AbsaOSS · yruslan · Nov 13, 2025 · Nov 7, 2025 · Nov 11, 2025 · Nov 12, 2025
@@ -1662,7 +1662,8 @@ The output looks like this:
 `common_extended`, `cp037_extended` are code pages supporting non-printable characters that converts to ASCII codes below 32.
 
 ## EBCDIC Processor (experimental)
-The EBCDIC processor allows processing files by replacing value of fields without changing the underlying format.
+The EBCDIC processor allows processing files by replacing value of fields without changing the underlying format (`CobolProcessingStrategy.InPlace`)
+or with conversion of the input format to variable-record-length format with big-endian RDWs (`CobolProcessingStrategy.ToVariableLength`).
 
 The processing does not require Spark. A processing application can have only the COBOL parser as a dependency (`cobol-parser`).
 
@@ -1676,6 +1677,7 @@ val builder = CobolProcessor.builder(copybookContents)
 
 val builder = CobolProcessor.builder
   .withCopybookContents("...some copybook...")
+  .withProcessingStrategy(CobolProcessingStrategy.InPlace) // Or CobolProcessingStrategy.ToVariableLength
 
 val processor = new RawRecordProcessor {
   override def processRecord(record: Array[Byte], ctx: CobolProcessorContext): Array[Byte] = {
@@ -1699,6 +1701,7 @@ import za.co.absa.cobrix.cobol.processor.{CobolProcessor, CobolProcessorContext}
 
 val count = CobolProcessor.builder
   .withCopybookContents(copybook)
+  .withProcessingStrategy(CobolProcessingStrategy.InPlace) // Or CobolProcessingStrategy.ToVariableLength
   .withRecordProcessor { (record: Array[Byte], ctx: CobolProcessorContext) =>
     // The transformation logic goes here
     val value = copybook.getFieldValueByName("some_field", record, 0)
@@ -1726,6 +1729,7 @@ val copybookContents = "...some copybook..."
 
 SparkCobolProcessor.builder
   .withCopybookContents(copybook)
+  .withProcessingStrategy(CobolProcessingStrategy.InPlace) // Or CobolProcessingStrategy.ToVariableLength
   .withRecordProcessor { (record: Array[Byte], ctx: CobolProcessorContext) =>
     // The transformation logic goes here
     val value = ctx.copybook.getFieldValueByName("some_field", record, 0)
@@ -1740,6 +1744,35 @@ SparkCobolProcessor.builder
   .save(outputPath)
 ```
 
+## EBCDIC Spark raw record RDD generator (experimental)
+You can process raw records of a mainframe file as an `RDD[Array[Byte]]`. This can be useful for custom processing without converting
+to Spark data types. You can still access fields via parsed copybooks.
+
+Example:
+```scala
+import org.apache.spark.rdd.RDD
+import za.co.absa.cobrix.spark.cobol.SparkCobolProcessor
+
+val copybookContents = "...some copybook..."
+
+val rddBuilder = SparkCobolProcessor.builder
+  .withCopybookContents(copybookContents)
+  .option("record_format", "F")
+  .load("s3://bucket/some/path")
+
+// Fetch the parsed copybook and the RDD separately
+val copybook = rddBuilder.getParsedCopybook
+val rdd: RDD[Array[Byte]] = rddBuilder.toRDD
+
+val segmentRdds RDD[String] = recordsRdd.flatMap { record =>
+  val seg = copybook.getFieldValueByName("SEGMENT_ID", record).toString
+  seg
+}
+
+// Print the list of unique segments
+segmentRdds.distinct.collect.sorted.foreach(println)
+```
+
 ## EBCDIC Writer (experimental)
 
 Cobrix's EBCDIC writer is an experimental feature that allows writing Spark DataFrames as EBCDIC mainframe files.

@@ -0,0 +1,24 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.cobol.processor
+
+trait CobolProcessingStrategy
+
+object CobolProcessingStrategy {
+  case object InPlace extends CobolProcessingStrategy
+  case object ToVariableLength extends CobolProcessingStrategy
+}
@@ -17,7 +17,7 @@
 package za.co.absa.cobrix.cobol.processor
 
 import za.co.absa.cobrix.cobol.parser.Copybook
-import za.co.absa.cobrix.cobol.processor.impl.CobolProcessorImpl
+import za.co.absa.cobrix.cobol.processor.impl.{CobolProcessorInPlace, CobolProcessorToRdw}
 import za.co.absa.cobrix.cobol.reader.parameters.{CobolParametersParser, Parameters, ReaderParameters}
 import za.co.absa.cobrix.cobol.reader.schema.CobolSchema
 import za.co.absa.cobrix.cobol.reader.stream.{FSStream, SimpleStream}
@@ -50,6 +50,7 @@ object CobolProcessor {
     private val caseInsensitiveOptions = new mutable.HashMap[String, String]()
     private var copybookContentsOpt: Option[String] = None
     private var rawRecordProcessorOpt: Option[RawRecordProcessor] = None
+    private var cobolProcessingStrategy: CobolProcessingStrategy = CobolProcessingStrategy.InPlace
 
     def build(): CobolProcessor = {
       if (copybookContentsOpt.isEmpty) {
@@ -59,7 +60,10 @@ object CobolProcessor {
       val readerParameters = getReaderParameters
       val cobolSchema = getCobolSchema(readerParameters)
 
-      new CobolProcessorImpl(readerParameters, cobolSchema.copybook, copybookContentsOpt.get, caseInsensitiveOptions.toMap)
+      cobolProcessingStrategy match {
+        case CobolProcessingStrategy.InPlace => new CobolProcessorInPlace(readerParameters, cobolSchema.copybook, copybookContentsOpt.get, caseInsensitiveOptions.toMap)
+        case CobolProcessingStrategy.ToVariableLength => new CobolProcessorToRdw(readerParameters, cobolSchema.copybook, copybookContentsOpt.get, caseInsensitiveOptions.toMap)
+      }
     }
 
     def load(path: String): CobolProcessorLoader = {
@@ -87,7 +91,7 @@ object CobolProcessor {
       val readerParameters = getReaderParameters
       val cobolSchema = getCobolSchema(readerParameters)
 
-      new CobolProcessorLoader(path, copybookContentsOpt.get, cobolSchema.copybook, rawRecordProcessorOpt.get, readerParameters, caseInsensitiveOptions.toMap)
+      new CobolProcessorLoader(path, copybookContentsOpt.get, cobolSchema.copybook, rawRecordProcessorOpt.get, readerParameters, cobolProcessingStrategy, caseInsensitiveOptions.toMap)
     }
 
     def withCopybookContents(copybookContents: String): CobolProcessorBuilder = {
@@ -100,6 +104,11 @@ object CobolProcessor {
       this
     }
 
+    def withProcessingStrategy(strategy: CobolProcessingStrategy): CobolProcessorBuilder = {
+      cobolProcessingStrategy = strategy
+      this
+    }
+
     /**
       * Adds a single option to the builder.
       *
@@ -142,9 +151,13 @@ object CobolProcessor {
                              copybook: Copybook,
                              rawRecordProcessor: RawRecordProcessor,
                              readerParameters: ReaderParameters,
+                             cobolProcessingStrategy: CobolProcessingStrategy,
                              options: Map[String, String]) {
     def save(outputFile: String): Long = {
-      val processor = new CobolProcessorImpl(readerParameters, copybook, copybookContents, options)
+      val processor = cobolProcessingStrategy match {
+        case CobolProcessingStrategy.InPlace => new CobolProcessorInPlace(readerParameters, copybook, copybookContents, options)
+        case CobolProcessingStrategy.ToVariableLength => new CobolProcessorToRdw(readerParameters, copybook, copybookContents, options)
+      }
 
       val ifs = new FSStream(fileToProcess)
       val ofs = new BufferedOutputStream(new FileOutputStream(outputFile))

@@ -0,0 +1,50 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.cobol.processor.impl
+
+import za.co.absa.cobrix.cobol.processor.CobolProcessor
+import za.co.absa.cobrix.cobol.reader.VarLenNestedReader
+import za.co.absa.cobrix.cobol.reader.extractors.raw.RawRecordExtractor
+import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
+import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
+
+/**
+  * Implements common methods of direct EBCDIC to EBCDIC data processor implementations.
+  *
+  * The processing can be done from inside an RDD so this is why it is serializable.
+  */
+abstract class CobolProcessorBase extends CobolProcessor with Serializable
+
+object CobolProcessorBase {
+  def getRecordExtractor(readerParameters: ReaderParameters, copybookContents: String, inputStream: SimpleStream, headerStreamOpt: Option[SimpleStream]): RawRecordExtractor = {
+    val dataStream = inputStream.copyStream()
+    val headerStream = headerStreamOpt match {
+      case Some(stream) => stream
+      case None => inputStream.copyStream()
+    }
+
+    val reader = new VarLenNestedReader[Array[Any]](Seq(copybookContents), readerParameters, new ArrayOfAnyHandler)
+
+    reader.recordExtractor(0, dataStream, headerStream) match {
+      case Some(extractor) => extractor
+      case None            =>
+        throw new IllegalArgumentException(s"Cannot create a record extractor for the given reader parameters. " +
+          "Please check the copybook and the reader parameters."
+        )
+    }
+  }
+}
@@ -27,9 +27,8 @@ import java.io.OutputStream
 
 /**
   * Implementation of the CobolProcessor trait, responsible for processing COBOL data streams
-  * by extracting records and applying a user-defined raw record processor.
-  *
-  * The processing can be done from inside an RDD so this is why it is serializable.
+  * by extracting records and applying a user-defined raw record processor. This processor
+  * retains the original COBOL data format in the output.
   *
   * Please, do not use this class directly. Use `CobolProcessor.builder()` instead.
   *
@@ -38,18 +37,18 @@ import java.io.OutputStream
   * @param copybookContents The raw textual representation of the copybook.
   * @param options          A map of processing options to customize the behavior of the processor (same as for `spark-cobol`).
   */
-class CobolProcessorImpl(readerParameters: ReaderParameters,
-                         copybook: Copybook,
-                         copybookContents: String,
-                         options: Map[String, String]) extends CobolProcessor with Serializable {
+class CobolProcessorInPlace(readerParameters: ReaderParameters,
+                            copybook: Copybook,
+                            copybookContents: String,
+                            options: Map[String, String]) extends CobolProcessorBase {
   override def process(inputStream: SimpleStream,
                        outputStream: OutputStream)
                       (rawRecordProcessor: RawRecordProcessor): Long = {
-    val recordExtractor = getRecordExtractor(readerParameters, inputStream)
+    val recordExtractor = CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, inputStream, None)
 
     val dataStream = inputStream.copyStream()
     try {
-      StreamProcessor.processStream(copybook,
+      StreamProcessor.processStreamInPlace(copybook,
         options,
         dataStream,
         recordExtractor,
@@ -60,18 +59,5 @@ class CobolProcessorImpl(readerParameters: ReaderParameters,
     }
   }
 
-  private[processor] def getRecordExtractor(readerParameters: ReaderParameters, inputStream: SimpleStream): RawRecordExtractor = {
-    val dataStream = inputStream.copyStream()
-    val headerStream = inputStream.copyStream()
-
-    val reader = new VarLenNestedReader[Array[Any]](Seq(copybookContents), readerParameters, new ArrayOfAnyHandler)
 
-    reader.recordExtractor(0, dataStream, headerStream) match {
-      case Some(extractor) => extractor
-      case None            =>
-        throw new IllegalArgumentException(s"Cannot create a record extractor for the given reader parameters. " +
-          "Please check the copybook and the reader parameters."
-        )
-    }
-  }
 }
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.cobrix.cobol.processor.impl
+
+import za.co.absa.cobrix.cobol.parser.Copybook
+import za.co.absa.cobrix.cobol.processor.RawRecordProcessor
+import za.co.absa.cobrix.cobol.reader.parameters.ReaderParameters
+import za.co.absa.cobrix.cobol.reader.stream.SimpleStream
+
+import java.io.OutputStream
+
+/**
+  * Implementation of the CobolProcessor trait, responsible for processing COBOL data streams
+  * by extracting records and applying a user-defined raw record processor. This processor
+  * converts the input format to the variable record length format with big-endian RDW records.
+  *
+  * Please, do not use this class directly. Use `CobolProcessor.builder()` instead.
+  *
+  * @param readerParameters Configuration for record extraction and COBOL file parsing.
+  * @param copybook         The copybook definition used for interpreting COBOL data structures.
+  * @param copybookContents The raw textual representation of the copybook.
+  * @param options          A map of processing options to customize the behavior of the processor (same as for `spark-cobol`).
+  */
+class CobolProcessorToRdw(readerParameters: ReaderParameters,
+                          copybook: Copybook,
+                          copybookContents: String,
+                          options: Map[String, String]) extends CobolProcessorBase {
+  override def process(inputStream: SimpleStream,
+                       outputStream: OutputStream)
+                      (rawRecordProcessor: RawRecordProcessor): Long = {
+    val recordExtractor = CobolProcessorBase.getRecordExtractor(readerParameters, copybookContents, inputStream, None)
+
+    StreamProcessor.processStreamToRdw(copybook,
+      options,
+      recordExtractor,
+      rawRecordProcessor,
+      outputStream)
+  }
+}
@@ -36,12 +36,12 @@ object StreamProcessor {
     * @param outputStream    the output stream where the processed records will be written.
     * @return The number of records processed.
     */
-  def processStream(copybook: Copybook,
-                    options: Map[String, String],
-                    inputStream: SimpleStream,
-                    recordExtractor: RawRecordExtractor,
-                    recordProcessor: RawRecordProcessor,
-                    outputStream: OutputStream): Long = {
+  def processStreamInPlace(copybook: Copybook,
+                           options: Map[String, String],
+                           inputStream: SimpleStream,
+                           recordExtractor: RawRecordExtractor,
+                           recordProcessor: RawRecordProcessor,
+                           outputStream: OutputStream): Long = {
     var recordCount = 0L
     while (recordExtractor.hasNext) {
       recordCount += 1
@@ -68,4 +68,41 @@ object StreamProcessor {
     }
     recordCount
   }
+
+  /**
+    * Processes a stream of COBOL raw records and writes it back as a variable length format with big-endian RDW headers.
+    *
+    * @param copybook        the COBOL copybook that describes the schema of the records.
+    * @param options         arbitrary options used for splitting input data into records (same as 'spark-cobol' options).
+    *                        Keys are lower-cased for case-insensitive handling. Can contain custom options as well.
+    * @param recordExtractor the extractor that extracts raw records from the input stream.
+    * @param recordProcessor the per-record processing logic implementation.
+    * @param outputStream    the output stream where the processed records will be written.
+    * @return The number of records processed.
+    */
+  def processStreamToRdw(copybook: Copybook,
+                         options: Map[String, String],
+                         recordExtractor: RawRecordExtractor,
+                         recordProcessor: RawRecordProcessor,
+                         outputStream: OutputStream): Long = {
+    var recordCount = 0L
+
+    while (recordExtractor.hasNext) {
+      recordCount += 1
+      val record = recordExtractor.next()
+      val recordSize = record.length
+
+      val ctx = CobolProcessorContext(copybook, options, recordExtractor.offset)
+
+      val updatedRecord = recordProcessor.processRecord(record, ctx)
+
+      val rdw = Array[Byte](((updatedRecord.length >> 8) & 0xFF).toByte, ((updatedRecord.length) & 0xFF).toByte, 0, 0)
+
+      outputStream.write(rdw)
+      outputStream.write(updatedRecord)
+    }
+
+    recordCount
+  }
+
 }