AbsaOSS · yruslan · Jul 8, 2021 · Feb 19, 2021 · Feb 23, 2021 · Jul 8, 2021
@@ -1113,6 +1113,7 @@ Again, the full example is available at
 | .option("floating_point_format", "IBM")    | Specifies a floating-point format. Available options: `IBM` (default), `IEEE754`, `IBM_little_endian`, `IEEE754_little_endian`. |
 | .option("variable_size_occurs", "false")   | If `false` (default) fields that have `OCCURS 0 TO 100 TIMES DEPENDING ON` clauses always have the same size corresponding to the maximum array size (e.g. 100 in this example). If set to `true` the size of the field will shrink for each field that has less actual elements. |
 | .option("occurs_mapping", "{\"FIELD\": {\"X\": 1}}")   | If specified, as a JSON string, allows for String `DEPENDING ON` fields with a corresponding mapping. |
+| .option("improved_null_detection", "false") | If `true`, values that contain only 0x0 ror DISPLAY strings and numbers will be considered `null`s instead of empty strings. |
 
 ##### Modifier options
 

@@ -78,21 +78,22 @@ object CopybookParser {
   /**
     * Tokenizes a Cobol Copybook contents and returns the AST.
     *
-    * @param dataEncoding         Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
-    * @param copyBookContents     A string containing all lines of a copybook
-    * @param dropGroupFillers     Drop groups marked as fillers from the output AST
-    * @param dropValueFillers     Drop primitive fields marked as fillers from the output AST
-    * @param segmentRedefines     A list of redefined fields that correspond to various segments. This needs to be specified for automatically
-    *                             resolving segment redefines.
-    * @param fieldParentMap       A segment fields parent mapping
-    * @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
-    * @param commentPolicy        Specifies a policy for comments truncation inside a copybook
-    * @param ebcdicCodePage       A code page for EBCDIC encoded data
-    * @param asciiCharset         A charset for ASCII encoded data
-    * @param isUtf16BigEndian     If true UTF-16 strings are considered big-endian.
-    * @param floatingPointFormat  A format of floating-point numbers (IBM/IEEE754)
-    * @param nonTerminals         A list of non-terminals that should be extracted as strings
-    * @param debugFieldsPolicy    Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
+    * @param dataEncoding          Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
+    * @param copyBookContents      A string containing all lines of a copybook
+    * @param dropGroupFillers      Drop groups marked as fillers from the output AST
+    * @param dropValueFillers      Drop primitive fields marked as fillers from the output AST
+    * @param segmentRedefines      A list of redefined fields that correspond to various segments. This needs to be specified for automatically
+    *                              resolving segment redefines.
+    * @param fieldParentMap        A segment fields parent mapping
+    * @param stringTrimmingPolicy  Specifies if and how strings should be trimmed when parsed
+    * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
+    * @param commentPolicy         Specifies a policy for comments truncation inside a copybook
+    * @param ebcdicCodePage        A code page for EBCDIC encoded data
+    * @param asciiCharset          A charset for ASCII encoded data
+    * @param isUtf16BigEndian      If true UTF-16 strings are considered big-endian.
+    * @param floatingPointFormat   A format of floating-point numbers (IBM/IEEE754)
+    * @param nonTerminals          A list of non-terminals that should be extracted as strings
+    * @param debugFieldsPolicy     Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
     * @return Seq[Group] where a group is a record inside the copybook
     */
   def parse(copyBookContents: String,
@@ -103,6 +104,7 @@ object CopybookParser {
             fieldParentMap: Map[String, String] = HashMap[String, String](),
             stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
             commentPolicy: CommentPolicy = CommentPolicy(),
+            improvedNullDetection: Boolean = false,
             ebcdicCodePage: CodePage = new CodePageCommon,
             asciiCharset: Charset = StandardCharsets.US_ASCII,
             isUtf16BigEndian: Boolean = true,
@@ -118,6 +120,7 @@ object CopybookParser {
       fieldParentMap,
       stringTrimmingPolicy,
       commentPolicy,
+      improvedNullDetection,
       ebcdicCodePage,
       asciiCharset,
       isUtf16BigEndian,
@@ -130,19 +133,20 @@ object CopybookParser {
   /**
     * Tokenizes a Cobol Copybook contents and returns the AST.
     *
-    * @param copyBookContents     A string containing all lines of a copybook
-    * @param dropGroupFillers     Drop groups marked as fillers from the output AST
-    * @param dropValueFillers     Drop primitive fields marked as fillers from the output AST
-    * @param segmentRedefines     A list of redefined fields that correspond to various segments. This needs to be specified for automatically
-    * @param fieldParentMap       A segment fields parent mapping
-    * @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
-    * @param commentPolicy        Specifies a policy for comments truncation inside a copybook
-    * @param ebcdicCodePage       A code page for EBCDIC encoded data
-    * @param asciiCharset         A charset for ASCII encoded data
-    * @param isUtf16BigEndian     If true UTF-16 strings are considered big-endian.
-    * @param floatingPointFormat  A format of floating-point numbers (IBM/IEEE754)
-    * @param nonTerminals         A list of non-terminals that should be extracted as strings
-    * @param debugFieldsPolicy    Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
+    * @param copyBookContents      A string containing all lines of a copybook
+    * @param dropGroupFillers      Drop groups marked as fillers from the output AST
+    * @param dropValueFillers      Drop primitive fields marked as fillers from the output AST
+    * @param segmentRedefines      A list of redefined fields that correspond to various segments. This needs to be specified for automatically
+    * @param fieldParentMap        A segment fields parent mapping
+    * @param stringTrimmingPolicy  Specifies if and how strings should be trimmed when parsed
+    * @param commentPolicy         Specifies a policy for comments truncation inside a copybook
+    * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
+    * @param ebcdicCodePage        A code page for EBCDIC encoded data
+    * @param asciiCharset          A charset for ASCII encoded data
+    * @param isUtf16BigEndian      If true UTF-16 strings are considered big-endian.
+    * @param floatingPointFormat   A format of floating-point numbers (IBM/IEEE754)
+    * @param nonTerminals          A list of non-terminals that should be extracted as strings
+    * @param debugFieldsPolicy     Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
     * @return Seq[Group] where a group is a record inside the copybook
     */
   def parseTree(copyBookContents: String,
@@ -152,6 +156,7 @@ object CopybookParser {
                 fieldParentMap: Map[String, String] = HashMap[String, String](),
                 stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
                 commentPolicy: CommentPolicy = CommentPolicy(),
+                improvedNullDetection: Boolean = false,
                 ebcdicCodePage: CodePage = new CodePageCommon,
                 asciiCharset: Charset = StandardCharsets.US_ASCII,
                 isUtf16BigEndian: Boolean = true,
@@ -167,6 +172,7 @@ object CopybookParser {
       fieldParentMap,
       stringTrimmingPolicy,
       commentPolicy,
+      improvedNullDetection,
       ebcdicCodePage,
       asciiCharset,
       isUtf16BigEndian,
@@ -179,21 +185,22 @@ object CopybookParser {
   /**
     * Tokenizes a Cobol Copybook contents and returns the AST.
     *
-    * @param enc                  Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
-    * @param copyBookContents     A string containing all lines of a copybook
-    * @param dropGroupFillers     Drop groups marked as fillers from the output AST
-    * @param dropValueFillers     Drop primitive fields marked as fillers from the output AST
-    * @param segmentRedefines     A list of redefined fields that correspond to various segments. This needs to be specified for automatically
-    *                             resolving segment redefines.
-    * @param fieldParentMap       A segment fields parent mapping
-    * @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
-    * @param commentPolicy        Specifies a policy for comments truncation inside a copybook
-    * @param ebcdicCodePage       A code page for EBCDIC encoded data
-    * @param asciiCharset         A charset for ASCII encoded data
-    * @param isUtf16BigEndian     If true UTF-16 strings are considered big-endian.
-    * @param floatingPointFormat  A format of floating-point numbers (IBM/IEEE754)
-    * @param nonTerminals         A list of non-terminals that should be extracted as strings
-    * @param debugFieldsPolicy    Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
+    * @param enc                   Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
+    * @param copyBookContents      A string containing all lines of a copybook
+    * @param dropGroupFillers      Drop groups marked as fillers from the output AST
+    * @param dropValueFillers      Drop primitive fields marked as fillers from the output AST
+    * @param segmentRedefines      A list of redefined fields that correspond to various segments. This needs to be specified for automatically
+    *                              resolving segment redefines.
+    * @param fieldParentMap        A segment fields parent mapping
+    * @param stringTrimmingPolicy  Specifies if and how strings should be trimmed when parsed
+    * @param commentPolicy         Specifies a policy for comments truncation inside a copybook
+    * @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
+    * @param ebcdicCodePage        A code page for EBCDIC encoded data
+    * @param asciiCharset          A charset for ASCII encoded data
+    * @param isUtf16BigEndian      If true UTF-16 strings are considered big-endian.
+    * @param floatingPointFormat   A format of floating-point numbers (IBM/IEEE754)
+    * @param nonTerminals          A list of non-terminals that should be extracted as strings
+    * @param debugFieldsPolicy     Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
     * @return Seq[Group] where a group is a record inside the copybook
     */
   @throws(classOf[SyntaxErrorException])
@@ -205,6 +212,7 @@ object CopybookParser {
                 fieldParentMap: Map[String, String],
                 stringTrimmingPolicy: StringTrimmingPolicy,
                 commentPolicy: CommentPolicy,
+                improvedNullDetection: Boolean,
                 ebcdicCodePage: CodePage,
                 asciiCharset: Charset,
                 isUtf16BigEndian: Boolean,
@@ -213,7 +221,7 @@ object CopybookParser {
                 occursHandlers: Map[String, Map[String, Int]],
                 debugFieldsPolicy: DebugFieldsPolicy): Copybook = {
 
-    val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
+    val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, improvedNullDetection, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
 
     val nonTerms: Set[String] = (for (id <- nonTerminals)
       yield transformIdentifier(id)
@@ -232,7 +240,7 @@ object CopybookParser {
                   processGroupFillers(
                     markDependeeFields(
                       addNonTerminals(
-                        calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat),
+                        calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection),
                       occursHandlers
                     ), dropValueFillers
                   ), dropGroupFillers, dropValueFillers
@@ -249,7 +257,7 @@ object CopybookParser {
                 renameGroupFillers(
                   markDependeeFields(
                     addNonTerminals(
-                      calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat),
+                      calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection),
                     occursHandlers
                   ),
                   dropGroupFillers, dropValueFillers
@@ -267,7 +275,8 @@ object CopybookParser {
                               ebcdicCodePage: CodePage,
                               asciiCharset: Charset,
                               isUtf16BigEndian: Boolean,
-                              floatingPointFormat: FloatingPointFormat
+                              floatingPointFormat: FloatingPointFormat,
+                              improvedNullDetection: Boolean
                              ): CopybookAST = {
 
     def getNonTerminalName(name: String, parent: Group): String = {
@@ -292,11 +301,11 @@ object CopybookParser {
         case g: Group =>
           if (nonTerminals contains g.name) {
             newChildren.append(
-              addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat).copy(isRedefined = true)(g.parent)
+              addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection).copy(isRedefined = true)(g.parent)
             )
             val sz = g.binaryProperties.actualSize
             val dataType = AlphaNumeric(s"X($sz)", sz, enc = Some(enc))
-            val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
+            val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
             val newName = getNonTerminalName(g.name, g.parent.get)
             newChildren.append(
               Primitive(
@@ -310,7 +319,7 @@ object CopybookParser {
           }
           else
             newChildren.append(
-              addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
+              addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
             )
       }
     }
@@ -840,7 +849,7 @@ object CopybookParser {
     * <li>Remove all groups that don't have child nodes.</li>
     * </ul>
     *
-    * @param ast An AST as a set of copybook records
+    * @param ast              An AST as a set of copybook records
     * @param dropValueFillers is there intention to drop primitive fields fillers
     * @return The same AST with group fillers processed
     */
@@ -919,8 +928,8 @@ object CopybookParser {
           val newGrp = processGroup(grp)
           newChildren += newGrp
         case st: Primitive =>
-            newChildren += st.withUpdatedIsRedefined(newIsRedefined = true)
-            newChildren += getDebugField(st)
+          newChildren += st.withUpdatedIsRedefined(newIsRedefined = true)
+          newChildren += getDebugField(st)
       }
       group.withUpdatedChildren(newChildren)
     }

@@ -56,11 +56,12 @@ object ANTLRParser {
             enc: Encoding,
             stringTrimmingPolicy: StringTrimmingPolicy,
             commentPolicy: CommentPolicy,
+            improvedNullDetection: Boolean,
             ebcdicCodePage: CodePage,
             asciiCharset: Charset,
             isUtf16BigEndian: Boolean,
             floatingPointFormat: FloatingPointFormat): CopybookAST = {
-    val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
+    val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
 
     val strippedContents = filterSpecialCharacters(copyBookContents).split("\\r?\\n").map(
       line =>

@@ -45,7 +45,8 @@ class ParserVisitor(enc: Encoding,
                     ebcdicCodePage: CodePage,
                     asciiCharset: Charset,
                     isUtf16BigEndian: Boolean,
-                    floatingPointFormat: FloatingPointFormat) extends copybookParserBaseVisitor[Expr] {
+                    floatingPointFormat: FloatingPointFormat,
+                    improvedNullDetection: Boolean) extends copybookParserBaseVisitor[Expr] {
   /* expressions */
   case class IdentifierExpr(value: String) extends Expr
   case class OccursExpr(m: Int, M: Option[Int], dep: Option[String]) extends Expr
@@ -812,7 +813,7 @@ class ParserVisitor(enc: Encoding,
       Map(),
       isDependee = false,
       identifier.toUpperCase() == Constants.FILLER,
-      DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
+      DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
       ) (Some(parent))
 
     parent.children.append(prim)

@@ -29,8 +29,9 @@ import scala.collection.mutable.ArrayBuffer
   * @param asciiCharsetName A charset name of input strings
   * @return A string representation of the binary data
   */
-class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String) extends Serializable with (Array[Byte] => Any) {
+class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String, improvedNullDetection: Boolean) extends Serializable with (Array[Byte] => Any) {
   import StringDecoders._
+  import StringTools._
 
   lazy val charset: Charset = Charset.forName(asciiCharsetName)
 
@@ -41,6 +42,9 @@ class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String) ext
     * @return A string representation of the binary data
     */
   def apply(bytes: Array[Byte]): String = {
+    if (improvedNullDetection && isArrayNull(bytes))
+      return null
+
     var i = 0
 
     // Filter out all special characters