Skip to content

Commit

Permalink
Merge pull request #400 from AbsaOSS/feature/372-add-improve-null-det…
Browse files Browse the repository at this point in the history
…ection-option

Feature/372 add improve null detection option
  • Loading branch information
yruslan authored Jul 8, 2021
2 parents fc3bbd9 + 14c1439 commit 050be67
Show file tree
Hide file tree
Showing 21 changed files with 473 additions and 277 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,7 @@ Again, the full example is available at
| .option("floating_point_format", "IBM") | Specifies a floating-point format. Available options: `IBM` (default), `IEEE754`, `IBM_little_endian`, `IEEE754_little_endian`. |
| .option("variable_size_occurs", "false") | If `false` (default) fields that have `OCCURS 0 TO 100 TIMES DEPENDING ON` clauses always have the same size corresponding to the maximum array size (e.g. 100 in this example). If set to `true` the size of the field will shrink for each field that has less actual elements. |
| .option("occurs_mapping", "{\"FIELD\": {\"X\": 1}}") | If specified, as a JSON string, allows for String `DEPENDING ON` fields with a corresponding mapping. |
| .option("improved_null_detection", "false") | If `true`, values that contain only 0x0 ror DISPLAY strings and numbers will be considered `null`s instead of empty strings. |

##### Modifier options

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,21 +78,22 @@ object CopybookParser {
/**
* Tokenizes a Cobol Copybook contents and returns the AST.
*
* @param dataEncoding Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* resolving segment redefines.
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @param dataEncoding Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* resolving segment redefines.
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @return Seq[Group] where a group is a record inside the copybook
*/
def parse(copyBookContents: String,
Expand All @@ -103,6 +104,7 @@ object CopybookParser {
fieldParentMap: Map[String, String] = HashMap[String, String](),
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
commentPolicy: CommentPolicy = CommentPolicy(),
improvedNullDetection: Boolean = false,
ebcdicCodePage: CodePage = new CodePageCommon,
asciiCharset: Charset = StandardCharsets.US_ASCII,
isUtf16BigEndian: Boolean = true,
Expand All @@ -118,6 +120,7 @@ object CopybookParser {
fieldParentMap,
stringTrimmingPolicy,
commentPolicy,
improvedNullDetection,
ebcdicCodePage,
asciiCharset,
isUtf16BigEndian,
Expand All @@ -130,19 +133,20 @@ object CopybookParser {
/**
* Tokenizes a Cobol Copybook contents and returns the AST.
*
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @return Seq[Group] where a group is a record inside the copybook
*/
def parseTree(copyBookContents: String,
Expand All @@ -152,6 +156,7 @@ object CopybookParser {
fieldParentMap: Map[String, String] = HashMap[String, String](),
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
commentPolicy: CommentPolicy = CommentPolicy(),
improvedNullDetection: Boolean = false,
ebcdicCodePage: CodePage = new CodePageCommon,
asciiCharset: Charset = StandardCharsets.US_ASCII,
isUtf16BigEndian: Boolean = true,
Expand All @@ -167,6 +172,7 @@ object CopybookParser {
fieldParentMap,
stringTrimmingPolicy,
commentPolicy,
improvedNullDetection,
ebcdicCodePage,
asciiCharset,
isUtf16BigEndian,
Expand All @@ -179,21 +185,22 @@ object CopybookParser {
/**
* Tokenizes a Cobol Copybook contents and returns the AST.
*
* @param enc Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* resolving segment redefines.
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @param enc Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* resolving segment redefines.
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @return Seq[Group] where a group is a record inside the copybook
*/
@throws(classOf[SyntaxErrorException])
Expand All @@ -205,6 +212,7 @@ object CopybookParser {
fieldParentMap: Map[String, String],
stringTrimmingPolicy: StringTrimmingPolicy,
commentPolicy: CommentPolicy,
improvedNullDetection: Boolean,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
isUtf16BigEndian: Boolean,
Expand All @@ -213,7 +221,7 @@ object CopybookParser {
occursHandlers: Map[String, Map[String, Int]],
debugFieldsPolicy: DebugFieldsPolicy): Copybook = {

val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, improvedNullDetection, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)

val nonTerms: Set[String] = (for (id <- nonTerminals)
yield transformIdentifier(id)
Expand All @@ -232,7 +240,7 @@ object CopybookParser {
processGroupFillers(
markDependeeFields(
addNonTerminals(
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat),
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection),
occursHandlers
), dropValueFillers
), dropGroupFillers, dropValueFillers
Expand All @@ -249,7 +257,7 @@ object CopybookParser {
renameGroupFillers(
markDependeeFields(
addNonTerminals(
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat),
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection),
occursHandlers
),
dropGroupFillers, dropValueFillers
Expand All @@ -267,7 +275,8 @@ object CopybookParser {
ebcdicCodePage: CodePage,
asciiCharset: Charset,
isUtf16BigEndian: Boolean,
floatingPointFormat: FloatingPointFormat
floatingPointFormat: FloatingPointFormat,
improvedNullDetection: Boolean
): CopybookAST = {

def getNonTerminalName(name: String, parent: Group): String = {
Expand All @@ -292,11 +301,11 @@ object CopybookParser {
case g: Group =>
if (nonTerminals contains g.name) {
newChildren.append(
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat).copy(isRedefined = true)(g.parent)
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection).copy(isRedefined = true)(g.parent)
)
val sz = g.binaryProperties.actualSize
val dataType = AlphaNumeric(s"X($sz)", sz, enc = Some(enc))
val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
val newName = getNonTerminalName(g.name, g.parent.get)
newChildren.append(
Primitive(
Expand All @@ -310,7 +319,7 @@ object CopybookParser {
}
else
newChildren.append(
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
)
}
}
Expand Down Expand Up @@ -840,7 +849,7 @@ object CopybookParser {
* <li>Remove all groups that don't have child nodes.</li>
* </ul>
*
* @param ast An AST as a set of copybook records
* @param ast An AST as a set of copybook records
* @param dropValueFillers is there intention to drop primitive fields fillers
* @return The same AST with group fillers processed
*/
Expand Down Expand Up @@ -919,8 +928,8 @@ object CopybookParser {
val newGrp = processGroup(grp)
newChildren += newGrp
case st: Primitive =>
newChildren += st.withUpdatedIsRedefined(newIsRedefined = true)
newChildren += getDebugField(st)
newChildren += st.withUpdatedIsRedefined(newIsRedefined = true)
newChildren += getDebugField(st)
}
group.withUpdatedChildren(newChildren)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,12 @@ object ANTLRParser {
enc: Encoding,
stringTrimmingPolicy: StringTrimmingPolicy,
commentPolicy: CommentPolicy,
improvedNullDetection: Boolean,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
isUtf16BigEndian: Boolean,
floatingPointFormat: FloatingPointFormat): CopybookAST = {
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)

val strippedContents = filterSpecialCharacters(copyBookContents).split("\\r?\\n").map(
line =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ class ParserVisitor(enc: Encoding,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
isUtf16BigEndian: Boolean,
floatingPointFormat: FloatingPointFormat) extends copybookParserBaseVisitor[Expr] {
floatingPointFormat: FloatingPointFormat,
improvedNullDetection: Boolean) extends copybookParserBaseVisitor[Expr] {
/* expressions */
case class IdentifierExpr(value: String) extends Expr
case class OccursExpr(m: Int, M: Option[Int], dep: Option[String]) extends Expr
Expand Down Expand Up @@ -812,7 +813,7 @@ class ParserVisitor(enc: Encoding,
Map(),
isDependee = false,
identifier.toUpperCase() == Constants.FILLER,
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
) (Some(parent))

parent.children.append(prim)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ import scala.collection.mutable.ArrayBuffer
* @param asciiCharsetName A charset name of input strings
* @return A string representation of the binary data
*/
class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String) extends Serializable with (Array[Byte] => Any) {
class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String, improvedNullDetection: Boolean) extends Serializable with (Array[Byte] => Any) {
import StringDecoders._
import StringTools._

lazy val charset: Charset = Charset.forName(asciiCharsetName)

Expand All @@ -41,6 +42,9 @@ class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String) ext
* @return A string representation of the binary data
*/
def apply(bytes: Array[Byte]): String = {
if (improvedNullDetection && isArrayNull(bytes))
return null

var i = 0

// Filter out all special characters
Expand Down
Loading

0 comments on commit 050be67

Please sign in to comment.