Skip to content

Feature/372 add improve null detection option #400

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1113,6 +1113,7 @@ Again, the full example is available at
| .option("floating_point_format", "IBM") | Specifies a floating-point format. Available options: `IBM` (default), `IEEE754`, `IBM_little_endian`, `IEEE754_little_endian`. |
| .option("variable_size_occurs", "false") | If `false` (default) fields that have `OCCURS 0 TO 100 TIMES DEPENDING ON` clauses always have the same size corresponding to the maximum array size (e.g. 100 in this example). If set to `true` the size of the field will shrink for each field that has less actual elements. |
| .option("occurs_mapping", "{\"FIELD\": {\"X\": 1}}") | If specified, as a JSON string, allows for String `DEPENDING ON` fields with a corresponding mapping. |
| .option("improved_null_detection", "false") | If `true`, values that contain only 0x0 ror DISPLAY strings and numbers will be considered `null`s instead of empty strings. |

##### Modifier options

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,21 +78,22 @@ object CopybookParser {
/**
* Tokenizes a Cobol Copybook contents and returns the AST.
*
* @param dataEncoding Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* resolving segment redefines.
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @param dataEncoding Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* resolving segment redefines.
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @return Seq[Group] where a group is a record inside the copybook
*/
def parse(copyBookContents: String,
Expand All @@ -103,6 +104,7 @@ object CopybookParser {
fieldParentMap: Map[String, String] = HashMap[String, String](),
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
commentPolicy: CommentPolicy = CommentPolicy(),
improvedNullDetection: Boolean = false,
ebcdicCodePage: CodePage = new CodePageCommon,
asciiCharset: Charset = StandardCharsets.US_ASCII,
isUtf16BigEndian: Boolean = true,
Expand All @@ -118,6 +120,7 @@ object CopybookParser {
fieldParentMap,
stringTrimmingPolicy,
commentPolicy,
improvedNullDetection,
ebcdicCodePage,
asciiCharset,
isUtf16BigEndian,
Expand All @@ -130,19 +133,20 @@ object CopybookParser {
/**
* Tokenizes a Cobol Copybook contents and returns the AST.
*
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @return Seq[Group] where a group is a record inside the copybook
*/
def parseTree(copyBookContents: String,
Expand All @@ -152,6 +156,7 @@ object CopybookParser {
fieldParentMap: Map[String, String] = HashMap[String, String](),
stringTrimmingPolicy: StringTrimmingPolicy = StringTrimmingPolicy.TrimBoth,
commentPolicy: CommentPolicy = CommentPolicy(),
improvedNullDetection: Boolean = false,
ebcdicCodePage: CodePage = new CodePageCommon,
asciiCharset: Charset = StandardCharsets.US_ASCII,
isUtf16BigEndian: Boolean = true,
Expand All @@ -167,6 +172,7 @@ object CopybookParser {
fieldParentMap,
stringTrimmingPolicy,
commentPolicy,
improvedNullDetection,
ebcdicCodePage,
asciiCharset,
isUtf16BigEndian,
Expand All @@ -179,21 +185,22 @@ object CopybookParser {
/**
* Tokenizes a Cobol Copybook contents and returns the AST.
*
* @param enc Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* resolving segment redefines.
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @param enc Encoding of the data file (either ASCII/EBCDIC). The encoding of the copybook is expected to be ASCII.
* @param copyBookContents A string containing all lines of a copybook
* @param dropGroupFillers Drop groups marked as fillers from the output AST
* @param dropValueFillers Drop primitive fields marked as fillers from the output AST
* @param segmentRedefines A list of redefined fields that correspond to various segments. This needs to be specified for automatically
* resolving segment redefines.
* @param fieldParentMap A segment fields parent mapping
* @param stringTrimmingPolicy Specifies if and how strings should be trimmed when parsed
* @param commentPolicy Specifies a policy for comments truncation inside a copybook
* @param improvedNullDetection If true, string values that contain only zero bytes (0x0) will be considered null.
* @param ebcdicCodePage A code page for EBCDIC encoded data
* @param asciiCharset A charset for ASCII encoded data
* @param isUtf16BigEndian If true UTF-16 strings are considered big-endian.
* @param floatingPointFormat A format of floating-point numbers (IBM/IEEE754)
* @param nonTerminals A list of non-terminals that should be extracted as strings
* @param debugFieldsPolicy Specifies if debugging fields need to be added and what should they contain (false, hex, raw).
* @return Seq[Group] where a group is a record inside the copybook
*/
@throws(classOf[SyntaxErrorException])
Expand All @@ -205,6 +212,7 @@ object CopybookParser {
fieldParentMap: Map[String, String],
stringTrimmingPolicy: StringTrimmingPolicy,
commentPolicy: CommentPolicy,
improvedNullDetection: Boolean,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
isUtf16BigEndian: Boolean,
Expand All @@ -213,7 +221,7 @@ object CopybookParser {
occursHandlers: Map[String, Map[String, Int]],
debugFieldsPolicy: DebugFieldsPolicy): Copybook = {

val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
val schemaANTLR: CopybookAST = ANTLRParser.parse(copyBookContents, enc, stringTrimmingPolicy, commentPolicy, improvedNullDetection, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)

val nonTerms: Set[String] = (for (id <- nonTerminals)
yield transformIdentifier(id)
Expand All @@ -232,7 +240,7 @@ object CopybookParser {
processGroupFillers(
markDependeeFields(
addNonTerminals(
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat),
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection),
occursHandlers
), dropValueFillers
), dropGroupFillers, dropValueFillers
Expand All @@ -249,7 +257,7 @@ object CopybookParser {
renameGroupFillers(
markDependeeFields(
addNonTerminals(
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat),
calculateBinaryProperties(schemaANTLR), nonTerms, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection),
occursHandlers
),
dropGroupFillers, dropValueFillers
Expand All @@ -267,7 +275,8 @@ object CopybookParser {
ebcdicCodePage: CodePage,
asciiCharset: Charset,
isUtf16BigEndian: Boolean,
floatingPointFormat: FloatingPointFormat
floatingPointFormat: FloatingPointFormat,
improvedNullDetection: Boolean
): CopybookAST = {

def getNonTerminalName(name: String, parent: Group): String = {
Expand All @@ -292,11 +301,11 @@ object CopybookParser {
case g: Group =>
if (nonTerminals contains g.name) {
newChildren.append(
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat).copy(isRedefined = true)(g.parent)
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection).copy(isRedefined = true)(g.parent)
)
val sz = g.binaryProperties.actualSize
val dataType = AlphaNumeric(s"X($sz)", sz, enc = Some(enc))
val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
val newName = getNonTerminalName(g.name, g.parent.get)
newChildren.append(
Primitive(
Expand All @@ -310,7 +319,7 @@ object CopybookParser {
}
else
newChildren.append(
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
addNonTerminals(g, nonTerminals, enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
)
}
}
Expand Down Expand Up @@ -840,7 +849,7 @@ object CopybookParser {
* <li>Remove all groups that don't have child nodes.</li>
* </ul>
*
* @param ast An AST as a set of copybook records
* @param ast An AST as a set of copybook records
* @param dropValueFillers is there intention to drop primitive fields fillers
* @return The same AST with group fillers processed
*/
Expand Down Expand Up @@ -919,8 +928,8 @@ object CopybookParser {
val newGrp = processGroup(grp)
newChildren += newGrp
case st: Primitive =>
newChildren += st.withUpdatedIsRedefined(newIsRedefined = true)
newChildren += getDebugField(st)
newChildren += st.withUpdatedIsRedefined(newIsRedefined = true)
newChildren += getDebugField(st)
}
group.withUpdatedChildren(newChildren)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,12 @@ object ANTLRParser {
enc: Encoding,
stringTrimmingPolicy: StringTrimmingPolicy,
commentPolicy: CommentPolicy,
improvedNullDetection: Boolean,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
isUtf16BigEndian: Boolean,
floatingPointFormat: FloatingPointFormat): CopybookAST = {
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
val visitor = new ParserVisitor(enc, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)

val strippedContents = filterSpecialCharacters(copyBookContents).split("\\r?\\n").map(
line =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ class ParserVisitor(enc: Encoding,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
isUtf16BigEndian: Boolean,
floatingPointFormat: FloatingPointFormat) extends copybookParserBaseVisitor[Expr] {
floatingPointFormat: FloatingPointFormat,
improvedNullDetection: Boolean) extends copybookParserBaseVisitor[Expr] {
/* expressions */
case class IdentifierExpr(value: String) extends Expr
case class OccursExpr(m: Int, M: Option[Int], dep: Option[String]) extends Expr
Expand Down Expand Up @@ -812,7 +813,7 @@ class ParserVisitor(enc: Encoding,
Map(),
isDependee = false,
identifier.toUpperCase() == Constants.FILLER,
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat)
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, improvedNullDetection)
) (Some(parent))

parent.children.append(prim)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ import scala.collection.mutable.ArrayBuffer
* @param asciiCharsetName A charset name of input strings
* @return A string representation of the binary data
*/
class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String) extends Serializable with (Array[Byte] => Any) {
class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String, improvedNullDetection: Boolean) extends Serializable with (Array[Byte] => Any) {
import StringDecoders._
import StringTools._

lazy val charset: Charset = Charset.forName(asciiCharsetName)

Expand All @@ -41,6 +42,9 @@ class AsciiStringDecoderWrapper(trimmingType: Int, asciiCharsetName: String) ext
* @return A string representation of the binary data
*/
def apply(bytes: Array[Byte]): String = {
if (improvedNullDetection && isArrayNull(bytes))
return null

var i = 0

// Filter out all special characters
Expand Down
Loading