[SPARK-14016][SQL] Support high-precision decimals in vectorized parquet reader

sameeragarwal · yhuai · commit 7299961657b5 · 2016-03-21T18:19:54.000-07:00
## What changes were proposed in this pull request? This patch adds support for reading `DecimalTypes` with high (> 18) precision in `VectorizedColumnReader` ## How was this patch tested? 1. `VectorizedColumnReader` initially had a gating condition on `primitiveType.getDecimalMetadata().getPrecision() > Decimal.MAX_LONG_DIGITS()` that made us fall back on parquet-mr for handling high-precision decimals. This condition is now removed. 2. In particular, the `ParquetHadoopFsRelationSuite` (that tests for all supported hive types -- including `DecimalType(25, 5)`) fails when the gating condition is removed (#11808) and should now pass with this change. Author: Sameer Agarwal <sameer@databricks.com> Closes #11869 from sameeragarwal/bigdecimal-parquet.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -262,6 +262,11 @@ private void decodeDictionaryIds(int rowId, int num, ColumnVector column,
             Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
             column.putLong(i, CatalystRowConverter.binaryToUnscaledLong(v));
           }
+        } else if (DecimalType.isByteArrayDecimalType(column.dataType())) {
+          for (int i = rowId; i < rowId + num; ++i) {
+            Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));
+            column.putByteArray(i, v.getBytes());
+          }
         } else {
           throw new NotImplementedException();
         }
@@ -368,6 +373,14 @@ private void readFixedLenByteArrayBatch(int rowId, int num,
           column.putNull(rowId + i);
         }
       }
+    } else if (DecimalType.isByteArrayDecimalType(column.dataType())) {
+      for (int i = 0; i < num; i++) {
+        if (defColumn.readInteger() == maxDefLevel) {
+          column.putByteArray(rowId + i, data.readBinary(arrayLen).getBytes());
+        } else {
+          column.putNull(rowId + i);
+        }
+      }
     } else {
       throw new NotImplementedException("Unimplemented type: " + column.dataType());
     }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
@@ -220,10 +220,6 @@ private void initializeInternal() throws IOException {
           originalTypes[i] != OriginalType.INT_8 && originalTypes[i] != OriginalType.INT_16) {
         throw new IOException("Unsupported type: " + t);
       }
-      if (originalTypes[i] == OriginalType.DECIMAL &&
-          primitiveType.getDecimalMetadata().getPrecision() > Decimal.MAX_LONG_DIGITS()) {
-        throw new IOException("Decimal with high precision is not supported.");
-      }
       if (primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT96) {
         throw new IOException("Int96 not supported.");
       }

Original file line number	Diff line number	Diff line change
`@@ -262,6 +262,11 @@ private void decodeDictionaryIds(int rowId, int num, ColumnVector column,`
`262`	`262`	`Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));`
`263`	`263`	`column.putLong(i, CatalystRowConverter.binaryToUnscaledLong(v));`
`264`	`264`	`}`
	`265`	`+ } else if (DecimalType.isByteArrayDecimalType(column.dataType())) {`
	`266`	`+ for (int i = rowId; i < rowId + num; ++i) {`
	`267`	`+ Binary v = dictionary.decodeToBinary(dictionaryIds.getInt(i));`
	`268`	`+ column.putByteArray(i, v.getBytes());`
	`269`	`+ }`
`265`	`270`	`} else {`
`266`	`271`	`throw new NotImplementedException();`
`267`	`272`	`}`
`@@ -368,6 +373,14 @@ private void readFixedLenByteArrayBatch(int rowId, int num,`
`368`	`373`	`column.putNull(rowId + i);`
`369`	`374`	`}`
`370`	`375`	`}`
	`376`	`+ } else if (DecimalType.isByteArrayDecimalType(column.dataType())) {`
	`377`	`+ for (int i = 0; i < num; i++) {`
	`378`	`+ if (defColumn.readInteger() == maxDefLevel) {`
	`379`	`+ column.putByteArray(rowId + i, data.readBinary(arrayLen).getBytes());`
	`380`	`+ } else {`
	`381`	`+ column.putNull(rowId + i);`
	`382`	`+ }`
	`383`	`+ }`
`371`	`384`	`} else {`
`372`	`385`	`throw new NotImplementedException("Unimplemented type: " + column.dataType());`
`373`	`386`	`}`
Original file line number	Diff line number	Diff line change
`@@ -220,10 +220,6 @@ private void initializeInternal() throws IOException {`
`220`	`220`	`originalTypes[i] != OriginalType.INT_8 && originalTypes[i] != OriginalType.INT_16) {`
`221`	`221`	`throw new IOException("Unsupported type: " + t);`
`222`	`222`	`}`
`223`		`- if (originalTypes[i] == OriginalType.DECIMAL &&`
`224`		`- primitiveType.getDecimalMetadata().getPrecision() > Decimal.MAX_LONG_DIGITS()) {`
`225`		`- throw new IOException("Decimal with high precision is not supported.");`
`226`		`- }`
`227`	`223`	`if (primitiveType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT96) {`
`228`	`224`	`throw new IOException("Int96 not supported.");`
`229`	`225`	`}`