[SPARK-23047][PYTHON][SQL] Change MapVector to NullableMapVector in ArrowColumnVector

icexelloss · HyukjinKwon · commit 6e509fde3f05 · 2018-01-18T07:26:57.000+09:00
## What changes were proposed in this pull request? This PR changes usage of `MapVector` in Spark codebase to use `NullableMapVector`. `MapVector` is an internal Arrow class that is not supposed to be used directly. We should use `NullableMapVector` instead. ## How was this patch tested? Existing test. Author: Li Jin <ice.xelloss@gmail.com> Closes #20239 from icexelloss/arrow-map-vector. (cherry picked from commit 4e6f8fb) Signed-off-by: hyukjinkwon <gurwls223@gmail.com>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -247,8 +247,8 @@ public ArrowColumnVector(ValueVector vector) {
 
       childColumns = new ArrowColumnVector[1];
       childColumns[0] = new ArrowColumnVector(listVector.getDataVector());
-    } else if (vector instanceof MapVector) {
-      MapVector mapVector = (MapVector) vector;
+    } else if (vector instanceof NullableMapVector) {
+      NullableMapVector mapVector = (NullableMapVector) vector;
       accessor = new StructAccessor(mapVector);
 
       childColumns = new ArrowColumnVector[mapVector.size()];
@@ -553,9 +553,16 @@ final int getArrayOffset(int rowId) {
     }
   }
 
+  /**
+   * Any call to "get" method will throw UnsupportedOperationException.
+   *
+   * Access struct values in a ArrowColumnVector doesn't use this accessor. Instead, it uses getStruct() method defined
+   * in the parent class. Any call to "get" method in this class is a bug in the code.
+   *
+   */
   private static class StructAccessor extends ArrowVectorAccessor {
 
-    StructAccessor(MapVector vector) {
+    StructAccessor(NullableMapVector vector) {
       super(vector);
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -322,6 +322,42 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
     allocator.close()
   }
 
+  test("non nullable struct") {
+    val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue)
+    val schema = new StructType().add("int", IntegerType).add("long", LongType)
+    val vector = ArrowUtils.toArrowField("struct", schema, nullable = false, null)
+      .createVector(allocator).asInstanceOf[NullableMapVector]
+
+    vector.allocateNew()
+    val intVector = vector.getChildByOrdinal(0).asInstanceOf[IntVector]
+    val longVector = vector.getChildByOrdinal(1).asInstanceOf[BigIntVector]
+
+    vector.setIndexDefined(0)
+    intVector.setSafe(0, 1)
+    longVector.setSafe(0, 1L)
+
+    vector.setIndexDefined(1)
+    intVector.setSafe(1, 2)
+    longVector.setNull(1)
+
+    vector.setValueCount(2)
+
+    val columnVector = new ArrowColumnVector(vector)
+    assert(columnVector.dataType === schema)
+    assert(columnVector.numNulls === 0)
+
+    val row0 = columnVector.getStruct(0, 2)
+    assert(row0.getInt(0) === 1)
+    assert(row0.getLong(1) === 1L)
+
+    val row1 = columnVector.getStruct(1, 2)
+    assert(row1.getInt(0) === 2)
+    assert(row1.isNullAt(1))
+
+    columnVector.close()
+    allocator.close()
+  }
+
   test("struct") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("struct", 0, Long.MaxValue)
     val schema = new StructType().add("int", IntegerType).add("long", LongType)

Original file line number	Diff line number	Diff line change
`@@ -247,8 +247,8 @@ public ArrowColumnVector(ValueVector vector) {`
`247`	`247`
`248`	`248`	`childColumns = new ArrowColumnVector[1];`
`249`	`249`	`childColumns[0] = new ArrowColumnVector(listVector.getDataVector());`
`250`		`- } else if (vector instanceof MapVector) {`
`251`		`- MapVector mapVector = (MapVector) vector;`
	`250`	`+ } else if (vector instanceof NullableMapVector) {`
	`251`	`+ NullableMapVector mapVector = (NullableMapVector) vector;`
`252`	`252`	`accessor = new StructAccessor(mapVector);`
`253`	`253`
`254`	`254`	`childColumns = new ArrowColumnVector[mapVector.size()];`
`@@ -553,9 +553,16 @@ final int getArrayOffset(int rowId) {`
`553`	`553`	`}`
`554`	`554`	`}`
`555`	`555`
	`556`	`+ /**`
	`557`	`+ * Any call to "get" method will throw UnsupportedOperationException.`
	`558`	`+ *`
	`559`	`+ * Access struct values in a ArrowColumnVector doesn't use this accessor. Instead, it uses getStruct() method defined`
	`560`	`+ * in the parent class. Any call to "get" method in this class is a bug in the code.`
	`561`	`+ *`
	`562`	`+ */`
`556`	`563`	`private static class StructAccessor extends ArrowVectorAccessor {`
`557`	`564`
`558`		`- StructAccessor(MapVector vector) {`
	`565`	`+ StructAccessor(NullableMapVector vector) {`
`559`	`566`	`super(vector);`
`560`	`567`	`}`
`561`	`568`	`}`