[SPARK-23325] Use InternalRow when reading with DataSourceV2.

rdblue · rdblue · commit 680fb547db5f · 2018-08-29T12:59:21.000-07:00
This updates the DataSourceV2 API to use InternalRow instead of Row for the default case with no scan mix-ins. Support for readers that produce Row is added through SupportsDeprecatedScanRow, which matches the previous API. Readers that used Row now implement this class and should be migrated to InternalRow. Readers that previously implemented SupportsScanUnsafeRow have been migrated to use no SupportsScan mix-ins and produce InternalRow. This uses existing tests. Author: Ryan Blue <blue@apache.org> Closes apache#21118 from rdblue/SPARK-23325-datasource-v2-internal-row.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceReader.java
@@ -20,7 +20,7 @@
 import java.util.List;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
@@ -40,7 +40,10 @@
  *   2. Information Reporting. E.g., statistics reporting, ordering reporting, etc.
  *      Names of these interfaces start with `SupportsReporting`.
  *   3. Special scans. E.g, columnar scan, unsafe row scan, etc.
- *      Names of these interfaces start with `SupportsScan`.
+ *      Names of these interfaces start with `SupportsScan`. Note that a reader should only
+ *      implement at most one of the special scans, if more than one special scans are implemented,
+ *      only one of them would be respected, according to the priority list from high to low:
+ *      {@link SupportsScanColumnarBatch}, {@link SupportsDeprecatedScanRow}.
  *
  * If an exception was throw when applying any of these query optimizations, the action will fail
  * and no Spark job will be submitted.
@@ -73,5 +76,5 @@ public interface DataSourceReader {
    * If this method fails (by throwing an exception), the action will fail and no Spark job will be
    * submitted.
    */
-  List<InputPartition<Row>> planInputPartitions();
+  List<InputPartition<InternalRow>> planInputPartitions();
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/InputPartitionReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/InputPartitionReader.java
@@ -26,9 +26,10 @@
  * An input partition reader returned by {@link InputPartition#createPartitionReader()} and is
  * responsible for outputting data for a RDD partition.
  *
- * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.Row} for normal input
- * partition readers, or {@link org.apache.spark.sql.catalyst.expressions.UnsafeRow} for input
- * partition readers that mix in {@link SupportsScanUnsafeRow}.
+ * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.catalyst.InternalRow}
+ * for normal data source readers, {@link org.apache.spark.sql.vectorized.ColumnarBatch} for data
+ * source readers that mix in {@link SupportsScanColumnarBatch}, or {@link org.apache.spark.sql.Row}
+ * for data source readers that mix in {@link SupportsDeprecatedScanRow}.
  */
 @InterfaceStability.Evolving
 public interface InputPartitionReader<T> extends Closeable {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsDeprecatedScanRow.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsDeprecatedScanRow.java
@@ -17,30 +17,23 @@
 
 package org.apache.spark.sql.sources.v2.reader;
 
-import java.util.List;
-
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+import org.apache.spark.sql.catalyst.InternalRow;
+
+import java.util.List;
 
 /**
  * A mix-in interface for {@link DataSourceReader}. Data source readers can implement this
- * interface to output {@link UnsafeRow} directly and avoid the row copy at Spark side.
- * This is an experimental and unstable interface, as {@link UnsafeRow} is not public and may get
- * changed in the future Spark versions.
+ * interface to output {@link Row} instead of {@link InternalRow}.
+ * This is an experimental and unstable interface.
  */
 @InterfaceStability.Unstable
-public interface SupportsScanUnsafeRow extends DataSourceReader {
-
-  @Override
-  default List<InputPartition<Row>> planInputPartitions() {
+public interface SupportsDeprecatedScanRow extends DataSourceReader {
+  default List<InputPartition<InternalRow>> planInputPartitions() {
     throw new IllegalStateException(
-      "planInputPartitions not supported by default within SupportsScanUnsafeRow");
+        "planInputPartitions not supported by default within SupportsDeprecatedScanRow");
   }
 
-  /**
-   * Similar to {@link DataSourceReader#planInputPartitions()},
-   * but returns data in unsafe row format.
-   */
-  List<InputPartition<UnsafeRow>> planUnsafeInputPartitions();
+  List<InputPartition<Row>> planRowInputPartitions();
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -54,12 +54,13 @@ case class DataSourceV2ScanExec(
     Seq(output, source, options).hashCode()
   }
 
-  private lazy val partitions: Seq[InputPartition[UnsafeRow]] = reader match {
-    case r: SupportsScanUnsafeRow => r.planUnsafeInputPartitions().asScala
-    case _ =>
-      reader.planInputPartitions().asScala.map {
-        new RowToUnsafeRowInputPartition(_, reader.readSchema()): InputPartition[UnsafeRow]
+  private lazy val partitions: Seq[InputPartition[InternalRow]] = reader match {
+    case r: SupportsDeprecatedScanRow =>
+      r.planRowInputPartitions().asScala.map {
+        new RowToUnsafeRowInputPartition(_, reader.readSchema()): InputPartition[InternalRow]
       }
+    case _ =>
+      reader.planInputPartitions().asScala
   }
 
   private lazy val inputRDD: RDD[InternalRow] = reader match {
@@ -80,11 +81,11 @@ case class DataSourceV2ScanExec(
 }
 
 class RowToUnsafeRowInputPartition(partition: InputPartition[Row], schema: StructType)
-  extends InputPartition[UnsafeRow] {
+  extends InputPartition[InternalRow] {
 
   override def preferredLocations: Array[String] = partition.preferredLocations
 
-  override def createPartitionReader: InputPartitionReader[UnsafeRow] = {
+  override def createPartitionReader: InputPartitionReader[InternalRow] = {
     new RowToUnsafeInputPartitionReader(
       partition.createPartitionReader, RowEncoder.apply(schema).resolveAndBind())
   }
@@ -94,7 +95,7 @@ class RowToUnsafeInputPartitionReader(
     val rowReader: InputPartitionReader[Row],
     encoder: ExpressionEncoder[Row])
 
-  extends InputPartitionReader[UnsafeRow] {
+  extends InputPartitionReader[InternalRow] {
 
   override def next: Boolean = rowReader.next
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -124,13 +124,8 @@ object DataSourceV2Strategy extends Strategy {
       val filterCondition = postScanFilters.reduceLeftOption(And)
       val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan)
 
-      val withProjection = if (withFilter.output != project) {
-        ProjectExec(project, withFilter)
-      } else {
-        withFilter
-      }
-
-      withProjection :: Nil
+      // always add the projection, which will produce unsafe rows required by some operators
+      ProjectExec(project, withFilter) :: Nil
 
     case WriteToDataSourceV2(writer, query) =>
       WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
@@ -33,7 +33,7 @@
 public class JavaAdvancedDataSourceV2 implements DataSourceV2, ReadSupport {
 
   public class Reader implements DataSourceReader, SupportsPushDownRequiredColumns,
-      SupportsPushDownFilters {
+      SupportsPushDownFilters, SupportsDeprecatedScanRow {
 
     // Exposed for testing.
     public StructType requiredSchema = new StructType().add("i", "int").add("j", "int");
@@ -79,7 +79,7 @@ public Filter[] pushedFilters() {
     }
 
     @Override
-    public List<InputPartition<Row>> planInputPartitions() {
+    public List<InputPartition<Row>> planRowInputPartitions() {
       List<InputPartition<Row>> res = new ArrayList<>();
 
       Integer lowerBound = null;
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
@@ -25,11 +25,12 @@
 import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
 import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
 import org.apache.spark.sql.sources.v2.reader.InputPartition;
+import org.apache.spark.sql.sources.v2.reader.SupportsDeprecatedScanRow;
 import org.apache.spark.sql.types.StructType;
 
 public class JavaSchemaRequiredDataSource implements DataSourceV2, ReadSupportWithSchema {
 
-  class Reader implements DataSourceReader {
+  class Reader implements DataSourceReader, SupportsDeprecatedScanRow {
     private final StructType schema;
 
     Reader(StructType schema) {
@@ -42,7 +43,7 @@ public StructType readSchema() {
     }
 
     @Override
-    public List<InputPartition<Row>> planInputPartitions() {
+    public List<InputPartition<Row>> planRowInputPartitions() {
       return java.util.Collections.emptyList();
     }
   }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java
@@ -28,11 +28,12 @@
 import org.apache.spark.sql.sources.v2.reader.InputPartitionReader;
 import org.apache.spark.sql.sources.v2.reader.InputPartition;
 import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
+import org.apache.spark.sql.sources.v2.reader.SupportsDeprecatedScanRow;
 import org.apache.spark.sql.types.StructType;
 
 public class JavaSimpleDataSourceV2 implements DataSourceV2, ReadSupport {
 
-  class Reader implements DataSourceReader {
+  class Reader implements DataSourceReader, SupportsDeprecatedScanRow {
     private final StructType schema = new StructType().add("i", "int").add("j", "int");
 
     @Override
@@ -41,7 +42,7 @@ public StructType readSchema() {
     }
 
     @Override
-    public List<InputPartition<Row>> planInputPartitions() {
+    public List<InputPartition<Row>> planRowInputPartitions() {
       return java.util.Arrays.asList(
         new JavaSimpleInputPartition(0, 5),
         new JavaSimpleInputPartition(5, 10));
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java
@@ -20,6 +20,7 @@
 import java.io.IOException;
 import java.util.List;
 
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
@@ -29,7 +30,7 @@
 
 public class JavaUnsafeRowDataSourceV2 implements DataSourceV2, ReadSupport {
 
-  class Reader implements DataSourceReader, SupportsScanUnsafeRow {
+  class Reader implements DataSourceReader {
     private final StructType schema = new StructType().add("i", "int").add("j", "int");
 
     @Override
@@ -38,15 +39,15 @@ public StructType readSchema() {
     }
 
     @Override
-    public List<InputPartition<UnsafeRow>> planUnsafeInputPartitions() {
+    public List<InputPartition<InternalRow>> planInputPartitions() {
       return java.util.Arrays.asList(
         new JavaUnsafeRowInputPartition(0, 5),
         new JavaUnsafeRowInputPartition(5, 10));
     }
   }
 
   static class JavaUnsafeRowInputPartition
-      implements InputPartition<UnsafeRow>, InputPartitionReader<UnsafeRow> {
+      implements InputPartition<InternalRow>, InputPartitionReader<InternalRow> {
     private int start;
     private int end;
     private UnsafeRow row;
@@ -59,7 +60,7 @@ static class JavaUnsafeRowInputPartition
     }
 
     @Override
-    public InputPartitionReader<UnsafeRow> createPartitionReader() {
+    public InputPartitionReader<InternalRow> createPartitionReader() {
       return new JavaUnsafeRowInputPartition(start - 1, end);
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -23,6 +23,7 @@ import test.org.apache.spark.sql.sources.v2._
 
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2ScanExec}
 import org.apache.spark.sql.functions._
@@ -283,10 +284,10 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
 
 class SimpleDataSourceV2 extends DataSourceV2 with ReadSupport {
 
-  class Reader extends DataSourceReader {
+  class Reader extends DataSourceReader with SupportsDeprecatedScanRow {
     override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
 
-    override def planInputPartitions(): JList[InputPartition[Row]] = {
+    override def planRowInputPartitions(): JList[InputPartition[Row]] = {
       java.util.Arrays.asList(new SimpleInputPartition(0, 5), new SimpleInputPartition(5, 10))
     }
   }
@@ -316,7 +317,7 @@ class SimpleInputPartition(start: Int, end: Int)
 
 class AdvancedDataSourceV2 extends DataSourceV2 with ReadSupport {
 
-  class Reader extends DataSourceReader
+  class Reader extends DataSourceReader with SupportsDeprecatedScanRow
     with SupportsPushDownRequiredColumns with SupportsPushDownFilters {
 
     var requiredSchema = new StructType().add("i", "int").add("j", "int")
@@ -341,7 +342,7 @@ class AdvancedDataSourceV2 extends DataSourceV2 with ReadSupport {
       requiredSchema
     }
 
-    override def planInputPartitions(): JList[InputPartition[Row]] = {
+    override def planRowInputPartitions(): JList[InputPartition[Row]] = {
       val lowerBound = filters.collect {
         case GreaterThan("i", v: Int) => v
       }.headOption
@@ -393,10 +394,10 @@ class AdvancedInputPartition(start: Int, end: Int, requiredSchema: StructType)
 
 class UnsafeRowDataSourceV2 extends DataSourceV2 with ReadSupport {
 
-  class Reader extends DataSourceReader with SupportsScanUnsafeRow {
+  class Reader extends DataSourceReader {
     override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int")
 
-    override def planUnsafeInputPartitions(): JList[InputPartition[UnsafeRow]] = {
+    override def planInputPartitions(): JList[InputPartition[InternalRow]] = {
       java.util.Arrays.asList(new UnsafeRowInputPartitionReader(0, 5),
         new UnsafeRowInputPartitionReader(5, 10))
     }
@@ -406,14 +407,14 @@ class UnsafeRowDataSourceV2 extends DataSourceV2 with ReadSupport {
 }
 
 class UnsafeRowInputPartitionReader(start: Int, end: Int)
-  extends InputPartition[UnsafeRow] with InputPartitionReader[UnsafeRow] {
+  extends InputPartition[InternalRow] with InputPartitionReader[InternalRow] {
 
   private val row = new UnsafeRow(2)
   row.pointTo(new Array[Byte](8 * 3), 8 * 3)
 
   private var current = start - 1
 
-  override def createPartitionReader(): InputPartitionReader[UnsafeRow] = this
+  override def createPartitionReader(): InputPartitionReader[InternalRow] = this
 
   override def next(): Boolean = {
     current += 1
@@ -430,8 +431,8 @@ class UnsafeRowInputPartitionReader(start: Int, end: Int)
 
 class SchemaRequiredDataSource extends DataSourceV2 with ReadSupportWithSchema {
 
-  class Reader(val readSchema: StructType) extends DataSourceReader {
-    override def planInputPartitions(): JList[InputPartition[Row]] =
+  class Reader(val readSchema: StructType) extends DataSourceReader with SupportsDeprecatedScanRow {
+    override def planRowInputPartitions(): JList[InputPartition[Row]] =
       java.util.Collections.emptyList()
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
@@ -28,7 +28,7 @@ import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, Path}
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.{Row, SaveMode}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, InputPartition, InputPartitionReader}
+import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, InputPartition, InputPartitionReader, SupportsDeprecatedScanRow}
 import org.apache.spark.sql.sources.v2.writer._
 import org.apache.spark.sql.types.{DataType, StructType}
 import org.apache.spark.util.SerializableConfiguration
@@ -42,10 +42,11 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
 
   private val schema = new StructType().add("i", "long").add("j", "long")
 
-  class Reader(path: String, conf: Configuration) extends DataSourceReader {
+  class Reader(path: String, conf: Configuration) extends DataSourceReader
+      with SupportsDeprecatedScanRow {
     override def readSchema(): StructType = schema
 
-    override def planInputPartitions(): JList[InputPartition[Row]] = {
+    override def planRowInputPartitions(): JList[InputPartition[Row]] = {
       val dataPath = new Path(path)
       val fs = dataPath.getFileSystem(conf)
       if (fs.exists(dataPath)) {

Original file line number	Diff line number	Diff line change
`@@ -25,11 +25,12 @@`
`25`	`25`	`import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;`
`26`	`26`	`import org.apache.spark.sql.sources.v2.reader.DataSourceReader;`
`27`	`27`	`import org.apache.spark.sql.sources.v2.reader.InputPartition;`
	`28`	`+import org.apache.spark.sql.sources.v2.reader.SupportsDeprecatedScanRow;`
`28`	`29`	`import org.apache.spark.sql.types.StructType;`
`29`	`30`
`30`	`31`	`public class JavaSchemaRequiredDataSource implements DataSourceV2, ReadSupportWithSchema {`
`31`	`32`
`32`		`- class Reader implements DataSourceReader {`
	`33`	`+ class Reader implements DataSourceReader, SupportsDeprecatedScanRow {`
`33`	`34`	`private final StructType schema;`
`34`	`35`
`35`	`36`	`Reader(StructType schema) {`
`@@ -42,7 +43,7 @@ public StructType readSchema() {`
`42`	`43`	`}`
`43`	`44`
`44`	`45`	`@Override`
`45`		`- public List<InputPartition<Row>> planInputPartitions() {`
	`46`	`+ public List<InputPartition<Row>> planRowInputPartitions() {`
`46`	`47`	`return java.util.Collections.emptyList();`
`47`	`48`	`}`
`48`	`49`	`}`