[SPARK-24971][SQL] remove SupportsDeprecatedScanRow

cloud-fan · rdblue · commit dcbdeb434f0a · 2018-08-29T12:59:21.000-07:00
This is a follow up of apache#21118 . In apache#21118 we added `SupportsDeprecatedScanRow`. Ideally data source should produce `InternalRow` instead of `Row` for better performance. We should remove `SupportsDeprecatedScanRow` and encourage data sources to produce `InternalRow`, which is also very easy to build. existing tests. Author: Wenchen Fan <wenchen@databricks.com> Closes apache#21921 from cloud-fan/row.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceReader.java
@@ -39,11 +39,7 @@
  *      pruning), etc. Names of these interfaces start with `SupportsPushDown`.
  *   2. Information Reporting. E.g., statistics reporting, ordering reporting, etc.
  *      Names of these interfaces start with `SupportsReporting`.
- *   3. Special scans. E.g, columnar scan, unsafe row scan, etc.
- *      Names of these interfaces start with `SupportsScan`. Note that a reader should only
- *      implement at most one of the special scans, if more than one special scans are implemented,
- *      only one of them would be respected, according to the priority list from high to low:
- *      {@link SupportsScanColumnarBatch}, {@link SupportsDeprecatedScanRow}.
+ *   3. Columnar scan if implements {@link SupportsScanColumnarBatch}.
  *
  * If an exception was throw when applying any of these query optimizations, the action will fail
  * and no Spark job will be submitted.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/InputPartitionReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/InputPartitionReader.java
@@ -28,8 +28,7 @@
  *
  * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.catalyst.InternalRow}
  * for normal data source readers, {@link org.apache.spark.sql.vectorized.ColumnarBatch} for data
- * source readers that mix in {@link SupportsScanColumnarBatch}, or {@link org.apache.spark.sql.Row}
- * for data source readers that mix in {@link SupportsDeprecatedScanRow}.
+ * source readers that mix in {@link SupportsScanColumnarBatch}.
  */
 @InterfaceStability.Evolving
 public interface InputPartitionReader<T> extends Closeable {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -20,15 +20,12 @@ package org.apache.spark.sql.execution.datasources.v2
 import scala.collection.JavaConverters._
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.LeafExecNode
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.sources.v2.DataSourceV2
 import org.apache.spark.sql.sources.v2.reader._
-import org.apache.spark.sql.types.StructType
 
 /**
  * Physical plan node for scanning data from a data source.
@@ -54,13 +51,8 @@ case class DataSourceV2ScanExec(
     Seq(output, source, options).hashCode()
   }
 
-  private lazy val partitions: Seq[InputPartition[InternalRow]] = reader match {
-    case r: SupportsDeprecatedScanRow =>
-      r.planRowInputPartitions().asScala.map {
-        new RowToUnsafeRowInputPartition(_, reader.readSchema()): InputPartition[InternalRow]
-      }
-    case _ =>
-      reader.planInputPartitions().asScala
+  private lazy val partitions: Seq[InputPartition[InternalRow]] = {
+    reader.planInputPartitions().asScala
   }
 
   private lazy val inputRDD: RDD[InternalRow] = reader match {
@@ -79,27 +71,3 @@ case class DataSourceV2ScanExec(
     }
   }
 }
-
-class RowToUnsafeRowInputPartition(partition: InputPartition[Row], schema: StructType)
-  extends InputPartition[InternalRow] {
-
-  override def preferredLocations: Array[String] = partition.preferredLocations
-
-  override def createPartitionReader: InputPartitionReader[InternalRow] = {
-    new RowToUnsafeInputPartitionReader(
-      partition.createPartitionReader, RowEncoder.apply(schema).resolveAndBind())
-  }
-}
-
-class RowToUnsafeInputPartitionReader(
-    val rowReader: InputPartitionReader[Row],
-    encoder: ExpressionEncoder[Row])
-
-  extends InputPartitionReader[InternalRow] {
-
-  override def next: Boolean = rowReader.next
-
-  override def get: UnsafeRow = encoder.toRow(rowReader.get).asInstanceOf[UnsafeRow]
-
-  override def close(): Unit = rowReader.close()
-}
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java
@@ -20,8 +20,8 @@
 import java.io.IOException;
 import java.util.*;
 
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.catalyst.expressions.GenericRow;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.sources.Filter;
 import org.apache.spark.sql.sources.GreaterThan;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
@@ -33,7 +33,7 @@
 public class JavaAdvancedDataSourceV2 implements DataSourceV2, ReadSupport {
 
   public class Reader implements DataSourceReader, SupportsPushDownRequiredColumns,
-      SupportsPushDownFilters, SupportsDeprecatedScanRow {
+    SupportsPushDownFilters {
 
     // Exposed for testing.
     public StructType requiredSchema = new StructType().add("i", "int").add("j", "int");
@@ -79,8 +79,8 @@ public Filter[] pushedFilters() {
     }
 
     @Override
-    public List<InputPartition<Row>> planRowInputPartitions() {
-      List<InputPartition<Row>> res = new ArrayList<>();
+    public List<InputPartition<InternalRow>> planInputPartitions() {
+      List<InputPartition<InternalRow>> res = new ArrayList<>();
 
       Integer lowerBound = null;
       for (Filter filter : filters) {
@@ -107,7 +107,8 @@ public List<InputPartition<Row>> planRowInputPartitions() {
     }
   }
 
-  static class JavaAdvancedInputPartition implements InputPartition<Row>, InputPartitionReader<Row> {
+  static class JavaAdvancedInputPartition implements InputPartition<InternalRow>,
+      InputPartitionReader<InternalRow> {
     private int start;
     private int end;
     private StructType requiredSchema;
@@ -119,7 +120,7 @@ static class JavaAdvancedInputPartition implements InputPartition<Row>, InputPar
     }
 
     @Override
-    public InputPartitionReader<Row> createPartitionReader() {
+    public InputPartitionReader<InternalRow> createPartitionReader() {
       return new JavaAdvancedInputPartition(start - 1, end, requiredSchema);
     }
 
@@ -130,7 +131,7 @@ public boolean next() {
     }
 
     @Override
-    public Row get() {
+    public InternalRow get() {
       Object[] values = new Object[requiredSchema.size()];
       for (int i = 0; i < values.length; i++) {
         if ("i".equals(requiredSchema.apply(i).name())) {
@@ -139,7 +140,7 @@ public Row get() {
           values[i] = -start;
         }
       }
-      return new GenericRow(values);
+      return new GenericInternalRow(values);
     }
 
     @Override
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
@@ -19,18 +19,17 @@
 
 import java.util.List;
 
-import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
 import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
 import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
 import org.apache.spark.sql.sources.v2.reader.InputPartition;
-import org.apache.spark.sql.sources.v2.reader.SupportsDeprecatedScanRow;
 import org.apache.spark.sql.types.StructType;
 
 public class JavaSchemaRequiredDataSource implements DataSourceV2, ReadSupportWithSchema {
 
-  class Reader implements DataSourceReader, SupportsDeprecatedScanRow {
+  class Reader implements DataSourceReader {
     private final StructType schema;
 
     Reader(StructType schema) {
@@ -43,7 +42,7 @@ public StructType readSchema() {
     }
 
     @Override
-    public List<InputPartition<Row>> planRowInputPartitions() {
+    public List<InputPartition<InternalRow>> planInputPartitions() {
       return java.util.Collections.emptyList();
     }
   }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java
@@ -20,20 +20,19 @@
 import java.io.IOException;
 import java.util.List;
 
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.catalyst.expressions.GenericRow;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.reader.InputPartitionReader;
 import org.apache.spark.sql.sources.v2.reader.InputPartition;
 import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
-import org.apache.spark.sql.sources.v2.reader.SupportsDeprecatedScanRow;
 import org.apache.spark.sql.types.StructType;
 
 public class JavaSimpleDataSourceV2 implements DataSourceV2, ReadSupport {
 
-  class Reader implements DataSourceReader, SupportsDeprecatedScanRow {
+  class Reader implements DataSourceReader {
     private final StructType schema = new StructType().add("i", "int").add("j", "int");
 
     @Override
@@ -42,14 +41,16 @@ public StructType readSchema() {
     }
 
     @Override
-    public List<InputPartition<Row>> planRowInputPartitions() {
+    public List<InputPartition<InternalRow>> planInputPartitions() {
       return java.util.Arrays.asList(
         new JavaSimpleInputPartition(0, 5),
         new JavaSimpleInputPartition(5, 10));
     }
   }
 
-  static class JavaSimpleInputPartition implements InputPartition<Row>, InputPartitionReader<Row> {
+  static class JavaSimpleInputPartition implements InputPartition<InternalRow>,
+    InputPartitionReader<InternalRow> {
+
     private int start;
     private int end;
 
@@ -59,7 +60,7 @@ static class JavaSimpleInputPartition implements InputPartition<Row>, InputParti
     }
 
     @Override
-    public InputPartitionReader<Row> createPartitionReader() {
+    public InputPartitionReader<InternalRow> createPartitionReader() {
       return new JavaSimpleInputPartition(start - 1, end);
     }
 
@@ -70,8 +71,8 @@ public boolean next() {
     }
 
     @Override
-    public Row get() {
-      return new GenericRow(new Object[] {start, -start});
+    public InternalRow get() {
+      return new GenericInternalRow(new Object[] {start, -start});
     }
 
     @Override
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaUnsafeRowDataSourceV2.java
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala