[SPARK-24990][SQL] merge ReadSupport and ReadSupportWithSchema

cloud-fan · rdblue · commit cd7f5a7178e2 · 2018-08-29T12:59:21.000-07:00
Regarding user-specified schema, data sources may have 3 different behaviors: 1. must have a user-specified schema 2. can't have a user-specified schema 3. can accept the user-specified if it's given, or infer the schema. I added `ReadSupportWithSchema` to support these behaviors, following data source v1. But it turns out we don't need this extra interface. We can just add a `createReader(schema, options)` to `ReadSupport` and make it call `createReader(options)` by default. TODO: also fix the streaming API in followup PRs. existing tests. Author: Wenchen Fan <wenchen@databricks.com> Closes apache#21946 from cloud-fan/ds-schema.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupport.java
@@ -18,7 +18,9 @@
 package org.apache.spark.sql.sources.v2;
 
 import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.sources.DataSourceRegister;
 import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
+import org.apache.spark.sql.types.StructType;
 
 /**
  * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
@@ -27,6 +29,29 @@
 @InterfaceStability.Evolving
 public interface ReadSupport extends DataSourceV2 {
 
+  /**
+   * Creates a {@link DataSourceReader} to scan the data from this data source.
+   *
+   * If this method fails (by throwing an exception), the action will fail and no Spark job will be
+   * submitted.
+   *
+   * @param schema the user specified schema.
+   * @param options the options for the returned data source reader, which is an immutable
+   *                case-insensitive string-to-string map.
+   *
+   * By default this method throws {@link UnsupportedOperationException}, implementations should
+   * override this method to handle user specified schema.
+   */
+  default DataSourceReader createReader(StructType schema, DataSourceOptions options) {
+    String name;
+    if (this instanceof DataSourceRegister) {
+      name = ((DataSourceRegister) this).shortName();
+    } else {
+      name = this.getClass().getName();
+    }
+    throw new UnsupportedOperationException(name + " does not support user specified schema");
+  }
+
   /**
    * Creates a {@link DataSourceReader} to scan the data from this data source.
    *
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ReadSupportWithSchema.java
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceReader.java
@@ -23,13 +23,12 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.ReadSupport;
-import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
 import org.apache.spark.sql.types.StructType;
 
 /**
  * A data source reader that is returned by
  * {@link ReadSupport#createReader(DataSourceOptions)} or
- * {@link ReadSupportWithSchema#createReader(StructType, DataSourceOptions)}.
+ * {@link ReadSupport#createReader(StructType, DataSourceOptions)}.
  * It can mix in various query optimization interfaces to speed up the data scan. The actual scan
  * logic is delegated to {@link InputPartition}s, which are returned by
  * {@link #planInputPartitions()}.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStorageFor
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, NamedExpression}
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics, SupportsPhysicalStats}
 import org.apache.spark.sql.sources.DataSourceRegister
-import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, ReadSupport, ReadSupportWithSchema, WriteSupport}
+import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, ReadSupport, WriteSupport}
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.sources.v2.writer.DataSourceWriter
 import org.apache.spark.sql.types.StructType
@@ -121,22 +121,6 @@ object DataSourceV2Relation {
       source match {
         case support: ReadSupport =>
           support
-        case _: ReadSupportWithSchema =>
-          // this method is only called if there is no user-supplied schema. if there is no
-          // user-supplied schema and ReadSupport was not implemented, throw a helpful exception.
-          throw new AnalysisException(s"Data source requires a user-supplied schema: $name")
-        case _ =>
-          throw new AnalysisException(s"Data source is not readable: $name")
-      }
-    }
-
-    def asReadSupportWithSchema: ReadSupportWithSchema = {
-      source match {
-        case support: ReadSupportWithSchema =>
-          support
-        case _: ReadSupport =>
-          throw new AnalysisException(
-            s"Data source does not support user-supplied schema: $name")
         case _ =>
           throw new AnalysisException(s"Data source is not readable: $name")
       }
@@ -166,7 +150,7 @@ object DataSourceV2Relation {
       val v2Options = new DataSourceOptions(options.asJava)
       userSpecifiedSchema match {
         case Some(s) =>
-          asReadSupportWithSchema.createReader(s, v2Options)
+          asReadSupport.createReader(s, v2Options)
         case _ =>
           asReadSupport.createReader(v2Options)
       }
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java
@@ -22,12 +22,12 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.DataSourceV2;
-import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
+import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
 import org.apache.spark.sql.sources.v2.reader.InputPartition;
 import org.apache.spark.sql.types.StructType;
 
-public class JavaSchemaRequiredDataSource implements DataSourceV2, ReadSupportWithSchema {
+public class JavaSchemaRequiredDataSource implements DataSourceV2, ReadSupport {
 
   class Reader implements DataSourceReader {
     private final StructType schema;
@@ -47,6 +47,11 @@ public List<InputPartition<InternalRow>> planInputPartitions() {
     }
   }
 
+  @Override
+  public DataSourceReader createReader(DataSourceOptions options) {
+    throw new IllegalArgumentException("requires a user-supplied schema");
+  }
+
   @Override
   public DataSourceReader createReader(StructType schema, DataSourceOptions options) {
     return new Reader(schema);
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -21,10 +21,9 @@ import java.util.{ArrayList, List => JList}
 
 import test.org.apache.spark.sql.sources.v2._
 
-import org.apache.spark.{SparkConf, SparkException}
-import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row}
+import org.apache.spark.SparkException
+import org.apache.spark.sql.{DataFrame, QueryTest, Row}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2ScanExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.sources.{Filter, GreaterThan}
@@ -120,8 +119,8 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
   test("schema required data source") {
     Seq(classOf[SchemaRequiredDataSource], classOf[JavaSchemaRequiredDataSource]).foreach { cls =>
       withClue(cls.getName) {
-        val e = intercept[AnalysisException](spark.read.format(cls.getName).load())
-        assert(e.message.contains("requires a user-supplied schema"))
+        val e = intercept[IllegalArgumentException](spark.read.format(cls.getName).load())
+        assert(e.getMessage.contains("requires a user-supplied schema"))
 
         val schema = new StructType().add("i", "int").add("s", "string")
         val df = spark.read.format(cls.getName).schema(schema).load()
@@ -381,13 +380,18 @@ class AdvancedInputPartition(start: Int, end: Int, requiredSchema: StructType)
 }
 
 
-class SchemaRequiredDataSource extends DataSourceV2 with ReadSupportWithSchema {
+class SchemaRequiredDataSource extends DataSourceV2 with ReadSupport {
 
   class Reader(val readSchema: StructType) extends DataSourceReader {
     override def planInputPartitions(): JList[InputPartition[InternalRow]] =
       java.util.Collections.emptyList()
   }
 
-  override def createReader(schema: StructType, options: DataSourceOptions): DataSourceReader =
+  override def createReader(options: DataSourceOptions): DataSourceReader = {
+    throw new IllegalArgumentException("requires a user-supplied schema")
+  }
+
+  override def createReader(schema: StructType, options: DataSourceOptions): DataSourceReader = {
     new Reader(schema)
+  }
 }