cloud-fan · cloud-fan · Jul 11, 2017 · Aug 17, 2017 · Aug 22, 2017 · Aug 31, 2017
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap;
+import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * The main interface and minimal requirement for data source v2 implementations. Users can mix in
+ * more interfaces to implement more functions other than just scan.
+ */
+public interface DataSourceV2 {
+
+  /**
+   * The main entrance for read interface.
+   *
+   * @param schema the full schema of this data source reader. Full schema usually maps to the
+   *               physical schema of the underlying storage of this data source reader, e.g.
+   *               parquet files, JDBC tables, etc, while this reader may not read data with full
+   *               schema, as column pruning or other optimizations may happen.
+   * @param options the options for this data source reader, which is case insensitive.
+   * @return a reader that implements the actual read logic.
+   */
+  DataSourceV2Reader createReader(
+      StructType schema,
+      CaseInsensitiveMap<String> options);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2SchemaProvider.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2SchemaProvider.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A mix in interface for `DataSourceV2`. Users can implement this interface to provide schema
+ * inference ability when scanning data.
+ */
+public interface DataSourceV2SchemaProvider {
+  /**
+   * Return the inferred schema of this data source given these options.
+   */
+  StructType inferSchema(CaseInsensitiveMap<String> options);
+
+  /**
+   * Whether or not this data source can accept user specified schema. When Spark scans a data
+   * source, users can specify the schema to avoid expensive schema inference. However some data
+   * sources may have to infer the schema and reject any user specified schemas, they can overwrite
+   * this method to achieve this.
+   */
+  default boolean acceptsUserDefinedSchema() {
+    return true;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WritableDataSourceV2.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WritableDataSourceV2.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap;
+import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
+
+/**
+ * A mix in interface for `DataSourceV2`. Users can implement this interface to provide data writing
+ * ability with job-level transaction.
+ */
+public interface WritableDataSourceV2 extends DataSourceV2 {
+
+  /**
+   * The main entrance for write interface.
+   *
+   * @param mode the save move, can be append, overwrite, etc.
+   * @param options the options for this data source writer.
+   * @return a writer that implements the actual write logic.
+   */
+  DataSourceV2Writer createWriter(SaveMode mode, CaseInsensitiveMap<String> options);
+}
diff --git a/...e/src/main/java/org/apache/spark/sql/sources/v2/reader/CatalystFilterPushDownSupport.java b/...e/src/main/java/org/apache/spark/sql/sources/v2/reader/CatalystFilterPushDownSupport.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.catalyst.expressions.Expression;
+
+/**
+ * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to push down
+ * arbitrary expressions as predicates to the data source.
+ */
+@Experimental
+@InterfaceStability.Unstable
+public interface CatalystFilterPushDownSupport {
+  /**
+   * Push down filters, returns unsupported filters.
+   */
+  Expression[] pushDownCatalystFilters(Expression[] filters);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ColumnPruningSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ColumnPruningSupport.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to only read
+ * required columns/nested fields during scan.
+ */
+public interface ColumnPruningSupport {
+  /**
+   * Returns true if the implementation can apple this column pruning optimization, so that we can
+   * reduce the data size to be read at the very beginning.
+   */
+  boolean pruneColumns(StructType requiredSchema);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ColumnarReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ColumnarReadSupport.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.util.List;
+
+import org.apache.spark.sql.execution.vectorized.ColumnarBatch;
+
+/**
+ * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to provide
+ * columnar read ability for better performance.
+ */
+public interface ColumnarReadSupport {
+  /**
+   * Similar to `DataSourceV2Reader.createReadTasks`, but return data in columnar format.
+   */
+  List<ReadTask<ColumnarBatch>> createColumnarReadTasks();
+
+  /**
+   * A safety door for columnar reader. It's possible that the implementation can only support
+   * columnar reads for some certain columns, users can overwrite this method to fallback to
+   * normal read path under some conditions.
+   *
+   * Note that, if the implementation always return true here, then he can throw exception in
+   * the row based `DataSourceV2Reader.createReadTasks`, as it will never be called.
+   */
+  default boolean supportsColumnarReads() {
+    return true;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.io.Closeable;
+import java.util.Iterator;
+
+/**
+ * A data reader returned by a read task and is responsible for outputting data for an RDD
+ * partition.
+ */
+public interface DataReader<T> extends Iterator<T>, Closeable {}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
+import org.apache.spark.sql.catalyst.encoders.RowEncoder;
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * The main interface and minimal requirement for a data source reader. The implementations should
+ * at least implement the full scan logic, users can mix in more interfaces to implement scan
+ * optimizations like column pruning, filter push down, etc.
+ */
+public abstract class DataSourceV2Reader {
+
+  /**
+   * The actual schema of this data source reader, which may be different from the physical schema
+   * of the underlying storage, as column pruning or other optimizations may happen.
+   */
+  public abstract StructType readSchema();
+
+  /**
+   * The actual read logic should be implemented here. This may not be a full scan as optimizations
+   * may have already been applied on this reader. Implementations should return a list of
+   * read tasks, each task is responsible to output data for one RDD partition, which means
+   * the number of tasks returned here will be same as the number of RDD partitions this scan
+   * output.
+   */
+  // TODO: maybe we should support arbitrary type and work with Dataset, instead of only Row.
+  protected abstract List<ReadTask<Row>> createReadTasks();
+
+  /**
+   * Inside Spark, the input rows will be converted to `UnsafeRow`s before processing. To avoid
+   * this conversion, implementations can overwrite this method and output `UnsafeRow`s directly.
+   * Note that, this is an experimental and unstable interface, as `UnsafeRow` is not public and
+   * may get changed in future Spark versions.
+   *
+   * Note that, if the implement overwrites this method, he should also overwrite `createReadTasks`
+   * to throw exception, as it will never be called.
+   */
+  @Experimental
+  @InterfaceStability.Unstable
+  public List<ReadTask<UnsafeRow>> createUnsafeRowReadTasks() {
+    StructType schema = readSchema();
+    return createReadTasks().stream()
+        .map(rowGenerator -> new RowToUnsafeRowReadTask(rowGenerator, schema))
+        .collect(Collectors.toList());
+  }
+}
+
+class RowToUnsafeRowReadTask implements ReadTask<UnsafeRow> {
+  private final ReadTask<Row> rowReadTask;
+  private final StructType schema;
+
+  RowToUnsafeRowReadTask(ReadTask<Row> rowReadTask, StructType schema) {
+    this.rowReadTask = rowReadTask;
+    this.schema = schema;
+  }
+
+  @Override
+  public String[] preferredLocations() {
+    return rowReadTask.preferredLocations();
+  }
+
+  @Override
+  public DataReader<UnsafeRow> getReader() {
+    return new RowToUnsafeDataReader(rowReadTask.getReader(), RowEncoder.apply(schema));
+  }
+}
+
+class RowToUnsafeDataReader implements DataReader<UnsafeRow> {
+  private final DataReader<Row> rowReader;
+  private final ExpressionEncoder<Row> encoder;
+
+  RowToUnsafeDataReader(DataReader<Row> rowReader, ExpressionEncoder<Row> encoder) {
+    this.rowReader = rowReader;
+    this.encoder = encoder;
+  }
+
+  @Override
+  public boolean hasNext() {
+    return rowReader.hasNext();
+  }
+
+  @Override
+  public UnsafeRow next() {
+    return (UnsafeRow) encoder.toRow(rowReader.next());
+  }
+
+  @Override
+  public void close() throws IOException {
+    rowReader.close();
+  }
+}