apache
diff --git a/‎external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala‎
Lines changed: 8 additions & 6 deletions b/‎external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchReader.scala‎
Lines changed: 8 additions & 7 deletions b/‎external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchReader.scala‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousDataReaderFactory.java‎
Lines changed: 2 additions & 2 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousDataReaderFactory.java‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java‎
Lines changed: 4 additions & 4 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceReader.java‎
Lines changed: 6 additions & 6 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceReader.java‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎…sources/v2/reader/DataReaderFactory.java‎ ‎…park/sql/sources/v2/reader/ReadTask.java‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReaderFactory.java renamed to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
Lines changed: 5 additions & 5 deletions b/‎…sources/v2/reader/DataReaderFactory.java‎ ‎…park/sql/sources/v2/reader/ReadTask.java‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReaderFactory.java renamed to sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
Lines changed: 5 additions & 5 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsDeprecatedScanRow.java‎
Lines changed: 35 additions & 0 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsDeprecatedScanRow.java‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java‎
Lines changed: 1 addition & 1 deletion b/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java‎
Lines changed: 6 additions & 6 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanColumnarBatch.java‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java‎
Lines changed: 0 additions & 46 deletions b/‎sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsScanUnsafeRow.java‎
Lines changed: 0 additions & 46 deletions
@@ -26,6 +26,7 @@ import org.apache.kafka.common.TopicPartition
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.kafka010.KafkaSourceProvider.{INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE, INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE}
 import org.apache.spark.sql.sources.v2.reader._
@@ -53,7 +54,7 @@ class KafkaContinuousReader(
     metadataPath: String,
     initialOffsets: KafkaOffsetRangeLimit,
     failOnDataLoss: Boolean)
-  extends ContinuousReader with SupportsScanUnsafeRow with Logging {
+  extends ContinuousReader with Logging {
 
   private lazy val session = SparkSession.getActiveSession.get
   private lazy val sc = session.sparkContext
@@ -86,7 +87,7 @@ class KafkaContinuousReader(
     KafkaSourceOffset(JsonUtils.partitionOffsets(json))
   }
 
-  override def createUnsafeRowReaderFactories(): ju.List[DataReaderFactory[UnsafeRow]] = {
+  override def createReadTasks(): ju.List[ReadTask[InternalRow]] = {
     import scala.collection.JavaConverters._
 
     val oldStartPartitionOffsets = KafkaSourceOffset.getPartitionOffsets(offset)
@@ -104,12 +105,13 @@ class KafkaContinuousReader(
       oldStartPartitionOffsets.filterKeys(!deletedPartitions.contains(_))
     knownPartitions = startOffsets.keySet
 
-    startOffsets.toSeq.map {
+    val tasks: Seq[ReadTask[InternalRow]] = startOffsets.toSeq.map {
       case (topicPartition, start) =>
         KafkaContinuousDataReaderFactory(
           topicPartition, start, kafkaParams, pollTimeoutMs, failOnDataLoss)
-          .asInstanceOf[DataReaderFactory[UnsafeRow]]
-    }.asJava
+    }
+
+    tasks.asJava
   }
 
   /** Stop this source and free any resources it has allocated. */
@@ -161,7 +163,7 @@ case class KafkaContinuousDataReaderFactory(
     startOffset: Long,
     kafkaParams: ju.Map[String, Object],
     pollTimeoutMs: Long,
-    failOnDataLoss: Boolean) extends ContinuousDataReaderFactory[UnsafeRow] {
+    failOnDataLoss: Boolean) extends ContinuousDataReaderFactory[InternalRow] {
 
   override def createDataReaderWithOffset(offset: PartitionOffset): DataReader[UnsafeRow] = {
     val kafkaOffset = offset.asInstanceOf[KafkaSourcePartitionOffset]
 
@@ -29,11 +29,12 @@ import org.apache.spark.SparkEnv
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.ExecutorCacheTaskLocation
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.execution.streaming.{HDFSMetadataLog, SerializedOffset}
 import org.apache.spark.sql.kafka010.KafkaSourceProvider.{INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE, INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE}
 import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.reader.{DataReader, DataReaderFactory, SupportsScanUnsafeRow}
+import org.apache.spark.sql.sources.v2.reader.{DataReader, ReadTask}
 import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchReader, Offset}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.UninterruptibleThread
@@ -61,7 +62,7 @@ private[kafka010] class KafkaMicroBatchReader(
     metadataPath: String,
     startingOffsets: KafkaOffsetRangeLimit,
     failOnDataLoss: Boolean)
-  extends MicroBatchReader with SupportsScanUnsafeRow with Logging {
+  extends MicroBatchReader with Logging {
 
   private var startPartitionOffsets: PartitionOffsetMap = _
   private var endPartitionOffsets: PartitionOffsetMap = _
@@ -101,7 +102,7 @@ private[kafka010] class KafkaMicroBatchReader(
         }
   }
 
-  override def createUnsafeRowReaderFactories(): ju.List[DataReaderFactory[UnsafeRow]] = {
+  override def createReadTasks(): ju.List[ReadTask[InternalRow]] = {
     // Find the new partitions, and get their earliest offsets
     val newPartitions = endPartitionOffsets.keySet.diff(startPartitionOffsets.keySet)
     val newPartitionInitialOffsets = kafkaOffsetReader.fetchEarliestOffsets(newPartitions.toSeq)
@@ -142,11 +143,11 @@ private[kafka010] class KafkaMicroBatchReader(
     val reuseKafkaConsumer = offsetRanges.map(_.topicPartition).toSet.size == offsetRanges.size
 
     // Generate factories based on the offset ranges
-    val factories = offsetRanges.map { range =>
+    val tasks: Seq[ReadTask[InternalRow]] = offsetRanges.map { range =>
       new KafkaMicroBatchDataReaderFactory(
         range, executorKafkaParams, pollTimeoutMs, failOnDataLoss, reuseKafkaConsumer)
     }
-    factories.map(_.asInstanceOf[DataReaderFactory[UnsafeRow]]).asJava
+    tasks.asJava
   }
 
   override def getStartOffset: Offset = {
@@ -299,13 +300,13 @@ private[kafka010] class KafkaMicroBatchReader(
   }
 }
 
-/** A [[DataReaderFactory]] for reading Kafka data in a micro-batch streaming query. */
+/** A [[ReadTask]] for reading Kafka data in a micro-batch streaming query. */
 private[kafka010] case class KafkaMicroBatchDataReaderFactory(
     offsetRange: KafkaOffsetRange,
     executorKafkaParams: ju.Map[String, Object],
     pollTimeoutMs: Long,
     failOnDataLoss: Boolean,
-    reuseKafkaConsumer: Boolean) extends DataReaderFactory[UnsafeRow] {
+    reuseKafkaConsumer: Boolean) extends ReadTask[InternalRow] {
 
   override def preferredLocations(): Array[String] = offsetRange.preferredLoc.toArray
 
 
@@ -21,11 +21,11 @@
 import org.apache.spark.sql.sources.v2.reader.streaming.PartitionOffset;
 
 /**
- * A mix-in interface for {@link DataReaderFactory}. Continuous data reader factories can
+ * A mix-in interface for {@link ReadTask}. Continuous data reader factories can
  * implement this interface to provide creating {@link DataReader} with particular offset.
  */
 @InterfaceStability.Evolving
-public interface ContinuousDataReaderFactory<T> extends DataReaderFactory<T> {
+public interface ContinuousDataReaderFactory<T> extends ReadTask<T> {
   /**
    * Create a DataReader with particular offset as its startOffset.
    *
 
@@ -23,12 +23,12 @@
 import org.apache.spark.annotation.InterfaceStability;
 
 /**
- * A data reader returned by {@link DataReaderFactory#createDataReader()} and is responsible for
+ * A data reader returned by {@link ReadTask#createDataReader()} and is responsible for
  * outputting data for a RDD partition.
  *
- * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.Row} for normal data
- * source readers, or {@link org.apache.spark.sql.catalyst.expressions.UnsafeRow} for data source
- * readers that mix in {@link SupportsScanUnsafeRow}.
+ * Note that, Currently the type `T` should be {@link org.apache.spark.sql.catalyst.InternalRow}
+ * for normal data source readers, or {@link org.apache.spark.sql.vectorized.ColumnarBatch} for
+ * data source readers that mix in {@link SupportsScanColumnarBatch}.
  */
 @InterfaceStability.Evolving
 public interface DataReader<T> extends Closeable {
 
@@ -20,7 +20,7 @@
 import java.util.List;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.ReadSupport;
 import org.apache.spark.sql.sources.v2.ReadSupportWithSchema;
@@ -31,19 +31,19 @@
  * {@link ReadSupport#createReader(DataSourceOptions)} or
  * {@link ReadSupportWithSchema#createReader(StructType, DataSourceOptions)}.
  * It can mix in various query optimization interfaces to speed up the data scan. The actual scan
- * logic is delegated to {@link DataReaderFactory}s that are returned by
- * {@link #createDataReaderFactories()}.
+ * logic is delegated to {@link ReadTask}s that are returned by
+ * {@link #createReadTasks()}.
  *
  * There are mainly 3 kinds of query optimizations:
  *   1. Operators push-down. E.g., filter push-down, required columns push-down(aka column
  *      pruning), etc. Names of these interfaces start with `SupportsPushDown`.
  *   2. Information Reporting. E.g., statistics reporting, ordering reporting, etc.
  *      Names of these interfaces start with `SupportsReporting`.
- *   3. Special scans. E.g, columnar scan, unsafe row scan, etc.
+ *   3. Special scans. E.g, columnar scan.
  *      Names of these interfaces start with `SupportsScan`. Note that a reader should only
  *      implement at most one of the special scans, if more than one special scans are implemented,
  *      only one of them would be respected, according to the priority list from high to low:
- *      {@link SupportsScanColumnarBatch}, {@link SupportsScanUnsafeRow}.
+ *      {@link SupportsScanColumnarBatch}, {@link SupportsDeprecatedScanRow}.
  *
  * If an exception was throw when applying any of these query optimizations, the action would fail
  * and no Spark job was submitted.
@@ -76,5 +76,5 @@ public interface DataSourceReader {
    * If this method fails (by throwing an exception), the action would fail and no Spark job was
    * submitted.
    */
-  List<DataReaderFactory<Row>> createDataReaderFactories();
+  List<ReadTask<InternalRow>> createReadTasks();
 }
@@ -22,17 +22,17 @@
 import org.apache.spark.annotation.InterfaceStability;
 
 /**
- * A reader factory returned by {@link DataSourceReader#createDataReaderFactories()} and is
+ * A reader factory returned by {@link DataSourceReader#createReadTasks()} and is
  * responsible for creating the actual data reader. The relationship between
- * {@link DataReaderFactory} and {@link DataReader}
+ * {@link ReadTask} and {@link DataReader}
  * is similar to the relationship between {@link Iterable} and {@link java.util.Iterator}.
  *
- * Note that, the reader factory will be serialized and sent to executors, then the data reader
- * will be created on executors and do the actual reading. So {@link DataReaderFactory} must be
+ * Note that this read task will be serialized and sent to executors, then the data reader
+ * will be created on executors and do the actual reading. So {@link ReadTask} must be
  * serializable and {@link DataReader} doesn't need to be.
  */
 @InterfaceStability.Evolving
-public interface DataReaderFactory<T> extends Serializable {
+public interface ReadTask<T> extends Serializable {
 
   /**
    * The preferred locations where the data reader returned by this reader factory can run faster,
 
@@ -0,0 +1,35 @@
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
+
+import java.util.List;
+
+/**
+ * A mix-in interface for {@link DataSourceReader} to assist in moving from Row to InternalRow.
+ * Data source readers can implement this interface to output {@link Row}.
+ */
+@Deprecated
+@InterfaceStability.Evolving
+public interface SupportsDeprecatedScanRow extends DataSourceReader {
+  @Override
+  default List<ReadTask<InternalRow>> createReadTasks() {
+    throw new IllegalStateException(
+        "createReadTasks not supported by default within SupportsDeprecatedScanRow.");
+  }
+
+  /**
+   * Returns a list of reader factories. Each factory is responsible for creating a data reader to
+   * output data for one RDD partition. That means the number of factories returned here is same as
+   * the number of RDD partitions this scan outputs.
+   *
+   * Note that, this may not be a full scan if the data source reader mixes in other optimization
+   * interfaces like column pruning, filter push-down, etc. These optimizations are applied before
+   * Spark issues the scan request.
+   *
+   * If this method fails (by throwing an exception), the action would fail and no Spark job was
+   * submitted.
+   */
+  List<ReadTask<Row>> createDataReaderFactories();
+}
@@ -24,7 +24,7 @@
  * A mix in interface for {@link DataSourceReader}. Data source readers can implement this
  * interface to report data partitioning and try to avoid shuffle at Spark side.
  *
- * Note that, when the reader creates exactly one {@link DataReaderFactory}, Spark may avoid
+ * Note that, when the reader creates exactly one {@link ReadTask}, Spark may avoid
  * adding a shuffle even if the reader does not implement this interface.
  */
 @InterfaceStability.Evolving
 
@@ -20,7 +20,7 @@
 import java.util.List;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
 
 /**
@@ -30,22 +30,22 @@
 @InterfaceStability.Evolving
 public interface SupportsScanColumnarBatch extends DataSourceReader {
   @Override
-  default List<DataReaderFactory<Row>> createDataReaderFactories() {
+  default List<ReadTask<InternalRow>> createReadTasks() {
     throw new IllegalStateException(
-      "createDataReaderFactories not supported by default within SupportsScanColumnarBatch.");
+      "createReadTasks not supported by default within SupportsScanColumnarBatch.");
   }
 
   /**
-   * Similar to {@link DataSourceReader#createDataReaderFactories()}, but returns columnar data
+   * Similar to {@link DataSourceReader#createReadTasks()}, but returns columnar data
    * in batches.
    */
-  List<DataReaderFactory<ColumnarBatch>> createBatchDataReaderFactories();
+  List<ReadTask<ColumnarBatch>> createBatchReadTasks();
 
   /**
    * Returns true if the concrete data source reader can read data in batch according to the scan
    * properties like required columns, pushes filters, etc. It's possible that the implementation
    * can only support some certain columns with certain types. Users can overwrite this method and
-   * {@link #createDataReaderFactories()} to fallback to normal read path under some conditions.
+   * {@link #createReadTasks()} to fallback to normal read path under some conditions.
    */
   default boolean enableBatchRead() {
     return true;
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@`
`24`	`24`	`* A mix in interface for {@link DataSourceReader}. Data source readers can implement this`
`25`	`25`	`* interface to report data partitioning and try to avoid shuffle at Spark side.`
`26`	`26`	`*`
`27`		`- * Note that, when the reader creates exactly one {@link DataReaderFactory}, Spark may avoid`
	`27`	`+ * Note that, when the reader creates exactly one {@link ReadTask}, Spark may avoid`
`28`	`28`	`* adding a shuffle even if the reader does not implement this interface.`
`29`	`29`	`*/`
`30`	`30`	`@InterfaceStability.Evolving`