[SPARK-24308][SQL] Handle DataReaderFactory to InputPartition rename in left over classes

arunmahadevan · gatorsmile · commit 710e4e81a8ef · 2018-05-18T14:37:01.000-07:00
## What changes were proposed in this pull request? SPARK-24073 renames DataReaderFactory -> InputPartition and DataReader -> InputPartitionReader. Some classes still reflects the old name and causes confusion. This patch renames the left over classes to reflect the new interface and fixes a few comments. ## How was this patch tested? Existing unit tests. Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Arun Mahadevan <arunm@apache.org> Closes #21355 from arunmahadevan/SPARK-24308.
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
@@ -106,7 +106,7 @@ class KafkaContinuousReader(
 
     startOffsets.toSeq.map {
       case (topicPartition, start) =>
-        KafkaContinuousDataReaderFactory(
+        KafkaContinuousInputPartition(
           topicPartition, start, kafkaParams, pollTimeoutMs, failOnDataLoss)
           .asInstanceOf[InputPartition[UnsafeRow]]
     }.asJava
@@ -146,7 +146,7 @@ class KafkaContinuousReader(
 }
 
 /**
- * A data reader factory for continuous Kafka processing. This will be serialized and transformed
+ * An input partition for continuous Kafka processing. This will be serialized and transformed
  * into a full reader on executors.
  *
  * @param topicPartition The (topic, partition) pair this task is responsible for.
@@ -156,7 +156,7 @@ class KafkaContinuousReader(
  * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
  *                       are skipped.
  */
-case class KafkaContinuousDataReaderFactory(
+case class KafkaContinuousInputPartition(
     topicPartition: TopicPartition,
     startOffset: Long,
     kafkaParams: ju.Map[String, Object],
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchReader.scala
@@ -143,7 +143,7 @@ private[kafka010] class KafkaMicroBatchReader(
 
     // Generate factories based on the offset ranges
     val factories = offsetRanges.map { range =>
-      new KafkaMicroBatchDataReaderFactory(
+      new KafkaMicroBatchInputPartition(
         range, executorKafkaParams, pollTimeoutMs, failOnDataLoss, reuseKafkaConsumer)
     }
     factories.map(_.asInstanceOf[InputPartition[UnsafeRow]]).asJava
@@ -300,7 +300,7 @@ private[kafka010] class KafkaMicroBatchReader(
 }
 
 /** A [[InputPartition]] for reading Kafka data in a micro-batch streaming query. */
-private[kafka010] case class KafkaMicroBatchDataReaderFactory(
+private[kafka010] case class KafkaMicroBatchInputPartition(
     offsetRange: KafkaOffsetRange,
     executorKafkaParams: ju.Map[String, Object],
     pollTimeoutMs: Long,
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
@@ -679,7 +679,7 @@ class KafkaMicroBatchV2SourceSuite extends KafkaMicroBatchSourceSuiteBase {
           Optional.of[OffsetV2](KafkaSourceOffset(Map(tp -> 100L)))
         )
         val factories = reader.planUnsafeInputPartitions().asScala
-          .map(_.asInstanceOf[KafkaMicroBatchDataReaderFactory])
+          .map(_.asInstanceOf[KafkaMicroBatchInputPartition])
         withClue(s"minPartitions = $minPartitions generated factories $factories\n\t") {
           assert(factories.size == numPartitionsGenerated)
           factories.foreach { f => assert(f.reuseKafkaConsumer == reusesConsumers) }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousInputPartition.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousInputPartition.java
@@ -27,9 +27,9 @@
 @InterfaceStability.Evolving
 public interface ContinuousInputPartition<T> extends InputPartition<T> {
   /**
-   * Create a DataReader with particular offset as its startOffset.
+   * Create an input partition reader with particular offset as its startOffset.
    *
-   * @param offset offset want to set as the DataReader's startOffset.
+   * @param offset offset want to set as the input partition reader's startOffset.
    */
   InputPartitionReader<T> createContinuousReader(PartitionOffset offset);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/InputPartition.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/InputPartition.java
@@ -36,8 +36,8 @@
 public interface InputPartition<T> extends Serializable {
 
   /**
-   * The preferred locations where the data reader returned by this partition can run faster,
-   * but Spark does not guarantee to run the data reader on these locations.
+   * The preferred locations where the input partition reader returned by this partition can run faster,
+   * but Spark does not guarantee to run the input partition reader on these locations.
    * The implementations should make sure that it can be run on any location.
    * The location is a string representing the host name.
    *
@@ -53,7 +53,7 @@ default String[] preferredLocations() {
   }
 
   /**
-   * Returns a data reader to do the actual reading work.
+   * Returns an input partition reader to do the actual reading work.
    *
    * If this method fails (by throwing an exception), the corresponding Spark task would fail and
    * get retried until hitting the maximum retry times.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/InputPartitionReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/InputPartitionReader.java
@@ -23,11 +23,11 @@
 import org.apache.spark.annotation.InterfaceStability;
 
 /**
- * A data reader returned by {@link InputPartition#createPartitionReader()} and is responsible for
+ * An input partition reader returned by {@link InputPartition#createPartitionReader()} and is responsible for
  * outputting data for a RDD partition.
  *
- * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.Row} for normal data
- * source readers, or {@link org.apache.spark.sql.catalyst.expressions.UnsafeRow} for data source
+ * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.Row} for normal input
+ * partition readers, or {@link org.apache.spark.sql.catalyst.expressions.UnsafeRow} for input partition
  * readers that mix in {@link SupportsScanUnsafeRow}.
  */
 @InterfaceStability.Evolving
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
@@ -29,12 +29,12 @@ class DataSourceRDDPartition[T : ClassTag](val index: Int, val inputPartition: I
 
 class DataSourceRDD[T: ClassTag](
     sc: SparkContext,
-    @transient private val readerFactories: Seq[InputPartition[T]])
+    @transient private val inputPartitions: Seq[InputPartition[T]])
   extends RDD[T](sc, Nil) {
 
   override protected def getPartitions: Array[Partition] = {
-    readerFactories.zipWithIndex.map {
-      case (readerFactory, index) => new DataSourceRDDPartition(index, readerFactory)
+    inputPartitions.zipWithIndex.map {
+      case (inputPartition, index) => new DataSourceRDDPartition(index, inputPartition)
     }.toArray
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -85,7 +85,7 @@ class RateStreamContinuousReader(options: DataSourceOptions)
       val start = partitionStartMap(i)
       // Have each partition advance by numPartitions each row, with starting points staggered
       // by their partition index.
-      RateStreamContinuousDataReaderFactory(
+      RateStreamContinuousInputPartition(
         start.value,
         start.runTimeMs,
         i,
@@ -113,7 +113,7 @@ class RateStreamContinuousReader(options: DataSourceOptions)
 
 }
 
-case class RateStreamContinuousDataReaderFactory(
+case class RateStreamContinuousInputPartition(
     startValue: Long,
     startTimeMs: Long,
     partitionIndex: Int,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -156,7 +156,7 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
       logDebug(generateDebugString(newBlocks.flatten, startOrdinal, endOrdinal))
 
       newBlocks.map { block =>
-        new MemoryStreamDataReaderFactory(block).asInstanceOf[InputPartition[UnsafeRow]]
+        new MemoryStreamInputPartition(block).asInstanceOf[InputPartition[UnsafeRow]]
       }.asJava
     }
   }
@@ -201,7 +201,7 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)
 }
 
 
-class MemoryStreamDataReaderFactory(records: Array[UnsafeRow])
+class MemoryStreamInputPartition(records: Array[UnsafeRow])
   extends InputPartition[UnsafeRow] {
   override def createPartitionReader(): InputPartitionReader[UnsafeRow] = {
     new InputPartitionReader[UnsafeRow] {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala
@@ -44,8 +44,8 @@ import org.apache.spark.util.RpcUtils
  *  * ContinuousMemoryStream maintains a list of records for each partition. addData() will
  *    distribute records evenly-ish across partitions.
  *  * RecordEndpoint is set up as an endpoint for executor-side
- *    ContinuousMemoryStreamDataReader instances to poll. It returns the record at the specified
- *    offset within the list, or null if that offset doesn't yet have a record.
+ *    ContinuousMemoryStreamInputPartitionReader instances to poll. It returns the record at
+ *    the specified offset within the list, or null if that offset doesn't yet have a record.
  */
 class ContinuousMemoryStream[A : Encoder](id: Int, sqlContext: SQLContext, numPartitions: Int = 2)
   extends MemoryStreamBase[A](sqlContext) with ContinuousReader with ContinuousReadSupport {
@@ -106,7 +106,7 @@ class ContinuousMemoryStream[A : Encoder](id: Int, sqlContext: SQLContext, numPa
 
       startOffset.partitionNums.map {
         case (part, index) =>
-          new ContinuousMemoryStreamDataReaderFactory(
+          new ContinuousMemoryStreamInputPartition(
             endpointName, part, index): InputPartition[Row]
       }.toList.asJava
     }
@@ -157,9 +157,9 @@ object ContinuousMemoryStream {
 }
 
 /**
- * Data reader factory for continuous memory stream.
+ * An input partition for continuous memory stream.
  */
-class ContinuousMemoryStreamDataReaderFactory(
+class ContinuousMemoryStreamInputPartition(
     driverEndpointName: String,
     partition: Int,
     startOffset: Int) extends InputPartition[Row] {
@@ -168,7 +168,7 @@ class ContinuousMemoryStreamDataReaderFactory(
 }
 
 /**
- * Data reader for continuous memory stream.
+ * An input partition reader for continuous memory stream.
  *
  * Polls the driver endpoint for new records.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchReader.scala
@@ -167,7 +167,7 @@ class RateStreamMicroBatchReader(options: DataSourceOptions, checkpointLocation:
     }
 
     (0 until numPartitions).map { p =>
-      new RateStreamMicroBatchDataReaderFactory(
+      new RateStreamMicroBatchInputPartition(
         p, numPartitions, rangeStart, rangeEnd, localStartTimeMs, relativeMsPerValue)
         : InputPartition[Row]
     }.toList.asJava
@@ -182,7 +182,7 @@ class RateStreamMicroBatchReader(options: DataSourceOptions, checkpointLocation:
     s"numPartitions=${options.get(NUM_PARTITIONS).orElse("default")}"
 }
 
-class RateStreamMicroBatchDataReaderFactory(
+class RateStreamMicroBatchInputPartition(
     partitionId: Int,
     numPartitions: Int,
     rangeStart: Long,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProviderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProviderSuite.scala
@@ -309,7 +309,7 @@ class RateSourceSuite extends StreamTest {
 
     val data = scala.collection.mutable.ListBuffer[Row]()
     tasks.asScala.foreach {
-      case t: RateStreamContinuousDataReaderFactory =>
+      case t: RateStreamContinuousInputPartition =>
         val startTimeMs = reader.getStartOffset()
           .asInstanceOf[RateStreamOffset]
           .partitionToValueAndRunTimeMs(t.partitionIndex)

Original file line number	Diff line number	Diff line change
`@@ -679,7 +679,7 @@ class KafkaMicroBatchV2SourceSuite extends KafkaMicroBatchSourceSuiteBase {`
`679`	`679`	`Optional.of[OffsetV2](KafkaSourceOffset(Map(tp -> 100L)))`
`680`	`680`	`)`
`681`	`681`	`val factories = reader.planUnsafeInputPartitions().asScala`
`682`		`- .map(_.asInstanceOf[KafkaMicroBatchDataReaderFactory])`
	`682`	`+ .map(_.asInstanceOf[KafkaMicroBatchInputPartition])`
`683`	`683`	`withClue(s"minPartitions = $minPartitions generated factories $factories\n\t") {`
`684`	`684`	`assert(factories.size == numPartitionsGenerated)`
`685`	`685`	`factories.foreach { f => assert(f.reuseKafkaConsumer == reusesConsumers) }`
Original file line number	Diff line number	Diff line change
`@@ -27,9 +27,9 @@`
`27`	`27`	`@InterfaceStability.Evolving`
`28`	`28`	`public interface ContinuousInputPartition<T> extends InputPartition<T> {`
`29`	`29`	`/**`
`30`		`- * Create a DataReader with particular offset as its startOffset.`
	`30`	`+ * Create an input partition reader with particular offset as its startOffset.`
`31`	`31`	`*`
`32`		`- * @param offset offset want to set as the DataReader's startOffset.`
	`32`	`+ * @param offset offset want to set as the input partition reader's startOffset.`
`33`	`33`	`*/`
`34`	`34`	`InputPartitionReader<T> createContinuousReader(PartitionOffset offset);`
`35`	`35`	`}`
Original file line number	Diff line number	Diff line change
`@@ -36,8 +36,8 @@`
`36`	`36`	`public interface InputPartition<T> extends Serializable {`
`37`	`37`
`38`	`38`	`/**`
`39`		`- * The preferred locations where the data reader returned by this partition can run faster,`
`40`		`- * but Spark does not guarantee to run the data reader on these locations.`
	`39`	`+ * The preferred locations where the input partition reader returned by this partition can run faster,`
	`40`	`+ * but Spark does not guarantee to run the input partition reader on these locations.`
`41`	`41`	`* The implementations should make sure that it can be run on any location.`
`42`	`42`	`* The location is a string representing the host name.`
`43`	`43`	`*`
`@@ -53,7 +53,7 @@ default String[] preferredLocations() {`
`53`	`53`	`}`
`54`	`54`
`55`	`55`	`/**`
`56`		`- * Returns a data reader to do the actual reading work.`
	`56`	`+ * Returns an input partition reader to do the actual reading work.`
`57`	`57`	`*`
`58`	`58`	`* If this method fails (by throwing an exception), the corresponding Spark task would fail and`
`59`	`59`	`* get retried until hitting the maximum retry times.`
Original file line number	Diff line number	Diff line change
`@@ -156,7 +156,7 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)`
`156`	`156`	`logDebug(generateDebugString(newBlocks.flatten, startOrdinal, endOrdinal))`
`157`	`157`
`158`	`158`	`newBlocks.map { block =>`
`159`		`- new MemoryStreamDataReaderFactory(block).asInstanceOf[InputPartition[UnsafeRow]]`
	`159`	`+ new MemoryStreamInputPartition(block).asInstanceOf[InputPartition[UnsafeRow]]`
`160`	`160`	`}.asJava`
`161`	`161`	`}`
`162`	`162`	`}`
`@@ -201,7 +201,7 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext)`
`201`	`201`	`}`
`202`	`202`
`203`	`203`
`204`		`-class MemoryStreamDataReaderFactory(records: Array[UnsafeRow])`
	`204`	`+class MemoryStreamInputPartition(records: Array[UnsafeRow])`
`205`	`205`	`extends InputPartition[UnsafeRow] {`
`206`	`206`	`override def createPartitionReader(): InputPartitionReader[UnsafeRow] = {`
`207`	`207`	`new InputPartitionReader[UnsafeRow] {`