[SPARK-24991][SQL] use InternalRow in DataSourceWriter

cloud-fan · rdblue · commit 2cffd18d399d · 2018-08-29T12:59:21.000-07:00
A follow up of apache#21118 Since we use `InternalRow` in the read API of data source v2, we should do the same thing for the write API. existing tests. Author: Wenchen Fan <wenchen@databricks.com> Closes apache#21948 from cloud-fan/row-write.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataSourceWriter.java
@@ -18,8 +18,8 @@
 package org.apache.spark.sql.sources.v2.writer;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 import org.apache.spark.sql.sources.v2.WriteSupport;
 import org.apache.spark.sql.types.StructType;
@@ -57,7 +57,7 @@ public interface DataSourceWriter {
    * If this method fails (by throwing an exception), the action will fail and no Spark job will be
    * submitted.
    */
-  DataWriterFactory<Row> createWriterFactory();
+  DataWriterFactory<InternalRow> createWriterFactory();
 
   /**
    * Returns whether Spark should use the commit coordinator to ensure that only one attempt for
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriter.java
@@ -49,9 +49,7 @@
  * successfully, and have a way to revert committed data writers without the commit message, because
  * Spark only accepts the commit message that arrives first and ignore others.
  *
- * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.Row} for normal data
- * source writers, or {@link org.apache.spark.sql.catalyst.InternalRow} for data source writers
- * that mix in {@link SupportsWriteInternalRow}.
+ * Note that, Currently the type `T` can only be {@link org.apache.spark.sql.catalyst.InternalRow}.
  */
 @InterfaceStability.Evolving
 public interface DataWriter<T> {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java
@@ -33,7 +33,10 @@
 public interface DataWriterFactory<T> extends Serializable {
 
   /**
-   * Returns a data writer to do the actual writing work.
+   * Returns a data writer to do the actual writing work. Note that, Spark will reuse the same data
+   * object instance when sending data to the data writer, for better performance. Data writers
+   * are responsible for defensive copies if necessary, e.g. copy the data before buffer it in a
+   * list.
    *
    * If this method fails (by throwing an exception), the action will fail and no Spark job will be
    * submitted.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/SupportsWriteInternalRow.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/SupportsWriteInternalRow.java
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -47,11 +47,7 @@ case class WriteToDataSourceV2Exec(writer: DataSourceWriter, query: SparkPlan) e
   override def output: Seq[Attribute] = Nil
 
   override protected def doExecute(): RDD[InternalRow] = {
-    val writeTask = writer match {
-      case w: SupportsWriteInternalRow => w.createInternalRowWriterFactory()
-      case _ => new InternalRowDataWriterFactory(writer.createWriterFactory(), query.schema)
-    }
-
+    val writeTask = writer.createWriterFactory()
     val useCommitCoordinator = writer.useCommitCoordinator
     val rdd = query.execute()
     val messages = new Array[WriterCommitMessage](rdd.partitions.length)
@@ -142,23 +138,3 @@ object DataWritingSparkTask extends Logging {
   }
 }
 
-class InternalRowDataWriterFactory(
-    rowWriterFactory: DataWriterFactory[Row],
-    schema: StructType) extends DataWriterFactory[InternalRow] {
-
-  override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
-    new InternalRowDataWriter(
-      rowWriterFactory.createDataWriter(partitionId, attemptNumber),
-      RowEncoder.apply(schema).resolveAndBind())
-  }
-}
-
-class InternalRowDataWriter(rowWriter: DataWriter[Row], encoder: ExpressionEncoder[Row])
-  extends DataWriter[InternalRow] {
-
-  override def write(record: InternalRow): Unit = rowWriter.write(encoder.fromRow(record))
-
-  override def commit(): WriterCommitMessage = rowWriter.commit()
-
-  override def abort(): Unit = rowWriter.abort()
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala
@@ -188,13 +188,6 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {
         assert(e2.getMessage.contains("Writing job aborted"))
         // make sure we don't have partial data.
         assert(spark.read.format(cls.getName).option("path", path).load().collect().isEmpty)
-
-        // test internal row writer
-        spark.range(5).select('id as 'i, -'id as 'j).write.format(cls.getName)
-          .option("path", path).option("internal", "true").mode("overwrite").save()
-        checkAnswer(
-          spark.read.format(cls.getName).option("path", path).load(),
-          spark.range(5).select('id, -'id))
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
@@ -26,7 +26,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, Path}
 
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.{Row, SaveMode}
+import org.apache.spark.sql.SaveMode
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, InputPartition, InputPartitionReader}
 import org.apache.spark.sql.sources.v2.writer._
@@ -65,9 +65,9 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
   }
 
   class Writer(jobId: String, path: String, conf: Configuration) extends DataSourceWriter {
-    override def createWriterFactory(): DataWriterFactory[Row] = {
+    override def createWriterFactory(): DataWriterFactory[InternalRow] = {
       SimpleCounter.resetCounter
-      new SimpleCSVDataWriterFactory(path, jobId, new SerializableConfiguration(conf))
+      new CSVDataWriterFactory(path, jobId, new SerializableConfiguration(conf))
     }
 
     override def onDataWriterCommit(message: WriterCommitMessage): Unit = {
@@ -97,18 +97,6 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
     }
   }
 
-  class InternalRowWriter(jobId: String, path: String, conf: Configuration)
-    extends Writer(jobId, path, conf) with SupportsWriteInternalRow {
-
-    override def createWriterFactory(): DataWriterFactory[Row] = {
-      throw new IllegalArgumentException("not expected!")
-    }
-
-    override def createInternalRowWriterFactory(): DataWriterFactory[InternalRow] = {
-      new InternalRowCSVDataWriterFactory(path, jobId, new SerializableConfiguration(conf))
-    }
-  }
-
   override def createReader(options: DataSourceOptions): DataSourceReader = {
     val path = new Path(options.get("path").get())
     val conf = SparkContext.getActive.get.hadoopConfiguration
@@ -124,7 +112,6 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
     assert(!SparkContext.getActive.get.conf.getBoolean("spark.speculation", false))
 
     val path = new Path(options.get("path").get())
-    val internal = options.get("internal").isPresent
     val conf = SparkContext.getActive.get.hadoopConfiguration
     val fs = path.getFileSystem(conf)
 
@@ -142,17 +129,8 @@ class SimpleWritableDataSource extends DataSourceV2 with ReadSupport with WriteS
       fs.delete(path, true)
     }
 
-    Optional.of(createWriter(jobId, path, conf, internal))
-  }
-
-  private def createWriter(
-      jobId: String, path: Path, conf: Configuration, internal: Boolean): DataSourceWriter = {
     val pathStr = path.toUri.toString
-    if (internal) {
-      new InternalRowWriter(jobId, pathStr, conf)
-    } else {
-      new Writer(jobId, pathStr, conf)
-    }
+    Optional.of(new Writer(jobId, pathStr, conf))
   }
 }
 
@@ -204,51 +182,18 @@ private[v2] object SimpleCounter {
   }
 }
 
-class SimpleCSVDataWriterFactory(path: String, jobId: String, conf: SerializableConfiguration)
-  extends DataWriterFactory[Row] {
-
-  override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[Row] = {
-    val jobPath = new Path(new Path(path, "_temporary"), jobId)
-    val filePath = new Path(jobPath, s"$jobId-$partitionId-$attemptNumber")
-    val fs = filePath.getFileSystem(conf.value)
-    new SimpleCSVDataWriter(fs, filePath)
-  }
-}
-
-class SimpleCSVDataWriter(fs: FileSystem, file: Path) extends DataWriter[Row] {
-
-  private val out = fs.create(file)
-
-  override def write(record: Row): Unit = {
-    out.writeBytes(s"${record.getLong(0)},${record.getLong(1)}\n")
-  }
-
-  override def commit(): WriterCommitMessage = {
-    out.close()
-    null
-  }
-
-  override def abort(): Unit = {
-    try {
-      out.close()
-    } finally {
-      fs.delete(file, false)
-    }
-  }
-}
-
-class InternalRowCSVDataWriterFactory(path: String, jobId: String, conf: SerializableConfiguration)
+class CSVDataWriterFactory(path: String, jobId: String, conf: SerializableConfiguration)
   extends DataWriterFactory[InternalRow] {
 
   override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
     val jobPath = new Path(new Path(path, "_temporary"), jobId)
     val filePath = new Path(jobPath, s"$jobId-$partitionId-$attemptNumber")
     val fs = filePath.getFileSystem(conf.value)
-    new InternalRowCSVDataWriter(fs, filePath)
+    new CSVDataWriter(fs, filePath)
   }
 }
 
-class InternalRowCSVDataWriter(fs: FileSystem, file: Path) extends DataWriter[InternalRow] {
+class CSVDataWriter(fs: FileSystem, file: Path) extends DataWriter[InternalRow] {
 
   private val out = fs.create(file)
 

Original file line number	Diff line number	Diff line change
`@@ -188,13 +188,6 @@ class DataSourceV2Suite extends QueryTest with SharedSQLContext {`
`188`	`188`	`assert(e2.getMessage.contains("Writing job aborted"))`
`189`	`189`	`// make sure we don't have partial data.`
`190`	`190`	`assert(spark.read.format(cls.getName).option("path", path).load().collect().isEmpty)`
`191`		`-`
`192`		`- // test internal row writer`
`193`		`- spark.range(5).select('id as 'i, -'id as 'j).write.format(cls.getName)`
`194`		`- .option("path", path).option("internal", "true").mode("overwrite").save()`
`195`		`- checkAnswer(`
`196`		`- spark.read.format(cls.getName).option("path", path).load(),`
`197`		`- spark.range(5).select('id, -'id))`
`198`	`191`	`}`
`199`	`192`	`}`
`200`	`193`	`}`