apache · cloud-fan · Oct 2, 2019 · Oct 3, 2019 · cloud-fan · Oct 3, 2019
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
@@ -50,7 +50,7 @@ private[kafka010] object KafkaWriter extends Logging {
       topic: Option[String] = None): Unit = {
     schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse(
       if (topic.isEmpty) {
-        throw new AnalysisException(s"topic option required when no " +
+        throw new IllegalArgumentException(s"topic option required when no " +
           s"'$TOPIC_ATTRIBUTE_NAME' attribute is present. Use the " +
           s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a topic.")
       } else {
@@ -59,22 +59,22 @@ private[kafka010] object KafkaWriter extends Logging {
     ).dataType match {
       case StringType => // good
       case _ =>
-        throw new AnalysisException(s"Topic type must be a ${StringType.catalogString}")
+        throw new IllegalArgumentException(s"Topic type must be a ${StringType.catalogString}")
     }
     schema.find(_.name == KEY_ATTRIBUTE_NAME).getOrElse(
       Literal(null, StringType)
     ).dataType match {
       case StringType | BinaryType => // good
       case _ =>
-        throw new AnalysisException(s"$KEY_ATTRIBUTE_NAME attribute type " +
+        throw new IllegalArgumentException(s"$KEY_ATTRIBUTE_NAME attribute type " +
           s"must be a ${StringType.catalogString} or ${BinaryType.catalogString}")
     }
     schema.find(_.name == VALUE_ATTRIBUTE_NAME).getOrElse(
-      throw new AnalysisException(s"Required attribute '$VALUE_ATTRIBUTE_NAME' not found")
+      throw new IllegalArgumentException(s"Required attribute '$VALUE_ATTRIBUTE_NAME' not found")
     ).dataType match {
       case StringType | BinaryType => // good
       case _ =>
-        throw new AnalysisException(s"$VALUE_ATTRIBUTE_NAME attribute type " +
+        throw new IllegalArgumentException(s"$VALUE_ATTRIBUTE_NAME attribute type " +
           s"must be a ${StringType.catalogString} or ${BinaryType.catalogString}")
     }
     schema.find(_.name == HEADERS_ATTRIBUTE_NAME).getOrElse(
@@ -83,7 +83,7 @@ private[kafka010] object KafkaWriter extends Logging {
     ).dataType match {
       case KafkaRecordToRowConverter.headersType => // good
       case _ =>
-        throw new AnalysisException(s"$HEADERS_ATTRIBUTE_NAME attribute type " +
+        throw new IllegalArgumentException(s"$HEADERS_ATTRIBUTE_NAME attribute type " +
           s"must be a ${KafkaRecordToRowConverter.headersType.catalogString}")
     }
   }

diff --git a/...afka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala b/...afka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
@@ -25,6 +25,8 @@ import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.sql.{AnalysisException, DataFrame, Row}
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, SpecificInternalRow, UnsafeProjection}
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryStream
 import org.apache.spark.sql.streaming._
 import org.apache.spark.sql.types.{BinaryType, DataType}
 import org.apache.spark.util.Utils
@@ -215,6 +217,7 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
   test("streaming - write data with bad schema") {
     val inputTopic = newTopic()
     testUtils.createTopic(inputTopic, partitions = 1)
+    testUtils.sendMessages(inputTopic, Array("0"))
 
     val input = spark
       .readStream
@@ -226,21 +229,21 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
     val topic = newTopic()
     testUtils.createTopic(topic)
 
-    val ex = intercept[AnalysisException] {
+    val ex = intercept[StreamingQueryException] {
       /* No topic field or topic option */
       createKafkaWriter(input.toDF())(
         withSelectExpr = "value as key", "value"
-      )
+      ).processAllAvailable()
     }
     assert(ex.getMessage
       .toLowerCase(Locale.ROOT)
       .contains("topic option required when no 'topic' attribute is present"))
 
-    val ex2 = intercept[AnalysisException] {
+    val ex2 = intercept[StreamingQueryException] {
       /* No value field */
       createKafkaWriter(input.toDF())(
         withSelectExpr = s"'$topic' as topic", "value as key"
-      )
+      ).processAllAvailable()
     }
     assert(ex2.getMessage.toLowerCase(Locale.ROOT).contains(
       "required attribute 'value' not found"))
@@ -249,6 +252,7 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
   test("streaming - write data with valid schema but wrong types") {
     val inputTopic = newTopic()
     testUtils.createTopic(inputTopic, partitions = 1)
+    testUtils.sendMessages(inputTopic, Array("0"))
 
     val input = spark
       .readStream
@@ -261,28 +265,28 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
     val topic = newTopic()
     testUtils.createTopic(topic)
 
-    val ex = intercept[AnalysisException] {
+    val ex = intercept[StreamingQueryException] {
       /* topic field wrong type */
       createKafkaWriter(input.toDF())(
         withSelectExpr = s"CAST('1' as INT) as topic", "value"
-      )
+      ).processAllAvailable()
     }
     assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("topic type must be a string"))
 
-    val ex2 = intercept[AnalysisException] {
+    val ex2 = intercept[StreamingQueryException] {
       /* value field wrong type */
       createKafkaWriter(input.toDF())(
         withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as value"
-      )
+      ).processAllAvailable()
     }
     assert(ex2.getMessage.toLowerCase(Locale.ROOT).contains(
       "value attribute type must be a string or binary"))
 
-    val ex3 = intercept[AnalysisException] {
+    val ex3 = intercept[StreamingQueryException] {
       /* key field wrong type */
       createKafkaWriter(input.toDF())(
         withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as key", "value"
-      )
+      ).processAllAvailable()
     }
     assert(ex3.getMessage.toLowerCase(Locale.ROOT).contains(
       "key attribute type must be a string or binary"))
@@ -330,18 +334,18 @@ class KafkaContinuousSinkSuite extends KafkaContinuousTest {
       .option("subscribe", inputTopic)
       .load()
 
-    val ex = intercept[IllegalArgumentException] {
+    val ex = intercept[StreamingQueryException] {
       createKafkaWriter(
         input.toDF(),
-        withOptions = Map("kafka.key.serializer" -> "foo"))()
+        withOptions = Map("kafka.key.serializer" -> "foo"))().processAllAvailable()
     }
     assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
       "kafka option 'key.serializer' is not supported"))
 
-    val ex2 = intercept[IllegalArgumentException] {
+    val ex2 = intercept[StreamingQueryException] {
       createKafkaWriter(
         input.toDF(),
-        withOptions = Map("kafka.value.serializer" -> "foo"))()
+        withOptions = Map("kafka.value.serializer" -> "foo"))().processAllAvailable()
     }
     assert(ex2.getMessage.toLowerCase(Locale.ROOT).contains(
       "kafka option 'value.serializer' is not supported"))

diff --git a/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, SupportsPushDownF
 import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, MicroBatchStream}
 import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan}
 import org.apache.spark.sql.execution.datasources.DataSourceStrategy
+import org.apache.spark.sql.execution.streaming.{WriteMicroBatch, WriteMicroBatchExec}
 import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec}
 import org.apache.spark.sql.sources
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
@@ -176,9 +177,6 @@ object DataSourceV2Strategy extends Strategy with PredicateHelper {
 
       withProjection :: Nil
 
-    case WriteToDataSourceV2(writer, query) =>
-      WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil
-
     case CreateV2Table(catalog, ident, schema, parts, props, ifNotExists) =>
       CreateTableExec(catalog, ident, schema, parts, props, ifNotExists) :: Nil
 
@@ -265,8 +263,13 @@ object DataSourceV2Strategy extends Strategy with PredicateHelper {
           }).toArray
       DeleteFromTableExec(r.table.asDeletable, filters) :: Nil
 
-    case WriteToContinuousDataSource(writer, query) =>
-      WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil
+    case WriteMicroBatch(table, query, queryId, querySchema, outputMode, options, epochId) =>
+      WriteMicroBatchExec(
+        table, planLater(query), queryId, querySchema, outputMode, options, epochId) :: Nil
+
+    case WriteToContinuousDataSource(table, query, queryId, querySchema, outputMode, options) =>
+      WriteToContinuousDataSourceExec(
+        table, planLater(query), queryId, querySchema, outputMode, options) :: Nil
 
     case Repartition(1, false, child) =>
       val isContinuous = child.find {

diff --git a/...rc/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala b/...rc/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala
@@ -38,17 +38,6 @@ import org.apache.spark.sql.sources.{AlwaysTrue, Filter}
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.util.{LongAccumulator, Utils}
 
-/**
- * Deprecated logical plan for writing data into data source v2. This is being replaced by more
- * specific logical plans, like [[org.apache.spark.sql.catalyst.plans.logical.AppendData]].
- */
-@deprecated("Use specific logical plans like AppendData instead", "2.4.0")
-case class WriteToDataSourceV2(batchWrite: BatchWrite, query: LogicalPlan)
-  extends LogicalPlan {
-  override def children: Seq[LogicalPlan] = Seq(query)
-  override def output: Seq[Attribute] = Nil
-}
-
 /**
  * Physical plan node for v2 create table as select when the catalog does not support staging
  * the table creation.
@@ -315,17 +304,6 @@ case class OverwritePartitionsDynamicExec(
   }
 }
 
-case class WriteToDataSourceV2Exec(
-    batchWrite: BatchWrite,
-    query: SparkPlan) extends V2TableWriteExec {
-
-  def writeOptions: CaseInsensitiveStringMap = CaseInsensitiveStringMap.empty()
-
-  override protected def doExecute(): RDD[InternalRow] = {
-    writeWithV2(batchWrite)
-  }
-}
-
 /**
  * Helper for physical plans that build batch writes.
  */

diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingWriteExec.scala b/...core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingWriteExec.scala
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.{Append, Complete, Update}
+import org.apache.spark.sql.connector.catalog.SupportsWrite
+import org.apache.spark.sql.connector.write.SupportsTruncate
+import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
+import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.util.Utils
+
+trait BaseStreamingWriteExec extends UnaryExecNode {
+  def table: SupportsWrite
+  def query: SparkPlan
+  def queryId: String
+  def querySchema: StructType
+  def outputMode: OutputMode
+  def options: Map[String, String]
+
+  override def child: SparkPlan = query
+  override def output: Seq[Attribute] = Nil
+
+  protected lazy val inputRDD = query.execute()
+  lazy val streamWrite = {
+    val writeBuilder = table.newWriteBuilder(new CaseInsensitiveStringMap(options.asJava))
+      .withQueryId(queryId)
+      .withInputDataSchema(querySchema)
+    outputMode match {
+      case Append =>
+        writeBuilder.buildForStreaming()
+
+      case Complete =>
+        // TODO: we should do this check earlier when we have capability API.
+        require(writeBuilder.isInstanceOf[SupportsTruncate],
+          table.name + " does not support Complete mode.")
+        writeBuilder.asInstanceOf[SupportsTruncate].truncate().buildForStreaming()
+
+      case Update =>
+        // Although no v2 sinks really support Update mode now, but during tests we do want them
+        // to pretend to support Update mode, and treat Update mode same as Append mode.
+        if (Utils.isTesting) {
+          writeBuilder.buildForStreaming()
+        } else {
+          throw new IllegalArgumentException(
+            "Data source v2 streaming sinks does not support Update mode.")
+        }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -27,8 +27,8 @@ import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability}
 import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset => OffsetV2, SparkDataStream}
 import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.execution.datasources.v2.{StreamingDataSourceV2Relation, StreamWriterCommitProgress, WriteToDataSourceV2Exec}
-import org.apache.spark.sql.execution.streaming.sources.{RateControlMicroBatchStream, WriteToMicroBatchDataSource}
+import org.apache.spark.sql.execution.datasources.v2.{StreamingDataSourceV2Relation, StreamWriterCommitProgress}
+import org.apache.spark.sql.execution.streaming.sources.RateControlMicroBatchStream
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.{OutputMode, Trigger}
 import org.apache.spark.util.Clock
@@ -127,8 +127,8 @@ class MicroBatchExecution(
     // TODO (SPARK-27484): we should add the writing node before the plan is analyzed.
     sink match {
       case s: SupportsWrite =>
-        val streamingWrite = createStreamingWrite(s, extraOptions, _logicalPlan)
-        WriteToMicroBatchDataSource(streamingWrite, _logicalPlan)
+        WriteToMicroBatchDataSource(
+          s, _logicalPlan, id.toString, _logicalPlan.schema, outputMode, extraOptions)
 
       case _ => _logicalPlan
     }
@@ -557,7 +557,7 @@ class MicroBatchExecution(
             nextBatch.collect()
         }
         lastExecution.executedPlan match {
-          case w: WriteToDataSourceV2Exec => w.commitProgress
+          case w: WriteMicroBatchExec => w.commitProgress
           case _ => None
         }
       }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -584,35 +584,6 @@ abstract class StreamExecution(
         |batch = $batchDescription""".stripMargin
   }
 
-  protected def createStreamingWrite(
-      table: SupportsWrite,
-      options: Map[String, String],
-      inputPlan: LogicalPlan): StreamingWrite = {
-    val writeBuilder = table.newWriteBuilder(new CaseInsensitiveStringMap(options.asJava))
-      .withQueryId(id.toString)
-      .withInputDataSchema(inputPlan.schema)
-    outputMode match {
-      case Append =>
-        writeBuilder.buildForStreaming()
-
-      case Complete =>
-        // TODO: we should do this check earlier when we have capability API.
-        require(writeBuilder.isInstanceOf[SupportsTruncate],
-          table.name + " does not support Complete mode.")
-        writeBuilder.asInstanceOf[SupportsTruncate].truncate().buildForStreaming()
-
-      case Update =>
-        // Although no v2 sinks really support Update mode now, but during tests we do want them
-        // to pretend to support Update mode, and treat Update mode same as Append mode.
-        if (Utils.isTesting) {
-          writeBuilder.buildForStreaming()
-        } else {
-          throw new IllegalArgumentException(
-            "Data source v2 streaming sinks does not support Update mode.")
-        }
-    }
-  }
-
   protected def purge(threshold: Long): Unit = {
     logDebug(s"Purging metadata at threshold=$threshold")
     offsetLog.purge(threshold)