[SPARK-53303][SS][CONNECT] Use the empty state encoder when the initial state is not provided in TWS

huanliwang-db · anishshri-db · commit 65fed3ac74cd · 2025-08-18T16:43:46.000-07:00
### What changes were proposed in this pull request? `agnosticEncoderFor[S]` returns the wrong encoder when no initial state is provided in TWS. We should create an empty state encoder and use that instead. ### Why are the changes needed? Fix the incorrect behavior for TWS without initial state in spark connect ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? N/A: unfortunately, this field is not used so far and it's not easy to provide the test coverage for this ### Was this patch authored or co-authored using generative AI tooling? No Closes apache#52056 from huanliwang-db/huanliwang-db/fix-tws. Authored-by: huanliwang-db <huanli.wang@databricks.com> Signed-off-by: Anish Shrigondekar <anish.shrigondekar@databricks.com>
diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/KeyValueGroupedDataset.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/KeyValueGroupedDataset.scala
@@ -24,6 +24,7 @@ import org.apache.spark.api.java.function._
 import org.apache.spark.connect.proto
 import org.apache.spark.sql
 import org.apache.spark.sql.{Column, Encoder, TypedColumn}
+import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder
 import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{agnosticEncoderFor, ProductEncoder, StructEncoder}
 import org.apache.spark.sql.connect.ColumnNodeToProtoConverter.{toExpr, toExprWithTransformation, toTypedExpr}
@@ -658,8 +659,14 @@ private class KeyValueGroupedDatasetImpl[K, V, IK, IV](
       initialState: Option[sql.KeyValueGroupedDataset[K, S]] = None,
       eventTimeColumnName: String = ""): Dataset[U] = {
     val outputEncoder = agnosticEncoderFor[U]
-    val stateEncoder = agnosticEncoderFor[S]
-    val inputEncoders: Seq[AgnosticEncoder[_]] = Seq(kEncoder, stateEncoder, ivEncoder)
+    val initialStateEncoder = if (initialState.isDefined) {
+      agnosticEncoderFor[S]
+    } else {
+      // Cannot use `agnosticEncoderFor[S]` here because it points to incorrect encoder
+      // when the initial state is not provided. Using an empty state encoder instead.
+      ScalaReflection.encoderFor[EmptyInitialStateStruct]
+    }
+    val inputEncoders: Seq[AgnosticEncoder[_]] = Seq(kEncoder, initialStateEncoder, ivEncoder)
 
     // SparkUserDefinedFunction is creating a udfPacket where the input function are
     // being java serialized into bytes; we pass in `statefulProcessor` as function so it can be
@@ -780,3 +787,14 @@ private object KeyValueGroupedDatasetImpl {
     case _ => false
   }
 }
+
+/**
+ * A marker case class used as a placeholder type for initial state encoders when no actual
+ * initial state is provided to stateful streaming operations.
+ *
+ * In the `transformWithStateHelper` method, when `initialState` is not provided, we cannot use
+ * `agnosticEncoderFor[S]` for the initial state encoder because it would incorrectly point to the
+ * other encoders. Instead, we use `EmptyStruct` as a sentinel type to create a proper encoder
+ * that represents the absence of initial state data.
+ */
+case class EmptyInitialStateStruct()