apache · anishshri-db · Dec 10, 2024 · Dec 10, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/...t/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/...t/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import java.util.Locale
+
 import org.apache.spark.internal.{Logging, MDC}
 import org.apache.spark.internal.LogKeys.{ANALYSIS_ERROR, QUERY_PLAN}
 import org.apache.spark.sql.AnalysisException
@@ -140,6 +142,38 @@ object UnsupportedOperationChecker extends Logging {
     }
   }
 
+  private def checkAvroSupportForStatefulOperator(p: LogicalPlan): Option[String] = p match {
+    // TODO: remove operators from this list as support for avro encoding is added
+    case s: Aggregate if s.isStreaming => Some("aggregation")
+    // Since the Distinct node will be replaced to Aggregate in the optimizer rule
+    // [[ReplaceDistinctWithAggregate]], here we also need to check all Distinct node by
+    // assuming it as Aggregate.
+    case d @ Distinct(_: LogicalPlan) if d.isStreaming => Some("distinct")
+    case _ @ Join(left, right, _, _, _) if left.isStreaming && right.isStreaming => Some("join")
+    case f: FlatMapGroupsWithState if f.isStreaming => Some("flatMapGroupsWithState")
+    case f: FlatMapGroupsInPandasWithState if f.isStreaming =>
+      Some("applyInPandasWithState")
+    case d: Deduplicate if d.isStreaming => Some("dropDuplicates")
+    case d: DeduplicateWithinWatermark if d.isStreaming => Some("dropDuplicatesWithinWatermark")
+    case _ => None
+  }
+
+  // Rule to check that avro encoding format is not supported in case any
+  // non-transformWithState stateful streaming operators are present in the query.
+  def checkSupportedStoreEncodingFormats(plan: LogicalPlan): Unit = {
+    val storeEncodingFormat = SQLConf.get.stateStoreEncodingFormat
+    if (storeEncodingFormat.toLowerCase(Locale.ROOT) == "avro") {
+      plan.foreach { subPlan =>
+        val operatorOpt = checkAvroSupportForStatefulOperator(subPlan)
+        if (operatorOpt.isDefined) {
+          val errorMsg = "State store encoding format as avro is not supported for " +
+            s"operator=${operatorOpt.get} used within the query"
+          throwError(errorMsg)(plan)
+        }
+      }
+    }
+  }
+
   def checkForStreaming(plan: LogicalPlan, outputMode: OutputMode): Unit = {
     if (!plan.isStreaming) {
       throwError(
@@ -199,6 +233,11 @@ object UnsupportedOperationChecker extends Logging {
           "DataFrames/Datasets")(plan)
     }
 
+    // check to see that if store encoding format is set to true, then we have no stateful
+    // operators in the query or only variants of operators that support avro encoding such as
+    // transformWithState.
+    checkSupportedStoreEncodingFormats(plan)
+
     val aggregates = collectStreamingAggregates(plan)
     // Disallow some output mode
     outputMode match {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
@@ -102,7 +102,7 @@ object OffsetSeqMetadata extends Logging {
     FLATMAPGROUPSWITHSTATE_STATE_FORMAT_VERSION, STREAMING_AGGREGATION_STATE_FORMAT_VERSION,
     STREAMING_JOIN_STATE_FORMAT_VERSION, STATE_STORE_COMPRESSION_CODEC,
     STATE_STORE_ROCKSDB_FORMAT_VERSION, STATEFUL_OPERATOR_USE_STRICT_DISTRIBUTION,
-    PRUNE_FILTERS_CAN_PRUNE_STREAMING_SUBPLAN
+    PRUNE_FILTERS_CAN_PRUNE_STREAMING_SUBPLAN, STREAMING_STATE_STORE_ENCODING_FORMAT
   )
 
   /**
@@ -125,7 +125,8 @@ object OffsetSeqMetadata extends Logging {
       SymmetricHashJoinStateManager.legacyVersion.toString,
     STATE_STORE_COMPRESSION_CODEC.key -> CompressionCodec.LZ4,
     STATEFUL_OPERATOR_USE_STRICT_DISTRIBUTION.key -> "false",
-    PRUNE_FILTERS_CAN_PRUNE_STREAMING_SUBPLAN.key -> "true"
+    PRUNE_FILTERS_CAN_PRUNE_STREAMING_SUBPLAN.key -> "true",
+    STREAMING_STATE_STORE_ENCODING_FORMAT.key -> "unsaferow"
   )
 
   def apply(json: String): OffsetSeqMetadata = Serialization.read[OffsetSeqMetadata](json)

diff --git a/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/.0.crc b/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/.0.crc
diff --git a/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/.1.crc b/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/.1.crc
diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/0
@@ -0,0 +1,2 @@
+v1
+{"nextBatchWatermarkMs":0,"stateUniqueIds":{}}
diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/commits/1
@@ -0,0 +1,2 @@
+v1
+{"nextBatchWatermarkMs":0,"stateUniqueIds":{}}
diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/metadata
@@ -0,0 +1 @@
+{"id":"f3f30619-9175-4329-97a7-f5629deaad89"}
diff --git a/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/.0.crc b/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/.0.crc
diff --git a/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/.1.crc b/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/.1.crc
diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/0
@@ -0,0 +1,3 @@
+v1
+{"batchWatermarkMs":0,"batchTimestampMs":1734074255407,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.stateStore.encodingFormat":"avro","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.databricks.sql.optimizer.pruneFiltersCanPruneStreamingSubplan":"false"}}
+0
diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/1 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/offsets/1
@@ -0,0 +1,3 @@
+v1
+{"batchWatermarkMs":0,"batchTimestampMs":1734074257473,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.stateStore.encodingFormat":"avro","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.databricks.sql.optimizer.pruneFiltersCanPruneStreamingSubplan":"false"}}
+1
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.1.changelog.crc b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.1.changelog.crc
diff --git a/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.1.zip.crc b/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.1.zip.crc
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.2.changelog.crc b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/.2.changelog.crc
diff --git a/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/1.changelog b/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/1.changelog
diff --git a/...src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/1.zip b/...src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/1.zip
diff --git a/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/2.changelog b/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/0/2.changelog
diff --git a/...ersion-4.0.0-tws-avro/state/0/0/SSTs/.000008-7b1ee246-6831-4c62-9fd7-7741cb534368.sst.crc b/...ersion-4.0.0-tws-avro/state/0/0/SSTs/.000008-7b1ee246-6831-4c62-9fd7-7741cb534368.sst.crc
diff --git a/...int-version-4.0.0-tws-avro/state/0/0/SSTs/000008-7b1ee246-6831-4c62-9fd7-7741cb534368.sst b/...int-version-4.0.0-tws-avro/state/0/0/SSTs/000008-7b1ee246-6831-4c62-9fd7-7741cb534368.sst
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.1.changelog.crc b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.1.changelog.crc
diff --git a/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.1.zip.crc b/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.1.zip.crc
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.2.changelog.crc b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/.2.changelog.crc
diff --git a/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/1.changelog b/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/1.changelog
diff --git a/...src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/1.zip b/...src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/1.zip
diff --git a/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/2.changelog b/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/1/2.changelog
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.1.changelog.crc b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.1.changelog.crc
diff --git a/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.1.zip.crc b/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.1.zip.crc
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.2.changelog.crc b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/.2.changelog.crc
diff --git a/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/1.changelog b/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/1.changelog
diff --git a/...src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/1.zip b/...src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/1.zip
diff --git a/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/2.changelog b/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/2/2.changelog
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.1.changelog.crc b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.1.changelog.crc
diff --git a/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.1.zip.crc b/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.1.zip.crc
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.2.changelog.crc b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/.2.changelog.crc
diff --git a/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/1.changelog b/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/1.changelog
diff --git a/...src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/1.zip b/...src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/1.zip
diff --git a/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/2.changelog b/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/3/2.changelog
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.1.changelog.crc b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.1.changelog.crc
diff --git a/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.1.zip.crc b/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.1.zip.crc
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.2.changelog.crc b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/.2.changelog.crc
diff --git a/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/1.changelog b/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/1.changelog
diff --git a/...src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/1.zip b/...src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/1.zip
diff --git a/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/2.changelog b/...st/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/4/2.changelog
diff --git a/...ources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_metadata/v2/.0.crc b/...ources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_metadata/v2/.0.crc
diff --git a/...t/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_metadata/v2/0 b/...t/resources/structured-streaming/checkpoint-version-4.0.0-tws-avro/state/0/_metadata/v2/0
@@ -0,0 +1,2 @@
+v2
+{"operatorInfo":{"operatorId":0,"operatorName":"transformWithStateExec"},"stateStoreInfo":[{"storeName":"default","numColsPrefixKey":0,"numPartitions":5,"stateSchemaFilePath":"file:/Users/anish.shrigondekar/spark/spark/target/tmp/spark-dcaeba6f-ff09-4f91-ba1b-4d14fe53cc9f/state/0/_stateSchema/default/0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0"}],"operatorPropertiesJson":"{\"timeMode\":\"NoTime\",\"outputMode\":\"Update\",\"stateVariables\":[{\"stateName\":\"countState\",\"stateVariableType\":\"ValueState\",\"ttlEnabled\":false}]}"}
diff --git a/...n-4.0.0-tws-avro/state/0/_stateSchema/default/.0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0.crc b/...n-4.0.0-tws-avro/state/0/_stateSchema/default/.0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0.crc
diff --git a/...ersion-4.0.0-tws-avro/state/0/_stateSchema/default/0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0 b/...ersion-4.0.0-tws-avro/state/0/_stateSchema/default/0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0
diff --git a/...test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/.0.crc b/...test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/.0.crc
diff --git a/...test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/.1.crc b/...test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/.1.crc
diff --git a/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/0 b/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/0
@@ -0,0 +1,2 @@
+v1
+{"nextBatchWatermarkMs":0,"stateUniqueIds":{}}
diff --git a/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/1 b/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/commits/1
@@ -0,0 +1,2 @@
+v1
+{"nextBatchWatermarkMs":0,"stateUniqueIds":{}}
diff --git a/...e/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/metadata b/...e/src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/metadata
@@ -0,0 +1 @@
+{"id":"1341f9d1-5100-4426-876c-2754aeaca02b"}
diff --git a/...test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/.0.crc b/...test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/.0.crc
diff --git a/...test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/.1.crc b/...test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/.1.crc
diff --git a/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/0 b/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/0
@@ -0,0 +1,3 @@
+v1
+{"batchWatermarkMs":0,"batchTimestampMs":1734074067729,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.stateStore.encodingFormat":"unsaferow","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.databricks.sql.optimizer.pruneFiltersCanPruneStreamingSubplan":"false"}}
+0
diff --git a/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/1 b/.../src/test/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/offsets/1
@@ -0,0 +1,3 @@
+v1
+{"batchWatermarkMs":0,"batchTimestampMs":1734074071551,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.RocksDBStateStoreProvider","spark.sql.streaming.stateStore.rocksdb.formatVersion":"5","spark.sql.streaming.stateStore.encodingFormat":"unsaferow","spark.sql.streaming.statefulOperator.useStrictDistribution":"true","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.stateStore.compression.codec":"lz4","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.databricks.sql.optimizer.pruneFiltersCanPruneStreamingSubplan":"false"}}
+1
diff --git a/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.1.changelog.crc b/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.1.changelog.crc
diff --git a/...esources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.1.zip.crc b/...esources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.1.zip.crc
diff --git a/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.2.changelog.crc b/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/.2.changelog.crc
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/1.changelog b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/1.changelog
diff --git a/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/1.zip b/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/1.zip
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/2.changelog b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/0/2.changelog
diff --git a/...n-4.0.0-tws-unsaferow/state/0/0/SSTs/.000008-9b6e23ce-e7de-4df8-b320-2b0378b53e52.sst.crc b/...n-4.0.0-tws-unsaferow/state/0/0/SSTs/.000008-9b6e23ce-e7de-4df8-b320-2b0378b53e52.sst.crc
diff --git a/...ersion-4.0.0-tws-unsaferow/state/0/0/SSTs/000008-9b6e23ce-e7de-4df8-b320-2b0378b53e52.sst b/...ersion-4.0.0-tws-unsaferow/state/0/0/SSTs/000008-9b6e23ce-e7de-4df8-b320-2b0378b53e52.sst
diff --git a/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.1.changelog.crc b/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.1.changelog.crc
diff --git a/...esources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.1.zip.crc b/...esources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.1.zip.crc
diff --git a/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.2.changelog.crc b/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/.2.changelog.crc
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/1.changelog b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/1.changelog
diff --git a/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/1.zip b/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/1.zip
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/2.changelog b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/1/2.changelog
diff --git a/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.1.changelog.crc b/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.1.changelog.crc
diff --git a/...esources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.1.zip.crc b/...esources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.1.zip.crc
diff --git a/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.2.changelog.crc b/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/.2.changelog.crc
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/1.changelog b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/1.changelog
diff --git a/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/1.zip b/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/1.zip
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/2.changelog b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/2/2.changelog
diff --git a/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.1.changelog.crc b/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.1.changelog.crc
diff --git a/...esources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.1.zip.crc b/...esources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.1.zip.crc
diff --git a/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.2.changelog.crc b/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/.2.changelog.crc
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/1.changelog b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/1.changelog
diff --git a/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/1.zip b/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/1.zip
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/2.changelog b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/3/2.changelog
diff --git a/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.1.changelog.crc b/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.1.changelog.crc
diff --git a/...esources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.1.zip.crc b/...esources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.1.zip.crc
diff --git a/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.2.changelog.crc b/...es/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/.2.changelog.crc
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/1.changelog b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/1.changelog
diff --git a/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/1.zip b/...est/resources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/1.zip
diff --git a/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/2.changelog b/...sources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/4/2.changelog
diff --git a/...s/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_metadata/v2/.0.crc b/...s/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_metadata/v2/.0.crc
diff --git a/...ources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_metadata/v2/0 b/...ources/structured-streaming/checkpoint-version-4.0.0-tws-unsaferow/state/0/_metadata/v2/0
@@ -0,0 +1,2 @@
+v2
+{"operatorInfo":{"operatorId":0,"operatorName":"transformWithStateExec"},"stateStoreInfo":[{"storeName":"default","numColsPrefixKey":0,"numPartitions":5,"stateSchemaFilePath":"file:/Users/anish.shrigondekar/spark/spark/target/tmp/spark-ae28252a-e696-4653-a9a5-7a9a0766f4c1/state/0/_stateSchema/default/0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751"}],"operatorPropertiesJson":"{\"timeMode\":\"NoTime\",\"outputMode\":\"Update\",\"stateVariables\":[{\"stateName\":\"countState\",\"stateVariableType\":\"ValueState\",\"ttlEnabled\":false}]}"}
diff --git a/....0-tws-unsaferow/state/0/_stateSchema/default/.0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751.crc b/....0-tws-unsaferow/state/0/_stateSchema/default/.0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751.crc
diff --git a/...n-4.0.0-tws-unsaferow/state/0/_stateSchema/default/0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751 b/...n-4.0.0-tws-unsaferow/state/0/_stateSchema/default/0_2e8e6b52-e3c3-4184-b8ef-8d391b75d751
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.streaming
 
 import java.io.File
 
+import org.apache.commons.io.FileUtils
+
 import org.apache.spark.sql.catalyst.util.stringToFile
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
@@ -129,4 +131,69 @@ class OffsetSeqLogSuite extends SharedSparkSession {
     val log = new OffsetSeqLog(spark, input.toString)
     log.getLatest().get
   }
+
+  // SPARK-50526 - sanity tests to ensure that values are set correctly for state store
+  // encoding format within OffsetSeqMetadata
+  test("offset log records defaults to unsafeRow for store encoding format") {
+    val offsetSeqMetadata = OffsetSeqMetadata.apply(batchWatermarkMs = 0, batchTimestampMs = 0,
+      spark.conf)
+    assert(offsetSeqMetadata.conf.get(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key) ===
+      Some("unsaferow"))
+  }
+
+  test("offset log uses the store encoding format set in the conf") {
+    val offsetSeqMetadata = OffsetSeqMetadata.apply(batchWatermarkMs = 0, batchTimestampMs = 0,
+      Map(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key -> "avro"))
+    assert(offsetSeqMetadata.conf.get(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key) ===
+      Some("avro"))
+  }
+
+  // Verify whether entry exists within the offset log and has the right value or that we pick up
+  // the correct default values when populating the session conf.
+  private def verifyOffsetLogEntry(
+      checkpointDir: String,
+      entryExists: Boolean,
+      encodingFormat: String): Unit = {
+    val log = new OffsetSeqLog(spark, s"$checkpointDir/offsets")
+    val latestBatchId = log.getLatestBatchId()
+    assert(latestBatchId.isDefined, "No offset log entries found in the checkpoint location")
+
+    // Read the latest offset log
+    val offsetSeq = log.get(latestBatchId.get).get
+    val offsetSeqMetadata = offsetSeq.metadata.get
+
+    if (entryExists) {
+      val encodingFormatOpt = offsetSeqMetadata.conf.get(
+        SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key)
+      assert(encodingFormatOpt.isDefined, "No store encoding format found in the offset log entry")
+      assert(encodingFormatOpt.get == encodingFormat)
+    }
+
+    val clonedSqlConf = spark.sessionState.conf.clone()
+    OffsetSeqMetadata.setSessionConf(offsetSeqMetadata, clonedSqlConf)
+    assert(clonedSqlConf.stateStoreEncodingFormat == encodingFormat)
+  }
+
+  // verify that checkpoint created with different store encoding formats are read correctly
+  Seq("unsaferow", "avro").foreach { storeEncodingFormat =>
+    test(s"verify format values from checkpoint loc - $storeEncodingFormat") {
+      withTempDir { checkpointDir =>
+        val resourceUri = this.getClass.getResource(
+        "/structured-streaming/checkpoint-version-4.0.0-tws-" + storeEncodingFormat + "/").toURI
+        FileUtils.copyDirectory(new File(resourceUri), checkpointDir.getCanonicalFile)
+        verifyOffsetLogEntry(checkpointDir.getAbsolutePath, entryExists = true,
+          storeEncodingFormat)
+      }
+    }
+  }
+
+  test("verify format values from old checkpoint with Spark version 3.5.1") {
+    withTempDir { checkpointDir =>
+      val resourceUri = this.getClass.getResource(
+        "/structured-streaming/checkpoint-version-3.5.1-streaming-deduplication/").toURI
+      FileUtils.copyDirectory(new File(resourceUri), checkpointDir.getCanonicalFile)
+      verifyOffsetLogEntry(checkpointDir.getAbsolutePath, entryExists = false,
+        "unsaferow")
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -874,6 +874,26 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions {
     )
   }
 
+  testWithAllStateVersions("test that avro encoding is not supported") {
+    val inputData = MemoryStream[Int]
+
+    val aggregated =
+      inputData.toDF()
+        .groupBy($"value")
+        .agg(count("*"))
+        .as[(Int, Long)]
+
+    val ex = intercept[Exception] {
+      withSQLConf(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key -> "avro") {
+        testStream(aggregated, Update)(
+          AddData(inputData, 3),
+          ProcessAllAvailable()
+        )
+      }
+    }
+    assert(ex.getMessage.contains("State store encoding format as avro is not supported"))
+  }
+
   private def prepareTestForChangingSchemaOfState(
       tempDir: File): (MemoryStream[Int], DataFrame) = {
     val inputData = MemoryStream[Int]

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala
@@ -574,6 +574,21 @@ class StreamingDeduplicationSuite extends StateStoreMetricsTest {
       matchPVals = true
     )
   }
+
+  test("test that avro encoding is not supported") {
+    val inputData = MemoryStream[String]
+    val result = inputData.toDS().dropDuplicates()
+
+    val ex = intercept[Exception] {
+      withSQLConf(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key -> "avro") {
+        testStream(result, Append)(
+          AddData(inputData, "a"),
+          ProcessAllAvailable()
+        )
+      }
+    }
+    assert(ex.getMessage.contains("State store encoding format as avro is not supported"))
+  }
 }
 
 @SlowSQLTest
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		v1
		{"nextBatchWatermarkMs":0,"stateUniqueIds":{}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		v2
		{"operatorInfo":{"operatorId":0,"operatorName":"transformWithStateExec"},"stateStoreInfo":[{"storeName":"default","numColsPrefixKey":0,"numPartitions":5,"stateSchemaFilePath":"file:/Users/anish.shrigondekar/spark/spark/target/tmp/spark-dcaeba6f-ff09-4f91-ba1b-4d14fe53cc9f/state/0/_stateSchema/default/0_6b12d3c5-57e6-4001-8321-3ae63d6be7a0"}],"operatorPropertiesJson":"{\"timeMode\":\"NoTime\",\"outputMode\":\"Update\",\"stateVariables\":[{\"stateName\":\"countState\",\"stateVariableType\":\"ValueState\",\"ttlEnabled\":false}]}"}