Skip to content

init #24

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 32 commits into from
Closed

init #24

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9f038aa
[SPARK-50112] Moving Avro files to sql/core so they can be used by Tr…
ericm-db Oct 25, 2024
28c3dbd
moving scala to scala dir
ericm-db Oct 25, 2024
2e33fd1
adding deprecated one
ericm-db Oct 25, 2024
b037859
init
ericm-db Oct 25, 2024
c1db91d
adding enum
ericm-db Oct 25, 2024
a30a29d
feedback and test
ericm-db Oct 25, 2024
2ebf6a8
creating utils class
ericm-db Oct 25, 2024
0559480
micheal feedback
ericm-db Oct 31, 2024
d3845a5
ValueState post-refactor
ericm-db Nov 1, 2024
35b3b0d
multivalue state encoder
ericm-db Nov 1, 2024
dcf0df7
encodeToUnsafeRow avro method
ericm-db Nov 2, 2024
dfc6b1e
using correct val
ericm-db Nov 4, 2024
5b98aa6
comments
ericm-db Nov 4, 2024
0d37ffd
calling encodeUnsafeRow
ericm-db Nov 4, 2024
9a1f825
merge into upstream/master
ericm-db Nov 5, 2024
5c8dd33
Merge remote-tracking branch 'upstream/master' into avro
ericm-db Nov 5, 2024
9b8dd5d
[SPARK-50127] Implement Avro encoding for MapState and PrefixKeyScanS…
ericm-db Nov 7, 2024
448ea76
making schema conversion lazy
ericm-db Nov 7, 2024
386fbf1
batch succeeds
ericm-db Nov 7, 2024
896e24f
actually enabling ttl
ericm-db Nov 7, 2024
15c5f71
including hidden files
ericm-db Nov 7, 2024
1f5e5f7
testWithEncodingTypes
ericm-db Nov 7, 2024
1826d5a
no longer relying on unsaferow
ericm-db Nov 8, 2024
c5ef895
everything but batch works
ericm-db Nov 8, 2024
e22e1a2
splitting it up
ericm-db Nov 8, 2024
730cae0
easy feedback to address
ericm-db Nov 9, 2024
754ce6c
batch works
ericm-db Nov 9, 2024
b6dbfdb
added test suite for non-contiguous ordinals
ericm-db Nov 11, 2024
e6f0b7a
using negative/null val marker
ericm-db Nov 11, 2024
ca660c0
removing log line
ericm-db Nov 11, 2024
41de8ae
getAvroEnc
ericm-db Nov 11, 2024
c49acd2
init
ericm-db Nov 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import org.apache.avro.generic.GenericDatumReader
import org.apache.avro.io.{BinaryDecoder, DecoderFactory}

import org.apache.spark.SparkException
import org.apache.spark.sql.avro.SchemaConverters
import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, SpecificInternalRow, UnaryExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode}
import org.apache.spark.sql.catalyst.util.{FailFastMode, ParseMode, PermissiveMode}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ import org.apache.spark.util.ArrayImplicits._
* the fields of the provided schema.
* @param schema The required schema of records from datasource files.
*/
abstract class StructFilters(pushedFilters: Seq[sources.Filter], schema: StructType) {
abstract class StructFilters(
pushedFilters: Seq[sources.Filter], schema: StructType) extends Serializable {

protected val filters = StructFilters.pushedFilters(pushedFilters.toArray, schema)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2204,6 +2204,16 @@ object SQLConf {
.intConf
.createWithDefault(3)

val STREAMING_STATE_STORE_ENCODING_FORMAT =
buildConf("spark.sql.streaming.stateStore.encodingFormat")
.doc("The encoding format used for stateful operators to store information" +
"in the state store")
.version("4.0.0")
.stringConf
.checkValue(v => Set("UnsafeRow", "Avro").contains(v),
"Valid values are 'UnsafeRow' and 'Avro'")
.createWithDefault("UnsafeRow")

// The feature is still in development, so it is still internal.
val STATE_STORE_CHECKPOINT_FORMAT_VERSION =
buildConf("spark.sql.streaming.stateStore.checkpointFormatVersion")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ private[sql] class AvroDeserializer(
filters: StructFilters,
useStableIdForUnionType: Boolean,
stableIdPrefixForUnionType: String,
recursiveFieldMaxDepth: Int) {
recursiveFieldMaxDepth: Int) extends Serializable {

def this(
rootAvroType: Schema,
Expand Down Expand Up @@ -463,7 +463,7 @@ private[sql] class AvroDeserializer(
* A base interface for updating values inside catalyst data structure like `InternalRow` and
* `ArrayData`.
*/
sealed trait CatalystDataUpdater {
sealed trait CatalystDataUpdater extends Serializable {
def set(ordinal: Int, value: Any): Unit

def setNullAt(ordinal: Int): Unit = set(ordinal, null)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ private[sql] class AvroSerializer(
rootAvroType: Schema,
nullable: Boolean,
positionalFieldMatch: Boolean,
datetimeRebaseMode: LegacyBehaviorPolicy.Value) extends Logging {
datetimeRebaseMode: LegacyBehaviorPolicy.Value) extends Logging with Serializable {

def this(rootCatalystType: DataType, rootAvroType: Schema, nullable: Boolean) = {
this(rootCatalystType, rootAvroType, nullable, positionalFieldMatch = false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,19 @@ class IncrementalExecution(
}
}

object StateStoreColumnFamilySchemasRule extends SparkPlanPartialRule {
override val rule: PartialFunction[SparkPlan, SparkPlan] = {
case statefulOp: StatefulOperator =>
statefulOp match {
case op: TransformWithStateExec =>
op.copy(
columnFamilySchemas = op.getColFamilySchemas()
)
case _ => statefulOp
}
}
}

object StateOpIdRule extends SparkPlanPartialRule {
override val rule: PartialFunction[SparkPlan, SparkPlan] = {
case StateStoreSaveExec(keys, None, None, None, None, stateFormatVersion,
Expand Down Expand Up @@ -552,9 +565,9 @@ class IncrementalExecution(
// The rule below doesn't change the plan but can cause the side effect that
// metadata/schema is written in the checkpoint directory of stateful operator.
planWithStateOpId transform StateSchemaAndOperatorMetadataRule.rule

simulateWatermarkPropagation(planWithStateOpId)
planWithStateOpId transform WatermarkPropagationRule.rule
val planWithStateSchemas = planWithStateOpId transform StateStoreColumnFamilySchemasRule.rule
simulateWatermarkPropagation(planWithStateSchemas)
planWithStateSchemas transform WatermarkPropagationRule.rule
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.streaming
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.execution.metric.SQLMetric
import org.apache.spark.sql.execution.streaming.state.{NoPrefixKeyStateEncoderSpec, StateStore, StateStoreErrors}
import org.apache.spark.sql.execution.streaming.state.{AvroEncoder, NoPrefixKeyStateEncoderSpec, StateStore, StateStoreErrors}
import org.apache.spark.sql.streaming.ListState
import org.apache.spark.sql.types.StructType

Expand All @@ -32,14 +32,17 @@ import org.apache.spark.sql.types.StructType
* @param keyExprEnc - Spark SQL encoder for key
* @param valEncoder - Spark SQL encoder for value
* @param metrics - metrics to be updated as part of stateful processing
* @param avroEnc - optional Avro serializer and deserializer for this state variable that
* is used by the StateStore to encode state in Avro format
* @tparam S - data type of object that will be stored in the list
*/
class ListStateImpl[S](
store: StateStore,
stateName: String,
keyExprEnc: ExpressionEncoder[Any],
valEncoder: ExpressionEncoder[Any],
metrics: Map[String, SQLMetric] = Map.empty)
metrics: Map[String, SQLMetric] = Map.empty,
avroEnc: Option[AvroEncoder] = None)
extends ListStateMetricsImpl
with ListState[S]
with Logging {
Expand All @@ -50,8 +53,13 @@ class ListStateImpl[S](

private val stateTypesEncoder = StateTypesEncoder(keyExprEnc, valEncoder, stateName)

store.createColFamilyIfAbsent(stateName, keyExprEnc.schema, valEncoder.schema,
NoPrefixKeyStateEncoderSpec(keyExprEnc.schema), useMultipleValuesPerKey = true)
store.createColFamilyIfAbsent(
stateName,
keyExprEnc.schema,
valEncoder.schema,
NoPrefixKeyStateEncoderSpec(keyExprEnc.schema),
useMultipleValuesPerKey = true,
avroEncoderSpec = avroEnc)

/** Whether state exists or not. */
override def exists(): Boolean = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.execution.metric.SQLMetric
import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._
import org.apache.spark.sql.execution.streaming.state.{NoPrefixKeyStateEncoderSpec, StateStore, StateStoreErrors}
import org.apache.spark.sql.execution.streaming.state.{AvroEncoder, NoPrefixKeyStateEncoderSpec, StateStore, StateStoreErrors}
import org.apache.spark.sql.streaming.{ListState, TTLConfig}
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.NextIterator
Expand All @@ -36,6 +36,10 @@ import org.apache.spark.util.NextIterator
* @param ttlConfig - TTL configuration for values stored in this state
* @param batchTimestampMs - current batch processing timestamp.
* @param metrics - metrics to be updated as part of stateful processing
* @param avroEnc - optional Avro serializer and deserializer for this state variable that
* is used by the StateStore to encode state in Avro format
* @param secondaryIndexAvroEnc - optional Avro serializer and deserializer for TTL state that
* is used by the StateStore to encode state in Avro format
* @tparam S - data type of object that will be stored
*/
class ListStateImplWithTTL[S](
Expand All @@ -45,8 +49,11 @@ class ListStateImplWithTTL[S](
valEncoder: ExpressionEncoder[Any],
ttlConfig: TTLConfig,
batchTimestampMs: Long,
metrics: Map[String, SQLMetric] = Map.empty)
extends SingleKeyTTLStateImpl(stateName, store, keyExprEnc, batchTimestampMs)
metrics: Map[String, SQLMetric] = Map.empty,
avroEnc: Option[AvroEncoder] = None,
secondaryIndexAvroEnc: Option[AvroEncoder] = None)
extends SingleKeyTTLStateImpl(
stateName, store, keyExprEnc, batchTimestampMs, secondaryIndexAvroEnc)
with ListStateMetricsImpl
with ListState[S] {

Expand All @@ -65,7 +72,8 @@ class ListStateImplWithTTL[S](
private def initialize(): Unit = {
store.createColFamilyIfAbsent(stateName, keyExprEnc.schema,
getValueSchemaWithTTL(valEncoder.schema, true),
NoPrefixKeyStateEncoderSpec(keyExprEnc.schema), useMultipleValuesPerKey = true)
NoPrefixKeyStateEncoderSpec(keyExprEnc.schema), useMultipleValuesPerKey = true,
avroEncoderSpec = avroEnc)
}

/** Whether state exists or not. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.execution.metric.SQLMetric
import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._
import org.apache.spark.sql.execution.streaming.state.{PrefixKeyScanStateEncoderSpec, StateStore, StateStoreErrors, UnsafeRowPair}
import org.apache.spark.sql.execution.streaming.state.{AvroEncoder, PrefixKeyScanStateEncoderSpec, StateStore, StateStoreErrors, UnsafeRowPair}
import org.apache.spark.sql.streaming.MapState
import org.apache.spark.sql.types.StructType

Expand All @@ -32,6 +32,8 @@ import org.apache.spark.sql.types.StructType
* @param keyExprEnc - Spark SQL encoder for key
* @param valEncoder - Spark SQL encoder for value
* @param metrics - metrics to be updated as part of stateful processing
* @param avroEnc - optional Avro serializer and deserializer for this state variable that
* is used by the StateStore to encode state in Avro format
* @tparam K - type of key for map state variable
* @tparam V - type of value for map state variable
*/
Expand All @@ -41,7 +43,8 @@ class MapStateImpl[K, V](
keyExprEnc: ExpressionEncoder[Any],
userKeyEnc: ExpressionEncoder[Any],
valEncoder: ExpressionEncoder[Any],
metrics: Map[String, SQLMetric] = Map.empty) extends MapState[K, V] with Logging {
metrics: Map[String, SQLMetric] = Map.empty,
avroEnc: Option[AvroEncoder] = None) extends MapState[K, V] with Logging {

// Pack grouping key and user key together as a prefixed composite key
private val schemaForCompositeKeyRow: StructType = {
Expand All @@ -52,7 +55,7 @@ class MapStateImpl[K, V](
keyExprEnc, userKeyEnc, valEncoder, stateName)

store.createColFamilyIfAbsent(stateName, schemaForCompositeKeyRow, schemaForValueRow,
PrefixKeyScanStateEncoderSpec(schemaForCompositeKeyRow, 1))
PrefixKeyScanStateEncoderSpec(schemaForCompositeKeyRow, 1), avroEncoderSpec = avroEnc)

/** Whether state exists or not. */
override def exists(): Boolean = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.execution.metric.SQLMetric
import org.apache.spark.sql.execution.streaming.TransformWithStateKeyValueRowSchemaUtils._
import org.apache.spark.sql.execution.streaming.state.{PrefixKeyScanStateEncoderSpec, StateStore, StateStoreErrors}
import org.apache.spark.sql.execution.streaming.state.{AvroEncoder, PrefixKeyScanStateEncoderSpec, StateStore, StateStoreErrors}
import org.apache.spark.sql.streaming.{MapState, TTLConfig}
import org.apache.spark.util.NextIterator

Expand All @@ -36,21 +36,27 @@ import org.apache.spark.util.NextIterator
* @param ttlConfig - the ttl configuration (time to live duration etc.)
* @param batchTimestampMs - current batch processing timestamp.
* @param metrics - metrics to be updated as part of stateful processing
* @param avroEnc - optional Avro serializer and deserializer for this state variable that
* is used by the StateStore to encode state in Avro format
* @param secondaryIndexAvroEnc - optional Avro serializer and deserializer for TTL state that
* is used by the StateStore to encode state in Avro format
* @tparam K - type of key for map state variable
* @tparam V - type of value for map state variable
* @return - instance of MapState of type [K,V] that can be used to store state persistently
*/
class MapStateImplWithTTL[K, V](
store: StateStore,
stateName: String,
keyExprEnc: ExpressionEncoder[Any],
userKeyEnc: ExpressionEncoder[Any],
valEncoder: ExpressionEncoder[Any],
ttlConfig: TTLConfig,
batchTimestampMs: Long,
metrics: Map[String, SQLMetric] = Map.empty)
store: StateStore,
stateName: String,
keyExprEnc: ExpressionEncoder[Any],
userKeyEnc: ExpressionEncoder[Any],
valEncoder: ExpressionEncoder[Any],
ttlConfig: TTLConfig,
batchTimestampMs: Long,
metrics: Map[String, SQLMetric] = Map.empty,
avroEnc: Option[AvroEncoder] = None,
secondaryIndexAvroEnc: Option[AvroEncoder] = None)
extends CompositeKeyTTLStateImpl[K](stateName, store,
keyExprEnc, userKeyEnc, batchTimestampMs)
keyExprEnc, userKeyEnc, batchTimestampMs, secondaryIndexAvroEnc)
with MapState[K, V] with Logging {

private val stateTypesEncoder = new CompositeKeyStateEncoder(
Expand All @@ -66,7 +72,8 @@ class MapStateImplWithTTL[K, V](
getCompositeKeySchema(keyExprEnc.schema, userKeyEnc.schema)
store.createColFamilyIfAbsent(stateName, schemaForCompositeKeyRow,
getValueSchemaWithTTL(valEncoder.schema, true),
PrefixKeyScanStateEncoderSpec(schemaForCompositeKeyRow, 1))
PrefixKeyScanStateEncoderSpec(schemaForCompositeKeyRow, 1),
avroEncoderSpec = avroEnc)
}

/** Whether state exists or not. */
Expand Down
Loading
Loading