apache · jingz-db · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -62,6 +62,7 @@ private[spark] object PythonEvalType {
   val SQL_GROUPED_MAP_ARROW_UDF = 209
   val SQL_COGROUPED_MAP_ARROW_UDF = 210
   val SQL_TRANSFORM_WITH_STATE_PANDAS_UDF = 211
+  val SQL_TRANSFORM_WITH_STATE_PANDAS_INIT_STATE_UDF = 212
 
   val SQL_TABLE_UDF = 300
   val SQL_ARROW_TABLE_UDF = 301
@@ -84,6 +85,8 @@ private[spark] object PythonEvalType {
     case SQL_TABLE_UDF => "SQL_TABLE_UDF"
     case SQL_ARROW_TABLE_UDF => "SQL_ARROW_TABLE_UDF"
     case SQL_TRANSFORM_WITH_STATE_PANDAS_UDF => "SQL_TRANSFORM_WITH_STATE_PANDAS_UDF"
+    case SQL_TRANSFORM_WITH_STATE_PANDAS_INIT_STATE_UDF =>
+      "SQL_TRANSFORM_WITH_STATE_PANDAS_INIT_STATE_UDF"
   }
 }
 

diff --git a/python/docs/source/reference/pyspark.sql/stateful_processor.rst b/python/docs/source/reference/pyspark.sql/stateful_processor.rst
@@ -26,4 +26,5 @@ Stateful Processor
 
     StatefulProcessor.init
     StatefulProcessor.handleInputRows
-    StatefulProcessor.close
+    StatefulProcessor.close
+    StatefulProcessor.handleInitialState
diff --git a/python/pyspark/sql/pandas/_typing/__init__.pyi b/python/pyspark/sql/pandas/_typing/__init__.pyi
@@ -56,6 +56,7 @@ PandasGroupedMapUDFWithStateType = Literal[208]
 ArrowGroupedMapUDFType = Literal[209]
 ArrowCogroupedMapUDFType = Literal[210]
 PandasGroupedMapUDFTransformWithStateType = Literal[211]
+PandasGroupedMapUDFTransformWithStateInitStateType = Literal[212]
 
 class PandasVariadicScalarToScalarFunction(Protocol):
     def __call__(self, *_: DataFrameOrSeriesLike_) -> DataFrameOrSeriesLike_: ...

diff --git a/python/pyspark/sql/pandas/functions.py b/python/pyspark/sql/pandas/functions.py
@@ -414,6 +414,7 @@ def calculate(iterator: Iterator[pd.Series]) -> Iterator[pd.Series]:
         PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
         PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE,
         PythonEvalType.SQL_TRANSFORM_WITH_STATE_PANDAS_UDF,
+        PythonEvalType.SQL_TRANSFORM_WITH_STATE_PANDAS_INIT_STATE_UDF,
         PythonEvalType.SQL_GROUPED_MAP_ARROW_UDF,
         PythonEvalType.SQL_COGROUPED_MAP_ARROW_UDF,
         None,
@@ -455,6 +456,7 @@ def _validate_pandas_udf(f, evalType) -> int:
         PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
         PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE,
         PythonEvalType.SQL_TRANSFORM_WITH_STATE_PANDAS_UDF,
+        PythonEvalType.SQL_TRANSFORM_WITH_STATE_PANDAS_INIT_STATE_UDF,
         PythonEvalType.SQL_GROUPED_MAP_ARROW_UDF,
         PythonEvalType.SQL_COGROUPED_MAP_ARROW_UDF,
         PythonEvalType.SQL_ARROW_BATCHED_UDF,

diff --git a/python/pyspark/sql/pandas/group_ops.py b/python/pyspark/sql/pandas/group_ops.py
@@ -16,7 +16,7 @@
 #
 import itertools
 import sys
-from typing import Any, Iterator, List, Union, TYPE_CHECKING, cast
+from typing import Any, Iterator, List, Optional, Union, TYPE_CHECKING, cast
 import warnings
 
 from pyspark.errors import PySparkTypeError
@@ -373,6 +373,7 @@ def transformWithStateInPandas(
         outputStructType: Union[StructType, str],
         outputMode: str,
         timeMode: str,
+        initialState: Optional["GroupedData"] = None,
     ) -> DataFrame:
         """
         Invokes methods defined in the stateful processor used in arbitrary state API v2. It
@@ -409,6 +410,9 @@ def transformWithStateInPandas(
             The output mode of the stateful processor.
         timeMode : str
             The time mode semantics of the stateful processor for timers and TTL.
+        initialState : :class:`pyspark.sql.GroupedData`
+            Optional. The grouped dataframe as initial states used for initialization
+            of state variables in the first batch.
 
         Examples
         --------
@@ -493,22 +497,17 @@ def transformWithStateInPandas(
         from pyspark.sql.functions import pandas_udf
 
         assert isinstance(self, GroupedData)
+        if initialState is not None:
+            assert isinstance(initialState, GroupedData)
+        if isinstance(outputStructType, str):
+            outputStructType = cast(StructType, _parse_datatype_string(outputStructType))
 
-        def transformWithStateUDF(
+        def handle_data_with_timers(
             statefulProcessorApiClient: StatefulProcessorApiClient,
             key: Any,
             inputRows: Iterator["PandasDataFrameLike"],
         ) -> Iterator["PandasDataFrameLike"]:
-            handle = StatefulProcessorHandle(statefulProcessorApiClient)
-
-            if statefulProcessorApiClient.handle_state == StatefulProcessorHandleState.CREATED:
-                statefulProcessor.init(handle)
-                statefulProcessorApiClient.set_handle_state(
-                    StatefulProcessorHandleState.INITIALIZED
-                )
-
             statefulProcessorApiClient.set_implicit_key(key)
-
             if timeMode != "none":
                 batch_timestamp = statefulProcessorApiClient.get_batch_timestamp()
                 watermark_timestamp = statefulProcessorApiClient.get_watermark_timestamp()
@@ -551,25 +550,102 @@ def transformWithStateUDF(
             # TODO(SPARK-49603) set the handle state in the lazily initialized iterator
 
             result = itertools.chain(*result_iter_list)
+            return result
+
+        def transformWithStateUDF(
+            statefulProcessorApiClient: StatefulProcessorApiClient,
+            key: Any,
+            inputRows: Iterator["PandasDataFrameLike"],
+        ) -> Iterator["PandasDataFrameLike"]:
+            handle = StatefulProcessorHandle(statefulProcessorApiClient)
+
+            if statefulProcessorApiClient.handle_state == StatefulProcessorHandleState.CREATED:
+                statefulProcessor.init(handle)
+                statefulProcessorApiClient.set_handle_state(
+                    StatefulProcessorHandleState.INITIALIZED
+                )
+
+            result = handle_data_with_timers(statefulProcessorApiClient, key, inputRows)
+            return result
+
+        def transformWithStateWithInitStateUDF(
+            statefulProcessorApiClient: StatefulProcessorApiClient,
+            key: Any,
+            inputRows: Iterator["PandasDataFrameLike"],
+            initialStates: Optional[Iterator["PandasDataFrameLike"]] = None,
+        ) -> Iterator["PandasDataFrameLike"]:
+            """
+            UDF for TWS operator with non-empty initial states. Possible input combinations
+            of inputRows and initialStates iterator:
+            - Both `inputRows` and `initialStates` are non-empty. Both input rows and initial
+             states contains the grouping key and data.
+            - `InitialStates` is non-empty, while `inputRows` is empty. Only initial states
+             contains the grouping key and data, and it is first batch.
+            - `initialStates` is empty, while `inputRows` is non-empty. Only inputRows contains the
+             grouping key and data, and it is first batch.
+            - `initialStates` is None, while `inputRows` is not empty. This is not first batch.
+             `initialStates` is initialized to the positional value as None.
+            """
+            handle = StatefulProcessorHandle(statefulProcessorApiClient)
+
+            if statefulProcessorApiClient.handle_state == StatefulProcessorHandleState.CREATED:
+                statefulProcessor.init(handle)
+                statefulProcessorApiClient.set_handle_state(
+                    StatefulProcessorHandleState.INITIALIZED
+                )
+
+            # only process initial state if first batch and initial state is not None
+            if initialStates is not None:
+                for cur_initial_state in initialStates:
+                    statefulProcessorApiClient.set_implicit_key(key)
+                    # TODO(SPARK-50194) integration with new timer API with initial state
+                    statefulProcessor.handleInitialState(key, cur_initial_state)
+
+            # if we don't have input rows for the given key but only have initial state
+            # for the grouping key, the inputRows iterator could be empty
+            input_rows_empty = False
+            try:
+                first = next(inputRows)
+            except StopIteration:
+                input_rows_empty = True
+            else:
+                inputRows = itertools.chain([first], inputRows)
+
+            if not input_rows_empty:
+                result = handle_data_with_timers(statefulProcessorApiClient, key, inputRows)
+            else:
+                result = iter([])
 
             return result
 
         if isinstance(outputStructType, str):
             outputStructType = cast(StructType, _parse_datatype_string(outputStructType))
 
-        udf = pandas_udf(
-            transformWithStateUDF,  # type: ignore
-            returnType=outputStructType,
-            functionType=PythonEvalType.SQL_TRANSFORM_WITH_STATE_PANDAS_UDF,
-        )
         df = self._df
+
+        if initialState is None:
+            initial_state_java_obj = None
+            udf = pandas_udf(
+                transformWithStateUDF,  # type: ignore
+                returnType=outputStructType,
+                functionType=PythonEvalType.SQL_TRANSFORM_WITH_STATE_PANDAS_UDF,
+            )
+        else:
+            initial_state_java_obj = initialState._jgd
+            udf = pandas_udf(
+                transformWithStateWithInitStateUDF,  # type: ignore
+                returnType=outputStructType,
+                functionType=PythonEvalType.SQL_TRANSFORM_WITH_STATE_PANDAS_INIT_STATE_UDF,
+            )
+
         udf_column = udf(*[df[col] for col in df.columns])
 
         jdf = self._jgd.transformWithStateInPandas(
             udf_column._jc,
             self.session._jsparkSession.parseDataType(outputStructType.json()),
             outputMode,
             timeMode,
+            initial_state_java_obj,
         )
         return DataFrame(jdf, self.session)
 

diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -1190,3 +1190,79 @@ def dump_stream(self, iterator, stream):
         """
         result = [(b, t) for x in iterator for y, t in x for b in y]
         super().dump_stream(result, stream)
+
+
+class TransformWithStateInPandasInitStateSerializer(TransformWithStateInPandasSerializer):
+    """
+    Serializer used by Python worker to evaluate UDF for
+    :meth:`pyspark.sql.GroupedData.transformWithStateInPandasInitStateSerializer`.
+    Parameters
+    ----------
+    Same as input parameters in TransformWithStateInPandasSerializer.
+    """
+
+    def __init__(self, timezone, safecheck, assign_cols_by_name, arrow_max_records_per_batch):
+        super(TransformWithStateInPandasInitStateSerializer, self).__init__(
+            timezone, safecheck, assign_cols_by_name, arrow_max_records_per_batch
+        )
+        self.init_key_offsets = None
+
+    def load_stream(self, stream):
+        import pyarrow as pa
+
+        def generate_data_batches(batches):
+            """
+            Deserialize ArrowRecordBatches and return a generator of pandas.Series list.
+            The deserialization logic assumes that Arrow RecordBatches contain the data with the
+            ordering that data chunks for same grouping key will appear sequentially.
+            See `TransformWithStateInPandasPythonInitialStateRunner` for arrow batch schema sent
+             from JVM.
+            This function flatten the columns of input rows and initial state rows and feed them
+             into the data generator.
+            """
+
+            def flatten_columns(cur_batch, col_name):
+                state_column = cur_batch.column(cur_batch.schema.get_field_index(col_name))
+                state_field_names = [
+                    state_column.type[i].name for i in range(state_column.type.num_fields)
+                ]
+                state_field_arrays = [
+                    state_column.field(i) for i in range(state_column.type.num_fields)
+                ]
+                table_from_fields = pa.Table.from_arrays(
+                    state_field_arrays, names=state_field_names
+                )
+                return table_from_fields
+
+            """
+            The arrow batch is written in the schema:
+            schema: StructType = new StructType()
+                .add("inputData", dataSchema)
+                .add("initState", initStateSchema)
+            We'll parse batch into Tuples of (key, inputData, initState) and pass into the Python
+             data generator. All rows in the same batch have the same grouping key.
+            """
+            for batch in batches:
+                flatten_state_table = flatten_columns(batch, "inputData")
+                data_pandas = [self.arrow_to_pandas(c) for c in flatten_state_table.itercolumns()]
+
+                flatten_init_table = flatten_columns(batch, "initState")
+                init_data_pandas = [
+                    self.arrow_to_pandas(c) for c in flatten_init_table.itercolumns()
+                ]
+                key_series = [data_pandas[o] for o in self.key_offsets]
+                init_key_series = [init_data_pandas[o] for o in self.init_key_offsets]
+
+                if any(s.empty for s in key_series):
+                    # If any row is empty, assign batch_key using init_key_series
+                    batch_key = tuple(s[0] for s in init_key_series)
+                else:
+                    # If all rows are non-empty, create batch_key from key_series
+                    batch_key = tuple(s[0] for s in key_series)
+                yield (batch_key, data_pandas, init_data_pandas)
+
+        _batches = super(ArrowStreamPandasSerializer, self).load_stream(stream)
+        data_batches = generate_data_batches(_batches)
+
+        for k, g in groupby(data_batches, key=lambda x: x[0]):
+            yield (k, g)
diff --git a/python/pyspark/sql/streaming/stateful_processor.py b/python/pyspark/sql/streaming/stateful_processor.py
@@ -426,3 +426,10 @@ def close(self) -> None:
         operations.
         """
         ...
+
+    def handleInitialState(self, key: Any, initialState: "PandasDataFrameLike") -> None:
+        """
+        Optional to implement. Will act as no-op if not defined or no initial state input.
+         Function that will be invoked only in the first batch for users to process initial states.
+        """
+        pass