[SPARK-54598][PYTHON] Extract logic to read UDFs

Yicong-Huang · zhengruifeng · commit bba8bb86f19a · 2025-12-05T15:27:53.000+08:00
### What changes were proposed in this pull request? This PR refactors the UDF reading logic in `read_udfs()` to eliminate code duplication. Currently, the logic for reading UDFs (functions and their argument offsets) is duplicated across multiple `eval_type` branches, with different patterns for single UDF vs. multiple UDFs cases. ### Why are the changes needed? This duplication makes the code harder to maintain and increases the risk of inconsistencies. By centralizing the UDF reading logic at the beginning of `read_udfs()`, we can: - Reduce code duplication - Ensure consistent UDF reading behavior across all eval types - Make it easier to add new eval types in the future ### Does this PR introduce _any_ user-facing change? No, this is an internal refactoring that maintains backward compatibility. The API behavior remains the same from the user's perspective. ### How was this patch tested? Existing Tests ### Was this patch authored or co-authored using generative AI tooling? No Closes apache#53330 from Yicong-Huang/SPARK-54598/refactor/udf-fetching-logic. Authored-by: Yicong-Huang <17627829+Yicong-Huang@users.noreply.github.com> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
@@ -2878,7 +2878,12 @@ def read_udfs(pickleSer, infile, eval_type):
     else:
         profiler = None
 
+    # Read all UDFs
     num_udfs = read_int(infile)
+    udfs = [
+        read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i, profiler=profiler)
+        for i in range(num_udfs)
+    ]
 
     is_scalar_iter = eval_type in (
         PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
@@ -2896,9 +2901,7 @@ def read_udfs(pickleSer, infile, eval_type):
         if is_map_arrow_iter:
             assert num_udfs == 1, "One MAP_ARROW_ITER UDF expected here."
 
-        arg_offsets, udf = read_single_udf(
-            pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler
-        )
+        arg_offsets, udf = udfs[0]
 
         def func(_, iterator):
             num_input_rows = 0
@@ -2994,9 +2997,7 @@ def extract_key_value_indexes(grouped_arg_offsets):
 
         # See FlatMapGroupsInPandasExec for how arg_offsets are used to
         # distinguish between grouping attributes and data attributes
-        arg_offsets, f = read_single_udf(
-            pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler
-        )
+        arg_offsets, f = udfs[0]
         parsed_offsets = extract_key_value_indexes(arg_offsets)
 
         def mapper(series_iter):
@@ -3022,9 +3023,7 @@ def mapper(series_iter):
 
         # See TransformWithStateInPandasExec for how arg_offsets are used to
         # distinguish between grouping attributes and data attributes
-        arg_offsets, f = read_single_udf(
-            pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler
-        )
+        arg_offsets, f = udfs[0]
         parsed_offsets = extract_key_value_indexes(arg_offsets)
         ser.key_offsets = parsed_offsets[0][0]
         stateful_processor_api_client = StatefulProcessorApiClient(state_server_port, key_schema)
@@ -3053,9 +3052,7 @@ def values_gen():
 
         # See TransformWithStateInPandasExec for how arg_offsets are used to
         # distinguish between grouping attributes and data attributes
-        arg_offsets, f = read_single_udf(
-            pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler
-        )
+        arg_offsets, f = udfs[0]
         # parsed offsets:
         # [
         #     [groupingKeyOffsets, dedupDataOffsets],
@@ -3091,9 +3088,7 @@ def values_gen():
 
         # See TransformWithStateInPySparkExec for how arg_offsets are used to
         # distinguish between grouping attributes and data attributes
-        arg_offsets, f = read_single_udf(
-            pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler
-        )
+        arg_offsets, f = udfs[0]
         parsed_offsets = extract_key_value_indexes(arg_offsets)
         ser.key_offsets = parsed_offsets[0][0]
         stateful_processor_api_client = StatefulProcessorApiClient(state_server_port, key_schema)
@@ -3118,9 +3113,7 @@ def mapper(a):
 
         # See TransformWithStateInPandasExec for how arg_offsets are used to
         # distinguish between grouping attributes and data attributes
-        arg_offsets, f = read_single_udf(
-            pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler
-        )
+        arg_offsets, f = udfs[0]
         # parsed offsets:
         # [
         #     [groupingKeyOffsets, dedupDataOffsets],
@@ -3156,9 +3149,7 @@ def mapper(a):
 
         # See FlatMapGroupsInPandasExec for how arg_offsets are used to
         # distinguish between grouping attributes and data attributes
-        arg_offsets, f = read_single_udf(
-            pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler
-        )
+        arg_offsets, f = udfs[0]
         parsed_offsets = extract_key_value_indexes(arg_offsets)
 
         def batch_from_offset(batch, offsets):
@@ -3187,9 +3178,7 @@ def mapper(a):
 
         # See FlatMapGroupsInPandas(WithState)Exec for how arg_offsets are used to
         # distinguish between grouping attributes and data attributes
-        arg_offsets, f = read_single_udf(
-            pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler
-        )
+        arg_offsets, f = udfs[0]
         parsed_offsets = extract_key_value_indexes(arg_offsets)
 
         def mapper(a):
@@ -3223,9 +3212,7 @@ def mapper(a):
         # We assume there is only one UDF here because cogrouped map doesn't
         # support combining multiple UDFs.
         assert num_udfs == 1
-        arg_offsets, f = read_single_udf(
-            pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler
-        )
+        arg_offsets, f = udfs[0]
 
         parsed_offsets = extract_key_value_indexes(arg_offsets)
 
@@ -3242,9 +3229,7 @@ def mapper(a):
         # We assume there is only one UDF here because cogrouped map doesn't
         # support combining multiple UDFs.
         assert num_udfs == 1
-        arg_offsets, f = read_single_udf(
-            pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler
-        )
+        arg_offsets, f = udfs[0]
 
         parsed_offsets = extract_key_value_indexes(arg_offsets)
 
@@ -3269,9 +3254,7 @@ def mapper(a):
         # support combining multiple UDFs.
         assert num_udfs == 1
 
-        arg_offsets, f = read_single_udf(
-            pickleSer, infile, eval_type, runner_conf, udf_index=0, profiler=profiler
-        )
+        arg_offsets, f = udfs[0]
 
         # Convert to iterator of batches: Iterator[pa.Array] for single column,
         # or Iterator[Tuple[pa.Array, ...]] for multiple columns
@@ -3283,13 +3266,6 @@ def mapper(a):
             return f(batch_iter)
 
     else:
-        udfs = []
-        for i in range(num_udfs):
-            udfs.append(
-                read_single_udf(
-                    pickleSer, infile, eval_type, runner_conf, udf_index=i, profiler=profiler
-                )
-            )
 
         def mapper(a):
             result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)