apache · WweiL · Aug 21, 2023 · Aug 22, 2023 · Aug 22, 2023 · Aug 22, 2023
diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py
@@ -708,6 +708,11 @@
       "State is either not defined or has already been removed."
     ]
   },
+  "STREAMING_CONNECT_SERIALIZATION_ERROR" : {
+    "message" : [
+      "Cannot serialize the function `<name>`. If you accessed the spark session, or a dataframe defined outside of the function, please be aware that they are not allowed in Spark Connect. For foreachBatch, please access the spark session using `df.sparkSession`, where `df` is the first parameter in your foreachBatch function. For StreamingQueryListener, please access the spark session using `self.spark`. For details please check out the PySpark doc for foreachBatch and StreamingQueryListener."
+    ]
+  },
   "STOP_ITERATION_OCCURRED" : {
     "message" : [
       "Caught StopIteration thrown from user's code; failing the task: <exc>"

diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py
@@ -17,6 +17,7 @@
 
 import json
 import sys
+import pickle
 from typing import TYPE_CHECKING, Any, cast, Dict, List, Optional
 
 from pyspark.errors import StreamingQueryException, PySparkValueError
@@ -32,6 +33,7 @@
 from pyspark.errors.exceptions.connect import (
     StreamingQueryException as CapturedStreamingQueryException,
 )
+from pyspark.errors import PySparkRuntimeError
 
 __all__ = ["StreamingQuery", "StreamingQueryManager"]
 
@@ -237,7 +239,13 @@ def addListener(self, listener: StreamingQueryListener) -> None:
         listener._init_listener_id()
         cmd = pb2.StreamingQueryManagerCommand()
         expr = proto.PythonUDF()
-        expr.command = CloudPickleSerializer().dumps(listener)
+        try:
+            expr.command = CloudPickleSerializer().dumps(listener)
+        except pickle.PicklingError:
+            raise PySparkRuntimeError(
+                error_class="STREAMING_CONNECT_SERIALIZATION_ERROR",
+                message_parameters={"name": "addListener"},
+            )
         expr.python_ver = get_python_ver()
         cmd.add_listener.python_listener_payload.CopyFrom(expr)
         cmd.add_listener.id = listener._id

diff --git a/python/pyspark/sql/connect/streaming/readwriter.py b/python/pyspark/sql/connect/streaming/readwriter.py
@@ -20,6 +20,7 @@
 check_dependencies(__name__)
 
 import sys
+import pickle
 from typing import cast, overload, Callable, Dict, List, Optional, TYPE_CHECKING, Union
 
 from pyspark.serializers import CloudPickleSerializer
@@ -33,7 +34,7 @@
 )
 from pyspark.sql.connect.utils import get_python_ver
 from pyspark.sql.types import Row, StructType
-from pyspark.errors import PySparkTypeError, PySparkValueError
+from pyspark.errors import PySparkTypeError, PySparkValueError, PySparkRuntimeError
 
 if TYPE_CHECKING:
     from pyspark.sql.connect.session import SparkSession
@@ -488,18 +489,30 @@ def foreach(self, f: Union[Callable[[Row], None], "SupportsProcess"]) -> "DataSt
         serializer = AutoBatchedSerializer(CPickleSerializer())
         command = (func, None, serializer, serializer)
         # Python ForeachWriter isn't really a PythonUDF. But we reuse it for simplicity.
-        self._write_proto.foreach_writer.python_function.command = CloudPickleSerializer().dumps(
-            command
-        )
+        try:
+            self._write_proto.foreach_writer.python_function.command = (
+                CloudPickleSerializer().dumps(command)
+            )
+        except pickle.PicklingError:
+            raise PySparkRuntimeError(
+                error_class="STREAMING_CONNECT_SERIALIZATION_ERROR",
+                message_parameters={"name": "foreach"},
+            )
         self._write_proto.foreach_writer.python_function.python_ver = "%d.%d" % sys.version_info[:2]
         return self
 
     foreach.__doc__ = PySparkDataStreamWriter.foreach.__doc__
 
     def foreachBatch(self, func: Callable[["DataFrame", int], None]) -> "DataStreamWriter":
-        self._write_proto.foreach_batch.python_function.command = CloudPickleSerializer().dumps(
-            func
-        )
+        try:
+            self._write_proto.foreach_batch.python_function.command = CloudPickleSerializer().dumps(
+                func
+            )
+        except pickle.PicklingError:
+            raise PySparkRuntimeError(
+                error_class="STREAMING_CONNECT_SERIALIZATION_ERROR",
+                message_parameters={"name": "foreachBatch"},
+            )
         self._write_proto.foreach_batch.python_function.python_ver = get_python_ver()
         return self
 

diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_foreachBatch.py b/python/pyspark/sql/tests/connect/streaming/test_parity_foreachBatch.py
@@ -19,6 +19,7 @@
 
 from pyspark.sql.tests.streaming.test_streaming_foreachBatch import StreamingTestsForeachBatchMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.errors import PySparkRuntimeError
 
 
 class StreamingForeachBatchParityTests(StreamingTestsForeachBatchMixin, ReusedConnectTestCase):
@@ -30,6 +31,35 @@ def test_streaming_foreachBatch_propagates_python_errors(self):
     def test_streaming_foreachBatch_graceful_stop(self):
         super().test_streaming_foreachBatch_graceful_stop()
 
+    # class StreamingForeachBatchParityTests(ReusedConnectTestCase):
+    def test_accessing_spark_session(self):
+        spark = self.spark
+
+        def func(df, _):
+            spark.createDataFrame([("do", "not"), ("serialize", "spark")]).collect()
+
+        error_thrown = False
+        try:
+            self.spark.readStream.format("rate").load().writeStream.foreachBatch(func).start()
+        except PySparkRuntimeError as e:
+            self.assertEqual(e.getErrorClass(), "STREAMING_CONNECT_SERIALIZATION_ERROR")
+            error_thrown = True
+        self.assertTrue(error_thrown)
+
+    def test_accessing_spark_session_through_df(self):
+        dataframe = self.spark.createDataFrame([("do", "not"), ("serialize", "dataframe")])
+
+        def func(df, _):
+            dataframe.collect()
+
+        error_thrown = False
+        try:
+            self.spark.readStream.format("rate").load().writeStream.foreachBatch(func).start()
+        except PySparkRuntimeError as e:
+            self.assertEqual(e.getErrorClass(), "STREAMING_CONNECT_SERIALIZATION_ERROR")
+            error_thrown = True
+        self.assertTrue(error_thrown)
+
 
 if __name__ == "__main__":
     import unittest

diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py b/python/pyspark/sql/tests/connect/streaming/test_parity_listener.py
@@ -18,6 +18,7 @@
 import unittest
 import time
 
+from pyspark.errors import PySparkRuntimeError
 from pyspark.sql.tests.streaming.test_streaming_listener import StreamingListenerTestsMixin
 from pyspark.sql.streaming.listener import StreamingQueryListener, QueryStartedEvent
 from pyspark.sql.types import StructType, StructField, StringType
@@ -83,6 +84,54 @@ def test_listener_events(self):
             # Remove again to verify this won't throw any error
             self.spark.streams.removeListener(test_listener)
 
+    def test_accessing_spark_session(self):
+        spark = self.spark
+
+        class TestListener(StreamingQueryListener):
+            def onQueryStarted(self, event):
+                spark.createDataFrame([("do", "not"), ("serialize", "spark")]).collect()
+
+            def onQueryProgress(self, event):
+                pass
+
+            def onQueryIdle(self, event):
+                pass
+
+            def onQueryTerminated(self, event):
+                pass
+
+        error_thrown = False
+        try:
+            self.spark.streams.addListener(TestListener())
+        except PySparkRuntimeError as e:
+            self.assertEqual(e.getErrorClass(), "STREAMING_CONNECT_SERIALIZATION_ERROR")
+            error_thrown = True
+        self.assertTrue(error_thrown)
+
+    def test_accessing_spark_session_through_df(self):
+        dataframe = self.spark.createDataFrame([("do", "not"), ("serialize", "dataframe")])
+
+        class TestListener(StreamingQueryListener):
+            def onQueryStarted(self, event):
+                dataframe.collect()
+
+            def onQueryProgress(self, event):
+                pass
+
+            def onQueryIdle(self, event):
+                pass
+
+            def onQueryTerminated(self, event):
+                pass
+
+        error_thrown = False
+        try:
+            self.spark.streams.addListener(TestListener())
+        except PySparkRuntimeError as e:
+            self.assertEqual(e.getErrorClass(), "STREAMING_CONNECT_SERIALIZATION_ERROR")
+            error_thrown = True
+        self.assertTrue(error_thrown)
+
 
 if __name__ == "__main__":
     import unittest