[SPARK-42824][CONNECT][PYTHON] Provide a clear error message for unsupported JVM attributes

itholic · HyukjinKwon · commit deac48130448 · 2023-03-17T11:13:00.000+09:00
### What changes were proposed in this pull request? This pull request proposes an improvement to the error message when trying to access a JVM attribute that is not supported in Spark Connect. Specifically, it adds a more informative error message that clearly indicates which attribute is not supported due to Spark Connect's lack of dependency on the JVM. ### Why are the changes needed? Currently, when attempting to access an unsupported JVM attribute in Spark Connect, the error message is not very clear, making it difficult for users to understand the root cause of the issue. This improvement aims to provide more helpful information to users to address this problem as below: **Before** ```python >>> spark._jsc Traceback (most recent call last): File "<stdin>", line 1, in <module> AttributeError: 'SparkSession' object has no attribute '_jsc' ``` **After** ```python >>> spark._jsc Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/Users/haejoon.lee/Desktop/git_store/spark/python/pyspark/sql/connect/session.py", line 490, in _jsc raise PySparkAttributeError( pyspark.errors.exceptions.base.PySparkAttributeError: [JVM_ATTRIBUTE_NOT_SUPPORTED] Attribute `_jsc` is not supported in Spark Connect as it depends on the JVM. If you need to use this attribute, use the original PySpark instead of Spark Connect. ``` ### Does this PR introduce _any_ user-facing change? This PR does not introduce any user-facing change in terms of functionality. However, it improves the error message, which could potentially affect the user experience in a positive way. ### How was this patch tested? This patch was tested by adding new unit tests that specifically target the error message related to unsupported JVM attributes. The tests were run locally on a development environment. Closes apache#40458 from itholic/SPARK-42824. Authored-by: itholic <haejoon.lee@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/errors/__init__.py b/python/pyspark/errors/__init__.py
@@ -31,6 +31,7 @@
     SparkUpgradeException,
     PySparkTypeError,
     PySparkValueError,
+    PySparkAttributeError,
 )
 
 
@@ -47,4 +48,5 @@
     "SparkUpgradeException",
     "PySparkTypeError",
     "PySparkValueError",
+    "PySparkAttributeError",
 ]
diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py
@@ -39,6 +39,11 @@
       "Function `<func_name>` should return Column, got <return_type>."
     ]
   },
+  "JVM_ATTRIBUTE_NOT_SUPPORTED" : {
+    "message" : [
+      "Attribute `<attr_name>` is not supported in Spark Connect as it depends on the JVM. If you need to use this attribute, do not use Spark Connect when creating your session."
+    ]
+  },
   "NOT_BOOL" : {
     "message" : [
       "Argument `<arg_name>` should be a bool, got <arg_type>."
diff --git a/python/pyspark/errors/exceptions/base.py b/python/pyspark/errors/exceptions/base.py
@@ -160,3 +160,9 @@ class PySparkTypeError(PySparkException, TypeError):
     """
     Wrapper class for TypeError to support error classes.
     """
+
+
+class PySparkAttributeError(PySparkException, AttributeError):
+    """
+    Wrapper class for AttributeError to support error classes.
+    """
diff --git a/python/pyspark/sql/connect/column.py b/python/pyspark/sql/connect/column.py
@@ -31,7 +31,7 @@
     Optional,
 )
 
-from pyspark.errors import PySparkTypeError
+from pyspark.errors import PySparkTypeError, PySparkAttributeError
 from pyspark.sql.types import DataType
 from pyspark.sql.column import Column as PySparkColumn
 
@@ -433,6 +433,10 @@ def dropFields(self, *fieldNames: str) -> "Column":
     dropFields.__doc__ = PySparkColumn.dropFields.__doc__
 
     def __getattr__(self, item: Any) -> "Column":
+        if item == "_jc":
+            raise PySparkAttributeError(
+                error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jc"}
+            )
         if item.startswith("__"):
             raise AttributeError(item)
         return self[item]
@@ -459,6 +463,12 @@ def __nonzero__(self) -> None:
 
     __bool__ = __nonzero__
 
+    @property
+    def _jc(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jc"}
+        )
+
 
 Column.__doc__ = PySparkColumn.__doc__
 
diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
@@ -50,7 +50,7 @@
     DataFrameStatFunctions as PySparkDataFrameStatFunctions,
 )
 
-from pyspark.errors import PySparkTypeError
+from pyspark.errors import PySparkTypeError, PySparkAttributeError
 from pyspark.errors.exceptions.connect import SparkConnectException
 from pyspark.rdd import PythonEvalType
 import pyspark.sql.connect.plan as plan
@@ -1304,6 +1304,10 @@ def _get_alias(self) -> Optional[str]:
         return None
 
     def __getattr__(self, name: str) -> "Column":
+        if name in ["_jseq", "_jdf", "_jmap", "_jcols"]:
+            raise PySparkAttributeError(
+                error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": name}
+            )
         return self[name]
 
     @overload
diff --git a/python/pyspark/sql/connect/readwriter.py b/python/pyspark/sql/connect/readwriter.py
@@ -30,6 +30,7 @@
     DataFrameReader as PySparkDataFrameReader,
     DataFrameWriterV2 as PySparkDataFrameWriterV2,
 )
+from pyspark.errors import PySparkAttributeError
 
 if TYPE_CHECKING:
     from pyspark.sql.connect.dataframe import DataFrame
@@ -417,6 +418,12 @@ def jdbc(
 
     jdbc.__doc__ = PySparkDataFrameReader.jdbc.__doc__
 
+    @property
+    def _jreader(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jreader"}
+        )
+
 
 DataFrameReader.__doc__ = PySparkDataFrameReader.__doc__
 
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
@@ -67,6 +67,7 @@
     TimestampType,
 )
 from pyspark.sql.utils import to_str
+from pyspark.errors import PySparkAttributeError
 
 if TYPE_CHECKING:
     from pyspark.sql.connect._typing import OptionalPrimitiveType
@@ -484,6 +485,31 @@ def streams(self) -> Any:
     def readStream(self) -> Any:
         raise NotImplementedError("readStream() is not implemented.")
 
+    @property
+    def _jsc(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jsc"}
+        )
+
+    @property
+    def _jconf(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jconf"}
+        )
+
+    @property
+    def _jvm(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": "_jvm"}
+        )
+
+    @property
+    def _jsparkSession(self) -> None:
+        raise PySparkAttributeError(
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED",
+            message_parameters={"attr_name": "_jsparkSession"},
+        )
+
     @property
     def udf(self) -> "UDFRegistration":
         from pyspark.sql.connect.udf import UDFRegistration
diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py
@@ -23,7 +23,7 @@
 import tempfile
 from collections import defaultdict
 
-from pyspark.errors import PySparkTypeError
+from pyspark.errors import PySparkAttributeError, PySparkTypeError
 from pyspark.sql import SparkSession as PySparkSession, Row
 from pyspark.sql.types import (
     StructType,
@@ -2936,6 +2936,53 @@ def test_map_has_nullable(self):
         self.assertEqual(cdf2.schema, sdf2.schema)
         self.assertEqual(cdf2.collect(), sdf2.collect())
 
+    def test_unsupported_jvm_attribute(self):
+        # Unsupported jvm attributes for Spark session.
+        unsupported_attrs = ["_jsc", "_jconf", "_jvm", "_jsparkSession"]
+        spark_session = self.connect
+        for attr in unsupported_attrs:
+            with self.assertRaises(PySparkAttributeError) as pe:
+                getattr(spark_session, attr)
+
+            self.check_error(
+                exception=pe.exception,
+                error_class="JVM_ATTRIBUTE_NOT_SUPPORTED",
+                message_parameters={"attr_name": attr},
+            )
+
+        # Unsupported jvm attributes for DataFrame.
+        unsupported_attrs = ["_jseq", "_jdf", "_jmap", "_jcols"]
+        cdf = self.connect.range(10)
+        for attr in unsupported_attrs:
+            with self.assertRaises(PySparkAttributeError) as pe:
+                getattr(cdf, attr)
+
+            self.check_error(
+                exception=pe.exception,
+                error_class="JVM_ATTRIBUTE_NOT_SUPPORTED",
+                message_parameters={"attr_name": attr},
+            )
+
+        # Unsupported jvm attributes for Column.
+        with self.assertRaises(PySparkAttributeError) as pe:
+            getattr(cdf.id, "_jc")
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED",
+            message_parameters={"attr_name": "_jc"},
+        )
+
+        # Unsupported jvm attributes for DataFrameReader.
+        with self.assertRaises(PySparkAttributeError) as pe:
+            getattr(spark_session.read, "_jreader")
+
+        self.check_error(
+            exception=pe.exception,
+            error_class="JVM_ATTRIBUTE_NOT_SUPPORTED",
+            message_parameters={"attr_name": "_jreader"},
+        )
+
 
 @unittest.skipIf(not should_test_connect, connect_requirement_message)
 class ClientTests(unittest.TestCase):

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@`
`31`	`31`	`SparkUpgradeException,`
`32`	`32`	`PySparkTypeError,`
`33`	`33`	`PySparkValueError,`
	`34`	`+ PySparkAttributeError,`
`34`	`35`	`)`
`35`	`36`
`36`	`37`
`@@ -47,4 +48,5 @@`
`47`	`48`	`"SparkUpgradeException",`
`48`	`49`	`"PySparkTypeError",`
`49`	`50`	`"PySparkValueError",`
	`51`	`+ "PySparkAttributeError",`
`50`	`52`	`]`