Skip to content

Commit 87b0f59

Browse files
committed
[SPARK-48561][PS][CONNECT] Throw PandasNotImplementedError for unsupported plotting functions
### What changes were proposed in this pull request? Throw `PandasNotImplementedError` for unsupported plotting functions: - {Frame, Series}.plot.hist - {Frame, Series}.plot.kde - {Frame, Series}.plot.density - {Frame, Series}.plot(kind="hist", ...) - {Frame, Series}.plot(kind="hist", ...) - {Frame, Series}.plot(kind="density", ...) ### Why are the changes needed? the previous error message is confusing: ``` In [3]: psdf.plot.hist() /Users/ruifeng.zheng/Dev/spark/python/pyspark/pandas/utils.py:1017: PandasAPIOnSparkAdviceWarning: The config 'spark.sql.ansi.enabled' is set to True. This can cause unexpected behavior from pandas API on Spark since pandas API on Spark follows the behavior of pandas, not SQL. warnings.warn(message, PandasAPIOnSparkAdviceWarning) [*********************************************-----------------------------------] 57.14% Complete (0 Tasks running, 1s, Scanned[*********************************************-----------------------------------] 57.14% Complete (0 Tasks running, 1s, Scanned[*********************************************-----------------------------------] 57.14% Complete (0 Tasks running, 1s, Scanned --------------------------------------------------------------------------- PySparkAttributeError Traceback (most recent call last) Cell In[3], line 1 ----> 1 psdf.plot.hist() File ~/Dev/spark/python/pyspark/pandas/plot/core.py:951, in PandasOnSparkPlotAccessor.hist(self, bins, **kwds) 903 def hist(self, bins=10, **kwds): 904 """ 905 Draw one histogram of the DataFrame’s columns. 906 A `histogram`_ is a representation of the distribution of data. (...) 949 >>> df.plot.hist(bins=12, alpha=0.5) # doctest: +SKIP 950 """ --> 951 return self(kind="hist", bins=bins, **kwds) File ~/Dev/spark/python/pyspark/pandas/plot/core.py:580, in PandasOnSparkPlotAccessor.__call__(self, kind, backend, **kwargs) 577 kind = {"density": "kde"}.get(kind, kind) 578 if hasattr(plot_backend, "plot_pandas_on_spark"): 579 # use if there's pandas-on-Spark specific method. --> 580 return plot_backend.plot_pandas_on_spark(plot_data, kind=kind, **kwargs) 581 else: 582 # fallback to use pandas' 583 if not PandasOnSparkPlotAccessor.pandas_plot_data_map[kind]: File ~/Dev/spark/python/pyspark/pandas/plot/plotly.py:41, in plot_pandas_on_spark(data, kind, **kwargs) 39 return plot_pie(data, **kwargs) 40 if kind == "hist": ---> 41 return plot_histogram(data, **kwargs) 42 if kind == "box": 43 return plot_box(data, **kwargs) File ~/Dev/spark/python/pyspark/pandas/plot/plotly.py:87, in plot_histogram(data, **kwargs) 85 psdf, bins = HistogramPlotBase.prepare_hist_data(data, bins) 86 assert len(bins) > 2, "the number of buckets must be higher than 2." ---> 87 output_series = HistogramPlotBase.compute_hist(psdf, bins) 88 prev = float("%.9f" % bins[0]) # to make it prettier, truncate. 89 text_bins = [] File ~/Dev/spark/python/pyspark/pandas/plot/core.py:189, in HistogramPlotBase.compute_hist(psdf, bins) 183 for group_id, (colname, bucket_name) in enumerate(zip(colnames, bucket_names)): 184 # creates a Bucketizer to get corresponding bin of each value 185 bucketizer = Bucketizer( 186 splits=bins, inputCol=colname, outputCol=bucket_name, handleInvalid="skip" 187 ) --> 189 bucket_df = bucketizer.transform(sdf) 191 if output_df is None: 192 output_df = bucket_df.select( 193 F.lit(group_id).alias("__group_id"), F.col(bucket_name).alias("__bucket") 194 ) File ~/Dev/spark/python/pyspark/ml/base.py:260, in Transformer.transform(self, dataset, params) 258 return self.copy(params)._transform(dataset) 259 else: --> 260 return self._transform(dataset) 261 else: 262 raise TypeError("Params must be a param map but got %s." % type(params)) File ~/Dev/spark/python/pyspark/ml/wrapper.py:412, in JavaTransformer._transform(self, dataset) 409 assert self._java_obj is not None 411 self._transfer_params_to_java() --> 412 return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sparkSession) File ~/Dev/spark/python/pyspark/sql/connect/dataframe.py:1696, in DataFrame.__getattr__(self, name) 1694 def __getattr__(self, name: str) -> "Column": 1695 if name in ["_jseq", "_jdf", "_jmap", "_jcols", "rdd", "toJSON"]: -> 1696 raise PySparkAttributeError( 1697 error_class="JVM_ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": name} 1698 ) 1700 if name not in self.columns: 1701 raise PySparkAttributeError( 1702 error_class="ATTRIBUTE_NOT_SUPPORTED", message_parameters={"attr_name": name} 1703 ) PySparkAttributeError: [JVM_ATTRIBUTE_NOT_SUPPORTED] Attribute `_jdf` is not supported in Spark Connect as it depends on the JVM. If you need to use this attribute, do not use Spark Connect when creating your session. Visit https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession for creating regular Spark Session in detail. ``` after this PR: ``` In [3]: psdf.plot.hist() --------------------------------------------------------------------------- PandasNotImplementedError Traceback (most recent call last) Cell In[3], line 1 ----> 1 psdf.plot.hist() File ~/Dev/spark/python/pyspark/pandas/plot/core.py:957, in PandasOnSparkPlotAccessor.hist(self, bins, **kwds) 909 """ 910 Draw one histogram of the DataFrame’s columns. 911 A `histogram`_ is a representation of the distribution of data. (...) 954 >>> df.plot.hist(bins=12, alpha=0.5) # doctest: +SKIP 955 """ 956 if is_remote(): --> 957 return unsupported_function(class_name="pd.DataFrame", method_name="hist")() 959 return self(kind="hist", bins=bins, **kwds) File ~/Dev/spark/python/pyspark/pandas/missing/__init__.py:23, in unsupported_function.<locals>.unsupported_function(*args, **kwargs) 22 def unsupported_function(*args, **kwargs): ---> 23 raise PandasNotImplementedError( 24 class_name=class_name, method_name=method_name, reason=reason 25 ) PandasNotImplementedError: The method `pd.DataFrame.hist()` is not implemented yet. ``` ### Does this PR introduce _any_ user-facing change? yes, error message improvement ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? No Closes #46911 from zhengruifeng/ps_plotting_unsupported. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
1 parent b7d9c31 commit 87b0f59

File tree

4 files changed

+142
-1
lines changed

4 files changed

+142
-1
lines changed

dev/sparktestsupport/modules.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,8 @@ def __hash__(self):
11021102
"python/pyspark/pandas",
11031103
],
11041104
python_test_goals=[
1105+
# unittests dedicated for Spark Connect
1106+
"pyspark.pandas.tests.connect.test_connect_plotting",
11051107
# pandas-on-Spark unittests
11061108
"pyspark.pandas.tests.connect.test_parity_categorical",
11071109
"pyspark.pandas.tests.connect.test_parity_config",

python/pyspark/pandas/plot/core.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from pandas.core.dtypes.inference import is_integer
2424

2525
from pyspark.sql import functions as F
26+
from pyspark.sql.utils import is_remote
2627
from pyspark.pandas.missing import unsupported_function
2728
from pyspark.pandas.config import get_option
2829
from pyspark.pandas.utils import name_like_string
@@ -571,10 +572,14 @@ def _get_plot_backend(backend=None):
571572
return module
572573

573574
def __call__(self, kind="line", backend=None, **kwargs):
575+
kind = {"density": "kde"}.get(kind, kind)
576+
577+
if is_remote() and kind in ["hist", "kde"]:
578+
return unsupported_function(class_name="pd.DataFrame", method_name=kind)()
579+
574580
plot_backend = PandasOnSparkPlotAccessor._get_plot_backend(backend)
575581
plot_data = self.data
576582

577-
kind = {"density": "kde"}.get(kind, kind)
578583
if hasattr(plot_backend, "plot_pandas_on_spark"):
579584
# use if there's pandas-on-Spark specific method.
580585
return plot_backend.plot_pandas_on_spark(plot_data, kind=kind, **kwargs)
@@ -948,6 +953,9 @@ def hist(self, bins=10, **kwds):
948953
>>> df = ps.from_pandas(df)
949954
>>> df.plot.hist(bins=12, alpha=0.5) # doctest: +SKIP
950955
"""
956+
if is_remote():
957+
return unsupported_function(class_name="pd.DataFrame", method_name="hist")()
958+
951959
return self(kind="hist", bins=bins, **kwds)
952960

953961
def kde(self, bw_method=None, ind=None, **kwargs):
@@ -1023,6 +1031,9 @@ def kde(self, bw_method=None, ind=None, **kwargs):
10231031
... })
10241032
>>> df.plot.kde(ind=[1, 2, 3, 4, 5, 6], bw_method=0.3) # doctest: +SKIP
10251033
"""
1034+
if is_remote():
1035+
return unsupported_function(class_name="pd.DataFrame", method_name="kde")()
1036+
10261037
return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs)
10271038

10281039
density = kde

python/pyspark/pandas/tests/connect/plot/test_parity_series_plot_matplotlib.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
class SeriesPlotMatplotlibParityTests(
2525
SeriesPlotMatplotlibTestsMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase
2626
):
27+
@unittest.skip("Test depends on Spark ML which is not supported from Spark Connect.")
28+
def test_empty_hist(self):
29+
super().test_empty_hist()
30+
2731
@unittest.skip("Test depends on Spark ML which is not supported from Spark Connect.")
2832
def test_hist(self):
2933
super().test_hist()
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
import unittest
18+
19+
import pandas as pd
20+
21+
from pyspark import pandas as ps
22+
from pyspark.pandas.exceptions import PandasNotImplementedError
23+
from pyspark.testing.connectutils import ReusedConnectTestCase
24+
from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils
25+
26+
27+
class ConnectPlottingTests(PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase):
28+
@property
29+
def pdf1(self):
30+
return pd.DataFrame(
31+
[[1, 2], [4, 5], [7, 8]],
32+
index=["cobra", "viper", None],
33+
columns=["max_speed", "shield"],
34+
)
35+
36+
@property
37+
def psdf1(self):
38+
return ps.from_pandas(self.pdf1)
39+
40+
def test_unsupported_functions(self):
41+
with self.assertRaises(PandasNotImplementedError):
42+
self.psdf1.plot.hist()
43+
44+
with self.assertRaises(PandasNotImplementedError):
45+
self.psdf1.plot.hist(bins=3)
46+
47+
with self.assertRaises(PandasNotImplementedError):
48+
self.psdf1.plot.kde()
49+
50+
with self.assertRaises(PandasNotImplementedError):
51+
self.psdf1.plot.kde(bw_method=3)
52+
53+
with self.assertRaises(PandasNotImplementedError):
54+
self.psdf1.plot.density()
55+
56+
with self.assertRaises(PandasNotImplementedError):
57+
self.psdf1.plot.density(bw_method=3)
58+
59+
with self.assertRaises(PandasNotImplementedError):
60+
self.psdf1.shield.plot.hist()
61+
62+
with self.assertRaises(PandasNotImplementedError):
63+
self.psdf1.shield.plot.hist(bins=3)
64+
65+
with self.assertRaises(PandasNotImplementedError):
66+
self.psdf1.shield.plot.kde()
67+
68+
with self.assertRaises(PandasNotImplementedError):
69+
self.psdf1.shield.plot.kde(bw_method=3)
70+
71+
with self.assertRaises(PandasNotImplementedError):
72+
self.psdf1.shield.plot.density()
73+
74+
with self.assertRaises(PandasNotImplementedError):
75+
self.psdf1.shield.plot.density(bw_method=3)
76+
77+
def test_unsupported_kinds(self):
78+
with self.assertRaises(PandasNotImplementedError):
79+
self.psdf1.plot(kind="hist")
80+
81+
with self.assertRaises(PandasNotImplementedError):
82+
self.psdf1.plot(kind="hist", bins=3)
83+
84+
with self.assertRaises(PandasNotImplementedError):
85+
self.psdf1.plot(kind="kde")
86+
87+
with self.assertRaises(PandasNotImplementedError):
88+
self.psdf1.plot(kind="kde", bw_method=3)
89+
90+
with self.assertRaises(PandasNotImplementedError):
91+
self.psdf1.plot(kind="density")
92+
93+
with self.assertRaises(PandasNotImplementedError):
94+
self.psdf1.plot(kind="density", bw_method=3)
95+
96+
with self.assertRaises(PandasNotImplementedError):
97+
self.psdf1.shield.plot(kind="hist")
98+
99+
with self.assertRaises(PandasNotImplementedError):
100+
self.psdf1.shield.plot(kind="hist", bins=3)
101+
102+
with self.assertRaises(PandasNotImplementedError):
103+
self.psdf1.shield.plot(kind="kde")
104+
105+
with self.assertRaises(PandasNotImplementedError):
106+
self.psdf1.shield.plot(kind="kde", bw_method=3)
107+
108+
with self.assertRaises(PandasNotImplementedError):
109+
self.psdf1.shield.plot(kind="density")
110+
111+
with self.assertRaises(PandasNotImplementedError):
112+
self.psdf1.shield.plot(kind="density", bw_method=3)
113+
114+
115+
if __name__ == "__main__":
116+
from pyspark.pandas.tests.connect.test_connect_plotting import * # noqa: F401
117+
118+
try:
119+
import xmlrunner # type: ignore[import]
120+
121+
testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
122+
except ImportError:
123+
testRunner = None
124+
unittest.main(testRunner=testRunner, verbosity=2)

0 commit comments

Comments
 (0)