Skip to content

Commit 9689c44

Browse files
itholicHyukjinKwon
authored andcommitted
[SPARK-34995] Port/integrate Koalas remaining codes into PySpark
### What changes were proposed in this pull request? There are some more changes in Koalas such as [databricks/koalas#2141](databricks/koalas@c8f803d), [databricks/koalas#2143](databricks/koalas@913d688) after the main code porting, this PR is to synchronize those changes with the `pyspark.pandas`. ### Why are the changes needed? We should port the whole Koalas codes into PySpark and synchronize them. ### Does this PR introduce _any_ user-facing change? Fixed some incompatible behavior with pandas 1.2.0 and added more to the `to_markdown` docstring. ### How was this patch tested? Manually tested in local. Closes #32154 from itholic/SPARK-34995. Authored-by: itholic <haejoon.lee@databricks.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
1 parent 2cb962b commit 9689c44

File tree

3 files changed

+56
-43
lines changed

3 files changed

+56
-43
lines changed

python/pyspark/pandas/generic.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2872,6 +2872,10 @@ def to_markdown(self, buf=None, mode=None) -> str:
28722872
str
28732873
Series or DataFrame in Markdown-friendly format.
28742874
2875+
Notes
2876+
-----
2877+
Requires the `tabulate <https://pypi.org/project/tabulate>`_ package.
2878+
28752879
Examples
28762880
--------
28772881
>>> kser = ps.Series(["elk", "pig", "dog", "quetzal"], name="animal")

python/pyspark/pandas/indexing.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1698,6 +1698,25 @@ def _select_cols_else(
16981698
)
16991699

17001700
def __setitem__(self, key, value):
1701+
if is_list_like(value) and not isinstance(value, spark.Column):
1702+
iloc_item = self[key]
1703+
if not is_list_like(key) or not is_list_like(iloc_item):
1704+
raise ValueError("setting an array element with a sequence.")
1705+
else:
1706+
shape_iloc_item = iloc_item.shape
1707+
len_iloc_item = shape_iloc_item[0]
1708+
len_value = len(value)
1709+
if len_iloc_item != len_value:
1710+
if self._is_series:
1711+
raise ValueError(
1712+
"cannot set using a list-like indexer with a different length than "
1713+
"the value"
1714+
)
1715+
else:
1716+
raise ValueError(
1717+
"shape mismatch: value array of shape ({},) could not be broadcast "
1718+
"to indexing result of shape {}".format(len_value, shape_iloc_item)
1719+
)
17011720
super().__setitem__(key, value)
17021721
# Update again with resolved_copy to drop extra columns.
17031722
self._kdf._update_internal_frame(

python/pyspark/pandas/tests/test_ops_on_diff_frames.py

Lines changed: 33 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,25 +1151,17 @@ def test_frame_iloc_setitem(self):
11511151
pdf.iloc[[0, 1, 2], 1] = -pdf.max_speed
11521152
self.assert_eq(kdf, pdf)
11531153

1154-
# TODO: matching the behavior with pandas 1.2 and uncomment below test
1155-
# with self.assertRaisesRegex(
1156-
# ValueError,
1157-
# "shape mismatch: value array of shape (3,) could not be broadcast to indexing "
1158-
# "result of shape (2,1)",
1159-
# ):
1160-
# kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed
1154+
with self.assertRaisesRegex(
1155+
ValueError, "shape mismatch",
1156+
):
1157+
kdf.iloc[[1, 2], [1]] = -another_kdf.max_speed
11611158

11621159
kdf.iloc[[0, 1, 2], 1] = 10 * another_kdf.max_speed
11631160
pdf.iloc[[0, 1, 2], 1] = 10 * pdf.max_speed
11641161
self.assert_eq(kdf, pdf)
11651162

1166-
# TODO: matching the behavior with pandas 1.2 and uncomment below test
1167-
# with self.assertRaisesRegex(
1168-
# ValueError,
1169-
# "shape mismatch: value array of shape (3,) could not be broadcast to indexing "
1170-
# "result of shape (1,)",
1171-
# ):
1172-
# kdf.iloc[[0], 1] = 10 * another_kdf.max_speed
1163+
with self.assertRaisesRegex(ValueError, "shape mismatch"):
1164+
kdf.iloc[[0], 1] = 10 * another_kdf.max_speed
11731165

11741166
def test_series_loc_setitem(self):
11751167
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
@@ -1269,36 +1261,35 @@ def test_series_iloc_setitem(self):
12691261
self.assert_eq(kdf, pdf)
12701262
self.assert_eq(ksery, psery)
12711263

1272-
# TODO: matching the behavior with pandas 1.2 and uncomment below test.
1273-
# with self.assertRaisesRegex(
1274-
# ValueError,
1275-
# "cannot set using a list-like indexer with a different length than the value",
1276-
# ):
1277-
# kser.iloc[[1, 2]] = -kser_another
1264+
with self.assertRaisesRegex(
1265+
ValueError,
1266+
"cannot set using a list-like indexer with a different length than the value",
1267+
):
1268+
kser.iloc[[1, 2]] = -kser_another
12781269

12791270
kser.iloc[[0, 1, 2]] = 10 * kser_another
12801271
pser.iloc[[0, 1, 2]] = 10 * pser_another
12811272
self.assert_eq(kser, pser)
12821273
self.assert_eq(kdf, pdf)
12831274
self.assert_eq(ksery, psery)
12841275

1285-
# with self.assertRaisesRegex(
1286-
# ValueError,
1287-
# "cannot set using a list-like indexer with a different length than the value",
1288-
# ):
1289-
# kser.iloc[[0]] = 10 * kser_another
1276+
with self.assertRaisesRegex(
1277+
ValueError,
1278+
"cannot set using a list-like indexer with a different length than the value",
1279+
):
1280+
kser.iloc[[0]] = 10 * kser_another
12901281

12911282
kser1.iloc[[0, 1, 2]] = -kser_another
12921283
pser1.iloc[[0, 1, 2]] = -pser_another
12931284
self.assert_eq(kser1, pser1)
12941285
self.assert_eq(kdf, pdf)
12951286
self.assert_eq(ksery, psery)
12961287

1297-
# with self.assertRaisesRegex(
1298-
# ValueError,
1299-
# "cannot set using a list-like indexer with a different length than the value",
1300-
# ):
1301-
# kser1.iloc[[1, 2]] = -kser_another
1288+
with self.assertRaisesRegex(
1289+
ValueError,
1290+
"cannot set using a list-like indexer with a different length than the value",
1291+
):
1292+
kser1.iloc[[1, 2]] = -kser_another
13021293

13031294
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=["cobra", "viper", "sidewinder"])
13041295
kdf = ps.from_pandas(pdf)
@@ -1317,24 +1308,23 @@ def test_series_iloc_setitem(self):
13171308
self.assert_eq(kdf, pdf)
13181309
self.assert_eq(ksery, psery)
13191310

1320-
# TODO: matching the behavior with pandas 1.2 and uncomment below test.
1321-
# with self.assertRaisesRegex(
1322-
# ValueError,
1323-
# "cannot set using a list-like indexer with a different length than the value",
1324-
# ):
1325-
# kiloc[[1, 2]] = -kser_another
1311+
with self.assertRaisesRegex(
1312+
ValueError,
1313+
"cannot set using a list-like indexer with a different length than the value",
1314+
):
1315+
kiloc[[1, 2]] = -kser_another
13261316

13271317
kiloc[[0, 1, 2]] = 10 * kser_another
13281318
piloc[[0, 1, 2]] = 10 * pser_another
13291319
self.assert_eq(kser, pser)
13301320
self.assert_eq(kdf, pdf)
13311321
self.assert_eq(ksery, psery)
13321322

1333-
# with self.assertRaisesRegex(
1334-
# ValueError,
1335-
# "cannot set using a list-like indexer with a different length than the value",
1336-
# ):
1337-
# kiloc[[0]] = 10 * kser_another
1323+
with self.assertRaisesRegex(
1324+
ValueError,
1325+
"cannot set using a list-like indexer with a different length than the value",
1326+
):
1327+
kiloc[[0]] = 10 * kser_another
13381328

13391329
def test_update(self):
13401330
pdf = pd.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]})
@@ -1863,7 +1853,7 @@ def test_frame_iloc_setitem(self):
18631853
another_kdf = ps.DataFrame(pdf)
18641854

18651855
with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
1866-
kdf.iloc[[1, 2], [1]] = another_kdf.max_speed
1856+
kdf.iloc[[1, 2], [1]] = another_kdf.max_speed.iloc[[1, 2]]
18671857

18681858
def test_series_loc_setitem(self):
18691859
pser = pd.Series([1, 2, 3], index=["cobra", "viper", "sidewinder"])
@@ -1889,7 +1879,7 @@ def test_series_iloc_setitem(self):
18891879
kser_another = ps.from_pandas(pser_another)
18901880

18911881
with self.assertRaisesRegex(ValueError, "Cannot combine the series or dataframe"):
1892-
kser.iloc[[1]] = -kser_another
1882+
kser.iloc[[1]] = -kser_another.iloc[[1]]
18931883

18941884
def test_where(self):
18951885
pdf1 = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]})

0 commit comments

Comments
 (0)