Skip to content

Commit 8ebc3fc

Browse files
xinrong-mengueshin
authored andcommitted
[SPARK-35012][PYTHON] Port Koalas DataFrame-related unit tests into PySpark
### What changes were proposed in this pull request? Now that we merged the Koalas main code into the PySpark code base (#32036), we should port the Koalas DataFrame-related unit tests to PySpark. ### Why are the changes needed? Currently, the pandas-on-Spark modules are not fully tested. We should enable the DataFrame-related unit tests first. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Enable DataFrame-related unit tests. Closes #32131 from xinrong-databricks/port.test_dataframe_related. Lead-authored-by: Xinrong Meng <xinrong.meng@databricks.com> Co-authored-by: xinrong-databricks <47337188+xinrong-databricks@users.noreply.github.com> Signed-off-by: Takuya UESHIN <ueshin@databricks.com>
1 parent ee7d838 commit 8ebc3fc

File tree

4 files changed

+902
-0
lines changed

4 files changed

+902
-0
lines changed

dev/sparktestsupport/modules.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,9 @@ def __hash__(self):
612612
"pyspark.pandas.typedef.typehints",
613613
# unittests
614614
"pyspark.pandas.tests.test_dataframe",
615+
"pyspark.pandas.tests.test_dataframe_conversion",
616+
"pyspark.pandas.tests.test_dataframe_spark_io",
617+
"pyspark.pandas.tests.test_frame_spark",
615618
"pyspark.pandas.tests.test_ops_on_diff_frames",
616619
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
617620
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
import os
19+
import shutil
20+
import string
21+
import tempfile
22+
import unittest
23+
24+
import numpy as np
25+
import pandas as pd
26+
27+
from pyspark import pandas as pp
28+
from distutils.version import LooseVersion
29+
from pyspark.pandas.testing.utils import ReusedSQLTestCase, SQLTestUtils, TestUtils
30+
31+
32+
class DataFrameConversionTest(ReusedSQLTestCase, SQLTestUtils, TestUtils):
33+
"""Test cases for "small data" conversion and I/O."""
34+
35+
def setUp(self):
36+
self.tmp_dir = tempfile.mkdtemp(prefix=DataFrameConversionTest.__name__)
37+
38+
def tearDown(self):
39+
shutil.rmtree(self.tmp_dir, ignore_errors=True)
40+
41+
@property
42+
def pdf(self):
43+
return pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3])
44+
45+
@property
46+
def kdf(self):
47+
return pp.from_pandas(self.pdf)
48+
49+
@staticmethod
50+
def strip_all_whitespace(str):
51+
"""A helper function to remove all whitespace from a string."""
52+
return str.translate({ord(c): None for c in string.whitespace})
53+
54+
def test_to_html(self):
55+
expected = self.strip_all_whitespace(
56+
"""
57+
<table border="1" class="dataframe">
58+
<thead>
59+
<tr style="text-align: right;"><th></th><th>a</th><th>b</th></tr>
60+
</thead>
61+
<tbody>
62+
<tr><th>0</th><td>1</td><td>4</td></tr>
63+
<tr><th>1</th><td>2</td><td>5</td></tr>
64+
<tr><th>3</th><td>3</td><td>6</td></tr>
65+
</tbody>
66+
</table>
67+
"""
68+
)
69+
got = self.strip_all_whitespace(self.kdf.to_html())
70+
self.assert_eq(got, expected)
71+
72+
# with max_rows set
73+
expected = self.strip_all_whitespace(
74+
"""
75+
<table border="1" class="dataframe">
76+
<thead>
77+
<tr style="text-align: right;"><th></th><th>a</th><th>b</th></tr>
78+
</thead>
79+
<tbody>
80+
<tr><th>0</th><td>1</td><td>4</td></tr>
81+
<tr><th>1</th><td>2</td><td>5</td></tr>
82+
</tbody>
83+
</table>
84+
"""
85+
)
86+
got = self.strip_all_whitespace(self.kdf.to_html(max_rows=2))
87+
self.assert_eq(got, expected)
88+
89+
@staticmethod
90+
def get_excel_dfs(koalas_location, pandas_location):
91+
return {
92+
"got": pd.read_excel(koalas_location, index_col=0),
93+
"expected": pd.read_excel(pandas_location, index_col=0),
94+
}
95+
96+
@unittest.skip("openpyxl")
97+
def test_to_excel(self):
98+
with self.temp_dir() as dirpath:
99+
pandas_location = dirpath + "/" + "output1.xlsx"
100+
koalas_location = dirpath + "/" + "output2.xlsx"
101+
102+
pdf = self.pdf
103+
kdf = self.kdf
104+
kdf.to_excel(koalas_location)
105+
pdf.to_excel(pandas_location)
106+
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
107+
self.assert_eq(dataframes["got"], dataframes["expected"])
108+
109+
kdf.a.to_excel(koalas_location)
110+
pdf.a.to_excel(pandas_location)
111+
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
112+
self.assert_eq(dataframes["got"], dataframes["expected"])
113+
114+
pdf = pd.DataFrame({"a": [1, None, 3], "b": ["one", "two", None]}, index=[0, 1, 3])
115+
116+
kdf = pp.from_pandas(pdf)
117+
118+
kdf.to_excel(koalas_location, na_rep="null")
119+
pdf.to_excel(pandas_location, na_rep="null")
120+
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
121+
self.assert_eq(dataframes["got"], dataframes["expected"])
122+
123+
pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}, index=[0, 1, 3])
124+
125+
kdf = pp.from_pandas(pdf)
126+
127+
kdf.to_excel(koalas_location, float_format="%.1f")
128+
pdf.to_excel(pandas_location, float_format="%.1f")
129+
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
130+
self.assert_eq(dataframes["got"], dataframes["expected"])
131+
132+
kdf.to_excel(koalas_location, header=False)
133+
pdf.to_excel(pandas_location, header=False)
134+
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
135+
self.assert_eq(dataframes["got"], dataframes["expected"])
136+
137+
kdf.to_excel(koalas_location, index=False)
138+
pdf.to_excel(pandas_location, index=False)
139+
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
140+
self.assert_eq(dataframes["got"], dataframes["expected"])
141+
142+
def test_to_json(self):
143+
pdf = self.pdf
144+
kdf = pp.from_pandas(pdf)
145+
146+
self.assert_eq(kdf.to_json(orient="records"), pdf.to_json(orient="records"))
147+
148+
def test_to_json_negative(self):
149+
kdf = pp.from_pandas(self.pdf)
150+
151+
with self.assertRaises(NotImplementedError):
152+
kdf.to_json(orient="table")
153+
154+
with self.assertRaises(NotImplementedError):
155+
kdf.to_json(lines=False)
156+
157+
def test_read_json_negative(self):
158+
with self.assertRaises(NotImplementedError):
159+
pp.read_json("invalid", lines=False)
160+
161+
def test_to_json_with_path(self):
162+
pdf = pd.DataFrame({"a": [1], "b": ["a"]})
163+
kdf = pp.DataFrame(pdf)
164+
165+
kdf.to_json(self.tmp_dir, num_files=1)
166+
expected = pdf.to_json(orient="records")
167+
168+
output_paths = [path for path in os.listdir(self.tmp_dir) if path.startswith("part-")]
169+
assert len(output_paths) > 0
170+
output_path = "%s/%s" % (self.tmp_dir, output_paths[0])
171+
self.assertEqual("[%s]" % open(output_path).read().strip(), expected)
172+
173+
def test_to_json_with_partition_cols(self):
174+
pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
175+
kdf = pp.DataFrame(pdf)
176+
177+
kdf.to_json(self.tmp_dir, partition_cols="b", num_files=1)
178+
179+
partition_paths = [path for path in os.listdir(self.tmp_dir) if path.startswith("b=")]
180+
assert len(partition_paths) > 0
181+
for partition_path in partition_paths:
182+
column, value = partition_path.split("=")
183+
expected = pdf[pdf[column] == value].drop("b", axis=1).to_json(orient="records")
184+
185+
output_paths = [
186+
path
187+
for path in os.listdir("%s/%s" % (self.tmp_dir, partition_path))
188+
if path.startswith("part-")
189+
]
190+
assert len(output_paths) > 0
191+
output_path = "%s/%s/%s" % (self.tmp_dir, partition_path, output_paths[0])
192+
with open(output_path) as f:
193+
self.assertEqual("[%s]" % open(output_path).read().strip(), expected)
194+
195+
@unittest.skip("Pyperclip could not find a copy/paste mechanism for Linux.")
196+
def test_to_clipboard(self):
197+
pdf = self.pdf
198+
kdf = self.kdf
199+
200+
self.assert_eq(kdf.to_clipboard(), pdf.to_clipboard())
201+
self.assert_eq(kdf.to_clipboard(excel=False), pdf.to_clipboard(excel=False))
202+
self.assert_eq(
203+
kdf.to_clipboard(sep=";", index=False), pdf.to_clipboard(sep=";", index=False)
204+
)
205+
206+
def test_to_latex(self):
207+
pdf = self.pdf
208+
kdf = self.kdf
209+
210+
self.assert_eq(kdf.to_latex(), pdf.to_latex())
211+
self.assert_eq(kdf.to_latex(col_space=2), pdf.to_latex(col_space=2))
212+
self.assert_eq(kdf.to_latex(header=True), pdf.to_latex(header=True))
213+
self.assert_eq(kdf.to_latex(index=False), pdf.to_latex(index=False))
214+
self.assert_eq(kdf.to_latex(na_rep="-"), pdf.to_latex(na_rep="-"))
215+
self.assert_eq(kdf.to_latex(float_format="%.1f"), pdf.to_latex(float_format="%.1f"))
216+
self.assert_eq(kdf.to_latex(sparsify=False), pdf.to_latex(sparsify=False))
217+
self.assert_eq(kdf.to_latex(index_names=False), pdf.to_latex(index_names=False))
218+
self.assert_eq(kdf.to_latex(bold_rows=True), pdf.to_latex(bold_rows=True))
219+
self.assert_eq(kdf.to_latex(decimal=","), pdf.to_latex(decimal=","))
220+
if LooseVersion(pd.__version__) < LooseVersion("1.0.0"):
221+
self.assert_eq(kdf.to_latex(encoding="ascii"), pdf.to_latex(encoding="ascii"))
222+
223+
def test_to_records(self):
224+
if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"):
225+
pdf = pd.DataFrame({"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"])
226+
227+
kdf = pp.from_pandas(pdf)
228+
229+
self.assert_eq(kdf.to_records(), pdf.to_records())
230+
self.assert_eq(kdf.to_records(index=False), pdf.to_records(index=False))
231+
self.assert_eq(kdf.to_records(index_dtypes="<S2"), pdf.to_records(index_dtypes="<S2"))
232+
233+
def test_from_records(self):
234+
# Assert using a dict as input
235+
self.assert_eq(
236+
pp.DataFrame.from_records({"A": [1, 2, 3]}), pd.DataFrame.from_records({"A": [1, 2, 3]})
237+
)
238+
# Assert using a list of tuples as input
239+
self.assert_eq(
240+
pp.DataFrame.from_records([(1, 2), (3, 4)]), pd.DataFrame.from_records([(1, 2), (3, 4)])
241+
)
242+
# Assert using a NumPy array as input
243+
self.assert_eq(pp.DataFrame.from_records(np.eye(3)), pd.DataFrame.from_records(np.eye(3)))
244+
# Asserting using a custom index
245+
self.assert_eq(
246+
pp.DataFrame.from_records([(1, 2), (3, 4)], index=[2, 3]),
247+
pd.DataFrame.from_records([(1, 2), (3, 4)], index=[2, 3]),
248+
)
249+
# Assert excluding excluding column(s)
250+
self.assert_eq(
251+
pp.DataFrame.from_records({"A": [1, 2, 3], "B": [1, 2, 3]}, exclude=["B"]),
252+
pd.DataFrame.from_records({"A": [1, 2, 3], "B": [1, 2, 3]}, exclude=["B"]),
253+
)
254+
# Assert limiting to certain column(s)
255+
self.assert_eq(
256+
pp.DataFrame.from_records({"A": [1, 2, 3], "B": [1, 2, 3]}, columns=["A"]),
257+
pd.DataFrame.from_records({"A": [1, 2, 3], "B": [1, 2, 3]}, columns=["A"]),
258+
)
259+
# Assert limiting to a number of rows
260+
self.assert_eq(
261+
pp.DataFrame.from_records([(1, 2), (3, 4)], nrows=1),
262+
pd.DataFrame.from_records([(1, 2), (3, 4)], nrows=1),
263+
)
264+
265+
266+
if __name__ == "__main__":
267+
from pyspark.pandas.tests.test_dataframe_conversion import * # noqa: F401
268+
269+
try:
270+
import xmlrunner # type: ignore[import]
271+
testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2)
272+
except ImportError:
273+
testRunner = None
274+
unittest.main(testRunner=testRunner, verbosity=2)

0 commit comments

Comments
 (0)