|
| 1 | +# |
| 2 | +# Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | +# contributor license agreements. See the NOTICE file distributed with |
| 4 | +# this work for additional information regarding copyright ownership. |
| 5 | +# The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | +# (the "License"); you may not use this file except in compliance with |
| 7 | +# the License. You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | +# |
| 17 | + |
| 18 | +import os |
| 19 | +import shutil |
| 20 | +import string |
| 21 | +import tempfile |
| 22 | +import unittest |
| 23 | + |
| 24 | +import numpy as np |
| 25 | +import pandas as pd |
| 26 | + |
| 27 | +from pyspark import pandas as pp |
| 28 | +from distutils.version import LooseVersion |
| 29 | +from pyspark.pandas.testing.utils import ReusedSQLTestCase, SQLTestUtils, TestUtils |
| 30 | + |
| 31 | + |
| 32 | +class DataFrameConversionTest(ReusedSQLTestCase, SQLTestUtils, TestUtils): |
| 33 | + """Test cases for "small data" conversion and I/O.""" |
| 34 | + |
| 35 | + def setUp(self): |
| 36 | + self.tmp_dir = tempfile.mkdtemp(prefix=DataFrameConversionTest.__name__) |
| 37 | + |
| 38 | + def tearDown(self): |
| 39 | + shutil.rmtree(self.tmp_dir, ignore_errors=True) |
| 40 | + |
| 41 | + @property |
| 42 | + def pdf(self): |
| 43 | + return pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]) |
| 44 | + |
| 45 | + @property |
| 46 | + def kdf(self): |
| 47 | + return pp.from_pandas(self.pdf) |
| 48 | + |
| 49 | + @staticmethod |
| 50 | + def strip_all_whitespace(str): |
| 51 | + """A helper function to remove all whitespace from a string.""" |
| 52 | + return str.translate({ord(c): None for c in string.whitespace}) |
| 53 | + |
| 54 | + def test_to_html(self): |
| 55 | + expected = self.strip_all_whitespace( |
| 56 | + """ |
| 57 | + <table border="1" class="dataframe"> |
| 58 | + <thead> |
| 59 | + <tr style="text-align: right;"><th></th><th>a</th><th>b</th></tr> |
| 60 | + </thead> |
| 61 | + <tbody> |
| 62 | + <tr><th>0</th><td>1</td><td>4</td></tr> |
| 63 | + <tr><th>1</th><td>2</td><td>5</td></tr> |
| 64 | + <tr><th>3</th><td>3</td><td>6</td></tr> |
| 65 | + </tbody> |
| 66 | + </table> |
| 67 | + """ |
| 68 | + ) |
| 69 | + got = self.strip_all_whitespace(self.kdf.to_html()) |
| 70 | + self.assert_eq(got, expected) |
| 71 | + |
| 72 | + # with max_rows set |
| 73 | + expected = self.strip_all_whitespace( |
| 74 | + """ |
| 75 | + <table border="1" class="dataframe"> |
| 76 | + <thead> |
| 77 | + <tr style="text-align: right;"><th></th><th>a</th><th>b</th></tr> |
| 78 | + </thead> |
| 79 | + <tbody> |
| 80 | + <tr><th>0</th><td>1</td><td>4</td></tr> |
| 81 | + <tr><th>1</th><td>2</td><td>5</td></tr> |
| 82 | + </tbody> |
| 83 | + </table> |
| 84 | + """ |
| 85 | + ) |
| 86 | + got = self.strip_all_whitespace(self.kdf.to_html(max_rows=2)) |
| 87 | + self.assert_eq(got, expected) |
| 88 | + |
| 89 | + @staticmethod |
| 90 | + def get_excel_dfs(koalas_location, pandas_location): |
| 91 | + return { |
| 92 | + "got": pd.read_excel(koalas_location, index_col=0), |
| 93 | + "expected": pd.read_excel(pandas_location, index_col=0), |
| 94 | + } |
| 95 | + |
| 96 | + @unittest.skip("openpyxl") |
| 97 | + def test_to_excel(self): |
| 98 | + with self.temp_dir() as dirpath: |
| 99 | + pandas_location = dirpath + "/" + "output1.xlsx" |
| 100 | + koalas_location = dirpath + "/" + "output2.xlsx" |
| 101 | + |
| 102 | + pdf = self.pdf |
| 103 | + kdf = self.kdf |
| 104 | + kdf.to_excel(koalas_location) |
| 105 | + pdf.to_excel(pandas_location) |
| 106 | + dataframes = self.get_excel_dfs(koalas_location, pandas_location) |
| 107 | + self.assert_eq(dataframes["got"], dataframes["expected"]) |
| 108 | + |
| 109 | + kdf.a.to_excel(koalas_location) |
| 110 | + pdf.a.to_excel(pandas_location) |
| 111 | + dataframes = self.get_excel_dfs(koalas_location, pandas_location) |
| 112 | + self.assert_eq(dataframes["got"], dataframes["expected"]) |
| 113 | + |
| 114 | + pdf = pd.DataFrame({"a": [1, None, 3], "b": ["one", "two", None]}, index=[0, 1, 3]) |
| 115 | + |
| 116 | + kdf = pp.from_pandas(pdf) |
| 117 | + |
| 118 | + kdf.to_excel(koalas_location, na_rep="null") |
| 119 | + pdf.to_excel(pandas_location, na_rep="null") |
| 120 | + dataframes = self.get_excel_dfs(koalas_location, pandas_location) |
| 121 | + self.assert_eq(dataframes["got"], dataframes["expected"]) |
| 122 | + |
| 123 | + pdf = pd.DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]}, index=[0, 1, 3]) |
| 124 | + |
| 125 | + kdf = pp.from_pandas(pdf) |
| 126 | + |
| 127 | + kdf.to_excel(koalas_location, float_format="%.1f") |
| 128 | + pdf.to_excel(pandas_location, float_format="%.1f") |
| 129 | + dataframes = self.get_excel_dfs(koalas_location, pandas_location) |
| 130 | + self.assert_eq(dataframes["got"], dataframes["expected"]) |
| 131 | + |
| 132 | + kdf.to_excel(koalas_location, header=False) |
| 133 | + pdf.to_excel(pandas_location, header=False) |
| 134 | + dataframes = self.get_excel_dfs(koalas_location, pandas_location) |
| 135 | + self.assert_eq(dataframes["got"], dataframes["expected"]) |
| 136 | + |
| 137 | + kdf.to_excel(koalas_location, index=False) |
| 138 | + pdf.to_excel(pandas_location, index=False) |
| 139 | + dataframes = self.get_excel_dfs(koalas_location, pandas_location) |
| 140 | + self.assert_eq(dataframes["got"], dataframes["expected"]) |
| 141 | + |
| 142 | + def test_to_json(self): |
| 143 | + pdf = self.pdf |
| 144 | + kdf = pp.from_pandas(pdf) |
| 145 | + |
| 146 | + self.assert_eq(kdf.to_json(orient="records"), pdf.to_json(orient="records")) |
| 147 | + |
| 148 | + def test_to_json_negative(self): |
| 149 | + kdf = pp.from_pandas(self.pdf) |
| 150 | + |
| 151 | + with self.assertRaises(NotImplementedError): |
| 152 | + kdf.to_json(orient="table") |
| 153 | + |
| 154 | + with self.assertRaises(NotImplementedError): |
| 155 | + kdf.to_json(lines=False) |
| 156 | + |
| 157 | + def test_read_json_negative(self): |
| 158 | + with self.assertRaises(NotImplementedError): |
| 159 | + pp.read_json("invalid", lines=False) |
| 160 | + |
| 161 | + def test_to_json_with_path(self): |
| 162 | + pdf = pd.DataFrame({"a": [1], "b": ["a"]}) |
| 163 | + kdf = pp.DataFrame(pdf) |
| 164 | + |
| 165 | + kdf.to_json(self.tmp_dir, num_files=1) |
| 166 | + expected = pdf.to_json(orient="records") |
| 167 | + |
| 168 | + output_paths = [path for path in os.listdir(self.tmp_dir) if path.startswith("part-")] |
| 169 | + assert len(output_paths) > 0 |
| 170 | + output_path = "%s/%s" % (self.tmp_dir, output_paths[0]) |
| 171 | + self.assertEqual("[%s]" % open(output_path).read().strip(), expected) |
| 172 | + |
| 173 | + def test_to_json_with_partition_cols(self): |
| 174 | + pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) |
| 175 | + kdf = pp.DataFrame(pdf) |
| 176 | + |
| 177 | + kdf.to_json(self.tmp_dir, partition_cols="b", num_files=1) |
| 178 | + |
| 179 | + partition_paths = [path for path in os.listdir(self.tmp_dir) if path.startswith("b=")] |
| 180 | + assert len(partition_paths) > 0 |
| 181 | + for partition_path in partition_paths: |
| 182 | + column, value = partition_path.split("=") |
| 183 | + expected = pdf[pdf[column] == value].drop("b", axis=1).to_json(orient="records") |
| 184 | + |
| 185 | + output_paths = [ |
| 186 | + path |
| 187 | + for path in os.listdir("%s/%s" % (self.tmp_dir, partition_path)) |
| 188 | + if path.startswith("part-") |
| 189 | + ] |
| 190 | + assert len(output_paths) > 0 |
| 191 | + output_path = "%s/%s/%s" % (self.tmp_dir, partition_path, output_paths[0]) |
| 192 | + with open(output_path) as f: |
| 193 | + self.assertEqual("[%s]" % open(output_path).read().strip(), expected) |
| 194 | + |
| 195 | + @unittest.skip("Pyperclip could not find a copy/paste mechanism for Linux.") |
| 196 | + def test_to_clipboard(self): |
| 197 | + pdf = self.pdf |
| 198 | + kdf = self.kdf |
| 199 | + |
| 200 | + self.assert_eq(kdf.to_clipboard(), pdf.to_clipboard()) |
| 201 | + self.assert_eq(kdf.to_clipboard(excel=False), pdf.to_clipboard(excel=False)) |
| 202 | + self.assert_eq( |
| 203 | + kdf.to_clipboard(sep=";", index=False), pdf.to_clipboard(sep=";", index=False) |
| 204 | + ) |
| 205 | + |
| 206 | + def test_to_latex(self): |
| 207 | + pdf = self.pdf |
| 208 | + kdf = self.kdf |
| 209 | + |
| 210 | + self.assert_eq(kdf.to_latex(), pdf.to_latex()) |
| 211 | + self.assert_eq(kdf.to_latex(col_space=2), pdf.to_latex(col_space=2)) |
| 212 | + self.assert_eq(kdf.to_latex(header=True), pdf.to_latex(header=True)) |
| 213 | + self.assert_eq(kdf.to_latex(index=False), pdf.to_latex(index=False)) |
| 214 | + self.assert_eq(kdf.to_latex(na_rep="-"), pdf.to_latex(na_rep="-")) |
| 215 | + self.assert_eq(kdf.to_latex(float_format="%.1f"), pdf.to_latex(float_format="%.1f")) |
| 216 | + self.assert_eq(kdf.to_latex(sparsify=False), pdf.to_latex(sparsify=False)) |
| 217 | + self.assert_eq(kdf.to_latex(index_names=False), pdf.to_latex(index_names=False)) |
| 218 | + self.assert_eq(kdf.to_latex(bold_rows=True), pdf.to_latex(bold_rows=True)) |
| 219 | + self.assert_eq(kdf.to_latex(decimal=","), pdf.to_latex(decimal=",")) |
| 220 | + if LooseVersion(pd.__version__) < LooseVersion("1.0.0"): |
| 221 | + self.assert_eq(kdf.to_latex(encoding="ascii"), pdf.to_latex(encoding="ascii")) |
| 222 | + |
| 223 | + def test_to_records(self): |
| 224 | + if LooseVersion(pd.__version__) >= LooseVersion("0.24.0"): |
| 225 | + pdf = pd.DataFrame({"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"]) |
| 226 | + |
| 227 | + kdf = pp.from_pandas(pdf) |
| 228 | + |
| 229 | + self.assert_eq(kdf.to_records(), pdf.to_records()) |
| 230 | + self.assert_eq(kdf.to_records(index=False), pdf.to_records(index=False)) |
| 231 | + self.assert_eq(kdf.to_records(index_dtypes="<S2"), pdf.to_records(index_dtypes="<S2")) |
| 232 | + |
| 233 | + def test_from_records(self): |
| 234 | + # Assert using a dict as input |
| 235 | + self.assert_eq( |
| 236 | + pp.DataFrame.from_records({"A": [1, 2, 3]}), pd.DataFrame.from_records({"A": [1, 2, 3]}) |
| 237 | + ) |
| 238 | + # Assert using a list of tuples as input |
| 239 | + self.assert_eq( |
| 240 | + pp.DataFrame.from_records([(1, 2), (3, 4)]), pd.DataFrame.from_records([(1, 2), (3, 4)]) |
| 241 | + ) |
| 242 | + # Assert using a NumPy array as input |
| 243 | + self.assert_eq(pp.DataFrame.from_records(np.eye(3)), pd.DataFrame.from_records(np.eye(3))) |
| 244 | + # Asserting using a custom index |
| 245 | + self.assert_eq( |
| 246 | + pp.DataFrame.from_records([(1, 2), (3, 4)], index=[2, 3]), |
| 247 | + pd.DataFrame.from_records([(1, 2), (3, 4)], index=[2, 3]), |
| 248 | + ) |
| 249 | + # Assert excluding excluding column(s) |
| 250 | + self.assert_eq( |
| 251 | + pp.DataFrame.from_records({"A": [1, 2, 3], "B": [1, 2, 3]}, exclude=["B"]), |
| 252 | + pd.DataFrame.from_records({"A": [1, 2, 3], "B": [1, 2, 3]}, exclude=["B"]), |
| 253 | + ) |
| 254 | + # Assert limiting to certain column(s) |
| 255 | + self.assert_eq( |
| 256 | + pp.DataFrame.from_records({"A": [1, 2, 3], "B": [1, 2, 3]}, columns=["A"]), |
| 257 | + pd.DataFrame.from_records({"A": [1, 2, 3], "B": [1, 2, 3]}, columns=["A"]), |
| 258 | + ) |
| 259 | + # Assert limiting to a number of rows |
| 260 | + self.assert_eq( |
| 261 | + pp.DataFrame.from_records([(1, 2), (3, 4)], nrows=1), |
| 262 | + pd.DataFrame.from_records([(1, 2), (3, 4)], nrows=1), |
| 263 | + ) |
| 264 | + |
| 265 | + |
| 266 | +if __name__ == "__main__": |
| 267 | + from pyspark.pandas.tests.test_dataframe_conversion import * # noqa: F401 |
| 268 | + |
| 269 | + try: |
| 270 | + import xmlrunner # type: ignore[import] |
| 271 | + testRunner = xmlrunner.XMLTestRunner(output='target/test-reports', verbosity=2) |
| 272 | + except ImportError: |
| 273 | + testRunner = None |
| 274 | + unittest.main(testRunner=testRunner, verbosity=2) |
0 commit comments