Skip to content

Commit 98dbfed

Browse files
amyskovanmyachev
andcommitted
TEST-#2509: addressing review comments
Co-authored-by: Anatoly Myachev <45976948+anmyachev@users.noreply.github.com> Signed-off-by: Alexander Myskov <alexander.myskov@intel.com>
1 parent b565fdc commit 98dbfed

File tree

2 files changed

+42
-106
lines changed

2 files changed

+42
-106
lines changed

modin/pandas/test/test_io.py

Lines changed: 41 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ def _make_parquet_file(
111111
"""Helper function to generate parquet files/directories.
112112
113113
Args:
114+
filename: The name of test file, that should be created.
114115
row_size: Number of rows for the dataframe.
115116
force: Create a new file/directory even if one already exists.
116117
directory: Create a partitioned directory using pyarrow.
@@ -498,6 +499,14 @@ def setup_fwf_file(filename=TEST_FWF_FILENAME, force=True, fwf_data=None):
498499

499500

500501
def eval_to_file(modin_obj, pandas_obj, fn, extension, **fn_kwargs):
502+
"""Helper function to test `to_<extension>` methods.
503+
504+
Args:
505+
modin_obj: Modin DataFrame or Series to test `to_<extension>` method.
506+
pandas_obj: Pandas DataFrame or Series to test `to_<extension>` method.
507+
fn: name of the method, that should be tested.
508+
extension: Extension of the test file.
509+
"""
501510
unique_filename_modin = get_unique_filename(extension=extension)
502511
unique_filename_pandas = get_unique_filename(extension=extension)
503512

@@ -1118,22 +1127,12 @@ def test_read_csv_parse_dates(
11181127

11191128
@pytest.mark.skipif(Engine.get() == "Python", reason="Using pandas implementation")
11201129
def test_read_csv_s3(self):
1121-
dataset_url = "s3://noaa-ghcn-pds/csv/1788.csv"
1122-
pandas_df = pandas.read_csv(dataset_url)
1123-
1124-
# This first load is to trigger all the import deprecation warnings
1125-
modin_df = pd.read_csv(dataset_url)
1126-
1127-
# This will warn if it defaults to pandas behavior, but it shouldn't
1128-
with pytest.warns(None) as record:
1129-
modin_df = pd.read_csv(dataset_url)
1130-
1131-
assert not any(
1132-
"defaulting to pandas implementation" in str(err) for err in record.list
1130+
eval_io(
1131+
fn_name="read_csv",
1132+
# read_csv kwargs
1133+
filepath_or_buffer="s3://noaa-ghcn-pds/csv/1788.csv",
11331134
)
11341135

1135-
df_equals(modin_df, pandas_df)
1136-
11371136
@pytest.mark.parametrize("names", [list("XYZ"), None])
11381137
@pytest.mark.parametrize("skiprows", [1, 2, 3, 4, None])
11391138
def test_read_csv_skiprows_names(self, names, skiprows):
@@ -1307,67 +1306,40 @@ def wrapped_read_table(file, method):
13071306

13081307

13091308
class TestParquet:
1310-
def test_read_parquet(self, make_parquet_file):
1309+
@pytest.mark.parametrize("columns", [None, ["col1"]])
1310+
def test_read_parquet(self, make_parquet_file, columns):
13111311
unique_filename = get_unique_filename(extension="parquet")
13121312
make_parquet_file(filename=unique_filename)
13131313

13141314
eval_io(
13151315
fn_name="read_parquet",
13161316
# read_parquet kwargs
13171317
path=unique_filename,
1318+
columns=columns,
13181319
)
13191320

1320-
def test_read_parquet_with_columns(self, make_parquet_file):
1321-
unique_filename = get_unique_filename(extension="parquet")
1322-
make_parquet_file(filename=unique_filename)
1323-
1324-
eval_io(
1325-
fn_name="read_parquet",
1326-
# read_parquet kwargs
1327-
path=unique_filename,
1328-
columns=["col1"],
1329-
)
1330-
1331-
def test_read_parquet_partition(self, make_parquet_file):
1321+
@pytest.mark.parametrize("columns", [None, ["col1"]])
1322+
def test_read_parquet_directory(self, make_parquet_file, columns): #
13321323

13331324
unique_filename = get_unique_filename(extension=None)
13341325
make_parquet_file(filename=unique_filename, directory=True)
13351326
eval_io(
13361327
fn_name="read_parquet",
13371328
# read_parquet kwargs
13381329
path=unique_filename,
1330+
columns=columns,
13391331
)
13401332

1341-
def test_read_parquet_partition_with_columns(self, make_parquet_file):
1342-
1343-
unique_filename = get_unique_filename(extension=None)
1344-
make_parquet_file(filename=unique_filename, directory=True)
1345-
eval_io(
1346-
fn_name="read_parquet",
1347-
# read_parquet kwargs
1348-
path=unique_filename,
1349-
columns=["col1"],
1350-
)
1351-
1352-
def test_read_parquet_partitioned_columns(self, make_parquet_file):
1353-
1354-
unique_filename = get_unique_filename(extension=None)
1355-
make_parquet_file(filename=unique_filename, partitioned_columns=["col1"])
1356-
eval_io(
1357-
fn_name="read_parquet",
1358-
# read_parquet kwargs
1359-
path=unique_filename,
1360-
)
1361-
1362-
def test_read_parquet_partitioned_columns_with_columns(self, make_parquet_file):
1333+
@pytest.mark.parametrize("columns", [None, ["col1"]])
1334+
def test_read_parquet_partitioned_directory(self, make_parquet_file, columns):
13631335
unique_filename = get_unique_filename(extension=None)
13641336
make_parquet_file(filename=unique_filename, partitioned_columns=["col1"])
13651337

13661338
eval_io(
13671339
fn_name="read_parquet",
13681340
# read_parquet kwargs
13691341
path=unique_filename,
1370-
columns=["col1"],
1342+
columns=columns,
13711343
)
13721344

13731345
def test_read_parquet_pandas_index(self):
@@ -1452,14 +1424,16 @@ def test_to_parquet(self):
14521424

14531425

14541426
class TestJson:
1455-
def test_read_json(self):
1427+
@pytest.mark.parametrize("lines", [False, True])
1428+
def test_read_json(self, lines):
14561429
unique_filename = get_unique_filename(extension="json")
14571430
try:
14581431
setup_json_file(filename=unique_filename)
14591432
eval_io(
14601433
fn_name="read_json",
14611434
# read_json kwargs
14621435
path_or_buf=unique_filename,
1436+
lines=lines,
14631437
)
14641438
finally:
14651439
teardown_test_files([unique_filename])
@@ -1472,19 +1446,6 @@ def test_read_json_categories(self):
14721446
dtype={"one": "int64", "two": "category"},
14731447
)
14741448

1475-
def test_read_json_lines(self):
1476-
unique_filename = get_unique_filename(extension="json")
1477-
try:
1478-
setup_json_lines_file(filename=unique_filename)
1479-
eval_io(
1480-
fn_name="read_json",
1481-
# read_json kwargs
1482-
path_or_buf=unique_filename,
1483-
lines=True,
1484-
)
1485-
finally:
1486-
teardown_test_files([unique_filename])
1487-
14881449
@pytest.mark.parametrize(
14891450
"data",
14901451
[json_short_string, json_short_bytes, json_long_string, json_long_bytes],
@@ -1645,25 +1606,11 @@ def test_to_excel(self):
16451606

16461607
class TestHdf:
16471608
@pytest.mark.skipif(os.name == "nt", reason="Windows not supported")
1648-
def test_read_hdf(self):
1609+
@pytest.mark.parametrize("format", [None, "table"])
1610+
def test_read_hdf(self, format):
16491611
unique_filename = get_unique_filename(extension="hdf")
16501612
try:
1651-
setup_hdf_file(filename=unique_filename, format=None)
1652-
eval_io(
1653-
fn_name="read_hdf",
1654-
# read_hdf kwargs
1655-
path_or_buf=unique_filename,
1656-
key="df",
1657-
)
1658-
finally:
1659-
teardown_test_files([unique_filename])
1660-
1661-
@pytest.mark.skipif(os.name == "nt", reason="Windows not supported")
1662-
def test_read_hdf_format(self):
1663-
unique_filename = get_unique_filename(extension="hdf")
1664-
try:
1665-
setup_hdf_file(filename=unique_filename, format="table")
1666-
1613+
setup_hdf_file(filename=unique_filename, format=format)
16671614
eval_io(
16681615
fn_name="read_hdf",
16691616
# read_hdf kwargs
@@ -1766,35 +1713,24 @@ def test_read_sql_with_chunksize(self, make_sql_connection):
17661713
for modin_df, pandas_df in zip(modin_gen, pandas_gen):
17671714
df_equals(modin_df, pandas_df)
17681715

1769-
def test_to_sql_without_index(self, make_sql_connection):
1770-
table_name = "tbl_without_index"
1716+
@pytest.mark.parametrize("index", [False, True])
1717+
def test_to_sql(self, make_sql_connection, index):
1718+
table_name = f"test_to_sql_{str(index)}"
17711719
modin_df, pandas_df = create_test_dfs(TEST_DATA)
17721720

17731721
# We do not pass the table name so the fixture won't generate a table
1774-
conn = make_sql_connection("test_to_sql.db")
1775-
modin_df.to_sql(table_name, conn, index=False)
1776-
df_modin_sql = pandas.read_sql(table_name, con=conn)
1777-
1778-
# We do not pass the table name so the fixture won't generate a table
1779-
conn = make_sql_connection("test_to_sql_pandas.db")
1780-
pandas_df.to_sql(table_name, conn, index=False)
1781-
df_pandas_sql = pandas.read_sql(table_name, con=conn)
1782-
1783-
assert df_modin_sql.sort_index().equals(df_pandas_sql.sort_index())
1784-
1785-
def test_to_sql_with_index(self, make_sql_connection):
1786-
table_name = "tbl_with_index"
1787-
modin_df, pandas_df = create_test_dfs(TEST_DATA)
1788-
1789-
# We do not pass the table name so the fixture won't generate a table
1790-
conn = make_sql_connection("test_to_sql_with_index_1.db")
1791-
modin_df.to_sql(table_name, conn)
1792-
df_modin_sql = pandas.read_sql(table_name, con=conn, index_col="index")
1722+
conn = make_sql_connection(f"{table_name}_modin.db")
1723+
modin_df.to_sql(table_name, conn, index=index)
1724+
df_modin_sql = pandas.read_sql(
1725+
table_name, con=conn, index_col="index" if index else None
1726+
)
17931727

17941728
# We do not pass the table name so the fixture won't generate a table
1795-
conn = make_sql_connection("test_to_sql_with_index_2.db")
1796-
pandas_df.to_sql(table_name, conn)
1797-
df_pandas_sql = pandas.read_sql(table_name, con=conn, index_col="index")
1729+
conn = make_sql_connection(f"{table_name}_pandas.db")
1730+
pandas_df.to_sql(table_name, conn, index=index)
1731+
df_pandas_sql = pandas.read_sql(
1732+
table_name, con=conn, index_col="index" if index else None
1733+
)
17981734

17991735
assert df_modin_sql.sort_index().equals(df_pandas_sql.sort_index())
18001736

modin/pandas/test/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -943,7 +943,7 @@ def get_unique_filename(
943943
else:
944944
import uuid
945945

946-
return os.path.join(data_dir, (uuid.uuid1().hex + suffix_part + extension_part))
946+
return os.path.join(data_dir, uuid.uuid1().hex + suffix_part + extension_part)
947947

948948

949949
def get_random_string():

0 commit comments

Comments
 (0)