Skip to content

Commit

Permalink
SNOW-1497832 Support reading excel files in snowpandas. As seen in te…
Browse files Browse the repository at this point in the history
…lemetry. (snowflakedb#1818)
  • Loading branch information
sfc-gh-jkew authored Jul 1, 2024
1 parent 5823b0f commit 990c47e
Show file tree
Hide file tree
Showing 12 changed files with 63 additions and 69 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
- Added support for `DataFrameGroupBy.all`, `SeriesGroupBy.all`, `DataFrameGroupBy.any`, and `SeriesGroupBy.any`.
- Added support for `DataFrame.nlargest`, `DataFrame.nsmallest`, `Series.nlargest` and `Series.nsmallest`.
- Added support for `replace` and `frac > 1` in `DataFrame.sample` and `Series.sample`.
- Added support for `read_excel` (Uses local pandas for processing)
- Added support for `Series.at`, `Series.iat`, `DataFrame.at`, and `DataFrame.iat`.
- Added support for `Series.dt.isocalendar`.
- Added support for `Series.case_when` except when condition or replacement is callable.
Expand Down
1 change: 1 addition & 0 deletions docs/source/modin/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Input/Output
:toctree: pandas_api/

read_csv
read_excel
read_json
read_parquet

Expand Down
10 changes: 9 additions & 1 deletion docs/source/modin/supported/general_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ Data manipulations
| | | | performance. You can force the use of the Snowflake|
| | | | parser with ``engine=snowflake`` |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``read_excel`` | Y | | Uses native pandas to read excel files, using the |
| | | | engine specified by the pandas. You will need to |
| | | | separately install a supported excel reader such |
| | | | as openpyxl. Please refer to the native pandas |
| | | | `read excel`_ documentation for more details. |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``read_json`` | P | ``orient``, ``typ``, ``dtype``, | ``P``: |
| | | ``convert_axes``, ``lines``, | - if ndjson files are passed |
| | | ``convert_dates``, ``date_unit``,| - Supported parameters are ``compression`` and |
Expand Down Expand Up @@ -199,4 +205,6 @@ Importing from other DataFrame libraries
| Method | Snowpark implemented? (Y/N/P/D) | Notes for current implementation |
+---------------------------------------+---------------------------------+----------------------------------------------------+
| ``api.interchange.from_dataframe`` | N | |
+---------------------------------------+---------------------------------+----------------------------------------------------+
+---------------------------------------+---------------------------------+----------------------------------------------------+

.. _read excel: https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"cachetools", # used in UDF doctest
"pytest-timeout",
"pytest-xdist",
"openpyxl", # used in read_excel test, not a requirement for distribution
"pre-commit",
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,66 +279,14 @@ def read_clipboard(cls, sep=r"\s+", **kwargs): # pragma: no cover # noqa: PR01
returns="""BaseQueryCompiler or dict/OrderedDict :
QueryCompiler or OrderedDict/dict with read data.""",
)
def read_excel(
cls,
io,
sheet_name=0,
header=0,
names=None,
index_col=None,
usecols=None,
squeeze=False,
dtype=None,
engine=None,
converters=None,
true_values=None,
false_values=None,
skiprows=None,
nrows=None,
na_values=None,
keep_default_na=True,
verbose=False,
parse_dates=False,
date_parser=None,
thousands=None,
comment=None,
skip_footer=0,
skipfooter=0,
convert_float=True,
mangle_dupe_cols=True,
na_filter=True,
**kwds,
): # noqa: PR01
if skip_footer != 0:
skipfooter = skip_footer
intermediate = pandas.read_excel(
io,
sheet_name=sheet_name,
header=header,
names=names,
index_col=index_col,
usecols=usecols,
squeeze=squeeze,
dtype=dtype,
engine=engine,
converters=converters,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
keep_default_na=keep_default_na,
verbose=verbose,
parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
convert_float=convert_float,
mangle_dupe_cols=mangle_dupe_cols,
na_filter=na_filter,
**kwds,
)
def read_excel(cls, **kwargs): # noqa: PR01
try:
intermediate = pandas.read_excel(**kwargs)
except ImportError as e:
raise ImportError(
"Snowpark Pandas requires an additional package to read excel files such as openpyxl, pyxlsb, or xlrd",
e,
)
if isinstance(intermediate, (OrderedDict, dict)):
parsed = type(intermediate)()
for key in intermediate.keys():
Expand Down
2 changes: 0 additions & 2 deletions src/snowflake/snowpark/modin/pandas/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,6 @@ def read_clipboard(
)


@_inherit_docstrings(pandas.read_excel, apilink="pandas.read_excel")
@snowpark_pandas_telemetry_standalone_function_decorator
@expanduser_path_arg("io")
def read_excel(
Expand Down Expand Up @@ -517,7 +516,6 @@ def read_excel(
engine_kwargs: dict | None = None,
) -> DataFrame | dict[IntStrT, DataFrame]: # pragma: no cover
_, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

from snowflake.snowpark.modin.core.execution.dispatching.factories.dispatcher import (
FactoryDispatcher,
)
Expand Down
5 changes: 0 additions & 5 deletions src/snowflake/snowpark/modin/plugin/io/snow_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,11 +595,6 @@ def read_html(
def read_clipboard(cls, sep=r"\s+", **kwargs):
pass # pragma: no cover

@classmethod
@pandas_module_level_function_not_implemented()
def read_excel(cls, **kwargs):
pass # pragma: no cover

@classmethod
@pandas_module_level_function_not_implemented()
def read_hdf(
Expand Down
38 changes: 38 additions & 0 deletions tests/integ/modin/io/test_read_excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#
import modin.pandas as pd
import pandas as native_pd
import pytest

from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker
from tests.integ.modin.utils import assert_frame_equal
from tests.utils import TestFiles


def test_read_excel(resources_path):
test_files = TestFiles(resources_path)

filename = test_files.test_file_excel

with SqlCounter(query_count=1):
assert_frame_equal(
pd.read_excel(filename),
native_pd.read_excel(filename),
check_dtype=False,
)


@sql_count_checker(query_count=0)
def test_read_excel_no_lib_negative(resources_path):
try:
# skip test if we actually have calamine installed
# this is not a common library to use
import calamine # noqa
except Exception:
test_files = TestFiles(resources_path)
filename = test_files.test_file_excel
with pytest.raises(
ImportError, match="Snowpark Pandas requires an additional package"
):
pd.read_excel(filename, engine="calamine")
Binary file added tests/resources/test_excel.xlsx
Binary file not shown.
1 change: 0 additions & 1 deletion tests/unit/modin/test_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
["read_gbq", {"query": ""}],
["read_html", {"io": ""}],
["read_clipboard", {}],
["read_excel", {"io": ""}],
["read_hdf", {"path_or_buf": ""}],
["read_feather", {"path": ""}],
["read_stata", {"filepath_or_buffer": ""}],
Expand Down
1 change: 1 addition & 0 deletions tests/unit/scala/test_utils_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ def check_zip_files_and_close_stream(input_stream, expected_files):
"resources/test_concat_file1.csv",
"resources/test_concat_file2.csv",
"resources/test_environment.yml",
"resources/test_excel.xlsx",
"resources/test_sp_dir/",
"resources/test_sp_dir/test_sp_file.py",
"resources/test_sp_dir/test_sp_mod3_file.py",
Expand Down
4 changes: 4 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1287,6 +1287,10 @@ def test_file_csv_quotes_special(self):
def test_file_csv_special_format(self):
return os.path.join(self.resources_path, "testCSVspecialFormat.csv")

@property
def test_file_excel(self):
return os.path.join(self.resources_path, "test_excel.xlsx")

@functools.cached_property
def test_file_json_special_format(self):
return os.path.join(self.resources_path, "testJSONspecialFormat.json.gz")
Expand Down

0 comments on commit 990c47e

Please sign in to comment.