SNOW-1497832 Support reading excel files in snowpandas. As seen in te…

…lemetry. (snowflakedb#1818)
trakmaker · Jul 1, 2024 · 990c47e · 990c47e
1 parent 5823b0f
commit 990c47e
Show file tree

Hide file tree

Showing 12 changed files with 63 additions and 69 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -81,6 +81,7 @@
 - Added support for `DataFrameGroupBy.all`, `SeriesGroupBy.all`, `DataFrameGroupBy.any`, and `SeriesGroupBy.any`.
 - Added support for `DataFrame.nlargest`, `DataFrame.nsmallest`, `Series.nlargest` and `Series.nsmallest`.
 - Added support for `replace` and `frac > 1` in `DataFrame.sample` and `Series.sample`.
+- Added support for `read_excel` (Uses local pandas for processing)
 - Added support for `Series.at`, `Series.iat`, `DataFrame.at`, and `DataFrame.iat`.
 - Added support for `Series.dt.isocalendar`.
 - Added support for `Series.case_when` except when condition or replacement is callable.

diff --git a/docs/source/modin/io.rst b/docs/source/modin/io.rst
@@ -10,6 +10,7 @@ Input/Output
     :toctree: pandas_api/
 
     read_csv
+    read_excel
     read_json
     read_parquet
 

diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst
@@ -64,6 +64,12 @@ Data manipulations
 |                             |                                 |                                  | performance. You can force the use of the Snowflake|
 |                             |                                 |                                  | parser with ``engine=snowflake``                   |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
+| ``read_excel``              | Y                               |                                  | Uses native pandas to read excel files, using the  | 
+|                             |                                 |                                  | engine specified by the pandas. You will need to   |
+|                             |                                 |                                  | separately install a supported excel reader such   |
+|                             |                                 |                                  | as openpyxl. Please refer to the native pandas     | 
+|                             |                                 |                                  | `read excel`_ documentation for more details.      |
++-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``read_json``               | P                               | ``orient``, ``typ``, ``dtype``,  | ``P``:                                             |
 |                             |                                 | ``convert_axes``, ``lines``,     | - if ndjson files are passed                       |
 |                             |                                 | ``convert_dates``, ``date_unit``,| - Supported parameters are ``compression`` and     |
@@ -199,4 +205,6 @@ Importing from other DataFrame libraries
 | Method                                | Snowpark implemented? (Y/N/P/D) | Notes for current implementation                   |
 +---------------------------------------+---------------------------------+----------------------------------------------------+
 | ``api.interchange.from_dataframe``    | N                               |                                                    |
-+---------------------------------------+---------------------------------+----------------------------------------------------+
++---------------------------------------+---------------------------------+----------------------------------------------------+
+
+.. _read excel: https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
diff --git a/setup.py b/setup.py
@@ -44,6 +44,7 @@
     "cachetools",  # used in UDF doctest
     "pytest-timeout",
     "pytest-xdist",
+    "openpyxl",  # used in read_excel test, not a requirement for distribution
     "pre-commit",
 ]
 

diff --git a/src/snowflake/snowpark/modin/core/execution/dispatching/factories/baseio.py b/src/snowflake/snowpark/modin/core/execution/dispatching/factories/baseio.py
@@ -279,66 +279,14 @@ def read_clipboard(cls, sep=r"\s+", **kwargs):  # pragma: no cover # noqa: PR01
         returns="""BaseQueryCompiler or dict/OrderedDict :
     QueryCompiler or OrderedDict/dict with read data.""",
     )
-    def read_excel(
-        cls,
-        io,
-        sheet_name=0,
-        header=0,
-        names=None,
-        index_col=None,
-        usecols=None,
-        squeeze=False,
-        dtype=None,
-        engine=None,
-        converters=None,
-        true_values=None,
-        false_values=None,
-        skiprows=None,
-        nrows=None,
-        na_values=None,
-        keep_default_na=True,
-        verbose=False,
-        parse_dates=False,
-        date_parser=None,
-        thousands=None,
-        comment=None,
-        skip_footer=0,
-        skipfooter=0,
-        convert_float=True,
-        mangle_dupe_cols=True,
-        na_filter=True,
-        **kwds,
-    ):  # noqa: PR01
-        if skip_footer != 0:
-            skipfooter = skip_footer
-        intermediate = pandas.read_excel(
-            io,
-            sheet_name=sheet_name,
-            header=header,
-            names=names,
-            index_col=index_col,
-            usecols=usecols,
-            squeeze=squeeze,
-            dtype=dtype,
-            engine=engine,
-            converters=converters,
-            true_values=true_values,
-            false_values=false_values,
-            skiprows=skiprows,
-            nrows=nrows,
-            na_values=na_values,
-            keep_default_na=keep_default_na,
-            verbose=verbose,
-            parse_dates=parse_dates,
-            date_parser=date_parser,
-            thousands=thousands,
-            comment=comment,
-            skipfooter=skipfooter,
-            convert_float=convert_float,
-            mangle_dupe_cols=mangle_dupe_cols,
-            na_filter=na_filter,
-            **kwds,
-        )
+    def read_excel(cls, **kwargs):  # noqa: PR01
+        try:
+            intermediate = pandas.read_excel(**kwargs)
+        except ImportError as e:
+            raise ImportError(
+                "Snowpark Pandas requires an additional package to read excel files such as openpyxl, pyxlsb, or xlrd",
+                e,
+            )
         if isinstance(intermediate, (OrderedDict, dict)):
             parsed = type(intermediate)()
             for key in intermediate.keys():

diff --git a/src/snowflake/snowpark/modin/pandas/io.py b/src/snowflake/snowpark/modin/pandas/io.py
@@ -478,7 +478,6 @@ def read_clipboard(
     )
 
 
-@_inherit_docstrings(pandas.read_excel, apilink="pandas.read_excel")
 @snowpark_pandas_telemetry_standalone_function_decorator
 @expanduser_path_arg("io")
 def read_excel(
@@ -517,7 +516,6 @@ def read_excel(
     engine_kwargs: dict | None = None,
 ) -> DataFrame | dict[IntStrT, DataFrame]:  # pragma: no cover
     _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
-
     from snowflake.snowpark.modin.core.execution.dispatching.factories.dispatcher import (
         FactoryDispatcher,
     )

diff --git a/src/snowflake/snowpark/modin/plugin/io/snow_io.py b/src/snowflake/snowpark/modin/plugin/io/snow_io.py
@@ -595,11 +595,6 @@ def read_html(
     def read_clipboard(cls, sep=r"\s+", **kwargs):
         pass  # pragma: no cover
 
-    @classmethod
-    @pandas_module_level_function_not_implemented()
-    def read_excel(cls, **kwargs):
-        pass  # pragma: no cover
-
     @classmethod
     @pandas_module_level_function_not_implemented()
     def read_hdf(

diff --git a/tests/integ/modin/io/test_read_excel.py b/tests/integ/modin/io/test_read_excel.py
@@ -0,0 +1,38 @@
+#
+# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
+#
+import modin.pandas as pd
+import pandas as native_pd
+import pytest
+
+from tests.integ.modin.sql_counter import SqlCounter, sql_count_checker
+from tests.integ.modin.utils import assert_frame_equal
+from tests.utils import TestFiles
+
+
+def test_read_excel(resources_path):
+    test_files = TestFiles(resources_path)
+
+    filename = test_files.test_file_excel
+
+    with SqlCounter(query_count=1):
+        assert_frame_equal(
+            pd.read_excel(filename),
+            native_pd.read_excel(filename),
+            check_dtype=False,
+        )
+
+
+@sql_count_checker(query_count=0)
+def test_read_excel_no_lib_negative(resources_path):
+    try:
+        # skip test if we actually have calamine installed
+        # this is not a common library to use
+        import calamine  # noqa
+    except Exception:
+        test_files = TestFiles(resources_path)
+        filename = test_files.test_file_excel
+        with pytest.raises(
+            ImportError, match="Snowpark Pandas requires an additional package"
+        ):
+            pd.read_excel(filename, engine="calamine")
diff --git a/tests/resources/test_excel.xlsx b/tests/resources/test_excel.xlsx
diff --git a/tests/unit/modin/test_unsupported.py b/tests/unit/modin/test_unsupported.py
@@ -20,7 +20,6 @@
         ["read_gbq", {"query": ""}],
         ["read_html", {"io": ""}],
         ["read_clipboard", {}],
-        ["read_excel", {"io": ""}],
         ["read_hdf", {"path_or_buf": ""}],
         ["read_feather", {"path": ""}],
         ["read_stata", {"filepath_or_buffer": ""}],

diff --git a/tests/unit/scala/test_utils_suite.py b/tests/unit/scala/test_utils_suite.py
@@ -302,6 +302,7 @@ def check_zip_files_and_close_stream(input_stream, expected_files):
                 "resources/test_concat_file1.csv",
                 "resources/test_concat_file2.csv",
                 "resources/test_environment.yml",
+                "resources/test_excel.xlsx",
                 "resources/test_sp_dir/",
                 "resources/test_sp_dir/test_sp_file.py",
                 "resources/test_sp_dir/test_sp_mod3_file.py",

diff --git a/tests/utils.py b/tests/utils.py
@@ -1287,6 +1287,10 @@ def test_file_csv_quotes_special(self):
     def test_file_csv_special_format(self):
         return os.path.join(self.resources_path, "testCSVspecialFormat.csv")
 
+    @property
+    def test_file_excel(self):
+        return os.path.join(self.resources_path, "test_excel.xlsx")
+
     @functools.cached_property
     def test_file_json_special_format(self):
         return os.path.join(self.resources_path, "testJSONspecialFormat.json.gz")