Merge branch 'master' of github.com:quantumblacklabs/kedro

lvijnck · Oct 25, 2021 · ab0cab9 · ab0cab9
2 parents 65de36b + 4f88b20
commit ab0cab9
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 4 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -9,6 +9,7 @@
 * Added support for arbitrary backends (via importable module paths) that satisfy the `pickle` interface to `PickleDataSet`.
 * Added support for `sum` syntax for connecting pipeline objects.
 * Upgraded `pip-tools`, which is used by `kedro build-reqs`, to 6.4. This `pip-tools` version requires `pip>=21.2` while [adding support for `pip>=21.3`](https://github.com/jazzband/pip-tools/pull/1501). To upgrade `pip`, please refer to [their documentation](https://pip.pypa.io/en/stable/installing/#upgrading-pip).
+* Extended ``ExcelDataSet`` to support saving Excel files with multiple sheets.
 * `kedro pipeline package <pipeline>` now raises an error if the `<pipeline>` argument doesn't look like a valid Python module path (e.g. has `/` instead of `.`).
 
 ## Minor breaking changes to the API

diff --git a/kedro/extras/datasets/pandas/excel_dataset.py b/kedro/extras/datasets/pandas/excel_dataset.py
@@ -32,7 +32,7 @@
 from copy import deepcopy
 from io import BytesIO
 from pathlib import PurePosixPath
-from typing import Any, Dict
+from typing import Any, Dict, Union
 
 import fsspec
 import pandas as pd
@@ -95,6 +95,7 @@ def __init__(
                 Here you can find all available arguments:
                 https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html
                 All defaults are preserved, but "engine", which is set to "xlrd".
+                Supports multi-sheet Excel files (include `sheet_name = None` in `load_args`).
             save_args: Pandas options for saving Excel files.
                 Here you can find all available arguments:
                 https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html
@@ -161,19 +162,25 @@ def _describe(self) -> Dict[str, Any]:
             version=self._version,
         )
 
-    def _load(self) -> pd.DataFrame:
+    def _load(self) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
         load_path = get_filepath_str(self._get_load_path(), self._protocol)
 
         with self._fs.open(load_path, **self._fs_open_args_load) as fs_file:
             return pd.read_excel(fs_file, **self._load_args)
 
-    def _save(self, data: pd.DataFrame) -> None:
+    def _save(self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]) -> None:
         output = BytesIO()
         save_path = get_filepath_str(self._get_save_path(), self._protocol)
 
         # pylint: disable=abstract-class-instantiated
         with pd.ExcelWriter(output, **self._writer_args) as writer:
-            data.to_excel(writer, **self._save_args)
+            if isinstance(data, dict):
+                for sheet_name, sheet_data in data.items():
+                    sheet_data.to_excel(
+                        writer, sheet_name=sheet_name, **self._save_args
+                    )
+            else:
+                data.to_excel(writer, **self._save_args)
 
         with self._fs.open(save_path, **self._fs_open_args_save) as fs_file:
             fs_file.write(output.getvalue())

diff --git a/tests/extras/datasets/pandas/test_excel_dataset.py b/tests/extras/datasets/pandas/test_excel_dataset.py
@@ -56,6 +56,17 @@ def excel_data_set(filepath_excel, load_args, save_args, fs_args):
     )
 
 
+@pytest.fixture
+def excel_multisheet_data_set(filepath_excel, save_args, fs_args):
+    load_args = {"sheet_name": None}
+    return ExcelDataSet(
+        filepath=filepath_excel,
+        load_args=load_args,
+        save_args=save_args,
+        fs_args=fs_args,
+    )
+
+
 @pytest.fixture
 def versioned_excel_data_set(filepath_excel, load_version, save_version):
     return ExcelDataSet(
@@ -68,6 +79,11 @@ def dummy_dataframe():
     return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]})
 
 
+@pytest.fixture
+def another_dummy_dataframe():
+    return pd.DataFrame({"x": [10, 20], "y": ["hello", "world"]})
+
+
 class TestExcelDataSet:
     def test_save_and_load(self, excel_data_set, dummy_dataframe):
         """Test saving and reloading the data set."""
@@ -77,6 +93,19 @@ def test_save_and_load(self, excel_data_set, dummy_dataframe):
         assert excel_data_set._fs_open_args_load == {}
         assert excel_data_set._fs_open_args_save == {"mode": "wb"}
 
+    def test_save_and_load_multiple_sheets(
+        self, excel_multisheet_data_set, dummy_dataframe, another_dummy_dataframe
+    ):
+        """Test saving and reloading the data set with multiple sheets."""
+        dummy_multisheet = {
+            "sheet 1": dummy_dataframe,
+            "sheet 2": another_dummy_dataframe,
+        }
+        excel_multisheet_data_set.save(dummy_multisheet)
+        reloaded = excel_multisheet_data_set.load()
+        assert_frame_equal(dummy_multisheet["sheet 1"], reloaded["sheet 1"])
+        assert_frame_equal(dummy_multisheet["sheet 2"], reloaded["sheet 2"])
+
     def test_exists(self, excel_data_set, dummy_dataframe):
         """Test `exists` method invocation for both existing and
         nonexistent data set."""