From 4f88b2008d5b2ce0f3b6be22df076d6ec4bf7dd1 Mon Sep 17 00:00:00 2001
From: Simon Brugman <sbrugman@users.noreply.github.com>
Date: Mon, 25 Oct 2021 12:23:17 +0200
Subject: [PATCH] Support multiple sheets for ExcelDataSet (#963)

---
 RELEASE.md                                    |  1 +
 kedro/extras/datasets/pandas/excel_dataset.py | 15 +++++++---
 .../datasets/pandas/test_excel_dataset.py     | 29 +++++++++++++++++++
 3 files changed, 41 insertions(+), 4 deletions(-)
diff --git a/RELEASE.md b/RELEASE.md
index 81a99dfb17..b24bbd760f 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -9,6 +9,7 @@
 * Added support for arbitrary backends (via importable module paths) that satisfy the `pickle` interface to `PickleDataSet`.
 * Added support for `sum` syntax for connecting pipeline objects.
 * Upgraded `pip-tools`, which is used by `kedro build-reqs`, to 6.4. This `pip-tools` version requires `pip>=21.2` while [adding support for `pip>=21.3`](https://github.com/jazzband/pip-tools/pull/1501). To upgrade `pip`, please refer to [their documentation](https://pip.pypa.io/en/stable/installing/#upgrading-pip).
+* Extended ``ExcelDataSet`` to support saving Excel files with multiple sheets.
 * `kedro pipeline package <pipeline>` now raises an error if the `<pipeline>` argument doesn't look like a valid Python module path (e.g. has `/` instead of `.`).
 
 ## Minor breaking changes to the API
diff --git a/kedro/extras/datasets/pandas/excel_dataset.py b/kedro/extras/datasets/pandas/excel_dataset.py
index ac9c546929..559cd2a44a 100644
--- a/kedro/extras/datasets/pandas/excel_dataset.py
+++ b/kedro/extras/datasets/pandas/excel_dataset.py
@@ -32,7 +32,7 @@
 from copy import deepcopy
 from io import BytesIO
 from pathlib import PurePosixPath
-from typing import Any, Dict
+from typing import Any, Dict, Union
 
 import fsspec
 import pandas as pd
@@ -95,6 +95,7 @@ def __init__(
                 Here you can find all available arguments:
                 https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html
                 All defaults are preserved, but "engine", which is set to "xlrd".
+                Supports multi-sheet Excel files (include `sheet_name = None` in `load_args`).
             save_args: Pandas options for saving Excel files.
                 Here you can find all available arguments:
                 https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html
@@ -161,19 +162,25 @@ def _describe(self) -> Dict[str, Any]:
             version=self._version,
         )
 
-    def _load(self) -> pd.DataFrame:
+    def _load(self) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
         load_path = get_filepath_str(self._get_load_path(), self._protocol)
 
         with self._fs.open(load_path, **self._fs_open_args_load) as fs_file:
             return pd.read_excel(fs_file, **self._load_args)
 
-    def _save(self, data: pd.DataFrame) -> None:
+    def _save(self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]) -> None:
         output = BytesIO()
         save_path = get_filepath_str(self._get_save_path(), self._protocol)
 
         # pylint: disable=abstract-class-instantiated
         with pd.ExcelWriter(output, **self._writer_args) as writer:
-            data.to_excel(writer, **self._save_args)
+            if isinstance(data, dict):
+                for sheet_name, sheet_data in data.items():
+                    sheet_data.to_excel(
+                        writer, sheet_name=sheet_name, **self._save_args
+                    )
+            else:
+                data.to_excel(writer, **self._save_args)
 
         with self._fs.open(save_path, **self._fs_open_args_save) as fs_file:
             fs_file.write(output.getvalue())
diff --git a/tests/extras/datasets/pandas/test_excel_dataset.py b/tests/extras/datasets/pandas/test_excel_dataset.py
index 530f5eb424..73e7243ec4 100644
--- a/tests/extras/datasets/pandas/test_excel_dataset.py
+++ b/tests/extras/datasets/pandas/test_excel_dataset.py
@@ -56,6 +56,17 @@ def excel_data_set(filepath_excel, load_args, save_args, fs_args):
     )
 
 
+@pytest.fixture
+def excel_multisheet_data_set(filepath_excel, save_args, fs_args):
+    load_args = {"sheet_name": None}
+    return ExcelDataSet(
+        filepath=filepath_excel,
+        load_args=load_args,
+        save_args=save_args,
+        fs_args=fs_args,
+    )
+
+
 @pytest.fixture
 def versioned_excel_data_set(filepath_excel, load_version, save_version):
     return ExcelDataSet(
@@ -68,6 +79,11 @@ def dummy_dataframe():
     return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]})
 
 
+@pytest.fixture
+def another_dummy_dataframe():
+    return pd.DataFrame({"x": [10, 20], "y": ["hello", "world"]})
+
+
 class TestExcelDataSet:
     def test_save_and_load(self, excel_data_set, dummy_dataframe):
         """Test saving and reloading the data set."""
@@ -77,6 +93,19 @@ def test_save_and_load(self, excel_data_set, dummy_dataframe):
         assert excel_data_set._fs_open_args_load == {}
         assert excel_data_set._fs_open_args_save == {"mode": "wb"}
 
+    def test_save_and_load_multiple_sheets(
+        self, excel_multisheet_data_set, dummy_dataframe, another_dummy_dataframe
+    ):
+        """Test saving and reloading the data set with multiple sheets."""
+        dummy_multisheet = {
+            "sheet 1": dummy_dataframe,
+            "sheet 2": another_dummy_dataframe,
+        }
+        excel_multisheet_data_set.save(dummy_multisheet)
+        reloaded = excel_multisheet_data_set.load()
+        assert_frame_equal(dummy_multisheet["sheet 1"], reloaded["sheet 1"])
+        assert_frame_equal(dummy_multisheet["sheet 2"], reloaded["sheet 2"])
+
     def test_exists(self, excel_data_set, dummy_dataframe):
         """Test `exists` method invocation for both existing and
         nonexistent data set."""