From 4f88b2008d5b2ce0f3b6be22df076d6ec4bf7dd1 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Mon, 25 Oct 2021 12:23:17 +0200 Subject: [PATCH] Support multiple sheets for ExcelDataSet (#963) --- RELEASE.md | 1 + kedro/extras/datasets/pandas/excel_dataset.py | 15 +++++++--- .../datasets/pandas/test_excel_dataset.py | 29 +++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 81a99dfb17..b24bbd760f 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -9,6 +9,7 @@ * Added support for arbitrary backends (via importable module paths) that satisfy the `pickle` interface to `PickleDataSet`. * Added support for `sum` syntax for connecting pipeline objects. * Upgraded `pip-tools`, which is used by `kedro build-reqs`, to 6.4. This `pip-tools` version requires `pip>=21.2` while [adding support for `pip>=21.3`](https://github.com/jazzband/pip-tools/pull/1501). To upgrade `pip`, please refer to [their documentation](https://pip.pypa.io/en/stable/installing/#upgrading-pip). +* Extended ``ExcelDataSet`` to support saving Excel files with multiple sheets. * `kedro pipeline package ` now raises an error if the `` argument doesn't look like a valid Python module path (e.g. has `/` instead of `.`). ## Minor breaking changes to the API diff --git a/kedro/extras/datasets/pandas/excel_dataset.py b/kedro/extras/datasets/pandas/excel_dataset.py index ac9c546929..559cd2a44a 100644 --- a/kedro/extras/datasets/pandas/excel_dataset.py +++ b/kedro/extras/datasets/pandas/excel_dataset.py @@ -32,7 +32,7 @@ from copy import deepcopy from io import BytesIO from pathlib import PurePosixPath -from typing import Any, Dict +from typing import Any, Dict, Union import fsspec import pandas as pd @@ -95,6 +95,7 @@ def __init__( Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html All defaults are preserved, but "engine", which is set to "xlrd". + Supports multi-sheet Excel files (include `sheet_name = None` in `load_args`). save_args: Pandas options for saving Excel files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html @@ -161,19 +162,25 @@ def _describe(self) -> Dict[str, Any]: version=self._version, ) - def _load(self) -> pd.DataFrame: + def _load(self) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]: load_path = get_filepath_str(self._get_load_path(), self._protocol) with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: return pd.read_excel(fs_file, **self._load_args) - def _save(self, data: pd.DataFrame) -> None: + def _save(self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]) -> None: output = BytesIO() save_path = get_filepath_str(self._get_save_path(), self._protocol) # pylint: disable=abstract-class-instantiated with pd.ExcelWriter(output, **self._writer_args) as writer: - data.to_excel(writer, **self._save_args) + if isinstance(data, dict): + for sheet_name, sheet_data in data.items(): + sheet_data.to_excel( + writer, sheet_name=sheet_name, **self._save_args + ) + else: + data.to_excel(writer, **self._save_args) with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: fs_file.write(output.getvalue()) diff --git a/tests/extras/datasets/pandas/test_excel_dataset.py b/tests/extras/datasets/pandas/test_excel_dataset.py index 530f5eb424..73e7243ec4 100644 --- a/tests/extras/datasets/pandas/test_excel_dataset.py +++ b/tests/extras/datasets/pandas/test_excel_dataset.py @@ -56,6 +56,17 @@ def excel_data_set(filepath_excel, load_args, save_args, fs_args): ) +@pytest.fixture +def excel_multisheet_data_set(filepath_excel, save_args, fs_args): + load_args = {"sheet_name": None} + return ExcelDataSet( + filepath=filepath_excel, + load_args=load_args, + save_args=save_args, + fs_args=fs_args, + ) + + @pytest.fixture def versioned_excel_data_set(filepath_excel, load_version, save_version): return ExcelDataSet( @@ -68,6 +79,11 @@ def dummy_dataframe(): return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) +@pytest.fixture +def another_dummy_dataframe(): + return pd.DataFrame({"x": [10, 20], "y": ["hello", "world"]}) + + class TestExcelDataSet: def test_save_and_load(self, excel_data_set, dummy_dataframe): """Test saving and reloading the data set.""" @@ -77,6 +93,19 @@ def test_save_and_load(self, excel_data_set, dummy_dataframe): assert excel_data_set._fs_open_args_load == {} assert excel_data_set._fs_open_args_save == {"mode": "wb"} + def test_save_and_load_multiple_sheets( + self, excel_multisheet_data_set, dummy_dataframe, another_dummy_dataframe + ): + """Test saving and reloading the data set with multiple sheets.""" + dummy_multisheet = { + "sheet 1": dummy_dataframe, + "sheet 2": another_dummy_dataframe, + } + excel_multisheet_data_set.save(dummy_multisheet) + reloaded = excel_multisheet_data_set.load() + assert_frame_equal(dummy_multisheet["sheet 1"], reloaded["sheet 1"]) + assert_frame_equal(dummy_multisheet["sheet 2"], reloaded["sheet 2"]) + def test_exists(self, excel_data_set, dummy_dataframe): """Test `exists` method invocation for both existing and nonexistent data set."""