Skip to content

Commit

Permalink
Support multiple sheets for ExcelDataSet (kedro-org#963)
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrugman authored Oct 25, 2021
1 parent 5402c71 commit 4f88b20
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 4 deletions.
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
* Added support for arbitrary backends (via importable module paths) that satisfy the `pickle` interface to `PickleDataSet`.
* Added support for `sum` syntax for connecting pipeline objects.
* Upgraded `pip-tools`, which is used by `kedro build-reqs`, to 6.4. This `pip-tools` version requires `pip>=21.2` while [adding support for `pip>=21.3`](https://github.com/jazzband/pip-tools/pull/1501). To upgrade `pip`, please refer to [their documentation](https://pip.pypa.io/en/stable/installing/#upgrading-pip).
* Extended ``ExcelDataSet`` to support saving Excel files with multiple sheets.
* `kedro pipeline package <pipeline>` now raises an error if the `<pipeline>` argument doesn't look like a valid Python module path (e.g. has `/` instead of `.`).

## Minor breaking changes to the API
Expand Down
15 changes: 11 additions & 4 deletions kedro/extras/datasets/pandas/excel_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from copy import deepcopy
from io import BytesIO
from pathlib import PurePosixPath
from typing import Any, Dict
from typing import Any, Dict, Union

import fsspec
import pandas as pd
Expand Down Expand Up @@ -95,6 +95,7 @@ def __init__(
Here you can find all available arguments:
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html
All defaults are preserved, but "engine", which is set to "xlrd".
Supports multi-sheet Excel files (include `sheet_name = None` in `load_args`).
save_args: Pandas options for saving Excel files.
Here you can find all available arguments:
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html
Expand Down Expand Up @@ -161,19 +162,25 @@ def _describe(self) -> Dict[str, Any]:
version=self._version,
)

def _load(self) -> pd.DataFrame:
def _load(self) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
load_path = get_filepath_str(self._get_load_path(), self._protocol)

with self._fs.open(load_path, **self._fs_open_args_load) as fs_file:
return pd.read_excel(fs_file, **self._load_args)

def _save(self, data: pd.DataFrame) -> None:
def _save(self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]) -> None:
output = BytesIO()
save_path = get_filepath_str(self._get_save_path(), self._protocol)

# pylint: disable=abstract-class-instantiated
with pd.ExcelWriter(output, **self._writer_args) as writer:
data.to_excel(writer, **self._save_args)
if isinstance(data, dict):
for sheet_name, sheet_data in data.items():
sheet_data.to_excel(
writer, sheet_name=sheet_name, **self._save_args
)
else:
data.to_excel(writer, **self._save_args)

with self._fs.open(save_path, **self._fs_open_args_save) as fs_file:
fs_file.write(output.getvalue())
Expand Down
29 changes: 29 additions & 0 deletions tests/extras/datasets/pandas/test_excel_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@ def excel_data_set(filepath_excel, load_args, save_args, fs_args):
)


@pytest.fixture
def excel_multisheet_data_set(filepath_excel, save_args, fs_args):
load_args = {"sheet_name": None}
return ExcelDataSet(
filepath=filepath_excel,
load_args=load_args,
save_args=save_args,
fs_args=fs_args,
)


@pytest.fixture
def versioned_excel_data_set(filepath_excel, load_version, save_version):
return ExcelDataSet(
Expand All @@ -68,6 +79,11 @@ def dummy_dataframe():
return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]})


@pytest.fixture
def another_dummy_dataframe():
return pd.DataFrame({"x": [10, 20], "y": ["hello", "world"]})


class TestExcelDataSet:
def test_save_and_load(self, excel_data_set, dummy_dataframe):
"""Test saving and reloading the data set."""
Expand All @@ -77,6 +93,19 @@ def test_save_and_load(self, excel_data_set, dummy_dataframe):
assert excel_data_set._fs_open_args_load == {}
assert excel_data_set._fs_open_args_save == {"mode": "wb"}

def test_save_and_load_multiple_sheets(
self, excel_multisheet_data_set, dummy_dataframe, another_dummy_dataframe
):
"""Test saving and reloading the data set with multiple sheets."""
dummy_multisheet = {
"sheet 1": dummy_dataframe,
"sheet 2": another_dummy_dataframe,
}
excel_multisheet_data_set.save(dummy_multisheet)
reloaded = excel_multisheet_data_set.load()
assert_frame_equal(dummy_multisheet["sheet 1"], reloaded["sheet 1"])
assert_frame_equal(dummy_multisheet["sheet 2"], reloaded["sheet 2"])

def test_exists(self, excel_data_set, dummy_dataframe):
"""Test `exists` method invocation for both existing and
nonexistent data set."""
Expand Down

0 comments on commit 4f88b20

Please sign in to comment.