Skip to content

Commit

Permalink
Rename all DataSet mentions to Dataset (excl. docs) (#3147)
Browse files Browse the repository at this point in the history
Signed-off-by: Merel Theisen <merel.theisen@quantumblack.com>
Co-authored-by: Deepyaman Datta <deepyaman.datta@utexas.edu>
  • Loading branch information
merelcht and deepyaman committed Oct 10, 2023
1 parent bb61b17 commit 2297d23
Show file tree
Hide file tree
Showing 32 changed files with 228 additions and 443 deletions.
2 changes: 1 addition & 1 deletion .circleci/continue_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ commands:
name: Install venv for some pre-commit hooks
command: conda install -y virtualenv
- run:
# pytables does not work properly with python 3.9 to handle our HDFDataSet
# pytables does not work properly with python 3.9 to handle our HDFDataset
# if pip-installed, so we install this dependency via conda
name: Install pytables
command: conda install -c conda-forge pytables -y
Expand Down
8 changes: 0 additions & 8 deletions docs/source/kedro.io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,11 @@ kedro.io

kedro.io.AbstractDataset
kedro.io.AbstractVersionedDataset
kedro.io.CachedDataSet
kedro.io.CachedDataset
kedro.io.DataCatalog
kedro.io.IncrementalDataSet
kedro.io.IncrementalDataset
kedro.io.LambdaDataSet
kedro.io.LambdaDataset
kedro.io.MemoryDataSet
kedro.io.MemoryDataset
kedro.io.PartitionedDataSet
kedro.io.PartitionedDataset
kedro.io.Version

Expand All @@ -32,9 +27,6 @@ kedro.io
:toctree:
:template: autosummary/class.rst

kedro.io.DataSetAlreadyExistsError
kedro.io.DataSetError
kedro.io.DataSetNotFoundError
kedro.io.DatasetAlreadyExistsError
kedro.io.DatasetError
kedro.io.DatasetNotFoundError
43 changes: 0 additions & 43 deletions docs/source/kedro_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,91 +11,48 @@ kedro_datasets
:toctree:
:template: autosummary/class.rst

kedro_datasets.api.APIDataSet
kedro_datasets.api.APIDataset
kedro_datasets.biosequence.BioSequenceDataSet
kedro_datasets.biosequence.BioSequenceDataset
kedro_datasets.dask.ParquetDataSet
kedro_datasets.dask.ParquetDataset
kedro_datasets.databricks.ManagedTableDataSet
kedro_datasets.databricks.ManagedTableDataset
kedro_datasets.email.EmailMessageDataSet
kedro_datasets.email.EmailMessageDataset
kedro_datasets.geopandas.GeoJSONDataSet
kedro_datasets.geopandas.GeoJSONDataset
kedro_datasets.holoviews.HoloviewsWriter
kedro_datasets.json.JSONDataSet
kedro_datasets.json.JSONDataset
kedro_datasets.matplotlib.MatplotlibWriter
kedro_datasets.networkx.GMLDataSet
kedro_datasets.networkx.GMLDataset
kedro_datasets.networkx.GraphMLDataSet
kedro_datasets.networkx.GraphMLDataset
kedro_datasets.networkx.JSONDataSet
kedro_datasets.networkx.JSONDataset
kedro_datasets.pandas.CSVDataSet
kedro_datasets.pandas.CSVDataset
kedro_datasets.pandas.DeltaTableDataSet
kedro_datasets.pandas.DeltaTableDataset
kedro_datasets.pandas.ExcelDataSet
kedro_datasets.pandas.ExcelDataset
kedro_datasets.pandas.FeatherDataSet
kedro_datasets.pandas.FeatherDataset
kedro_datasets.pandas.GBQQueryDataSet
kedro_datasets.pandas.GBQQueryDataset
kedro_datasets.pandas.GBQTableDataSet
kedro_datasets.pandas.GBQTableDataset
kedro_datasets.pandas.GenericDataSet
kedro_datasets.pandas.GenericDataset
kedro_datasets.pandas.HDFDataSet
kedro_datasets.pandas.HDFDataset
kedro_datasets.pandas.JSONDataSet
kedro_datasets.pandas.JSONDataset
kedro_datasets.pandas.ParquetDataSet
kedro_datasets.pandas.ParquetDataset
kedro_datasets.pandas.SQLQueryDataSet
kedro_datasets.pandas.SQLQueryDataset
kedro_datasets.pandas.SQLTableDataSet
kedro_datasets.pandas.SQLTableDataset
kedro_datasets.pandas.XMLDataSet
kedro_datasets.pandas.XMLDataset
kedro_datasets.pickle.PickleDataSet
kedro_datasets.pickle.PickleDataset
kedro_datasets.pillow.ImageDataSet
kedro_datasets.pillow.ImageDataset
kedro_datasets.plotly.JSONDataSet
kedro_datasets.plotly.JSONDataset
kedro_datasets.plotly.PlotlyDataSet
kedro_datasets.plotly.PlotlyDataset
kedro_datasets.polars.CSVDataSet
kedro_datasets.polars.CSVDataset
kedro_datasets.polars.GenericDataSet
kedro_datasets.polars.GenericDataset
kedro_datasets.redis.PickleDataSet
kedro_datasets.redis.PickleDataset
kedro_datasets.snowflake.SnowparkTableDataSet
kedro_datasets.snowflake.SnowparkTableDataset
kedro_datasets.spark.DeltaTableDataSet
kedro_datasets.spark.DeltaTableDataset
kedro_datasets.spark.SparkDataSet
kedro_datasets.spark.SparkDataset
kedro_datasets.spark.SparkHiveDataSet
kedro_datasets.spark.SparkHiveDataset
kedro_datasets.spark.SparkJDBCDataSet
kedro_datasets.spark.SparkJDBCDataset
kedro_datasets.spark.SparkStreamingDataSet
kedro_datasets.spark.SparkStreamingDataset
kedro_datasets.svmlight.SVMLightDataSet
kedro_datasets.svmlight.SVMLightDataset
kedro_datasets.tensorflow.TensorFlowModelDataSet
kedro_datasets.tensorflow.TensorFlowModelDataset
kedro_datasets.text.TextDataSet
kedro_datasets.text.TextDataset
kedro_datasets.tracking.JSONDataSet
kedro_datasets.tracking.JSONDataset
kedro_datasets.tracking.MetricsDataSet
kedro_datasets.tracking.MetricsDataset
kedro_datasets.video.VideoDataSet
kedro_datasets.video.VideoDataset
kedro_datasets.yaml.YAMLDataSet
kedro_datasets.yaml.YAMLDataset
10 changes: 5 additions & 5 deletions docs/source/tutorial/set_up_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ Open `conf/base/catalog.yml` for the spaceflights project to inspect the content

```yaml
companies:
type: pandas.CSVDataSet
type: pandas.CSVDataset
filepath: data/01_raw/companies.csv

reviews:
type: pandas.CSVDataSet
type: pandas.CSVDataset
filepath: data/01_raw/reviews.csv
```
</details> <br />
Expand All @@ -44,7 +44,7 @@ Likewise for the `xlsx` dataset:

```yaml
shuttles:
type: pandas.ExcelDataSet
type: pandas.ExcelDataset
filepath: data/01_raw/shuttles.xlsx
load_args:
engine: openpyxl # Use modern Excel engine (the default since Kedro 0.18.0)
Expand Down Expand Up @@ -75,7 +75,7 @@ companies.head()
<summary><b>Click to expand</b></summary>

```
INFO Loading data from 'companies' (CSVDataSet)
INFO Loading data from 'companies' (CSVDataset)
Out[1]:
id company_rating company_location total_fleet_count iata_approved
0 35029 100% Niue 4.0 f
Expand All @@ -100,7 +100,7 @@ You should see output such as the following:
<summary><b>Click to expand</b></summary>

```
INFO Loading data from 'shuttles' (ExcelDataSet)
INFO Loading data from 'shuttles' (ExcelDataset)
Out[1]:
id shuttle_location shuttle_type engine_type ... d_check_complete moon_clearance_complete price company_id
0 63561 Niue Type V5 Quantum ... f f $1,325.0 35029
Expand Down
2 changes: 1 addition & 1 deletion features/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,6 @@ def _install_project_requirements(context):
.splitlines()
)
install_reqs = [req for req in install_reqs if "{" not in req and "#" not in req]
install_reqs.append("kedro-datasets[pandas.CSVDataSet]")
install_reqs.append("kedro-datasets[pandas.CSVDataset]")
call([context.pip, "install", *install_reqs], env=context.env)
return context
8 changes: 4 additions & 4 deletions features/steps/e2e_test_catalog.yml
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
A:
type: pandas.CSVDataSet
type: pandas.CSVDataset
filepath: data/01_raw/input_1.csv
save_args:
index: False
C:
type: pandas.CSVDataSet
type: pandas.CSVDataset
filepath: data/01_raw/input_2.csv
save_args:
index: False
E:
type: pandas.CSVDataSet
type: pandas.CSVDataset
filepath: data/02_intermediate/output_1.csv
save_args:
index: False
F:
type: pandas.CSVDataSet
type: pandas.CSVDataset
filepath: data/02_intermediate/output_2.csv
save_args:
index: False
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
# An example data set definition can look as follows:
#
#bikes:
# type: pandas.CSVDataSet
# type: pandas.CSVDataset
# filepath: "data/01_raw/bikes.csv"
#
#weather:
# type: spark.SparkDataSet
# type: spark.SparkDataset
# filepath: s3a://your_bucket/data/01_raw/weather*
# file_format: csv
# credentials: dev_s3
Expand All @@ -24,7 +24,7 @@
# header: True
#
#scooters:
# type: pandas.SQLTableDataSet
# type: pandas.SQLTableDataset
# credentials: scooters_credentials
# table_name: scooters
# load_args:
Expand All @@ -35,13 +35,13 @@
# # if_exists: 'fail'
# # if_exists: 'append'
#
# The Data Catalog supports being able to reference the same file using two different DataSet implementations
# The Data Catalog supports being able to reference the same file using two different dataset implementations
# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
# https://kedro.readthedocs.io/en/stable/data/data_catalog.html
#
# This is a data set used by the "Hello World" example pipeline provided with the project
# template. Please feel free to remove it once you remove the example pipeline.

example_iris_data:
type: pandas.CSVDataSet
type: pandas.CSVDataset
filepath: data/01_raw/iris.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ jupyter~=1.0
jupyterlab_server>=2.11.1, <2.16.0
jupyterlab~=3.0, <3.6.0
kedro~={{ cookiecutter.kedro_version}}
kedro-datasets[pandas.CSVDataSet]
kedro-datasets[pandas.CSVDataset]
kedro-telemetry~=0.2.0
pytest-cov~=3.0
pytest-mock>=1.7.1, <2.0
Expand Down
4 changes: 2 additions & 2 deletions kedro/config/templated_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ class TemplatedConfigLoader(AbstractConfigLoader):
environment: "dev"
datasets:
csv: "pandas.CSVDataSet"
spark: "spark.SparkDataSet"
csv: "pandas.CSVDataset"
spark: "spark.SparkDataset"
folders:
raw: "01_raw"
Expand Down
34 changes: 3 additions & 31 deletions kedro/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
from __future__ import annotations

from .cached_dataset import CachedDataSet, CachedDataset
from .cached_dataset import CachedDataset
from .core import (
AbstractDataset,
AbstractVersionedDataset,
Expand All @@ -13,52 +13,24 @@
Version,
)
from .data_catalog import DataCatalog
from .lambda_dataset import LambdaDataSet, LambdaDataset
from .memory_dataset import MemoryDataSet, MemoryDataset
from .lambda_dataset import LambdaDataset
from .memory_dataset import MemoryDataset
from .partitioned_dataset import (
IncrementalDataSet,
IncrementalDataset,
PartitionedDataSet,
PartitionedDataset,
)

# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901
DataSetError: type[DatasetError]
DataSetNotFoundError: type[DatasetNotFoundError]
DataSetAlreadyExistsError: type[DatasetAlreadyExistsError]
AbstractDataSet: type[AbstractDataset]
AbstractVersionedDataSet: type[AbstractVersionedDataset]


def __getattr__(name):
import kedro.io.core # noqa: import-outside-toplevel

if name in (kedro.io.core._DEPRECATED_CLASSES): # noqa: protected-access
return getattr(kedro.io.core, name)
raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}")


__all__ = [
"AbstractDataSet",
"AbstractDataset",
"AbstractVersionedDataSet",
"AbstractVersionedDataset",
"CachedDataSet",
"CachedDataset",
"DataCatalog",
"DataSetAlreadyExistsError",
"DatasetAlreadyExistsError",
"DataSetError",
"DatasetError",
"DataSetNotFoundError",
"DatasetNotFoundError",
"IncrementalDataSet",
"IncrementalDataset",
"LambdaDataSet",
"LambdaDataset",
"MemoryDataSet",
"MemoryDataset",
"PartitionedDataSet",
"PartitionedDataset",
"Version",
]
17 changes: 0 additions & 17 deletions kedro/io/cached_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,11 @@
from __future__ import annotations

import logging
import warnings
from typing import Any

from kedro.io.core import VERSIONED_FLAG_KEY, AbstractDataset, Version
from kedro.io.memory_dataset import MemoryDataset

# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901
CachedDataSet: type[CachedDataset]


class CachedDataset(AbstractDataset):
"""``CachedDataset`` is a dataset wrapper which caches in memory the data saved,
Expand Down Expand Up @@ -121,16 +117,3 @@ def __getstate__(self):
logging.getLogger(__name__).warning("%s: clearing cache to pickle.", str(self))
self._cache.release()
return self.__dict__


def __getattr__(name):
if name == "CachedDataSet":
alias = CachedDataset
warnings.warn(
f"{repr(name)} has been renamed to {repr(alias.__name__)}, "
f"and the alias will be removed in Kedro 0.19.0",
DeprecationWarning,
stacklevel=2,
)
return alias
raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}")
29 changes: 0 additions & 29 deletions kedro/io/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,6 @@
PROTOCOL_DELIMITER = "://"
CLOUD_PROTOCOLS = ("s3", "s3n", "s3a", "gcs", "gs", "adl", "abfs", "abfss", "gdrive")

# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901
DataSetError: type[DatasetError]
DataSetNotFoundError: type[DatasetNotFoundError]
DataSetAlreadyExistsError: type[DatasetAlreadyExistsError]
AbstractDataSet: type[AbstractDataset]
AbstractVersionedDataSet: type[AbstractVersionedDataset]


class DatasetError(Exception):
"""``DatasetError`` raised by ``AbstractDataset`` implementations
Expand Down Expand Up @@ -757,25 +750,3 @@ def validate_on_forbidden_chars(**kwargs):
raise DatasetError(
f"Neither white-space nor semicolon are allowed in '{key}'."
)


_DEPRECATED_CLASSES = {
"DataSetError": DatasetError,
"DataSetNotFoundError": DatasetNotFoundError,
"DataSetAlreadyExistsError": DatasetAlreadyExistsError,
"AbstractDataSet": AbstractDataset,
"AbstractVersionedDataSet": AbstractVersionedDataset,
}


def __getattr__(name):
if name in _DEPRECATED_CLASSES:
alias = _DEPRECATED_CLASSES[name]
warnings.warn(
f"{repr(name)} has been renamed to {repr(alias.__name__)}, "
f"and the alias will be removed in Kedro 0.19.0",
DeprecationWarning,
stacklevel=2,
)
return alias
raise AttributeError(f"module {repr(__name__)} has no attribute {repr(name)}")
Loading

0 comments on commit 2297d23

Please sign in to comment.