diff --git a/.circleci/config.yml b/.circleci/config.yml index 1cf2aaaa00..f10e9a2166 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -28,6 +28,11 @@ commands: - run: name: Activate conda environment command: echo "conda deactivate; conda activate kedro_builder" >> $BASH_ENV + - run: + # pytables does not work properly with python 3.9 to handle our HDFDataSet + # if pip-installed, so we install this dependency via conda + name: Install conda packages + command: echo "conda install -c conda-forge pytables -y" >> $BASH_ENV setup_requirements: steps: @@ -120,6 +125,11 @@ commands: win_setup_requirements: steps: + # pytables and Fiona have a series of binary dependencies under Windows that + # are best handled by conda-installing instead of pip-installing them. + - run: + name: Install pytables + command: conda activate kedro_builder; conda install -c conda-forge pytables -y - run: name: Install GDAL command: conda activate kedro_builder; conda install -c conda-forge gdal -y @@ -217,20 +227,9 @@ jobs: # Setting it to 2 or higher will suppress the warning messages totally. name: Set HDF5_DISABLE_VERSION_CHECK environment variable command: setx /m HDF5_DISABLE_VERSION_CHECK 1 - - unless: - condition: - equal: ["3.6", <>] - steps: - - run: - name: Run unit tests without spark - command: conda activate kedro_builder; make test-no-spark - - when: - condition: - equal: ["3.6", <>] - steps: - - run: - name: Run unit tests without spark or tensorflow - command: conda activate kedro_builder; pytest tests --no-cov --ignore tests/extras/datasets/spark --ignore tests/extras/datasets/tensorflow --numprocesses 4 --dist loadfile + - run: + name: Run unit tests without spark + command: conda activate kedro_builder; make test-no-spark lint: parameters: @@ -457,31 +456,31 @@ workflows: - e2e_tests: matrix: parameters: - python_version: ["3.6", "3.7", "3.8"] + python_version: ["3.7", "3.8", "3.9"] - win_e2e_tests: matrix: parameters: - python_version: ["3.6", "3.7", "3.8"] + python_version: ["3.7", "3.8", "3.9"] - unit_tests: matrix: parameters: - python_version: ["3.6", "3.7", "3.8"] + python_version: ["3.7", "3.8", "3.9"] - win_unit_tests: matrix: parameters: - python_version: ["3.6", "3.7", "3.8"] + python_version: ["3.7", "3.8", "3.9"] - lint: matrix: parameters: - python_version: ["3.6", "3.7", "3.8"] + python_version: ["3.7", "3.8", "3.9"] - pip_compile: matrix: parameters: - python_version: ["3.6", "3.7", "3.8"] + python_version: ["3.7", "3.8", "3.9"] - win_pip_compile: matrix: parameters: - python_version: ["3.6", "3.7", "3.8"] + python_version: ["3.7", "3.8", "3.9"] - build_docs - docs_linkcheck - all_circleci_checks_succeeded: @@ -534,7 +533,7 @@ workflows: - build_kedro: matrix: parameters: - python_version: ["3.6", "3.7", "3.8"] + python_version: ["3.7", "3.8", "3.9"] requires: - build_docker_image-<> @@ -564,7 +563,7 @@ workflows: - build_kedro: matrix: parameters: - python_version: ["3.6", "3.7", "3.8"] + python_version: ["3.7", "3.8", "3.9"] - publish_kedro: requires: - build_kedro diff --git a/.gitignore b/.gitignore index e99cbc2524..ee243eb41a 100644 --- a/.gitignore +++ b/.gitignore @@ -147,6 +147,9 @@ kedro.db .*.swo .*.swp +# Prettier +.prettierignore + .pytest_cache/ kedro/html docs/tmp-build-artifacts diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cb1eb8ea11..4c2a89bb96 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,7 +29,7 @@ repos: exclude: "^kedro/templates/|^features/steps/test_starter/" - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.720 + rev: v0.812 hooks: - id: mypy args: [--allow-redefinition, --ignore-missing-imports] diff --git a/.readthedocs.yml b/.readthedocs.yml index cf91de6bd1..507f8d271a 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -21,7 +21,7 @@ formats: # Optionally set the version of Python and requirements required to build your docs python: - version: 3.6 + version: 3.7 install: - method: pip path: . diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 720c97fc98..aa326aedf3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -43,7 +43,7 @@ Check the [Kedro repo to find open pull requests](https://github.com/kedro-org/k ## Contribute a fix or feature -If you're interested in contributing fixes to code or documentation, first read our [guidelines for contributing developers](https://kedro.readthedocs.io/en/stable/14_contribution/02_developer_contributor_guidelines.html) for an explanation of how to get set up and the process you'll follow. +If you're interested in contributing fixes to code or documentation, first read our [guidelines for contributing developers](https://kedro.readthedocs.io/en/stable/contribution/developer_contributor_guidelines.html) for an explanation of how to get set up and the process you'll follow. Once you are ready to contribute, a good place to start is to take a look at the `good first issues` and `help wanted issues` on [GitHub](https://github.com/kedro-org/kedro/issues). @@ -51,4 +51,4 @@ Once you are ready to contribute, a good place to start is to take a look at the You can help us improve the [Kedro documentation online](https://kedro.readthedocs.io/en/stable/). Send us feedback as a [GitHub issue](https://github.com/kedro-org/kedro/issues) or start a documentation discussion on [GitHub](https://github.com/kedro-org/kedro/discussions). -You are also welcome to make a raise a PR with a bug fix or addition to the documentation. First read the guide [Contribute to the Kedro documentation](https://kedro.readthedocs.io/en/stable/14_contribution/04_documentation_contributor_guidelines.html). +You are also welcome to make a raise a PR with a bug fix or addition to the documentation. First read the guide [Contribute to the Kedro documentation](https://kedro.readthedocs.io/en/stable/contribution/documentation_contributor_guidelines.html). diff --git a/README.md b/README.md index ac9f5c0115..a34e39fa8c 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![Kedro Logo Banner](https://raw.githubusercontent.com/kedro-org/kedro/develop/static/img/kedro_banner.png) -[![Python version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue.svg)](https://pypi.org/project/kedro/) +[![Python version](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue.svg)](https://pypi.org/project/kedro/) [![PyPI version](https://badge.fury.io/py/kedro.svg)](https://pypi.org/project/kedro/) [![Conda version](https://img.shields.io/conda/vn/conda-forge/kedro.svg)](https://anaconda.org/conda-forge/kedro) [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/kedro-org/kedro/blob/main/LICENSE.md) @@ -28,7 +28,7 @@ It is also possible to install Kedro using `conda`: conda install -c conda-forge kedro ``` -Our [Get Started guide](https://kedro.readthedocs.io/en/stable/02_get_started/01_prerequisites.html) contains full installation instructions, and includes how to set up Python virtual environments. +Our [Get Started guide](https://kedro.readthedocs.io/en/stable/get_started/prerequisites.html) contains full installation instructions, and includes how to set up Python virtual environments. ## What are the main features of Kedro? @@ -49,9 +49,9 @@ Our [Get Started guide](https://kedro.readthedocs.io/en/stable/02_get_started/01 ## How do I use Kedro? The [Kedro documentation](https://kedro.readthedocs.io/en/stable/) includes three examples to help get you started: -- A typical "Hello World" example, for an [entry-level description of the main Kedro concepts](https://kedro.readthedocs.io/en/stable/02_get_started/03_hello_kedro.html) -- An [introduction to the project template](https://kedro.readthedocs.io/en/stable/02_get_started/05_example_project.html) using the Iris dataset -- A more detailed [spaceflights tutorial](https://kedro.readthedocs.io/en/stable/03_tutorial/02_tutorial_template.html) to give you hands-on experience +- A typical "Hello World" example, for an [entry-level description of the main Kedro concepts](https://kedro.readthedocs.io/en/stable/get_started/hello_kedro.html) +- An [introduction to the project template](https://kedro.readthedocs.io/en/stable/get_started/example_project.html) using the Iris dataset +- A more detailed [spaceflights tutorial](https://kedro.readthedocs.io/en/stable/tutorial/tutorial_template.html) to give you hands-on experience ## Why does Kedro exist? @@ -66,7 +66,7 @@ Kedro is built upon our collective best-practice (and mistakes) trying to delive ## The humans behind Kedro -Kedro is maintained by [a product team](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html) and a number of [contributors from across the world](https://github.com/kedro-org/kedro/releases). +Kedro is maintained by [a product team](https://kedro.readthedocs.io/en/stable/faq/faq.html) and a number of [contributors from across the world](https://github.com/kedro-org/kedro/releases). ## Can I contribute? @@ -76,7 +76,7 @@ Yes! Want to help build Kedro? Check out our [guide to contributing to Kedro](ht ## Where can I learn more? -There is a growing community around Kedro. Have a look at the [Kedro FAQs](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#how-can-i-find-out-more-about-kedro) to find projects using Kedro and links to articles, podcasts and talks. +There is a growing community around Kedro. Have a look at the [Kedro FAQs](https://kedro.readthedocs.io/en/stable/faq/faq.html#how-can-i-find-out-more-about-kedro) to find projects using Kedro and links to articles, podcasts and talks. ## Who likes Kedro? diff --git a/RELEASE.md b/RELEASE.md index 669176a2a1..8bf2b52012 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,122 @@ +# Upcoming Release 0.18.0 + +## Major features and improvements +* Added support for Python 3.9, dropped support for Python 3.6. +* Support specifying parameters mapping in `pipeline()` without the `params:` prefix. +* Added new API `Pipeline.filter()` (previously in `KedroContext._filter_pipeline()`) to filter parts of a pipeline. +* Added `partitionBy` support and exposed `save_args` for `SparkHiveDataSet`. +* Exposed `open_args_save` in `fs_args` for `pandas.ParquetDataSet`. +* Bumped the minimum version of `pandas` to 1.3. Any `storage_options` should continue to be specified under `fs_args` and/or `credentials`. +* Refactored the `load` and `save` operations for `pandas` datasets in order to leverage `pandas` own API and delegate `fsspec` operations to them. This reduces the need to have our own `fsspec` wrappers. +* Removed `cli.py` from the Kedro project template. By default, all CLI commands, including `kedro run`, are now defined on the Kedro framework side. These can be overridden in turn by a plugin or a `cli.py` file in your project. A packaged Kedro project will respect the same hierarchy when executed with `python -m my_package`. +* Merged `pandas.AppendableExcelDataSet` into `pandas.ExcelDataSet`. +* Added `save_args` to `feather.FeatherDataSet`. +* The default `kedro` environment names can now be set in `settings.py` with the help of the `CONFIG_LOADER_ARGS` variable. The relevant keys to be supplied are `base_env` and `default_run_env`. These values are set to `base` and `local` respectively as a default. +* Added `kedro.config.abstract_config.AbstractConfigLoader` as an abstract base class for all `ConfigLoader` implementations. `ConfigLoader` and `TemplatedConfigLoader` now inherit directly from this base class. +* Streamlined the `ConfigLoader.get` and `TemplatedConfigLoader.get` API and delegated the actual `get` method functional implementation to the `kedro.config.common` module. +* Added the following new datasets: + +| Type | Description | Location | +| ------------------- | -------------------------------------------------------------- | ------------------------------ | +| `pandas.XMLDataSet` | Read XML into Pandas DataFrame. Write Pandas DataFrame to XML. | `kedro.extras.datasets.pandas` | +| `networkx.GraphMLDataSet` | Work with NetworkX using GraphML files | `kedro.extras.datasets.networkx` | +| `networkx.GMLDataSet` | Work with NetworkX using Graph Modelling Language files | `kedro.extras.datasets.networkx` | + +## Breaking changes to the API +* Add namespace to parameters in a modular pipeline, which addresses [Issue 399](https://github.com/kedro-org/kedro/issues/399) +* `pandas.ExcelDataSet` now uses `openpyxl` engine instead of `xlrd`. +* `pandas.ParquetDataSet` now calls `pd.to_parquet()` upon saving. Note that the argument `partition_cols` is not supported. +* `KedroSession.run` now raises `ValueError` rather than `KedroContextError` when the pipeline contains no nodes. The same `ValueError` is raised when there are no matching tags. +* `KedroSession.run` now raises `ValueError` rather than `KedroContextError` when the pipeline name doesn't exist in the pipeline registry. +* Removed deprecated functions `load_context` and `get_project_context`. +* `spark.SparkHiveDataSet` API has been updated to reflect `spark.SparkDataSet`. The `write_mode=insert` option has also been replaced with `write_mode=append` as per Spark styleguide. This change addresses [Issue 725](https://github.com/kedro-org/kedro/issues/725) and [Issue 745](https://github.com/kedro-org/kedro/issues/745). Additionally, `upsert` mode now leverages `checkpoint` functionality and requires a valid `checkpointDir` be set for current `SparkContext`. +* Deprecated and removed `ProjectHooks.register_config_loader` `hook_spec` in favour of loading `CONFIG_LOADER_CLASS` directly from `settings.py`. The default option for `CONFIG_LOADER_CLASS` is now set to `kedro.config.ConfigLoader`. +* Added `CONFIG_LOADER_ARGS` to `settings.py` to facilitate the provision of additional keyword arguments to the constructor of the project `config_loader`. The default option for `CONFIG_LOADER_ARGS` is an empty dictionary. +* `yaml.YAMLDataSet` can no longer save a `pandas.DataFrame` directly, but it can save a dictionary. Use `pandas.DataFrame.to_dict()` to convert your `pandas.DataFrame` to a dictionary before you attempt to save it to YAML. +* Removed `--version` CLI option for `kedro pipeline package` command. Specific pipeline package version can be added by setting the `__version__` variable in the pipeline package's `__init__.py` file. +* The `kedro package` and `kedro pipeline package` now save `egg` and `whl` files in the `/dist` folder (previously `/src/dist`). +* Removed `kedro pipeline list` and `kedro pipeline describe` commands in favour of `kedro registry list` and `kedro registry describe`. +* Removed `open_args_load` and `open_args_save` from the following datasets: + * `pandas.CSVDataSet` + * `pandas.ExcelDataSet` + * `pandas.FeatherDataSet` + * `pandas.JSONDataSet` + * `pandas.ParquetDataSet` +* `storage_options` are now dropped if they are specified under `load_args` or `save_args` for the following datasets: + * `pandas.CSVDataSet` + * `pandas.ExcelDataSet` + * `pandas.FeatherDataSet` + * `pandas.JSONDataSet` + * `pandas.ParquetDataSet` +* The environment defaulting behaviour has been removed from `KedroContext` and is now implemented in a `ConfigLoader` class (or equivalent) with the `base_env` and `default_run_env` attributes. +* `ConfigLoader` and `TemplatedConfigLoader` argument `conf_root` has been renamed to `conf_source` to align the API. +* The `settings.py` setting `CONF_ROOT` has been renamed to `CONF_SOURCE` to align the API. Default value of `conf` remains unchanged. +* Renamed `extra_params` to `runtime_params` in `kedro.config.config.ConfigLoader` and `kedro.config.templated_config.TemplatedConfigLoader`. +* Switched from packaging pipelines as wheel files to tar archive files compressed with gzip (`.tar.gz`) +* `kedro pipeline package` now accepts a module name and path to the pipeline or utility module to package, relative to `src//`. In addition to the `--alias` flag used to rename the package, `kedro pipeline pull` now also supports `--destination` to provide a location for pulling the package. +* Renamed `lambda_data_set`, `memory_data_set`, and `partitioned_data_set` to `lambda_dataset`, `memory_dataset`, and `partitioned_dataset`, respectively, in `kedro.io`. +* Removed the `kedro install` command in favour of using `pip install -r src/requirements.txt` to install project dependencies. +* The dataset `networkx.NetworkXDataSet` has been renamed to `networkx.JSONDataSet`. +* Removed the `config_loader` property from `KedroContext`. +* Removed decorator API from `Node` and `Pipeline`, as well as the modules `kedro.extras.decorators` and `kedro.pipeline.decorators`. +* Removed transformer API from `DataCatalog`, as well as the modules `kedro.extras.transformers` and `kedro.io.transformers`. +* Removed the `Journal` and `DataCatalogWithDefault`. +* Removed the `--parallel` flag from `kedro run` in favour of `--runner=ParallelRunner`. The `-p` flag is now an alias for `--pipeline`. +* Removed deprecated `CONF_SOURCE`, `package_name`, `pipeline`, `pipelines`, and `io` attributes from `KedroContext` as well as the deprecated `KedroContext.run` method. +* Changed the behaviour of `kedro build-reqs` to compile requirements from `requirements.txt` instead of `requirements.in` and save them to `requirements.lock` instead of `requirements.txt`. +* Removed `ProjectHooks.register_catalog` `hook_spec` in favour of loading `DATA_CATALOG_CLASS` directly from `settings.py`. The default option for `DATA_CATALOG_CLASS` is now set to `kedro.io.DataCatalog`. +* Removed `RegistrationSpecs` and all registration hooks that belonged to it. Going forward users can register custom library components through `settings.py`. + +## Thanks for supporting contributions + +[Deepyaman Datta](https://github.com/deepyaman), [Lucas Jamar](https://github.com/lucasjamar), [Simon Brugman](https://github.com/sbrugman) + +## Migration guide from Kedro 0.17.* to 0.18.* +* Please remove any existing `hook_impl` of the `register_config_loader` and `register_catalog` methods from `ProjectHooks` (or custom alternatives). +* Populate `settings.py` with `CONFIG_LOADER_CLASS` set to your expected config loader class (for example `kedro.config.TemplatedConfigLoader` or custom implementation). If `CONFIG_LOADER_CLASS` value is not set, it will default to `kedro.config.ConfigLoader` at runtime. +* Populate `settings.py` with `CONFIG_LOADER_ARGS` set to a dictionary with expected keyword arguments. If `CONFIG_LOADER_ARGS` is not set, it will default to an empty dictionary. +* Populate `settings.py` with `DATA_CATALOG_CLASS` set to your expected data catalog class. If `DATA_CATALOG_CLASS` value is not set, it will default to `kedro.io.DataCatalog` at runtime. +* Optional: You can now remove all `params:` prefix when supplying values to `parameters` argument in a `pipeline()` call. +* If you're using `pandas.ExcelDataSet`, make sure you have `openpyxl` installed in your environment. Note that this is automatically pulled if you specify `kedro[pandas.ExcelDataSet]==0.18.0` in your `requirements.in`. You can uninstall `xlrd` if you were only using it for this dataset. +* If you're using `pandas.ParquetDataSet`, please pass pandas saving arguments directly to `save_args` instead of nested in `from_pandas` (e.g. `save_args = {"preserve_index": False}` instead of `save_args = {"from_pandas": {"preserve_index": False}}`). +* If you're using `spark.SparkHiveDataSet` with `write_mode` option set to `insert`, please update this to `append` in line with the Spark styleguide. If you're using `spark.SparkHiveDataSet` with `write_mode` option set to `upsert`, please make sure that your `SparkContext` has a valid `checkpointDir` set either by `SparkContext.setCheckpointDir` method or directly in the `conf` folder. +* Edit any scripts containing `kedro pipeline package --version` to remove the `--version` option. If you wish to set a specific pipeline package version, set the `__version__` variable in the pipeline package's `__init__.py` file. +* If you had any `pandas.AppendableExcelDataSet` entries in your catalog, replace them with `pandas.ExcelDataSet`. +* If you were using `pandas~=1.2.0` and passing `storage_options` through `load_args` or `savs_args`, please specify them under `fs_args` or via `credentials` instead. +* Update the `settings.py` setting `CONF_ROOT` to `CONF_SOURCE`. +* Update the key-word argument `conf_root` to `conf_source` when calling `ConfigLoader` or `TemplatedConfigLoader` directly. +* Rename `extra_params` to `runtime_params` in `kedro.config.config.ConfigLoader` and `kedro.config.templated_config.TemplatedConfigLoader`, or your custom implementation, if it calls to `ConfigLoader` or any of its parent classes. +* If you were importing from `kedro.io.lambda_data_set`, `kedro.io.memory_data_set`, or `kedro.io.partitioned_data_set`, change the import to `kedro.io.lambda_dataset`, `kedro.io.memory_dataset`, or `kedro.io.partitioned_dataset`, respectively (or import the dataset directly from `kedro.io`). +* If you were pulling any modular pipelines with `kedro pipeline pull my_pipeline --alias other_pipeline`, please use `kedro pipeline pull my_pipeline --alias pipelines.other_pipeline` instead. +* If you were packaging any modular pipelines with `kedro pipeline package my_pipeline`, please use `kedro pipeline package pipelines.my_pipeline` instead. +* Similarly, if you were packaging any modular pipelines using `pyproject.toml`, you should modify the keys to include the full module path, and wrapped in double-quotes, e.g: + + ```toml + [tool.kedro.pipeline.package] + data_engineering = {destination = "path/to/here"} + data_science = {alias = "ds", env = "local"} + + [tool.kedro.pipeline.pull] + "s3://my_bucket/my_pipeline" = {alias = "aliased_pipeline"} + ``` + + becomes + + ```toml + [tool.kedro.pipeline.package] + "pipelines.data_engineering" = {destination = "path/to/here"} + "pipelines.data_science" = {alias = "ds", env = "local"} + + [tool.kedro.pipeline.pull] + "s3://my_bucket/my_pipeline" = {alias = "pipelines.aliased_pipeline"} + + ``` + +* If you had any `networkx.NetworkXDataSet` entries in your catalog, replace them with `networkx.JSONDataSet`. +* If you were using the `KedroContext` to access `ConfigLoader`, please use `settings.CONFIG_LOADER_CLASS` to access the currently used `ConfigLoader` instead. +* To run a pipeline in parallel, use `kedro run --runner=ParallelRunner` rather than `--parallel` or `-p`. + + # Release 0.17.7 ## Major features and improvements @@ -11,6 +130,7 @@ ## Upcoming deprecations for Kedro 0.18.0 + # Release 0.17.6 ## Major features and improvements @@ -70,7 +190,7 @@ ## Major features and improvements * Added new CLI group `registry`, with the associated commands `kedro registry list` and `kedro registry describe`, to replace `kedro pipeline list` and `kedro pipeline describe`. -* Added support for dependency management at a modular pipeline level. When a pipeline with `requirements.txt` is packaged, its dependencies are embedded in the modular pipeline wheel file. Upon pulling the pipeline, Kedro will append dependencies to the project's `requirements.in`. More information is available in [our documentation](https://kedro.readthedocs.io/en/stable/06_nodes_and_pipelines/03_modular_pipelines.html#package-a-modular-pipeline). +* Added support for dependency management at a modular pipeline level. When a pipeline with `requirements.txt` is packaged, its dependencies are embedded in the modular pipeline wheel file. Upon pulling the pipeline, Kedro will append dependencies to the project's `requirements.in`. More information is available in [our documentation](https://kedro.readthedocs.io/en/0.17.5/06_nodes_and_pipelines/03_modular_pipelines.html). * Added support for bulk packaging/pulling modular pipelines using `kedro pipeline package/pull --all` and `pyproject.toml`. * Removed `cli.py` from the Kedro project template. By default all CLI commands, including `kedro run`, are now defined on the Kedro framework side. These can be overridden in turn by a plugin or a `cli.py` file in your project. A packaged Kedro project will respect the same hierarchy when executed with `python -m my_package`. * Removed `.ipython/profile_default/startup/` from the Kedro project template in favour of `.ipython/profile_default/ipython_config.py` and the `kedro.extras.extensions.ipython`. @@ -238,7 +358,7 @@ * This release has broken the `kedro ipython` and `kedro jupyter` workflows. To fix this, follow the instructions in the migration guide below. * You will also need to upgrade `kedro-viz` to 3.10.1 if you use the `%run_viz` line magic in Jupyter Notebook. -> *Note:* If you're using the `ipython` [extension](https://kedro.readthedocs.io/en/stable/11_tools_integration/02_ipython.html#ipython-extension) instead, you will not encounter this problem. +> *Note:* If you're using the `ipython` [extension](https://kedro.readthedocs.io/en/0.17.1/11_tools_integration/02_ipython.html#ipython-extension) instead, you will not encounter this problem. ## Migration guide You will have to update the file `/.ipython/profile_default/startup/00-kedro-init.py` in order to make `kedro ipython` and/or `kedro jupyter` work. Add the following line before the `KedroSession` is created: @@ -270,7 +390,7 @@ from kedro.framework.session import KedroSession ## Major features and improvements -* In a significant change, [we have introduced `KedroSession`](https://kedro.readthedocs.io/en/stable/04_kedro_project_setup/03_session.html) which is responsible for managing the lifecycle of a Kedro run. +* In a significant change, [we have introduced `KedroSession`](https://kedro.readthedocs.io/en/0.17.0/04_kedro_project_setup/03_session.html) which is responsible for managing the lifecycle of a Kedro run. * Created a new Kedro Starter: `kedro new --starter=mini-kedro`. It is possible to [use the DataCatalog as a standalone component](https://github.com/kedro-org/kedro-starters/tree/master/mini-kedro) in a Jupyter notebook and transition into the rest of the Kedro framework. * Added `DatasetSpecs` with Hooks to run before and after datasets are loaded from/saved to the catalog. * Added a command: `kedro catalog create`. For a registered pipeline, it creates a `//catalog/.yml` configuration file with `MemoryDataSet` datasets for each dataset that is missing from `DataCatalog`. @@ -278,7 +398,7 @@ from kedro.framework.session import KedroSession * `ProjectContext` is no longer needed, unless for very complex customisations. `KedroContext`, `ProjectHooks` and `settings.py` together implement sensible default behaviour. As a result `context_path` is also now an _optional_ key in `pyproject.toml`. * Removed `ProjectContext` from `src//run.py`. * `TemplatedConfigLoader` now supports [Jinja2 template syntax](https://jinja.palletsprojects.com/en/2.11.x/templates/) alongside its original syntax. -* Made [registration Hooks](https://kedro.readthedocs.io/en/stable/07_extend_kedro/02_hooks.html#registration-hooks) mandatory, as the only way to customise the `ConfigLoader` or the `DataCatalog` used in a project. If no such Hook is provided in `src//hooks.py`, a `KedroContextError` is raised. There are sensible defaults defined in any project generated with Kedro >= 0.16.5. +* Made [registration Hooks](https://kedro.readthedocs.io/en/0.17.0/07_extend_kedro/02_hooks.html#registration-hooks) mandatory, as the only way to customise the `ConfigLoader` or the `DataCatalog` used in a project. If no such Hook is provided in `src//hooks.py`, a `KedroContextError` is raised. There are sensible defaults defined in any project generated with Kedro >= 0.16.5. ## Bug fixes and other changes @@ -332,14 +452,14 @@ from kedro.framework.session import KedroSession ## Migration guide from Kedro 0.16.* to 0.17.* -**Reminder:** Our documentation on [how to upgrade Kedro](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#how-do-i-upgrade-kedro) covers a few key things to remember when updating any Kedro version. +**Reminder:** Our documentation on [how to upgrade Kedro](https://kedro.readthedocs.io/en/0.17.0/12_faq/01_faq.html#how-do-i-upgrade-kedro) covers a few key things to remember when updating any Kedro version. The Kedro 0.17.0 release contains some breaking changes. If you update Kedro to 0.17.0 and then try to work with projects created against earlier versions of Kedro, you may encounter some issues when trying to run `kedro` commands in the terminal for that project. Here's a short guide to getting your projects running against the new version of Kedro. >*Note*: As always, if you hit any problems, please check out our documentation: ->* [How can I find out more about Kedro?](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#how-can-i-find-out-more-about-kedro) ->* [How can I get my questions answered?](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#how-can-i-get-my-question-answered). +>* [How can I find out more about Kedro?](https://kedro.readthedocs.io/en/0.17.0/12_faq/01_faq.html#how-can-i-find-out-more-about-kedro) +>* [How can I get my questions answered?](https://kedro.readthedocs.io/en/0.17.0/12_faq/01_faq.html#how-can-i-get-my-question-answered). To get an existing Kedro project to work after you upgrade to Kedro 0.17.0, we recommend that you create a new project against Kedro 0.17.0 and move the code from your existing project into it. Let's go through the changes, but first, note that if you create a new Kedro project with Kedro 0.17.0 you will not be asked whether you want to include the boilerplate code for the Iris dataset example. We've removed this option (you should now use a Kedro starter if you want to create a project that is pre-populated with code). @@ -599,7 +719,7 @@ Even though this release ships a fix for project generated with `kedro==0.16.2`, * Added `joblib` backend support to `pickle.PickleDataSet`. * Added versioning support to `MatplotlibWriter` dataset. * Added the ability to install dependencies for a given dataset with more granularity, e.g. `pip install "kedro[pandas.ParquetDataSet]"`. -* Added the ability to specify extra arguments, e.g. `encoding` or `compression`, for `fsspec.spec.AbstractFileSystem.open()` calls when loading/saving a dataset. See Example 3 under [docs](https://kedro.readthedocs.io/en/stable/04_user_guide/04_data_catalog.html#using-the-data-catalog-with-the-yaml-api). +* Added the ability to specify extra arguments, e.g. `encoding` or `compression`, for `fsspec.spec.AbstractFileSystem.open()` calls when loading/saving a dataset. See Example 3 under [docs](https://kedro.readthedocs.io/en/0.16.0/04_user_guide/04_data_catalog.html#using-the-data-catalog-with-the-yaml-api). ### Other * Added `namespace` property on ``Node``, related to the modular pipeline where the node belongs. @@ -608,14 +728,14 @@ Even though this release ships a fix for project generated with `kedro==0.16.2`, * Removed the requirement to have all dependencies for a dataset module to use only a subset of the datasets within. * Added support for `pandas>=1.0`. * Enabled Python 3.8 compatibility. _Please note that a Spark workflow may be unreliable for this Python version as `pyspark` is not fully-compatible with 3.8 yet._ -* Renamed "features" layer to "feature" layer to be consistent with (most) other layers and the [relevant FAQ](https://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention). +* Renamed "features" layer to "feature" layer to be consistent with (most) other layers and the [relevant FAQ](https://kedro.readthedocs.io/en/0.16.0/06_resources/01_faq.html#what-is-data-engineering-convention). ## Bug fixes and other changes * Fixed a bug where a new version created mid-run by an external system caused inconsistencies in the load versions used in the current run. * Documentation improvements * Added instruction in the documentation on how to create a custom runner). * Updated contribution process in `CONTRIBUTING.md` - added Developer Workflow. - * Documented installation of development version of Kedro in the [FAQ section](https://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#how-can-i-use-a-development-version-of-kedro). + * Documented installation of development version of Kedro in the [FAQ section](https://kedro.readthedocs.io/en/0.16.0/06_resources/01_faq.html#how-can-i-use-development-version-of-kedro). * Added missing `_exists` method to `MyOwnDataSet` example in 04_user_guide/08_advanced_io. * Fixed a bug where `PartitionedDataSet` and `IncrementalDataSet` were not working with `s3a` or `s3n` protocol. * Added ability to read partitioned parquet file from a directory in `pandas.ParquetDataSet`. @@ -650,7 +770,7 @@ Even though this release ships a fix for project generated with `kedro==0.16.2`, #### General Migration -**reminder** [How do I upgrade Kedro](https://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#how-do-i-upgrade-kedro) covers a few key things to remember when updating any kedro version. +**reminder** [How do I upgrade Kedro](https://kedro.readthedocs.io/en/0.16.0/06_resources/01_faq.html#how-do-i-upgrade-kedro) covers a few key things to remember when updating any kedro version. #### Migration for datasets @@ -755,7 +875,7 @@ You can find the list of moved files in the [`0.15.6` release notes](https://git # 0.15.6 ## Major features and improvements -> _TL;DR_ We're launching [`kedro.extras`](https://github.com/kedro-org/kedro/tree/master/extras), the new home for our revamped series of datasets, decorators and dataset transformers. The datasets in [`kedro.extras.datasets`](https://github.com/kedro-org/kedro/tree/master/extras/datasets) use [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to access a variety of data stores including local file systems, network file systems, cloud object stores (including S3 and GCP), and Hadoop, read more about this [**here**](https://kedro.readthedocs.io/en/latest/04_user_guide/04_data_catalog.html#specifying-the-location-of-the-dataset). The change will allow [#178](https://github.com/kedro-org/kedro/issues/178) to happen in the next major release of Kedro. +> _TL;DR_ We're launching [`kedro.extras`](https://github.com/kedro-org/kedro/tree/master/extras), the new home for our revamped series of datasets, decorators and dataset transformers. The datasets in [`kedro.extras.datasets`](https://github.com/kedro-org/kedro/tree/master/extras/datasets) use [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to access a variety of data stores including local file systems, network file systems, cloud object stores (including S3 and GCP), and Hadoop, read more about this [**here**](https://kedro.readthedocs.io/en/0.15.6/04_user_guide/04_data_catalog.html#specifying-the-location-of-the-dataset). The change will allow [#178](https://github.com/kedro-org/kedro/issues/178) to happen in the next major release of Kedro. An example of this new system can be seen below, loading the CSV `SparkDataSet` from S3: @@ -767,13 +887,13 @@ weather: file_format: csv ``` -You can also load data incrementally whenever it is dumped into a directory with the extension to [`PartionedDataSet`](https://kedro.readthedocs.io/en/latest/04_user_guide/08_advanced_io.html#partitioned-dataset), a feature that allows you to load a directory of files. The [`IncrementalDataSet`](https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#incremental-loads-with-incrementaldataset) stores the information about the last processed partition in a `checkpoint`, read more about this feature [**here**](https://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#incremental-loads-with-incrementaldataset). +You can also load data incrementally whenever it is dumped into a directory with the extension to [`PartionedDataSet`](https://kedro.readthedocs.io/en/0.15.6/04_user_guide/08_advanced_io.html#partitioned-dataset), a feature that allows you to load a directory of files. The [`IncrementalDataSet`](https://kedro.readthedocs.io/en/0.15.6/04_user_guide/08_advanced_io.html#incremental-loads-with-incrementaldataset) stores the information about the last processed partition in a `checkpoint`, read more about this feature [**here**](https://kedro.readthedocs.io/en/0.15.6/04_user_guide/08_advanced_io.html#incremental-loads-with-incrementaldataset). ### New features -* Added `layer` attribute for datasets in `kedro.extras.datasets` to specify the name of a layer according to [data engineering convention](https://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention), this feature will be passed to [`kedro-viz`](https://github.com/kedro-org/kedro-viz) in future releases. +* Added `layer` attribute for datasets in `kedro.extras.datasets` to specify the name of a layer according to [data engineering convention](https://kedro.readthedocs.io/en/0.15.6/06_resources/01_faq.html#what-is-data-engineering-convention), this feature will be passed to [`kedro-viz`](https://github.com/kedro-org/kedro-viz) in future releases. * Enabled loading a particular version of a dataset in Jupyter Notebooks and iPython, using `catalog.load("dataset_name", version="<2019-12-13T15.08.09.255Z>")`. -* Added property `run_id` on `ProjectContext`, used for versioning using the [`Journal`](https://kedro.readthedocs.io/en/stable/04_user_guide/13_journal.html). To customise your journal `run_id` you can override the private method `_get_run_id()`. +* Added property `run_id` on `ProjectContext`, used for versioning using the [`Journal`](https://kedro.readthedocs.io/en/0.15.6/04_user_guide/13_journal.html). To customise your journal `run_id` you can override the private method `_get_run_id()`. * Added the ability to install all optional kedro dependencies via `pip install "kedro[all]"`. * Modified the `DataCatalog`'s load order for datasets, loading order is the following: - `kedro.io` diff --git a/docs/conf.py b/docs/conf.py index 07f56df273..dad349e030 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -117,13 +117,10 @@ "kedro.io.core.DataSetError", "kedro.io.core.Version", "kedro.io.data_catalog.DataCatalog", - "kedro.io.transformers.AbstractTransformer", - "kedro.io.data_catalog_with_default.DataCatalogWithDefault", - "kedro.io.partitioned_data_set.PartitionedDataSet", + "kedro.io.partitioned_dataset.PartitionedDataSet", "kedro.pipeline.pipeline.Pipeline", "kedro.runner.runner.AbstractRunner", "kedro.runner.parallel_runner._SharedMemoryDataSet", - "kedro.versioning.journal.Journal", "kedro.framework.context.context.KedroContext", "kedro.framework.startup.ProjectMetadata", "abc.ABC", @@ -132,7 +129,7 @@ "requests.auth.AuthBase", "google.oauth2.credentials.Credentials", "Exception", - "CONF_ROOT", + "CONF_SOURCE", "integer -- return number of occurrences of value", "integer -- return first index of value.", "kedro.extras.datasets.pandas.json_dataset.JSONDataSet", @@ -199,12 +196,21 @@ "https://www.astronomer.io/docs/cloud/stable/get-started/quickstart#", "https://eternallybored.org/misc/wget/", "https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas", + "https://github.com/kedro-org/kedro-starters/tree/main/standalone-datacatalog", # temporary until 0.18 "https://www.oracle.com/java/technologies/javase-downloads.html", # "forbidden" url "https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71", "https://medium.com/quantumblack/beyond-the-notebook-and-into-the-data-science-framework-revolution-a7fd364ab9c4", "https://www.java.com/en/download/help/download_options.html", # "403 Client Error: Forbidden for url" # "anchor not found" but it's a valid selector for code examples "https://docs.delta.io/latest/delta-update.html#language-python", + + # Number section removal, temporarily ignore until 0.18.0 release + "https://kedro.readthedocs.io/en/stable/data/data_catalog.html#using-the-data-catalog-with-the-yaml-api", + "https://kedro.readthedocs.io/en/stable/faq/faq.html#what-is-data-engineering-convention", + "https://kedro.readthedocs.io/en/stable/faq/faq.html", + "https://kedro.readthedocs.io/en/stable/data/kedro_io.html#partitioned-dataset-credentials", + "https://kedro.readthedocs.io/en/stable/data/kedro_io.html", + "https://kedro.readthedocs.io/en/stable/data/kedro_io.html#checkpoint-configuration", ] # retry before render a link broken (fix for "too many requests") @@ -315,8 +321,6 @@ "kedro.config", "kedro.extras.datasets", "kedro.extras.logging", - "kedro.extras.decorators", - "kedro.extras.transformers", ] @@ -486,8 +490,8 @@ def _prepare_build_dir(app, config): build_root = Path(app.srcdir) build_out = Path(app.outdir) copy_tree(str(here / "source"), str(build_root)) - copy_tree(str(build_root / "15_api_docs"), str(build_root)) - shutil.rmtree(str(build_root / "15_api_docs")) + copy_tree(str(build_root / "api_docs"), str(build_root)) + shutil.rmtree(str(build_root / "api_docs")) shutil.rmtree(str(build_out), ignore_errors=True) copy_tree(str(build_root / "css"), str(build_out / "_static" / "css")) shutil.rmtree(str(build_root / "css")) diff --git a/docs/source/04_kedro_project_setup/04_mini_kedro.md b/docs/source/04_kedro_project_setup/04_mini_kedro.md deleted file mode 100644 index 2eb1e8a8f9..0000000000 --- a/docs/source/04_kedro_project_setup/04_mini_kedro.md +++ /dev/null @@ -1,59 +0,0 @@ -# The `mini-kedro` Kedro starter - -## Introduction - -Mini-Kedro makes it possible to use the [`DataCatalog`](https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html) functionality in Kedro. -Use Kedro to configure and explore data sources in a Jupyter notebook using the [DataCatalog](https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html) feature. - -The DataCatalog allows you to specify data sources that you interact with for loading and saving purposes using a YAML API. See an example: - - ```yaml -# conf/base/catalog.yml -example_dataset_1: - type: pandas.CSVDataSet - filepath: folder/filepath.csv - -example_dataset_2: - type: spark.SparkDataSet - filepath: s3a://your_bucket/data/01_raw/example_dataset_2* - credentials: dev_s3 - file_format: csv - save_args: - if_exists: replace -``` - -Later on, you can build a full pipeline using the same configuration, with the following steps: - -1. Create a new empty Kedro project in a new directory - -```bash -kedro new -``` - -Let's assume that the new project is created at `/path/to/your/project`. - -2. Copy the `conf/` and `data/` directories over to the new project - -``` -cp -fR {conf,data} `/path/to/your/project` -``` - -This makes it possible to use something like `df = catalog.load("example_dataset_1")` and `df_2 = catalog.save("example_dataset_2")` to interact with data in a Jupyter notebook. -The advantage of this approach is that you never need to specify file paths for loading or saving data in your Jupyter notebook. - -## Usage - -Create a new project using this starter: - -```bash -$ kedro new --starter=mini-kedro -``` - -## Content - -The starter contains: - -* A `conf/` directory, which contains an example `DataCatalog` configuration (`catalog.yml`) -* A `data/` directory, which contains an example dataset identical to the one used by the [`pandas-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris) starter -* An example notebook showing how to instantiate the `DataCatalog` and interact with the example dataset -* A `README.md`, which explains how to use the project created by the starter and how to transition to a full Kedro project afterwards diff --git a/docs/source/07_extend_kedro/01_common_use_cases.md b/docs/source/07_extend_kedro/01_common_use_cases.md deleted file mode 100644 index 176ee51206..0000000000 --- a/docs/source/07_extend_kedro/01_common_use_cases.md +++ /dev/null @@ -1,29 +0,0 @@ -# Common use cases - -Kedro has a few built-in mechanisms for you to extend its behaviour. This document explains how to select which mechanism to employ for the most common use cases. - -## Use Case 1: How to add extra behaviour to Kedro's execution timeline - -The execution timeline of a Kedro pipeline can be thought of as a sequence of actions performed by various Kedro library components, such as the [DataSets](/kedro.extras.datasets), [DataCatalog](/kedro.io.DataCatalog), [Pipeline](/kedro.pipeline.Pipeline), and [Node](/kedro.pipeline.node.Node). - -At different points in the lifecycle of these components, you may want to add extra behaviour. For example, you could add extra computation for profiling purposes _before_ and _after_ a node runs or _before_ and _after_ the I/O actions of a dataset, namely the `load` and `save` actions. - -Before Kedro 0.17.0, we added a few different APIs to allow you to extend Kedro's behaviour. For example, to allow extra behaviour _before_ and _after_ a node runs, we introduced the [decorators](07_decorators.md) API. Similarly, to allow extra behaviour _before_ and _after_ dataset I/O, we introduced the [transformers](06_transformers.md) API. - -While we addressed some immediate use cases, we have since decided to provide just one, single way to extend Kedro's execution timeline: Hooks. So, from Kedro version 0.17.0, we now deprecate decorators and transformers in favour of [Hooks](./02_hooks.md), which will be the recommended approach when you need to extend Kedro's execution timeline. - -## Use Case 2: How to integrate Kedro with additional data sources - -You can use [DataSets](/kedro.extras.datasets) to interface with various different data sources. If the data source you plan to use is not supported out of the box by Kedro, you can [create a custom dataset](03_custom_datasets.md). - -## Use Case 3: How to add CLI commands that are reusable across projects - -To inject additional CLI commands intended to be reused across projects, please refer to our [plugins](./04_plugins.md) system. An example of one such command is the `kedro viz` command introduced by the official [Kedro-Viz](https://github.com/kedro-org/kedro-viz) plugin. This command is intended to work on every Kedro project and therefore must be a standalone plugin. On the other hand, if you just want to customise built-in Kedro commands, such as `kedro run` for a specific project, please modify the `cli.py` file in your project instead. - -```eval_rst -.. note:: Your plugin's implementation can take advantage of other extension mechanisms such as Hooks. -``` - -## Use Case 4: How to customise the initial boilerplate of your project - -Sometimes you might want to tailor the starting boilerplate of a Kedro project to your specific needs. For example, your organisation might have a standard CI script that you want to include in every new Kedro project. To this end, please visit our guide to [create Kedro starters](./05_create_kedro_starters.md) to solve this extension requirement. diff --git a/docs/source/07_extend_kedro/06_transformers.md b/docs/source/07_extend_kedro/06_transformers.md deleted file mode 100644 index 785b790370..0000000000 --- a/docs/source/07_extend_kedro/06_transformers.md +++ /dev/null @@ -1,131 +0,0 @@ -# Dataset transformers (deprecated) - -```eval_rst -.. warning:: The transformer API will be deprecated in 0.18.0. We recommend using the ``before_dataset_loaded``/``after_dataset_loaded`` and ``before_dataset_saved``/``after_dataset_saved`` Hooks to customise the dataset ``load`` and ``save`` methods where appropriate. -``` - -As we describe in the [documentation about how Kedro works with data](../05_data/01_data_catalog.md#transforming-datasets), Kedro transformers intercept the load and save operations on Kedro `DataSet`s. - -Use cases for Kedro transformers include: - - - Data validation - - Operation performance tracking - - Data format conversion (although we would recommend [Transcoding](../05_data/01_data_catalog.md#transcoding-datasets) for this) - -### Develop your own dataset transformer - -To illustrate the use case for operation performance tracking, this section demonstrates how to build a transformer to track memory consumption. In fact, Kedro provides a built-in memory profiler, but this example shows how to build your own, using [memory-profiler](https://github.com/pythonprofilers/memory_profiler). - - -```eval_rst -.. note:: To work with this example, you need to ``pip install memory_profiler`` before you start. -``` - -A custom transformer should: - -* Inherit from the `kedro.io.AbstractTransformer` base class -* Implement the `load` and `save` method - -Within the project in which you want to use the transformer, create a file in `src//` called `memory_profile.py` and paste the following code into it: - -
-Click to expand - -```python -import logging -from typing import Callable, Any - -from kedro.io import AbstractTransformer -from memory_profiler import memory_usage - - -def _normalise_mem_usage(mem_usage): - # memory_profiler < 0.56.0 returns list instead of float - return mem_usage[0] if isinstance(mem_usage, (list, tuple)) else mem_usage - - -class ProfileMemoryTransformer(AbstractTransformer): - """A transformer that logs the maximum memory consumption during load and save calls""" - - @property - def _logger(self): - return logging.getLogger(self.__class__.__name__) - - def load(self, data_set_name: str, load: Callable[[], Any]) -> Any: - mem_usage, data = memory_usage( - (load, [], {}), - interval=0.1, - max_usage=True, - retval=True, - include_children=True, - ) - # memory_profiler < 0.56.0 returns list instead of float - mem_usage = _normalise_mem_usage(mem_usage) - - self._logger.info( - "Loading %s consumed %2.2fMiB memory at peak time", data_set_name, mem_usage - ) - return data - - def save(self, data_set_name: str, save: Callable[[Any], None], data: Any) -> None: - mem_usage = memory_usage( - (save, [data], {}), - interval=0.1, - max_usage=True, - retval=False, - include_children=True, - ) - mem_usage = _normalise_mem_usage(mem_usage) - - self._logger.info( - "Saving %s consumed %2.2fMiB memory at peak time", data_set_name, mem_usage - ) -``` -
- -Next, you need to update `TransformerHooks` to apply your custom transformer. Add the following to a `hooks.py` file in your project. - -
-Click to expand - -```python -... -from .memory_profile import ProfileMemoryTransformer # new import - - -class TransformerHooks: - @hook_impl - def after_catalog_created(self, catalog: DataCatalog) -> None: - catalog.add_transformer(ProfileTimeTransformer()) - - # as memory tracking is quite time-consuming, for demonstration purposes - # let's apply profile_memory only to the model_input_table - catalog.add_transformer(ProfileMemoryTransformer(), "model_input_table") -``` -
- -Finally, update `HOOKS` variable in `settings.py` as follows: - -```python -HOOKS = (TransformerHooks(),) -``` - -Then re-run the pipeline: - -```console -$ kedro run -``` - -The output should look similar to the following: - -``` -... -2019-11-13 15:55:01,674 - kedro.io.data_catalog - INFO - Saving data to `model_input_table` (CSVDataSet)... -2019-11-13 15:55:12,322 - ProfileMemoryTransformer - INFO - Saving model_input_table consumed 606.98MiB memory at peak time -2019-11-13 15:55:12,322 - ProfileTimeTransformer - INFO - Saving model_input_table took 10.648 seconds -2019-11-13 15:55:12,357 - kedro.runner.sequential_runner - INFO - Completed 3 out of 6 tasks -2019-11-13 15:55:12,358 - kedro.io.data_catalog - INFO - Loading data from `model_input_table` (CSVDataSet)... -2019-11-13 15:55:13,933 - ProfileMemoryTransformer - INFO - Loading model_input_table consumed 533.05MiB memory at peak time -2019-11-13 15:55:13,933 - ProfileTimeTransformer - INFO - Loading model_input_table took 1.576 seconds -... -``` diff --git a/docs/source/07_extend_kedro/07_decorators.md b/docs/source/07_extend_kedro/07_decorators.md deleted file mode 100644 index 64d9ff37ea..0000000000 --- a/docs/source/07_extend_kedro/07_decorators.md +++ /dev/null @@ -1,126 +0,0 @@ -# Decorators (deprecated) - -```eval_rst -.. warning:: The decorator API will be deprecated in 0.18.0. We recommend using Hooks to extend a node's behaviour. -``` - -A decorator is a computation that runs before and after execution. You can apply [Python decorators](https://wiki.python.org/moin/PythonDecorators) to Kedro nodes or an entire Kedro pipeline. - -## How to apply a decorator to nodes - -This example illustrates decorators that modify the first string argument of a given function: - -```python -from functools import wraps -from typing import Callable - - -def apply_f(func: Callable) -> Callable: - @wraps(func) - def with_f(*args, **kwargs): - return func(*["f({})".format(a) for a in args], **kwargs) - - return with_f - - -def apply_g(func: Callable) -> Callable: - @wraps(func) - def with_g(*args, **kwargs): - return func(*["g({})".format(a) for a in args], **kwargs) - - return with_g - - -def apply_h(func: Callable) -> Callable: - @wraps(func) - def with_h(*args, **kwargs): - return func(*["h({})".format(a) for a in args], **kwargs) - - return with_h -``` - -To make sure that `apply_f` is applied to every function call, including within Kedro nodes: - -```python -@apply_f -def say_hello(name): - print("Hello {}!".format(name)) - - -hello_node = node(say_hello, "name", None) -hello_node.run(dict(name="Kedro")) -``` - -`Output`: - -```console -In [3]: hello_node.run(dict(name="Kedro")) -Hello f(Kedro)! -Out[3]: {} -``` - -If you want to apply an additional decorator to the same function, but just for another node: - -```python -hello_node_wrapped = node(apply_g(say_hello), "name", None) - -hello_node.run(dict(name="Kedro")) -hello_node_wrapped.run(dict(name="Kedro")) -``` - -`Output`: - -```console -Hello f(Kedro)! -Hello f(g(Kedro))! -Out[4]: {} -``` - -## How to apply multiple decorators to nodes - -You can also provide a list of decorators as shown here: - -```python -hello_wrapped = node(apply_g(apply_h(say_hello)), "name", None) -hello_decorated = hello_node.decorate(apply_g, apply_h) - -hello_wrapped.run(dict(name="Kedro")) -hello_decorated.run(dict(name="Kedro")) -``` - -`Output`: - -```console -Hello f(h(g(Kedro)))! -Hello f(h(g(Kedro)))! -``` - -## How to apply a decorator to a pipeline - -Decorators can also be useful for monitoring your pipeline. You can apply one or more decorators to an entire pipeline, much as you do for a node. - -For example, if you want to apply the decorator above to all pipeline nodes simultaneously: - -```python -hello_pipeline = Pipeline( - [node(say_hello, "name1", None), node(say_hello, "name2", None)] -).decorate(apply_g, apply_h) - -SequentialRunner().run( - hello_pipeline, DataCatalog({}, dict(name1="Kedro", name2="Python")) -) -``` - -`Output`: - -```console -Hello f(h(g(Kedro)))! -Hello f(h(g(Python)))! -Out[9]: {} -``` - -## Kedro decorators - -Kedro currently has one built-in decorator: `log_time`, which logs the time taken to execute a node. You can find it in [`kedro.pipeline.decorators`](/kedro.pipeline.decorators.log_time). - -Other decorators can be found in [`kedro.extras.decorators`](/kedro.extras.decorators), for which you will need to install the required dependencies. diff --git a/docs/source/09_development/04_lint.md b/docs/source/09_development/04_lint.md deleted file mode 100644 index 26e9ff14f0..0000000000 --- a/docs/source/09_development/04_lint.md +++ /dev/null @@ -1,35 +0,0 @@ -# Linting your Kedro project - -To follow these instructions, you will need to install the `pylint` package, subject to GPL licence. - -You can lint your project code to ensure code quality using the `kedro lint` command, your project is linted with [`black`](https://github.com/psf/black) (projects created with Python 3.6 and above), [`flake8`](https://gitlab.com/pycqa/flake8) and [`isort`](https://github.com/PyCQA/isort). If you prefer to use [pylint](https://www.pylint.org/), a popular linting tool, then the sample commands you can use to help with this are included in the script below: - -```bash -isort -pylint -j 0 src/ -pylint -j 0 --disable=missing-docstring,redefined-outer-name src/tests -``` - -Alternatively, you can opt to use it as a plugin to your Kedro project. To do this, add the following code snippet to `cli.py` in your project package directory: - -```python -@cli.command() -def lint(): - """Check the Python code quality.""" - python_call("isort", ["src/", "src/tests"]) - python_call("pylint", ["-j", "0", "src/"]) - python_call( - "pylint", - ["-j", "0", "--disable=missing-docstring,redefined-outer-name", "src/tests"], - ) -``` - -To trigger the behaviour, simply run the following command in your terminal window: -```bash -kedro lint -``` - -Make sure you also include the dependency in your `requirements.txt`, i.e: -```text -pylint>=2.3.1,<3.0 -``` diff --git a/docs/source/15_api_docs/kedro.versioning.journal.JournalFileHandler.rst b/docs/source/15_api_docs/kedro.versioning.journal.JournalFileHandler.rst deleted file mode 100644 index 386ce5313a..0000000000 --- a/docs/source/15_api_docs/kedro.versioning.journal.JournalFileHandler.rst +++ /dev/null @@ -1,6 +0,0 @@ -kedro.versioning.journal.JournalFileHandler -=========================================== - -.. currentmodule:: kedro.versioning.journal - -.. autoclass:: JournalFileHandler diff --git a/docs/source/15_api_docs/kedro.config.rst b/docs/source/api_docs/kedro.config.rst similarity index 100% rename from docs/source/15_api_docs/kedro.config.rst rename to docs/source/api_docs/kedro.config.rst diff --git a/docs/source/15_api_docs/kedro.extras.datasets.rst b/docs/source/api_docs/kedro.extras.datasets.rst similarity index 90% rename from docs/source/15_api_docs/kedro.extras.datasets.rst rename to docs/source/api_docs/kedro.extras.datasets.rst index c49217f409..13d1319f74 100644 --- a/docs/source/15_api_docs/kedro.extras.datasets.rst +++ b/docs/source/api_docs/kedro.extras.datasets.rst @@ -19,10 +19,11 @@ kedro.extras.datasets kedro.extras.datasets.holoviews.HoloviewsWriter kedro.extras.datasets.json.JSONDataSet kedro.extras.datasets.matplotlib.MatplotlibWriter - kedro.extras.datasets.networkx.NetworkXDataSet + kedro.extras.datasets.networkx.GMLDataSet + kedro.extras.datasets.networkx.GraphMLDataSet + kedro.extras.datasets.networkx.JSONDataSet kedro.extras.datasets.pandas.CSVDataSet kedro.extras.datasets.pandas.ExcelDataSet - kedro.extras.datasets.pandas.AppendableExcelDataSet kedro.extras.datasets.pandas.FeatherDataSet kedro.extras.datasets.pandas.GBQQueryDataSet kedro.extras.datasets.pandas.GBQTableDataSet @@ -32,6 +33,7 @@ kedro.extras.datasets kedro.extras.datasets.pandas.ParquetDataSet kedro.extras.datasets.pandas.SQLQueryDataSet kedro.extras.datasets.pandas.SQLTableDataSet + kedro.extras.datasets.pandas.XMLDataSet kedro.extras.datasets.pickle.PickleDataSet kedro.extras.datasets.pillow.ImageDataSet kedro.extras.datasets.plotly.JSONDataSet diff --git a/docs/source/15_api_docs/kedro.extras.logging.color_logger.ColorHandler.rst b/docs/source/api_docs/kedro.extras.logging.color_logger.ColorHandler.rst similarity index 100% rename from docs/source/15_api_docs/kedro.extras.logging.color_logger.ColorHandler.rst rename to docs/source/api_docs/kedro.extras.logging.color_logger.ColorHandler.rst diff --git a/docs/source/15_api_docs/kedro.framework.cli.cli.KedroCLI.rst b/docs/source/api_docs/kedro.framework.cli.cli.KedroCLI.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.cli.cli.KedroCLI.rst rename to docs/source/api_docs/kedro.framework.cli.cli.KedroCLI.rst diff --git a/docs/source/15_api_docs/kedro.framework.cli.hooks.manager.CLIHooksManager.rst b/docs/source/api_docs/kedro.framework.cli.hooks.manager.CLIHooksManager.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.cli.hooks.manager.CLIHooksManager.rst rename to docs/source/api_docs/kedro.framework.cli.hooks.manager.CLIHooksManager.rst diff --git a/docs/source/15_api_docs/kedro.framework.cli.jupyter.SingleKernelSpecManager.rst b/docs/source/api_docs/kedro.framework.cli.jupyter.SingleKernelSpecManager.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.cli.jupyter.SingleKernelSpecManager.rst rename to docs/source/api_docs/kedro.framework.cli.jupyter.SingleKernelSpecManager.rst diff --git a/docs/source/15_api_docs/kedro.framework.cli.utils.CommandCollection.rst b/docs/source/api_docs/kedro.framework.cli.utils.CommandCollection.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.cli.utils.CommandCollection.rst rename to docs/source/api_docs/kedro.framework.cli.utils.CommandCollection.rst diff --git a/docs/source/15_api_docs/kedro.framework.cli.utils.rst b/docs/source/api_docs/kedro.framework.cli.utils.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.cli.utils.rst rename to docs/source/api_docs/kedro.framework.cli.utils.rst diff --git a/docs/source/15_api_docs/kedro.framework.context.rst b/docs/source/api_docs/kedro.framework.context.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.context.rst rename to docs/source/api_docs/kedro.framework.context.rst diff --git a/docs/source/15_api_docs/kedro.framework.session.store.BaseSessionStore.rst b/docs/source/api_docs/kedro.framework.session.store.BaseSessionStore.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.session.store.BaseSessionStore.rst rename to docs/source/api_docs/kedro.framework.session.store.BaseSessionStore.rst diff --git a/docs/source/15_api_docs/kedro.framework.session.store.ShelveStore.rst b/docs/source/api_docs/kedro.framework.session.store.ShelveStore.rst similarity index 100% rename from docs/source/15_api_docs/kedro.framework.session.store.ShelveStore.rst rename to docs/source/api_docs/kedro.framework.session.store.ShelveStore.rst diff --git a/docs/source/15_api_docs/kedro.io.rst b/docs/source/api_docs/kedro.io.rst similarity index 89% rename from docs/source/15_api_docs/kedro.io.rst rename to docs/source/api_docs/kedro.io.rst index 64a4552bb7..971ba98660 100644 --- a/docs/source/15_api_docs/kedro.io.rst +++ b/docs/source/api_docs/kedro.io.rst @@ -13,14 +13,12 @@ kedro.io kedro.io.AbstractDataSet kedro.io.AbstractVersionedDataSet - kedro.io.AbstractTransformer kedro.io.DataCatalog kedro.io.LambdaDataSet kedro.io.MemoryDataSet kedro.io.PartitionedDataSet kedro.io.IncrementalDataSet kedro.io.CachedDataSet - kedro.io.DataCatalogWithDefault kedro.io.Version .. rubric:: Exceptions diff --git a/docs/source/15_api_docs/kedro.pipeline.rst b/docs/source/api_docs/kedro.pipeline.rst similarity index 87% rename from docs/source/15_api_docs/kedro.pipeline.rst rename to docs/source/api_docs/kedro.pipeline.rst index b8c67323b4..c99f493c9e 100644 --- a/docs/source/15_api_docs/kedro.pipeline.rst +++ b/docs/source/api_docs/kedro.pipeline.rst @@ -25,12 +25,6 @@ kedro.pipeline .. rubric:: Modules -.. autosummary:: - :toctree: - :recursive: - - kedro.pipeline.decorators - .. rubric:: Exceptions .. autosummary:: diff --git a/docs/source/15_api_docs/kedro.runner.rst b/docs/source/api_docs/kedro.runner.rst similarity index 100% rename from docs/source/15_api_docs/kedro.runner.rst rename to docs/source/api_docs/kedro.runner.rst diff --git a/docs/source/14_contribution/03_backwards_compatibility.md b/docs/source/contribution/backwards_compatibility.md similarity index 100% rename from docs/source/14_contribution/03_backwards_compatibility.md rename to docs/source/contribution/backwards_compatibility.md diff --git a/docs/source/14_contribution/01_contribute_to_kedro.md b/docs/source/contribution/contribute_to_kedro.md similarity index 78% rename from docs/source/14_contribution/01_contribute_to_kedro.md rename to docs/source/contribution/contribute_to_kedro.md index 1c27557234..bc360e7377 100644 --- a/docs/source/14_contribution/01_contribute_to_kedro.md +++ b/docs/source/contribution/contribute_to_kedro.md @@ -7,5 +7,5 @@ We welcome any and all contributions to Kedro, at whatever level you can manage. - Make a pull request on the [kedro-community Github repo](https://github.com/kedro-org/kedro-community) to update the curated list of Kedro community content. - Report a bug or propose a new feature on [GitHub issues](https://github.com/kedro-org/kedro/issues) - [Review other contributors' PRs](https://github.com/kedro-org/kedro/pulls) -- [Contribute code](./02_developer_contributor_guidelines.md), for example to fix a bug or add a feature -- [Contribute to the documentation](04_documentation_contributor_guidelines.md) +- [Contribute code](./developer_contributor_guidelines.md), for example to fix a bug or add a feature +- [Contribute to the documentation](documentation_contributor_guidelines.md) diff --git a/docs/source/14_contribution/02_developer_contributor_guidelines.md b/docs/source/contribution/developer_contributor_guidelines.md similarity index 91% rename from docs/source/14_contribution/02_developer_contributor_guidelines.md rename to docs/source/contribution/developer_contributor_guidelines.md index ad6f35fe07..6038498e37 100644 --- a/docs/source/14_contribution/02_developer_contributor_guidelines.md +++ b/docs/source/contribution/developer_contributor_guidelines.md @@ -26,7 +26,7 @@ To work on the Kedro codebase, you will need to be set up with Git, and Make. .. note:: If your development environment is Windows, you can use the ``win_setup_conda`` and ``win_setup_env`` commands from `Circle CI configuration `_ to guide you in the correct way to do this. ``` -You will also need to create and activate virtual environment. If this is unfamiliar to you, read through our [pre-requisites documentation](../02_get_started/01_prerequisites.md). +You will also need to create and activate virtual environment. If this is unfamiliar to you, read through our [pre-requisites documentation](../get_started/prerequisites.md). Next, you'll need to fork the [Kedro source code from the Github repository](https://github.com/kedro-org/kedro): @@ -61,8 +61,8 @@ Once you are ready to contribute, a good place to start is to take a look at the We focus on three areas for contribution: `core`, `extras` and `plugin`: - `core` refers to the primary Kedro library. Read the [`core` contribution process](#core-contribution-process) for details. -- `extras` refers to features that could be added to `core` that do not introduce too many dependencies or require new Kedro CLI commands to be created e.g. [adding a new dataset](../07_extend_kedro/03_custom_datasets.md) to the `kedro.extras.dataset` data management module. All the datasets are placed under `kedro.extras.datasets` to separate heavy dependencies (e.g Pandas) from Kedro `core` components. Read the [`extras` contribution process](#extras-contribution-process) for more information. -- [`plugin`](../07_extend_kedro/04_plugins.md) refers to new functionality that requires a Kedro CLI command e.g. adding in Airflow functionality. The [`plugin` development documentation](../07_extend_kedro/04_plugins.md) contains guidance on how to design and develop a Kedro `plugin`. +- `extras` refers to features that could be added to `core` that do not introduce too many dependencies or require new Kedro CLI commands to be created e.g. [adding a new dataset](../extend_kedro/custom_datasets.md) to the `kedro.extras.dataset` data management module. All the datasets are placed under `kedro.extras.datasets` to separate heavy dependencies (e.g Pandas) from Kedro `core` components. Read the [`extras` contribution process](#extras-contribution-process) for more information. +- [`plugin`](../extend_kedro/plugins.md) refers to new functionality that requires a Kedro CLI command e.g. adding in Airflow functionality. The [`plugin` development documentation](../extend_kedro/plugins.md) contains guidance on how to design and develop a Kedro `plugin`. ### `core` contribution process @@ -72,7 +72,7 @@ Typically, we only accept small contributions to the `core` Kedro library but we To contribute: 1. Create a feature branch on your forked repository and push all your local changes to that feature branch. -2. Is your change [non-breaking and backwards-compatible](./03_backwards_compatibility.md)? Your feature branch should branch off from: +2. Is your change [non-breaking and backwards-compatible](./backwards_compatibility.md)? Your feature branch should branch off from:
  1. main if you intend for it to be a non-breaking, backwards-compatible change.
  2. develop if you intend for it to be a breaking change.
  3. @@ -100,7 +100,7 @@ You can add new work to `extras` if you do not need to create a new Kedro CLI co 2. Work in [`extras`](https://github.com/kedro-org/kedro/tree/main/kedro/extras) and create a feature branch on your forked repository and push all your local changes to that feature branch. 3. Before you submit a pull request, please ensure that unit, E2E tests and linting are passing for your changes by running `make test`,`make e2e-tests` and `make lint` locally, have a look at the section [Running checks locally](#ci--cd-and-running-checks-locally) below. 4. Include a `README.md` with instructions on how to use your contribution. -5. Is your change [non-breaking and backwards-compatible](./03_backwards_compatibility.md)? +5. Is your change [non-breaking and backwards-compatible](./backwards_compatibility.md)?
    1. For backwards compatible changes, open a PR against the kedro-org:main branch from your feature branch.
    2. For changes that are NOT backwards compatible, open a PR against the kedro-org:develop branch from your feature branch.
    3. diff --git a/docs/source/14_contribution/04_documentation_contributor_guidelines.md b/docs/source/contribution/documentation_contributor_guidelines.md similarity index 90% rename from docs/source/14_contribution/04_documentation_contributor_guidelines.md rename to docs/source/contribution/documentation_contributor_guidelines.md index 26547db62f..124f05a11b 100644 --- a/docs/source/14_contribution/04_documentation_contributor_guidelines.md +++ b/docs/source/contribution/documentation_contributor_guidelines.md @@ -18,7 +18,7 @@ The following instructions are specifically for people working with documentatio ### Set up to build Kedro documentation -Follow the setup instructions in the [developer contributor guide](./02_developer_contributor_guidelines.md#before-you-start-development-set-up) +Follow the setup instructions in the [developer contributor guide](./developer_contributor_guidelines.md#before-you-start-development-set-up) to fork the Kedro repo, create and activate a Python virtual environment and install the dependencies necessary to build the documentation. @@ -49,7 +49,7 @@ To move or remove a page of documentation, first locate it in the repo, and also You need to submit any changes to the documentation via a branch. -[Find out more about the process of submitting a PR to the Kedro project](./02_developer_contributor_guidelines.md). +[Find out more about the process of submitting a PR to the Kedro project](./developer_contributor_guidelines.md). ### Help! @@ -59,7 +59,7 @@ Ask for help over on [GitHub discussions](https://github.com/kedro-org/kedro/dis ## Kedro documentation style guide -This is the style guide we have used to create [documentation about Kedro](https://kedro.readthedocs.io/en/stable/). +This is the style guide we have used to create [documentation about Kedro](../index). When you are writing documentation for your own project, you may find it useful to follow these rules. We also ask anyone kind enough to contribute to the Kedro documentation to follow our preferred style to maintain consistency and simplicity. @@ -83,19 +83,19 @@ Where it's not obvious what the style should be, it's worth consulting the [Micr * Make hyperlink descriptions as descriptive as you can. This is a good description: ```text -Learn how to [update the project pipeline](https://kedro.readthedocs.io/en/stable/03_tutorial/04_create_pipelines.html#update-the-project-pipeline) +Learn how to [update the project pipeline](../tutorial/create_pipelines.html#update-the-project-pipeline) ``` This is less helpful: ```text -Learn how to update the [project pipeline](https://kedro.readthedocs.io/en/stable/03_tutorial/04_create_pipelines.html#update-the-project-pipeline) +Learn how to update the [project pipeline](../tutorial/create_pipelines.html#update-the-project-pipeline) ``` Don't write this: ```text -To learn how to update the project pipeline, see [here](https://kedro.readthedocs.io/en/stable/03_tutorial/04_create_pipelines.html#update-the-project-pipeline) +To learn how to update the project pipeline, see [here](../tutorial/create_pipelines.html#update-the-project-pipeline) ``` ### Capitalisation @@ -134,12 +134,12 @@ We use callout sections formatted in `.rst` to bring attention to key points. Fo ### Kedro lexicon * Name of our product: Kedro and Kedro-Viz (note capitalisation). -* Use journal and pipeline as these aren't proper nouns. Tend to lower case except if there is a precedent (see next bullet). +* Use pipeline as this isn't a proper noun. Tend to lower case except if there is a precedent (see next bullet). * Use Hooks (not hooks, except where it's a necessary part of your code example). We are taking our lead from React here, so capitalising despite it not seeming consistent with other rules. * Use dataset (not data set, or data-set) for a generic dataset. * Use capitalised DataSet when talking about a specific Kedro dataset class e.g. CSVDataSet. * Use data catalog for a generic data catalog. - * Use Data Catalog to talk about the [Kedro Data Catalog](../05_data/01_data_catalog.md). + * Use Data Catalog to talk about the [Kedro Data Catalog](../data/data_catalog.md). ### Style * Keep your sentences short and easy to read. diff --git a/docs/source/05_data/01_data_catalog.md b/docs/source/data/data_catalog.md similarity index 82% rename from docs/source/05_data/01_data_catalog.md rename to docs/source/data/data_catalog.md index 5c95c3777b..6a6a3ff353 100644 --- a/docs/source/05_data/01_data_catalog.md +++ b/docs/source/data/data_catalog.md @@ -10,7 +10,7 @@ Kedro uses configuration to make your code reproducible when it has to reference You can copy this file and reference additional locations for the same datasets. For instance, you can use the `catalog.yml` file in `conf/base/` to register the locations of datasets that would run in production while copying and updating a second version of `catalog.yml` in `conf/local/` to register the locations of sample datasets that you are using for prototyping your data pipeline(s). -There is built-in functionality for `conf/local/` to overwrite `conf/base/` detailed [here](../04_kedro_project_setup/02_configuration.md). This means that a dataset called `cars` could exist in the `catalog.yml` files in `conf/base/` and `conf/local/`. In code, in `src`, you would only call a dataset named `cars` and Kedro would detect which definition of `cars` dataset to use to run your pipeline - `cars` definition from `conf/local/catalog.yml` would take precedence in this case. +There is built-in functionality for `conf/local/` to overwrite `conf/base/` detailed [here](../kedro_project_setup/configuration.md). This means that a dataset called `cars` could exist in the `catalog.yml` files in `conf/base/` and `conf/local/`. In code, in `src`, you would only call a dataset named `cars` and Kedro would detect which definition of `cars` dataset to use to run your pipeline - `cars` definition from `conf/local/catalog.yml` would take precedence in this case. The Data Catalog also works with the `credentials.yml` in `conf/local/`, allowing you to specify usernames and passwords that are required to load certain datasets. @@ -21,7 +21,7 @@ The are two ways of defining a Data Catalog through the use of YAML configuratio - Location of the dataset using `fsspec`, detailed in the next section - Credentials needed in order to access the dataset - Load and saving arguments - - Whether or not you want a [dataset or ML model to be versioned](02_kedro_io.md#versioning) when you run your data pipeline + - Whether or not you want a [dataset or ML model to be versioned](kedro_io.md#versioning) when you run your data pipeline ## Specifying the location of the dataset @@ -316,7 +316,7 @@ dev_abs: ## Creating a Data Catalog YAML configuration file via CLI -You can use [`kedro catalog create` command](../09_development/03_commands_reference.md#create-a-data-catalog-yaml-configuration-file) to create a Data Catalog YAML configuration. +You can use [`kedro catalog create` command](../development/commands_reference.md#create-a-data-catalog-yaml-configuration-file) to create a Data Catalog YAML configuration. It creates a `//catalog/.yml` configuration file with `MemoryDataSet` datasets for each dataset in a registered pipeline if it is missing from the `DataCatalog`. @@ -330,12 +330,12 @@ scooters: ## Adding parameters -You can [configure parameters](../04_kedro_project_setup/02_configuration.md#load-parameters) for your project and [reference them](../04_kedro_project_setup/02_configuration.md#use-parameters) in your nodes. Do this using the `add_feed_dict()` method ([API documentation](/kedro.io.DataCatalog)). You can use this method to add any other entry / metadata you wish on the `DataCatalog`. +You can [configure parameters](../kedro_project_setup/configuration.md#load-parameters) for your project and [reference them](../kedro_project_setup/configuration.md#use-parameters) in your nodes. Do this using the `add_feed_dict()` method ([API documentation](/kedro.io.DataCatalog)). You can use this method to add any other entry / metadata you wish on the `DataCatalog`. ## Feeding in credentials -Before instantiating the `DataCatalog` Kedro will first attempt to read the credentials from [the project configuration](../04_kedro_project_setup/02_configuration.md#aws-credentials). The resulting dictionary is then passed into `DataCatalog.from_config()` as the `credentials` argument. +Before instantiating the `DataCatalog` Kedro will first attempt to read the credentials from [the project configuration](../kedro_project_setup/configuration.md#aws-credentials). The resulting dictionary is then passed into `DataCatalog.from_config()` as the `credentials` argument. Let's assume that the project contains the file `conf/local/credentials.yml` with the following contents: @@ -461,71 +461,6 @@ In the pipeline, Kedro uses the `spark.SparkDataSet` implementation for saving a for loading, so the first node should output a `pyspark.sql.DataFrame`, while the second node would receive a `pandas.Dataframe`. -## Transforming datasets - -Transformers are used to intercept the load and save operations on Kedro `DataSet`s. Use cases for transformers include: - - - Data validation - - Tracking operation performance - - Data format conversion (although we would recommend [Transcoding](#transcoding-datasets) for this) - -### Applying built-in transformers - -Here we cover the use case of _tracking operation performance_ by applying built-in transformers to monitor the latency of load and save operations. - -Transformers are applied at the `DataCatalog` level. To apply the built-in `ProfileTimeTransformer`, you need to: - -1. Navigate to `src//hooks.py` -2. Apply `ProfileTimeTransformer` in the hook implementation `TransformerHooks.after_catalog_created` -3. Register the hook in your `src//settings.py` - -```python -# src//hooks.py - -from kedro.extras.transformers import ProfileTimeTransformer # new import -from kedro.framework.hooks import hook_impl # new import -from kedro.io import DataCatalog # new import - - -class TransformerHooks: - @hook_impl - def after_catalog_created(self, catalog: DataCatalog) -> None: - catalog.add_transformer(ProfileTimeTransformer()) -``` - -```python -# src//settings.py -from .hooks import TransformerHooks - -HOOKS = (TransformerHooks(),) -``` - -Once complete, rerun the pipeline from the terminal and you should see the following logging output: - -```console -$ kedro run - -... -2019-11-13 15:09:01,784 - kedro.io.data_catalog - INFO - Loading data from `companies` (CSVDataSet)... -2019-11-13 15:09:01,827 - ProfileTimeTransformer - INFO - Loading companies took 0.043 seconds -2019-11-13 15:09:01,828 - kedro.pipeline.node - INFO - Running node: preprocessing_companies: preprocess_companies([companies]) -> [preprocessed_companies] -2019-11-13 15:09:01,880 - kedro_tutorial.nodes.data_engineering - INFO - Running 'preprocess_companies' took 0.05 seconds -2019-11-13 15:09:01,880 - kedro_tutorial.nodes.data_engineering - INFO - Running 'preprocess_companies' took 0.05 seconds -2019-11-13 15:09:01,880 - kedro.io.data_catalog - INFO - Saving data to `preprocessed_companies` (CSVDataSet)... -2019-11-13 15:09:02,112 - ProfileTimeTransformer - INFO - Saving preprocessed_companies took 0.232 seconds -2019-11-13 15:09:02,113 - kedro.runner.sequential_runner - INFO - Completed 1 out of 6 tasks -... -``` - -The `ProfileTimeTransformer - INFO` log messages report the latency of dataset load and save operations. - -### Transformer scope -You can refine the scope of the transformer by specifying an optional list of the datasets it is applied to in `add_transformer`. - -For example, the command `catalog.add_transformer(profile_time, ["dataset1", "dataset2"])` applies the `profile_time` transformer _only_ to the datasets named `dataset1` and `dataset2`. - -This is useful when you need to apply a transformer to just a subset of datasets. - ## Versioning datasets and ML models Making a simple addition to your Data Catalog allows you to perform versioning of datasets and machine learning models. @@ -548,7 +483,7 @@ kedro run --load-version="cars.csv:YYYY-MM-DDThh.mm.ss.sssZ" ``` where `--load-version` is dataset name and version timestamp separated by `:`. -This section shows just the very basics of versioning, which is described further in the documentation about [Kedro IO](../05_data/02_kedro_io.md#versioning). +This section shows just the very basics of versioning, which is described further in the documentation about [Kedro IO](../data/kedro_io.md#versioning). ## Using the Data Catalog with the Code API diff --git a/docs/source/05_data/02_kedro_io.md b/docs/source/data/kedro_io.md similarity index 98% rename from docs/source/05_data/02_kedro_io.md rename to docs/source/data/kedro_io.md index 217d741756..8e577f63fb 100644 --- a/docs/source/05_data/02_kedro_io.md +++ b/docs/source/data/kedro_io.md @@ -31,9 +31,9 @@ If you have a dataset called `parts`, you can make direct calls to it like so: parts_df = parts.load() ``` -However, we recommend using a `DataCatalog` instead (for more details, see [this section](../05_data/01_data_catalog.md) in the User Guide) as it has been designed to make all datasets available to project members. +However, we recommend using a `DataCatalog` instead (for more details, see [this section](../data/data_catalog.md) in the User Guide) as it has been designed to make all datasets available to project members. -For contributors, if you would like to submit a new dataset, you will have to extend `AbstractDataSet`. For a complete guide, please read [Creating a new dataset](../07_extend_kedro/03_custom_datasets.md). +For contributors, if you would like to submit a new dataset, you will have to extend `AbstractDataSet`. For a complete guide, please read [Creating a new dataset](../extend_kedro/custom_datasets.md). ## Versioning @@ -112,7 +112,7 @@ cars: The `DataCatalog` will create a versioned `CSVDataSet` called `cars`. The actual csv file location will look like `data/01_raw/company/car_data.csv//car_data.csv`, where `` corresponds to a global save version string formatted as `YYYY-MM-DDThh.mm.ss.sssZ`. Every time the `DataCatalog` is instantiated, it generates a new global save version, which is propagated to all versioned datasets it contains. -`catalog.yml` only allows you to version your datasets but it does not allow you to choose which version to load or save. This is deliberate because we have chosen to separate the data catalog from any runtime configuration. If you need to pin a dataset version, you can either [specify the versions in a separate `yml` file and call it at runtime](../04_kedro_project_setup/02_configuration.md#configure-kedro-run-arguments) or [instantiate your versioned datasets using Code API and define a version parameter explicitly](#versioning-using-the-code-api). +`catalog.yml` only allows you to version your datasets but it does not allow you to choose which version to load or save. This is deliberate because we have chosen to separate the data catalog from any runtime configuration. If you need to pin a dataset version, you can either [specify the versions in a separate `yml` file and call it at runtime](../kedro_project_setup/configuration.md#configure-kedro-run-arguments) or [instantiate your versioned datasets using Code API and define a version parameter explicitly](#versioning-using-the-code-api). By default, the `DataCatalog` will load the latest version of the dataset. However, it is also possible to specify an exact load version. In order to do that, you can pass a dictionary with exact load versions to `DataCatalog.from_config`: @@ -498,7 +498,7 @@ def create_partitions() -> Dict[str, Callable[[], Any]]: } ``` -> *Note:* When using lazy saving the dataset will be written _after_ the `after_node_run` [hook](../07_extend_kedro/02_hooks). +> *Note:* When using lazy saving the dataset will be written _after_ the `after_node_run` [hook](../extend_kedro/hooks). ### Incremental loads with `IncrementalDataSet` diff --git a/docs/source/10_deployment/11_airflow_astronomer.md b/docs/source/deployment/airflow_astronomer.md similarity index 94% rename from docs/source/10_deployment/11_airflow_astronomer.md rename to docs/source/deployment/airflow_astronomer.md index 5b60abf25f..f56c58a30b 100644 --- a/docs/source/10_deployment/11_airflow_astronomer.md +++ b/docs/source/deployment/airflow_astronomer.md @@ -2,11 +2,11 @@ This tutorial explains how to deploy a Kedro project on [Apache Airflow](https://airflow.apache.org/) with [Astronomer](https://www.astronomer.io/). Apache Airflow is an extremely popular open-source workflow management platform. Workflows in Airflow are modelled and organised as [DAGs](https://en.wikipedia.org/wiki/Directed_acyclic_graph), making it a suitable engine to orchestrate and execute a pipeline authored with Kedro. [Astronomer](https://www.astronomer.io/docs/cloud/stable/develop/cli-quickstart) is a managed Airflow platform which allows users to spin up and run an Airflow cluster easily in production. Additionally, it also provides a set of tools to help users get started with Airflow locally in the easiest way possible. -The following discusses how to run the [example Iris classification pipeline](../02_get_started/05_example_project) on a local Airflow cluster with Astronomer. +The following discusses how to run the [example Iris classification pipeline](../get_started/example_project) on a local Airflow cluster with Astronomer. ## Strategy -The general strategy to deploy a Kedro pipeline on Apache Airflow is to run every Kedro node as an [Airflow task](https://airflow.apache.org/docs/apache-airflow/stable/concepts/tasks.html) while the whole pipeline is converted into a [DAG](https://airflow.apache.org/docs/apache-airflow/stable/concepts/dags.html) for orchestration purpose. This approach mirrors the principles of running Kedro in a [distributed](03_distributed) environment. +The general strategy to deploy a Kedro pipeline on Apache Airflow is to run every Kedro node as an [Airflow task](https://airflow.apache.org/docs/apache-airflow/stable/concepts/tasks.html) while the whole pipeline is converted into a [DAG](https://airflow.apache.org/docs/apache-airflow/stable/concepts/dags.html) for orchestration purpose. This approach mirrors the principles of running Kedro in a [distributed](distributed) environment. ## Prerequisites @@ -67,7 +67,7 @@ To follow along with this tutorial, make sure you have the following: pip install kedro-airflow~=0.4 ``` -5. Run `kedro install` to install all dependencies. +5. Run `pip install -r src/requirements.txt` to install all dependencies. ## Deployment process @@ -110,7 +110,7 @@ This ensures that all datasets are persisted so all Airflow tasks can read them kedro package ``` -This step should produce a wheel file called `new_kedro_project-0.1-py3-none-any.whl` located at `src/dist`. +This step should produce a wheel file called `new_kedro_project-0.1-py3-none-any.whl` located at `dist/`. * **Step 2.2**: Add the `src/` directory to `.dockerignore`, as it's not necessary to bundle the entire code base with the container once we have the packaged wheel file. @@ -123,7 +123,7 @@ echo "src/" >> .dockerignore ```Dockerfile FROM quay.io/astronomer/ap-airflow:2.0.0-buster-onbuild -RUN pip install --user src/dist/new_kedro_project-0.1-py3-none-any.whl +RUN pip install --user dist/new_kedro_project-0.1-py3-none-any.whl ``` ### Step 3. Convert the Kedro pipeline into an Airflow DAG with `kedro airflow` diff --git a/docs/source/10_deployment/04_argo.md b/docs/source/deployment/argo.md similarity index 96% rename from docs/source/10_deployment/04_argo.md rename to docs/source/deployment/argo.md index 7d62f79d29..235749d8e7 100644 --- a/docs/source/10_deployment/04_argo.md +++ b/docs/source/deployment/argo.md @@ -19,7 +19,7 @@ To use Argo Workflows, make sure you have the following prerequisites in place: - Argo Workflows is [installed](https://github.com/argoproj/argo/blob/master/README.md#quickstart) on your Kubernetes cluster - Argo CLI is [installed](https://github.com/argoproj/argo/releases) on you machine - A `name` attribute is set for each Kedro [node](/kedro.pipeline.node) since it is used to build a DAG -- All node input/output DataSets must be configured in `catalog.yml` and refer to an external location (e.g. [AWS S3](../05_data/01_data_catalog.md#using-the-data-catalog-with-the-yaml-api)); you cannot use the `MemoryDataSet` in your workflow +- All node input/output DataSets must be configured in `catalog.yml` and refer to an external location (e.g. [AWS S3](../data/data_catalog.md#using-the-data-catalog-with-the-yaml-api)); you cannot use the `MemoryDataSet` in your workflow ```eval_rst .. note:: Each node will run in its own container. @@ -33,7 +33,7 @@ First, you need to containerise your Kedro project, using any preferred containe For the purpose of this walk-through, we are going to assume a `Docker` workflow. We recommend the [`Kedro-Docker`](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker) plugin to streamline the process. [Instructions for Kedro-Docker are in the plugin's README.md](https://github.com/kedro-org/kedro-plugins/blob/main/README.md). -After you’ve built the Docker image for your project locally, [transfer the image to a container registry](./02_single_machine.md#how-to-use-container-registry). +After you’ve built the Docker image for your project locally, [transfer the image to a container registry](./single_machine.md#how-to-use-container-registry). ### Create Argo Workflows spec diff --git a/docs/source/10_deployment/07_aws_batch.md b/docs/source/deployment/aws_batch.md similarity index 91% rename from docs/source/10_deployment/07_aws_batch.md rename to docs/source/deployment/aws_batch.md index 0045d43401..2edff5e5cc 100644 --- a/docs/source/10_deployment/07_aws_batch.md +++ b/docs/source/deployment/aws_batch.md @@ -5,7 +5,7 @@ AWS Batch helps you run massively parallel Kedro pipelines in a cost-effective way, and allows you to parallelise the pipeline execution across a number of compute instances. Each Batch job is run in an isolated Docker container environment. -The following sections are a guide on how to deploy a Kedro project to AWS Batch, and uses the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) as primary example. The guide assumes that you have already completed the tutorial, and that the project was created with the project name **Kedro Tutorial**. +The following sections are a guide on how to deploy a Kedro project to AWS Batch, and uses the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) as primary example. The guide assumes that you have already completed the tutorial, and that the project was created with the project name **Kedro Tutorial**. ## Prerequisites @@ -13,7 +13,7 @@ To use AWS Batch, make sure you have the following prerequisites in place: - An [AWS account set up](https://aws.amazon.com/premiumsupport/knowledge-center/create-and-activate-aws-account/). - A `name` attribute is set for each Kedro [node](/kedro.pipeline.node). Each node will run in its own Batch job, so having sensible node names will make it easier to `kedro run --node `. -- All node input/output `DataSets` must be configured in `catalog.yml` and refer to an external location (e.g. [AWS S3](../05_data/01_data_catalog.md#using-the-data-catalog-with-the-yaml-api)). A clean way to do this is to create a new configuration environment `conf/aws_batch` containing a `catalog.yml` file with the appropriate configuration, as illustrated below. +- All node input/output `DataSets` must be configured in `catalog.yml` and refer to an external location (e.g. [AWS S3](../data/data_catalog.md#using-the-data-catalog-with-the-yaml-api)). A clean way to do this is to create a new configuration environment `conf/aws_batch` containing a `catalog.yml` file with the appropriate configuration, as illustrated below.
      Click to expand @@ -75,7 +75,7 @@ First, you need to containerise your Kedro project, using any preferred containe For the purpose of this walk-through, we are going to assume a `Docker` workflow. We recommend using the [`Kedro-Docker`](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker) plugin to streamline the process. [Instructions for using this are in the plugin's README.md](https://github.com/kedro-org/kedro-plugins/blob/main/README.md). -After you’ve built the Docker image for your project locally, [transfer the image to a container registry](./02_single_machine.md#how-to-use-container-registry), for instance [AWS ECR](https://aws.amazon.com/ecr/). You can find instructions on how to push your Docker image to ECR [in Amazon's ECR documentation](https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-push-ecr-image.html). +After you’ve built the Docker image for your project locally, [transfer the image to a container registry](./single_machine.md#how-to-use-container-registry), for instance [AWS ECR](https://aws.amazon.com/ecr/). You can find instructions on how to push your Docker image to ECR [in Amazon's ECR documentation](https://docs.aws.amazon.com/AmazonECR/latest/userguide/docker-push-ecr-image.html). Alternatively, once you've created a container repository, click the `View Push Commands` button in the top-right corner of the [ECR dashboard](https://console.aws.amazon.com/ecr). @@ -297,19 +297,12 @@ aws_batch: #### Update CLI implementation -You're nearly there! Before being able to use the new runner, update the `run()` function in your `cli.py` file to make sure the runner class is instantiated correctly: +You're nearly there! Before you can use the new runner, you need to add a `cli.py` file at the same level as `settings.py`, using [the template we provide](../extend_kedro/common_use_cases.md#use-case-3-how-to-add-or-modify-cli-commands). Add the following `run()` function to your `cli.py` file to make sure the runner class is instantiated correctly: ```python -def run(tag, env, parallel, ...): +def run(tag, env, ...): """Run the pipeline.""" - if parallel and runner: - raise KedroCliError( - "Both --parallel and --runner options cannot be used together. " - "Please use either --parallel or --runner." - ) runner = runner or "SequentialRunner" - if parallel: - runner = "ParallelRunner" tag = _get_values_as_tuple(tag) if tag else tag node_names = _get_values_as_tuple(node_names) if node_names else node_names @@ -323,6 +316,7 @@ def run(tag, env, parallel, ...): from_nodes=from_nodes, to_nodes=to_nodes, from_inputs=from_inputs, + to_outputs=to_outputs, load_versions=load_version, pipeline_name=pipeline, ) diff --git a/docs/source/10_deployment/09_aws_sagemaker.md b/docs/source/deployment/aws_sagemaker.md similarity index 86% rename from docs/source/10_deployment/09_aws_sagemaker.md rename to docs/source/deployment/aws_sagemaker.md index 2460070a13..4a22759b66 100644 --- a/docs/source/10_deployment/09_aws_sagemaker.md +++ b/docs/source/deployment/aws_sagemaker.md @@ -2,7 +2,7 @@ This tutorial explains how to integrate a Kedro project with [Amazon SageMaker](https://aws.amazon.com/sagemaker/) in order to train a machine learning model. It shows how to build machine learning pipelines in Kedro and while taking advantage of the power of SageMaker for potentially compute-intensive machine learning tasks. -The Kedro project will still run locally (or on one of many supported workflow engines like [Argo](./04_argo.md), [Prefect](./05_prefect.md), [Kubeflow](./06_kubeflow.md), [AWS Batch](./07_aws_batch.md) and others), but the model training step will be offloaded onto SageMaker. +The Kedro project will still run locally (or on one of many supported workflow engines like [Argo](./argo.md), [Prefect](./prefect.md), [Kubeflow](./kubeflow.md), [AWS Batch](./aws_batch.md) and others), but the model training step will be offloaded onto SageMaker. ## Why would you use Amazon SageMaker? @@ -16,7 +16,7 @@ To use Amazon SageMaker, make sure you have the following prerequisites in place - An [AWS account set up](https://aws.amazon.com/premiumsupport/knowledge-center/create-and-activate-aws-account/) - [Configured AWS credentials](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html) on your local machine - Generated Kedro project called **Kedro Tutorial** using [Kedro Spaceflights starter](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights/) -- Completed the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) +- Completed the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) ## Prepare the environment @@ -24,18 +24,18 @@ To use Amazon SageMaker, make sure you have the following prerequisites in place First, you should add extra package dependencies that are required to communicate with SageMaker via its [Python SDK](https://sagemaker.readthedocs.io/en/stable/). -If you have run `kedro install` at least once for your project, you should already have the `src/requirements.in` file, so you need to modify that. Otherwise, if you have never run `kedro install` for your project, you should modify `src/requirements.txt`. Open the corresponding file with a text editor and add the following lines at the end of the file: +Add the dependencies by modifying `src/requirements.txt`. Open the corresponding file with a text editor and add the following lines at the end of the file: ```text sagemaker>=2.13.0 s3fs>=0.3.0, <0.4.1 # will be needed to work with AWS S3 ``` -Since you have added two extra dependencies, you should compile and install the updated project dependencies by running the following from your terminal: +Since you have added two extra dependencies, you should install the updated project dependencies by running the following from your terminal: ```bash cd -kedro install --build-reqs +pip install -r src/requirements.txt ``` ```eval_rst @@ -65,7 +65,7 @@ It's generally a good practice to create AWS resources (like S3 bucket above) fo ### Create the configuration environment -Configuration in Kedro is logically separated into [configuration environments](../04_kedro_project_setup/02_configuration.md#additional-configuration-environments) which are loaded in specific order where the project is run. To separate SageMaker-specific configuration from the default one, let's create a new configuration environment. Go ahead and create a `conf/sagemaker` folder and then create the following files in it. +Configuration in Kedro is logically separated into [configuration environments](../kedro_project_setup/configuration.md#additional-configuration-environments) which are loaded in specific order where the project is run. To separate SageMaker-specific configuration from the default one, let's create a new configuration environment. Go ahead and create a `conf/sagemaker` folder and then create the following files in it. ```eval_rst .. note:: ``${key}`` in the YAML snippets below is a special syntax which allows you to template the project configuration. You don't need to replace those values, just paste them as-is. @@ -87,7 +87,7 @@ y_train: filepath: ${s3.train_path}/y_train.pickle ``` -> *Node:* `@pickle` and `@path` in the dataset names above correspond to the [dataset transcoding](https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html#transcoding-datasets) feature of Kedro. This allows to pass S3 path to the `X_train` dataset instead of the actual data itself to the `train_model_sagemaker` node that you will create shortly. +> *Node:* `@pickle` and `@path` in the dataset names above correspond to the [dataset transcoding](../data/data_catalog.md#transcoding-datasets) feature of Kedro. This allows to pass S3 path to the `X_train` dataset instead of the actual data itself to the `train_model_sagemaker` node that you will create shortly. * `parameters.yml` - contains the configuration for [SageMaker Scikit Learn Estimator](https://sagemaker.readthedocs.io/en/stable/frameworks/sklearn/sagemaker.sklearn.html#scikit-learn-estimator): @@ -111,23 +111,13 @@ s3: ### Update the project hooks -Now you need to tell Kedro to use the [`TemplatedConfigLoader`](https://kedro.readthedocs.io/en/stable/kedro.config.TemplatedConfigLoader.html) instead of the default `ConfigLoader` class to read the project configuration. It is very easy to do via [Kedro hooks](https://kedro.readthedocs.io/en/stable/07_extend_kedro/02_hooks.html) - open `src/kedro_tutorial/hooks.py` file, locate the definition of `ProjectHooks` and add the following method to it: +Now you need to tell Kedro to use the [`TemplatedConfigLoader`](/kedro.config.TemplatedConfigLoader) instead of the default `ConfigLoader` class to read the project configuration. It is very easy to do via `settings.py` file - open `src/kedro_tutorial/settings.py` file and set the `CONFIG_LOADER_CLASS` constant: ```python -from typing import Iterable - from kedro.config import TemplatedConfigLoader -from kedro.framework.hooks import hook_impl - -class ProjectHooks: - # - @hook_impl - def register_config_loader( - self, conf_paths: Iterable[str] - ) -> TemplatedConfigLoader: - return TemplatedConfigLoader(conf_paths, globals_pattern="*globals.yml") +CONFIG_LOADER_CLASS = TemplatedConfigLoader ``` ### Update the data science pipeline diff --git a/docs/source/10_deployment/10_aws_step_functions.md b/docs/source/deployment/aws_step_functions.md similarity index 95% rename from docs/source/10_deployment/10_aws_step_functions.md rename to docs/source/deployment/aws_step_functions.md index b743aefb21..b8df5a727c 100644 --- a/docs/source/10_deployment/10_aws_step_functions.md +++ b/docs/source/deployment/aws_step_functions.md @@ -8,11 +8,11 @@ A major problem when data pipelines move to production is to build and maintain In addition to on-demand compute, services like [AWS Step Functions](https://aws.amazon.com/step-functions/) offer a managed orchestration capability that makes it easy to sequence serverless functions and multiple cloud-native services into business-critical applications. From a Kedro perspective, this means the ability to run each node and retain the pipeline's correctness and reliability through a managed orchestrator without the concerns of managing underlying infrastructure. -The following discusses how to run the Kedro pipeline from the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) on [AWS Step Functions](https://aws.amazon.com/step-functions/). +The following discusses how to run the Kedro pipeline from the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) on [AWS Step Functions](https://aws.amazon.com/step-functions/). ## Strategy -The general strategy to deploy a Kedro pipeline on AWS Step Functions is to run every Kedro node as an [AWS Lambda](https://aws.amazon.com/lambda/) function. The whole pipeline is converted into an [AWS Step Functions State Machine](https://docs.aws.amazon.com/step-functions/latest/dg/tutorial-creating-lambda-state-machine.html) for orchestration purpose. This approach mirrors the principles of running Kedro in a [distributed](03_distributed) environment. +The general strategy to deploy a Kedro pipeline on AWS Step Functions is to run every Kedro node as an [AWS Lambda](https://aws.amazon.com/lambda/) function. The whole pipeline is converted into an [AWS Step Functions State Machine](https://docs.aws.amazon.com/step-functions/latest/dg/tutorial-creating-lambda-state-machine.html) for orchestration purpose. This approach mirrors the principles of running Kedro in a [distributed](distributed) environment. ## Prerequisites @@ -22,7 +22,7 @@ To use AWS Step Functions, make sure you have the following: - [Configured AWS credentials](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html) on your local machine - Generated Kedro project called **Spaceflights Step Functions** using [Kedro Spaceflights starter](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights/). - The final project directory's name should be `spaceflights-step-functions`. - - You should complete the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) to understand the project's structure. + - You should complete the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) to understand the project's structure. * In this tutorial, we will also be using [AWS Cloud Development Kit (CDK)](https://aws.amazon.com/cdk/) to write our deployment script. To install the `cdk` command, please consult [AWS guide](https://docs.aws.amazon.com/cdk/latest/guide/cli.html). The official method of installation is using [npm](https://www.npmjs.com/): @@ -39,7 +39,7 @@ The deployment process for a Kedro pipeline on AWS Step Functions consists of th * Develop the Kedro pipeline locally as normal * Create a new configuration environment in which we ensure all nodes' inputs and outputs have a persistent location on S3, since `MemoryDataSet` can't be shared between AWS Lambda functions * Package the Kedro pipeline as an [AWS Lambda-compliant Docker image](https://docs.aws.amazon.com/lambda/latest/dg/lambda-images.html) -* Write a script to convert and deploy each Kedro node as an AWS Lambda function. Each function will use the same pipeline Docker image created in the previous step and run a single Kedro node associated with it. This follows the principles laid out in our [distributed deployment guide](03_distributed). +* Write a script to convert and deploy each Kedro node as an AWS Lambda function. Each function will use the same pipeline Docker image created in the previous step and run a single Kedro node associated with it. This follows the principles laid out in our [distributed deployment guide](distributed). * The script above will also convert and deploy the entire Kedro pipeline as an AWS Step Functions State Machine. The final deployed AWS Step Functions State Machine will have the following visualisation in AWS Management Console: @@ -120,7 +120,7 @@ In December 2020, AWS [announced](https://aws.amazon.com/blogs/aws/new-for-aws-l $ kedro package ``` -For more information, please visit the guide on [packaging Kedro as a Python package](../03_tutorial/05_package_a_project). +For more information, please visit the guide on [packaging Kedro as a Python package](../tutorial/package_a_project). * **Step 2.2**: Create a `lambda_handler.py` file: @@ -179,7 +179,7 @@ COPY lambda_handler.py ${FUNCTION_DIR} # Add conf/ directory COPY conf ${FUNCTION_DIR}/conf # Install Kedro pipeline -COPY src/dist/spaceflights_steps_function-0.1-py3-none-any.whl . +COPY dist/spaceflights_steps_function-0.1-py3-none-any.whl . RUN python${RUNTIME_VERSION} -m pip install --no-cache-dir spaceflights_steps_function-0.1-py3-none-any.whl --target ${FUNCTION_DIR} # Install Lambda Runtime Interface Client for Python RUN python${RUNTIME_VERSION} -m pip install --no-cache-dir awslambdaric --target ${FUNCTION_DIR} @@ -383,8 +383,8 @@ If you go into the state machine and click on `Start Execution`, you will be abl ## Limitations -Generally speaking, the [limitations](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) on AWS Lambda have improved dramatically in recent years. However, it's still worth noting that each Lambda function has a 15-minute timeout, 10GB maximum memory limit and 10GB container image code package size limit. This means, for example, if you have a node that takes longer than 15 minutes to run, you should switch to some other AWS services, such as [AWS Batch](07_aws_batch) or [AWS ECS](https://aws.amazon.com/ecs/), to execute that node. +Generally speaking, the [limitations](https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html) on AWS Lambda have improved dramatically in recent years. However, it's still worth noting that each Lambda function has a 15-minute timeout, 10GB maximum memory limit and 10GB container image code package size limit. This means, for example, if you have a node that takes longer than 15 minutes to run, you should switch to some other AWS services, such as [AWS Batch](aws_batch) or [AWS ECS](https://aws.amazon.com/ecs/), to execute that node. ## Final thought -One major benefit of running a Kedro pipeline in a serverless computing platform is the ability to take advantage of other services from the same provider. For example, AWS has recently announced a [Feature Store for SageMaker](https://aws.amazon.com/sagemaker/feature-store/). We could easily use it as the Features layer in Kedro's [Data Engineering convention](../12_faq/01_faq.md#what-is-data-engineering-convention). +One major benefit of running a Kedro pipeline in a serverless computing platform is the ability to take advantage of other services from the same provider. For example, AWS has recently announced a [Feature Store for SageMaker](https://aws.amazon.com/sagemaker/feature-store/). We could easily use it as the Features layer in Kedro's [Data Engineering convention](../faq/faq.md#what-is-data-engineering-convention). diff --git a/docs/source/10_deployment/08_databricks.md b/docs/source/deployment/databricks.md similarity index 98% rename from docs/source/10_deployment/08_databricks.md rename to docs/source/deployment/databricks.md index c68e943975..6643dd78ee 100644 --- a/docs/source/10_deployment/08_databricks.md +++ b/docs/source/deployment/databricks.md @@ -40,13 +40,13 @@ kedro new --starter pyspark-iris ### 2. Install dependencies and run locally -Now, as the project has been successfully created, we should move into the project root directory, install project dependencies, and then start a local test run using [Spark local execution mode](https://stackoverflow.com/a/54064507/3364156), which means that all Spark jobs will be executed in a single JVM locally, rather than in a cluster. `pyspark-iris` Kedro starter used to generate the project already has all necessary configuration for it to work, you just need to have `pyspark` Python package installed, which is done for you by `kedro install` command below. +Now, as the project has been successfully created, we should move into the project root directory, install project dependencies, and then start a local test run using [Spark local execution mode](https://stackoverflow.com/a/54064507/3364156), which means that all Spark jobs will be executed in a single JVM locally, rather than in a cluster. `pyspark-iris` Kedro starter used to generate the project already has all necessary configuration for it to work, you just need to have `pyspark` Python package installed, which is done for you by `pip install -r src/requirements.txt` command below. ```bash # change the directory to the project root cd iris-databricks/ # compile and install the project dependencies, this may take a few minutes -kedro install +pip install -r src/requirements.txt # start a local run kedro run ``` @@ -147,7 +147,7 @@ dbutils.fs.ls("dbfs:/iris-databricks/data/01_raw/") Then type `exit()` to terminate the Python session. -Finally, modify the project catalog so that the `example_iris_data` dataset points to a new DBFS location instead of local. You can use Kedro [configuration environments](../04_kedro_project_setup/02_configuration.md#additional-configuration-environments) for this. +Finally, modify the project catalog so that the `example_iris_data` dataset points to a new DBFS location instead of local. You can use Kedro [configuration environments](../kedro_project_setup/configuration.md#additional-configuration-environments) for this. Copy the `catalog.yml` from `base` into `dbfs` environment by running the CLI command: diff --git a/docs/source/10_deployment/01_deployment_guide.md b/docs/source/deployment/deployment_guide.md similarity index 51% rename from docs/source/10_deployment/01_deployment_guide.md rename to docs/source/deployment/deployment_guide.md index bab6791dbf..b41c8b9588 100644 --- a/docs/source/10_deployment/01_deployment_guide.md +++ b/docs/source/deployment/deployment_guide.md @@ -4,20 +4,20 @@ Your choice of deployment method will depend on a number of factors. In this section we provide a number of guides for different approaches. -If you decide to deploy your Kedro project on a single machine, you should consult our [guide to single-machine deployment](02_single_machine.md), and decide whether to [use Docker for container-based deployment](./02_single_machine.md#container-based) or to use [package-based deployment](./02_single_machine.md#package-based) or to [use the CLI to clone and deploy](./02_single_machine.md#cli-based) your codebase to a server. +If you decide to deploy your Kedro project on a single machine, you should consult our [guide to single-machine deployment](single_machine.md), and decide whether to [use Docker for container-based deployment](./single_machine.md#container-based) or to use [package-based deployment](./single_machine.md#package-based) or to [use the CLI to clone and deploy](./single_machine.md#cli-based) your codebase to a server. -If your pipeline is sizeable, you will want to run parts of it on separate machines, so will need to consult our [guide to distributed deployment](03_distributed.md). +If your pipeline is sizeable, you will want to run parts of it on separate machines, so will need to consult our [guide to distributed deployment](distributed.md). We also provide information to help you deploy to the following: -* to [Argo Workflows](04_argo.md) -* to [Prefect](05_prefect.md) -* to [Kubeflow Workflows](06_kubeflow.md) -* to [AWS Batch](07_aws_batch.md) -* to [Databricks](08_databricks.md) +* to [Argo Workflows](argo.md) +* to [Prefect](prefect.md) +* to [Kubeflow Workflows](kubeflow.md) +* to [AWS Batch](aws_batch.md) +* to [Databricks](databricks.md) -In addition, we also provide instructions on [how to integrate a Kedro project with Amazon SageMaker](09_aws_sagemaker.md). +In addition, we also provide instructions on [how to integrate a Kedro project with Amazon SageMaker](aws_sagemaker.md). ![](../meta/images/deployments.png) diff --git a/docs/source/10_deployment/03_distributed.md b/docs/source/deployment/distributed.md similarity index 93% rename from docs/source/10_deployment/03_distributed.md rename to docs/source/deployment/distributed.md index 105a8c3356..45f228210c 100644 --- a/docs/source/10_deployment/03_distributed.md +++ b/docs/source/deployment/distributed.md @@ -20,7 +20,7 @@ kedro build-reqs We then recommend the [`Kedro-Docker`](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker) plugin to streamline the process of building the image. [Instructions for using this are in the plugin's README.md](https://github.com/kedro-org/kedro-plugins/blob/main/README.md). -After you’ve built the Docker image for your project locally, you would typically have to transfer the image to a container registry, such as DockerHub or AWS Elastic Container Registry, to be able to pull it on your remote servers. You can find instructions on how to do so [in our guide for single-machine deployment](./02_single_machine.md#how-to-use-container-registry). +After you’ve built the Docker image for your project locally, you would typically have to transfer the image to a container registry, such as DockerHub or AWS Elastic Container Registry, to be able to pull it on your remote servers. You can find instructions on how to do so [in our guide for single-machine deployment](./single_machine.md#how-to-use-container-registry). ## 2. Convert your Kedro pipeline into targeted platform's primitives @@ -40,4 +40,4 @@ We encourage you to play with different ways of parameterising your runs as you ## 4. (Optional) Create starters -This is an optional step, but it may speed up your work in the long term. If you find yourself having to deploy in a similar environment or to a similar platform fairly often, you may want to build your own [Kedro starter](../02_get_started/06_starters.md). That way you will be able to re-use any deployment scripts written as part of step 2. +This is an optional step, but it may speed up your work in the long term. If you find yourself having to deploy in a similar environment or to a similar platform fairly often, you may want to build your own [Kedro starter](../get_started/starters.md). That way you will be able to re-use any deployment scripts written as part of step 2. diff --git a/docs/source/10_deployment/06_kubeflow.md b/docs/source/deployment/kubeflow.md similarity index 96% rename from docs/source/10_deployment/06_kubeflow.md rename to docs/source/deployment/kubeflow.md index ec6e81720d..28fcd75f29 100644 --- a/docs/source/10_deployment/06_kubeflow.md +++ b/docs/source/deployment/kubeflow.md @@ -18,7 +18,7 @@ To use Kubeflow Pipelines, make sure you have the following prerequisites in pla - Kubeflow Pipelines is [installed](https://www.kubeflow.org/docs/started/getting-started/) on your Kubernetes cluster - Kubeflow Pipelines SDK is [installed](https://www.kubeflow.org/docs/pipelines/sdk/install-sdk/) locally - A `name` attribute is set for each Kedro [node](/kedro.pipeline.node) since it is used to trigger runs -- All node input/output DataSets must be configured in `catalog.yml` and refer to an external location (e.g. [AWS S3](../05_data/01_data_catalog.md#using-the-data-catalog-with-the-yaml-api)); you cannot use the `MemoryDataSet` in your workflow +- All node input/output DataSets must be configured in `catalog.yml` and refer to an external location (e.g. [AWS S3](../data/data_catalog.md#using-the-data-catalog-with-the-yaml-api)); you cannot use the `MemoryDataSet` in your workflow ```eval_rst .. note:: Each node runs in its own container. @@ -32,7 +32,7 @@ First, you need to containerise your Kedro project, using any preferred containe For the purpose of this walk-through, we are going to assume a `Docker` workflow. We recommend the [`Kedro-Docker`](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker) plugin to streamline the process. [Instructions for Kedro-Docker are in the plugin's README.md](https://github.com/kedro-org/kedro-plugins/blob/main/README.md). -After you’ve built the Docker image for your project locally, [transfer the image to a container registry](./02_single_machine.md#how-to-use-container-registry). +After you’ve built the Docker image for your project locally, [transfer the image to a container registry](./single_machine.md#how-to-use-container-registry). ### Create a workflow spec diff --git a/docs/source/10_deployment/05_prefect.md b/docs/source/deployment/prefect.md similarity index 100% rename from docs/source/10_deployment/05_prefect.md rename to docs/source/deployment/prefect.md diff --git a/docs/source/10_deployment/02_single_machine.md b/docs/source/deployment/single_machine.md similarity index 92% rename from docs/source/10_deployment/02_single_machine.md rename to docs/source/deployment/single_machine.md index 4970f5cb61..624f176f9e 100644 --- a/docs/source/10_deployment/02_single_machine.md +++ b/docs/source/deployment/single_machine.md @@ -2,8 +2,8 @@ This topic explains how to deploy Kedro on a production server. You can use three alternative methods to deploy your Kedro pipelines: - Container based using [Kedro-Docker](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker) -- Package based using [`kedro package`](../09_development/03_commands_reference.md#deploy-the-project) -- CLI based using the [Kedro CLI](../09_development/03_commands_reference.md) +- Package based using [`kedro package`](../development/commands_reference.md#deploy-the-project) +- CLI based using the [Kedro CLI](../development/commands_reference.md) ## Container based @@ -45,7 +45,7 @@ If you prefer not to use containerisation, you can instead package your Kedro pr kedro package ``` -Kedro builds the package into the `src/dist/` folder of your project, and creates one `.egg` file and one `.whl` file, which are [Python packaging formats for binary distribution](https://packaging.python.org/overview/). +Kedro builds the package into the `dist/` folder of your project, and creates one `.egg` file and one `.whl` file, which are [Python packaging formats for binary distribution](https://packaging.python.org/overview/). The resulting package only contains the Python source code of your Kedro pipeline, not any of the `conf/`, `data/` and `logs/` subfolders nor the `pyproject.toml` file. This means that you can distribute the project to run elsewhere, such as on a separate computer with different configuration, data and logging. When distributed, the packaged project must be run from within a directory that contains the `pyproject.toml` file and `conf/` subfolder (and `data/` and `logs/` if your pipeline loads/saves local data or uses logging). This means that you will have to create these directories on the remote servers manually. @@ -114,7 +114,7 @@ conda install -c conda-forge kedro Install the project’s dependencies, by running the following in the project's root directory: ```console -kedro install +pip install -r src/requirements.txt ``` After having installed your project on the remote server you can run the Kedro project as follows from the root of the project: diff --git a/docs/source/09_development/03_commands_reference.md b/docs/source/development/commands_reference.md similarity index 81% rename from docs/source/09_development/03_commands_reference.md rename to docs/source/development/commands_reference.md index 79be964544..e290e7a4ea 100644 --- a/docs/source/09_development/03_commands_reference.md +++ b/docs/source/development/commands_reference.md @@ -60,10 +60,9 @@ Here is a list of Kedro CLI commands, as a shortcut to the descriptions below. P * Project-specific Kedro commands * [`kedro activate-nbstripout`](#strip-output-cells) * [`kedro build-docs`](#build-the-project-documentation) - * [`kedro build-reqs`](#build-the-project-s-dependency-tree) + * [`kedro build-reqs`](#build-the-projects-dependency-tree) * [`kedro catalog list`](#list-datasets-per-pipeline-per-type) * [`kedro catalog create`](#create-a-data-catalog-yaml-configuration-file) - * [`kedro install`](#install-all-package-dependencies) * [`kedro ipython`](#notebooks) * [`kedro jupyter convert`](#copy-tagged-cells) * [`kedro jupyter lab`](#notebooks) @@ -72,10 +71,10 @@ Here is a list of Kedro CLI commands, as a shortcut to the descriptions below. P * [`kedro package`](#deploy-the-project) * [`kedro pipeline create `](#create-a-new-modular-pipeline-in-your-project) * [`kedro pipeline delete `](#delete-a-modular-pipeline) - * [`kedro pipeline describe `](#describe-a-pipeline) - * [`kedro pipeline list`](#list-all-pipelines-in-your-project) * [`kedro pipeline package `](#package-a-modular-pipeline) * [`kedro pipeline pull `](#pull-a-modular-pipeline) + * [`kedro registry describe `](#describe-a-registered-pipeline) + * [`kedro registry list`](#list-all-registered-pipelines-in-your-project) * [`kedro run`](#run-the-project) * [`kedro test`](#test-your-project) @@ -146,7 +145,7 @@ kedro docs Kedro's command line interface (CLI) allows you to associate a set of commands and dependencies with a target, which you can then execute from inside the project directory. -The commands a project supports are specified in its `cli.py` file, which can be extended, either by modifying the file or by injecting commands into it via the [`plugin` framework](../07_extend_kedro/04_plugins.md). +The commands a project supports are specified on the framework side. If you want to customise any of the Kedro commands you can do this either by adding a file called `cli.py` or by injecting commands into it via the [`plugin` framework](../extend_kedro/plugins.md). Find the template for the `cli.py` file [here](../extend_kedro/common_use_cases.md#use-case-3-how-to-add-or-modify-cli-commands). ### Project setup @@ -156,19 +155,21 @@ The commands a project supports are specified in its `cli.py` file, which can be kedro build-reqs ``` -This command runs [`pip-compile`](https://github.com/jazzband/pip-tools#example-usage-for-pip-compile) on the project's `src/requirements.in` file. If the file doesn't exist, Kedro will create it by copying from `src/requirements.txt`. +This command runs [`pip-compile`](https://github.com/jazzband/pip-tools#example-usage-for-pip-compile) on the project's `src/requirements.txt` file and will create `src/requirements.lock` with the compiled requirements. -`kedro build-reqs` also accepts and passes through CLI options accepted by `pip-compile`. For example, `kedro build-reqs --generate-hashes` will call `pip-compile --generate-hashes src/requirements.in`. +`kedro build-reqs` has two optional arguments to specify which file to compile the requirements from and where to save the compiled requirements to. These arguments are `--input-file` and `--output-file` respectively. + +`kedro build-reqs` also accepts and passes through CLI options accepted by `pip-compile`. For example, `kedro build-reqs --generate-hashes` will call `pip-compile --output-file=src/requirements.lock --generate-hashes src/requirements.txt`. #### Install all package dependencies The following runs [`pip`](https://github.com/pypa/pip) to install all package dependencies specified in `src/requirements.txt`: ```bash -kedro install +pip install -r src/requirements.txt ``` -For further information, see the [`kedro install` documentation](../04_kedro_project_setup/01_dependencies.md#kedro-install). +For further information, see the [documentation on installing project-specific dependencies](../kedro_project_setup/dependencies.md#install-project-specific-dependencies). ### Run the project @@ -178,7 +179,7 @@ Call the `run()` method of the `KedroSession` defined in `kedro.framework.sessio kedro run ``` -`KedroContext` can be extended in `run.py` (`src/project-name/run.py`). In order to use the extended `KedroContext` you need to set `context_path` in [`pyproject.toml`](../12_faq/02_architecture_overview) configuration file. +`KedroContext` can be extended in `run.py` (`src/project-name/run.py`). In order to use the extended `KedroContext` you need to set `context_path` in [`pyproject.toml`](../faq/architecture_overview) configuration file. #### Modifying a `kedro run` @@ -196,10 +197,7 @@ Kedro has options to modify pipeline runs. Here is a list of CLI arguments suppo +---------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+-----------------------------+ | :code:`kedro run --node debug_me,debug_me_too` | Run only nodes with specified names | Yes | +---------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+-----------------------------+ -| :code:`kedro run --runner runner_name` | Run the pipeline with a specific runner. Cannot be used together with :code:`--parallel`| No | -+---------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+-----------------------------+ -| :code:`kedro run --parallel` | Run the pipeline using the :code:`ParallelRunner`. If not specified, use the | No | -| | :code:`SequentialRunner`. Cannot be used together with :code:`--runner` | | +| :code:`kedro run --runner runner_name` | Run the pipeline with a specific runner | No | +---------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+-----------------------------+ | :code:`kedro run --env env_name` | Run the pipeline in the env_name environment. Defaults to local if not provided | No | +---------------------------------------------------------------------------+-----------------------------------------------------------------------------------------+-----------------------------+ @@ -224,13 +222,13 @@ You can also combine these options together, so the following command runs all t kedro run --from-nodes split --to-nodes predict,report ``` -This functionality is extended to the `kedro run --config config.yml` command, which allows you to [specify run commands in a configuration file](../04_kedro_project_setup/02_configuration.md#configure-kedro-run-arguments). +This functionality is extended to the `kedro run --config config.yml` command, which allows you to [specify run commands in a configuration file](../kedro_project_setup/configuration.md#configure-kedro-run-arguments). -A parameterised run is best used for dynamic parameters, i.e. running the same pipeline with different inputs, for static parameters that do not change we recommend following the [Kedro project setup methodology](../04_kedro_project_setup/02_configuration.md#parameters). +A parameterised run is best used for dynamic parameters, i.e. running the same pipeline with different inputs, for static parameters that do not change we recommend following the [Kedro project setup methodology](../kedro_project_setup/configuration.md#parameters). ### Deploy the project -The following packages your application as one `.egg` file and one `.whl` file within the `src/dist/` folder of your project: +The following packages your application as one `.egg` file and one `.whl` file within the `dist/` folder of your project: ```bash kedro package @@ -242,10 +240,10 @@ See the Python documentation for [further information about packaging](https://p Since Kedro 0.16.4 you can pull a modular pipeline into your Kedro project as follows: ```bash -kedro pipeline pull +kedro pipeline pull ``` -The above command will take the bundled `.whl` file and do the following: +The above command will take the bundled `.tar.gz` file and do the following: * Place source code in `src//pipelines/` * Place parameters in `conf/base/parameters/.yml` @@ -254,8 +252,8 @@ The above command will take the bundled `.whl` file and do the following: `kedro pipeline pull` works with PyPI, local and cloud storage: * PyPI: `kedro pipeline pull ` with `` being a package on PyPI -* Local storage: `kedro pipeline pull /src/dist/-0.1-py3-none-any.whl` -* Cloud storage: `kedro pipeline pull s3:///-0.1-py3-none-any.whl` +* Local storage: `kedro pipeline pull /dist/-0.1.tar.gz` +* Cloud storage: `kedro pipeline pull s3:///-0.1.tar.gz` ### Project quality @@ -265,7 +263,7 @@ The above command will take the bundled `.whl` file and do the following: kedro build-docs ``` -The `build-docs` command builds [project documentation](../03_tutorial/05_package_a_project.md#add-documentation-to-your-project) using the [Sphinx](https://www.sphinx-doc.org) framework. To further customise your documentation, please refer to `docs/source/conf.py` and the [Sphinx documentation](http://www.sphinx-doc.org/en/master/usage/configuration.html). +The `build-docs` command builds [project documentation](../tutorial/package_a_project.md#add-documentation-to-your-project) using the [Sphinx](https://www.sphinx-doc.org) framework. To further customise your documentation, please refer to `docs/source/conf.py` and the [Sphinx documentation](http://www.sphinx-doc.org/en/master/usage/configuration.html). #### Lint your project @@ -274,7 +272,7 @@ The `build-docs` command builds [project documentation](../03_tutorial/05_packag kedro lint ``` -Your project is linted with [`black`](https://github.com/psf/black), [`flake8`](https://gitlab.com/pycqa/flake8) and [`isort`](https://github.com/PyCQA/isort). See our [documentation about `kedro lint`](../09_development/04_lint.md#linting-your-kedro-project) for further details. +Your project is linted with [`black`](https://github.com/psf/black), [`flake8`](https://gitlab.com/pycqa/flake8) and [`isort`](https://github.com/PyCQA/isort). #### Test your project @@ -289,27 +287,29 @@ kedro test #### Modular pipelines -##### Create a new [modular pipeline](../06_nodes_and_pipelines/03_modular_pipelines) in your project +##### Create a new [modular pipeline](../nodes_and_pipelines/modular_pipelines) in your project + ```bash kedro pipeline create ``` ##### Package a modular pipeline -The following command packages all the files related to a modular pipeline into a [wheel file](https://pythonwheels.com/): +The following command packages all the files related to a modular pipeline into a [Python source distribution file](https://packaging.python.org/overview/#python-source-distributions): ```bash -kedro pipeline package +kedro pipeline package ``` -Further information is available in the [pipeline documentation](../06_nodes_and_pipelines/03_modular_pipelines.md#package-a-modular-pipeline). +Further information is available in the [pipeline documentation](../nodes_and_pipelines/modular_pipelines.md#package-a-modular-pipeline). ##### Pull a modular pipeline in your project -The following command pulls all the files related to a modular pipeline from either [Pypi](https://pypi.org/) or a storage location of a [wheel file](https://pythonwheels.com/). +The following command pulls all the files related to a modular pipeline from either [Pypi](https://pypi.org/) or a storage location of a [Python source distribution file](https://packaging.python.org/overview/#python-source-distributions). + ```bash -kedro pipeline pull (or path to a wheel file) +kedro pipeline pull (or path to a sdist file) ``` -Further information is available in the [pipeline documentation](../06_nodes_and_pipelines/03_modular_pipelines.md#pull-a-modular-pipeline). +Further information is available in the [pipeline documentation](../nodes_and_pipelines/modular_pipelines.md#pull-a-modular-pipeline). ##### Delete a modular pipeline The following command deletes all the files related to a modular pipeline in your Kedro project. @@ -318,24 +318,28 @@ The following command deletes all the files related to a modular pipeline in you kedro pipeline delete ``` -Further information is available in the [pipeline documentation](../06_nodes_and_pipelines/03_modular_pipelines.md#pull-a-modular-pipeline). +Further information is available in the [pipeline documentation](../nodes_and_pipelines/modular_pipelines.md#pull-a-modular-pipeline). -##### Describe a pipeline + +#### Registered pipelines + +##### Describe a registered pipeline ```bash -kedro pipeline describe +kedro registry describe ``` The output includes all the nodes in the pipeline. If no pipeline name is provided, this command returns all nodes in the `__default__` pipeline. -##### List all pipelines in your project +##### List all registered pipelines in your project ```bash -kedro pipeline list +kedro registry list ``` #### Datasets ##### List datasets per pipeline per type + ```bash kedro catalog list ``` @@ -383,16 +387,16 @@ kedro ipython Every time you start or restart a notebook kernel, a startup script (`/.ipython/profile_default/startup/00-kedro-init.py`) will add the following variables in scope: -- `context`: An instance of `kedro.framework.context.KedroContext` class or custom context class extending `KedroContext` if one was set to `CONTEXT_CLASS` in `settings.py` file (further details of how to use `context` can be found [in the IPython documentation](../11_tools_integration/02_ipython.md)) +- `context`: An instance of `kedro.framework.context.KedroContext` class or custom context class extending `KedroContext` if one was set to `CONTEXT_CLASS` in `settings.py` file (further details of how to use `context` can be found [in the IPython documentation](../tools_integration/ipython.md)) - `startup_error` (`Exception`) - `catalog` To reload these variables at any point in your notebook (e.g. if you updated `catalog.yml`) use the [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html#line-magics) `%reload_kedro`, which can be also used to see the error message if any of the variables above are undefined. -If you get an error message `Module ```` not found. Make sure to install required project dependencies by running ``kedro install`` command first.` when running any of those commands, it indicates that some Jupyter or IPython dependencies are not installed in your environment. To resolve this you will need to do the following: +If you get an error message `Module ```` not found. Make sure to install required project dependencies by running ``pip install -r requirements.txt`` first.` when running any of those commands, it indicates that some Jupyter or IPython dependencies are not installed in your environment. To resolve this you will need to do the following: -1. Make sure the corresponding dependency is present in `src/requirements.in` (`src/requirements.txt` if not compiled) -2. Run [`kedro install`](#install-all-package-dependencies) command from your terminal +1. Make sure the corresponding dependency is present in `src/requirements.txt` +2. Run [`pip install -r src/requirements.txt`](#install-all-package-dependencies) command from your terminal ##### Copy tagged cells To copy the code from cells [tagged](https://jupyter-notebook.readthedocs.io/en/stable/changelog.html#cell-tags) with `node` tag into Python files under `src//nodes/` in a Kedro project: diff --git a/docs/source/09_development/05_debugging.md b/docs/source/development/debugging.md similarity index 90% rename from docs/source/09_development/05_debugging.md rename to docs/source/development/debugging.md index 4e5ba3c220..af262fc264 100644 --- a/docs/source/09_development/05_debugging.md +++ b/docs/source/development/debugging.md @@ -7,9 +7,9 @@ If you're running your Kedro pipeline from the CLI or you can't/don't want to ru * If you have long running nodes or pipelines, inserting `print` statements and running them multiple times quickly becomes a time-consuming procedure. * Debugging nodes outside the `run` session isn't very helpful because getting access to the local scope within the `node` can be hard, especially if you're dealing with large data or memory datasets, where you need to chain a few nodes together or re-run your pipeline to produce the data for debugging purposes. -This guide provides examples on how to instantiate a [post-mortem](https://docs.python.org/3/library/pdb.html#pdb.post_mortem) debugging session with [`pdb`](https://docs.python.org/3/library/pdb.html) using [Hooks](../07_extend_kedro/02_hooks.md) when an uncaught error occurs during a pipeline run. Note that [ipdb](https://pypi.org/project/ipdb/) could be integrated in the same manner. +This guide provides examples on how to instantiate a [post-mortem](https://docs.python.org/3/library/pdb.html#pdb.post_mortem) debugging session with [`pdb`](https://docs.python.org/3/library/pdb.html) using [Hooks](../extend_kedro/hooks.md) when an uncaught error occurs during a pipeline run. Note that [ipdb](https://pypi.org/project/ipdb/) could be integrated in the same manner. -If you are looking for guides on how to setup debugging with IDEs, please visit the guide for [VSCode](./01_set_up_vscode.md#debugging) and [PyCharm](./02_set_up_pycharm.md#debugging). +If you are looking for guides on how to setup debugging with IDEs, please visit the guide for [VSCode](./set_up_vscode.md#debugging) and [PyCharm](./set_up_pycharm.md#debugging). ## Debugging Node diff --git a/docs/source/09_development/02_set_up_pycharm.md b/docs/source/development/set_up_pycharm.md similarity index 98% rename from docs/source/09_development/02_set_up_pycharm.md rename to docs/source/development/set_up_pycharm.md index 689439e3c4..bbb9f5caf6 100644 --- a/docs/source/09_development/02_set_up_pycharm.md +++ b/docs/source/development/set_up_pycharm.md @@ -149,7 +149,7 @@ You can configure Pycharm's IPython to load Kedro's Extension. Click **PyCharm | Preferences** for macOS or **File | Settings**, inside **Build, Execution, Deployment** and **Console**, enter the **Python Console** configuration. -You can append the configuration necessary to use Kedro IPython to the **Starting script** as described in the [IPython configuring documentation](https://kedro.readthedocs.io/en/latest/11_tools_integration/02_ipython.html). +You can append the configuration necessary to use Kedro IPython to the **Starting script** as described in the [IPython configuring documentation](../tools_integration/ipython.md). ![](../meta/images/pycharm_ipython_starting_script.png) diff --git a/docs/source/09_development/01_set_up_vscode.md b/docs/source/development/set_up_vscode.md similarity index 100% rename from docs/source/09_development/01_set_up_vscode.md rename to docs/source/development/set_up_vscode.md diff --git a/docs/source/extend_kedro/common_use_cases.md b/docs/source/extend_kedro/common_use_cases.md new file mode 100644 index 0000000000..13d0a9b216 --- /dev/null +++ b/docs/source/extend_kedro/common_use_cases.md @@ -0,0 +1,131 @@ +# Common use cases + +Kedro has a few built-in mechanisms for you to extend its behaviour. This document explains how to select which mechanism to employ for the most common use cases. + +## Use Case 1: How to add extra behaviour to Kedro's execution timeline + +The execution timeline of a Kedro pipeline can be thought of as a sequence of actions performed by various Kedro library components, such as the [DataSets](/kedro.extras.datasets), [DataCatalog](/kedro.io.DataCatalog), [Pipeline](/kedro.pipeline.Pipeline), and [Node](/kedro.pipeline.node.Node). + +At different points in the lifecycle of these components, you may want to add extra behaviour. For example, you could add extra computation for profiling purposes _before_ and _after_ a node runs or _before_ and _after_ the I/O actions of a dataset, namely the `load` and `save` actions. + +This can now achieved by using [Hooks](./hooks.md), to define the extra behaviour and at which point in the execution timeline it should be injected. + +## Use Case 2: How to integrate Kedro with additional data sources + +You can use [DataSets](/kedro.extras.datasets) to interface with various different data sources. If the data source you plan to use is not supported out of the box by Kedro, you can [create a custom dataset](custom_datasets.md). + +## Use Case 3: How to add or modify CLI commands + +If you want to customise a built-in Kedro command, such as `kedro run`, for a specific project, add a `cli.py` file that defines a custom `run()` function. You should add the `cli.py` file at the same level as `settings.py`. A template for the `cli.py` file is in the section below. + +
      +Click to expand + +``` +"""Command line tools for manipulating a Kedro project. +Intended to be invoked via `kedro`.""" +import click +from kedro.framework.cli.utils import ( + CONTEXT_SETTINGS, + _config_file_callback, + _reformat_load_versions, + _split_params, + env_option, + split_string, +) + +from kedro.framework.cli.project import ( + FROM_INPUTS_HELP, TO_OUTPUTS_HELP, FROM_NODES_HELP, TO_NODES_HELP, NODE_ARG_HELP, + RUNNER_ARG_HELP, ASYNC_ARG_HELP, TAG_ARG_HELP, LOAD_VERSION_HELP, + PIPELINE_ARG_HELP, CONFIG_FILE_HELP, PARAMS_ARG_HELP +) + + +@click.group(context_settings=CONTEXT_SETTINGS, name=__file__) +def cli(): + """Command line tools for manipulating a Kedro project.""" + + +@cli.command() +@click.option( + "--from-inputs", type=str, default="", help=FROM_INPUTS_HELP, callback=split_string +) +@click.option( + "--to-outputs", type=str, default="", help=TO_OUTPUTS_HELP, callback=split_string +) +@click.option( + "--from-nodes", type=str, default="", help=FROM_NODES_HELP, callback=split_string +) +@click.option( + "--to-nodes", type=str, default="", help=TO_NODES_HELP, callback=split_string +) +@click.option("--node", "-n", "node_names", type=str, multiple=True, help=NODE_ARG_HELP) +@click.option( + "--runner", "-r", type=str, default=None, multiple=False, help=RUNNER_ARG_HELP +) +@click.option("--async", "is_async", is_flag=True, multiple=False, help=ASYNC_ARG_HELP) +@env_option +@click.option("--tag", "-t", type=str, multiple=True, help=TAG_ARG_HELP) +@click.option( + "--load-version", + "-lv", + type=str, + multiple=True, + help=LOAD_VERSION_HELP, + callback=_reformat_load_versions, +) +@click.option("--pipeline", "-p", type=str, default=None, help=PIPELINE_ARG_HELP) +@click.option( + "--config", + "-c", + type=click.Path(exists=True, dir_okay=False, resolve_path=True), + help=CONFIG_FILE_HELP, + callback=_config_file_callback, +) +@click.option( + "--params", type=str, default="", help=PARAMS_ARG_HELP, callback=_split_params +) +def run( + tag, + env, + parallel, + runner, + is_async, + node_names, + to_nodes, + from_nodes, + from_inputs, + to_outputs, + load_version, + pipeline, + config, + params, +): + """Run the pipeline.""" + + == ADD YOUR CUSTOM RUN COMMAND CODE HERE == + +``` +
      + +If you want to customise a Kedro command from a command group, such as `kedro pipeline` or `kedro jupyter`, you need to import the corresponding click command group from the Kedro framework `cli`. For `kedro pipeline` commands this would be `from kedro.framework.cli.pipeline import pipeline`, and for `kedro jupyter` commands `from kedro.framework.cli.jupyter import jupyter`. Note that you must still add the `cli` click group from the snippet above, even if you don't modify it. + +You can then add or overwrite any command by adding it to the click group, as in the snippet below: +``` +@jupyter.command("notebook") +@env_option( + help="Open a notebook" +) +def notebook_run(...): + == ADD YOUR CUSTOM NOTEBOOK COMMAND CODE HERE == +``` + +To inject additional CLI commands intended to be reused across projects, please refer to our [plugins](./plugins.md) system. An example of one such command is the `kedro viz` command introduced by the official [Kedro-Viz](https://github.com/kedro-org/kedro-viz) plugin. This command is intended to work on every Kedro project and therefore must be a standalone plugin. + +```eval_rst +.. note:: Your plugin's implementation can take advantage of other extension mechanisms such as Hooks. +``` + +## Use Case 4: How to customise the initial boilerplate of your project + +Sometimes you might want to tailor the starting boilerplate of a Kedro project to your specific needs. For example, your organisation might have a standard CI script that you want to include in every new Kedro project. To this end, please visit our guide to [create Kedro starters](./create_kedro_starters.md) to solve this extension requirement. diff --git a/docs/source/07_extend_kedro/05_create_kedro_starters.md b/docs/source/extend_kedro/create_kedro_starters.md similarity index 93% rename from docs/source/07_extend_kedro/05_create_kedro_starters.md rename to docs/source/extend_kedro/create_kedro_starters.md index 06a0026acf..e9a5f08189 100644 --- a/docs/source/07_extend_kedro/05_create_kedro_starters.md +++ b/docs/source/extend_kedro/create_kedro_starters.md @@ -51,7 +51,7 @@ If you want `cookiecutter` to provide sensible **defaults** in case a user doesn To review an example Kedro starter, clone [`pandas-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris) from Github. -When you create an Iris dataset example project by calling `kedro new`, you supply three configuration variables as the documentation in [Create a new project](../02_get_started/04_new_project.md) describes. These variables are `project_name`, `repo_name` and `python_package` and they are supplied interactively or by means of a configuration file. You can see how these variables are used by inspecting the template: +When you create an Iris dataset example project by calling `kedro new`, you supply three configuration variables as the documentation in [Create a new project](../get_started/new_project.md) describes. These variables are `project_name`, `repo_name` and `python_package` and they are supplied interactively or by means of a configuration file. You can see how these variables are used by inspecting the template: **project_name** @@ -79,8 +79,6 @@ Here is the layout of the project as a Cookiecutter template: └── src # Project source code └── {{ cookiecutter.python_package }} ├── __init.py__ - ├── cli.py # A collection of Kedro command line interface (CLI) commands - ├── hooks.py ├── pipelines ├── pipeline_registry.py ├── __main__.py diff --git a/docs/source/07_extend_kedro/03_custom_datasets.md b/docs/source/extend_kedro/custom_datasets.md similarity index 94% rename from docs/source/07_extend_kedro/03_custom_datasets.md rename to docs/source/extend_kedro/custom_datasets.md index d70066d327..41304cd77b 100644 --- a/docs/source/07_extend_kedro/03_custom_datasets.md +++ b/docs/source/extend_kedro/custom_datasets.md @@ -8,7 +8,7 @@ In this example, we use a [Kaggle dataset of Pokémon images and types](https:// ## Project setup -We assume that you have already [installed Kedro](../02_get_started/02_install.md). Now [create a project](../02_get_started/04_new_project.md) (feel free to name your project as you like, but here we will assume the project's repository name is `kedro-pokemon`). +We assume that you have already [installed Kedro](../get_started/install.md). Now [create a project](../get_started/new_project.md) (feel free to name your project as you like, but here we will assume the project's repository name is `kedro-pokemon`). Log into your Kaggle account to [download the Pokémon dataset](https://www.kaggle.com/vishalsubbiah/pokemon-images-and-types) and unzip it into `data/01_raw`, within a subfolder named `pokemon-images-and-types`. The data comprises a single `pokemon.csv` file plus a subfolder of images. @@ -90,7 +90,7 @@ src/kedro_pokemon/extras ## Implement the `_load` method with `fsspec` -Many of the built-in Kedro datasets rely on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) as a consistent interface to different data sources, as described earlier in the section about the [Data Catalog](../05_data/01_data_catalog.md#specifying-the-location-of-the-dataset). In this example, it's particularly convenient to use `fsspec` in conjunction with `Pillow` to read image data, since it allows the dataset to work flexibly with different image locations and formats. +Many of the built-in Kedro datasets rely on [fsspec](https://filesystem-spec.readthedocs.io/en/latest/) as a consistent interface to different data sources, as described earlier in the section about the [Data Catalog](../data/data_catalog.md#specifying-the-location-of-the-dataset). In this example, it's particularly convenient to use `fsspec` in conjunction with `Pillow` to read image data, since it allows the dataset to work flexibly with different image locations and formats. Here is the implementation of the `_load` method using `fsspec` and `Pillow` to read the data of a single image into a `numpy` array: @@ -263,7 +263,7 @@ class ImageDataSet(AbstractDataSet): Currently, the `ImageDataSet` only works with a single image, but this example needs to load all Pokemon images from the raw data directory for further processing. -Kedro's [`PartitionedDataSet`](../05_data/02_kedro_io.md#partitioned-dataset) is a convenient way to load multiple separate data files of the same underlying dataset type into a directory. +Kedro's [`PartitionedDataSet`](../data/kedro_io.md#partitioned-dataset) is a convenient way to load multiple separate data files of the same underlying dataset type into a directory. To use `PartitionedDataSet` with `ImageDataSet` to load all Pokemon PNG images, add this to the data catalog YAML so that `PartitionedDataSet` loads all PNG files from the data directory using `ImageDataSet`: @@ -297,7 +297,7 @@ $ ls -la data/01_raw/pokemon-images-and-types/images/images/*.png | wc -l ```eval_rst .. note:: Versioning doesn't work with `PartitionedDataSet`. You can't use both of them at the same time. ``` -To add [Versioning](../05_data/02_kedro_io.md#versioning) support to the new dataset we need to extend the +To add [Versioning](../data/kedro_io.md#versioning) support to the new dataset we need to extend the [AbstractVersionedDataSet](/kedro.io.AbstractVersionedDataSet) to: * Accept a `version` keyword argument as part of the constructor @@ -495,7 +495,7 @@ In [2]: context.catalog.save('pikachu', data=img) Inspect the content of the data directory to find a new version of the data, written by `save`. -You may also want to consult the [in-depth documentation about the Versioning API](../05_data/02_kedro_io.md#versioning). +You may also want to consult the [in-depth documentation about the Versioning API](../data/kedro_io.md#versioning). ## Thread-safety @@ -559,7 +559,7 @@ class ImageDataSet(AbstractVersionedDataSet): ... ``` -We provide additional examples of [how to use parameters through the data catalog's YAML API](../05_data/01_data_catalog.md#using-the-data-catalog-with-the-yaml-api). For an example of how to use these parameters in your dataset's constructor, please see the [SparkDataSet](/kedro.extras.datasets.spark.SparkDataSet)'s implementation. +We provide additional examples of [how to use parameters through the data catalog's YAML API](../data/data_catalog.md#using-the-data-catalog-with-the-yaml-api). For an example of how to use these parameters in your dataset's constructor, please see the [SparkDataSet](/kedro.extras.datasets.spark.SparkDataSet)'s implementation. ## How to contribute a custom dataset implementation diff --git a/docs/source/07_extend_kedro/02_hooks.md b/docs/source/extend_kedro/hooks.md similarity index 87% rename from docs/source/07_extend_kedro/02_hooks.md rename to docs/source/extend_kedro/hooks.md index d45384f97d..97471ec967 100644 --- a/docs/source/07_extend_kedro/02_hooks.md +++ b/docs/source/extend_kedro/hooks.md @@ -4,7 +4,7 @@ Hooks are a mechanism to add extra behaviour to Kedro's main execution in an easy and consistent manner. Some examples may include: -* Adding a transformer after the data catalog is loaded +* Adding a log statement after the data catalog is loaded * Adding data validation to the inputs before a node runs, and to the outputs after a node has run. This makes it possible to integrate with other tools like [Great-Expectations](https://docs.greatexpectations.io/en/latest/) * Adding machine learning metrics tracking, e.g. using [MLflow](https://mlflow.org/), throughout a pipeline run @@ -12,15 +12,11 @@ Hooks are a mechanism to add extra behaviour to Kedro's main execution in an eas A Hook is comprised of a Hook specification and Hook implementation. To add Hooks to your project you will need to: -* Provide a Hook implementation for an existing Kedro-defined Hook specification -* Register your Hook implementation in the `src//settings.py` file under the `HOOKS` key +* Create or modify the file `/src//hooks.py` to define a Hook implementation for an existing Kedro-defined Hook specification +* Register your Hook implementation in the [`src//settings.py`](../kedro_project_setup/settings.md) file under the `HOOKS` key ### Hook specification -Kedro distinguishes between 2 main types of Hooks: execution timeline and component registration. - -#### Execution timeline Hooks - Kedro defines Hook specifications for particular execution points where users can inject additional behaviour. Currently, the following Hook specifications are provided in [kedro.framework.hooks](/kedro.framework.hooks): * `after_catalog_created` @@ -47,16 +43,6 @@ The naming convention for error hooks is `on__error`, in which: [kedro.framework.hooks](/kedro.framework.hooks) lists the full specifications for which you can inject additional behaviours by providing an implementation. -#### Registration Hooks - -In addition, Kedro defines Hook specifications to register certain library components to be used with the project. This is where users can define their custom class implementations. Currently, the following Hook specifications are provided: - -* `register_pipelines` -* `register_config_loader` -* `register_catalog` - -The naming convention for registration hooks is `register_`. - #### CLI hooks Lastly, Kedro defines a small set of CLI hooks that inject additional behaviour around execution of a Kedro CLI command: @@ -87,30 +73,35 @@ def after_catalog_created( pass ``` -However, if you just want to use this Hook to add transformer for a data catalog after it is created, your Hook implementation can be as simple as: +However, if you just want to use this Hook to list the contents of a data catalog after it is created, your Hook implementation can be as simple as: ```python # /src//hooks.py -from kedro.extras.transformers.time_profiler import ProfileTimeTransformer +import logging + from kedro.framework.hooks import hook_impl from kedro.io import DataCatalog -class TransformerHooks: +class DataCatalogHooks: + @property + def _logger(self): + return logging.getLogger(self.__class__.__name__) + @hook_impl def after_catalog_created(self, catalog: DataCatalog) -> None: - catalog.add_transformer(ProfileTimeTransformer()) + self._logger.info(catalog.list()) ``` ```eval_rst .. note:: The name of a module that contains Hooks implementation is arbitrary and is not restricted to ``hooks.py``. ``` -We recommend that you group related Hook implementations under a namespace, preferably a class, within a `hooks.py` file in your project. +We recommend that you group related Hook implementations under a namespace, preferably a class, within a `hooks.py` file that you create in your project. #### Registering your Hook implementations with Kedro -Hook implementations should be registered with Kedro using the `/src//settings.py` file under the `HOOKS` key. +Hook implementations should be registered with Kedro using the [`/src//settings.py`](../kedro_project_setup/settings.md) file under the `HOOKS` key. You can register more than one implementation for the same specification. They will be called in LIFO (last-in, first-out) order. @@ -118,12 +109,12 @@ The following example sets up a Hook so that the `after_data_catalog_created` im ```python # /src//settings.py -from .hooks import ProjectHooks, TransformerHooks +from .hooks import ProjectHooks, DataCatalogHooks -HOOKS = (ProjectHooks(), TransformerHooks()) +HOOKS = (ProjectHooks(), DataCatalogHooks()) ``` -Kedro also has auto-discovery enabled by default. This means that any installed plugins that declare a Hooks entry-point will be registered. To learn more about how to enable this for your custom plugin, see our [plugin development guide](04_plugins.md#hooks). +Kedro also has auto-discovery enabled by default. This means that any installed plugins that declare a Hooks entry-point will be registered. To learn more about how to enable this for your custom plugin, see our [plugin development guide](plugins.md#hooks). ```eval_rst .. note:: Auto-discovered Hooks will run *first*, followed by the ones specified in `settings.py`. @@ -146,9 +137,7 @@ where `` is the name of an installed plugin for which the auto-regi ### Use Hooks to extend a node's behaviour -Prior to Kedro 0.16, to add extra behaviour before and after a node's execution, we recommended using [decorators](07_decorators.md) on individual nodes. We also exposed a convenience method to apply decorators to [all nodes in a `Pipeline`](07_decorators.md#how-to-apply-a-decorator-to-nodes). - -However, after the introduction of Hooks in 0.16, this capability is readily available through the [`before_node_run` and `after_node_run` Hooks](/kedro.framework.hooks.specs.NodeSpecs). Furthermore, you can apply extra behaviour to not only an individual node or an entire Kedro pipeline, but also to a _subset_ of nodes based on their tags or namespaces. For example, let's say we want to add the following extra behaviours to a node: +You can add extra behaviour before and after a node's execution by using the [`before_node_run` and `after_node_run` Hooks](/kedro.framework.hooks.specs.NodeSpecs). Furthermore, you can apply extra behaviour to not only an individual node or an entire Kedro pipeline, but also to a _subset_ of nodes based on their tags or namespaces. For example, let's say we want to add the following extra behaviours to a node: ```python from kedro.pipeline.node import Node @@ -238,7 +227,7 @@ class ProjectHooks: node.func = retry(node.func) ``` ### Use Hooks to customise the dataset load and save methods -From Kedro 0.18.0 [Transformers](06_transformers.md) will be deprecated and we recommend using the `before_dataset_loaded`/`after_dataset_loaded` and `before_dataset_saved`/`after_dataset_saved` Hooks to customise the dataset `load` and `save` methods where appropriate. +We recommend using the `before_dataset_loaded`/`after_dataset_loaded` and `before_dataset_saved`/`after_dataset_saved` Hooks to customise the dataset `load` and `save` methods where appropriate. For example, you can add logging about the dataset load runtime as follows: diff --git a/docs/source/07_extend_kedro/04_plugins.md b/docs/source/extend_kedro/plugins.md similarity index 91% rename from docs/source/07_extend_kedro/04_plugins.md rename to docs/source/extend_kedro/plugins.md index 047a798b24..f87f01a661 100644 --- a/docs/source/07_extend_kedro/04_plugins.md +++ b/docs/source/extend_kedro/plugins.md @@ -14,7 +14,7 @@ Here is a simple example of a plugin that prints the pipeline as JSON: ```python import click -from kedro.framework.session import KedroSession +from kedro.framework.project import pipelines @click.group(name="JSON") @@ -27,9 +27,8 @@ def commands(): @click.pass_obj def to_json(metadata): """Display the pipeline in JSON format""" - session = KedroSession.create(metadata.package_name) - context = session.load_context() - print(context.pipeline.to_json()) + pipeline = pipelines["__default__"] + print(pipeline.to_json()) ``` The plugin provides the following `entry_points` config in `setup.py`: @@ -176,7 +175,7 @@ See the full list of plugins using the GitHub tag [kedro-plugin](https://github. - [Kedro-Accelerator](https://github.com/deepyaman/kedro-accelerator), by [Deepyaman Datta](https://github.com/deepyaman), speeds up pipelines by parallelizing I/O in the background - [kedro-dataframe-dropin](https://github.com/mzjp2/kedro-dataframe-dropin), by [Zain Patel](https://github.com/mzjp2), lets you swap out pandas datasets for modin or RAPIDs equivalents for specialised use to speed up your workflows (e.g on GPUs) - [kedro-kubeflow](https://github.com/getindata/kedro-kubeflow), by [Mateusz Pytel](https://github.com/em-pe) and [Mariusz Strzelecki](https://github.com/szczeles), lets you run and schedule pipelines on Kubernetes clusters using [Kubeflow Pipelines](https://www.kubeflow.org/docs/components/pipelines/overview/) -- [kedro-mlflow](https://github.com/Galileo-Galilei/kedro-mlflow), by [Yolan Honoré-Rougé](https://github.com/galileo-galilei) and [Takieddine Kadiri](https://github.com/takikadiri), facilitates [MLflow](https://www.mlflow.org/) integration inside Kedro projects while enforcing [Kedro's principles](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#what-are-the-primary-advantages-of-kedro). Its main features are modular configuration, automatic parameters tracking, datasets versioning, Kedro pipelines packaging and serving and automatic synchronization between training and inference pipelines for high reproducibility of machine learning experiments and ease of deployment. A tutorial is provided in the [kedro-mlflow-tutorial repo](https://github.com/Galileo-Galilei/kedro-mlflow-tutorial). You can find more information in [the documentation](https://kedro-mlflow.readthedocs.io/en/stable/). +- [kedro-mlflow](https://github.com/Galileo-Galilei/kedro-mlflow), by [Yolan Honoré-Rougé](https://github.com/galileo-galilei) and [Takieddine Kadiri](https://github.com/takikadiri), facilitates [MLflow](https://www.mlflow.org/) integration inside Kedro projects while enforcing [Kedro's principles](../faq/faq.md#what-are-the-primary-advantages-of-kedro). Its main features are modular configuration, automatic parameters tracking, datasets versioning, Kedro pipelines packaging and serving and automatic synchronization between training and inference pipelines for high reproducibility of machine learning experiments and ease of deployment. A tutorial is provided in the [kedro-mlflow-tutorial repo](https://github.com/Galileo-Galilei/kedro-mlflow-tutorial). You can find more information in [the documentation](https://kedro-mlflow.readthedocs.io/en/stable/). - [kedro-neptune](https://github.com/neptune-ai/kedro-neptune), by [Jakub Czakon](https://github.com/jakubczakon) and [Rafał Jankowski](https://github.com/Raalsky), lets you have all the benefits of a nicely organized Kedro pipeline with Neptune: a powerful user interface built for ML metadata management. It lets you browse and filter pipeline executions, compare nodes and pipelines on metrics and parameters, and visualize pipeline metadata like learning curves, node outputs, and charts. For more information, tutorials and videos, go to [the documentation](https://docs.neptune.ai/integrations-and-supported-tools/automation-pipelines/kedro). - [kedro-dolt](https://www.dolthub.com/blog/2021-06-16-kedro-dolt-plugin/), by [Max Hoffman](https://github.com/max-hoffman) and [Oscar Batori](https://github.com/oscarbatori), allows you to expand the data versioning abilities of data scientists and engineers - [kedro-airflow-k8s](https://github.com/getindata/kedro-airflow-k8s), by [GetInData](https://github.com/getindata), enables running a Kedro pipeline with Airflow on a Kubernetes cluster diff --git a/docs/source/12_faq/02_architecture_overview.md b/docs/source/faq/architecture_overview.md similarity index 80% rename from docs/source/12_faq/02_architecture_overview.md rename to docs/source/faq/architecture_overview.md index 673a0302d0..e8bb893864 100644 --- a/docs/source/12_faq/02_architecture_overview.md +++ b/docs/source/faq/architecture_overview.md @@ -11,11 +11,9 @@ As a data pipeline developer, you will interact with a Kedro project, which cons * The **`conf/`** directory, which contains configuration for the project, such as data catalog configuration, parameters, etc. * The **`src`** directory, which contains the source code for the project, including: * The **`pipelines`** directory, which contains the source code for your pipelines. - * **`settings.py`** file contains the settings for the project, such as library component registration, custom hooks registration, etc. - * **`hooks.py`**, which contains custom [Hooks implementations](../07_extend_kedro/02_hooks) in the project, including both registration hooks and extension hooks. - * **`cli.py`** file contains project specific CLI commands (e.g., `kedro run`, `kedro test`, etc.). + * **`settings.py`** file contains the settings for the project, such as library component registration, custom hooks registration, etc. All the available settings are listed and explained [in the project settings chapter](../kedro_project_setup/settings.md). * **`pipeline_registry.py`** file defines the project pipelines, i.e. pipelines that can be run using `kedro run --pipeline`. - * **`__main__.py`** file serves as the main entry point of the project in [package mode](../03_tutorial/05_package_a_project.md#package-your-project). + * **`__main__.py`** file serves as the main entry point of the project in [package mode](../tutorial/package_a_project.md#package-your-project). * **`pyproject.toml`** identifies the project root by providing project metadata, including: * `package_name`: A valid Python package name for your project package * `project_name`: A human readable name for your project @@ -23,7 +21,7 @@ As a data pipeline developer, you will interact with a Kedro project, which cons ### Kedro starter -You can use a [Kedro starter](../02_get_started/06_starters) to generate a Kedro project that contains boilerplate code. We maintain a set of [official starters](https://github.com/kedro-org/kedro-starters/) but you can also use a custom starter of your choice. +You can use a [Kedro starter](../get_started/starters) to generate a Kedro project that contains boilerplate code. We maintain a set of [official starters](https://github.com/kedro-org/kedro-starters/) but you can also use a custom starter of your choice. ### Kedro library diff --git a/docs/source/12_faq/01_faq.md b/docs/source/faq/faq.md similarity index 100% rename from docs/source/12_faq/01_faq.md rename to docs/source/faq/faq.md diff --git a/docs/source/12_faq/03_kedro_principles.md b/docs/source/faq/kedro_principles.md similarity index 100% rename from docs/source/12_faq/03_kedro_principles.md rename to docs/source/faq/kedro_principles.md diff --git a/docs/source/02_get_started/05_example_project.md b/docs/source/get_started/example_project.md similarity index 91% rename from docs/source/02_get_started/05_example_project.md rename to docs/source/get_started/example_project.md index 606ae85634..d58a0044b8 100644 --- a/docs/source/02_get_started/05_example_project.md +++ b/docs/source/get_started/example_project.md @@ -10,7 +10,7 @@ The Iris dataset can be used by a machine learning model to illustrate classific ## Create the example project -You must first [create a project](./04_new_project.md). Feel free to name your project as you like, but here we will assume the project's repository name is `get-started`. +You must first [create a project](./new_project.md). Feel free to name your project as you like, but here we will assume the project's repository name is `get-started`. ```bash kedro new --starter=pandas-iris @@ -41,7 +41,7 @@ get-started ├── .coveragerc # Configuration file for the coverage reporting when doing `kedro test` ├── .gitignore # Prevent staging of unnecessary files to `git` ├── .ipython # IPython startup scripts -└── pyproject.toml # Identifies the project root and [contains configuration information](https://kedro.readthedocs.io/en/latest/11_faq/02_architecture_overview.html#kedro-yml) +└── pyproject.toml # Identifies the project root and [contains configuration information](../faq/architecture_overview.md#kedro-yml) ``` #### `conf/` @@ -54,7 +54,7 @@ For project-specific settings to share across different installations (for examp The folder contains three files for the example, but you can add others as you require: -- `catalog.yml` - [Configures the Data Catalog](../05_data/01_data_catalog.md#using-the-data-catalog-within-kedro-configuration) with the file paths and load/save configuration required for different datasets +- `catalog.yml` - [Configures the Data Catalog](../data/data_catalog.md#using-the-data-catalog-within-kedro-configuration) with the file paths and load/save configuration required for different datasets - `logging.yml` - Uses Python's default [`logging`](https://docs.python.org/3/library/logging.html) library to set up logging - `parameters.yml` - Allows you to define parameters for machine learning experiments e.g. train / test split and number of iterations @@ -64,7 +64,7 @@ The `local` subfolder of `conf` is used for **settings that should not be shared #### `data` -The `data` folder contains a number of subfolders to store project data. We recommend that you put raw data into `raw` and move processed data to other subfolders according to [data engineering convention](../12_faq/01_faq.md#what-is-data-engineering-convention). +The `data` folder contains a number of subfolders to store project data. We recommend that you put raw data into `raw` and move processed data to other subfolders according to [data engineering convention](../faq/faq.md#what-is-data-engineering-convention). The example project has a single file, `iris.csv`, that contains the Iris dataset. The subfolders of `data` are ignored by `git` through inclusion in `.gitignore` since data is more frequently stored elsewhere, such as in an S3 bucket. However, if you are familiar with [`.gitignore`](https://docs.github.com/en/github/using-git/ignoring-files) you can edit it, if you are confident that you need to manage your data in `git`. @@ -90,11 +90,11 @@ This subfolder contains the project's source code. It contains 2 subfolders: Once you have created the project, to run project-specific Kedro commands, you need to navigate to the directory in which it has been created. -Call `kedro install` to install the project's dependencies. Next, call `kedro run`: +Call `pip install -r src/requirements.txt` to install the project's dependencies. Next, call `kedro run`: ```bash cd getting-started -kedro install +pip install -r src/requirements.txt kedro run ``` @@ -123,7 +123,7 @@ This is the data engineering node function within `src/get_started/pipelines/dat | Node | Description | Node Function Name | +=================+================================================================+==========================+ | Split data | Splits the example | :code:`split_data` | -| | `Iris dataset `_ | | +| | `Iris dataset ` | | | | into train and test samples | | +-----------------+----------------------------------------------------------------+--------------------------+ ``` diff --git a/docs/source/02_get_started/03_hello_kedro.md b/docs/source/get_started/hello_kedro.md similarity index 100% rename from docs/source/02_get_started/03_hello_kedro.md rename to docs/source/get_started/hello_kedro.md diff --git a/docs/source/02_get_started/02_install.md b/docs/source/get_started/install.md similarity index 74% rename from docs/source/02_get_started/02_install.md rename to docs/source/get_started/install.md index 169e2820eb..7002bb69d0 100644 --- a/docs/source/02_get_started/02_install.md +++ b/docs/source/get_started/install.md @@ -28,8 +28,8 @@ You should see an ASCII art graphic and the Kedro version number. For example: ![](../meta/images/kedro_graphic.png) -If you do not see the graphic displayed, or have any issues with your installation, see the [frequently asked questions](../12_faq/01_faq.md), check out [GitHub Discussions](https://github.com/kedro-org/kedro/discussions) or talk to the community on the [Discord Server](https://discord.gg/akJDeVaxnB). +If you do not see the graphic displayed, or have any issues with your installation, see the [frequently asked questions](../faq/faq.md), check out [GitHub Discussions](https://github.com/kedro-org/kedro/discussions) or talk to the community on the [Discord Server](https://discord.gg/akJDeVaxnB). ## Install a development version -You can try out a development version of Kedro direct from the [Kedro Github repository](https://github.com/kedro-org/kedro) by following [these steps](../12_faq/01_faq.md#how-can-i-use-a-development-version-of-kedro). +You can try out a development version of Kedro direct from the [Kedro Github repository](https://github.com/kedro-org/kedro) by following [these steps](../faq/faq.md#how-can-i-use-a-development-version-of-kedro). diff --git a/docs/source/02_get_started/04_new_project.md b/docs/source/get_started/new_project.md similarity index 91% rename from docs/source/02_get_started/04_new_project.md rename to docs/source/get_started/new_project.md index 6c224439b8..8409616e00 100644 --- a/docs/source/02_get_started/04_new_project.md +++ b/docs/source/get_started/new_project.md @@ -1,8 +1,8 @@ # Create a new project -Once you have [installed Kedro](./02_install.md), you can create a new, empty project by answering a series of questions, or by using settings recorded in a configuration file. +Once you have [installed Kedro](./install.md), you can create a new, empty project by answering a series of questions, or by using settings recorded in a configuration file. -If you want to create a Kedro project that is populated with some template or example code, you can use Kedro starters by specifying the `--starter` flag. Read the guide to [creating new projects with Kedro Starters](./06_starters.md) for more information. +If you want to create a Kedro project that is populated with some template or example code, you can use Kedro starters by specifying the `--starter` flag. Read the guide to [creating new projects with Kedro Starters](./starters.md) for more information. ## Create a new project interactively diff --git a/docs/source/02_get_started/01_prerequisites.md b/docs/source/get_started/prerequisites.md similarity index 93% rename from docs/source/02_get_started/01_prerequisites.md rename to docs/source/get_started/prerequisites.md index 2fcdee7e75..630bda1874 100644 --- a/docs/source/02_get_started/01_prerequisites.md +++ b/docs/source/get_started/prerequisites.md @@ -1,6 +1,6 @@ # Installation prerequisites -- Kedro supports macOS, Linux and Windows (7 / 8 / 10 and Windows Server 2016+). If you encounter any problems on these platforms, please check the [frequently asked questions](../12_faq/01_faq.md), [GitHub Discussions](https://github.com/kedro-org/kedro/discussions) or the [Discord Server](https://discord.gg/akJDeVaxnB). +- Kedro supports macOS, Linux and Windows (7 / 8 / 10 and Windows Server 2016+). If you encounter any problems on these platforms, please check the [frequently asked questions](../faq/faq.md), [GitHub Discussions](https://github.com/kedro-org/kedro/discussions) or the [Discord Server](https://discord.gg/akJDeVaxnB). - To work with Kedro, we highly recommend that you [download and install Anaconda](https://www.anaconda.com/products/individual#Downloads) (Python 3.x version). diff --git a/docs/source/get_started/standalone_use_of_datacatalog.md b/docs/source/get_started/standalone_use_of_datacatalog.md new file mode 100644 index 0000000000..9ca1a51de8 --- /dev/null +++ b/docs/source/get_started/standalone_use_of_datacatalog.md @@ -0,0 +1,66 @@ +# Standalone use of the `DataCatalog` + +## Introduction + +To make it easier to share a Jupyter notebook with others you need to avoid hard-coded file paths used to load or save data. One way to explore data within a shareable Jupyter notebook is take advantage of Kedro's [`DataCatalog`](../data/data_catalog.md), but in the early phases of a project, you may not want to use any other Kedro features. + +The Kedro starter with alias `standalone-datacatalog` (formerly known as `mini-kedro`) provides this minimal functionality. You can specify the sources required to load and save data using a YAML API. For example: + + ```yaml +# conf/base/catalog.yml +example_dataset_1: + type: pandas.CSVDataSet + filepath: folder/filepath.csv + +example_dataset_2: + type: spark.SparkDataSet + filepath: s3a://your_bucket/data/01_raw/example_dataset_2* + credentials: dev_s3 + file_format: csv + save_args: + if_exists: replace +``` + +This makes it possible to interact with data within your Jupyter notebook, with code much like this: + +```python +df = catalog.load("example_dataset_1") +df_2 = catalog.save("example_dataset_2") +``` + +## Usage + +Create a new project using the [`standalone-datacatalog` starter](https://github.com/kedro-org/kedro-starters/tree/main/standalone-datacatalog): + +```bash +$ kedro new --starter=standalone-datacatalog +``` + +## Content + +The starter comprises a minimal setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../data/data_catalog.md). + +The starter contains: + +* A `conf/` directory, which contains an example `DataCatalog` configuration (`catalog.yml`) +* A `data/` directory, which contains an example dataset identical to the one used by the [`pandas-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris) starter +* An example notebook showing how to instantiate the `DataCatalog` and interact with the example dataset +* A blank `README.md` which points to this page of documentation + +## Create a full Kedro project + +When you later wish to build a full pipeline, you can use the same configuration, with the following steps: + +***1. Create a new empty Kedro project in a new directory*** + +Let's assume that the new project is created at `/path/to/your/project`: + +```bash +kedro new +``` + +***2. Copy the `conf/` and `data/` directories from your `standalone-datacatalog` starter project over to your new project*** + +```bash +cp -fR {conf,data} `/path/to/your/project` +``` diff --git a/docs/source/02_get_started/06_starters.md b/docs/source/get_started/starters.md similarity index 71% rename from docs/source/02_get_started/06_starters.md rename to docs/source/get_started/starters.md index 59633e88b4..249076fd50 100644 --- a/docs/source/02_get_started/06_starters.md +++ b/docs/source/get_started/starters.md @@ -6,7 +6,7 @@ Kedro starters are used to create projects that contain code to run as-is, or to * To add a `docker-compose` setup to launch Kedro next to a monitoring stack * To add deployment scripts and CI/CD setup for your targeted infrastructure -A Kedro starter is a [Cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/) template that contains the boilerplate code for a Kedro project. You can create your own starters for reuse within a project or team, as described in the documentation about [how to create a Kedro starter](../07_extend_kedro/05_create_kedro_starters.md). +A Kedro starter is a [Cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/) template that contains the boilerplate code for a Kedro project. You can create your own starters for reuse within a project or team, as described in the documentation about [how to create a Kedro starter](../extend_kedro/create_kedro_starters.md). ## How to use Kedro starters @@ -26,7 +26,7 @@ To create a project using the `PySpark` starter: kedro new --starter=pyspark ``` -If no starter is provided to `kedro new`, the default Kedro template will be used, as documented in ["Creating a new project"](./04_new_project.md). +If no starter is provided to `kedro new`, the default Kedro template will be used, as documented in ["Creating a new project"](./new_project.md). ### Starter aliases @@ -46,13 +46,12 @@ kedro starter list The Kedro team maintains the following starters to bootstrap new Kedro projects: -* [Alias `astro-airflow-iris`](https://github.com/kedro-org/kedro-starters/tree/main/astro-airflow-iris): The [Kedro Iris dataset example project](https://kedro.readthedocs.io/en/stable/02_get_started/05_example_project.html) with a minimal setup for deploying the pipeline on Airflow with [Astronomer](https://www.astronomer.io/). -* [Alias `mini-kedro`](https://github.com/kedro-org/kedro-starters/tree/main/mini-kedro): A minimum setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../05_data/01_data_catalog.md), which is a core component of Kedro. This starter is of use in the exploratory phase of a project. For more information, please read the [Mini-Kedro](../04_kedro_project_setup/04_mini_kedro.md) guide. -* [Alias `pandas-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris): The [Kedro Iris dataset example project](./05_example_project.md) -* [Alias `pyspark-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pyspark-iris): An alternative Kedro Iris dataset example, using [PySpark](../11_tools_integration/01_pyspark.md) -* [Alias `pyspark`](https://github.com/kedro-org/kedro-starters/tree/main/pyspark): The configuration and initialisation code for a [Kedro pipeline using PySpark](../11_tools_integration/01_pyspark.md) -* [Alias `spaceflights`](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): The [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) example code - +* [Alias `astro-airflow-iris`](https://github.com/kedro-org/kedro-starters/tree/main/astro-airflow-iris): The [Kedro Iris dataset example project](../get_started/example_project.md) with a minimal setup for deploying the pipeline on Airflow with [Astronomer](https://www.astronomer.io/). +* [Alias `standalone-datacatalog`](https://github.com/kedro-org/kedro-starters/tree/main/standalone-datacatalog): A minimum setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../data/data_catalog.md), which is a core component of Kedro. This starter is of use in the exploratory phase of a project. For more information, read the guide to [standalone use of the `DataCatalog`](../get_started/standalone_use_of_datacatalog.md). This starter was formerly known as `mini-kedro`. +* [Alias `pandas-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris): The [Kedro Iris dataset example project](./example_project.md) +* [Alias `pyspark-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pyspark-iris): An alternative Kedro Iris dataset example, using [PySpark](../tools_integration/pyspark.md) +* [Alias `pyspark`](https://github.com/kedro-org/kedro-starters/tree/main/pyspark): The configuration and initialisation code for a [Kedro pipeline using PySpark](../tools_integration/pyspark.md) +* [Alias `spaceflights`](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): The [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) example code ## Starter versioning @@ -68,7 +67,7 @@ Under the hood, the value will be passed to the [`--checkout` flag in Cookiecutt ## Use a starter in interactive mode -By default, when you create a new project using a starter, `kedro new` launches [by asking a few questions](./04_new_project.md#create-a-new-project-interactively). You will be prompted to provide the following variables: +By default, when you create a new project using a starter, `kedro new` launches [by asking a few questions](./new_project.md#create-a-new-project-interactively). You will be prompted to provide the following variables: * `project_name` - A human readable name for your new project * `repo_name` - A name for the directory that holds your project repository @@ -78,7 +77,7 @@ This mode assumes that the starter doesn't require any additional configuration ## Use a starter with a configuration file -Kedro also allows you to [specify a configuration file](./04_new_project.md#Create-a-new-project-from-a-configuration-file) to create a project. Use the `--config` flag alongside the starter as follows: +Kedro also allows you to [specify a configuration file](./new_project.md#Create-a-new-project-from-a-configuration-file) to create a project. Use the `--config` flag alongside the starter as follows: ```bash kedro new --config=my_kedro_pyspark_project.yml --starter=pyspark diff --git a/docs/source/index.rst b/docs/source/index.rst index db88d1f663..e35a12ae43 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -23,9 +23,9 @@ Welcome to Kedro's documentation! :target: https://opensource.org/licenses/Apache-2.0 :alt: License is Apache 2.0 -.. image:: https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8-blue.svg +.. image:: https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue.svg :target: https://pypi.org/project/kedro/ - :alt: Python version 3.6, 3.7, 3.8 + :alt: Python version 3.7, 3.8, 3.9 .. image:: https://badge.fury.io/py/kedro.svg :target: https://pypi.org/project/kedro/ @@ -51,134 +51,137 @@ Welcome to Kedro's documentation! :maxdepth: 2 :caption: Introduction - 01_introduction/01_introduction + introduction/introduction .. toctree:: :maxdepth: 2 :caption: Get started - 02_get_started/01_prerequisites - 02_get_started/02_install - 02_get_started/03_hello_kedro - 02_get_started/04_new_project - 02_get_started/05_example_project - 02_get_started/06_starters + get_started/prerequisites + get_started/install + get_started/hello_kedro + +.. toctree:: + :maxdepth: 2 + :caption: Make a project + + get_started/new_project + get_started/example_project + get_started/starters + get_started/standalone_use_of_datacatalog .. toctree:: :maxdepth: 2 :caption: Tutorial - 03_tutorial/01_spaceflights_tutorial - 03_tutorial/02_tutorial_template - 03_tutorial/03_set_up_data - 03_tutorial/04_create_pipelines - 03_tutorial/05_package_a_project - 03_tutorial/06_visualise_pipeline - 03_tutorial/07_set_up_experiment_tracking + tutorial/spaceflights_tutorial + tutorial/tutorial_template + tutorial/set_up_data + tutorial/create_pipelines + tutorial/package_a_project + tutorial/visualise_pipeline + tutorial/set_up_experiment_tracking .. toctree:: :maxdepth: 2 :caption: Kedro project setup - 04_kedro_project_setup/01_dependencies - 04_kedro_project_setup/02_configuration - 04_kedro_project_setup/03_session - 04_kedro_project_setup/04_mini_kedro + kedro_project_setup/dependencies + kedro_project_setup/configuration + kedro_project_setup/session + kedro_project_setup/settings .. toctree:: :maxdepth: 2 :caption: Data Catalog - 05_data/01_data_catalog - 05_data/02_kedro_io + data/data_catalog + data/kedro_io .. toctree:: :maxdepth: 2 :caption: Nodes and pipelines - 06_nodes_and_pipelines/01_nodes - 06_nodes_and_pipelines/02_pipeline_introduction - 06_nodes_and_pipelines/03_modular_pipelines - 06_nodes_and_pipelines/04_run_a_pipeline - 06_nodes_and_pipelines/05_slice_a_pipeline + nodes_and_pipelines/nodes + nodes_and_pipelines/pipeline_introduction + nodes_and_pipelines/modular_pipelines + nodes_and_pipelines/run_a_pipeline + nodes_and_pipelines/slice_a_pipeline .. toctree:: :maxdepth: 2 :caption: Extend Kedro - 07_extend_kedro/01_common_use_cases - 07_extend_kedro/02_hooks - 07_extend_kedro/03_custom_datasets - 07_extend_kedro/04_plugins - 07_extend_kedro/05_create_kedro_starters - 07_extend_kedro/06_transformers - 07_extend_kedro/07_decorators + extend_kedro/common_use_cases + extend_kedro/hooks + extend_kedro/custom_datasets + extend_kedro/plugins + extend_kedro/create_kedro_starters .. toctree:: :maxdepth: 2 :caption: Logging - 08_logging/01_logging - 08_logging/02_experiment_tracking + logging/logging + logging/experiment_tracking .. toctree:: :maxdepth: 2 :caption: Development - 09_development/01_set_up_vscode - 09_development/02_set_up_pycharm - 09_development/03_commands_reference - 09_development/04_lint - 09_development/05_debugging + development/set_up_vscode + development/set_up_pycharm + development/commands_reference + development/debugging .. toctree:: :maxdepth: 2 :caption: Deployment - 10_deployment/01_deployment_guide - 10_deployment/02_single_machine - 10_deployment/03_distributed - 10_deployment/04_argo - 10_deployment/05_prefect - 10_deployment/06_kubeflow - 10_deployment/07_aws_batch - 10_deployment/08_databricks - 10_deployment/09_aws_sagemaker - 10_deployment/10_aws_step_functions - 10_deployment/11_airflow_astronomer + deployment/deployment_guide + deployment/single_machine + deployment/distributed + deployment/argo + deployment/prefect + deployment/kubeflow + deployment/aws_batch + deployment/databricks + deployment/aws_sagemaker + deployment/aws_step_functions + deployment/airflow_astronomer .. toctree:: :maxdepth: 2 :caption: Tools integration - 11_tools_integration/01_pyspark - 11_tools_integration/02_ipython + tools_integration/pyspark + tools_integration/ipython .. toctree:: :maxdepth: 2 :caption: FAQs - 12_faq/01_faq - 12_faq/02_architecture_overview - 12_faq/03_kedro_principles + faq/faq + faq/architecture_overview + faq/kedro_principles .. toctree:: :maxdepth: 2 :caption: Resources - 13_resources/01_logos - 13_resources/02_glossary + resources/logos + resources/glossary .. toctree:: :maxdepth: 2 :caption: Contribute to Kedro - 14_contribution/01_contribute_to_kedro - 14_contribution/02_developer_contributor_guidelines - 14_contribution/03_backwards_compatibility - 14_contribution/04_documentation_contributor_guidelines + contribution/contribute_to_kedro + contribution/developer_contributor_guidelines + contribution/backwards_compatibility + contribution/documentation_contributor_guidelines API documentation ================= diff --git a/docs/source/01_introduction/01_introduction.md b/docs/source/introduction/introduction.md similarity index 60% rename from docs/source/01_introduction/01_introduction.md rename to docs/source/introduction/introduction.md index 2cc4f05c20..3c8ff2bf77 100644 --- a/docs/source/01_introduction/01_introduction.md +++ b/docs/source/introduction/introduction.md @@ -6,19 +6,19 @@ For the source code, take a look at the [Kedro repository on Github](https://git ## Learn how to use Kedro -In the next few chapters, you will learn how to [install Kedro](../02_get_started/01_prerequisites.md) and set up your own production-ready data pipelines. +In the next few chapters, you will learn how to [install Kedro](../get_started/prerequisites.md) and set up your own production-ready data pipelines. Once you are set up, we suggest working through our examples, including: -- A typical "Hello World" example, for an [entry-level description of the main Kedro concepts](https://kedro.readthedocs.io/en/stable/02_get_started/03_hello_kedro.html) -- An [introduction to the project template](https://kedro.readthedocs.io/en/stable/02_get_started/05_example_project.html) using the Iris dataset -- A more detailed [spaceflights tutorial](https://kedro.readthedocs.io/en/stable/03_tutorial/02_tutorial_template.html) to give you hands-on experience +- A typical "Hello World" example, for an [entry-level description of the main Kedro concepts](../get_started/hello_kedro.md) +- An [introduction to the project template](../get_started/example_project.md) using the Iris dataset +- A more detailed [spaceflights tutorial](../tutorial/tutorial_template.md) to give you hands-on experience -We also recommend the [frequently asked questions](../12_faq/01_faq.md) and the [API reference documentation](/kedro.rst) for additional information. +We also recommend the [frequently asked questions](../faq/faq.md) and the [API reference documentation](/kedro.rst) for additional information. ## Assumptions -We have designed the documentation and the [spaceflights tutorial](../03_tutorial/01_spaceflights_tutorial.md) for anyone new to Kedro. The more knowledge of Python you have, the easier you will find the learning curve. +We have designed the documentation and the [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) for anyone new to Kedro. The more knowledge of Python you have, the easier you will find the learning curve. ```eval_rst .. note:: There are a number of excellent online resources for learning Python, but you should choose those that reference Python 3, as Kedro is built for Python 3.6+. There are many curated lists of online resources, such as the `official Python programming language website `_ and `this list of free programming books and tutorials `_. diff --git a/docs/source/04_kedro_project_setup/02_configuration.md b/docs/source/kedro_project_setup/configuration.md similarity index 92% rename from docs/source/04_kedro_project_setup/02_configuration.md rename to docs/source/kedro_project_setup/configuration.md index e2a39ea68e..f8ba8b3a15 100644 --- a/docs/source/04_kedro_project_setup/02_configuration.md +++ b/docs/source/kedro_project_setup/configuration.md @@ -4,7 +4,7 @@ This section contains detailed information about configuration, for which the re ## Configuration root -We recommend that you keep all configuration files in the `conf` directory of a Kedro project. However, if you prefer, you may point Kedro to any other directory and change the configuration paths by setting the `CONF_ROOT` variable in `src//settings.py` as follows: +We recommend that you keep all configuration files in the `conf` directory of a Kedro project. However, if you prefer, you may point Kedro to any other directory and change the configuration paths by setting the `CONF_ROOT` variable in [`src//settings.py`](settings.md) as follows: ```python CONF_ROOT = "new_conf" ``` @@ -16,12 +16,11 @@ Kedro-specific configuration (e.g., `DataCatalog` configuration for IO) is loade ```python from kedro.config import ConfigLoader -conf_paths = ["conf/base", "conf/local"] -conf_loader = ConfigLoader(conf_paths) +conf_loader = ConfigLoader(conf_root="conf", env="local") conf_catalog = conf_loader.get("catalog*", "catalog*/**") ``` -This will recursively scan for configuration files firstly in `conf/base/` and then in `conf/local/` directory according to the following rules: +This recursively scans for configuration files firstly in `conf/base/` (`base` being the default environment) and then in `conf/local/` (`local` being the designated overriding environment) directory according to the following rules: * *Either* of the following is true: * filename starts with `catalog` @@ -60,23 +59,14 @@ export KEDRO_ENV=test ## Template configuration -Kedro also provides an extension [TemplatedConfigLoader](/kedro.config.TemplatedConfigLoader) class that allows you to template values in configuration files. To apply templating in your project, you will need to update the `register_config_loader` hook implementation in your `src//hooks.py`: +Kedro also provides an extension [TemplatedConfigLoader](/kedro.config.TemplatedConfigLoader) class that allows you to template values in configuration files. To apply templating in your project, you will need to set the `CONFIG_LOADER_CLASS` constant in your `src//settings.py`: ```python from kedro.config import TemplatedConfigLoader # new import - -class ProjectHooks: - @hook_impl - def register_config_loader(self, conf_paths: Iterable[str]) -> ConfigLoader: - return TemplatedConfigLoader( - conf_paths, - globals_pattern="*globals.yml", # read the globals dictionary from project config - globals_dict={ # extra keys to add to the globals dictionary, take precedence over globals_pattern - "bucket_name": "another_bucket_name", - "non_string_key": 10, - }, - ) +... +CONFIG_LOADER_CLASS = TemplatedConfigLoader +... ``` Let's assume the project contains a `conf/base/globals.yml` file with the following contents: @@ -347,7 +337,7 @@ except MissingConfigException: .. note:: The ``kedro.framework.context.KedroContext`` class uses the approach above to load project credentials. ``` -Credentials configuration can then be used on its own or [fed into the `DataCatalog`](../05_data/01_data_catalog.md#feeding-in-credentials). +Credentials configuration can then be used on its own or [fed into the `DataCatalog`](../data/data_catalog.md#feeding-in-credentials). ### AWS credentials @@ -355,7 +345,7 @@ When working with AWS credentials on datasets, you are not required to store AWS ## Configure `kedro run` arguments -An extensive list of CLI options for a `kedro run` is available in the [Kedro CLI documentation](../09_development/03_commands_reference.md#run-the-project). However, instead of specifying all the command line options in a `kedro run` via the CLI, you can specify a config file that contains the arguments, say `config.yml` and run: +An extensive list of CLI options for a `kedro run` is available in the [Kedro CLI documentation](../development/commands_reference.md#run-the-project). However, instead of specifying all the command line options in a `kedro run` via the CLI, you can specify a config file that contains the arguments, say `config.yml` and run: ```console $ kedro run --config config.yml diff --git a/docs/source/04_kedro_project_setup/01_dependencies.md b/docs/source/kedro_project_setup/dependencies.md similarity index 50% rename from docs/source/04_kedro_project_setup/01_dependencies.md rename to docs/source/kedro_project_setup/dependencies.md index 626d121a1b..3261b94da4 100644 --- a/docs/source/04_kedro_project_setup/01_dependencies.md +++ b/docs/source/kedro_project_setup/dependencies.md @@ -9,45 +9,23 @@ You can add or remove dependencies. For a new project, edit `src/requirements.tx kedro build-reqs ``` -The `build-reqs` command will: - -1. Generate `src/requirements.in` from the contents of `src/requirements.txt` -2. [pip compile](https://github.com/jazzband/pip-tools#example-usage-for-pip-compile) the requirements listed in `src/requirements.in` -3. Regenerate `src/requirements.txt` to specify a list of pinned project dependencies (those with a strict version) +The `build-reqs` command will [pip compile](https://github.com/jazzband/pip-tools#example-usage-for-pip-compile) the requirements listed in `src/requirements.txt` into a `src/requirements.lock` that specifies a list of pinned project dependencies (those with a strict version). ```eval_rst -.. note:: ``src/requirements.in`` contains "source" requirements, while ``src/requirements.txt`` contains the compiled version of those and requires no manual updates. +.. note:: ``src/requirements.txt`` contains "source" requirements, while ``src/requirements.lock`` contains the compiled version of those and requires no manual updates. ``` -To further update the project requirements, you should modify `src/requirements.in` (not `src/requirements.txt`) and re-run `kedro build-reqs`. +To further update the project requirements, you should modify `src/requirements.txt` (not `src/requirements.lock`) and re-run `kedro build-reqs`. -## `kedro install` +## Install project-specific dependencies To install the project-specific dependencies, navigate to the root directory of the project and run: ```bash -kedro install -``` - -`kedro install` automatically compiles project dependencies by running `kedro build-reqs` behind the scenes if the `src/requirements.in` file doesn't exist. - -To skip the compilation step and install requirements as-is from `src/requirements.txt`, run the following: -```bash -kedro install --no-build-reqs -``` - -This takes the latest version of a dependency that is available within the range specified. It allows flexibility in the version of the dependency that `pip` installs. For example, if `ipython>=7.0.0,<8.0` is specified, then the most up-to-date version available is installed. - - -To force the compilation, even if `src/requirements.in` already exists, run the following: - -```bash -kedro install --build-reqs +pip install -r src/requirements.txt ``` -In some cases, such as a production setting, this is useful to eliminate ambiguity and specify exactly the version of each dependency that is installed. - ## Workflow dependencies To install all of the dependencies recorded in Kedro's [`setup.py`](https://github.com/kedro-org/kedro/blob/develop/setup.py) run: @@ -58,7 +36,7 @@ pip install "kedro[all]" ### Install dependencies related to the Data Catalog -The [Data Catalog](../05_data/01_data_catalog.md) is your way of interacting with different data types in Kedro. The modular dependencies in this category include `pandas`, `numpy`, `pyspark`, `matplotlib`, `pillow`, `dask`, and more. +The [Data Catalog](../data/data_catalog.md) is your way of interacting with different data types in Kedro. The modular dependencies in this category include `pandas`, `numpy`, `pyspark`, `matplotlib`, `pillow`, `dask`, and more. #### Install dependencies at a group-level diff --git a/docs/source/04_kedro_project_setup/03_session.md b/docs/source/kedro_project_setup/session.md similarity index 85% rename from docs/source/04_kedro_project_setup/03_session.md rename to docs/source/kedro_project_setup/session.md index c95f66e6da..995bddde7d 100644 --- a/docs/source/04_kedro_project_setup/03_session.md +++ b/docs/source/kedro_project_setup/session.md @@ -14,7 +14,7 @@ The main methods and properties of `KedroSession` are: - `create()`: Create a new instance of ``KedroSession`` with session data - `load_context()`: Instantiate `KedroContext` object - `close()`: Close the current session — although we recommend that you [use the session object as a context manager](#create-a-session), which will call `close()` automatically, as opposed to calling the method explicitly -- `run()`: Run the pipeline with the arguments provided; see [Running pipelines](../06_nodes_and_pipelines/04_run_a_pipeline) for details +- `run()`: Run the pipeline with the arguments provided; see [Running pipelines](../nodes_and_pipelines/run_a_pipeline) for details ### Create a session @@ -37,13 +37,3 @@ You need to tell `KedroSession` the package name of your Kedro project so it can - `env`: Environment for the `KedroContext` - `extra_params`: Optional dictionary containing extra project parameters for the underlying `KedroContext`; if specified, this will update (and therefore take precedence over) parameters retrieved from the project configuration - -When you want to access to the most recent session object, use the helper function `get_current_session()` as follows: - -```python -from kedro.framework.session import get_current_session - -session = get_current_session() -context = session.load_context() -context.catalog.load("my_data").head() -``` diff --git a/docs/source/kedro_project_setup/settings.md b/docs/source/kedro_project_setup/settings.md new file mode 100644 index 0000000000..41ffefb71e --- /dev/null +++ b/docs/source/kedro_project_setup/settings.md @@ -0,0 +1,31 @@ +# Project settings + +A Kedro project's `settings.py` file contains the application configuration for the project, including registration of hooks and library components. This page explains how settings work and which settings are available. + +By default, all code in `settings.py` is commented out. In the case that settings are not supplied, Kedro chooses sensible default values. You only need to edit `settings.py` if you wish to change to values other than the defaults. + +## Available settings + +```eval_rst ++-----------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Setting | Default value | Use | ++===================================+=========================================================+=========================================================================================================================================================+ +| :code:`HOOKS` | :code:`tuple()` | Instantiate and list your `project hooks <../extend_kedro/hooks.md>`_. | ++-----------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ +| :code:`DISABLE_HOOKS_FOR_PLUGINS` | :code:`tuple()` | List the installed plugins for which to disable auto-registry of hooks. | ++-----------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ +| :code:`SESSION_STORE_CLASS` | :code:`kedro.framework.session.session.BaseSessionStore`| Define where to store data from a :code:`KedroSession`. | ++-----------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ +| :code:`SESSION_STORE_ARGS` | :code:`dict()` | Define keyword arguments to be passed to :code:`SESSION_STORE_CLASS` constructor. | ++-----------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ +| :code:`CONTEXT_CLASS` | :code:`kedro.framework.context.KedroContext` | Define custom context class. | ++-----------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ +| :code:`CONF_SOURCE` | :code:`"conf"` | Define the configuration folder. | ++-----------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ +| :code:`CONFIG_LOADER_CLASS` | :code:`kedro.config.ConfigLoader` | Define the project :code:`ConfigLoader` class. | ++-----------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ +| :code:`CONFIG_LOADER_ARGS` | :code:`dict()` | Define keyword arguments to be passed to :code:`CONFIG_LOADER_CLASS` constructor. These kwargs depend on the :code:`ConfigLoader` class implementation. | ++-----------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ +| :code:`DATA_CATALOG_CLASS` | :code:`kedro.io.DataCatalog` | Define the project :code:`DataCatalog` class. | ++-----------------------------------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` diff --git a/docs/source/08_logging/02_experiment_tracking.md b/docs/source/logging/experiment_tracking.md similarity index 92% rename from docs/source/08_logging/02_experiment_tracking.md rename to docs/source/logging/experiment_tracking.md index f041700047..c06837869d 100644 --- a/docs/source/08_logging/02_experiment_tracking.md +++ b/docs/source/logging/experiment_tracking.md @@ -10,7 +10,7 @@ However, Kedro was missing a way to log metrics and capture all this logged data Experiment tracking in Kedro adds in the missing pieces and will be developed incrementally. -The following section outlines the setup within your Kedro project to enable experiment tracking. You can also refer to [this tutorial](../03_tutorial/07_set_up_experiment_tracking.md) for a step-by-step process to access your tracking datasets on Kedro-Viz. +The following section outlines the setup within your Kedro project to enable experiment tracking. You can also refer to [this tutorial](../tutorial/set_up_experiment_tracking.md) for a step-by-step process to access your tracking datasets on Kedro-Viz. ## Enable experiment tracking Use either one of the [`tracking.MetricsDataSet`](/kedro.extras.datasets.tracking.MetricsDataSet) or [`tracking.JSONDataSet`](/kedro.extras.datasets.tracking.JSONDataSet) in your data catalog. These datasets are versioned by default to ensure a historical record is kept of the logged data. @@ -56,4 +56,4 @@ def create_pipeline(**kwargs): ``` ## Community solutions -You can find more solutions for experiment tracking developed by the Kedro community on the [plugins page](../07_extend_kedro/04_plugins.md#community-developed-plugins). +You can find more solutions for experiment tracking developed by the Kedro community on the [plugins page](../extend_kedro/plugins.md#community-developed-plugins). diff --git a/docs/source/08_logging/01_logging.md b/docs/source/logging/logging.md similarity index 81% rename from docs/source/08_logging/01_logging.md rename to docs/source/logging/logging.md index 8db8bc9c1c..60d8b61589 100644 --- a/docs/source/08_logging/01_logging.md +++ b/docs/source/logging/logging.md @@ -4,7 +4,7 @@ Kedro uses, and facilitates, the use of Python’s `logging` library by providin ## Configure logging -You can customise project logging in `conf//logging.yml` using [standard Kedro mechanisms for handling configuration](../04_kedro_project_setup/02_configuration.md). The configuration should comply with the guidelines from the `logging` library. Find more about it in [the documentation for `logging` module](https://docs.python.org/3/library/logging.html). +You can customise project logging in `conf//logging.yml` using [standard Kedro mechanisms for handling configuration](../kedro_project_setup/configuration.md). The configuration should comply with the guidelines from the `logging` library. Find more about it in [the documentation for `logging` module](https://docs.python.org/3/library/logging.html). ## Use logging diff --git a/docs/source/meta/images/KedroArchitecture.drawio b/docs/source/meta/images/KedroArchitecture.drawio index f546a57896..0d32eeeebd 100644 --- a/docs/source/meta/images/KedroArchitecture.drawio +++ b/docs/source/meta/images/KedroArchitecture.drawio @@ -1 +1 @@ -7V1Zc9s4Ev41rt19kIr38egjM8nEkzjrTTl5clEkJCEmCRUJ2VJ+/QC8CUA0LfFSNs5DRBAkpa+7P3Q3GuCFeh3s/oyczfpv5AH/QpG83YV6c6Eoii0b5D/ask9bZFmV0pZVBL2srWy4hz9B1ph320IPxLWOGCEfw0290UVhCFxca3OiCL3Uuy2RX3/qxlkBruHedXy+9QF6eJ22WrpUtr8HcLXOnyxL2ZnAyTtnDfHa8dBLpUl9d6FeRwjh9FOwuwY+RS/HJb3ujwNniy8WgRC3uSB+MDYftlsX4PhG/vIHfrI+/5xp6V2eHX+b/eALxfDJ/a4W5MOKfvgIvAjRJ0XoB0U4O08eVHTJfiDe56hFaBt6gD5YIqdf1hCD+43j0rMvRFFI2xoHPjmSycdnEGFIEL/04SokbRjRDkvo+9fIR1FyR9XTgeVppD3GEXoClTOWslANo/gWVUAyjOgDwK7SlAH0J0ABwNGedMkVNpPVvq6EL6XklVx/1xWpG7nQnUzbVsWdS4GQD5lMxPJZP/58Dj4+f/sWvH8w/tq49vP3TzOFk08cufSWJyHuOfE66Zsf3DkYgyhMWhTpsFCc7MgHSyp331kA/8pxn1bJ43OhhCgkz71C5PkQUyh1qRvxaKx4ePnItkA+qtWBfIT2I5vtDWgZOQF4QdHTSCYEZGJEpsiEbMNUnX5MyDB0TkaaLpCRrPcmI05Ed3ADfEiUVJFuwDPw0QZEnAAIX2/ox23gX7qYQlXAfEv1/g7FEENE4V4gjFFwUA4VQaEtpg++Lkarjixjpth13FWBbYip63TUravPd9rtl/U7A351d0HwfQb3M1XlEAUeGVqzQxThNVqh0PHfla1XdaUv+9wiCmSC4A+A8T7zE5wtRnV8wQ7ib/TyuZ4dfa+cudlld04O9vlBSH5v5SJ6+L16rrwsOcqvq7HoQRnGaBu5oEE9syEYO9EK4IZ+SgYoRbFRJSLgOxg+1x2Y7uXLuw7RNoyFQk/spS6ofCRxCV4gEphOAD0v1QkQw5/OIrkfhX2DYIiTX6NfXeg3QuwbNZKzqsJ7zJ5Sc9CE1ibNJVkzawaXjdStwc9ufkd/TXmXmT3P75tfhJbLmCgGK6/iWx1PjDonQYLD8mTvogM6M/UatppgnFcEZKbpfQ0h+qhkVqGyktjEZNYRKdktSUkbi5PEYjI4jV6BEEQOBhMhpkbtOp2YpHnOQzmfyKfRUu7UqfW72kNRlM0J9KB7HRNlJRKLR/Kul8ul4roi79ozFobekXetG3U3z+K9a6Gbp/XnXUuciDZOSGhoBiPIW93QI4khm/P6YFJkoKqjSW6CNcyKKzsHzTgb57ij8aTI3b3q5dpjjShin9EcU1DyRfuBv4xiajFMGdL0G8W0FrA6LQHnruVvAXcmYGtaAubTP9t4Ku5go0p2EqcaUj7WnOgCkntZFuNbkjBYHcoTVHg5XiMikx3mRPmKV+HEmzT3toQ7ahidZD3lul9mSrxjZgr8MrO39NuoEetbeK0rflLbehjypPhJ5WNWNwLTiVgbtasTipKtPJd4IkWJGGowelI5Md6DOKbzAxOgJ70eAg3JTo1efgWtPIz34SJyskf0HLFbLhBH7AtL17qas2TmwyxBxC6eD1P7Al/lVZWMpEu4KlIrUZ43uUWOJ5gaG1+DbYUP4gfVYMH0RzmrOD5cMpMnGh8vfpj775aMFJNQruJnTwYtvqzgA7nR5ymAVRRTTAYsi+ez2w+nDSA94GTqgryjcBarN6D4Ufc9Qk/jJ2lZqAx7bKg0fozMHRQScYKQOnYn4tbOR9HpP5GPYiR/9AoSBVfa079+ZhsMwUSscLpB7S2u1cbNnB+Tr5NHyddpbetKus/XJZdeRpGzr3TIYs2DwZuuso4yU876Sn/TlBjlSr9Bp1Gexnt9/yZjzX9I052/XcFTOaEDmzXkuqdsGQNyqdhk7d8m285k9bYm2/kcynEmy3o4tt5ssmx/eQiT5auc3u1w5JCmaTg/hqwwM9SmyecIBjbZUac9jzPZuSQZjNka5iuGmxzdgQgS2GjqtFNrNlpa88SmvDU+nILBxgcByMhgojnpXGe7qKLSLKbgSUkPT63uNOaGVr+ov4y0xmdcrrcxrVNXpNs8ySpdo2CDQqFoxydCOx/oRosDDR7ETYSekzVpkzCFQyi/WemL4tncb+yimFY8ecRBSiwAw3AVzzcn5v070EKm9Fg227rPfcHFhxybLNEcT7BUWxasmRsWL0FVuw8nqFmKVdDdaFgJ2G0fkx/5NDpYgrpN2xTh1V/lphAxPjGfITbVcldFsABvaNB4jzKmICx9+vQpgqaq44MmKKwOEQaLNGidHPFrrXP2vQHGV4ht9tmq9Tnxe/2pQabrY/O/rHCQBTCEs6d0rmNydqkJClcGt0vef+WQAqF3STefIEeu78QxdOvg1FIMr2dWDmL5asqggpOoyCRvO7U2VGGSztxkb5otyS4rBcDdibEQgxFimkvhbvPWxOWMLddUzObJBu4Co5bpPDlzKVY0gedfxJ6S52Caw3TJfz5aXSjXfPUOURgnSWzFyWnJjYBHNAg6ftYAsDvndbc5jM22fjghiG3eNYLX9WYr7KICUZHU+mLebkqmdSaJJDPa3EnuR4wOHwSdyFFHLj44gduqedOmPMJ0OJDhFHbTkbYcaDFxjzoQCcq2MgCp8SFnBFYwTlmqrP4kVlwmJllic0KPCjH1hKUiO+1gYVXGmRBalyXVPRHajNGYPhZ/iMERrQM+Mz6bKk/J7BYwR/pqsirNJcUu/5h9LHqjLbXbWWShAip8MF7xxSoJWYk4Y0TzBuegtiyTG9KUWYaZiZj1sSGKWMp8BuHsaOZVt0mfNh0pusKG7kc6TopG7jQCIamSPh8gPMx18wAlFbs1OgHNzISLmP6XFEZT3yogPpTAswLzFYkIpSz/I0VbfvnSebhT3U2M9kh0qjyOOyVYrHZ2PDdV/tLYvUCPdKd0mePBvghLz5Ip/dIVn80q1rlKYLchRu5glEiuMs1xntxzDrmpsVysXyAzNVnqYdPlx1KPyc2e9EY9pjUA9TQVcZWekkSe46Rp9fOknTPIII1FO7pgbripjm+C+2QMtzhBnG1vG1/mc8ojL06wmM3CTXZ7/Ff6a1lZdq/MpPJqWdTo3hf7GTL6OX5drinaeH3YBQq8G1GZRngbfU+8MFdX7LltW5aikk+aLNe3ODF728BbFzhmY60BUZm/ZqIkB+yajpHJs/VaEK3tpOYwa0F0PlGw2ELfm4iNNeptN9t86/nqxVOXfvSyFVGjstVXvKVr1i9+mTce2Mx4bQlKwwZ94YHOTxH9nw5JXbwNRwzxhHZj7W5I6n0/TzGWbRcbT2x5os57fuVqtq/xudMa9yIXW7BddV+8JnwFFR+izOd81eAEaoMNay5aON1febAQLt5jSuqUprgASRGg1Vf5uRAr0dvnfueCm8mz7TQ6W2x6ZC6YCH+Uqp5c6bpKvwj1jx9KKnlhN0/E5HWG/Gz52VUeNlrhhNPGbElQH4vohdjwodTjY+DA8PFxgoSuDbn2VgiX6G2Ivwm9E0Ln3mF5JKFr6mB1BZrabV2BUOX4taUVDk+0gso5+6a4KGpiqRyv6dsgy7lAGCbq4T7R9wMrUoA8fl/X86D3zrZFOe9CqE+fvn7436X/5Yv2Ufvx12L1sH3YCWKbvJT3Mc3dRPuOeF4AP0c+bXle1QZ03IW4KWfG8wypt8J+LJ7vyHFXjcGKOFSj2yIOocrxcXWF52vs/S9BXStbB7sp3wAcOsHbN/MZhttbc3ZdoQ+b7K/B7eQwQghXuxPWXf9Nxmja4x8= +7V3Zctu4Ev0a1dx5kIr78uglM8nEN3Gub8rJk4siIQkxSahIyJby9QNwJwDTtMRNM5EfLIKbeLr7oLvRAGfqVbD/M3K2m/8iD/gzRfL2M/V6piiKrljkH205pC2WYqQN6wh6aZNcNtzBnyBrlLLWHfRAXDsQI+RjuK03uigMgYtrbU4Uoef6YSvk1++6ddaAa7hzHZ9vvYce3mRPoUtl+3sA15v8zrKU7Qmc/OCsId44HnquNKnvZupVhBBOvwX7K+BT8HJc0vP+eGFv8cMiEOI2J8T3xvbDbucCHF/LX/7Aj9bnn3MtvcqT4++yB54phk+ud7kkX9b0y0fgRYjeKUI/KMLZfnKj4pDsAfEhRy1Cu9AD9MYS2f28gRjcbR2X7n0mekLaNjjwyZZMvj6BCEOC+IUP1yFpw4gesIK+f4V8FCVXVD0dWJ5G2mMcoUdQ2WMpS9Uwil9RBSTDiN4A7CtNGUB/AhQAHB3IIbm+ZrI61JXwuZS8omZtm4rUjVzoTqZt6+LKpUDIl0wmYvlsHn4+BR+fvn0L3t8bf21d++n7p7nCySeOXHrJkxD3nHiTHJtv3DoYgyhMWhTpZaE42ZYPVlTuvrME/qXjPq6T2+dCCVFI7nuJyP0hplDqUjfi0Vjx8PKRbYF8VKsD+QjtRzbbG9AqcgLwjKLHkUwIyMSITJEJ2YapOv2YkGHonIw0XSAjWe9NRpyIbuEW+JAoqSJdgyfgoy2IOAEQvt7Sr7vAv3AxhaqA+Ybq/S2KIYaIwr1EGKPgRTlUBIV2mN74quitOrKMuWLXcVcFtiGmrtNRty4/32o3XzbvDPjV3QfB9zk8zFWVQxR4pGvNNlGEN2iNQsd/V7Ze1pW+POYGUSATBH8AjA+Zn+DsMKrjC/YQf6OnL/Rs63tlz/U+u3Kyccg3QvK8lZPo5vfqvvK0ZCs/r8aiL8owRrvIBQ3qmXXB2InWADccp2SAUhQbVSICvoPhU92B6V6+vOsQ7cJYKPTEXuqCynsSl+AFIoHpBNDzUp0AMfzpLJPrUdi3CIY4eRr9cqZfC7Fv1EjOqgrvMbtLzUETWpu0kGTNrBlc1lO3Bj+7+C19mvIqc3uRXzc/Ca1WMVEMVl7FrzqeGHVOggSH1cneRQd0Zuo1bDVBP68IyEzT++pC9FHJrEJlJbGJyawjUrJbkpI2FieJxWRwGr0GIYgcDCZCTI3adToxSYuch3I+kU+jpdypU+tXtYeiKJsT6IvudUyUlUgsHsm7Xq1WiuuKvGvPWBp6R961btTdPIv3roVuntafdy1xIto6IaGhOYwgb3VD9ySGbC7qnYmsCnqT3ARrmBVndg6acTbOcUf9SZG7e9XLtcfqUcQ+ozmmoORZ+46/jGJqMUwZ0vQbxbQWsDotAeeu5S8BdyZga1oC5tM/u3gq7mCjSnYSpxpS3tec6AKSa1kW41uSMFgdyhNUeDleISKTPeZE+YpX4cTbNPe2gntqGJ1kPeW6X2ZKvGNmCvwys7f026gR61t4rSt+Utt6GPKk+EnlY1Y3AtOJWBu1qxOKkq08l3giRYkYajB6Ujkx3oE4puMDE6AnvR4CDclOjV5+Ba08jPfhMnKyW/QcsVsuEEfsS0vXuhqzZMbDLEHELh4PU/sCX+VVlfSkK7guUitRnje5QY4nGBobX4NthQ/iB9VgwfBHOao4PlwykycaHy++m/vfjvQUk1Cu4rEngxZfVvCBXOjzFMAqiikmA5bF89nNh9M6kB5wMnVB3lE4itUbUHyv+x6hx/GTtCxUhj02VBrfR+YOCok4QUgduxNxa+ej6PRP5KMYyYeeQaLgSnv66We0wRAMxAqHG9Te4lpt3Mz5Mfk6eZR8nda2rqT7fF1y6kUUOYfKAVms+WLwpquso8yUs75yvGlKjHKlv6DTKE/jvb7/kL7md9J06+/W8FRO6MBmDbnuKVvGgFwqNln7l8m2M1m9rcl2PoZynMmyHo6tN5sse7w8hMnyVU7v9jhySNM0nB9DVpgRatPkcwQDm+yow57HmexCkgzGbA3zFcNNtm5BBAlsNHXaqTUbLa15YkPeGh9OwWDrgwBkZDDRnHSus11UUWkWU/CkpJunVncaC0Orn9RfRlrjMy5Xu5jWqSvSTZ5kla5QsEWhULTjE6Gdd3SjxYEGD+I2Qk/JnLRJmMJLKL9Z6Yvi2dxv7KKYVjx4xEFKLADDcB0vtifm/TvQQqb0WM6ZYIjaYzGv8RqYJZrjCZZqy4I5c8PiJahq9+EENYvw3aKtn9cXVgJ2O8TkIR9HB0tQt2mbIrz6q9wUIsYn5jPEplruqggm4A0NGu9RxhSElU/vPkXQVHV80ASF1SHCYJkGrZMjfq11zr43wPgKse0hm7W+IH6vPzXIdH1s/pcVDrIAhnD+mI51TM4uNUHhyuB2yfuvHFIg9C7o4hNky/WdOIZuHZxaiuH1zMqLWL6aMqjgJCoyydtOrQ1VmKQzN9ibZkuy00oBcFdiLMRghJjmUrjLvDVxOWfLNRWzebCBO8GoZTpPzlyKFU3g+Rexp+Q5mOYwXfLPR+uZcsVX7xCFcZLEVpzsltwIeESDoONnDQC7C153m8PYbOmHE4LY5lUjeF1vtsIuKhAVSa1P5u2mZFpnkkgyo82d5H7E6PBB0IkcdeTkgxO4rZo3bcojTIcDGU6xmdr4thxoMXFPnpDtnQSlIUiNDzkjsIZxylJl9Sex4jIxyRKbE3pUiKknLBXZaQcLqzLOhNC6LKnuidDmjMb0MflDDI5oHvCZ8dlUeUpml4A50leTVWkhKXb5Ydax6Iu28kLpXmlL4YPxii9WSchKxBkjmjc4B7VlmdyQpswyzEjEvI8FUcRS5jMIZ0czr7pN+rTpSFFlNnQ/0nEiv2ahj0BIiqUtBvCkct18gZKK1RqdgGZmwmVM/yWF0dS3CogPJfCswGJNIkIpy/9I0Y6fvnQe7lR3A6M9Ep0qj+NOCSarnR3PTZW/NHYt0CPdKb2shuibsPQsmdIvXfHZrGKeqwT2W2LkDkaJ5CrDHOfJPeeQmxrLxfoHZKYmSz1suvxY6jG50ZPeqMe0BqCepiKu0lOSyH2cNK1+nrRzBhmksWhHF4wNN9XxTXCdjOEmJ4iz7W3jy3xMeeTJCRazWLjJLo//yvFaVpbdKzOpvFoWNbp3xXqGjH6OX5drihZeH3aCAu9GVIYR3kbfEy/M1RV7YduWpajkmybL9SVOzN4W8NYFjtlYc0BU5tNMlGSDndMxMnm2nguitR3UHGYuiM4nCpY76HsTsbFGve1mmW89n7146tSPXpYialS2+oy3dM767B/zxgOb6a8tQWnYoC880Pkhon9pl9TF23DEEE9oNdbuuqTe1/MUY9l2svHEpifqvOdXzmb7Gp87rXEvcrEFy1X3xWvCV1DxIcpiwVcNTqA22LAWoonT/ZUHC+HiO9+Hh8CB4cPDBOcgaa0Dub7gEr0/61c6uJk/W+ZWVXbN/yPTwZoy2EiUpnQ7EiVUOX42UiUbnGgFlXP2S3ExDM4OleMNfX9YmT2GYaIe7iN9o6QiBcjjVwKcXCK50SYnnEceYuj806evH/5/4X/5on3Ufvy1XN/v7veC3jAv/npIvf3oMEGeV/M1P4fgeSFuCofbL57viOcLj/FEnld1fSieV3WjU54Xqhyfu6rwfI29fxNUQrGVU9vynZGhE7x9+Yfhub3RDv9d3E42yxctp4eXr6tW3/0N diff --git a/docs/source/meta/images/kedro_architecture.png b/docs/source/meta/images/kedro_architecture.png index 297049efc5..7db347260e 100644 Binary files a/docs/source/meta/images/kedro_architecture.png and b/docs/source/meta/images/kedro_architecture.png differ diff --git a/docs/source/06_nodes_and_pipelines/03_modular_pipelines.md b/docs/source/nodes_and_pipelines/modular_pipelines.md similarity index 85% rename from docs/source/06_nodes_and_pipelines/03_modular_pipelines.md rename to docs/source/nodes_and_pipelines/modular_pipelines.md index de27833461..1d291b6661 100644 --- a/docs/source/06_nodes_and_pipelines/03_modular_pipelines.md +++ b/docs/source/nodes_and_pipelines/modular_pipelines.md @@ -6,7 +6,7 @@ In many typical Kedro projects, a single (“main”) pipeline increases in comp ## How do I create a modular pipeline? -You can use a [project-specific CLI command](../09_development/03_commands_reference.md#kedro-commands) to create a modular pipeline. The pipeline name must adhere to [generic Python module naming rules](https://realpython.com/python-pep8/#naming-conventions): +You can use a [project-specific CLI command](../development/commands_reference.md#kedro-commands) to create a modular pipeline. The pipeline name must adhere to [generic Python module naming rules](https://realpython.com/python-pep8/#naming-conventions): * Can only contain alphanumeric characters and underscores (`A-Za-z0-9_`) * Must start with a letter or underscore @@ -62,8 +62,7 @@ Finally, `kedro pipeline create ` also creates a placeholder for For ease of use and portability, consider these recommendations as you develop a modular pipeline: * A modular pipeline should include a `README.md`, with all the information regarding its execution -* A modular pipeline _may_ have external dependencies specified in `requirements.txt`. These dependencies are _not_ - currently installed by the [`kedro install`](../09_development/03_commands_reference.md#install-all-package-dependencies) command, so users of your pipeline would have to run `pip install -r src//pipelines//requirements.txt` before using the pipeline +* A modular pipeline _may_ have external dependencies specified in `requirements.txt`. Users of your pipeline would have to run `pip install -r src//pipelines//requirements.txt` before using the pipeline * To ensure portability, modular pipelines should use relative imports when accessing their own objects and absolute imports otherwise. For example, in `pipeline.py`: ```python @@ -112,23 +111,25 @@ kedro run --pipeline mp2 ## How to share a modular pipeline ### Package a modular pipeline -Since Kedro 0.16.4 you can package a modular pipeline by executing `kedro pipeline package ` command, which will generate a new [wheel file](https://pythonwheels.com/) for it. By default, the wheel file will be saved into `src/dist` directory inside your project, however this can be changed using the `--destination` (`-d`) option. +Since Kedro 0.16.4 you can package a modular pipeline by executing `kedro pipeline package ` (e.g. `kedro pipeline package pipelines.data_science.training`). From Kedro 0.18.0 this will generate a new [Python source distribution file](https://packaging.python.org/overview/#python-source-distributions) (sdist) for it. Older versions of Kedro will generate a wheel file. By default, the sdist file, with extension `.tar.gz`, will be saved into `dist/` directory inside your project, however this can be changed using the `--destination` (`-d`) option. When you package your modular pipeline, Kedro will also automatically package files from 3 locations: -* All the modular pipeline code in `src//pipelines//` +* All the modular pipeline code in `src//...` * Parameter files that match either the glob pattern `conf//parameters*/**/.yml` or `conf//parameters*/**//*`, where `` defaults to `base`. If you need to capture the parameters from a different config environment, run `kedro pipeline package --env ` -* Pipeline unit tests in `src/tests/pipelines/` +* Pipeline unit tests in `src/tests/...` -Kedro will also include any requirements found in `src//pipelines//requirements.txt` in the modular pipeline wheel file. These requirements will later be taken into account when pulling a pipeline via `kedro pipeline pull`. +Kedro will also include any requirements found in `src///requirements.txt` in the modular pipeline sdist file. These requirements will later be taken into account when pulling a pipeline via `kedro pipeline pull`. ```eval_rst .. note:: Kedro will not package the catalog config files even if those are present in ``conf//catalog/.yml``. ``` -If you plan to publish your packaged modular pipeline to some Python package repository like [PyPI](https://pypi.org/), you need to make sure that your modular pipeline name doesn't clash with any of the existing packages in that repository. However, there is no need to rename any of your source files if that is the case. Simply alias your package with a new name by running `kedro pipeline package --alias `. +If you plan to publish your packaged modular pipeline to some Python package repository like [PyPI](https://pypi.org/), you need to make sure that your modular pipeline name doesn't clash with any of the existing packages in that repository. However, there is no need to rename any of your source files if that is the case. Simply alias your package with a new name by running `kedro pipeline package --alias `. -In addition to [PyPI](https://pypi.org/), you can also share the packaged wheel file directly, or via a cloud storage such as AWS S3. +In addition to [PyPI](https://pypi.org/), you can also share the packaged sdist file directly, or via a cloud storage such as AWS S3. + +If you want to generate a wheel file from the sdist file you can run `pip wheel /dist/-.tar.gz` #### Package multiple modular pipelines @@ -136,11 +137,11 @@ To package multiple modular pipelines in bulk, run `kedro pipeline package --all ```toml [tool.kedro.pipeline.package] -first_pipeline = {alias = "aliased_pipeline", destination = "somewhere/else", env = "uat"} -second_pipeline = {} +"pipelines.first_pipeline" = {alias = "aliased_pipeline", destination = "somewhere/else", env = "uat"} +"pipelines.second_pipeline" = {} ``` -Here the keys (e.g. `first_pipeline`, `second_pipeline`) are the modular pipelines' folder names, and the values are the options that `kedro pipeline package ` accepts. +Here the keys (e.g. `pipelines.first_pipeline`, `pipelines.second_pipeline`) are the Python module paths to the modular pipelines, relative to the project's package name, and the values are the options that `kedro pipeline package ` accepts. ```eval_rst .. note:: Make sure `destination` is specified as a POSIX path even when working on a Windows machine. @@ -148,30 +149,26 @@ Here the keys (e.g. `first_pipeline`, `second_pipeline`) are the modular pipelin ### Pull a modular pipeline -You can pull a modular pipeline from a wheel file by executing `kedro pipeline pull `, where `` is either a package name on PyPI or a path to the wheel file. Kedro will unpack the wheel file, and install the files in following locations in your Kedro project: +You can pull a modular pipeline from a source distribution (sdist) file by executing `kedro pipeline pull `, where `` is either a package name on PyPI or a path to the source distribution file. Kedro will unpack the sdist file, and install the files in the following locations in your Kedro project: -* All the modular pipeline code in `src//pipelines//` +* All the modular pipeline code in `src///` * Configuration files in `conf//parameters/.yml`, where `` defaults to `base`. If you want to place the parameters from a different config environment, run `kedro pipeline pull --env ` -* Pipeline unit tests in `src/tests/pipelines/` +* Pipeline unit tests in `src/tests/` -Kedro will also parse any requirements packaged with the modular pipeline and add them to project level `requirements.in`. It is advised to do `kedro install --build-reqs` to compile and install the updated list of requirements after pulling a modular pipeline. - -```eval_rst -.. note:: If a modular pipeline has embedded requirements and a project `requirements.in` file does not already exist, it will be generated based on the project `requirements.txt` before appending the modular pipeline requirements. -``` +Kedro will also parse any requirements packaged with the modular pipeline and add them to project level `requirements.txt`. It is advised to run `kedro build-reqs` to compile and `pip install -r src/requirements.lock` to install the updated list of requirements after pulling a modular pipeline. You can pull a modular pipeline from different locations, including local storage, PyPI and the cloud: - Pulling a modular pipeline from a local directory: ```bash -kedro pipeline pull /src/dist/-0.1-py3-none-any.whl +kedro pipeline pull /dist/-0.1.tar.gz ``` - Pulling a modular pipeline from S3: ```bash -kedro pipeline pull https://.s3..amazonaws.com/-0.1-py3-none-any.whl +kedro pipeline pull https://.s3..amazonaws.com/-0.1.tar.gz ``` - Pulling a modular pipeline from PyPI: @@ -183,7 +180,7 @@ kedro pipeline pull If you are pulling the pipeline from a location that isn't PyPI, Kedro uses [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) to locate and pull down your pipeline. If you need to provide any `fsspec`-specific arguments (say, if you're pulling your pipeline down from an S3 bucket and want to provide the S3 credentials inline or from a local server that requires tokens in the header) then you can use the `--fs-args` option to point to a YAML (or any `anyconfig`-supported configuration) file that contains the required configuration. ```bash -kedro pipeline pull https:// --fs-args pipeline_pull_args.yml +kedro pipeline pull https:// --fs-args pipeline_pull_args.yml ``` where @@ -201,8 +198,8 @@ To pull multiple modular pipelines in bulk, run `kedro pipeline pull --all`. Thi ```toml [tool.kedro.pipeline.pull] -"src/dist/first-pipeline-0.1-py3-none-any.whl" = {} -"https://www.url.to/second-pipeline.whl" = {alias = "aliased_pipeline", fs-args = "pipeline_pull_args.yml"} +"dist/first-pipeline-0.1.tar.gz" = {} +"https://www.url.to/second-pipeline-0.1.tar.gz" = {alias = "aliased_pipeline", destination = "somewhere/else", fs-args = "pipeline_pull_args.yml"} ``` Here the keys are the package paths, and the values are the options that `kedro pipeline pull ` accepts. Package paths can be any of the locations allowed by `kedro pipeline pull`, including local storage, PyPI and the cloud. @@ -256,8 +253,6 @@ new-kedro-project │   │   │   │   └── README.md │   │   │   └── __init__.py │   │   ├── __init__.py -| | ├── cli.py -│   │   ├── hooks.py │   │   ├── pipeline_registry.py │   │   ├── __main__.py | | └── settings.py @@ -421,14 +416,14 @@ Remapping free outputs is required since "breakfast_food" and "lunch_food" are t The resulting pipeline now has two separate nodes, `breakfast.defrost_node` and `lunch.defrost_node`. Also two separate datasets `breakfast.meat` and `lunch.meat` connect the nodes inside the pipelines, causing no confusion between them. -Note that `pipeline()` will skip prefixing when node inputs contain parameter references (`params:` and `parameters`). +Note that `pipeline()` will also prefix single parameter referenced with `params:` in a node's inputs. However, it won't prefix `parameters`. For example: ```python raw_pipeline = Pipeline([node(node_func, ["input", "params:x"], "output")]) final_pipeline = pipeline(raw_pipeline, namespace="new") -# `final_pipeline` will be `Pipeline([node(node_func, ["new.input", "params:x"], "new.output")])` +# `final_pipeline` will be `Pipeline([node(node_func, ["new.input", "params:new.x"], "new.output")])` ``` ## How to use a modular pipeline with different parameters @@ -445,7 +440,7 @@ alpha_pipeline = Pipeline( beta_pipeline = pipeline( alpha_pipeline, inputs={"input1", "input2"}, - parameters={"params:alpha": "params:beta"}, + parameters={"alpha": "beta"}, namespace="beta", ) @@ -454,6 +449,8 @@ final_pipeline = alpha_pipeline + beta_pipeline The value of parameter `alpha` is replaced with the value of parameter `beta`, assuming they both live in your parameters configuration (`parameters.yml`). The namespace ensures that outputs are not overwritten, so intermediate and final outputs are prefixed, i.e. `beta.intermediary_output`, `beta.output`. +Note that similar to the `inputs` and `outputs` namespacing rule, if you supply a `str` or a `Set[str]`, these explicitly listed parameters won't be namespaced. + ## How to clean up a modular pipeline You can manually delete all the files that belong to a modular pipeline. However, Kedro also provides a CLI command to clean up automatically. It deletes the following files when you call `kedro pipeline delete `: diff --git a/docs/source/06_nodes_and_pipelines/01_nodes.md b/docs/source/nodes_and_pipelines/nodes.md similarity index 95% rename from docs/source/06_nodes_and_pipelines/01_nodes.md rename to docs/source/nodes_and_pipelines/nodes.md index fff2169ab1..30cd7a0c77 100644 --- a/docs/source/06_nodes_and_pipelines/01_nodes.md +++ b/docs/source/nodes_and_pipelines/nodes.md @@ -111,7 +111,7 @@ To tag a node, you can simply specify the `tags` argument, as follows: node(func=add, inputs=["a", "b"], outputs="sum", name="adding_a_and_b", tags="node_tag") ``` -Moreover, you can [tag all nodes in a `Pipeline`](./02_pipeline_introduction.md#how-to-tag-a-pipeline). If the pipeline definition contains the `tags=` argument, Kedro will attach the corresponding tag to every node within that pipeline. +Moreover, you can [tag all nodes in a `Pipeline`](./pipeline_introduction.md#how-to-tag-a-pipeline). If the pipeline definition contains the `tags=` argument, Kedro will attach the corresponding tag to every node within that pipeline. To run a pipeline using a tag: diff --git a/docs/source/06_nodes_and_pipelines/02_pipeline_introduction.md b/docs/source/nodes_and_pipelines/pipeline_introduction.md similarity index 92% rename from docs/source/06_nodes_and_pipelines/02_pipeline_introduction.md rename to docs/source/nodes_and_pipelines/pipeline_introduction.md index 9c720e4aef..85d18994c6 100644 --- a/docs/source/06_nodes_and_pipelines/02_pipeline_introduction.md +++ b/docs/source/nodes_and_pipelines/pipeline_introduction.md @@ -1,6 +1,6 @@ # Pipelines -We previously introduced [Nodes](./01_nodes.md) as building blocks that represent tasks, and which can be combined in a pipeline to build your workflow. A pipeline organises the dependencies and execution order of your collection of nodes, and connects inputs and outputs while keeping your code modular. The pipeline determines the node execution order by resolving dependencies and does *not* necessarily run the nodes in the order in which they are passed in. +We previously introduced [Nodes](./nodes.md) as building blocks that represent tasks, and which can be combined in a pipeline to build your workflow. A pipeline organises the dependencies and execution order of your collection of nodes, and connects inputs and outputs while keeping your code modular. The pipeline determines the node execution order by resolving dependencies and does *not* necessarily run the nodes in the order in which they are passed in. To benefit from Kedro's automatic dependency resolution, you can chain your nodes into a [pipeline](/kedro.pipeline.Pipeline), which is a list of nodes that use a shared set of variables. diff --git a/docs/source/06_nodes_and_pipelines/04_run_a_pipeline.md b/docs/source/nodes_and_pipelines/run_a_pipeline.md similarity index 95% rename from docs/source/06_nodes_and_pipelines/04_run_a_pipeline.md rename to docs/source/nodes_and_pipelines/run_a_pipeline.md index 7dd926bc24..4b32b0b480 100644 --- a/docs/source/06_nodes_and_pipelines/04_run_a_pipeline.md +++ b/docs/source/nodes_and_pipelines/run_a_pipeline.md @@ -30,22 +30,11 @@ kedro run --runner=SequentialRunner #### Multiprocessing -You can alternatively run the nodes within the pipeline concurrently, using a `ParallelRunner`. To do so, add a flag as follows: - -```bash -kedro run --parallel -``` - -or - +You can alternatively run the nodes within the pipeline concurrently, using a `ParallelRunner` as follows: ```bash kedro run --runner=ParallelRunner ``` -```eval_rst -.. note:: You cannot use both ``--parallel`` and ``--runner`` flags at the same time. (That is, ``kedro run --parallel --runner=SequentialRunner`` raises an exception). -``` - #### Multithreading While `ParallelRunner` uses multiprocessing, you can also run the pipeline with multithreading for concurrent execution by specifying `ThreadRunner` as follows: @@ -57,7 +46,7 @@ kedro run --runner=ThreadRunner .. note:: ``SparkDataSet`` doesn't work correctly with ``ParallelRunner``. To add concurrency to the pipeline with ``SparkDataSet``, you must use ``ThreadRunner``. ``` -For more information on how to maximise concurrency when using Kedro with PySpark, please visit our guide on [how to build a Kedro pipeline with PySpark](../11_tools_integration/01_pyspark.md). +For more information on how to maximise concurrency when using Kedro with PySpark, please visit our guide on [how to build a Kedro pipeline with PySpark](../tools_integration/pyspark.md). @@ -192,7 +181,7 @@ kedro run --pipeline my_pipeline .. note:: If you specify ``kedro run`` without the ``--pipeline`` option, it runs the ``__default__`` pipeline from the dictionary returned by ``register_pipelines()``. ``` -Further information about `kedro run` can be found in the [Kedro CLI documentation](../09_development/03_commands_reference.md#run-the-project). +Further information about `kedro run` can be found in the [Kedro CLI documentation](../development/commands_reference.md#run-the-project). ## Run pipelines with IO diff --git a/docs/source/06_nodes_and_pipelines/05_slice_a_pipeline.md b/docs/source/nodes_and_pipelines/slice_a_pipeline.md similarity index 96% rename from docs/source/06_nodes_and_pipelines/05_slice_a_pipeline.md rename to docs/source/nodes_and_pipelines/slice_a_pipeline.md index 27ff3196f0..a823dc1b85 100644 --- a/docs/source/06_nodes_and_pipelines/05_slice_a_pipeline.md +++ b/docs/source/nodes_and_pipelines/slice_a_pipeline.md @@ -1,8 +1,8 @@ # Slice a pipeline -Sometimes it is desirable to run a subset, or a 'slice' of a pipeline's nodes. In this page, we illustrate the programmatic options that Kedro provides. You can also use the [Kedro CLI to pass parameters to `kedro run`](../09_development/03_commands_reference.md#run-the-project) command and slice a pipeline. +Sometimes it is desirable to run a subset, or a 'slice' of a pipeline's nodes. In this page, we illustrate the programmatic options that Kedro provides. You can also use the [Kedro CLI to pass parameters to `kedro run`](../development/commands_reference.md#run-the-project) command and slice a pipeline. -Let's look again at the example pipeline from the [pipeline introduction documentation](./02_pipeline_introduction.md#how-to-build-a-pipeline), which computes the variance of a set of numbers: +Let's look again at the example pipeline from the [pipeline introduction documentation](./pipeline_introduction.md#how-to-build-a-pipeline), which computes the variance of a set of numbers:
      Click to expand diff --git a/docs/source/13_resources/02_glossary.md b/docs/source/resources/glossary.md similarity index 79% rename from docs/source/13_resources/02_glossary.md rename to docs/source/resources/glossary.md index b956a7b29b..b0cac79059 100644 --- a/docs/source/13_resources/02_glossary.md +++ b/docs/source/resources/glossary.md @@ -4,7 +4,7 @@ ## Data Catalog The Data Catalog is Kedro's registry of all data sources available for use in the data pipeline. It manages loading and saving of data. The Data Catalog maps the names of node inputs and outputs as keys in a Kedro `DataSet`, which can be specialised for different types of data storage. -[Further information about the Data Catalog](../05_data/01_data_catalog) +[Further information about the Data Catalog](../data/data_catalog) ## Data engineering vs Data science Data engineering is the process of wrangling data into a clean and reliable state. Data wrangling is about taking a messy or unrefined source of data and turning it into something useful by parsing and cleaning it. @@ -14,7 +14,7 @@ Data science extracts insights from data by using a combination of domain expert ## Kedro Kedro is an open-source Python framework for creating reproducible, maintainable and modular data science code. It applies software engineering best-practices to machine learning code, including modularity, separation of concerns and versioning. -[Introduction to Kedro](../01_introduction/01_introduction) +[Introduction to Kedro](../introduction/introduction) ## `KedroContext` A Python class that holds the configuration and Kedro’s main functionality. @@ -24,7 +24,7 @@ API documentation for [`KedroContext`](/kedro.framework.context.KedroContext) ## `KedroSession` A KedroSession allows you to manage the lifecycle of a Kedro run, persist runtime parameters and trace back runtime parameters, such as CLI command flags and environment variables. -[Further information about `KedroSession`](../04_kedro_project_setup/03_session) +[Further information about `KedroSession`](../kedro_project_setup/session) ## Kedro-Viz You can use Kedro-Viz to visualise your Kedro data pipelines: @@ -33,24 +33,24 @@ You can use Kedro-Viz to visualise your Kedro data pipelines: * Get a clear picture when you have lots of datasets and nodes by using tags to visualise sub-pipelines * Search for nodes and datasets -[Further information from the Kedro-Viz repository](https://github.com/kedro-org/kedro-viz) and [Kedro tutorial documentation](../03_tutorial/06_visualise_pipeline) +[Further information from the Kedro-Viz repository](https://github.com/kedro-org/kedro-viz) and [Kedro tutorial documentation](../tutorial/visualise_pipeline) ## Layers (data engineering convention) -According to [data engineering convention](../12_faq/01_faq.md#what-is-data-engineering-convention), a pipeline can be broken up into different layers according to how data is processed. This convention makes it easier to collaborate with other team members because everyone has an idea of what type of data cleaning or processing has happened. +According to [data engineering convention](../faq/faq.md#what-is-data-engineering-convention), a pipeline can be broken up into different layers according to how data is processed. This convention makes it easier to collaborate with other team members because everyone has an idea of what type of data cleaning or processing has happened. -Kedro-Viz makes it easy to [visualise these data processing stages](../03_tutorial/06_visualise_pipeline.md#visualise-layers) by adding a `layer` attribute to the datasets in the Data Catalog. +Kedro-Viz makes it easy to [visualise these data processing stages](../tutorial/visualise_pipeline.md#visualise-layers) by adding a `layer` attribute to the datasets in the Data Catalog. ## Modular pipeline _(See also [Pipeline](#pipeline))_ In many typical Kedro projects, a single (“main”) pipeline increases in complexity as the project evolves. To keep your project fit for purpose, you can create modular pipelines, which are logically isolated and can be reused. Modular pipelines are easier to develop, test and maintain, and are portable so they can be copied and reused between projects. -[Further information about modular pipelines](../06_nodes_and_pipelines/03_modular_pipelines) +[Further information about modular pipelines](../nodes_and_pipelines/modular_pipelines) ## Node A Kedro node is a wrapper for a Python function that names the inputs and outputs of that function. It is the building block of a pipeline. Nodes can be linked when the output of one node is the input of another. -[Further information about nodes](../06_nodes_and_pipelines/01_nodes) +[Further information about nodes](../nodes_and_pipelines/nodes) ## Node execution order The node execution order is determined by resolving the input and output data dependencies between the nodes. The pipeline determines the node execution order and does not necessarily run the nodes in the order in which they are passed in. @@ -58,7 +58,7 @@ The node execution order is determined by resolving the input and output data de ## Pipeline A Kedro pipeline organises the dependencies and execution order of a collection of nodes, and connects inputs and outputs. The pipeline determines the node execution order by resolving dependencies. -[Further information about pipelines](../06_nodes_and_pipelines/02_pipeline_introduction) +[Further information about pipelines](../nodes_and_pipelines/pipeline_introduction) **_Chonky pipeline_**: _Chonky is generally used to describe animals that are plump, rounded or simply heavier than average. A chonky pipeline is, likewise, a pipeline that is more bulky than usual._ @@ -71,7 +71,7 @@ This is when you run a subset, or a ‘slice’ of a pipeline’s nodes. You can * by tagging certain nodes (`pipeline.only_nodes_with_tags`) * by specifying certain nodes (`pipeline.only_nodes`) -[Further information about pipeline slicing](../06_nodes_and_pipelines/05_slice_a_pipeline) +[Further information about pipeline slicing](../nodes_and_pipelines/slice_a_pipeline) ## Runner Runners are different execution mechanisms to run pipelines with the specified data catalog. @@ -80,12 +80,12 @@ Runners are different execution mechanisms to run pipelines with the specified d * The parallel runner allows for concurrency by use of multiprocessing * The thread runner uses threading for concurrent execution -[Further information about runners](../06_nodes_and_pipelines/04_run_a_pipeline) +[Further information about runners](../nodes_and_pipelines/run_a_pipeline) ## Starters Kedro starters are used to create projects that contain code to run as-is, or to adapt and extend. They provide pre-defined example code and configuration that can be reused. A Kedro starter is a [Cookiecutter template](https://cookiecutter.readthedocs.io/) that contains the boilerplate code for a Kedro project. -[Further information about Kedro starters](../02_get_started/06_starters) +[Further information about Kedro starters](../get_started/starters) ## Tags You can apply tags to nodes or pipelines as a means of filtering which are executed. diff --git a/docs/source/13_resources/01_logos.md b/docs/source/resources/logos.md similarity index 100% rename from docs/source/13_resources/01_logos.md rename to docs/source/resources/logos.md diff --git a/docs/source/11_tools_integration/02_ipython.md b/docs/source/tools_integration/ipython.md similarity index 95% rename from docs/source/11_tools_integration/02_ipython.md rename to docs/source/tools_integration/ipython.md index d1668358cf..06b3af884d 100644 --- a/docs/source/11_tools_integration/02_ipython.md +++ b/docs/source/tools_integration/ipython.md @@ -1,6 +1,6 @@ # Use Kedro with IPython and Jupyter Notebooks/Lab -This section follows the [Iris dataset example](../02_get_started/05_example_project.md) and demonstrates how to use Kedro with IPython and Jupyter Notebooks / Lab. We also recommend a video that explains the transition from the use of vanilla Jupyter Notebooks to using Kedro, from [Data Engineer One](https://www.youtube.com/watch?v=dRnCovp1GRQ&t=50s&ab_channel=DataEngineerOne). +This section follows the [Iris dataset example](../get_started/example_project.md) and demonstrates how to use Kedro with IPython and Jupyter Notebooks / Lab. We also recommend a video that explains the transition from the use of vanilla Jupyter Notebooks to using Kedro, from [Data Engineer One](https://www.youtube.com/watch?v=dRnCovp1GRQ&t=50s&ab_channel=DataEngineerOne). @@ -29,7 +29,7 @@ exit() ``` ### Load `DataCatalog` in IPython -To test the IPython session, load the [Iris test example](https://archive.ics.uci.edu/ml/datasets/iris) data inside the IPython console as follows: +To test the IPython session, load the [Iris test example](https://www.kaggle.com/uciml/iris) data inside the IPython console as follows: ```python catalog.load("example_iris_data").head() @@ -50,7 +50,7 @@ kedro.io.data_catalog - INFO - Loading data from `example_iris_data` (CSVDataSet #### Dataset versioning -If you enable [versioning](../05_data/01_data_catalog.md#versioning-datasets-and-ml-models), you can load a particular version of a dataset. Given a catalog entry: +If you enable [versioning](../data/data_catalog.md#versioning-datasets-and-ml-models), you can load a particular version of a dataset. Given a catalog entry: ```yaml example_train_x: @@ -103,10 +103,7 @@ The `context` variable allows you to interact with Kedro library components from With `context`, you can access the following variables and methods: - `context.project_path` (`Path`) - Root directory of the project -- `context.project_name` (`str`) - Project folder name - `context.catalog` (`DataCatalog`) - An instance of [DataCatalog](/kedro.io.DataCatalog) -- `context.config_loader` (`ConfigLoader`) - An instance of [ConfigLoader](/kedro.config.ConfigLoader) -- `context.pipeline` (`Pipeline`) - The `__default__` pipeline ### Run the pipeline @@ -392,7 +389,7 @@ To reload these variables at any point (e.g., if you update `catalog.yml`), use Note that if you want to pass an argument to `reload_kedro` line magic function, you should call it like a normal Python function (e.g `reload_kedro(extra_params=extra_params)` rather than using `%reload_kedro` in a notebook cell (e.g. `%reload_kedro(extra_params=extra_params)` wouldn't work). -If the `KEDRO_ENV` environment variable is specified, the startup script loads that environment, otherwise it defaults to `local`. Instructions for setting the environment variable can be found in the [Kedro configuration documentation](../04_kedro_project_setup/02_configuration.md#additional-configuration-environments). +If the `KEDRO_ENV` environment variable is specified, the startup script loads that environment, otherwise it defaults to `local`. Instructions for setting the environment variable can be found in the [Kedro configuration documentation](../kedro_project_setup/configuration.md#additional-configuration-environments). ### Kedro-Viz and Jupyter diff --git a/docs/source/11_tools_integration/01_pyspark.md b/docs/source/tools_integration/pyspark.md similarity index 96% rename from docs/source/11_tools_integration/01_pyspark.md rename to docs/source/tools_integration/pyspark.md index 8a6029b3d0..ab78b3d0f2 100644 --- a/docs/source/11_tools_integration/01_pyspark.md +++ b/docs/source/tools_integration/pyspark.md @@ -28,6 +28,7 @@ from pathlib import Path from pyspark import SparkConf from pyspark.sql import SparkSession +from kedro.config import ConfigLoader from kedro.framework.context import KedroContext @@ -36,17 +37,18 @@ class CustomContext(KedroContext): self, package_name: str, project_path: Union[Path, str], + config_loader: ConfigLoader, env: str = None, extra_params: Dict[str, Any] = None, ): - super().__init__(package_name, project_path, env, extra_params) + super().__init__(package_name, project_path, config_loader, env, extra_params) self.init_spark_session() def init_spark_session(self) -> None: """Initialises a SparkSession using the config defined in project's conf folder.""" # Load the spark configuration in spark.yaml using the config loader - parameters = self.config_loader.get("spark*", "spark*/**") + parameters = self._config_loader.get("spark*", "spark*/**") spark_conf = SparkConf().setAll(parameters.items()) # Initialise the spark session @@ -75,7 +77,7 @@ CONTEXT_CLASS = CustomContext ## Use Kedro's built-in Spark datasets to load and save raw data -We recommend using Kedro's built-in Spark datasets to load raw data into Spark's [DataFrame](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html), as well as to write them back to storage. Some of our built-in Spark datasets include: +We recommend using Kedro's built-in Spark datasets to load raw data into Spark's [DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#dataframe-apis), as well as to write them back to storage. Some of our built-in Spark datasets include: * [spark.DeltaTableDataSet](/kedro.extras.datasets.spark.DeltaTableDataSet) * [spark.SparkDataSet](/kedro.extras.datasets.spark.SparkDataSet) @@ -121,7 +123,7 @@ assert isinstance(df, pyspark.sql.DataFrame) [Delta Lake](https://delta.io/) is an open-source project that enables building a Lakehouse architecture on top of data lakes. It provides ACID transactions and unifies streaming and batch data processing on top of existing data lakes, such as S3, ADLS, GCS, and HDFS. To setup PySpark with Delta Lake, have a look at [the recommendations in Delta Lake's documentation](https://docs.delta.io/latest/quick-start.html#python). -We recommend the following workflow, which makes use of the [Transcoding](../05_data/01_data_catalog.md) feature in Kedro: +We recommend the following workflow, which makes use of the [Transcoding](../data/data_catalog.md) feature in Kedro: * To create a Delta table, use a `SparkDataSet` with `file_format="delta"`. You can also use this type of dataset to read from a Delta table and/or overwrite it. * To perform [Delta table deletes, updates, and merges](https://docs.delta.io/latest/delta-update.html#language-python), load the data using a `DeltaTableDataSet` and perform the write operations within the node function. diff --git a/docs/source/03_tutorial/04_create_pipelines.md b/docs/source/tutorial/create_pipelines.md similarity index 91% rename from docs/source/03_tutorial/04_create_pipelines.md rename to docs/source/tutorial/create_pipelines.md index dbe8b1b9d3..c128b15eb3 100644 --- a/docs/source/03_tutorial/04_create_pipelines.md +++ b/docs/source/tutorial/create_pipelines.md @@ -1,9 +1,9 @@ # Create a pipeline -This section covers the third part of the [standard development workflow](./01_spaceflights_tutorial.md#kedro-project-development-workflow), and covers the following: +This section covers the third part of the [standard development workflow](./spaceflights_tutorial.md#kedro-project-development-workflow), and covers the following: -* How to create each [node](../13_resources/02_glossary.md#node) required by the example -* How to set up a [pipeline](../13_resources/02_glossary.md#pipeline) +* How to create each [node](../resources/glossary.md#node) required by the example +* How to set up a [pipeline](../resources/glossary.md#pipeline) ## Data processing pipeline @@ -71,7 +71,7 @@ def preprocess_shuttles(shuttles: pd.DataFrame) -> pd.DataFrame: ### Assemble nodes into the data processing pipeline -The next steps are to create a [node](../13_resources/02_glossary.md#node) for each function, and to create a [modular pipeline](../13_resources/02_glossary.md#modular-pipeline) for data processing: +The next steps are to create a [node](../resources/glossary.md#node) for each function, and to create a [modular pipeline](../resources/glossary.md#modular-pipeline) for data processing: Add the following to `src/kedro_tutorial/pipelines/data_processing/pipeline.py`, so the `create_pipeline()` function looks as follows: @@ -116,11 +116,11 @@ You should also create a file `src/kedro_tutorial/pipelines/data_processing/__in ```python from .pipeline import create_pipeline # NOQA ``` -This file ensures that the `data_processing` folder is a Python package, in accordance with the [standard format for a modular pipeline](../06_nodes_and_pipelines/03_modular_pipelines.md#how-do-i-create-a-modular-pipeline). +This file ensures that the `data_processing` folder is a Python package, in accordance with the [standard format for a modular pipeline](../nodes_and_pipelines/modular_pipelines.md#how-do-i-create-a-modular-pipeline). ### Update the project pipeline -Now update the project's pipeline in `src/kedro_tutorial/pipeline_registry.py` to add the [modular pipeline](../13_resources/02_glossary.md#modular-pipeline) for data processing: +Now update the project's pipeline in `src/kedro_tutorial/pipeline_registry.py` to add the [modular pipeline](../resources/glossary.md#modular-pipeline) for data processing:
      Click to expand @@ -212,9 +212,9 @@ preprocessed_shuttles: The code above declares explicitly that [pandas.CSVDataSet](/kedro.extras.datasets.pandas.CSVDataSet) should be used instead of [`MemoryDataSet`](/kedro.io.MemoryDataSet). -The [Data Catalog](../13_resources/02_glossary.md#data-catalog) will take care of saving the datasets automatically (in this case as CSV data) to the path specified next time the pipeline is run. There is no need to change any code in your preprocessing functions to accommodate this change. +The [Data Catalog](../resources/glossary.md#data-catalog) will take care of saving the datasets automatically (in this case as CSV data) to the path specified next time the pipeline is run. There is no need to change any code in your preprocessing functions to accommodate this change. -In this tutorial, we chose `pandas.CSVDataSet` for its simplicity, but you can use any other available dataset implementation class, for example, a database table, cloud storage (like [AWS S3](https://aws.amazon.com/s3/), [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/), etc.) or others. If you cannot find the dataset implementation you need, you can implement your own [custom dataset](../07_extend_kedro/03_custom_datasets.md). +In this tutorial, we chose `pandas.CSVDataSet` for its simplicity, but you can use any other available dataset implementation class, for example, a database table, cloud storage (like [AWS S3](https://aws.amazon.com/s3/), [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/), etc.) or others. If you cannot find the dataset implementation you need, you can implement your own [custom dataset](../extend_kedro/custom_datasets.md). ### Extend the data processing pipeline @@ -312,19 +312,15 @@ implementation from the [scikit-learn](https://scikit-learn.org/stable/) library ### Update dependencies We now need to add `scikit-learn` to the project's dependencies. This is a slightly different process from the initial change we made early in the tutorial. -To **update** the project's dependencies, you should modify `src/requirements.in` to add the following. Note that you do not need to update ``src/requirements.txt`` as you did previously in the tutorial before you built the project's requirements with ``kedro build-reqs``: +To **update** the project's dependencies, you should modify `src/requirements.txt` to add the following: ```text scikit-learn==0.23.1 ``` -Then, re-run `kedro install` with a flag telling Kedro to recompile the requirements: +Then, re-run `pip install -r src/requirements.txt` to install the updated project requirements. -```bash -kedro install --build-reqs -``` - -You can find out more about [how to work with project dependencies](../04_kedro_project_setup/01_dependencies) in the Kedro project documentation. +You can find out more about [how to work with project dependencies](../kedro_project_setup/dependencies) in the Kedro project documentation. ### Create a data science node @@ -412,7 +408,7 @@ features: - review_scores_rating ``` -These are the parameters fed into the `DataCatalog` when the pipeline is executed. More information about [parameters](../04_kedro_project_setup/02_configuration.md#Parameters) is available in later documentation for advanced usage. Here, the parameters `test_size` and `random_state` are used as part of the train-test split, and `features` gives the names of columns in the model input table to use as features. +These are the parameters fed into the `DataCatalog` when the pipeline is executed. More information about [parameters](../kedro_project_setup/configuration.md#Parameters) is available in later documentation for advanced usage. Here, the parameters `test_size` and `random_state` are used as part of the train-test split, and `features` gives the names of columns in the model input table to use as features. ### Register the dataset The next step is to register the dataset that will save the trained model, by adding the following definition to `conf/base/catalog.yml`: @@ -424,7 +420,7 @@ regressor: versioned: true ``` -Versioning is enabled for `regressor`, which means that the pickled output of the `regressor` will be versioned and saved every time the pipeline is run. This allows us to keep the history of the models built using this pipeline. Further details can be found in the [Versioning](../05_data/02_kedro_io.md#versioning) section. +Versioning is enabled for `regressor`, which means that the pickled output of the `regressor` will be versioned and saved every time the pipeline is run. This allows us to keep the history of the models built using this pipeline. Further details can be found in the [Versioning](../data/kedro_io.md#versioning) section. ### Assemble the data science pipeline To create a modular pipeline for the price prediction model, add the following to the top of `src/kedro_tutorial/pipelines/data_science/pipeline.py`: @@ -562,15 +558,10 @@ There are three different Kedro runners that can run the pipeline: * `ParallelRunner` - runs your nodes in parallel; independent nodes are able to run at the same time, which is more efficient when there are independent branches in your pipeline and allows you to take advantage of multiple CPU cores. * `ThreadRunner` - runs your nodes in parallel, similarly to `ParallelRunner`, but uses multithreading instead of multiprocessing. -By default, Kedro uses a `SequentialRunner`, which is instantiated when you execute `kedro run` from the command line. If you decide to use `ParallelRunner`, provide an additional flag when running the pipeline from the command line: - -```bash -kedro run --parallel -``` - -If you want to run using `ThreadRunner` or a custom runner, you can do so by running: +By default, Kedro uses a `SequentialRunner`, which is instantiated when you execute `kedro run` from the command line. If you decide to use `ParallelRunner`, `ThreadRunner` or a custom runner, you can do so through the `--runner` flag as follows: ```bash +kedro run --runner=ParallelRunner kedro run --runner=ThreadRunner kedro run --runner=module.path.to.my.runner ``` @@ -579,7 +570,7 @@ kedro run --runner=module.path.to.my.runner .. note:: ``ParallelRunner`` performs task parallelisation, which is different from data parallelisation as seen in PySpark. ``` -You can find out more about the runners Kedro provides, and how to create your own, in the [pipeline documentation about runners](../06_nodes_and_pipelines/04_run_a_pipeline.md). +You can find out more about the runners Kedro provides, and how to create your own, in the [pipeline documentation about runners](../nodes_and_pipelines/run_a_pipeline.md). ## Slice a pipeline @@ -589,7 +580,7 @@ In some cases you may want to run just part of a pipeline. For example, you may kedro run --pipeline=ds ``` -See the [pipeline slicing documentation](../06_nodes_and_pipelines/05_slice_a_pipeline.md) for other ways to run sections of your pipeline. +See the [pipeline slicing documentation](../nodes_and_pipelines/slice_a_pipeline.md) for other ways to run sections of your pipeline. ```eval_rst .. note:: To successfully run the pipeline, you need to make sure that all required input datasets already exist, otherwise you may get an error similar to this: diff --git a/docs/source/03_tutorial/05_package_a_project.md b/docs/source/tutorial/package_a_project.md similarity index 92% rename from docs/source/03_tutorial/05_package_a_project.md rename to docs/source/tutorial/package_a_project.md index 542bb849c6..621e7eda84 100644 --- a/docs/source/03_tutorial/05_package_a_project.md +++ b/docs/source/tutorial/package_a_project.md @@ -19,7 +19,7 @@ To package your project, run the following in your project's root directory: kedro package ``` -Kedro builds the package into the `src/dist/` folder of your project, and creates one `.egg` file and one `.whl` file, which are [Python packaging formats for binary distribution](https://packaging.python.org/). +Kedro builds the package into the `dist/` folder of your project, and creates one `.egg` file and one `.whl` file, which are [Python packaging formats for binary distribution](https://packaging.python.org/). The resulting package only contains the Python source code of your Kedro pipeline, not any of the `conf/`, `data/` and `logs/` subfolders. This means that you can distribute the project to run elsewhere, such as on a separate computer with different configuration, data and logging. When distributed, the packaged project must be run from within a directory that contains the `conf/` subfolder (and `data/` and `logs/` if your pipeline loads/saves local data or uses logging). diff --git a/docs/source/03_tutorial/03_set_up_data.md b/docs/source/tutorial/set_up_data.md similarity index 91% rename from docs/source/03_tutorial/03_set_up_data.md rename to docs/source/tutorial/set_up_data.md index 060b4c4902..3198983005 100644 --- a/docs/source/03_tutorial/03_set_up_data.md +++ b/docs/source/tutorial/set_up_data.md @@ -1,11 +1,11 @@ # Set up the data -In this section, we discuss the data set-up phase, which is the second part of the [standard development workflow](./01_spaceflights_tutorial.md#kedro-project-development-workflow). The steps are as follows: +In this section, we discuss the data set-up phase, which is the second part of the [standard development workflow](./spaceflights_tutorial.md#kedro-project-development-workflow). The steps are as follows: -* Add datasets to your `data/` folder, according to [data engineering convention](../12_faq/01_faq.md#what-is-data-engineering-convention) +* Add datasets to your `data/` folder, according to [data engineering convention](../faq/faq.md#what-is-data-engineering-convention) * Register the datasets with the Data Catalog in `conf/base/catalog.yml`, which is the registry of all data sources available for use by the project. This ensures that your code is reproducible when it references datasets in different locations and/or environments. -You can find further information about [the Data Catalog](../05_data/01_data_catalog.md) in specific documentation covering advanced usage. +You can find further information about [the Data Catalog](../data/data_catalog.md) in specific documentation covering advanced usage. ## Add your datasets to `data` @@ -141,4 +141,4 @@ exit() Kedro supports a number of [datasets](/kedro.extras.datasets) out of the box, but you can also add support for any proprietary data format or filesystem in your pipeline. -You can find further information about [how to add support for custom datasets](../07_extend_kedro/03_custom_datasets.md) in specific documentation covering advanced usage. +You can find further information about [how to add support for custom datasets](../extend_kedro/custom_datasets.md) in specific documentation covering advanced usage. diff --git a/docs/source/03_tutorial/07_set_up_experiment_tracking.md b/docs/source/tutorial/set_up_experiment_tracking.md similarity index 95% rename from docs/source/03_tutorial/07_set_up_experiment_tracking.md rename to docs/source/tutorial/set_up_experiment_tracking.md index 88c3d80828..4735641003 100644 --- a/docs/source/03_tutorial/07_set_up_experiment_tracking.md +++ b/docs/source/tutorial/set_up_experiment_tracking.md @@ -9,13 +9,13 @@ Enabling experiment tracking features on Kedro-Viz relies on: * [experiment tracking datasets to let Kedro know what metrics should be tracked](#set-up-tracking-datasets) * [modifying your nodes and pipelines to output those metrics](#setting-up-your-nodes-and-pipelines-to-log-metrics). -This tutorial will provide a step-by-step process to set up experiment tracking and access your logged metrics from each run on Kedro-Viz. It will use the spaceflights starter project that is outlined in [this tutorial](../03_tutorial/01_spaceflights_tutorial.md). You can also jump directly to [this section for direct reference in setting up experiment tracking](../08_logging/02_experiment_tracking.md) for your Kedro project. +This tutorial will provide a step-by-step process to set up experiment tracking and access your logged metrics from each run on Kedro-Viz. It will use the spaceflights starter project that is outlined in [this tutorial](../tutorial/spaceflights_tutorial.md). You can also jump directly to [this section for direct reference in setting up experiment tracking](../logging/experiment_tracking.md) for your Kedro project. You can also access a more detailed demo [here](https://kedro-viz-live-demo.hfa4c8ufrmn4u.eu-west-2.cs.amazonlightsail.com/). ## Set up a project -We assume that you have already [installed Kedro](../02_get_started/02_install.md) and [Kedro-Viz](../03_tutorial/06_visualise_pipeline.md). Set up a new project using the spaceflights starter by running: +We assume that you have already [installed Kedro](../get_started/install.md) and [Kedro-Viz](../tutorial/visualise_pipeline.md). Set up a new project using the spaceflights starter by running: ```bash kedro new --starter=spaceflights diff --git a/docs/source/03_tutorial/01_spaceflights_tutorial.md b/docs/source/tutorial/spaceflights_tutorial.md similarity index 86% rename from docs/source/03_tutorial/01_spaceflights_tutorial.md rename to docs/source/tutorial/spaceflights_tutorial.md index 30b6fe104e..68ab1fae68 100644 --- a/docs/source/03_tutorial/01_spaceflights_tutorial.md +++ b/docs/source/tutorial/spaceflights_tutorial.md @@ -9,7 +9,7 @@ In this tutorial, we illustrate the typical Kedro workflow and the steps necessa In the text, we assume that you create an empty project and follow the flow of the tutorial by copying and pasting the example code into the project as we describe. This tutorial will take approximately 2 hours and you will learn each step of the Kedro project development workflow, by working on an example to construct nodes and pipelines for the price-prediction model. ```eval_rst -.. note:: You may prefer to get up and running more swiftly so we provide the full spaceflights example project as a `Kedro starter <../02_get_started/06_starters.md>`_. To create the project, run ``kedro new --starter=spaceflights``. When prompted for a project name, enter ``Kedro Tutorial``. Subsequently, accept the default suggestions for ``repo_name`` and ``python_package`` by pressing enter. This will generate a project from the `Kedro starter for the spaceflights tutorial `_ so you can follow the tutorial without any of the copy/pasting. +.. note:: You may prefer to get up and running more swiftly so we provide the full spaceflights example project as a `Kedro starter <../get_started/starters.md>`_. To create the project, run ``kedro new --starter=spaceflights``. When prompted for a project name, enter ``Kedro Tutorial``. Subsequently, accept the default suggestions for ``repo_name`` and ``python_package`` by pressing enter. This will generate a project from the `Kedro starter for the spaceflights tutorial `_ so you can follow the tutorial without any of the copy/pasting. ``` ## Kedro project development workflow @@ -21,7 +21,7 @@ When building a Kedro project, you will typically follow a standard development ### 1. Set up the project template * Create a new project with `kedro new` -* Install project dependencies with `kedro install` +* Install project dependencies with `pip install -r src/requirements.txt` * Configure the following in the `conf` folder: * Logging * Credentials diff --git a/docs/source/03_tutorial/02_tutorial_template.md b/docs/source/tutorial/tutorial_template.md similarity index 80% rename from docs/source/03_tutorial/02_tutorial_template.md rename to docs/source/tutorial/tutorial_template.md index 8b1d298358..89e888b3df 100644 --- a/docs/source/03_tutorial/02_tutorial_template.md +++ b/docs/source/tutorial/tutorial_template.md @@ -1,6 +1,6 @@ # Set up the spaceflights project -In this section, we discuss the project set-up phase, which is the first part of the [standard development workflow](./01_spaceflights_tutorial.md#kedro-project-development-workflow). The set-up steps are as follows: +In this section, we discuss the project set-up phase, which is the first part of the [standard development workflow](./spaceflights_tutorial.md#kedro-project-development-workflow). The set-up steps are as follows: * Create a new project @@ -10,7 +10,7 @@ In this section, we discuss the project set-up phase, which is the first part of ## Create a new project -Navigate to your chosen working directory and run the following to [create a new empty Kedro project](../02_get_started/04_new_project.md#create-a-new-project-interactively) using the default interactive prompts: +Navigate to your chosen working directory and run the following to [create a new empty Kedro project](../get_started/new_project.md#create-a-new-project-interactively) using the default interactive prompts: ```bash kedro new @@ -18,12 +18,12 @@ kedro new When prompted for a project name, enter `Kedro Tutorial`. Subsequently, accept the default suggestions for `repo_name` and `python_package` by pressing enter. -## Install project dependencies with `kedro install` +## Install project dependencies To install the project-specific dependencies, navigate to the root directory of the project and run: ```bash -kedro install +pip install -r src/requirements.txt ``` ### More about project dependencies @@ -73,11 +73,11 @@ Then run the following: kedro build-reqs ``` -You can find out more about [how to work with project dependencies](../04_kedro_project_setup/01_dependencies.md) in the Kedro project documentation. In a [later step of this tutorial](./04_create_pipelines.md#update-dependencies), we will modify project's dependencies to illustrate how, once you have installed project-specific dependencies, you can update them. +You can find out more about [how to work with project dependencies](../kedro_project_setup/dependencies.md) in the Kedro project documentation. In a [later step of this tutorial](./create_pipelines.md#update-dependencies), we will modify project's dependencies to illustrate how, once you have installed project-specific dependencies, you can update them. ## Configure the project -You may optionally add in any credentials to `conf/local/credentials.yml` that you would need to load specific data sources like usernames and passwords. Some examples are given within the file to illustrate how you store credentials. Additional information can be found in the [advanced documentation on configuration](../04_kedro_project_setup/02_configuration.md). +You may optionally add in any credentials to `conf/local/credentials.yml` that you would need to load specific data sources like usernames and passwords. Some examples are given within the file to illustrate how you store credentials. Additional information can be found in the [advanced documentation on configuration](../kedro_project_setup/configuration.md). -At this stage of the workflow, you may also want to [set up logging](../08_logging/01_logging.md), but we do not use it in this tutorial. +At this stage of the workflow, you may also want to [set up logging](../logging/logging.md), but we do not use it in this tutorial. diff --git a/docs/source/03_tutorial/06_visualise_pipeline.md b/docs/source/tutorial/visualise_pipeline.md similarity index 94% rename from docs/source/03_tutorial/06_visualise_pipeline.md rename to docs/source/tutorial/visualise_pipeline.md index 5060a049fb..856474b843 100644 --- a/docs/source/03_tutorial/06_visualise_pipeline.md +++ b/docs/source/tutorial/visualise_pipeline.md @@ -1,6 +1,6 @@ # Visualise pipelines -[Kedro-Viz](https://github.com/kedro-org/kedro-viz) displays data and machine-learning pipelines in an informative way, emphasising the connections between datasets and nodes. It shows the structure of your Kedro pipeline. This exercise assumes that you have been following the [Spaceflights tutorial](01_spaceflights_tutorial.md). +[Kedro-Viz](https://github.com/kedro-org/kedro-viz) displays data and machine-learning pipelines in an informative way, emphasising the connections between datasets and nodes. It shows the structure of your Kedro pipeline. This exercise assumes that you have been following the [Spaceflights tutorial](spaceflights_tutorial.md). ## Install Kedro-Viz @@ -25,7 +25,7 @@ You may also use the `--autoreload` flag to autoreload Kedro Viz when a `Python` ![](../meta/images/kedro_viz_autoreload.gif) -If a visualisation panel opens up and a pipeline is not visible then please check that your [pipeline definition](04_create_pipelines.md) is complete. All other errors can be logged as GitHub Issues on the [Kedro-Viz repository](https://github.com/kedro-org/kedro-viz). +If a visualisation panel opens up and a pipeline is not visible then please check that your [pipeline definition](create_pipelines.md) is complete. All other errors can be logged as GitHub Issues on the [Kedro-Viz repository](https://github.com/kedro-org/kedro-viz). ## Exit an open visualisation @@ -33,7 +33,7 @@ You exit this visualisation by closing the open browser and entering **Ctrl+C** ## Visualise layers -A pipeline can be broken up into different layers according to how data is processed, and using a convention for layers makes it easier to collaborate. For example, the [data engineering convention](../12_faq/01_faq.md#what-is-data-engineering-convention) shown here labels datasets according to the stage of the pipeline (e.g. whether the data has been cleaned). +A pipeline can be broken up into different layers according to how data is processed, and using a convention for layers makes it easier to collaborate. For example, the [data engineering convention](../faq/faq.md#what-is-data-engineering-convention) shown here labels datasets according to the stage of the pipeline (e.g. whether the data has been cleaned). Kedro-Viz makes it easy to visualise these data processing stages by adding a `layer` attribute to the datasets in the Data Catalog. We will be modifying `catalog.yml` with the following: diff --git a/features/build_docs.feature b/features/build_docs.feature index 2408031d1f..1797a06e14 100644 --- a/features/build_docs.feature +++ b/features/build_docs.feature @@ -5,7 +5,7 @@ Feature: build-docs target in new project Given I have prepared a config file And I have run a non-interactive kedro new with starter And I have updated kedro requirements - And I have executed the kedro command "install" + And I have installed the project dependencies When I execute the kedro command "build-docs" Then I should get a successful exit code And docs should be generated diff --git a/features/install.feature b/features/install.feature deleted file mode 100644 index 4569a599aa..0000000000 --- a/features/install.feature +++ /dev/null @@ -1,18 +0,0 @@ -Feature: install target in new project - Background: - Given I have prepared a config file - And I have run a non-interactive kedro new with starter - And I have updated kedro requirements - Then src/requirements.in must not exist - - @fresh_venv - Scenario: Execute install target - When I execute the kedro command "install" - Then I should get a successful exit code - And src/requirements.in file must exist - - @fresh_venv - Scenario: Execute install target without compiled requirements - When I execute the kedro command "install --no-build-reqs" - Then I should get a successful exit code - And src/requirements.in must not exist diff --git a/features/load_context.feature b/features/load_context.feature index 5b636639af..826ed52a6d 100644 --- a/features/load_context.feature +++ b/features/load_context.feature @@ -22,12 +22,6 @@ Feature: Custom Kedro project And I should get a message including "Registered hooks from 1 installed plugin(s): test-plugin-0.1" And I should get a message including "Reached after_catalog_created hook" - Scenario: Pipelines from installed plugins are added to the project's pipelines - Given I have installed the test plugin - When I execute the kedro command "run --pipeline from_plugin" - Then I should get a successful exit code - And I should get a message including "Registered hooks from 1 installed plugin(s): test-plugin-0.1" - Scenario: Disable automatically registered plugin hooks Given I have installed the test plugin And I have disabled hooks for "test-plugin" plugin via config diff --git a/features/package.feature b/features/package.feature index 7132bfdce3..638c98de07 100644 --- a/features/package.feature +++ b/features/package.feature @@ -3,7 +3,7 @@ Feature: Package target in new project Background: Given I have prepared a config file And I have run a non-interactive kedro new with starter - And I have executed the kedro command "install --no-build-reqs" + And I have installed the project dependencies @fresh_venv Scenario: Install package diff --git a/features/run.feature b/features/run.feature index 527985e812..79d47e55d2 100644 --- a/features/run.feature +++ b/features/run.feature @@ -14,7 +14,7 @@ Feature: Run Project Scenario: Run parallel runner with default python entry point with example code Given I have prepared a config file And I have run a non-interactive kedro new with starter - When I execute the kedro command "run --parallel" + When I execute the kedro command "run --runner=ParallelRunner" Then I should get a successful exit code And the console log should show that "split_data" was run And the console log should show that "train_model" was run diff --git a/features/steps/cli_steps.py b/features/steps/cli_steps.py index d5b4ae56a5..ab55519545 100644 --- a/features/steps/cli_steps.py +++ b/features/steps/cli_steps.py @@ -161,12 +161,16 @@ def create_config_file(context): yaml.dump(config, config_file, default_flow_style=False) -@given('I have executed the kedro command "{command}"') -def exec_kedro_target_checked(context, command): - """Execute Kedro command and check the status.""" - cmd = [context.kedro] + command.split() - - res = run(cmd, env=context.env, cwd=str(context.root_project_dir)) +@given("I have installed the project dependencies") +@when("I install the project dependencies") +def pip_install_dependencies(context): + """Install project dependencies using pip.""" + reqs_path = "src/requirements.txt" + res = run( + [context.pip, "install", "-r", reqs_path], + env=context.env, + cwd=str(context.root_project_dir), + ) if res.returncode != OK_EXIT_CODE: print(res.stdout) @@ -219,7 +223,7 @@ def uninstall_package_via_pip(context, package): @when("I install the project's python package") def install_project_package_via_pip(context): """Install a python package using pip.""" - dist_dir = context.root_project_dir / "src" / "dist" + dist_dir = context.root_project_dir / "dist" (whl_file,) = dist_dir.glob("*.whl") run([context.pip, "install", str(whl_file)], env=context.env) @@ -321,6 +325,7 @@ def commit_changes_to_git(context): check_run(f"git commit -m 'Change {time()}'") +@given('I have executed the kedro command "{command}"') @when('I execute the kedro command "{command}"') def exec_kedro_target(context, command): """Execute Kedro target.""" @@ -461,7 +466,7 @@ def update_kedro_req(context: behave.runner.Context): @when("I add {dependency} to the requirements") def add_req(context: behave.runner.Context, dependency: str): - reqs_path = context.root_project_dir / "src" / "requirements.in" + reqs_path = context.root_project_dir / "src" / "requirements.txt" if reqs_path.is_file(): reqs_path.write_text(reqs_path.read_text() + "\n" + str(dependency) + "\n") @@ -557,7 +562,7 @@ def check_failed_status_code(context): @then("the relevant packages should be created") def check_python_packages_created(context): """Check that egg and whl files exist in dist dir.""" - dist_dir = context.root_project_dir / "src" / "dist" + dist_dir = context.root_project_dir / "dist" egg_file = dist_dir.glob("*.egg") whl_file = dist_dir.glob("*.whl") assert any(egg_file) @@ -690,13 +695,10 @@ def check_docs_generated(context: behave.runner.Context): @then("requirements should be generated") def check_reqs_generated(context: behave.runner.Context): - """Check that new project docs are generated.""" - reqs_path = context.root_project_dir / "src" / "requirements.in" + """Check that new project requirements are generated.""" + reqs_path = context.root_project_dir / "src" / "requirements.lock" assert reqs_path.is_file() - assert ( - "This file is autogenerated by pip-compile" - in (context.root_project_dir / "src" / "requirements.txt").read_text() - ) + assert "This file is autogenerated by pip-compile" in reqs_path.read_text() @then("{dependency} should be in the requirements") diff --git a/features/steps/sh_run.py b/features/steps/sh_run.py index e80152bb95..e994363bf0 100644 --- a/features/steps/sh_run.py +++ b/features/steps/sh_run.py @@ -83,7 +83,9 @@ def __init__(self, cmd: List[str], **kwargs) -> None: **kwargs: keyword arguments such as env and cwd """ - super().__init__(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs) + super().__init__( # type: ignore + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs + ) def terminate(self) -> None: """Terminate process and children.""" diff --git a/features/steps/test_plugin/plugin.py b/features/steps/test_plugin/plugin.py index f979f8c096..876c40bd69 100644 --- a/features/steps/test_plugin/plugin.py +++ b/features/steps/test_plugin/plugin.py @@ -2,7 +2,6 @@ import logging from kedro.framework.hooks import hook_impl -from kedro.pipeline import node, pipeline class MyPluginHook: @@ -12,11 +11,5 @@ def after_catalog_created( ): # pylint: disable=unused-argument,no-self-use logging.info("Reached after_catalog_created hook") - @hook_impl - def register_pipelines(self): # pylint: disable=no-self-use - return { - "from_plugin": pipeline([node(lambda: "sth", inputs=None, outputs="x")]) - } - hooks = MyPluginHook() diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/README.md b/features/steps/test_starter/{{ cookiecutter.repo_name }}/README.md index 9679c653cc..2bca835cf3 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/README.md +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/README.md @@ -11,7 +11,7 @@ Take a look at the [Kedro documentation](https://kedro.readthedocs.io) to get st In order to get the best out of the template: * Don't remove any lines from the `.gitignore` file we provide -* Make sure your results can be reproduced by following a [data engineering convention](https://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention) +* Make sure your results can be reproduced by following a [data engineering convention](https://kedro.readthedocs.io/en/stable/faq/faq.html#what-is-data-engineering-convention) * Don't commit data to your repository * Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/` @@ -22,7 +22,7 @@ Declare any dependencies in `src/requirements.txt` for `pip` installation and `s To install them, run: ``` -kedro install +pip install -r src/requirements.txt ``` ## How to run Kedro @@ -52,11 +52,11 @@ To generate or update the dependency requirements for your project: kedro build-reqs ``` -This will copy the contents of `src/requirements.txt` into a new file `src/requirements.in` which will be used as the source for `pip-compile`. You can see the output of the resolution by opening `src/requirements.txt`. +This will `pip-compile` the contents of `src/requirements.txt` into a new file `src/requirements.lock`. You can see the output of the resolution by opening `src/requirements.lock`. -After this, if you'd like to update your project requirements, please update `src/requirements.in` and re-run `kedro build-reqs`. +After this, if you'd like to update your project requirements, please update `src/requirements.txt` and re-run `kedro build-reqs`. -[Further information about project dependencies](https://kedro.readthedocs.io/en/stable/04_kedro_project_setup/01_dependencies.html#project-specific-dependencies) +[Further information about project dependencies](https://kedro.readthedocs.io/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies) ## How to work with Kedro and notebooks @@ -96,7 +96,7 @@ kedro ipython ``` ### How to convert notebook cells to nodes in a Kedro project -You can move notebook code over into a Kedro project structure using a mixture of [cell tagging](https://jupyter-notebook.readthedocs.io/en/stable/changelog.html#release-5-0-0) and Kedro CLI commands. +You can move notebook code over into a Kedro project structure using a mixture of [cell tagging](https://jupyter-notebook.readthedocs.io/en/stable/changelog.html#id35) and Kedro CLI commands. By adding the `node` tag to a cell and running the command below, the cell's source code will be copied over to a Python file within `src//nodes/`: @@ -118,4 +118,4 @@ To automatically strip out all output cell contents before committing to `git`, ## Package your Kedro project -[Further information about building project documentation and packaging your project](https://kedro.readthedocs.io/en/stable/03_tutorial/05_package_a_project.html) +[Further information about building project documentation and packaging your project](https://kedro.readthedocs.io/en/stable/tutorial/package_a_project.html) diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml index de74c169cf..c0c61a3a2c 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml @@ -1,7 +1,7 @@ # Here you can define all your data sets by using simple YAML syntax. # # Documentation for this file format can be found in "The Data Catalog" -# Link: https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html +# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html # # We support interacting with a variety of data stores including local file systems, cloud, network and HDFS # @@ -37,7 +37,7 @@ # # The Data Catalog supports being able to reference the same file using two different DataSet implementations # (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: -# https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html +# https://kedro.readthedocs.io/en/stable/data/data_catalog.html # # This is a data set used by the "Hello World" example pipeline provided with the project # template. Please feel free to remove it once you remove the example pipeline. diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/logging.yml b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/logging.yml index 3689418056..a4dcad2e08 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/logging.yml +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/logging.yml @@ -34,12 +34,6 @@ handlers: encoding: utf8 delay: True - journal_file_handler: - class: kedro.versioning.journal.JournalFileHandler - level: INFO - base_dir: logs/journals - formatter: json_formatter - loggers: anyconfig: level: WARNING @@ -56,11 +50,6 @@ loggers: handlers: [console, info_file_handler, error_file_handler] propagate: no - kedro.journal: - level: INFO - handlers: [journal_file_handler] - propagate: no - root: level: INFO handlers: [console, info_file_handler, error_file_handler] diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt index 0833d99f63..0fcbb356bd 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/requirements.txt @@ -6,7 +6,7 @@ jupyter~=1.0 jupyter_client>=5.1, <7.0 jupyterlab~=3.0 kedro[pandas.CSVDataSet]=={{ cookiecutter.kedro_version }} -kedro-telemetry~=0.1.0 +kedro-telemetry~=0.1.0; python_version < '3.9' nbstripout~=0.4 pytest-cov~=3.0 pytest-mock>=1.7.1, <2.0 diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/setup.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/setup.py index 5e80d6d547..de5796b810 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/setup.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/setup.py @@ -13,7 +13,7 @@ requires = [] for line in f: req = line.split("#", 1)[0].strip() - if req and not req.startswith("--"): + if req and not req.startswith("-r"): requires.append(req) setup( diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/tests/test_run.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/tests/test_run.py index d13cacfa09..cec7571094 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/tests/test_run.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/tests/test_run.py @@ -11,17 +11,27 @@ from pathlib import Path import pytest +from kedro.config import ConfigLoader from kedro.framework.context import KedroContext @pytest.fixture -def project_context(): - return KedroContext(package_name="{{ cookiecutter.python_package }}", project_path=Path.cwd()) +def config_loader(): + return ConfigLoader(conf_source=str(Path.cwd())) + + +@pytest.fixture +def project_context(config_loader): + return KedroContext( + package_name="{{ cookiecutter.python_package }}", + project_path=Path.cwd(), + config_loader=config_loader, + ) # The tests below are here for the demonstration purpose # and should be replaced with the ones testing the project # functionality class TestProjectContext: - def test_package_name(self, project_context): - assert project_context.package_name == "{{ cookiecutter.python_package }}" + def test_project_path(self, project_context): + assert project_context.project_path == Path.cwd() diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py index 628f18a70e..5f74f6b303 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py @@ -20,7 +20,7 @@ def _find_run_command(package_name): if run: # use run command from installed plugin if it exists return run - # use run command from the framework project + # use run command from `kedro.framework.cli.project` from kedro.framework.cli.project import run return run diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py deleted file mode 100644 index 98dffc60d4..0000000000 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Project hooks.""" -from typing import Any, Dict, Iterable, Optional - -from kedro.config import ConfigLoader -from kedro.framework.hooks import hook_impl -from kedro.io import DataCatalog -from kedro.versioning import Journal - - -class ProjectHooks: - @hook_impl - def register_config_loader(self, conf_paths: Iterable[str]) -> ConfigLoader: - return ConfigLoader(conf_paths) - - @hook_impl - def register_catalog( - self, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]], - load_versions: Dict[str, str], - save_version: str, - journal: Journal, - ) -> DataCatalog: - return DataCatalog.from_config( - catalog, credentials, load_versions, save_version, journal - ) diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/__init__.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/__init__.py index a6a46b2038..a3b6f56ce5 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/__init__.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/__init__.py @@ -5,5 +5,3 @@ """ from .pipeline import create_pipeline # NOQA - -__version__ = "4.20.69" diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py index cc573c8b8b..328eed3dc2 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py @@ -1,8 +1,8 @@ """Project settings.""" -from {{cookiecutter.python_package}}.hooks import ProjectHooks -# Instantiate and list your project hooks here -HOOKS = (ProjectHooks(),) +# Instantiate and list your custom project hooks here +# from {{cookiecutter.python_package}}.hooks import ProjectHooks +# HOOKS = (ProjectHooks(),) # List the installed plugins for which to disable auto-registry # DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",) @@ -20,4 +20,4 @@ # CONTEXT_CLASS = KedroContext # Define the configuration folder. Defaults to `conf` -# CONF_ROOT = "conf" +# CONF_SOURCE = "conf" diff --git a/features/windows_reqs.txt b/features/windows_reqs.txt index 587d9ff03c..32df180278 100644 --- a/features/windows_reqs.txt +++ b/features/windows_reqs.txt @@ -2,8 +2,8 @@ # e2e tests on Windows are slow but we don't need to install # everything, so just this subset will be enough for CI behave==1.2.6 -pandas>=0.24.0, <1.0.4 -psutil==5.6.7 +pandas~=1.2 +psutil==5.8.0 requests~=2.20 toml~=0.10.1 PyYAML>=4.2, <6.0 diff --git a/kedro/config/__init__.py b/kedro/config/__init__.py index 997a18baef..f632c4039a 100644 --- a/kedro/config/__init__.py +++ b/kedro/config/__init__.py @@ -2,11 +2,16 @@ configuration from different file formats. """ - -from .config import BadConfigException, ConfigLoader, MissingConfigException +from .abstract_config import ( + AbstractConfigLoader, + BadConfigException, + MissingConfigException, +) +from .config import ConfigLoader from .templated_config import TemplatedConfigLoader __all__ = [ + "AbstractConfigLoader", "BadConfigException", "ConfigLoader", "MissingConfigException", diff --git a/kedro/config/abstract_config.py b/kedro/config/abstract_config.py new file mode 100644 index 0000000000..5eecd68a29 --- /dev/null +++ b/kedro/config/abstract_config.py @@ -0,0 +1,71 @@ +# Copyright 2021 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +"""This module provides ``kedro.abstract_config`` with the baseline +class model for a `ConfigLoader` implementation. +""" +from abc import ABC, abstractmethod +from typing import Any, Dict + + +class AbstractConfigLoader(ABC): + """ + ``AbstractConfigLoader`` is the abstract base class + for all `ConfigLoader` implementations. + All user-defined `ConfigLoader` implementations should inherit + from `AbstractConfigLoader` and implement all relevant abstract methods. + """ + + def __init__( + self, + conf_source: str, + env: str = None, + runtime_params: Dict[str, Any] = None, + **kwargs # pylint: disable=unused-argument + ): + self.conf_source = conf_source + self.env = env + self.runtime_params = runtime_params + + @abstractmethod # pragma: no cover + def get(self) -> Dict[str, Any]: + """Required method to get all configurations.""" + pass + + +class BadConfigException(Exception): + """Raised when a configuration file cannot be loaded, for instance + due to wrong syntax or poor formatting. + """ + + pass + + +class MissingConfigException(Exception): + """Raised when no configuration files can be found within a config path""" + + pass diff --git a/kedro/config/common.py b/kedro/config/common.py new file mode 100644 index 0000000000..3f8d4c0f4a --- /dev/null +++ b/kedro/config/common.py @@ -0,0 +1,264 @@ +# Copyright 2021 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module contains methods and facade interfaces for various ConfigLoader +implementations. +""" + +import logging +from glob import iglob +from pathlib import Path +from typing import AbstractSet, Any, Dict, Iterable, List, Set +from warnings import warn + +from kedro.config import BadConfigException, MissingConfigException + +SUPPORTED_EXTENSIONS = [ + ".yml", + ".yaml", + ".json", + ".ini", + ".pickle", + ".properties", + ".xml", +] +_config_logger = logging.getLogger(__name__) + + +def _get_config_from_patterns( + conf_paths: Iterable[str], + patterns: Iterable[str] = None, + ac_template: bool = False, +) -> Dict[str, Any]: + """Recursively scan for configuration files, load and merge them, and + return them in the form of a config dictionary. + + Args: + conf_paths: List of configuration paths to directories + patterns: Glob patterns to match. Files, which names match + any of the specified patterns, will be processed. + ac_template: Boolean flag to indicate whether to use the `ac_template` + argument of the ``anyconfig.load`` method. Used in the context of + `_load_config_file` function. + + Raises: + ValueError: If 2 or more configuration files inside the same + config path (or its subdirectories) contain the same + top-level key. + MissingConfigException: If no configuration files exist within + a specified config path. + BadConfigException: If configuration is poorly formatted and + cannot be loaded. + + Returns: + Dict[str, Any]: A Python dictionary with the combined + configuration from all configuration files. **Note:** any keys + that start with `_` will be ignored. + """ + + if not patterns: + raise ValueError( + "`patterns` must contain at least one glob " + "pattern to match config filenames against." + ) + + config = {} # type: Dict[str, Any] + processed_files = set() # type: Set[Path] + + for conf_path in conf_paths: + if not Path(conf_path).is_dir(): + raise ValueError( + f"Given configuration path either does not exist " + f"or is not a valid directory: {conf_path}" + ) + + config_filepaths = _lookup_config_filepaths( + Path(conf_path), patterns, processed_files, _config_logger + ) + new_conf = _load_configs( + config_filepaths=config_filepaths, ac_template=ac_template + ) + + common_keys = config.keys() & new_conf.keys() + if common_keys: + sorted_keys = ", ".join(sorted(common_keys)) + msg = ( + "Config from path `%s` will override the following " + "existing top-level config keys: %s" + ) + _config_logger.info(msg, conf_path, sorted_keys) + + config.update(new_conf) + processed_files |= set(config_filepaths) + + if not processed_files: + raise MissingConfigException( + f"No files found in {conf_paths} matching the glob " + f"pattern(s): {patterns}" + ) + return config + + +def _load_config_file(config_file: Path, ac_template: bool = False) -> Dict[str, Any]: + """Load an individual config file using `anyconfig` as a backend. + + Args: + config_file: Path to a config file to process. + ac_template: Boolean flag to indicate whether to use the `ac_template` + argument of the ``anyconfig.load`` method. + + Raises: + BadConfigException: If configuration is poorly formatted and + cannot be loaded. + + Returns: + Parsed configuration. + """ + # for performance reasons + import anyconfig # pylint: disable=import-outside-toplevel + + try: + # Default to UTF-8, which is Python 3 default encoding, to decode the file + with open(config_file, encoding="utf8") as yml: + return { + k: v + for k, v in anyconfig.load(yml, ac_template=ac_template).items() + if not k.startswith("_") + } + except AttributeError as exc: + raise BadConfigException(f"Couldn't load config file: {config_file}") from exc + + +def _load_configs(config_filepaths: List[Path], ac_template: bool) -> Dict[str, Any]: + """Recursively load all configuration files, which satisfy + a given list of glob patterns from a specific path. + + Args: + config_filepaths: Configuration files sorted in the order of precedence. + ac_template: Boolean flag to indicate whether to use the `ac_template` + argument of the ``anyconfig.load`` method. Used in the context of + `_load_config_file` function. + + Raises: + ValueError: If 2 or more configuration files contain the same key(s). + BadConfigException: If configuration is poorly formatted and + cannot be loaded. + + Returns: + Resulting configuration dictionary. + + """ + + aggregate_config = {} + seen_file_to_keys = {} # type: Dict[Path, AbstractSet[str]] + + for config_filepath in config_filepaths: + single_config = _load_config_file(config_filepath, ac_template=ac_template) + _check_duplicate_keys(seen_file_to_keys, config_filepath, single_config) + seen_file_to_keys[config_filepath] = single_config.keys() + aggregate_config.update(single_config) + + return aggregate_config + + +def _lookup_config_filepaths( + conf_path: Path, + patterns: Iterable[str], + processed_files: Set[Path], + logger: Any, +) -> List[Path]: + config_files = _path_lookup(conf_path, patterns) + + seen_files = config_files & processed_files + if seen_files: + logger.warning( + "Config file(s): %s already processed, skipping loading...", + ", ".join(str(seen) for seen in sorted(seen_files)), + ) + config_files -= seen_files + + return sorted(config_files) + + +def _remove_duplicates(items: Iterable[str]): + """Remove duplicates while preserving the order.""" + unique_items = [] # type: List[str] + for item in items: + if item not in unique_items: + unique_items.append(item) + else: + warn( + f"Duplicate environment detected! " + f"Skipping re-loading from configuration path: {item}" + ) + return unique_items + + +def _check_duplicate_keys( + processed_files: Dict[Path, AbstractSet[str]], filepath: Path, conf: Dict[str, Any] +) -> None: + duplicates = [] + + for processed_file, keys in processed_files.items(): + overlapping_keys = conf.keys() & keys + + if overlapping_keys: + sorted_keys = ", ".join(sorted(overlapping_keys)) + if len(sorted_keys) > 100: + sorted_keys = sorted_keys[:100] + "..." + duplicates.append(f"{processed_file}: {sorted_keys}") + + if duplicates: + dup_str = "\n- ".join(duplicates) + raise ValueError(f"Duplicate keys found in {filepath} and:\n- {dup_str}") + + +def _path_lookup(conf_path: Path, patterns: Iterable[str]) -> Set[Path]: + """Return a set of all configuration files from ``conf_path`` or + its subdirectories, which satisfy a given list of glob patterns. + + Args: + conf_path: Path to configuration directory. + patterns: List of glob patterns to match the filenames against. + + Returns: + A set of paths to configuration files. + + """ + config_files = set() + conf_path = conf_path.resolve() + + for pattern in patterns: + # `Path.glob()` ignores the files if pattern ends with "**", + # therefore iglob is used instead + for each in iglob(str(conf_path / pattern), recursive=True): + path = Path(each).resolve() + if path.is_file() and path.suffix in SUPPORTED_EXTENSIONS: + config_files.add(path) + + return config_files diff --git a/kedro/config/config.py b/kedro/config/config.py index 70bb2daa28..c680d16e5c 100644 --- a/kedro/config/config.py +++ b/kedro/config/config.py @@ -1,51 +1,31 @@ """This module provides ``kedro.config`` with the functionality to load one or more configuration files from specified paths. """ -import logging -from glob import iglob from pathlib import Path -from typing import AbstractSet, Any, Dict, Iterable, List, Set, Union -from warnings import warn +from typing import Any, Dict, Iterable -SUPPORTED_EXTENSIONS = [ - ".yml", - ".yaml", - ".json", - ".ini", - ".pickle", - ".properties", - ".xml", -] +from kedro.config import AbstractConfigLoader +from kedro.config.common import _get_config_from_patterns, _remove_duplicates -class MissingConfigException(Exception): - """Raised when no configuration files can be found within a config path""" - - pass - - -class BadConfigException(Exception): - """Raised when a configuration file cannot be loaded, for instance - due to wrong syntax or poor formatting. - """ - - pass - - -class ConfigLoader: - """Recursively scan the directories specified in ``conf_paths`` for +class ConfigLoader(AbstractConfigLoader): + """Recursively scan directories (config paths) contained in ``conf_source`` for configuration files with a ``yaml``, ``yml``, ``json``, ``ini``, ``pickle``, ``xml`` or ``properties`` extension, load them, and return them in the form of a config dictionary. + The first processed config path is the ``base`` directory inside + ``conf_source``. The optional ``env`` argument can be used to specify a + subdirectory of ``conf_source`` to process as a config path after ``base``. + When the same top-level key appears in any 2 config files located in - the same ``conf_path`` (sub)directory, a ``ValueError`` is raised. + the same (sub)directory, a ``ValueError`` is raised. When the same key appears in any 2 config files located in different - ``conf_path`` directories, the last processed config path takes - precedence and overrides this key. + (sub)directories, the last processed config path takes precedence + and overrides this key. - For example, if your ``conf_path`` looks like this: + For example, if your ``conf_source`` looks like this: :: . @@ -72,8 +52,7 @@ class ConfigLoader: >>> import logging.config >>> from kedro.config import ConfigLoader >>> - >>> conf_paths = ['conf/base', 'conf/local'] - >>> conf_loader = ConfigLoader(conf_paths) + >>> conf_loader = ConfigLoader('conf', 'local') >>> >>> conf_logging = conf_loader.get('logging*') >>> logging.config.dictConfig(conf_logging) # set logging conf @@ -83,218 +62,47 @@ class ConfigLoader: """ - def __init__(self, conf_paths: Union[str, Iterable[str]]): - """Instantiate a ConfigLoader. - - Args: - conf_paths: Non-empty path or list of paths to configuration - directories. - Raises: - ValueError: If ``conf_paths`` is empty. - - """ - if not conf_paths: - raise ValueError( - "`conf_paths` must contain at least one path to " - "load configuration files from." - ) - if isinstance(conf_paths, str): - conf_paths = [conf_paths] - - self.conf_paths = _remove_duplicates(conf_paths) - self.logger = logging.getLogger(__name__) - - @staticmethod - def _load_config_file(config_file: Path) -> Dict[str, Any]: - """Load an individual config file using `anyconfig` as a backend. + def __init__( + self, + conf_source: str, + env: str = None, + runtime_params: Dict[str, Any] = None, + *, + base_env: str = "base", + default_run_env: str = "local", + ): + """Instantiates a ``ConfigLoader``. Args: - config_file: Path to a config file to process. - - Raises: - BadConfigException: If configuration is poorly formatted and - cannot be loaded. - - Returns: - Parsed configuration. - """ - # for performance reasons - import anyconfig # pylint: disable=import-outside-toplevel - - try: - # Default to UTF-8, which is Python 3 default encoding, to decode the file - with open(config_file, encoding="utf8") as yml: - return { - k: v - for k, v in anyconfig.load(yml).items() - if not k.startswith("_") - } - except AttributeError as exc: - raise BadConfigException( - f"Couldn't load config file: {config_file}" - ) from exc - - def _load_configs(self, config_filepaths: List[Path]) -> Dict[str, Any]: - """Recursively load all configuration files, which satisfy - a given list of glob patterns from a specific path. - - Args: - config_filepaths: Configuration files sorted in the order of precedence. - - Raises: - ValueError: If 2 or more configuration files contain the same key(s). - BadConfigException: If configuration is poorly formatted and - cannot be loaded. - - Returns: - Resulting configuration dictionary. - + conf_source: Path to use as root directory for loading configuration. + env: Environment that will take precedence over base. + runtime_params: Extra parameters passed to a Kedro run. + base_env: Name of the base environment. Defaults to `"base"`. + This is used in the `conf_paths` property method to construct + the configuration paths. + default_run_env: Name of the base environment. Defaults to `"local"`. + This is used in the `conf_paths` property method to construct + the configuration paths. Can be overriden by supplying the `env` argument. """ + super().__init__( + conf_source=conf_source, env=env, runtime_params=runtime_params + ) + self.base_env = base_env + self.default_run_env = default_run_env - aggregate_config = {} - seen_file_to_keys = {} # type: Dict[Path, AbstractSet[str]] - - for config_filepath in config_filepaths: - single_config = self._load_config_file(config_filepath) - _check_duplicate_keys(seen_file_to_keys, config_filepath, single_config) - seen_file_to_keys[config_filepath] = single_config.keys() - aggregate_config.update(single_config) - - return aggregate_config - - def _lookup_config_filepaths( - self, conf_path: Path, patterns: Iterable[str], processed_files: Set[Path] - ) -> List[Path]: - config_files = _path_lookup(conf_path, patterns) - - seen_files = config_files & processed_files - if seen_files: - self.logger.warning( - "Config file(s): %s already processed, skipping loading...", - ", ".join(str(seen) for seen in sorted(seen_files)), - ) - config_files -= seen_files - - return sorted(config_files) + @property + def conf_paths(self): + """Property method to return deduplicated configuration paths.""" + return _remove_duplicates(self._build_conf_paths()) def get(self, *patterns: str) -> Dict[str, Any]: - """Recursively scan for configuration files, load and merge them, and - return them in the form of a config dictionary. - - Args: - *patterns: Glob patterns to match. Files, which names match - any of the specified patterns, will be processed. - - Raises: - ValueError: If 2 or more configuration files inside the same - config path (or its subdirectories) contain the same - top-level key. - MissingConfigException: If no configuration files exist within - a specified config path. - BadConfigException: If configuration is poorly formatted and - cannot be loaded. - - Returns: - Dict[str, Any]: A Python dictionary with the combined - configuration from all configuration files. **Note:** any keys - that start with `_` will be ignored. - """ - - if not patterns: - raise ValueError( - "`patterns` must contain at least one glob " - "pattern to match config filenames against." - ) - - config = {} # type: Dict[str, Any] - processed_files = set() # type: Set[Path] - - for conf_path in self.conf_paths: - if not Path(conf_path).is_dir(): - raise ValueError( - f"Given configuration path either does not exist " - f"or is not a valid directory: {conf_path}" - ) - - config_filepaths = self._lookup_config_filepaths( - Path(conf_path), patterns, processed_files - ) - new_conf = self._load_configs(config_filepaths) - - common_keys = config.keys() & new_conf.keys() - if common_keys: - sorted_keys = ", ".join(sorted(common_keys)) - msg = ( - "Config from path `%s` will override the following " - "existing top-level config keys: %s" - ) - self.logger.info(msg, conf_path, sorted_keys) - - config.update(new_conf) - processed_files |= set(config_filepaths) - - if not processed_files: - raise MissingConfigException( - f"No files found in {self.conf_paths} matching the glob " - f"pattern(s): {list(patterns)}" - ) - return config - - -def _check_duplicate_keys( - processed_files: Dict[Path, AbstractSet[str]], filepath: Path, conf: Dict[str, Any] -) -> None: - duplicates = [] - - for processed_file, keys in processed_files.items(): - overlapping_keys = conf.keys() & keys - - if overlapping_keys: - sorted_keys = ", ".join(sorted(overlapping_keys)) - if len(sorted_keys) > 100: - sorted_keys = sorted_keys[:100] + "..." - duplicates.append(f"{processed_file}: {sorted_keys}") - - if duplicates: - dup_str = "\n- ".join(duplicates) - raise ValueError(f"Duplicate keys found in {filepath} and:\n- {dup_str}") - - -def _path_lookup(conf_path: Path, patterns: Iterable[str]) -> Set[Path]: - """Return a set of all configuration files from ``conf_path`` or - its subdirectories, which satisfy a given list of glob patterns. - - Args: - conf_path: Path to configuration directory. - patterns: List of glob patterns to match the filenames against. - - Returns: - A set of paths to configuration files. - - """ - config_files = set() - conf_path = conf_path.resolve() - - for pattern in patterns: - # `Path.glob()` ignores the files if pattern ends with "**", - # therefore iglob is used instead - for each in iglob(str(conf_path / pattern), recursive=True): - path = Path(each).resolve() - if path.is_file() and path.suffix in SUPPORTED_EXTENSIONS: - config_files.add(path) - - return config_files - - -def _remove_duplicates(items: Iterable[str]): - """Remove duplicates while preserving the order.""" - unique_items = [] # type: List[str] - for item in items: - if item not in unique_items: - unique_items.append(item) - else: - warn( - f"Duplicate environment detected! " - f"Skipping re-loading from configuration path: {item}" - ) - return unique_items + return _get_config_from_patterns( + conf_paths=self.conf_paths, patterns=list(patterns) + ) + + def _build_conf_paths(self) -> Iterable[str]: + run_env = self.env or self.default_run_env + return [ + str(Path(self.conf_source) / self.base_env), + str(Path(self.conf_source) / run_env), + ] diff --git a/kedro/config/templated_config.py b/kedro/config/templated_config.py index 7c62a6d00a..af9404aa2b 100644 --- a/kedro/config/templated_config.py +++ b/kedro/config/templated_config.py @@ -5,11 +5,12 @@ import re from copy import deepcopy from pathlib import Path -from typing import Any, Dict, Iterable, Optional, Union +from typing import Any, Dict, Iterable, Optional import jmespath -from kedro.config.config import ConfigLoader +from kedro.config import AbstractConfigLoader +from kedro.config.common import _get_config_from_patterns, _remove_duplicates IDENTIFIER_PATTERN = re.compile( r"""\$\{ @@ -23,31 +24,25 @@ ) -class TemplatedConfigLoader(ConfigLoader): +class TemplatedConfigLoader(AbstractConfigLoader): """ Extension of the ``ConfigLoader`` class that allows for template values, wrapped in brackets like: ${...}, to be automatically formatted based on the configs. - The easiest way to use this class is by registering it into the - ``KedroContext`` using hooks. This can be done by updating the - hook implementation `register_config_loader` in `hooks.py`, making it return - a ``TemplatedConfigLoader`` object instead of a ``ConfigLoader`` object. + The easiest way to use this class is by setting the `CONFIG_LOADER_CLASS` constant + in `settings.py`. Example: :: + >>> # in settings.py >>> from kedro.config import TemplatedConfigLoader >>> - >>> - >>> class ProjectHooks: - >>> @hook_impl - >>> def register_config_loader(self, conf_paths: Iterable[str]) -> ConfigLoader: - >>> return TemplatedConfigLoader( - >>> conf_paths, - >>> globals_pattern="*globals.yml", - >>> globals_dict={"param1": "pandas.CSVDataSet"} - >>> ) + >>> CONFIG_LOADER_CLASS = TemplatedConfigLoader + >>> CONFIG_LOADER_ARGS = { + >>> "globals_pattern": "*globals.yml", + >>> } The contents of the dictionary resulting from the `globals_pattern` get merged with the ``globals_dict``. In case of conflicts, the keys in @@ -94,16 +89,23 @@ class TemplatedConfigLoader(ConfigLoader): def __init__( self, - conf_paths: Union[str, Iterable[str]], + conf_source: str, + env: str = None, + runtime_params: Dict[str, Any] = None, *, + base_env: str = "base", + default_run_env: str = "local", globals_pattern: Optional[str] = None, globals_dict: Optional[Dict[str, Any]] = None, ): - """Instantiate a ``TemplatedConfigLoader``. + """Instantiates a ``TemplatedConfigLoader``. Args: - conf_paths: Non-empty path or list of paths to configuration - directories. + conf_source: Path to use as root directory for loading configuration. + env: Environment that will take precedence over base. + runtime_params: Extra parameters passed to a Kedro run. + base_env: + default_run_env: globals_pattern: Optional keyword-only argument specifying a glob pattern. Files that match the pattern will be loaded as a formatting dictionary. @@ -112,31 +114,28 @@ def __init__( obtained from the globals_pattern. In case of duplicate keys, the ``globals_dict`` keys take precedence. """ - - super().__init__(conf_paths) - - self._arg_dict = super().get(globals_pattern) if globals_pattern else {} + super().__init__( + conf_source=conf_source, env=env, runtime_params=runtime_params + ) + self.base_env = base_env + self.default_run_env = default_run_env + + self._config_mapping = ( + _get_config_from_patterns( + conf_paths=self.conf_paths, + patterns=list(globals_pattern), + ac_template=False, + ) + if globals_pattern + else {} + ) globals_dict = deepcopy(globals_dict) or {} - self._arg_dict = {**self._arg_dict, **globals_dict} - - @staticmethod - def _load_config_file(config_file: Path) -> Dict[str, Any]: - """Load an individual config file using `anyconfig` as a backend. - - Args: - config_file: Path to a config file to process. + self._config_mapping = {**self._config_mapping, **globals_dict} - Returns: - Parsed configuration. - """ - # for performance reasons - import anyconfig # pylint: disable=import-outside-toplevel - - return { - k: v - for k, v in anyconfig.load(config_file, ac_template=True).items() - if not k.startswith("_") - } + @property + def conf_paths(self): + """Property method to return deduplicated configuration paths.""" + return _remove_duplicates(self._build_conf_paths()) def get(self, *patterns: str) -> Dict[str, Any]: """Tries to resolve the template variables in the config dictionary @@ -159,9 +158,17 @@ def get(self, *patterns: str) -> Dict[str, Any]: Raises: ValueError: malformed config found. """ - - config_raw = super().get(*patterns) - return _format_object(config_raw, self._arg_dict) + config_raw = _get_config_from_patterns( + conf_paths=self.conf_paths, patterns=list(patterns), ac_template=True + ) + return _format_object(config_raw, self._config_mapping) + + def _build_conf_paths(self) -> Iterable[str]: + run_env = self.env or self.default_run_env + return [ + str(Path(self.conf_source) / self.base_env), + str(Path(self.conf_source) / run_env), + ] def _format_object(val: Any, format_dict: Dict[str, Any]) -> Any: diff --git a/kedro/extras/datasets/README.md b/kedro/extras/datasets/README.md index 54d77bc249..bb9ebfa528 100644 --- a/kedro/extras/datasets/README.md +++ b/kedro/extras/datasets/README.md @@ -8,11 +8,11 @@ We support a range of data descriptions, including CSV, Excel, Parquet, Feather, These data descriptions are supported with the APIs of `pandas`, `spark`, `networkx`, `matplotlib`, `yaml` and more. -[The Data Catalog](https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html) allows you to work with a range of file formats on local file systems, network file systems, cloud object stores, and Hadoop. +[The Data Catalog](https://kedro.readthedocs.io/en/stable/data/data_catalog.html) allows you to work with a range of file formats on local file systems, network file systems, cloud object stores, and Hadoop. Here is a full list of [supported data descriptions and APIs](https://kedro.readthedocs.io/en/stable/kedro.extras.datasets.html). ## How can I create my own `AbstractDataSet` implementation? -Take a look at our [instructions on how to create your own `AbstractDataSet` implementation](https://kedro.readthedocs.io/en/stable/07_extend_kedro/03_custom_datasets.html). +Take a look at our [instructions on how to create your own `AbstractDataSet` implementation](https://kedro.readthedocs.io/en/stable/extend_kedro/custom_datasets.html). diff --git a/kedro/extras/datasets/holoviews/holoviews_writer.py b/kedro/extras/datasets/holoviews/holoviews_writer.py index fec7900411..28d532eeb9 100644 --- a/kedro/extras/datasets/holoviews/holoviews_writer.py +++ b/kedro/extras/datasets/holoviews/holoviews_writer.py @@ -66,7 +66,7 @@ def __init__( E.g. for ``S3FileSystem`` it should look like: `{'key': '', 'secret': ''}}` save_args: Extra save args passed to `holoviews.save()`. See - http://holoviews.org/reference_manual/holoviews.util.html#holoviews.util.save + https://holoviews.org/reference_manual/holoviews.util.html#holoviews.util.save version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` diff --git a/kedro/extras/datasets/matplotlib/matplotlib_writer.py b/kedro/extras/datasets/matplotlib/matplotlib_writer.py index 7158f01f94..3e52782070 100644 --- a/kedro/extras/datasets/matplotlib/matplotlib_writer.py +++ b/kedro/extras/datasets/matplotlib/matplotlib_writer.py @@ -37,6 +37,16 @@ class MatplotlibWriter(AbstractVersionedDataSet): >>> plt.close() >>> single_plot_writer.save(plt) >>> + >>> # MatplotlibWriter can output other formats as well, such as PDF files. + >>> # For this, we need to specify the format: + >>> plt.plot([1, 2, 3], [4, 5, 6]) + >>> single_plot_writer = MatplotlibWriter( + >>> filepath="matplot_lib_single_plot.pdf", + >>> save_args={"format": "pdf"}, + >>> ) + >>> plt.close() + >>> single_plot_writer.save(plt) + >>> >>> # Saving dictionary of plots >>> plots_dict = dict() >>> for colour in ["blue", "green", "red"]: diff --git a/kedro/extras/datasets/networkx/__init__.py b/kedro/extras/datasets/networkx/__init__.py index 71642ef034..73674c81fe 100644 --- a/kedro/extras/datasets/networkx/__init__.py +++ b/kedro/extras/datasets/networkx/__init__.py @@ -1,9 +1,15 @@ """``AbstractDataSet`` implementation to save and load NetworkX graphs in JSON -format using ``NetworkX``.""" +, GraphML and GML formats using ``NetworkX``.""" -__all__ = ["NetworkXDataSet"] +__all__ = ["GMLDataSet", "GraphMLDataSet", "JSONDataSet"] from contextlib import suppress with suppress(ImportError): - from .networkx_dataset import NetworkXDataSet + from .gml_dataset import GMLDataSet + +with suppress(ImportError): + from .graphml_dataset import GraphMLDataSet + +with suppress(ImportError): + from .json_dataset import JSONDataSet diff --git a/kedro/extras/datasets/networkx/gml_dataset.py b/kedro/extras/datasets/networkx/gml_dataset.py new file mode 100644 index 0000000000..9543e6745a --- /dev/null +++ b/kedro/extras/datasets/networkx/gml_dataset.py @@ -0,0 +1,170 @@ +# Copyright 2021 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""NetworkX ``GMLDataSet`` loads and saves graphs to a graph modelling language (GML) +file using an underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to +create GML data. +""" + +from copy import deepcopy +from pathlib import PurePosixPath +from typing import Any, Dict + +import fsspec +import networkx + +from kedro.io.core import ( + AbstractVersionedDataSet, + Version, + get_filepath_str, + get_protocol_and_path, +) + + +class GMLDataSet(AbstractVersionedDataSet): + """``GMLDataSet`` loads and saves graphs to a GML file using an + underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to + create GML data. + See https://networkx.org/documentation/stable/tutorial.html for details. + + Example: + :: + + >>> from kedro.extras.datasets.networkx import GMLDataSet + >>> import networkx as nx + >>> graph = nx.complete_graph(100) + >>> graph_dataset = GMLDataSet(filepath="test.gml") + >>> graph_dataset.save(graph) + >>> reloaded = graph_dataset.load() + >>> assert nx.is_isomorphic(graph, reloaded) + + """ + + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + + # pylint: disable=too-many-arguments + def __init__( + self, + filepath: str, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + version: Version = None, + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``GMLDataSet``. + + Args: + filepath: Filepath in POSIX format to the NetworkX GML file. + load_args: Arguments passed on to ``networkx.read_gml``. + See the details in + https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.gml.read_gml.html + save_args: Arguments passed on to ``networkx.write_gml``. + See the details in + https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.gml.write_gml.html + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as + to pass to the filesystem's `open` method through nested keys + `open_args_load` and `open_args_save`. + Here you can find all available arguments for `open`: + https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open + All defaults are preserved, except `mode`, which is set to `r` when loading + and to `w` when saving. + """ + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath, version) + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + + self._protocol = protocol + self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + _fs_open_args_load.setdefault("mode", "rb") + _fs_open_args_save.setdefault("mode", "wb") + self._fs_open_args_load = _fs_open_args_load + self._fs_open_args_save = _fs_open_args_save + + def _load(self) -> networkx.Graph: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: + data = networkx.read_gml(fs_file, **self._load_args) + return data + + def _save(self, data: networkx.Graph) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + networkx.write_gml(data, fs_file, **self._save_args) + self._invalidate_cache() + + def _exists(self) -> bool: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + return self._fs.exists(load_path) + + def _describe(self) -> Dict[str, Any]: + return dict( + filepath=self._filepath, + protocol=self._protocol, + load_args=self._load_args, + save_args=self._save_args, + version=self._version, + ) + + def _release(self) -> None: + super()._release() + self._invalidate_cache() + + def _invalidate_cache(self) -> None: + """Invalidate underlying filesystem caches.""" + filepath = get_filepath_str(self._filepath, self._protocol) + self._fs.invalidate_cache(filepath) diff --git a/kedro/extras/datasets/networkx/graphml_dataset.py b/kedro/extras/datasets/networkx/graphml_dataset.py new file mode 100644 index 0000000000..b540f5344e --- /dev/null +++ b/kedro/extras/datasets/networkx/graphml_dataset.py @@ -0,0 +1,169 @@ +# Copyright 2021 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""NetworkX ``GraphMLDataSet`` loads and saves graphs to a GraphML file using an underlying +filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to create GraphML data. +""" + +from copy import deepcopy +from pathlib import PurePosixPath +from typing import Any, Dict + +import fsspec +import networkx + +from kedro.io.core import ( + AbstractVersionedDataSet, + Version, + get_filepath_str, + get_protocol_and_path, +) + + +class GraphMLDataSet(AbstractVersionedDataSet): + """``GraphMLDataSet`` loads and saves graphs to a GraphML file using an + underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to + create GraphML data. + See https://networkx.org/documentation/stable/tutorial.html for details. + + Example: + :: + + >>> from kedro.extras.datasets.networkx import GraphMLDataSet + >>> import networkx as nx + >>> graph = nx.complete_graph(100) + >>> graph_dataset = GraphMLDataSet(filepath="test.graphml") + >>> graph_dataset.save(graph) + >>> reloaded = graph_dataset.load() + >>> assert nx.is_isomorphic(graph, reloaded) + + """ + + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + + # pylint: disable=too-many-arguments + def __init__( + self, + filepath: str, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + version: Version = None, + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``GraphMLDataSet``. + + Args: + filepath: Filepath in POSIX format to the NetworkX GraphML file. + load_args: Arguments passed on to ``networkx.read_graphml``. + See the details in + https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.graphml.read_graphml.html + save_args: Arguments passed on to ``networkx.write_graphml``. + See the details in + https://networkx.org/documentation/stable/reference/readwrite/generated/networkx.readwrite.graphml.write_graphml.html + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as + to pass to the filesystem's `open` method through nested keys + `open_args_load` and `open_args_save`. + Here you can find all available arguments for `open`: + https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open + All defaults are preserved, except `mode`, which is set to `r` when loading + and to `w` when saving. + """ + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath, version) + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + + self._protocol = protocol + self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + _fs_open_args_load.setdefault("mode", "rb") + _fs_open_args_save.setdefault("mode", "wb") + self._fs_open_args_load = _fs_open_args_load + self._fs_open_args_save = _fs_open_args_save + + def _load(self) -> networkx.Graph: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: + data = networkx.read_graphml(fs_file, **self._load_args) + return data + + def _save(self, data: networkx.Graph) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + networkx.write_graphml(data, fs_file, **self._save_args) + self._invalidate_cache() + + def _exists(self) -> bool: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + return self._fs.exists(load_path) + + def _describe(self) -> Dict[str, Any]: + return dict( + filepath=self._filepath, + protocol=self._protocol, + load_args=self._load_args, + save_args=self._save_args, + version=self._version, + ) + + def _release(self) -> None: + super()._release() + self._invalidate_cache() + + def _invalidate_cache(self) -> None: + """Invalidate underlying filesystem caches.""" + filepath = get_filepath_str(self._filepath, self._protocol) + self._fs.invalidate_cache(filepath) diff --git a/kedro/extras/datasets/networkx/networkx_dataset.py b/kedro/extras/datasets/networkx/json_dataset.py similarity index 90% rename from kedro/extras/datasets/networkx/networkx_dataset.py rename to kedro/extras/datasets/networkx/json_dataset.py index 52b3714190..fae22b4128 100644 --- a/kedro/extras/datasets/networkx/networkx_dataset.py +++ b/kedro/extras/datasets/networkx/json_dataset.py @@ -1,4 +1,4 @@ -"""``NetworkXDataSet`` loads and saves graphs to a JSON file using an underlying +"""``JSONDataSet`` loads and saves graphs to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to create JSON data. """ @@ -18,8 +18,8 @@ ) -class NetworkXDataSet(AbstractVersionedDataSet): - """``NetworkXDataSet`` loads and saves graphs to a JSON file using an +class JSONDataSet(AbstractVersionedDataSet): + """NetworkX ``JSONDataSet`` loads and saves graphs to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). ``NetworkX`` is used to create JSON data. See https://networkx.org/documentation/stable/tutorial.html for details. @@ -27,10 +27,10 @@ class NetworkXDataSet(AbstractVersionedDataSet): Example: :: - >>> from kedro.extras.datasets.networkx import NetworkXDataSet + >>> from kedro.extras.datasets.networkx import JSONDataSet >>> import networkx as nx >>> graph = nx.complete_graph(100) - >>> graph_dataset = NetworkXDataSet(filepath="test.json") + >>> graph_dataset = JSONDataSet(filepath="test.json") >>> graph_dataset.save(graph) >>> reloaded = graph_dataset.load() >>> assert nx.is_isomorphic(graph, reloaded) @@ -50,14 +50,14 @@ def __init__( credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, ) -> None: - """Creates a new instance of ``NetworkXDataSet``. + """Creates a new instance of ``JSONDataSet``. Args: filepath: Filepath in POSIX format to the NetworkX graph JSON file. - load_args: Arguments passed on to ```networkx.node_link_graph``. + load_args: Arguments passed on to ``networkx.node_link_graph``. See the details in https://networkx.org/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html - save_args: Arguments passed on to ```networkx.node_link_data``. + save_args: Arguments passed on to ``networkx.node_link_data``. See the details in https://networkx.org/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_data.html version: If specified, should be an instance of diff --git a/kedro/extras/datasets/pandas/__init__.py b/kedro/extras/datasets/pandas/__init__.py index 8cdceaf1ac..078587d7ea 100644 --- a/kedro/extras/datasets/pandas/__init__.py +++ b/kedro/extras/datasets/pandas/__init__.py @@ -7,12 +7,12 @@ "GBQTableDataSet", "GBQQueryDataSet", "ExcelDataSet", - "AppendableExcelDataSet", "HDFDataSet", "JSONDataSet", "ParquetDataSet", "SQLQueryDataSet", "SQLTableDataSet", + "XMLDataSet", "GenericDataSet", ] @@ -22,8 +22,6 @@ from .csv_dataset import CSVDataSet with suppress(ImportError): from .excel_dataset import ExcelDataSet -with suppress(ImportError): - from .appendable_excel_dataset import AppendableExcelDataSet with suppress(ImportError): from .feather_dataset import FeatherDataSet with suppress(ImportError): @@ -36,5 +34,7 @@ from .parquet_dataset import ParquetDataSet with suppress(ImportError): from .sql_dataset import SQLQueryDataSet, SQLTableDataSet +with suppress(ImportError): + from .xml_dataset import XMLDataSet with suppress(ImportError): from .generic_dataset import GenericDataSet diff --git a/kedro/extras/datasets/pandas/appendable_excel_dataset.py b/kedro/extras/datasets/pandas/appendable_excel_dataset.py deleted file mode 100644 index 6533cb2d91..0000000000 --- a/kedro/extras/datasets/pandas/appendable_excel_dataset.py +++ /dev/null @@ -1,136 +0,0 @@ -"""``AppendableExcelDataSet`` loads/saves data from/to a local Excel file opened in append mode. -It uses pandas to handle the Excel file. -""" -from copy import deepcopy -from pathlib import Path, PurePosixPath -from typing import Any, Dict - -import pandas as pd - -from kedro.io.core import AbstractDataSet, DataSetError - - -class AppendableExcelDataSet(AbstractDataSet): - """``AppendableExcelDataSet`` loads/saves data from/to a local Excel file opened in - append mode. It uses pandas to handle the Excel file. - - Example adding a catalog entry with - `YAML API `_: - - .. code-block:: yaml - - >>> # AppendableExcelDataSet creates a new sheet for every dataset - >>> # ExcelDataSet restricts one dataset per file as it is overwritten - >>> - >>> preprocessed_companies: - >>> type: pandas.AppendableExcelDataSet - >>> filepath: data/02_intermediate/preprocessed.xlsx # assumes file already exists - >>> save_args: - >>> sheet_name: preprocessed_companies - >>> load_args: - >>> sheet_name: preprocessed_companies - >>> - >>> preprocessed_shuttles: - >>> type: pandas.AppendableExcelDataSet - >>> filepath: data/02_intermediate/preprocessed.xlsx - >>> save_args: - >>> sheet_name: preprocessed_shuttles - >>> load_args: - >>> sheet_name: preprocessed_shuttles - - Example using Python API: - :: - - >>> from kedro.extras.datasets.pandas import AppendableExcelDataSet - >>> from kedro.extras.datasets.pandas import ExcelDataSet - >>> import pandas as pd - >>> - >>> data_1 = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], - >>> 'col3': [5, 6]}) - >>> - >>> data_2 = pd.DataFrame({'col1': [7, 8], 'col2': [5, 7]}) - >>> - >>> regular_ds = ExcelDataSet(filepath="/tmp/test.xlsx") - >>> appendable_ds = AppendableExcelDataSet( - >>> filepath="/tmp/test.xlsx", - >>> save_args={"sheet_name": "my_sheet"}, - >>> load_args={"sheet_name": "my_sheet"} - >>> ) - >>> - >>> regular_ds.save(data_1) - >>> appendable_ds.save(data_2) - >>> reloaded = appendable_ds.load() - >>> assert data_2.equals(reloaded) - - """ - - DEFAULT_LOAD_ARGS = {"engine": "openpyxl"} - DEFAULT_SAVE_ARGS = {"index": False} - - def __init__( - self, - filepath: str, - load_args: Dict[str, Any] = None, - save_args: Dict[str, Any] = None, - ) -> None: - """Creates a new instance of ``AppendableExcelDataSet`` pointing to an existing local - Excel file to be opened in append mode. - - Args: - filepath: Filepath in POSIX format to an existing local Excel file. - load_args: Pandas options for loading Excel files. - Here you can find all available arguments: - https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html - All defaults are preserved, but "engine", which is set to "openpyxl". - save_args: Pandas options for saving Excel files. - Here you can find all available arguments: - https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html - All defaults are preserved, but "index", which is set to False. - If you would like to specify options for the `ExcelWriter`, - you can include them under "writer" key. Here you can - find all available arguments: - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html - Note: `mode` option of `ExcelWriter` is set to `a` and it can not be overridden. - """ - self._filepath = PurePosixPath(filepath) - - # Handle default load and save arguments - self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) - if load_args is not None: - self._load_args.update(load_args) - - save_args = deepcopy(save_args) or {} - self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) - self._writer_args = save_args.pop("writer", {}) # type: Dict[str, Any] - self._writer_args.setdefault("engine", "openpyxl") - if save_args is not None: - self._save_args.update(save_args) - - # Use only append mode - self._writer_args["mode"] = "a" - - def _describe(self) -> Dict[str, Any]: - return dict( - filepath=self._filepath, - load_args=self._load_args, - save_args=self._save_args, - writer_args=self._writer_args, - ) - - def _load(self) -> pd.DataFrame: - return pd.read_excel(str(self._filepath), **self._load_args) - - def _save(self, data: pd.DataFrame) -> None: - # pylint: disable=abstract-class-instantiated - try: - with pd.ExcelWriter(str(self._filepath), **self._writer_args) as writer: - data.to_excel(writer, **self._save_args) - except FileNotFoundError as exc: - raise DataSetError( - f"`{self._filepath}` Excel file not found. The file cannot be opened in " - f"append mode." - ) from exc - - def _exists(self) -> bool: - return Path(self._filepath.as_posix()).is_file() diff --git a/kedro/extras/datasets/pandas/csv_dataset.py b/kedro/extras/datasets/pandas/csv_dataset.py index 85b3cef572..cad57896e0 100644 --- a/kedro/extras/datasets/pandas/csv_dataset.py +++ b/kedro/extras/datasets/pandas/csv_dataset.py @@ -1,7 +1,9 @@ """``CSVDataSet`` loads/saves data from/to a CSV file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the CSV file. """ +import logging from copy import deepcopy +from io import BytesIO from pathlib import PurePosixPath from typing import Any, Dict @@ -9,6 +11,7 @@ import pandas as pd from kedro.io.core import ( + PROTOCOL_DELIMITER, AbstractVersionedDataSet, DataSetError, Version, @@ -16,6 +19,8 @@ get_protocol_and_path, ) +logger = logging.getLogger(__name__) + class CSVDataSet(AbstractVersionedDataSet): """``CSVDataSet`` loads/saves data from/to a CSV file using an underlying @@ -23,8 +28,8 @@ class CSVDataSet(AbstractVersionedDataSet): Example adding a catalog entry with `YAML API - `_: + `_: .. code-block:: yaml @@ -99,17 +104,9 @@ def __init__( credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor - (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as - to pass to the filesystem's `open` method through nested keys - `open_args_load` and `open_args_save`. - Here you can find all available arguments for `open`: - https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). """ _fs_args = deepcopy(fs_args) or {} - _fs_open_args_load = _fs_args.pop("open_args_load", {}) - _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -117,7 +114,8 @@ def __init__( _fs_args.setdefault("auto_mkdir", True) self._protocol = protocol - self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self._storage_options = {**_credentials, **_fs_args} + self._fs = fsspec.filesystem(self._protocol, **self._storage_options) super().__init__( filepath=PurePosixPath(path), @@ -134,10 +132,14 @@ def __init__( if save_args is not None: self._save_args.update(save_args) - _fs_open_args_save.setdefault("mode", "w") - _fs_open_args_save.setdefault("newline", "") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + if "storage_options" in self._save_args or "storage_options" in self._load_args: + logger.warning( + "Dropping `storage_options` for %s, " + "please specify them under `fs_args` or `credentials`.", + self._filepath, + ) + self._save_args.pop("storage_options", None) + self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: return dict( @@ -149,16 +151,27 @@ def _describe(self) -> Dict[str, Any]: ) def _load(self) -> pd.DataFrame: - load_path = get_filepath_str(self._get_load_path(), self._protocol) - - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return pd.read_csv(fs_file, **self._load_args) + load_path = str(self._get_load_path()) + if self._protocol == "file": + # file:// protocol seems to misbehave on Windows + # (), + # so we don't join that back to the filepath; + # storage_options also don't work with local paths + return pd.read_csv(load_path, **self._load_args) + + load_path = f"{self._protocol}{PROTOCOL_DELIMITER}{load_path}" + return pd.read_csv( + load_path, storage_options=self._storage_options, **self._load_args + ) def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: - data.to_csv(path_or_buf=fs_file, **self._save_args) + buf = BytesIO() + data.to_csv(path_or_buf=buf, **self._save_args) + + with self._fs.open(save_path, mode="wb") as fs_file: + fs_file.write(buf.getvalue()) self._invalidate_cache() diff --git a/kedro/extras/datasets/pandas/excel_dataset.py b/kedro/extras/datasets/pandas/excel_dataset.py index 31ab9d8105..0a03191a48 100644 --- a/kedro/extras/datasets/pandas/excel_dataset.py +++ b/kedro/extras/datasets/pandas/excel_dataset.py @@ -1,6 +1,7 @@ """``ExcelDataSet`` loads/saves data from/to a Excel file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Excel file. """ +import logging from copy import deepcopy from io import BytesIO from pathlib import PurePosixPath @@ -10,6 +11,7 @@ import pandas as pd from kedro.io.core import ( + PROTOCOL_DELIMITER, AbstractVersionedDataSet, DataSetError, Version, @@ -17,14 +19,16 @@ get_protocol_and_path, ) +logger = logging.getLogger(__name__) + class ExcelDataSet(AbstractVersionedDataSet): """``ExcelDataSet`` loads/saves data from/to a Excel file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Excel file. Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml @@ -60,14 +64,14 @@ class ExcelDataSet(AbstractVersionedDataSet): """ - DEFAULT_LOAD_ARGS = {"engine": "xlrd"} + DEFAULT_LOAD_ARGS = {"engine": "openpyxl"} DEFAULT_SAVE_ARGS = {"index": False} # pylint: disable=too-many-arguments def __init__( self, filepath: str, - engine: str = "xlsxwriter", + engine: str = "openpyxl", load_args: Dict[str, Any] = None, save_args: Dict[str, Any] = None, version: Version = None, @@ -83,11 +87,11 @@ def __init__( The prefix should be any protocol supported by ``fsspec``. Note: `http(s)` doesn't support versioning. engine: The engine used to write to excel files. The default - engine is 'xlsxwriter'. + engine is 'openpyxl'. load_args: Pandas options for loading Excel files. Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html - All defaults are preserved, but "engine", which is set to "xlrd". + All defaults are preserved, but "engine", which is set to "openpyxl". Supports multi-sheet Excel files (include `sheet_name = None` in `load_args`). save_args: Pandas options for saving Excel files. Here you can find all available arguments: @@ -104,16 +108,12 @@ def __init__( credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor - (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as - to pass to the filesystem's `open` method through nested keys - `open_args_load` and `open_args_save`. - Here you can find all available arguments for `open`: - https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `wb` when saving. + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + + Raises: + DataSetError: If versioning is enabled while in append mode. """ _fs_args = deepcopy(fs_args) or {} - _fs_open_args_load = _fs_args.pop("open_args_load", {}) - _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -121,7 +121,8 @@ def __init__( _fs_args.setdefault("auto_mkdir", True) self._protocol = protocol - self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self._storage_options = {**_credentials, **_fs_args} + self._fs = fsspec.filesystem(self._protocol, **self._storage_options) super().__init__( filepath=PurePosixPath(path), @@ -139,11 +140,22 @@ def __init__( self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: self._save_args.update(save_args) - self._writer_args = self._save_args.pop("writer", {"engine": engine}) - - _fs_open_args_save.setdefault("mode", "wb") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + self._writer_args = self._save_args.pop("writer", {}) # type: ignore + self._writer_args.setdefault("engine", engine or "openpyxl") # type: ignore + + if version and self._writer_args.get("mode") == "a": # type: ignore + raise DataSetError( + "`ExcelDataSet` doesn't support versioning in append mode." + ) + + if "storage_options" in self._save_args or "storage_options" in self._load_args: + logger.warning( + "Dropping `storage_options` for %s, " + "please specify them under `fs_args` or `credentials`.", + self._filepath, + ) + self._save_args.pop("storage_options", None) + self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: return dict( @@ -156,10 +168,18 @@ def _describe(self) -> Dict[str, Any]: ) def _load(self) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]: - load_path = get_filepath_str(self._get_load_path(), self._protocol) - - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return pd.read_excel(fs_file, **self._load_args) + load_path = str(self._get_load_path()) + if self._protocol == "file": + # file:// protocol seems to misbehave on Windows + # (), + # so we don't join that back to the filepath; + # storage_options also don't work with local paths + return pd.read_excel(load_path, **self._load_args) + + load_path = f"{self._protocol}{PROTOCOL_DELIMITER}{load_path}" + return pd.read_excel( + load_path, storage_options=self._storage_options, **self._load_args + ) def _save(self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]) -> None: output = BytesIO() @@ -175,7 +195,7 @@ def _save(self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]]) -> None: else: data.to_excel(writer, **self._save_args) - with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + with self._fs.open(save_path, mode="wb") as fs_file: fs_file.write(output.getvalue()) self._invalidate_cache() diff --git a/kedro/extras/datasets/pandas/feather_dataset.py b/kedro/extras/datasets/pandas/feather_dataset.py index a340c9baa3..d3b1ed34b8 100644 --- a/kedro/extras/datasets/pandas/feather_dataset.py +++ b/kedro/extras/datasets/pandas/feather_dataset.py @@ -2,6 +2,7 @@ using an underlying filesystem (e.g.: local, S3, GCS). The underlying functionality is supported by pandas, so it supports all operations the pandas supports. """ +import logging from copy import deepcopy from io import BytesIO from pathlib import PurePosixPath @@ -11,12 +12,15 @@ import pandas as pd from kedro.io.core import ( + PROTOCOL_DELIMITER, AbstractVersionedDataSet, Version, get_filepath_str, get_protocol_and_path, ) +logger = logging.getLogger(__name__) + class FeatherDataSet(AbstractVersionedDataSet): """``FeatherDataSet`` loads and saves data to a feather file using an @@ -44,12 +48,14 @@ class FeatherDataSet(AbstractVersionedDataSet): """ DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] # pylint: disable=too-many-arguments def __init__( self, filepath: str, load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, version: Version = None, credentials: Dict[str, Any] = None, fs_args: Dict[str, Any] = None, @@ -66,6 +72,10 @@ def __init__( Here you can find all available arguments: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html All defaults are preserved. + save_args: Pandas options for saving feather files. + Here you can find all available arguments: + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_feather.html + All defaults are preserved. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -73,16 +83,9 @@ def __init__( credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor - (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as - to pass to the filesystem's `open` method through nested keys - `open_args_load` and `open_args_save`. - Here you can find all available arguments for `open`: - https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `wb` when saving. + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). """ _fs_args = deepcopy(fs_args) or {} - _fs_open_args_load = _fs_args.pop("open_args_load", {}) - _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -90,7 +93,8 @@ def __init__( _fs_args.setdefault("auto_mkdir", True) self._protocol = protocol - self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self._storage_options = {**_credentials, **_fs_args} + self._fs = fsspec.filesystem(self._protocol, **self._storage_options) super().__init__( filepath=PurePosixPath(path), @@ -103,10 +107,18 @@ def __init__( self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) - - _fs_open_args_save.setdefault("mode", "wb") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + if "storage_options" in self._save_args or "storage_options" in self._load_args: + logger.warning( + "Dropping `storage_options` for %s, " + "please specify them under `fs_args` or `credentials`.", + self._filepath, + ) + self._save_args.pop("storage_options", None) + self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: return dict( @@ -117,25 +129,32 @@ def _describe(self) -> Dict[str, Any]: ) def _load(self) -> pd.DataFrame: - load_path = get_filepath_str(self._get_load_path(), self._protocol) - - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return pd.read_feather(fs_file, **self._load_args) + load_path = str(self._get_load_path()) + if self._protocol == "file": + # file:// protocol seems to misbehave on Windows + # (), + # so we don't join that back to the filepath; + # storage_options also don't work with local paths + return pd.read_feather(load_path, **self._load_args) + + load_path = f"{self._protocol}{PROTOCOL_DELIMITER}{load_path}" + return pd.read_feather( + load_path, storage_options=self._storage_options, **self._load_args + ) def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) buf = BytesIO() - data.to_feather(buf) + data.to_feather(buf, **self._save_args) - with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: + with self._fs.open(save_path, mode="wb") as fs_file: fs_file.write(buf.getvalue()) self._invalidate_cache() def _exists(self) -> bool: load_path = get_filepath_str(self._get_load_path(), self._protocol) - return self._fs.exists(load_path) def _release(self) -> None: diff --git a/kedro/extras/datasets/pandas/gbq_dataset.py b/kedro/extras/datasets/pandas/gbq_dataset.py index 8882fef61c..49777ff590 100644 --- a/kedro/extras/datasets/pandas/gbq_dataset.py +++ b/kedro/extras/datasets/pandas/gbq_dataset.py @@ -26,8 +26,8 @@ class GBQTableDataSet(AbstractDataSet): It uses pandas-gbq to read and write from/to BigQuery table. Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml diff --git a/kedro/extras/datasets/pandas/generic_dataset.py b/kedro/extras/datasets/pandas/generic_dataset.py index 697f8c21f0..0bca989513 100644 --- a/kedro/extras/datasets/pandas/generic_dataset.py +++ b/kedro/extras/datasets/pandas/generic_dataset.py @@ -35,8 +35,8 @@ class GenericDataSet(AbstractVersionedDataSet): appropriate type of read/write target on a best effort basis. Example using `YAML API - `_: + `_: .. code-block:: yaml diff --git a/kedro/extras/datasets/pandas/hdf_dataset.py b/kedro/extras/datasets/pandas/hdf_dataset.py index 2816fec317..6e12350e27 100644 --- a/kedro/extras/datasets/pandas/hdf_dataset.py +++ b/kedro/extras/datasets/pandas/hdf_dataset.py @@ -25,8 +25,8 @@ class HDFDataSet(AbstractVersionedDataSet): filesystem (e.g. local, S3, GCS). It uses pandas.HDFStore to handle the hdf file. Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml diff --git a/kedro/extras/datasets/pandas/json_dataset.py b/kedro/extras/datasets/pandas/json_dataset.py index 8371ba6210..dfc7b5be19 100644 --- a/kedro/extras/datasets/pandas/json_dataset.py +++ b/kedro/extras/datasets/pandas/json_dataset.py @@ -1,7 +1,9 @@ """``JSONDataSet`` loads/saves data from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the JSON file. """ +import logging from copy import deepcopy +from io import BytesIO from pathlib import PurePosixPath from typing import Any, Dict @@ -9,6 +11,7 @@ import pandas as pd from kedro.io.core import ( + PROTOCOL_DELIMITER, AbstractVersionedDataSet, DataSetError, Version, @@ -16,14 +19,16 @@ get_protocol_and_path, ) +logger = logging.getLogger(__name__) + class JSONDataSet(AbstractVersionedDataSet): """``JSONDataSet`` loads/saves data from/to a JSON file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the json file. Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml @@ -91,24 +96,17 @@ def __init__( credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{'token': None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor - (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as - to pass to the filesystem's `open` method through nested keys - `open_args_load` and `open_args_save`. - Here you can find all available arguments for `open`: - https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `r` when loading - and to `w` when saving. + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). """ _fs_args = deepcopy(fs_args) or {} - _fs_open_args_load = _fs_args.pop("open_args_load", {}) - _fs_open_args_save = _fs_args.pop("open_args_save", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) if protocol == "file": _fs_args.setdefault("auto_mkdir", True) self._protocol = protocol - self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self._storage_options = {**_credentials, **_fs_args} + self._fs = fsspec.filesystem(self._protocol, **self._storage_options) super().__init__( filepath=PurePosixPath(path), @@ -125,9 +123,14 @@ def __init__( if save_args is not None: self._save_args.update(save_args) - _fs_open_args_save.setdefault("mode", "w") - self._fs_open_args_load = _fs_open_args_load - self._fs_open_args_save = _fs_open_args_save + if "storage_options" in self._save_args or "storage_options" in self._load_args: + logger.warning( + "Dropping `storage_options` for %s, " + "please specify them under `fs_args` or `credentials`.", + self._filepath, + ) + self._save_args.pop("storage_options", None) + self._load_args.pop("storage_options", None) def _describe(self) -> Dict[str, Any]: return dict( @@ -139,16 +142,27 @@ def _describe(self) -> Dict[str, Any]: ) def _load(self) -> Any: - load_path = get_filepath_str(self._get_load_path(), self._protocol) - - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return pd.read_json(fs_file, **self._load_args) + load_path = str(self._get_load_path()) + if self._protocol == "file": + # file:// protocol seems to misbehave on Windows + # (), + # so we don't join that back to the filepath; + # storage_options also don't work with local paths + return pd.read_json(load_path, **self._load_args) + + load_path = f"{self._protocol}{PROTOCOL_DELIMITER}{load_path}" + return pd.read_json( + load_path, storage_options=self._storage_options, **self._load_args + ) def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: - data.to_json(path_or_buf=fs_file, **self._save_args) + buf = BytesIO() + data.to_json(path_or_buf=buf, **self._save_args) + + with self._fs.open(save_path, mode="wb") as fs_file: + fs_file.write(buf.getvalue()) self._invalidate_cache() diff --git a/kedro/extras/datasets/pandas/parquet_dataset.py b/kedro/extras/datasets/pandas/parquet_dataset.py index 2836019fd2..eca4958a03 100644 --- a/kedro/extras/datasets/pandas/parquet_dataset.py +++ b/kedro/extras/datasets/pandas/parquet_dataset.py @@ -1,17 +1,18 @@ """``ParquetDataSet`` loads/saves data from/to a Parquet file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Parquet file. """ +import logging from copy import deepcopy +from io import BytesIO from pathlib import Path, PurePosixPath from typing import Any, Dict import fsspec import pandas as pd -import pyarrow as pa import pyarrow.parquet as pq from kedro.io.core import ( - HTTP_PROTOCOLS, + PROTOCOL_DELIMITER, AbstractVersionedDataSet, DataSetError, Version, @@ -19,14 +20,16 @@ get_protocol_and_path, ) +logger = logging.getLogger(__name__) + class ParquetDataSet(AbstractVersionedDataSet): """``ParquetDataSet`` loads/saves data from/to a Parquet file using an underlying filesystem (e.g.: local, S3, GCS). It uses pandas to handle the Parquet file. Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml @@ -98,30 +101,19 @@ def __init__( Here you can find all available arguments when reading partitioned datasets: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.read All defaults are preserved. - save_args: Additional saving options for `pyarrow.parquet.write_table` and - `pyarrow.Table.from_pandas`. - Here you can find all available arguments for `write_table()`: - https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html?highlight=write_table#pyarrow.parquet.write_table - The arguments for `from_pandas()` should be passed through a nested - key: `from_pandas`. E.g.: `save_args = {"from_pandas": {"preserve_index": False}}` - Here you can find all available arguments for `from_pandas()`: - https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas - version: If specified, should be an instance of - ``kedro.io.core.Version``. If its ``load`` attribute is - None, the latest version will be loaded. If its ``save`` - attribute is None, save version will be autogenerated. + save_args: Additional saving options for saving Parquet file(s). + Here you can find all available arguments: + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html + All defaults are preserved. ``partition_cols`` is not supported. + version: If specified, should be an instance of ``kedro.io.core.Version``. + If its ``load`` attribute is None, the latest version will be loaded. If + its ``save`` attribute is None, save version will be autogenerated. credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Extra arguments to pass into underlying filesystem class constructor - (e.g. `{"project": "my-project"}` for ``GCSFileSystem``), as well as - to pass to the filesystem's `open` method through nested keys - `open_args_load` and `open_args_save`. - Here you can find all available arguments for `open`: - https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved. + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). """ _fs_args = deepcopy(fs_args) or {} - self._fs_open_args_load = _fs_args.pop("open_args_load", {}) _credentials = deepcopy(credentials) or {} protocol, path = get_protocol_and_path(filepath, version) @@ -129,7 +121,8 @@ def __init__( _fs_args.setdefault("auto_mkdir", True) self._protocol = protocol - self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args) + self._storage_options = {**_credentials, **_fs_args} + self._fs = fsspec.filesystem(self._protocol, **self._storage_options) super().__init__( filepath=PurePosixPath(path), @@ -138,17 +131,23 @@ def __init__( glob_function=self._fs.glob, ) - self._from_pandas_args = {} # type: Dict[str, Any] - # Handle default load and save arguments self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) if load_args is not None: self._load_args.update(load_args) self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) if save_args is not None: - self._from_pandas_args.update(save_args.pop("from_pandas", {})) self._save_args.update(save_args) + if "storage_options" in self._save_args or "storage_options" in self._load_args: + logger.warning( + "Dropping `storage_options` for %s, " + "please specify them under `fs_args` or `credentials`.", + self._filepath, + ) + self._save_args.pop("storage_options", None) + self._load_args.pop("storage_options", None) + def _describe(self) -> Dict[str, Any]: return dict( filepath=self._filepath, @@ -170,11 +169,24 @@ def _load(self) -> pd.DataFrame: .to_pandas() ) else: - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - data = pd.read_parquet(fs_file, **self._load_args) + data = self._load_from_pandas() return data + def _load_from_pandas(self): + load_path = str(self._get_load_path()) + if self._protocol == "file": + # file:// protocol seems to misbehave on Windows + # (), + # so we don't join that back to the filepath; + # storage_options also don't work with local paths + return pd.read_parquet(load_path, **self._load_args) + + load_path = f"{self._protocol}{PROTOCOL_DELIMITER}{load_path}" + return pd.read_parquet( + load_path, storage_options=self._storage_options, **self._load_args + ) + def _save(self, data: pd.DataFrame) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) @@ -183,12 +195,17 @@ def _save(self, data: pd.DataFrame) -> None: f"Saving {self.__class__.__name__} to a directory is not supported." ) - if self._protocol not in HTTP_PROTOCOLS: - self._fs.makedirs(Path(save_path).parent.as_posix(), exist_ok=True) - table = pa.Table.from_pandas(data, **self._from_pandas_args) - pq.write_table( - table=table, where=save_path, filesystem=self._fs, **self._save_args - ) + if "partition_cols" in self._save_args: + raise DataSetError( + f"{self.__class__.__name__} does not support save argument " + f"`partition_cols`. Please use `kedro.io.PartitionedDataSet` instead." + ) + + bytes_buffer = BytesIO() + data.to_parquet(bytes_buffer, **self._save_args) + + with self._fs.open(save_path, mode="wb") as fs_file: + fs_file.write(bytes_buffer.getvalue()) self._invalidate_cache() diff --git a/kedro/extras/datasets/pandas/sql_dataset.py b/kedro/extras/datasets/pandas/sql_dataset.py index 0d01f0de78..bd5ebe582c 100644 --- a/kedro/extras/datasets/pandas/sql_dataset.py +++ b/kedro/extras/datasets/pandas/sql_dataset.py @@ -103,8 +103,8 @@ class SQLTableDataSet(AbstractDataSet): symmetric. Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml @@ -261,8 +261,8 @@ class SQLQueryDataSet(AbstractDataSet): Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml diff --git a/kedro/extras/datasets/pandas/xml_dataset.py b/kedro/extras/datasets/pandas/xml_dataset.py new file mode 100644 index 0000000000..08b47241c4 --- /dev/null +++ b/kedro/extras/datasets/pandas/xml_dataset.py @@ -0,0 +1,196 @@ +# Copyright 2021 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +"""``XMLDataSet`` loads/saves data from/to a XML file using an underlying +filesystem (e.g.: local, S3, GCS). It uses pandas to handle the XML file. +""" +import logging +from copy import deepcopy +from io import BytesIO +from pathlib import PurePosixPath +from typing import Any, Dict + +import fsspec +import pandas as pd + +from kedro.io.core import ( + PROTOCOL_DELIMITER, + AbstractVersionedDataSet, + DataSetError, + Version, + get_filepath_str, + get_protocol_and_path, +) + +logger = logging.getLogger(__name__) + + +class XMLDataSet(AbstractVersionedDataSet): + """``XMLDataSet`` loads/saves data from/to a XML file using an underlying + filesystem (e.g.: local, S3, GCS). It uses pandas to handle the XML file. + + Example: + :: + + >>> from kedro.extras.datasets.pandas import XMLDataSet + >>> import pandas as pd + >>> + >>> data = pd.DataFrame({'col1': [1, 2], 'col2': [4, 5], + >>> 'col3': [5, 6]}) + >>> + >>> # data_set = XMLDataSet(filepath="gcs://bucket/test.xml") + >>> data_set = XMLDataSet(filepath="test.xml") + >>> data_set.save(data) + >>> reloaded = data_set.load() + >>> assert data.equals(reloaded) + + """ + + DEFAULT_LOAD_ARGS = {} # type: Dict[str, Any] + DEFAULT_SAVE_ARGS = {"index": False} # type: Dict[str, Any] + + # pylint: disable=too-many-arguments + def __init__( + self, + filepath: str, + load_args: Dict[str, Any] = None, + save_args: Dict[str, Any] = None, + version: Version = None, + credentials: Dict[str, Any] = None, + fs_args: Dict[str, Any] = None, + ) -> None: + """Creates a new instance of ``XMLDataSet`` pointing to a concrete XML file + on a specific filesystem. + + Args: + filepath: Filepath in POSIX format to a XML file prefixed with a protocol like `s3://`. + If prefix is not provided, `file` protocol (local filesystem) will be used. + The prefix should be any protocol supported by ``fsspec``. + Note: `http(s)` doesn't support versioning. + load_args: Pandas options for loading XML files. + Here you can find all available arguments: + https://pandas.pydata.org/docs/reference/api/pandas.read_xml.html + All defaults are preserved. + save_args: Pandas options for saving XML files. + Here you can find all available arguments: + https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_xml.html + All defaults are preserved, but "index", which is set to False. + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + credentials: Credentials required to get access to the underlying filesystem. + E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. + fs_args: Extra arguments to pass into underlying filesystem class constructor + (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + """ + _fs_args = deepcopy(fs_args) or {} + _credentials = deepcopy(credentials) or {} + + protocol, path = get_protocol_and_path(filepath, version) + if protocol == "file": + _fs_args.setdefault("auto_mkdir", True) + + self._protocol = protocol + self._storage_options = {**_credentials, **_fs_args} + self._fs = fsspec.filesystem(self._protocol, **self._storage_options) + + super().__init__( + filepath=PurePosixPath(path), + version=version, + exists_function=self._fs.exists, + glob_function=self._fs.glob, + ) + + # Handle default load and save arguments + self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS) + if load_args is not None: + self._load_args.update(load_args) + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + + if "storage_options" in self._save_args or "storage_options" in self._load_args: + logger.warning( + "Dropping `storage_options` for %s, " + "please specify them under `fs_args` or `credentials`.", + self._filepath, + ) + self._save_args.pop("storage_options", None) + self._load_args.pop("storage_options", None) + + def _describe(self) -> Dict[str, Any]: + return dict( + filepath=self._filepath, + protocol=self._protocol, + load_args=self._load_args, + save_args=self._save_args, + version=self._version, + ) + + def _load(self) -> pd.DataFrame: + load_path = str(self._get_load_path()) + if self._protocol == "file": + # file:// protocol seems to misbehave on Windows + # (), + # so we don't join that back to the filepath; + # storage_options also don't work with local paths + return pd.read_xml(load_path, **self._load_args) + + load_path = f"{self._protocol}{PROTOCOL_DELIMITER}{load_path}" + return pd.read_xml( + load_path, storage_options=self._storage_options, **self._load_args + ) + + def _save(self, data: pd.DataFrame) -> None: + save_path = get_filepath_str(self._get_save_path(), self._protocol) + + buf = BytesIO() + data.to_xml(path_or_buffer=buf, **self._save_args) + + with self._fs.open(save_path, mode="wb") as fs_file: + fs_file.write(buf.getvalue()) + + self._invalidate_cache() + + def _exists(self) -> bool: + try: + load_path = get_filepath_str(self._get_load_path(), self._protocol) + except DataSetError: + return False + + return self._fs.exists(load_path) + + def _release(self) -> None: + super()._release() + self._invalidate_cache() + + def _invalidate_cache(self) -> None: + """Invalidate underlying filesystem caches.""" + filepath = get_filepath_str(self._filepath, self._protocol) + self._fs.invalidate_cache(filepath) diff --git a/kedro/extras/datasets/pickle/pickle_dataset.py b/kedro/extras/datasets/pickle/pickle_dataset.py index c9425be487..f77c656be3 100644 --- a/kedro/extras/datasets/pickle/pickle_dataset.py +++ b/kedro/extras/datasets/pickle/pickle_dataset.py @@ -26,8 +26,8 @@ class PickleDataSet(AbstractVersionedDataSet): supports all allowed options for loading and saving pickle files. Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml diff --git a/kedro/extras/datasets/plotly/plotly_dataset.py b/kedro/extras/datasets/plotly/plotly_dataset.py index 243b399da2..4fa3880a0c 100644 --- a/kedro/extras/datasets/plotly/plotly_dataset.py +++ b/kedro/extras/datasets/plotly/plotly_dataset.py @@ -2,6 +2,7 @@ file using an underlying filesystem (e.g.: local, S3, GCS). It loads the JSON into a plotly figure. """ +from copy import deepcopy from typing import Any, Dict import pandas as pd @@ -82,12 +83,19 @@ def __init__( `open_args_load` and `open_args_save`. Here you can find all available arguments for `open`: https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open - All defaults are preserved, except `mode`, which is set to `w` when - saving. + All defaults are preserved, except `mode`, which is set to `w` when saving. """ super().__init__(filepath, load_args, save_args, version, credentials, fs_args) self._plotly_args = plotly_args + _fs_args = deepcopy(fs_args) or {} + _fs_open_args_load = _fs_args.pop("open_args_load", {}) + _fs_open_args_save = _fs_args.pop("open_args_save", {}) + _fs_open_args_save.setdefault("mode", "w") + + self._fs_open_args_load = _fs_open_args_load + self._fs_open_args_save = _fs_open_args_save + def _describe(self) -> Dict[str, Any]: return {**super()._describe(), "plotly_args": self._plotly_args} diff --git a/kedro/extras/datasets/spark/spark_dataset.py b/kedro/extras/datasets/spark/spark_dataset.py index ba8668798d..f0eb76affb 100644 --- a/kedro/extras/datasets/spark/spark_dataset.py +++ b/kedro/extras/datasets/spark/spark_dataset.py @@ -152,8 +152,8 @@ class SparkDataSet(AbstractVersionedDataSet): """``SparkDataSet`` loads and saves Spark dataframes. Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml @@ -230,14 +230,14 @@ def __init__( # pylint: disable=too-many-arguments It is dependent on the selected file format. You can find a list of read options for each supported format in Spark DataFrame read documentation: - https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html + https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#dataframe-apis save_args: Save args passed to Spark DataFrame write options. Similar to load_args this is dependent on the selected file format. You can pass ``mode`` and ``partitionBy`` to specify your overwrite mode and partitioning respectively. You can find a list of options for each format in Spark DataFrame write documentation: - https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html + https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#dataframe-apis version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` diff --git a/kedro/extras/datasets/spark/spark_hive_dataset.py b/kedro/extras/datasets/spark/spark_hive_dataset.py index b94c59986c..b3ce4b69d9 100644 --- a/kedro/extras/datasets/spark/spark_hive_dataset.py +++ b/kedro/extras/datasets/spark/spark_hive_dataset.py @@ -1,62 +1,17 @@ """``AbstractDataSet`` implementation to access Spark dataframes using ``pyspark`` on Apache Hive. """ - import pickle -import uuid +from copy import deepcopy from typing import Any, Dict, List -from pyspark.sql import DataFrame, SparkSession -from pyspark.sql.functions import coalesce, col, lit +from pyspark.sql import DataFrame, SparkSession, Window +from pyspark.sql.functions import col, lit, row_number from kedro.io.core import AbstractDataSet, DataSetError -class StagedHiveDataSet: - """ - Provides a context manager for temporarily writing data to a staging hive table, for example - where you want to replace the contents of a hive table with data which relies on the data - currently present in that table. - - Once initialised, the ``staged_data`` ``DataFrame`` can be queried and underlying tables used to - define the initial dataframe can be modified without affecting ``staged_data``. - - Upon exiting this object it will drop the redundant staged table. - """ - - def __init__( - self, data: DataFrame, stage_table_name: str, stage_database_name: str - ): - """ - Creates a new instance eof `StagedHiveDataSet`. - - Args: - data: The spark dataframe to be staged - stage_table_name: the database destination for the staged data - stage_database_name: the table destination for the staged data - """ - self.staged_data = None - self._data = data - self._stage_table_name = stage_table_name - self._stage_database_name = stage_database_name - self._spark_session = SparkSession.builder.getOrCreate() - - def __enter__(self): - self._data.createOrReplaceTempView("tmp") - - _table = f"{self._stage_database_name}.{self._stage_table_name}" - self._spark_session.sql( - f"create table {_table} as select * from tmp" # nosec - ).take(1) - self.staged_data = self._spark_session.sql(f"select * from {_table}") # nosec - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self._spark_session.sql( - f"drop table {self._stage_database_name}.{self._stage_table_name}" # nosec - ) - - +# pylint:disable=too-many-instance-attributes class SparkHiveDataSet(AbstractDataSet): """``SparkHiveDataSet`` loads and saves Spark dataframes stored on Hive. This data set also handles some incompatible file types such as using partitioned parquet on @@ -68,10 +23,11 @@ class SparkHiveDataSet(AbstractDataSet): duration of the pipeline) - Tables are not being externally modified during upserts. The upsert method is NOT ATOMIC to external changes to the target table while executing. + Upsert methodology works by leveraging Spark DataFrame execution plan checkpointing. Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml @@ -105,8 +61,16 @@ class SparkHiveDataSet(AbstractDataSet): >>> reloaded.take(4) """ + DEFAULT_SAVE_ARGS = {} # type: Dict[str, Any] + + # pylint:disable=too-many-arguments def __init__( - self, database: str, table: str, write_mode: str, table_pk: List[str] = None + self, + database: str, + table: str, + write_mode: str = "errorifexists", + table_pk: List[str] = None, + save_args: Dict[str, Any] = None, ) -> None: """Creates a new instance of ``SparkHiveDataSet``. @@ -116,13 +80,25 @@ def __init__( write_mode: ``insert``, ``upsert`` or ``overwrite`` are supported. table_pk: If performing an upsert, this identifies the primary key columns used to resolve preexisting data. Is required for ``write_mode="upsert"``. + save_args: Optional mapping of any options, + passed to the `DataFrameWriter.saveAsTable` as kwargs. + Key example of this is `partitionBy` which allows data partitioning + on a list of column names. + Other `HiveOptions` can be found here: + https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#specifying-storage-format-for-hive-tables + + Note: + For users leveraging the `upsert` functionality, + a `checkpoint` directory must be set, e.g. using + `spark.sparkContext.setCheckpointDir("/path/to/dir")` + or directly in the Spark conf folder. Raises: DataSetError: Invalid configuration supplied """ - valid_write_modes = ["insert", "upsert", "overwrite"] - if write_mode not in valid_write_modes: - valid_modes = ", ".join(valid_write_modes) + _write_modes = ["append", "error", "errorifexists", "upsert", "overwrite"] + if write_mode not in _write_modes: + valid_modes = ", ".join(_write_modes) raise DataSetError( f"Invalid `write_mode` provided: {write_mode}. " f"`write_mode` must be one of: {valid_modes}" @@ -134,10 +110,12 @@ def __init__( self._table_pk = table_pk or [] self._database = database self._table = table - self._stage_table = "_temp_" + table - - # self._table_columns is set up in _save() to speed up initialization - self._table_columns = [] # type: List[str] + self._full_table_address = f"{database}.{table}" + self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS) + if save_args is not None: + self._save_args.update(save_args) + self._format = self._save_args.get("format") or "hive" + self._eager_checkpoint = self._save_args.pop("eager_checkpoint", None) or True def _describe(self) -> Dict[str, Any]: return dict( @@ -145,86 +123,73 @@ def _describe(self) -> Dict[str, Any]: table=self._table, write_mode=self._write_mode, table_pk=self._table_pk, + partition_by=self._save_args.get("partitionBy"), + format=self._format, ) @staticmethod def _get_spark() -> SparkSession: - return SparkSession.builder.getOrCreate() - - def _create_empty_hive_table(self, data): - data.createOrReplaceTempView("tmp") - self._get_spark().sql( - f"create table {self._database}.{self._table} select * from tmp limit 1" # nosec + """ + This method should only be used to get an existing SparkSession + with valid Hive configuration. + Configuration for Hive is read from hive-site.xml on the classpath. + It supports running both SQL and HiveQL commands. + Additionally, if users are leveraging the `upsert` functionality, + then a `checkpoint` directory must be set, e.g. using + `spark.sparkContext.setCheckpointDir("/path/to/dir")` + """ + _spark = SparkSession.builder.getOrCreate() + return _spark + + def _create_hive_table(self, data: DataFrame, mode: str = None): + _mode: str = mode or self._write_mode + data.write.saveAsTable( + self._full_table_address, + mode=_mode, + format=self._format, + **self._save_args, ) - self._get_spark().sql(f"truncate table {self._database}.{self._table}") # nosec def _load(self) -> DataFrame: - if not self._exists(): - raise DataSetError( - f"Requested table not found: {self._database}.{self._table}" - ) - return self._get_spark().sql( - f"select * from {self._database}.{self._table}" # nosec - ) + return self._get_spark().read.table(self._full_table_address) def _save(self, data: DataFrame) -> None: - if not self._exists(): - self._create_empty_hive_table(data) - self._table_columns = data.columns - else: - self._table_columns = self._load().columns - if self._write_mode == "upsert": - non_existent_columns = set(self._table_pk) - set(self._table_columns) - if non_existent_columns: - colnames = ", ".join(sorted(non_existent_columns)) - raise DataSetError( - f"Columns [{colnames}] selected as primary key(s) not found in " - f"table {self._database}.{self._table}" - ) - self._validate_save(data) - write_methods = { - "insert": self._insert_save, - "upsert": self._upsert_save, - "overwrite": self._overwrite_save, - } - write_methods[self._write_mode](data) - - def _insert_save(self, data: DataFrame) -> None: - data.createOrReplaceTempView("tmp") - columns = ", ".join(self._table_columns) - self._get_spark().sql( - f"insert into {self._database}.{self._table} select {columns} from tmp" # nosec - ) + if self._write_mode == "upsert": + # check if _table_pk is a subset of df columns + if not set(self._table_pk) <= set(self._load().columns): + raise DataSetError( + f"Columns {str(self._table_pk)} selected as primary key(s) not found in " + f"table {self._full_table_address}" + ) + self._upsert_save(data=data) + else: + self._create_hive_table(data=data) def _upsert_save(self, data: DataFrame) -> None: - if self._load().rdd.isEmpty(): - self._insert_save(data) + if not self._exists() or self._load().rdd.isEmpty(): + self._create_hive_table(data=data, mode="overwrite") else: - joined_data = data.alias("new").join( - self._load().alias("old"), self._table_pk, "outer" + _tmp_colname = "tmp_colname" + _tmp_row = "tmp_row" + _w = Window.partitionBy(*self._table_pk).orderBy(col(_tmp_colname).desc()) + df_old = self._load().select("*", lit(1).alias(_tmp_colname)) + df_new = data.select("*", lit(2).alias(_tmp_colname)) + df_stacked = df_new.unionByName(df_old).select( + "*", row_number().over(_w).alias(_tmp_row) ) - upsert_dataset = joined_data.select( - [ # type: ignore - coalesce(f"new.{col_name}", f"old.{col_name}").alias(col_name) - for col_name in set(data.columns) - - set(self._table_pk) # type: ignore - ] - + self._table_pk + df_filtered = ( + df_stacked.filter(col(_tmp_row) == 1) + .drop(_tmp_colname, _tmp_row) + .checkpoint(eager=self._eager_checkpoint) ) - temporary_persisted_tbl_name = f"temp_{uuid.uuid4().int}" - with StagedHiveDataSet( - upsert_dataset, - stage_database_name=self._database, - stage_table_name=temporary_persisted_tbl_name, - ) as temp_table: - self._overwrite_save(temp_table.staged_data) - - def _overwrite_save(self, data: DataFrame) -> None: - self._get_spark().sql(f"truncate table {self._database}.{self._table}") # nosec - self._insert_save(data) + self._create_hive_table(data=df_filtered, mode="overwrite") def _validate_save(self, data: DataFrame): + # do not validate when the table doesn't exist + # or if the `write_mode` is set to overwrite + if (not self._exists()) or self._write_mode == "overwrite": + return hive_dtypes = set(self._load().dtypes) data_dtypes = set(data.dtypes) if data_dtypes != hive_dtypes: @@ -237,21 +202,15 @@ def _validate_save(self, data: DataFrame): ) def _exists(self) -> bool: - if ( + # noqa # pylint:disable=protected-access + return ( self._get_spark() - .sql("show databases") - .filter(col("namespace") == lit(self._database)) - .take(1) - ): - self._get_spark().sql(f"use {self._database}") - if ( - self._get_spark() - .sql("show tables") - .filter(col("tableName") == lit(self._table)) - .take(1) - ): - return True - return False + ._jsparkSession.catalog() + .tableExists(self._database, self._table) + ) def __getstate__(self) -> None: - raise pickle.PicklingError("PySpark datasets can't be serialized") + raise pickle.PicklingError( + "PySpark datasets objects cannot be pickled " + "or serialised as Python objects." + ) diff --git a/kedro/extras/datasets/spark/spark_jdbc_dataset.py b/kedro/extras/datasets/spark/spark_jdbc_dataset.py index 043f689ca5..0b60edd673 100644 --- a/kedro/extras/datasets/spark/spark_jdbc_dataset.py +++ b/kedro/extras/datasets/spark/spark_jdbc_dataset.py @@ -19,8 +19,8 @@ class SparkJDBCDataSet(AbstractDataSet): Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml diff --git a/kedro/extras/datasets/yaml/yaml_dataset.py b/kedro/extras/datasets/yaml/yaml_dataset.py index 979efc9db0..b8d043eb5d 100644 --- a/kedro/extras/datasets/yaml/yaml_dataset.py +++ b/kedro/extras/datasets/yaml/yaml_dataset.py @@ -3,10 +3,9 @@ """ from copy import deepcopy from pathlib import PurePosixPath -from typing import Any, Dict, Union +from typing import Any, Dict import fsspec -import pandas as pd import yaml from kedro.io.core import ( @@ -117,12 +116,8 @@ def _load(self) -> Dict: with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: return yaml.safe_load(fs_file) - def _save(self, data: Union[Dict, pd.DataFrame]) -> None: + def _save(self, data: Dict) -> None: save_path = get_filepath_str(self._get_save_path(), self._protocol) - - if isinstance(data, pd.DataFrame): - data = data.to_dict() - with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: yaml.dump(data, fs_file, **self._save_args) diff --git a/kedro/extras/decorators/README.md b/kedro/extras/decorators/README.md deleted file mode 100644 index 143bee23bd..0000000000 --- a/kedro/extras/decorators/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# Decorators - -Welcome to `kedro.extras.decorators`, the home of Kedro's node and pipeline decorators, which enable additional functionality by wrapping your functions, for example: - - Retry nodes that have failed to run - - Profile how much memory is being consumed by a node - -Further information on [node and pipeline decorators](https://kedro.readthedocs.io/en/stable/03_tutorial/04_create_pipelines.html#using-decorators-for-nodes-and-pipelines) has been added to the documentation. Before writing a decorator to implement a certain functionality that interacts a pipeline or node lifecycle event, you may want to consider using [Hooks](https://kedro.readthedocs.io/en/latest/04_user_guide/15_hooks.html) instead. - -## What decorators are currently supported? -View a full list of supported decorators [**here**](https://kedro.readthedocs.io/en/stable/kedro.extras.decorators.html). - -Examples of decorators supported include: - - **A retry decorator**: A function decorator which catches exceptions from the wrapped function at most `n_times`, after which it bundles and propagates them. By default, all exceptions are caught, but you can narrow your scope using the `exceptions` argument. You can also specify the time delay (in seconds) between a failure and the next retry, using the `delay_sec` parameter. - - **A node and pipeline memory profiler**: A function decorator which profiles the memory used when executing the function. The logged memory is collected by taking memory snapshots every 100ms, and includes memory used by children processes. The implementation uses the `memory_profiler` Python package under the hood. - -> _Note_: The node and pipeline memory profiler will only work on functions that take longer than 0.5s to execute, see [class documentation](memory_profiler.py) for more details. - -### What pre-requisites are required for the node and pipeline memory profiler? - -On Unix-like operating systems, you will need to install a C-compiler and related build tools for your platform. - - #### macOS - To install Command Line Tools for Xcode, run the following from the terminal: - - ```bash - xcode-select --install - ``` - - #### GNU / Linux - - ##### Debian/Ubuntu - - The following command (run with root permissions) will install the `build-essential` metapackage for Debian-based distributions: - - ```bash - apt-get update && apt-get install build-essential - ``` - - ##### Red Hat Enterprise Linux / Centos - The following command (run with root permissions) will install the "Develop Tools" group of packages on RHEL / Centos: - - ```bash - yum groupinstall 'Development Tools' - ``` diff --git a/kedro/extras/decorators/__init__.py b/kedro/extras/decorators/__init__.py deleted file mode 100644 index e5757090c0..0000000000 --- a/kedro/extras/decorators/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -"""``kedro.extras.decorators`` provides Node/Pipeline Decorators.""" -import warnings - -warnings.simplefilter("default", DeprecationWarning) - -warnings.warn( - "Support for decorators will be deprecated in Kedro 0.18.0. " - "Please use Hooks to extend the behaviour of a node or pipeline.", - DeprecationWarning, -) diff --git a/kedro/extras/decorators/memory_profiler.py b/kedro/extras/decorators/memory_profiler.py deleted file mode 100644 index ce36e0e832..0000000000 --- a/kedro/extras/decorators/memory_profiler.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -This module contains function decorators for memory-profiler, which can -be used as ``Node`` decorators. See ``kedro.pipeline.node.decorate`` -""" -import logging -from functools import wraps -from typing import Callable - -from kedro.pipeline.decorators import _func_full_name - -try: - from memory_profiler import memory_usage -except ImportError as exc: - raise ImportError( - f"{exc}: `pip install kedro[profilers]` to get the required " - "memory profiler dependencies." - ) from exc - - -def mem_profile(func: Callable) -> Callable: - """A function decorator which profiles the memory used when executing the - function. The logged memory is collected by using the memory_profiler - python module and includes memory used by children processes. The usage - is collected by taking memory snapshots every 100ms. This decorator will - only work with functions taking at least 0.5s to execute due to a bug in - the memory_profiler python module. For more information about the bug, - please see https://github.com/pythonprofilers/memory_profiler/issues/216 - - Args: - func: The function to be profiled. - - Returns: - A wrapped function, which will execute the provided function and log - its max memory usage upon completion. - - """ - - @wraps(func) - def with_memory(*args, **kwargs): - log = logging.getLogger(__name__) - mem_usage, result = memory_usage( - (func, args, kwargs), - interval=0.1, - timeout=1, - max_usage=True, - retval=True, - include_children=True, - ) - # memory_profiler < 0.56.0 returns list instead of float - mem_usage = mem_usage[0] if isinstance(mem_usage, (list, tuple)) else mem_usage - log.info( - "Running %r consumed %2.2fMiB memory at peak time", - _func_full_name(func), - mem_usage, - ) - return result - - return with_memory diff --git a/kedro/extras/decorators/retry_node.py b/kedro/extras/decorators/retry_node.py deleted file mode 100644 index 4e5750ff52..0000000000 --- a/kedro/extras/decorators/retry_node.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -This module contains the retry decorator, which can be used as -``Node`` decorators to retry nodes. See ``kedro.pipeline.node.decorate`` -""" - -import logging -from functools import wraps -from time import sleep -from typing import Callable, Type - - -def retry( - exceptions: Type[Exception] = Exception, n_times: int = 1, delay_sec: float = 0 -) -> Callable: - """ - Catches exceptions from the wrapped function at most n_times and then - bundles and propagates them. - - **Make sure your function does not mutate the arguments** - - Args: - exceptions: The superclass of exceptions to catch. - By default catch all exceptions. - n_times: At most let the function fail n_times. The bundle the - errors and propagate them. By default retry only once. - delay_sec: Delay between failure and next retry in seconds - - Returns: - The original function with retry functionality. - - """ - - def _retry(func: Callable): - @wraps(func) - def _wrapper(*args, **kwargs): - counter = n_times - errors = [] - while counter >= 0: - try: - return func(*args, **kwargs) - # pylint: disable=broad-except - except exceptions as exc: - errors.append(exc) - if counter != 0: - sleep(delay_sec) - counter -= 1 - - if errors: - log = logging.getLogger(__name__) - log.error( - "Function `%s` failed %i times. Errors:\n", func.__name__, n_times - ) - log.error("\n".join(str(err) for err in errors)) - log.error("Raising last exception") - raise errors[-1] - - return _wrapper - - return _retry diff --git a/kedro/extras/transformers/README.md b/kedro/extras/transformers/README.md deleted file mode 100644 index 9cbe3fb6fb..0000000000 --- a/kedro/extras/transformers/README.md +++ /dev/null @@ -1,43 +0,0 @@ -# Transformers - -Welcome to `kedro.extras.transformers`, the home of Kedro's dataset transformers. Transformers intercept the load and save operations on Kedro datasets. Use cases that transformers enable include: - - Performing data validation, - - Tracking operation performance, - - And, converting data between formats (although we would recommend [transcoding](https://kedro.readthedocs.io/en/stable/04_user_guide/04_data_catalog.html#transcoding-datasets) for this). - -Further information on [transformers](https://kedro.readthedocs.io/en/stable/04_user_guide/04_data_catalog.html#transforming-datasets) has been added to the documentation. - -## What transformers are currently supported? -View a full list of supported transformers [**here**](https://kedro.readthedocs.io/en/stable/kedro.extras.transformers.html). - -Examples of transformers supported include: - - **A dataset time profiler**: A transformer that logs the runtime of data set load and save calls - - **A dataset memory profiler**: A transformer that logs the maximum memory consumption during load and save calls - -### What pre-requisites are required for the dataset memory profiler? - -On Unix-like operating systems, you will need to install a C-compiler and related build tools for your platform. - - #### macOS - To install `Command Line Tools for Xcode`, run the following from the terminal: - - ```bash - xcode-select --install - ``` - - #### GNU / Linux - - ##### Debian / Ubuntu - - The following command (run with root permissions) will install the `build-essential` metapackage for Debian-based distributions: - - ```bash - apt-get update && apt-get install build-essential - ``` - - ##### Red Hat Enterprise Linux / Centos - The following command (run with root permissions) will install the "Develop Tools" group of packages on RHEL / Centos: - - ```bash - yum groupinstall 'Development Tools' - ``` diff --git a/kedro/extras/transformers/__init__.py b/kedro/extras/transformers/__init__.py deleted file mode 100644 index 10c7db86b1..0000000000 --- a/kedro/extras/transformers/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -"""``kedro.extras.transformers`` is the home of Kedro's dataset transformers.""" -import warnings - -from .memory_profiler import ProfileMemoryTransformer -from .time_profiler import ProfileTimeTransformer - -__all__ = ["ProfileMemoryTransformer", "ProfileTimeTransformer"] - -warnings.simplefilter("default", DeprecationWarning) - -warnings.warn( - "Support for transformers will be deprecated in Kedro 0.18.0. " - "Please use Hooks `before_dataset_loaded`/`after_dataset_loaded` or " - "`before_dataset_saved`/`after_dataset_saved` instead.", - DeprecationWarning, -) diff --git a/kedro/extras/transformers/memory_profiler.py b/kedro/extras/transformers/memory_profiler.py deleted file mode 100644 index 924e741670..0000000000 --- a/kedro/extras/transformers/memory_profiler.py +++ /dev/null @@ -1,58 +0,0 @@ -"""``Transformers`` modify the loading and saving of ``DataSets`` in a -``DataCatalog``. -""" - -import logging -from typing import Any, Callable - -from kedro.io import AbstractTransformer - -try: - from memory_profiler import memory_usage -except ImportError as exc: - raise ImportError( - f"{exc}: `pip install kedro[profilers]` to get the required " - "memory profiler dependencies." - ) from exc - - -def _normalise_mem_usage(mem_usage): - # memory_profiler < 0.56.0 returns list instead of float - return mem_usage[0] if isinstance(mem_usage, (list, tuple)) else mem_usage - - -class ProfileMemoryTransformer(AbstractTransformer): - """A transformer that logs the maximum memory consumption during load and save calls.""" - - @property - def _logger(self): - return logging.getLogger(self.__class__.__name__) - - def load(self, data_set_name: str, load: Callable[[], Any]) -> Any: - mem_usage, data = memory_usage( - (load, [], {}), - interval=0.1, - max_usage=True, - retval=True, - include_children=True, - ) - mem_usage = _normalise_mem_usage(mem_usage) - - self._logger.info( - "Loading %s consumed %2.2fMiB memory at peak time", data_set_name, mem_usage - ) - return data - - def save(self, data_set_name: str, save: Callable[[Any], None], data: Any) -> None: - mem_usage = memory_usage( - (save, [data], {}), - interval=0.1, - max_usage=True, - retval=False, - include_children=True, - ) - mem_usage = _normalise_mem_usage(mem_usage) - - self._logger.info( - "Saving %s consumed %2.2fMiB memory at peak time", data_set_name, mem_usage - ) diff --git a/kedro/extras/transformers/time_profiler.py b/kedro/extras/transformers/time_profiler.py deleted file mode 100644 index 31f4deec36..0000000000 --- a/kedro/extras/transformers/time_profiler.py +++ /dev/null @@ -1,32 +0,0 @@ -"""``Transformers`` modify the loading and saving of ``DataSets`` in a -``DataCatalog``. -""" - -import logging -import time -from typing import Any, Callable - -from kedro.io import AbstractTransformer - - -class ProfileTimeTransformer(AbstractTransformer): - """A transformer that logs the runtime of data set load and save calls.""" - - @property - def _logger(self): - return logging.getLogger("ProfileTimeTransformer") - - def load(self, data_set_name: str, load: Callable[[], Any]) -> Any: - start = time.time() - data = load() - self._logger.info( - "Loading %s took %0.3f seconds", data_set_name, time.time() - start - ) - return data - - def save(self, data_set_name: str, save: Callable[[Any], None], data: Any) -> None: - start = time.time() - save(data) - self._logger.info( - "Saving %s took %0.3f seconds", data_set_name, time.time() - start - ) diff --git a/kedro/framework/cli/__init__.py b/kedro/framework/cli/__init__.py index 430402d445..fcbb427ef7 100644 --- a/kedro/framework/cli/__init__.py +++ b/kedro/framework/cli/__init__.py @@ -1,7 +1,7 @@ """``kedro.framework.cli`` implements commands available from Kedro's CLI. """ -from .cli import get_project_context, main +from .cli import main from .utils import command_with_verbosity, load_entry_points -__all__ = ["get_project_context", "main", "command_with_verbosity", "load_entry_points"] +__all__ = ["main", "command_with_verbosity", "load_entry_points"] diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index c62c758aba..d4b6f0ac7c 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -37,6 +37,7 @@ def catalog(): @env_option @click.option( "--pipeline", + "-p", type=str, default="", help="Name of the modular pipeline to run. If not set, " @@ -102,6 +103,7 @@ def _map_type_to_datasets(datasets, datasets_meta): @env_option(help="Environment to create Data Catalog YAML file in. Defaults to `base`.") @click.option( "--pipeline", + "-p", "pipeline_name", type=str, required=True, @@ -116,7 +118,7 @@ def create_catalog(metadata: ProjectMetadata, pipeline_name, env): the `DataCatalog`. The catalog configuration will be saved to - `//catalog/.yml` file. + `//catalog/.yml` file. """ env = env or "base" session = _create_session(metadata.package_name, env=env) @@ -147,7 +149,7 @@ def create_catalog(metadata: ProjectMetadata, pipeline_name, env): if missing_ds: catalog_path = ( context.project_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / env / "catalog" / f"{pipeline_name}.yml" diff --git a/kedro/framework/cli/cli.py b/kedro/framework/cli/cli.py index 32487770c4..15abbe4542 100644 --- a/kedro/framework/cli/cli.py +++ b/kedro/framework/cli/cli.py @@ -3,12 +3,10 @@ This module implements commands available from the kedro CLI. """ import importlib -import warnings import webbrowser from collections import defaultdict -from copy import deepcopy from pathlib import Path -from typing import Any, Sequence +from typing import Sequence import click import pkg_resources @@ -31,7 +29,6 @@ KedroCliError, load_entry_points, ) -from kedro.framework.context.context import load_context from kedro.framework.startup import _is_project, bootstrap_project LOGO = rf""" @@ -50,8 +47,6 @@ def cli(): # pragma: no cover """Kedro is a CLI for creating and using Kedro projects. For more information, type ``kedro info``. - When inside a Kedro project (created with ``kedro new``) commands from - the project's ``cli.py`` file will also be available here. """ pass @@ -97,43 +92,6 @@ def docs(): webbrowser.open(index_path) -def get_project_context( - key: str = "context", project_path: Path = None, **kwargs -) -> Any: - """Gets the context value from context associated with the key. - - Args: - key: Optional key to get associated value from Kedro context. - Supported keys are "verbose" and "context", and it defaults to "context". - project_path: Optional path to where the project root is to load the context. - If omitted, the current working directory will be used. - **kwargs: Optional custom arguments defined by users, which will be passed into - the constructor of the projects KedroContext subclass. - - Returns: - Requested value from Kedro context dictionary or the default if the key - was not found. - - Raises: - KedroCliError: When the key is not found and the default value was not - specified. - """ - warnings.warn( - "`get_project_context` is now deprecated and will be removed in Kedro 0.18.0. " - "Please use `KedroSession.load_context()` to access the " - "`KedroContext` object. For more information, please visit " - "https://kedro.readthedocs.io/en/stable/04_kedro_project_setup/03_session.html", - DeprecationWarning, - ) - project_path = project_path or Path.cwd() - context = load_context(project_path, **kwargs) - # Dictionary to be compatible with existing Plugins. Future plugins should - # retrieve necessary Kedro project properties from context - value = {"context": context, "verbose": KedroCliError.VERBOSE_ERROR}[key] - - return deepcopy(value) - - def _init_plugins(): group = ENTRY_POINT_GROUPS["init"] for entry_point in pkg_resources.iter_entry_points(group=group): @@ -198,10 +156,13 @@ def global_groups(self) -> Sequence[click.MultiCommand]: @property def project_groups(self) -> Sequence[click.MultiCommand]: + # pylint: disable=line-too-long """Property which loads all project command groups from the project and the plugins, then combines them with the built-in ones. Built-in commands can be overridden by plugins, which can be - overridden by the project's cli.py. + overridden by a custom project cli.py. + See https://kedro.readthedocs.io/en/stable/extend_kedro/common_use_cases.html#use-case-3-how-to-add-or-modify-cli-commands + on how to add this. """ if not self._metadata: return [] diff --git a/kedro/framework/cli/pipeline.py b/kedro/framework/cli/pipeline.py index 672ef4e96e..5ab14f4d74 100644 --- a/kedro/framework/cli/pipeline.py +++ b/kedro/framework/cli/pipeline.py @@ -1,38 +1,34 @@ # pylint: disable=too-many-lines """A collection of CLI commands for working with Kedro pipelines.""" -import json import re import shutil import sys +import tarfile import tempfile from importlib import import_module from pathlib import Path from textwrap import indent -from typing import Any, Iterable, List, NamedTuple, Optional, Set, Tuple, Union -from zipfile import ZipFile +from typing import Iterable, List, NamedTuple, Optional, Set, Tuple, Union import click import pkg_resources -import yaml from rope.base.project import Project from rope.contrib import generate from rope.refactor.move import MoveModule from rope.refactor.rename import Rename -from setuptools.dist import Distribution import kedro from kedro.framework.cli.utils import ( KedroCliError, _clean_pycache, _filter_deprecation_warnings, - _get_requirements_in, call, command_with_verbosity, env_option, python_call, ) -from kedro.framework.project import pipelines, settings +from kedro.framework.project import settings from kedro.framework.startup import ProjectMetadata _SETUP_PY_TEMPLATE = """# -*- coding: utf-8 -*- @@ -44,7 +40,6 @@ description="Modular pipeline `{name}`", packages=find_packages(), include_package_data=True, - package_data={package_data}, install_requires={install_requires}, ) """ @@ -120,8 +115,8 @@ def create_pipeline( ): # pylint: disable=unused-argument """Create a new modular pipeline by providing a name.""" package_dir = metadata.source_dir / metadata.package_name - conf_root = settings.CONF_ROOT - project_conf_path = metadata.project_path / conf_root + conf_source = settings.CONF_SOURCE + project_conf_path = metadata.project_path / conf_source env = env or "base" if not skip_config and not (project_conf_path / env).exists(): @@ -156,8 +151,8 @@ def delete_pipeline( ): # pylint: disable=unused-argument """Delete a modular pipeline by providing a name.""" package_dir = metadata.source_dir / metadata.package_name - conf_root = settings.CONF_ROOT - project_conf_path = metadata.project_path / conf_root + conf_source = settings.CONF_SOURCE + project_conf_path = metadata.project_path / conf_source env = env or "base" if not (project_conf_path / env).exists(): @@ -204,50 +199,6 @@ def delete_pipeline( ) -@pipeline.command("list") -def list_pipelines(): - """List all pipelines defined in your pipeline_registry.py file. (DEPRECATED)""" - deprecation_message = ( - "DeprecationWarning: Command `kedro pipeline list` is deprecated. " - "Please use `kedro registry list` instead." - ) - click.secho(deprecation_message, fg="red") - - click.echo(yaml.dump(sorted(pipelines))) - - -@command_with_verbosity(pipeline, "describe") -@click.argument("name", nargs=1, default="__default__") -@click.pass_obj -def describe_pipeline( - metadata: ProjectMetadata, name, **kwargs -): # pylint: disable=unused-argument, protected-access - """Describe a pipeline by providing a pipeline name. - Defaults to the __default__ pipeline. (DEPRECATED) - """ - deprecation_message = ( - "DeprecationWarning: Command `kedro pipeline describe` is deprecated. " - "Please use `kedro registry describe` instead." - ) - click.secho(deprecation_message, fg="red") - - pipeline_obj = pipelines.get(name) - if not pipeline_obj: - all_pipeline_names = pipelines.keys() - existing_pipelines = ", ".join(sorted(all_pipeline_names)) - raise KedroCliError( - f"`{name}` pipeline not found. Existing pipelines: [{existing_pipelines}]" - ) - - nodes = [] - for node in pipeline_obj.nodes: - namespace = f"{node.namespace}." if node.namespace else "" - nodes.append(f"{namespace}{node._name or node._func_name} ({node._func_name})") - result = {"Nodes": nodes} - - click.echo(yaml.dump(result)) - - @command_with_verbosity(pipeline, "pull") @click.argument("package_path", nargs=1, required=False) @click.option( @@ -260,12 +211,13 @@ def describe_pipeline( @env_option( help="Environment to install the pipeline configuration to. Defaults to `base`." ) +@click.option("--alias", type=str, default="", help="Rename the package.") @click.option( - "--alias", - type=str, - default="", - callback=_check_pipeline_name, - help="Alternative name to unpackage under.", + "-d", + "--destination", + type=click.Path(file_okay=False, dir_okay=False), + default=None, + help="Module location where to unpack under.", ) @click.option( "--fs-args", @@ -277,7 +229,14 @@ def describe_pipeline( ) @click.pass_obj # this will pass the metadata as first argument def pull_package( # pylint:disable=unused-argument, too-many-arguments - metadata: ProjectMetadata, package_path, env, alias, fs_args, all_flag, **kwargs + metadata: ProjectMetadata, + package_path, + env, + alias, + destination, + fs_args, + all_flag, + **kwargs, ) -> None: """Pull and unpack a modular pipeline in your project.""" if not package_path and not all_flag: @@ -291,45 +250,66 @@ def pull_package( # pylint:disable=unused-argument, too-many-arguments _pull_packages_from_manifest(metadata) return - _pull_package(package_path, metadata, env=env, alias=alias, fs_args=fs_args) + _pull_package( + package_path, + metadata, + env=env, + alias=alias, + destination=destination, + fs_args=fs_args, + ) as_alias = f" as `{alias}`" if alias else "" message = f"Pipeline {package_path} pulled and unpacked{as_alias}!" click.secho(message, fg="green") +# pylint: disable=too-many-arguments, too-many-locals def _pull_package( package_path: str, metadata: ProjectMetadata, env: str = None, alias: str = None, + destination: str = None, fs_args: str = None, ): with tempfile.TemporaryDirectory() as temp_dir: temp_dir_path = Path(temp_dir).resolve() - _unpack_wheel(package_path, temp_dir_path, fs_args) + _unpack_sdist(package_path, temp_dir_path, fs_args) - dist_info_file = list(temp_dir_path.glob("*.dist-info")) - if len(dist_info_file) != 1: + sdist_file_name = Path(package_path).name.rstrip(".tar.gz") + egg_info_file = list((temp_dir_path / sdist_file_name).glob("*.egg-info")) + if len(egg_info_file) != 1: raise KedroCliError( - f"More than 1 or no dist-info files found from {package_path}. " - f"There has to be exactly one dist-info directory." + f"More than 1 or no egg-info files found from {package_path}. " + f"There has to be exactly one egg-info directory." ) - # Extract package name, based on the naming convention for wheel files - # https://www.python.org/dev/peps/pep-0427/#file-name-convention - package_name = dist_info_file[0].stem.split("-")[0] - package_metadata = dist_info_file[0] / "METADATA" + package_name = egg_info_file[0].stem + package_requirements = temp_dir_path / sdist_file_name / "setup.py" + + # Finds a string representation of 'install_requires' list from setup.py + reqs_list_pattern = r"install_requires\=(.*?)\,\n" + list_reqs = re.findall( + reqs_list_pattern, package_requirements.read_text(encoding="utf-8") + ) + + # Finds all elements from the above string representation of a list + reqs_element_pattern = r"\'(.*?)\'" + package_reqs = re.findall(reqs_element_pattern, list_reqs[0]) - req_pattern = r"Requires-Dist: (.*?)\n" - package_reqs = re.findall(req_pattern, package_metadata.read_text()) if package_reqs: - requirements_in = _get_requirements_in( - metadata.source_dir, create_empty=True - ) - _append_package_reqs(requirements_in, package_reqs, package_name) + requirements_txt = metadata.source_dir / "requirements.txt" + _append_package_reqs(requirements_txt, package_reqs, package_name) _clean_pycache(temp_dir_path) - _install_files(metadata, package_name, temp_dir_path, env, alias) + _install_files( + metadata, + package_name, + temp_dir_path / sdist_file_name, + env, + alias, + destination, + ) def _pull_packages_from_manifest(metadata: ProjectMetadata) -> None: @@ -349,7 +329,7 @@ def _pull_packages_from_manifest(metadata: ProjectMetadata) -> None: for package_path, specs in build_specs.items(): if "alias" in specs: - _assert_pkg_name_ok(specs["alias"]) + _assert_pkg_name_ok(specs["alias"].split(".")[-1]) _pull_package(package_path, metadata, **specs) click.secho(f"Pulled and unpacked `{package_path}`!") @@ -395,15 +375,7 @@ def _package_pipelines_from_manifest(metadata: ProjectMetadata) -> None: "-d", "--destination", type=click.Path(resolve_path=True, file_okay=False), - help="Location where to create the wheel file. Defaults to `src/dist`.", -) -@click.option( - "-v", - "--version", - type=str, - help="Version to package under. " - "Defaults to pipeline package version or, " - "if that is not defined, the project package version.", + help="Location where to create the source distribution file. Defaults to `dist/`.", ) @click.option( "--all", @@ -412,13 +384,13 @@ def _package_pipelines_from_manifest(metadata: ProjectMetadata) -> None: is_flag=True, help="Package all pipelines in the `pyproject.toml` package manifest section.", ) -@click.argument("name", nargs=1, required=False, callback=_check_module_path) +@click.argument("module_path", nargs=1, required=False, callback=_check_module_path) @click.pass_obj # this will pass the metadata as first argument def package_pipeline( - metadata: ProjectMetadata, name, env, alias, destination, version, all_flag + metadata: ProjectMetadata, module_path, env, alias, destination, all_flag ): # pylint: disable=too-many-arguments - """Package up a modular pipeline as a Python .whl.""" - if not name and not all_flag: + """Package up a modular pipeline as Python source distribution.""" + if not module_path and not all_flag: click.secho( "Please specify a pipeline name or add '--all' to package all pipelines in " "the `pyproject.toml` package manifest section." @@ -430,11 +402,14 @@ def package_pipeline( return result_path = _package_pipeline( - name, metadata, alias=alias, destination=destination, env=env, version=version + module_path, metadata, alias=alias, destination=destination, env=env ) as_alias = f" as `{alias}`" if alias else "" - message = f"Pipeline `{name}` packaged{as_alias}! Location: {result_path}" + message = ( + f"`{metadata.package_name}.{module_path}` packaged{as_alias}! " + f"Location: {result_path}" + ) click.secho(message, fg="green") @@ -470,28 +445,28 @@ def _get_fsspec_filesystem(location: str, fs_args: Optional[str]): return None -def _unpack_wheel(location: str, destination: Path, fs_args: Optional[str]) -> None: +def _unpack_sdist(location: str, destination: Path, fs_args: Optional[str]) -> None: filesystem = _get_fsspec_filesystem(location, fs_args) - if location.endswith(".whl") and filesystem and filesystem.exists(location): + if location.endswith(".tar.gz") and filesystem and filesystem.exists(location): with filesystem.open(location) as fs_file: - # pylint: disable=consider-using-with - ZipFile(fs_file).extractall(destination) + with tarfile.open(fileobj=fs_file, mode="r:gz") as tar_file: + tar_file.extractall(destination) else: python_call( "pip", ["download", "--no-deps", "--dest", str(destination), location] ) - wheel_file = list(destination.glob("*.whl")) - # `--no-deps` should fetch only one wheel file, and CLI should fail if that's + sdist_file = list(destination.glob("*.tar.gz")) + # `--no-deps` should fetch only one source distribution file, and CLI should fail if that's # not the case. - if len(wheel_file) != 1: - file_names = [wf.name for wf in wheel_file] + if len(sdist_file) != 1: + file_names = [sf.name for sf in sdist_file] raise KedroCliError( - f"More than 1 or no wheel files found: {file_names}. " - f"There has to be exactly one distribution file." + f"More than 1 or no sdist files found: {file_names}. " + f"There has to be exactly one source distribution file." ) - # pylint: disable=consider-using-with - ZipFile(wheel_file[0]).extractall(destination) + with tarfile.open(sdist_file[0], "r:gz") as fs_file: + fs_file.extractall(destination) def _rename_files(conf_source: Path, old_name: str, new_name: str): @@ -510,6 +485,7 @@ def _refactor_code_for_unpacking( package_path: Path, tests_path: Path, alias: Optional[str], + destination: Optional[str], project_metadata: ProjectMetadata, ) -> Tuple[Path, Path]: """This is the reverse operation of `_refactor_code_for_package`, i.e @@ -525,13 +501,13 @@ def _refactor_code_for_unpacking( # also the root of the Rope project |__ |__ __init__.py - |__ pipelines + |__ |__ __init__.py |__ |__ __init__.py |__ tests |__ __init__.py - |__ pipelines + |__ |__ __init__.py |__ |__ __init__.py @@ -548,16 +524,24 @@ def _move_package_with_conflicting_name( return full_path pipeline_name = package_path.stem - if alias: + package_target = Path(project_metadata.package_name) + tests_target = Path("tests") + + if destination: + destination_path = Path(destination) + package_target = package_target / destination_path + tests_target = tests_target / destination_path + + if alias and alias != pipeline_name: _rename_package(project, pipeline_name, alias) pipeline_name = alias - package_target = Path(project_metadata.package_name) / "pipelines" if pipeline_name == project_metadata.package_name: full_path = _move_package_with_conflicting_name(package_target, pipeline_name) else: full_path = _create_nested_package(project, package_target) _move_package(project, pipeline_name, package_target.as_posix()) + refactored_package_path = full_path / pipeline_name if not tests_path.exists(): @@ -568,21 +552,22 @@ def _move_package_with_conflicting_name( # hence we give it a temp name, create the expected # nested folder structure, move the contents there, # then rename the temp name to . - tests_target = Path("tests") / "pipelines" full_path = _move_package_with_conflicting_name( tests_target, original_name="tests", desired_name=pipeline_name ) + refactored_tests_path = full_path / pipeline_name return refactored_package_path, refactored_tests_path -def _install_files( +def _install_files( # pylint: disable=too-many-arguments, too-many-locals project_metadata: ProjectMetadata, package_name: str, source_path: Path, env: str = None, alias: str = None, + destination: str = None, ): env = env or "base" @@ -593,9 +578,12 @@ def _install_files( if conf_source.is_dir() and alias: _rename_files(conf_source, package_name, alias) - pipeline_name = alias or package_name - package_dest, test_dest, conf_dest = _get_pipeline_artifacts( - project_metadata, pipeline_name=pipeline_name, env=env + module_path = alias or package_name + if destination: + module_path = f"{destination}.{module_path}" + + package_dest, test_dest, conf_dest = _get_artifacts_to_package( + project_metadata, module_path=module_path, env=env ) if conf_source.is_dir(): @@ -607,7 +595,7 @@ def _install_files( project = Project(source_path) refactored_package_source, refactored_test_source = _refactor_code_for_unpacking( - project, package_source, test_source, alias, project_metadata + project, package_source, test_source, alias, destination, project_metadata ) project.close() @@ -636,11 +624,11 @@ def _find_config_files( return config_files -def _get_default_version(metadata: ProjectMetadata, pipeline_name: str) -> str: +def _get_default_version(metadata: ProjectMetadata, pipeline_module_path: str) -> str: # default to pipeline package version try: pipeline_module = import_module( - f"{metadata.package_name}.pipelines.{pipeline_name}" + f"{metadata.package_name}.{pipeline_module_path}" ) return pipeline_module.__version__ # type: ignore except (AttributeError, ModuleNotFoundError): @@ -649,40 +637,36 @@ def _get_default_version(metadata: ProjectMetadata, pipeline_name: str) -> str: return project_module.__version__ # type: ignore -def _package_pipeline( # pylint: disable=too-many-arguments - pipeline_name: str, +def _package_pipeline( + pipeline_module_path: str, metadata: ProjectMetadata, alias: str = None, destination: str = None, env: str = None, - version: str = None, ) -> Path: + pipeline_name = pipeline_module_path.split(".")[-1] package_dir = metadata.source_dir / metadata.package_name env = env or "base" - artifacts_to_package = _get_pipeline_artifacts( - metadata, pipeline_name=pipeline_name, env=env + package_source, package_tests, package_conf = _get_artifacts_to_package( + metadata, module_path=pipeline_module_path, env=env ) - # as the wheel file will only contain parameters, we aren't listing other + # as the source distribution will only contain parameters, we aren't listing other # config files not to confuse users and avoid useless file copies configs_to_package = _find_config_files( - artifacts_to_package.pipeline_conf, + package_conf, [f"parameters*/**/{pipeline_name}.yml", f"parameters*/**/{pipeline_name}/**/*"], ) - source_paths = ( - artifacts_to_package.pipeline_dir, - artifacts_to_package.pipeline_tests, - configs_to_package, - ) + source_paths = (package_source, package_tests, configs_to_package) # Check that pipeline directory exists and not empty - _validate_dir(artifacts_to_package.pipeline_dir) + _validate_dir(package_source) - destination = Path(destination) if destination else package_dir.parent / "dist" - version = version or _get_default_version(metadata, pipeline_name) + destination = Path(destination) if destination else metadata.project_path / "dist" + version = _get_default_version(metadata, pipeline_module_path) - _generate_wheel_file( + _generate_sdist_file( pipeline_name=pipeline_name, destination=destination.resolve(), source_paths=source_paths, @@ -704,15 +688,8 @@ def _validate_dir(path: Path) -> None: raise KedroCliError(f"'{path}' is an empty directory.") -def _get_wheel_name(**kwargs: Any) -> str: - # https://stackoverflow.com/q/51939257/3364156 - dist = Distribution(attrs=kwargs) - bdist_wheel_cmd = dist.get_command_obj("bdist_wheel") - bdist_wheel_cmd.ensure_finalized() - - distname = bdist_wheel_cmd.wheel_dist_name - tag = "-".join(bdist_wheel_cmd.get_tag()) - return f"{distname}-{tag}.whl" +def _get_sdist_name(name, version): + return f"{name}-{version}.tar.gz" def _sync_path_list(source: List[Tuple[Path, str]], target: Path) -> None: @@ -865,7 +842,7 @@ def _move_package_with_conflicting_name(target: Path, conflicting_name: str): # pylint: disable=too-many-arguments,too-many-locals -def _generate_wheel_file( +def _generate_sdist_file( pipeline_name: str, destination: Path, source_paths: Tuple[_SourcePathType, ...], @@ -880,8 +857,8 @@ def _generate_wheel_file( temp_dir_path = Path(temp_dir).resolve() project = Project(temp_dir_path) # project where to do refactoring - _refactor_code_for_package( # type: ignore - project, package_source, tests_source, alias, metadata + _refactor_code_for_package( + project, package_source, tests_source, alias, metadata # type: ignore ) project.close() @@ -901,23 +878,25 @@ def _generate_wheel_file( cls = exc.__class__ raise KedroCliError(f"{cls.__module__}.{cls.__qualname__}: {exc}") from exc - config_files = [str(file) for file in conf_target.rglob("*") if file.is_file()] + _generate_manifest_file(temp_dir_path) setup_file = _generate_setup_file( - package_name, version, install_requires, temp_dir_path, config_files + package_name, version, install_requires, temp_dir_path ) - package_file = destination / _get_wheel_name(name=package_name, version=version) + package_file = destination / _get_sdist_name(name=package_name, version=version) + if package_file.is_file(): click.secho( f"Package file {package_file} will be overwritten!", fg="yellow" ) - # python setup.py bdist_wheel --dist-dir + # python setup.py sdist --formats=gztar --dist-dir call( [ sys.executable, str(setup_file.resolve()), - "bdist_wheel", + "sdist", + "--formats=gztar", "--dist-dir", str(destination), ], @@ -925,21 +904,26 @@ def _generate_wheel_file( ) +def _generate_manifest_file(output_dir: Path): + manifest_file = output_dir / "MANIFEST.in" + manifest_file.write_text( + """ + global-include README.md + global-include config/parameters* + global-include config/**/parameters* + global-include config/parameters*/** + global-include config/parameters*/**/* + """ + ) + + def _generate_setup_file( - package_name: str, - version: str, - install_requires: List[str], - output_dir: Path, - config_files: List[str], + package_name: str, version: str, install_requires: List[str], output_dir: Path ) -> Path: setup_file = output_dir / "setup.py" - package_data = {package_name: ["README.md"] + config_files} setup_file_context = dict( - name=package_name, - version=version, - package_data=json.dumps(package_data), - install_requires=install_requires, + name=package_name, version=version, install_requires=install_requires ) setup_file.write_text(_SETUP_PY_TEMPLATE.format(**setup_file_context)) @@ -1035,13 +1019,21 @@ def _sync_dirs(source: Path, target: Path, prefix: str = "", overwrite: bool = F def _get_pipeline_artifacts( project_metadata: ProjectMetadata, pipeline_name: str, env: str ) -> PipelineArtifacts: + artifacts = _get_artifacts_to_package( + project_metadata, f"pipelines.{pipeline_name}", env + ) + return PipelineArtifacts(*artifacts) + + +def _get_artifacts_to_package( + project_metadata: ProjectMetadata, module_path: str, env: str +) -> Tuple[Path, Path, Path]: """From existing project, returns in order: source_path, tests_path, config_paths""" package_dir = project_metadata.source_dir / project_metadata.package_name - conf_root = settings.CONF_ROOT - project_conf_path = project_metadata.project_path / conf_root - artifacts = PipelineArtifacts( - package_dir / "pipelines" / pipeline_name, - package_dir.parent / "tests" / "pipelines" / pipeline_name, + project_conf_path = project_metadata.project_path / settings.CONF_SOURCE + artifacts = ( + Path(package_dir, *module_path.split(".")), + Path(package_dir.parent, "tests", *module_path.split(".")), project_conf_path / env, ) return artifacts @@ -1050,7 +1042,7 @@ def _get_pipeline_artifacts( def _get_package_artifacts( source_path: Path, package_name: str ) -> Tuple[Path, Path, Path]: - """From existing unpacked wheel, returns in order: + """From existing package, returns in order: source_path, tests_path, config_path """ artifacts = ( @@ -1100,29 +1092,39 @@ def _delete_artifacts(*artifacts: Path): def _append_package_reqs( - requirements_in: Path, package_reqs: List[str], pipeline_name: str + requirements_txt: Path, package_reqs: List[str], pipeline_name: str ) -> None: - """Appends modular pipeline requirements to project level requirements.in""" - existing_reqs = _safe_parse_requirements(requirements_in.read_text()) + """Appends modular pipeline requirements to project level requirements.txt""" incoming_reqs = _safe_parse_requirements(package_reqs) - reqs_to_add = set(incoming_reqs) - set(existing_reqs) - if not reqs_to_add: - return - - sorted_reqs = sorted(str(req) for req in reqs_to_add) - sep = "\n" - with open(requirements_in, "a", encoding="utf-8") as file: - file.write( - f"\n\n# Additional requirements from modular pipeline `{pipeline_name}`:\n" + if requirements_txt.is_file(): + existing_reqs = _safe_parse_requirements(requirements_txt.read_text()) + reqs_to_add = set(incoming_reqs) - set(existing_reqs) + if not reqs_to_add: + return + + sorted_reqs = sorted(str(req) for req in reqs_to_add) + sep = "\n" + with open(requirements_txt, "a", encoding="utf-8") as file: + file.write( + f"\n\n# Additional requirements from modular pipeline `{pipeline_name}`:\n" + ) + file.write(sep.join(sorted_reqs)) + click.secho( + f"Added the following requirements from modular pipeline `{pipeline_name}` to " + f"requirements.txt:\n{sep.join(sorted_reqs)}" ) - file.write(sep.join(sorted_reqs)) - click.secho( - f"Added the following requirements from modular pipeline `{pipeline_name}` to " - f"requirements.in:\n{sep.join(sorted_reqs)}" - ) + else: + click.secho( + "No project requirements.txt found. Copying contents from pipeline requirements.txt..." + ) + sorted_reqs = sorted(str(req) for req in incoming_reqs) + sep = "\n" + with open(requirements_txt, "a", encoding="utf-8") as file: + file.write(sep.join(sorted_reqs)) + click.secho( - "Use `kedro install --build-reqs` to compile and install the updated list of " - "requirements." + "Use `kedro build-reqs` to compile and `pip install -r src/requirements.lock` to install " + "the updated list of requirements." ) diff --git a/kedro/framework/cli/project.py b/kedro/framework/cli/project.py index b4bbb0ad52..931139bb3f 100644 --- a/kedro/framework/cli/project.py +++ b/kedro/framework/cli/project.py @@ -6,7 +6,6 @@ import sys import webbrowser from pathlib import Path -from typing import Sequence import click from click import secho @@ -15,7 +14,6 @@ KedroCliError, _check_module_importable, _config_file_callback, - _get_requirements_in, _get_values_as_tuple, _reformat_load_versions, _split_params, @@ -32,7 +30,7 @@ from kedro.utils import load_obj NO_DEPENDENCY_MESSAGE = """{module} is not installed. Please make sure {module} is in -{src}/requirements.txt and run `kedro install`.""" +{src}/requirements.txt and run `pip install -r src/requirements.txt`.""" LINT_CHECK_ONLY_HELP = """Check the files for style guide violations, unsorted / unformatted imports, and unblackened Python code without modifying the files.""" OPEN_ARG_HELP = """Open the documentation in your default browser after building.""" @@ -44,13 +42,7 @@ TO_NODES_HELP = """A list of node names which should be used as an end point.""" NODE_ARG_HELP = """Run only nodes with specified names.""" RUNNER_ARG_HELP = """Specify a runner that you want to run the pipeline with. -Available runners: `SequentialRunner`, `ParallelRunner` and `ThreadRunner`. -This option cannot be used together with --parallel.""" -PARALLEL_ARG_HELP = """(DEPRECATED) Run the pipeline using the `ParallelRunner`. -If not specified, use the `SequentialRunner`. This flag cannot be used together -with --runner. In Kedro 0.18.0, `-p` will be an alias for `--pipeline` and the -`--parallel` flag will no longer exist. Instead, the parallel runner should be used by -specifying `--runner=ParallelRunner` (or `-r ParallelRunner`).""" +Available runners: `SequentialRunner`, `ParallelRunner` and `ThreadRunner`.""" ASYNC_ARG_HELP = """Load and save node inputs and outputs asynchronously with threads. If not specified, load and save datasets synchronously.""" TAG_ARG_HELP = """Construct the pipeline using only nodes which have this tag @@ -68,18 +60,8 @@ so parameter values are allowed to contain colons, parameter keys are not. To pass a nested dictionary as parameter, separate keys by '.', example: param_group.param1:value1.""" - - -def _build_reqs(source_path: Path, args: Sequence[str] = ()): - """Run `pip-compile requirements.in` command. - - Args: - source_path: Path to the project `src` folder. - args: Optional arguments for `pip-compile` call, e.g. `--generate-hashes`. - - """ - requirements_in = _get_requirements_in(source_path) - python_call("piptools", ["compile", "-q", *args, str(requirements_in)]) +INPUT_FILE_HELP = """Name of the requirements file to compile.""" +OUTPUT_FILE_HELP = """Name of the file where compiled requirements should be stored.""" # pylint: disable=missing-function-docstring @@ -133,60 +115,6 @@ def lint( python_call("isort", ("--check",) + files if check_only else files) -@project_group.command() -@click.option( - "--build-reqs/--no-build-reqs", - "compile_flag", - default=None, - help="Run `pip-compile` on project requirements before install. " - "By default runs only if `src/requirements.in` file doesn't exist.", -) -@click.pass_obj # this will pass the metadata as first argument -def install(metadata: ProjectMetadata, compile_flag): - """Install project dependencies from both requirements.txt - and environment.yml (DEPRECATED).""" - - deprecation_message = ( - "DeprecationWarning: Command `kedro install` will be deprecated in Kedro 0.18.0. " - "In the future use `pip install -r src/requirements.txt` instead. " - "If you were running `kedro install` with the `--build-reqs` flag, " - "we recommend running `kedro build-reqs` followed by `pip install -r src/requirements.txt`" - ) - click.secho(deprecation_message, fg="red") - - # we cannot use `context.project_path` as in other commands since - # context instantiation might break due to missing dependencies - # we attempt to install here - # pylint: disable=consider-using-with - source_path = metadata.source_dir - environment_yml = source_path / "environment.yml" - requirements_in = source_path / "requirements.in" - requirements_txt = source_path / "requirements.txt" - - if environment_yml.is_file(): - call(["conda", "env", "update", "--file", str(environment_yml), "--prune"]) - - default_compile = bool(compile_flag is None and not requirements_in.is_file()) - do_compile = compile_flag or default_compile - if do_compile: - _build_reqs(source_path) - - pip_command = ["install", "-U", "-r", str(requirements_txt)] - - if os.name == "posix": - python_call("pip", pip_command) - else: - command = [sys.executable, "-m", "pip"] + pip_command - proc = subprocess.Popen( - command, creationflags=subprocess.CREATE_NEW_CONSOLE, stderr=subprocess.PIPE - ) - _, errs = proc.communicate() - if errs: - secho(errs.decode(), fg="red") - raise click.exceptions.Exit(code=1) - secho("Requirements installed!", fg="green") - - @forward_command(project_group, forward_help=True) @env_option @click.pass_obj # this will pass the metadata as first argument @@ -210,11 +138,27 @@ def package(metadata: ProjectMetadata): """Package the project as a Python egg and wheel.""" source_path = metadata.source_dir call( - [sys.executable, "setup.py", "clean", "--all", "bdist_egg"], + [ + sys.executable, + "setup.py", + "clean", + "--all", + "bdist_egg", + "--dist-dir", + "../dist", + ], cwd=str(source_path), ) call( - [sys.executable, "setup.py", "clean", "--all", "bdist_wheel"], + [ + sys.executable, + "setup.py", + "clean", + "--all", + "bdist_wheel", + "--dist-dir", + "../dist", + ], cwd=str(source_path), ) @@ -256,17 +200,53 @@ def build_docs(metadata: ProjectMetadata, open_docs): @forward_command(project_group, name="build-reqs") +@click.option( + "--input-file", + "input_file", + type=click.Path(exists=True, dir_okay=False, resolve_path=True), + multiple=False, + help=INPUT_FILE_HELP, +) +@click.option( + "--output-file", + "output_file", + multiple=False, + help=OUTPUT_FILE_HELP, +) @click.pass_obj # this will pass the metadata as first argument def build_reqs( - metadata: ProjectMetadata, args, **kwargs + metadata: ProjectMetadata, input_file, output_file, args, **kwargs ): # pylint: disable=unused-argument - """Build the project dependency requirements.""" + """Run `pip-compile` on src/requirements.txt or the user defined input file and save + the compiled requirements to src/requirements.lock or the user defined output file. + """ + source_path = metadata.source_dir - _build_reqs(source_path, args) + input_file = Path(input_file or source_path / "requirements.txt") + output_file = Path(output_file or source_path / "requirements.lock") + + if input_file.is_file(): + python_call( + "piptools", + [ + "compile", + *args, + str(input_file), + "--output-file", + str(output_file), + ], + ) + + else: + raise FileNotFoundError( + f"File `{input_file}` not found in the project. " + "Please specify another input or create the file and try again." + ) + secho( - "Requirements built! Please update requirements.in " + f"Requirements built! Please update {input_file.name} " "if you'd like to make a change in your project's dependencies, " - "and re-run build-reqs to generate the new requirements.txt.", + f"and re-run build-reqs to generate the new {output_file.name}.", fg="green", ) @@ -324,7 +304,6 @@ def activate_nbstripout( @click.option( "--runner", "-r", type=str, default=None, multiple=False, help=RUNNER_ARG_HELP ) -@click.option("--parallel", "-p", is_flag=True, multiple=False, help=PARALLEL_ARG_HELP) @click.option("--async", "is_async", is_flag=True, multiple=False, help=ASYNC_ARG_HELP) @env_option @click.option("--tag", "-t", type=str, multiple=True, help=TAG_ARG_HELP) @@ -336,7 +315,7 @@ def activate_nbstripout( help=LOAD_VERSION_HELP, callback=_reformat_load_versions, ) -@click.option("--pipeline", type=str, default=None, help=PIPELINE_ARG_HELP) +@click.option("--pipeline", "-p", type=str, default=None, help=PIPELINE_ARG_HELP) @click.option( "--config", "-c", @@ -347,11 +326,10 @@ def activate_nbstripout( @click.option( "--params", type=str, default="", help=PARAMS_ARG_HELP, callback=_split_params ) -# pylint: disable=too-many-arguments,unused-argument,too-many-locals +# pylint: disable=too-many-arguments,unused-argument def run( tag, env, - parallel, runner, is_async, node_names, @@ -365,23 +343,7 @@ def run( params, ): """Run the pipeline.""" - if parallel and runner: - raise KedroCliError( - "Both --parallel and --runner options cannot be used together. " - "Please use either --parallel or --runner." - ) - runner = runner or "SequentialRunner" - if parallel: - deprecation_message = ( - "DeprecationWarning: The behaviour of --parallel and -p flags will change. " - "In Kedro 0.18.0, `-p` will be an alias for `--pipeline` and the " - "`--parallel` flag will no longer exist. Instead, the parallel runner " - "should be used by specifying `--runner=ParallelRunner` (or " - "`-r ParallelRunner`)." - ) - click.secho(deprecation_message, fg="red") - runner = "ParallelRunner" - runner_class = load_obj(runner, "kedro.runner") + runner = load_obj(runner or "SequentialRunner", "kedro.runner") tag = _get_values_as_tuple(tag) if tag else tag node_names = _get_values_as_tuple(node_names) if node_names else node_names @@ -389,7 +351,7 @@ def run( with KedroSession.create(env=env, extra_params=params) as session: session.run( tags=tag, - runner=runner_class(is_async=is_async), + runner=runner(is_async=is_async), node_names=node_names, from_nodes=from_nodes, to_nodes=to_nodes, diff --git a/kedro/framework/cli/starters.py b/kedro/framework/cli/starters.py index cfd9e6a45b..9fd2f1e64b 100644 --- a/kedro/framework/cli/starters.py +++ b/kedro/framework/cli/starters.py @@ -248,7 +248,7 @@ def _create_project(template_path: str, cookiecutter_args: Dict[str, str]): ) click.secho( "\nA best-practice setup includes initialising git and creating " - "a virtual environment before running ``kedro install`` to install " + "a virtual environment before running ``pip install -r src/requirements.txt`` to install " "project-specific dependencies. Refer to the Kedro documentation: " "https://kedro.readthedocs.io/" ) diff --git a/kedro/framework/cli/utils.py b/kedro/framework/cli/utils.py index 1ded659302..91291e1c56 100644 --- a/kedro/framework/cli/utils.py +++ b/kedro/framework/cli/utils.py @@ -328,7 +328,7 @@ def _check_module_importable(module_name: str) -> None: except ImportError as exc: raise KedroCliError( f"Module `{module_name}` not found. Make sure to install required project " - f"dependencies by running the `kedro install` command first." + f"dependencies by running the `pip install -r src/requirements.txt` command first." ) from exc @@ -355,21 +355,6 @@ def load_entry_points(name: str) -> Sequence[click.MultiCommand]: return entry_point_commands -def _add_src_to_path(source_dir: Path, project_path: Path) -> None: # pragma: no cover - # for backwards compatibility with ipython & deployment scripts - # pylint: disable=import-outside-toplevel - from kedro.framework.startup import _add_src_to_path as real_add_src_to_path - - msg = ( - "kedro.framework.utils._add_src_to_path is deprecated. " - "Please import from new location kedro.framework.startup " - "or use `bootstrap_project()` instead for setting up " - "the Kedro project." - ) - warnings.warn(msg, FutureWarning) - real_add_src_to_path(source_dir, project_path) - - def _config_file_callback(ctx, param, value): # pylint: disable=unused-argument """CLI callback that replaces command line options with values specified in a config file. If command line @@ -473,44 +458,5 @@ def _update_value_nested_dict( return nested_dict -def _get_requirements_in(source_path: Path, create_empty: bool = False) -> Path: - """Get path to project level requirements.in, creating it if required. - - Args: - source_path: Path to the project `src` folder. - create_empty: Whether an empty requirements.in file should be created if - requirements.in does not exist and there is also no requirements.txt to - copy requirements from. - - Returns: - Path to requirements.in. - - Raises: - FileNotFoundError: If neither requirements.in nor requirements.txt is found. - - """ - requirements_in = source_path / "requirements.in" - if requirements_in.is_file(): - return requirements_in - - requirements_txt = source_path / "requirements.txt" - if requirements_txt.is_file(): - click.secho( - "No requirements.in found. Copying contents from requirements.txt..." - ) - shutil.copyfile(str(requirements_txt), str(requirements_in)) - return requirements_in - - if create_empty: - click.secho("Creating empty requirements.in...") - requirements_in.touch() - return requirements_in - - raise FileNotFoundError( - "No project requirements.in or requirements.txt found in `/src`. " - "Please create either and try again." - ) - - def _get_values_as_tuple(values: Iterable[str]) -> Tuple[str, ...]: return tuple(chain.from_iterable(value.split(",") for value in values)) diff --git a/kedro/framework/context/__init__.py b/kedro/framework/context/__init__.py index 89b9a7d169..41891e5c72 100644 --- a/kedro/framework/context/__init__.py +++ b/kedro/framework/context/__init__.py @@ -2,6 +2,6 @@ project context. """ -from .context import KedroContext, KedroContextError, load_context +from .context import KedroContext, KedroContextError -__all__ = ["KedroContext", "KedroContextError", "load_context"] +__all__ = ["KedroContext", "KedroContextError"] diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 0ac14b38ef..5ee9f214d8 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -1,43 +1,16 @@ """This module provides context for Kedro project.""" -import functools -import logging -import os from copy import deepcopy from pathlib import Path, PurePosixPath, PureWindowsPath -from typing import Any, Dict, Iterable, Union +from typing import Any, Dict, Optional, Union from urllib.parse import urlparse from warnings import warn from kedro.config import ConfigLoader, MissingConfigException from kedro.framework.hooks import get_hook_manager -from kedro.framework.project import pipelines, settings -from kedro.framework.startup import _get_project_metadata +from kedro.framework.project import settings from kedro.io import DataCatalog -from kedro.io.core import generate_timestamp -from kedro.pipeline import Pipeline from kedro.pipeline.pipeline import _transcode_split -from kedro.runner.runner import AbstractRunner -from kedro.runner.sequential_runner import SequentialRunner -from kedro.versioning import Journal - - -def _deprecate(version): - """Decorator to deprecate a few of the context's properties.""" - - def decorator(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - property_name = func.__name__ - warn( - f"Accessing {property_name} via the context will be deprecated in Kedro {version}.", - DeprecationWarning, - ) - return func(*args, **kwargs) - - return wrapper - - return decorator def _is_relative_path(path_string: str) -> bool: @@ -190,19 +163,17 @@ class KedroContext: Kedro's main functionality. """ - _CONF_ROOT = "conf" - """CONF_ROOT: Name of root directory containing project configuration. - Default name is "conf".""" - def __init__( self, package_name: str, project_path: Union[Path, str], + config_loader: ConfigLoader, env: str = None, extra_params: Dict[str, Any] = None, - ): + ): # pylint: disable=too-many-arguments """Create a context object by providing the root of a Kedro project and - the environment configuration subfolders (see ``kedro.config.ConfigLoader``) + the environment configuration subfolders + (see ``kedro.config.ConfigLoader``) Raises: KedroContextError: If there is a mismatch @@ -220,33 +191,12 @@ def __init__( """ self._project_path = Path(project_path).expanduser().resolve() self._package_name = package_name - - self._env = env or "local" + self._config_loader = config_loader + self._env = env self._extra_params = deepcopy(extra_params) @property # type: ignore - @_deprecate(version="0.18.0") - def CONF_ROOT(self) -> str: # pylint: disable=invalid-name - """Deprecated in favour of settings.CONF_ROOT - - Returns: - The root directory of the configuration directory of the project. - Raises: - DeprecationWarning - """ - return self._CONF_ROOT - - @CONF_ROOT.setter # type: ignore - @_deprecate(version="0.18.0") - def CONF_ROOT(self, value: str) -> None: # pylint: disable=invalid-name - """Deprecated in favour of settings.CONF_ROOT - Raises: - DeprecationWarning - """ - self._CONF_ROOT = value # pylint: disable=invalid-name - - @property # type: ignore - def env(self) -> str: + def env(self) -> Optional[str]: """Property for the current Kedro environment. Returns: @@ -255,48 +205,6 @@ def env(self) -> str: """ return self._env - @property # type: ignore - @_deprecate(version="0.18.0") - def package_name(self) -> str: - """Property for Kedro project package name. - - Returns: - Name of Kedro project package. - - """ - return self._package_name - - @property # type: ignore - @_deprecate(version="0.18.0") - def pipeline(self) -> Pipeline: - """Read-only property for an instance of Pipeline. - - Returns: - Default pipeline. - Raises: - KedroContextError: If the `__default__` pipeline is not - defined by `register_pipelines`. - - """ - try: - return pipelines["__default__"] - except KeyError as exc: # pragma: no cover - raise KedroContextError( - "Failed to find the pipeline named '__default__'. " - "It needs to be generated and returned " - "by the 'register_pipelines' function." - ) from exc - - @property # type: ignore - @_deprecate(version="0.18.0") - def pipelines(self) -> Dict[str, Pipeline]: - """Read-only property for an instance of Pipeline. - - Returns: - A dictionary of defined pipelines. - """ - return dict(pipelines) - @property def project_path(self) -> Path: """Read-only property containing Kedro's root project directory. @@ -329,7 +237,7 @@ def params(self) -> Dict[str, Any]: """ try: # '**/parameters*' reads modular pipeline configs - params = self.config_loader.get( + params = self._config_loader.get( "parameters*", "parameters*/**", "**/parameters*" ) except MissingConfigException as exc: @@ -341,7 +249,6 @@ def params(self) -> Dict[str, Any]: def _get_catalog( self, save_version: str = None, - journal: Journal = None, load_versions: Dict[str, str] = None, ) -> DataCatalog: """A hook for changing the creation of a DataCatalog instance. @@ -353,7 +260,7 @@ def _get_catalog( """ # '**/catalog*' reads modular pipeline configs - conf_catalog = self.config_loader.get("catalog*", "catalog*/**", "**/catalog*") + conf_catalog = self._config_loader.get("catalog*", "catalog*/**", "**/catalog*") # turn relative paths in conf_catalog into absolute paths # before initializing the catalog conf_catalog = _convert_paths_to_absolute_posix( @@ -361,19 +268,12 @@ def _get_catalog( ) conf_creds = self._get_config_credentials() - hook_manager = get_hook_manager() - catalog = hook_manager.hook.register_catalog( # pylint: disable=no-member + catalog = settings.DATA_CATALOG_CLASS.from_config( catalog=conf_catalog, credentials=conf_creds, load_versions=load_versions, save_version=save_version, - journal=journal, ) - if not isinstance(catalog, DataCatalog): - raise KedroContextError( - f"Expected an instance of `DataCatalog`, " - f"got `{type(catalog).__name__}` instead." - ) feed_dict = self._get_feed_dict() catalog.add_feed_dict(feed_dict) @@ -391,63 +291,6 @@ def _get_catalog( ) return catalog - @property # type: ignore - @_deprecate(version="0.18.0") - def io(self) -> DataCatalog: - """Read-only alias property referring to Kedro's ``DataCatalog`` for this - context. - - Returns: - DataCatalog defined in `catalog.yml`. - Raises: - KedroContextError: Incorrect ``DataCatalog`` registered for the project. - - """ - # pylint: disable=invalid-name - return self.catalog - - def _get_config_loader(self) -> ConfigLoader: - """A hook for changing the creation of a ConfigLoader instance. - - Returns: - Instance of `ConfigLoader` created by `register_config_loader` hook. - Raises: - KedroContextError: Incorrect ``ConfigLoader`` registered for the project. - - """ - conf_root = settings.CONF_ROOT - conf_paths = [ - str(self.project_path / conf_root / "base"), - str(self.project_path / conf_root / self.env), - ] - hook_manager = get_hook_manager() - config_loader = ( - hook_manager.hook.register_config_loader( # pylint: disable=no-member - conf_paths=conf_paths, - env=self.env, - extra_params=self._extra_params, - ) - ) - if not isinstance(config_loader, ConfigLoader): - raise KedroContextError( - f"Expected an instance of `ConfigLoader`, " - f"got `{type(config_loader).__name__}` instead." - ) - return config_loader - - @property - def config_loader(self) -> ConfigLoader: - """Read-only property referring to Kedro's ``ConfigLoader`` for this - context. - - Returns: - Instance of `ConfigLoader`. - Raises: - KedroContextError: Incorrect ``ConfigLoader`` registered for the project. - - """ - return self._get_config_loader() - def _get_feed_dict(self) -> Dict[str, Any]: """Get parameters and return the feed dictionary.""" params = self.params @@ -481,7 +324,7 @@ def _add_param_to_feed_dict(param_name, param_value): def _get_config_credentials(self) -> Dict[str, Any]: """Getter for credentials specified in credentials directory.""" try: - conf_creds = self.config_loader.get( + conf_creds = self._config_loader.get( "credentials*", "credentials*/**", "**/credentials*" ) except MissingConfigException as exc: @@ -489,238 +332,22 @@ def _get_config_credentials(self) -> Dict[str, Any]: conf_creds = {} return conf_creds - # pylint: disable=too-many-arguments, no-self-use - def _filter_pipeline( - self, - pipeline: Pipeline, - tags: Iterable[str] = None, - from_nodes: Iterable[str] = None, - to_nodes: Iterable[str] = None, - node_names: Iterable[str] = None, - from_inputs: Iterable[str] = None, - to_outputs: Iterable[str] = None, - ) -> Pipeline: - """Filter the pipeline as the intersection of all conditions.""" - new_pipeline = pipeline - # We need to intersect with the pipeline because the order - # of operations matters, so we don't want to do it incrementally. - # As an example, with a pipeline of nodes 1,2,3, think of - # "from 1", and "only 1 and 3" - the order you do them in results in - # either 1 & 3, or just 1. - if tags: - new_pipeline &= pipeline.only_nodes_with_tags(*tags) - if not new_pipeline.nodes: - raise KedroContextError( - f"Pipeline contains no nodes with tags: {str(tags)}" - ) - if from_nodes: - new_pipeline &= pipeline.from_nodes(*from_nodes) - if to_nodes: - new_pipeline &= pipeline.to_nodes(*to_nodes) - if node_names: - new_pipeline &= pipeline.only_nodes(*node_names) - if from_inputs: - new_pipeline &= pipeline.from_inputs(*from_inputs) - if to_outputs: - new_pipeline &= pipeline.to_outputs(*to_outputs) - - if not new_pipeline.nodes: - raise KedroContextError("Pipeline contains no nodes") - return new_pipeline - @property def run_id(self) -> Union[None, str]: - """Unique identifier for a run / journal record, defaults to None. + """Unique identifier for a run, defaults to None. If `run_id` is None, `save_version` will be used instead. """ return self._get_run_id() - def run( # pylint: disable=too-many-arguments,too-many-locals - self, - tags: Iterable[str] = None, - runner: AbstractRunner = None, - node_names: Iterable[str] = None, - from_nodes: Iterable[str] = None, - to_nodes: Iterable[str] = None, - from_inputs: Iterable[str] = None, - to_outputs: Iterable[str] = None, - load_versions: Dict[str, str] = None, - pipeline_name: str = None, - ) -> Dict[str, Any]: - """Runs the pipeline with a specified runner. - - Args: - tags: An optional list of node tags which should be used to - filter the nodes of the ``Pipeline``. If specified, only the nodes - containing *any* of these tags will be run. - runner: An optional parameter specifying the runner that you want to run - the pipeline with. - node_names: An optional list of node names which should be used to - filter the nodes of the ``Pipeline``. If specified, only the nodes - with these names will be run. - from_nodes: An optional list of node names which should be used as a - starting point of the new ``Pipeline``. - to_nodes: An optional list of node names which should be used as an - end point of the new ``Pipeline``. - from_inputs: An optional list of input datasets which should be used as a - starting point of the new ``Pipeline``. - to_outputs: An optional list of output datasets which should be used as an - end point of the new ``Pipeline``. - load_versions: An optional flag to specify a particular dataset version timestamp - to load. - pipeline_name: Name of the ``Pipeline`` to execute. - Defaults to "__default__". - Raises: - KedroContextError: If the resulting ``Pipeline`` is empty - or incorrect tags are provided. - Exception: Any uncaught exception will be re-raised - after being passed to``on_pipeline_error``. - Returns: - Any node outputs that cannot be processed by the ``DataCatalog``. - These are returned in a dictionary, where the keys are defined - by the node outputs. - """ - warn( - "`kedro.framework.context.KedroContext.run` is now deprecated in favour of " - "`KedroSession.run` and will be removed in Kedro 0.18.0.", - DeprecationWarning, - ) - # Report project name - logging.info("** Kedro project %s", self.project_path.name) - - name = pipeline_name or "__default__" - - try: - pipeline = pipelines[name] - except KeyError as exc: - raise KedroContextError( - f"Failed to find the pipeline named '{name}'. " - f"It needs to be generated and returned " - f"by the 'register_pipelines' function." - ) from exc - - filtered_pipeline = self._filter_pipeline( - pipeline=pipeline, - tags=tags, - from_nodes=from_nodes, - to_nodes=to_nodes, - node_names=node_names, - from_inputs=from_inputs, - to_outputs=to_outputs, - ) - - save_version = self._get_save_version() - run_id = self.run_id or save_version - - record_data = { - "run_id": run_id, - "project_path": str(self.project_path), - "env": self.env, - "tags": tags, - "from_nodes": from_nodes, - "to_nodes": to_nodes, - "node_names": node_names, - "from_inputs": from_inputs, - "to_outputs": to_outputs, - "load_versions": load_versions, - "pipeline_name": pipeline_name, - "extra_params": self._extra_params, - } - journal = Journal(record_data) - - catalog = self._get_catalog( - save_version=save_version, journal=journal, load_versions=load_versions - ) - - # Run the runner - runner = runner or SequentialRunner() - hook_manager = get_hook_manager() - hook_manager.hook.before_pipeline_run( # pylint: disable=no-member - run_params=record_data, pipeline=filtered_pipeline, catalog=catalog - ) - - try: - run_result = runner.run(filtered_pipeline, catalog, run_id) - except Exception as exc: - hook_manager.hook.on_pipeline_error( # pylint: disable=no-member - error=exc, - run_params=record_data, - pipeline=filtered_pipeline, - catalog=catalog, - ) - raise exc - - hook_manager.hook.after_pipeline_run( # pylint: disable=no-member - run_params=record_data, - run_result=run_result, - pipeline=filtered_pipeline, - catalog=catalog, - ) - return run_result - - def _get_run_id( + def _get_run_id( # pylint: disable=no-self-use self, *args, **kwargs # pylint: disable=unused-argument ) -> Union[None, str]: """A hook for generating a unique identifier for a - run / journal record, defaults to None. + run, defaults to None. If None, `save_version` will be used instead. """ return None - def _get_save_version( - self, *args, **kwargs # pylint: disable=unused-argument - ) -> str: - """Generate unique ID for dataset versioning, defaults to timestamp. - `save_version` MUST be something that can be ordered, in order to - easily determine the latest version. - """ - return generate_timestamp() - - -def load_context(project_path: Union[str, Path], **kwargs) -> KedroContext: - """Loads the KedroContext object of a Kedro Project. - This is the default way to load the KedroContext object for normal workflows such as - CLI, Jupyter Notebook, Plugins, etc. It assumes the following project structure - under the given project_path:: - - - |__ - |__ pyproject.toml - - The name of the is `src` by default. The `pyproject.toml` file is used - for project metadata. Kedro configuration should be under `[tool.kedro]` section. - - Args: - project_path: Path to the Kedro project. - **kwargs: Optional kwargs for ``KedroContext`` class. - - Returns: - Instance of ``KedroContext`` class defined in Kedro project. - - Raises: - KedroContextError: `pyproject.toml` was not found or the `[tool.kedro]` section - is missing, or loaded context has package conflict. - - """ - warn( - "`kedro.framework.context.load_context` is now deprecated in favour of " - "`KedroSession.load_context` and will be removed in Kedro 0.18.0.", - DeprecationWarning, - ) - project_path = Path(project_path).expanduser().resolve() - metadata = _get_project_metadata(project_path) - - context_class = settings.CONTEXT_CLASS - # update kwargs with env from the environment variable - # (defaults to None if not set) - # need to do this because some CLI command (e.g `kedro run`) defaults to - # passing in `env=None` - kwargs["env"] = kwargs.get("env") or os.getenv("KEDRO_ENV") - context = context_class( - package_name=metadata.package_name, project_path=project_path, **kwargs - ) - return context - class KedroContextError(Exception): """Error occurred when loading project and running context pipeline.""" diff --git a/kedro/framework/hooks/manager.py b/kedro/framework/hooks/manager.py index e3e00200d3..72084fba53 100644 --- a/kedro/framework/hooks/manager.py +++ b/kedro/framework/hooks/manager.py @@ -8,13 +8,7 @@ from pluggy import PluginManager from .markers import HOOK_NAMESPACE -from .specs import ( - DataCatalogSpecs, - DatasetSpecs, - NodeSpecs, - PipelineSpecs, - RegistrationSpecs, -) +from .specs import DataCatalogSpecs, DatasetSpecs, NodeSpecs, PipelineSpecs _hook_manager = None @@ -29,7 +23,6 @@ def _create_hook_manager() -> PluginManager: manager.add_hookspecs(NodeSpecs) manager.add_hookspecs(PipelineSpecs) manager.add_hookspecs(DataCatalogSpecs) - manager.add_hookspecs(RegistrationSpecs) manager.add_hookspecs(DatasetSpecs) return manager diff --git a/kedro/framework/hooks/specs.py b/kedro/framework/hooks/specs.py index 9fb6cc1037..72566c86e7 100644 --- a/kedro/framework/hooks/specs.py +++ b/kedro/framework/hooks/specs.py @@ -2,14 +2,11 @@ For more information about these specifications, please visit [Pluggy's documentation](https://pluggy.readthedocs.io/en/stable/#specs) """ -# pylint: disable=too-many-arguments -from typing import Any, Dict, Iterable, Optional +from typing import Any, Dict, Optional -from kedro.config import ConfigLoader from kedro.io import DataCatalog from kedro.pipeline import Pipeline from kedro.pipeline.node import Node -from kedro.versioning import Journal from .markers import hook_spec @@ -18,7 +15,7 @@ class DataCatalogSpecs: """Namespace that defines all specifications for a data catalog's lifecycle hooks.""" @hook_spec - def after_catalog_created( + def after_catalog_created( # pylint: disable=too-many-arguments self, catalog: DataCatalog, conf_catalog: Dict[str, Any], @@ -50,7 +47,7 @@ class NodeSpecs: """Namespace that defines all specifications for a node's lifecycle hooks.""" @hook_spec - def before_node_run( + def before_node_run( # pylint: disable=too-many-arguments self, node: Node, catalog: DataCatalog, @@ -106,7 +103,7 @@ def after_node_run( # pylint: disable=too-many-arguments pass @hook_spec - def on_node_error( + def on_node_error( # pylint: disable=too-many-arguments self, error: Exception, node: Node, @@ -143,7 +140,7 @@ def before_pipeline_run( Args: run_params: The params used to run the pipeline. - Should be identical to the data logged by Journal with the following schema:: + Should have the following schema:: { "run_id": str @@ -178,7 +175,7 @@ def after_pipeline_run( Args: run_params: The params used to run the pipeline. - Should be identical to the data logged by Journal with the following schema:: + Should have the following schema:: { "run_id": str @@ -217,7 +214,7 @@ def on_pipeline_error( Args: error: The uncaught exception thrown during the pipeline run. run_params: The params used to run the pipeline. - Should be identical to the data logged by Journal with the following schema:: + Should have the following schema:: { "run_id": str @@ -284,51 +281,3 @@ def after_dataset_saved(self, dataset_name: str, data: Any) -> None: data: the actual data that was saved to the catalog. """ pass - - -class RegistrationSpecs: - """Namespace that defines all specifications for hooks registering - library components with a Kedro project. - """ - - @hook_spec - def register_pipelines(self) -> Dict[str, Pipeline]: - """Hook to be invoked to register a project's pipelines. - - Returns: - A mapping from a pipeline name to a ``Pipeline`` object. - - """ - pass - - @hook_spec(firstresult=True) - def register_config_loader( - self, conf_paths: Iterable[str], env: str, extra_params: Dict[str, Any] - ) -> ConfigLoader: - """Hook to be invoked to register a project's config loader. - - Args: - conf_paths: the paths to the conf directory to be supplied to the config loader - env: the environment with which the config loader will be instantiated - extra_params: the extra parameters passed to a Kedro run - - Returns: - An instance of a ``ConfigLoader``. - """ - pass - - @hook_spec(firstresult=True) - def register_catalog( - self, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]], - load_versions: Dict[str, str], - save_version: str, - journal: Journal, - ) -> DataCatalog: - """Hook to be invoked to register a project's data catalog. - - Returns: - An instance of a ``DataCatalog``. - """ - pass diff --git a/kedro/framework/project/__init__.py b/kedro/framework/project/__init__.py index 9d63195528..c084fd73c3 100644 --- a/kedro/framework/project/__init__.py +++ b/kedro/framework/project/__init__.py @@ -2,10 +2,10 @@ configure a Kedro project and access its settings.""" # pylint: disable=redefined-outer-name,unused-argument,global-statement import importlib +import logging.config import operator from collections.abc import MutableMapping -from typing import Dict, Optional -from warnings import warn +from typing import Any, Dict, Optional from dynaconf import LazySettings from dynaconf.validator import ValidationError, Validator @@ -47,9 +47,9 @@ class _ProjectSettings(LazySettings): Use Dynaconf's LazySettings as base. """ - _CONF_ROOT = Validator("CONF_ROOT", default="conf") + _CONF_SOURCE = Validator("CONF_SOURCE", default="conf") _HOOKS = Validator("HOOKS", default=tuple()) - _CONTEXT_CLASS = Validator( + _CONTEXT_CLASS = _IsSubclassValidator( "CONTEXT_CLASS", default=_get_default_class("kedro.framework.context.KedroContext"), ) @@ -59,17 +59,27 @@ class _ProjectSettings(LazySettings): ) _SESSION_STORE_ARGS = Validator("SESSION_STORE_ARGS", default={}) _DISABLE_HOOKS_FOR_PLUGINS = Validator("DISABLE_HOOKS_FOR_PLUGINS", default=tuple()) + _CONFIG_LOADER_CLASS = _IsSubclassValidator( + "CONFIG_LOADER_CLASS", default=_get_default_class("kedro.config.ConfigLoader") + ) + _CONFIG_LOADER_ARGS = Validator("CONFIG_LOADER_ARGS", default={}) + _DATA_CATALOG_CLASS = _IsSubclassValidator( + "DATA_CATALOG_CLASS", default=_get_default_class("kedro.io.DataCatalog") + ) def __init__(self, *args, **kwargs): kwargs.update( validators=[ - self._CONF_ROOT, + self._CONF_SOURCE, self._HOOKS, self._CONTEXT_CLASS, self._SESSION_STORE_CLASS, self._SESSION_STORE_ARGS, self._DISABLE_HOOKS_FOR_PLUGINS, + self._CONFIG_LOADER_CLASS, + self._CONFIG_LOADER_ARGS, + self._DATA_CATALOG_CLASS, ] ) super().__init__(*args, **kwargs) @@ -127,19 +137,6 @@ def _load_data(self): else: project_pipelines = register_pipelines() - hook_manager = get_hook_manager() - pipelines_dicts = ( - hook_manager.hook.register_pipelines() # pylint: disable=no-member - ) - for pipeline_collection in pipelines_dicts: - duplicate_keys = pipeline_collection.keys() & project_pipelines.keys() - if duplicate_keys: - warn( - f"Found duplicate pipeline entries. " - f"The following will be overwritten: {', '.join(duplicate_keys)}" - ) - project_pipelines.update(pipeline_collection) - self._content = project_pipelines self._is_data_loaded = True @@ -170,6 +167,7 @@ def _clear(self, pipelines_module: str) -> None: PACKAGE_NAME = None +LOGGING = None settings = _ProjectSettings() @@ -199,6 +197,13 @@ def configure_project(package_name: str): PACKAGE_NAME = package_name +def configure_logging(logging_config: Dict[str, Any]) -> None: + """Configure logging to make it available as a global variable.""" + logging.config.dictConfig(logging_config) + global LOGGING + LOGGING = logging_config + + def validate_settings(): """Eagerly validate that the settings module is importable. This is desirable to surface any syntax or import errors early. In particular, without eagerly importing diff --git a/kedro/framework/session/__init__.py b/kedro/framework/session/__init__.py index cfda2f9125..b195660d3c 100644 --- a/kedro/framework/session/__init__.py +++ b/kedro/framework/session/__init__.py @@ -1,6 +1,6 @@ """``kedro.framework.session`` provides access to KedroSession responsible for project lifecycle. """ -from .session import KedroSession, get_current_session +from .session import KedroSession -__all__ = ["KedroSession", "get_current_session"] +__all__ = ["KedroSession"] diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py index 2adcf7b32c..5c66e41885 100644 --- a/kedro/framework/session/session.py +++ b/kedro/framework/session/session.py @@ -7,18 +7,17 @@ import traceback from copy import deepcopy from pathlib import Path -from typing import Any, Dict, Iterable, Optional, Union +from typing import Any, Dict, Iterable, Union import click from kedro import __version__ as kedro_version +from kedro.config import ConfigLoader from kedro.framework.context import KedroContext -from kedro.framework.context.context import ( - KedroContextError, - _convert_paths_to_absolute_posix, -) +from kedro.framework.context.context import _convert_paths_to_absolute_posix from kedro.framework.hooks import get_hook_manager from kedro.framework.project import ( + configure_logging, configure_project, pipelines, settings, @@ -31,25 +30,6 @@ _active_session = None -def get_current_session(silent: bool = False) -> Optional["KedroSession"]: - """Fetch the active ``KedroSession`` instance. - - Args: - silent: Indicates to suppress the error if no active session was found. - - Raises: - RuntimeError: If no active session was found and `silent` is False. - - Returns: - KedroSession instance. - - """ - if not _active_session and not silent: - raise RuntimeError("There is no active Kedro session.") - - return _active_session - - def _activate_session(session: "KedroSession", force: bool = False) -> None: global _active_session @@ -66,7 +46,7 @@ def _deactivate_session() -> None: _active_session = None -def _describe_git(project_path: Path) -> Dict[str, Dict[str, str]]: +def _describe_git(project_path: Path) -> Dict[str, Dict[str, Any]]: project_path = str(project_path) try: @@ -78,7 +58,7 @@ def _describe_git(project_path: Path) -> Dict[str, Dict[str, str]]: logging.getLogger(__name__).warning("Unable to git describe %s", project_path) return {} - git_data = {"commit_sha": res.decode().strip()} + git_data = {"commit_sha": res.decode().strip()} # type: Dict[str, Any] res = subprocess.check_output(["git", "status", "--short"], cwd=project_path) git_data["dirty"] = bool(res.decode().strip()) @@ -197,9 +177,7 @@ def create( # pylint: disable=too-many-arguments return session def _get_logging_config(self) -> Dict[str, Any]: - context = self.load_context() - - conf_logging = context.config_loader.get( + conf_logging = self._get_config_loader().get( "logging*", "logging*/**", "**/logging*" ) # turn relative paths in logging config into absolute path @@ -212,7 +190,7 @@ def _get_logging_config(self) -> Dict[str, Any]: def _setup_logging(self) -> None: """Register logging specified in logging directory.""" conf_logging = self._get_logging_config() - logging.config.dictConfig(conf_logging) + configure_logging(conf_logging) def _init_store(self) -> BaseSessionStore: store_class = settings.SESSION_STORE_CLASS @@ -257,16 +235,31 @@ def load_context(self) -> KedroContext: """An instance of the project context.""" env = self.store.get("env") extra_params = self.store.get("extra_params") + config_loader = self._get_config_loader() context_class = settings.CONTEXT_CLASS context = context_class( package_name=self._package_name, project_path=self._project_path, + config_loader=config_loader, env=env, extra_params=extra_params, ) return context + def _get_config_loader(self) -> ConfigLoader: + """An instance of the config loader.""" + env = self.store.get("env") + extra_params = self.store.get("extra_params") + + config_loader_class = settings.CONFIG_LOADER_CLASS + return config_loader_class( + conf_source=str(self._project_path / settings.CONF_SOURCE), + env=env, + runtime_params=extra_params, + **settings.CONFIG_LOADER_ARGS, + ) + def close(self): """Close the current session and save its store to disk if `save_on_close` attribute is True. @@ -274,11 +267,11 @@ def close(self): if self.save_on_close: self._store.save() - if get_current_session(silent=True) is self: + if _active_session is self: _deactivate_session() def __enter__(self): - if get_current_session(silent=True) is not self: + if _active_session is not self: _activate_session(self) return self @@ -322,7 +315,7 @@ def run( # pylint: disable=too-many-arguments,too-many-locals load_versions: An optional flag to specify a particular dataset version timestamp to load. Raises: - KedroContextError: If the named or `__default__` pipeline is not + ValueError: If the named or `__default__` pipeline is not defined by `register_pipelines`. Exception: Any uncaught exception during the run will be re-raised after being passed to ``on_pipeline_error`` hook. @@ -344,14 +337,13 @@ def run( # pylint: disable=too-many-arguments,too-many-locals try: pipeline = pipelines[name] except KeyError as exc: - raise KedroContextError( + raise ValueError( f"Failed to find the pipeline named '{name}'. " f"It needs to be generated and returned " f"by the 'register_pipelines' function." ) from exc - filtered_pipeline = context._filter_pipeline( - pipeline=pipeline, + filtered_pipeline = pipeline.filter( tags=tags, from_nodes=from_nodes, to_nodes=to_nodes, diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index f172dff3aa..de6f6e49e4 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -13,19 +13,15 @@ Version, ) from .data_catalog import DataCatalog -from .data_catalog_with_default import DataCatalogWithDefault -from .lambda_data_set import LambdaDataSet -from .memory_data_set import MemoryDataSet -from .partitioned_data_set import IncrementalDataSet, PartitionedDataSet -from .transformers import AbstractTransformer +from .lambda_dataset import LambdaDataSet +from .memory_dataset import MemoryDataSet +from .partitioned_dataset import IncrementalDataSet, PartitionedDataSet __all__ = [ "AbstractDataSet", - "AbstractTransformer", "AbstractVersionedDataSet", "CachedDataSet", "DataCatalog", - "DataCatalogWithDefault", "DataSetAlreadyExistsError", "DataSetError", "DataSetNotFoundError", diff --git a/kedro/io/cached_dataset.py b/kedro/io/cached_dataset.py index 540687a9e9..e218236a61 100644 --- a/kedro/io/cached_dataset.py +++ b/kedro/io/cached_dataset.py @@ -6,7 +6,7 @@ from typing import Any, Dict, Union from kedro.io.core import VERSIONED_FLAG_KEY, AbstractDataSet, Version -from kedro.io.memory_data_set import MemoryDataSet +from kedro.io.memory_dataset import MemoryDataSet class CachedDataSet(AbstractDataSet): diff --git a/kedro/io/core.py b/kedro/io/core.py index 075b8dacf3..fa5f2625fe 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -438,7 +438,7 @@ def _load_obj(class_path: str) -> Optional[object]: f"{exc} Please see the documentation on how to " f"install relevant dependencies for {class_path}:\n" f"https://kedro.readthedocs.io/en/stable/" - f"04_kedro_project_setup/01_dependencies.html" + f"kedro_project_setup/dependencies.html" ) from exc return None @@ -521,7 +521,7 @@ def __init__( self._exists_function = exists_function or _local_exists self._glob_function = glob_function or iglob # 1 entry for load version, 1 for save version - self._version_cache = Cache(maxsize=2) + self._version_cache = Cache(maxsize=2) # type: Cache # 'key' is set to prevent cache key overlapping for load and save: # https://cachetools.readthedocs.io/en/stable/#cachetools.cachedmethod diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 86666afd9a..7e6945186e 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -8,10 +8,8 @@ import difflib import logging import re -import warnings from collections import defaultdict -from functools import partial -from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Type, Union +from typing import Any, Dict, List, Optional, Set, Type, Union from kedro.io.core import ( AbstractDataSet, @@ -22,9 +20,7 @@ Version, generate_timestamp, ) -from kedro.io.memory_data_set import MemoryDataSet -from kedro.io.transformers import AbstractTransformer -from kedro.versioning import Journal +from kedro.io.memory_dataset import MemoryDataSet CATALOG_KEY = "catalog" CREDENTIALS_KEY = "credentials" @@ -138,14 +134,10 @@ class DataCatalog: to the underlying data sets. """ - # pylint: disable=too-many-arguments def __init__( self, data_sets: Dict[str, AbstractDataSet] = None, feed_dict: Dict[str, Any] = None, - transformers: Dict[str, List[AbstractTransformer]] = None, - default_transformers: List[AbstractTransformer] = None, - journal: Journal = None, layers: Dict[str, Set[str]] = None, ) -> None: """``DataCatalog`` stores instances of ``AbstractDataSet`` @@ -158,18 +150,10 @@ def __init__( Args: data_sets: A dictionary of data set names and data set instances. feed_dict: A feed dict with data to be added in memory. - transformers: A dictionary of lists of transformers to be applied - to the data sets. - default_transformers: A list of transformers to be applied to any - new data sets. - journal: Instance of Journal. layers: A dictionary of data set layers. It maps a layer name to a set of data set names, according to the data engineering convention. For more details, see - https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#what-is-data-engineering-convention - Raises: - DataSetNotFoundError: When transformers are passed for a non - existent data set. + https://kedro.readthedocs.io/en/stable/faq/faq.html#what-is-data-engineering-convention Example: :: @@ -185,18 +169,6 @@ def __init__( self.datasets = _FrozenDatasets(self._data_sets) self.layers = layers - if transformers or default_transformers: - warnings.warn( - "The transformer API will be deprecated in Kedro 0.18.0." - "Please use Dataset Hooks to customise the load and save methods." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/02_hooks.html", - DeprecationWarning, - ) - self._transformers = {k: list(v) for k, v in (transformers or {}).items()} - self._default_transformers = list(default_transformers or []) - self._check_and_normalize_transformers() - self._journal = journal # import the feed dict if feed_dict: self.add_feed_dict(feed_dict) @@ -205,21 +177,6 @@ def __init__( def _logger(self): return logging.getLogger(__name__) - def _check_and_normalize_transformers(self): - data_sets = self._data_sets.keys() - transformers = self._transformers.keys() - excess_transformers = transformers - data_sets - missing_transformers = data_sets - transformers - - if excess_transformers: - raise DataSetNotFoundError( - f"Unexpected transformers for missing data_sets {', '.join(excess_transformers)}" - ) - - for data_set_name in missing_transformers: - self._transformers[data_set_name] = list(self._default_transformers) - - # pylint: disable=too-many-arguments @classmethod def from_config( cls: Type, @@ -227,7 +184,6 @@ def from_config( credentials: Dict[str, Dict[str, Any]] = None, load_versions: Dict[str, str] = None, save_version: str = None, - journal: Journal = None, ) -> "DataCatalog": """Create a ``DataCatalog`` instance from configuration. This is a factory method used to provide developers with a way to instantiate @@ -252,7 +208,6 @@ class to be loaded is specified with the key ``type`` and their case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when sorted in lexicographical order. - journal: Instance of Journal. Returns: An instantiated ``DataCatalog`` containing all specified @@ -302,8 +257,7 @@ class to be loaded is specified with the key ``type`` and their data_sets = {} catalog = copy.deepcopy(catalog) or {} credentials = copy.deepcopy(credentials) or {} - run_id = journal.run_id if journal else None - save_version = save_version or run_id or generate_timestamp() + save_version = save_version or generate_timestamp() load_versions = copy.deepcopy(load_versions) or {} missing_keys = load_versions.keys() - catalog.keys() @@ -325,7 +279,7 @@ class to be loaded is specified with the key ``type`` and their ) dataset_layers = layers or None - return cls(data_sets=data_sets, journal=journal, layers=dataset_layers) + return cls(data_sets=data_sets, layers=dataset_layers) def _get_dataset( self, data_set_name: str, version: Version = None @@ -350,14 +304,6 @@ def _get_dataset( return data_set - def _get_transformed_dataset_function( - self, data_set_name: str, operation: str, data_set: AbstractDataSet - ) -> Callable: - func = getattr(data_set, operation) - for transformer in reversed(self._transformers[data_set_name]): - func = partial(getattr(transformer, operation), data_set_name, func) - return func - def load(self, name: str, version: str = None) -> Any: """Loads a registered data set. @@ -393,18 +339,8 @@ def load(self, name: str, version: str = None) -> Any: "Loading data from `%s` (%s)...", name, type(dataset).__name__ ) - func = self._get_transformed_dataset_function(name, "load", dataset) - result = func() - - version = ( - dataset.resolve_load_version() - if isinstance(dataset, AbstractVersionedDataSet) - else None - ) + result = dataset.load() - # Log only if versioning is enabled for the data set - if self._journal and version: - self._journal.log_catalog(name, "load", version) return result def save(self, name: str, data: Any) -> None: @@ -440,18 +376,7 @@ def save(self, name: str, data: Any) -> None: self._logger.info("Saving data to `%s` (%s)...", name, type(dataset).__name__) - func = self._get_transformed_dataset_function(name, "save", dataset) - func(data) - - version = ( - dataset.resolve_save_version() - if isinstance(dataset, AbstractVersionedDataSet) - else None - ) - - # Log only if versioning is enabled for the data set - if self._journal and version: - self._journal.log_catalog(name, "save", version) + dataset.save(data) def exists(self, name: str) -> bool: """Checks whether registered data set exists by calling its `exists()` @@ -520,7 +445,6 @@ def add( f"DataSet '{data_set_name}' has already been registered" ) self._data_sets[data_set_name] = data_set - self._transformers[data_set_name] = list(self._default_transformers) self.datasets = _FrozenDatasets(self.datasets, {data_set_name: data_set}) def add_all( @@ -591,46 +515,6 @@ def add_feed_dict(self, feed_dict: Dict[str, Any], replace: bool = False) -> Non self.add(data_set_name, data_set, replace) - def add_transformer( - self, - transformer: AbstractTransformer, - data_set_names: Union[str, Iterable[str]] = None, - ): - """Add a ``DataSet`` Transformer to the``DataCatalog``. - Transformers can modify the way Data Sets are loaded and saved. - - Args: - transformer: The transformer instance to add. - data_set_names: The Data Sets to add the transformer to. - Or None to add the transformer to all Data Sets. - Raises: - DataSetNotFoundError: When a transformer is being added to a non - existent data set. - TypeError: When transformer isn't an instance of ``AbstractTransformer`` - """ - - warnings.warn( - "The transformer API will be deprecated in Kedro 0.18.0." - "Please use Dataset Hooks to customise the load and save methods." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/02_hooks.html", - DeprecationWarning, - ) - - if not isinstance(transformer, AbstractTransformer): - raise TypeError( - f"Object of type {type(transformer)} is not an instance of AbstractTransformer" - ) - if data_set_names is None: - self._default_transformers.append(transformer) - data_set_names = self._transformers.keys() - elif isinstance(data_set_names, str): - data_set_names = [data_set_names] - for data_set_name in data_set_names: - if data_set_name not in self._data_sets: - raise DataSetNotFoundError(f"No data set called {data_set_name}") - self._transformers[data_set_name].append(transformer) - def list(self, regex_search: Optional[str] = None) -> List[str]: """ List of all ``DataSet`` names registered in the catalog. @@ -682,28 +566,10 @@ def shallow_copy(self) -> "DataCatalog": Returns: Copy of the current object. """ - return DataCatalog( - data_sets=self._data_sets, - transformers=self._transformers, - default_transformers=self._default_transformers, - journal=self._journal, - layers=self.layers, - ) + return DataCatalog(data_sets=self._data_sets, layers=self.layers) def __eq__(self, other): - return ( - self._data_sets, - self._transformers, - self._default_transformers, - self._journal, - self.layers, - ) == ( - other._data_sets, - other._transformers, - other._default_transformers, - other._journal, - other.layers, - ) + return (self._data_sets, self.layers) == (other._data_sets, other.layers) def confirm(self, name: str) -> None: """Confirm a dataset by its name. diff --git a/kedro/io/data_catalog_with_default.py b/kedro/io/data_catalog_with_default.py deleted file mode 100644 index 27c1a341d2..0000000000 --- a/kedro/io/data_catalog_with_default.py +++ /dev/null @@ -1,188 +0,0 @@ -"""A ``DataCatalog`` with a default ``DataSet`` implementation for any data set -which is not registered in the catalog. -""" -import warnings -from typing import Any, Callable, Dict, Optional - -from kedro.io.core import AbstractDataSet -from kedro.io.data_catalog import DataCatalog -from kedro.versioning import Journal - - -class DataCatalogWithDefault(DataCatalog): - """A ``DataCatalog`` with a default ``DataSet`` implementation for any - data set which is not registered in the catalog. - """ - - def __init__( - self, - data_sets: Dict[str, AbstractDataSet] = None, - default: Callable[[str], AbstractDataSet] = None, - remember: bool = False, - ): - """``DataCatalogWithDefault`` is deprecated and will be removed in Kedro 0.18.0. - A ``DataCatalog`` with a default ``DataSet`` implementation for any - data set which is not registered in the catalog. - - Args: - data_sets: A dictionary of data set names and data set instances. - default: A callable which accepts a single argument of type string, - the key of the data set, and returns an ``AbstractDataSet``. - ``load`` and ``save`` calls on data sets which are not - registered to the catalog will be delegated to this - ``AbstractDataSet``. - remember: If True, then store in the catalog any - ``AbstractDataSet``s provided by the ``default`` callable - argument. Useful when one want to transition from a - ``DataCatalogWithDefault`` to a ``DataCatalog``: just call - ``DataCatalogWithDefault.to_yaml``, after all required data - sets have been saved/loaded, and use the generated YAML file - with a new ``DataCatalog``. - Raises: - TypeError: If default is not a callable. - - Example: - :: - - >>> from kedro.extras.datasets.pandas import CSVDataSet - >>> - >>> def default_data_set(name): - >>> return CSVDataSet(filepath='data/01_raw/' + name) - >>> - >>> io = DataCatalog(data_sets={}, - >>> default=default_data_set) - >>> - >>> # load the file in data/raw/cars.csv - >>> df = io.load("cars.csv") - """ - super().__init__(data_sets) - - warnings.warn( - "`DataCatalogWithDefault` is now deprecated and will be removed in Kedro 0.18.0." - "For more information, please visit " - "https://github.com/kedro-org/kedro/blob/main/RELEASE.md", - DeprecationWarning, - ) - - if not callable(default): - raise TypeError( - "Default must be a callable with a single input " - "string argument: the key of the requested data " - "set." - ) - self._default = default - self._remember = remember - - def load(self, name: str, version: str = None) -> Any: - """Loads a registered data set - - Args: - name: A data set to be loaded. - version: Optional version to be loaded. - - - Returns: - The loaded data as configured. - - Raises: - DataSetNotFoundError: When a data set with the given name - has not yet been registered. - - """ - data_set = self._data_sets.get(name, self._default(name)) - - if self._remember and name not in self._data_sets: - self._data_sets[name] = data_set - - return data_set.load() - - def save(self, name: str, data: Any): - """Save data to a registered data set. - - Args: - name: A data set to be saved to. - data: A data object to be saved as configured in the registered - data set. - - Raises: - DataSetNotFoundError: When a data set with the given name - has not yet been registered. - - """ - data_set = self._data_sets.get(name, self._default(name)) - - if self._remember and name not in self._data_sets: - self._data_sets[name] = data_set - - data_set.save(data) - - # pylint: disable=too-many-arguments - @classmethod - def from_config( - cls, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]] = None, - load_versions: Dict[str, str] = None, - save_version: str = None, - journal: Journal = None, - ): - """To create a ``DataCatalogWithDefault`` from configuration, please - use: - :: - - >>> DataCatalogWithDefault.from_data_catalog( - >>> DataCatalog.from_config(catalog, credentials)) - - Args: - catalog: See ``DataCatalog.from_config`` - credentials: See ``DataCatalog.from_config`` - load_versions: See ``DataCatalog.from_config`` - save_version: See ``DataCatalog.from_config`` - journal: See ``DataCatalog.from_config`` - - Raises: - ValueError: If you try to instantiate a ``DataCatalogWithDefault`` - directly with this method. - - """ - raise ValueError( - "Cannot instantiate a `DataCatalogWithDefault` " - "directly from configuration files. Please use" - "``DataCatalogWithDefault.from_data_catalog(" - "DataCatalog.from_config(catalog, " - "credentials, journal))" - ) - - @classmethod - def from_data_catalog( - cls, data_catalog: DataCatalog, default: Callable[[str], AbstractDataSet] - ) -> "DataCatalogWithDefault": - """Convenience factory method to create a ``DataCatalogWithDefault`` - from a ``DataCatalog`` - - A ``DataCatalog`` with a default ``DataSet`` implementation for any - data set which is not registered in the catalog. - - Args: - data_catalog: The ``DataCatalog`` to convert to a - ``DataCatalogWithDefault``. - default: A callable which accepts a single argument of type string, - the key of the data set, and returns an ``AbstractDataSet``. - ``load`` and ``save`` calls on data sets which are not - registered to the catalog will be delegated to this - ``AbstractDataSet``. - - Returns: - A new ``DataCatalogWithDefault`` which contains all the - ``AbstractDataSets`` from the provided data-catalog. - - """ - # pylint: disable=protected-access - return cls({**data_catalog._data_sets}, default) - - def shallow_copy(self) -> "DataCatalogWithDefault": # pragma: no cover - """Returns a shallow copy of the current object. - Returns: - Copy of the current object. - """ - return DataCatalogWithDefault({**self._data_sets}, self._default) diff --git a/kedro/io/lambda_data_set.py b/kedro/io/lambda_dataset.py similarity index 100% rename from kedro/io/lambda_data_set.py rename to kedro/io/lambda_dataset.py diff --git a/kedro/io/memory_data_set.py b/kedro/io/memory_dataset.py similarity index 100% rename from kedro/io/memory_data_set.py rename to kedro/io/memory_dataset.py diff --git a/kedro/io/partitioned_data_set.py b/kedro/io/partitioned_dataset.py similarity index 97% rename from kedro/io/partitioned_data_set.py rename to kedro/io/partitioned_dataset.py index de92556dd4..1785357167 100644 --- a/kedro/io/partitioned_data_set.py +++ b/kedro/io/partitioned_dataset.py @@ -37,8 +37,8 @@ class PartitionedDataSet(AbstractDataSet): https://github.com/intake/filesystem_spec. Example adding a catalog entry with - `YAML API `_: + `YAML API `_: .. code-block:: yaml @@ -136,7 +136,7 @@ def __init__( # pylint: disable=too-many-arguments and the dataset initializer. If the dataset config contains explicit credentials spec, then such spec will take precedence. All possible credentials management scenarios are documented here: - https://kedro.readthedocs.io/en/stable/05_data/02_kedro_io.html#partitioned-dataset-credentials + https://kedro.readthedocs.io/en/stable/data/kedro_io.html#partitioned-dataset-credentials load_args: Keyword arguments to be passed into ``find()`` method of the filesystem implementation. fs_args: Extra arguments to pass into underlying filesystem class constructor @@ -155,7 +155,7 @@ def __init__( # pylint: disable=too-many-arguments self._filename_suffix = filename_suffix self._overwrite = overwrite self._protocol = infer_storage_options(self._path)["protocol"] - self._partition_cache = Cache(maxsize=1) + self._partition_cache = Cache(maxsize=1) # type: Cache dataset = dataset if isinstance(dataset, dict) else {"type": dataset} self._dataset_type, self._dataset_config = parse_dataset_definition(dataset) @@ -372,7 +372,7 @@ def __init__( with the corresponding dataset definition including ``filepath`` (unlike ``dataset`` argument). Checkpoint configuration is described here: - https://kedro.readthedocs.io/en/stable/05_data/02_kedro_io.html#checkpoint-configuration + https://kedro.readthedocs.io/en/stable/data/kedro_io.html#checkpoint-configuration Credentials for the checkpoint can be explicitly specified in this configuration. filepath_arg: Underlying dataset initializer argument that will @@ -387,7 +387,7 @@ def __init__( the dataset or the checkpoint configuration contains explicit credentials spec, then such spec will take precedence. All possible credentials management scenarios are documented here: - https://kedro.readthedocs.io/en/stable/05_data/02_kedro_io.html#partitioned-dataset-credentials + https://kedro.readthedocs.io/en/stable/data/kedro_io.html#partitioned-dataset-credentials load_args: Keyword arguments to be passed into ``find()`` method of the filesystem implementation. fs_args: Extra arguments to pass into underlying filesystem class constructor diff --git a/kedro/io/transformers.py b/kedro/io/transformers.py deleted file mode 100644 index 8a9ddb074f..0000000000 --- a/kedro/io/transformers.py +++ /dev/null @@ -1,49 +0,0 @@ -"""``Transformers`` modify the loading and saving of ``DataSets`` in a -``DataCatalog``. -""" -import abc -from typing import Any, Callable - - -class AbstractTransformer(abc.ABC): - """Transformers will be deprecated in Kedro 0.18.0 in favour of the Dataset Hooks. - - ``AbstractTransformer`` is the base class for all transformer implementations. - All transformer implementations should extend this abstract class - and customise the `load` and `save` methods where appropriate.""" - - def load(self, data_set_name: str, load: Callable[[], Any]) -> Any: - """ - This method will be deprecated in Kedro 0.18.0 in favour of the Dataset Hooks - `before_dataset_loaded` and `after_dataset_loaded`. - - Wrap the loading of a dataset. - Call ``load`` to get the data from the data set / next transformer. - - Args: - data_set_name: The name of the data set being loaded. - load: A callback to retrieve the data being loaded from the - data set / next transformer. - - Returns: - The loaded data. - """ - # pylint: disable=unused-argument, no-self-use - return load() - - def save(self, data_set_name: str, save: Callable[[Any], None], data: Any) -> None: - """ - This method will be deprecated in Kedro 0.18.0 in favour of the Dataset Hooks - `before_dataset_saved` and `after_dataset_saved`. - - Wrap the saving of a dataset. - Call ``save`` to pass the data to the data set / next transformer. - - Args: - data_set_name: The name of the data set being saved. - save: A callback to pass the data being saved on to the - data set / next transformer. - data: The data being saved - """ - # pylint: disable=unused-argument, no-self-use - save(data) diff --git a/kedro/pipeline/decorators.py b/kedro/pipeline/decorators.py deleted file mode 100644 index 49a4119c65..0000000000 --- a/kedro/pipeline/decorators.py +++ /dev/null @@ -1,69 +0,0 @@ -"""A module containing predefined node decorators in Kedro. -""" - -import logging -import time -import warnings -from functools import wraps -from typing import Callable - -warnings.simplefilter("default", DeprecationWarning) - -warnings.warn( - "Support for decorators will be deprecated in Kedro 0.18.0. " - "Please use Hooks to extend the behaviour of a node or pipeline.", - DeprecationWarning, -) - - -def _func_full_name(func: Callable): - if not getattr(func, "__module__", None): - return getattr(func, "__qualname__", repr(func)) - return f"{func.__module__}.{func.__qualname__}" - - -def _human_readable_time(elapsed: float): - mins, secs = divmod(elapsed, 60) - hours, mins = divmod(mins, 60) - - if hours > 0: - message = f"{int(hours)}h{int(mins):02}m{int(secs):02}s" - elif mins > 0: - message = f"{int(mins)}m{int(secs):02}s" - elif secs >= 1: - message = f"{secs:.2f}s" - else: - message = f"{secs * 1000.0:.0f}ms" - - return message - - -def log_time(func: Callable) -> Callable: - """A function decorator which logs the time taken for executing a function. - - Args: - func: The function to be logged. - - Returns: - A wrapped function, which will execute the provided function and log - the running time. - - """ - - @wraps(func) - def with_time(*args, **kwargs): - log = logging.getLogger(__name__) - t_start = time.time() - result = func(*args, **kwargs) - t_end = time.time() - elapsed = t_end - t_start - - log.info( - "Running %r took %s [%.3fs]", - _func_full_name(func), - _human_readable_time(elapsed), - elapsed, - ) - return result - - return with_time diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py index ac864388e5..fe5c61d147 100644 --- a/kedro/pipeline/modular_pipeline.py +++ b/kedro/pipeline/modular_pipeline.py @@ -10,8 +10,6 @@ _transcode_split, ) -_PARAMETER_KEYWORDS = ("params:", "parameters") - class ModularPipelineError(Exception): """Raised when a modular pipeline is not adapted and integrated @@ -21,8 +19,16 @@ class ModularPipelineError(Exception): pass +def _is_all_parameters(name: str) -> bool: + return name == "parameters" + + +def _is_single_parameter(name: str) -> bool: + return name.startswith("params:") + + def _is_parameter(name: str) -> bool: - return any(name.startswith(param) for param in _PARAMETER_KEYWORDS) + return _is_single_parameter(name) or _is_all_parameters(name) def _validate_inputs_outputs( @@ -68,12 +74,86 @@ def _validate_datasets_exist( ) +def _get_dataset_names_mapping( + names: Union[str, Set[str], Dict[str, str]] = None +) -> Dict[str, str]: + """Take a name or a collection of dataset names + and turn it into a mapping from the old dataset names to the provided ones if necessary. + + Args: + names: A dataset name or collection of dataset names. + When str or Set[str] is provided, the listed names will stay + the same as they are named in the provided pipeline. + When Dict[str, str] is provided, current names will be + mapped to new names in the resultant pipeline. + Returns: + A dictionary that maps the old dataset names to the provided ones. + Examples: + >>> _get_dataset_names_mapping("dataset_name") + {"dataset_name": "dataset_name"} # a str name will stay the same + >>> _get_dataset_names_mapping(set(["ds_1", "ds_2"])) + {"ds_1": "ds_1", "ds_2": "ds_2"} # a Set[str] of names will stay the same + >>> _get_dataset_names_mapping({"ds_1": "new_ds_1_name"}) + {"ds_1": "new_ds_1_name"} # a Dict[str, str] of names will map key to value + """ + if names is None: + return {} + if isinstance(names, str): + return {names: names} + if isinstance(names, dict): + return copy.deepcopy(names) + + return {item: item for item in names} + + +def _normalize_param_name(name: str) -> str: + """Make sure that a param name has a `params:` prefix before passing to the node""" + return name if name.startswith("params:") else f"params:{name}" + + +def _get_param_names_mapping( + names: Union[str, Set[str], Dict[str, str]] = None +) -> Dict[str, str]: + """Take a parameter or a collection of parameter names + and turn it into a mapping from existing parameter names to new ones if necessary. + It follows the same rule as `_get_dataset_names_mapping` and + prefixes the keys on the resultant dictionary with `params:` to comply with node's syntax. + + Args: + names: A parameter name or collection of parameter names. + When str or Set[str] is provided, the listed names will stay + the same as they are named in the provided pipeline. + When Dict[str, str] is provided, current names will be + mapped to new names in the resultant pipeline. + Returns: + A dictionary that maps the old parameter names to the provided ones. + Examples: + >>> _get_param_names_mapping("param_name") + {"params:param_name": "params:param_name"} # a str name will stay the same + >>> _get_param_names_mapping(set(["param_1", "param_2"])) + # a Set[str] of names will stay the same + {"params:param_1": "params:param_1", "params:param_2": "params:param_2"} + >>> _get_param_names_mapping({"param_1": "new_name_for_param_1"}) + # a Dict[str, str] of names will map key to valu + {"params:param_1": "params:new_name_for_param_1"} + """ + params = {} + for name, new_name in _get_dataset_names_mapping(names).items(): + if _is_all_parameters(name): + params[name] = name # don't map parameters into params:parameters + else: + param_name = _normalize_param_name(name) + param_new_name = _normalize_param_name(new_name) + params[param_name] = param_new_name + return params + + def pipeline( pipe: Union[Iterable[Union[Node, Pipeline]], Pipeline], *, inputs: Union[str, Set[str], Dict[str, str]] = None, outputs: Union[str, Set[str], Dict[str, str]] = None, - parameters: Dict[str, str] = None, + parameters: Union[str, Set[str], Dict[str, str]] = None, tags: Union[str, Iterable[str]] = None, namespace: str = None, ) -> Pipeline: @@ -101,7 +181,12 @@ def pipeline( mapped to new names. Can refer to both the pipeline's free outputs, as well as intermediate results that need to be exposed. - parameters: A map of existing parameter to the new one. + parameters: A name or collection of parameters to namespace. + When str or Set[str] are provided, the listed parameter names will stay + the same as they are named in the provided pipeline. + When Dict[str, str] is provided, current parameter names will be + mapped to new names. + The parameters can be specified without the `params:` prefix. tags: Optional set of tags to be applied to all the pipeline nodes. namespace: A prefix to give to all dataset names, except those explicitly named with the `inputs`/`outputs` @@ -126,17 +211,21 @@ def pipeline( return pipe # pylint: disable=protected-access - inputs = _to_dict(inputs) - outputs = _to_dict(outputs) - parameters = _to_dict(parameters) + inputs = _get_dataset_names_mapping(inputs) + outputs = _get_dataset_names_mapping(outputs) + parameters = _get_param_names_mapping(parameters) _validate_datasets_exist(inputs.keys(), outputs.keys(), parameters.keys(), pipe) _validate_inputs_outputs(inputs.keys(), outputs.keys(), pipe) mapping = {**inputs, **outputs, **parameters} - def _prefix(name: str) -> str: - return f"{namespace}.{name}" if namespace else name + def _prefix_dataset(name: str) -> str: + return f"{namespace}.{name}" + + def _prefix_param(name: str) -> str: + _, param_name = name.split("params:") + return f"params:{namespace}.{param_name}" def _is_transcode_base_in_mapping(name: str) -> bool: base_name, _ = _transcode_split(name) @@ -150,12 +239,14 @@ def _rename(name: str): rules = [ # if name mapped to new name, update with new name (lambda n: n in mapping, lambda n: mapping[n]), - # if it's a parameter, leave as is (don't namespace) - (_is_parameter, lambda n: n), + # if name refers to the set of all "parameters", leave as is + (_is_all_parameters, lambda n: n), # if transcode base is mapped to a new name, update with new base (_is_transcode_base_in_mapping, _map_transcode_base), - # if namespace given, prefix name using that namespace - (lambda n: bool(namespace), _prefix), + # if name refers to a single parameter and a namespace is given, apply prefix + (lambda n: bool(namespace) and _is_single_parameter(n), _prefix_param), + # if namespace given for a dataset, prefix name using that namespace + (lambda n: bool(namespace), _prefix_dataset), ] for predicate, processor in rules: @@ -197,13 +288,3 @@ def _copy_node(node: Node) -> Node: new_nodes = [_copy_node(n) for n in pipe.nodes] return Pipeline(new_nodes, tags=tags) - - -def _to_dict(element: Union[None, str, Set[str], Dict[str, str]]) -> Dict[str, str]: - if element is None: - return {} - if isinstance(element, str): - return {element: element} - if isinstance(element, dict): - return copy.deepcopy(element) - return {item: item for item in element} diff --git a/kedro/pipeline/node.py b/kedro/pipeline/node.py index 781cb2b634..d769e400f7 100644 --- a/kedro/pipeline/node.py +++ b/kedro/pipeline/node.py @@ -6,12 +6,11 @@ import logging import re from collections import Counter -from functools import reduce from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Union from warnings import warn -class Node: # pylint: disable=too-many-instance-attributes +class Node: """``Node`` is an auxiliary class facilitating the operations required to run user-provided functions as part of Kedro pipelines. """ @@ -24,7 +23,6 @@ def __init__( *, name: str = None, tags: Union[str, Iterable[str]] = None, - decorators: Iterable[Callable] = None, confirms: Union[str, List[str]] = None, namespace: str = None, ): @@ -47,7 +45,6 @@ def __init__( name: Optional node name to be used when displaying the node in logs or any other visualisations. tags: Optional set of tags to be applied to the node. - decorators: Optional list of decorators to be applied to the node. confirms: Optional name or the list of the names of the datasets that should be confirmed. This will result in calling ``confirm()`` method of the corresponding data set instance. @@ -108,15 +105,6 @@ def __init__( self._name = name self._namespace = namespace self._tags = set(_to_list(tags)) - if decorators: - warn( - "The node's `decorators` API will be deprecated in Kedro 0.18.0." - "Please use a node's Hooks to extend the node's behaviour in a pipeline." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/02_hooks.html", - DeprecationWarning, - ) - self._decorators = list(decorators or []) self._validate_unique_outputs() self._validate_inputs_dif_than_outputs() @@ -133,7 +121,6 @@ def _copy(self, **overwrite_params): "name": self._name, "namespace": self._namespace, "tags": self._tags, - "decorators": self._decorators, "confirms": self._confirms, } params.update(overwrite_params) @@ -309,91 +296,6 @@ def confirms(self) -> List[str]: """ return _to_list(self._confirms) - @property - def _decorated_func(self): - return reduce(lambda g, f: f(g), self._decorators, self._func) - - def decorate(self, *decorators: Callable) -> "Node": - """Create a new ``Node`` by applying the provided decorators to the - underlying function. If no decorators are passed, it will return a - new ``Node`` object, but with no changes to the function. - - Args: - *decorators: Decorators to be applied on the node function. - Decorators will be applied from right to left. - - Returns: - A new ``Node`` object with the decorators applied to the function. - - Example: - :: - - >>> - >>> from functools import wraps - >>> - >>> - >>> def apply_f(func: Callable) -> Callable: - >>> @wraps(func) - >>> def with_f(*args, **kwargs): - >>> args = ["f({})".format(a) for a in args] - >>> return func(*args, **kwargs) - >>> return with_f - >>> - >>> - >>> def apply_g(func: Callable) -> Callable: - >>> @wraps(func) - >>> def with_g(*args, **kwargs): - >>> args = ["g({})".format(a) for a in args] - >>> return func(*args, **kwargs) - >>> return with_g - >>> - >>> - >>> def apply_h(func: Callable) -> Callable: - >>> @wraps(func) - >>> def with_h(*args, **kwargs): - >>> args = ["h({})".format(a) for a in args] - >>> return func(*args, **kwargs) - >>> return with_h - >>> - >>> - >>> def apply_fg(func: Callable) -> Callable: - >>> @wraps(func) - >>> def with_fg(*args, **kwargs): - >>> args = ["fg({})".format(a) for a in args] - >>> return func(*args, **kwargs) - >>> return with_fg - >>> - >>> - >>> def identity(value): - >>> return value - >>> - >>> - >>> # using it as a regular python decorator - >>> @apply_f - >>> def decorated_identity(value): - >>> return value - >>> - >>> - >>> # wrapping the node function - >>> old_node = node(apply_g(decorated_identity), 'input', 'output', - >>> name='node') - >>> # using the .decorate() method to apply multiple decorators - >>> new_node = old_node.decorate(apply_h, apply_fg) - >>> result = new_node.run(dict(input=1)) - >>> - >>> assert old_node.name == new_node.name - >>> assert "output" in result - >>> assert result['output'] == "f(g(fg(h(1))))" - """ - warn( - "The node's `decorate` API will be deprecated in Kedro 0.18.0." - "Please use a node's Hooks to extend the node's behaviour in a pipeline." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/02_hooks.html", - DeprecationWarning, - ) - return self._copy(decorators=self._decorators + list(reversed(decorators))) - def run(self, inputs: Dict[str, Any] = None) -> Dict[str, Any]: """Run this node using the provided inputs and return its results in a dictionary. @@ -458,7 +360,7 @@ def _run_with_no_inputs(self, inputs: Dict[str, Any]): f"{sorted(inputs.keys())}." ) - return self._decorated_func() + return self._func() def _run_with_one_input(self, inputs: Dict[str, Any], node_input: str): if len(inputs) != 1 or node_input not in inputs: @@ -468,7 +370,7 @@ def _run_with_one_input(self, inputs: Dict[str, Any], node_input: str): f"{sorted(inputs.keys())}." ) - return self._decorated_func(inputs[node_input]) + return self._func(inputs[node_input]) def _run_with_list(self, inputs: Dict[str, Any], node_inputs: List[str]): # Node inputs and provided run inputs should completely overlap @@ -479,7 +381,7 @@ def _run_with_list(self, inputs: Dict[str, Any], node_inputs: List[str]): f"{sorted(inputs.keys())}." ) # Ensure the function gets the inputs in the correct order - return self._decorated_func(*(inputs[item] for item in node_inputs)) + return self._func(*(inputs[item] for item in node_inputs)) def _run_with_dict(self, inputs: Dict[str, Any], node_inputs: Dict[str, str]): # Node inputs and provided run inputs should completely overlap @@ -491,7 +393,7 @@ def _run_with_dict(self, inputs: Dict[str, Any], node_inputs: Dict[str, str]): f"{sorted(inputs.keys())}." ) kwargs = {arg: inputs[alias] for arg, alias in node_inputs.items()} - return self._decorated_func(**kwargs) + return self._func(**kwargs) def _outputs_to_dictionary(self, outputs): def _from_dict(): diff --git a/kedro/pipeline/pipeline.py b/kedro/pipeline/pipeline.py index 17ad1187fd..13640e1b7d 100644 --- a/kedro/pipeline/pipeline.py +++ b/kedro/pipeline/pipeline.py @@ -7,8 +7,7 @@ import json from collections import Counter, defaultdict from itertools import chain -from typing import Callable, Dict, Iterable, List, Set, Tuple, Union -from warnings import warn +from typing import Dict, Iterable, List, Set, Tuple, Union from toposort import CircularDependencyError as ToposortCircleError from toposort import toposort @@ -389,18 +388,17 @@ def only_nodes(self, *node_names: str) -> "Pipeline": return Pipeline(nodes) def only_nodes_with_namespace(self, node_namespace: str) -> "Pipeline": - """Create a new ``Pipeline`` which will contain only the specified - nodes by namespace. + """Creates a new ``Pipeline`` containing only nodes with the specified + namespace. Args: node_namespace: One node namespace. Raises: - ValueError: When pipeline contains no pipeline with the specified namespace. + ValueError: When pipeline contains no nodes with the specified namespace. Returns: - A new ``Pipeline`` with nodes starting with a specified namespace. - + A new ``Pipeline`` containing nodes with the specified namespace. """ nodes = [ n @@ -409,8 +407,7 @@ def only_nodes_with_namespace(self, node_namespace: str) -> "Pipeline": ] if not nodes: raise ValueError( - f"Pipeline does not contain nodes with namespace(s) " - f"{list(node_namespace)}." + f"Pipeline does not contain nodes with namespace `{node_namespace}`" ) return Pipeline(nodes) @@ -651,7 +648,7 @@ def to_nodes(self, *node_names: str) -> "Pipeline": return res def only_nodes_with_tags(self, *tags: str) -> "Pipeline": - """Create a new ``Pipeline`` object with the nodes which contain *any* + """Creates a new ``Pipeline`` object with the nodes which contain *any* of the provided tags. The resulting ``Pipeline`` is empty if no tags are provided. @@ -667,29 +664,97 @@ def only_nodes_with_tags(self, *tags: str) -> "Pipeline": nodes = [node for node in self.nodes if tags & node.tags] return Pipeline(nodes) - def decorate(self, *decorators: Callable) -> "Pipeline": - """Create a new ``Pipeline`` by applying the provided decorators to - all the nodes in the pipeline. If no decorators are passed, it will - return a copy of the current ``Pipeline`` object. + # pylint: disable=too-many-arguments + def filter( + self, + tags: Iterable[str] = None, + from_nodes: Iterable[str] = None, + to_nodes: Iterable[str] = None, + node_names: Iterable[str] = None, + from_inputs: Iterable[str] = None, + to_outputs: Iterable[str] = None, + node_namespace: str = None, + ) -> "Pipeline": + """Creates a new ``Pipeline`` object with the nodes that meet all of the + specified filtering conditions. + + The new pipeline object is the intersection of pipelines that meet each + filtering condition. This is distinct from chaining multiple filters together. Args: - *decorators: Decorators to be applied on all node functions in - the pipeline, always applied from right to left. + tags: A list of node tags which should be used to lookup + the nodes of the new ``Pipeline``. + from_nodes: A list of node names which should be used as a + starting point of the new ``Pipeline``. + to_nodes: A list of node names which should be used as an + end point of the new ``Pipeline``. + node_names: A list of node names which should be selected for the + new ``Pipeline``. + from_inputs: A list of inputs which should be used as a starting point + of the new ``Pipeline`` + to_outputs: A list of outputs which should be the final outputs of + the new ``Pipeline``. + node_namespace: One node namespace which should be used to select + nodes in the new ``Pipeline``. Returns: - A new ``Pipeline`` object with all nodes decorated with the - provided decorators. + A new ``Pipeline`` object with nodes that meet all of the specified + filtering conditions. + Raises: + ValueError: The filtered ``Pipeline`` has no nodes. + + Example: + :: + + >>> pipeline = Pipeline( + >>> [ + >>> node(func, "A", "B", name="node1"), + >>> node(func, "B", "C", name="node2"), + >>> node(func, "C", "D", name="node3"), + >>> ] + >>> ) + >>> pipeline.filter(node_names=["node1", "node3"], from_inputs=["A"]) + >>> # Gives a new pipeline object containing node1 and node3. """ - warn( - "The pipeline's `decorate` API will be deprecated in Kedro 0.18.0." - "Please use a node's Hooks to extend the node's behaviour in a pipeline." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/02_hooks.html", - DeprecationWarning, - ) - nodes = [node.decorate(*decorators) for node in self.nodes] - return Pipeline(nodes) + # Use [node_namespace] so only_nodes_with_namespace can follow the same + # *filter_args pattern as the other filtering methods, which all take iterables. + node_namespace = [node_namespace] if node_namespace else None + + filter_methods = { + self.only_nodes_with_tags: tags, + self.from_nodes: from_nodes, + self.to_nodes: to_nodes, + self.only_nodes: node_names, + self.from_inputs: from_inputs, + self.to_outputs: to_outputs, + self.only_nodes_with_namespace: node_namespace, + } + + subset_pipelines = { + filter_method(*filter_args) # type: ignore + for filter_method, filter_args in filter_methods.items() + if filter_args + } + + # Intersect all the pipelines subsets. We apply each filter to the original + # pipeline object (self) rather than incrementally chaining filter methods + # together. Hence the order of filtering does not affect the outcome, and the + # resultant pipeline is unambiguously defined. + # If this were not the case then, for example, + # pipeline.filter(node_names=["node1", "node3"], from_inputs=["A"]) + # would give different outcomes depending on the order of filter methods: + # only_nodes and then from_inputs would give node1, while only_nodes and then + # from_inputs would give node1 and node3. + filtered_pipeline = Pipeline(self.nodes) + for subset_pipeline in subset_pipelines: + filtered_pipeline &= subset_pipeline + + if not filtered_pipeline.nodes: + raise ValueError( + "Pipeline contains no nodes after applying all provided filters" + ) + return filtered_pipeline def tag(self, tags: Union[str, Iterable[str]]) -> "Pipeline": """Tags all the nodes in the pipeline. diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 3226a2c451..2053fc671d 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -52,11 +52,11 @@ def save(self, data: Any): # Checks if the error is due to serialisation or not try: pickle.dumps(data) - except Exception as exc: # SKIP_IF_NO_SPARK + except Exception as serialisation_exc: # SKIP_IF_NO_SPARK raise DataSetError( f"{str(data.__class__)} cannot be serialized. ParallelRunner " "implicit memory datasets can only be used with serializable data" - ) from exc + ) from serialisation_exc else: raise exc @@ -220,19 +220,19 @@ def _validate_catalog(cls, catalog: DataCatalog, pipeline: Pipeline): f"using functools.wraps()." ) - memory_data_sets = [] + memory_datasets = [] for name, data_set in data_sets.items(): if ( name in pipeline.all_outputs() and isinstance(data_set, MemoryDataSet) and not isinstance(data_set, BaseProxy) ): - memory_data_sets.append(name) + memory_datasets.append(name) - if memory_data_sets: + if memory_datasets: raise AttributeError( f"The following data sets are memory data sets: " - f"{sorted(memory_data_sets)}\n" + f"{sorted(memory_datasets)}\n" f"ParallelRunner does not support output to externally created " f"MemoryDataSets" ) @@ -270,7 +270,6 @@ def _run( # pylint: disable=too-many-locals,useless-suppression """ # pylint: disable=import-outside-toplevel,cyclic-import - from kedro.framework.session.session import get_current_session nodes = pipeline.nodes self._validate_catalog(catalog, pipeline) @@ -284,11 +283,7 @@ def _run( # pylint: disable=too-many-locals,useless-suppression done = None max_workers = self._get_required_workers_count(pipeline) - from kedro.framework.project import PACKAGE_NAME - - session = get_current_session(silent=True) - # pylint: disable=protected-access - conf_logging = session._get_logging_config() if session else None + from kedro.framework.project import LOGGING, PACKAGE_NAME with ProcessPoolExecutor(max_workers=max_workers) as pool: while True: @@ -303,7 +298,7 @@ def _run( # pylint: disable=too-many-locals,useless-suppression self._is_async, run_id, package_name=PACKAGE_NAME, - conf_logging=conf_logging, + conf_logging=LOGGING, ) ) if not futures: diff --git a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/__init__.py b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/__init__.py index 3d5ecd61b4..8e84ca9c0e 100644 --- a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/__init__.py +++ b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/__init__.py @@ -6,3 +6,5 @@ from .pipeline import create_pipeline __all__ = ["create_pipeline"] + +__version__ = "0.1" diff --git a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters/{{ cookiecutter.pipeline_name }}.yml b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters/{{ cookiecutter.pipeline_name }}.yml index e94e6a6efd..093b9a9f80 100644 --- a/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters/{{ cookiecutter.pipeline_name }}.yml +++ b/kedro/templates/pipeline/{{ cookiecutter.pipeline_name }}/config/parameters/{{ cookiecutter.pipeline_name }}.yml @@ -2,4 +2,4 @@ # using Kedro {{ cookiecutter.kedro_version }}. # # Documentation for this file format can be found in "Parameters" -# Link: https://kedro.readthedocs.io/en/{{ cookiecutter.kedro_version }}/04_kedro_project_setup/02_configuration.html#parameters +# Link: https://kedro.readthedocs.io/en/{{ cookiecutter.kedro_version }}/kedro_project_setup/configuration.html#parameters diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/README.md b/kedro/templates/project/{{ cookiecutter.repo_name }}/README.md index 44a529d9b9..1493cb1020 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/README.md +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/README.md @@ -11,7 +11,7 @@ Take a look at the [Kedro documentation](https://kedro.readthedocs.io) to get st In order to get the best out of the template: * Don't remove any lines from the `.gitignore` file we provide -* Make sure your results can be reproduced by following a [data engineering convention](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#what-is-data-engineering-convention) +* Make sure your results can be reproduced by following a [data engineering convention](https://kedro.readthedocs.io/en/stable/faq/faq.html#what-is-data-engineering-convention) * Don't commit data to your repository * Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/` @@ -22,7 +22,7 @@ Declare any dependencies in `src/requirements.txt` for `pip` installation and `s To install them, run: ``` -kedro install +pip install -r src/requirements.txt ``` ## How to run your Kedro pipeline @@ -51,17 +51,17 @@ To generate or update the dependency requirements for your project: kedro build-reqs ``` -This will copy the contents of `src/requirements.txt` into a new file `src/requirements.in` which will be used as the source for `pip-compile`. You can see the output of the resolution by opening `src/requirements.txt`. +This will `pip-compile` the contents of `src/requirements.txt` into a new file `src/requirements.lock`. You can see the output of the resolution by opening `src/requirements.lock`. -After this, if you'd like to update your project requirements, please update `src/requirements.in` and re-run `kedro build-reqs`. +After this, if you'd like to update your project requirements, please update `src/requirements.txt` and re-run `kedro build-reqs`. -[Further information about project dependencies](https://kedro.readthedocs.io/en/stable/04_kedro_project_setup/01_dependencies.html#project-specific-dependencies) +[Further information about project dependencies](https://kedro.readthedocs.io/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies) ## How to work with Kedro and notebooks > Note: Using `kedro jupyter` or `kedro ipython` to run your notebook provides these variables in scope: `context`, `catalog`, and `startup_error`. > -> Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `kedro install` you will not need to take any extra steps before you use them. +> Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `pip install -r src/requirements.txt` you will not need to take any extra steps before you use them. ### Jupyter To use Jupyter notebooks in your Kedro project, you need to install Jupyter: @@ -119,4 +119,4 @@ To automatically strip out all output cell contents before committing to `git`, ## Package your Kedro project -[Further information about building project documentation and packaging your project](https://kedro.readthedocs.io/en/stable/03_tutorial/05_package_a_project.html) +[Further information about building project documentation and packaging your project](https://kedro.readthedocs.io/en/stable/tutorial/package_a_project.html) diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml index dfeffb42bb..3fd2208d45 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml @@ -1,4 +1,4 @@ # Here you can define all your data sets by using simple YAML syntax. # # Documentation for this file format can be found in "The Data Catalog" -# Link: https://kedro.readthedocs.io/en/stable/05_data/01_data_catalog.html +# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/logging.yml b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/logging.yml index 3689418056..a4dcad2e08 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/logging.yml +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/logging.yml @@ -34,12 +34,6 @@ handlers: encoding: utf8 delay: True - journal_file_handler: - class: kedro.versioning.journal.JournalFileHandler - level: INFO - base_dir: logs/journals - formatter: json_formatter - loggers: anyconfig: level: WARNING @@ -56,11 +50,6 @@ loggers: handlers: [console, info_file_handler, error_file_handler] propagate: no - kedro.journal: - level: INFO - handlers: [journal_file_handler] - propagate: no - root: level: INFO handlers: [console, info_file_handler, error_file_handler] diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt index 48e23902e9..e2f0204f69 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/requirements.txt @@ -6,7 +6,7 @@ jupyter~=1.0 jupyter_client>=5.1, <7.0 jupyterlab~=3.0 kedro=={{ cookiecutter.kedro_version }} -kedro-telemetry~=0.1.0 +kedro-telemetry~=0.1.0; python_version < '3.9' nbstripout~=0.4 pytest-cov~=3.0 pytest-mock>=1.7.1, <2.0 diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py index 196a9233fd..491d063b49 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/tests/test_run.py @@ -25,5 +25,5 @@ def project_context(): # and should be replaced with the ones testing the project # functionality class TestProjectContext: - def test_package_name(self, project_context): - assert project_context.package_name == "{{ cookiecutter.python_package }}" + def test_project_path(self, project_context): + assert project_context.project_path == Path.cwd() diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py index 628f18a70e..5f74f6b303 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/__main__.py @@ -20,7 +20,7 @@ def _find_run_command(package_name): if run: # use run command from installed plugin if it exists return run - # use run command from the framework project + # use run command from `kedro.framework.cli.project` from kedro.framework.cli.project import run return run diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py deleted file mode 100644 index 3991d30c11..0000000000 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/hooks.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Project hooks.""" -from typing import Any, Dict, Iterable, Optional - -from kedro.config import ConfigLoader -from kedro.framework.hooks import hook_impl -from kedro.io import DataCatalog -from kedro.versioning import Journal - - -class ProjectHooks: - @hook_impl - def register_config_loader( - self, conf_paths: Iterable[str], env: str, extra_params: Dict[str, Any], - ) -> ConfigLoader: - return ConfigLoader(conf_paths) - - @hook_impl - def register_catalog( - self, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]], - load_versions: Dict[str, str], - save_version: str, - journal: Journal, - ) -> DataCatalog: - return DataCatalog.from_config( - catalog, credentials, load_versions, save_version, journal - ) diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py index cc573c8b8b..bb12487903 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/settings.py @@ -1,8 +1,8 @@ """Project settings.""" -from {{cookiecutter.python_package}}.hooks import ProjectHooks -# Instantiate and list your project hooks here -HOOKS = (ProjectHooks(),) +# Instantiate and list your custom project hooks here +# from {{cookiecutter.python_package}}.hooks import ProjectHooks +# HOOKS = (ProjectHooks(),) # List the installed plugins for which to disable auto-registry # DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",) @@ -20,4 +20,21 @@ # CONTEXT_CLASS = KedroContext # Define the configuration folder. Defaults to `conf` -# CONF_ROOT = "conf" +# CONF_SOURCE = "conf" + +# Define the project ConfigLoader class here. +# Defaults to kedro.config.ConfigLoader +# from kedro.config import TemplatedConfigLoader +# CONFIG_LOADER_CLASS = TemplatedConfigLoader + +# Define keyword arguments to be passed to `CONFIG_LOADER_CLASS` constructor. +# These kwargs depend on the `ConfigLoader` class implementation. +# CONFIG_LOADER_ARGS = { +# "globals_pattern": "*globals.yml", +# "base_env": "base", +# "default_run_env": "local", +# } + +# Define the project DataCatalog class here. +# Defaults to kedro.io.DataCatalog +# DATA_CATALOG_CLASS = DataCatalog diff --git a/kedro/versioning/__init__.py b/kedro/versioning/__init__.py deleted file mode 100644 index d4bec012be..0000000000 --- a/kedro/versioning/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -"""``kedro.versioning`` provides functionality to setup the Journal for -capturing information required to reproduce a Kedro run. -""" - -from .journal import Journal - -__all__ = ["Journal"] diff --git a/kedro/versioning/journal.py b/kedro/versioning/journal.py deleted file mode 100644 index 6dc3412a93..0000000000 --- a/kedro/versioning/journal.py +++ /dev/null @@ -1,136 +0,0 @@ -"""This module provides journal logging to enable versioning support for -Kedro project.""" -import json -import logging -import subprocess -import warnings -from pathlib import Path -from typing import Any, Dict, Mapping, Optional, Union - -_JOURNAL_KEY = "kedro.journal" - - -class Journal: - """``Journal`` class provides journal logging to enable versioning support for - Kedro project. - """ - - def __init__(self, record_data: Dict[str, Any]): - """Initialise ``Journal`` as a session of the journal versioning, - and log the project context with an unique identifier. - - Args: - record_data: JSON serializable dictionary specific to project context. - Raises: - DeprecationWarning - """ - warnings.warn( - "`Journal` is now deprecated and will be removed in Kedro 0.18.0." - "For more information, please visit " - "https://github.com/kedro-org/kedro/blob/main/RELEASE.md", - DeprecationWarning, - ) - - self.run_id = record_data["run_id"] - record_data["git_sha"] = _git_sha(record_data["project_path"]) - self._log_journal("ContextJournalRecord", record_data) - - def _log_journal(self, record_type: str, record_data: Mapping) -> None: - """Log a record to journal. - - Args: - record_type: A unique type identifier. - record_data: JSON serializable dictionary, specific to ``record_type``. - - """ - # pylint: disable=no-self-use - try: - logging.getLogger(_JOURNAL_KEY).info( - json.dumps({"type": record_type, **record_data}) - ) - except TypeError: - logging.getLogger(__name__).error( - "Unable to record %s to journal, make sure it's a " - "serializable dictionary", - repr(record_data), - ) - - def log_catalog( - self, dataset_name: str, operation: str, version: str = None - ) -> None: - """Log journal record for ``DataCatalog``. - - Args: - dataset_name: Name of dataset being logged. - operation: Operation on dataset, one of {'save', 'load'}. - version: Dataset version corresponding to operation (i.e if operation - is "save" then this is "save_version"). - - """ - record_data = { - "run_id": self.run_id, - "name": dataset_name, - "operation": operation, - "version": version, - } - self._log_journal("DatasetJournalRecord", record_data) - - -def _git_sha(proj_dir: Union[str, Path] = None) -> Optional[str]: - """Git description of working tree. - - Returns: Git description or None. - - """ - proj_dir = str(proj_dir or Path.cwd()) - try: - res = subprocess.check_output( - ["git", "rev-parse", "--short", "HEAD"], cwd=proj_dir - ) - return res.decode().strip() - # `subprocess.check_output()` raises `NotADirectoryError` on Windows - except (subprocess.CalledProcessError, FileNotFoundError, NotADirectoryError): - logging.getLogger(__name__).warning("Unable to git describe %s", proj_dir) - return None - - -class JournalFileHandler(logging.Handler): - """Handler for logging journal record to a file based on journal ID.""" - - def __init__(self, base_dir: Union[str, Path]): - """Initialise ``JournalFileHandler`` which will handle logging journal record. - - Args: - base_dir: Base directory for saving journals. - - """ - super().__init__() - self.base_dir = Path(base_dir).expanduser() - self._file_handlers = {} # type:Dict[str, logging.FileHandler] - - def _generate_handler(self, run_id: str) -> logging.FileHandler: - """Generate unique filename for journal record path. - - Returns: - Logging FileHandler object. - - """ - self.base_dir.mkdir(parents=True, exist_ok=True) - handler_path = self.base_dir.resolve() / f"journal_{run_id}.log" - return logging.FileHandler(str(handler_path), mode="a") - - def emit(self, record: logging.LogRecord) -> None: - """Overriding emit function in logging.Handler, which will output the record to - the filelog based on run id. - - Args: - record: logging record. - - """ - message = json.loads(record.getMessage()) - - handler = self._file_handlers.setdefault( - message["run_id"], self._generate_handler(message["run_id"]) - ) - - handler.emit(record) diff --git a/pyproject.toml b/pyproject.toml index f63935b2d7..fe43a544f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,12 +69,10 @@ layers = [ "extras.datasets", "io", "pipeline", - "config", - "versioning" + "config" ] ignore_imports = [ - "kedro.runner.parallel_runner -> kedro.framework.project", - "kedro.runner.parallel_runner -> kedro.framework.session.session" + "kedro.runner.parallel_runner -> kedro.framework.project" ] [[tool.importlinter.contracts]] @@ -112,5 +110,5 @@ forbidden_modules = [ ] ignore_imports = [ "kedro.framework.context.context -> kedro.config", - "kedro.framework.hooks.specs -> kedro.config" + "kedro.framework.session.session -> kedro.config" ] diff --git a/setup.py b/setup.py index 07eb920a2a..2ebb99b533 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ here = path.abspath(path.dirname(__file__)) -PANDAS = "pandas>=0.24" +PANDAS = "pandas~=1.3" # to be able to use XMLDataSet and pandas integration with fsspec SPARK = "pyspark>=2.2, <4.0" HDFS = "hdfs>=2.5.8, <3.0" S3FS = "s3fs>=0.3.0, <0.5" @@ -58,22 +58,16 @@ def _collect_requirements(requires): api_require = {"api.APIDataSet": ["requests~=2.20"]} biosequence_require = {"biosequence.BioSequenceDataSet": ["biopython~=1.73"]} -dask_require = { - "dask.ParquetDataSet": [ - "dask>=2021.10.0, <2022.01; python_version > '3.6'", - "dask[complete]~=2.6; python_version == '3.6'", - ] -} +dask_require = {"dask.ParquetDataSet": ["dask[complete]~=2021.10"]} geopandas_require = { - "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj>=2.2.0, <3.0"] + "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] } matplotlib_require = {"matplotlib.MatplotlibWriter": ["matplotlib>=3.0.3, <4.0"]} holoviews_require = {"holoviews.HoloviewsWriter": ["holoviews~=1.13.0"]} networkx_require = {"networkx.NetworkXDataSet": ["networkx~=2.4"]} pandas_require = { "pandas.CSVDataSet": [PANDAS], - "pandas.ExcelDataSet": [PANDAS, "xlrd~=1.0", "xlsxwriter~=1.0"], - "pandas.AppendableExcelDataSet": [PANDAS, "openpyxl>=3.0.3, <4.0"], + "pandas.ExcelDataSet": [PANDAS, "openpyxl>=3.0.6, <4.0"], "pandas.FeatherDataSet": [PANDAS], "pandas.GBQTableDataSet": [PANDAS, "pandas-gbq>=0.12.0, <1.0"], "pandas.GBQQueryDataSet": [PANDAS, "pandas-gbq>=0.12.0, <1.0"], @@ -83,6 +77,7 @@ def _collect_requirements(requires): "pandas.ParquetDataSet": [PANDAS, "pyarrow>=1.0, <7.0"], "pandas.SQLTableDataSet": [PANDAS, "SQLAlchemy~=1.2"], "pandas.SQLQueryDataSet": [PANDAS, "SQLAlchemy~=1.2"], + "pandas.XMLDataSet": [PANDAS, "lxml~=4.6"], "pandas.GenericDataSet": [PANDAS], } pillow_require = {"pillow.ImageDataSet": ["Pillow~=8.0"]} @@ -99,7 +94,7 @@ def _collect_requirements(requires): tensorflow_required = { "tensorflow.TensorflowModelDataset": [ # currently only TensorFlow V2 supported for saving and loading. - # V1 requires HDF5 and serializes differently + # V1 requires HDF5 and serialises differently "tensorflow~=2.0" ] } @@ -158,7 +153,7 @@ def _collect_requirements(requires): long_description=readme, long_description_content_type="text/markdown", url="https://github.com/kedro-org/kedro", - python_requires=">=3.6, <3.9", + python_requires=">=3.7, <3.10", packages=find_packages(exclude=["docs*", "tests*", "tools*", "features*"]), include_package_data=True, tests_require=test_requires, @@ -172,9 +167,9 @@ def _collect_requirements(requires): keywords="pipelines, machine learning, data pipelines, data science, data engineering", classifiers=[ "Development Status :: 4 - Beta", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", ], extras_require=extras_require, ) diff --git a/static/jsonschema/kedro-catalog-0.15.9.json b/static/jsonschema/kedro-catalog-0.15.9.json index beaecd5a3f..a5e755569d 100644 --- a/static/jsonschema/kedro-catalog-0.15.9.json +++ b/static/jsonschema/kedro-catalog-0.15.9.json @@ -79,7 +79,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -113,7 +113,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -149,7 +149,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" } } } @@ -179,7 +179,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -207,7 +207,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -241,7 +241,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -273,7 +273,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -307,7 +307,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -337,7 +337,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -369,7 +369,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -415,7 +415,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -447,7 +447,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -477,7 +477,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -503,7 +503,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -537,7 +537,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -567,7 +567,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -609,7 +609,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -645,7 +645,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -677,7 +677,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -715,7 +715,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -753,7 +753,7 @@ }, "layer": { "type": "string", - "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/stable/11_faq/01_faq.html#what-is-data-engineering-convention" + "description": "The data layer according to the data engineering convention:\nhttps://kedro.readthedocs.io/en/0.15.9/06_resources/01_faq.html#what-is-data-engineering-convention" } } } @@ -849,7 +849,7 @@ }, "credentials": { "type": "object", - "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\n**Note:** ``dataset_credentials`` key has now been deprecated\nand should not be specified.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\n**Note:** ``dataset_credentials`` key has now been deprecated\nand should not be specified.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.15.9/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" }, "load_args": { "type": "object", diff --git a/static/jsonschema/kedro-catalog-0.16.json b/static/jsonschema/kedro-catalog-0.16.json index b8cb47db3e..32b23591bd 100644 --- a/static/jsonschema/kedro-catalog-0.16.json +++ b/static/jsonschema/kedro-catalog-0.16.json @@ -60,7 +60,7 @@ }, "credentials": { "type": "object", - "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\n**Note:** ``dataset_credentials`` key has now been deprecated\nand should not be specified.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\n**Note:** ``dataset_credentials`` key has now been deprecated\nand should not be specified.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.16.0/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" }, "load_args": { "type": "object", diff --git a/static/jsonschema/kedro-catalog-0.17.json b/static/jsonschema/kedro-catalog-0.17.json index 149254bf6e..197e59dcb2 100644 --- a/static/jsonschema/kedro-catalog-0.17.json +++ b/static/jsonschema/kedro-catalog-0.17.json @@ -67,7 +67,7 @@ }, "credentials": { "type": "object", - "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/stable/04_user_guide/08_advanced_io.html#partitioned-dataset-credentials" + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.17.0/05_data/02_kedro_io.html#partitioned-dataset-credentials" }, "load_args": { "type": "object", diff --git a/static/jsonschema/kedro-catalog-0.18.json b/static/jsonschema/kedro-catalog-0.18.json new file mode 100644 index 0000000000..24c6d47418 --- /dev/null +++ b/static/jsonschema/kedro-catalog-0.18.json @@ -0,0 +1,1042 @@ +{ + "type": "object", + "patternProperties": { + "^[a-z0-9-_]+$": { + "required": ["type"], + "properties": { + "type": { + "type": "string", + "enum": [ + "CachedDataSet", + "IncrementalDataSet", + "MemoryDataSet", + "LambdaDataSet", + "PartitionedDataSet", + "api.APIDataSet", + "biosequence.BioSequenceDataSet", + "dask.ParquetDataSet", + "email.EmailMessageDataSet", + "geopandas.GeoJSONDataSet", + "holoviews.HoloviewsWriter", + "json.JSONDataSet", + "matplotlib.MatplotlibWriter", + "networkx.NetworkXDataSet", + "pandas.CSVDataSet", + "pandas.ExcelDataSet", + "pandas.FeatherDataSet", + "pandas.GBQTableDataSet", + "pandas.HDFDataSet", + "pandas.JSONDataSet", + "pandas.ParquetDataSet", + "pandas.SQLTableDataSet", + "pandas.SQLQueryDataSet", + "pandas.XMLDataSet", + "pillow.ImageDataSet", + "pickle.PickleDataSet", + "plotly.PlotlyDataSet", + "spark.SparkDataSet", + "spark.SparkHiveDataSet", + "spark.SparkJDBCDataSet", + "tensorflow.TensorFlowModelDataset", + "text.TextDataSet", + "tracking.JSONDataSet", + "tracking.MetricsDataSet", + "yaml.YAMLDataSet" + ] + } + }, + "allOf": [ + { + "if": { "properties": { "type": { "const": "CachedDataSet" } } }, + "then": { + "required": ["dataset"], + "properties": { + "dataset": { + "pattern": ".*", + "description": "A Kedro DataSet object or a dictionary to cache." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "IncrementalDataSet" } } }, + "then": { + "required": ["path", "dataset"], + "properties": { + "path": { + "type": "string", + "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataSet``." + }, + "dataset": { + "pattern": ".*", + "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataSet``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." + }, + "checkpoint": { + "pattern": "object", + "description": "Optional checkpoint configuration. Accepts a dictionary\nwith the corresponding dataset definition including ``filepath``\n(unlike ``dataset`` argument). Checkpoint configuration is\ndescribed here:\nhttps://kedro.readthedocs.io/en/0.18.0/data/kedro_io.html#checkpoint-configuration\nCredentials for the checkpoint can be explicitly specified\nin this configuration." + }, + "filepath_arg": { + "type": "string", + "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." + }, + "filename_suffix": { + "type": "string", + "description": "If specified, only partitions that end with this\nstring will be processed." + }, + "credentials": { + "type": ["object", "string"], + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.18.0/data/kedro_io.html#partitioned-dataset-credentials" + }, + "load_args": { + "type": "object", + "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "MemoryDataSet" } } }, + "then": { + "required": [], + "properties": { + "data": { + "pattern": ".*", + "description": "Python object containing the data." + }, + "copy_mode": { + "type": "string", + "description": "The copy mode used to copy the data. Possible\nvalues are: \"deepcopy\", \"copy\" and \"assign\". If not\nprovided, it is inferred based on the data type." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "LambdaDataSet" } } }, + "then": { + "required": ["load", "save"], + "properties": { + "load": { + "pattern": ".*", + "description": "Method to load data from a data set." + }, + "save": { + "pattern": ".*", + "description": "Method to save data to a data set." + }, + "exists": { + "pattern": ".*", + "description": "Method to check whether output data already exists." + }, + "release": { + "pattern": ".*", + "description": "Method to release any cached information." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "PartitionedDataSet" } } }, + "then": { + "required": ["path", "dataset"], + "properties": { + "path": { + "type": "string", + "description": "Path to the folder containing partitioned data.\nIf path starts with the protocol (e.g., ``s3://``) then the\ncorresponding ``fsspec`` concrete filesystem implementation will\nbe used. If protocol is not specified,\n``fsspec.implementations.local.LocalFileSystem`` will be used.\n**Note:** Some concrete implementations are bundled with ``fsspec``,\nwhile others (like ``s3`` or ``gcs``) must be installed separately\nprior to usage of the ``PartitionedDataSet``." + }, + "dataset": { + "pattern": ".*", + "description": "Underlying dataset definition. This is used to instantiate\nthe dataset for each file located inside the ``path``.\nAccepted formats are:\na) object of a class that inherits from ``AbstractDataSet``\nb) a string representing a fully qualified class name to such class\nc) a dictionary with ``type`` key pointing to a string from b),\nother keys are passed to the Dataset initializer.\nCredentials for the dataset can be explicitly specified in\nthis configuration." + }, + "filepath_arg": { + "type": "string", + "description": "Underlying dataset initializer argument that will\ncontain a path to each corresponding partition file.\nIf unspecified, defaults to \"filepath\"." + }, + "filename_suffix": { + "type": "string", + "description": "If specified, only partitions that end with this\nstring will be processed." + }, + "credentials": { + "type": ["object", "string"], + "description": "Protocol-specific options that will be passed to\n``fsspec.filesystem``\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.filesystem\nand the dataset initializer. If the dataset config contains\nexplicit credentials spec, then such spec will take precedence.\nAll possible credentials management scenarios are documented here:\nhttps://kedro.readthedocs.io/en/0.18.0/data/kedro_io.html#partitioned-dataset-credentials" + }, + "load_args": { + "type": "object", + "description": "Keyword arguments to be passed into ``find()`` method of\nthe filesystem implementation." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "api.APIDataSet" } } }, + "then": { + "required": ["url"], + "properties": { + "url": { + "type": "string", + "description": "The API URL endpoint." + }, + "method": { + "type": "string", + "description": "The Method of the request, GET, POST, PUT, DELETE, HEAD, etc..." + }, + "data": { + "pattern": ".*", + "description": "The request payload, used for POST, PUT, etc requests\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" + }, + "params": { + "type": "object", + "description": "The url parameters of the API.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#passing-parameters-in-urls" + }, + "headers": { + "type": "object", + "description": "The HTTP headers.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#custom-headers" + }, + "auth": { + "pattern": ".*", + "description": "Anything ``requests`` accepts. Normally it's either ``('login', 'password')``,\nor ``AuthBase``, ``HTTPBasicAuth`` instance for more complex cases." + }, + "json": { + "pattern": ".*", + "description": "The request payload, used for POST, PUT, etc requests, passed in\nto the json kwarg in the requests object.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#more-complicated-post-requests" + }, + "timeout": { + "type": "integer", + "description": "The wait time in seconds for a response, defaults to 1 minute.\nhttps://requests.readthedocs.io/en/master/user/quickstart/#timeouts" + } + } + } + }, + { + "if": { + "properties": { + "type": { "const": "biosequence.BioSequenceDataSet" } + } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to sequence file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``." + }, + "load_args": { + "type": "object", + "description": "Options for parsing sequence files by Biopython ``SeqIO.parse()``." + }, + "save_args": { + "type": "object", + "description": "file format supported by Biopython ``SeqIO.write()``.\nE.g. `{\"format\": \"fasta\"}`." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\n to pass to the filesystem's `open` method through nested keys\n `open_args_load` and `open_args_save`.\n Here you can find all available arguments for `open`:\n https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\n All defaults are preserved, except `mode`, which is set to `r` when loading\n and to `w` when saving.\n\nNote: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "dask.ParquetDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a parquet file\nparquet collection or the directory of a multipart parquet." + }, + "load_args": { + "type": "object", + "description": "Additional loading options `dask.dataframe.read_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.read_parquet" + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `dask.dataframe.to_parquet`:\nhttps://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet" + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Optional parameters to the backend file system driver:\nhttps://docs.dask.org/en/latest/remote-data-services.html#optional-parameters" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "email.EmailMessageDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "``email`` options for parsing email messages (arguments passed\ninto ``email.parser.Parser.parse``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser.parse\nIf you would like to specify options for the `Parser`,\nyou can include them under the \"parser\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.parser.html#email.parser.Parser\nAll defaults are preserved, but \"policy\", which is set to ``email.policy.default``." + }, + "save_args": { + "type": "object", + "description": "``email`` options for generating MIME documents (arguments passed into\n``email.generator.Generator.flatten``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator.flatten\nIf you would like to specify options for the `Generator`,\nyou can include them under the \"generator\" key. Here you can\nfind all available arguments:\nhttps://docs.python.org/3/library/email.generator.html#email.generator.Generator\nAll defaults are preserved." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "geopandas.GeoJSONDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a GeoJSON file prefixed with a protocol like\n`s3://`. If prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "GeoPandas options for loading GeoJSON files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference/geopandas.read_file.html" + }, + "save_args": { + "type": "object", + "description": "GeoPandas options for saving geojson files.\nHere you can find all available arguments:\nhttps://geopandas.org/reference.html#geopandas.GeoDataFrame.to_file\nThe default_save_arg driver is 'GeoJSON', all others preserved." + }, + "credentials": { + "type": ["object", "string"], + "description": "credentials required to access the underlying filesystem.\nEg. for ``GCFileSystem`` it would look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "holoviews.HoloviewsWriter" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Extra save args passed to `holoviews.save()`. See\nhttps://holoviews.org/reference_manual/holoviews.util.html#holoviews.util.save" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "json.JSONDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "matplotlib.MatplotlibWriter" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a matplot object file(s) prefixed with a protocol\nlike `s3://`. If prefix is not provided, `file` protocol (local filesystem) will be\nused. The prefix should be any protocol supported by ``fsspec``." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``S3FileSystem`` it should look like:\n`{'key': '', 'secret': ''}}`" + }, + "save_args": { + "type": "object", + "description": "Save args passed to `plt.savefig`. See\nhttps://matplotlib.org/api/_as_gen/matplotlib.pyplot.savefig.html" + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "networkx.NetworkXDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to the NetworkX graph JSON file." + }, + "load_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_graph``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_graph.html" + }, + "save_args": { + "type": "object", + "description": "Arguments passed on to ```networkx.node_link_data``.\nSee the details in\nhttps://networkx.github.io/documentation/networkx-1.9.1/reference/generated/networkx.readwrite.json_graph.node_link_data.html" + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.CSVDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a CSV file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving CSV files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.ExcelDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Excel file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "engine": { + "type": "string", + "description": "The engine used to write to excel files. The default\nengine is 'xlsxwriter'." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html\nAll defaults are preserved, but \"engine\", which is set to \"xlrd\"." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving Excel files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html\nAll defaults are preserved, but \"index\", which is set to False.\nIf you would like to specify options for the `ExcelWriter`,\nyou can include them under the \"writer\" key. Here you can\nfind all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.ExcelWriter.html" + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.FeatherDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a feather file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading feather files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_feather.html\nAll defaults are preserved." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.GBQTableDataSet" } } + }, + "then": { + "required": ["dataset", "table_name"], + "properties": { + "dataset": { + "type": "string", + "description": "Google BigQuery dataset." + }, + "table_name": { + "type": "string", + "description": "Google BigQuery table name." + }, + "project": { + "type": "string", + "description": "Google BigQuery Account project ID.\nOptional when available from the environment.\nhttps://cloud.google.com/resource-manager/docs/creating-managing-projects" + }, + "credentials": { + "pattern": ".*", + "description": "Credentials for accessing Google APIs.\nEither ``google.auth.credentials.Credentials`` object or dictionary with\nparameters required to instantiate ``google.oauth2.credentials.Credentials``.\nHere you can find all the arguments:\nhttps://google-auth.readthedocs.io/en/latest/reference/google.oauth2.credentials.html" + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading BigQuery table into DataFrame.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_gbq.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving DataFrame to BigQuery table.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html\nAll defaults are preserved, but \"progress_bar\", which is set to False." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.HDFDataSet" } } }, + "then": { + "required": ["filepath", "key"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a hdf file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "key": { + "type": "string", + "description": "Identifier to the group in the HDF store." + }, + "load_args": { + "type": "object", + "description": "PyTables options for loading hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "PyTables options for saving hdf files.\nYou can find all available arguments at:\nhttps://www.pytables.org/usersguide/libref/top_level.html#tables.open_file\nAll defaults are preserved." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set `wb` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.JSONDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_json.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving JSON files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.ParquetDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Parquet file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nIt can also be a path to a directory. If the directory is\nprovided then it can be used for reading partitioned parquet files.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Additional options for loading Parquet file(s).\nHere you can find all available arguments when reading single file:\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_parquet.html\nHere you can find all available arguments when reading partitioned datasets:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#pyarrow.parquet.ParquetDataset.read\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Additional saving options for `pyarrow.parquet.write_table` and\n`pyarrow.Table.from_pandas`.\nHere you can find all available arguments for `write_table()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html?highlight=write_table#pyarrow.parquet.write_table\nThe arguments for `from_pandas()` should be passed through a nested\nkey: `from_pandas`. E.g.: `save_args = {\"from_pandas\": {\"preserve_index\": False}}`\nHere you can find all available arguments for `from_pandas()`:\nhttps://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.from_pandas" + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.SQLTableDataSet" } } + }, + "then": { + "required": ["table_name", "credentials"], + "properties": { + "table_name": { + "type": "string", + "description": "The table name to load or save data to. It\noverwrites name in ``save_args`` and ``table_name``\nparameters in ``load_args``." + }, + "credentials": { + "type": ["object", "string"], + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_table``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying pandas ``to_sql`` function along\nwith the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls\nIt has ``index=False`` in the default parameters." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pandas.SQLQueryDataSet" } } + }, + "then": { + "required": ["sql", "credentials"], + "properties": { + "sql": { + "type": "string", + "description": "The sql query statement." + }, + "credentials": { + "type": ["object", "string"], + "description": "A dictionary with a ``SQLAlchemy`` connection string.\nUsers are supposed to provide the connection string 'con'\nthrough credentials. It overwrites `con` parameter in\n``load_args`` and ``save_args`` in case it is provided. To find\nall supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + }, + "load_args": { + "type": "object", + "description": "Provided to underlying pandas ``read_sql_query``\nfunction along with the connection string.\nTo find all supported arguments, see here:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html\nTo find all supported connection string formats, see here:\nhttps://docs.sqlalchemy.org/en/13/core/engines.html#database-urls" + } + } + } + }, + { + "if": { "properties": { "type": { "const": "pandas.XMLDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a XML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "Pandas options for loading XML files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_xml.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pandas options for saving XML files.\nHere you can find all available arguments:\nhttps://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_xml.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pickle.PickleDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Pickle file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "backend": { + "type": "string", + "description": "Backend to use, must be one of ['pickle', 'joblib']. Defaults to 'pickle'." + }, + "load_args": { + "type": "object", + "description": "Pickle options for loading pickle files.\nHere you can find all available arguments for different backends:\npickle.load: https://docs.python.org/3/library/pickle.html#pickle.load\njoblib.load: https://joblib.readthedocs.io/en/latest/generated/joblib.load.html\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Pickle options for saving pickle files.\nHere you can find all available arguments for different backends:\npickle.dump: https://docs.python.org/3/library/pickle.html#pickle.dump\njoblib.dump: https://joblib.readthedocs.io/en/latest/generated/joblib.dump.html\nAll defaults are preserved." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "pillow.ImageDataSet" } } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to an image file prefixed with a protocol like\n`s3://`. If prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "Pillow options for saving image files.\nHere you can find all available arguments:\nhttps://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.save\nAll defaults are preserved." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "plotly.PlotlyDataSet" } } + }, + "then": { + "required": ["filepath", "plotly_args"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a JSON file prefixed with a protocol like `s3://`.\nIf prefix is not provided `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "plotly_args": { + "type": "object", + "description": "Plotly configuration for generating a plotly graph object Figure\nrepresenting the plotted data." + }, + "load_args": { + "type": "object", + "description": "Plotly options for loading JSON files.\nHere you can find all available arguments:\nhttps://plotly.com/python-api-reference/generated/plotly.io.from_json.html#plotly.io.from_json\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "Plotly options for saving JSON files.\nHere you can find all available arguments:\nhttps://plotly.com/python-api-reference/generated/plotly.io.write_json.html\nAll defaults are preserved, but \"index\", which is set to False." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested key `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `wb` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "spark.SparkDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a Spark dataframe. When using Databricks\nand working with data written to mount path points,\nspecify ``filepath``s for (versioned) ``SparkDataSet``s\nstarting with ``/dbfs/mnt``." + }, + "file_format": { + "type": "string", + "description": "File format used during load and save\noperations. These are formats supported by the running\nSparkContext include parquet, csv. For a list of supported\nformats please refer to Apache Spark documentation at\nhttps://spark.apache.org/docs/latest/sql-programming-guide.html" + }, + "load_args": { + "type": "object", + "description": "Load args passed to Spark DataFrameReader load method.\nIt is dependent on the selected file format. You can find\na list of read options for each supported format\nin Spark DataFrame read documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "save_args": { + "type": "object", + "description": "Save args passed to Spark DataFrame write options.\nSimilar to load_args this is dependent on the selected file\nformat. You can pass ``mode`` and ``partitionBy`` to specify\nyour overwrite mode and partitioning respectively. You can find\na list of options for each format in Spark DataFrame\nwrite documentation:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html" + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials to access the S3 bucket, such as\n``key``, ``secret``, if ``filepath`` prefix is ``s3a://`` or ``s3n://``.\nOptional keyword arguments passed to ``hdfs.client.InsecureClient``\nif ``filepath`` prefix is ``hdfs://``. Ignored otherwise." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "spark.SparkHiveDataSet" } } + }, + "then": { + "required": ["database", "table", "write_mode"], + "properties": { + "database": { + "type": "string", + "description": "The name of the hive database." + }, + "table": { + "type": "string", + "description": "The name of the table within the database." + }, + "write_mode": { + "type": "string", + "description": "``insert``, ``upsert`` or ``overwrite`` are supported." + }, + "table_pk": { + "type": "array", + "description": "If performing an upsert, this identifies the primary key columns used to\nresolve preexisting data. Is required for ``write_mode=\"upsert\"``." + } + } + } + }, + { + "if": { + "properties": { "type": { "const": "spark.SparkJDBCDataSet" } } + }, + "then": { + "required": ["url", "table"], + "properties": { + "url": { + "type": "string", + "description": "A JDBC URL of the form ``jdbc:subprotocol:subname``." + }, + "table": { + "type": "string", + "description": "The name of the table to load or save data to." + }, + "credentials": { + "type": ["object", "string"], + "description": "A dictionary of JDBC database connection arguments.\nNormally at least properties ``user`` and ``password`` with\ntheir corresponding values. It updates ``properties``\nparameter in ``load_args`` and ``save_args`` in case it is\nprovided." + }, + "load_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameReader.jdbc.html" + }, + "save_args": { + "type": "object", + "description": "Provided to underlying PySpark ``jdbc`` function along\nwith the JDBC URL and the name of the table. To find all\nsupported arguments, see here:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.jdbc.html" + } + } + } + }, + { + "if": { + "properties": { + "type": { "const": "tensorflow.TensorFlowModelDataset" } + } + }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a TensorFlow model directory prefixed with a\nprotocol like `s3://`. If prefix is not provided `file` protocol (local filesystem)\nwill be used. The prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "load_args": { + "type": "object", + "description": "TensorFlow options for loading models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/load_model\nAll defaults are preserved." + }, + "save_args": { + "type": "object", + "description": "TensorFlow options for saving models.\nHere you can find all available arguments:\nhttps://www.tensorflow.org/api_docs/python/tf/keras/models/save_model\nAll defaults are preserved, except for \"save_format\", which is set to \"tf\"." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{'token': None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``)." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "text.TextDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "tracking.JSONDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "tracking.MetricsDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + }, + { + "if": { "properties": { "type": { "const": "yaml.YAMLDataSet" } } }, + "then": { + "required": ["filepath"], + "properties": { + "filepath": { + "type": "string", + "description": "Filepath in POSIX format to a YAML file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." + }, + "save_args": { + "type": "object", + "description": "PyYAML options for saving YAML files (arguments passed\ninto ```yaml.dump``). Here you can find all available arguments:\nhttps://pyyaml.org/wiki/PyYAMLDocumentation\nAll defaults are preserved, but \"default_flow_style\", which is set to False." + }, + "credentials": { + "type": ["object", "string"], + "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." + }, + "fs_args": { + "type": "object", + "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." + } + } + } + } + ] + } + } +} diff --git a/test_requirements.txt b/test_requirements.txt index 8c7337789a..4444341642 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -6,8 +6,8 @@ biopython~=1.73 black==21.5b1 blacken-docs==1.9.2 compress-pickle[lz4]~=1.2.0 -dask>=2021.10.0, <2022.01; python_version > '3.6' # not directly required, pinned by Snyk to avoid a vulnerability -dask[complete]~=2.6; python_version == '3.6' +coverage[toml] +dask[complete]~=2021.10 # pinned by Snyk to avoid a vulnerability delta-spark~=1.0 dill~=0.3.1 filelock>=3.4.0, <4.0 @@ -18,7 +18,7 @@ holoviews~=1.13.0 import-linter[toml]==1.2.6 ipython~=7.10 joblib>=0.14 -lxml~=4.6.3 +lxml~=4.6 matplotlib>=3.0.3, <3.4 # 3.4.0 breaks holoviews memory_profiler>=0.50.0, <1.0 moto==1.3.7 @@ -27,14 +27,15 @@ nbformat~=4.4 networkx~=2.4 openpyxl>=3.0.3, <4.0 pandas-gbq>=0.12.0, <1.0 -pandas>=0.24.0 # Needs to be at least 0.24.0 to make use of `pandas.DataFrame.to_numpy` (recommended alternative to `pandas.DataFrame.values`) +pandas~=1.3 # 1.3 for read_xml/to_xml Pillow~=8.0 plotly>=4.8.0, <6.0 pre-commit~=1.17 -psutil==5.6.7 +psutil==5.8.0 pyarrow>=1.0, <7.0 -pylint>=2.5.2, <3.0 -pyproj>=2.2.0, <3.0 # pyproj 3.0 requires proj>=7.2 but that conflicts with fiona which requires proj<7.2. +pylint>=2.5.2, <2.11; python_version=='3.9' +pylint>=2.5.2, <3.0; python_version<'3.9' +pyproj~=3.0 pyspark>=2.2, <4.0 pytest-cov~=3.0 pytest-mock>=1.7.1, <2.0 @@ -46,8 +47,8 @@ s3fs>=0.3.0, <0.5 # Needs to be at least 0.3.0 to make use of `cachable` attrib SQLAlchemy~=1.2 tables~=3.6.0; platform_system == "Windows" tables~=3.6; platform_system != "Windows" -tensorflow~=2.0 +tensorflow>=2.0, <2.6; python_version=='3.9' +tensorflow>=2.0, <3.0; python_version<'3.9' trufflehog~=2.1 -wheel>=0.35, <0.37 -xlrd~=1.0 +wheel~=0.35 xlsxwriter~=1.0 diff --git a/tests/config/test_config.py b/tests/config/test_config.py index 9a220768e9..dc386af47e 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -9,6 +9,9 @@ from kedro.config import BadConfigException, ConfigLoader, MissingConfigException +_DEFAULT_RUN_ENV = "local" +_BASE_ENV = "base" + def _get_local_logging_config(): return { @@ -80,11 +83,11 @@ def local_config(tmp_path): @pytest.fixture def create_config_dir(tmp_path, base_config, local_config): - proj_catalog = tmp_path / "base" / "catalog.yml" - local_catalog = tmp_path / "local" / "catalog.yml" - local_logging = tmp_path / "local" / "logging.yml" - parameters = tmp_path / "base" / "parameters.json" - db_config_path = tmp_path / "base" / "db.ini" + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" + local_catalog = tmp_path / _DEFAULT_RUN_ENV / "catalog.yml" + local_logging = tmp_path / _DEFAULT_RUN_ENV / "logging.yml" + parameters = tmp_path / _BASE_ENV / "parameters.json" + db_config_path = tmp_path / _BASE_ENV / "db.ini" project_parameters = dict(param1=1, param2=2) _write_yaml(proj_catalog, base_config) @@ -94,20 +97,15 @@ def create_config_dir(tmp_path, base_config, local_config): _write_dummy_ini(db_config_path) -@pytest.fixture -def conf_paths(tmp_path): - return [str(tmp_path / "base"), str(tmp_path / "local")] - - @pytest.fixture def proj_catalog(tmp_path, base_config): - proj_catalog = tmp_path / "base" / "catalog.yml" + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" _write_yaml(proj_catalog, base_config) @pytest.fixture def proj_catalog_nested(tmp_path): - path = tmp_path / "base" / "catalog" / "dir" / "nested.yml" + path = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" _write_yaml(path, {"nested": {"type": "MemoryDataSet"}}) @@ -117,10 +115,10 @@ def proj_catalog_nested(tmp_path): class TestConfigLoader: @use_config_dir - def test_load_local_config(self, conf_paths): + def test_load_local_config(self, tmp_path): """Make sure that configs from `local/` override the ones from `base/`""" - conf = ConfigLoader(conf_paths) + conf = ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV) params = conf.get("parameters*") db_conf = conf.get("db*") catalog = conf.get("catalog*") @@ -134,48 +132,50 @@ def test_load_local_config(self, conf_paths): assert not catalog["cars"]["save_args"]["index"] @use_proj_catalog - def test_load_base_config(self, tmp_path, conf_paths, base_config): + def test_load_base_config(self, tmp_path, base_config): """Test config loading if `local/` directory is empty""" - (tmp_path / "local").mkdir(exist_ok=True) - catalog = ConfigLoader(conf_paths).get("catalog*.yml") + (tmp_path / _DEFAULT_RUN_ENV).mkdir(exist_ok=True) + catalog = ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV).get("catalog*.yml") assert catalog == base_config @use_proj_catalog - def test_duplicate_patterns(self, tmp_path, conf_paths, base_config): + def test_duplicate_patterns(self, tmp_path, base_config): """Test config loading if the glob patterns cover the same file""" - (tmp_path / "local").mkdir(exist_ok=True) - conf = ConfigLoader(conf_paths) + (tmp_path / _DEFAULT_RUN_ENV).mkdir(exist_ok=True) + conf = ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV) catalog1 = conf.get("catalog*.yml", "catalog*.yml") catalog2 = conf.get("catalog*.yml", "catalog.yml") assert catalog1 == catalog2 == base_config - def test_subdirs_dont_exist(self, tmp_path, conf_paths, base_config): + def test_subdirs_dont_exist(self, tmp_path, base_config): """Check the error when config paths don't exist""" pattern = ( r"Given configuration path either does not exist " r"or is not a valid directory\: {}" ) with pytest.raises(ValueError, match=pattern.format(".*base")): - ConfigLoader(conf_paths).get("catalog*") + ConfigLoader(str(tmp_path)).get("catalog*") with pytest.raises(ValueError, match=pattern.format(".*local")): - proj_catalog = tmp_path / "base" / "catalog.yml" + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" _write_yaml(proj_catalog, base_config) - ConfigLoader(conf_paths).get("catalog*") + ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV).get("catalog*") @pytest.mark.usefixtures("create_config_dir", "proj_catalog", "proj_catalog_nested") def test_nested(self, tmp_path): """Test loading the config from subdirectories""" - catalog = ConfigLoader(str(tmp_path / "base")).get("catalog*", "catalog*/**") + config_loader = ConfigLoader(str(tmp_path)) + config_loader.default_run_env = "" + catalog = config_loader.get("catalog*", "catalog*/**") assert catalog.keys() == {"cars", "trains", "nested"} assert catalog["cars"]["type"] == "pandas.CSVDataSet" assert catalog["cars"]["save_args"]["index"] is True assert catalog["nested"]["type"] == "MemoryDataSet" @use_config_dir - def test_nested_subdirs_duplicate(self, tmp_path, conf_paths, base_config): + def test_nested_subdirs_duplicate(self, tmp_path, base_config): """Check the error when the configs from subdirectories contain duplicate keys""" - nested = tmp_path / "base" / "catalog" / "dir" / "nested.yml" + nested = tmp_path / _BASE_ENV / "catalog" / "dir" / "nested.yml" _write_yaml(nested, base_config) pattern = ( @@ -183,19 +183,20 @@ def test_nested_subdirs_duplicate(self, tmp_path, conf_paths, base_config): r"and\:\n\- .*nested\.yml\: cars, trains" ) with pytest.raises(ValueError, match=pattern): - ConfigLoader(conf_paths).get("catalog*", "catalog*/**") + ConfigLoader(str(tmp_path)).get("catalog*", "catalog*/**") def test_ignore_hidden_keys(self, tmp_path): """Check that the config key starting with `_` are ignored and also don't cause a config merge error""" - _write_yaml(tmp_path / "base" / "catalog1.yml", {"k1": "v1", "_k2": "v2"}) - _write_yaml(tmp_path / "base" / "catalog2.yml", {"k3": "v3", "_k2": "v4"}) + _write_yaml(tmp_path / _BASE_ENV / "catalog1.yml", {"k1": "v1", "_k2": "v2"}) + _write_yaml(tmp_path / _BASE_ENV / "catalog2.yml", {"k3": "v3", "_k2": "v4"}) conf = ConfigLoader(str(tmp_path)) + conf.default_run_env = "" catalog = conf.get("**/catalog*") assert catalog.keys() == {"k1", "k3"} - _write_yaml(tmp_path / "base" / "catalog3.yml", {"k1": "dup", "_k2": "v5"}) + _write_yaml(tmp_path / _BASE_ENV / "catalog3.yml", {"k1": "dup", "_k2": "v5"}) pattern = ( r"^Duplicate keys found in .*catalog3\.yml and\:\n\- .*catalog1\.yml\: k1$" ) @@ -203,20 +204,20 @@ def test_ignore_hidden_keys(self, tmp_path): conf.get("**/catalog*") def test_bad_config_syntax(self, tmp_path): - conf_path = tmp_path / "test" + conf_path = tmp_path / _BASE_ENV conf_path.mkdir(parents=True, exist_ok=True) (conf_path / "catalog.yml").write_text("bad;config") pattern = f"Couldn't load config file: {conf_path / 'catalog.yml'}" with pytest.raises(BadConfigException, match=re.escape(pattern)): - ConfigLoader([str(conf_path)]).get("catalog*.yml") + ConfigLoader(str(tmp_path)).get("catalog*.yml") def test_lots_of_duplicates(self, tmp_path): """Check that the config key starting with `_` are ignored and also don't cause a config merge error""" data = {str(i): i for i in range(100)} - _write_yaml(tmp_path / "base" / "catalog1.yml", data) - _write_yaml(tmp_path / "base" / "catalog2.yml", data) + _write_yaml(tmp_path / _BASE_ENV / "catalog1.yml", data) + _write_yaml(tmp_path / _BASE_ENV / "catalog2.yml", data) conf = ConfigLoader(str(tmp_path)) pattern = r"^Duplicate keys found in .*catalog2\.yml and\:\n\- .*catalog1\.yml\: .*\.\.\.$" @@ -224,10 +225,10 @@ def test_lots_of_duplicates(self, tmp_path): conf.get("**/catalog*") @use_config_dir - def test_same_key_in_same_dir(self, tmp_path, conf_paths, base_config): + def test_same_key_in_same_dir(self, tmp_path, base_config): """Check the error if 2 files in the same config dir contain the same top-level key""" - dup_json = tmp_path / "base" / "catalog.json" + dup_json = tmp_path / _BASE_ENV / "catalog.json" _write_json(dup_json, base_config) pattern = ( @@ -235,31 +236,20 @@ def test_same_key_in_same_dir(self, tmp_path, conf_paths, base_config): r"and\:\n\- .*catalog\.json\: cars, trains" ) with pytest.raises(ValueError, match=pattern): - ConfigLoader(conf_paths).get("catalog*") - - def test_empty_conf_paths(self): - """Check the error if config paths were not specified or are empty""" - pattern = ( - r"`conf_paths` must contain at least one path to load " - r"configuration files from" - ) - with pytest.raises(ValueError, match=pattern): - ConfigLoader([]) - with pytest.raises(ValueError, match=pattern): - ConfigLoader("") + ConfigLoader(str(tmp_path)).get("catalog*") @use_config_dir - def test_empty_patterns(self, conf_paths): + def test_empty_patterns(self, tmp_path): """Check the error if no config patterns were specified""" pattern = ( r"`patterns` must contain at least one glob pattern " r"to match config filenames against" ) with pytest.raises(ValueError, match=pattern): - ConfigLoader(conf_paths).get() + ConfigLoader(str(tmp_path)).get() @use_config_dir - def test_no_files_found(self, conf_paths): + def test_no_files_found(self, tmp_path): """Check the error if no config files satisfy a given pattern""" pattern = ( r"No files found in " @@ -269,31 +259,27 @@ def test_no_files_found(self, conf_paths): r"\[\'non\-existent\-pattern\'\]" ) with pytest.raises(MissingConfigException, match=pattern): - ConfigLoader(conf_paths).get("non-existent-pattern") + ConfigLoader(str(tmp_path), _DEFAULT_RUN_ENV).get("non-existent-pattern") def test_duplicate_paths(self, tmp_path, caplog): """Check that trying to load the same environment config multiple times logs a warning and skips the reload""" - paths = [str(tmp_path / "base"), str(tmp_path / "base")] - _write_yaml(tmp_path / "base" / "catalog.yml", {"env": "base", "a": "a"}) + _write_yaml(tmp_path / _BASE_ENV / "catalog.yml", {"env": _BASE_ENV, "a": "a"}) + config_loader = ConfigLoader(str(tmp_path), _BASE_ENV) with pytest.warns(UserWarning, match="Duplicate environment detected"): - conf = ConfigLoader(paths) - assert conf.conf_paths == paths[:1] + config_paths = config_loader.conf_paths + assert config_paths == [str(tmp_path / _BASE_ENV)] - conf.get("catalog*", "catalog*/**") + config_loader.get("catalog*", "catalog*/**") log_messages = [record.getMessage() for record in caplog.records] assert not log_messages def test_overlapping_patterns(self, tmp_path, caplog): """Check that same configuration file is not loaded more than once.""" - paths = [ - str(tmp_path / "base"), - str(tmp_path / "dev"), - str(tmp_path / "dev" / "user1"), - ] _write_yaml( - tmp_path / "base" / "catalog0.yml", {"env": "base", "common": "common"} + tmp_path / _BASE_ENV / "catalog0.yml", + {"env": _BASE_ENV, "common": "common"}, ) _write_yaml( tmp_path / "dev" / "catalog1.yml", {"env": "dev", "dev_specific": "wiz"} @@ -301,13 +287,14 @@ def test_overlapping_patterns(self, tmp_path, caplog): _write_yaml(tmp_path / "dev" / "user1" / "catalog2.yml", {"user1_c2": True}) _write_yaml(tmp_path / "dev" / "user1" / "catalog3.yml", {"user1_c3": True}) - catalog = ConfigLoader(paths).get("catalog*", "catalog*/**", "user1/catalog2*") + catalog = ConfigLoader(str(tmp_path), "dev").get( + "catalog*", "catalog*/**", "user1/catalog2*", "../**/catalog2*" + ) expected_catalog = { "env": "dev", "common": "common", "dev_specific": "wiz", "user1_c2": True, - "user1_c3": True, } assert catalog == expected_catalog diff --git a/tests/config/test_templated_config.py b/tests/config/test_templated_config.py index 31d28fa41e..1b8cce25b4 100644 --- a/tests/config/test_templated_config.py +++ b/tests/config/test_templated_config.py @@ -7,6 +7,9 @@ from kedro.config import TemplatedConfigLoader from kedro.config.templated_config import _format_object +_DEFAULT_RUN_ENV = "local" +_BASE_ENV = "base" + def _write_yaml(filepath: Path, config: Dict): filepath.parent.mkdir(parents=True, exist_ok=True) @@ -14,11 +17,6 @@ def _write_yaml(filepath: Path, config: Dict): filepath.write_text(yaml_str) -@pytest.fixture -def conf_paths(tmp_path): - return [str(tmp_path / "base"), str(tmp_path / "local")] - - @pytest.fixture def param_config(): return { @@ -51,7 +49,7 @@ def template_config(): @pytest.fixture def catalog_with_jinja2_syntax(tmp_path): - filepath = tmp_path / "base" / "catalog.yml" + filepath = tmp_path / _BASE_ENV / "catalog.yml" catalog = """ {% for speed in ['fast', 'slow'] %} @@ -73,13 +71,13 @@ def catalog_with_jinja2_syntax(tmp_path): @pytest.fixture def proj_catalog_param(tmp_path, param_config): - proj_catalog = tmp_path / "base" / "catalog.yml" + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" _write_yaml(proj_catalog, param_config) @pytest.fixture def proj_catalog_globals(tmp_path, template_config): - global_yml = tmp_path / "base" / "globals.yml" + global_yml = tmp_path / _BASE_ENV / "globals.yml" _write_yaml(global_yml, template_config) @@ -98,7 +96,7 @@ def normal_config_advanced(): @pytest.fixture def proj_catalog_advanced(tmp_path, normal_config_advanced): - proj_catalog = tmp_path / "base" / "catalog.yml" + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" _write_yaml(proj_catalog, normal_config_advanced) @@ -128,7 +126,7 @@ def template_config_advanced(): @pytest.fixture def proj_catalog_param_w_vals_advanced(tmp_path, param_config_advanced): - proj_catalog = tmp_path / "base" / "catalog.yml" + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" _write_yaml(proj_catalog, param_config_advanced) @@ -155,7 +153,7 @@ def get_environ(): @pytest.fixture def proj_catalog_param_mixed(tmp_path, param_config_mixed): - proj_catalog = tmp_path / "base" / "catalog.yml" + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" _write_yaml(proj_catalog, param_config_mixed) @@ -177,7 +175,7 @@ def param_config_namespaced(): @pytest.fixture def proj_catalog_param_namespaced(tmp_path, param_config_namespaced): - proj_catalog = tmp_path / "base" / "catalog.yml" + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" _write_yaml(proj_catalog, param_config_namespaced) @@ -193,7 +191,7 @@ def template_config_exceptional(): @pytest.fixture def proj_catalog_param_w_vals_exceptional(tmp_path, param_config_exceptional): - proj_catalog = tmp_path / "base" / "catalog.yml" + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" _write_yaml(proj_catalog, param_config_exceptional) @@ -204,20 +202,19 @@ def param_config_with_default(): @pytest.fixture def proj_catalog_param_with_default(tmp_path, param_config_with_default): - proj_catalog = tmp_path / "base" / "catalog.yml" + proj_catalog = tmp_path / _BASE_ENV / "catalog.yml" _write_yaml(proj_catalog, param_config_with_default) class TestTemplatedConfigLoader: @pytest.mark.usefixtures("proj_catalog_param") - def test_catalog_parameterized_w_dict(self, tmp_path, conf_paths, template_config): + def test_catalog_parameterized_w_dict(self, tmp_path, template_config): """Test parameterized config with input from dictionary with values""" - (tmp_path / "local").mkdir(exist_ok=True) - - catalog = TemplatedConfigLoader(conf_paths, globals_dict=template_config).get( - "catalog*.yml" + config_loader = TemplatedConfigLoader( + str(tmp_path), globals_dict=template_config ) - + config_loader.default_run_env = "" + catalog = config_loader.get("catalog*.yml") assert catalog["boats"]["type"] == "SparkDataSet" assert ( catalog["boats"]["filepath"] == "s3a://boat-and-car-bucket/01_raw/boats.csv" @@ -228,13 +225,13 @@ def test_catalog_parameterized_w_dict(self, tmp_path, conf_paths, template_confi assert catalog["boats"]["users"] == ["fred", "ron"] @pytest.mark.usefixtures("proj_catalog_param", "proj_catalog_globals") - def test_catalog_parameterized_w_globals(self, tmp_path, conf_paths): + def test_catalog_parameterized_w_globals(self, tmp_path): """Test parameterized config with globals yaml file""" - (tmp_path / "local").mkdir(exist_ok=True) - - catalog = TemplatedConfigLoader(conf_paths, globals_pattern="*globals.yml").get( - "catalog*.yml" - ) + proj_catalog = tmp_path / _DEFAULT_RUN_ENV / "catalog.yml" + _write_yaml(proj_catalog, {}) + catalog = TemplatedConfigLoader( + str(tmp_path), globals_pattern="*globals.yml" + ).get("catalog*.yml") assert catalog["boats"]["type"] == "SparkDataSet" assert ( @@ -246,33 +243,31 @@ def test_catalog_parameterized_w_globals(self, tmp_path, conf_paths): assert catalog["boats"]["users"] == ["fred", "ron"] @pytest.mark.usefixtures("proj_catalog_param") - def test_catalog_parameterized_no_params_no_default(self, tmp_path, conf_paths): + def test_catalog_parameterized_no_params_no_default(self, tmp_path): """Test parameterized config without input""" - (tmp_path / "local").mkdir(exist_ok=True) - with pytest.raises(ValueError, match="Failed to format pattern"): - TemplatedConfigLoader(conf_paths).get("catalog*.yml") + config_loader = TemplatedConfigLoader(str(tmp_path)) + config_loader.default_run_env = "" + config_loader.get("catalog*.yml") @pytest.mark.usefixtures("proj_catalog_param_with_default") - def test_catalog_parameterized_empty_params_with_default( - self, tmp_path, conf_paths - ): + def test_catalog_parameterized_empty_params_with_default(self, tmp_path): """Test parameterized config with empty globals dictionary""" - (tmp_path / "local").mkdir(exist_ok=True) - - catalog = TemplatedConfigLoader(conf_paths, globals_dict={}).get("catalog*.yml") + config_loader = TemplatedConfigLoader(str(tmp_path), globals_dict={}) + config_loader.default_run_env = "" + catalog = config_loader.get("catalog*.yml") assert catalog["boats"]["users"] == ["fred", "ron"] @pytest.mark.usefixtures("proj_catalog_advanced") - def test_catalog_advanced(self, tmp_path, conf_paths, normal_config_advanced): + def test_catalog_advanced(self, tmp_path, normal_config_advanced): """Test whether it responds well to advanced yaml values (i.e. nested dicts, booleans, lists, etc.)""" - (tmp_path / "local").mkdir(exist_ok=True) - - catalog = TemplatedConfigLoader( - conf_paths, globals_dict=normal_config_advanced - ).get("catalog*.yml") + config_loader = TemplatedConfigLoader( + str(tmp_path), globals_dict=normal_config_advanced + ) + config_loader.default_run_env = "" + catalog = config_loader.get("catalog*.yml") assert catalog["planes"]["type"] == "SparkJDBCDataSet" assert catalog["planes"]["postgres_credentials"]["user"] == "Fakeuser" @@ -282,15 +277,13 @@ def test_catalog_advanced(self, tmp_path, conf_paths, normal_config_advanced): assert catalog["planes"]["secret_tables"] == ["models", "pilots", "engines"] @pytest.mark.usefixtures("proj_catalog_param_w_vals_advanced") - def test_catalog_parameterized_advanced( - self, tmp_path, conf_paths, template_config_advanced - ): + def test_catalog_parameterized_advanced(self, tmp_path, template_config_advanced): """Test advanced templating (i.e. nested dicts, booleans, lists, etc.)""" - (tmp_path / "local").mkdir(exist_ok=True) - - catalog = TemplatedConfigLoader( - conf_paths, globals_dict=template_config_advanced - ).get("catalog*.yml") + config_loader = TemplatedConfigLoader( + str(tmp_path), globals_dict=template_config_advanced + ) + config_loader.default_run_env = "" + catalog = config_loader.get("catalog*.yml") assert catalog["planes"]["type"] == "SparkJDBCDataSet" assert catalog["planes"]["postgres_credentials"]["user"] == "Fakeuser" @@ -300,15 +293,13 @@ def test_catalog_parameterized_advanced( assert catalog["planes"]["secret_tables"] == ["models", "pilots", "engines"] @pytest.mark.usefixtures("proj_catalog_param_mixed", "proj_catalog_globals") - def test_catalog_parameterized_w_dict_mixed( - self, tmp_path, conf_paths, get_environ - ): + def test_catalog_parameterized_w_dict_mixed(self, tmp_path, get_environ): """Test parameterized config with input from dictionary with values and globals.yml""" - (tmp_path / "local").mkdir(exist_ok=True) - + proj_catalog = tmp_path / _DEFAULT_RUN_ENV / "catalog.yml" + _write_yaml(proj_catalog, {}) catalog = TemplatedConfigLoader( - conf_paths, globals_pattern="*globals.yml", globals_dict=get_environ + str(tmp_path), globals_pattern="*globals.yml", globals_dict=get_environ ).get("catalog*.yml") assert catalog["boats"]["type"] == "SparkDataSet" @@ -322,14 +313,14 @@ def test_catalog_parameterized_w_dict_mixed( @pytest.mark.usefixtures("proj_catalog_param_namespaced") def test_catalog_parameterized_w_dict_namespaced( - self, tmp_path, conf_paths, template_config, get_environ + self, tmp_path, template_config, get_environ ): """Test parameterized config with namespacing in the template values""" - (tmp_path / "local").mkdir(exist_ok=True) - - catalog = TemplatedConfigLoader( - conf_paths, globals_dict={"global": template_config, "env": get_environ} - ).get("catalog*.yml") + config_loader = TemplatedConfigLoader( + str(tmp_path), globals_dict={"global": template_config, "env": get_environ} + ) + config_loader.default_run_env = "" + catalog = config_loader.get("catalog*.yml") assert catalog["boats"]["type"] == "SparkDataSet" assert ( @@ -342,23 +333,24 @@ def test_catalog_parameterized_w_dict_namespaced( @pytest.mark.usefixtures("proj_catalog_param_w_vals_exceptional") def test_catalog_parameterized_exceptional( - self, tmp_path, conf_paths, template_config_exceptional + self, tmp_path, template_config_exceptional ): """Test templating with mixed type replacement values going into one string""" - (tmp_path / "local").mkdir(exist_ok=True) - - catalog = TemplatedConfigLoader( - conf_paths, globals_dict=template_config_exceptional - ).get("catalog*.yml") + config_loader = TemplatedConfigLoader( + str(tmp_path), globals_dict=template_config_exceptional + ) + config_loader.default_run_env = "" + catalog = config_loader.get("catalog*.yml") assert catalog["postcode"] == "NW10 2JK" @pytest.mark.usefixtures("catalog_with_jinja2_syntax") - def test_catalog_with_jinja2_syntax(self, tmp_path, conf_paths, template_config): - (tmp_path / "local").mkdir(exist_ok=True) - catalog = TemplatedConfigLoader(conf_paths, globals_dict=template_config).get( - "catalog*.yml" + def test_catalog_with_jinja2_syntax(self, tmp_path, template_config): + config_loader = TemplatedConfigLoader( + str(tmp_path), globals_dict=template_config ) + config_loader.default_run_env = "" + catalog = config_loader.get("catalog*.yml") expected_catalog = { "fast-trains": {"type": "MemoryDataSet"}, "fast-cars": { diff --git a/tests/extras/datasets/networkx/test_gml_dataset.py b/tests/extras/datasets/networkx/test_gml_dataset.py new file mode 100644 index 0000000000..a35fe90ab8 --- /dev/null +++ b/tests/extras/datasets/networkx/test_gml_dataset.py @@ -0,0 +1,216 @@ +# Copyright 2021 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path, PurePosixPath + +import networkx +import pytest +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from s3fs.core import S3FileSystem + +from kedro.extras.datasets.networkx import GMLDataSet +from kedro.io import DataSetError, Version +from kedro.io.core import PROTOCOL_DELIMITER + +ATTRS = { + "source": "from", + "target": "to", + "name": "fake_id", + "key": "fake_key", + "link": "fake_link", +} + + +@pytest.fixture +def filepath_gml(tmp_path): + return (tmp_path / "some_dir" / "test.gml").as_posix() + + +@pytest.fixture +def gml_data_set(filepath_gml): + return GMLDataSet( + filepath=filepath_gml, + load_args={"destringizer": int}, + save_args={"stringizer": str}, + ) + + +@pytest.fixture +def versioned_gml_data_set(filepath_gml, load_version, save_version): + return GMLDataSet( + filepath=filepath_gml, + version=Version(load_version, save_version), + load_args={"destringizer": int}, + save_args={"stringizer": str}, + ) + + +@pytest.fixture() +def dummy_graph_data(): + return networkx.complete_graph(3) + + +class TestGMLDataSet: + def test_save_and_load(self, gml_data_set, dummy_graph_data): + """Test saving and reloading the data set.""" + gml_data_set.save(dummy_graph_data) + reloaded = gml_data_set.load() + assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) + assert gml_data_set._fs_open_args_load == {"mode": "rb"} + assert gml_data_set._fs_open_args_save == {"mode": "wb"} + + def test_load_missing_file(self, gml_data_set): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set GMLDataSet\(.*\)" + with pytest.raises(DataSetError, match=pattern): + assert gml_data_set.load() + + def test_exists(self, gml_data_set, dummy_graph_data): + """Test `exists` method invocation.""" + assert not gml_data_set.exists() + gml_data_set.save(dummy_graph_data) + assert gml_data_set.exists() + + @pytest.mark.parametrize( + "filepath,instance_type", + [ + ("s3://bucket/file.gml", S3FileSystem), + ("file:///tmp/test.gml", LocalFileSystem), + ("/tmp/test.gml", LocalFileSystem), + ("gcs://bucket/file.gml", GCSFileSystem), + ("https://example.com/file.gml", HTTPFileSystem), + ], + ) + def test_protocol_usage(self, filepath, instance_type): + data_set = GMLDataSet(filepath=filepath) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.gml" + data_set = GMLDataSet(filepath=filepath) + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + +class TestGMLDataSetVersioned: + def test_save_and_load(self, versioned_gml_data_set, dummy_graph_data): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_gml_data_set.save(dummy_graph_data) + reloaded = versioned_gml_data_set.load() + assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) + assert versioned_gml_data_set._fs_open_args_load == {"mode": "rb"} + assert versioned_gml_data_set._fs_open_args_save == {"mode": "wb"} + + def test_no_versions(self, versioned_gml_data_set): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for GMLDataSet\(.+\)" + with pytest.raises(DataSetError, match=pattern): + versioned_gml_data_set.load() + + def test_exists(self, versioned_gml_data_set, dummy_graph_data): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_gml_data_set.exists() + versioned_gml_data_set.save(dummy_graph_data) + assert versioned_gml_data_set.exists() + + def test_prevent_override(self, versioned_gml_data_set, dummy_graph_data): + """Check the error when attempt to override the same data set + version.""" + versioned_gml_data_set.save(dummy_graph_data) + pattern = ( + r"Save path \`.+\` for GMLDataSet\(.+\) must not " + r"exist if versioning is enabled" + ) + with pytest.raises(DataSetError, match=pattern): + versioned_gml_data_set.save(dummy_graph_data) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_gml_data_set, load_version, save_version, dummy_graph_data + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + fr"Save version `{save_version}` did not match " + fr"load version `{load_version}` for GMLDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_gml_data_set.save(dummy_graph_data) + + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test.gml" + ds = GMLDataSet(filepath=filepath) + ds_versioned = GMLDataSet( + filepath=filepath, version=Version(load_version, save_version) + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "GMLDataSet" in str(ds_versioned) + assert "GMLDataSet" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + + def test_versioning_existing_dataset( + self, gml_data_set, versioned_gml_data_set, dummy_graph_data + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + gml_data_set.save(dummy_graph_data) + assert gml_data_set.exists() + assert gml_data_set._filepath == versioned_gml_data_set._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_gml_data_set._filepath.parent.as_posix()})" + ) + with pytest.raises(DataSetError, match=pattern): + versioned_gml_data_set.save(dummy_graph_data) + + # Remove non-versioned dataset and try again + Path(gml_data_set._filepath.as_posix()).unlink() + versioned_gml_data_set.save(dummy_graph_data) + assert versioned_gml_data_set.exists() diff --git a/tests/extras/datasets/networkx/test_graphml_dataset.py b/tests/extras/datasets/networkx/test_graphml_dataset.py new file mode 100644 index 0000000000..7c72f46395 --- /dev/null +++ b/tests/extras/datasets/networkx/test_graphml_dataset.py @@ -0,0 +1,216 @@ +# Copyright 2021 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path, PurePosixPath + +import networkx +import pytest +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from s3fs.core import S3FileSystem + +from kedro.extras.datasets.networkx import GraphMLDataSet +from kedro.io import DataSetError, Version +from kedro.io.core import PROTOCOL_DELIMITER + +ATTRS = { + "source": "from", + "target": "to", + "name": "fake_id", + "key": "fake_key", + "link": "fake_link", +} + + +@pytest.fixture +def filepath_graphml(tmp_path): + return (tmp_path / "some_dir" / "test.graphml").as_posix() + + +@pytest.fixture +def graphml_data_set(filepath_graphml): + return GraphMLDataSet( + filepath=filepath_graphml, + load_args={"node_type": int}, + save_args={}, + ) + + +@pytest.fixture +def versioned_graphml_data_set(filepath_graphml, load_version, save_version): + return GraphMLDataSet( + filepath=filepath_graphml, + version=Version(load_version, save_version), + load_args={"node_type": int}, + save_args={}, + ) + + +@pytest.fixture() +def dummy_graph_data(): + return networkx.complete_graph(3) + + +class TestGraphMLDataSet: + def test_save_and_load(self, graphml_data_set, dummy_graph_data): + """Test saving and reloading the data set.""" + graphml_data_set.save(dummy_graph_data) + reloaded = graphml_data_set.load() + assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) + assert graphml_data_set._fs_open_args_load == {"mode": "rb"} + assert graphml_data_set._fs_open_args_save == {"mode": "wb"} + + def test_load_missing_file(self, graphml_data_set): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set GraphMLDataSet\(.*\)" + with pytest.raises(DataSetError, match=pattern): + assert graphml_data_set.load() + + def test_exists(self, graphml_data_set, dummy_graph_data): + """Test `exists` method invocation.""" + assert not graphml_data_set.exists() + graphml_data_set.save(dummy_graph_data) + assert graphml_data_set.exists() + + @pytest.mark.parametrize( + "filepath,instance_type", + [ + ("s3://bucket/file.graphml", S3FileSystem), + ("file:///tmp/test.graphml", LocalFileSystem), + ("/tmp/test.graphml", LocalFileSystem), + ("gcs://bucket/file.graphml", GCSFileSystem), + ("https://example.com/file.graphml", HTTPFileSystem), + ], + ) + def test_protocol_usage(self, filepath, instance_type): + data_set = GraphMLDataSet(filepath=filepath) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.graphml" + data_set = GraphMLDataSet(filepath=filepath) + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + +class TestGraphMLDataSetVersioned: + def test_save_and_load(self, versioned_graphml_data_set, dummy_graph_data): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_graphml_data_set.save(dummy_graph_data) + reloaded = versioned_graphml_data_set.load() + assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) + assert versioned_graphml_data_set._fs_open_args_load == {"mode": "rb"} + assert versioned_graphml_data_set._fs_open_args_save == {"mode": "wb"} + + def test_no_versions(self, versioned_graphml_data_set): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for GraphMLDataSet\(.+\)" + with pytest.raises(DataSetError, match=pattern): + versioned_graphml_data_set.load() + + def test_exists(self, versioned_graphml_data_set, dummy_graph_data): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_graphml_data_set.exists() + versioned_graphml_data_set.save(dummy_graph_data) + assert versioned_graphml_data_set.exists() + + def test_prevent_override(self, versioned_graphml_data_set, dummy_graph_data): + """Check the error when attempt to override the same data set + version.""" + versioned_graphml_data_set.save(dummy_graph_data) + pattern = ( + r"Save path \`.+\` for GraphMLDataSet\(.+\) must not " + r"exist if versioning is enabled" + ) + with pytest.raises(DataSetError, match=pattern): + versioned_graphml_data_set.save(dummy_graph_data) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_graphml_data_set, load_version, save_version, dummy_graph_data + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + fr"Save version `{save_version}` did not match " + fr"load version `{load_version}` for GraphMLDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_graphml_data_set.save(dummy_graph_data) + + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test.graphml" + ds = GraphMLDataSet(filepath=filepath) + ds_versioned = GraphMLDataSet( + filepath=filepath, version=Version(load_version, save_version) + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "GraphMLDataSet" in str(ds_versioned) + assert "GraphMLDataSet" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + + def test_versioning_existing_dataset( + self, graphml_data_set, versioned_graphml_data_set, dummy_graph_data + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + graphml_data_set.save(dummy_graph_data) + assert graphml_data_set.exists() + assert graphml_data_set._filepath == versioned_graphml_data_set._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_graphml_data_set._filepath.parent.as_posix()})" + ) + with pytest.raises(DataSetError, match=pattern): + versioned_graphml_data_set.save(dummy_graph_data) + + # Remove non-versioned dataset and try again + Path(graphml_data_set._filepath.as_posix()).unlink() + versioned_graphml_data_set.save(dummy_graph_data) + assert versioned_graphml_data_set.exists() diff --git a/tests/extras/datasets/networkx/test_networkx_dataset.py b/tests/extras/datasets/networkx/test_json_dataset.py similarity index 60% rename from tests/extras/datasets/networkx/test_networkx_dataset.py rename to tests/extras/datasets/networkx/test_json_dataset.py index 433ebe89fd..c89d7caaba 100644 --- a/tests/extras/datasets/networkx/test_networkx_dataset.py +++ b/tests/extras/datasets/networkx/test_json_dataset.py @@ -7,7 +7,7 @@ from gcsfs import GCSFileSystem from s3fs.core import S3FileSystem -from kedro.extras.datasets.networkx import NetworkXDataSet +from kedro.extras.datasets.networkx import JSONDataSet from kedro.io import DataSetError, Version from kedro.io.core import PROTOCOL_DELIMITER @@ -26,20 +26,20 @@ def filepath_json(tmp_path): @pytest.fixture -def networkx_data_set(filepath_json, fs_args): - return NetworkXDataSet(filepath=filepath_json, fs_args=fs_args) +def json_data_set(filepath_json, fs_args): + return JSONDataSet(filepath=filepath_json, fs_args=fs_args) @pytest.fixture -def versioned_networkx_data_set(filepath_json, load_version, save_version): - return NetworkXDataSet( +def versioned_json_data_set(filepath_json, load_version, save_version): + return JSONDataSet( filepath=filepath_json, version=Version(load_version, save_version) ) @pytest.fixture -def networkx_data_set_args(filepath_json): - return NetworkXDataSet( +def json_data_set_args(filepath_json): + return JSONDataSet( filepath=filepath_json, load_args={"attrs": ATTRS}, save_args={"attrs": ATTRS} ) @@ -49,29 +49,27 @@ def dummy_graph_data(): return networkx.complete_graph(3) -class TestNetworkXDataSet: - def test_save_and_load(self, networkx_data_set, dummy_graph_data): +class TestJSONDataSet: + def test_save_and_load(self, json_data_set, dummy_graph_data): """Test saving and reloading the data set.""" - networkx_data_set.save(dummy_graph_data) - reloaded = networkx_data_set.load() + json_data_set.save(dummy_graph_data) + reloaded = json_data_set.load() assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) - assert networkx_data_set._fs_open_args_load == {} - assert networkx_data_set._fs_open_args_save == {"mode": "w"} + assert json_data_set._fs_open_args_load == {} + assert json_data_set._fs_open_args_save == {"mode": "w"} - def test_load_missing_file(self, networkx_data_set): + def test_load_missing_file(self, json_data_set): """Check the error when trying to load missing file.""" - pattern = r"Failed while loading data from data set NetworkXDataSet\(.*\)" + pattern = r"Failed while loading data from data set JSONDataSet\(.*\)" with pytest.raises(DataSetError, match=pattern): - assert networkx_data_set.load() + assert json_data_set.load() - def test_load_args_save_args( - self, mocker, networkx_data_set_args, dummy_graph_data - ): + def test_load_args_save_args(self, mocker, json_data_set_args, dummy_graph_data): """Test saving and reloading with save and load arguments.""" patched_save = mocker.patch( "networkx.node_link_data", wraps=networkx.node_link_data ) - networkx_data_set_args.save(dummy_graph_data) + json_data_set_args.save(dummy_graph_data) patched_save.assert_called_once_with(dummy_graph_data, attrs=ATTRS) patched_load = mocker.patch( @@ -79,7 +77,7 @@ def test_load_args_save_args( ) # load args need to be the same attrs as the ones used for saving # in order to successfully retrieve data - reloaded = networkx_data_set_args.load() + reloaded = json_data_set_args.load() patched_load.assert_called_once_with( { @@ -102,17 +100,15 @@ def test_load_args_save_args( [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], indirect=True, ) - def test_open_extra_args(self, networkx_data_set, fs_args): - assert networkx_data_set._fs_open_args_load == fs_args["open_args_load"] - assert networkx_data_set._fs_open_args_save == { - "mode": "w" - } # default unchanged + def test_open_extra_args(self, json_data_set, fs_args): + assert json_data_set._fs_open_args_load == fs_args["open_args_load"] + assert json_data_set._fs_open_args_save == {"mode": "w"} # default unchanged - def test_exists(self, networkx_data_set, dummy_graph_data): + def test_exists(self, json_data_set, dummy_graph_data): """Test `exists` method invocation.""" - assert not networkx_data_set.exists() - networkx_data_set.save(dummy_graph_data) - assert networkx_data_set.exists() + assert not json_data_set.exists() + json_data_set.save(dummy_graph_data) + assert json_data_set.exists() @pytest.mark.parametrize( "filepath,instance_type", @@ -125,7 +121,7 @@ def test_exists(self, networkx_data_set, dummy_graph_data): ], ) def test_protocol_usage(self, filepath, instance_type): - data_set = NetworkXDataSet(filepath=filepath) + data_set = JSONDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] @@ -136,41 +132,41 @@ def test_protocol_usage(self, filepath, instance_type): def test_catalog_release(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value filepath = "test.json" - data_set = NetworkXDataSet(filepath=filepath) + data_set = JSONDataSet(filepath=filepath) data_set.release() fs_mock.invalidate_cache.assert_called_once_with(filepath) -class TestNetworkXDataSetVersioned: - def test_save_and_load(self, versioned_networkx_data_set, dummy_graph_data): +class TestJSONDataSetVersioned: + def test_save_and_load(self, versioned_json_data_set, dummy_graph_data): """Test that saved and reloaded data matches the original one for the versioned data set.""" - versioned_networkx_data_set.save(dummy_graph_data) - reloaded = versioned_networkx_data_set.load() + versioned_json_data_set.save(dummy_graph_data) + reloaded = versioned_json_data_set.load() assert dummy_graph_data.nodes(data=True) == reloaded.nodes(data=True) - def test_no_versions(self, versioned_networkx_data_set): + def test_no_versions(self, versioned_json_data_set): """Check the error if no versions are available for load.""" - pattern = r"Did not find any versions for NetworkXDataSet\(.+\)" + pattern = r"Did not find any versions for JSONDataSet\(.+\)" with pytest.raises(DataSetError, match=pattern): - versioned_networkx_data_set.load() + versioned_json_data_set.load() - def test_exists(self, versioned_networkx_data_set, dummy_graph_data): + def test_exists(self, versioned_json_data_set, dummy_graph_data): """Test `exists` method invocation for versioned data set.""" - assert not versioned_networkx_data_set.exists() - versioned_networkx_data_set.save(dummy_graph_data) - assert versioned_networkx_data_set.exists() + assert not versioned_json_data_set.exists() + versioned_json_data_set.save(dummy_graph_data) + assert versioned_json_data_set.exists() - def test_prevent_override(self, versioned_networkx_data_set, dummy_graph_data): + def test_prevent_override(self, versioned_json_data_set, dummy_graph_data): """Check the error when attempt to override the same data set version.""" - versioned_networkx_data_set.save(dummy_graph_data) + versioned_json_data_set.save(dummy_graph_data) pattern = ( - r"Save path \`.+\` for NetworkXDataSet\(.+\) must not " + r"Save path \`.+\` for JSONDataSet\(.+\) must not " r"exist if versioning is enabled" ) with pytest.raises(DataSetError, match=pattern): - versioned_networkx_data_set.save(dummy_graph_data) + versioned_json_data_set.save(dummy_graph_data) @pytest.mark.parametrize( "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True @@ -179,23 +175,23 @@ def test_prevent_override(self, versioned_networkx_data_set, dummy_graph_data): "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True ) def test_save_version_warning( - self, versioned_networkx_data_set, load_version, save_version, dummy_graph_data + self, versioned_json_data_set, load_version, save_version, dummy_graph_data ): """Check the warning when saving to the path that differs from the subsequent load path.""" pattern = ( fr"Save version `{save_version}` did not match load version " - fr"`{load_version}` for NetworkXDataSet\(.+\)" + fr"`{load_version}` for JSONDataSet\(.+\)" ) with pytest.warns(UserWarning, match=pattern): - versioned_networkx_data_set.save(dummy_graph_data) + versioned_json_data_set.save(dummy_graph_data) def test_version_str_repr(self, load_version, save_version): """Test that version is in string representation of the class instance when applicable.""" filepath = "test.json" - ds = NetworkXDataSet(filepath=filepath) - ds_versioned = NetworkXDataSet( + ds = JSONDataSet(filepath=filepath) + ds_versioned = JSONDataSet( filepath=filepath, version=Version(load_version, save_version) ) assert filepath in str(ds) @@ -204,27 +200,27 @@ def test_version_str_repr(self, load_version, save_version): assert filepath in str(ds_versioned) ver_str = f"version=Version(load={load_version}, save='{save_version}')" assert ver_str in str(ds_versioned) - assert "NetworkXDataSet" in str(ds_versioned) - assert "NetworkXDataSet" in str(ds) + assert "JSONDataSet" in str(ds_versioned) + assert "JSONDataSet" in str(ds) assert "protocol" in str(ds_versioned) assert "protocol" in str(ds) def test_versioning_existing_dataset( - self, networkx_data_set, versioned_networkx_data_set, dummy_graph_data + self, json_data_set, versioned_json_data_set, dummy_graph_data ): """Check the error when attempting to save a versioned dataset on top of an already existing (non-versioned) dataset.""" - networkx_data_set.save(dummy_graph_data) - assert networkx_data_set.exists() - assert networkx_data_set._filepath == versioned_networkx_data_set._filepath + json_data_set.save(dummy_graph_data) + assert json_data_set.exists() + assert json_data_set._filepath == versioned_json_data_set._filepath pattern = ( f"(?=.*file with the same name already exists in the directory)" - f"(?=.*{versioned_networkx_data_set._filepath.parent.as_posix()})" + f"(?=.*{versioned_json_data_set._filepath.parent.as_posix()})" ) with pytest.raises(DataSetError, match=pattern): - versioned_networkx_data_set.save(dummy_graph_data) + versioned_json_data_set.save(dummy_graph_data) # Remove non-versioned dataset and try again - Path(networkx_data_set._filepath.as_posix()).unlink() - versioned_networkx_data_set.save(dummy_graph_data) - assert versioned_networkx_data_set.exists() + Path(json_data_set._filepath.as_posix()).unlink() + versioned_json_data_set.save(dummy_graph_data) + assert versioned_json_data_set.exists() diff --git a/tests/extras/datasets/pandas/test_appendable_excel_dataset.py b/tests/extras/datasets/pandas/test_appendable_excel_dataset.py deleted file mode 100644 index 774d0744c1..0000000000 --- a/tests/extras/datasets/pandas/test_appendable_excel_dataset.py +++ /dev/null @@ -1,148 +0,0 @@ -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal - -from kedro.extras.datasets.pandas import AppendableExcelDataSet, ExcelDataSet -from kedro.io import DataSetError - - -@pytest.fixture -def filepath(tmp_path): - return (tmp_path / "test.xlsx").as_posix() - - -@pytest.fixture(scope="module") -def dummy_dataframe(): - return pd.DataFrame({"col1": [7, 8], "col2": [5, 8]}) - - -def setup_excel_dataset(path): - excel_dataset = ExcelDataSet(path) - df = pd.DataFrame({"col1": [1, 2]}) - excel_dataset.save(df) - return excel_dataset, df - - -@pytest.fixture -def appendable_excel_dataset(filepath, save_args, load_args): - return AppendableExcelDataSet( - filepath=filepath, load_args=load_args, save_args=save_args - ) - - -class TestAppendableExcelDataSet: - def test_save_and_load(self, dummy_dataframe, filepath): - """Test saving and reloading the data set.""" - excel_dataset, excel_df = setup_excel_dataset(filepath) - appendable_excel_dataset = AppendableExcelDataSet( - filepath=filepath, - load_args={"sheet_name": "test"}, - save_args={"sheet_name": "test"}, - ) - appendable_excel_dataset.save(dummy_dataframe) - reloaded = appendable_excel_dataset.load() - assert_frame_equal(dummy_dataframe, reloaded) - - reloaded_first_sheet = excel_dataset.load() - assert_frame_equal(reloaded_first_sheet, excel_df) - - assert appendable_excel_dataset._save_args == { - "index": False, - "sheet_name": "test", - } - assert appendable_excel_dataset._load_args == { - "engine": "openpyxl", - "sheet_name": "test", - } - assert appendable_excel_dataset._writer_args == { - "engine": "openpyxl", - "mode": "a", - } - - def test_exists(self, filepath): - """Test `exists` method invocation for both existing and - nonexistent data set.""" - appendable_excel_dataset = AppendableExcelDataSet(filepath) - assert not appendable_excel_dataset.exists() - setup_excel_dataset(filepath) - assert appendable_excel_dataset.exists() - - @pytest.mark.parametrize( - "load_args", [{"k1": "v1", "engine": "value"}], indirect=True - ) - def test_load_extra_params(self, appendable_excel_dataset, load_args): - """Test overriding the default load arguments.""" - for key, value in load_args.items(): - assert appendable_excel_dataset._load_args[key] == value - - @pytest.mark.parametrize( - "save_args", [{"k1": "v1", "index": "value"}], indirect=True - ) - def test_save_extra_params(self, appendable_excel_dataset, save_args): - """Test overriding the default save arguments.""" - for key, value in save_args.items(): - assert appendable_excel_dataset._save_args[key] == value - - @pytest.mark.parametrize( - "save_args", [{"writer": {"mode": "write", "engine": "test"}}], indirect=True - ) - def test_writer_args( - self, appendable_excel_dataset, save_args - ): # pylint: disable=unused-argument - """Test overriding the default writer arguments except mode.""" - assert appendable_excel_dataset._writer_args == {"engine": "test", "mode": "a"} - - def test_load_missing_file(self, appendable_excel_dataset): - """Check the error when trying to load missing file.""" - pattern = ( - r"Failed while loading data from data set AppendableExcelDataSet\(.*\)" - ) - with pytest.raises(DataSetError, match=pattern): - appendable_excel_dataset.load() - - def test_appending_to_non_existing_file( - self, appendable_excel_dataset, dummy_dataframe - ): - pattern = ( - rf"`{appendable_excel_dataset._filepath}` Excel file not found\. " - rf"The file cannot be opened in append mode\." - ) - with pytest.raises(DataSetError, match=pattern): - appendable_excel_dataset.save(dummy_dataframe) - - def test_str_repr(self, appendable_excel_dataset): - """Test that meta data is in string representation of the class instance.""" - - assert str(appendable_excel_dataset._filepath) in str(appendable_excel_dataset) - assert "version" not in str(appendable_excel_dataset) - assert "writer_args" in str(appendable_excel_dataset) - assert "save_args={" in str(appendable_excel_dataset) - assert "load_args={" in str(appendable_excel_dataset) - - def test_load_args_passed_in_read_excel(self, appendable_excel_dataset, mocker): - read_excel_mock = mocker.patch("pandas.read_excel") - appendable_excel_dataset.load() - assert appendable_excel_dataset._load_args - read_excel_mock.assert_called_once_with( - str(appendable_excel_dataset._filepath), - **appendable_excel_dataset._load_args, - ) - - def test_save_args_passed_in_to_excel_and_writer( - self, appendable_excel_dataset, dummy_dataframe, mocker - ): - to_excel_mock = mocker.patch.object(dummy_dataframe, "to_excel") - writer_mock = mocker.patch("pandas.ExcelWriter") - - appendable_excel_dataset.save(dummy_dataframe) - assert appendable_excel_dataset._writer_args - writer_mock.assert_called_once_with( - str(appendable_excel_dataset._filepath), - **appendable_excel_dataset._writer_args, - ) - - assert appendable_excel_dataset._save_args - to_excel_mock.assert_called_once_with( - writer_mock.return_value.__enter__.return_value, - **appendable_excel_dataset._save_args, - ) diff --git a/tests/extras/datasets/pandas/test_csv_dataset.py b/tests/extras/datasets/pandas/test_csv_dataset.py index 36cb7009b0..4d057549f3 100644 --- a/tests/extras/datasets/pandas/test_csv_dataset.py +++ b/tests/extras/datasets/pandas/test_csv_dataset.py @@ -45,8 +45,6 @@ def test_save_and_load(self, csv_data_set, dummy_dataframe): csv_data_set.save(dummy_dataframe) reloaded = csv_data_set.load() assert_frame_equal(dummy_dataframe, reloaded) - assert csv_data_set._fs_open_args_load == {} - assert csv_data_set._fs_open_args_save == {"mode": "w", "newline": ""} def test_exists(self, csv_data_set, dummy_dataframe): """Test `exists` method invocation for both existing and @@ -72,16 +70,26 @@ def test_save_extra_params(self, csv_data_set, save_args): assert csv_data_set._save_args[key] == value @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], - indirect=True, + "load_args,save_args", + [ + ({"storage_options": {"a": "b"}}, {}), + ({}, {"storage_options": {"a": "b"}}), + ({"storage_options": {"a": "b"}}, {"storage_options": {"x": "y"}}), + ], ) - def test_open_extra_args(self, csv_data_set, fs_args): - assert csv_data_set._fs_open_args_load == fs_args["open_args_load"] - assert csv_data_set._fs_open_args_save == { - "mode": "w", - "newline": "", - } # default unchanged + def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): + filepath = str(tmp_path / "test.csv") + + ds = CSVDataSet(filepath=filepath, load_args=load_args, save_args=save_args) + + records = [r for r in caplog.records if r.levelname == "WARNING"] + expected_log_message = ( + f"Dropping `storage_options` for {filepath}, " + f"please specify them under `fs_args` or `credentials`." + ) + assert records[0].getMessage() == expected_log_message + assert "storage_options" not in ds._save_args + assert "storage_options" not in ds._load_args def test_load_missing_file(self, csv_data_set): """Check the error when trying to load missing file.""" diff --git a/tests/extras/datasets/pandas/test_excel_dataset.py b/tests/extras/datasets/pandas/test_excel_dataset.py index f7c1004522..9cdc1842d9 100644 --- a/tests/extras/datasets/pandas/test_excel_dataset.py +++ b/tests/extras/datasets/pandas/test_excel_dataset.py @@ -62,8 +62,6 @@ def test_save_and_load(self, excel_data_set, dummy_dataframe): excel_data_set.save(dummy_dataframe) reloaded = excel_data_set.load() assert_frame_equal(dummy_dataframe, reloaded) - assert excel_data_set._fs_open_args_load == {} - assert excel_data_set._fs_open_args_save == {"mode": "wb"} def test_save_and_load_multiple_sheets( self, excel_multisheet_data_set, dummy_dataframe, another_dummy_dataframe @@ -102,13 +100,26 @@ def test_save_extra_params(self, excel_data_set, save_args): assert excel_data_set._save_args[key] == value @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], - indirect=True, + "load_args,save_args", + [ + ({"storage_options": {"a": "b"}}, {}), + ({}, {"storage_options": {"a": "b"}}), + ({"storage_options": {"a": "b"}}, {"storage_options": {"x": "y"}}), + ], ) - def test_open_extra_args(self, excel_data_set, fs_args): - assert excel_data_set._fs_open_args_load == fs_args["open_args_load"] - assert excel_data_set._fs_open_args_save == {"mode": "wb"} # default unchanged + def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): + filepath = str(tmp_path / "test.csv") + + ds = ExcelDataSet(filepath=filepath, load_args=load_args, save_args=save_args) + + records = [r for r in caplog.records if r.levelname == "WARNING"] + expected_log_message = ( + f"Dropping `storage_options` for {filepath}, " + f"please specify them under `fs_args` or `credentials`." + ) + assert records[0].getMessage() == expected_log_message + assert "storage_options" not in ds._save_args + assert "storage_options" not in ds._load_args def test_load_missing_file(self, excel_data_set): """Check the error when trying to load missing file.""" @@ -117,16 +128,20 @@ def test_load_missing_file(self, excel_data_set): excel_data_set.load() @pytest.mark.parametrize( - "filepath,instance_type", + "filepath,instance_type,load_path", [ - ("s3://bucket/file.xlsx", S3FileSystem), - ("file:///tmp/test.xlsx", LocalFileSystem), - ("/tmp/test.xlsx", LocalFileSystem), - ("gcs://bucket/file.xlsx", GCSFileSystem), - ("https://example.com/file.xlsx", HTTPFileSystem), + ("s3://bucket/file.xlsx", S3FileSystem, "s3://bucket/file.xlsx"), + ("file:///tmp/test.xlsx", LocalFileSystem, "/tmp/test.xlsx"), + ("/tmp/test.xlsx", LocalFileSystem, "/tmp/test.xlsx"), + ("gcs://bucket/file.xlsx", GCSFileSystem, "gcs://bucket/file.xlsx"), + ( + "https://example.com/file.xlsx", + HTTPFileSystem, + "https://example.com/file.xlsx", + ), ], ) - def test_protocol_usage(self, filepath, instance_type): + def test_protocol_usage(self, filepath, instance_type, load_path, mocker): data_set = ExcelDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) @@ -135,6 +150,11 @@ def test_protocol_usage(self, filepath, instance_type): assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath) + mock_pandas_call = mocker.patch("pandas.read_excel") + data_set.load() + assert mock_pandas_call.call_count == 1 + assert mock_pandas_call.call_args_list[0][0][0] == load_path + def test_catalog_release(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value filepath = "test.xlsx" @@ -167,8 +187,8 @@ def test_version_str_repr(self, load_version, save_version): # Default save_args and load_args assert "save_args={'index': False}" in str(ds) assert "save_args={'index': False}" in str(ds_versioned) - assert "load_args={'engine': xlrd}" in str(ds_versioned) - assert "load_args={'engine': xlrd}" in str(ds) + assert "load_args={'engine': openpyxl}" in str(ds_versioned) + assert "load_args={'engine': openpyxl}" in str(ds) def test_save_and_load(self, versioned_excel_data_set, dummy_dataframe): """Test that saved and reloaded data matches the original one for @@ -183,6 +203,20 @@ def test_no_versions(self, versioned_excel_data_set): with pytest.raises(DataSetError, match=pattern): versioned_excel_data_set.load() + def test_versioning_not_supported_in_append_mode( + self, tmp_path, load_version, save_version + ): + filepath = str(tmp_path / "test.xlsx") + save_args = {"writer": {"mode": "a"}} + + pattern = "`ExcelDataSet` doesn't support versioning in append mode." + with pytest.raises(DataSetError, match=pattern): + ExcelDataSet( + filepath=filepath, + version=Version(load_version, save_version), + save_args=save_args, + ) + def test_exists(self, versioned_excel_data_set, dummy_dataframe): """Test `exists` method invocation for versioned data set.""" assert not versioned_excel_data_set.exists() diff --git a/tests/extras/datasets/pandas/test_feather_dataset.py b/tests/extras/datasets/pandas/test_feather_dataset.py index 237d66baa9..6d4376fc93 100644 --- a/tests/extras/datasets/pandas/test_feather_dataset.py +++ b/tests/extras/datasets/pandas/test_feather_dataset.py @@ -43,8 +43,6 @@ def test_save_and_load(self, feather_data_set, dummy_dataframe): feather_data_set.save(dummy_dataframe) reloaded = feather_data_set.load() assert_frame_equal(dummy_dataframe, reloaded) - assert feather_data_set._fs_open_args_load == {} - assert feather_data_set._fs_open_args_save == {"mode": "wb"} def test_exists(self, feather_data_set, dummy_dataframe): """Test `exists` method invocation for both existing and @@ -61,6 +59,28 @@ def test_load_extra_params(self, feather_data_set, load_args): for key, value in load_args.items(): assert feather_data_set._load_args[key] == value + @pytest.mark.parametrize( + "load_args,save_args", + [ + ({"storage_options": {"a": "b"}}, {}), + ({}, {"storage_options": {"a": "b"}}), + ({"storage_options": {"a": "b"}}, {"storage_options": {"x": "y"}}), + ], + ) + def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): + filepath = str(tmp_path / "test.csv") + + ds = FeatherDataSet(filepath=filepath, load_args=load_args, save_args=save_args) + + records = [r for r in caplog.records if r.levelname == "WARNING"] + expected_log_message = ( + f"Dropping `storage_options` for {filepath}, " + f"please specify them under `fs_args` or `credentials`." + ) + assert records[0].getMessage() == expected_log_message + assert "storage_options" not in ds._save_args + assert "storage_options" not in ds._load_args + def test_load_missing_file(self, feather_data_set): """Check the error when trying to load missing file.""" pattern = r"Failed while loading data from data set FeatherDataSet\(.*\)" @@ -68,25 +88,20 @@ def test_load_missing_file(self, feather_data_set): feather_data_set.load() @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], - indirect=True, - ) - def test_open_extra_args(self, feather_data_set, fs_args): - assert feather_data_set._fs_open_args_load == fs_args["open_args_load"] - assert feather_data_set._fs_open_args_save == {"mode": "wb"} - - @pytest.mark.parametrize( - "filepath,instance_type", + "filepath,instance_type,load_path", [ - ("s3://bucket/file.feather", S3FileSystem), - ("file:///tmp/test.feather", LocalFileSystem), - ("/tmp/test.feather", LocalFileSystem), - ("gcs://bucket/file.feather", GCSFileSystem), - ("https://example.com/file.feather", HTTPFileSystem), + ("s3://bucket/file.feather", S3FileSystem, "s3://bucket/file.feather"), + ("file:///tmp/test.feather", LocalFileSystem, "/tmp/test.feather"), + ("/tmp/test.feather", LocalFileSystem, "/tmp/test.feather"), + ("gcs://bucket/file.feather", GCSFileSystem, "gcs://bucket/file.feather"), + ( + "https://example.com/file.feather", + HTTPFileSystem, + "https://example.com/file.feather", + ), ], ) - def test_protocol_usage(self, filepath, instance_type): + def test_protocol_usage(self, filepath, instance_type, load_path, mocker): data_set = FeatherDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) @@ -95,6 +110,11 @@ def test_protocol_usage(self, filepath, instance_type): assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath) + mock_pandas_call = mocker.patch("pandas.read_feather") + data_set.load() + assert mock_pandas_call.call_count == 1 + assert mock_pandas_call.call_args_list[0][0][0] == load_path + def test_catalog_release(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value filepath = "test.feather" diff --git a/tests/extras/datasets/pandas/test_json_dataset.py b/tests/extras/datasets/pandas/test_json_dataset.py index 4770c2e91b..520d3ab6e1 100644 --- a/tests/extras/datasets/pandas/test_json_dataset.py +++ b/tests/extras/datasets/pandas/test_json_dataset.py @@ -47,8 +47,6 @@ def test_save_and_load(self, json_data_set, dummy_dataframe): json_data_set.save(dummy_dataframe) reloaded = json_data_set.load() assert_frame_equal(dummy_dataframe, reloaded) - assert json_data_set._fs_open_args_load == {} - assert json_data_set._fs_open_args_save == {"mode": "w"} def test_exists(self, json_data_set, dummy_dataframe): """Test `exists` method invocation for both existing and @@ -74,13 +72,26 @@ def test_save_extra_params(self, json_data_set, save_args): assert json_data_set._save_args[key] == value @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], - indirect=True, + "load_args,save_args", + [ + ({"storage_options": {"a": "b"}}, {}), + ({}, {"storage_options": {"a": "b"}}), + ({"storage_options": {"a": "b"}}, {"storage_options": {"x": "y"}}), + ], ) - def test_open_extra_args(self, json_data_set, fs_args): - assert json_data_set._fs_open_args_load == fs_args["open_args_load"] - assert json_data_set._fs_open_args_save == {"mode": "w"} # default unchanged + def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): + filepath = str(tmp_path / "test.csv") + + ds = JSONDataSet(filepath=filepath, load_args=load_args, save_args=save_args) + + records = [r for r in caplog.records if r.levelname == "WARNING"] + expected_log_message = ( + f"Dropping `storage_options` for {filepath}, " + f"please specify them under `fs_args` or `credentials`." + ) + assert records[0].getMessage() == expected_log_message + assert "storage_options" not in ds._save_args + assert "storage_options" not in ds._load_args def test_load_missing_file(self, json_data_set): """Check the error when trying to load missing file.""" @@ -89,21 +100,29 @@ def test_load_missing_file(self, json_data_set): json_data_set.load() @pytest.mark.parametrize( - "filepath,instance_type,credentials", + "filepath,instance_type,credentials,load_path", [ - ("s3://bucket/file.json", S3FileSystem, {}), - ("file:///tmp/test.json", LocalFileSystem, {}), - ("/tmp/test.json", LocalFileSystem, {}), - ("gcs://bucket/file.json", GCSFileSystem, {}), - ("https://example.com/file.json", HTTPFileSystem, {}), + ("s3://bucket/file.json", S3FileSystem, {}, "s3://bucket/file.json"), + ("file:///tmp/test.json", LocalFileSystem, {}, "/tmp/test.json"), + ("/tmp/test.json", LocalFileSystem, {}, "/tmp/test.json"), + ("gcs://bucket/file.json", GCSFileSystem, {}, "gcs://bucket/file.json"), + ( + "https://example.com/file.json", + HTTPFileSystem, + {}, + "https://example.com/file.json", + ), ( "abfs://bucket/file.csv", AzureBlobFileSystem, {"account_name": "test", "account_key": "test"}, + "abfs://bucket/file.csv", ), ], ) - def test_protocol_usage(self, filepath, instance_type, credentials): + def test_protocol_usage( + self, filepath, instance_type, credentials, load_path, mocker + ): data_set = JSONDataSet(filepath=filepath, credentials=credentials) assert isinstance(data_set._fs, instance_type) @@ -112,6 +131,11 @@ def test_protocol_usage(self, filepath, instance_type, credentials): assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath) + mock_pandas_call = mocker.patch("pandas.read_json") + data_set.load() + assert mock_pandas_call.call_count == 1 + assert mock_pandas_call.call_args_list[0][0][0] == load_path + def test_catalog_release(self, mocker): fs_mock = mocker.patch("fsspec.filesystem").return_value filepath = "test.json" diff --git a/tests/extras/datasets/pandas/test_parquet_dataset.py b/tests/extras/datasets/pandas/test_parquet_dataset.py index 69950b6133..ff5f2eccd3 100644 --- a/tests/extras/datasets/pandas/test_parquet_dataset.py +++ b/tests/extras/datasets/pandas/test_parquet_dataset.py @@ -1,7 +1,6 @@ from pathlib import Path, PurePosixPath import pandas as pd -import pyarrow as pa import pyarrow.parquet as pq import pytest from fsspec.implementations.http import HTTPFileSystem @@ -62,7 +61,6 @@ def test_save_and_load(self, tmp_path, dummy_dataframe): data_set.save(dummy_dataframe) reloaded = data_set.load() assert_frame_equal(dummy_dataframe, reloaded) - assert data_set._fs_open_args_load == {} files = [child.is_file() for child in tmp_path.iterdir()] assert all(files) @@ -98,15 +96,28 @@ def test_save_extra_params(self, parquet_data_set, save_args): """Test overriding the default save arguments.""" for key, value in save_args.items(): assert parquet_data_set._save_args[key] == value - assert parquet_data_set._from_pandas_args == {} @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "r", "compression": "gzip"}}], - indirect=True, + "load_args,save_args", + [ + ({"storage_options": {"a": "b"}}, {}), + ({}, {"storage_options": {"a": "b"}}), + ({"storage_options": {"a": "b"}}, {"storage_options": {"x": "y"}}), + ], ) - def test_open_extra_args(self, parquet_data_set, fs_args): - assert parquet_data_set._fs_open_args_load == fs_args["open_args_load"] + def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): + filepath = str(tmp_path / "test.csv") + + ds = ParquetDataSet(filepath=filepath, load_args=load_args, save_args=save_args) + + records = [r for r in caplog.records if r.levelname == "WARNING"] + expected_log_message = ( + f"Dropping `storage_options` for {filepath}, " + f"please specify them under `fs_args` or `credentials`." + ) + assert records[0].getMessage() == expected_log_message + assert "storage_options" not in ds._save_args + assert "storage_options" not in ds._load_args def test_load_missing_file(self, parquet_data_set): """Check the error when trying to load missing file.""" @@ -115,16 +126,20 @@ def test_load_missing_file(self, parquet_data_set): parquet_data_set.load() @pytest.mark.parametrize( - "filepath,instance_type", + "filepath,instance_type,load_path", [ - ("s3://bucket/file.parquet", S3FileSystem), - ("file:///tmp/test.parquet", LocalFileSystem), - ("/tmp/test.parquet", LocalFileSystem), - ("gcs://bucket/file.parquet", GCSFileSystem), - ("https://example.com/file.parquet", HTTPFileSystem), + ("s3://bucket/file.parquet", S3FileSystem, "s3://bucket/file.parquet"), + ("file:///tmp/test.parquet", LocalFileSystem, "/tmp/test.parquet"), + ("/tmp/test.parquet", LocalFileSystem, "/tmp/test.parquet"), + ("gcs://bucket/file.parquet", GCSFileSystem, "gcs://bucket/file.parquet"), + ( + "https://example.com/file.parquet", + HTTPFileSystem, + "https://example.com/file.parquet", + ), ], ) - def test_protocol_usage(self, filepath, instance_type): + def test_protocol_usage(self, filepath, instance_type, load_path, mocker): data_set = ParquetDataSet(filepath=filepath) assert isinstance(data_set._fs, instance_type) @@ -133,6 +148,12 @@ def test_protocol_usage(self, filepath, instance_type): assert str(data_set._filepath) == path assert isinstance(data_set._filepath, PurePosixPath) + mocker.patch.object(data_set._fs, "isdir", return_value=False) + mock_pandas_call = mocker.patch("pandas.read_parquet") + data_set.load() + assert mock_pandas_call.call_count == 1 + assert mock_pandas_call.call_args_list[0][0][0] == load_path + @pytest.mark.parametrize( "protocol,path", [("https://", "example.com/"), ("s3://", "bucket/")] ) @@ -192,27 +213,16 @@ def test_read_from_file(self, mocker): data_set.load() fs_mock.isdir.assert_called_once() - fs_mock.open.assert_called_once() - # pylint: disable=unused-argument - @pytest.mark.parametrize( - "save_args", [{"from_pandas": {"preserve_index": False}}], indirect=True - ) - def test_from_pandas_args( - self, parquet_data_set, dummy_dataframe, save_args, mocker - ): - from_pandas_mock = mocker.patch( - "kedro.extras.datasets.pandas.parquet_dataset.pa", wraps=pa + def test_arg_partition_cols(self, dummy_dataframe, tmp_path): + data_set = ParquetDataSet( + filepath=(tmp_path / FILENAME).as_posix(), + save_args={"partition_cols": ["col2"]}, ) - from_pandas_args = {"preserve_index": False} - - parquet_data_set.save(dummy_dataframe) + pattern = "does not support save argument `partition_cols`" - assert parquet_data_set._save_args == {} - assert parquet_data_set._from_pandas_args == from_pandas_args - from_pandas_mock.Table.from_pandas.assert_called_once_with( - dummy_dataframe, **from_pandas_args - ) + with pytest.raises(DataSetError, match=pattern): + data_set.save(dummy_dataframe) class TestParquetDataSetVersioned: diff --git a/tests/extras/datasets/pandas/test_xml_dataset.py b/tests/extras/datasets/pandas/test_xml_dataset.py new file mode 100644 index 0000000000..5c4f0f3031 --- /dev/null +++ b/tests/extras/datasets/pandas/test_xml_dataset.py @@ -0,0 +1,269 @@ +# Copyright 2021 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path, PurePosixPath + +import pandas as pd +import pytest +from adlfs import AzureBlobFileSystem +from fsspec.implementations.http import HTTPFileSystem +from fsspec.implementations.local import LocalFileSystem +from gcsfs import GCSFileSystem +from pandas.testing import assert_frame_equal +from s3fs.core import S3FileSystem + +from kedro.extras.datasets.pandas import XMLDataSet +from kedro.io import DataSetError +from kedro.io.core import PROTOCOL_DELIMITER, Version + + +@pytest.fixture +def filepath_xml(tmp_path): + return (tmp_path / "test.xml").as_posix() + + +@pytest.fixture +def xml_data_set(filepath_xml, load_args, save_args, fs_args): + return XMLDataSet( + filepath=filepath_xml, + load_args=load_args, + save_args=save_args, + fs_args=fs_args, + ) + + +@pytest.fixture +def versioned_xml_data_set(filepath_xml, load_version, save_version): + return XMLDataSet( + filepath=filepath_xml, version=Version(load_version, save_version) + ) + + +@pytest.fixture +def dummy_dataframe(): + return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) + + +class TestXMLDataSet: + def test_save_and_load(self, xml_data_set, dummy_dataframe): + """Test saving and reloading the data set.""" + xml_data_set.save(dummy_dataframe) + reloaded = xml_data_set.load() + assert_frame_equal(dummy_dataframe, reloaded) + + def test_exists(self, xml_data_set, dummy_dataframe): + """Test `exists` method invocation for both existing and + nonexistent data set.""" + assert not xml_data_set.exists() + xml_data_set.save(dummy_dataframe) + assert xml_data_set.exists() + + @pytest.mark.parametrize( + "load_args", [{"k1": "v1", "index": "value"}], indirect=True + ) + def test_load_extra_params(self, xml_data_set, load_args): + """Test overriding the default load arguments.""" + for key, value in load_args.items(): + assert xml_data_set._load_args[key] == value + + @pytest.mark.parametrize( + "save_args", [{"k1": "v1", "index": "value"}], indirect=True + ) + def test_save_extra_params(self, xml_data_set, save_args): + """Test overriding the default save arguments.""" + for key, value in save_args.items(): + assert xml_data_set._save_args[key] == value + + @pytest.mark.parametrize( + "load_args,save_args", + [ + ({"storage_options": {"a": "b"}}, {}), + ({}, {"storage_options": {"a": "b"}}), + ({"storage_options": {"a": "b"}}, {"storage_options": {"x": "y"}}), + ], + ) + def test_storage_options_dropped(self, load_args, save_args, caplog, tmp_path): + filepath = str(tmp_path / "test.csv") + + ds = XMLDataSet(filepath=filepath, load_args=load_args, save_args=save_args) + + records = [r for r in caplog.records if r.levelname == "WARNING"] + expected_log_message = ( + f"Dropping `storage_options` for {filepath}, " + f"please specify them under `fs_args` or `credentials`." + ) + assert records[0].getMessage() == expected_log_message + assert "storage_options" not in ds._save_args + assert "storage_options" not in ds._load_args + + def test_load_missing_file(self, xml_data_set): + """Check the error when trying to load missing file.""" + pattern = r"Failed while loading data from data set XMLDataSet\(.*\)" + with pytest.raises(DataSetError, match=pattern): + xml_data_set.load() + + @pytest.mark.parametrize( + "filepath,instance_type,credentials,load_path", + [ + ("s3://bucket/file.xml", S3FileSystem, {}, "s3://bucket/file.xml"), + ("file:///tmp/test.xml", LocalFileSystem, {}, "/tmp/test.xml"), + ("/tmp/test.xml", LocalFileSystem, {}, "/tmp/test.xml"), + ("gcs://bucket/file.xml", GCSFileSystem, {}, "gcs://bucket/file.xml"), + ( + "https://example.com/file.xml", + HTTPFileSystem, + {}, + "https://example.com/file.xml", + ), + ( + "abfs://bucket/file.csv", + AzureBlobFileSystem, + {"account_name": "test", "account_key": "test"}, + "abfs://bucket/file.csv", + ), + ], + ) + def test_protocol_usage( + self, filepath, instance_type, credentials, load_path, mocker + ): + data_set = XMLDataSet(filepath=filepath, credentials=credentials) + assert isinstance(data_set._fs, instance_type) + + path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] + + assert str(data_set._filepath) == path + assert isinstance(data_set._filepath, PurePosixPath) + + mock_pandas_call = mocker.patch("pandas.read_xml") + data_set.load() + assert mock_pandas_call.call_count == 1 + assert mock_pandas_call.call_args_list[0][0][0] == load_path + + def test_catalog_release(self, mocker): + fs_mock = mocker.patch("fsspec.filesystem").return_value + filepath = "test.xml" + data_set = XMLDataSet(filepath=filepath) + data_set.release() + fs_mock.invalidate_cache.assert_called_once_with(filepath) + + +class TestXMLDataSetVersioned: + def test_version_str_repr(self, load_version, save_version): + """Test that version is in string representation of the class instance + when applicable.""" + filepath = "test.xml" + ds = XMLDataSet(filepath=filepath) + ds_versioned = XMLDataSet( + filepath=filepath, version=Version(load_version, save_version) + ) + assert filepath in str(ds) + assert "version" not in str(ds) + + assert filepath in str(ds_versioned) + ver_str = f"version=Version(load={load_version}, save='{save_version}')" + assert ver_str in str(ds_versioned) + assert "XMLDataSet" in str(ds_versioned) + assert "XMLDataSet" in str(ds) + assert "protocol" in str(ds_versioned) + assert "protocol" in str(ds) + + def test_save_and_load(self, versioned_xml_data_set, dummy_dataframe): + """Test that saved and reloaded data matches the original one for + the versioned data set.""" + versioned_xml_data_set.save(dummy_dataframe) + reloaded_df = versioned_xml_data_set.load() + assert_frame_equal(dummy_dataframe, reloaded_df) + + def test_no_versions(self, versioned_xml_data_set): + """Check the error if no versions are available for load.""" + pattern = r"Did not find any versions for XMLDataSet\(.+\)" + with pytest.raises(DataSetError, match=pattern): + versioned_xml_data_set.load() + + def test_exists(self, versioned_xml_data_set, dummy_dataframe): + """Test `exists` method invocation for versioned data set.""" + assert not versioned_xml_data_set.exists() + versioned_xml_data_set.save(dummy_dataframe) + assert versioned_xml_data_set.exists() + + def test_prevent_overwrite(self, versioned_xml_data_set, dummy_dataframe): + """Check the error when attempting to override the data set if the + corresponding hdf file for a given save version already exists.""" + versioned_xml_data_set.save(dummy_dataframe) + pattern = ( + r"Save path \`.+\` for XMLDataSet\(.+\) must " + r"not exist if versioning is enabled\." + ) + with pytest.raises(DataSetError, match=pattern): + versioned_xml_data_set.save(dummy_dataframe) + + @pytest.mark.parametrize( + "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True + ) + @pytest.mark.parametrize( + "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True + ) + def test_save_version_warning( + self, versioned_xml_data_set, load_version, save_version, dummy_dataframe + ): + """Check the warning when saving to the path that differs from + the subsequent load path.""" + pattern = ( + fr"Save version `{save_version}` did not match " + fr"load version `{load_version}` for XMLDataSet\(.+\)" + ) + with pytest.warns(UserWarning, match=pattern): + versioned_xml_data_set.save(dummy_dataframe) + + def test_http_filesystem_no_versioning(self): + pattern = r"HTTP\(s\) DataSet doesn't support versioning\." + + with pytest.raises(DataSetError, match=pattern): + XMLDataSet( + filepath="https://example.com/file.xml", version=Version(None, None) + ) + + def test_versioning_existing_dataset( + self, xml_data_set, versioned_xml_data_set, dummy_dataframe + ): + """Check the error when attempting to save a versioned dataset on top of an + already existing (non-versioned) dataset.""" + xml_data_set.save(dummy_dataframe) + assert xml_data_set.exists() + assert xml_data_set._filepath == versioned_xml_data_set._filepath + pattern = ( + f"(?=.*file with the same name already exists in the directory)" + f"(?=.*{versioned_xml_data_set._filepath.parent.as_posix()})" + ) + with pytest.raises(DataSetError, match=pattern): + versioned_xml_data_set.save(dummy_dataframe) + + # Remove non-versioned dataset and try again + Path(xml_data_set._filepath.as_posix()).unlink() + versioned_xml_data_set.save(dummy_dataframe) + assert versioned_xml_data_set.exists() diff --git a/tests/extras/datasets/spark/conftest.py b/tests/extras/datasets/spark/conftest.py index b1be145f31..4b08ae9d6c 100644 --- a/tests/extras/datasets/spark/conftest.py +++ b/tests/extras/datasets/spark/conftest.py @@ -11,7 +11,7 @@ try: from pyspark.sql import SparkSession except ImportError: # pragma: no cover - pass # this is only for test discovery to succeed on Python 3.8 + pass # this is only for test discovery to succeed on Python 3.8, 3.9 def _setup_spark_session(): diff --git a/tests/extras/datasets/spark/test_memory_dataset.py b/tests/extras/datasets/spark/test_memory_dataset.py index 5d6c62b1dd..ef47eac9d3 100644 --- a/tests/extras/datasets/spark/test_memory_dataset.py +++ b/tests/extras/datasets/spark/test_memory_dataset.py @@ -31,37 +31,37 @@ def spark_data_frame(spark_session): @pytest.fixture -def memory_data_set(spark_data_frame): +def memory_dataset(spark_data_frame): return MemoryDataSet(data=spark_data_frame) -def test_load_modify_original_data(memory_data_set, spark_data_frame): +def test_load_modify_original_data(memory_dataset, spark_data_frame): """Check that the data set object is not updated when the original SparkDataFrame is changed.""" spark_data_frame = _update_spark_df(spark_data_frame, 1, 1, -5) - assert not _check_equals(memory_data_set.load(), spark_data_frame) + assert not _check_equals(memory_dataset.load(), spark_data_frame) def test_save_modify_original_data(spark_data_frame): """Check that the data set object is not updated when the original SparkDataFrame is changed.""" - memory_data_set = MemoryDataSet() - memory_data_set.save(spark_data_frame) + memory_dataset = MemoryDataSet() + memory_dataset.save(spark_data_frame) spark_data_frame = _update_spark_df(spark_data_frame, 1, 1, "new value") - assert not _check_equals(memory_data_set.load(), spark_data_frame) + assert not _check_equals(memory_dataset.load(), spark_data_frame) -def test_load_returns_same_spark_object(memory_data_set, spark_data_frame): +def test_load_returns_same_spark_object(memory_dataset, spark_data_frame): """Test that consecutive loads point to the same object in case of a SparkDataFrame""" - loaded_data = memory_data_set.load() - reloaded_data = memory_data_set.load() + loaded_data = memory_dataset.load() + reloaded_data = memory_dataset.load() assert _check_equals(loaded_data, spark_data_frame) assert _check_equals(reloaded_data, spark_data_frame) assert loaded_data is reloaded_data -def test_str_representation(memory_data_set): +def test_str_representation(memory_dataset): """Test string representation of the data set""" - assert "MemoryDataSet(data=)" in str(memory_data_set) + assert "MemoryDataSet(data=)" in str(memory_dataset) diff --git a/tests/extras/datasets/spark/test_spark_hive_dataset.py b/tests/extras/datasets/spark/test_spark_hive_dataset.py index c30ca49cd3..36cd3926cf 100644 --- a/tests/extras/datasets/spark/test_spark_hive_dataset.py +++ b/tests/extras/datasets/spark/test_spark_hive_dataset.py @@ -1,4 +1,5 @@ import gc +import re from pathlib import Path from tempfile import TemporaryDirectory @@ -34,6 +35,9 @@ def spark_session(): .enableHiveSupport() .getOrCreate() ) + spark.sparkContext.setCheckpointDir( + str((Path(tmpdir) / "spark_checkpoint").absolute()) + ) yield spark # This fixture should be a dependency of other fixtures dealing with spark hive data @@ -141,7 +145,7 @@ def test_cant_pickle(self): def test_read_existing_table(self): dataset = SparkHiveDataSet( - database="default_1", table="table_1", write_mode="overwrite" + database="default_1", table="table_1", write_mode="overwrite", save_args={} ) assert_df_equal(_generate_spark_df_one(), dataset.load()) @@ -177,7 +181,7 @@ def test_insert_not_empty_table(self, spark_session): dataset = SparkHiveDataSet( database="default_1", table="test_insert_not_empty_table", - write_mode="insert", + write_mode="append", ) dataset.save(_generate_spark_df_one()) dataset.save(_generate_spark_df_one()) @@ -226,25 +230,29 @@ def test_upsert_not_empty_table(self, spark_session): ) def test_invalid_pk_provided(self): + _test_columns = ["column_doesnt_exist"] dataset = SparkHiveDataSet( database="default_1", table="table_1", write_mode="upsert", - table_pk=["column_doesnt_exist"], + table_pk=_test_columns, ) with pytest.raises( DataSetError, - match=r"Columns \[column_doesnt_exist\] selected as primary key\(s\) not " - r"found in table default_1\.table_1", + match=re.escape( + f"Columns {str(_test_columns)} selected as primary key(s) " + f"not found in table default_1.table_1", + ), ): dataset.save(_generate_spark_df_one()) def test_invalid_write_mode_provided(self): pattern = ( - r"Invalid `write_mode` provided: not_a_write_mode\. " - r"`write_mode` must be one of: insert, upsert, overwrite" + "Invalid `write_mode` provided: not_a_write_mode. " + "`write_mode` must be one of: " + "append, error, errorifexists, upsert, overwrite" ) - with pytest.raises(DataSetError, match=pattern): + with pytest.raises(DataSetError, match=re.escape(pattern)): SparkHiveDataSet( database="default_1", table="table_1", @@ -260,7 +268,7 @@ def test_invalid_schema_insert(self, spark_session): dataset = SparkHiveDataSet( database="default_1", table="test_invalid_schema_insert", - write_mode="insert", + write_mode="append", ) with pytest.raises( DataSetError, @@ -272,7 +280,7 @@ def test_invalid_schema_insert(self, spark_session): def test_insert_to_non_existent_table(self): dataset = SparkHiveDataSet( - database="default_1", table="table_not_yet_created", write_mode="insert" + database="default_1", table="table_not_yet_created", write_mode="append" ) dataset.save(_generate_spark_df_one()) assert_df_equal( @@ -281,10 +289,15 @@ def test_insert_to_non_existent_table(self): def test_read_from_non_existent_table(self): dataset = SparkHiveDataSet( - database="default_1", table="table_doesnt_exist", write_mode="insert" + database="default_1", table="table_doesnt_exist", write_mode="append" ) with pytest.raises( DataSetError, - match="Requested table not found: default_1.table_doesnt_exist", + match=r"Failed while loading data from data set " + r"SparkHiveDataSet\(database=default_1, format=hive, " + r"table=table_doesnt_exist, table_pk=\[\], write_mode=append\)\.\n" + r"Table or view not found: default_1.table_doesnt_exist;\n" + r"'UnresolvedRelation \[default_1, " + r"table_doesnt_exist\], \[\], false\n", ): dataset.load() diff --git a/tests/extras/datasets/yaml/test_yaml_dataset.py b/tests/extras/datasets/yaml/test_yaml_dataset.py index 7b75b45231..03428a3802 100644 --- a/tests/extras/datasets/yaml/test_yaml_dataset.py +++ b/tests/extras/datasets/yaml/test_yaml_dataset.py @@ -102,7 +102,7 @@ def test_catalog_release(self, mocker): def test_dataframe_support(self, yaml_data_set): data = pd.DataFrame({"col1": [1, 2], "col2": [4, 5]}) - yaml_data_set.save(data) + yaml_data_set.save(data.to_dict()) reloaded = yaml_data_set.load() assert isinstance(reloaded, dict) diff --git a/tests/extras/decorators/__init__.py b/tests/extras/decorators/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/extras/decorators/test_memory_profiler.py b/tests/extras/decorators/test_memory_profiler.py deleted file mode 100644 index d8d54ea106..0000000000 --- a/tests/extras/decorators/test_memory_profiler.py +++ /dev/null @@ -1,56 +0,0 @@ -import importlib -import logging -from time import sleep - -import pytest - -from kedro.extras.decorators import memory_profiler - - -def sleeping_identity(inp): - sleep(0.1) - return inp - - -def test_mem_profile(caplog): - func = memory_profiler.mem_profile(sleeping_identity) - res = func(1) - - logger_name, severity, message = caplog.record_tuples[0] - assert res == 1 - assert logger_name == "kedro.extras.decorators.memory_profiler" - assert severity == logging.INFO - expected = ( - f"Running '{sleeping_identity.__module__}.{sleeping_identity.__qualname__}' " - f"consumed" - ) - assert expected in message - - -def test_mem_profile_old_versions(caplog, mocker): - mocker.patch( - "kedro.extras.decorators.memory_profiler.memory_usage", - return_value=[[float(0)], 1], - ) - func = memory_profiler.mem_profile(sleeping_identity) - res = func(1) - - logger_name, severity, message = caplog.record_tuples[0] - assert res == 1 - assert logger_name == "kedro.extras.decorators.memory_profiler" - assert severity == logging.INFO - expected = ( - f"Running '{sleeping_identity.__module__}.{sleeping_identity.__qualname__}' " - f"consumed" - ) - assert expected in message - - -def test_import_error(mocker): - mocker.patch.dict("sys.modules", {"memory_profiler": None}) - pattern = ( - r".*`pip install kedro\[profilers\]` to get the required " - "memory profiler dependencies" - ) - with pytest.raises(ImportError, match=pattern): - importlib.reload(memory_profiler) diff --git a/tests/extras/decorators/test_retry_node.py b/tests/extras/decorators/test_retry_node.py deleted file mode 100644 index 6e07a14bb7..0000000000 --- a/tests/extras/decorators/test_retry_node.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest - -from kedro.extras.decorators.retry_node import retry -from kedro.pipeline import node - - -def test_retry(): - def _bigger(obj): - obj["value"] += 1 - if obj["value"] >= 0: - return True - raise ValueError("Value less than 0") - - decorated = node(_bigger, "in", "out").decorate(retry()) - - with pytest.raises(ValueError, match=r"Value less than 0"): - decorated.run({"in": {"value": -3}}) - - decorated2 = node(_bigger, "in", "out").decorate(retry(n_times=2)) - assert decorated2.run({"in": {"value": -3}}) diff --git a/tests/extras/extensions/test_ipython.py b/tests/extras/extensions/test_ipython.py index 4678e65639..49ba08b33f 100644 --- a/tests/extras/extensions/test_ipython.py +++ b/tests/extras/extensions/test_ipython.py @@ -95,6 +95,7 @@ def my_register_pipeline(): ) mocker.patch("kedro.framework.project.settings.configure") mocker.patch("kedro.framework.session.session.validate_settings") + mocker.patch("kedro.framework.session.KedroSession._get_config_loader") mocker.patch( "kedro.framework.startup.bootstrap_project", return_value=fake_metadata, diff --git a/tests/extras/transformers/__init__.py b/tests/extras/transformers/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/extras/transformers/conftest.py b/tests/extras/transformers/conftest.py deleted file mode 100644 index 703cda48a7..0000000000 --- a/tests/extras/transformers/conftest.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -This file contains the fixtures that are reusable by any tests within -this directory. You don't need to import the fixtures as pytest will -discover them automatically. More info here: -https://docs.pytest.org/en/latest/fixture.html -""" - -from typing import Any, Dict - -import pytest - -from kedro.io import AbstractDataSet, DataCatalog - - -class FakeDataSet(AbstractDataSet): - def __init__(self, data): - self.log = [] - self.data = data - - def _load(self) -> Any: - self.log.append(("load", self.data)) - return self.data - - def _save(self, data: Any) -> None: - self.log.append(("save", data)) - self.data = data - - def _describe(self) -> Dict[str, Any]: - return {"data": self.data} - - -@pytest.fixture -def fake_data_set(): - return FakeDataSet(123) - - -@pytest.fixture -def catalog(fake_data_set): - return DataCatalog({"test": fake_data_set}) diff --git a/tests/extras/transformers/test_memory_profiler.py b/tests/extras/transformers/test_memory_profiler.py deleted file mode 100644 index e7ff80c269..0000000000 --- a/tests/extras/transformers/test_memory_profiler.py +++ /dev/null @@ -1,28 +0,0 @@ -import importlib - -import pytest - -import kedro.extras.transformers.memory_profiler as tf - - -class TestMemoryTransformer: - def test_memory_usage(self, catalog, caplog): - expected_log = "MiB memory at peak time" - catalog.add_transformer(tf.ProfileMemoryTransformer()) - - catalog.save("test", 42) - assert "Saving test consumed" in caplog.text - assert expected_log in caplog.text - caplog.clear() - assert catalog.load("test") == 42 - assert "Loading test consumed" in caplog.text - assert expected_log in caplog.text - - def test_import_error(self, mocker): - mocker.patch.dict("sys.modules", {"memory_profiler": None}) - pattern = ( - r".*`pip install kedro\[profilers\]` to get the required " - "memory profiler dependencies" - ) - with pytest.raises(ImportError, match=pattern): - importlib.reload(tf) diff --git a/tests/extras/transformers/test_time_profiler.py b/tests/extras/transformers/test_time_profiler.py deleted file mode 100644 index d4048d21dd..0000000000 --- a/tests/extras/transformers/test_time_profiler.py +++ /dev/null @@ -1,11 +0,0 @@ -from kedro.extras.transformers import ProfileTimeTransformer - - -class TestTransformers: - def test_timing(self, catalog, caplog): - catalog.add_transformer(ProfileTimeTransformer()) - - catalog.save("test", 42) - assert "Saving test took" in caplog.text - assert catalog.load("test") == 42 - assert "Loading test took" in caplog.text diff --git a/tests/framework/cli/pipeline/conftest.py b/tests/framework/cli/pipeline/conftest.py index 0e6aefc918..2dc27dd6ce 100644 --- a/tests/framework/cli/pipeline/conftest.py +++ b/tests/framework/cli/pipeline/conftest.py @@ -12,6 +12,32 @@ def mocked_logging(mocker): return mocker.patch("logging.config.dictConfig") +@pytest.fixture(autouse=True) +def cleanup_micropackages(fake_repo_path, fake_package_path): + packages = {p.name for p in fake_package_path.iterdir() if p.is_dir()} + + yield + + created_packages = { + p.name + for p in fake_package_path.iterdir() + if p.is_dir() and p.name != "__pycache__" + } + created_packages -= packages + + for micropackage in created_packages: + shutil.rmtree(str(fake_package_path / micropackage)) + + confs = fake_repo_path / settings.CONF_SOURCE + for each in confs.rglob(f"*{micropackage}*"): + if each.is_file(): + each.unlink() + + tests = fake_repo_path / "src" / "tests" / micropackage + if tests.is_dir(): + shutil.rmtree(str(tests)) + + @pytest.fixture(autouse=True) def cleanup_pipelines(fake_repo_path, fake_package_path): pipes_path = fake_package_path / "pipelines" @@ -29,35 +55,28 @@ def cleanup_pipelines(fake_repo_path, fake_package_path): for pipeline in created_pipelines: shutil.rmtree(str(pipes_path / pipeline)) - confs = fake_repo_path / settings.CONF_ROOT + confs = fake_repo_path / settings.CONF_SOURCE for each in confs.rglob(f"*{pipeline}*"): # clean all pipeline config files if each.is_file(): each.unlink() - dirs_to_delete = ( - dirpath - for pattern in ("parameters", "catalog") - for dirpath in confs.rglob(pattern) - if dirpath.is_dir() and not any(dirpath.iterdir()) - ) - for dirpath in dirs_to_delete: - dirpath.rmdir() + for pattern in ("parameter", "catalog"): + for dirpath in confs.rglob(pattern): + if dirpath.is_dir() and not any(dirpath.iterdir()): + dirpath.rmdir() tests = fake_repo_path / "src" / "tests" / "pipelines" / pipeline if tests.is_dir(): shutil.rmtree(str(tests)) - # remove requirements.in and reset requirements.txt - requirements_in = fake_repo_path / "src" / "requirements.in" - if requirements_in.exists(): - requirements_in.unlink() + # reset requirements.txt requirements_txt.write_text(requirements) @pytest.fixture def cleanup_dist(fake_repo_path): yield - dist_dir = fake_repo_path / "src" / "dist" + dist_dir = fake_repo_path / "dist" if dist_dir.exists(): shutil.rmtree(str(dist_dir)) diff --git a/tests/framework/cli/pipeline/test_pipeline.py b/tests/framework/cli/pipeline/test_pipeline.py index 44b5d178f2..130a17c3c2 100644 --- a/tests/framework/cli/pipeline/test_pipeline.py +++ b/tests/framework/cli/pipeline/test_pipeline.py @@ -20,7 +20,7 @@ def make_pipelines(request, fake_repo_path, fake_package_path, mocker): source_path = fake_package_path / "pipelines" / PIPELINE_NAME tests_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME - conf_path = fake_repo_path / settings.CONF_ROOT / request.param / "parameters" + conf_path = fake_repo_path / settings.CONF_SOURCE / request.param / "parameters" for path in (source_path, tests_path, conf_path): path.mkdir(parents=True, exist_ok=True) @@ -36,26 +36,6 @@ def make_pipelines(request, fake_repo_path, fake_package_path, mocker): shutil.rmtree(str(conf_path), ignore_errors=True) -@pytest.fixture -def yaml_dump_mock(mocker): - return mocker.patch("yaml.dump", return_value="Result YAML") - - -@pytest.fixture -def pipelines_dict(): - pipelines = { - "de": ["split_data (split_data)"], - "ds": [ - "train_model (train_model)", - "predict (predict)", - "report_accuracy (report_accuracy)", - ], - "dp": ["data_processing.split_data (split_data)"], - } - pipelines["__default__"] = pipelines["de"] + pipelines["ds"] - return pipelines - - LETTER_ERROR = "It must contain only letters, digits, and/or underscores." FIRST_CHAR_ERROR = "It must start with a letter or underscore." TOO_SHORT_ERROR = "It must be at least 2 characters long." @@ -90,7 +70,7 @@ def test_create_pipeline( # pylint: disable=too-many-locals # config conf_env = env or "base" - conf_dir = (fake_repo_path / settings.CONF_ROOT / conf_env).resolve() + conf_dir = (fake_repo_path / settings.CONF_SOURCE / conf_env).resolve() actual_configs = list(conf_dir.glob(f"**/{PIPELINE_NAME}.yml")) expected_configs = [conf_dir / "parameters" / f"{PIPELINE_NAME}.yml"] assert actual_configs == expected_configs @@ -119,7 +99,7 @@ def test_create_pipeline_skip_config( assert f"Creating the pipeline `{PIPELINE_NAME}`: OK" in result.output assert f"Pipeline `{PIPELINE_NAME}` was successfully created." in result.output - conf_dirs = list((fake_repo_path / settings.CONF_ROOT).rglob(PIPELINE_NAME)) + conf_dirs = list((fake_repo_path / settings.CONF_SOURCE).rglob(PIPELINE_NAME)) assert conf_dirs == [] # no configs created for the pipeline test_dir = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME @@ -138,7 +118,7 @@ def test_catalog_and_params( # pylint: disable=too-many-locals assert result.exit_code == 0 # write pipeline catalog - conf_dir = fake_repo_path / settings.CONF_ROOT / "base" + conf_dir = fake_repo_path / settings.CONF_SOURCE / "base" catalog_dict = { "ds_from_pipeline": { "type": "pandas.CSVDataSet", @@ -169,7 +149,7 @@ def test_skip_copy(self, fake_repo_path, fake_project_cli, fake_metadata): for dirname in ("catalog", "parameters"): path = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / "base" / dirname / f"{PIPELINE_NAME}.yml" @@ -299,7 +279,7 @@ def test_delete_pipeline( tests_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME params_path = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / expected_conf / "parameters" / f"{PIPELINE_NAME}.yml" @@ -335,7 +315,7 @@ def test_delete_pipeline_skip( tests_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME params_path = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / "base" / "parameters" / f"{PIPELINE_NAME}.yml" @@ -427,7 +407,7 @@ def test_pipeline_delete_confirmation( tests_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME params_path = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / "base" / "parameters" / f"{PIPELINE_NAME}.yml" @@ -468,7 +448,7 @@ def test_pipeline_delete_confirmation_skip( tests_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME params_path = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / "base" / "parameters" / f"{PIPELINE_NAME}.yml" @@ -489,61 +469,6 @@ def test_pipeline_delete_confirmation_skip( assert params_path.is_file() -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") -def test_list_pipelines( - fake_project_cli, fake_metadata, yaml_dump_mock, pipelines_dict -): - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "list"], obj=fake_metadata - ) - - assert not result.exit_code - yaml_dump_mock.assert_called_once_with(sorted(pipelines_dict.keys())) - - -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log") -class TestPipelineDescribeCommand: - @pytest.mark.parametrize("pipeline_name", ["de", "ds", "dp", "__default__"]) - def test_describe_pipeline( - self, - fake_project_cli, - fake_metadata, - yaml_dump_mock, - pipeline_name, - pipelines_dict, - ): - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "describe", pipeline_name], obj=fake_metadata - ) - - assert not result.exit_code - expected_dict = {"Nodes": pipelines_dict[pipeline_name]} - yaml_dump_mock.assert_called_once_with(expected_dict) - - def test_not_found_pipeline(self, fake_project_cli, fake_metadata): - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "describe", "missing"], obj=fake_metadata - ) - - assert result.exit_code - expected_output = ( - "Error: `missing` pipeline not found. Existing pipelines: " - "[__default__, de, dp, ds]\n" - ) - assert expected_output in result.output - - def test_describe_pipeline_default( - self, fake_project_cli, fake_metadata, yaml_dump_mock, pipelines_dict - ): - result = CliRunner().invoke( - fake_project_cli, ["pipeline", "describe"], obj=fake_metadata - ) - - assert not result.exit_code - expected_dict = {"Nodes": pipelines_dict["__default__"]} - yaml_dump_mock.assert_called_once_with(expected_dict) - - class TestSyncDirs: @pytest.fixture(autouse=True) def mock_click(self, mocker): diff --git a/tests/framework/cli/pipeline/test_pipeline_package.py b/tests/framework/cli/pipeline/test_pipeline_package.py index f1894de041..50649fe3ca 100644 --- a/tests/framework/cli/pipeline/test_pipeline_package.py +++ b/tests/framework/cli/pipeline/test_pipeline_package.py @@ -1,12 +1,12 @@ +import tarfile import textwrap from pathlib import Path -from zipfile import ZipFile import pytest import toml from click.testing import CliRunner -from kedro.framework.cli.pipeline import _get_wheel_name +from kedro.framework.cli.pipeline import _get_sdist_name PIPELINE_NAME = "my_pipeline" @@ -17,42 +17,36 @@ @pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log", "cleanup_dist") class TestPipelinePackageCommand: - def assert_wheel_contents_correct( - self, wheel_location, package_name=PIPELINE_NAME, version="0.1" + def assert_sdist_contents_correct( + self, sdist_location, package_name=PIPELINE_NAME, version="0.1" ): - wheel_name = _get_wheel_name(name=package_name, version=version) - wheel_file = wheel_location / wheel_name - assert wheel_file.is_file() - assert len(list(wheel_location.iterdir())) == 1 + sdist_name = _get_sdist_name(name=package_name, version=version) + sdist_file = sdist_location / sdist_name + assert sdist_file.is_file() + assert len(list(sdist_location.iterdir())) == 1 + + with tarfile.open(sdist_file, "r") as tar: + sdist_contents = set(tar.getnames()) - # pylint: disable=consider-using-with - wheel_contents = set(ZipFile(str(wheel_file)).namelist()) expected_files = { - f"{package_name}/__init__.py", - f"{package_name}/README.md", - f"{package_name}/nodes.py", - f"{package_name}/pipeline.py", - f"{package_name}/config/parameters/{package_name}.yml", - "tests/__init__.py", - "tests/test_pipeline.py", + f"{package_name}-{version}/{package_name}/__init__.py", + f"{package_name}-{version}/{package_name}/README.md", + f"{package_name}-{version}/{package_name}/nodes.py", + f"{package_name}-{version}/{package_name}/pipeline.py", + f"{package_name}-{version}/{package_name}/config/parameters/{package_name}.yml", + f"{package_name}-{version}/tests/__init__.py", + f"{package_name}-{version}/tests/test_pipeline.py", } - assert expected_files <= wheel_contents + assert expected_files <= sdist_contents @pytest.mark.parametrize( - "options,package_name,version,success_message", + "options,package_name,success_message", [ - ([], PIPELINE_NAME, "0.1", f"Pipeline `{PIPELINE_NAME}` packaged!"), + ([], PIPELINE_NAME, f"`dummy_package.pipelines.{PIPELINE_NAME}` packaged!"), ( ["--alias", "alternative"], "alternative", - "0.1", - f"Pipeline `{PIPELINE_NAME}` packaged as `alternative`!", - ), - ( - ["--version", "0.3"], - PIPELINE_NAME, - "0.3", - f"Pipeline `{PIPELINE_NAME}` packaged!", + f"`dummy_package.pipelines.{PIPELINE_NAME}` packaged as `alternative`!", ), ], ) @@ -62,7 +56,6 @@ def test_package_pipeline( fake_project_cli, options, package_name, - version, success_message, fake_metadata, ): @@ -72,18 +65,18 @@ def test_package_pipeline( assert result.exit_code == 0 result = CliRunner().invoke( fake_project_cli, - ["pipeline", "package", PIPELINE_NAME] + options, + ["pipeline", "package", f"pipelines.{PIPELINE_NAME}"] + options, obj=fake_metadata, ) assert result.exit_code == 0 assert success_message in result.output - wheel_location = fake_repo_path / "src" / "dist" - assert f"Location: {wheel_location}" in result.output + sdist_location = fake_repo_path / "dist" + assert f"Location: {sdist_location}" in result.output - self.assert_wheel_contents_correct( - wheel_location=wheel_location, package_name=package_name, version=version + self.assert_sdist_contents_correct( + sdist_location=sdist_location, package_name=package_name, version="0.1" ) def test_pipeline_package_same_name_as_package_name( @@ -100,14 +93,16 @@ def test_pipeline_package_same_name_as_package_name( assert result.exit_code == 0 result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", pipeline_name], obj=fake_metadata + fake_project_cli, + ["pipeline", "package", f"pipelines.{pipeline_name}"], + obj=fake_metadata, ) - wheel_location = fake_repo_path / "src" / "dist" + sdist_location = fake_repo_path / "dist" assert result.exit_code == 0 - assert f"Location: {wheel_location}" in result.output - self.assert_wheel_contents_correct( - wheel_location=wheel_location, package_name=pipeline_name + assert f"Location: {sdist_location}" in result.output + self.assert_sdist_contents_correct( + sdist_location=sdist_location, package_name=pipeline_name ) def test_pipeline_package_same_name_as_package_name_alias( @@ -125,15 +120,15 @@ def test_pipeline_package_same_name_as_package_name_alias( result = CliRunner().invoke( fake_project_cli, - ["pipeline", "package", PIPELINE_NAME, "--alias", alias], + ["pipeline", "package", f"pipelines.{PIPELINE_NAME}", "--alias", alias], obj=fake_metadata, ) - wheel_location = fake_repo_path / "src" / "dist" + sdist_location = fake_repo_path / "dist" assert result.exit_code == 0 - assert f"Location: {wheel_location}" in result.output - self.assert_wheel_contents_correct( - wheel_location=wheel_location, package_name=alias + assert f"Location: {sdist_location}" in result.output + self.assert_sdist_contents_correct( + sdist_location=sdist_location, package_name=alias ) @pytest.mark.parametrize("existing_dir", [True, False]) @@ -150,25 +145,32 @@ def test_pipeline_package_to_destination( assert result.exit_code == 0 result = CliRunner().invoke( fake_project_cli, - ["pipeline", "package", PIPELINE_NAME, "--destination", str(destination)], + [ + "pipeline", + "package", + f"pipelines.{PIPELINE_NAME}", + "--destination", + str(destination), + ], obj=fake_metadata, ) assert result.exit_code == 0 success_message = ( - f"Pipeline `{PIPELINE_NAME}` packaged! Location: {destination}" + f"`dummy_package.pipelines.{PIPELINE_NAME}` packaged! " + f"Location: {destination}" ) assert success_message in result.output - self.assert_wheel_contents_correct(wheel_location=destination) + self.assert_sdist_contents_correct(sdist_location=destination) - def test_pipeline_package_overwrites_wheel( + def test_pipeline_package_overwrites_sdist( self, fake_project_cli, tmp_path, fake_metadata ): destination = (tmp_path / "in" / "here").resolve() destination.mkdir(parents=True) - wheel_file = destination / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - wheel_file.touch() + sdist_file = destination / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + sdist_file.touch() result = CliRunner().invoke( fake_project_cli, ["pipeline", "create", PIPELINE_NAME], obj=fake_metadata @@ -176,19 +178,26 @@ def test_pipeline_package_overwrites_wheel( assert result.exit_code == 0 result = CliRunner().invoke( fake_project_cli, - ["pipeline", "package", PIPELINE_NAME, "--destination", str(destination)], + [ + "pipeline", + "package", + f"pipelines.{PIPELINE_NAME}", + "--destination", + str(destination), + ], obj=fake_metadata, ) assert result.exit_code == 0 - warning_message = f"Package file {wheel_file} will be overwritten!" + warning_message = f"Package file {sdist_file} will be overwritten!" success_message = ( - f"Pipeline `{PIPELINE_NAME}` packaged! Location: {destination}" + f"`dummy_package.pipelines.{PIPELINE_NAME}` packaged! " + f"Location: {destination}" ) assert warning_message in result.output assert success_message in result.output - self.assert_wheel_contents_correct(wheel_location=destination) + self.assert_sdist_contents_correct(sdist_location=destination) @pytest.mark.parametrize( "bad_alias,error_message", @@ -204,7 +213,7 @@ def test_package_pipeline_bad_alias( ): result = CliRunner().invoke( fake_project_cli, - ["pipeline", "package", PIPELINE_NAME, "--alias", bad_alias], + ["pipeline", "package", f"pipelines.{PIPELINE_NAME}", "--alias", bad_alias], ) assert result.exit_code assert error_message in result.output @@ -223,6 +232,7 @@ def test_package_pipeline_invalid_module_path(self, fake_project_cli): def test_package_pipeline_no_config( self, fake_repo_path, fake_project_cli, fake_metadata ): + version = "0.1" result = CliRunner().invoke( fake_project_cli, ["pipeline", "create", PIPELINE_NAME, "--skip-config"], @@ -230,39 +240,46 @@ def test_package_pipeline_no_config( ) assert result.exit_code == 0 result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", PIPELINE_NAME], obj=fake_metadata + fake_project_cli, + ["pipeline", "package", f"pipelines.{PIPELINE_NAME}"], + obj=fake_metadata, ) assert result.exit_code == 0 - assert f"Pipeline `{PIPELINE_NAME}` packaged!" in result.output + assert f"`dummy_package.pipelines.{PIPELINE_NAME}` packaged!" in result.output + + sdist_location = fake_repo_path / "dist" + assert f"Location: {sdist_location}" in result.output - wheel_location = fake_repo_path / "src" / "dist" - assert f"Location: {wheel_location}" in result.output + # the sdist contents are slightly different (config shouldn't be included), + # which is why we can't call self.assert_sdist_contents_correct here + sdist_file = sdist_location / _get_sdist_name( + name=PIPELINE_NAME, version=version + ) + assert sdist_file.is_file() + assert len(list((fake_repo_path / "dist").iterdir())) == 1 - # the wheel contents are slightly different (config shouldn't be included), - # which is why we can't call self.assert_wheel_contents_correct here - wheel_file = wheel_location / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - assert wheel_file.is_file() - assert len(list((fake_repo_path / "src" / "dist").iterdir())) == 1 + with tarfile.open(sdist_file, "r") as tar: + sdist_contents = set(tar.getnames()) - # pylint: disable=consider-using-with - wheel_contents = set(ZipFile(str(wheel_file)).namelist()) expected_files = { - f"{PIPELINE_NAME}/__init__.py", - f"{PIPELINE_NAME}/README.md", - f"{PIPELINE_NAME}/nodes.py", - f"{PIPELINE_NAME}/pipeline.py", - "tests/__init__.py", - "tests/test_pipeline.py", + f"{PIPELINE_NAME}-{version}/{PIPELINE_NAME}/__init__.py", + f"{PIPELINE_NAME}-{version}/{PIPELINE_NAME}/README.md", + f"{PIPELINE_NAME}-{version}/{PIPELINE_NAME}/nodes.py", + f"{PIPELINE_NAME}-{version}/{PIPELINE_NAME}/pipeline.py", + f"{PIPELINE_NAME}-{version}/tests/__init__.py", + f"{PIPELINE_NAME}-{version}/tests/test_pipeline.py", } - assert expected_files <= wheel_contents - assert f"{PIPELINE_NAME}/config/parameters.yml" not in wheel_contents + assert expected_files <= sdist_contents + assert f"{PIPELINE_NAME}/config/parameters.yml" not in sdist_contents def test_package_non_existing_pipeline_dir( self, fake_package_path, fake_project_cli, fake_metadata ): result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", "non_existing"], obj=fake_metadata + fake_project_cli, + ["pipeline", "package", "pipelines.non_existing"], + obj=fake_metadata, ) assert result.exit_code == 1 pipeline_dir = fake_package_path / "pipelines" / "non_existing" @@ -276,7 +293,9 @@ def test_package_empty_pipeline_dir( pipeline_dir.mkdir() result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", "empty_dir"], obj=fake_metadata + fake_project_cli, + ["pipeline", "package", "pipelines.empty_dir"], + obj=fake_metadata, ) assert result.exit_code == 1 error_message = f"Error: '{pipeline_dir}' is an empty directory." @@ -314,25 +333,32 @@ def test_package_modular_pipeline_with_nested_parameters( (nested_param_path / "params1.yml").touch() result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", "retail"], obj=fake_metadata + fake_project_cli, + ["pipeline", "package", "pipelines.retail"], + obj=fake_metadata, ) assert result.exit_code == 0 - assert "Pipeline `retail` packaged!" in result.output + assert "`dummy_package.pipelines.retail` packaged!" in result.output - wheel_location = fake_repo_path / "src" / "dist" - assert f"Location: {wheel_location}" in result.output + sdist_location = fake_repo_path / "dist" + assert f"Location: {sdist_location}" in result.output - wheel_name = _get_wheel_name(name="retail", version="0.1") - wheel_file = wheel_location / wheel_name - assert wheel_file.is_file() - assert len(list(wheel_location.iterdir())) == 1 + sdist_name = _get_sdist_name(name="retail", version="0.1") + sdist_file = sdist_location / sdist_name + assert sdist_file.is_file() + assert len(list(sdist_location.iterdir())) == 1 - # pylint: disable=consider-using-with - wheel_contents = set(ZipFile(str(wheel_file)).namelist()) - assert "retail/config/parameters/retail/params1.yml" in wheel_contents - assert "retail/config/parameters/retail.yml" in wheel_contents - assert "retail/config/parameters/retail_banking.yml" not in wheel_contents + with tarfile.open(sdist_file, "r") as tar: + sdist_contents = set(tar.getnames()) + assert ( + "retail-0.1/retail/config/parameters/retail/params1.yml" in sdist_contents + ) + assert "retail-0.1/retail/config/parameters/retail.yml" in sdist_contents + assert ( + "retail-0.1/retail/config/parameters/retail_banking.yml" + not in sdist_contents + ) def test_package_pipeline_with_deep_nested_parameters( self, fake_repo_path, fake_project_cli, fake_metadata @@ -372,51 +398,97 @@ def test_package_pipeline_with_deep_nested_parameters( super_deep_nested_param_path.mkdir(parents=True, exist_ok=True) (super_deep_nested_param_path / "params3.yml").touch() result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", "retail"], obj=fake_metadata + fake_project_cli, + ["pipeline", "package", "pipelines.retail"], + obj=fake_metadata, ) assert result.exit_code == 0 - assert "Pipeline `retail` packaged!" in result.output + assert "`dummy_package.pipelines.retail` packaged!" in result.output - wheel_location = fake_repo_path / "src" / "dist" - assert f"Location: {wheel_location}" in result.output + sdist_location = fake_repo_path / "dist" + assert f"Location: {sdist_location}" in result.output - wheel_name = _get_wheel_name(name="retail", version="0.1") - wheel_file = wheel_location / wheel_name - assert wheel_file.is_file() - assert len(list(wheel_location.iterdir())) == 1 + sdist_name = _get_sdist_name(name="retail", version="0.1") + sdist_file = sdist_location / sdist_name + assert sdist_file.is_file() + assert len(list(sdist_location.iterdir())) == 1 # pylint: disable=consider-using-with - wheel_contents = set(ZipFile(str(wheel_file)).namelist()) - assert "retail/config/parameters/deep/retail/params1.yml" in wheel_contents - assert "retail/config/parameters/retail/deep/params1.yml" in wheel_contents - assert "retail/config/parameters/retail.yml" in wheel_contents - assert "retail/config/parameters/deep/retail.yml" in wheel_contents - assert "retail/config/parameters/a/b/c/d/retail/params3.yml" in wheel_contents - - def test_pipeline_package_version( + with tarfile.open(sdist_file, "r") as tar: + sdist_contents = set(tar.getnames()) + assert ( + "retail-0.1/retail/config/parameters/deep/retail/params1.yml" + in sdist_contents + ) + assert ( + "retail-0.1/retail/config/parameters/retail/deep/params1.yml" + in sdist_contents + ) + assert "retail-0.1/retail/config/parameters/retail.yml" in sdist_contents + assert "retail-0.1/retail/config/parameters/deep/retail.yml" in sdist_contents + assert ( + "retail-0.1/retail/config/parameters/a/b/c/d/retail/params3.yml" + in sdist_contents + ) + + def test_pipeline_package_default( self, fake_repo_path, fake_package_path, fake_project_cli, fake_metadata ): _pipeline_name = "data_engineering" - # the test version value is set separately in - # features/steps/test_starter//src//pipelines/data_engineering/__init__.py - _test_version = "4.20.69" pipelines_dir = fake_package_path / "pipelines" / _pipeline_name assert pipelines_dir.is_dir() result = CliRunner().invoke( - fake_project_cli, ["pipeline", "package", _pipeline_name], obj=fake_metadata + fake_project_cli, + ["pipeline", "package", f"pipelines.{_pipeline_name}"], + obj=fake_metadata, ) assert result.exit_code == 0 # test for actual version - wheel_location = fake_repo_path / "src" / "dist" - wheel_name = _get_wheel_name(name=_pipeline_name, version=_test_version) - wheel_file = wheel_location / wheel_name + sdist_location = fake_repo_path / "dist" + sdist_name = _get_sdist_name(name=_pipeline_name, version="0.1") + sdist_file = sdist_location / sdist_name + + assert sdist_file.is_file() + assert len(list(sdist_location.iterdir())) == 1 + + def test_pipeline_package_nested_module( + self, fake_project_cli, fake_metadata, fake_repo_path, fake_package_path + ): + CliRunner().invoke( + fake_project_cli, ["pipeline", "create", PIPELINE_NAME], obj=fake_metadata + ) - assert wheel_file.is_file() - assert len(list(wheel_location.iterdir())) == 1 + nested_utils = fake_package_path / "pipelines" / PIPELINE_NAME / "utils" + nested_utils.mkdir(parents=True) + (nested_utils / "__init__.py").touch() + (nested_utils / "useful.py").touch() + + result = CliRunner().invoke( + fake_project_cli, + ["pipeline", "package", f"pipelines.{PIPELINE_NAME}.utils"], + obj=fake_metadata, + ) + assert result.exit_code == 0 + + sdist_location = fake_repo_path / "dist" + sdist_name = _get_sdist_name(name="utils", version="0.1") + sdist_file = sdist_location / sdist_name + + assert sdist_file.is_file() + assert len(list(sdist_location.iterdir())) == 1 + + with tarfile.open(sdist_file, "r") as tar: + sdist_contents = set(tar.getnames()) + expected_files = { + "utils-0.1/utils/__init__.py", + "utils-0.1/utils/useful.py", + } + assert expected_files <= sdist_contents + assert f"{PIPELINE_NAME}/pipeline.py" not in sdist_contents @pytest.mark.usefixtures( @@ -436,9 +508,9 @@ def test_pipeline_package_all( # pylint: disable=too-many-locals project_toml_str = textwrap.dedent( f""" [tool.kedro.pipeline.package] - first = {{destination = "{other_dest.as_posix()}"}} - second = {{alias = "ds", env = "local"}} - third = {{}} + "pipelines.first" = {{destination = "{other_dest.as_posix()}"}} + "pipelines.second" = {{alias = "ds", env = "local"}} + "pipelines.third" = {{}} """ ) with pyproject_toml.open(mode="a") as file: diff --git a/tests/framework/cli/pipeline/test_pipeline_pull.py b/tests/framework/cli/pipeline/test_pipeline_pull.py index b37ca75e83..e838a3351f 100644 --- a/tests/framework/cli/pipeline/test_pipeline_pull.py +++ b/tests/framework/cli/pipeline/test_pipeline_pull.py @@ -1,6 +1,7 @@ import filecmp import shutil import textwrap +from pathlib import Path import pytest import toml @@ -8,7 +9,7 @@ from click import ClickException from click.testing import CliRunner -from kedro.framework.cli.pipeline import _get_wheel_name +from kedro.framework.cli.pipeline import _get_sdist_name from kedro.framework.project import settings PIPELINE_NAME = "my_pipeline" @@ -27,7 +28,9 @@ def call_pipeline_package( options = ["--alias", alias] if alias else [] options += ["--destination", str(destination)] if destination else [] result = CliRunner().invoke( - cli, ["pipeline", "package", pipeline_name, *options], obj=metadata + cli, + ["pipeline", "package", f"pipelines.{pipeline_name}", *options], + obj=metadata, ) assert result.exit_code == 0, result.output @@ -50,17 +53,26 @@ def assert_package_files_exist(self, source_path): } @pytest.mark.parametrize("env", [None, "local"]) - @pytest.mark.parametrize("alias", [None, "alias_path"]) - def test_pull_local_whl( + @pytest.mark.parametrize( + "alias, destination", + [ + (None, None), + ("aliased", None), + ("aliased", "pipelines"), + (None, "pipelines"), + ], + ) + def test_pull_local_sdist( self, fake_project_cli, fake_repo_path, fake_package_path, env, alias, + destination, fake_metadata, ): - """Test for pulling a valid wheel file locally.""" + """Test for pulling a valid sdist file locally.""" # pylint: disable=too-many-locals call_pipeline_create(fake_project_cli, fake_metadata) call_pipeline_package(fake_project_cli, fake_metadata) @@ -68,39 +80,38 @@ def test_pull_local_whl( source_path = fake_package_path / "pipelines" / PIPELINE_NAME config_path = ( - fake_repo_path / settings.CONF_ROOT / "base" / "pipelines" / PIPELINE_NAME + fake_repo_path / settings.CONF_SOURCE / "base" / "pipelines" / PIPELINE_NAME ) test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME - # Make sure the files actually deleted before pulling from the wheel file. + # Make sure the files actually deleted before pulling from the sdist file. assert not source_path.exists() assert not test_path.exists() assert not config_path.exists() - wheel_file = ( - fake_repo_path - / "src" - / "dist" - / _get_wheel_name(name=PIPELINE_NAME, version="0.1") + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") ) - assert wheel_file.is_file() + assert sdist_file.is_file() options = ["-e", env] if env else [] options += ["--alias", alias] if alias else [] + options += ["--destination", destination] if destination else [] result = CliRunner().invoke( fake_project_cli, - ["pipeline", "pull", str(wheel_file), *options], + ["pipeline", "pull", str(sdist_file), *options], obj=fake_metadata, ) assert result.exit_code == 0, result.output assert "pulled and unpacked" in result.output pipeline_name = alias or PIPELINE_NAME - source_dest = fake_package_path / "pipelines" / pipeline_name - test_dest = fake_repo_path / "src" / "tests" / "pipelines" / pipeline_name + destination = destination or Path() + source_dest = fake_package_path / destination / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / destination / pipeline_name config_env = env or "base" params_config = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / config_env / "parameters" / f"{pipeline_name}.yml" @@ -113,17 +124,26 @@ def test_pull_local_whl( assert actual_test_files == expected_test_files @pytest.mark.parametrize("env", [None, "local"]) - @pytest.mark.parametrize("alias", [None, "alias_path"]) - def test_pull_local_whl_compare( + @pytest.mark.parametrize( + "alias, destination", + [ + (None, None), + ("aliased", None), + ("aliased", "pipelines"), + (None, "pipelines"), + ], + ) + def test_pull_local_sdist_compare( self, fake_project_cli, fake_repo_path, fake_package_path, env, alias, + destination, fake_metadata, ): - """Test for pulling a valid wheel file locally, unpack it + """Test for pulling a valid sdist file locally, unpack it into another location and check that unpacked files are identical to the ones in the original modular pipeline. """ @@ -136,37 +156,36 @@ def test_pull_local_whl_compare( test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME source_params_config = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / "base" / "parameters" / f"{PIPELINE_NAME}.yml" ) - wheel_file = ( - fake_repo_path - / "src" - / "dist" - / _get_wheel_name(name=pipeline_name, version="0.1") + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=pipeline_name, version="0.1") ) - assert wheel_file.is_file() + assert sdist_file.is_file() options = ["-e", env] if env else [] options += ["--alias", alias] if alias else [] + options += ["--destination", destination] if destination else [] result = CliRunner().invoke( fake_project_cli, - ["pipeline", "pull", str(wheel_file), *options], + ["pipeline", "pull", str(sdist_file), *options], obj=fake_metadata, ) assert result.exit_code == 0, result.output assert "pulled and unpacked" in result.output pipeline_name = alias or pipeline_name - source_dest = fake_package_path / "pipelines" / pipeline_name - test_dest = fake_repo_path / "src" / "tests" / "pipelines" / pipeline_name + destination = destination or Path() + source_dest = fake_package_path / destination / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / destination / pipeline_name config_env = env or "base" dest_params_config = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / config_env / "parameters" / f"{pipeline_name}.yml" @@ -176,7 +195,107 @@ def test_pull_local_whl_compare( assert not filecmp.dircmp(test_path, test_dest).diff_files assert source_params_config.read_bytes() == dest_params_config.read_bytes() - def test_pipeline_alias_refactors_imports( + def test_pipeline_pull_same_alias_package_name( + self, + fake_project_cli, + fake_repo_path, + fake_package_path, + fake_metadata, + ): + call_pipeline_create(fake_project_cli, fake_metadata) + call_pipeline_package(fake_project_cli, fake_metadata) + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + + pipeline_name = PIPELINE_NAME + destination = "tools" + + result = CliRunner().invoke( + fake_project_cli, + [ + "pipeline", + "pull", + str(sdist_file), + "--destination", + destination, + "--alias", + pipeline_name, + ], + obj=fake_metadata, + ) + assert result.exit_code == 0, result.stderr + assert "pulled and unpacked" in result.output + + source_dest = fake_package_path / destination / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / destination / pipeline_name + config_env = "base" + params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / config_env + / "parameters" + / f"{pipeline_name}.yml" + ) + + self.assert_package_files_exist(source_dest) + assert params_config.is_file() + actual_test_files = {f.name for f in test_dest.iterdir()} + expected_test_files = {"__init__.py", "test_pipeline.py"} + assert actual_test_files == expected_test_files + + def test_pipeline_pull_nested_destination( + self, + fake_project_cli, + fake_repo_path, + fake_package_path, + fake_metadata, + ): + call_pipeline_create(fake_project_cli, fake_metadata) + call_pipeline_package(fake_project_cli, fake_metadata) + + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") + ) + + pipeline_name = PIPELINE_NAME + destination = "pipelines/nested" + + result = CliRunner().invoke( + fake_project_cli, + [ + "pipeline", + "pull", + str(sdist_file), + "--destination", + destination, + "--alias", + pipeline_name, + ], + obj=fake_metadata, + ) + assert result.exit_code == 0, result.stderr + assert "pulled and unpacked" in result.output + + source_dest = fake_package_path / destination / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / destination / pipeline_name + config_env = "base" + params_config = ( + fake_repo_path + / settings.CONF_SOURCE + / config_env + / "parameters" + / f"{pipeline_name}.yml" + ) + + self.assert_package_files_exist(source_dest) + assert params_config.is_file() + actual_test_files = {f.name for f in test_dest.iterdir()} + expected_test_files = {"__init__.py", "test_pipeline.py"} + assert actual_test_files == expected_test_files + + def test_pipeline_alias_refactors_imports( # pylint: disable=too-many-locals self, fake_project_cli, fake_package_path, fake_repo_path, fake_metadata ): call_pipeline_create(fake_project_cli, fake_metadata) @@ -189,32 +308,37 @@ def test_pipeline_alias_refactors_imports( package_alias = "alpha" pull_alias = "beta" + pull_destination = "pipelines/lib" call_pipeline_package( cli=fake_project_cli, metadata=fake_metadata, alias=package_alias ) - wheel_file = ( - fake_repo_path - / "src" - / "dist" - / _get_wheel_name(name=package_alias, version="0.1") + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=package_alias, version="0.1") ) CliRunner().invoke( - fake_project_cli, ["pipeline", "pull", str(wheel_file)], obj=fake_metadata + fake_project_cli, ["pipeline", "pull", str(sdist_file)], obj=fake_metadata ) CliRunner().invoke( fake_project_cli, - ["pipeline", "pull", str(wheel_file), "--alias", pull_alias], + [ + "pipeline", + "pull", + str(sdist_file), + "--alias", + pull_alias, + "--destination", + pull_destination, + ], obj=fake_metadata, ) - - for alias in (package_alias, pull_alias): - path = fake_package_path / "pipelines" / alias / "pipeline.py" + pull = f"pipelines.lib.{pull_alias}" + for alias in (package_alias, pull): + alias_path = Path(*alias.split(".")) + path = fake_package_path / alias_path / "pipeline.py" file_content = path.read_text() - expected_stmt = ( - f"import {fake_metadata.package_name}.pipelines.{alias}.nodes" - ) + expected_stmt = f"import {fake_metadata.package_name}.{alias}.nodes" assert expected_stmt in file_content def test_pipeline_pull_from_aliased_pipeline_conflicting_name( @@ -230,22 +354,19 @@ def test_pipeline_pull_from_aliased_pipeline_conflicting_name( call_pipeline_package( cli=fake_project_cli, metadata=fake_metadata, alias=package_name ) - wheel_file = ( - fake_repo_path - / "src" - / "dist" - / _get_wheel_name(name=package_name, version="0.1") + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=package_name, version="0.1") ) - assert wheel_file.is_file() + assert sdist_file.is_file() result = CliRunner().invoke( - fake_project_cli, ["pipeline", "pull", str(wheel_file)], obj=fake_metadata + fake_project_cli, ["pipeline", "pull", str(sdist_file)], obj=fake_metadata ) assert result.exit_code == 0, result.output - path = fake_package_path / "pipelines" / package_name / "pipeline.py" + path = fake_package_path / package_name / "pipeline.py" file_content = path.read_text() - expected_stmt = f"import {package_name}.pipelines.{package_name}.nodes" + expected_stmt = f"import {package_name}.{package_name}.nodes" assert expected_stmt in file_content def test_pipeline_pull_as_aliased_pipeline_conflicting_name( @@ -259,29 +380,26 @@ def test_pipeline_pull_as_aliased_pipeline_conflicting_name( f.write(import_stmt) call_pipeline_package(cli=fake_project_cli, metadata=fake_metadata) - wheel_file = ( - fake_repo_path - / "src" - / "dist" - / _get_wheel_name(name=PIPELINE_NAME, version="0.1") + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") ) - assert wheel_file.is_file() + assert sdist_file.is_file() result = CliRunner().invoke( fake_project_cli, - ["pipeline", "pull", str(wheel_file), "--alias", package_name], + ["pipeline", "pull", str(sdist_file), "--alias", package_name], obj=fake_metadata, ) assert result.exit_code == 0, result.output - path = fake_package_path / "pipelines" / package_name / "pipeline.py" + path = fake_package_path / package_name / "pipeline.py" file_content = path.read_text() - expected_stmt = f"import {package_name}.pipelines.{package_name}.nodes" + expected_stmt = f"import {package_name}.{package_name}.nodes" assert expected_stmt in file_content - def test_pull_whl_fs_args( + def test_pull_sdist_fs_args( self, fake_project_cli, fake_repo_path, mocker, tmp_path, fake_metadata ): - """Test for pulling a wheel file with custom fs_args specified.""" + """Test for pulling a sdist file with custom fs_args specified.""" call_pipeline_create(fake_project_cli, fake_metadata) call_pipeline_package(fake_project_cli, fake_metadata) call_pipeline_delete(fake_project_cli, fake_metadata) @@ -291,49 +409,45 @@ def test_pull_whl_fs_args( yaml.dump({"fs_arg_1": 1, "fs_arg_2": {"fs_arg_2_nested_1": 2}}, f) mocked_filesystem = mocker.patch("fsspec.filesystem") - wheel_file = ( - fake_repo_path - / "src" - / "dist" - / _get_wheel_name(name=PIPELINE_NAME, version="0.1") + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") ) options = ["--fs-args", str(fs_args_config)] CliRunner().invoke( - fake_project_cli, ["pipeline", "pull", str(wheel_file), *options] + fake_project_cli, ["pipeline", "pull", str(sdist_file), *options] ) mocked_filesystem.assert_called_once_with( "file", fs_arg_1=1, fs_arg_2=dict(fs_arg_2_nested_1=2) ) - def test_pull_two_dist_info( + def test_pull_two_egg_info( self, fake_project_cli, fake_repo_path, mocker, tmp_path, fake_metadata ): - """ - Test for pulling a wheel file with more than one dist-info directory. + """Test for pulling an sdist file with more than one + dist-info directory. """ call_pipeline_create(fake_project_cli, fake_metadata) call_pipeline_package(fake_project_cli, fake_metadata) - wheel_file = ( - fake_repo_path - / "src" - / "dist" - / _get_wheel_name(name=PIPELINE_NAME, version="0.1") + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") ) - assert wheel_file.is_file() + assert sdist_file.is_file() - (tmp_path / "dummy.dist-info").mkdir() + (tmp_path / f"{PIPELINE_NAME}-0.1" / "dummy.egg-info").mkdir(parents=True) mocker.patch( "kedro.framework.cli.pipeline.tempfile.TemporaryDirectory", return_value=tmp_path, ) result = CliRunner().invoke( - fake_project_cli, ["pipeline", "pull", str(wheel_file)], obj=fake_metadata + fake_project_cli, + ["pipeline", "pull", str(sdist_file)], + obj=fake_metadata, ) assert result.exit_code - assert "Error: More than 1 or no dist-info files found" in result.output + assert "Error: More than 1 or no egg-info files found" in result.output @pytest.mark.parametrize("env", [None, "local"]) @pytest.mark.parametrize("alias", [None, "alias_path"]) @@ -346,8 +460,8 @@ def test_pull_tests_missing( alias, fake_metadata, ): - """Test for pulling a valid wheel file locally, - but `tests` directory is missing from the wheel file. + """Test for pulling a valid sdist file locally, + but `tests` directory is missing from the sdist file. """ # pylint: disable=too-many-locals call_pipeline_create(fake_project_cli, fake_metadata) @@ -360,39 +474,36 @@ def test_pull_tests_missing( source_path = fake_package_path / "pipelines" / PIPELINE_NAME source_params_config = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / "base" / "parameters" / f"{PIPELINE_NAME}.yml" ) - # Make sure the files actually deleted before pulling from the wheel file. + # Make sure the files actually deleted before pulling from the sdist file. assert not source_path.exists() assert not source_params_config.exists() - wheel_file = ( - fake_repo_path - / "src" - / "dist" - / _get_wheel_name(name=PIPELINE_NAME, version="0.1") + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") ) - assert wheel_file.is_file() + assert sdist_file.is_file() options = ["-e", env] if env else [] options += ["--alias", alias] if alias else [] result = CliRunner().invoke( fake_project_cli, - ["pipeline", "pull", str(wheel_file), *options], + ["pipeline", "pull", str(sdist_file), *options], obj=fake_metadata, ) assert result.exit_code == 0 pipeline_name = alias or PIPELINE_NAME - source_dest = fake_package_path / "pipelines" / pipeline_name - test_dest = fake_repo_path / "src" / "tests" / "pipelines" / pipeline_name + source_dest = fake_package_path / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / pipeline_name config_env = env or "base" params_config = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / config_env / "parameters" / f"{pipeline_name}.yml" @@ -414,14 +525,14 @@ def test_pull_config_missing( fake_metadata, ): """ - Test for pulling a valid wheel file locally, but `config` directory is missing - from the wheel file. + Test for pulling a valid sdist file locally, but `config` directory is missing + from the sdist file. """ # pylint: disable=too-many-locals call_pipeline_create(fake_project_cli, fake_metadata) source_params_config = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / "base" / "parameters" / f"{PIPELINE_NAME}.yml" @@ -432,34 +543,31 @@ def test_pull_config_missing( source_path = fake_package_path / "pipelines" / PIPELINE_NAME test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME - # Make sure the files actually deleted before pulling from the wheel file. + # Make sure the files actually deleted before pulling from the sdist file. assert not source_path.exists() assert not test_path.exists() - wheel_file = ( - fake_repo_path - / "src" - / "dist" - / _get_wheel_name(name=PIPELINE_NAME, version="0.1") + sdist_file = ( + fake_repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") ) - assert wheel_file.is_file() + assert sdist_file.is_file() options = ["-e", env] if env else [] options += ["--alias", alias] if alias else [] result = CliRunner().invoke( fake_project_cli, - ["pipeline", "pull", str(wheel_file), *options], + ["pipeline", "pull", str(sdist_file), *options], obj=fake_metadata, ) assert result.exit_code == 0 pipeline_name = alias or PIPELINE_NAME - source_dest = fake_package_path / "pipelines" / pipeline_name - test_dest = fake_repo_path / "src" / "tests" / "pipelines" / pipeline_name + source_dest = fake_package_path / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / pipeline_name config_env = env or "base" dest_params_config = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / config_env / "parameters" / f"{pipeline_name}.yml" @@ -485,22 +593,23 @@ def test_pull_from_pypi( fake_metadata, ): """ - Test for pulling a valid wheel file from pypi. + Test for pulling a valid sdist file from pypi. """ # pylint: disable=too-many-locals call_pipeline_create(fake_project_cli, fake_metadata) - # We mock the `pip download` call, and manually create a package wheel file + # We mock the `pip download` call, and manually create a package sdist file # to simulate the pypi scenario instead call_pipeline_package(fake_project_cli, fake_metadata, destination=tmp_path) - wheel_file = tmp_path / _get_wheel_name(name=PIPELINE_NAME, version="0.1") - assert wheel_file.is_file() + version = "0.1" + sdist_file = tmp_path / _get_sdist_name(name=PIPELINE_NAME, version=version) + assert sdist_file.is_file() call_pipeline_delete(fake_project_cli, fake_metadata) source_path = fake_package_path / "pipelines" / PIPELINE_NAME test_path = fake_repo_path / "src" / "tests" / "pipelines" / PIPELINE_NAME source_params_config = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / "base" / "parameters" / f"{PIPELINE_NAME}.yml" @@ -520,23 +629,30 @@ def test_pull_from_pypi( options += ["--alias", alias] if alias else [] result = CliRunner().invoke( fake_project_cli, - ["pipeline", "pull", PIPELINE_NAME, *options], + ["pipeline", "pull", f"{PIPELINE_NAME}-{version}", *options], obj=fake_metadata, ) assert result.exit_code == 0 assert "pulled and unpacked" in result.output python_call_mock.assert_called_once_with( - "pip", ["download", "--no-deps", "--dest", str(tmp_path), PIPELINE_NAME] + "pip", + [ + "download", + "--no-deps", + "--dest", + str(tmp_path), + f"{PIPELINE_NAME}-{version}", + ], ) pipeline_name = alias or PIPELINE_NAME - source_dest = fake_package_path / "pipelines" / pipeline_name - test_dest = fake_repo_path / "src" / "tests" / "pipelines" / pipeline_name + source_dest = fake_package_path / pipeline_name + test_dest = fake_repo_path / "src" / "tests" / pipeline_name config_env = env or "base" dest_params_config = ( fake_repo_path - / settings.CONF_ROOT + / settings.CONF_SOURCE / config_env / "parameters" / f"{pipeline_name}.yml" @@ -579,14 +695,14 @@ def test_invalid_pull_from_pypi( assert pypi_error_message in result.stdout - def test_pull_from_pypi_more_than_one_wheel_file( + def test_pull_from_pypi_more_than_one_sdist_file( self, fake_project_cli, mocker, tmp_path, fake_metadata ): """ - Test for pulling a wheel file with `pip download`, but there are more than one wheel + Test for pulling a sdist file with `pip download`, but there are more than one sdist file to unzip. """ - # We mock the `pip download` call, and manually create a package wheel file + # We mock the `pip download` call, and manually create a package sdist file # to simulate the pypi scenario instead call_pipeline_create(fake_project_cli, fake_metadata) call_pipeline_package(fake_project_cli, fake_metadata, destination=tmp_path) @@ -603,14 +719,14 @@ def test_pull_from_pypi_more_than_one_wheel_file( ) assert result.exit_code - assert "Error: More than 1 or no wheel files found:" in result.output + assert "Error: More than 1 or no sdist files found:" in result.output def test_pull_unsupported_protocol_by_fsspec( self, fake_project_cli, fake_metadata, tmp_path, mocker ): protocol = "unsupported" exception_message = f"Protocol not known: {protocol}" - error_message = "Error: More than 1 or no wheel files found:" + error_message = "Error: More than 1 or no sdist files found:" package_path = f"{protocol}://{PIPELINE_NAME}" python_call_mock = mocker.patch("kedro.framework.cli.pipeline.python_call") @@ -643,23 +759,18 @@ class TestPipelinePullFromManifest: def test_pipeline_pull_all( # pylint: disable=too-many-locals self, fake_repo_path, fake_project_cli, fake_metadata, mocker ): - # pylint: disable=import-outside-toplevel + # pylint: disable=import-outside-toplevel, line-too-long from kedro.framework.cli import pipeline spy = mocker.spy(pipeline, "_pull_package") pyproject_toml = fake_repo_path / "pyproject.toml" - wheel_file = ( - lambda name: fake_repo_path - / "src" - / "dist" - / _get_wheel_name(name=name, version="0.1") - ) + sdist_file = str(fake_repo_path / "dist" / _get_sdist_name("{}", "0.1")) project_toml_str = textwrap.dedent( f""" [tool.kedro.pipeline.pull] - "{wheel_file("first")}" = {{alias = "dp"}} - "{wheel_file("second")}" = {{alias = "ds", env = "local"}} - "{wheel_file("third")}" = {{}} + "{sdist_file.format("first")}" = {{alias = "dp", destination = "pipelines"}} + "{sdist_file.format("second")}" = {{alias = "ds", destination = "pipelines", env = "local"}} + "{sdist_file.format("third")}" = {{}} """ ) @@ -681,8 +792,8 @@ def test_pipeline_pull_all( # pylint: disable=too-many-locals build_config = toml.loads(project_toml_str) pull_manifest = build_config["tool"]["kedro"]["pipeline"]["pull"] - for wheel_file, pull_specs in pull_manifest.items(): - expected_call = mocker.call(wheel_file, fake_metadata, **pull_specs) + for sdist_file, pull_specs in pull_manifest.items(): + expected_call = mocker.call(sdist_file, fake_metadata, **pull_specs) assert expected_call in spy.call_args_list def test_pipeline_pull_all_empty_toml( diff --git a/tests/framework/cli/pipeline/test_pipeline_requirements.py b/tests/framework/cli/pipeline/test_pipeline_requirements.py index 7a7f04f7ce..eb17f60422 100644 --- a/tests/framework/cli/pipeline/test_pipeline_requirements.py +++ b/tests/framework/cli/pipeline/test_pipeline_requirements.py @@ -1,7 +1,7 @@ import pytest from click.testing import CliRunner -from kedro.framework.cli.pipeline import _get_wheel_name, _safe_parse_requirements +from kedro.framework.cli.pipeline import _get_sdist_name, _safe_parse_requirements PIPELINE_NAME = "my_pipeline" @@ -41,7 +41,7 @@ class TestPipelineRequirements: - create a pipeline with some sort of requirements.txt - package the pipeline - delete the pipeline and pull in the packaged one - - assert the project's modified requirements.in is as expected + - assert the project's modified requirements.txt is as expected """ def call_pipeline_create(self, cli, metadata): @@ -53,7 +53,7 @@ def call_pipeline_create(self, cli, metadata): def call_pipeline_package(self, cli, metadata): result = CliRunner().invoke( cli, - ["pipeline", "package", PIPELINE_NAME], + ["pipeline", "package", f"pipelines.{PIPELINE_NAME}"], obj=metadata, ) assert result.exit_code == 0 @@ -65,17 +65,14 @@ def call_pipeline_delete(self, cli, metadata): assert result.exit_code == 0 def call_pipeline_pull(self, cli, metadata, repo_path): - wheel_file = ( - repo_path - / "src" - / "dist" - / _get_wheel_name(name=PIPELINE_NAME, version="0.1") + sdist_file = ( + repo_path / "dist" / _get_sdist_name(name=PIPELINE_NAME, version="0.1") ) - assert wheel_file.is_file() + assert sdist_file.is_file() result = CliRunner().invoke( cli, - ["pipeline", "pull", str(wheel_file)], + ["pipeline", "pull", str(sdist_file)], obj=metadata, ) assert result.exit_code == 0 @@ -83,8 +80,7 @@ def call_pipeline_pull(self, cli, metadata, repo_path): def test_existing_complex_project_requirements_txt( self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path ): - """Pipeline requirements.txt and project requirements.txt, but no project - requirements.in.""" + """Pipeline requirements.txt and project requirements.txt.""" project_requirements_txt = fake_repo_path / "src" / "requirements.txt" with open(project_requirements_txt, "a", encoding="utf-8") as file: file.write(COMPLEX_REQUIREMENTS) @@ -103,21 +99,19 @@ def test_existing_complex_project_requirements_txt( self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS) - project_requirements_in = fake_repo_path / "src" / "requirements.in" pulled_requirements = _safe_parse_requirements( - project_requirements_in.read_text() + project_requirements_txt.read_text() ) - # requirements.in afterwards should be the requirements that already existed in + # The project requirements.txt afterwards should be the requirements that already existed in # project requirements.txt + those pulled in from pipeline requirements.txt. # Unparseable COMPLEX_REQUIREMENTS should still be there. assert pulled_requirements == existing_requirements | packaged_requirements - assert COMPLEX_REQUIREMENTS in project_requirements_in.read_text() + assert COMPLEX_REQUIREMENTS in project_requirements_txt.read_text() def test_existing_project_requirements_txt( self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path ): - """Pipeline requirements.txt and project requirements.txt, but no project - requirements.in.""" + """Pipeline requirements.txt and project requirements.txt.""" project_requirements_txt = fake_repo_path / "src" / "requirements.txt" existing_requirements = _safe_parse_requirements( project_requirements_txt.read_text() @@ -134,76 +128,23 @@ def test_existing_project_requirements_txt( self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS) - project_requirements_in = fake_repo_path / "src" / "requirements.in" - pulled_requirements = _safe_parse_requirements( - project_requirements_in.read_text() - ) - # requirements.in afterwards should be the requirements that already existed in - # project requirements.txt + those pulled in from pipeline requirements.txt. - assert pulled_requirements == existing_requirements | packaged_requirements - - def test_existing_complex_project_requirements_in( - self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path - ): - """Pipeline requirements.txt and a pre-existing project requirements.in.""" - project_requirements_in = fake_repo_path / "src" / "requirements.in" - project_requirements_in.write_text(COMPLEX_REQUIREMENTS) - self.call_pipeline_create(fake_project_cli, fake_metadata) - pipeline_requirements_txt = ( - fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" - ) - pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS) - - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - - packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS) - existing_requirements = _safe_parse_requirements(COMPLEX_REQUIREMENTS) - pulled_requirements = _safe_parse_requirements( - project_requirements_in.read_text() - ) - # requirements.in afterwards should be the requirements that already existed in - # project requirements.txt + those pulled in from pipeline requirements.txt. - # Unparseable COMPLEX_REQUIREMENTS should still be there. - assert pulled_requirements == existing_requirements | packaged_requirements - assert COMPLEX_REQUIREMENTS in project_requirements_in.read_text() - - def test_existing_project_requirements_in( - self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path - ): - """Pipeline requirements.txt and a pre-existing project requirements.in.""" - project_requirements_in = fake_repo_path / "src" / "requirements.in" - initial_dependency = "some_package==0.1.0" - project_requirements_in.write_text(initial_dependency) - self.call_pipeline_create(fake_project_cli, fake_metadata) - pipeline_requirements_txt = ( - fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" - ) - pipeline_requirements_txt.write_text(SIMPLE_REQUIREMENTS) - - self.call_pipeline_package(fake_project_cli, fake_metadata) - self.call_pipeline_delete(fake_project_cli, fake_metadata) - self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - - packaged_requirements = _safe_parse_requirements(SIMPLE_REQUIREMENTS) - existing_requirements = _safe_parse_requirements(initial_dependency) pulled_requirements = _safe_parse_requirements( - project_requirements_in.read_text() + project_requirements_txt.read_text() ) - # requirements.in afterwards should be the requirements that already existed in + # Project requirements.txt afterwards should be the requirements that already existed in # project requirements.txt + those pulled in from pipeline requirements.txt. assert pulled_requirements == existing_requirements | packaged_requirements - def test_missing_project_requirements_in_and_txt( + def test_missing_project_requirements_txt( self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path, ): - """Pipeline requirements.txt without requirements.in or requirements.txt at + """Pipeline requirements.txt without requirements.txt at project level.""" + # Remove project requirements.txt project_requirements_txt = fake_repo_path / "src" / "requirements.txt" project_requirements_txt.unlink() @@ -220,12 +161,9 @@ def test_missing_project_requirements_in_and_txt( self.call_pipeline_delete(fake_project_cli, fake_metadata) self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - project_requirements_in = fake_repo_path / "src" / "requirements.in" - - assert not project_requirements_txt.exists() - assert project_requirements_in.exists() + assert project_requirements_txt.exists() pulled_requirements = _safe_parse_requirements( - project_requirements_in.read_text() + project_requirements_txt.read_text() ) assert packaged_requirements == pulled_requirements @@ -235,7 +173,7 @@ def test_no_requirements( fake_metadata, fake_repo_path, ): - """No pipeline requirements.txt, and also no requirements.in or requirements.txt + """No pipeline requirements.txt, and also no requirements.txt at project level.""" # Remove project requirements.txt project_requirements_txt = fake_repo_path / "src" / "requirements.txt" @@ -246,10 +184,7 @@ def test_no_requirements( self.call_pipeline_delete(fake_project_cli, fake_metadata) self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - project_requirements_in = fake_repo_path / "src" / "requirements.in" - project_requirements_txt = fake_repo_path / "src" / "requirements.txt" assert not project_requirements_txt.exists() - assert not project_requirements_in.exists() def test_all_requirements_already_covered( self, fake_project_cli, fake_metadata, fake_repo_path, fake_package_path @@ -268,30 +203,37 @@ def test_all_requirements_already_covered( self.call_pipeline_delete(fake_project_cli, fake_metadata) self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - # requirements.txt expected to be copied into requirements.in without any + # Pipeline requirements.txt expected to be copied into project requirements.txt without any # addition - project_requirements_in = fake_repo_path / "src" / "requirements.in" - assert project_requirements_in.exists() - assert project_requirements_in.read_text() == SIMPLE_REQUIREMENTS + assert project_requirements_txt.read_text() == SIMPLE_REQUIREMENTS def test_no_pipeline_requirements_txt( self, fake_project_cli, fake_metadata, fake_repo_path ): - """No pipeline requirements.txt and no project requirements.in does not - create project requirements.in.""" + """No pipeline requirements.txt and no project requirements.txt does not + create project requirements.txt.""" + + # Remove project requirements.txt + project_requirements_txt = fake_repo_path / "src" / "requirements.txt" + project_requirements_txt.unlink() + self.call_pipeline_create(fake_project_cli, fake_metadata) self.call_pipeline_package(fake_project_cli, fake_metadata) self.call_pipeline_delete(fake_project_cli, fake_metadata) self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - project_requirements_in = fake_repo_path / "src" / "requirements.in" - assert not project_requirements_in.exists() + assert not project_requirements_txt.exists() def test_empty_pipeline_requirements_txt( self, fake_project_cli, fake_metadata, fake_package_path, fake_repo_path ): - """Empty pipeline requirements.txt and no project requirements.in does not - create project requirements.in.""" + """Empty pipeline requirements.txt and no project requirements.txt does not + create project requirements.txt.""" + + # Remove project requirements.txt + project_requirements_txt = fake_repo_path / "src" / "requirements.txt" + project_requirements_txt.unlink() + self.call_pipeline_create(fake_project_cli, fake_metadata) pipeline_requirements_txt = ( fake_package_path / "pipelines" / PIPELINE_NAME / "requirements.txt" @@ -301,8 +243,7 @@ def test_empty_pipeline_requirements_txt( self.call_pipeline_delete(fake_project_cli, fake_metadata) self.call_pipeline_pull(fake_project_cli, fake_metadata, fake_repo_path) - project_requirements_in = fake_repo_path / "src" / "requirements.in" - assert not project_requirements_in.exists() + assert not project_requirements_txt.exists() @pytest.mark.parametrize("requirement", COMPLEX_REQUIREMENTS.splitlines()) def test_complex_requirements( @@ -318,7 +259,7 @@ def test_complex_requirements( result = CliRunner().invoke( fake_project_cli, - ["pipeline", "package", PIPELINE_NAME], + ["pipeline", "package", f"pipelines.{PIPELINE_NAME}"], obj=fake_metadata, ) assert result.exit_code == 1 diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index 16ec565c01..1e17cdc06f 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -174,7 +174,7 @@ def catalog_path(request, fake_repo_path): def test_pipeline_argument_is_required(self, fake_project_cli): result = CliRunner().invoke(fake_project_cli, ["catalog", "create"]) assert result.exit_code - expected_output = "Error: Missing option '--pipeline'." + expected_output = "Error: Missing option '--pipeline' / '-p'." assert expected_output in result.output @pytest.mark.usefixtures("fake_load_context") diff --git a/tests/framework/cli/test_cli.py b/tests/framework/cli/test_cli.py index 7b8ae426d5..6fc169b57e 100644 --- a/tests/framework/cli/test_cli.py +++ b/tests/framework/cli/test_cli.py @@ -10,7 +10,7 @@ from pytest import fixture, mark, raises from kedro import __version__ as version -from kedro.framework.cli import get_project_context, load_entry_points +from kedro.framework.cli import load_entry_points from kedro.framework.cli.catalog import catalog_cli from kedro.framework.cli.cli import KedroCLI, _init_plugins, cli from kedro.framework.cli.jupyter import jupyter_cli @@ -71,24 +71,6 @@ def fake_session(mocker): return mocked_session -# pylint:disable=too-few-public-methods -class DummyContext: - def __init__(self): - self.config_loader = "config_loader" - - catalog = "catalog" - pipeline = "pipeline" - project_name = "dummy_name" - project_path = "dummy_path" - - -@fixture -def mocked_load_context(mocker): - return mocker.patch( - "kedro.framework.cli.cli.load_context", return_value=DummyContext() - ) - - class TestCliCommands: def test_cli(self): """Run `kedro` without arguments.""" @@ -322,23 +304,6 @@ def test_update_value_nested_dict(self): assert actual == expected -@mark.usefixtures("mocked_load_context") -class TestGetProjectContext: - def test_get_context_without_project_path(self, mocked_load_context): - dummy_context = get_project_context("context") - mocked_load_context.assert_called_once_with(Path.cwd()) - assert isinstance(dummy_context, DummyContext) - - def test_get_context_with_project_path(self, tmpdir, mocked_load_context): - dummy_project_path = tmpdir.mkdir("dummy_project") - dummy_context = get_project_context("context", project_path=dummy_project_path) - mocked_load_context.assert_called_once_with(dummy_project_path) - assert isinstance(dummy_context, DummyContext) - - def test_verbose(self): - assert not get_project_context("verbose") - - class TestEntryPoints: def test_project_groups(self, entry_points, entry_point): entry_point.load.return_value = "groups" @@ -552,22 +517,11 @@ def test_run_with_pipeline_filters( assert isinstance(runner, SequentialRunner) assert not runner._is_async - def test_with_sequential_runner_and_parallel_flag( - self, fake_project_cli, fake_session - ): - result = CliRunner().invoke( - fake_project_cli, ["run", "--parallel", "--runner=SequentialRunner"] - ) - assert result.exit_code - assert "Please use either --parallel or --runner" in result.stdout - - fake_session.return_value.run.assert_not_called() - - def test_run_successfully_parallel_via_flag( + def test_run_successfully_parallel( self, fake_project_cli, fake_metadata, fake_session, mocker ): result = CliRunner().invoke( - fake_project_cli, ["run", "--parallel"], obj=fake_metadata + fake_project_cli, ["run", "--runner=ParallelRunner"], obj=fake_metadata ) assert not result.exit_code fake_session.run.assert_called_once_with( @@ -586,17 +540,6 @@ def test_run_successfully_parallel_via_flag( assert isinstance(runner, ParallelRunner) assert not runner._is_async - def test_run_successfully_parallel_via_name( - self, fake_project_cli, fake_metadata, fake_session - ): - result = CliRunner().invoke( - fake_project_cli, ["run", "--runner=ParallelRunner"], obj=fake_metadata - ) - assert not result.exit_code - runner = fake_session.run.call_args_list[0][1]["runner"] - assert isinstance(runner, ParallelRunner) - assert not runner._is_async - def test_run_async(self, fake_project_cli, fake_metadata, fake_session): result = CliRunner().invoke( fake_project_cli, ["run", "--async"], obj=fake_metadata diff --git a/tests/framework/cli/test_jupyter.py b/tests/framework/cli/test_jupyter.py index 31da834915..50134bb7c8 100644 --- a/tests/framework/cli/test_jupyter.py +++ b/tests/framework/cli/test_jupyter.py @@ -155,7 +155,7 @@ def test_fail_no_jupyter_core(self, fake_project_cli, mocker): assert result.exit_code error = ( "Module `jupyter_core` not found. Make sure to install required project " - "dependencies by running the `kedro install` command first." + "dependencies by running the `pip install -r src/requirements.txt` command first." ) assert error in result.output @@ -220,7 +220,7 @@ def test_fail_no_jupyter_core(self, fake_project_cli, mocker): assert result.exit_code error = ( "Module `jupyter_core` not found. Make sure to install required project " - "dependencies by running the `kedro install` command first." + "dependencies by running the `pip install -r src/requirements.txt` command first." ) assert error in result.output diff --git a/tests/framework/cli/test_project.py b/tests/framework/cli/test_project.py index f9635ed111..3d9fe17e55 100644 --- a/tests/framework/cli/test_project.py +++ b/tests/framework/cli/test_project.py @@ -244,226 +244,6 @@ def test_pythonpath_env_var( assert mocked_environ == {"PYTHONPATH": str(fake_repo_path / "src")} -@pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log", "fake_copyfile") -class TestInstallCommand: - def test_install_compile_default( - self, - python_call_mock, - fake_project_cli, - fake_repo_path, - fake_copyfile, - mocker, - fake_metadata, - ): - """Test that the requirements are compiled by default - if requirements.in doesn't exist""" - mocker.patch("kedro.framework.cli.project.os").name = "posix" - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert not result.exit_code, result.output - assert "Requirements installed!" in result.output - - requirements_in = fake_repo_path / "src" / "requirements.in" - requirements_txt = fake_repo_path / "src" / "requirements.txt" - expected_calls = [ - mocker.call("piptools", ["compile", "-q", str(requirements_in)]), - mocker.call("pip", ["install", "-U", "-r", str(requirements_txt)]), - ] - assert python_call_mock.mock_calls == expected_calls - fake_copyfile.assert_called_once_with( - str(requirements_txt), str(requirements_in) - ) - - def test_install_compile_force( - self, - python_call_mock, - fake_project_cli, - fake_repo_path, - fake_copyfile, - mocker, - fake_metadata, - ): - """Test that the requirements are compiled if requirements.in exists - and --build-reqs CLI option is specified""" - mocker.patch("kedro.framework.cli.project.os").name = "posix" - mocker.patch.object(Path, "is_file", return_value=True) - result = CliRunner().invoke( - fake_project_cli, ["install", "--build-reqs"], obj=fake_metadata - ) - assert not result.exit_code, result.output - assert "Requirements installed!" in result.output - - requirements_in = fake_repo_path / "src" / "requirements.in" - requirements_txt = fake_repo_path / "src" / "requirements.txt" - expected_calls = [ - mocker.call("piptools", ["compile", "-q", str(requirements_in)]), - mocker.call("pip", ["install", "-U", "-r", str(requirements_txt)]), - ] - assert python_call_mock.mock_calls == expected_calls - fake_copyfile.assert_not_called() - - def test_install_no_compile_default( - self, - python_call_mock, - fake_project_cli, - fake_repo_path, - fake_copyfile, - mocker, - fake_metadata, - ): - """Test that the requirements aren't compiled by default - if requirements.in exists""" - mocker.patch("kedro.framework.cli.project.os").name = "posix" - mocker.patch.object(Path, "is_file", return_value=True) - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert not result.exit_code, result.output - assert "Requirements installed!" in result.output - - requirements_txt = fake_repo_path / "src" / "requirements.txt" - python_call_mock.assert_called_once_with( - "pip", ["install", "-U", "-r", str(requirements_txt)] - ) - fake_copyfile.assert_not_called() - - def test_install_no_compile_force( - self, - python_call_mock, - fake_project_cli, - fake_repo_path, - fake_copyfile, - mocker, - fake_metadata, - ): - """Test that the requirements aren't compiled if requirements.in doesn't exist - and --no-build-reqs CLI option is specified""" - mocker.patch("kedro.framework.cli.project.os").name = "posix" - result = CliRunner().invoke( - fake_project_cli, ["install", "--no-build-reqs"], obj=fake_metadata - ) - assert not result.exit_code, result.output - assert "Requirements installed!" in result.output - - requirements_txt = fake_repo_path / "src" / "requirements.txt" - python_call_mock.assert_called_once_with( - "pip", ["install", "-U", "-r", str(requirements_txt)] - ) - fake_copyfile.assert_not_called() - - def test_with_env_file( - self, - python_call_mock, - call_mock, - fake_project_cli, - mocker, - fake_repo_path, - fake_copyfile, - fake_metadata, - ): - mocker.patch("kedro.framework.cli.project.os").name = "posix" - # Pretend env file exists: - mocker.patch.object(Path, "is_file", return_value=True) - - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert not result.exit_code, result.stdout - assert "Requirements installed!" in result.output - - requirements_txt = fake_repo_path / "src" / "requirements.txt" - expected_calls = [ - mocker.call("pip", ["install", "-U", "-r", str(requirements_txt)]) - ] - assert python_call_mock.mock_calls == expected_calls - - call_mock.assert_called_once_with( - [ - "conda", - "env", - "update", - "--file", - str(fake_repo_path / "src/environment.yml"), - "--prune", - ] - ) - fake_copyfile.assert_not_called() - - def test_windows( - self, fake_project_cli, mocker, fake_repo_path, fake_copyfile, fake_metadata - ): - mock_subprocess = mocker.patch("kedro.framework.cli.project.subprocess") - mock_subprocess.Popen.return_value.communicate.return_value = ("", b"") - # pretend we are on Windows - mocker.patch("kedro.framework.cli.project.os").name = "nt" - - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert not result.exit_code, result.stdout - assert "Requirements installed!" in result.output - - requirements_in = fake_repo_path / "src" / "requirements.in" - requirements_txt = fake_repo_path / "src" / "requirements.txt" - command = [ - sys.executable, - "-m", - "pip", - "install", - "-U", - "-r", - str(requirements_txt), - ] - mock_subprocess.Popen.assert_called_once_with( - command, - creationflags=mock_subprocess.CREATE_NEW_CONSOLE, - stderr=mock_subprocess.PIPE, - ) - fake_copyfile.assert_called_once_with( - str(requirements_txt), str(requirements_in) - ) - - def test_windows_err( - self, fake_project_cli, mocker, fake_repo_path, fake_copyfile, fake_metadata - ): - mock_subprocess = mocker.patch("kedro.framework.cli.project.subprocess") - mock_subprocess.Popen.return_value.communicate.return_value = ( - "", - b"Error in dependencies", - ) - # pretend we are on Windows - mocker.patch("kedro.framework.cli.project.os").name = "nt" - - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert result.exit_code, result.stdout - assert "Error in dependencies" in result.output - - def test_install_working_with_unimportable_pipelines( - self, fake_project_cli, mocker, fake_metadata - ): - """Test kedro install works even if pipelines are not importable""" - mocker.patch("kedro.framework.cli.project.os").name = "posix" - pipeline_registry = ( - fake_metadata.source_dir - / fake_metadata.package_name - / "pipeline_registry.py" - ) - pipeline_registry.write_text("import this_is_not_a_real_thing") - - result = CliRunner().invoke(fake_project_cli, ["install"], obj=fake_metadata) - assert not result.exit_code, result.output - assert "Requirements installed!" in result.output - - @pytest.mark.parametrize("os_name", ["posix", "nt"]) - def test_install_missing_requirements_in_and_txt( - self, fake_project_cli, mocker, fake_metadata, os_name - ): - """Test error when neither requirements.txt nor requirements.in exists.""" - mocker.patch("kedro.framework.cli.project.os").name = os_name - mocker.patch.object(Path, "is_file", return_value=False) - result = CliRunner().invoke( - fake_project_cli, ["install", "--build-reqs"], obj=fake_metadata - ) - assert result.exit_code # Error expected - assert isinstance(result.exception, FileNotFoundError) - assert "No project requirements.in or requirements.txt found" in str( - result.exception - ) - - @pytest.fixture def os_mock(mocker): return mocker.patch("kedro.framework.cli.project.os") @@ -537,7 +317,7 @@ def test_fail_no_ipython(self, fake_project_cli, mocker): assert result.exit_code error = ( "Module `IPython` not found. Make sure to install required project " - "dependencies by running the `kedro install` command first." + "dependencies by running the `pip install -r src/requirements.txt` command first." ) assert error in result.output @@ -552,11 +332,27 @@ def test_happy_path( call_mock.assert_has_calls( [ mocker.call( - [sys.executable, "setup.py", "clean", "--all", "bdist_egg"], + [ + sys.executable, + "setup.py", + "clean", + "--all", + "bdist_egg", + "--dist-dir", + "../dist", + ], cwd=str(fake_repo_path / "src"), ), mocker.call( - [sys.executable, "setup.py", "clean", "--all", "bdist_wheel"], + [ + sys.executable, + "setup.py", + "clean", + "--all", + "bdist_wheel", + "--dist-dir", + "../dist", + ], cwd=str(fake_repo_path / "src"), ), ] @@ -620,7 +416,7 @@ def test_open_docs(self, open_flag, fake_project_cli, mocker, fake_metadata): @pytest.mark.usefixtures("chdir_to_dummy_project", "patch_log", "fake_copyfile") class TestBuildReqsCommand: - def test_requirements_file_exists( + def test_compile_from_requirements_file( self, python_call_mock, fake_project_cli, @@ -638,11 +434,15 @@ def test_requirements_file_exists( python_call_mock.assert_called_once_with( "piptools", - ["compile", "-q", str(fake_repo_path / "src" / "requirements.in")], + [ + "compile", + str(fake_repo_path / "src" / "requirements.txt"), + "--output-file", + str(fake_repo_path / "src" / "requirements.lock"), + ], ) - fake_copyfile.assert_not_called() - def test_requirements_file_doesnt_exist( + def test_compile_from_input_and_to_output_file( self, python_call_mock, fake_project_cli, @@ -650,18 +450,28 @@ def test_requirements_file_doesnt_exist( fake_copyfile, fake_metadata, ): - # File does not exist: - requirements_in = fake_repo_path / "src" / "requirements.in" - requirements_txt = fake_repo_path / "src" / "requirements.txt" + # File exists: + input_file = fake_repo_path / "src" / "dev-requirements.txt" + with open(input_file, "a", encoding="utf-8") as file: + file.write("") + output_file = fake_repo_path / "src" / "dev-requirements.lock" - result = CliRunner().invoke(fake_project_cli, ["build-reqs"], obj=fake_metadata) + result = CliRunner().invoke( + fake_project_cli, + [ + "build-reqs", + "--input-file", + str(input_file), + "--output-file", + str(output_file), + ], + obj=fake_metadata, + ) assert not result.exit_code, result.stdout assert "Requirements built!" in result.stdout python_call_mock.assert_called_once_with( - "piptools", ["compile", "-q", str(requirements_in)] - ) - fake_copyfile.assert_called_once_with( - str(requirements_txt), str(requirements_in) + "piptools", + ["compile", str(input_file), "--output-file", str(output_file)], ) @pytest.mark.parametrize( @@ -675,7 +485,7 @@ def test_extra_args( extra_args, fake_metadata, ): - requirements_in = fake_repo_path / "src" / "requirements.in" + requirements_txt = fake_repo_path / "src" / "requirements.txt" result = CliRunner().invoke( fake_project_cli, ["build-reqs"] + extra_args, obj=fake_metadata @@ -684,5 +494,24 @@ def test_extra_args( assert not result.exit_code, result.stdout assert "Requirements built!" in result.stdout - call_args = ["compile", "-q"] + extra_args + [str(requirements_in)] + call_args = ( + ["compile"] + + extra_args + + [str(requirements_txt)] + + ["--output-file", str(fake_repo_path / "src" / "requirements.lock")] + ) python_call_mock.assert_called_once_with("piptools", call_args) + + @pytest.mark.parametrize("os_name", ["posix", "nt"]) + def test_missing_requirements_txt( + self, fake_project_cli, mocker, fake_metadata, os_name, fake_repo_path + ): + """Test error when input file requirements.txt doesn't exists.""" + requirements_txt = fake_repo_path / "src" / "requirements.txt" + + mocker.patch("kedro.framework.cli.project.os").name = os_name + mocker.patch.object(Path, "is_file", return_value=False) + result = CliRunner().invoke(fake_project_cli, ["build-reqs"], obj=fake_metadata) + assert result.exit_code # Error expected + assert isinstance(result.exception, FileNotFoundError) + assert f"File `{requirements_txt}` not found" in str(result.exception) diff --git a/tests/framework/cli/test_starters.py b/tests/framework/cli/test_starters.py index abf1e54352..d206084c20 100644 --- a/tests/framework/cli/test_starters.py +++ b/tests/framework/cli/test_starters.py @@ -14,7 +14,7 @@ from kedro import __version__ as version from kedro.framework.cli.starters import _STARTER_ALIASES, TEMPLATE_PATH -FILES_IN_TEMPLATE = 35 +FILES_IN_TEMPLATE = 34 @pytest.fixture diff --git a/tests/framework/context/test_context.py b/tests/framework/context/test_context.py index 2dbe5a9dec..2f025ea6f7 100644 --- a/tests/framework/context/test_context.py +++ b/tests/framework/context/test_context.py @@ -1,9 +1,8 @@ import configparser import json import re -import sys +import textwrap from pathlib import Path, PurePath, PurePosixPath, PureWindowsPath -from time import sleep from typing import Any, Dict import pandas as pd @@ -14,30 +13,29 @@ from kedro import __version__ as kedro_version from kedro.config import ConfigLoader, MissingConfigException -from kedro.extras.datasets.pandas import CSVDataSet -from kedro.framework.context import KedroContext, KedroContextError +from kedro.framework.context import KedroContext from kedro.framework.context.context import ( _convert_paths_to_absolute_posix, _is_relative_path, _update_nested_dict, _validate_layers_for_transcoding, ) -from kedro.framework.hooks import get_hook_manager, hook_impl from kedro.framework.project import ( - Validator, - _ProjectPipelines, + ValidationError, _ProjectSettings, configure_project, pipelines, ) -from kedro.io import DataCatalog -from kedro.io.core import Version, generate_timestamp -from kedro.pipeline import Pipeline, node -from kedro.runner import ParallelRunner, SequentialRunner MOCK_PACKAGE_NAME = "mock_package_name" +class BadCatalog: # pylint: disable=too-few-public-methods + """ + Catalog class that doesn't subclass `DataCatalog`, for testing only. + """ + + def _write_yaml(filepath: Path, config: Dict): filepath.parent.mkdir(parents=True, exist_ok=True) yaml_str = yaml.dump(config) @@ -107,22 +105,44 @@ def local_config(tmp_path): } +@pytest.fixture +def local_logging_config() -> Dict[str, Any]: + return { + "version": 1, + "formatters": { + "simple": {"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"} + }, + "root": {"level": "INFO", "handlers": ["console"]}, + "loggers": {"kedro": {"level": "INFO", "handlers": ["console"]}}, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "level": "INFO", + "formatter": "simple", + "stream": "ext://sys.stdout", + } + }, + } + + @pytest.fixture(params=[None]) def env(request): return request.param @pytest.fixture -def prepare_project_dir(tmp_path, base_config, local_config, env): +def prepare_project_dir(tmp_path, base_config, local_config, local_logging_config, env): env = "local" if env is None else env proj_catalog = tmp_path / "conf" / "base" / "catalog.yml" env_catalog = tmp_path / "conf" / str(env) / "catalog.yml" + logging = tmp_path / "conf" / "local" / "logging.yml" env_credentials = tmp_path / "conf" / str(env) / "credentials.yml" parameters = tmp_path / "conf" / "base" / "parameters.json" db_config_path = tmp_path / "conf" / "base" / "db.ini" project_parameters = {"param1": 1, "param2": 2, "param3": {"param4": 3}} _write_yaml(proj_catalog, base_config) _write_yaml(env_catalog, local_config) + _write_yaml(logging, local_logging_config) _write_yaml(env_credentials, local_config) _write_json(parameters, project_parameters) _write_dummy_ini(db_config_path) @@ -130,54 +150,32 @@ def prepare_project_dir(tmp_path, base_config, local_config, env): _write_toml(tmp_path / "pyproject.toml", pyproject_toml_payload) -class RegistrationHooks: - @hook_impl - def register_catalog( - self, catalog, credentials, load_versions, save_version, journal - ) -> DataCatalog: - return DataCatalog.from_config( - catalog, credentials, load_versions, save_version, journal +@pytest.fixture +def mock_settings_file_bad_data_catalog_class(tmpdir): + mock_settings_file = tmpdir.join("mock_settings_file.py") + mock_settings_file.write( + textwrap.dedent( + f""" + from {__name__} import BadCatalog + DATA_CATALOG_CLASS = BadCatalog + """ ) - - @hook_impl - def register_config_loader(self, conf_paths) -> ConfigLoader: - return ConfigLoader(conf_paths) - - -class MockSettings(_ProjectSettings): - _HOOKS = Validator("HOOKS", default=(RegistrationHooks(),)) + ) + return mock_settings_file @pytest.fixture(autouse=True) def mock_settings(mocker): - mocked_settings = MockSettings() + mocked_settings = _ProjectSettings() mocker.patch("kedro.framework.session.session.settings", mocked_settings) - mocker.patch("kedro.framework.context.context.settings", mocked_settings) return mocker.patch("kedro.framework.project.settings", mocked_settings) -@pytest.fixture(autouse=True) -def mock_pipelines(mocker): - mocker.patch.object( - _ProjectPipelines, - "_get_pipelines_registry_callable", - return_value=_create_pipelines, - ) - - @pytest.fixture def dummy_dataframe(): return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) -def identity(input1: str): - return input1 # pragma: no cover - - -def bad_node(x): - raise ValueError("Oh no!") - - expected_message_middle = ( "There are 2 nodes that have not run.\n" "You can resume the pipeline run by adding the following " @@ -203,43 +201,6 @@ def bad_node(x): } -def _create_pipelines(): - bad_pipeline_middle = Pipeline( - [ - node(identity, "cars", "boats", name="node1", tags=["tag1"]), - node(identity, "boats", "trains", name="node2"), - node(bad_node, "trains", "ships", name="nodes3"), - node(identity, "ships", "planes", name="node4"), - ], - tags="bad_pipeline", - ) - bad_pipeline_head = Pipeline( - [ - node(bad_node, "cars", "boats", name="node1", tags=["tag1"]), - node(identity, "boats", "trains", name="node2"), - node(identity, "trains", "ships", name="nodes3"), - node(identity, "ships", "planes", name="node4"), - ], - tags="bad_pipeline", - ) - default_pipeline = Pipeline( - [ - node(identity, "cars", "boats", name="node1", tags=["tag1"]), - node(identity, "boats", "trains", name="node2"), - node(identity, "trains", "ships", name="node3"), - node(identity, "ships", "planes", name="node4"), - ], - tags="pipeline", - ) - return { - "__default__": default_pipeline, - "empty": Pipeline([]), - "simple": Pipeline([node(identity, "cars", "boats")]), - "bad_pipeline_middle": bad_pipeline_middle, - "bad_pipeline_head": bad_pipeline_head, - } - - @pytest.fixture(params=[None]) def extra_params(request): return request.param @@ -254,55 +215,30 @@ def mocked_logging(mocker): @pytest.fixture def dummy_context( - tmp_path, prepare_project_dir, env, extra_params, mocker + tmp_path, prepare_project_dir, env, extra_params ): # pylint: disable=unused-argument configure_project(MOCK_PACKAGE_NAME) + config_loader = ConfigLoader(str(tmp_path / "conf"), env=env) context = KedroContext( - MOCK_PACKAGE_NAME, str(tmp_path), env=env, extra_params=extra_params + MOCK_PACKAGE_NAME, + str(tmp_path), + config_loader=config_loader, + env=env, + extra_params=extra_params, ) yield context pipelines._clear(MOCK_PACKAGE_NAME) -@pytest.fixture(autouse=True) -def clear_hook_manager(): - yield - hook_manager = get_hook_manager() - plugins = hook_manager.get_plugins() - for plugin in plugins: - hook_manager.unregister(plugin) - - class TestKedroContext: - def test_deprecate_reading_conf_root_from_context(self, dummy_context): - pattern = ( - "Accessing CONF_ROOT via the context will be deprecated in Kedro 0.18.0." - ) - with pytest.warns(DeprecationWarning, match=pattern): - assert dummy_context.CONF_ROOT == "conf" - - def test_deprecate_setting_conf_root_on_context(self, dummy_context): - pattern = ( - "Accessing CONF_ROOT via the context will be deprecated in Kedro 0.18.0." - ) - with pytest.warns(DeprecationWarning, match=pattern): - dummy_context.CONF_ROOT = "test_conf" - - @pytest.mark.parametrize("property_name", ["io", "pipeline", "pipelines"]) - def test_deprecate_properties_on_context(self, property_name, dummy_context): - pattern = f"Accessing {property_name} via the context will be deprecated in Kedro 0.18.0." - with pytest.warns(DeprecationWarning, match=pattern): - assert getattr(dummy_context, property_name) - def test_attributes(self, tmp_path, dummy_context): - project_metadata = pyproject_toml_payload["tool"]["kedro"] - assert dummy_context.package_name == project_metadata["package_name"] assert isinstance(dummy_context.project_path, Path) assert dummy_context.project_path == tmp_path.resolve() def test_get_catalog_always_using_absolute_path(self, dummy_context): - conf_catalog = dummy_context.config_loader.get("catalog*") + config_loader = dummy_context._config_loader + conf_catalog = config_loader.get("catalog*") # even though the raw configuration uses relative path assert conf_catalog["horses"]["filepath"] == "horses.csv" @@ -331,10 +267,17 @@ def test_catalog(self, dummy_context, dummy_dataframe): reloaded_df = dummy_context.catalog.load("cars") assert_frame_equal(reloaded_df, dummy_dataframe) - def test_io(self, dummy_context, dummy_dataframe): - dummy_context.io.save("cars", dummy_dataframe) - reloaded_df = dummy_context.io.load("cars") - assert_frame_equal(reloaded_df, dummy_dataframe) + def test_wrong_catalog_type(self, mock_settings_file_bad_data_catalog_class): + pattern = ( + "Invalid value `tests.framework.context.test_context.BadCatalog` received " + "for setting `DATA_CATALOG_CLASS`. " + "It must be a subclass of `kedro.io.data_catalog.DataCatalog`." + ) + mock_settings = _ProjectSettings( + settings_file=str(mock_settings_file_bad_data_catalog_class) + ) + with pytest.raises(ValidationError, match=re.escape(pattern)): + assert mock_settings.DATA_CATALOG_CLASS @pytest.mark.parametrize( "extra_params", @@ -360,8 +303,8 @@ def test_nested_params(self, param, expected, dummy_context): indirect=True, ) def test_params_missing(self, mocker, extra_params, dummy_context): - mock_config_loader = mocker.patch.object(KedroContext, "config_loader") - mock_config_loader.get.side_effect = MissingConfigException("nope") + mock_config_loader = mocker.patch("kedro.config.ConfigLoader.get") + mock_config_loader.side_effect = MissingConfigException("nope") extra_params = extra_params or {} pattern = "Parameters not found in your Kedro project config" @@ -369,22 +312,6 @@ def test_params_missing(self, mocker, extra_params, dummy_context): actual = dummy_context.params assert actual == extra_params - def test_config_loader(self, dummy_context): - params = dummy_context.config_loader.get("parameters*") - db_conf = dummy_context.config_loader.get("db*") - catalog = dummy_context.config_loader.get("catalog*") - - assert params["param1"] == 1 - assert db_conf["prod"]["url"] == "postgresql://user:pass@url_prod/db" - - assert catalog["trains"]["type"] == "pandas.CSVDataSet" - assert catalog["cars"]["type"] == "pandas.CSVDataSet" - assert catalog["boats"]["type"] == "pandas.CSVDataSet" - assert not catalog["cars"]["save_args"]["index"] - - def test_default_env(self, dummy_context): - assert dummy_context.env == "local" - @pytest.mark.parametrize("env", ["custom_env"], indirect=True) def test_custom_env(self, dummy_context, env): assert dummy_context.env == env @@ -407,279 +334,6 @@ def test_missing_credentials(self, dummy_context): with pytest.warns(UserWarning, match=re.escape(pattern)): _ = dummy_context.catalog - def test_pipeline(self, dummy_context): - assert dummy_context.pipeline.nodes[0].inputs == ["cars"] - assert dummy_context.pipeline.nodes[0].outputs == ["boats"] - assert dummy_context.pipeline.nodes[1].inputs == ["boats"] - assert dummy_context.pipeline.nodes[1].outputs == ["trains"] - - def test_pipelines(self, dummy_context): - assert len(dummy_context.pipelines) == 5 - assert len(dummy_context.pipelines["__default__"].nodes) == 4 - - -class TestKedroContextRun: - def test_deprecate_run(self, dummy_context, dummy_dataframe): - dummy_context.catalog.save("cars", dummy_dataframe) - pattern = ( - "`kedro.framework.context.KedroContext.run` is now deprecated in favour of " - "`KedroSession.run` and will be removed in Kedro 0.18.0." - ) - with pytest.warns(DeprecationWarning, match=pattern): - dummy_context.run() - - def test_run_output(self, dummy_context, dummy_dataframe): - dummy_context.catalog.save("cars", dummy_dataframe) - outputs = dummy_context.run() - pd.testing.assert_frame_equal(outputs["planes"], dummy_dataframe) - - def test_run_no_output(self, dummy_context, dummy_dataframe): - dummy_context.catalog.save("cars", dummy_dataframe) - outputs = dummy_context.run(node_names=["node1"]) - assert not outputs - - def test_default_run(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run() - - log_msgs = [record.getMessage() for record in caplog.records] - log_names = [record.name for record in caplog.records] - - assert "kedro.runner.sequential_runner" in log_names - assert "Pipeline execution completed successfully." in log_msgs - - def test_sequential_run_arg(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(runner=SequentialRunner()) - - log_msgs = [record.getMessage() for record in caplog.records] - log_names = [record.name for record in caplog.records] - assert "kedro.runner.sequential_runner" in log_names - assert "Pipeline execution completed successfully." in log_msgs - - @pytest.mark.skipif( - sys.platform.startswith("win"), reason="Due to bug in parallel runner" - ) - def test_parallel_run_arg(self, dummy_context, dummy_dataframe, caplog, mocker): - mocker.patch( - "kedro.framework.context.context.load_context", return_value=dummy_context - ) - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(runner=ParallelRunner()) - - log_msgs = [record.getMessage() for record in caplog.records] - log_names = [record.name for record in caplog.records] - assert "kedro.runner.parallel_runner" in log_names - assert "Pipeline execution completed successfully." in log_msgs - - def test_run_with_node_names(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(node_names=["node1"]) - - log_msgs = [record.getMessage() for record in caplog.records] - assert "Running node: node1: identity([cars]) -> [boats]" in log_msgs - assert "Pipeline execution completed successfully." in log_msgs - assert "Running node: node2: identity([boats]) -> [trains]" not in log_msgs - - def test_run_with_node_names_and_tags(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(node_names=["node1"], tags=["tag1", "pipeline"]) - - log_msgs = [record.getMessage() for record in caplog.records] - assert "Running node: node1: identity([cars]) -> [boats]" in log_msgs - assert "Pipeline execution completed successfully." in log_msgs - assert "Running node: node2: identity([boats]) -> [trains]" not in log_msgs - - def test_run_with_tags(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(tags=["tag1"]) - log_msgs = [record.getMessage() for record in caplog.records] - - assert "Completed 1 out of 1 tasks" in log_msgs - assert "Running node: node1: identity([cars]) -> [boats]" in log_msgs - assert "Running node: node2: identity([boats]) -> [trains]" not in log_msgs - assert "Pipeline execution completed successfully." in log_msgs - - def test_run_with_wrong_tags(self, dummy_context, dummy_dataframe): - dummy_context.catalog.save("cars", dummy_dataframe) - pattern = r"Pipeline contains no nodes with tags: \['non\-existent'\]" - with pytest.raises(KedroContextError, match=pattern): - dummy_context.run(tags=["non-existent"]) - - def test_run_from_nodes(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(from_nodes=["node1"]) - - log_msgs = [record.getMessage() for record in caplog.records] - assert "Completed 4 out of 4 tasks" in log_msgs - assert "Running node: node1: identity([cars]) -> [boats]" in log_msgs - assert "Pipeline execution completed successfully." in log_msgs - - def test_run_to_nodes(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(to_nodes=["node2"]) - - log_msgs = [record.getMessage() for record in caplog.records] - assert "Completed 2 out of 2 tasks" in log_msgs - assert "Running node: node1: identity([cars]) -> [boats]" in log_msgs - assert "Running node: node2: identity([boats]) -> [trains]" in log_msgs - assert "Running node: node3: identity([trains]) -> [ships]" not in log_msgs - assert "Pipeline execution completed successfully." in log_msgs - - def test_run_with_node_range(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(from_nodes=["node1"], to_nodes=["node3"]) - - log_msgs = [record.getMessage() for record in caplog.records] - assert "Completed 3 out of 3 tasks" in log_msgs - assert "Running node: node1: identity([cars]) -> [boats]" in log_msgs - assert "Running node: node2: identity([boats]) -> [trains]" in log_msgs - assert "Running node: node3: identity([trains]) -> [ships]" in log_msgs - assert "Pipeline execution completed successfully." in log_msgs - - def test_run_with_invalid_node_range(self, dummy_context, dummy_dataframe): - dummy_context.catalog.save("cars", dummy_dataframe) - pattern = "Pipeline contains no nodes" - - with pytest.raises(KedroContextError, match=pattern): - dummy_context.run(from_nodes=["node3"], to_nodes=["node1"]) - - def test_run_from_inputs(self, dummy_context, dummy_dataframe, caplog): - for dataset in ("cars", "trains", "boats"): - dummy_context.catalog.save(dataset, dummy_dataframe) - dummy_context.run(from_inputs=["trains"]) - - log_msgs = [record.getMessage() for record in caplog.records] - assert "Completed 2 out of 2 tasks" in log_msgs - assert "Running node: node3: identity([trains]) -> [ships]" in log_msgs - assert "Running node: node4: identity([ships]) -> [planes]" in log_msgs - assert "Pipeline execution completed successfully." in log_msgs - - def test_run_to_outputs(self, dummy_context, dummy_dataframe, caplog): - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(to_outputs=["trains"]) - - log_msgs = [record.getMessage() for record in caplog.records] - assert "Completed 2 out of 2 tasks" in log_msgs - assert "Running node: node1: identity([cars]) -> [boats]" in log_msgs - assert "Running node: node2: identity([boats]) -> [trains]" in log_msgs - assert "Running node: node3: identity([trains]) -> [ships]" not in log_msgs - assert "Pipeline execution completed successfully." in log_msgs - - def test_run_load_versions(self, dummy_context, dummy_dataframe): - filepath = (dummy_context.project_path / "cars.csv").as_posix() - - old_save_version = generate_timestamp() - old_df = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) - old_csv_data_set = CSVDataSet( - filepath=filepath, - save_args={"sep": ","}, - version=Version(None, old_save_version), - ) - old_csv_data_set.save(old_df) - - sleep(0.5) - new_save_version = generate_timestamp() - new_csv_data_set = CSVDataSet( - filepath=filepath, - save_args={"sep": ","}, - version=Version(None, new_save_version), - ) - new_csv_data_set.save(dummy_dataframe) - - load_versions = {"cars": old_save_version} - dummy_context.run(load_versions=load_versions, pipeline_name="simple") - assert not dummy_context.catalog.load("boats").equals(dummy_dataframe) - assert dummy_context.catalog.load("boats").equals(old_df) - - def test_run_with_empty_pipeline(self, dummy_context): - with pytest.raises(KedroContextError, match="Pipeline contains no nodes"): - dummy_context.run(pipeline_name="empty") - - @pytest.mark.parametrize( - "pipeline_name,expected_message", - [ - ("bad_pipeline_middle", expected_message_middle), - ("bad_pipeline_head", expected_message_head), - ], # pylint: disable=too-many-arguments - ) - def test_run_failure_prompts_resume_command( - self, dummy_context, dummy_dataframe, caplog, pipeline_name, expected_message - ): - dummy_context.catalog.save("cars", dummy_dataframe) - with pytest.raises(ValueError, match="Oh no"): - dummy_context.run(pipeline_name=pipeline_name) - - actual_messages = [ - record.getMessage() - for record in caplog.records - if record.levelname == "WARNING" - ] - - assert expected_message in actual_messages - - def test_missing_pipeline_name(self, dummy_context, dummy_dataframe): - dummy_context.catalog.save("cars", dummy_dataframe) - - with pytest.raises(KedroContextError, match="Failed to find the pipeline"): - dummy_context.run(pipeline_name="invalid-name") - - @pytest.mark.parametrize( - "extra_params", - [None, {}, {"foo": "bar", "baz": [1, 2], "qux": None}], - indirect=True, - ) - def test_run_with_extra_params( - self, mocker, dummy_context, dummy_dataframe, extra_params - ): - mock_journal = mocker.patch("kedro.framework.context.context.Journal") - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run() - - assert mock_journal.call_args[0][0]["extra_params"] == extra_params - - def test_run_with_save_version_as_run_id( - self, mocker, dummy_context, dummy_dataframe, caplog - ): - """Test that the default behaviour, with run_id set to None, - creates a journal record with the run_id the same as save_version. - """ - save_version = "2020-01-01T00.00.00.000Z" - mocked_get_save_version = mocker.patch.object( - dummy_context, "_get_save_version", return_value=save_version - ) - - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run(load_versions={"boats": save_version}) - - mocked_get_save_version.assert_called_once_with() - log_msg = next( - record.getMessage() - for record in caplog.records - if record.name == "kedro.journal" - ) - assert json.loads(log_msg)["run_id"] == save_version - - def test_run_with_custom_run_id( - self, mocker, dummy_context, dummy_dataframe, caplog - ): - run_id = "001" - mocked_get_run_id = mocker.patch.object( - dummy_context, "_get_run_id", return_value=run_id - ) - - dummy_context.catalog.save("cars", dummy_dataframe) - dummy_context.run() - - # once during run, and twice for each `.catalog` - assert mocked_get_run_id.call_count == 3 - log_msg = next( - record.getMessage() - for record in caplog.records - if record.name == "kedro.journal" - ) - assert json.loads(log_msg)["run_id"] == run_id - @pytest.mark.parametrize( "path_string,expected", diff --git a/tests/framework/project/test_settings.py b/tests/framework/project/test_settings.py index c36f7be0df..41748245ec 100644 --- a/tests/framework/project/test_settings.py +++ b/tests/framework/project/test_settings.py @@ -1,6 +1,5 @@ import sys import textwrap -from unittest import mock import pytest @@ -8,17 +7,9 @@ from kedro.framework.project import configure_project, settings from kedro.framework.session.store import BaseSessionStore -MOCK_CONTEXT_CLASS = mock.patch( - "kedro.framework.context.context.KedroContext", autospec=True -) - -def test_settings_without_configure_project_show_default_values(): - assert settings.CONF_ROOT == "conf" - assert settings.CONTEXT_CLASS is KedroContext - assert settings.SESSION_STORE_CLASS is BaseSessionStore - assert settings.SESSION_STORE_ARGS == {} - assert len(settings.DISABLE_HOOKS_FOR_PLUGINS) == 0 +class MyContext(KedroContext): + pass @pytest.fixture @@ -28,9 +19,9 @@ def mock_package_name_with_settings_file(tmpdir): settings_file_path.write( textwrap.dedent( f""" - from {__name__} import MOCK_CONTEXT_CLASS - CONF_ROOT = "test_conf" - CONTEXT_CLASS = MOCK_CONTEXT_CLASS + from {__name__} import MyContext + CONF_SOURCE = "test_conf" + CONTEXT_CLASS = MyContext """ ) ) @@ -43,9 +34,17 @@ def mock_package_name_with_settings_file(tmpdir): settings.set(key, value) +def test_settings_without_configure_project_show_default_values(): + assert settings.CONF_SOURCE == "conf" + assert settings.CONTEXT_CLASS is KedroContext + assert settings.SESSION_STORE_CLASS is BaseSessionStore + assert settings.SESSION_STORE_ARGS == {} + assert len(settings.DISABLE_HOOKS_FOR_PLUGINS) == 0 + + def test_settings_after_configuring_project_shows_updated_values( mock_package_name_with_settings_file, ): configure_project(mock_package_name_with_settings_file) - assert settings.CONF_ROOT == "test_conf" - assert settings.CONTEXT_CLASS is MOCK_CONTEXT_CLASS + assert settings.CONF_SOURCE == "test_conf" + assert settings.CONTEXT_CLASS is MyContext diff --git a/tests/framework/session/conftest.py b/tests/framework/session/conftest.py index acffa55cd0..7af110f7c1 100644 --- a/tests/framework/session/conftest.py +++ b/tests/framework/session/conftest.py @@ -2,7 +2,7 @@ from logging.handlers import QueueHandler, QueueListener from multiprocessing import Queue from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, List import pandas as pd import pytest @@ -11,7 +11,6 @@ from dynaconf.validator import Validator from kedro import __version__ as kedro_version -from kedro.config import ConfigLoader from kedro.framework.hooks import hook_impl from kedro.framework.hooks.manager import get_hook_manager from kedro.framework.project import _ProjectPipelines, _ProjectSettings @@ -19,7 +18,6 @@ from kedro.io import DataCatalog from kedro.pipeline import Pipeline from kedro.pipeline.node import Node, node -from kedro.versioning import Journal logger = logging.getLogger(__name__) @@ -71,6 +69,10 @@ def _assert_hook_call_record_has_expected_parameters( assert hasattr(call_record, param) +def _assert_pipeline_equal(p: Pipeline, q: Pipeline): + assert sorted(p.nodes) == sorted(q.nodes) + + @pytest.fixture def local_config(tmp_path): cars_filepath = str(tmp_path / "cars.csv") @@ -343,39 +345,6 @@ def after_dataset_saved(self, dataset_name: str, data: Any) -> None: "After dataset saved", extra={"dataset_name": dataset_name, "data": data} ) - @hook_impl - def register_config_loader( - self, conf_paths: Iterable[str], env: str, extra_params: Dict[str, Any] - ) -> ConfigLoader: - logger.info( - "Registering config loader", - extra={"conf_paths": conf_paths, "env": env, "extra_params": extra_params}, - ) - return ConfigLoader(conf_paths) - - @hook_impl - def register_catalog( - self, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]], - load_versions: Dict[str, str], - save_version: str, - journal: Journal, - ) -> DataCatalog: - logger.info( - "Registering catalog", - extra={ - "catalog": catalog, - "credentials": credentials, - "load_versions": load_versions, - "save_version": save_version, - "journal": journal, - }, - ) - return DataCatalog.from_config( - catalog, credentials, load_versions, save_version, journal - ) - @pytest.fixture def project_hooks(): @@ -408,7 +377,6 @@ def mock_register_pipelines(): def _mock_imported_settings_paths(mocker, mock_settings): for path in [ - "kedro.framework.context.context.settings", "kedro.framework.session.session.settings", "kedro.framework.project.settings", ]: diff --git a/tests/framework/session/test_session.py b/tests/framework/session/test_session.py index 938f891cc4..a57b7ec6b7 100644 --- a/tests/framework/session/test_session.py +++ b/tests/framework/session/test_session.py @@ -4,22 +4,21 @@ import subprocess import textwrap from pathlib import Path -from typing import Iterable import pytest import toml from kedro import __version__ as kedro_version from kedro.config import ConfigLoader -from kedro.framework.context import KedroContext, KedroContextError -from kedro.framework.hooks import hook_impl +from kedro.framework.context import KedroContext from kedro.framework.project import ( ValidationError, Validator, + _IsSubclassValidator, _ProjectSettings, configure_project, ) -from kedro.framework.session import KedroSession, get_current_session +from kedro.framework.session import KedroSession from kedro.framework.session.store import BaseSessionStore, ShelveStore _FAKE_PROJECT_NAME = "fake_project" @@ -32,6 +31,12 @@ class BadStore: # pylint: disable=too-few-public-methods """ +class BadConfigLoader: # pylint: disable=too-few-public-methods + """ + ConfigLoader class that doesn't subclass `ConfigLoader`, for testing only. + """ + + @pytest.fixture(autouse=True) def mocked_logging(mocker): # Disable logging.config.dictConfig in KedroSession._setup_logging as @@ -44,17 +49,10 @@ def mock_context_class(mocker): return mocker.patch("kedro.framework.session.session.KedroContext", autospec=True) -class ConfigLoaderHooks: - @hook_impl - def register_config_loader(self, conf_paths: Iterable[str]) -> ConfigLoader: - return ConfigLoader(conf_paths) - - def _mock_imported_settings_paths(mocker, mock_settings): for path in [ "kedro.framework.project.settings", "kedro.framework.session.session.settings", - "kedro.framework.context.context.settings", ]: mocker.patch(path, mock_settings) return mock_settings @@ -62,16 +60,12 @@ def _mock_imported_settings_paths(mocker, mock_settings): @pytest.fixture def mock_settings(mocker): - class MockSettings(_ProjectSettings): - _HOOKS = Validator("HOOKS", default=(ConfigLoaderHooks(),)) - - return _mock_imported_settings_paths(mocker, MockSettings()) + return _mock_imported_settings_paths(mocker, _ProjectSettings()) @pytest.fixture def mock_settings_context_class(mocker, mock_context_class): class MockSettings(_ProjectSettings): - _HOOKS = Validator("HOOKS", default=(ConfigLoaderHooks(),)) _CONTEXT_CLASS = Validator( "CONTEXT_CLASS", default=lambda *_: mock_context_class ) @@ -85,12 +79,38 @@ class MyContext(KedroContext): pass class MockSettings(_ProjectSettings): - _HOOKS = Validator("HOOKS", default=(ConfigLoaderHooks(),)) _CONTEXT_CLASS = Validator("CONTEXT_CLASS", default=lambda *_: MyContext) return _mock_imported_settings_paths(mocker, MockSettings()) +@pytest.fixture +def mock_settings_custom_config_loader_class(mocker): + class MyConfigLoader(ConfigLoader): + pass + + class MockSettings(_ProjectSettings): + _CONFIG_LOADER_CLASS = _IsSubclassValidator( + "CONFIG_LOADER_CLASS", default=lambda *_: MyConfigLoader + ) + + return _mock_imported_settings_paths(mocker, MockSettings()) + + +@pytest.fixture +def mock_settings_file_bad_config_loader_class(tmpdir): + mock_settings_file = tmpdir.join("mock_settings_file.py") + mock_settings_file.write( + textwrap.dedent( + f""" + from {__name__} import BadConfigLoader + CONFIG_LOADER_CLASS = BadConfigLoader + """ + ) + ) + return mock_settings_file + + @pytest.fixture def mock_settings_file_bad_session_store_class(tmpdir): mock_settings_file = tmpdir.join("mock_settings_file.py") @@ -131,8 +151,7 @@ def mock_settings_shelve_session_store(mocker, fake_project): shelve_location = fake_project / "nested" / "sessions" class MockSettings(_ProjectSettings): - _HOOKS = Validator("HOOKS", default=(ConfigLoaderHooks(),)) - _SESSION_STORE_CLASS = Validator( + _SESSION_STORE_CLASS = _IsSubclassValidator( "SESSION_STORE_CLASS", default=lambda *_: ShelveStore ) _SESSION_STORE_ARGS = Validator( @@ -227,6 +246,7 @@ def test_create( extra_params, ): mock_click_ctx = mocker.patch("click.get_current_context").return_value + mocker.patch("kedro.framework.session.KedroSession._get_logging_config") session = KedroSession.create( mock_package_name, fake_project, env=env, extra_params=extra_params ) @@ -249,15 +269,8 @@ def test_create( expected_store["extra_params"] = extra_params assert session.store == expected_store - # called for logging setup - mock_context_class.assert_called_once_with( - project_path=fake_project, - package_name=mock_package_name, - env=env, - extra_params=extra_params, - ) - assert session.load_context() is mock_context_class.return_value + assert isinstance(session._get_config_loader(), ConfigLoader) @pytest.mark.usefixtures("mock_settings_context_class") def test_create_no_env_extra_params( @@ -285,14 +298,8 @@ def test_create_no_env_extra_params( } assert session.store == expected_store - mock_context_class.assert_called_once_with( - project_path=fake_project, - package_name=mock_package_name, - env=None, - extra_params=None, - ) - assert session.load_context() is mock_context_class.return_value + assert isinstance(session._get_config_loader(), ConfigLoader) @pytest.mark.usefixtures("mock_settings") def test_load_context_with_envvar( @@ -308,6 +315,20 @@ def test_load_context_with_envvar( assert result.__class__.__name__ == "KedroContext" assert result.env == "my_fake_env" + @pytest.mark.usefixtures("mock_settings") + def test_load_config_loader_with_envvar( + self, fake_project, monkeypatch, mock_package_name, mocker + ): + mocker.patch("kedro.config.config.ConfigLoader.get") + monkeypatch.setenv("KEDRO_ENV", "my_fake_env") + + session = KedroSession.create(mock_package_name, fake_project) + result = session._get_config_loader() + + assert isinstance(result, ConfigLoader) + assert result.__class__.__name__ == "ConfigLoader" + assert result.env == "my_fake_env" + @pytest.mark.usefixtures("mock_settings_custom_context_class") def test_load_context_custom_context_class(self, fake_project, mock_package_name): session = KedroSession.create(mock_package_name, fake_project) @@ -316,6 +337,28 @@ def test_load_context_custom_context_class(self, fake_project, mock_package_name assert isinstance(result, KedroContext) assert result.__class__.__name__ == "MyContext" + @pytest.mark.usefixtures("mock_settings_custom_config_loader_class") + def test_load_config_loader_custom_config_loader_class( + self, fake_project, mock_package_name + ): + session = KedroSession.create(mock_package_name, fake_project) + result = session._get_config_loader() + + assert isinstance(result, ConfigLoader) + assert result.__class__.__name__ == "MyConfigLoader" + + def test_broken_config_loader(self, mock_settings_file_bad_config_loader_class): + pattern = ( + "Invalid value `tests.framework.session.test_session.BadConfigLoader` received " + "for setting `CONFIG_LOADER_CLASS`. " + "It must be a subclass of `kedro.config.config.ConfigLoader`." + ) + mock_settings = _ProjectSettings( + settings_file=str(mock_settings_file_bad_config_loader_class) + ) + with pytest.raises(ValidationError, match=re.escape(pattern)): + assert mock_settings.CONFIG_LOADER_CLASS + @pytest.mark.usefixtures("mock_settings_context_class") def test_default_store( self, fake_project, fake_session_id, caplog, mock_package_name @@ -461,32 +504,18 @@ def test_log_error(self, fake_project, mock_package_name): ) @pytest.mark.usefixtures("mock_settings") - def test_get_current_session(self, fake_project, mock_package_name): - assert get_current_session(silent=True) is None # no sessions yet - - pattern = "There is no active Kedro session" - with pytest.raises(RuntimeError, match=pattern): - get_current_session() - + def test_nested_sessions(self, fake_project, mock_package_name): configure_project(mock_package_name) session1 = KedroSession.create(mock_package_name, fake_project) session2 = KedroSession.create(mock_package_name, fake_project) with session1: - assert get_current_session() is session1 - pattern = ( "Cannot activate the session as another active session already exists" ) with pytest.raises(RuntimeError, match=pattern), session2: pass # pragma: no cover - # session has been closed, so no current sessions should be available - assert get_current_session(silent=True) is None - - with session2: - assert get_current_session() is session2 - @pytest.mark.usefixtures("mock_settings_context_class") @pytest.mark.parametrize("fake_pipeline_name", [None, _FAKE_PIPELINE_NAME]) def test_run( @@ -503,17 +532,17 @@ def test_run( mock_hook = mocker.patch( "kedro.framework.session.session.get_hook_manager" ).return_value.hook - mock_pipelines = { - _FAKE_PIPELINE_NAME: mocker.Mock(), - "__default__": mocker.Mock(), - } - mocker.patch( - "kedro.framework.session.session.pipelines", return_value=mock_pipelines + mock_pipelines = mocker.patch( + "kedro.framework.session.session.pipelines", + return_value={ + _FAKE_PIPELINE_NAME: mocker.Mock(), + "__default__": mocker.Mock(), + }, ) mock_context = mock_context_class.return_value mock_catalog = mock_context._get_catalog.return_value mock_runner = mocker.Mock() - mock_pipeline = mock_context._filter_pipeline.return_value + mock_pipeline = mock_pipelines.__getitem__.return_value.filter.return_value with KedroSession.create(mock_package_name, fake_project) as session: session.run(runner=mock_runner, pipeline_name=fake_pipeline_name) @@ -556,7 +585,7 @@ def test_run_non_existent_pipeline(self, fake_project, mock_package_name, mocker "It needs to be generated and returned " "by the 'register_pipelines' function." ) - with pytest.raises(KedroContextError, match=re.escape(pattern)): + with pytest.raises(ValueError, match=re.escape(pattern)): with KedroSession.create(mock_package_name, fake_project) as session: session.run(runner=mock_runner, pipeline_name="doesnotexist") @@ -575,19 +604,19 @@ def test_run_exception( # pylint: disable=too-many-locals mock_hook = mocker.patch( "kedro.framework.session.session.get_hook_manager" ).return_value.hook - mock_pipelines = { - _FAKE_PIPELINE_NAME: mocker.Mock(), - "__default__": mocker.Mock(), - } - mocker.patch( - "kedro.framework.session.session.pipelines", return_value=mock_pipelines + mock_pipelines = mocker.patch( + "kedro.framework.session.session.pipelines", + return_value={ + _FAKE_PIPELINE_NAME: mocker.Mock(), + "__default__": mocker.Mock(), + }, ) mock_context = mock_context_class.return_value mock_catalog = mock_context._get_catalog.return_value error = FakeException("You shall not pass!") mock_runner = mocker.Mock() mock_runner.run.side_effect = error # runner.run() raises an error - mock_pipeline = mock_context._filter_pipeline.return_value + mock_pipeline = mock_pipelines.__getitem__.return_value.filter.return_value with pytest.raises(FakeException), KedroSession.create( mock_package_name, fake_project diff --git a/tests/framework/session/test_session_extension_hooks.py b/tests/framework/session/test_session_extension_hooks.py index 7e8a98ac8b..5f1b8c1473 100644 --- a/tests/framework/session/test_session_extension_hooks.py +++ b/tests/framework/session/test_session_extension_hooks.py @@ -10,7 +10,7 @@ from kedro.framework.context.context import _convert_paths_to_absolute_posix from kedro.framework.hooks import hook_impl -from kedro.framework.project import _ProjectPipelines, _ProjectSettings +from kedro.framework.project import _ProjectPipelines, _ProjectSettings, pipelines from kedro.framework.session import KedroSession from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import Pipeline, node @@ -19,6 +19,7 @@ from kedro.runner.runner import _run_node_async from tests.framework.session.conftest import ( _assert_hook_call_record_has_expected_parameters, + _assert_pipeline_equal, _mock_imported_settings_paths, assert_exceptions_equal, ) @@ -82,7 +83,7 @@ def test_after_catalog_created_hook(self, mocker, mock_session, caplog): project_path = context.project_path catalog = context.catalog - config_loader = context.config_loader + config_loader = mock_session._get_config_loader() relevant_records = [ r for r in caplog.records if r.getMessage() == "Catalog created" @@ -104,17 +105,23 @@ def test_after_catalog_created_hook_default_run_id( ): context = mock_session.load_context() fake_save_version = mocker.sentinel.fake_save_version - mocker.patch.object( - context, "_get_save_version", return_value=fake_save_version + + mocker.patch( + "kedro.framework.session.KedroSession.store", + new_callable=mocker.PropertyMock, + return_value={ + "session_id": fake_save_version, + "save_version": fake_save_version, + }, ) catalog = context.catalog - config_loader = context.config_loader + config_loader = mock_session._get_config_loader() project_path = context.project_path catalog.save("cars", dummy_dataframe) catalog.save("boats", dummy_dataframe) - context.run() + mock_session.run() relevant_records = [ r for r in caplog.records if r.getMessage() == "Catalog created" @@ -138,7 +145,7 @@ def test_before_and_after_pipeline_run_hooks( ): context = mock_session.load_context() catalog = context.catalog - default_pipeline = context.pipeline + default_pipeline = pipelines["__default__"] catalog.save("cars", dummy_dataframe) catalog.save("boats", dummy_dataframe) mock_session.run() @@ -151,7 +158,7 @@ def test_before_and_after_pipeline_run_hooks( ] assert len(before_pipeline_run_calls) == 1 call_record = before_pipeline_run_calls[0] - assert call_record.pipeline is default_pipeline + _assert_pipeline_equal(call_record.pipeline, default_pipeline) _assert_hook_call_record_has_expected_parameters( call_record, ["pipeline", "catalog", "run_params"] ) @@ -167,7 +174,7 @@ def test_before_and_after_pipeline_run_hooks( _assert_hook_call_record_has_expected_parameters( call_record, ["pipeline", "catalog", "run_params"] ) - assert call_record.pipeline is default_pipeline + _assert_pipeline_equal(call_record.pipeline, default_pipeline) @pytest.mark.usefixtures("mock_broken_pipelines") def test_on_pipeline_error_hook(self, caplog, mock_session): diff --git a/tests/framework/session/test_session_hook_manager.py b/tests/framework/session/test_session_hook_manager.py index 6b5a38ef36..9cc31ecf28 100644 --- a/tests/framework/session/test_session_hook_manager.py +++ b/tests/framework/session/test_session_hook_manager.py @@ -8,7 +8,7 @@ from kedro.framework.session import KedroSession from tests.framework.session.conftest import _mock_imported_settings_paths -MockDistInfo = namedtuple("Distinfo", ["project_name", "version"]) +MockDistInfo = namedtuple("MockDistInfo", ["project_name", "version"]) @pytest.fixture diff --git a/tests/framework/session/test_session_registration_hooks.py b/tests/framework/session/test_session_registration_hooks.py deleted file mode 100644 index bb28174df1..0000000000 --- a/tests/framework/session/test_session_registration_hooks.py +++ /dev/null @@ -1,205 +0,0 @@ -import logging -import re -from typing import Any, Dict, Iterable, Optional - -import pytest -from dynaconf.validator import Validator - -from kedro.config import ConfigLoader -from kedro.framework.context import KedroContextError -from kedro.framework.hooks import hook_impl -from kedro.framework.project import _ProjectSettings, settings -from kedro.framework.session import KedroSession -from kedro.io import DataCatalog -from kedro.versioning import Journal -from tests.framework.session.conftest import ( - _assert_hook_call_record_has_expected_parameters, - _mock_imported_settings_paths, -) - -logger = logging.getLogger(__name__) - - -@pytest.fixture -def pipeline_registration_hook(mock_pipeline): - class PipelineHook: - @hook_impl - def register_pipelines(self): - logger.info("Registering pipelines") - return {"__default__": mock_pipeline} - - return PipelineHook() - - -def _mock_settings_with_hooks(mocker, hooks): - class MockSettings(_ProjectSettings): - _HOOKS = Validator("HOOKS", default=hooks) - - return _mock_imported_settings_paths(mocker, MockSettings()) - - -@pytest.fixture -def mock_settings_with_pipeline_hooks( - mocker, project_hooks, pipeline_registration_hook -): - return _mock_settings_with_hooks( - mocker, hooks=(project_hooks, pipeline_registration_hook) - ) - - -@pytest.fixture -def mock_settings_duplicate_hooks(mocker, project_hooks, pipeline_registration_hook): - return _mock_settings_with_hooks( - mocker, - hooks=(project_hooks, pipeline_registration_hook, pipeline_registration_hook), - ) - - -class RequiredRegistrationHooks: - """Mandatory registration hooks""" - - @hook_impl - def register_config_loader(self, conf_paths: Iterable[str]) -> ConfigLoader: - return ConfigLoader(conf_paths) - - @hook_impl - def register_catalog( - self, - catalog: Optional[Dict[str, Dict[str, Any]]], - credentials: Dict[str, Dict[str, Any]], - load_versions: Dict[str, str], - save_version: str, - journal: Journal, - ) -> DataCatalog: - return DataCatalog.from_config( # pragma: no cover - catalog, credentials, load_versions, save_version, journal - ) - - -@pytest.fixture -def mock_settings_broken_config_loader_hooks(mocker): - class BrokenConfigLoaderHooks(RequiredRegistrationHooks): - @hook_impl - def register_config_loader(self): # pylint: disable=arguments-differ - return None - - return _mock_settings_with_hooks(mocker, hooks=(BrokenConfigLoaderHooks(),)) - - -@pytest.fixture -def mock_settings_broken_catalog_hooks(mocker): - class BrokenCatalogHooks(RequiredRegistrationHooks): - @hook_impl - def register_catalog(self): # pylint: disable=arguments-differ - return None - - return _mock_settings_with_hooks(mocker, hooks=(BrokenCatalogHooks(),)) - - -@pytest.fixture -def mock_session( - mock_settings_with_pipeline_hooks, mock_package_name, tmp_path -): # pylint: disable=unused-argument - return KedroSession.create( - mock_package_name, tmp_path, extra_params={"params:key": "value"} - ) - - -class TestRegistrationHooks: - def test_register_pipelines_is_called( - self, dummy_dataframe, caplog, mock_session, mock_pipeline - ): - context = mock_session.load_context() - catalog = context.catalog - catalog.save("cars", dummy_dataframe) - catalog.save("boats", dummy_dataframe) - mock_session.run() - - register_pipelines_calls = [ - record - for record in caplog.records - if record.funcName == "register_pipelines" - ] - assert len(register_pipelines_calls) == 1 - call_record = register_pipelines_calls[0] - assert call_record.getMessage() == "Registering pipelines" - _assert_hook_call_record_has_expected_parameters(call_record, []) - - expected_pipelines = { - "__default__": mock_pipeline, - "pipe": mock_pipeline, - } - assert context.pipelines == expected_pipelines - - def test_register_config_loader_is_called(self, mock_session, caplog): - context = mock_session.load_context() - _ = context.config_loader - - relevant_records = [ - r for r in caplog.records if r.getMessage() == "Registering config loader" - ] - assert len(relevant_records) == 1 - - record = relevant_records[0] - expected_conf_paths = [ - str(context.project_path / settings.CONF_ROOT / "base"), - str(context.project_path / settings.CONF_ROOT / "local"), - ] - assert record.conf_paths == expected_conf_paths - assert record.env == context.env - assert record.extra_params == {"params:key": "value"} - - def test_register_catalog_is_called(self, mock_session, caplog): - context = mock_session.load_context() - catalog = context.catalog - assert isinstance(catalog, DataCatalog) - - relevant_records = [ - r for r in caplog.records if r.getMessage() == "Registering catalog" - ] - assert len(relevant_records) == 1 - - record = relevant_records[0] - assert record.catalog.keys() == {"cars", "boats"} - assert record.credentials == {"dev_s3": "foo"} - # save_version is only passed during a run, not on the property getter - assert record.save_version is None - assert record.load_versions is None - assert record.journal is None - - -class TestDuplicatePipelineRegistration: - """Test to make sure that if pipelines are defined in both registration hooks - and pipelines_registry, they are deduplicated and a warning is displayed. - """ - - @pytest.mark.usefixtures("mock_settings_duplicate_hooks") - def test_register_pipelines_with_duplicate_entries( - self, tmp_path, mock_package_name, mock_pipeline - ): - session = KedroSession.create(mock_package_name, tmp_path) - context = session.load_context() - # check that all pipeline dictionaries merged together correctly - expected_pipelines = {key: mock_pipeline for key in ("__default__", "pipe")} - pattern = ( - "Found duplicate pipeline entries. The following " - "will be overwritten: __default__" - ) - with pytest.warns(UserWarning, match=re.escape(pattern)): - assert context.pipelines == expected_pipelines - - -class TestBrokenRegistrationHooks: - @pytest.mark.usefixtures("mock_settings_broken_config_loader_hooks") - def test_broken_register_config_loader_hook(self, tmp_path, mock_package_name): - pattern = "Expected an instance of `ConfigLoader`, got `NoneType` instead." - with pytest.raises(KedroContextError, match=re.escape(pattern)): - KedroSession.create(mock_package_name, tmp_path) - - @pytest.mark.usefixtures("mock_settings_broken_catalog_hooks") - def test_broken_register_catalog_hook(self, tmp_path, mock_package_name): - pattern = "Expected an instance of `DataCatalog`, got `NoneType` instead." - with KedroSession.create(mock_package_name, tmp_path) as session: - context = session.load_context() - with pytest.raises(KedroContextError, match=re.escape(pattern)): - _ = context.catalog diff --git a/tests/io/test_cached_dataset.py b/tests/io/test_cached_dataset.py index 5f5d32f350..5e060349b5 100644 --- a/tests/io/test_cached_dataset.py +++ b/tests/io/test_cached_dataset.py @@ -137,6 +137,6 @@ def test_release(self, cached_ds): _ = cached_ds.load() def test_copy_mode(self, mocker): - mocked_memory_data_set = mocker.patch("kedro.io.cached_dataset.MemoryDataSet") + mocked_memory_dataset = mocker.patch("kedro.io.cached_dataset.MemoryDataSet") CachedDataSet(MemoryDataSet(), copy_mode="assign") - mocked_memory_data_set.assert_called_once_with(copy_mode="assign") + mocked_memory_dataset.assert_called_once_with(copy_mode="assign") diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index 28fd1ab8a8..e8b153c54e 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -20,7 +20,6 @@ MemoryDataSet, ) from kedro.io.core import VERSION_FORMAT, generate_timestamp -from kedro.versioning import Journal @pytest.fixture @@ -88,12 +87,11 @@ def data_set(filepath): @pytest.fixture -def multi_catalog(mocker): +def multi_catalog(): csv = CSVDataSet(filepath="abc.csv") parq = ParquetDataSet(filepath="xyz.parq") - journal = mocker.Mock() layers = {"raw": {"abc.csv"}, "model": {"xyz.parq"}} - return DataCatalog({"abc": csv, "xyz": parq}, journal=journal, layers=layers) + return DataCatalog({"abc": csv, "xyz": parq}, layers=layers) @pytest.fixture @@ -534,16 +532,12 @@ def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): ) version = fmt.format(d=current_ts, ms=current_ts.microsecond // 1000) - journal = Journal({"run_id": "fake-id", "project_path": "fake-path"}) catalog = DataCatalog.from_config( **sane_config, load_versions={"boats": version}, save_version=version, - journal=journal, ) - assert catalog._journal == journal - catalog.save("boats", dummy_dataframe) path = Path(sane_config["catalog"]["boats"]["filepath"]) path = path / version / path.name diff --git a/tests/io/test_data_catalog_with_default.py b/tests/io/test_data_catalog_with_default.py deleted file mode 100644 index aba9e1cae8..0000000000 --- a/tests/io/test_data_catalog_with_default.py +++ /dev/null @@ -1,116 +0,0 @@ -import pandas as pd -import pytest - -from kedro.extras.datasets.pandas import CSVDataSet -from kedro.io import DataCatalog, DataCatalogWithDefault, MemoryDataSet - - -@pytest.fixture -def filepath(tmp_path): - return str(tmp_path / "some" / "dir" / "test.csv") - - -@pytest.fixture -def data_set(filepath): - return CSVDataSet(filepath=filepath, save_args={"index": False}) - - -def default_csv(name): - return CSVDataSet(name) - - -@pytest.fixture -def dummy_dataframe(): - return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) - - -@pytest.fixture -def sane_config(filepath): - return { - "catalog": { - "boats": { - "type": "kedro.extras.datasets.pandas.CSVDataSet", - "filepath": filepath, - }, - "cars": { - "type": "kedro.extras.datasets.pandas.CSVDataSet", - "filepath": "s3://test_bucket/test_file.csv", - "credentials": "s3_credentials", - }, - }, - "credentials": { - "s3_credentials": {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} - }, - } - - -def test_load_from_unregistered(dummy_dataframe, tmpdir): - catalog = DataCatalogWithDefault(data_sets={}, default=default_csv) - - path = str(tmpdir.mkdir("sub").join("test.csv")) - catalog.save(path, dummy_dataframe) - reloaded_df = catalog.load(path) - - assert dummy_dataframe.equals(reloaded_df) - - -def test_save_and_load_catalog(data_set, dummy_dataframe, tmpdir): - catalog = DataCatalogWithDefault(data_sets={"test": data_set}, default=default_csv) - - path = str(tmpdir.mkdir("sub").join("test")) - catalog.save(path, dummy_dataframe) - reloaded_df = catalog.load(path) - assert dummy_dataframe.equals(reloaded_df) - - -def test_from_sane_config(sane_config): - with pytest.raises( - ValueError, match="Cannot instantiate a `DataCatalogWithDefault`" - ): - DataCatalogWithDefault.from_config( - sane_config["catalog"], sane_config["credentials"] - ) - - -def test_from_sane_config_default(sane_config, dummy_dataframe, tmpdir): - catalog = DataCatalog.from_config( - sane_config["catalog"], sane_config["credentials"] - ) - catalog_with_default = DataCatalogWithDefault.from_data_catalog( - catalog, default_csv - ) - path = str(tmpdir.mkdir("sub").join("missing.csv")) - catalog_with_default.save(path, dummy_dataframe) - reloaded_df = catalog_with_default.load(path) - assert dummy_dataframe.equals(reloaded_df) - - -def test_default_none(): - with pytest.raises( - TypeError, - match="Default must be a callable with a " - "single input string argument: the " - "key of the requested data set.", - ): - DataCatalogWithDefault(data_sets={}, default=None) - - -# pylint: disable=unused-argument -def default_memory(name): - return MemoryDataSet(5) - - -def test_remember_load(): - catalog = DataCatalogWithDefault( - data_sets={}, default=default_memory, remember=True - ) - assert catalog.load("any") == 5 - assert "any" in catalog.list() - - -def test_remember_save(tmpdir, dummy_dataframe): - catalog = DataCatalogWithDefault(data_sets={}, default=default_csv, remember=True) - - path = str(tmpdir.mkdir("sub").join("test.csv")) - catalog.save(path, dummy_dataframe) - assert tmpdir.join("sub").join("test.csv") in catalog.list() diff --git a/tests/io/test_lambda_data_set.py b/tests/io/test_lambda_dataset.py similarity index 92% rename from tests/io/test_lambda_data_set.py rename to tests/io/test_lambda_dataset.py index 82d2411e38..7aed9ff3ee 100644 --- a/tests/io/test_lambda_data_set.py +++ b/tests/io/test_lambda_dataset.py @@ -28,25 +28,25 @@ def _dummy_exists(): def _dummy_release(): pass # pragma: no cover - assert "LambdaDataSet(load=)" in str( + assert "LambdaDataSet(load=)" in str( LambdaDataSet(_dummy_load, None) ) - assert "LambdaDataSet(save=)" in str( + assert "LambdaDataSet(save=)" in str( LambdaDataSet(None, _dummy_save) ) - assert "LambdaDataSet(exists=)" in str( + assert "LambdaDataSet(exists=)" in str( LambdaDataSet(None, None, _dummy_exists) ) assert ( - "LambdaDataSet(release=)" + "LambdaDataSet(release=)" in str(LambdaDataSet(None, None, None, _dummy_release)) ) # __init__ keys alphabetically sorted, None values not shown expected = ( - "LambdaDataSet(exists=, " - "load=, " - "save=)" + "LambdaDataSet(exists=, " + "load=, " + "save=)" ) actual = str(LambdaDataSet(_dummy_load, _dummy_save, _dummy_exists, None)) assert actual == expected diff --git a/tests/io/test_memory_data_set.py b/tests/io/test_memory_dataset.py similarity index 82% rename from tests/io/test_memory_data_set.py rename to tests/io/test_memory_dataset.py index 83491cc8d5..90118b1c69 100644 --- a/tests/io/test_memory_data_set.py +++ b/tests/io/test_memory_dataset.py @@ -6,7 +6,7 @@ import pytest from kedro.io import DataSetError, MemoryDataSet -from kedro.io.memory_data_set import _copy_with_mode, _infer_copy_mode +from kedro.io.memory_dataset import _copy_with_mode, _infer_copy_mode def _update_data(data, idx, jdx, value): @@ -48,24 +48,24 @@ def new_data(): @pytest.fixture -def memory_data_set(input_data): +def memory_dataset(input_data): return MemoryDataSet(data=input_data) @pytest.fixture def mocked_infer_mode(mocker): - return mocker.patch("kedro.io.memory_data_set._infer_copy_mode") + return mocker.patch("kedro.io.memory_dataset._infer_copy_mode") @pytest.fixture def mocked_copy_with_mode(mocker): - return mocker.patch("kedro.io.memory_data_set._copy_with_mode") + return mocker.patch("kedro.io.memory_dataset._copy_with_mode") class TestMemoryDataSet: - def test_load(self, memory_data_set, input_data): + def test_load(self, memory_dataset, input_data): """Test basic load""" - loaded_data = memory_data_set.load() + loaded_data = memory_dataset.load() assert _check_equals(loaded_data, input_data) def test_load_none(self): @@ -73,10 +73,10 @@ def test_load_none(self): assert loaded_data is None def test_load_infer_mode( - self, memory_data_set, input_data, mocked_infer_mode, mocked_copy_with_mode + self, memory_dataset, input_data, mocked_infer_mode, mocked_copy_with_mode ): """Test load calls infer_mode and copy_mode_with""" - memory_data_set.load() + memory_dataset.load() assert mocked_infer_mode.call_count == 1 assert mocked_copy_with_mode.call_count == 1 @@ -87,18 +87,18 @@ def test_load_infer_mode( assert mocked_copy_with_mode.call_args[0] assert _check_equals(mocked_copy_with_mode.call_args[0][0], input_data) - def test_save(self, memory_data_set, input_data, new_data): + def test_save(self, memory_dataset, input_data, new_data): """Test overriding the data set""" - memory_data_set.save(data=new_data) - reloaded = memory_data_set.load() + memory_dataset.save(data=new_data) + reloaded = memory_dataset.load() assert not _check_equals(reloaded, input_data) assert _check_equals(reloaded, new_data) def test_save_infer_mode( - self, memory_data_set, new_data, mocked_infer_mode, mocked_copy_with_mode + self, memory_dataset, new_data, mocked_infer_mode, mocked_copy_with_mode ): """Test save calls infer_mode and copy_mode_with""" - memory_data_set.save(data=new_data) + memory_dataset.save(data=new_data) assert mocked_infer_mode.call_count == 1 assert mocked_copy_with_mode.call_count == 1 @@ -109,28 +109,28 @@ def test_save_infer_mode( assert mocked_copy_with_mode.call_args[0] assert _check_equals(mocked_copy_with_mode.call_args[0][0], new_data) - def test_load_modify_original_data(self, memory_data_set, input_data): + def test_load_modify_original_data(self, memory_dataset, input_data): """Check that the data set object is not updated when the original object is changed.""" input_data = _update_data(input_data, 1, 1, -5) - assert not _check_equals(memory_data_set.load(), input_data) + assert not _check_equals(memory_dataset.load(), input_data) - def test_save_modify_original_data(self, memory_data_set, new_data): + def test_save_modify_original_data(self, memory_dataset, new_data): """Check that the data set object is not updated when the original object is changed.""" - memory_data_set.save(new_data) + memory_dataset.save(new_data) new_data = _update_data(new_data, 1, 1, "new value") - assert not _check_equals(memory_data_set.load(), new_data) + assert not _check_equals(memory_dataset.load(), new_data) @pytest.mark.parametrize( "input_data", ["dummy_dataframe", "dummy_numpy_array"], indirect=True ) - def test_load_returns_new_object(self, memory_data_set, input_data): + def test_load_returns_new_object(self, memory_dataset, input_data): """Test that consecutive loads point to different objects in case of a pandas DataFrame and numpy array""" - loaded_data = memory_data_set.load() - reloaded_data = memory_data_set.load() + loaded_data = memory_dataset.load() + reloaded_data = memory_dataset.load() assert _check_equals(loaded_data, input_data) assert _check_equals(reloaded_data, input_data) assert loaded_data is not reloaded_data @@ -161,9 +161,9 @@ def test_saving_none(self): ], indirect=["input_data"], ) - def test_str_representation(self, memory_data_set, input_data, expected): + def test_str_representation(self, memory_dataset, input_data, expected): """Test string representation of the data set""" - assert expected in str(memory_data_set) + assert expected in str(memory_dataset) def test_exists(self, new_data): """Test `exists` method invocation""" diff --git a/tests/io/test_partitioned_dataset.py b/tests/io/test_partitioned_dataset.py index 972ac4472e..800ee407e3 100644 --- a/tests/io/test_partitioned_dataset.py +++ b/tests/io/test_partitioned_dataset.py @@ -12,7 +12,7 @@ from kedro.extras.datasets.pandas import CSVDataSet, ParquetDataSet from kedro.io import DataSetError, PartitionedDataSet from kedro.io.data_catalog import CREDENTIALS_KEY -from kedro.io.partitioned_data_set import KEY_PROPAGATION_WARNING +from kedro.io.partitioned_dataset import KEY_PROPAGATION_WARNING @pytest.fixture diff --git a/tests/io/test_transformers.py b/tests/io/test_transformers.py deleted file mode 100644 index 97c4469889..0000000000 --- a/tests/io/test_transformers.py +++ /dev/null @@ -1,144 +0,0 @@ -import re -from typing import Any, Callable, Dict - -import pytest - -from kedro.io import AbstractDataSet, DataCatalog, DataSetNotFoundError -from kedro.io.transformers import AbstractTransformer - - -class FakeDataSet(AbstractDataSet): - def __init__(self, data): - self.log = [] - self.data = data - - def _load(self) -> Any: - self.log.append(("load", self.data)) - return self.data - - def _save(self, data: Any) -> None: - self.log.append(("save", data)) - self.data = data - - def _describe(self) -> Dict[str, Any]: - return {"data": self.data} - - -class NoopTransformer(AbstractTransformer): - pass - - -class FakeTransformer(AbstractTransformer): - def __init__(self): - self.log = [] - - def load(self, data_set_name: str, load: Callable[[], Any]) -> Any: - res = load() - self.log.append(("load", res)) - return res + 1 - - def save(self, data_set_name: str, save: Callable[[Any], None], data: Any) -> None: - self.log.append(("save", data)) - save(data + 1) - - -@pytest.fixture -def fake_data_set(): - return FakeDataSet(123) - - -@pytest.fixture -def fake_transformer(): - return FakeTransformer() - - -@pytest.fixture -def catalog(fake_data_set): - return DataCatalog({"test": fake_data_set}) - - -class TestTransformers: - def test_noop(self, fake_data_set, catalog): - catalog.add_transformer(NoopTransformer()) - - catalog.save("test", 42) - assert catalog.load("test") == 42 - assert fake_data_set.log == [("save", 42), ("load", 42)] - - def test_basic(self, fake_data_set, catalog, fake_transformer): - catalog.add_transformer(fake_transformer) - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_copy(self, fake_data_set, catalog, fake_transformer): - catalog.add_transformer(fake_transformer) - catalog = catalog.shallow_copy() - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_specific(self, fake_data_set, catalog, fake_transformer): - catalog.add_transformer(fake_transformer, "test") - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_specific_list(self, fake_data_set, catalog, fake_transformer): - catalog.add_transformer(fake_transformer, ["test"]) - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_not_found_error(self, fake_transformer): - catalog = DataCatalog() - - with pytest.raises(DataSetNotFoundError): - catalog.add_transformer(fake_transformer, "test") - - def test_not_found_error_in_constructor(self): - with pytest.raises(DataSetNotFoundError): - DataCatalog(transformers={"test": []}) - - def test_all_before_adding(self, fake_data_set, fake_transformer): - catalog = DataCatalog() - catalog.add_transformer(fake_transformer) - catalog.add("test", fake_data_set) - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_all_before_copy_and_add(self, fake_data_set, fake_transformer): - catalog = DataCatalog() - catalog.add_transformer(fake_transformer) - catalog = catalog.shallow_copy() - catalog.add("test", fake_data_set) - - catalog.save("test", 42) - assert catalog.load("test") == 44 - assert fake_data_set.log == [("save", 43), ("load", 43)] - assert fake_transformer.log == [("save", 42), ("load", 43)] - - def test_add_bad_transformer(self, catalog): - with pytest.raises(TypeError, match="not an instance of AbstractTransformer"): - catalog.add_transformer(object) - - def test_deprecation_warning(self, catalog, fake_transformer): - pattern = ( - "The transformer API will be deprecated in Kedro 0.18.0." - "Please use Dataset Hooks to customise the load and save methods." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/02_hooks.html" - ) - with pytest.warns(DeprecationWarning, match=re.escape(pattern)): - catalog.add_transformer(fake_transformer) diff --git a/tests/pipeline/test_decorators.py b/tests/pipeline/test_decorators.py deleted file mode 100644 index 25b7863376..0000000000 --- a/tests/pipeline/test_decorators.py +++ /dev/null @@ -1,79 +0,0 @@ -import logging -from functools import partial -from time import sleep - -import pytest - -from kedro.io import DataCatalog -from kedro.pipeline import Pipeline, node -from kedro.pipeline.decorators import _human_readable_time, log_time -from kedro.runner import SequentialRunner - - -def sleeping_identity(inp): - sleep(0.1) - return inp - - -def identity(arg): - return arg - - -@pytest.mark.parametrize( - "elapsed,expected", - [(3600.1, "1h00m00s"), (3599.0, "59m59s"), (59, "59.00s"), (0.1, "100ms")], -) -def test_human_readable_time(elapsed, expected): - message = _human_readable_time(elapsed) - assert message == expected - - -def test_log_time(caplog): - caplog.clear() - func = log_time(sleeping_identity) - res = func(1) - - logger_name, severity, message = caplog.record_tuples[0] - assert res == 1 - assert logger_name == "kedro.pipeline.decorators" - assert severity == logging.INFO - expected = ( - f"Running '{sleeping_identity.__module__}.{sleeping_identity.__qualname__}' " - f"took" - ) - assert expected in message - - -def test_log_time_no_module(caplog): - """When func module is not defined, function full name is not logged.""" - - def no_module(arg): - return sleeping_identity(arg) - - no_module.__module__ = None - - caplog.clear() - func = log_time(no_module) - res = func(1) - - logger_name, severity, message = caplog.record_tuples[0] - assert res == 1 - assert logger_name == "kedro.pipeline.decorators" - assert severity == logging.INFO - expected = f"Running {no_module.__qualname__!r} took" - assert expected in message - - -def test_log_time_with_partial(recwarn): - pipeline = Pipeline( - [node(partial(identity, 1), None, "output", name="identity1")] - ).decorate(log_time) - catalog = DataCatalog({}, dict(number=1)) - result = SequentialRunner().run(pipeline, catalog) - assert result["output"] == 1 - warning = recwarn.pop(UserWarning) - assert ( - "The node producing outputs `['output']` is made from a " - "`partial` function. Partial functions do not have a " - "`__name__` attribute" in str(warning.message) - ) diff --git a/tests/pipeline/test_pipeline_helper.py b/tests/pipeline/test_modular_pipeline.py similarity index 84% rename from tests/pipeline/test_pipeline_helper.py rename to tests/pipeline/test_modular_pipeline.py index 7173a2bcbe..8f2da09126 100644 --- a/tests/pipeline/test_pipeline_helper.py +++ b/tests/pipeline/test_modular_pipeline.py @@ -18,6 +18,10 @@ def biconcat(input1: str, input2: str): return input1 + input2 # pragma: no cover +def triconcat(input1: str, input2: str, input3: str): + return input1 + input2 + input3 # pragma: no cover + + class TestPipelineHelper: def test_transform_dataset_names(self): """ @@ -116,7 +120,7 @@ def test_prefix_exclude_free_inputs(self, inputs, outputs): def test_transform_params_prefix_and_parameters(self): """ - Test that transform should skip `params:` and `parameters`: str, list and dict. + Test that transform should prefix all parameters by default. """ raw_pipeline = Pipeline( [ @@ -133,12 +137,12 @@ def test_transform_params_prefix_and_parameters(self): resulting_pipeline = pipeline(raw_pipeline, namespace="PREFIX") nodes = sorted(resulting_pipeline.nodes) assert nodes[0]._inputs == "parameters" - assert nodes[0]._outputs == "params:B" + assert nodes[0]._outputs == "params:PREFIX.B" - assert nodes[1]._inputs == ["params:C", "PREFIX.D"] + assert nodes[1]._inputs == ["params:PREFIX.C", "PREFIX.D"] assert nodes[1]._outputs == ["parameters", "PREFIX.F"] - assert nodes[2]._inputs == {"input1": "params:H", "input2": "parameters"} + assert nodes[2]._inputs == {"input1": "params:PREFIX.H", "input2": "parameters"} assert nodes[2]._outputs == {"K": "PREFIX.L"} assert nodes[2].name == "PREFIX.node3" @@ -227,12 +231,10 @@ def test_node_properties_preserved(self): Also an explicitly defined name should get prefixed. """ raw_pipeline = Pipeline([node(identity, "A", "B", name="node1", tags=["tag1"])]) - raw_pipeline = raw_pipeline.decorate(lambda: None) resulting_pipeline = pipeline(raw_pipeline, namespace="PREFIX") assert resulting_pipeline.nodes[0].name == "PREFIX.node1" assert resulting_pipeline.nodes[0].tags == {"tag1"} - assert len(resulting_pipeline.nodes[0]._decorators) == 1 def test_default_node_name_is_namespaced(self): """Check that auto-generated node names are also namespaced""" @@ -273,10 +275,39 @@ def test_expose_intermediate_output(self): assert actual_nodes[2]._inputs == "ACTUAL.C" assert actual_nodes[2]._outputs == "ACTUAL.D" - assert actual_nodes[3]._inputs == ["ACTUAL.D", "params:x"] + assert actual_nodes[3]._inputs == ["ACTUAL.D", "params:ACTUAL.x"] assert actual_nodes[3]._outputs == "ACTUAL.X" - def test_parameters_updated(self): + def test_parameters_left_intact_when_defined_as_str(self): + raw_pipeline = Pipeline([node(biconcat, ["A", "params:x"], "AA", name="node1")]) + resulting_pipeline = pipeline( + raw_pipeline, outputs={"AA": "B"}, parameters="x", namespace="PREFIX" + ) + actual_nodes = resulting_pipeline.nodes + + assert actual_nodes[0]._inputs == ["PREFIX.A", "params:x"] + assert actual_nodes[0]._outputs == "B" + + @pytest.mark.parametrize( + "parameters", ["params:x", {"params:x"}, {"params:x": "params:x"}] + ) + def test_parameters_left_intact_when_defined_as_(self, parameters): + raw_pipeline = Pipeline( + [node(triconcat, ["A", "params:x", "params:y"], "AA", name="node1")] + ) + resulting_pipeline = pipeline( + raw_pipeline, + outputs={"AA": "B"}, + parameters=parameters, + namespace="PREFIX", + ) + actual_nodes = resulting_pipeline.nodes + + # x is left intact because it's defined in parameters but y is namespaced + assert actual_nodes[0]._inputs == ["PREFIX.A", "params:x", "params:PREFIX.y"] + assert actual_nodes[0]._outputs == "B" + + def test_parameters_updated_with_dict(self): raw_pipeline = Pipeline( [ node(biconcat, ["A", "params:x"], "AA", name="node1"), @@ -287,20 +318,36 @@ def test_parameters_updated(self): resulting_pipeline = pipeline( raw_pipeline, outputs={"B": "B_new"}, - parameters={"params:x": "params:y"}, + parameters={"x": "X"}, namespace="ACTUAL", ) actual_nodes = resulting_pipeline.nodes - assert actual_nodes[0]._inputs == ["ACTUAL.A", "params:y"] + assert actual_nodes[0]._inputs == ["ACTUAL.A", "params:X"] assert actual_nodes[0]._outputs == "ACTUAL.AA" - assert actual_nodes[1]._inputs == ["ACTUAL.AA", "params:y"] + assert actual_nodes[1]._inputs == ["ACTUAL.AA", "params:ACTUAL.y"] assert actual_nodes[1]._outputs == "B_new" - assert actual_nodes[2]._inputs == ["B_new", "params:y"] + assert actual_nodes[2]._inputs == ["B_new", "params:X"] assert actual_nodes[2]._outputs == "ACTUAL.BB" + def test_parameters_defined_with_params_prefix(self): + raw_pipeline = Pipeline( + [node(triconcat, ["A", "params:x", "params:y"], "AA", name="node1")] + ) + resulting_pipeline = pipeline( + raw_pipeline, + outputs={"AA": "B"}, + parameters={"params:x"}, + namespace="PREFIX", + ) + actual_nodes = resulting_pipeline.nodes + + # x is left intact because it's defined in parameters but y is namespaced + assert actual_nodes[0]._inputs == ["PREFIX.A", "params:x", "params:PREFIX.y"] + assert actual_nodes[0]._outputs == "B" + def test_parameters_specified_under_inputs(self): raw_pipeline = Pipeline( [ @@ -326,7 +373,7 @@ def test_non_existent_parameters_mapped(self): pattern = r"Failed to map datasets and/or parameters: params:beta" with pytest.raises(ModularPipelineError, match=pattern): - pipeline(raw_pipeline, parameters={"params:beta": "params:gamma"}) + pipeline(raw_pipeline, parameters={"beta": "gamma"}) pattern = r"Failed to map datasets and/or parameters: parameters" with pytest.raises(ModularPipelineError, match=pattern): diff --git a/tests/pipeline/test_node.py b/tests/pipeline/test_node.py index 487ebab38a..8291d5f391 100644 --- a/tests/pipeline/test_node.py +++ b/tests/pipeline/test_node.py @@ -347,57 +347,17 @@ def test_bad_input(func, expected): def apply_f(func: Callable) -> Callable: @wraps(func) def with_f(*args, **kwargs): - return func(*(f"f({a})" for a in args), **kwargs) + return func(*(f"f({a})" for a in args), **kwargs) # pragma: no cover return with_f -def apply_g(func: Callable) -> Callable: - @wraps(func) - def with_g(*args, **kwargs): - return func(*(f"g({a})" for a in args), **kwargs) - - return with_g - - -def apply_h(func: Callable) -> Callable: - @wraps(func) - def with_h(*args, **kwargs): - return func(*(f"h({a})" for a in args), **kwargs) - - return with_h - - -def apply_ij(func: Callable) -> Callable: - @wraps(func) - def with_ij(*args, **kwargs): - return func(*(f"ij({a})" for a in args), **kwargs) - - return with_ij - - @apply_f def decorated_identity(value): - return value - - -class TestTagDecorator: - def test_apply_decorators(self): - old_node = node(apply_g(decorated_identity), "input", "output", name="node") - pattern = ( - "The node's `decorate` API will be deprecated in Kedro 0.18.0." - "Please use a node's Hooks to extend the node's behaviour in a pipeline." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/02_hooks.html" - ) - with pytest.warns(DeprecationWarning, match=re.escape(pattern)): - new_node = old_node.decorate(apply_h, apply_ij) - result = new_node.run(dict(input=1)) + return value # pragma: no cover - assert old_node.name == new_node.name - assert "output" in result - assert result["output"] == "f(g(ij(h(1))))" +class TestTag: def test_tag_nodes(self): tagged_node = node(identity, "input", "output", tags=["hello"]).tag(["world"]) assert "hello" in tagged_node.tags @@ -408,15 +368,6 @@ def test_tag_nodes_single_tag(self): tagged_node = node(identity, "input", "output", tags="hello").tag("world") assert "hello" in tagged_node.tags assert "world" in tagged_node.tags - assert len(tagged_node.tags) == 2 - - def test_tag_and_decorate(self): - tagged_node = node(identity, "input", "output", tags=["hello"]) - tagged_node = tagged_node.decorate(apply_f) - tagged_node = tagged_node.tag(["world"]) - assert "hello" in tagged_node.tags - assert "world" in tagged_node.tags - assert tagged_node.run(dict(input=1))["output"] == "f(1)" class TestNames: diff --git a/tests/pipeline/test_node_run.py b/tests/pipeline/test_node_run.py index b817e0422c..fe3fd7d0f1 100644 --- a/tests/pipeline/test_node_run.py +++ b/tests/pipeline/test_node_run.py @@ -44,7 +44,7 @@ def test_valid_nodes(valid_nodes_with_inputs): def test_run_got_dataframe(mocked_dataset): """Check an exception when non-dictionary (class object) is passed.""" pattern = r"Node.run\(\) expects a dictionary or None, " - pattern += r"but got instead" + pattern += r"but got instead" with pytest.raises(ValueError, match=pattern): node(one_in_one_out, dict(arg="ds1"), "A").run(mocked_dataset) diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index cae3234bc4..5b5cfbecd2 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1,12 +1,9 @@ import re -from functools import wraps from itertools import chain -from typing import Callable import pytest import kedro -from kedro.io import DataCatalog from kedro.pipeline import Pipeline, node from kedro.pipeline.pipeline import ( CircularDependencyError, @@ -15,7 +12,6 @@ _strip_transcoding, _transcode_split, ) -from kedro.runner import SequentialRunner class TestTranscodeHelpers: @@ -83,17 +79,17 @@ def pipeline_list_with_lists(): node(identity, "F", ["G", "M"], name="node3"), node(identity, "E", ["F", "H"], name="node4"), node(identity, "D", None, name="node5"), - node(identity, "C", "D", name="node6"), - node(identity, "B", ["C", "E"], name="node7"), + node(identity, "C", "D", name="node6", tags=["foo"]), + node(identity, "B", ["C", "E"], name="node7", tags=["foo"]), node(identity, "A", ["B", "L"], name="node8"), node(constant_output, None, "A", name="node9"), ], "expected": [ {node(constant_output, None, "A", name="node9")}, {node(identity, "A", ["B", "L"], name="node8")}, - {node(identity, "B", ["C", "E"], name="node7")}, + {node(identity, "B", ["C", "E"], name="node7", tags=["foo"])}, { - node(identity, "C", "D", name="node6"), + node(identity, "C", "D", name="node6", tags=["foo"]), node(identity, "E", ["F", "H"], name="node4"), }, { @@ -117,17 +113,17 @@ def pipeline_with_dicts(): node(identity, "F", dict(M="M", N="G"), name="node3"), node(identity, "E", dict(O="F", P="H"), name="node4"), # NOQA node(identity, dict(input1="D"), None, name="node5"), - node(identity, "C", "D", name="node6"), - node(identity, "B", dict(P="C", Q="E"), name="node7"), + node(identity, "C", "D", name="node6", tags=["foo"]), + node(identity, "B", dict(P="C", Q="E"), name="node7", tags=["foo"]), node(identity, "A", dict(R="B", S="L"), name="node8"), node(constant_output, None, "A", name="node9"), ], "expected": [ {node(constant_output, None, "A", name="node9")}, {node(identity, "A", dict(R="B", S="L"), name="node8")}, - {node(identity, "B", dict(P="C", Q="E"), name="node7")}, + {node(identity, "B", dict(P="C", Q="E"), name="node7", tags=["foo"])}, { - node(identity, "C", "D", name="node6"), + node(identity, "C", "D", name="node6", tags=["foo"]), node(identity, "E", dict(O="F", P="H"), name="node4"), # NOQA }, { @@ -219,6 +215,13 @@ def str_node_inputs_list(): } +@pytest.fixture +def complex_pipeline(pipeline_list_with_lists): + nodes = pipeline_list_with_lists["nodes"] + pipeline = Pipeline(nodes) + return pipeline + + @pytest.fixture( params=[ "branchless_pipeline", @@ -253,43 +256,6 @@ def test_grouped_nodes(self, input_data): # Check each grouped node matches with expected group assert all(g == e for g, e in zip(grouped, expected)) - @pytest.mark.parametrize( - "target_node_names", [["node2", "node3", "node4", "node8"], ["node1"]] - ) - def test_only_nodes(self, target_node_names, pipeline_list_with_lists): - full = Pipeline(pipeline_list_with_lists["nodes"]) - partial = full.only_nodes(*target_node_names) - target_list = list(target_node_names) - names = map(lambda node_: node_.name, partial.nodes) - assert sorted(names) == sorted(target_list) - - @pytest.mark.parametrize( - "target_namespace,expected_namespaces", - [ - ("katie", ["katie", "katie.lisa", "katie.lisa.john"]), - ("lisa", ["lisa", "lisa.john"]), - ("john", ["john"]), - ("katie.lisa", ["katie.lisa", "katie.lisa.john"]), - ("katie.lisa.john", ["katie.lisa.john"]), - ], - ) - def test_only_nodes_with_namespace(self, target_namespace, expected_namespaces): - pipeline = Pipeline( - [ - node(identity, "A", "B", namespace="katie"), - node(identity, "B", "C", namespace="lisa"), - node(identity, "C", "D", namespace="john"), - node(identity, "D", "E", namespace="katie.lisa"), - node(identity, "E", "F", namespace="lisa.john"), - node(identity, "F", "G", namespace="katie.lisa.john"), - ] - ) - resulting_pipeline = pipeline.only_nodes_with_namespace(target_namespace) - for actual_node, expected_namespace in zip( - sorted(resulting_pipeline.nodes), expected_namespaces - ): - assert actual_node.namespace == expected_namespace - def test_free_input(self, input_data): nodes = input_data["nodes"] inputs = input_data["free_inputs"] @@ -306,6 +272,161 @@ def test_outputs(self, input_data): assert pipeline.outputs() == set(outputs) + def test_empty_case(self): + """Empty pipeline is possible""" + Pipeline([]) + + def test_initialized_with_tags(self): + pipeline = Pipeline( + [node(identity, "A", "B", tags=["node1", "p1"]), node(identity, "B", "C")], + tags=["p1", "p2"], + ) + + node1 = pipeline.grouped_nodes[0].pop() + node2 = pipeline.grouped_nodes[1].pop() + assert node1.tags == {"node1", "p1", "p2"} + assert node2.tags == {"p1", "p2"} + + def test_node_dependencies(self, complex_pipeline): + expected = { + "node1": {"node2", "node3", "node4"}, + "node2": {"node4"}, + "node3": {"node4"}, + "node4": {"node7"}, + "node5": {"node6"}, + "node6": {"node7"}, + "node7": {"node8"}, + "node8": {"node9"}, + "node9": set(), + } + actual = { + child.name: {parent.name for parent in parents} + for child, parents in complex_pipeline.node_dependencies.items() + } + assert actual == expected + + +@pytest.fixture +def pipeline_with_circle(): + return [ + node(identity, "A", "B", name="node1"), + node(identity, "B", "C", name="node2"), + node(identity, "C", "A", name="node3"), # circular dependency + ] + + +@pytest.fixture +def non_unique_node_outputs(): + return [ + node(identity, "A", ["B", "C"], name="node1"), + node(identity, "C", ["D", "E", "F"], name="node2"), + # D, E non-unique + node(identity, "B", dict(out1="D", out2="E"), name="node3"), + node(identity, "D", ["E"], name="node4"), # E non-unique + ] + + +class TestInvalidPipeline: + def test_circle_case(self, pipeline_with_circle): + pattern = "Circular dependencies" + with pytest.raises(CircularDependencyError, match=pattern): + Pipeline(pipeline_with_circle) + + def test_unique_outputs(self, non_unique_node_outputs): + with pytest.raises(OutputNotUniqueError, match=r"\['D', 'E'\]"): + Pipeline(non_unique_node_outputs) + + def test_none_case(self): + with pytest.raises(ValueError, match="is None"): + Pipeline(None) + + def test_duplicate_free_nodes(self): + pattern = ( + "Pipeline nodes must have unique names. The following node " + "names appear more than once:\n\nFree nodes:\n - same_name" + ) + with pytest.raises(ValueError, match=re.escape(pattern)): + Pipeline( + [ + node(identity, "in1", "out1", name="same_name"), + node(identity, "in2", "out2", name="same_name"), + ] + ) + + pipeline = Pipeline([node(identity, "in1", "out1", name="same_name")]) + another_node = node(identity, "in2", "out2", name="same_name") + with pytest.raises(ValueError, match=re.escape(pattern)): + # 'pipeline' passes the check, 'another_node' doesn't + Pipeline([pipeline, another_node]) + + def test_duplicate_nodes_in_pipelines(self): + pipeline = Pipeline( + [node(biconcat, ["input", "input1"], ["output", "output1"], name="node")] + ) + pattern = ( + r"Pipeline nodes must have unique names\. The following node " + r"names appear more than once\:\n\nPipeline\(\[.+\]\)\:\n \- node" + ) + with pytest.raises(ValueError, match=pattern): + # the first 'pipeline' passes the check, the second doesn't + Pipeline([pipeline, pipeline]) + + another_node = node(identity, "in1", "out1", name="node") + with pytest.raises(ValueError, match=pattern): + # 'another_node' passes the check, 'pipeline' doesn't + Pipeline([another_node, pipeline]) + + def test_bad_combine_node(self): + """Node cannot be combined to pipeline.""" + fred = node(identity, "input", "output") + pipeline = Pipeline([fred]) + with pytest.raises(TypeError): + pipeline + fred # pylint: disable=pointless-statement + + def test_bad_combine_int(self): + """int cannot be combined to pipeline, tests __radd__""" + fred = node(identity, "input", "output") + pipeline = Pipeline([fred]) + with pytest.raises(TypeError): + _ = 1 + pipeline + + def test_conflicting_names(self): + """Node names must be unique.""" + pipeline1 = Pipeline( + [node(biconcat, ["input", "input1"], ["output1"], name="a")] + ) + new_pipeline = Pipeline( + [node(biconcat, ["input", "input1"], ["output2"], name="a")] + ) + pattern = ( + "Pipeline nodes must have unique names. The following node names " + "appear more than once:\n\nFree nodes:\n - a" + ) + with pytest.raises(ValueError, match=re.escape(pattern)): + pipeline1 + new_pipeline # pylint: disable=pointless-statement + + def test_conflicting_outputs(self): + """Node outputs must be unique.""" + pipeline1 = Pipeline( + [node(biconcat, ["input", "input1"], ["output", "output1"], name="a")] + ) + new_pipeline = Pipeline( + [node(biconcat, ["input", "input2"], ["output", "output2"], name="b")] + ) + with pytest.raises(OutputNotUniqueError, match=r"\['output'\]"): + pipeline1 + new_pipeline # pylint: disable=pointless-statement + + def test_duplicate_node_confirms(self): + """Test that non-unique dataset confirms break pipeline concatenation""" + pipeline1 = Pipeline([node(identity, "input1", "output1", confirms="other")]) + pipeline2 = Pipeline( + [node(identity, "input2", "output2", confirms=["other", "output2"])] + ) + with pytest.raises(ConfirmNotUniqueError, match=r"\['other'\]"): + pipeline1 + pipeline2 # pylint: disable=pointless-statement + + +class TestPipelineOperators: def test_combine_add(self): pipeline1 = Pipeline([node(biconcat, ["input", "input1"], "output1", name="a")]) pipeline2 = Pipeline([node(biconcat, ["input", "input2"], "output2", name="b")]) @@ -443,21 +564,6 @@ def test_invalid_union(self): with pytest.raises(TypeError, match=pattern): p | "hello" # pylint: disable=pointless-statement - def test_empty_case(self): - """Empty pipeline is possible""" - Pipeline([]) - - def test_initialized_with_tags(self): - pipeline = Pipeline( - [node(identity, "A", "B", tags=["node1", "p1"]), node(identity, "B", "C")], - tags=["p1", "p2"], - ) - - node1 = pipeline.grouped_nodes[0].pop() - node2 = pipeline.grouped_nodes[1].pop() - assert node1.tags == {"node1", "p1", "p2"} - assert node2.tags == {"p1", "p2"} - def test_node_unique_confirms(self): """Test that unique dataset confirms don't break pipeline concatenation""" pipeline1 = Pipeline([node(identity, "input1", "output1", confirms="output1")]) @@ -466,197 +572,178 @@ def test_node_unique_confirms(self): combined = pipeline1 + pipeline2 + pipeline3 assert len(combined.nodes) == 3 + def test_connected_pipeline(self, disjoint_pipeline): + """Connect two separate pipelines.""" + nodes = disjoint_pipeline["nodes"] + subpipeline = Pipeline(nodes, tags=["subpipeline"]) -def pipeline_with_circle(): - return [ - node(identity, "A", "B", name="node1"), - node(identity, "B", "C", name="node2"), - node(identity, "C", "A", name="node3"), # circular dependency - ] + assert len(subpipeline.inputs()) == 2 + assert len(subpipeline.outputs()) == 2 + pipeline = Pipeline( + [node(identity, "C", "D", name="connecting_node"), subpipeline], tags="main" + ) -def non_unique_node_outputs(): - return [ - node(identity, "A", ["B", "C"], name="node1"), - node(identity, "C", ["D", "E", "F"], name="node2"), - # D, E non-unique - node(identity, "B", dict(out1="D", out2="E"), name="node3"), - node(identity, "D", ["E"], name="node4"), # E non-unique - ] + assert len(pipeline.nodes) == 1 + len(nodes) + assert len(pipeline.inputs()) == 1 + assert len(pipeline.outputs()) == 1 -class TestInvalidPipeline: - def test_circle_case(self): - pattern = "Circular dependencies" - with pytest.raises(CircularDependencyError, match=pattern): - Pipeline(pipeline_with_circle()) +class TestPipelineDescribe: + def test_names_only(self, str_node_inputs_list): + pipeline = Pipeline(str_node_inputs_list["nodes"]) + description = pipeline.describe() - def test_unique_outputs(self): - with pytest.raises(OutputNotUniqueError, match=r"\['D', 'E'\]"): - Pipeline(non_unique_node_outputs()) + desc = description.split("\n") + test_desc = [ + "#### Pipeline execution order ####", + "Inputs: input1, input2", + "", + "node1", + "node2", + "", + "Outputs: input4", + "##################################", + ] - def test_none_case(self): - with pytest.raises(ValueError, match="is None"): - Pipeline(None) + assert len(desc) == len(test_desc) + for res, example in zip(desc, test_desc): + assert res == example - @pytest.mark.parametrize( - "target_node_names", [["node2", "node3", "node4", "NaN"], ["invalid"]] - ) - def test_only_nodes_missing(self, pipeline_list_with_lists, target_node_names): - pattern = r"Pipeline does not contain nodes" - full = Pipeline(pipeline_list_with_lists["nodes"]) - with pytest.raises(ValueError, match=pattern): - full.only_nodes(*target_node_names) - - @pytest.mark.parametrize("namespace", ["katie", None]) - def test_only_nodes_with_namespace_empty(self, namespace): - pipeline = Pipeline([node(identity, "A", "B", namespace=namespace)]) - pattern = r"Pipeline does not contain nodes" - with pytest.raises(ValueError, match=pattern): - pipeline.only_nodes_with_namespace("non_existent") - - def test_duplicate_free_nodes(self): - pattern = ( - "Pipeline nodes must have unique names. The following node " - "names appear more than once:\n\nFree nodes:\n - same_name" - ) - with pytest.raises(ValueError, match=re.escape(pattern)): - Pipeline( - [ - node(identity, "in1", "out1", name="same_name"), - node(identity, "in2", "out2", name="same_name"), - ] - ) + def test_full(self, str_node_inputs_list): + pipeline = Pipeline(str_node_inputs_list["nodes"]) + description = pipeline.describe(names_only=False) - pipeline = Pipeline([node(identity, "in1", "out1", name="same_name")]) - another_node = node(identity, "in2", "out2", name="same_name") - with pytest.raises(ValueError, match=re.escape(pattern)): - # 'pipeline' passes the check, 'another_node' doesn't - Pipeline([pipeline, another_node]) + desc = description.split("\n") + test_desc = [ + "#### Pipeline execution order ####", + "Inputs: input1, input2", + "", + "node1: biconcat([input1,input2]) -> [input3]", + "node2: identity([input3]) -> [input4]", + "", + "Outputs: input4", + "##################################", + ] - def test_duplicate_nodes_in_pipelines(self): - pipeline = Pipeline( - [node(biconcat, ["input", "input1"], ["output", "output1"], name="node")] - ) - pattern = ( - r"Pipeline nodes must have unique names\. The following node " - r"names appear more than once\:\n\nPipeline\(\[.+\]\)\:\n \- node" - ) - with pytest.raises(ValueError, match=pattern): - # the first 'pipeline' passes the check, the second doesn't - Pipeline([pipeline, pipeline]) + assert len(desc) == len(test_desc) + for res, example in zip(desc, test_desc): + assert res == example - another_node = node(identity, "in1", "out1", name="node") - with pytest.raises(ValueError, match=pattern): - # 'another_node' passes the check, 'pipeline' doesn't - Pipeline([another_node, pipeline]) - def test_bad_combine_node(self): - """Node cannot be combined to pipeline.""" - fred = node(identity, "input", "output") - pipeline = Pipeline([fred]) - with pytest.raises(TypeError): - pipeline + fred # pylint: disable=pointless-statement +@pytest.fixture +def nodes_with_tags(): + return [ + node(identity, "E", None, name="node1"), + node(identity, "D", "E", name="node2", tags=["tag1", "tag2"]), + node(identity, "C", "D", name="node3"), + node(identity, "A", "B", name="node4", tags=["tag2"]), + node(identity, "B", "C", name="node5"), + node(constant_output, None, "A", name="node6", tags=["tag1"]), + ] - def test_bad_combine_int(self): - """int cannot be combined to pipeline, tests __radd__""" - fred = node(identity, "input", "output") - pipeline = Pipeline([fred]) - with pytest.raises(TypeError): - _ = 1 + pipeline - def test_conflicting_names(self): - """Node names must be unique.""" - pipeline1 = Pipeline( - [node(biconcat, ["input", "input1"], ["output1"], name="a")] - ) - new_pipeline = Pipeline( - [node(biconcat, ["input", "input1"], ["output2"], name="a")] - ) - pattern = ( - "Pipeline nodes must have unique names. The following node names " - "appear more than once:\n\nFree nodes:\n - a" - ) - with pytest.raises(ValueError, match=re.escape(pattern)): - pipeline1 + new_pipeline # pylint: disable=pointless-statement +class TestPipelineTags: + def test_tag_existing_pipeline(self, branchless_pipeline): + pipeline = Pipeline(branchless_pipeline["nodes"]) + pipeline = pipeline.tag(["new_tag"]) + assert all("new_tag" in n.tags for n in pipeline.nodes) - def test_conflicting_outputs(self): - """Node outputs must be unique.""" - pipeline1 = Pipeline( - [node(biconcat, ["input", "input1"], ["output", "output1"], name="a")] - ) - new_pipeline = Pipeline( - [node(biconcat, ["input", "input2"], ["output", "output2"], name="b")] - ) - with pytest.raises(OutputNotUniqueError, match=r"\['output'\]"): - pipeline1 + new_pipeline # pylint: disable=pointless-statement + def test_pipeline_single_tag(self, branchless_pipeline): + p1 = Pipeline(branchless_pipeline["nodes"], tags="single_tag") + p2 = Pipeline(branchless_pipeline["nodes"]).tag("single_tag") - def test_duplicate_node_confirms(self): - """Test that non-unique dataset confirms break pipeline concatenation""" - pipeline1 = Pipeline([node(identity, "input1", "output1", confirms="other")]) - pipeline2 = Pipeline( - [node(identity, "input2", "output2", confirms=["other", "output2"])] - ) - with pytest.raises(ConfirmNotUniqueError, match=r"\['other'\]"): - pipeline1 + pipeline2 # pylint: disable=pointless-statement + for pipeline in (p1, p2): + assert all("single_tag" in n.tags for n in pipeline.nodes) @pytest.fixture -def complex_pipeline(pipeline_list_with_lists): - nodes = pipeline_list_with_lists["nodes"] - pipeline = Pipeline(nodes) - return pipeline - - -class TestComplexPipeline: - def test_from_inputs(self, complex_pipeline): - """F and H are inputs of node1, node2 and node3.""" - new_pipeline = complex_pipeline.from_inputs("F", "H") - nodes = {node.name for node in new_pipeline.nodes} - - assert len(new_pipeline.nodes) == 3 - assert nodes == {"node1", "node2", "node3"} - - def test_from_inputs_unknown(self, complex_pipeline): - """W and Z do not exist as inputs.""" - with pytest.raises(ValueError, match=r"\['W', 'Z'\]"): - complex_pipeline.from_inputs("Z", "W", "E", "C") +def pipeline_with_namespaces(): + return Pipeline( + [ + node(identity, "A", "B", name="node1", namespace="katie"), + node(identity, "B", "C", name="node2", namespace="lisa"), + node(identity, "C", "D", name="node3", namespace="john"), + node(identity, "D", "E", name="node4", namespace="katie.lisa"), + node(identity, "E", "F", name="node5", namespace="lisa.john"), + node(identity, "F", "G", name="node6", namespace="katie.lisa.john"), + ] + ) - def test_only_nodes_with_inputs(self, complex_pipeline): - """node1 and node2 require H as an input.""" - new_pipeline = complex_pipeline.only_nodes_with_inputs("H") - nodes = {node.name for node in new_pipeline.nodes} - assert len(new_pipeline.nodes) == 2 - assert nodes == {"node1", "node2"} +class TestPipelineFilter: + def test_no_filters(self, complex_pipeline): + filtered_pipeline = complex_pipeline.filter() + assert filtered_pipeline is not complex_pipeline + assert set(filtered_pipeline.nodes) == set(complex_pipeline.nodes) - def test_only_nodes_with_inputs_unknown(self, complex_pipeline): - with pytest.raises(ValueError, match="['W', 'Z']"): - complex_pipeline.only_nodes_with_inputs("Z", "W", "E", "C") + @pytest.mark.parametrize( + "filter_method,expected_nodes", + [ + ({"tags": ["foo"]}, {"node6", "node7"}), + ({"from_nodes": ["node4"]}, {"node1", "node2", "node3", "node4"}), + ({"to_nodes": ["node4"]}, {"node9", "node8", "node7", "node4"}), + ({"node_names": ["node4", "node5"]}, {"node4", "node5"}), + ({"from_inputs": ["F"]}, {"node1", "node3"}), + ({"to_outputs": ["F"]}, {"node4", "node7", "node8", "node9"}), + ], + ) + def test_one_filter(self, filter_method, expected_nodes, complex_pipeline): + filtered_pipeline = complex_pipeline.filter(**filter_method) + nodes = {node.name for node in filtered_pipeline.nodes} + assert nodes == expected_nodes + + def test_namespace_filter(self, pipeline_with_namespaces): + filtered_pipeline = pipeline_with_namespaces.filter(node_namespace="katie") + nodes = {node.name for node in filtered_pipeline.nodes} + assert nodes == {"katie.node1", "katie.lisa.node4", "katie.lisa.john.node6"} + + def test_two_filters(self, complex_pipeline): + filtered_pipeline = complex_pipeline.filter( + from_nodes=["node4"], to_outputs=["M"] + ) + nodes = {node.name for node in filtered_pipeline.nodes} + assert nodes == {"node3", "node4"} - def test_only_nodes_with_outputs(self, complex_pipeline): - """node4 require F and H as outputs.""" - new_pipeline = complex_pipeline.only_nodes_with_outputs("F", "H") - nodes = {node.name for node in new_pipeline.nodes} + def test_three_filters(self, complex_pipeline): + filtered_pipeline = complex_pipeline.filter( + from_nodes=["node4"], to_outputs=["M"], node_names=["node3"] + ) + nodes = {node.name for node in filtered_pipeline.nodes} + assert nodes == {"node3"} + + def test_filter_no_nodes(self, complex_pipeline): + with pytest.raises(ValueError, match="Pipeline contains no nodes"): + complex_pipeline.filter( + from_nodes=["node4"], + to_outputs=["M"], + node_names=["node3"], + to_nodes=["node4"], + ) - assert len(new_pipeline.nodes) == 1 - assert nodes == {"node4"} - def test_only_nodes_with_outputs_unknown(self, complex_pipeline): - with pytest.raises(ValueError, match="['W', 'Z']"): - complex_pipeline.only_nodes_with_outputs("Z", "W", "E", "C") +class TestPipelineFilterHelpers: + """Node selection functions called by Pipeline.filter.""" - def test_to_outputs(self, complex_pipeline): - """New pipeline contain all nodes to produce F and H outputs.""" - new_pipeline = complex_pipeline.to_outputs("F", "H") - nodes = {node.name for node in new_pipeline.nodes} + @pytest.mark.parametrize( + "tags,expected_nodes", + [ + (["tag1"], ["node2", "node6"]), + (["tag2"], ["node2", "node4"]), + (["tag2", "tag1"], ["node2", "node4", "node6"]), + (["tag1", "tag2", "tag-missing"], ["node2", "node4", "node6"]), + (["tag-missing"], []), + ([], []), + ], + ) + def test_only_nodes_with_tags(self, tags, expected_nodes, nodes_with_tags): + pipeline = Pipeline(nodes_with_tags) - assert len(new_pipeline.nodes) == 4 - assert nodes == {"node4", "node7", "node8", "node9"} + def get_nodes_with_tags(*tags): + p = pipeline.only_nodes_with_tags(*tags) + return sorted(n.name for n in p.nodes) - def test_to_outputs_unknown(self, complex_pipeline): - with pytest.raises(ValueError, match=r"\['W', 'Z'\]"): - complex_pipeline.to_outputs("Z", "W", "E", "C") + assert get_nodes_with_tags(*tags) == expected_nodes def test_from_nodes(self, complex_pipeline): """New pipeline contain all nodes that depend on node2 and node3.""" @@ -666,7 +753,7 @@ def test_from_nodes(self, complex_pipeline): assert len(new_pipeline.nodes) == 3 assert nodes == {"node1", "node2", "node3"} - def test_from_node_unknown(self, complex_pipeline): + def test_from_nodes_unknown(self, complex_pipeline): pattern = r"Pipeline does not contain nodes named \['missing_node'\]" with pytest.raises(ValueError, match=pattern): complex_pipeline.from_nodes("missing_node") @@ -684,178 +771,105 @@ def test_to_nodes_unknown(self, complex_pipeline): with pytest.raises(ValueError, match=pattern): complex_pipeline.to_nodes("missing_node") - def test_connected_pipeline(self, disjoint_pipeline): - """Connect two separate pipelines.""" - nodes = disjoint_pipeline["nodes"] - subpipeline = Pipeline(nodes, tags=["subpipeline"]) - - assert len(subpipeline.inputs()) == 2 - assert len(subpipeline.outputs()) == 2 - - pipeline = Pipeline( - [node(identity, "C", "D", name="connecting_node"), subpipeline], tags="main" - ) - - assert len(pipeline.nodes) == 1 + len(nodes) - assert len(pipeline.inputs()) == 1 - assert len(pipeline.outputs()) == 1 - - def test_node_dependencies(self, complex_pipeline): - expected = { - "node1": {"node2", "node3", "node4"}, - "node2": {"node4"}, - "node3": {"node4"}, - "node4": {"node7"}, - "node5": {"node6"}, - "node6": {"node7"}, - "node7": {"node8"}, - "node8": {"node9"}, - "node9": set(), - } - actual = { - child.name: {parent.name for parent in parents} - for child, parents in complex_pipeline.node_dependencies.items() - } - assert actual == expected - - -class TestPipelineDescribe: - def test_names_only(self, str_node_inputs_list): - pipeline = Pipeline(str_node_inputs_list["nodes"]) - description = pipeline.describe() - - desc = description.split("\n") - test_desc = [ - "#### Pipeline execution order ####", - "Inputs: input1, input2", - "", - "node1", - "node2", - "", - "Outputs: input4", - "##################################", - ] - - assert len(desc) == len(test_desc) - for res, example in zip(desc, test_desc): - assert res == example - - def test_full(self, str_node_inputs_list): - pipeline = Pipeline(str_node_inputs_list["nodes"]) - description = pipeline.describe(names_only=False) - - desc = description.split("\n") - test_desc = [ - "#### Pipeline execution order ####", - "Inputs: input1, input2", - "", - "node1: biconcat([input1,input2]) -> [input3]", - "node2: identity([input3]) -> [input4]", - "", - "Outputs: input4", - "##################################", - ] - - assert len(desc) == len(test_desc) - for res, example in zip(desc, test_desc): - assert res == example + @pytest.mark.parametrize( + "target_node_names", [["node2", "node3", "node4", "node8"], ["node1"]] + ) + def test_only_nodes(self, target_node_names, pipeline_list_with_lists): + full = Pipeline(pipeline_list_with_lists["nodes"]) + partial = full.only_nodes(*target_node_names) + target_list = list(target_node_names) + names = map(lambda node_: node_.name, partial.nodes) + assert sorted(names) == sorted(target_list) + @pytest.mark.parametrize( + "target_node_names", [["node2", "node3", "node4", "NaN"], ["invalid"]] + ) + def test_only_nodes_unknown(self, pipeline_list_with_lists, target_node_names): + pattern = r"Pipeline does not contain nodes" + full = Pipeline(pipeline_list_with_lists["nodes"]) + with pytest.raises(ValueError, match=pattern): + full.only_nodes(*target_node_names) -def apply_f(func: Callable) -> Callable: - @wraps(func) - def with_f(*args, **kwargs): - return func(*(f"f({a})" for a in args), **kwargs) + def test_from_inputs(self, complex_pipeline): + """F and H are inputs of node1, node2 and node3.""" + new_pipeline = complex_pipeline.from_inputs("F", "H") + nodes = {node.name for node in new_pipeline.nodes} - return with_f + assert len(new_pipeline.nodes) == 3 + assert nodes == {"node1", "node2", "node3"} + def test_from_inputs_unknown(self, complex_pipeline): + """W and Z do not exist as inputs.""" + with pytest.raises(ValueError, match=r"\['W', 'Z'\]"): + complex_pipeline.from_inputs("Z", "W", "E", "C") -def apply_g(func: Callable) -> Callable: - @wraps(func) - def with_g(*args, **kwargs): - return func(*(f"g({a})" for a in args), **kwargs) + def test_to_outputs(self, complex_pipeline): + """New pipeline contain all nodes to produce F and H outputs.""" + new_pipeline = complex_pipeline.to_outputs("F", "H") + nodes = {node.name for node in new_pipeline.nodes} - return with_g + assert len(new_pipeline.nodes) == 4 + assert nodes == {"node4", "node7", "node8", "node9"} + def test_to_outputs_unknown(self, complex_pipeline): + with pytest.raises(ValueError, match=r"\['W', 'Z'\]"): + complex_pipeline.to_outputs("Z", "W", "E", "C") -class TestPipelineDecorator: - def test_apply(self): - nodes = sorted( - [ - node(identity, "number", "output1", name="identity1"), - node(identity, "output1", "output2", name="biconcat"), - node(identity, "output2", "output", name="identity3"), - ], - key=lambda x: x.name, - ) - pattern = ( - "The pipeline's `decorate` API will be deprecated in Kedro 0.18.0." - "Please use a node's Hooks to extend the node's behaviour in a pipeline." - "For more information, please visit" - "https://kedro.readthedocs.io/en/stable/07_extend_kedro/02_hooks.html" + @pytest.mark.parametrize( + "target_namespace,expected_namespaces", + [ + ("katie", ["katie.lisa.john", "katie.lisa", "katie"]), + ("lisa", ["lisa.john", "lisa"]), + ("john", ["john"]), + ("katie.lisa", ["katie.lisa.john", "katie.lisa"]), + ("katie.lisa.john", ["katie.lisa.john"]), + ], + ) + def test_only_nodes_with_namespace( + self, target_namespace, expected_namespaces, pipeline_with_namespaces + ): + resulting_pipeline = pipeline_with_namespaces.only_nodes_with_namespace( + target_namespace ) - with pytest.warns(DeprecationWarning, match=re.escape(pattern)): - pipeline = Pipeline(nodes).decorate(apply_f, apply_g) - catalog = DataCatalog({}, dict(number=1)) - result = SequentialRunner().run(pipeline, catalog) - decorated_nodes = sorted(pipeline.nodes, key=lambda x: x.name) - - assert result["output"] == "g(f(g(f(g(f(1))))))" - assert len(pipeline.nodes) == 3 - assert all(n1.name == n2.name for n1, n2 in zip(nodes, decorated_nodes)) - - def test_empty_apply(self): - """Applying no decorators is valid.""" - identity_node = node(identity, "number", "output", name="identity") - pipeline = Pipeline([identity_node]).decorate() - catalog = DataCatalog({}, dict(number=1)) - result = SequentialRunner().run(pipeline, catalog) - assert result["output"] == 1 + for actual_node, expected_namespace in zip( + sorted(resulting_pipeline.nodes), expected_namespaces + ): + assert actual_node.namespace == expected_namespace + @pytest.mark.parametrize("namespace", ["katie", None]) + def test_only_nodes_with_namespace_unknown(self, namespace): + pipeline = Pipeline([node(identity, "A", "B", namespace=namespace)]) + pattern = r"Pipeline does not contain nodes" + with pytest.raises(ValueError, match=pattern): + pipeline.only_nodes_with_namespace("non_existent") -@pytest.fixture -def nodes_with_tags(): - return [ - node(identity, "E", None, name="node1"), - node(identity, "D", "E", name="node2", tags=["tag1", "tag2"]), - node(identity, "C", "D", name="node3"), - node(identity, "A", "B", name="node4", tags=["tag2"]), - node(identity, "B", "C", name="node5"), - node(constant_output, None, "A", name="node6", tags=["tag1"]), - ] +class TestPipelineRunnerHelpers: + """Node selection functions used in AbstractRunner.""" -class TestPipelineTags: - @pytest.mark.parametrize( - "tags,expected_nodes", - [ - (["tag1"], ["node2", "node6"]), - (["tag2"], ["node2", "node4"]), - (["tag2", "tag1"], ["node2", "node4", "node6"]), - (["tag1", "tag2", "tag-missing"], ["node2", "node4", "node6"]), - (["tag-missing"], []), - ([], []), - ], - ) - def test_from_tags(self, tags, expected_nodes, nodes_with_tags): - pipeline = Pipeline(nodes_with_tags) + def test_only_nodes_with_inputs(self, complex_pipeline): + """node1 and node2 require H as an input.""" + new_pipeline = complex_pipeline.only_nodes_with_inputs("H") + nodes = {node.name for node in new_pipeline.nodes} - def get_nodes_with_tags(*tags): - p = pipeline.only_nodes_with_tags(*tags) - return sorted(n.name for n in p.nodes) + assert len(new_pipeline.nodes) == 2 + assert nodes == {"node1", "node2"} - assert get_nodes_with_tags(*tags) == expected_nodes + def test_only_nodes_with_inputs_unknown(self, complex_pipeline): + with pytest.raises(ValueError, match="['W', 'Z']"): + complex_pipeline.only_nodes_with_inputs("Z", "W", "E", "C") - def test_tag_existing_pipeline(self, branchless_pipeline): - pipeline = Pipeline(branchless_pipeline["nodes"]) - pipeline = pipeline.tag(["new_tag"]) - assert all("new_tag" in n.tags for n in pipeline.nodes) + def test_only_nodes_with_outputs(self, complex_pipeline): + """node4 require F and H as outputs.""" + new_pipeline = complex_pipeline.only_nodes_with_outputs("F", "H") + nodes = {node.name for node in new_pipeline.nodes} - def test_pipeline_single_tag(self, branchless_pipeline): - p1 = Pipeline(branchless_pipeline["nodes"], tags="single_tag") - p2 = Pipeline(branchless_pipeline["nodes"]).tag("single_tag") + assert len(new_pipeline.nodes) == 1 + assert nodes == {"node4"} - for pipeline in (p1, p2): - assert all("single_tag" in n.tags for n in pipeline.nodes) + def test_only_nodes_with_outputs_unknown(self, complex_pipeline): + with pytest.raises(ValueError, match="['W', 'Z']"): + complex_pipeline.only_nodes_with_outputs("Z", "W", "E", "C") def test_pipeline_to_json(input_data): diff --git a/tests/runner/test_parallel_runner.py b/tests/runner/test_parallel_runner.py index 3df5cf2212..9c0ef564f3 100644 --- a/tests/runner/test_parallel_runner.py +++ b/tests/runner/test_parallel_runner.py @@ -12,7 +12,6 @@ MemoryDataSet, ) from kedro.pipeline import Pipeline, node -from kedro.pipeline.decorators import log_time from kedro.runner import ParallelRunner from kedro.runner.parallel_runner import ( _MAX_WINDOWS_WORKERS, @@ -69,19 +68,6 @@ def fan_out_fan_in(): ) -@pytest.fixture(autouse=True) -def mock_load_context(tmp_path, mocker): - # pylint: disable=too-few-public-methods - class DummyContext: - def __init__(self, project_path): - self.project_path = project_path - - mocker.patch( - "kedro.framework.context.context.load_context", - return_value=DummyContext(str(tmp_path)), - ) - - @pytest.mark.skipif( sys.platform.startswith("win"), reason="Due to bug in parallel runner" ) @@ -100,7 +86,7 @@ def test_parallel_run(self, is_async, fan_out_fan_in, catalog): assert result["Z"] == (42, 42, 42) @pytest.mark.parametrize("is_async", [False, True]) - def test_memory_data_set_input(self, is_async, fan_out_fan_in): + def test_memory_dataset_input(self, is_async, fan_out_fan_in): pipeline = Pipeline([fan_out_fan_in]) catalog = DataCatalog({"A": MemoryDataSet("42")}) result = ParallelRunner(is_async=is_async).run(pipeline, catalog) @@ -186,7 +172,7 @@ def test_task_exception(self, is_async, fan_out_fan_in, catalog): with pytest.raises(Exception, match="test exception"): ParallelRunner(is_async=is_async).run(pipeline, catalog) - def test_memory_data_set_output(self, is_async, fan_out_fan_in): + def test_memory_dataset_output(self, is_async, fan_out_fan_in): """ParallelRunner does not support output to externally created MemoryDataSets. """ @@ -258,48 +244,6 @@ def test_unable_to_schedule_all_nodes( runner.run(fan_out_fan_in, catalog) -@log_time -def decorated_identity(*args, **kwargs): - return identity(*args, **kwargs) - - -@pytest.fixture -def decorated_fan_out_fan_in(): - return Pipeline( - [ - node(decorated_identity, "A", "B"), - node(decorated_identity, "B", "C"), - node(decorated_identity, "B", "D"), - node(decorated_identity, "B", "E"), - node(fan_in, ["C", "D", "E"], "Z"), - ] - ) - - -@pytest.mark.skipif( - sys.platform.startswith("win"), reason="Due to bug in parallel runner" -) -@pytest.mark.parametrize("is_async", [False, True]) -class TestParallelRunnerDecorator: - def test_decorate_pipeline(self, is_async, fan_out_fan_in, catalog): - catalog.add_feed_dict(dict(A=42)) - result = ParallelRunner(is_async=is_async).run( - fan_out_fan_in.decorate(log_time), catalog - ) - assert "Z" in result - assert len(result["Z"]) == 3 - assert result["Z"] == (42, 42, 42) - - def test_decorated_nodes(self, is_async, decorated_fan_out_fan_in, catalog): - catalog.add_feed_dict(dict(A=42)) - result = ParallelRunner(is_async=is_async).run( - decorated_fan_out_fan_in, catalog - ) - assert "Z" in result - assert len(result["Z"]) == 3 - assert result["Z"] == (42, 42, 42) - - class LoggingDataSet(AbstractDataSet): def __init__(self, log, name, value=None): self.log = log diff --git a/tests/runner/test_thread_runner.py b/tests/runner/test_thread_runner.py index a30e894d57..70a5c51cb4 100644 --- a/tests/runner/test_thread_runner.py +++ b/tests/runner/test_thread_runner.py @@ -5,7 +5,6 @@ from kedro.io import AbstractDataSet, DataCatalog, DataSetError, MemoryDataSet from kedro.pipeline import Pipeline, node -from kedro.pipeline.decorators import log_time from kedro.runner import ThreadRunner @@ -63,7 +62,7 @@ def test_thread_run(self, fan_out_fan_in, catalog): assert "Z" in result assert result["Z"] == (42, 42, 42) - def test_memory_data_set_input(self, fan_out_fan_in): + def test_memory_dataset_input(self, fan_out_fan_in): catalog = DataCatalog({"A": MemoryDataSet("42")}) result = ThreadRunner().run(fan_out_fan_in, catalog) assert "Z" in result @@ -140,40 +139,6 @@ def test_node_returning_none(self): ThreadRunner().run(pipeline, catalog) -@log_time -def decorated_identity(*args, **kwargs): - return identity(*args, **kwargs) - - -@pytest.fixture -def decorated_fan_out_fan_in(): - return Pipeline( - [ - node(decorated_identity, "A", "B"), - node(decorated_identity, "B", "C"), - node(decorated_identity, "B", "D"), - node(decorated_identity, "B", "E"), - node(fan_in, ["C", "D", "E"], "Z"), - ] - ) - - -class TestThreadRunnerDecorator: - def test_decorate_pipeline(self, fan_out_fan_in, catalog): - catalog.add_feed_dict(dict(A=42)) - result = ThreadRunner().run(fan_out_fan_in.decorate(log_time), catalog) - assert "Z" in result - assert len(result["Z"]) == 3 - assert result["Z"] == (42, 42, 42) - - def test_decorated_nodes(self, decorated_fan_out_fan_in, catalog): - catalog.add_feed_dict(dict(A=42)) - result = ThreadRunner().run(decorated_fan_out_fan_in, catalog) - assert "Z" in result - assert len(result["Z"]) == 3 - assert result["Z"] == (42, 42, 42) - - class LoggingDataSet(AbstractDataSet): def __init__(self, log, name, value=None): self.log = log diff --git a/tests/template/conftest.py b/tests/template/conftest.py deleted file mode 100644 index 32f95e3641..0000000000 --- a/tests/template/conftest.py +++ /dev/null @@ -1,122 +0,0 @@ -""" -This file contains the fixtures that are reusable by any tests within -this directory. You don't need to import the fixtures as pytest will -discover them automatically. More info here: -https://docs.pytest.org/en/latest/fixture.html -""" -import shutil -import sys -import tempfile -from importlib import import_module -from pathlib import Path - -import click -import yaml -from click.testing import CliRunner -from pytest import fixture - -from kedro import __version__ as kedro_version -from kedro.framework.cli.catalog import catalog_cli -from kedro.framework.cli.cli import cli -from kedro.framework.cli.jupyter import jupyter_cli -from kedro.framework.cli.pipeline import pipeline_cli -from kedro.framework.cli.project import project_group -from kedro.framework.cli.registry import registry_cli -from kedro.framework.cli.starters import create_cli -from kedro.framework.project import configure_project, pipelines, settings -from kedro.framework.startup import ProjectMetadata - -REPO_NAME = "dummy_project" -PACKAGE_NAME = "dummy_package" - - -@fixture(scope="module") -def fake_root_dir(): - # using tempfile as tmp_path fixture doesn't support module scope - tmpdir = tempfile.mkdtemp() - try: - yield Path(tmpdir).resolve() - finally: - shutil.rmtree(tmpdir, ignore_errors=True) - - -@fixture(scope="module") -def fake_repo_path(fake_root_dir): - return fake_root_dir.resolve() / REPO_NAME - - -@fixture(scope="module") -def dummy_config(fake_root_dir, fake_metadata): - config = { - "project_name": fake_metadata.project_name, - "repo_name": REPO_NAME, - "python_package": fake_metadata.package_name, - "output_dir": str(fake_root_dir), - } - - config_path = fake_root_dir / "dummy_config.yml" - with config_path.open("w") as f: - yaml.dump(config, f) - - return config_path - - -@fixture(scope="module") -def fake_metadata(fake_root_dir): - metadata = ProjectMetadata( - fake_root_dir / REPO_NAME / "pyproject.toml", - PACKAGE_NAME, - "CLI Testing Project", - fake_root_dir / REPO_NAME, - kedro_version, - fake_root_dir / REPO_NAME / "src", - ) - return metadata - - -# This is needed just for the tests, those CLI groups are merged in our -# code when invoking `kedro` but when imported, they still need to be merged -@fixture(scope="module") -def fake_kedro_cli(): - return click.CommandCollection( - name="Kedro", - sources=[ - cli, - create_cli, - catalog_cli, - jupyter_cli, - pipeline_cli, - project_group, - registry_cli, - ], - ) - - -@fixture(scope="module") -def fake_project_cli( - fake_repo_path: Path, dummy_config: Path, fake_kedro_cli: click.CommandCollection -): - old_settings = settings.as_dict() - starter_path = Path(__file__).parents[2].resolve() - starter_path = starter_path / "features" / "steps" / "test_starter" - CliRunner().invoke( - fake_kedro_cli, ["new", "-c", str(dummy_config), "--starter", str(starter_path)] - ) - - # NOTE: Here we load a couple of modules, as they would be imported in - # the code and tests. - # It's safe to remove the new entries from path due to the python - # module caching mechanism. Any `reload` on it will not work though. - old_path = sys.path.copy() - sys.path = [str(fake_repo_path / "src")] + sys.path - - import_module(PACKAGE_NAME) - configure_project(PACKAGE_NAME) - yield fake_kedro_cli - - # reset side-effects of configure_project - pipelines._clear(PACKAGE_NAME) # this resets pipelines loading state - for key, value in old_settings.items(): - settings.set(key, value) - sys.path = old_path - del sys.modules[PACKAGE_NAME] diff --git a/tests/template/test_load_context_framework.py b/tests/template/test_load_context_framework.py deleted file mode 100644 index 77f410733d..0000000000 --- a/tests/template/test_load_context_framework.py +++ /dev/null @@ -1,150 +0,0 @@ -import re -import sys - -import pytest -import toml - -from kedro import __version__ as kedro_version -from kedro.framework.context import KedroContext, load_context -from kedro.framework.project import Validator, _ProjectSettings -from kedro.framework.startup import _get_project_metadata - - -@pytest.fixture(autouse=True) -def mock_logging_config(mocker): - # Disable logging.config.dictConfig in KedroContext._setup_logging as - # it changes logging.config and affects other unit tests - mocker.patch("logging.config.dictConfig") - - -def _create_kedro_config(project_path, payload): - kedro_conf = project_path / "pyproject.toml" - kedro_conf.parent.mkdir(parents=True, exist_ok=True) - toml_str = toml.dumps(payload) - kedro_conf.write_text(toml_str) - - -class MyContext(KedroContext): - pass - - -class MockSettings(_ProjectSettings): - _HOOKS = Validator("HOOKS", default=()) - _CONTEXT_CLASS = Validator("CONTEXT_CLASS", default=lambda *_: MyContext) - - -@pytest.fixture -def mock_settings(mocker): - mocked_settings = MockSettings() - mocker.patch("kedro.framework.project.settings", mocked_settings) - mocker.patch("kedro.framework.context.context.settings", mocked_settings) - - -@pytest.mark.usefixtures("fake_project_cli") -class TestLoadContext: - def test_valid_context(self, fake_repo_path, mocker): - """Test getting project context.""" - get_project_metadata_mock = mocker.patch( - "kedro.framework.context.context._get_project_metadata", - wraps=_get_project_metadata, - ) - result = load_context(str(fake_repo_path)) - assert result.package_name == "dummy_package" - assert str(fake_repo_path.resolve() / "src") in sys.path - get_project_metadata_mock.assert_called_with(fake_repo_path) - - def test_valid_context_with_env(self, mocker, monkeypatch, fake_repo_path): - """Test getting project context when Kedro config environment is - specified in the environment variable. - """ - mocker.patch("kedro.config.config.ConfigLoader.get") - monkeypatch.setenv("KEDRO_ENV", "my_fake_env") - result = load_context(str(fake_repo_path)) - assert result.env == "my_fake_env" - - def test_invalid_path(self, tmp_path): - """Test for loading context from an invalid path.""" - other_path = tmp_path / "other" - other_path.mkdir() - pattern = "Could not find the project configuration file 'pyproject.toml'" - with pytest.raises(RuntimeError, match=re.escape(pattern)): - load_context(str(other_path)) - - def test_pyproject_toml_has_missing_mandatory_keys(self, fake_repo_path): - payload = { - "tool": { - "kedro": {"fake_key": "fake_value", "project_version": kedro_version} - } - } - _create_kedro_config(fake_repo_path, payload) - - pattern = ( - "Missing required keys ['package_name', 'project_name'] " - "from 'pyproject.toml'." - ) - with pytest.raises(RuntimeError, match=re.escape(pattern)): - load_context(str(fake_repo_path)) - - def test_pyproject_toml_has_extra_keys(self, fake_repo_path, fake_metadata): - project_name = "Test Project" - payload = { - "tool": { - "kedro": { - "project_version": kedro_version, - "project_name": project_name, - "package_name": fake_metadata.package_name, - "unexpected_key": "hello", - } - } - } - _create_kedro_config(fake_repo_path, payload) - - pattern = ( - "Found unexpected keys in 'pyproject.toml'. Make sure it " - "only contains the following keys: ['package_name', " - "'project_name', 'project_version', 'source_dir']." - ) - with pytest.raises(RuntimeError, match=re.escape(pattern)): - load_context(str(fake_repo_path)) - - def test_settings_py_has_no_context_path(self, fake_repo_path): - """Test for loading default `KedroContext` context.""" - payload = { - "tool": { - "kedro": { - "package_name": "dummy_package", - "project_version": kedro_version, - "project_name": "fake_project", - } - } - } - _create_kedro_config(fake_repo_path, payload) - - context = load_context(str(fake_repo_path)) - assert isinstance(context, KedroContext) - assert context.__class__ is KedroContext - - @pytest.mark.usefixtures("mock_settings") - def test_settings_py_has_context_path( - self, - fake_repo_path, - fake_metadata, - ): - """Test for loading custom `ProjectContext` context.""" - payload = { - "tool": { - "kedro": { - "package_name": fake_metadata.package_name, - "project_version": kedro_version, - "project_name": "fake_project", - } - } - } - - _create_kedro_config(fake_repo_path, payload) - - context = load_context(str(fake_repo_path)) - - assert isinstance(context, KedroContext) - assert context.__class__ is not KedroContext - assert context.__class__.__name__ == "MyContext" diff --git a/tests/tools/test_cli.py b/tests/tools/test_cli.py index b6c46adf85..9c7c164191 100644 --- a/tests/tools/test_cli.py +++ b/tests/tools/test_cli.py @@ -17,7 +17,6 @@ "build-docs", "build-reqs", "catalog", - "install", "ipython", "jupyter", "lint", @@ -98,7 +97,7 @@ def test_get_cli_structure_depth(self, mocker, fake_metadata): ) kedro_cli = KedroCLI(fake_metadata.project_path) raw_cli_structure = get_cli_structure(kedro_cli, get_help=False) - assert type(raw_cli_structure["kedro"]["new"]) == dict + assert isinstance(raw_cli_structure["kedro"]["new"], dict) assert sorted(list(raw_cli_structure["kedro"]["new"].keys())) == sorted( [ "--verbose", diff --git a/tests/versioning/test_journal.py b/tests/versioning/test_journal.py deleted file mode 100644 index f103a2f7f8..0000000000 --- a/tests/versioning/test_journal.py +++ /dev/null @@ -1,100 +0,0 @@ -# pylint: disable=protected-access - -import json -import logging -import logging.config -from importlib import reload - -import pytest - -from kedro.versioning.journal import Journal, _git_sha - - -@pytest.fixture() -def setup_logging(tmp_path): - config = { - "version": 1, - "loggers": { - "kedro.journal": { - "level": "INFO", - "handlers": ["journal_file_handler"], - "propagate": False, - } - }, - "handlers": { - "journal_file_handler": { - "class": "kedro.versioning.journal.JournalFileHandler", - "level": "INFO", - "base_dir": str(tmp_path), - } - }, - } - reload(logging) - logging.config.dictConfig(config) - - -@pytest.fixture -def fake_git_sha(mocker): - return mocker.patch("kedro.versioning.journal._git_sha", return_value="git_sha") - - -@pytest.mark.usefixtures("fake_git_sha") -class TestJournal: - @pytest.mark.usefixtures("setup_logging") - def test_context_record(self, tmp_path): - """Test journal initialisation""" - record_data = {"run_id": "fake_id", "project_path": str(tmp_path)} - journal = Journal(record_data) - file_path = list(tmp_path.glob("journal_*")) - - assert len(file_path) == 1 - assert journal.run_id in str(file_path[0]) - log = json.loads(file_path[0].read_text()) - assert log["type"] == "ContextJournalRecord" - assert log["project_path"] == str(tmp_path) - assert log["git_sha"] == "git_sha" - assert "run_id" in log - - def test_invalid_context_record(self, tmp_path, caplog): - record_data = { - "run_id": "fake_id", - "project_path": str(tmp_path), - "blah": lambda x: x, - } - _ = Journal(record_data) - - assert "Unable to record" in caplog.record_tuples[0][2] - - @pytest.mark.usefixtures("setup_logging") - def test_log_catalog(self, tmp_path): - record_data = {"run_id": "fake_id", "project_path": str(tmp_path)} - journal = Journal(record_data) - journal.log_catalog("fake_data", "fake_operation", "fake_version") - file_path = list(tmp_path.glob("journal_*")) - - assert journal.run_id in str(file_path[0]) - assert len(file_path) == 1 - with file_path[0].open() as log_file: - context_log = json.loads(log_file.readline()) - catalog_log = json.loads(log_file.readline()) - assert catalog_log["type"] == "DatasetJournalRecord" - assert catalog_log["name"] == "fake_data" - assert catalog_log["operation"] == "fake_operation" - assert catalog_log["version"] == "fake_version" - assert catalog_log["run_id"] == context_log["run_id"] - - def test_deprecation_warning(self, tmp_path): - record_data = {"run_id": "fake_id", "project_path": str(tmp_path)} - with pytest.warns(DeprecationWarning): - Journal(record_data) - - -def test_git_sha(tmp_path, mocker): - mocker.patch("subprocess.check_output", return_value=b"mocked_return") - result = _git_sha(tmp_path) - assert result == "mocked_return" - - -def test_invalid_git_sha(tmp_path, caplog): - _git_sha(tmp_path) - assert "Unable to git describe" in caplog.record_tuples[0][2] diff --git a/tools/cli.py b/tools/cli.py index 98f1d88a51..e150631abe 100644 --- a/tools/cli.py +++ b/tools/cli.py @@ -32,8 +32,8 @@ def _recurse_cli( element_name = cli_element.name or "kedro" io_dict[element_name] = {} for command_name in cli_element.list_commands(ctx): - _recurse_cli( # type: ignore - cli_element.get_command(ctx, command_name), + _recurse_cli( + cli_element.get_command(ctx, command_name), # type: ignore ctx, io_dict[element_name], get_help,