diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f2b2bd0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,152 @@ +########################## +# KEDRO PROJECT + +# ignore all local configuration +conf/local/** +!conf/local/.gitkeep + +# ignore potentially sensitive credentials files +conf/**/*credentials* + +# ignore everything in the following folders +data/** + +# except their sub-folders +!data/**/ + +# also keep all .gitkeep files +!.gitkeep + +# also keep the example dataset +!data/01_raw/*.csv + + +########################## +# Common files + +# IntelliJ +.idea/ +*.iml +out/ +.idea_modules/ + +### macOS +*.DS_Store +.AppleDouble +.LSOverride +.Trashes + +# Vim +*~ +.*.swo +.*.swp + +# emacs +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc + +# JIRA plugin +atlassian-ide-plugin.xml + +# C extensions +*.so + +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..3900c06 --- /dev/null +++ b/README.md @@ -0,0 +1,122 @@ +# kedro-dynamic-pipeline-hook-example + +## Overview + +This is your new Kedro project, which was generated using `Kedro 0.18.12`. + +Take a look at the [Kedro documentation](https://kedro.readthedocs.io) to get started. + +## Rules and guidelines + +In order to get the best out of the template: + +* Don't remove any lines from the `.gitignore` file we provide +* Make sure your results can be reproduced by following a [data engineering convention](https://kedro.readthedocs.io/en/stable/faq/faq.html#what-is-data-engineering-convention) +* Don't commit data to your repository +* Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/` + +## How to install dependencies + +Declare any dependencies in `src/requirements.txt` for `pip` installation and `src/environment.yml` for `conda` installation. + +To install them, run: + +``` +pip install -r src/requirements.txt +``` + +## How to run your Kedro pipeline + +You can run your Kedro project with: + +``` +kedro run +``` + +## How to test your Kedro project + +Have a look at the file `src/tests/test_run.py` for instructions on how to write your tests. You can run your tests as follows: + +``` +kedro test +``` + +To configure the coverage threshold, go to the `.coveragerc` file. + +## Project dependencies + +To generate or update the dependency requirements for your project: + +``` +kedro build-reqs +``` + +This will `pip-compile` the contents of `src/requirements.txt` into a new file `src/requirements.lock`. You can see the output of the resolution by opening `src/requirements.lock`. + +After this, if you'd like to update your project requirements, please update `src/requirements.txt` and re-run `kedro build-reqs`. + +[Further information about project dependencies](https://kedro.readthedocs.io/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies) + +## How to work with Kedro and notebooks + +> Note: Using `kedro jupyter` or `kedro ipython` to run your notebook provides these variables in scope: `catalog`, `context`, `pipelines` and `session`. +> +> Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `pip install -r src/requirements.txt` you will not need to take any extra steps before you use them. + +### Jupyter +To use Jupyter notebooks in your Kedro project, you need to install Jupyter: + +``` +pip install jupyter +``` + +After installing Jupyter, you can start a local notebook server: + +``` +kedro jupyter notebook +``` + +### JupyterLab +To use JupyterLab, you need to install it: + +``` +pip install jupyterlab +``` + +You can also start JupyterLab: + +``` +kedro jupyter lab +``` + +### IPython +And if you want to run an IPython session: + +``` +kedro ipython +``` + +### How to convert notebook cells to nodes in a Kedro project +You can move notebook code over into a Kedro project structure using a mixture of [cell tagging](https://jupyter-notebook.readthedocs.io/en/stable/changelog.html#release-5-0-0) and Kedro CLI commands. + +By adding the `node` tag to a cell and running the command below, the cell's source code will be copied over to a Python file within `src//nodes/`: + +``` +kedro jupyter convert +``` +> *Note:* The name of the Python file matches the name of the original notebook. + +Alternatively, you may want to transform all your notebooks in one go. Run the following command to convert all notebook files found in the project root directory and under any of its sub-folders: + +``` +kedro jupyter convert --all +``` + +### How to ignore notebook output cells in `git` +To automatically strip out all output cell contents before committing to `git`, you can run `kedro activate-nbstripout`. This will add a hook in `.git/config` which will run `nbstripout` before anything is committed to `git`. + +> *Note:* Your output cells will be retained locally. + +## Package your Kedro project + +[Further information about building project documentation and packaging your project](https://kedro.readthedocs.io/en/stable/tutorial/package_a_project.html) diff --git a/conf/README.md b/conf/README.md new file mode 100644 index 0000000..a6a80a4 --- /dev/null +++ b/conf/README.md @@ -0,0 +1,26 @@ +# What is this for? + +This folder should be used to store configuration files used by Kedro or by separate tools. + +This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the [Instructions](#Instructions) section. + +## Local configuration + +The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys). + +> *Note:* Please do not check in any local configuration to version control. + +## Base configuration + +The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members. + +WARNING: Please do not put access credentials in the base configuration folder. + +## Instructions + + + + + +## Find out more +You can find out more about configuration from the [user guide documentation](https://kedro.readthedocs.io/en/stable/user_guide/configuration.html). diff --git a/conf/base/catalog.yml b/conf/base/catalog.yml new file mode 100644 index 0000000..c0c61a3 --- /dev/null +++ b/conf/base/catalog.yml @@ -0,0 +1,47 @@ +# Here you can define all your data sets by using simple YAML syntax. +# +# Documentation for this file format can be found in "The Data Catalog" +# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html +# +# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS +# +# An example data set definition can look as follows: +# +#bikes: +# type: pandas.CSVDataSet +# filepath: "data/01_raw/bikes.csv" +# +#weather: +# type: spark.SparkDataSet +# filepath: s3a://your_bucket/data/01_raw/weather* +# file_format: csv +# credentials: dev_s3 +# load_args: +# header: True +# inferSchema: True +# save_args: +# sep: '|' +# header: True +# +#scooters: +# type: pandas.SQLTableDataSet +# credentials: scooters_credentials +# table_name: scooters +# load_args: +# index_col: ['name'] +# columns: ['name', 'gear'] +# save_args: +# if_exists: 'replace' +# # if_exists: 'fail' +# # if_exists: 'append' +# +# The Data Catalog supports being able to reference the same file using two different DataSet implementations +# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: +# https://kedro.readthedocs.io/en/stable/data/data_catalog.html +# +# This is a data set used by the "Hello World" example pipeline provided with the project +# template. Please feel free to remove it once you remove the example pipeline. + +example_iris_data: + type: pandas.CSVDataSet + filepath: data/01_raw/iris.csv diff --git a/conf/base/logging.yml b/conf/base/logging.yml new file mode 100644 index 0000000..0a0ac47 --- /dev/null +++ b/conf/base/logging.yml @@ -0,0 +1,41 @@ +version: 1 + +disable_existing_loggers: False + +formatters: + simple: + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +handlers: + console: + class: logging.StreamHandler + level: INFO + formatter: simple + stream: ext://sys.stdout + + info_file_handler: + class: logging.handlers.RotatingFileHandler + level: INFO + formatter: simple + filename: info.log + maxBytes: 10485760 # 10MB + backupCount: 20 + encoding: utf8 + delay: True + + rich: + class: kedro.logging.RichHandler + rich_tracebacks: True + # Advance options for customisation. + # See https://docs.kedro.org/en/stable/logging/logging.html#project-side-logging-configuration + # tracebacks_show_locals: False + +loggers: + kedro: + level: INFO + + kedro_dynamic_pipeline_hook_example: + level: INFO + +root: + handlers: [rich, info_file_handler] diff --git a/conf/base/parameters.yml b/conf/base/parameters.yml new file mode 100644 index 0000000..48c3cd7 --- /dev/null +++ b/conf/base/parameters.yml @@ -0,0 +1,3 @@ +train_fraction: 0.8 +random_state: 3 +target_column: species diff --git a/conf/local/.gitkeep b/conf/local/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/01_raw/.gitkeep b/data/01_raw/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/01_raw/iris.csv b/data/01_raw/iris.csv new file mode 100644 index 0000000..ba0ebd2 --- /dev/null +++ b/data/01_raw/iris.csv @@ -0,0 +1,151 @@ +sepal_length,sepal_width,petal_length,petal_width,species +5.1,3.5,1.4,0.2,setosa +4.9,3.0,1.4,0.2,setosa +4.7,3.2,1.3,0.2,setosa +4.6,3.1,1.5,0.2,setosa +5.0,3.6,1.4,0.2,setosa +5.4,3.9,1.7,0.4,setosa +4.6,3.4,1.4,0.3,setosa +5.0,3.4,1.5,0.2,setosa +4.4,2.9,1.4,0.2,setosa +4.9,3.1,1.5,0.1,setosa +5.4,3.7,1.5,0.2,setosa +4.8,3.4,1.6,0.2,setosa +4.8,3.0,1.4,0.1,setosa +4.3,3.0,1.1,0.1,setosa +5.8,4.0,1.2,0.2,setosa +5.7,4.4,1.5,0.4,setosa +5.4,3.9,1.3,0.4,setosa +5.1,3.5,1.4,0.3,setosa +5.7,3.8,1.7,0.3,setosa +5.1,3.8,1.5,0.3,setosa +5.4,3.4,1.7,0.2,setosa +5.1,3.7,1.5,0.4,setosa +4.6,3.6,1.0,0.2,setosa +5.1,3.3,1.7,0.5,setosa +4.8,3.4,1.9,0.2,setosa +5.0,3.0,1.6,0.2,setosa +5.0,3.4,1.6,0.4,setosa +5.2,3.5,1.5,0.2,setosa +5.2,3.4,1.4,0.2,setosa +4.7,3.2,1.6,0.2,setosa +4.8,3.1,1.6,0.2,setosa +5.4,3.4,1.5,0.4,setosa +5.2,4.1,1.5,0.1,setosa +5.5,4.2,1.4,0.2,setosa +4.9,3.1,1.5,0.1,setosa +5.0,3.2,1.2,0.2,setosa +5.5,3.5,1.3,0.2,setosa +4.9,3.1,1.5,0.1,setosa +4.4,3.0,1.3,0.2,setosa +5.1,3.4,1.5,0.2,setosa +5.0,3.5,1.3,0.3,setosa +4.5,2.3,1.3,0.3,setosa +4.4,3.2,1.3,0.2,setosa +5.0,3.5,1.6,0.6,setosa +5.1,3.8,1.9,0.4,setosa +4.8,3.0,1.4,0.3,setosa +5.1,3.8,1.6,0.2,setosa +4.6,3.2,1.4,0.2,setosa +5.3,3.7,1.5,0.2,setosa +5.0,3.3,1.4,0.2,setosa +7.0,3.2,4.7,1.4,versicolor +6.4,3.2,4.5,1.5,versicolor +6.9,3.1,4.9,1.5,versicolor +5.5,2.3,4.0,1.3,versicolor +6.5,2.8,4.6,1.5,versicolor +5.7,2.8,4.5,1.3,versicolor +6.3,3.3,4.7,1.6,versicolor +4.9,2.4,3.3,1.0,versicolor +6.6,2.9,4.6,1.3,versicolor +5.2,2.7,3.9,1.4,versicolor +5.0,2.0,3.5,1.0,versicolor +5.9,3.0,4.2,1.5,versicolor +6.0,2.2,4.0,1.0,versicolor +6.1,2.9,4.7,1.4,versicolor +5.6,2.9,3.6,1.3,versicolor +6.7,3.1,4.4,1.4,versicolor +5.6,3.0,4.5,1.5,versicolor +5.8,2.7,4.1,1.0,versicolor +6.2,2.2,4.5,1.5,versicolor +5.6,2.5,3.9,1.1,versicolor +5.9,3.2,4.8,1.8,versicolor +6.1,2.8,4.0,1.3,versicolor +6.3,2.5,4.9,1.5,versicolor +6.1,2.8,4.7,1.2,versicolor +6.4,2.9,4.3,1.3,versicolor +6.6,3.0,4.4,1.4,versicolor +6.8,2.8,4.8,1.4,versicolor +6.7,3.0,5.0,1.7,versicolor +6.0,2.9,4.5,1.5,versicolor +5.7,2.6,3.5,1.0,versicolor +5.5,2.4,3.8,1.1,versicolor +5.5,2.4,3.7,1.0,versicolor +5.8,2.7,3.9,1.2,versicolor +6.0,2.7,5.1,1.6,versicolor +5.4,3.0,4.5,1.5,versicolor +6.0,3.4,4.5,1.6,versicolor +6.7,3.1,4.7,1.5,versicolor +6.3,2.3,4.4,1.3,versicolor +5.6,3.0,4.1,1.3,versicolor +5.5,2.5,4.0,1.3,versicolor +5.5,2.6,4.4,1.2,versicolor +6.1,3.0,4.6,1.4,versicolor +5.8,2.6,4.0,1.2,versicolor +5.0,2.3,3.3,1.0,versicolor +5.6,2.7,4.2,1.3,versicolor +5.7,3.0,4.2,1.2,versicolor +5.7,2.9,4.2,1.3,versicolor +6.2,2.9,4.3,1.3,versicolor +5.1,2.5,3.0,1.1,versicolor +5.7,2.8,4.1,1.3,versicolor +6.3,3.3,6.0,2.5,virginica +5.8,2.7,5.1,1.9,virginica +7.1,3.0,5.9,2.1,virginica +6.3,2.9,5.6,1.8,virginica +6.5,3.0,5.8,2.2,virginica +7.6,3.0,6.6,2.1,virginica +4.9,2.5,4.5,1.7,virginica +7.3,2.9,6.3,1.8,virginica +6.7,2.5,5.8,1.8,virginica +7.2,3.6,6.1,2.5,virginica +6.5,3.2,5.1,2.0,virginica +6.4,2.7,5.3,1.9,virginica +6.8,3.0,5.5,2.1,virginica +5.7,2.5,5.0,2.0,virginica +5.8,2.8,5.1,2.4,virginica +6.4,3.2,5.3,2.3,virginica +6.5,3.0,5.5,1.8,virginica +7.7,3.8,6.7,2.2,virginica +7.7,2.6,6.9,2.3,virginica +6.0,2.2,5.0,1.5,virginica +6.9,3.2,5.7,2.3,virginica +5.6,2.8,4.9,2.0,virginica +7.7,2.8,6.7,2.0,virginica +6.3,2.7,4.9,1.8,virginica +6.7,3.3,5.7,2.1,virginica +7.2,3.2,6.0,1.8,virginica +6.2,2.8,4.8,1.8,virginica +6.1,3.0,4.9,1.8,virginica +6.4,2.8,5.6,2.1,virginica +7.2,3.0,5.8,1.6,virginica +7.4,2.8,6.1,1.9,virginica +7.9,3.8,6.4,2.0,virginica +6.4,2.8,5.6,2.2,virginica +6.3,2.8,5.1,1.5,virginica +6.1,2.6,5.6,1.4,virginica +7.7,3.0,6.1,2.3,virginica +6.3,3.4,5.6,2.4,virginica +6.4,3.1,5.5,1.8,virginica +6.0,3.0,4.8,1.8,virginica +6.9,3.1,5.4,2.1,virginica +6.7,3.1,5.6,2.4,virginica +6.9,3.1,5.1,2.3,virginica +5.8,2.7,5.1,1.9,virginica +6.8,3.2,5.9,2.3,virginica +6.7,3.3,5.7,2.5,virginica +6.7,3.0,5.2,2.3,virginica +6.3,2.5,5.0,1.9,virginica +6.5,3.0,5.2,2.0,virginica +6.2,3.4,5.4,2.3,virginica +5.9,3.0,5.1,1.8,virginica diff --git a/data/02_intermediate/.gitkeep b/data/02_intermediate/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/03_primary/.gitkeep b/data/03_primary/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/04_feature/.gitkeep b/data/04_feature/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/05_model_input/.gitkeep b/data/05_model_input/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/06_models/.gitkeep b/data/06_models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/07_model_output/.gitkeep b/data/07_model_output/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/08_reporting/.gitkeep b/data/08_reporting/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..d25b086 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 + + +# kedro_dynamic_pipeline_hook_example documentation build +# configuration file, created by sphinx-quickstart. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import re + +from kedro.framework.cli.utils import find_stylesheets + +from kedro_dynamic_pipeline_hook_example import __version__ as release + +# -- Project information ----------------------------------------------------- + +project = "kedro_dynamic_pipeline_hook_example" +author = "Kedro" + +# The short X.Y version. +version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx_autodoc_typehints", + "sphinx.ext.doctest", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.mathjax", + "nbsphinx", + "sphinx_copybutton", + "myst_parser", +] + +# enable autosummary plugin (table of contents for modules/classes/class +# methods) +autosummary_generate = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = {".rst": "restructuredtext", ".md": "markdown"} + +# The master toctree document. +master_doc = "index" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = ["_build", "**.ipynb_checkpoints"] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +html_theme_options = {"collapse_navigation": False, "style_external_links": True} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + +html_show_sourcelink = False + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = "kedro_dynamic_pipeline_hook_exampledoc" + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ( + master_doc, + "kedro_dynamic_pipeline_hook_example.tex", + "kedro_dynamic_pipeline_hook_example Documentation", + "Kedro", + "manual", + ) +] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ( + master_doc, + "kedro_dynamic_pipeline_hook_example", + "kedro_dynamic_pipeline_hook_example Documentation", + [author], + 1, + ) +] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "kedro_dynamic_pipeline_hook_example", + "kedro_dynamic_pipeline_hook_example Documentation", + author, + "kedro_dynamic_pipeline_hook_example", + "Project kedro_dynamic_pipeline_hook_example codebase.", + "Nearest-Neighbour", + ) +] + +# -- Options for todo extension ---------------------------------------------- + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +# -- Extension configuration ------------------------------------------------- + +# nbsphinx_prolog = """ +# see here for prolog/epilog details: +# https://nbsphinx.readthedocs.io/en/0.3.1/prolog-and-epilog.html +# """ + +# -- NBconvert kernel config ------------------------------------------------- +nbsphinx_kernel_name = "python3" + + +def remove_arrows_in_examples(lines): + for i, line in enumerate(lines): + lines[i] = line.replace(">>>", "") + + +def autodoc_process_docstring(app, what, name, obj, options, lines): + remove_arrows_in_examples(lines) + + +def skip(app, what, name, obj, skip, options): + if name == "__init__": + return False + return skip + + +def setup(app): + app.connect("autodoc-process-docstring", autodoc_process_docstring) + app.connect("autodoc-skip-member", skip) + # add Kedro stylesheets + for stylesheet in find_stylesheets(): + app.add_css_file(stylesheet) + # enable rendering RST tables in Markdown diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..022a900 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,19 @@ +.. kedro_dynamic_pipeline_hook_example documentation master file, created by sphinx-quickstart. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to project kedro_dynamic_pipeline_hook_example's API docs! +============================================= + +.. toctree:: + :maxdepth: 4 + + modules + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/notebooks/.gitkeep b/notebooks/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8a124be --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.kedro] +package_name = "kedro_dynamic_pipeline_hook_example" +project_name = "kedro-dynamic-pipeline-hook-example" +kedro_init_version = "0.18.12" + +[tool.isort] +profile = "black" + +[tool.pytest.ini_options] +addopts = """ +--cov-report term-missing \ +--cov src/kedro_dynamic_pipeline_hook_example -ra""" + +[tool.coverage.report] +fail_under = 0 +show_missing = true +exclude_lines = ["pragma: no cover", "raise NotImplementedError"] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..63ea673 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[flake8] +max-line-length=88 +extend-ignore=E203 diff --git a/src/kedro_dynamic_pipeline_hook_example/README.md b/src/kedro_dynamic_pipeline_hook_example/README.md new file mode 100644 index 0000000..64e6e64 --- /dev/null +++ b/src/kedro_dynamic_pipeline_hook_example/README.md @@ -0,0 +1,70 @@ +# Pipeline + +> *Note:* This is a `README.md` boilerplate generated using `Kedro 0.18.12`. + +## Overview + +This pipeline: +1. splits the data into training dataset and testing dataset using a configurable ratio found in `conf/base/parameters.yml` +2. runs a simple 1-nearest neighbour model (`make_prediction` node) and makes prediction dataset +3. reports the model accuracy on a test set (`report_accuracy` node) + +## Pipeline inputs + +### `example_iris_data` + +| | | +| ---- | ------------------ | +| Type | `pandas.CSVDataSet` | +| Description | Example iris data containing columns | + + +### `parameters` + +| | | +| ---- | ------------------ | +| Type | `dict` | +| Description | Project parameter dictionary that must contain the following keys: `train_fraction` (the ratio used to determine the train-test split), `random_state` (random generator to ensure train-test split is deterministic) and `target_column` (identify the target column in the dataset) | + + +## Pipeline intermediate outputs + +### `X_train` + +| | | +| ---- | ------------------ | +| Type | `pandas.DataFrame` | +| Description | DataFrame containing train set features | + +### `y_train` + +| | | +| ---- | ------------------ | +| Type | `pandas.Series` | +| Description | Series containing train set target. | + +### `X_test` + +| | | +| ---- | ------------------ | +| Type | `pandas.DataFrame` | +| Description | DataFrame containing test set features | + +### `y_test` + +| | | +| ---- | ------------------ | +| Type | `pandas.Series` | +| Description | Series containing test set target | + +### `y_pred` + +| | | +| ---- | ------------------ | +| Type | `pandas.Series` | +| Description | Predictions from the 1-nearest neighbour model | + + +## Pipeline outputs + +### `None` diff --git a/src/kedro_dynamic_pipeline_hook_example/__init__.py b/src/kedro_dynamic_pipeline_hook_example/__init__.py new file mode 100644 index 0000000..cdea9d8 --- /dev/null +++ b/src/kedro_dynamic_pipeline_hook_example/__init__.py @@ -0,0 +1,4 @@ +"""kedro-dynamic-pipeline-hook-example +""" + +__version__ = "0.1" diff --git a/src/kedro_dynamic_pipeline_hook_example/__main__.py b/src/kedro_dynamic_pipeline_hook_example/__main__.py new file mode 100644 index 0000000..393177e --- /dev/null +++ b/src/kedro_dynamic_pipeline_hook_example/__main__.py @@ -0,0 +1,47 @@ +"""kedro-dynamic-pipeline-hook-example file for ensuring the package is executable +as `kedro-dynamic-pipeline-hook-example` and `python -m kedro_dynamic_pipeline_hook_example` +""" +import importlib +from pathlib import Path + +from kedro.framework.cli.utils import KedroCliError, load_entry_points +from kedro.framework.project import configure_project + + +def _find_run_command(package_name): + try: + project_cli = importlib.import_module(f"{package_name}.cli") + # fail gracefully if cli.py does not exist + except ModuleNotFoundError as exc: + if f"{package_name}.cli" not in str(exc): + raise + plugins = load_entry_points("project") + run = _find_run_command_in_plugins(plugins) if plugins else None + if run: + # use run command from installed plugin if it exists + return run + # use run command from `kedro.framework.cli.project` + from kedro.framework.cli.project import run + + return run + # fail badly if cli.py exists, but has no `cli` in it + if not hasattr(project_cli, "cli"): + raise KedroCliError(f"Cannot load commands from {package_name}.cli") + return project_cli.run + + +def _find_run_command_in_plugins(plugins): + for group in plugins: + if "run" in group.commands: + return group.commands["run"] + + +def main(*args, **kwargs): + package_name = Path(__file__).parent.name + configure_project(package_name) + run = _find_run_command(package_name) + run(*args, **kwargs) + + +if __name__ == "__main__": + main() diff --git a/src/kedro_dynamic_pipeline_hook_example/nodes.py b/src/kedro_dynamic_pipeline_hook_example/nodes.py new file mode 100644 index 0000000..a0973f8 --- /dev/null +++ b/src/kedro_dynamic_pipeline_hook_example/nodes.py @@ -0,0 +1,74 @@ +""" +This is a boilerplate pipeline +generated using Kedro 0.18.12 +""" + +import logging +from typing import Any, Dict, Tuple + +import numpy as np +import pandas as pd + + +def split_data( + data: pd.DataFrame, parameters: Dict[str, Any] +) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: + """Splits data into features and target training and test sets. + + Args: + data: Data containing features and target. + parameters: Parameters defined in parameters.yml. + Returns: + Split data. + """ + + data_train = data.sample( + frac=parameters["train_fraction"], random_state=parameters["random_state"] + ) + data_test = data.drop(data_train.index) + + X_train = data_train.drop(columns=parameters["target_column"]) + X_test = data_test.drop(columns=parameters["target_column"]) + y_train = data_train[parameters["target_column"]] + y_test = data_test[parameters["target_column"]] + + return X_train, X_test, y_train, y_test + + +def make_predictions( + X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series +) -> pd.Series: + """Uses 1-nearest neighbour classifier to create predictions. + + Args: + X_train: Training data of features. + y_train: Training data for target. + X_test: Test data for features. + + Returns: + y_pred: Prediction of the target variable. + """ + + X_train_numpy = X_train.to_numpy() + X_test_numpy = X_test.to_numpy() + + squared_distances = np.sum( + (X_train_numpy[:, None, :] - X_test_numpy[None, :, :]) ** 2, axis=-1 + ) + nearest_neighbour = squared_distances.argmin(axis=0) + y_pred = y_train.iloc[nearest_neighbour] + y_pred.index = X_test.index + + return y_pred + + +def report_accuracy(y_pred: pd.Series, y_test: pd.Series): + """Calculates and logs the accuracy. + + Args: + y_pred: Predicted target. + y_test: True target. + """ + accuracy = (y_pred == y_test).sum() / len(y_test) + logger = logging.getLogger(__name__) + logger.info("Model has accuracy of %.3f on test data.", accuracy) diff --git a/src/kedro_dynamic_pipeline_hook_example/pipeline.py b/src/kedro_dynamic_pipeline_hook_example/pipeline.py new file mode 100644 index 0000000..94028f3 --- /dev/null +++ b/src/kedro_dynamic_pipeline_hook_example/pipeline.py @@ -0,0 +1,33 @@ +""" +This is a boilerplate pipeline +generated using Kedro 0.18.12 +""" + +from kedro.pipeline import Pipeline, node, pipeline + +from .nodes import make_predictions, report_accuracy, split_data + + +def create_pipeline(**kwargs) -> Pipeline: + return pipeline( + [ + node( + func=split_data, + inputs=["example_iris_data", "parameters"], + outputs=["X_train", "X_test", "y_train", "y_test"], + name="split", + ), + node( + func=make_predictions, + inputs=["X_train", "X_test", "y_train"], + outputs="y_pred", + name="make_predictions", + ), + node( + func=report_accuracy, + inputs=["y_pred", "y_test"], + outputs=None, + name="report_accuracy", + ), + ] + ) diff --git a/src/kedro_dynamic_pipeline_hook_example/pipeline_registry.py b/src/kedro_dynamic_pipeline_hook_example/pipeline_registry.py new file mode 100644 index 0000000..2d4272e --- /dev/null +++ b/src/kedro_dynamic_pipeline_hook_example/pipeline_registry.py @@ -0,0 +1,16 @@ +"""Project pipelines.""" +from typing import Dict + +from kedro.framework.project import find_pipelines +from kedro.pipeline import Pipeline + + +def register_pipelines() -> Dict[str, Pipeline]: + """Register the project's pipelines. + + Returns: + A mapping from pipeline names to ``Pipeline`` objects. + """ + pipelines = find_pipelines() + pipelines["__default__"] = sum(pipelines.values()) + return pipelines diff --git a/src/kedro_dynamic_pipeline_hook_example/settings.py b/src/kedro_dynamic_pipeline_hook_example/settings.py new file mode 100644 index 0000000..811e55f --- /dev/null +++ b/src/kedro_dynamic_pipeline_hook_example/settings.py @@ -0,0 +1,41 @@ +"""Project settings. There is no need to edit this file unless you want to change values +from the Kedro defaults. For further information, including these default values, see +https://kedro.readthedocs.io/en/stable/kedro_project_setup/settings.html.""" + +# Instantiated project hooks. +# For example, after creating a hooks.py and defining a ProjectHooks class there, do +# from kedro_dynamic_pipeline_hook_example.hooks import ProjectHooks +# HOOKS = (ProjectHooks(),) + +# Installed plugins for which to disable hook auto-registration. +# DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",) + +# Class that manages storing KedroSession data. +# from kedro.framework.session.store import BaseSessionStore +# SESSION_STORE_CLASS = BaseSessionStore +# Keyword arguments to pass to the `SESSION_STORE_CLASS` constructor. +# SESSION_STORE_ARGS = { +# "path": "./sessions" +# } + +# Directory that holds configuration. +# CONF_SOURCE = "conf" + +# Class that manages how configuration is loaded. +# from kedro.config import OmegaConfigLoader +# CONFIG_LOADER_CLASS = OmegaConfigLoader +# Keyword arguments to pass to the `CONFIG_LOADER_CLASS` constructor. +# CONFIG_LOADER_ARGS = { +# "config_patterns": { +# "spark" : ["spark*/"], +# "parameters": ["parameters*", "parameters*/**", "**/parameters*"], +# } +# } + +# Class that manages Kedro's library components. +# from kedro.framework.context import KedroContext +# CONTEXT_CLASS = KedroContext + +# Class that manages the Data Catalog. +# from kedro.io import DataCatalog +# DATA_CATALOG_CLASS = DataCatalog diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..7c2d771 --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,15 @@ +black~=22.0 +flake8>=3.7.9, <4.0 +ipython>=7.31.1, <8.0; python_version < '3.8' +ipython~=8.10; python_version >= '3.8' +isort~=5.0 +jupyter~=1.0 +jupyterlab~=3.0 +kedro~=0.18.12 +kedro-viz~=6.0 +kedro-datasets[pandas.CSVDataSet]~=1.0 +kedro-telemetry~=0.2.0 +nbstripout~=0.4 +pytest-cov~=3.0 +pytest-mock>=1.7.1, <2.0 +pytest~=7.2 diff --git a/src/setup.py b/src/setup.py new file mode 100644 index 0000000..54aa5ea --- /dev/null +++ b/src/setup.py @@ -0,0 +1,39 @@ +from setuptools import find_packages, setup + +entry_point = ( + "kedro-dynamic-pipeline-hook-example = kedro_dynamic_pipeline_hook_example.__main__:main" +) + + +# get the dependencies and installs +with open("requirements.txt", encoding="utf-8") as f: + # Make sure we strip all comments and options (e.g "--extra-index-url") + # that arise from a modified pip.conf file that configure global options + # when running kedro build-reqs + requires = [] + for line in f: + req = line.split("#", 1)[0].strip() + if req and not req.startswith("--"): + requires.append(req) + +setup( + name="kedro_dynamic_pipeline_hook_example", + version="0.1", + packages=find_packages(exclude=["tests"]), + entry_points={"console_scripts": [entry_point]}, + install_requires=requires, + extras_require={ + "docs": [ + "docutils<0.18.0", + "sphinx~=3.4.3", + "sphinx_rtd_theme==0.5.1", + "nbsphinx==0.8.1", + "nbstripout~=0.4", + "myst-parser~=0.17.2", + "sphinx-autodoc-typehints==1.11.1", + "sphinx_copybutton==0.3.1", + "ipykernel>=5.3, <7.0", + "Jinja2<3.1.0", + ] + }, +) diff --git a/src/tests/__init__.py b/src/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tests/test_pipeline.py b/src/tests/test_pipeline.py new file mode 100644 index 0000000..962a6f4 --- /dev/null +++ b/src/tests/test_pipeline.py @@ -0,0 +1,9 @@ +""" +This is a boilerplate test file +generated using Kedro 0.18.12. +Please add your pipeline tests here. + +Kedro recommends using `pytest` framework, more info about it can be found +in the official documentation: +https://docs.pytest.org/en/latest/getting-started.html +""" diff --git a/src/tests/test_run.py b/src/tests/test_run.py new file mode 100644 index 0000000..fd190dd --- /dev/null +++ b/src/tests/test_run.py @@ -0,0 +1,39 @@ +""" +This module contains an example test. + +Tests should be placed in ``src/tests``, in modules that mirror your +project's structure, and in files named test_*.py. They are simply functions +named ``test_*`` which test a unit of logic. + +To run the tests, run ``kedro test`` from the project root directory. +""" + +from pathlib import Path + +import pytest +from kedro.config import ConfigLoader +from kedro.framework.context import KedroContext +from kedro.framework.hooks import _create_hook_manager + + +@pytest.fixture +def config_loader(): + return ConfigLoader(conf_source=str(Path.cwd())) + + +@pytest.fixture +def project_context(config_loader): + return KedroContext( + package_name="kedro_dynamic_pipeline_hook_example", + project_path=Path.cwd(), + config_loader=config_loader, + hook_manager=_create_hook_manager(), + ) + + +# The tests below are here for the demonstration purpose +# and should be replaced with the ones testing the project +# functionality +class TestProjectContext: + def test_project_path(self, project_context): + assert project_context.project_path == Path.cwd()