Skip to content

Commit

Permalink
Add PySpark to the add-ons flow (#3169)
Browse files Browse the repository at this point in the history
* add pyspark config to add-ons

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* add pyspark option in starters and prompts

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* add pyspark kedro hook and make it work with add-ons

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* add pyspark hook to settings.py

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* add kedro-dataset[spark.SparkDataSet] to add-ons

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* dont add pyspark add ons for e2e test

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* fix test

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* changes based of review

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* Delete hooks.py

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* remove spark_config from utils.py

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* remove parameter files from conf/base

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* fix pyspark add-ons

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* Add guard clause for parse_add_ons_input

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* changes for refactoring

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* revert pyspark_path

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* update add-ons example usage

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* ignore coverage on None add_ons_str

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* fix linting

Signed-off-by: Nok <nok.lam.chan@quantumblack.com>

* lint

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* refactor utils.py, strip pyproject.toml and update tests

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* Fix coverage

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* Revert "Fix coverage"

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* Revert "refactor utils.py, strip pyproject.toml and update tests"

This reverts commit 343e805.

* changes based on review

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* Update kedro/framework/cli/starters.py

Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com>
Signed-off-by: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com>

* Update starters.py

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

* lint

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>

---------

Signed-off-by: SajidAlamQB <90610031+SajidAlamQB@users.noreply.github.com>
Signed-off-by: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com>
Signed-off-by: Nok Lam Chan <nok.lam.chan@quantumblack.com>
Signed-off-by: Nok <nok.lam.chan@quantumblack.com>
Co-authored-by: Nok Lam Chan <nok.lam.chan@quantumblack.com>
Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com>
  • Loading branch information
3 people authored Oct 26, 2023
1 parent cdc32d8 commit 5e9e05e
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 33 deletions.
2 changes: 1 addition & 1 deletion features/steps/cli_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def create_config_file(context):
context.root_project_dir = context.temp_dir / context.project_name
context.package_name = context.project_name.replace("-", "_")
config = {
"add_ons": "all",
"add_ons": "1-5",
"project_name": context.project_name,
"repo_name": context.project_name,
"output_dir": str(context.temp_dir),
Expand Down
45 changes: 37 additions & 8 deletions kedro/framework/cli/starters.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,10 @@ class KedroStarterSpec: # noqa: too-few-public-methods
3) Custom Logging: Provides more logging options\n
4) Documentation: Basic documentation setup with Sphinx\n
5) Data Structure: Provides a directory structure for storing data\n
6) Pyspark: Provides a basic PySpark set up\n
Example usage:\n
kedro new --addons=lint,test,log,docs,data (or any subset of these options)\n
kedro new --addons=lint,test,log,docs,data,pyspark (or any subset of these options)\n
kedro new --addons=all\n
kedro new --addons=none
"""
Expand All @@ -121,6 +122,7 @@ class KedroStarterSpec: # noqa: too-few-public-methods
"3": "Custom Logging",
"4": "Documentation",
"5": "Data Structure",
"6": "Pyspark",
}

# noqa: unused-argument
Expand Down Expand Up @@ -210,14 +212,17 @@ def _validate_range(start, end):
def _validate_selection(add_ons: list[str]):
for add_on in add_ons:
if int(add_on) < 1 or int(add_on) > len(ADD_ONS_DICT):
message = f"'{add_on}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5." # nosec
message = f"'{add_on}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5, 6." # nosec
click.secho(message, fg="red", err=True)
sys.exit(1)

if add_ons_str == "all":
return list(ADD_ONS_DICT)
if add_ons_str == "none":
return []
# Guard clause if add_ons_str is None, which can happen if prompts.yml is removed
if not add_ons_str:
return [] # pragma: no cover

# Split by comma
add_ons_choices = add_ons_str.split(",")
Expand Down Expand Up @@ -318,7 +323,10 @@ def new(config_path, starter_alias, selected_addons, checkout, directory, **kwar
config = _get_addons_from_cli_input(selected_addons, config)

cookiecutter_args = _make_cookiecutter_args(config, checkout, directory)
_create_project(template_path, cookiecutter_args)

project_template = fetch_template_based_on_add_ons(template_path, cookiecutter_args)

_create_project(project_template, cookiecutter_args)


@create_cli.group()
Expand Down Expand Up @@ -366,7 +374,14 @@ def _get_addons_from_cli_input(
Configuration for starting a new project, with the selected add-ons
from the `--addons` flag.
"""
string_to_number = {"lint": "1", "test": "2", "log": "3", "docs": "4", "data": "5"}
string_to_number = {
"lint": "1",
"test": "2",
"log": "3",
"docs": "4",
"data": "5",
"pyspark": "6",
}

if selected_addons is not None:
addons = selected_addons.split(",")
Expand All @@ -393,14 +408,14 @@ def _select_prompts_to_display(prompts_required: dict, selected_addons: str) ->
Returns:
the prompts_required dictionary, with all the redundant information removed.
"""
valid_addons = ["lint", "test", "log", "docs", "data", "all", "none"]
valid_addons = ["lint", "test", "log", "docs", "data", "pyspark", "all", "none"]

if selected_addons is not None:
addons = re.sub(r"\s", "", selected_addons).split(",")
for addon in addons:
if addon not in valid_addons:
click.secho(
"Please select from the available add-ons: lint, test, log, docs, data, all, none",
"Please select from the available add-ons: lint, test, log, docs, data, pyspark, all, none",
fg="red",
err=True,
)
Expand Down Expand Up @@ -493,6 +508,16 @@ def _make_cookiecutter_args(
return cookiecutter_args


def fetch_template_based_on_add_ons(template_path, cookiecutter_args: dict[str, Any]):
extra_context = cookiecutter_args["extra_context"]
add_ons = extra_context.get("add_ons")
if add_ons and "Pyspark" in add_ons:
cookiecutter_args["directory"] = "spaceflights-pyspark"
pyspark_path = "git+https://github.com/kedro-org/kedro-starters.git"
return pyspark_path
return template_path


def _create_project(template_path: str, cookiecutter_args: dict[str, Any]):
"""Creates a new kedro project using cookiecutter.
Expand Down Expand Up @@ -524,8 +549,12 @@ def _create_project(template_path: str, cookiecutter_args: dict[str, Any]):
)
add_ons = extra_context.get("add_ons")

# Only non-starter projects have configurable add-ons
if template_path == str(TEMPLATE_PATH):
# Only core template and spaceflights-pyspark have configurable add-ons
if (
template_path == str(TEMPLATE_PATH)
or add_ons is not None
and "Pyspark" in add_ons
):
if add_ons == "[]": # TODO: This should be a list
click.secho("\nYou have selected no add-ons")
else:
Expand Down
5 changes: 3 additions & 2 deletions kedro/templates/project/hooks/post_gen_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,19 @@
setup_template_add_ons,
sort_requirements,
)
from kedro.framework.cli.starters import _parse_add_ons_input


def main():
current_dir = Path.cwd()
requirements_file_path = current_dir / "requirements.txt"
pyproject_file_path = current_dir / "pyproject.toml"
python_package_name = '{{ cookiecutter.python_package }}'

# Get the selected add-ons from cookiecutter
selected_add_ons = "{{ cookiecutter.add_ons }}"

# Handle template directories and requirements according to selected add-ons
setup_template_add_ons(selected_add_ons, requirements_file_path, pyproject_file_path)
setup_template_add_ons(selected_add_ons, requirements_file_path, pyproject_file_path, python_package_name)

# Sort requirements.txt file in alphabetical order
sort_requirements(requirements_file_path)
Expand Down
35 changes: 34 additions & 1 deletion kedro/templates/project/hooks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"""

docs_pyproject_requirements = """
[project.optional-dependencies]
docs = [
"docutils<0.18.0",
"sphinx~=3.4.3",
Expand All @@ -48,7 +49,7 @@
"""


def setup_template_add_ons(selected_add_ons_list, requirements_file_path, pyproject_file_path):
def setup_template_add_ons(selected_add_ons_list, requirements_file_path, pyproject_file_path, python_package_name):
"""Removes directories and files related to unwanted addons from
a Kedro project template. Adds the necessary requirements for
the addons that were selected.
Expand Down Expand Up @@ -96,6 +97,38 @@ def setup_template_add_ons(selected_add_ons_list, requirements_file_path, pyproj
if data_path.exists():
shutil.rmtree(str(data_path))

if "Pyspark" not in selected_add_ons_list: # If PySpark not selected
pass
else: # Use spaceflights-pyspark to create pyspark template
# Remove all .csv and .xlsx files from data/01_raw/
raw_data_path = current_dir / "data/01_raw/"
if raw_data_path.exists() and raw_data_path.is_dir():
for file_path in raw_data_path.glob("*.*"):
if file_path.suffix in [".csv", ".xlsx"]:
file_path.unlink()

# Remove parameter files from conf/base/
param_files = [
"parameters_data_processing.yml",
"parameters_data_science.yml",
]
conf_base_path = current_dir / "conf/base/"
if conf_base_path.exists() and conf_base_path.is_dir():
for param_file in param_files:
file_path = conf_base_path / param_file
if file_path.exists():
file_path.unlink()

# Remove specific pipeline subdirectories
pipelines_path = current_dir / f"src/{python_package_name}/pipelines/"
for pipeline_subdir in ["data_science", "data_processing"]:
shutil.rmtree(pipelines_path / pipeline_subdir, ignore_errors=True)

# Remove all test file from tests/pipelines/
test_pipeline_path = current_dir / "tests/pipelines/test_data_science.py"
if test_pipeline_path.exists():
test_pipeline_path.unlink()


def sort_requirements(requirements_file_path):
"""Sort the requirements.txt file in alphabetical order.
Expand Down
3 changes: 2 additions & 1 deletion kedro/templates/project/prompts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ add_ons:
3) Custom Logging : Provides more logging options
4) Documentation: Provides basic documentations setup with Sphinx
5) Data Structure: Provides a directory structure for storing data
6) PySpark : Provides a basic PySpark set up
Which add-ons would you like to include in your project? [1-5/1,3/all/none]:
Which add-ons would you like to include in your project? [1-6/1,3/all/none]:
regex_validator: "^(all|none|(\\d(,\\d)*|(\\d-\\d)))$"
error_message: |
Invalid input. Please select valid options for add-ons using comma-separated values, ranges, or 'all/none'.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,6 @@ dynamic = ["dependencies", "version"]

[project.entry-points."kedro.hooks"]

[project.optional-dependencies]
docs = [
"docutils<0.18.0",
"sphinx~=3.4.3",
"sphinx_rtd_theme==0.5.1",
"nbsphinx==0.8.1",
"sphinx-autodoc-typehints==1.11.1",
"sphinx_copybutton==0.3.1",
"ipykernel>=5.3, <7.0",
"Jinja2<3.1.0",
"myst-parser~=0.17.2",
]

[tool.setuptools.dynamic]
dependencies = {file = "requirements.txt"}
version = {attr = "{{ cookiecutter.python_package }}.__version__"}
Expand Down
23 changes: 16 additions & 7 deletions tests/framework/cli/test_starters.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,14 @@ def _make_cli_prompt_input_without_addons(


def _convert_addon_names_to_numbers(selected_addons: str):
string_to_number = {"lint": "1", "test": "2", "log": "3", "docs": "4", "data": "5"}
string_to_number = {
"lint": "1",
"test": "2",
"log": "3",
"docs": "4",
"data": "5",
"pyspark": "6",
}

addons = selected_addons.split(",")
for i in range(len(addons)):
Expand All @@ -81,6 +88,7 @@ def _get_expected_files(add_ons: str):
"3": 1,
"4": 2,
"5": 8,
"6": 2,
} # files added to template by each add-on
add_ons_list = _parse_add_ons_input(add_ons)

Expand Down Expand Up @@ -269,7 +277,7 @@ def test_starter_list_with_invalid_starter_plugin(
("1,2,3", ["1", "2", "3"]),
("2-4", ["2", "3", "4"]),
("3-3", ["3"]),
("all", ["1", "2", "3", "4", "5"]),
("all", ["1", "2", "3", "4", "5", "6"]),
("none", []),
],
)
Expand All @@ -291,12 +299,12 @@ def test_parse_add_ons_invalid_range(input, capsys):

@pytest.mark.parametrize(
"input,first_invalid",
[("0,3,5", "0"), ("1,3,6", "6"), ("0-4", "0"), ("3-6", "6")],
[("0,3,5", "0"), ("1,3,7", "7"), ("0-4", "0"), ("3-7", "7")],
)
def test_parse_add_ons_invalid_selection(input, first_invalid, capsys):
with pytest.raises(SystemExit):
_parse_add_ons_input(input)
message = f"'{first_invalid}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5."
message = f"'{first_invalid}' is not a valid selection.\nPlease select from the available add-ons: 1, 2, 3, 4, 5, 6."
assert message in capsys.readouterr().err


Expand Down Expand Up @@ -881,7 +889,7 @@ def test_directory_flag_with_starter_alias(self, fake_kedro_cli):
class TestAddOnsFromUserPrompts:
@pytest.mark.parametrize(
"add_ons",
["1", "2", "3", "4", "5", "none", "2,3,4", "3-5", "all"],
["1", "2", "3", "4", "5", "6", "none", "2,3,4", "3-5", "all"],
)
def test_valid_add_ons(self, fake_kedro_cli, add_ons):
result = CliRunner().invoke(
Expand Down Expand Up @@ -913,7 +921,7 @@ def test_invalid_add_ons(self, fake_kedro_cli):
class TestAddOnsFromConfigFile:
@pytest.mark.parametrize(
"add_ons",
["1", "2", "3", "4", "5", "none", "2,3,4", "3-5", "all"],
["1", "2", "3", "4", "5", "6", "none", "2,3,4", "3-5", "all"],
)
def test_valid_add_ons(self, fake_kedro_cli, add_ons):
"""Test project created from config."""
Expand Down Expand Up @@ -963,6 +971,7 @@ class TestAddOnsFromCLI:
"log",
"docs",
"data",
"pyspark",
"none",
"test,log,docs",
"test,data,lint",
Expand Down Expand Up @@ -992,7 +1001,7 @@ def test_invalid_add_ons(self, fake_kedro_cli):

assert result.exit_code != 0
assert (
"Please select from the available add-ons: lint, test, log, docs, data, all, none"
"Please select from the available add-ons: lint, test, log, docs, data, pyspark, all, none"
in result.output
)

Expand Down

0 comments on commit 5e9e05e

Please sign in to comment.