Skip to content

Commit

Permalink
Merge branch 'main' into SajidAlamQB-patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
SajidAlamQB authored Oct 3, 2022
2 parents b39b74c + 929249b commit 1433f9e
Show file tree
Hide file tree
Showing 10 changed files with 86 additions and 24 deletions.
9 changes: 8 additions & 1 deletion RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,15 @@
## Major features and improvements

## Bug fixes and other changes
* Fixed `kedro micropkg pull` for packages on PyPI.
* Fixed `format` in `save_args` for `SparkHiveDataSet`, previously it didn't allow you to save it as delta format.
* Updated error message for `VersionNotFoundError` to handle insufficient permission issues for cloud storage.

## Minor breaking changes to the API

## Upcoming deprecations for Kedro 0.19.0
* `kedro test` and `kedro lint` will be deprecated.

## Breaking changes to the API

# Release 0.18.3

Expand Down
2 changes: 1 addition & 1 deletion docs/source/deployment/databricks.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ Then press `Confirm` button. Your cluster will be restarted to apply the changes

Congratulations, you are now ready to run your Kedro project from the Databricks!

[Create your Databricks notebook](https://docs.databricks.com/notebooks/notebooks-manage.html#create-a-notebook) and remember to [attach it to the cluster](https://docs.databricks.com/notebooks/notebooks-manage.html#attach) you have just configured.
[Create your Databricks notebook](https://docs.databricks.com/notebooks/notebooks-manage.html#create-a-notebook) and remember to attach it to the cluster you have just configured.

In your newly-created notebook, put each of the below code snippets into a separate cell, then [run all cells](https://docs.databricks.com/notebooks/notebooks-use.html#run-notebooks):

Expand Down
16 changes: 12 additions & 4 deletions docs/source/tutorial/visualise_pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,16 @@ Below is an example of how to visualise plots on Kedro-Viz using `plotly.PlotlyD
The below functions can be added to the `nodes.py` and `pipeline.py` files respectively.

```python
# nodes.py
import pandas as pd
def compare_passenger_capacity(preprocessed_shuttles: pd.DataFrame):
return preprocessed_shuttles.groupby(["shuttle_type"]).mean().reset_index()
```

```python
# pipeline.py
def create_pipeline(**kwargs) -> Pipeline:
"""This is a simple pipeline which generates a plot"""
return pipeline(
Expand Down Expand Up @@ -175,6 +178,7 @@ Below is an example of how to visualise plots using [Plotly Express](https://plo
The below functions can be added to the `nodes.py` and `pipeline.py` files respectively.

```python
# nodes.py
import plotly.express as px
import pandas as pd
Expand All @@ -200,8 +204,10 @@ def compare_passenger_capacity(preprocessed_shuttles: pd.DataFrame):
]
)
return fig
```

```python
# pipeline.py
def create_pipeline(**kwargs) -> Pipeline:
"""This is a simple pipeline which generates a plot"""
return pipeline(
Expand Down Expand Up @@ -256,6 +262,7 @@ To use this dataset, configure your plot in your Kedro node. The below functions
```python
# nodes.py
import matplotlib.pyplot as plt
import seaborn as sn
def create_confusion_matrix(companies: pd.DataFrame):
Expand All @@ -268,8 +275,9 @@ def create_confusion_matrix(companies: pd.DataFrame):
)
sn.heatmap(confusion_matrix, annot=True)
return plt
```

```python
# pipeline.py
def create_pipeline(**kwargs) -> Pipeline:
"""This is a simple pipeline which generates a plot"""
Expand All @@ -287,7 +295,7 @@ def create_pipeline(**kwargs) -> Pipeline:
You must also specify the output type in the `catalog.yml` file, like below. Remember to set the versioned flag to `true` if you want to add the plots to experiment tracking as well.

```yaml
reporting.dummy_confusion_matrix:
dummy_confusion_matrix:
type: matplotlib.MatplotlibWriter
filepath: ${base_location}/08_reporting/dummy_confusion_matrix.png
versioned: true
Expand Down
2 changes: 1 addition & 1 deletion kedro/extras/datasets/spark/spark_hive_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def __init__(
self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS)
if save_args is not None:
self._save_args.update(save_args)
self._format = self._save_args.get("format") or "hive"
self._format = self._save_args.pop("format", None) or "hive"
self._eager_checkpoint = self._save_args.pop("eager_checkpoint", None) or True

def _describe(self) -> Dict[str, Any]:
Expand Down
13 changes: 6 additions & 7 deletions kedro/framework/cli/micropkg.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,18 +141,17 @@ def _pull_package(
):
with tempfile.TemporaryDirectory() as temp_dir:
temp_dir_path = Path(temp_dir).resolve()

_unpack_sdist(package_path, temp_dir_path, fs_args)

sdist_file_name = Path(package_path).name.rstrip(".tar.gz")
egg_info_file = list((temp_dir_path / sdist_file_name).glob("*.egg-info"))
if len(egg_info_file) != 1:
egg_info_files = list((temp_dir_path).rglob("*.egg-info"))
if len(egg_info_files) != 1:
raise KedroCliError(
f"More than 1 or no egg-info files found from {package_path}. "
f"There has to be exactly one egg-info directory."
)
package_name = egg_info_file[0].stem
package_requirements = temp_dir_path / sdist_file_name / "setup.py"
egg_info_file = egg_info_files[0]
package_name = egg_info_file.stem
package_requirements = egg_info_file.parent / "setup.py"

# Finds a string representation of 'install_requires' list from setup.py
reqs_list_pattern = r"install_requires\=(.*?)\,\n"
Expand All @@ -172,7 +171,7 @@ def _pull_package(
_install_files(
metadata,
package_name,
temp_dir_path / sdist_file_name,
egg_info_file.parent,
env,
alias,
destination,
Expand Down
26 changes: 21 additions & 5 deletions kedro/framework/cli/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,14 @@ def project_group(): # pragma: no cover
@forward_command(project_group, forward_help=True)
@click.pass_obj # this will pass the metadata as first argument
def test(metadata: ProjectMetadata, args, **kwargs): # pylint: disable=unused-argument
"""Run the test suite."""
"""Run the test suite. (DEPRECATED)"""
deprecation_message = (
"DeprecationWarning: Command 'kedro test' is deprecated and "
"will not be available from Kedro 0.19.0. "
"Use the command 'pytest' instead. "
)
click.secho(deprecation_message, fg="red")

try:
_check_module_importable("pytest")
except KedroCliError as exc:
Expand All @@ -90,7 +97,13 @@ def test(metadata: ProjectMetadata, args, **kwargs): # pylint: disable=unused-a
def lint(
metadata: ProjectMetadata, files, check_only, **kwargs
): # pylint: disable=unused-argument
"""Run flake8, isort and black."""
"""Run flake8, isort and black. (DEPRECATED)"""
deprecation_message = (
"DeprecationWarning: Command 'kedro lint' is deprecated and "
"will not be available from Kedro 0.19.0."
)
click.secho(deprecation_message, fg="red")

source_path = metadata.source_dir
package_name = metadata.package_name
files = files or (str(source_path / "tests"), str(source_path / package_name))
Expand Down Expand Up @@ -171,13 +184,15 @@ def package(metadata: ProjectMetadata):
@click.pass_obj # this will pass the metadata as first argument
def build_docs(metadata: ProjectMetadata, open_docs):
"""Build the project documentation. (DEPRECATED)"""
source_path = metadata.source_dir
package_name = metadata.package_name
deprecation_message = (
"DeprecationWarning: Command 'kedro build-docs' is deprecated and "
"will not be available from Kedro 0.19.0."
)
click.secho(deprecation_message, fg="red")

source_path = metadata.source_dir
package_name = metadata.package_name

python_call("pip", ["install", str(source_path / "[docs]")])
python_call("pip", ["install", "-r", str(source_path / "requirements.txt")])
python_call("ipykernel", ["install", "--user", f"--name={package_name}"])
Expand Down Expand Up @@ -262,12 +277,13 @@ def activate_nbstripout(
metadata: ProjectMetadata, **kwargs
): # pylint: disable=unused-argument
"""Install the nbstripout git hook to automatically clean notebooks. (DEPRECATED)"""
source_path = metadata.source_dir
deprecation_message = (
"DeprecationWarning: Command 'kedro activate-nbstripout' is deprecated and "
"will not be available from Kedro 0.19.0."
)
click.secho(deprecation_message, fg="red")

source_path = metadata.source_dir
click.secho(
(
"Notebook output cells will be automatically cleared before committing"
Expand Down
11 changes: 9 additions & 2 deletions kedro/io/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,9 +538,16 @@ def _fetch_latest_load_version(self) -> str:
most_recent = next(
(path for path in version_paths if self._exists_function(path)), None
)

protocol = getattr(self, "_protocol", None)
if not most_recent:
raise VersionNotFoundError(f"Did not find any versions for {self}")
if protocol in CLOUD_PROTOCOLS:
message = (
f"Did not find any versions for {self}. This could be "
f"due to insufficient permission."
)
else:
message = f"Did not find any versions for {self}"
raise VersionNotFoundError(message)

return PurePath(most_recent).parent.name

Expand Down
11 changes: 11 additions & 0 deletions tests/extras/datasets/spark/test_spark_hive_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,3 +301,14 @@ def test_read_from_non_existent_table(self):
r"table_doesnt_exist\], \[\], false\n",
):
dataset.load()

def test_save_delta_format(self, mocker):
dataset = SparkHiveDataSet(
database="default_1", table="delta_table", save_args={"format": "delta"}
)
mocked_save = mocker.patch("pyspark.sql.DataFrameWriter.saveAsTable")
dataset.save(_generate_spark_df_one())
mocked_save.assert_called_with(
"default_1.delta_table", mode="errorifexists", format="delta"
)
assert dataset._format == "delta"
7 changes: 5 additions & 2 deletions tests/framework/cli/micropkg/test_micropkg_pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,9 +627,12 @@ def test_pull_from_pypi(

options = ["-e", env] if env else []
options += ["--alias", alias] if alias else []

package_name = "my-pipeline"

result = CliRunner().invoke(
fake_project_cli,
["micropkg", "pull", f"{PIPELINE_NAME}-{version}", *options],
["micropkg", "pull", package_name, *options],
obj=fake_metadata,
)
assert result.exit_code == 0
Expand All @@ -642,7 +645,7 @@ def test_pull_from_pypi(
"--no-deps",
"--dest",
str(tmp_path),
f"{PIPELINE_NAME}-{version}",
package_name,
],
)

Expand Down
13 changes: 12 additions & 1 deletion tests/io/test_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
LambdaDataSet,
MemoryDataSet,
)
from kedro.io.core import VERSION_FORMAT, generate_timestamp
from kedro.io.core import VERSION_FORMAT, Version, generate_timestamp


@pytest.fixture
Expand Down Expand Up @@ -652,3 +652,14 @@ def test_replacing_nonword_characters(self):
assert "ds2_spark" in catalog.datasets.__dict__
assert "ds3__csv" in catalog.datasets.__dict__
assert "jalapeño" in catalog.datasets.__dict__

def test_no_versions_with_cloud_protocol(self):
"""Check the error if no versions are available for load from cloud storage"""
version = Version(load=None, save=None)
versioned_dataset = CSVDataSet("s3://bucket/file.csv", version=version)
pattern = re.escape(
f"Did not find any versions for {versioned_dataset}. "
f"This could be due to insufficient permission."
)
with pytest.raises(DataSetError, match=pattern):
versioned_dataset.load()

0 comments on commit 1433f9e

Please sign in to comment.