Skip to content

Commit

Permalink
Merge branch 'develop' of github.com:quantumblacklabs/private-kedro i…
Browse files Browse the repository at this point in the history
…nto develop
  • Loading branch information
QuantumBlack Labs committed Mar 30, 2020
2 parents e71222c + 8839054 commit 707f8dd
Show file tree
Hide file tree
Showing 18 changed files with 182 additions and 181 deletions.
8 changes: 7 additions & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,13 @@ jobs:

unit_tests_38:
executor: py38
steps: [unit_tests]
steps:
- checkout
- setup_conda
- setup_requirements
- run:
name: Run unit tests without Spark
command: make test-no-spark

linters_38:
executor: py38
Expand Down
2 changes: 0 additions & 2 deletions .coveragerc

This file was deleted.

3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ lint:
test:
pytest tests

test-no-spark:
pytest tests --cov-config pyproject_no_spark.toml --ignore tests/extras/datasets/spark

e2e-tests:
behave

Expand Down
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
* Updated contribution process in `CONTRIBUTING.md` - added Developer Workflow.
* Fixed a bug where `PartitionedDataSet` and `IncrementalDataSet` were not working with `s3a` or `s3n` protocol.
* Documented installation of development version of Kedro in the [FAQ section](https://kedro.readthedocs.io/en/stable/06_resources/01_faq.html#how-can-i-use-development-version-of-kedro).
* Added the option for contributors to run Kedro tests locally without Spark installation with `make test-no-spark`.

## Breaking changes to the API
* Made `invalidate_cache` method on datasets private.
Expand Down
63 changes: 29 additions & 34 deletions kedro/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from collections import defaultdict
from copy import deepcopy
from pathlib import Path
from typing import Any, Callable, Dict, List, Union
from typing import Any, Callable, Dict, List

import click
import pkg_resources
Expand Down Expand Up @@ -240,47 +240,40 @@ def _create_project(config_path: str, verbose: bool):
_handle_exception("Failed to generate project.")


def _get_config_from_prompts() -> Dict:
"""Ask user to provide necessary inputs.
def _get_user_input(
text: str, default: Any = None, check_input: Callable = None
) -> Any:
"""Get user input and validate it.
Args:
text: Text to display in command line prompt.
default: Default value for the input.
check_input: Function to apply to check user input.
Returns:
Resulting config dictionary.
Processed user value.
"""

def _get_user_input(
text: str,
default: Any = None,
assert_or_check_funcs: Union[Callable, List[Callable]] = None,
) -> Any:
"""Get user input and validate it.
Args:
text: Text to display in command line prompt.
default: Default value for the input.
assert_or_check_funcs: List of functions to apply to user input.
Value is overridden by function output if the latter is
not None.
Returns:
Processed user value.
"""
if callable(assert_or_check_funcs):
assert_or_check_funcs = [assert_or_check_funcs]
else:
assert_or_check_funcs = assert_or_check_funcs or []
while True:
while True:
value = click.prompt(text, default=default)
if check_input:
try:
value = click.prompt(text, default=default)
for _func in assert_or_check_funcs:
_func(value)
check_input(value)
except KedroCliError as exc:
click.secho(str(exc), fg="red", err=True)
else:
break
continue
return value


def _get_config_from_prompts() -> Dict:
"""Ask user to provide necessary inputs.
Returns:
Resulting config dictionary.
"""

# set output directory to the current directory
output_dir = os.path.abspath(os.path.curdir)

Expand All @@ -303,7 +296,9 @@ def _get_user_input(
"Lowercase is recommended.",
)
repo_name = _get_user_input(
repo_name_prompt, normalized_project_name, _assert_repo_name_ok
repo_name_prompt,
default=normalized_project_name,
check_input=_assert_repo_name_ok,
)

# get python package_name
Expand All @@ -316,7 +311,7 @@ def _get_user_input(
"or underscore.",
)
python_package = _get_user_input(
pkg_name_prompt, default_pkg_name, _assert_pkg_name_ok
pkg_name_prompt, default=default_pkg_name, check_input=_assert_pkg_name_ok
)

# option for whether iris example code is included in the project
Expand Down
12 changes: 6 additions & 6 deletions kedro/context/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
import os
import sys
from copy import deepcopy
from pathlib import Path
from pathlib import Path, PureWindowsPath
from typing import Any, Dict, Iterable, Union
from urllib.parse import urlparse
from warnings import warn
Expand Down Expand Up @@ -77,17 +77,17 @@ def _is_relative_path(path_string: str) -> bool:
Returns:
Whether the string is a relative path.
"""
drive, filepath = os.path.splitdrive(path_string)

is_full_windows_path_with_drive = bool(drive)
# os.path.splitdrive does not reliably work on non-Windows systems
# breaking the coverage, using PureWindowsPath instead
is_full_windows_path_with_drive = bool(PureWindowsPath(path_string).drive)
if is_full_windows_path_with_drive:
return False

is_remote_path = bool(urlparse(filepath).scheme)
is_remote_path = bool(urlparse(path_string).scheme)
if is_remote_path:
return False

is_absolute_path = Path(filepath).is_absolute()
is_absolute_path = Path(path_string).is_absolute()
if is_absolute_path:
return False

Expand Down
4 changes: 2 additions & 2 deletions kedro/io/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ def _load_obj(class_path: str) -> Optional[object]:
return class_obj


def _local_exists(filepath: str) -> bool:
def _local_exists(filepath: str) -> bool: # SKIP_IF_NO_SPARK
filepath = Path(filepath)
return filepath.exists() or any(par.is_file() for par in filepath.parents)

Expand Down Expand Up @@ -600,7 +600,7 @@ def exists(self) -> bool:
return self._exists()
except VersionNotFoundError:
return False
except Exception as exc:
except Exception as exc: # SKIP_IF_NO_SPARK
message = "Failed during exists check for data set {}.\n{}".format(
str(self), str(exc)
)
Expand Down
3 changes: 0 additions & 3 deletions kedro/pipeline/modular_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,6 @@ def _is_transcode_base_in_mapping(name: str) -> bool:

def _map_transcode_base(name: str):
base_name, transcode_suffix = _transcode_split(name)
if not transcode_suffix:
return mapping[base_name]

return TRANSCODING_SEPARATOR.join((mapping[base_name], transcode_suffix))

def _rename(name: str):
Expand Down
2 changes: 1 addition & 1 deletion kedro/runner/parallel_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def save(self, data: Any):
# Checks if the error is due to serialisation or not
try:
pickle.dumps(data)
except Exception:
except Exception: # SKIP_IF_NO_SPARK
raise DataSetError(
"{} cannot be serialized. ParallelRunner implicit memory datasets "
"can only be used with serializable data".format(
Expand Down
5 changes: 5 additions & 0 deletions pyproject_no_spark.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[tool.coverage.report]
fail_under = 100
show_missing = true
omit = ["kedro/template*", "kedro/extras/datasets/spark/*", "tests/extras/datasets/spark/*"]
exclude_lines = ["pragma: no cover", "raise NotImplementedError", "SKIP_IF_NO_SPARK"]
55 changes: 0 additions & 55 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,61 +32,11 @@
discover them automatically. More info here:
https://docs.pytest.org/en/latest/fixture.html
"""
# pylint: disable=import-error
import gc
import os
import sys
from subprocess import Popen

import mock
import pytest

if sys.version_info < (3, 8):
from pyspark import SparkContext
from pyspark.sql import SparkSession
else:
SparkContext = mock.Mock()
SparkSession = mock.Mock()

the_real_getOrCreate = None


class UseTheSparkSessionFixtureOrMock: # pylint: disable=too-few-public-methods
pass


# prevent using spark without going through the spark_session fixture
@pytest.fixture(scope="session", autouse=True)
def replace_spark_default_getorcreate():
global the_real_getOrCreate # pylint: disable=global-statement
the_real_getOrCreate = SparkSession.builder.getOrCreate
SparkSession.builder.getOrCreate = UseTheSparkSessionFixtureOrMock
return the_real_getOrCreate


# clean up pyspark after the test module finishes
@pytest.fixture(scope="module")
def spark_session():
SparkSession.builder.getOrCreate = the_real_getOrCreate
spark = SparkSession.builder.getOrCreate()
yield spark
spark.stop()
SparkSession.builder.getOrCreate = UseTheSparkSessionFixtureOrMock

# remove the cached JVM vars
SparkContext._jvm = None # pylint: disable=protected-access
SparkContext._gateway = None # pylint: disable=protected-access

# py4j doesn't shutdown properly so kill the actual JVM process
for obj in gc.get_objects():
try:
if isinstance(obj, Popen) and "pyspark" in obj.args[0]:
obj.terminate()
except ReferenceError: # pragma: no cover
# gc.get_objects may return dead weak proxy objects that will raise
# ReferenceError when you isinstance them
pass


@pytest.fixture(autouse=True)
def preserve_system_context():
Expand All @@ -100,8 +50,3 @@ def preserve_system_context():

if os.getcwd() != old_cwd:
os.chdir(old_cwd)


skip_if_py38 = pytest.mark.skipif(
sys.version_info >= (3, 8), reason="Dependency not compatible with Python 3.8 yet"
)
83 changes: 83 additions & 0 deletions tests/extras/datasets/spark/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright 2020 QuantumBlack Visual Analytics Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS
# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo
# (either separately or in combination, "QuantumBlack Trademarks") are
# trademarks of QuantumBlack. The License does not grant you any right or
# license to the QuantumBlack Trademarks. You may not use the QuantumBlack
# Trademarks or any confusingly similar mark as a trademark for your product,
# or use the QuantumBlack Trademarks in any other manner that might cause
# confusion in the marketplace, including but not limited to in advertising,
# on websites, or on software.
#
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This file contains the fixtures that are reusable by any tests within
this directory. You don't need to import the fixtures as pytest will
discover them automatically. More info here:
https://docs.pytest.org/en/latest/fixture.html
"""
import gc
from subprocess import Popen

import pytest

try:
from pyspark import SparkContext
from pyspark.sql import SparkSession
except ImportError: # pragma: no cover
pass # this is only for test discovery to succeed on Python 3.8

the_real_getOrCreate = None


class UseTheSparkSessionFixtureOrMock: # pylint: disable=too-few-public-methods
pass


# prevent using spark without going through the spark_session fixture
@pytest.fixture(scope="session", autouse=True)
def replace_spark_default_getorcreate():
global the_real_getOrCreate # pylint: disable=global-statement
the_real_getOrCreate = SparkSession.builder.getOrCreate
SparkSession.builder.getOrCreate = UseTheSparkSessionFixtureOrMock
return the_real_getOrCreate


# clean up pyspark after the test module finishes
@pytest.fixture(scope="module")
def spark_session(): # SKIP_IF_NO_SPARK
SparkSession.builder.getOrCreate = the_real_getOrCreate
spark = SparkSession.builder.getOrCreate()
yield spark
spark.stop()
SparkSession.builder.getOrCreate = UseTheSparkSessionFixtureOrMock

# remove the cached JVM vars
SparkContext._jvm = None # pylint: disable=protected-access
SparkContext._gateway = None # pylint: disable=protected-access

# py4j doesn't shutdown properly so kill the actual JVM process
for obj in gc.get_objects():
try:
if isinstance(obj, Popen) and "pyspark" in obj.args[0]:
obj.terminate()
except ReferenceError: # pragma: no cover
# gc.get_objects may return dead weak proxy objects that will raise
# ReferenceError when you isinstance them
pass
Loading

0 comments on commit 707f8dd

Please sign in to comment.