Merge branch 'develop' of github.com:quantumblacklabs/private-kedro i…

…nto develop
kedro-org · Mar 30, 2020 · 707f8dd · 707f8dd
2 parents e71222c + 8839054
commit 707f8dd
Show file tree

Hide file tree

Showing 18 changed files with 182 additions and 181 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -147,7 +147,13 @@ jobs:
 
   unit_tests_38:
     executor: py38
-    steps: [unit_tests]
+    steps:
+      - checkout
+      - setup_conda
+      - setup_requirements
+      - run:
+          name: Run unit tests without Spark
+          command: make test-no-spark
 
   linters_38:
     executor: py38

diff --git a/.coveragerc b/.coveragerc
diff --git a/Makefile b/Makefile
@@ -21,6 +21,9 @@ lint:
 test:
 	pytest tests
 
+test-no-spark:
+	pytest tests --cov-config pyproject_no_spark.toml --ignore tests/extras/datasets/spark
+
 e2e-tests:
 	behave
 

diff --git a/RELEASE.md b/RELEASE.md
@@ -19,6 +19,7 @@
 * Updated contribution process in `CONTRIBUTING.md` - added Developer Workflow.
 * Fixed a bug where `PartitionedDataSet` and `IncrementalDataSet` were not working with `s3a` or `s3n` protocol.
 * Documented installation of development version of Kedro in the [FAQ section](https://kedro.readthedocs.io/en/stable/06_resources/01_faq.html#how-can-i-use-development-version-of-kedro).
+* Added the option for contributors to run Kedro tests locally without Spark installation with `make test-no-spark`.
 
 ## Breaking changes to the API
 * Made `invalidate_cache` method on datasets private.

diff --git a/kedro/cli/cli.py b/kedro/cli/cli.py
@@ -41,7 +41,7 @@
 from collections import defaultdict
 from copy import deepcopy
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Union
+from typing import Any, Callable, Dict, List
 
 import click
 import pkg_resources
@@ -240,47 +240,40 @@ def _create_project(config_path: str, verbose: bool):
         _handle_exception("Failed to generate project.")
 
 
-def _get_config_from_prompts() -> Dict:
-    """Ask user to provide necessary inputs.
+def _get_user_input(
+    text: str, default: Any = None, check_input: Callable = None
+) -> Any:
+    """Get user input and validate it.
+
+    Args:
+        text: Text to display in command line prompt.
+        default: Default value for the input.
+        check_input: Function to apply to check user input.
 
     Returns:
-        Resulting config dictionary.
+        Processed user value.
 
     """
 
-    def _get_user_input(
-        text: str,
-        default: Any = None,
-        assert_or_check_funcs: Union[Callable, List[Callable]] = None,
-    ) -> Any:
-        """Get user input and validate it.
-
-        Args:
-            text: Text to display in command line prompt.
-            default: Default value for the input.
-            assert_or_check_funcs: List of functions to apply to user input.
-                Value is overridden by function output if the latter is
-                not None.
-
-        Returns:
-            Processed user value.
-
-        """
-        if callable(assert_or_check_funcs):
-            assert_or_check_funcs = [assert_or_check_funcs]
-        else:
-            assert_or_check_funcs = assert_or_check_funcs or []
-        while True:
+    while True:
+        value = click.prompt(text, default=default)
+        if check_input:
             try:
-                value = click.prompt(text, default=default)
-                for _func in assert_or_check_funcs:
-                    _func(value)
+                check_input(value)
             except KedroCliError as exc:
                 click.secho(str(exc), fg="red", err=True)
-            else:
-                break
+                continue
         return value
 
+
+def _get_config_from_prompts() -> Dict:
+    """Ask user to provide necessary inputs.
+
+    Returns:
+        Resulting config dictionary.
+
+    """
+
     # set output directory to the current directory
     output_dir = os.path.abspath(os.path.curdir)
 
@@ -303,7 +296,9 @@ def _get_user_input(
         "Lowercase is recommended.",
     )
     repo_name = _get_user_input(
-        repo_name_prompt, normalized_project_name, _assert_repo_name_ok
+        repo_name_prompt,
+        default=normalized_project_name,
+        check_input=_assert_repo_name_ok,
     )
 
     # get python package_name
@@ -316,7 +311,7 @@ def _get_user_input(
         "or underscore.",
     )
     python_package = _get_user_input(
-        pkg_name_prompt, default_pkg_name, _assert_pkg_name_ok
+        pkg_name_prompt, default=default_pkg_name, check_input=_assert_pkg_name_ok
     )
 
     # option for whether iris example code is included in the project

diff --git a/kedro/context/context.py b/kedro/context/context.py
@@ -33,7 +33,7 @@
 import os
 import sys
 from copy import deepcopy
-from pathlib import Path
+from pathlib import Path, PureWindowsPath
 from typing import Any, Dict, Iterable, Union
 from urllib.parse import urlparse
 from warnings import warn
@@ -77,17 +77,17 @@ def _is_relative_path(path_string: str) -> bool:
     Returns:
         Whether the string is a relative path.
     """
-    drive, filepath = os.path.splitdrive(path_string)
-
-    is_full_windows_path_with_drive = bool(drive)
+    # os.path.splitdrive does not reliably work on non-Windows systems
+    # breaking the coverage, using PureWindowsPath instead
+    is_full_windows_path_with_drive = bool(PureWindowsPath(path_string).drive)
     if is_full_windows_path_with_drive:
         return False
 
-    is_remote_path = bool(urlparse(filepath).scheme)
+    is_remote_path = bool(urlparse(path_string).scheme)
     if is_remote_path:
         return False
 
-    is_absolute_path = Path(filepath).is_absolute()
+    is_absolute_path = Path(path_string).is_absolute()
     if is_absolute_path:
         return False
 

diff --git a/kedro/io/core.py b/kedro/io/core.py
@@ -446,7 +446,7 @@ def _load_obj(class_path: str) -> Optional[object]:
     return class_obj
 
 
-def _local_exists(filepath: str) -> bool:
+def _local_exists(filepath: str) -> bool:  # SKIP_IF_NO_SPARK
     filepath = Path(filepath)
     return filepath.exists() or any(par.is_file() for par in filepath.parents)
 
@@ -600,7 +600,7 @@ def exists(self) -> bool:
             return self._exists()
         except VersionNotFoundError:
             return False
-        except Exception as exc:
+        except Exception as exc:  # SKIP_IF_NO_SPARK
             message = "Failed during exists check for data set {}.\n{}".format(
                 str(self), str(exc)
             )

diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py
@@ -148,9 +148,6 @@ def _is_transcode_base_in_mapping(name: str) -> bool:
 
     def _map_transcode_base(name: str):
         base_name, transcode_suffix = _transcode_split(name)
-        if not transcode_suffix:
-            return mapping[base_name]
-
         return TRANSCODING_SEPARATOR.join((mapping[base_name], transcode_suffix))
 
     def _rename(name: str):

diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py
@@ -74,7 +74,7 @@ def save(self, data: Any):
             # Checks if the error is due to serialisation or not
             try:
                 pickle.dumps(data)
-            except Exception:
+            except Exception:  # SKIP_IF_NO_SPARK
                 raise DataSetError(
                     "{} cannot be serialized. ParallelRunner implicit memory datasets "
                     "can only be used with serializable data".format(

diff --git a/pyproject_no_spark.toml b/pyproject_no_spark.toml
@@ -0,0 +1,5 @@
+[tool.coverage.report]
+fail_under = 100
+show_missing = true
+omit = ["kedro/template*", "kedro/extras/datasets/spark/*", "tests/extras/datasets/spark/*"]
+exclude_lines = ["pragma: no cover", "raise NotImplementedError", "SKIP_IF_NO_SPARK"]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -32,61 +32,11 @@
 discover them automatically. More info here:
 https://docs.pytest.org/en/latest/fixture.html
 """
-# pylint: disable=import-error
-import gc
 import os
 import sys
-from subprocess import Popen
 
-import mock
 import pytest
 
-if sys.version_info < (3, 8):
-    from pyspark import SparkContext
-    from pyspark.sql import SparkSession
-else:
-    SparkContext = mock.Mock()
-    SparkSession = mock.Mock()
-
-the_real_getOrCreate = None
-
-
-class UseTheSparkSessionFixtureOrMock:  # pylint: disable=too-few-public-methods
-    pass
-
-
-# prevent using spark without going through the spark_session fixture
-@pytest.fixture(scope="session", autouse=True)
-def replace_spark_default_getorcreate():
-    global the_real_getOrCreate  # pylint: disable=global-statement
-    the_real_getOrCreate = SparkSession.builder.getOrCreate
-    SparkSession.builder.getOrCreate = UseTheSparkSessionFixtureOrMock
-    return the_real_getOrCreate
-
-
-# clean up pyspark after the test module finishes
-@pytest.fixture(scope="module")
-def spark_session():
-    SparkSession.builder.getOrCreate = the_real_getOrCreate
-    spark = SparkSession.builder.getOrCreate()
-    yield spark
-    spark.stop()
-    SparkSession.builder.getOrCreate = UseTheSparkSessionFixtureOrMock
-
-    # remove the cached JVM vars
-    SparkContext._jvm = None  # pylint: disable=protected-access
-    SparkContext._gateway = None  # pylint: disable=protected-access
-
-    # py4j doesn't shutdown properly so kill the actual JVM process
-    for obj in gc.get_objects():
-        try:
-            if isinstance(obj, Popen) and "pyspark" in obj.args[0]:
-                obj.terminate()
-        except ReferenceError:  # pragma: no cover
-            # gc.get_objects may return dead weak proxy objects that will raise
-            # ReferenceError when you isinstance them
-            pass
-
 
 @pytest.fixture(autouse=True)
 def preserve_system_context():
@@ -100,8 +50,3 @@ def preserve_system_context():
 
     if os.getcwd() != old_cwd:
         os.chdir(old_cwd)
-
-
-skip_if_py38 = pytest.mark.skipif(
-    sys.version_info >= (3, 8), reason="Dependency not compatible with Python 3.8 yet"
-)
diff --git a/tests/extras/datasets/spark/conftest.py b/tests/extras/datasets/spark/conftest.py
@@ -0,0 +1,83 @@
+# Copyright 2020 QuantumBlack Visual Analytics Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
+# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS
+# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo
+# (either separately or in combination, "QuantumBlack Trademarks") are
+# trademarks of QuantumBlack. The License does not grant you any right or
+# license to the QuantumBlack Trademarks. You may not use the QuantumBlack
+# Trademarks or any confusingly similar mark as a trademark for your product,
+#     or use the QuantumBlack Trademarks in any other manner that might cause
+# confusion in the marketplace, including but not limited to in advertising,
+# on websites, or on software.
+#
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This file contains the fixtures that are reusable by any tests within
+this directory. You don't need to import the fixtures as pytest will
+discover them automatically. More info here:
+https://docs.pytest.org/en/latest/fixture.html
+"""
+import gc
+from subprocess import Popen
+
+import pytest
+
+try:
+    from pyspark import SparkContext
+    from pyspark.sql import SparkSession
+except ImportError:  # pragma: no cover
+    pass  # this is only for test discovery to succeed on Python 3.8
+
+the_real_getOrCreate = None
+
+
+class UseTheSparkSessionFixtureOrMock:  # pylint: disable=too-few-public-methods
+    pass
+
+
+# prevent using spark without going through the spark_session fixture
+@pytest.fixture(scope="session", autouse=True)
+def replace_spark_default_getorcreate():
+    global the_real_getOrCreate  # pylint: disable=global-statement
+    the_real_getOrCreate = SparkSession.builder.getOrCreate
+    SparkSession.builder.getOrCreate = UseTheSparkSessionFixtureOrMock
+    return the_real_getOrCreate
+
+
+# clean up pyspark after the test module finishes
+@pytest.fixture(scope="module")
+def spark_session():  # SKIP_IF_NO_SPARK
+    SparkSession.builder.getOrCreate = the_real_getOrCreate
+    spark = SparkSession.builder.getOrCreate()
+    yield spark
+    spark.stop()
+    SparkSession.builder.getOrCreate = UseTheSparkSessionFixtureOrMock
+
+    # remove the cached JVM vars
+    SparkContext._jvm = None  # pylint: disable=protected-access
+    SparkContext._gateway = None  # pylint: disable=protected-access
+
+    # py4j doesn't shutdown properly so kill the actual JVM process
+    for obj in gc.get_objects():
+        try:
+            if isinstance(obj, Popen) and "pyspark" in obj.args[0]:
+                obj.terminate()
+        except ReferenceError:  # pragma: no cover
+            # gc.get_objects may return dead weak proxy objects that will raise
+            # ReferenceError when you isinstance them
+            pass