[MLflowSkinny] Base changes for MLflow Skinny Client (mlflow#3687)

* Add skinny client Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Add split off cli server tests Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Add chmod to skinny test files Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Resolve setup.py differences Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Move the fixture up a level Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Resolve imports Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Move in sqlalchemy import Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Handle lint issue Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Resolve lint issues Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Apply formatting Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Resolve test failures Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Undo conftest changes Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Move test_csv_generation to test_runs.py, remove duplicate test_mlflow_run tests, uninstall sqlalchemy before skinny client installation in install common deps Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Resolve sqlalchemy uninstall ordering Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Undo odd copy bug to vscode in csv_generation tests Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Add skinny-requirements for skinny tests Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com> * Move docker import into docker util function Signed-off-by: Eduardo de Leon <eddeleon@microsoft.com>
simonvanbernem · Dec 23, 2020 · 154712f · 154712f
1 parent 57050a6
commit 154712f
Show file tree

Hide file tree

Showing 22 changed files with 288 additions and 161 deletions.
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -90,6 +90,26 @@ jobs:
         name: 00check.log
         path: /tmp/00check.log
 
+  # python-skinny tests cover a subset of mlflow functionality
+  # that is meant to be supported with a smaller dependency footprint.
+  # The python skinny tests cover the subset of mlflow functionality
+  # while also verifying certain dependencies are omitted.
+  python-skinny:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Install dependencies
+      env:
+        INSTALL_SKINNY_PYTHON_DEPS: true
+        MLFLOW_SKINNY: true
+      run: |
+        source ./dev/install-common-deps.sh
+    - name: Run tests
+      run: |
+        export PATH="$CONDA_DIR/bin:$PATH"
+        source activate test-environment
+        ./dev/run-python-skinny-tests.sh
+
   python-small:
     runs-on: ubuntu-latest
     steps:

diff --git a/dev/install-common-deps.sh b/dev/install-common-deps.sh
@@ -30,13 +30,23 @@ source activate test-environment
 python --version
 pip install --upgrade pip==19.3.1
 
+if [[ "$MLFLOW_SKINNY" == "true" ]]; then
+  pip install . --upgrade
+else
+  pip install .[extras] --upgrade
+fi
+export MLFLOW_HOME=$(pwd)
+
 # Install Python test dependencies only if we're running Python tests
 if [[ "$INSTALL_SMALL_PYTHON_DEPS" == "true" ]]; then
   # When downloading large packages from PyPI, the connection is sometimes aborted by the
   # remote host. See https://github.com/pypa/pip/issues/8510.
   # As a workaround, we retry installation of large packages.
   retry-with-backoff pip install --quiet -r ./dev/small-requirements.txt
 fi
+if [[ "$INSTALL_SKINNY_PYTHON_DEPS" == "true" ]]; then
+  retry-with-backoff pip install --quiet -r ./dev/skinny-requirements.txt
+fi
 if [[ "$INSTALL_LARGE_PYTHON_DEPS" == "true" ]]; then
   retry-with-backoff pip install --quiet -r ./dev/large-requirements.txt
   retry-with-backoff pip install --quiet -r ./dev/extra-ml-requirements.txt
@@ -48,9 +58,6 @@ if [[ "$INSTALL_LARGE_PYTHON_DEPS" == "true" ]]; then
   ls -lha $(find $CONDA_DIR/envs/test-environment/ -path "*bin/spark-*")
 fi
 
-pip install .[extras]
-export MLFLOW_HOME=$(pwd)
-
 # Print current environment info
 pip list
 which mlflow

diff --git a/dev/run-python-skinny-tests.sh b/dev/run-python-skinny-tests.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Executes a subset of mlflow tests that is supported with fewer dependencies than the core mlflow package.
+# Tests include most client interactions and compatibility points with the mlflow plugins around tracking, projects, models, deployments, and the cli.
+
+# The SQL alchemy store's dependencies are added for a base client/store that can be tested against.
+# A different example client/store with a minimal dependency footprint could also work for this purpose.
+
+set -x
+# Set err=1 if any commands exit with non-zero status as described in
+# https://stackoverflow.com/a/42219754
+err=0
+trap 'err=1' ERR
+export MLFLOW_SKINNY='true'
+
+pytest --verbose tests/test_skinny.py
+python -m pip install sqlalchemy alembic sqlparse
+pytest --verbose tests/test_runs.py
+pytest --verbose tests/tracking/test_client.py
+pytest --verbose tests/tracking/test_tracking.py
+pytest --verbose tests/projects/test_projects.py
+pytest --verbose tests/deployments/test_cli.py
+pytest --verbose tests/deployments/test_deployments.py
+pytest --verbose tests/projects/test_projects_cli.py
+
+test $err = 0
diff --git a/dev/skinny-requirements.txt b/dev/skinny-requirements.txt
@@ -0,0 +1,7 @@
+## Small test reqs
+scipy
+## Test-only dependencies
+pytest==3.2.1
+pytest-cov==2.6.0
+# Test plugin, used to verify correctness of MLflow plugin APIs
+tests/resources/mlflow-test-plugin/
diff --git a/mlflow/cli.py b/mlflow/cli.py
@@ -15,10 +15,7 @@
 import mlflow.runs
 import mlflow.sagemaker.cli
 import mlflow.store.artifact.cli
-import mlflow.store.db.utils
 from mlflow import tracking
-from mlflow.server import _run_server
-from mlflow.server.handlers import initialize_backend_stores
 from mlflow.store.tracking import DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH
 from mlflow.store.artifact.artifact_repository_registry import get_artifact_repository
 from mlflow.tracking import _get_store
@@ -261,6 +258,8 @@ def ui(backend_store_uri, default_artifact_root, port, host):
     need to pass ``--host 0.0.0.0`` to listen on all network interfaces (or a specific interface
     address).
     """
+    from mlflow.server import _run_server
+    from mlflow.server.handlers import initialize_backend_stores
 
     # Ensure that both backend_store_uri and default_artifact_uri are set correctly.
     if not backend_store_uri:
@@ -364,6 +363,8 @@ def server(
     to pass ``--host 0.0.0.0`` to listen on all network interfaces
     (or a specific interface address).
     """
+    from mlflow.server import _run_server
+    from mlflow.server.handlers import initialize_backend_stores
 
     _validate_server_args(gunicorn_opts=gunicorn_opts, workers=workers, waitress_opts=waitress_opts)
 
@@ -443,8 +444,8 @@ def gc(backend_store_uri, run_ids):
         run = backend_store.get_run(run_id)
         if run.info.lifecycle_stage != LifecycleStage.DELETED:
             raise MlflowException(
-                "Run {} is not in `deleted` lifecycle stage. Only runs in "
-                "`deleted` lifecycle stage can be deleted.".format(run_id)
+                "Run % is not in `deleted` lifecycle stage. Only runs in "
+                "`deleted` lifecycle stage can be deleted." % run_id
             )
         artifact_repo = get_artifact_repository(run.info.artifact_uri)
         artifact_repo.delete_artifacts()

diff --git a/mlflow/db.py b/mlflow/db.py
@@ -1,7 +1,5 @@
 import click
 
-import mlflow.store.db.utils
-
 
 @click.group("db")
 def commands():
@@ -24,6 +22,8 @@ def upgrade(url):
     large migrations and includes information about how to estimate their performance and
     recover from failures.
     """
+    import mlflow.store.db.utils
+
     engine = mlflow.store.db.utils.create_sqlalchemy_engine_with_retry(url)
     if mlflow.store.db.utils._is_initialized_before_mlflow_1(engine):
         mlflow.store.db.utils._upgrade_db_initialized_before_mlflow_1(engine)

diff --git a/mlflow/deployments/cli.py b/mlflow/deployments/cli.py
@@ -4,8 +4,7 @@
 import pandas as pd
 from mlflow.utils import cli_args
 from mlflow.deployments import interface
-from mlflow.pyfunc.scoring_server import _get_jsonable_obj
-from mlflow.utils.proto_json_utils import NumpyEncoder
+from mlflow.utils.proto_json_utils import NumpyEncoder, _get_jsonable_obj
 
 
 def _user_args_to_dict(user_list):
@@ -69,7 +68,7 @@ def _user_args_to_dict(user_list):
 )
 
 parse_input = click.option(
-    "--input-path", "-I", required=True, help="Path to input json file for prediction",
+    "--input-path", "-I", required=True, help="Path to input json file for prediction"
 )
 
 parse_output = click.option(

diff --git a/mlflow/models/cli.py b/mlflow/models/cli.py
@@ -4,7 +4,6 @@
 
 from mlflow.models import Model
 from mlflow.models.model import MLMODEL_FILE_NAME
-from mlflow.models.flavor_backend_registry import get_flavor_backend
 from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.utils import cli_args
@@ -169,6 +168,8 @@ def build_docker(model_uri, name, install_mlflow):
 
 
 def _get_flavor_backend(model_uri, **kwargs):
+    from mlflow.models.flavor_backend_registry import get_flavor_backend
+
     with TempDir() as tmp:
         if ModelsArtifactRepository.is_models_uri(model_uri):
             underlying_model_uri = ModelsArtifactRepository.get_underlying_uri(model_uri)

diff --git a/mlflow/models/flavor_backend_registry.py b/mlflow/models/flavor_backend_registry.py
@@ -7,14 +7,13 @@
 Not all flavors have a flavor backend.
 """
 import mlflow.pyfunc as pyfunc
-from mlflow.pyfunc.backend import PyFuncBackend
-from mlflow.rfunc.backend import RFuncBackend
-
-
-_flavor_backends = {pyfunc.FLAVOR_NAME: PyFuncBackend, "crate": RFuncBackend}
 
 
 def get_flavor_backend(model, build_docker=True, **kwargs):
+    from mlflow.pyfunc.backend import PyFuncBackend
+    from mlflow.rfunc.backend import RFuncBackend
+
+    _flavor_backends = {pyfunc.FLAVOR_NAME: PyFuncBackend, "crate": RFuncBackend}
     for flavor_name, flavor_config in model.flavors.items():
         if flavor_name in _flavor_backends:
             backend = _flavor_backends[flavor_name](flavor_config, **kwargs)

diff --git a/mlflow/projects/__init__.py b/mlflow/projects/__init__.py
@@ -23,17 +23,9 @@
     PROJECT_STORAGE_DIR,
     PROJECT_DOCKER_ARGS,
 )
-from mlflow.projects.docker import (
-    build_docker_image,
-    validate_docker_env,
-    validate_docker_installation,
-)
 from mlflow.projects.backend import loader
 from mlflow.tracking.fluent import _get_experiment_id
-from mlflow.utils.mlflow_tags import (
-    MLFLOW_PROJECT_ENV,
-    MLFLOW_PROJECT_BACKEND,
-)
+from mlflow.utils.mlflow_tags import MLFLOW_PROJECT_ENV, MLFLOW_PROJECT_BACKEND
 import mlflow.utils.uri
 
 _logger = logging.getLogger(__name__)
@@ -136,6 +128,11 @@ def _run(
         )
 
     elif backend_name == "kubernetes":
+        from mlflow.projects.docker import (
+            build_docker_image,
+            validate_docker_env,
+            validate_docker_installation,
+        )
         from mlflow.projects import kubernetes as kb
 
         tracking.MlflowClient().set_tag(active_run.info.run_id, MLFLOW_PROJECT_ENV, "docker")

diff --git a/mlflow/projects/backend/local.py b/mlflow/projects/backend/local.py
@@ -7,12 +7,7 @@
 
 import mlflow
 from mlflow.exceptions import MlflowException
-from mlflow.projects.docker import (
-    validate_docker_env,
-    validate_docker_installation,
-    build_docker_image,
-    get_docker_tracking_cmd_and_envs,
-)
+
 from mlflow.projects.submitted_run import LocalSubmittedRun
 from mlflow.projects.backend.abstract_backend import AbstractBackend
 from mlflow.projects.utils import (
@@ -65,6 +60,12 @@ def run(
         # If a docker_env attribute is defined in MLproject then it takes precedence over conda yaml
         # environments, so the project will be executed inside a docker container.
         if project.docker_env:
+            from mlflow.projects.docker import (
+                validate_docker_env,
+                validate_docker_installation,
+                build_docker_image,
+            )
+
             tracking.MlflowClient().set_tag(active_run.info.run_id, MLFLOW_PROJECT_ENV, "docker")
             validate_docker_env(project)
             validate_docker_installation()
@@ -199,6 +200,8 @@ def _run_entry_point(command, work_dir, experiment_id, run_id):
 
 
 def _get_docker_command(image, active_run, docker_args=None, volumes=None, user_env_vars=None):
+    from mlflow.projects.docker import get_docker_tracking_cmd_and_envs
+
     docker_path = "docker"
     cmd = [docker_path, "run", "--rm"]
 

diff --git a/mlflow/pyfunc/scoring_server/__init__.py b/mlflow/pyfunc/scoring_server/__init__.py
@@ -27,7 +27,7 @@
 # ALl of the mlfow dependencies below need to be backwards compatible.
 from mlflow.exceptions import MlflowException
 from mlflow.types import Schema
-from mlflow.utils.proto_json_utils import NumpyEncoder, _dataframe_from_json
+from mlflow.utils.proto_json_utils import NumpyEncoder, _dataframe_from_json, _get_jsonable_obj
 
 try:
     from mlflow.pyfunc import load_model, PyFuncModel
@@ -253,22 +253,3 @@ def _predict(model_uri, input_path, output_path, content_type, json_format):
 def _serve(model_uri, port, host):
     pyfunc_model = load_model(model_uri)
     init(pyfunc_model).run(port=port, host=host)
-
-
-def _get_jsonable_obj(data, pandas_orient="records"):
-    """Attempt to make the data json-able via standard library.
-    Look for some commonly used types that are not jsonable and convert them into json-able ones.
-    Unknown data types are returned as is.
-
-    :param data: data to be converted, works with pandas and numpy, rest will be returned as is.
-    :param pandas_orient: If `data` is a Pandas DataFrame, it will be converted to a JSON
-                          dictionary using this Pandas serialization orientation.
-    """
-    if isinstance(data, np.ndarray):
-        return data.tolist()
-    if isinstance(data, pd.DataFrame):
-        return data.to_dict(orient=pandas_orient)
-    if isinstance(data, pd.Series):
-        return pd.DataFrame(data).to_dict(orient=pandas_orient)
-    else:  # by default just return whatever this is and hope for the best
-        return data
diff --git a/mlflow/server/handlers.py b/mlflow/server/handlers.py
@@ -8,7 +8,6 @@
 
 from flask import Response, request, send_file
 from google.protobuf import descriptor
-from querystring_parser import parser
 
 from mlflow.entities import Metric, Param, RunTag, ViewType, ExperimentTag
 from mlflow.entities.model_registry import RegisteredModelTag, ModelVersionTag
@@ -155,6 +154,8 @@ def _get_request_json(flask_request=request):
 
 
 def _get_request_message(request_message, flask_request=request):
+    from querystring_parser import parser
+
     if flask_request.method == "GET" and len(flask_request.query_string) > 0:
         # This is a hack to make arrays of length 1 work with the parser.
         # for example experiment_ids%5B%5D=0 should be parsed to {experiment_ids: [0]}
@@ -239,6 +240,8 @@ def wrapper(*args, **kwargs):
 
 @catch_mlflow_exception
 def get_artifact_handler():
+    from querystring_parser import parser
+
     query_string = request.query_string.decode("utf-8")
     request_dict = parser.parse(query_string, normalized=True)
     run_id = request_dict.get("run_id") or request_dict.get("run_uuid")
@@ -695,6 +698,8 @@ def _create_model_version():
 
 @catch_mlflow_exception
 def get_model_version_artifact_handler():
+    from querystring_parser import parser
+
     query_string = request.query_string.decode("utf-8")
     request_dict = parser.parse(query_string, normalized=True)
     name = request_dict.get("name")

diff --git a/mlflow/store/artifact/databricks_artifact_repo.py b/mlflow/store/artifact/databricks_artifact_repo.py
@@ -5,9 +5,6 @@
 import requests
 import uuid
 
-from azure.core.exceptions import ClientAuthenticationError
-from azure.storage.blob import BlobClient
-
 import mlflow.tracking
 from mlflow.entities import FileInfo
 from mlflow.exceptions import MlflowException
@@ -155,6 +152,9 @@ def _azure_upload_file(self, credentials, local_file, artifact_path):
         Finally, since the prevailing credentials could expire in the time between the last
         stage_block and the commit, a second try-except block refreshes credentials if needed.
         """
+        from azure.core.exceptions import ClientAuthenticationError
+        from azure.storage.blob import BlobClient
+
         try:
             headers = self._extract_headers_from_credentials(credentials.headers)
             service = BlobClient.from_blob_url(

diff --git a/mlflow/store/tracking/file_store.py b/mlflow/store/tracking/file_store.py
@@ -56,7 +56,6 @@
     local_file_uri_to_path,
     path_to_local_file_uri,
 )
-from mlflow.utils.search_utils import SearchUtils
 from mlflow.utils.string_utils import is_string_type
 from mlflow.utils.uri import append_to_uri_path
 from mlflow.utils.mlflow_tags import MLFLOW_LOGGED_MODELS
@@ -720,6 +719,8 @@ def _list_run_infos(self, experiment_id, view_type):
     def _search_runs(
         self, experiment_ids, filter_string, run_view_type, max_results, order_by, page_token
     ):
+        from mlflow.utils.search_utils import SearchUtils
+
         if max_results > SEARCH_MAX_RESULTS_THRESHOLD:
             raise MlflowException(
                 "Invalid value for request parameter max_results. It must be at "