[ML-16200] Add pip_requirements and additional_pip_requirements t…

…o `log_model` (mlflow#4519) * revert changes on other flavors Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> * Rename additional_pip_requirements to extra_pip_requirements Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> * fix Signed-off-by: harupy <17039389+harupy@users.noreply.github.com> * clean up Signed-off-by: harupy <17039389+harupy@users.noreply.github.com>
BLACKBUCK-LABS · Jul 15, 2021 · c89caef · c89caef
1 parent 8ec8603
commit c89caef
Show file tree

Hide file tree

Showing 11 changed files with 684 additions and 20 deletions.
diff --git a/dev/small-requirements.txt b/dev/small-requirements.txt
@@ -1,4 +1,6 @@
 ## Small test reqs
+# Required for testing utilities for parsing pip requirements
+pip>=20.1
 scipy
 # NB: We're specifying a test-only minimum version bound for sqlalchemy in order to reliably
 # execute schema consistency checks, the semantics of which were changed in sqlalchemy 1.3.21

diff --git a/docs/source/tutorials-and-examples/tutorial.rst b/docs/source/tutorials-and-examples/tutorial.rst
@@ -240,6 +240,14 @@ Now that you have your training code, you can package it so that other data scie
           parameters = list(alpha = 0.2)
         )
 
+.. _pip-requirements-example:
+
+Specifying pip requirements using ``pip_requirements`` and ``extra_pip_requirements``
+-------------------------------------------------------------------------------------
+
+.. literalinclude:: ../../../examples/pip_requirements/pip_requirements.py
+
+
 Serving the Model
 -----------------
 

diff --git a/examples/pip_requirements/pip_requirements.py b/examples/pip_requirements/pip_requirements.py
@@ -0,0 +1,72 @@
+"""
+This example demonstrates how to specify pip requirements using `pip_requirements` and
+`extra_pip_requirements` when logging a model via `mlflow.*.log_model`.
+"""
+
+import tempfile
+
+import sklearn
+from sklearn.datasets import load_iris
+import xgboost as xgb
+import mlflow
+
+
+def get_pip_requirements(run_id, artifact_path):
+    client = mlflow.tracking.MlflowClient()
+    local_path = client.download_artifacts(run_id, f"{artifact_path}/requirements.txt")
+    with open(local_path) as f:
+        return f.read().splitlines()
+
+
+def main():
+    iris = load_iris()
+    dtrain = xgb.DMatrix(iris.data, iris.target)
+    model = xgb.train({}, dtrain)
+
+    xgb_req = f"xgboost=={xgb.__version__}"
+    sklearn_req = f"scikit-learn=={sklearn.__version__}"
+
+    with mlflow.start_run() as run:
+        run_id = run.info.run_id
+
+        # Default (both `pip_requirements` and `extra_pip_requirements` are unspecified)
+        artifact_path = "default"
+        mlflow.xgboost.log_model(model, artifact_path)
+        pip_reqs = get_pip_requirements(run_id, artifact_path)
+        assert pip_reqs == ["mlflow", xgb_req], pip_reqs
+
+        # Overwrite the default set of pip requirements using `pip_requirements`
+        artifact_path = "pip_requirements"
+        mlflow.xgboost.log_model(model, artifact_path, pip_requirements=[sklearn_req])
+        pip_reqs = get_pip_requirements(run_id, artifact_path)
+        assert pip_reqs == ["mlflow", sklearn_req], pip_reqs
+
+        # Add extra pip requirements on top of the default set of pip requirements
+        # using `extra_pip_requirements`
+        artifact_path = "extra_pip_requirements"
+        mlflow.xgboost.log_model(model, artifact_path, extra_pip_requirements=[sklearn_req])
+        pip_reqs = get_pip_requirements(run_id, artifact_path)
+        assert pip_reqs == ["mlflow", xgb_req, sklearn_req], pip_reqs
+
+        # Specify pip requirements using a requirements file
+        with tempfile.NamedTemporaryFile("w", suffix=".requirements.txt") as f:
+            f.write(sklearn_req)
+            f.flush()
+
+            # Path to a pip requirements file
+            artifact_path = "requirements_file_path"
+            mlflow.xgboost.log_model(model, artifact_path, pip_requirements=f.name)
+            pip_reqs = get_pip_requirements(run_id, artifact_path)
+            assert pip_reqs == ["mlflow", sklearn_req], pip_reqs
+
+            # List of pip requirement strings
+            artifact_path = "requirements_file_list"
+            mlflow.xgboost.log_model(
+                model, artifact_path, pip_requirements=[xgb_req, f"-r {f.name}"]
+            )
+            pip_reqs = get_pip_requirements(run_id, artifact_path)
+            assert pip_reqs == ["mlflow", xgb_req, sklearn_req], pip_reqs
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mlflow/utils/environment.py b/mlflow/utils/environment.py
@@ -1,7 +1,9 @@
 import yaml
+import tempfile
 import os
 
 from mlflow.utils import PYTHON_VERSION
+from mlflow.utils.requirements_utils import _parse_requirements
 
 _conda_header = """\
 name: mlflow-env
@@ -68,9 +70,9 @@ def _mlflow_additional_pip_env(
         return requirements
 
 
-def _get_additional_pip_dep(conda_env):
+def _get_pip_deps(conda_env):
     """
-    :return: The additional pip dependencies from the conda env
+    :return: The pip dependencies from the conda env
     """
     if conda_env is not None:
         for dep in conda_env["dependencies"]:
@@ -80,5 +82,75 @@ def _get_additional_pip_dep(conda_env):
 
 
 def _log_pip_requirements(conda_env, path, requirements_file="requirements.txt"):
-    pip_deps = _get_additional_pip_dep(conda_env)
+    pip_deps = _get_pip_deps(conda_env)
     _mlflow_additional_pip_env(pip_deps, path=os.path.join(path, requirements_file))
+
+
+def _parse_pip_requirements(pip_requirements):
+    """
+    Parses an iterable of pip requirement strings or a pip requirements file.
+
+    :param pip_requirements: Either an iterable of pip requirement strings
+        (e.g. ``["scikit-learn", "-r requirements.txt"]``) or the string path to a pip requirements
+        file on the local filesystem (e.g. ``"requirements.txt"``). If ``None``, an empty list will
+        be returned.
+    :return: A list of pip requirement strings.
+    """
+    if pip_requirements is None:
+        return []
+
+    def _is_string(x):
+        return isinstance(x, str)
+
+    def _is_iterable(x):
+        try:
+            iter(x)
+            return True
+        except Exception:
+            return False
+
+    if _is_string(pip_requirements):
+        return list(_parse_requirements(pip_requirements))
+    elif _is_iterable(pip_requirements) and all(map(_is_string, pip_requirements)):
+        try:
+            # Create a temporary requirements file in the current working directory
+            tmp_req_file = tempfile.NamedTemporaryFile(
+                mode="w",
+                prefix="mlflow.",
+                suffix=".tmp.requirements.txt",
+                dir=os.getcwd(),
+                # Setting `delete` to True causes a permission-denied error on Windows
+                # while trying to read the generated temporary file.
+                delete=False,
+            )
+            tmp_req_file.write("\n".join(pip_requirements))
+            tmp_req_file.close()
+            return _parse_pip_requirements(tmp_req_file.name)
+        finally:
+            # Clean up the temporary requirements file
+            os.remove(tmp_req_file.name)
+    else:
+        raise TypeError(
+            "`pip_requirements` must be either a string path to a pip requirements file on the "
+            "local filesystem or an iterable of pip requirement strings, but got `{}`".format(
+                type(pip_requirements)
+            )
+        )
+
+
+def _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements):
+    """
+    Validates that only one or none of `conda_env`, `pip_requirements`, and
+    `extra_pip_requirements` is specified.
+    """
+    args = [
+        conda_env,
+        pip_requirements,
+        extra_pip_requirements,
+    ]
+    specified = [arg for arg in args if arg is not None]
+    if len(specified) > 1:
+        raise ValueError(
+            "Only one of `conda_env`, `pip_requirements`, and "
+            "`extra_pip_requirements` can be specified"
+        )
diff --git a/mlflow/utils/requirements_utils.py b/mlflow/utils/requirements_utils.py
@@ -0,0 +1,86 @@
+"""
+This module provides a set of utilities for interpreting and creating requirements files
+(e.g. pip's `requirements.txt`), which is useful for managing ML software environments.
+"""
+import os
+from itertools import filterfalse
+
+
+def _is_comment(line):
+    return line.startswith("#")
+
+
+def _is_empty(line):
+    return line == ""
+
+
+def _strip_inline_comment(line):
+    return line[: line.find(" #")].rstrip() if " #" in line else line
+
+
+def _is_requirements_file(line):
+    return line.startswith("-r ") or line.startswith("--requirement ")
+
+
+def _join_continued_lines(lines):
+    """
+    Joins lines ending with '\\'.
+
+    >>> _join_continued_lines["a\\", "b\\", "c"]
+    >>> 'abc'
+    """
+    continued_lines = []
+
+    for line in lines:
+        if line.endswith("\\"):
+            continued_lines.append(line.rstrip("\\"))
+        else:
+            continued_lines.append(line)
+            yield "".join(continued_lines)
+            continued_lines.clear()
+
+    # The last line ends with '\'
+    if continued_lines:
+        yield "".join(continued_lines)
+
+
+# TODO: Add support for constraint files:
+#       https://github.com/mlflow/mlflow/pull/4519#discussion_r668412179
+def _parse_requirements(requirements_file):
+    """
+    A simplified version of `pip._internal.req.parse_requirements` which performs the following
+    operations on the given requirements file and yields the parsed requirements.
+
+    - Remove comments and blank lines
+    - Join continued lines
+    - Resolve requirements file references (e.g. '-r requirements.txt')
+
+    References:
+    - `pip._internal.req.parse_requirements`:
+      https://github.com/pypa/pip/blob/7a77484a492c8f1e1f5ef24eaf71a43df9ea47eb/src/pip/_internal/req/req_file.py#L118
+    - Requirements File Format:
+      https://pip.pypa.io/en/stable/cli/pip_install/#requirements-file-format
+
+    :param requirements_file: A string path to a requirements file on the local filesystem.
+    :return: A list of parsed requirements (e.g. ``["scikit-learn==0.24.2", ...]``).
+    """
+    with open(requirements_file) as f:
+        lines = f.read().splitlines()
+
+    lines = map(str.strip, lines)
+    lines = map(_strip_inline_comment, lines)
+    lines = _join_continued_lines(lines)
+    lines = filterfalse(_is_comment, lines)
+    lines = filterfalse(_is_empty, lines)
+
+    for line in lines:
+        if _is_requirements_file(line):
+            req_file = line.split(maxsplit=1)[1]
+            abs_path = (
+                req_file
+                if os.path.isabs(req_file)
+                else os.path.join(os.path.dirname(requirements_file), req_file)
+            )
+            yield from _parse_requirements(abs_path)
+        else:
+            yield line