aai-institute · AdrianoKF · Mar 25, 2025 · Mar 26, 2025 · Mar 26, 2025
diff --git a/deploy/nannyml/Dockerfile b/deploy/nannyml/Dockerfile
@@ -7,21 +7,26 @@ ENV UV_LINK_MODE=copy
 WORKDIR /app
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=uv.lock,target=uv.lock \
-    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
-    uv sync --frozen --no-install-project --group monitoring
+  --mount=type=bind,source=uv.lock,target=uv.lock \
+  --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+  uv sync --frozen --no-install-project --group monitoring
 
 
 FROM python:3.12-slim
 WORKDIR /app
 COPY --from=builder /app/.venv /app/.venv
 
 RUN apt-get update && \
-    apt-get install -yqq --no-install-recommends libgomp1
+  apt-get install -yqq --no-install-recommends libgomp1
 
-ARG NANNYML_ESTIMATOR
 COPY deploy/nannyml/main.py .
+
+ARG NANNYML_ESTIMATOR
 COPY $NANNYML_ESTIMATOR ./nannyml_estimator.pkl
 ENV NANNYML_ESTIMATOR=/app/nannyml_estimator.pkl
 
+ARG NANNYML_DRIFT_CALCULATOR
+COPY $NANNYML_DRIFT_CALCULATOR ./nannyml_drift_calc.pkl
+ENV NANNYML_DRIFT_CALCULATOR=/app/nannyml_drift_calc.pkl
+
 CMD ["/app/.venv/bin/uvicorn", "--reload", "main:app", "--host", "0.0.0.0", "--port", "4040"]
diff --git a/src/income_prediction/assets/__init__.py b/src/income_prediction/assets/__init__.py
@@ -19,6 +19,7 @@
 from .fairness import evaluate_fairness
 from .model import model_container as model_container
 from .monitoring import nannyml_container as nannyml_container
+from .monitoring import nannyml_drift_calculator as nannyml_drift_calculator
 from .monitoring import nannyml_estimator as nannyml_estimator
 from .monitoring import reference_dataset as reference_dataset
 
@@ -108,16 +109,6 @@ def optuna_search_xgb(
             )
             mlflow.log_input(test_ds)
 
-            mlflow.sklearn.log_model(
-                best_model,
-                artifact_path="model",
-                registered_model_name=model_name,
-                code_paths=["src/asec"],
-                input_example=train_data.drop(columns=CensusASECMetadata.TARGET).head(
-                    5
-                ),
-            )
-
             mlflow.evaluate(
                 model=best_model.predict,
                 data=test_ds,
@@ -135,4 +126,13 @@ def optuna_search_xgb(
             fairness_metrics = evaluate_fairness(test_data, y_pred)
             log_fairness_metrics(fairness_metrics)
 
+            mlflow.sklearn.log_model(
+                best_model,
+                artifact_path="model",
+                registered_model_name=model_name,
+                code_paths=["src/asec"],
+                input_example=train_data.drop(columns=CensusASECMetadata.TARGET).head(
+                    5
+                ),
+            )
             return best_model
diff --git a/src/income_prediction/assets/monitoring.py b/src/income_prediction/assets/monitoring.py
@@ -63,31 +63,70 @@ def nannyml_estimator(
     return estimator
 
 
-@dg.asset(kinds={"docker"}, group_name="deployment", deps=["nannyml_estimator"])
+@dg.asset(
+    group_name="deployment",
+)
+def nannyml_drift_calculator(
+    reference_dataset: pd.DataFrame,
+    nanny_ml_config: NannyMLConfig,
+) -> nml.UnivariateDriftCalculator:
+    feature_cols = [
+        col
+        for col in reference_dataset.columns
+        if not col.startswith(("prediction", "prob_", "target"))
+    ]
+    calc = nml.UnivariateDriftCalculator(
+        column_names=feature_cols,
+        treat_as_categorical=CensusASECMetadata.CATEGORICAL_FEATURES,
+        continuous_methods=["kolmogorov_smirnov"],
+        categorical_methods=["chi2"],
+        chunk_size=nanny_ml_config.chunk_size,
+    )
+    calc.fit(reference_dataset)
+    return calc
+
+
+@dg.asset(
+    kinds={"docker"},
+    group_name="deployment",
+    deps=["nannyml_estimator", "nannyml_drift_calculator"],
+)
 def nannyml_container(
     context: dg.AssetExecutionContext,
     model_version: ModelVersion,
     nannyml_estimator: nml.CBPE,
+    nannyml_drift_calculator: nml.UnivariateDriftCalculator,
 ) -> dg.Output:
     build_context = Path(__file__).parents[3]
     image_tags = [f"nannyml:{suffix}" for suffix in [model_version.version, "latest"]]
     context.log.info(f"{image_tags=}")
 
     # Create tempfile inside the build context, so it can be copied into the image
-    with NamedTemporaryFile(
-        prefix="nannyml-cbpe-", suffix=".pkl", dir=build_context
-    ) as tmp_file:
-        pkl_path = Path(tmp_file.name)
-
-        with open(pkl_path, "wb") as f:
-            pickle.dump(nannyml_estimator, f)
+    with (
+        NamedTemporaryFile(
+            prefix="nannyml-cbpe-", suffix=".pkl", dir=build_context
+        ) as cbpe_file,
+        NamedTemporaryFile(
+            prefix="nannyml-drift-", suffix=".pkl", dir=build_context
+        ) as drift_file,
+    ):
+        pickle.dump(nannyml_estimator, cbpe_file)
+        pickle.dump(nannyml_drift_calculator, drift_file)
+
+        drift_file.flush()
+        cbpe_file.flush()
 
-        context.log.info(f"{pkl_path=}")
-        context.log.info(f"{build_context=}")
         build_result = build_container_image(
             build_context,
             image_tags,
-            build_args={"NANNYML_ESTIMATOR": str(pkl_path.name)},
+            build_args={
+                "NANNYML_ESTIMATOR": str(
+                    Path(cbpe_file.name).relative_to(build_context)
+                ),
+                "NANNYML_DRIFT_CALCULATOR": str(
+                    Path(drift_file.name).relative_to(build_context)
+                ),
+            },
             docker_file=build_context / "deploy" / "nannyml" / "Dockerfile",
         )
 

diff --git a/src/income_prediction/jobs.py b/src/income_prediction/jobs.py
@@ -1,7 +1,12 @@
 import dagster as dg
 
 from .assets.model import model_container
-from .assets.monitoring import nannyml_container, nannyml_estimator, reference_dataset
+from .assets.monitoring import (
+    nannyml_container,
+    nannyml_estimator,
+    reference_dataset,
+    nannyml_drift_calculator,
+)
 
 model_container_job = dg.define_asset_job(
     name="model_container_job",
@@ -12,5 +17,10 @@
 nannyml_container_job = dg.define_asset_job(
     name="nannyml_container_job",
     description="Monitoring service container image build",
-    selection=[reference_dataset, nannyml_estimator, nannyml_container],
+    selection=[
+        reference_dataset,
+        nannyml_estimator,
+        nannyml_drift_calculator,
+        nannyml_container,
+    ],
 )