using whylogs to monitor df

eeeds · Aug 17, 2022 · c808326 · c808326
1 parent 8bcb4eb
commit c808326
Show file tree

Hide file tree

Showing 6 changed files with 81 additions and 18 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,6 @@
 7-Project/__pycache__
 7-Project/.vscode
 7-Project/.pytest_cache
-7-Project/tests/__pycache__
+7-Project/tests/__pycache__
+7-Project/whylog*
+7-Project/keys/
diff --git a/7-Project/README.md b/7-Project/README.md
@@ -34,9 +34,15 @@
   - [Start an agent](#start-an-agent)
   - [Schedule the deployment](#schedule-the-deployment)
 - [Monitoring](#monitoring)
-  - [Install evidently](#install-evidently)
+  - [Evidently](#evidently)
+    - [Install evidently](#install-evidently)
   - [Dashboard for classification report](#dashboard-for-classification-report)
   - [Results](#results)
+  - [Whylogs](#whylogs)
+    - [Install whylogs](#install-whylogs)
+    - [Get your API key](#get-your-api-key)
+    - [First approach:Connect dataset](#first-approachconnect-dataset)
+    - [Results](#results-1)
 - [Tests](#tests)
   - [Configure Tests](#configure-tests)
 - [Linting and Formatting](#linting-and-formatting)
@@ -49,6 +55,8 @@
   - [Add Black to pyproject.toml](#add-black-to-pyprojecttoml)
   - [Apply Isort](#apply-isort)
   - [Add Isort to pyproject.toml](#add-isort-to-pyprojecttoml)
+- [Git pre-commits hooks](#git-pre-commits-hooks)
+  - [Install pre-commit](#install-pre-commit)
 
 # Problem Explanation
 
@@ -163,9 +171,9 @@ Now, when you run a deployment with the `-t tag` option, the agent will pick up
 - `Timezone` is important, so, be sure to select the correct timezone.
 
 # Monitoring
-I'm going to use [Evidently](https://evidentlyai.com/) to monitor the experiment.
-
-## Install evidently
+I'm going to use [Evidently](https://evidentlyai.com/) and [Whylogs](https://github.com/whylabs/whylogs)to monitor the experiment.
+## Evidently
+### Install evidently
 You can install it with the following command:
 ```
 pip install evidently
@@ -180,7 +188,31 @@ This report can be generated for a single model, or as a comparison. You can con
 Using train data and valid data to evaluate the model I've created the following dashboard:
 ![Results](images/evidently-dashboard.PNG)
 You can see the resuls in the [`dashboard`](dashboards/df_model_performance.html) folder.
+## Whylogs
+### Install whylogs
+```
+pip install "whylogs<1.0" 
+```
+We're installing this version because the platform doesn't yet support v1.
+### Get your API key
+Go to [whylogs.com](https://whylogs.com/) and create an account, then go to your profile and click on the `API` tab.
+### First approach:Connect dataset
+As a first approach, we can connect the dataset to the experiment.
+
+I use the following command to connect the dataset to the experiment:
+```
+import whylogs as why
+from whylogs.app import Session
+from whylogs.app.writers import WhyLabsWriter
 
+writer = WhyLabsWriter("", formats=[])
+    session = Session(project="model-1", pipeline="mlops-project-pipeline", writers=[writer])
+
+with session.logger(tags={"datasetId": "model-1"}) as ylog:
+        ylog.log_dataframe(df)
+```
+### Results
+![images](images/whylogs-df.PNG)
 # Tests 
 I'll use Pytest to test the model.
 
@@ -259,4 +291,11 @@ order_by_type = true
 where:
 - `multi_line_output` is the number of lines that will be used to output a multiline string.
 - `length_sort` is a boolean that indicates if you want to sort by length.
-- `order_by_type` is a boolean that indicates if you want to order by type.
+- `order_by_type` is a boolean that indicates if you want to order by type.
+
+# Git pre-commits hooks
+I'm going to install `pre-commit` library. [More info here](https://pre-commit.com/).
+## Install pre-commit
+```
+pip install pre-commit
+```
diff --git a/7-Project/dashboards/df_model_performance.html b/7-Project/dashboards/df_model_performance.html
diff --git a/7-Project/images/whylogs-df.PNG b/7-Project/images/whylogs-df.PNG
diff --git a/7-Project/model.py b/7-Project/model.py
@@ -1,7 +1,10 @@
 # Import libraries
+import os
+import sys
 import pickle
 import pandas as pd
-
+sys.path.append(r'C:\Users\User\Desktop\Github\MLOps-Camp\7-Project\keys')
+import keys_apis
 
 from sklearn.linear_model import LogisticRegression
 from sklearn.feature_extraction import DictVectorizer
@@ -20,11 +23,26 @@
 
 import mlflow
 
-mlflow.set_tracking_uri("sqlite:///mydb.sqlite")
-EXPERIMENT_NAME = "hr-employee-attrition-project"
-mlflow.set_experiment(EXPERIMENT_NAME)
+import whylogs as why
+from whylogs.app import Session
+from whylogs.app.writers import WhyLabsWriter
+
 
 
+
+mlflow.set_tracking_uri("sqlite:///mydb.sqlite")
+EXPERIMENT_NAME = "hr-employee-attrition-project"
+mlflow.set_experiment(EXPERIMENT_NAME)  
+@task(name = 'Starting Whylogs', retries = 3)
+def starting_whylogs():
+    k = keys_apis.Keys()
+    k.obtain_whylogs_key()
+    os.environ["WHYLABS_API_KEY"] = k.whylog_key
+    os.environ["WHYLABS_DEFAULT_ORG_ID"] = "org-tgNtgy"
+    # Adding the WhyLabs Writer to utilize WhyLabs platform
+    writer = WhyLabsWriter("", formats=[])
+    session = Session(project="model-1", pipeline="mlops-project-pipeline", writers=[writer])
+    return writer,session
 @task(name="Model Performance Dashboard", retries=3)
 def model_performance_dashboard(
     df_train, train_dicts, df_val, val_dicts, numerical_features, categorical_features
@@ -89,7 +107,7 @@ def create_pipeline(train_dicts, y_train):
 
 
 @task(name="Extract_Data", retries=3)
-def extract_data() -> pd.DataFrame:
+def extract_data(writer, session) -> pd.DataFrame:
     """
     Extract data from csv file and return dataframe
     Returns:
@@ -102,6 +120,9 @@ def extract_data() -> pd.DataFrame:
     df["Attrition"] = df["Attrition"].apply(lambda x: 1 if x == "Yes" else 0)
     df["Over18"] = df["Over18"].apply(lambda x: 1 if x == "Yes" else 0)
     df["OverTime"] = df["OverTime"].apply(lambda x: 1 if x == "Yes" else 0)
+
+    with session.logger(tags={"datasetId": "model-1"}) as ylog:
+        ylog.log_dataframe(df)
     return df
 
 
@@ -194,7 +215,8 @@ def applying_model():
     Returns:
         None
     """
-    df = extract_data()
+    writer, session = starting_whylogs()
+    df = extract_data(writer, session)
     (
         X_train,
         y_train,

diff --git a/7-Project/mydb.sqlite b/7-Project/mydb.sqlite