Fix models scripts/notebooks (nv-morpheus#1051)

- Update broken reference in the hamma_inference script - Fix ABP notebook scoring of cupy/cudf output to numpy. - Fix AutoEncoder parameters in Hammah notebook. - Add peridicity_detection module for hamma inference. Closes nv-morpheus#1052 Authors: - Tad ZeMicheal (https://github.com/tzemicheal) Approvers: - https://github.com/hsin-c - https://github.com/raykallen URL: nv-morpheus#1051
pthalasta · Jul 13, 2023 · 7c2db78 · 7c2db78
1 parent e50a3a9
commit 7c2db78
Show file tree

Hide file tree

Showing 4 changed files with 143 additions and 36 deletions.
diff --git a/models/training-tuning-scripts/abp-models/abp-nvsmi-xgb-20210310.ipynb b/models/training-tuning-scripts/abp-models/abp-nvsmi-xgb-20210310.ipynb
@@ -1,6 +1,7 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -18,6 +19,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -32,6 +34,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -54,6 +57,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -85,6 +89,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -142,20 +147,23 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "There are no categorical features in our dataset. `nvidia_smi_log.timestamp` can be used to return the indices."
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Check categories"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -185,6 +193,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -202,6 +211,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -219,6 +229,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -235,13 +246,15 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "Information on XGBoost parameters can be found [here](https://xgboost.readthedocs.io/en/latest/)"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -280,6 +293,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -296,6 +310,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -348,12 +363,13 @@
    ],
    "source": [
     "\n",
-    "y_pred = fil_preds_gpu\n",
-    "y_true = y_test\n",
+    "y_pred = fil_preds_gpu.values.get()\n",
+    "y_true = y_test.to_numpy()\n",
     "accuracy_score(y_true, y_pred)"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [

diff --git a/models/training-tuning-scripts/dfp-models/hammah-20211017.ipynb b/models/training-tuning-scripts/dfp-models/hammah-20211017.ipynb
@@ -1,13 +1,15 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "# Humans and Machines AutoEncoder Training Notebook"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -19,6 +21,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -33,6 +36,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -62,6 +66,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -103,6 +108,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -133,6 +139,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -174,6 +181,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -213,6 +221,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -248,6 +257,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -283,10 +293,11 @@
     }
    ],
    "source": [
-    "model.fit(X_train, epochs=25, val=X_val)"
+    "model.fit(X_train, epochs=25, val_data=X_val, run_validation=True)"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -314,6 +325,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -1003,6 +1015,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -1240,6 +1253,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [

diff --git a/...ce-scripts/dfp-models/hammah-inference.py → ...ce-scripts/dfp-models/hammah_inference.py b/...ce-scripts/dfp-models/hammah-inference.py → ...ce-scripts/dfp-models/hammah_inference.py
@@ -14,45 +14,47 @@
 # limitations under the License.
 """
 Example Usage:
-python hammah-inference.py \
-    --validationdata ../../datasets/validation-data/hammah-user123-validation-data.csv \
-    --model ../../hammah-models/hammah-user123-20211017-dill.pkl \
-    --output abp-validation-output.csv
+python hammah_inference.py --validationdata \
+    ../../datasets/validation-data/dfp-cloudtrail-user123-validation-data-input.csv \
+ --model ../../training-tuning-scripts/dfp-models/hammah-user123-20211017dill.pkl \
+      --output abp-validation-output.csv
+
 """
 
 import argparse
 import datetime
 
-import clx.analytics.periodicity_detection as pdd
 import cupy as cp
 import dill
 import pandas as pd
+import periodicity_detection as pdd
 
 import cudf
 
 
 def infer(validationdata, model, output):
 
     def zscore(data):
-        mu = cp.mean(data)
+        mean = cp.mean(data)
         std = cp.std(data)
-        return (data - mu) / std
+        return (data - mean) / std
 
     def date2min(time):
-        start = START
+        start = start_time
         timesince = time - start
         return int(timesince.total_seconds() // 60)
 
     form = "%Y-%m-%dT%H:%M:%SZ"
 
-    def stript(s):
-        obj = datetime.datetime.strptime(s, form)
+    def stript(input_str):
+        obj = datetime.datetime.strptime(input_str, form)
         return obj
 
     def back_to_string(obj):
-        return "{}-{}:{}:{}".format(f"{obj.month:02}", f"{obj.day:02}", f"{obj.hour:02}", f"{obj.minute:02}")
+        # return "{}-{}:{}:{}".format(f"{obj.month:02}", f"{obj.day:02}", f"{obj.hour:02}", f"{obj.minute:02}")
+        return f"{obj.month:02}-{obj.day:02}-{obj.hour:02}-{obj.minute:02}"
 
-    X_val = pd.read_csv(validationdata)
+    x_validation = pd.read_csv(validationdata)
 
     col_list = [
         'userIdentityaccountId',
@@ -71,49 +73,50 @@ def back_to_string(obj):
         'userIdentitysessionContextsessionIssueruserName'
     ]
 
-    for i in list(X_val):
+    for i in list(x_validation):
         if i not in col_list:
-            X_val = X_val.drop(i, axis=1)
+            x_validation = x_validation.drop(i, axis=1)
 
     with open(model, 'rb') as f:
         model = dill.load(f)
 
-    scores = model.get_anomaly_score(X_val)[3]
-    X_val['ae_anomaly_score'] = scores
+    scores = model.get_anomaly_score(x_validation)[3]
+    x_validation['ae_anomaly_score'] = scores
 
-    X_val.sort_values('ae_anomaly_score', ascending=False).head(10)
+    x_validation.sort_values('ae_anomaly_score', ascending=False).head(10)
     # since inference is done, add the original columns back so the output will be the same as the input format
     # X_val['ts_anomaly']=X_val_original['ts_anomaly']
     df = cudf.read_csv(validationdata)
     df = df.sort_values(by=['eventTime'])
-    timearr = df.eventTime.to_array()
-    START = stript(timearr[0])
+    timearr = df.eventTime.to_numpy()
+    start_time = stript(timearr[0])
     timeobj = list(map(stript, timearr))
-    hs = list(map(date2min, timeobj))
-    n, _ = cp.histogram(cp.array(hs), bins=cp.arange(0, max(hs)))
-    signal = cudf.Series(n)
-    a = cp.fromDlpack(signal.to_dlpack())
+    h_s = list(map(date2min, timeobj))
+    num, _ = cp.histogram(cp.array(h_s), bins=cp.arange(0, max(h_s)))
+    signal = cudf.Series(num)
+    # was 'a' before pylint
+    a_vector = cp.fromDlpack(signal.to_dlpack())
     periodogram = pdd.to_periodogram(signal)
     periodogram = periodogram[:int((len(signal) / 2))]
     threshold = float(cp.percentile(cp.array(periodogram), 90))
-    indices = cudf.Series(cp.arange(len(periodogram)))[periodogram < threshold].to_array()
-    rft = cp.fft.rfft(a)
+    indices = cudf.Series(cp.arange(len(periodogram)))[periodogram < threshold].to_numpy()
+    rft = cp.fft.rfft(a_vector)
     rft[indices] = 0
     recon = cp.fft.irfft(rft)
-    err = (abs(recon - a))
-    z = zscore(err)
-    indices = cudf.Series(cp.arange(len(z)))[z >= 8].to_array()
+    err = (abs(recon - a_vector))
+    z_score_value = zscore(err)
+    indices = cudf.Series(cp.arange(len(z_score_value)))[z_score_value >= 8].to_numpy()
     strlist = []
     for mins in indices:
-        from_start = START + datetime.timedelta(minutes=int(mins))
+        from_start = start_time + datetime.timedelta(minutes=int(mins))
         strlist.append(back_to_string(from_start))
     df['ts_anomaly'] = False
     for i in strlist:
         df['ts_anomaly'] = df['eventTime'].str.contains(i)
-    X_val.insert(0, 'eventID', X_val.index)
-    X_val.insert(0, '', X_val.index)
-    X_val['ts_anomaly'] = df['ts_anomaly'].to_pandas()
-    X_val.to_csv(output, index=False)
+    x_validation.insert(0, 'eventID', x_validation.index)
+    x_validation.insert(0, '', x_validation.index)
+    x_validation['ts_anomaly'] = df['ts_anomaly'].to_pandas()
+    x_validation.to_csv(output, index=False)
 
 
 def main():