Skip to content

Commit

Permalink
Fix models scripts/notebooks (nv-morpheus#1051)
Browse files Browse the repository at this point in the history
- Update broken reference in the hamma_inference script
- Fix ABP notebook scoring of cupy/cudf output to numpy.
- Fix AutoEncoder parameters in Hammah notebook.
- Add peridicity_detection module for hamma inference.


Closes nv-morpheus#1052

Authors:
  - Tad ZeMicheal (https://github.com/tzemicheal)

Approvers:
  - https://github.com/hsin-c
  - https://github.com/raykallen

URL: nv-morpheus#1051
  • Loading branch information
tzemicheal authored Jul 13, 2023
1 parent e50a3a9 commit 7c2db78
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 36 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -18,6 +19,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -32,6 +34,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -54,6 +57,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -85,6 +89,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -142,20 +147,23 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"There are no categorical features in our dataset. `nvidia_smi_log.timestamp` can be used to return the indices."
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Check categories"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -185,6 +193,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -202,6 +211,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -219,6 +229,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -235,13 +246,15 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Information on XGBoost parameters can be found [here](https://xgboost.readthedocs.io/en/latest/)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -280,6 +293,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -296,6 +310,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -348,12 +363,13 @@
],
"source": [
"\n",
"y_pred = fil_preds_gpu\n",
"y_true = y_test\n",
"y_pred = fil_preds_gpu.values.get()\n",
"y_true = y_test.to_numpy()\n",
"accuracy_score(y_true, y_pred)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down
16 changes: 15 additions & 1 deletion models/training-tuning-scripts/dfp-models/hammah-20211017.ipynb
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Humans and Machines AutoEncoder Training Notebook"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -19,6 +21,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand All @@ -33,6 +36,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -62,6 +66,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -103,6 +108,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -133,6 +139,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -174,6 +181,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -213,6 +221,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -248,6 +257,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -283,10 +293,11 @@
}
],
"source": [
"model.fit(X_train, epochs=25, val=X_val)"
"model.fit(X_train, epochs=25, val_data=X_val, run_validation=True)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -314,6 +325,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -1003,6 +1015,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down Expand Up @@ -1240,6 +1253,7 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,45 +14,47 @@
# limitations under the License.
"""
Example Usage:
python hammah-inference.py \
--validationdata ../../datasets/validation-data/hammah-user123-validation-data.csv \
--model ../../hammah-models/hammah-user123-20211017-dill.pkl \
--output abp-validation-output.csv
python hammah_inference.py --validationdata \
../../datasets/validation-data/dfp-cloudtrail-user123-validation-data-input.csv \
--model ../../training-tuning-scripts/dfp-models/hammah-user123-20211017dill.pkl \
--output abp-validation-output.csv
"""

import argparse
import datetime

import clx.analytics.periodicity_detection as pdd
import cupy as cp
import dill
import pandas as pd
import periodicity_detection as pdd

import cudf


def infer(validationdata, model, output):

def zscore(data):
mu = cp.mean(data)
mean = cp.mean(data)
std = cp.std(data)
return (data - mu) / std
return (data - mean) / std

def date2min(time):
start = START
start = start_time
timesince = time - start
return int(timesince.total_seconds() // 60)

form = "%Y-%m-%dT%H:%M:%SZ"

def stript(s):
obj = datetime.datetime.strptime(s, form)
def stript(input_str):
obj = datetime.datetime.strptime(input_str, form)
return obj

def back_to_string(obj):
return "{}-{}:{}:{}".format(f"{obj.month:02}", f"{obj.day:02}", f"{obj.hour:02}", f"{obj.minute:02}")
# return "{}-{}:{}:{}".format(f"{obj.month:02}", f"{obj.day:02}", f"{obj.hour:02}", f"{obj.minute:02}")
return f"{obj.month:02}-{obj.day:02}-{obj.hour:02}-{obj.minute:02}"

X_val = pd.read_csv(validationdata)
x_validation = pd.read_csv(validationdata)

col_list = [
'userIdentityaccountId',
Expand All @@ -71,49 +73,50 @@ def back_to_string(obj):
'userIdentitysessionContextsessionIssueruserName'
]

for i in list(X_val):
for i in list(x_validation):
if i not in col_list:
X_val = X_val.drop(i, axis=1)
x_validation = x_validation.drop(i, axis=1)

with open(model, 'rb') as f:
model = dill.load(f)

scores = model.get_anomaly_score(X_val)[3]
X_val['ae_anomaly_score'] = scores
scores = model.get_anomaly_score(x_validation)[3]
x_validation['ae_anomaly_score'] = scores

X_val.sort_values('ae_anomaly_score', ascending=False).head(10)
x_validation.sort_values('ae_anomaly_score', ascending=False).head(10)
# since inference is done, add the original columns back so the output will be the same as the input format
# X_val['ts_anomaly']=X_val_original['ts_anomaly']
df = cudf.read_csv(validationdata)
df = df.sort_values(by=['eventTime'])
timearr = df.eventTime.to_array()
START = stript(timearr[0])
timearr = df.eventTime.to_numpy()
start_time = stript(timearr[0])
timeobj = list(map(stript, timearr))
hs = list(map(date2min, timeobj))
n, _ = cp.histogram(cp.array(hs), bins=cp.arange(0, max(hs)))
signal = cudf.Series(n)
a = cp.fromDlpack(signal.to_dlpack())
h_s = list(map(date2min, timeobj))
num, _ = cp.histogram(cp.array(h_s), bins=cp.arange(0, max(h_s)))
signal = cudf.Series(num)
# was 'a' before pylint
a_vector = cp.fromDlpack(signal.to_dlpack())
periodogram = pdd.to_periodogram(signal)
periodogram = periodogram[:int((len(signal) / 2))]
threshold = float(cp.percentile(cp.array(periodogram), 90))
indices = cudf.Series(cp.arange(len(periodogram)))[periodogram < threshold].to_array()
rft = cp.fft.rfft(a)
indices = cudf.Series(cp.arange(len(periodogram)))[periodogram < threshold].to_numpy()
rft = cp.fft.rfft(a_vector)
rft[indices] = 0
recon = cp.fft.irfft(rft)
err = (abs(recon - a))
z = zscore(err)
indices = cudf.Series(cp.arange(len(z)))[z >= 8].to_array()
err = (abs(recon - a_vector))
z_score_value = zscore(err)
indices = cudf.Series(cp.arange(len(z_score_value)))[z_score_value >= 8].to_numpy()
strlist = []
for mins in indices:
from_start = START + datetime.timedelta(minutes=int(mins))
from_start = start_time + datetime.timedelta(minutes=int(mins))
strlist.append(back_to_string(from_start))
df['ts_anomaly'] = False
for i in strlist:
df['ts_anomaly'] = df['eventTime'].str.contains(i)
X_val.insert(0, 'eventID', X_val.index)
X_val.insert(0, '', X_val.index)
X_val['ts_anomaly'] = df['ts_anomaly'].to_pandas()
X_val.to_csv(output, index=False)
x_validation.insert(0, 'eventID', x_validation.index)
x_validation.insert(0, '', x_validation.index)
x_validation['ts_anomaly'] = df['ts_anomaly'].to_pandas()
x_validation.to_csv(output, index=False)


def main():
Expand Down
Loading

0 comments on commit 7c2db78

Please sign in to comment.