Skip to content

Commit

Permalink
Merge branch 'branch-23.07' of github.com:nv-morpheus/Morpheus into d…
Browse files Browse the repository at this point in the history
…avid-rest-source-sink
  • Loading branch information
dagardner-nv committed Jun 8, 2023
2 parents 7c518a4 + 59ade8c commit 98c0625
Show file tree
Hide file tree
Showing 37 changed files with 455 additions and 832 deletions.
6 changes: 4 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,10 @@ RUN --mount=type=cache,id=apt,target=/var/cache/apt \
rm -rf /var/lib/apt/lists/*

# Install camouflage needed for unittests to mock a triton server
RUN source activate morpheus &&\
npm install -g camouflage-server
# Pin to v0.9 until #967 is resolved
RUN source activate morpheus && \
npm install -g camouflage-server@0.9 && \
npm cache clean --force

# Setup git to allow other users to access /workspace. Requires git 2.35.3 or
# greater. See https://marc.info/?l=git&m=164989570902912&w=2. Only enable for
Expand Down
2 changes: 1 addition & 1 deletion docker/conda/environments/cuda11.8_dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ dependencies:
- configargparse=1.5
- cuda-compiler=11.8
- cuda-nvml-dev=11.8
- cudatoolkit=11.8
- cuda-toolkit=11.8
- cudf=23.02
- cupy=11.6.0
- cxx-compiler
Expand Down
11 changes: 10 additions & 1 deletion docs/source/developer_guide/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,16 @@ git submodule update --init --recursive
pip install -e ${MORPHEUS_ROOT}
```
Once Morpheus has been built, it can be installed into the current virtual environment.
1. Test the build (Note: some tests will be skipped)
1. Test the build (Note: some tests will be skipped)\
Some of the tests will rely on external data sets.
```bash
MORPHEUS_ROOT=${PWD}
git lfs install
git lfs update
./scripts/fetch_data.py fetch all
```
This script will fetch the data sets needed. Then run:
```bash
pytest
```
Expand Down
3 changes: 0 additions & 3 deletions models/phishing-models/phishing-bert-20230421.onnx

This file was deleted.

3 changes: 0 additions & 3 deletions models/phishing-models/phishing-bert-20230421.pt

This file was deleted.

3 changes: 3 additions & 0 deletions models/phishing-models/phishing-bert-20230517.onnx
Git LFS file not shown
3 changes: 3 additions & 0 deletions models/phishing-models/phishing-bert-20230517.pt
Git LFS file not shown
3 changes: 0 additions & 3 deletions models/root-cause-models/root-cause-binary-bert-20221118.onnx

This file was deleted.

3 changes: 3 additions & 0 deletions models/root-cause-models/root-cause-binary-bert-20230517.onnx
Git LFS file not shown
16 changes: 16 additions & 0 deletions models/training-tuning-scripts/common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Common tools and utilities for model training and tuning.
"""
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -17,10 +17,8 @@
log = logging.getLogger(__name__)


class DataLoader(object):
"""
Wrapper class is used to return dataframe partitions based on batchsize.
"""
class DataLoader():
"""Wrapper class is used to return dataframe partitions based on batchsize."""

def __init__(self, dataset, batchsize=1000):
"""Constructor to create dataframe partitions.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.


class Dataset(object):
class Dataset():

def __init__(self, df):
self._df = df.reset_index(drop=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,39 +14,43 @@
# limitations under the License.

import logging
import os
from abc import ABC
from abc import abstractmethod

import cudf
from cudf.core.subword_tokenizer import SubwordTokenizer
import cupy

import torch
from dataloader import DataLoader
from dataset import Dataset
from torch.optim import AdamW
from torch.utils.dlpack import to_dlpack
from tqdm import trange
from transformers import AutoModelForSequenceClassification

import cudf
from cudf.core.subword_tokenizer import SubwordTokenizer
from .dataloader import DataLoader
from .dataset import Dataset

log = logging.getLogger(__name__)


class SequenceClassifier(ABC):
class SequenceClassifier:
"""
Sequence Classifier using BERT. This class provides methods for training/loading BERT models, evaluation and
prediction.
"""

def __init__(self):
self._device = None
self._model = None
def __init__(self, model_or_path: str, hash_file: str, do_lower: bool = True, num_labels: int = 2):

self._optimizer = None
self._hashpath = self._get_hash_table_path()

@abstractmethod
def predict(self, input_data, max_seq_len=128, batch_size=32, threshold=0.5):
pass
self._model = AutoModelForSequenceClassification.from_pretrained(model_or_path, num_labels=num_labels)

if torch.cuda.is_available():
self._device = torch.device("cuda")
self._model = self._model.cuda()
# self._model = nn.DataParallel(self._model)
else:
self._device = torch.device("cpu")

self._tokenizer = SubwordTokenizer(hash_file, do_lower_case=do_lower)

def train_model(
self,
Expand All @@ -72,8 +76,8 @@ def train_model(
:type max_seq_len: int
:param batch_size: batch size
:type batch_size: int
:param epoch: epoch, default is 5
:type epoch: int
:param epochs: epoch, default is 5
:type epochs: int
Examples
--------
Expand All @@ -92,15 +96,14 @@ def train_model(

self._config_optimizer(learning_rate)
self._model.train() # Enable training mode
self._tokenizer = SubwordTokenizer(self._hashpath, do_lower_case=True)

for _ in trange(epochs, desc="Epoch"):
tr_loss = 0 # Tracking variables
nb_tr_examples, nb_tr_steps = 0, 0
for df in train_dataloader.get_chunks():
b_input_ids, b_input_mask = self._bert_uncased_tokenize(df["text"], max_seq_len)

b_labels = torch.tensor(df["label"].to_numpy())
b_labels = torch.tensor(df["label"].to_numpy()).cuda()
self._optimizer.zero_grad() # Clear out the gradients
loss = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,
labels=b_labels)[0] # forwardpass
Expand All @@ -111,7 +114,7 @@ def train_model(
nb_tr_examples += b_input_ids.size(0)
nb_tr_steps += 1

print("Train loss: {}".format(tr_loss / nb_tr_steps))
print(f"Train loss: {tr_loss / nb_tr_steps}")

def evaluate_model(self, test_data, labels, max_seq_len=128, batch_size=32):
"""
Expand All @@ -121,9 +124,9 @@ def evaluate_model(self, test_data, labels, max_seq_len=128, batch_size=32):
:type test_data: cudf.Series
:param labels: labels for each element in test_data
:type labels: cudf.Series
:param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter
than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it
will be truncated to max_seq_len.
:param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence
is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is
longer than max_seq_len it will be truncated to max_seq_len.
:type max_seq_len: int
:param batch_size: batch size
:type batch_size: int
Expand Down Expand Up @@ -165,6 +168,52 @@ def evaluate_model(self, test_data, labels, max_seq_len=128, batch_size=32):

return float(accuracy)

def predict(self, input_data, max_seq_len=128, batch_size=32):
"""
Predict the class with the trained model
:param input_data: input text data for prediction
:type input_data: cudf.Series
:param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence
is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is
longer than max_seq_len it will be truncated to max_seq_len.
:type max_seq_len: int
:param batch_size: batch size
:type batch_size: int
:return: predictions: predictions are labels (0 or 1) based on minimum threshold
:rtype: cudf.Series
Examples
--------
>>> from cuml.preprocessing.model_selection import train_test_split
>>> emails_train, emails_test, labels_train, labels_test =
train_test_split(train_emails_df, 'label', train_size=0.8)
>>> sc.train_model(emails_train, labels_train)
>>> predictions = sc.predict(emails_test)
"""

predict_gdf = cudf.DataFrame()
predict_gdf["text"] = input_data

predict_dataset = Dataset(predict_gdf)
predict_dataloader = DataLoader(predict_dataset, batchsize=batch_size)

preds = cudf.Series()

self._model.eval()
for df in predict_dataloader.get_chunks():
b_input_ids, b_input_mask = self._bert_uncased_tokenize(df["text"], max_seq_len)
with torch.no_grad():
logits = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]

logits = logits.type(torch.DoubleTensor).to(self._device)
logits = cupy.fromDlpack(to_dlpack(logits))
b_preds = cupy.argmax(logits, axis=1).flatten()
b_preds = cudf.Series(b_preds)
preds = preds.append(b_preds)

return preds

def save_model(self, save_to_path="."):
"""
Save trained model
Expand All @@ -181,7 +230,7 @@ def save_model(self, save_to_path="."):
>>> sc.save_model()
"""

self._model.module.save_pretrained(save_to_path)
self._model.save_pretrained(save_to_path)

def save_checkpoint(self, file_path):
"""
Expand Down Expand Up @@ -216,10 +265,6 @@ def load_checkpoint(self, file_path):
model_dict = torch.load(file_path)
self._model.module.load_state_dict(model_dict["state_dict"])

def _get_hash_table_path(self):
hash_table_path = "%s/resources/bert-base-uncased-hash.txt" % os.path.dirname(os.path.realpath(__file__))
return hash_table_path

def _config_optimizer(self, learning_rate):
param_optimizer = list(self._model.named_parameters())
no_decay = ["bias", "gamma", "beta"]
Expand Down
Loading

0 comments on commit 98c0625

Please sign in to comment.