Merge branch 'branch-23.07' of github.com:nv-morpheus/Morpheus into d…

…avid-rest-source-sink
nv-morpheus · Jun 8, 2023 · 98c0625 · 98c0625
2 parents 7c518a4 + 59ade8c
commit 98c0625
Show file tree

Hide file tree

Showing 37 changed files with 455 additions and 832 deletions.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -284,8 +284,10 @@ RUN --mount=type=cache,id=apt,target=/var/cache/apt \
     rm -rf /var/lib/apt/lists/*
 
 # Install camouflage needed for unittests to mock a triton server
-RUN source activate morpheus &&\
-    npm install -g camouflage-server
+# Pin to v0.9 until #967 is resolved
+RUN source activate morpheus && \
+    npm install -g camouflage-server@0.9 && \
+    npm cache clean --force
 
 # Setup git to allow other users to access /workspace. Requires git 2.35.3 or
 # greater. See https://marc.info/?l=git&m=164989570902912&w=2. Only enable for

diff --git a/docker/conda/environments/cuda11.8_dev.yml b/docker/conda/environments/cuda11.8_dev.yml
@@ -33,7 +33,7 @@ dependencies:
     - configargparse=1.5
     - cuda-compiler=11.8
     - cuda-nvml-dev=11.8
-    - cudatoolkit=11.8
+    - cuda-toolkit=11.8
     - cudf=23.02
     - cupy=11.6.0
     - cxx-compiler

diff --git a/docs/source/developer_guide/contributing.md b/docs/source/developer_guide/contributing.md
@@ -220,7 +220,16 @@ git submodule update --init --recursive
    pip install -e ${MORPHEUS_ROOT}
    ```
    Once Morpheus has been built, it can be installed into the current virtual environment.
-1. Test the build (Note: some tests will be skipped)
+1. Test the build (Note: some tests will be skipped)\
+   Some of the tests will rely on external data sets.
+   ```bash
+   MORPHEUS_ROOT=${PWD}
+
+   git lfs install
+   git lfs update
+   ./scripts/fetch_data.py fetch all
+   ```
+   This script will fetch the data sets needed. Then run:
    ```bash
    pytest
    ```

diff --git a/models/phishing-models/phishing-bert-20230421.onnx b/models/phishing-models/phishing-bert-20230421.onnx
diff --git a/models/phishing-models/phishing-bert-20230421.pt b/models/phishing-models/phishing-bert-20230421.pt
diff --git a/models/phishing-models/phishing-bert-20230517.onnx b/models/phishing-models/phishing-bert-20230517.onnx
diff --git a/models/phishing-models/phishing-bert-20230517.pt b/models/phishing-models/phishing-bert-20230517.pt
diff --git a/models/root-cause-models/root-cause-binary-bert-20221118.onnx b/models/root-cause-models/root-cause-binary-bert-20221118.onnx
diff --git a/models/root-cause-models/root-cause-binary-bert-20230517.onnx b/models/root-cause-models/root-cause-binary-bert-20230517.onnx
diff --git a/models/training-tuning-scripts/common/__init__.py b/models/training-tuning-scripts/common/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Common tools and utilities for model training and tuning.
+"""
diff --git a/...ing-scripts/phishing-models/dataloader.py → ...ining-tuning-scripts/common/dataloader.py b/...ing-scripts/phishing-models/dataloader.py → ...ining-tuning-scripts/common/dataloader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,10 +17,8 @@
 log = logging.getLogger(__name__)
 
 
-class DataLoader(object):
-    """
-    Wrapper class is used to return dataframe partitions based on batchsize.
-    """
+class DataLoader():
+    """Wrapper class is used to return dataframe partitions based on batchsize."""
 
     def __init__(self, dataset, batchsize=1000):
         """Constructor to create dataframe partitions.

diff --git a/...tuning-scripts/phishing-models/dataset.py → ...training-tuning-scripts/common/dataset.py b/...tuning-scripts/phishing-models/dataset.py → ...training-tuning-scripts/common/dataset.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-class Dataset(object):
+class Dataset():
 
     def __init__(self, df):
         self._df = df.reset_index(drop=True)

diff --git a/...ts/phishing-models/sequence_classifier.py → ...ing-scripts/common/sequence_classifier.py b/...ts/phishing-models/sequence_classifier.py → ...ing-scripts/common/sequence_classifier.py
@@ -14,39 +14,43 @@
 # limitations under the License.
 
 import logging
-import os
-from abc import ABC
-from abc import abstractmethod
 
+import cudf
+from cudf.core.subword_tokenizer import SubwordTokenizer
 import cupy
+
 import torch
-from dataloader import DataLoader
-from dataset import Dataset
 from torch.optim import AdamW
 from torch.utils.dlpack import to_dlpack
 from tqdm import trange
+from transformers import AutoModelForSequenceClassification
 
-import cudf
-from cudf.core.subword_tokenizer import SubwordTokenizer
+from .dataloader import DataLoader
+from .dataset import Dataset
 
 log = logging.getLogger(__name__)
 
 
-class SequenceClassifier(ABC):
+class SequenceClassifier:
     """
     Sequence Classifier using BERT. This class provides methods for training/loading BERT models, evaluation and
     prediction.
     """
 
-    def __init__(self):
-        self._device = None
-        self._model = None
+    def __init__(self, model_or_path: str, hash_file: str, do_lower: bool = True, num_labels: int = 2):
+
         self._optimizer = None
-        self._hashpath = self._get_hash_table_path()
 
-    @abstractmethod
-    def predict(self, input_data, max_seq_len=128, batch_size=32, threshold=0.5):
-        pass
+        self._model = AutoModelForSequenceClassification.from_pretrained(model_or_path, num_labels=num_labels)
+
+        if torch.cuda.is_available():
+            self._device = torch.device("cuda")
+            self._model = self._model.cuda()
+            # self._model = nn.DataParallel(self._model)
+        else:
+            self._device = torch.device("cpu")
+
+        self._tokenizer = SubwordTokenizer(hash_file, do_lower_case=do_lower)
 
     def train_model(
         self,
@@ -72,8 +76,8 @@ def train_model(
         :type max_seq_len: int
         :param batch_size: batch size
         :type batch_size: int
-        :param epoch: epoch, default is 5
-        :type epoch: int
+        :param epochs: epoch, default is 5
+        :type epochs: int
 
         Examples
         --------
@@ -92,15 +96,14 @@ def train_model(
 
         self._config_optimizer(learning_rate)
         self._model.train()  # Enable training mode
-        self._tokenizer = SubwordTokenizer(self._hashpath, do_lower_case=True)
 
         for _ in trange(epochs, desc="Epoch"):
             tr_loss = 0  # Tracking variables
             nb_tr_examples, nb_tr_steps = 0, 0
             for df in train_dataloader.get_chunks():
                 b_input_ids, b_input_mask = self._bert_uncased_tokenize(df["text"], max_seq_len)
 
-                b_labels = torch.tensor(df["label"].to_numpy())
+                b_labels = torch.tensor(df["label"].to_numpy()).cuda()
                 self._optimizer.zero_grad()  # Clear out the gradients
                 loss = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,
                                    labels=b_labels)[0]  # forwardpass
@@ -111,7 +114,7 @@ def train_model(
                 nb_tr_examples += b_input_ids.size(0)
                 nb_tr_steps += 1
 
-            print("Train loss: {}".format(tr_loss / nb_tr_steps))
+            print(f"Train loss: {tr_loss / nb_tr_steps}")
 
     def evaluate_model(self, test_data, labels, max_seq_len=128, batch_size=32):
         """
@@ -121,9 +124,9 @@ def evaluate_model(self, test_data, labels, max_seq_len=128, batch_size=32):
         :type test_data: cudf.Series
         :param labels: labels for each element in test_data
         :type labels: cudf.Series
-        :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter
-            than max_seq_len, output will be padded with 0s. If the tokenized sentence is longer than max_seq_len it
-            will be truncated to max_seq_len.
+        :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence
+            is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is
+            longer than max_seq_len it will be truncated to max_seq_len.
         :type max_seq_len: int
         :param batch_size: batch size
         :type batch_size: int
@@ -165,6 +168,52 @@ def evaluate_model(self, test_data, labels, max_seq_len=128, batch_size=32):
 
         return float(accuracy)
 
+    def predict(self, input_data, max_seq_len=128, batch_size=32):
+        """
+        Predict the class with the trained model
+
+        :param input_data: input text data for prediction
+        :type input_data: cudf.Series
+        :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence
+            is shorter than max_seq_len, output will be padded with 0s. If the tokenized sentence is
+            longer than max_seq_len it will be truncated to max_seq_len.
+        :type max_seq_len: int
+        :param batch_size: batch size
+        :type batch_size: int
+        :return: predictions: predictions are labels (0 or 1) based on minimum threshold
+        :rtype: cudf.Series
+
+        Examples
+        --------
+        >>> from cuml.preprocessing.model_selection import train_test_split
+        >>> emails_train, emails_test, labels_train, labels_test =
+                train_test_split(train_emails_df, 'label', train_size=0.8)
+        >>> sc.train_model(emails_train, labels_train)
+        >>> predictions = sc.predict(emails_test)
+        """
+
+        predict_gdf = cudf.DataFrame()
+        predict_gdf["text"] = input_data
+
+        predict_dataset = Dataset(predict_gdf)
+        predict_dataloader = DataLoader(predict_dataset, batchsize=batch_size)
+
+        preds = cudf.Series()
+
+        self._model.eval()
+        for df in predict_dataloader.get_chunks():
+            b_input_ids, b_input_mask = self._bert_uncased_tokenize(df["text"], max_seq_len)
+            with torch.no_grad():
+                logits = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
+
+            logits = logits.type(torch.DoubleTensor).to(self._device)
+            logits = cupy.fromDlpack(to_dlpack(logits))
+            b_preds = cupy.argmax(logits, axis=1).flatten()
+            b_preds = cudf.Series(b_preds)
+            preds = preds.append(b_preds)
+
+        return preds
+
     def save_model(self, save_to_path="."):
         """
         Save trained model
@@ -181,7 +230,7 @@ def save_model(self, save_to_path="."):
         >>> sc.save_model()
         """
 
-        self._model.module.save_pretrained(save_to_path)
+        self._model.save_pretrained(save_to_path)
 
     def save_checkpoint(self, file_path):
         """
@@ -216,10 +265,6 @@ def load_checkpoint(self, file_path):
         model_dict = torch.load(file_path)
         self._model.module.load_state_dict(model_dict["state_dict"])
 
-    def _get_hash_table_path(self):
-        hash_table_path = "%s/resources/bert-base-uncased-hash.txt" % os.path.dirname(os.path.realpath(__file__))
-        return hash_table_path
-
     def _config_optimizer(self, learning_rate):
         param_optimizer = list(self._model.named_parameters())
         no_decay = ["bias", "gamma", "beta"]