updates to work with cudf 22.08

nv-morpheus · Nov 18, 2022 · Nov 15, 2022 · Nov 15, 2022 · Nov 15, 2022 · Nov 16, 2022
commit d981a91b0007177ed57b39c130ebf3bfd07e9a4a
@@ -38,17 +38,13 @@ class BinarySequenceClassifier(SequenceClassifier):
     def init_model(self, model_or_path):
         """
         Load model from huggingface or locally saved model.
-
         :param model_or_path: huggingface pretrained model name or directory path to model
         :type model_or_path: str
-
         Examples
         --------
         >>> from clx.analytics.binary_sequence_classifier import BinarySequenceClassifier
         >>> sc = BinarySequenceClassifier()
-
         >>> sc.init_model("bert-base-uncased")  # huggingface pre-trained model
-
         >>> sc.init_model(model_path) # locally saved model
         """
         self._model = AutoModelForSequenceClassification.from_pretrained(model_or_path)
@@ -65,7 +61,6 @@ def init_model(self, model_or_path):
     def predict(self, input_data, max_seq_len=128, batch_size=32, threshold=0.5):
         """
         Predict the class with the trained model
-
         :param input_data: input text data for prediction
         :type input_data: cudf.Series
         :param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter
@@ -78,7 +73,6 @@ def predict(self, input_data, max_seq_len=128, batch_size=32, threshold=0.5):
         :type threshold: float
         :return: predictions, probabilities: predictions are labels (0 or 1) based on minimum threshold
         :rtype: cudf.Series, cudf.Series
-
         Examples
         --------
         >>> from cuml.preprocessing.model_selection import train_test_split
@@ -95,20 +89,23 @@ def predict(self, input_data, max_seq_len=128, batch_size=32, threshold=0.5):
         predict_dataset = Dataset(predict_gdf)
         predict_dataloader = DataLoader(predict_dataset, batchsize=batch_size)
 
-        preds = cudf.Series()
-        probs = cudf.Series()
+        preds_l = []
+        probs_l = []
 
         self._model.eval()
         for df in predict_dataloader.get_chunks():
             b_input_ids, b_input_mask = self._bert_uncased_tokenize(df["text"], max_seq_len)
             with torch.no_grad():
                 logits = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
                 b_probs = torch.sigmoid(logits[:, 1])
-                b_preds = b_probs.ge(threshold)
+                b_preds = b_probs.ge(threshold).type(torch.int8)
 
             b_probs = cudf.io.from_dlpack(to_dlpack(b_probs))
-            b_preds = cudf.io.from_dlpack(to_dlpack(b_preds))
-            preds = preds.append(b_preds)
-            probs = probs.append(b_probs)
+            b_preds = cudf.io.from_dlpack(to_dlpack(b_preds)).astype("boolean")
+            preds_l.append(b_preds)
+            probs_l.append(b_probs)
+
+        preds = cudf.concat(preds_l)
+        probs = cudf.concat(probs_l)
 
         return preds, probs
@@ -16,13 +16,15 @@
 Example Usage:
 python phish-bert-training-script.py
 """
-import cudf
-import binary_sequence_classifier
-import requests
 import os.path
 import zipfile
-from sklearn.model_selection import train_test_split
+
+import binary_sequence_classifier
+import requests
 from sklearn.metrics import f1_score
+from sklearn.model_selection import train_test_split
+
+import cudf
 
 
 def preprocessing():
@@ -42,8 +44,7 @@ def preprocessing():
     # convert label to binary 0 = ham, 1 = spam
     df["label"] = df["spam/ham"].str.match('spam').astype(int)
     # split into 80% training, 20% testing datasets
-    X_train, X_test, y_train, y_test = train_test_split(df["message"], df["label"], train_size=0.8,
-                                                        random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(df["message"], df["label"], train_size=0.8, random_state=42)
 
     return (X_train, y_train, X_test, y_test)
 
@@ -66,8 +67,8 @@ def main():
     print("Model Evaluation")
     print("Accuracy:")
     print(seq_classifier.evaluate_model(X_test, y_test))
-    test_preds = seq_classifier.predict(X_test, batch_size=128)[0].to_array()
-    true_labels = y_test.to_array()
+    test_preds = seq_classifier.predict(X_test, batch_size=128)[0].to_numpy()
+    true_labels = y_test.to_numpy()
     print("F1 Score:")
     print(f1_score(true_labels, test_preds))
 

@@ -44,7 +44,16 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/morpheus/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "import cudf\n",
     "from sklearn.model_selection import train_test_split\n",
@@ -79,7 +88,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -91,7 +100,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -184,14 +193,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']\n",
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']\n",
       "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
       "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
       "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
@@ -213,7 +222,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -226,35 +235,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Epoch:  50%|█████     | 1/2 [02:25<02:25, 145.90s/it]"
+      "Epoch:  50%|█████     | 1/2 [00:35<00:35, 35.78s/it]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Train loss: 0.08483679607217465\n"
+      "Train loss: 0.09204745624946165\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Epoch: 100%|██████████| 2/2 [04:53<00:00, 146.98s/it]"
+      "Epoch: 100%|██████████| 2/2 [01:11<00:00, 35.92s/it]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Train loss: 0.0388483095395192\n"
+      "Train loss: 0.01900260798949083\n"
      ]
     },
     {
@@ -271,7 +280,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -295,16 +304,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0.9875"
+       "0.99375"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -315,7 +324,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -331,23 +340,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0.9530201342281879"
+       "0.9729729729729729"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "tests = test_preds[0].to_array()\n",
-    "true_labels = y_test.to_array()\n",
+    "tests = test_preds[0].to_numpy()\n",
+    "true_labels = y_test.to_numpy()\n",
     "f1_score(true_labels, tests)"
    ]
   },
@@ -396,7 +405,7 @@
    "toc_visible": true
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -410,7 +419,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.2"
+   "version": "3.8.13"
   },
   "vscode": {
    "interpreter": {

@@ -1,9 +1,8 @@
-cudf==0+untagged.1.g788bddd
-cupy==9.1.0
-numpy==1.23.4
-requests==2.25.1
+cudf==22.8.1
+cupy==10.6.0
+numpy==1.22.4
+pandas==1.3.5
 scikit_learn==1.1.3
-scipy==1.6.0
-torch==1.7.1
-tqdm==4.61.1
-transformers==4.6.1
+torch==1.12.0+cu113
+tqdm==4.64.1
+transformers==4.24.0
@@ -22,9 +22,9 @@
 import torch
 from dataloader import DataLoader
 from dataset import Dataset
+from torch.optim import AdamW
 from torch.utils.dlpack import to_dlpack
 from tqdm import trange
-from transformers import AdamW
 
 import cudf
 from cudf.core.subword_tokenizer import SubwordTokenizer
@@ -100,10 +100,10 @@ def train_model(
             for df in train_dataloader.get_chunks():
                 b_input_ids, b_input_mask = self._bert_uncased_tokenize(df["text"], max_seq_len)
 
-                b_labels = torch.tensor(df["label"].to_array())
+                b_labels = torch.tensor(df["label"].to_numpy())
                 self._optimizer.zero_grad()  # Clear out the gradients
-                # forwardpass
-                loss = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0]
+                loss = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,
+                                   labels=b_labels)[0]  # forwardpass
 
                 loss.sum().backward()
                 self._optimizer.step()  # update parameters
@@ -148,7 +148,7 @@ def evaluate_model(self, test_data, labels, max_seq_len=128, batch_size=32):
         nb_eval_steps = 0
         for df in test_dataloader.get_chunks():
             b_input_ids, b_input_mask = self._bert_uncased_tokenize(df["text"], max_seq_len)
-            b_labels = torch.tensor(df["label"].to_array())
+            b_labels = torch.tensor(df["label"].to_numpy())
             with torch.no_grad():
                 logits = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
 
@@ -178,7 +178,6 @@ def save_model(self, save_to_path="."):
         >>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df,
                                                                                     'label',
                                                                                     train_size=0.8)
-        >>> sc.train_model(emails_train, labels_train)
         >>> sc.save_model()
         """
 

@@ -23,12 +23,14 @@
 
 import argparse
 import json
+
 import numpy as np
 import onnxruntime
 import torch
-from cudf.core.subword_tokenizer import SubwordTokenizer
 from scipy.special import expit
+
 import cudf
+from cudf.core.subword_tokenizer import SubwordTokenizer
 
 
 def infer(validationdata, vocab, model, output):

@@ -1,5 +1,5 @@
-cudf==0+untagged.1.g788bddd
-numpy==1.23.4
+cudf==22.8.1
+numpy==1.22.4
 onnxruntime==1.13.1
-scipy==1.6.0
-torch==1.7.1
+scipy==1.9.1
+torch==1.12.0+cu113