Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Phishing model and data updates #462

Merged
10 commits merged into from
Nov 18, 2022
Prev Previous commit
updates to work with cudf 22.08
  • Loading branch information
efajardo-nv committed Nov 17, 2022
commit d981a91b0007177ed57b39c130ebf3bfd07e9a4a
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,13 @@ class BinarySequenceClassifier(SequenceClassifier):
def init_model(self, model_or_path):
"""
Load model from huggingface or locally saved model.

:param model_or_path: huggingface pretrained model name or directory path to model
:type model_or_path: str

Examples
--------
>>> from clx.analytics.binary_sequence_classifier import BinarySequenceClassifier
>>> sc = BinarySequenceClassifier()

>>> sc.init_model("bert-base-uncased") # huggingface pre-trained model

>>> sc.init_model(model_path) # locally saved model
"""
self._model = AutoModelForSequenceClassification.from_pretrained(model_or_path)
Expand All @@ -65,7 +61,6 @@ def init_model(self, model_or_path):
def predict(self, input_data, max_seq_len=128, batch_size=32, threshold=0.5):
"""
Predict the class with the trained model

:param input_data: input text data for prediction
:type input_data: cudf.Series
:param max_seq_len: Limits the length of the sequence returned by tokenizer. If tokenized sentence is shorter
Expand All @@ -78,7 +73,6 @@ def predict(self, input_data, max_seq_len=128, batch_size=32, threshold=0.5):
:type threshold: float
:return: predictions, probabilities: predictions are labels (0 or 1) based on minimum threshold
:rtype: cudf.Series, cudf.Series

Examples
--------
>>> from cuml.preprocessing.model_selection import train_test_split
Expand All @@ -95,20 +89,23 @@ def predict(self, input_data, max_seq_len=128, batch_size=32, threshold=0.5):
predict_dataset = Dataset(predict_gdf)
predict_dataloader = DataLoader(predict_dataset, batchsize=batch_size)

preds = cudf.Series()
probs = cudf.Series()
preds_l = []
probs_l = []

self._model.eval()
for df in predict_dataloader.get_chunks():
b_input_ids, b_input_mask = self._bert_uncased_tokenize(df["text"], max_seq_len)
with torch.no_grad():
logits = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
b_probs = torch.sigmoid(logits[:, 1])
b_preds = b_probs.ge(threshold)
b_preds = b_probs.ge(threshold).type(torch.int8)

b_probs = cudf.io.from_dlpack(to_dlpack(b_probs))
b_preds = cudf.io.from_dlpack(to_dlpack(b_preds))
preds = preds.append(b_preds)
probs = probs.append(b_probs)
b_preds = cudf.io.from_dlpack(to_dlpack(b_preds)).astype("boolean")
preds_l.append(b_preds)
probs_l.append(b_probs)

preds = cudf.concat(preds_l)
probs = cudf.concat(probs_l)

return preds, probs
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@
Example Usage:
python phish-bert-training-script.py
"""
import cudf
import binary_sequence_classifier
import requests
import os.path
import zipfile
from sklearn.model_selection import train_test_split

import binary_sequence_classifier
import requests
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

import cudf


def preprocessing():
Expand All @@ -42,8 +44,7 @@ def preprocessing():
# convert label to binary 0 = ham, 1 = spam
df["label"] = df["spam/ham"].str.match('spam').astype(int)
# split into 80% training, 20% testing datasets
X_train, X_test, y_train, y_test = train_test_split(df["message"], df["label"], train_size=0.8,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df["message"], df["label"], train_size=0.8, random_state=42)

return (X_train, y_train, X_test, y_test)

Expand All @@ -66,8 +67,8 @@ def main():
print("Model Evaluation")
print("Accuracy:")
print(seq_classifier.evaluate_model(X_test, y_test))
test_preds = seq_classifier.predict(X_test, batch_size=128)[0].to_array()
true_labels = y_test.to_array()
test_preds = seq_classifier.predict(X_test, batch_size=128)[0].to_numpy()
true_labels = y_test.to_numpy()
print("F1 Score:")
print(f1_score(true_labels, test_preds))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,16 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/envs/morpheus/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import cudf\n",
"from sklearn.model_selection import train_test_split\n",
Expand Down Expand Up @@ -79,7 +88,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -91,7 +100,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -184,14 +193,14 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']\n",
"Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']\n",
"- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
Expand All @@ -213,7 +222,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -226,35 +235,35 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch: 50%|█████ | 1/2 [02:25<02:25, 145.90s/it]"
"Epoch: 50%|█████ | 1/2 [00:35<00:35, 35.78s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train loss: 0.08483679607217465\n"
"Train loss: 0.09204745624946165\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Epoch: 100%|██████████| 2/2 [04:53<00:00, 146.98s/it]"
"Epoch: 100%|██████████| 2/2 [01:11<00:00, 35.92s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train loss: 0.0388483095395192\n"
"Train loss: 0.01900260798949083\n"
]
},
{
Expand All @@ -271,7 +280,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -295,16 +304,16 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9875"
"0.99375"
]
},
"execution_count": 17,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -315,7 +324,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -331,23 +340,23 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9530201342281879"
"0.9729729729729729"
]
},
"execution_count": 19,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tests = test_preds[0].to_array()\n",
"true_labels = y_test.to_array()\n",
"tests = test_preds[0].to_numpy()\n",
"true_labels = y_test.to_numpy()\n",
"f1_score(true_labels, tests)"
]
},
Expand Down Expand Up @@ -396,7 +405,7 @@
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -410,7 +419,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
"version": "3.8.13"
},
"vscode": {
"interpreter": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
cudf==0+untagged.1.g788bddd
cupy==9.1.0
numpy==1.23.4
requests==2.25.1
cudf==22.8.1
cupy==10.6.0
numpy==1.22.4
pandas==1.3.5
scikit_learn==1.1.3
scipy==1.6.0
torch==1.7.1
tqdm==4.61.1
transformers==4.6.1
torch==1.12.0+cu113
tqdm==4.64.1
transformers==4.24.0
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
import torch
from dataloader import DataLoader
from dataset import Dataset
from torch.optim import AdamW
from torch.utils.dlpack import to_dlpack
from tqdm import trange
from transformers import AdamW

import cudf
from cudf.core.subword_tokenizer import SubwordTokenizer
Expand Down Expand Up @@ -100,10 +100,10 @@ def train_model(
for df in train_dataloader.get_chunks():
b_input_ids, b_input_mask = self._bert_uncased_tokenize(df["text"], max_seq_len)

b_labels = torch.tensor(df["label"].to_array())
b_labels = torch.tensor(df["label"].to_numpy())
self._optimizer.zero_grad() # Clear out the gradients
# forwardpass
loss = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0]
loss = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask,
labels=b_labels)[0] # forwardpass

loss.sum().backward()
self._optimizer.step() # update parameters
Expand Down Expand Up @@ -148,7 +148,7 @@ def evaluate_model(self, test_data, labels, max_seq_len=128, batch_size=32):
nb_eval_steps = 0
for df in test_dataloader.get_chunks():
b_input_ids, b_input_mask = self._bert_uncased_tokenize(df["text"], max_seq_len)
b_labels = torch.tensor(df["label"].to_array())
b_labels = torch.tensor(df["label"].to_numpy())
with torch.no_grad():
logits = self._model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]

Expand Down Expand Up @@ -178,7 +178,6 @@ def save_model(self, save_to_path="."):
>>> emails_train, emails_test, labels_train, labels_test = train_test_split(train_emails_df,
'label',
train_size=0.8)
>>> sc.train_model(emails_train, labels_train)
>>> sc.save_model()
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@

import argparse
import json

import numpy as np
import onnxruntime
import torch
from cudf.core.subword_tokenizer import SubwordTokenizer
from scipy.special import expit

import cudf
from cudf.core.subword_tokenizer import SubwordTokenizer


def infer(validationdata, vocab, model, output):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cudf==0+untagged.1.g788bddd
numpy==1.23.4
cudf==22.8.1
numpy==1.22.4
onnxruntime==1.13.1
scipy==1.6.0
torch==1.7.1
scipy==1.9.1
torch==1.12.0+cu113