From 05d67474dd964a9425e1c50f1ce89993bf233dd5 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Wed, 17 Jan 2024 15:39:29 -0500 Subject: [PATCH] GNN fraud detection notebook fix (#1450) - Add notebook fixes from @tzemicheal - Migrate remaining pandas code to cudf Closes #1265 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - Eli Fajardo (https://github.com/efajardo-nv) Approvers: - Tad ZeMicheal (https://github.com/tzemicheal) URL: https://github.com/nv-morpheus/Morpheus/pull/1450 --- .../gnn-fraud-detection-training.ipynb | 432 +++++++++--------- 1 file changed, 220 insertions(+), 212 deletions(-) diff --git a/models/training-tuning-scripts/fraud-detection-models/gnn-fraud-detection-training.ipynb b/models/training-tuning-scripts/fraud-detection-models/gnn-fraud-detection-training.ipynb index d66234974d..7decd59636 100644 --- a/models/training-tuning-scripts/fraud-detection-models/gnn-fraud-detection-training.ipynb +++ b/models/training-tuning-scripts/fraud-detection-models/gnn-fraud-detection-training.ipynb @@ -50,16 +50,16 @@ "source": [ "%load_ext autoreload\n", "%autoreload 2\n", - "import pandas as pd\n", - "import numpy as np\n", "import os\n", + "\n", "import dgl\n", + "import matplotlib.pylab as plt\n", "import numpy as np\n", - "import pandas as pd\n", "import torch\n", "import torch.nn as nn\n", "from model import HeteroRGCN\n", "from model import HinSAGE\n", + "from model import prepare_data\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import auc\n", "from sklearn.metrics import average_precision_score\n", @@ -68,9 +68,15 @@ "from sklearn.metrics import roc_curve\n", "from torchmetrics.functional import accuracy\n", "from tqdm import trange\n", + "from training import build_fsi_graph\n", + "from training import evaluate\n", + "from training import get_metrics\n", + "from training import init_loaders\n", + "from training import save_model\n", + "from training import train\n", "from xgboost import XGBClassifier\n", - "from training import (get_metrics, evaluate, init_loaders, build_fsi_graph,\n", - " map_node_id, prepare_data, save_model, train)\n" + "\n", + "import cudf" ] }, { @@ -85,26 +91,6 @@ "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")" ] }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "device(type='cuda', index=0)" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#device " - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -122,8 +108,8 @@ "# Replace training-data.csv and validation-data.csv with training & validation csv in dataset file.\n", "TRAINING_DATA ='../../datasets/training-data/fraud-detection-training-data.csv'\n", "VALIDATION_DATA = '../../datasets/validation-data/fraud-detection-validation-data.csv'\n", - "train_data = pd.read_csv(TRAINING_DATA)\n", - "inductive_data = pd.read_csv(VALIDATION_DATA)" + "train_data = cudf.read_csv(TRAINING_DATA)\n", + "inductive_data = cudf.read_csv(VALIDATION_DATA)" ] }, { @@ -141,16 +127,15 @@ "outputs": [], "source": [ "# Increase number of samples.\n", - "def augement_data(train_data=train_data, n=20):\n", - " max_id = inductive_data.index.max()\n", + "def augment_data(train_data=train_data, n=20):\n", + " train_data.drop(columns=['index'], inplace=True, axis=1)\n", " non_fraud = train_data[train_data['fraud_label'] == 0]\n", - " \n", - " non_fraud = non_fraud.drop(['index'], axis=1)\n", - " df_fraud = pd.concat([non_fraud for i in range(n)])\n", - " df_fraud.index = np.arange(1076, 1076 + df_fraud.shape[0])\n", - " df_fraud['index'] = df_fraud.index\n", - " \n", - " return pd.concat((train_data, df_fraud))" + " df_fraud = cudf.concat([non_fraud for _ in range(n)])\n", + " df_train = cudf.concat([train_data, df_fraud])\n", + " df_train.reset_index(inplace=True)\n", + " df_train['index'] = df_train.index\n", + "\n", + " return df_train" ] }, { @@ -159,7 +144,19 @@ "metadata": {}, "outputs": [], "source": [ - "train_data = augement_data(train_data, n=20)" + "train_data = augment_data(train_data, n=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# re-arange test data index\n", + "last_train_index = train_data.index.max()+1\n", + "inductive_data.index = np.arange(last_train_index, last_train_index + inductive_data.shape[0])\n", + "inductive_data['index'] = inductive_data.index" ] }, { @@ -173,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -183,11 +180,11 @@ "The distribution of fraud for the train data is:\n", " 0 11865\n", "1 188\n", - "Name: fraud_label, dtype: int64\n", + "Name: fraud_label, dtype: int32\n", "The distribution of fraud for the inductive data is:\n", " 0 244\n", "1 21\n", - "Name: fraud_label, dtype: int64\n" + "Name: fraud_label, dtype: int32\n" ] } ], @@ -196,38 +193,13 @@ "print('The distribution of fraud for the inductive data is:\\n', inductive_data['fraud_label'].value_counts())" ] }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# split train, test and create nodes index\n", - "def prepare_data(df_train, df_test):\n", - " \n", - " train_idx_ = df_train.shape[0]\n", - " df = pd.concat([df_train, df_test], axis=0)\n", - " df['tran_id'] = df['index']\n", - "\n", - " meta_cols = ['tran_id', 'client_node', 'merchant_node']\n", - " for col in meta_cols:\n", - " map_node_id(df, col)\n", - "\n", - " train_idx = df['tran_id'][:train_idx_]\n", - " test_idx = df['tran_id'][train_idx_:]\n", - "\n", - " df['index'] = df['tran_id']\n", - " df.index = df['index']\n", - "\n", - " return (df.iloc[train_idx, :], df.iloc[test_idx, :], train_idx, test_idx, df['fraud_label'].values, df)" - ] - }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ + "# Split into training, testing datasets\n", "train_data, test_data, train_idx, inductive_idx, labels, df = prepare_data(train_data, inductive_data)" ] }, @@ -236,7 +208,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 3. Construct transasction graph network" + "### 3. Construct transaction graph network" ] }, { @@ -253,45 +225,17 @@ "metadata": {}, "outputs": [], "source": [ - "meta_cols = [\"client_node\", \"merchant_node\", \"fraud_label\", \"index\", \"tran_id\"]\n", + "meta_cols = [\"client_node\", \"merchant_node\", \"index\"]\n", "\n", "# Build graph\n", "whole_graph, feature_tensors = build_fsi_graph(df, meta_cols)\n", "train_graph, _ = build_fsi_graph(train_data, meta_cols)\n", - "whole_graph = whole_graph.to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Dataset to tensors\n", - "feature_tensors = feature_tensors.to(device)\n", - "train_idx = torch.from_numpy(train_idx.values).to(device)\n", - "inductive_idx = torch.from_numpy(inductive_idx.values).to(device)\n", - "labels = torch.LongTensor(labels).to(device)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Graph(num_nodes={'client': 623, 'merchant': 388, 'transaction': 12053},\n", - " num_edges={('client', 'buy', 'transaction'): 12053, ('merchant', 'sell', 'transaction'): 12053, ('transaction', 'bought', 'client'): 12053, ('transaction', 'issued', 'merchant'): 12053},\n", - " metagraph=[('client', 'transaction', 'buy'), ('transaction', 'client', 'bought'), ('transaction', 'merchant', 'issued'), ('merchant', 'transaction', 'sell')])\n" - ] - } - ], - "source": [ - "# Show structure of training graph.\n", - "print(train_graph)" + "\n", + "# Dataset\n", + "feature_tensors = feature_tensors.float()\n", + "train_idx = torch.from_dlpack(train_idx.values.toDlpack()).long()\n", + "inductive_idx = torch.from_dlpack(inductive_idx.values.toDlpack()).long()\n", + "labels = torch.from_dlpack(labels.toDlpack()).long()" ] }, { @@ -312,31 +256,34 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Hyperparameters\n", "target_node = \"transaction\"\n", - "epochs = 20\n", + "epochs = 25\n", "in_size, hidden_size, out_size, n_layers,\\\n", " embedding_size = 111, 64, 2, 2, 1\n", - "batch_size = 100\n", - "hyperparameters = {\"in_size\": in_size, \"hidden_size\": hidden_size,\n", - " \"out_size\": out_size, \"n_layers\": n_layers,\n", - " \"embedding_size\": embedding_size,\n", - " \"target_node\": target_node,\n", - " \"epoch\": epochs}\n", - "\n", + "batch_size = 256\n", + "in_size, hidden_size, out_size, n_layers, embedding_size = 111, 64, 2, 2, 1\n", + "hyperparameters = {\n", + " \"in_size\": in_size,\n", + " \"hidden_size\": hidden_size,\n", + " \"out_size\": out_size,\n", + " \"n_layers\": n_layers,\n", + " \"embedding_size\": embedding_size,\n", + " \"target_node\": target_node,\n", + " \"epoch\": epochs\n", + "}\n", "\n", - "scale_pos_weight = train_data['fraud_label'].sum() / train_data.shape[0]\n", - "scale_pos_weight = torch.tensor(\n", - " [scale_pos_weight, 1-scale_pos_weight]).to(device)" + "scale_pos_weight = (labels[train_idx].sum() / train_data.shape[0]).item()\n", + "scale_pos_weight = torch.FloatTensor([scale_pos_weight, 1 - scale_pos_weight]).to(device)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -345,7 +292,6 @@ " device), train_idx, test_idx=inductive_idx,\n", " val_idx=inductive_idx, g_test=whole_graph, batch_size=batch_size)\n", "\n", - "\n", "# Set model variables\n", "model = HinSAGE(train_graph, in_size, hidden_size, out_size, n_layers, embedding_size).to(device)\n", "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n", @@ -354,314 +300,384 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " 0%| | 0/20 [00:00#sk-container-id-2 {color: black;background-color: white;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
+       "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
        "              colsample_bylevel=None, colsample_bynode=None,\n",
        "              colsample_bytree=None, early_stopping_rounds=None,\n",
        "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
@@ -806,7 +820,7 @@
        "              max_delta_step=None, max_depth=None, max_leaves=None,\n",
        "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
        "              n_estimators=100, n_jobs=None, num_parallel_tree=None,\n",
-       "              predictor=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.