code

LucaJiang · Aug 12, 2021 · 12d55dd · 12d55dd
1 parent 6559673
commit 12d55dd
Show file tree

Hide file tree

Showing 10 changed files with 2,009 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,35 @@
-# PTC-Imputation-DNN
-Code for article Analysis of Papillary Thyroid Carcinoma Metastasis Using Missing Value Imputation and Deep Transfer Learning Network
+# Missing Value Imputation and Deep Transfer Learning Network
+
+This repository contains source codes for the paper **Analysis of Papillary Thyroid Carcinoma Metastasis Using Missing Value Imputation and Deep Transfer Learning Network**. This paper had been accepted by 2021 IEEE-IUS (2021 IEEE International Ultrasonics Symposium).
+
+Missing values, diversified data types and insufficient sample size have become obstacles to clinical data analysis. In this paper, we proposed a novel machine learning pipeline aiming to make full use of medical data. Missing value imputation via MICE, image analysis via deep learning and feature fusion via SVM were applied to predict the metastasis of Papillary Thyroid Carcinoma. Due to the privacy policy, data set is not  available in this repository.
+
+This repository contains the code on data preprocessing, model training and result visualization. Please refer to the article for details.
+
+- [Missing Value Imputation and Deep Transfer Learning Network](#missing-value-imputation-and-deep-transfer-learning-network)
+  - [Data Preprocessing for Clinical Features](#data-preprocessing-for-clinical-features)
+  - [Deep Learning Model for Image analysis](#deep-learning-model-for-image-analysis)
+  - [SVM for Feature Fusion](#svm-for-feature-fusion)
+  - [Visualization](#visualization)
+
+
+## Data Preprocessing for Clinical Features
+* [summary](summary.R) used to summarize the clinical features;
+* [dummy](dummy.R) used to convert categorical variable to dummy variable;
+* [impute](impute.R) used to impute the missing value;
+* [heatmap2](heatmap2.R) used to visualize the heat map and cluster analysis of the clinical features.
+
+## Deep Learning Model for Image analysis
+* [resnet50_basic_cv](resnet50_basic_cv.ipynb) build a ResNet-50 model;
+* [get_prediction](get_prediction.ipynb) calculate statistic indicators to evaluate the deep learning model.
+
+## SVM for Feature Fusion
+* [SVM](SVM.ipynb) build a SVM classifier.
+
+## Visualization
+* [gradcam](gradcam.py) visualize deep learning model via heat maps;
+* [roc_curves](roc_curves.ipynb) visualize our result with ROC curves.
+
+
+
+
diff --git a/SVM.ipynb b/SVM.ipynb
@@ -0,0 +1,349 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.2-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python362jvsc74a57bd06ad2903d9f8eb78095a8e1054d3b7d96029c25240c1559c5c7674409ebbac066",
+   "display_name": "Python 3.6.2 64-bit (conda)"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "SVM for Feature fusion"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "import torch\r\n",
+    "import torch.nn as nn\r\n",
+    "import torch.nn.functional as F\r\n",
+    "import torch.optim as optim\r\n",
+    "import torch.optim.lr_scheduler as lr_scheduler\r\n",
+    "from torch.optim.lr_scheduler import _LRScheduler\r\n",
+    "import torch.utils.data as data\r\n",
+    "import torchvision.transforms as transforms\r\n",
+    "import torchvision.datasets as datasets\r\n",
+    "import torchvision.models as models\r\n",
+    "from torch.utils.data import Dataset, DataLoader\r\n",
+    "from torchvision import  utils\r\n",
+    "\r\n",
+    "from sklearn import decomposition\r\n",
+    "from sklearn import manifold\r\n",
+    "from sklearn import metrics\r\n",
+    "from sklearn import model_selection\r\n",
+    "from sklearn.metrics import confusion_matrix\r\n",
+    "\r\n",
+    "import matplotlib.pyplot as plt\r\n",
+    "%matplotlib inline\r\n",
+    "\r\n",
+    "import copy\r\n",
+    "from collections import namedtuple\r\n",
+    "import os, re, time\r\n",
+    "import random\r\n",
+    "import shutil\r\n",
+    "import math\r\n",
+    "\r\n",
+    "import pandas as pd\r\n",
+    "import numpy as np\r\n",
+    "from sklearn import svm\r\n",
+    "\r\n",
+    "# set seed to make sure the results are reproducible\r\n",
+    "SEED = 123\r\n",
+    "random.seed(SEED)\r\n",
+    "np.random.seed(SEED)\r\n",
+    "torch.manual_seed(SEED)\r\n",
+    "torch.cuda.manual_seed(SEED)\r\n",
+    "torch.backends.cudnn.deterministic = True\r\n",
+    "\r\n",
+    "# change directory of data\r\n",
+    "CV_ID = 1\r\n",
+    "CV_ID = str(CV_ID)\r\n",
+    "datapath = os.path.join(\".data\",\"hcb\",\"cv\" + CV_ID)\r\n",
+    "load_name = 'basic-model-cv'+CV_ID+'.pt'\r\n",
+    "train_dir = os.path.join(datapath, 'train')\r\n",
+    "val_dir = os.path.join(datapath, 'val')\r\n",
+    "test_dir = os.path.join(datapath, 'test')\r\n",
+    "\r\n",
+    "pretrained_size = (224, 224)\r\n",
+    "pretrained_means = [0.485, 0.456, 0.406]\r\n",
+    "pretrained_stds= [0.229, 0.224, 0.225]\r\n",
+    "test_transforms = transforms.Compose([\r\n",
+    "                           transforms.Resize(pretrained_size),\r\n",
+    "                           transforms.ToTensor(),\r\n",
+    "                           transforms.Normalize(mean = pretrained_means, \r\n",
+    "                                                std = pretrained_stds)\r\n",
+    "                       ])\r\n",
+    "# inorder to load img with it's label\r\n",
+    "class MyImageFolder(datasets.ImageFolder):\r\n",
+    "    def __getitem__(self, index):\r\n",
+    "        path, _ = self.imgs[index] #img path, label\r\n",
+    "        return super(MyImageFolder, self).__getitem__(index), path # return image path\r\n",
+    "    \r\n",
+    "\r\n",
+    "BATCH_SIZE = 1000 #4\r\n",
+    "train_data = MyImageFolder(root = train_dir,\r\n",
+    "                                transform = test_transforms)\r\n",
+    "test_data = MyImageFolder(root = test_dir,\r\n",
+    "                                transform = test_transforms)\r\n",
+    "valid_data = MyImageFolder(root = val_dir,\r\n",
+    "                                transform = test_transforms)\r\n",
+    "# train_iterator = data.DataLoader(train_data, \r\n",
+    "#                                 shuffle = True,\r\n",
+    "#                                 drop_last = True,\r\n",
+    "#                                 batch_size = BATCH_SIZE) \r\n",
+    "# valid_iterator = data.DataLoader(valid_data, drop_last = True,\r\n",
+    "#                                 batch_size = BATCH_SIZE)\r\n",
+    "# test_iterator = data.DataLoader(test_data, drop_last = True,\r\n",
+    "#                                 batch_size = BATCH_SIZE)     \r\n",
+    "\r\n",
+    "\r\n",
+    "# read csv data\r\n",
+    "path_to_text = '.data/text.csv'\r\n",
+    "text_table = pd.read_csv(path_to_text, header=0, index_col=0)\r\n",
+    "Dimension_Text = len(text_table.columns)\r\n",
+    "\r\n",
+    "def imgid2index(id):\r\n",
+    "    return 1000*int(id[0])+int(id[2:])\r\n",
+    "\r\n",
+    "def imgid2textinfo(imgid):\r\n",
+    "    # convert img path into text info in .csv\r\n",
+    "    # input: ['.data\\\\train\\\\zy\\\\1_152.bmp','2_8.bmp']\r\n",
+    "    # output: tensor([[ 1,  4,  7, 10],\r\n",
+    "    #    [ 3,  6,  9, 12]])\r\n",
+    "    return torch.tensor(text_table.loc[[imgid2index(re.search('\\d_\\d+',i.split('\\\\')[-1]).group()) for i in imgid],:].values, dtype=torch.float32, device=device)\r\n",
+    "\r\n",
+    "basic_model = torch.hub.load('pytorch/vision:v0.6.0', 'resnet50')\r\n",
+    "# change output dimension to what we need\r\n",
+    "IN_FEATURES = basic_model.fc.in_features \r\n",
+    "OUTPUT_DIM = 2\r\n",
+    "basic_model.fc = nn.Linear(IN_FEATURES, OUTPUT_DIM)\r\n",
+    "fc1=nn.Linear(IN_FEATURES, 32)\r\n",
+    "fc2=nn.Linear(32,OUTPUT_DIM)\r\n",
+    "basic_model.fc = nn.Sequential(fc1, fc2)\r\n",
+    "basic_model.load_state_dict(torch.load(load_name))\r\n",
+    "basic_model.fc[1] = nn.Identity()\r\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\r\n",
+    "# text_table.head()"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "# load train data\r\n",
+    "num_train = len(train_data)\r\n",
+    "train_data_combine_model = data.DataLoader(train_data,\r\n",
+    "                                           shuffle=True,\r\n",
+    "                                           batch_size=num_train)\r\n",
+    "xy, train_id = next(iter(train_data_combine_model))\r\n",
+    "x, y = xy\r\n",
+    "img_output = basic_model(x)\r\n",
+    "train_x = torch.cat((img_output, imgid2textinfo(\r\n",
+    "    train_id).cpu()), dim=-1).detach().numpy()\r\n",
+    "train_y = y\r\n",
+    "# load val data and combine with train\r\n",
+    "num_val = len(valid_data)\r\n",
+    "val_data_combine_model = data.DataLoader(valid_data,\r\n",
+    "                                         shuffle=True,\r\n",
+    "                                         batch_size=num_val)\r\n",
+    "xy, val_id = next(iter(val_data_combine_model))\r\n",
+    "x, y = xy\r\n",
+    "img_output = basic_model(x)\r\n",
+    "val_x = torch.cat((img_output, imgid2textinfo(val_id).cpu()),\r\n",
+    "                  dim=-1).detach().numpy()\r\n",
+    "val_y = y\r\n",
+    "\r\n",
+    "train_x = np.concatenate((train_x, val_x), axis=0)\r\n",
+    "train_y = np.concatenate((train_y, val_y), axis=0)\r\n",
+    "# load test data\r\n",
+    "num_test = len(test_data)\r\n",
+    "test_data_combine_model = data.DataLoader(test_data, batch_size=num_test)\r\n",
+    "xy, test_id = next(iter(test_data_combine_model))\r\n",
+    "x, y = xy\r\n",
+    "img_output = basic_model(x)\r\n",
+    "test_x = torch.cat((img_output, imgid2textinfo(\r\n",
+    "    test_id).cpu()), dim=-1).detach().numpy()\r\n",
+    "test_y = y.numpy()\r\n",
+    "tmpt = [imgid2index(re.search('\\d_\\d+', i.split('\\\\')[-1]).group())\r\n",
+    "        for i in train_id]\r\n",
+    "tmpv = [imgid2index(re.search('\\d_\\d+', i.split('\\\\')[-1]).group())\r\n",
+    "        for i in val_id]\r\n",
+    "train_ID = tmpt+tmpv\r\n",
+    "test_ID = [imgid2index(re.search('\\d_\\d+', i.split('\\\\')[-1]).group())\r\n",
+    "           for i in test_id]\r\n",
+    "\r\n",
+    "x_train = text_table.loc[train_ID, :].values\r\n",
+    "y_train = [i//1000-1 for i in train_ID]\r\n",
+    "# print(x_train,y_train)\r\n",
+    "\r\n",
+    "x_test = text_table.loc[test_ID, :].values\r\n",
+    "y_test = [i//1000-1 for i in test_ID]\r\n",
+    "# print(x_test,y_test)"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# SVM TEXT+IMG"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "#search best params for SVM\r\n",
+    "svm_para = svm.SVC()    \r\n",
+    "param_grid = {'C': range(5,30), 'gamma': [1e-2, 7e-3, 5e-3, 3e-3, 1e-3, 7e-4, 5e-4, 3e-4, 1e-4, 7e-5], \r\n",
+    "              'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}    \r\n",
+    "grid_search = model_selection.GridSearchCV(svm_para, param_grid)    \r\n",
+    "grid_search.fit(train_x, train_y)    \r\n",
+    "best_parameters = grid_search.best_estimator_.get_params()\r\n",
+    "for para, val in list(best_parameters.items()):    \r\n",
+    "    print(para, val) "
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "# combine text info and output of img model with Support Vector Machine\r\n",
+    "rbf_svc = svm.SVC(C=best_parameters['C'], gamma=best_parameters['gamma'], kernel=best_parameters['kernel'])\r\n",
+    "# rbf_svc = svm.SVC(kernel='rbf',C = 18, gamma = 0.001)\r\n",
+    "rbf_svc.fit(train_x, train_y)\r\n",
+    "\r\n",
+    "svm_pred = rbf_svc.predict(train_x)\r\n",
+    "acc = np.equal(svm_pred, train_y).sum() / len(train_y)\r\n",
+    "fpr, tpr, thresholds = metrics.roc_curve(train_y, rbf_svc.decision_function(train_x), pos_label=1)\r\n",
+    "roc_auc = metrics.auc(fpr, tpr)\r\n",
+    "plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)\r\n",
+    "plt.legend(loc='lower right')\r\n",
+    "# plt.plot([0, 1], [0, 1], 'r--')\r\n",
+    "plt.xlim([-0.1, 1.1])\r\n",
+    "plt.ylim([-0.1, 1.1])\r\n",
+    "plt.xlabel('False Positive Rate') #横坐标是fpr\r\n",
+    "plt.ylabel('True Positive Rate')  #纵坐标是tpr\r\n",
+    "plt.title('Receiver operating characteristic Curve')\r\n",
+    "plt.show()"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "svm_pred = rbf_svc.predict(test_x)\r\n",
+    "acc = np.equal(svm_pred, test_y).sum() / len(test_y)\r\n",
+    "fpr, tpr, thresholds = metrics.roc_curve(test_y, rbf_svc.decision_function(test_x), pos_label=1)\r\n",
+    "roc_auc = metrics.auc(fpr, tpr)\r\n",
+    "plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)\r\n",
+    "plt.legend(loc='lower right')\r\n",
+    "# plt.plot([0, 1], [0, 1], 'r--')\r\n",
+    "plt.xlim([-0.1, 1.1])\r\n",
+    "plt.ylim([-0.1, 1.1])\r\n",
+    "plt.xlabel('False Positive Rate') #横坐标是fpr\r\n",
+    "plt.ylabel('True Positive Rate')  #纵坐标是tpr\r\n",
+    "plt.title('Receiver operating characteristic Curve')\r\n",
+    "plt.show()"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# SVM TEXT"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "#search best params for SVM\r\n",
+    "svm_para = svm.SVC()    \r\n",
+    "param_grid = {'C': range(5,20), 'gamma': [1e-2, 7e-3, 5e-3, 3e-3, 1e-3, 7e-4], \r\n",
+    "              'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}    \r\n",
+    "grid_search = model_selection.GridSearchCV(svm_para, param_grid)    \r\n",
+    "grid_search.fit(x_train, y_train)    \r\n",
+    "best_parameters = grid_search.best_estimator_.get_params()\r\n",
+    "for para, val in list(best_parameters.items()):    \r\n",
+    "    print(para, val) "
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "# combine text info and output of img model with Support Vector Machine\r\n",
+    "rbf_svc = svm.SVC(C=best_parameters['C'], gamma=best_parameters['gamma'], kernel=best_parameters['kernel'])\r\n",
+    "# rbf_svc = svm.SVC(kernel='rbf',C = 18, gamma = 0.001)\r\n",
+    "rbf_svc.fit(x_train, y_train)\r\n",
+    "\r\n",
+    "svm_pred = rbf_svc.predict(x_train)\r\n",
+    "acc = np.equal(svm_pred, y_train).sum() / len(y_train)\r\n",
+    "fpr, tpr, thresholds = metrics.roc_curve(y_train, rbf_svc.decision_function(x_train), pos_label=1)\r\n",
+    "roc_auc = metrics.auc(fpr, tpr)\r\n",
+    "plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)\r\n",
+    "plt.legend(loc='lower right')\r\n",
+    "# plt.plot([0, 1], [0, 1], 'r--')\r\n",
+    "plt.xlim([-0.1, 1.1])\r\n",
+    "plt.ylim([-0.1, 1.1])\r\n",
+    "plt.xlabel('False Positive Rate') #横坐标是fpr\r\n",
+    "plt.ylabel('True Positive Rate')  #纵坐标是tpr\r\n",
+    "plt.title('Receiver operating characteristic Curve')\r\n",
+    "plt.show()"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "svm_pred = rbf_svc.predict(x_test)\r\n",
+    "acc = np.equal(svm_pred, y_test).sum() / len(y_test)\r\n",
+    "fpr, tpr, thresholds = metrics.roc_curve(y_test, rbf_svc.decision_function(x_test), pos_label=1)\r\n",
+    "roc_auc = metrics.auc(fpr, tpr)\r\n",
+    "plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)\r\n",
+    "plt.legend(loc='lower right')\r\n",
+    "# plt.plot([0, 1], [0, 1], 'r--')\r\n",
+    "plt.xlim([-0.1, 1.1])\r\n",
+    "plt.ylim([-0.1, 1.1])\r\n",
+    "plt.xlabel('False Positive Rate') #横坐标是fpr\r\n",
+    "plt.ylabel('True Positive Rate')  #纵坐标是tpr\r\n",
+    "plt.title('Receiver operating characteristic Curve')\r\n",
+    "plt.show()"
+   ],
+   "outputs": [],
+   "metadata": {}
+  }
+ ]
+}