-
Notifications
You must be signed in to change notification settings - Fork 0
/
BreastCancerCaseStudy.R
224 lines (155 loc) · 9.06 KB
/
BreastCancerCaseStudy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File: EARIN miniproyect 2
# Authors: Alejandro Cirugeda
#
# Description:
# The task of this project is to make a case study of the different classifiers and techniques
# used in datamining and machine learning and the advantages and disadvantages of each one of them.
# In order to do that we will using a Breast Cancer Dataset. After the case study we will be able to
# distinguish the best technique to create a classifier for this data for distinguish between malign
# and benign tumours.
#Data: https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)
library(caret)
library(party)
library(rpart)
library(e1071)
library(ggplot2)
library(randomForest)
library(mlbench)
library(rpart.plot)
library(ROCR)
# We will be using BreastCancer dataset form mlbech pacakge which is the same one
# from UCI but with headers and NA values removed.
data(BreastCancer)
head(BreastCancer)
# Attribute Domain
#--------------------------------------------
# 1. ID id number
# 2. Cl.thickness 1 - 10
# 3. Cell Size 1 - 10
# 4. Cell Shape 1 - 10
# 5. Marginal Adhesion 1 - 10
# 6. Single Epithelial Cell Size 1 - 10
# 7. Bare Nuclei 1 - 10
# 8. Bland Chromatin 1 - 10
# 9. Normal Nucleoli 1 - 10
# 10. Mitoses 1 - 10
# 11. Class Bening - Malign
summary(BreastCancer)
barplot(prop.table(table(BreastCancer$Class)), col=c("#99CCFF","#FFCC99"))
#### Data preparation #######
# In order to improve the results of te classifiers we need to prepare the data
# First we ommit the empty values because can produce some errors in some classifier
BreastCancer <- na.omit(BreastCancer)
# We convert the class attribute to categorical
BreastCancer$Class <- factor(BreastCancer$Class)
# In this dataset we have a Id collumn that wont be usefull, therefore we remove it
BreastCancer$Id <- NULL
# Also we use preprocesing in order to improve the outcome of the differents classifiers:
procMethod <- preProcess(BreastCancer, method = c("center","scale"));
BreastCancer <- predict(procMethod,BreastCancer)
# #### DATA SPLIT #####
# Now we partition the data between trainset and testset using a vector variable called ind.
# Each split will be assign a value.
# trainset will be 75% -- ind == 1
# testset will be 25% -- ind == 2
set.seed(12345)
ind <- sample(2, nrow(BreastCancer), replace = TRUE, prob=c(0.75, 0.25))
# We need to have both classes in each split of the data in order for the creation of correct models
table(BreastCancer[ind == 1,]$Class)
# benign = 321
# malign = 174
table(BreastCancer[ind == 2,]$Class)
# benign = 123
# malign = 65
# ################### CTREES ########################
#Decision trees are simple top-down models in which each node in the tree creates a binary split which after a
#succession of internal nodes we arrived at the terminal node which the prediction is made.
model_ctree <- ctree(Class ~ ., data=BreastCancer[ind == 1,])
plot(model_ctree, main="Decision tree with ctrees")
model_ctree.prediction <- predict(model_ctree, newdata=BreastCancer[ind == 2,])
confusionMatrix(model_ctree.prediction, BreastCancer[ind == 2,]$Class)
#accuracy = 0.9468
# Now we represent the results with a ROC curve
model_ctree.prob <- 1- unlist(treeresponse(model_ctree, BreastCancer[ind == 2,]), use.names=F)[seq(1,nrow(BreastCancer[ind == 2,])*2,2)]
model_ctree.prob.rocr <- prediction(model_ctree.prob, BreastCancer[ind == 2,'Class'])
model_ctree.perf <- performance(model_ctree.prob.rocr, measure = "tpr",x.measure = "fpr")
plot(model_ctree.perf, col=2, main="ROC Curve ctree")
# ################## RPART ######################
# recursive partitioning and regression trees. Categorical or continuous variables can be used depending on
#whether one wants classification trees or regression trees. we will used rpart library.
model_rpart <- rpart(Class ~ .,data = BreastCancer[ind == 1,])
rpart.plot(model_rpart, main="Decision tree with rpart")
model_rpart.prediction <- predict(model_rpart, newdata=BreastCancer[ind == 2,])
# Have to change here the form of confusion matrix due an error with ConfusionMatrix function
model_rpart.pred <- predict(model_rpart, newdata=BreastCancer[ind == 2,], type = 'class')
confusionMatrix(table(model_rpart.pred, BreastCancer[ind == 2,]$Class))
#accuracy = 0.9202
#Now we represent the roc curve:
pred <- prediction(model_rpart.prediction[, 2], BreastCancer[ind == 2,]$Class)
model_rpart.perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(model_rpart.perf, col = 3, main = "ROC Curve rpart")
# ################## RANDOM FOREST #################
# Now we will use a technique call Random Forest
#Random forest is very similar to bagging, the main different is that they used a modified tree
#learning algorithm which decided a random subset of features. For this classification we will use the
#“caret” and “randomForest” libraries
model_rf <- cforest(Class ~ ., data=BreastCancer[ind == 1,], control = cforest_unbiased(mtry = ncol(BreastCancer)-2))
model_rf.prediction <- predict(model_rf, newdata=BreastCancer[ind == 2,])
# Now we calculate the confusion matrix
confusionMatrix(model_rf.prediction, BreastCancer[ind == 2,]$Class)
#accuracy = 0.9628
model_rf.prob <- 1- unlist(treeresponse(model_rf, BreastCancer[ind == 2,]), use.names=F)[seq(1,nrow(BreastCancer[ind == 2,])*2,2)]
model_rf.prob.rocr <- prediction(model_rf.prob, BreastCancer[ind == 2,'Class'])
model_rf.perf <- performance(model_rf.prob.rocr, measure = "tpr",x.measure = "fpr")
plot(model_rf.perf, col=4, main = "random forest ROC Curve")
# ################ BAYER CLASSIFIER ##################
#Naive Bayes is a supervised learning classification algorithm based on Bayes Theorem.
#It is an extension of Exact Bayes classifier and assumes all variables are independent of each other.
#This method is suitable when dimensionality of inputs is high. It uses a simple method of conditional probability
#and can be computationally inexpensive and outperform more sophisticated models. We will use the “e1071” library
#for this task
model_bayer<- naiveBayes(Class ~ ., BreastCancer[ind == 1,])
model_bayer.prediction <- predict(model_bayer, newdata=BreastCancer[ind == 2,])
# Now we calculate the confusion matrix
confusionMatrix(model_bayer.prediction, BreastCancer[ind == 2,]$Class)
#accuracy = 0.9628
model_bayer.prob <- predict(model_bayer, type="raw", newdata=BreastCancer[ind == 2,], probability = TRUE)
model_bayer.prob.rocr <- prediction(model_bayer.prob[,2], BreastCancer[ind == 2,'Class'])
model_bayer.perf <- performance(model_bayer.prob.rocr, measure = "tpr",x.measure = "fpr")
plot(model_bayer.perf, col=5, main = "Bayer ROC Curve")
# ##################### SVM ##############################
#Support Vector Machines generates a partition of the data produce by a hyperlane. This line is produce using all
#the attributes of the data and by a clear gap that should be as wide as possible. For this classifier the library
#“e1071” will used.
#For the svm classifier firts we need to tune it to discover the optimal parameters
model_svm.tune <- tune(svm, Class~., data = BreastCancer[ind == 1,],
ranges = list(gamma = 2^(-8:1), cost = 2^(0:4)),
tunecontrol = tune.control(sampling = "fix"))
model_svm.tune # We obtain the tune parameter and use it in the svm classifier
# cost = 1
# gamma = 0.125
# and use it for the creation of the model
model_svm <- svm(Class~., data = BreastCancer[ind == 1,], cost=1, gamma=0.125, probability = TRUE)
model_svm.prediction <- predict(model_svm, newdata=BreastCancer[ind == 2,])
# and calculate the confusion matrix
confusionMatrix(model_svm.prediction, BreastCancer[ind == 2,]$Class)
#accuracy = 0.9521
model_svm.prob <- predict(model_svm, type="prob", newdata=BreastCancer[ind == 2,], probability = TRUE)
model_svm.prob.rocr <- prediction(attr(model_svm.prob, "probabilities")[,2], BreastCancer[ind == 2,'Class'])
model_svm.perf <- performance(model_svm.prob.rocr, measure = "tpr",x.measure = "fpr")
plot(model_svm.perf, col=6, main="SVM ROC Curve")
# ######## COMPARATION OF THE CLASSIFIERS ########
# Now we can plot the different results of the classifier in the same graph in order to compare them
plot(model_ctree.perf, col=2, main="comparation ROC curves", lwd=2) # ctree
plot(model_rpart.perf, col=3, add=TRUE, , lwd=2) # rpart
plot(model_rf.perf, col=4, add=TRUE, lwd=2) # random forest
plot(model_svm.perf, col=5, add=TRUE, lwd=2) # svm
plot(model_bayer.perf, col=6, add=TRUE, lwd=2) # bayer
# the legend
legend(0.6, 0.8, c("ctree", "rpart", "rand forest", "SVM", "Bayer"), 2:6)
# Conclusion:
# As we can see in the final graph both Support Vector Machines and Bayer classifier have very similar
# ROC curves. But if we check the accuracy of the classifiers we can see that bayer has 96.28%
# against 95.21% of the SVM.
#
# Therefore we can conclude that for this dataset the best classifier is Naive Bayes