-
Notifications
You must be signed in to change notification settings - Fork 0
/
countvectorizer.py
142 lines (115 loc) · 4.36 KB
/
countvectorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Hotel Reviews Sentiment Analysis Project
# This Project Uses Count Vectorizer for the models
# @author Simran
# @version 1.0
import pandas as pd # analyse data
import numpy as np # for working with arrays
# machine learning library
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
# >>- importing dataset ->>
print('>>- importing dataset ->>')
df = pd.read_csv('Datafiniti_Hotel_Reviews.csv')
# >>- getting rid of null values ->>
print('>>- getting rid of null values ->>')
df = df.dropna()
# >>- taking 30% representative sample ->>
print('>>- taking 30% representative sample ->>')
np.random.seed(34)
df1 = df.sample(frac=0.3)
# >>- adding sentiments column ->>
print('>>- adding sentiments column ->>')
df1['sentiments'] = df1.rating.apply(lambda x: 0 if x in [1, 2] else 1)
# >>- defining input training features and labels ->>
print('>>- defining input training features and labels ->>')
X = df1['reviews'] # input feature
Y = df1['sentiments'] # label
# >>- Using Count Vectorizer ->>
print('>>- Using Count Vectorizer ->>')
# use count vectorizer to vectorize the text data in
# ->review column and use three different
# ->classification models from scikit-learn models.
# -> split dataset into training sets and testing sets <-
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.5, random_state=24)
# Vectorizing the text data
print('Vectorizing the text data')
cv = CountVectorizer()
ctmTr = cv.fit_transform(X_train)
X_test_dtm = cv.transform(X_test)
# >- Logistic Regression ->
print('>- Logistic Regression ->')
# Training the model
print('Training the model')
lr = LogisticRegression()
lr.fit(ctmTr, Y_train)
# Generating Accuracy score
print('Generating Accuracy score')
lr_score = lr.score(X_test_dtm, Y_test)
print("Results for Logistic Regression with CountVectorizer")
print(lr_score)
# Predicting the labels for the test data
print('Predicting the labels for the test data')
Y_pred_lr = lr.predict(X_test_dtm)
# Setting up Confusion matrix
print('Setting up Confusion matrix')
cm_lr = confusion_matrix(Y_test, Y_pred_lr)
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred_lr).ravel()
print(tn, fp, fn, tp)
# Printing True Positive and Negative rates
print('Printing True Positive and Negative rates')
tpr_lr = round(tp / (tp + fn), 4)
tnr_lr = round(tn / (tn + fp), 4)
print(tpr_lr, tnr_lr)
# >- Support Vector Machine ->
print('>- Support Vector Machine ->')
# Training the model
print('Training the model')
svcl = svm.SVC()
svcl.fit(ctmTr, Y_train)
# Generating Accuracy score
print('Generating Accuracy score')
svcl_score = svcl.score(X_test_dtm, Y_test)
print('Results for Support Vector Machine with CountVectorizer')
print(svcl_score)
# Predicting the labels for the test data
print('Predicting the labels for the test data')
Y_pred_sv = svcl.predict(X_test_dtm)
# Setting up Confusion matrix
print('Setting up Confusion matrix')
cm_sv = confusion_matrix(Y_test, Y_pred_sv)
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred_sv).ravel()
print(tn, fp, fn, tp)
# Printing True Positive and Negative rates
print('Printing True Positive and Negative rates')
tpr_sv = round(tp/(tp + fn), 4)
tnr_sv = round(tn/(tn + fp), 4)
print(tpr_sv, tnr_sv)
# >- K Nearest Neighbor ->
print('>- K Nearest Neighbor ->')
# Training the model
print('Training the model')
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(ctmTr, Y_train)
# Generating Accuracy score
print('Generating Accuracy score')
knn_score = knn.score(X_test_dtm, Y_test)
print('Results for K Nearest Neighbor with CountVectorizer')
print(knn_score)
# Predicting the labels for the test data
print('Predicting the labels for the test data')
Y_pred_knn = knn.predict(X_test_dtm)
# Setting up Confusion matrix
print('Setting up Confusion matrix')
cm_knn = confusion_matrix(Y_test, Y_pred_knn)
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred_knn).ravel()
print(tn, fp, fn, tp)
# Printing True Positive and Negative rates
print('Printing True Positive and Negative rates')
tpr_knn = round(tp/(tp + fn), 4)
tnr_knn = round(tn/(tn + fp), 4)
print(tpr_knn, tnr_knn)