-
Notifications
You must be signed in to change notification settings - Fork 8
/
emotion_recognizer_svm.py
105 lines (81 loc) · 3.05 KB
/
emotion_recognizer_svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#important libraries
import pandas as pd
import numpy as np
import nltk
import re
#importing stopwords is optional, in this case it decreased accuracy
#from nltk.corpus import stopwords
import itertools
import time
start_time = time.time()
import os
os.chdir('/tmp/guest-pltjjp/Downloads')
data = pd.read_csv('text_emotion.csv')
#data = data.iloc[:100,:]
#stopset = set(stopwords.words('english'))
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
#comprehensive cleaning
def cleaning(text):
txt = str(text)
txt = re.sub(r"http\S+", "", txt)
if len(txt) == 0:
return 'no text'
else:
txt = txt.split()
index = 0
for j in range(len(txt)):
if txt[j][0] == '@':
index = j
txt = np.delete(txt, index)
if len(txt) == 0:
return 'no text'
else:
words = txt[0]
for k in range(len(txt)-1):
words+= " " + txt[k+1]
txt = words
txt = re.sub(r'[^\w]', ' ', txt)
if len(txt) == 0:
return 'no text'
else:
txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt))
txt = txt.replace("'", "")
txt = nltk.tokenize.word_tokenize(txt)
#data.content[i] = [w for w in data.content[i] if not w in stopset]
for j in range(len(txt)):
txt[j] = lem.lemmatize(txt[j], "v")
if len(txt) == 0:
return 'no text'
else:
return txt
data['content'] = data['content'].map(lambda x: cleaning(x))
data = data.reset_index(drop=True)
for i in range(len(data)):
words = data.content[i][0]
for j in range(len(data.content[i])-1):
words+= ' ' + data.content[i][j+1]
data.content[i] = words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data.content, data.sentiment, test_size=0.25, random_state=0)
x_train = x_train.reset_index(drop = True)
x_test = x_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)
vectorizer = TfidfVectorizer(min_df=3, max_df=0.9)
train_vectors = vectorizer.fit_transform(x_train)
test_vectors = vectorizer.transform(x_test)
model = svm.SVC(kernel='linear')
model.fit(train_vectors, y_train)
predicted_sentiment = model.predict(test_vectors)
print(classification_report(y_test, predicted_sentiment))
predicted_sentiments = []
for s in range(len(predicted_sentiment)):
predicted_sentiments.append(predicted_sentiment[s])
prediction_df = pd.DataFrame({'Content':x_test, 'Emotion_predicted':predicted_sentiment, 'Emotion_actual': y_test})
prediction_df.to_csv('emotion_recognizer_svm.csv', index = False)
elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")