-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
151 lines (132 loc) · 4.87 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import itertools
import re
import string
import numpy as np
from matplotlib import pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
def process_tweet(tweet, return_list=False):
"""Process tweet function.
Input:
tweet: a string containing a tweet
Output:
tweets_clean: a list of words containing the processed tweet or String
"""
stemmer = PorterStemmer()
stopwords_english = stopwords.words('english')
# remove stock market tickers like $GE
tweet = re.sub(r'\$\w*', '', tweet)
# remove old style retweet text "RT"
tweet = re.sub(r'^RT[\s]+', '', tweet)
# remove hyperlinks
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
# remove hashtags
# only removing the hash # sign from the word
tweet = re.sub(r'#', '', tweet)
# tokenize tweets
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
reduce_len=True)
tweet_tokens = tokenizer.tokenize(tweet)
final_tweet_tokens = []
for word in tweet_tokens:
if (word not in stopwords_english and # remove stopwords
word not in string.punctuation): # remove punctuation
# tweets_clean.append(word)
stem_word = stemmer.stem(word) # stemming word
final_tweet_tokens.append(stem_word)
return " ".join(final_tweet_tokens) if return_list == False else final_tweet_tokens
def convert_tweets_to_doc_vec(tweets, w2v_model):
np.seterr(divide='ignore', invalid='ignore')
docs = []
for tweet in tweets:
tokens = process_tweet(tweet, return_list=True)
tweet_vec = np.zeros(shape=(1, w2v_model.wv.vector_size))
tweet_words_counter = len(tokens)
for token in tokens:
try:
tweet_vec += w2v_model.wv[token]
except Exception: # if the token was not found
tweet_vec += np.zeros(shape=(1, w2v_model.wv.vector_size))
docs.append(tweet_vec / tweet_words_counter)
docs = np.array(docs)
shape1, shape2, shape3 = docs.shape
docs = docs.reshape((shape1, shape2 * shape3))
docs = np.nan_to_num(docs)
return docs
# (word, class): count
# (word, class): count / len_vocab
def build_freqs(tweets, ys):
"""Build frequencies.
Input:
tweets: a list of tweets
ys: an m x 1 array with the sentiment label of each tweet
(either 0 or 1)
Output:
freqs: a dictionary mapping each (word, sentiment) pair to its
frequency
"""
# Convert np array to list since zip needs an iterable.
# The squeeze is necessary or the list ends up with one element.
# Also note that this is just a NOP if ys is already a list.
yslist = np.squeeze(ys).tolist()
# Start with an empty dictionary and populate it by looping over all tweets
# and over all processed words in each tweet.
freqs = {}
vocab = set()
words_count = 0
for y, tweet in zip(yslist, tweets):
for word in process_tweet(tweet):
words_count += 1
vocab.add(word)
pair = (word, y)
if pair in freqs:
freqs[pair] += 1
else:
freqs[pair] = 1
return words_count, freqs, vocab
def build_probs(freqs, vocab):
probs = {}
for pair, count in freqs.items():
probs[pair] = count / len(vocab)
return probs
def get_word_class_count(freqs, word, label):
'''
Input:
freqs: a dictionary with the frequency of each pair (or tuple)
word: the word to look up
label: the label corresponding to the word
Output:
n: the number of times the word with its corresponding label appears.
'''
n = 0 # freqs.get((word, label), 0)
pair = (word, label)
if (pair in freqs):
n = freqs[pair]
return n
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label', fontsize=15)
plt.xlabel('Predicted label', fontsize=15)
plt.show()