-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_new.R
192 lines (137 loc) · 8.63 KB
/
import_new.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# David Ebert
# September 13, 2016
source("functions.R")
#############################################################
###### Import one raw data frame
#############################################################
#library(streamR)
temp_df = parseTweets(tweets = "~/Desktop/Huang Research/LAR_Data/raw_data/2016-09/2016-09-06.json")
temp_df = temp_df[1:1000,]
#############################################################
###### Import NEW 2016 tweets. Then analyze them.
#############################################################
#new_data_path = "~/Desktop/Huang Research/September_LaPY_data/" #Linux
#new_data_path = "C:/Users/000678922/Desktop/LAR_data" #Windows
#setwd("~/Desktop/Huang Research/September_LaPY_data")
# use import_tweets_from_json() from other functions.R to read .json files into cleaned .feather files
a = Sys.time()
# This takes a long time!
import_tweets_from_json(months_to_import = "2016-08/",
base_file_path = "~/Desktop/Huang Research/LAR_Data")
Sys.time()-a # takes about 30 minutes for a week's worth of tweets
# Read (currently one) cleaned feather file back into R
library(feather)
#tweet_df = read_feather(path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/1.feather")
# This runs fast... sometimes...
################################
#### Make august data frame with 5.6 million tweets; write all august tweets to feather file
################################
library(feather)
a = Sys.time()
all_august_tweets = read_feather(path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/1.feather")
all_august_tweets = rbind(all_august_tweets, read_feather(path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/2.feather"))
all_august_tweets = rbind(all_august_tweets, read_feather(path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/3.feather"))
#write_feather(all_august_tweets, path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/all_august.feather")
all_august_tweets = read_feather(path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/all_august.feather")
# 4651427 tweets from August
Sys.time()-a
################################
#### Build classifier from all_august_tweets
################################
#Extract emoticon tweets from all_august_tweets
emoticon_df = extract_emoticon_tweets(all_august_tweets)
print(paste("Finished extracting", nrow(emoticon_df),"emoticon tweets and building NDSI lexicon using tweets from tweet_df..."))
# 352078 emoticon tweets, evenly balanced
# sad happy
# 176039 214077
# Remove all_august_tweets from memory
rm(all_august_tweets)
# Make emoticon_df using max 40000 rows of emoticon_df, then save to feather file
indices = as.integer(c(sample(1:nrow(emoticon_df)/2,20000), sample(((nrow(emoticon_df)+2)/2):nrow(emoticon_df),20000)))
emoticon_df = emoticon_df[indices,]
# table(emoticon_df$polarity) # Check for balance
#write.csv(x = emoticon_df, file = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/emoticon_df.csv", row.names = FALSE)
#emoticon_df2 = read.csv(file = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/emoticon_df.csv") #NOT THE SAME!
#write_feather(emoticon_df, path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/emoticon_df.feather")
#emoticon_df = read_feather(path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/emoticon_df.feather") # WHY THIS ERROR??
# Make ndsi lexicon and save to feather
a = Sys.time()
ndsi_lexicon_df = make_ndsi_lexicon(emoticon_df, max_words = 2000, smoothing_alpha = 2^12)
Sys.time()-a
#write.csv(x = ndsi_lexicon_df, file = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/ndsi_lexicon_df.csv", row.names = FALSE)
#ndsi_lexicon_df2 = read.csv(file = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/ndsi_lexicon_df.csv")
#write_feather(ndsi_lexicon_df, path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/ndsi_lexicon_df.feather")
#ndsi_lexicon_df = read_feather(path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/ndsi_lexicon_df.feather")
# Make term-frequency data frame
emoticon_term_freq = make_term_freq(emoticon_df, ndsi_lexicon_df)
#write.csv(x = emoticon_term_freq, file = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/emoticon_term_freq.csv", row.names = FALSE)
#emoticon_term_freq2 = write.csv(file = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/emoticon_term_freq.csv") # NOT THE SAME!!!
#write_feather(emoticon_term_freq, path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/emoticon_term_freq.feather")
#emoticon_term_freq = read_feather(path = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/emoticon_term_freq.feather")
#print(paste("Finished making term-frequency data frame from tweet_df with", ncol(emoticon_term_freq)-6, "NDSI words..."))
# Build Random Forest Classifier and write to file <- This takes some time...
a = Sys.time()
print("Building random forest classifier...")
rf_model = make_rf_classifier(emoticon_term_freq, ndsi_lexicon_df, ntrain = 28000)
# try to get ntrain to 40000*.07 = 28000.
# Takes 5 mins with ntrain = 5000
# Takes 13 mins with ntrain = 10000
# Takes 35 mins with ntrain = 20000
# Takes 1.17 hours with ntrain = 28000
save(rf_model, file = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/rf_model.RData")
# load(file = "~/Desktop/Huang Research/LAR_Data/feather_data/2016-08/rf_model.RData")
Sys.time()-a
beepr::beep(3)
rf_model$train_accuracy
rf_model$test_accuracy
rf_model$sent140_accuracy
################################
#### Apply rf_model to all_august_tweets
################################
# Get all_august_tweets from above
# then classify them!
all_august_tweets$pred_polarity = classify.polarity.machine(tweet_df = all_august_tweets,
chunk.size = 10000,
ndsi_lexicon = max_imbalance_lexicon,
model = final_model_emoji)
#cutoff = 0.5
#all_august_tweets$pred_polarity = as.numeric(all_august_tweets$pred_polarity>cutoff)
write.csv(x = all_august_tweets, file = "~/Desktop/Huang Research/LAR_Data/all_august_pred.csv", row.names = FALSE)
write_feather(all_august_tweets, path = "~/Desktop/Huang Research/LAR_Data/all_august_pred.feather")
beepr::beep(3)
all_august_pred_import = read.csv(file = "~/Desktop/Huang Research/LAR_Data/all_august_pred.csv", row.names = FALSE)
all_august_pred_import = read_feather(path = "~/Desktop/Huang Research/LAR_Data/all_august_pred.feather")
# Write all_august_pred to 2 shortened files
library(feather)
august_pred_import_1 = all_august_pred_import[1:2000000,c("screen_name", "id_str", "lat", "lon", "afinn_score", "pred_polarity")]
august_pred_import_2 = all_august_pred_import[2000001:nrow(all_august_pred_import),c("screen_name", "id_str", "lat", "lon", "afinn_score", "pred_polarity")]
write.csv(x = august_pred_import_1, file = "~/Desktop/Huang Research/LAR_Data/august_pred_import_1.csv", row.names = FALSE)
write.csv(x = august_pred_import_1, file = "~/Desktop/Huang Research/LAR_Data/august_pred_import_2.csv", row.names = FALSE)
#a = Sys.time()
#tweet_df = make_tweet_df(1:4)
#model_result = build_classifier_from_tweets(tweet_df, ntrain = 100000, ntest = 1000)
#Sys.time()-a
#model_result
#optimize_rf_cutoff(score.vec = model_result$sent140_phat[,2],
# polarity.vec = as.numeric(model_result$sent140_actual)-1,
# min = 0.25,
# max = 0.7,
# step = .01)
#Check high and low afinn scores:
all_august_pred_import[all_august_pred_import$pred_polarity < 0.2, c("text", "afinn_score","pred_polarity", "polarity", "id_str")]
all_august_pred_import[all_august_pred_import$afinn_score < (-10), c("text", "afinn_score","pred_polarity", "polarity", "id_str")]
# Check rows with individual words.
all_august_pred_import[word_lookup(all_august_pred_import$text,'it'), c("text", "afinn_score","pred_polarity", "polarity", "id_str")]
# Check pred_polarity for tweets with known polarity
known_polarity_tweets = all_august_pred_import[!is.na(all_august_pred_import$polarity),]
mean(as.data.frame(known_polarity_tweets[known_polarity_tweets$polarity == 0, "pred_polarity"])[,1])
mean(as.data.frame(known_polarity_tweets[known_polarity_tweets$polarity == 1, "pred_polarity"])[,1])
# WHY DO SAD TWEETS HAVE HIGHER PREDICTED POLARITY???
# Check the average pred_polarity of each word in the lexicon over ALL tweets.
ndsi_lexicon_df$avg_polarity = NA
for(i in 1:nrow(ndsi_lexicon_df)){
lookup_word = ndsi_lexicon_df$word[i]
ndsi_lexicon_df$avg_polarity[i]=
mean(as.data.frame(all_august_pred_import[word_lookup(all_august_pred_import$text,lookup_word),"pred_polarity"])[,1])
print(i)
}