-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
116 lines (86 loc) · 3.43 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from flask import Flask, render_template
import flask
import pickle
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import pandas as pd
import numpy as np
import praw
app = Flask(__name__)
# text pre processing
def text_preprocess(text):
text = re.sub(r'[^\w\s]', '', text)
l_text = " ".join(word for word in text.lower().split() if word not in ENGLISH_STOP_WORDS)
return l_text
# loading all models
with open('one_hot.pkl', "rb") as f:
enc = pickle.load(f)
with open('rf.pkl', "rb") as f:
rf = pickle.load(f)
with open('senti.pkl', "rb") as f:
sid = pickle.load(f)
# extract information for given Reddit url
def get_data(url):
data = {}
reddit = praw.Reddit(client_id='rGGcUZbUNTiCFw',
client_secret='hD5kq4AUUN4qhvLLrqO77B0FvAGseQ',
user_agent='Reddit WebScrapping')
sub_data = reddit.submission(url=str(url))
data['Title'] = [str(sub_data.title)]
data['Gilded'] = [sub_data.gilded]
data['Over_18'] = [sub_data.over_18]
data['Number_of_Comments'] = [sub_data.num_comments]
scores = sid.polarity_scores(sub_data.title)
compound = scores['compound']
if (compound >= 0.5):
data['Predicted_value'] = ['positive']
elif (compound >= 0) & (compound <= 0.5):
data['Predicted_value'] = ['neutral']
elif (compound <= 0):
data['Predicted_value'] = ['negative']
df = pd.DataFrame(data)
return df
@app.route('/')
def home():
return render_template('Index.html')
@app.route('/predict', methods=['POST'])
def predict():
url = str(flask.request.form['url'])
# get data from url and store it in the form of DataFrame
data = get_data(url)
# text pre processing
title = text_preprocess(data['Title'][0])
# converting text to numerics
df_tokens = pd.read_csv('tokens.csv')
test_title = []
for word in title.split():
if word in df_tokens.columns:
test_title.append(df_tokens[word])
# padding with maxlength as 300
maxlen = 300
test_title = test_title + [0] * (maxlen - len(test_title))
# using embedding_matrix to convert words to vetors
embedding_matrix = np.array(pd.read_csv('embedding_matrix.csv', sep=' '))
vectors = []
for n in test_title:
vectors.append(embedding_matrix[n])
# calculate mean of vectors to get one vector for the text
vectors = [item for sublist in vectors for item in sublist]
arr = np.array(vectors)
final_vector = np.mean(arr, axis=0)
final_vector = pd.DataFrame(np.array(final_vector)).T
# one hot encoding with column names
categories = ['Over_18', 'Predicted_value']
test_encoded = enc.transform(data[categories])
# drop unnecessary columns
data.drop(["Title", 'Over_18', 'Predicted_value'], axis=1, inplace=True)
data.reset_index(inplace=True, drop=True)
# concatenate everything
col_names = [False, True, 'negative', 'neutral', 'positive']
test = pd.DataFrame(test_encoded.todense(), columns=col_names)
X_test = pd.concat([data, final_vector, test], axis=1)
# prection
results = int(rf.predict(X_test))
return render_template('Index.html', results='Predicted score for the post is: {}'.format(results))
if __name__ == "__main__":
app.run(debug=True)