-
Notifications
You must be signed in to change notification settings - Fork 1
/
satire.py
78 lines (61 loc) · 2.24 KB
/
satire.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
import os
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from bs4 import BeautifulSoup
from sklearn.neural_network import MLPClassifier
import pickle
from nltk.tokenize import word_tokenize
import sys
#punct
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
text = BeautifulSoup(text, "lxml").text # HTML decoding
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub('', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
text = ' '.join(word for word in text.split() if word not in STOPWORDS)# delete stopwors from text
stemmer = PorterStemmer()
words = stopwords.words("english")
word_tokens = word_tokenize(text)
filtered_sentence = [w for w in word_tokens if not w in words]
filtered_sentence = []
for w in word_tokens:
if w not in words:
filtered_sentence.append(w)
text_clean = ""
for w in filtered_sentence:
text_clean += stemmer.stem(w) + ' '
return [text_clean]
def find_satire(text):
#THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__)))
loaded_clf = pickle.load(open(THIS_FOLDER+'/classifier.pickle', 'rb'))
X = clean_text(text)
predictions = loaded_clf.predict(X)
return(predictions)
def main():
#my_text = sys.stdin.readlines()
my_text = str(sys.argv[1])
#print('python is running')
#my_text = input("response: ")
#my_text = [my_text_1]
predictions = find_satire(my_text)
#print(str(predictions))
sys.stdout.write(str(predictions[0]))
if __name__ == "__main__":
#app.run()
main()