-
Notifications
You must be signed in to change notification settings - Fork 0
/
Preprocessing.py
67 lines (59 loc) · 2.16 KB
/
Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import nltk
import string
operation = ["AND", "OR", "NOT"]
class Preprocessing:
@staticmethod
def Stemming(tokens):
# Stemming
# nltk.download('punkt')
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemToken = []
for token in tokens:
if token not in operation:
stemToken.append(stemmer.stem(token))
else:
stemToken.append(token)
return stemToken
# print(f"Stems {stemToken}")
@staticmethod
def Lemmatize(tokens):
# Lemmatization
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
lemmatizer = WordNetLemmatizer() # Create the Lemmatizer object
lemToken = []
for token, tag in pos_tag(tokens):
if token not in operation:
if tag.startswith("NN"):
lemToken.append(lemmatizer.lemmatize(token, pos="n"))
elif tag.startswith("VB"):
lemToken.append(lemmatizer.lemmatize(token, pos="v"))
elif tag.startswith("JJ"):
lemToken.append(lemmatizer.lemmatize(token, pos="a"))
else:
lemToken.append(token)
return lemToken
# print(f"Lems {lemToken}")
@staticmethod
def StopWord(tokens):
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
new_tokens = []
for token in tokens:
if token not in stop_words or token in operation:
new_tokens.append(token)
return new_tokens
# print(f"StopWords removal {new_tokens}")
@staticmethod
def tokenize(content=""):
return [t for t in nltk.word_tokenize(content) if t not in string.punctuation]
@staticmethod
def normalization(text):
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
# print(text)
return text