Skip to content

Commit 00b5a37

Browse files
committed
Add ngram support to the api
1 parent b8be963 commit 00b5a37

File tree

3 files changed

+22
-12
lines changed

3 files changed

+22
-12
lines changed

__pycache__/utils.cpython-37.pyc

1.15 KB
Binary file not shown.

app.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212
class WordListView(MethodView):
1313
def post(self):
1414
text_data_dict = json.loads(request.data)
15-
phrases_list, unique_word_list = clean_text(text_data_dict["texts"])
15+
ngrams_args = request.args.get('ngrams', None)
16+
phrases_list, unique_word_list = clean_text(text_data_dict["texts"], ngrams_args)
1617
word_counter = {}
17-
18+
import pdb
19+
pdb.set_trace()
1820
for index, phrase in enumerate(phrases_list):
1921
word_counter[index] = [phrase.count(word) for word in unique_word_list]
2022

utils.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,30 @@
11
import re
2-
import string
2+
from nltk.util import ngrams
33

44
from constants import STOP_WORDS
55

66

77
def _text_cleaner(text):
88
# function that lower letters, clean punctuation and
99
# strings that start with numbers, return a cleaned text
10-
return re.sub(r"\s*\b(\d+\w*)", "", "".join(text).lower()).translate(
11-
str.maketrans("", "", string.punctuation)
12-
)
10+
return re.findall(r'\w+', re.sub(r"\s*\b(\d+\w*)", "", "".join(text).lower()))
1311

1412

15-
def clean_text(text):
16-
unique_word_list = set(_text_cleaner(text).split())
17-
filtered_unique_words_list = set(
18-
[i for i in unique_word_list if i not in STOP_WORDS]
19-
)
20-
phrases_list = [_text_cleaner(i).split() for i in text]
13+
def clean_text(text, ngrams_number: int = None):
14+
if ngrams_number is None:
15+
unique_word_list = set(_text_cleaner(text))
16+
filtered_unique_words_list = set(
17+
[i for i in unique_word_list if i not in STOP_WORDS]
18+
)
19+
else:
20+
filtered_words_list = set
21+
22+
for word in text:
23+
cleaned_word = set(ngrams(_text_cleaner(word),2))
24+
filtered_words_list = filtered_words_list.union(cleaned_word)
25+
26+
filtered_unique_words_list = [' '.join(i) for i in filtered_words_list]
27+
28+
phrases_list = [' '.join(_text_cleaner(i)) for i in text]
2129

2230
return phrases_list, sorted(filtered_unique_words_list)

0 commit comments

Comments
 (0)