Add ngram support to the api

pythonbyte · pythonbyte · commit 00b5a3746c03 · 2019-09-06T17:03:36.000-03:00
diff --git a/__pycache__/utils.cpython-37.pyc b/__pycache__/utils.cpython-37.pyc
diff --git a/app.py b/app.py
@@ -12,9 +12,11 @@
 class WordListView(MethodView):
     def post(self):
         text_data_dict = json.loads(request.data)
-        phrases_list, unique_word_list = clean_text(text_data_dict["texts"])
+        ngrams_args = request.args.get('ngrams', None)
+        phrases_list, unique_word_list = clean_text(text_data_dict["texts"], ngrams_args)
         word_counter = {}
-
+        import pdb
+        pdb.set_trace()
         for index, phrase in enumerate(phrases_list):
             word_counter[index] = [phrase.count(word) for word in unique_word_list]
 
diff --git a/utils.py b/utils.py
@@ -1,22 +1,30 @@
 import re
-import string
+from nltk.util import ngrams
 
 from constants import STOP_WORDS
 
 
 def _text_cleaner(text):
     # function that lower letters, clean punctuation and
     # strings that start with numbers, return a cleaned text
-    return re.sub(r"\s*\b(\d+\w*)", "", "".join(text).lower()).translate(
-        str.maketrans("", "", string.punctuation)
-    )
+    return re.findall(r'\w+', re.sub(r"\s*\b(\d+\w*)", "", "".join(text).lower()))
 
 
-def clean_text(text):
-    unique_word_list = set(_text_cleaner(text).split())
-    filtered_unique_words_list = set(
-        [i for i in unique_word_list if i not in STOP_WORDS]
-    )
-    phrases_list = [_text_cleaner(i).split() for i in text]
+def clean_text(text, ngrams_number: int = None):
+    if ngrams_number is None:
+        unique_word_list = set(_text_cleaner(text))
+        filtered_unique_words_list = set(
+            [i for i in unique_word_list if i not in STOP_WORDS]
+        )
+    else:
+        filtered_words_list = set
+
+        for word in text:
+            cleaned_word = set(ngrams(_text_cleaner(word),2))
+            filtered_words_list = filtered_words_list.union(cleaned_word)
+
+        filtered_unique_words_list = [' '.join(i) for i in filtered_words_list]
+
+    phrases_list = [' '.join(_text_cleaner(i)) for i in text]
 
     return phrases_list, sorted(filtered_unique_words_list)