|
1 | 1 | import re
|
2 |
| -import string |
| 2 | +from nltk.util import ngrams |
3 | 3 |
|
4 | 4 | from constants import STOP_WORDS
|
5 | 5 |
|
6 | 6 |
|
7 | 7 | def _text_cleaner(text):
|
8 | 8 | # function that lower letters, clean punctuation and
|
9 | 9 | # strings that start with numbers, return a cleaned text
|
10 |
| - return re.sub(r"\s*\b(\d+\w*)", "", "".join(text).lower()).translate( |
11 |
| - str.maketrans("", "", string.punctuation) |
12 |
| - ) |
| 10 | + return re.findall(r'\w+', re.sub(r"\s*\b(\d+\w*)", "", "".join(text).lower())) |
13 | 11 |
|
14 | 12 |
|
15 |
| -def clean_text(text): |
16 |
| - unique_word_list = set(_text_cleaner(text).split()) |
17 |
| - filtered_unique_words_list = set( |
18 |
| - [i for i in unique_word_list if i not in STOP_WORDS] |
19 |
| - ) |
20 |
| - phrases_list = [_text_cleaner(i).split() for i in text] |
| 13 | +def clean_text(text, ngrams_number: int = None): |
| 14 | + if ngrams_number is None: |
| 15 | + unique_word_list = set(_text_cleaner(text)) |
| 16 | + filtered_unique_words_list = set( |
| 17 | + [i for i in unique_word_list if i not in STOP_WORDS] |
| 18 | + ) |
| 19 | + else: |
| 20 | + filtered_words_list = set |
| 21 | + |
| 22 | + for word in text: |
| 23 | + cleaned_word = set(ngrams(_text_cleaner(word),2)) |
| 24 | + filtered_words_list = filtered_words_list.union(cleaned_word) |
| 25 | + |
| 26 | + filtered_unique_words_list = [' '.join(i) for i in filtered_words_list] |
| 27 | + |
| 28 | + phrases_list = [' '.join(_text_cleaner(i)) for i in text] |
21 | 29 |
|
22 | 30 | return phrases_list, sorted(filtered_unique_words_list)
|
0 commit comments