@@ -14,14 +14,9 @@ class TextParser():
14
14
@classmethod
15
15
def get_distribuition_of (cls , text :str ) -> dict :
16
16
17
- tokens = [word for word in TextParser .pre_proccess (text ) if word not in TextParser .COMPLETE_FILTER ]
18
-
19
- for token_idx in range (len (tokens )):
20
- tokens [token_idx ] = TextParser .PORTUGUESE_STEMMER .stem (tokens [token_idx ])
21
-
17
+ tokens = [TextParser .PORTUGUESE_STEMMER .stem (word ) for word in TextParser .pre_proccess (text )]
22
18
token_frequency = FreqDist (tokens )
23
19
tokens = None
24
-
25
20
return dict (token_frequency .items ())
26
21
27
22
@classmethod
@@ -32,7 +27,11 @@ def pre_proccess(cls, text:str) -> list:
32
27
33
28
text = TextParser .split_on_upper (text )
34
29
text = text .rstrip ('\n ' ).strip ()
35
- return [word .lower () for word in word_tokenize (text ) if len (word ) <= TextParser .MAX_TAM_WORD ]
30
+ for word in word_tokenize (text ):
31
+ if len (word ) <= TextParser .MAX_TAM_WORD :
32
+ word_lower = word .lower ()
33
+ if word_lower not in TextParser .COMPLETE_FILTER :
34
+ yield word_lower
36
35
37
36
@classmethod
38
37
def stem (cls , text_list :list ) -> list :
@@ -49,11 +48,6 @@ def split_on_upper(cls, text:str) -> str:
49
48
50
49
return text .replace (" " , " " ).lstrip ()
51
50
52
- @classmethod
53
- def is_portuguese (cls , text :str ) -> bool :
54
- #return TextParser.language_of(text[:50]) in TextParser.ACCEPTED_LANGUAGES_DETECTED
55
- return True
56
-
57
51
@classmethod
58
52
def language_of (cls , text :str ) -> str :
59
53
return TextParser .TEXT_CLASSIFIER .guess_language (text )
0 commit comments