Skip to content

Commit

Permalink
improve join_abbreviation performance by flashtext
Browse files Browse the repository at this point in the history
  • Loading branch information
sir-kokabi committed Sep 28, 2023
1 parent ec3352c commit 6957fbc
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 15 deletions.
32 changes: 17 additions & 15 deletions hazm/word_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from hazm import default_verbs
from hazm import default_words
from hazm import words_list
from flashtext import KeywordProcessor


class WordTokenizer(TokenizerI):
Expand Down Expand Up @@ -286,18 +287,18 @@ def tokenize(self: "WordTokenizer", text: str) -> List[str]:
# 📍 عرضه بلوک NUM2 درصدی TAG های وب به قیمت


if self._join_abbreviation:
replaced_abbrs = []
if self._join_abbreviation:

rnd = "_"
rnd = 313 # random number that is less likely to appear within the text
while str(rnd) in text: rnd=rnd+1 # if rnd is found within the text, increment it by 1 until it no longer appears in the text.
rnd = str (rnd)

keyword_processor = KeywordProcessor()

while rnd in text: rnd +="_" # if rnd exists in text, add loop until text has no rnd

for abbr in self.abbreviations:
pattern = re.escape(abbr)
pattern = r"(?<!\w)" + pattern + r"(?!\w)"
text = re.sub(pattern, rnd, text)
replaced_abbrs.append(abbr)
for (i, abbr) in enumerate(self.abbreviations):
keyword_processor.add_keyword(abbr, rnd+str(i))

text = keyword_processor.replace_keywords(text)

if self.separate_emoji:
text = self.emoji_pattern.sub(self.emoji_repl, text)
Expand All @@ -319,11 +320,12 @@ def tokenize(self: "WordTokenizer", text: str) -> List[str]:


tokens = self.join_verb_parts(tokens) if self._join_verb_parts else tokens

if self._join_abbreviation:
for i in range(len(tokens)):
if tokens[i] == rnd:
tokens[i] = replaced_abbrs.pop(0)

if self._join_abbreviation:
reversed_dict = {value: key for key, value in keyword_processor.get_all_keywords().items()}
for i, token in enumerate(tokens):
if token in reversed_dict:
tokens[i] = reversed_dict[token]

return tokens

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ python-crfsuite="^0.9.9"
numpy = "^1.24.3"
scikit-learn = "^1.2.2"
fasttext-wheel = "^0.9.2"
flashtext = "^2.7"

[tool.poetry.group.docs.dependencies]
mkdocs="^1.4.3"
Expand Down

0 comments on commit 6957fbc

Please sign in to comment.