From 6957fbcbd4faed21b167d68f6694d4ea5b58b1cf Mon Sep 17 00:00:00 2001 From: Ayub Date: Thu, 28 Sep 2023 15:43:15 +0330 Subject: [PATCH] improve join_abbreviation performance by flashtext --- hazm/word_tokenizer.py | 32 +++++++++++++++++--------------- pyproject.toml | 1 + 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/hazm/word_tokenizer.py b/hazm/word_tokenizer.py index f897cffa..2b3c3780 100644 --- a/hazm/word_tokenizer.py +++ b/hazm/word_tokenizer.py @@ -16,6 +16,7 @@ from hazm import default_verbs from hazm import default_words from hazm import words_list +from flashtext import KeywordProcessor class WordTokenizer(TokenizerI): @@ -286,18 +287,18 @@ def tokenize(self: "WordTokenizer", text: str) -> List[str]: # 📍 عرضه بلوک NUM2 درصدی TAG های وب به قیمت - if self._join_abbreviation: - replaced_abbrs = [] + if self._join_abbreviation: - rnd = "_" + rnd = 313 # random number that is less likely to appear within the text + while str(rnd) in text: rnd=rnd+1 # if rnd is found within the text, increment it by 1 until it no longer appears in the text. + rnd = str (rnd) + + keyword_processor = KeywordProcessor() - while rnd in text: rnd +="_" # if rnd exists in text, add loop until text has no rnd - - for abbr in self.abbreviations: - pattern = re.escape(abbr) - pattern = r"(? List[str]: tokens = self.join_verb_parts(tokens) if self._join_verb_parts else tokens - - if self._join_abbreviation: - for i in range(len(tokens)): - if tokens[i] == rnd: - tokens[i] = replaced_abbrs.pop(0) + + if self._join_abbreviation: + reversed_dict = {value: key for key, value in keyword_processor.get_all_keywords().items()} + for i, token in enumerate(tokens): + if token in reversed_dict: + tokens[i] = reversed_dict[token] return tokens diff --git a/pyproject.toml b/pyproject.toml index e5030f79..6c455705 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ python-crfsuite="^0.9.9" numpy = "^1.24.3" scikit-learn = "^1.2.2" fasttext-wheel = "^0.9.2" +flashtext = "^2.7" [tool.poetry.group.docs.dependencies] mkdocs="^1.4.3"