This repository was archived by the owner on Dec 17, 2024. It is now read-only.
File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 11# Python interface to TEM
22
33- install ` requirements.txt `
4- - ` python3 -m nltk.download punkt stopwords `
4+ - ` python3 -m nltk.download punkt `
5+ - for English: ` python -m spacy download en_core_web_md `
Original file line number Diff line number Diff line change 11import re
22
3- from nltk .corpus import wordnet
4- from nltk .tokenize import word_tokenize , sent_tokenize
5- from nltk import WordNetLemmatizer , pos_tag
3+ from nltk .tokenize import sent_tokenize
4+ import spacy
65
76# ------------------------------------------------------------------------------
87# MARK: normalization
@@ -28,26 +27,9 @@ def normalize_quotes(doc: str) -> str:
2827def normal_str (doc : str ) -> str :
2928 return lower_alnum (normalize_quotes (doc ))
3029
31- __lemma = WordNetLemmatizer ()
32-
30+ __lemma = spacy .load ('en_core_web_md' )
3331def normal_tokens (doc : str ) -> list [str ]:
34- def __get_wordnet_pos (treebank_tag ):
35- if treebank_tag .startswith ('J' ):
36- return wordnet .ADJ
37- elif treebank_tag .startswith ('V' ):
38- return wordnet .VERB
39- elif treebank_tag .startswith ('N' ):
40- return wordnet .NOUN
41- elif treebank_tag .startswith ('R' ):
42- return wordnet .ADV
43- else :
44- return wordnet .NOUN
45-
46- token_pos = pos_tag (word_tokenize (normal_str (doc )))
47- return [
48- __lemma .lemmatize (token , __get_wordnet_pos (pos ))
49- for token , pos in token_pos
50- ]
32+ return [token .lemma_ for token in __lemma (normal_str (doc ))]
5133
5234# ------------------------------------------------------------------------------
5335# MARK: TEM prep
Original file line number Diff line number Diff line change @@ -3,3 +3,6 @@ nltk>=3.6.7
33numpy >= 1.23.5
44pexpect >= 4.8.0
55pyyaml >= 6.0
6+ spacy >= 3.7.4
7+ spacy-legacy >= 3.0.12
8+ spacy-loggers >= 1.0.5
You can’t perform that action at this time.
0 commit comments