Skip to content
This repository was archived by the owner on Dec 17, 2024. It is now read-only.

Commit 2bafac0

Browse files
committed
feat(#87): switch to spacy for lemmatization
1 parent e2da0ae commit 2bafac0

3 files changed

Lines changed: 9 additions & 23 deletions

File tree

script/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Python interface to TEM
22

33
- install `requirements.txt`
4-
- `python3 -m nltk.download punkt stopwords`
4+
- `python3 -m nltk.download punkt`
5+
- for English: `python -m spacy download en_core_web_md`

script/nlp.py

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import re
22

3-
from nltk.corpus import wordnet
4-
from nltk.tokenize import word_tokenize, sent_tokenize
5-
from nltk import WordNetLemmatizer, pos_tag
3+
from nltk.tokenize import sent_tokenize
4+
import spacy
65

76
# ------------------------------------------------------------------------------
87
# MARK: normalization
@@ -28,26 +27,9 @@ def normalize_quotes(doc: str) -> str:
2827
def normal_str(doc: str) -> str:
2928
return lower_alnum(normalize_quotes(doc))
3029

31-
__lemma = WordNetLemmatizer()
32-
30+
__lemma = spacy.load('en_core_web_md')
3331
def normal_tokens(doc: str) -> list[str]:
34-
def __get_wordnet_pos(treebank_tag):
35-
if treebank_tag.startswith('J'):
36-
return wordnet.ADJ
37-
elif treebank_tag.startswith('V'):
38-
return wordnet.VERB
39-
elif treebank_tag.startswith('N'):
40-
return wordnet.NOUN
41-
elif treebank_tag.startswith('R'):
42-
return wordnet.ADV
43-
else:
44-
return wordnet.NOUN
45-
46-
token_pos = pos_tag(word_tokenize(normal_str(doc)))
47-
return [
48-
__lemma.lemmatize(token, __get_wordnet_pos(pos))
49-
for token, pos in token_pos
50-
]
32+
return [token.lemma_ for token in __lemma(normal_str(doc))]
5133

5234
# ------------------------------------------------------------------------------
5335
# MARK: TEM prep

script/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@ nltk>=3.6.7
33
numpy>=1.23.5
44
pexpect>=4.8.0
55
pyyaml>=6.0
6+
spacy>=3.7.4
7+
spacy-legacy>=3.0.12
8+
spacy-loggers>=1.0.5

0 commit comments

Comments
 (0)