Skip to content

Commit

Permalink
handle missing in some language models DerwenAI#204
Browse files Browse the repository at this point in the history
  • Loading branch information
ceteri committed Mar 6, 2022
1 parent c9f3d24 commit 18f0f05
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 18 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

2022-03-06

* handles missing `noun_chunks` in some language models (e.g., "ru")
* add *TopicRank* algorithm; kudos @tomaarsen
* improved test suite; fixed tests for newer spacy releases; kudos @tomaarsen

Expand Down
9 changes: 8 additions & 1 deletion pytextrank/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,14 @@ def calc_textrank (
# agglomerate the lemmas ranked in the lemma graph into ranked
# phrases, leveraging information from earlier stages of the
# pipeline: noun chunks and named entities
nc_phrases: typing.Dict[Span, float] = self._collect_phrases(self.doc.noun_chunks, self.ranks)
nc_phrases: typing.Dict[Span, float] = {}

try:
nc_phrases = self._collect_phrases(self.doc.noun_chunks, self.ranks)
except AttributeError:
# some languages do not have `noun_chunks` support in spaCy models
pass

ent_phrases: typing.Dict[Span, float] = self._collect_phrases(self.doc.ents, self.ranks)
all_phrases: typing.Dict[Span, float] = { **nc_phrases, **ent_phrases }

Expand Down
17 changes: 11 additions & 6 deletions pytextrank/topicrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,14 +238,19 @@ def _get_candidates (
returns:
list of candidate spans
"""
noun_chunks = list(self.doc.noun_chunks)
candidates = []

for chunk in noun_chunks:
for token in chunk:
if self._keep_token(token):
candidates.append(self.doc[token.i : chunk.end])
break
try:
noun_chunks = list(self.doc.noun_chunks)

for chunk in noun_chunks:
for token in chunk:
if self._keep_token(token):
candidates.append(self.doc[token.i : chunk.end])
break
except AttributeError:
# some languages do not have `noun_chunks` support in spaCy models
pass

return candidates

Expand Down
60 changes: 49 additions & 11 deletions sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,19 +38,21 @@
print("{:.4f} {:5d} {}".format(phrase.rank, phrase.count, phrase.text))
ic(phrase.chunks)


# switch to a longer text document...
print("\n----\n")
print("dat/lee.txt")
print("dat/lee.txt:")

text = pathlib.Path("dat/lee.txt").read_text()
doc = nlp(text)

for phrase in doc._.phrases[:20]:
ic(phrase)


# to show use of stopwords: first we output a baseline...
print("\n----\n")
print("dat/gen.txt")
print("dat/gen.txt:")

text = pathlib.Path("dat/gen.txt").read_text()
doc = nlp(text)
Expand All @@ -61,7 +63,7 @@
# now add `"word": ["NOUN"]` to the stop words, to remove instances
# of `"word"` or `"words"` then see how the ranked phrases differ...
print("\n----\n")
print("stopwords:")
print("stop words:")

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank", config={ "stopwords": { "word": ["NOUN"] } })
Expand All @@ -71,21 +73,22 @@
for phrase in doc._.phrases[:10]:
ic(phrase)

# generate a GraphViz doc to visualize the lemma graph
print("\n----\n")
print("extractive summarization:")

# generate a GraphViz doc to visualize the lemma graph
tr = doc._.textrank
tr.write_dot(path="lemma_graph.dot")


# summarize the document based on its top 15 phrases,
# yielding its top 5 sentences...
print("\n----\n")
print("extractive summarization:")

for sent in tr.summary(limit_phrases=15, limit_sentences=5):
ic(sent)

print("\n----\n")
print("Biased TextRank:")

# compare results among the implemented textgraph algorithms
EXPECTED_PHRASES = [
"grandmaster Lee Sedol",
"Lee Sedol",
Expand All @@ -97,30 +100,65 @@
"Kasparov",
]

ic(EXPECTED_PHRASES)

# show use of Biased TextRank algorithm
# show use of TopicRank algorithm
print("\n----\n")
print("TopicRank:")

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("biasedtextrank")
nlp.add_pipe("topicrank")

text = pathlib.Path("dat/lee.txt").read_text()
doc = nlp(text)

for phrase in doc._.phrases[:len(EXPECTED_PHRASES)]:
ic(phrase)

tr = doc._.textrank


# show use of PositionRank algorithm
print("\n----\n")
print("PositionRank:")

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("positionrank")

text = pathlib.Path("dat/lee.txt").read_text()
doc = nlp(text)

for phrase in doc._.phrases[:len(EXPECTED_PHRASES)]:
ic(phrase)

tr = doc._.textrank


# show use of Biased TextRank algorithm
print("\n----\n")
print("Biased TextRank:")

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("biasedtextrank")

text = pathlib.Path("dat/lee.txt").read_text()
doc = nlp(text)

for phrase in doc._.phrases[:len(EXPECTED_PHRASES)]:
ic(phrase)

# note how the bias parameters get set here, to help emphasize
# the *focus set*
tr = doc._.textrank

phrases = tr.change_focus(
focus="It wasn't until the following year that Deep Blue topped Kasparov over the course of a six-game contest.",
bias=10.0,
default_bias=0.0,
)

print("\n----\n")
ic(EXPECTED_PHRASES)

for phrase in phrases[:len(EXPECTED_PHRASES)]:
ic(phrase.text)
ic(phrase.text in EXPECTED_PHRASES)

0 comments on commit 18f0f05

Please sign in to comment.