handle missing in some language models DerwenAI#204

jake-aft · Mar 6, 2022 · 18f0f05 · 18f0f05
1 parent c9f3d24
commit 18f0f05
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 2022-03-06
 
+  * handles missing `noun_chunks` in some language models (e.g., "ru")
   * add *TopicRank* algorithm; kudos @tomaarsen
   * improved test suite; fixed tests for newer spacy releases; kudos @tomaarsen
 

diff --git a/pytextrank/base.py b/pytextrank/base.py
@@ -357,7 +357,14 @@ def calc_textrank (
         # agglomerate the lemmas ranked in the lemma graph into ranked
         # phrases, leveraging information from earlier stages of the
         # pipeline: noun chunks and named entities
-        nc_phrases: typing.Dict[Span, float] = self._collect_phrases(self.doc.noun_chunks, self.ranks)
+        nc_phrases: typing.Dict[Span, float] = {}
+
+        try:
+            nc_phrases = self._collect_phrases(self.doc.noun_chunks, self.ranks)
+        except AttributeError:
+            # some languages do not have `noun_chunks` support in spaCy models
+            pass
+
         ent_phrases: typing.Dict[Span, float] = self._collect_phrases(self.doc.ents, self.ranks)
         all_phrases: typing.Dict[Span, float] = { **nc_phrases, **ent_phrases }
 

diff --git a/pytextrank/topicrank.py b/pytextrank/topicrank.py
@@ -238,14 +238,19 @@ def _get_candidates (
     returns:
 list of candidate spans
         """
-        noun_chunks = list(self.doc.noun_chunks)
         candidates = []
 
-        for chunk in noun_chunks:
-            for token in chunk:
-                if self._keep_token(token):
-                    candidates.append(self.doc[token.i : chunk.end])
-                    break
+        try:
+            noun_chunks = list(self.doc.noun_chunks)
+
+            for chunk in noun_chunks:
+                for token in chunk:
+                    if self._keep_token(token):
+                        candidates.append(self.doc[token.i : chunk.end])
+                        break
+        except AttributeError:
+            # some languages do not have `noun_chunks` support in spaCy models
+            pass
 
         return candidates
 

diff --git a/sample.py b/sample.py
@@ -38,19 +38,21 @@
     print("{:.4f} {:5d}  {}".format(phrase.rank, phrase.count, phrase.text))
     ic(phrase.chunks)
 
+
 # switch to a longer text document...
 print("\n----\n")
-print("dat/lee.txt")
+print("dat/lee.txt:")
 
 text = pathlib.Path("dat/lee.txt").read_text()
 doc = nlp(text)
 
 for phrase in doc._.phrases[:20]:
     ic(phrase)
 
+
 # to show use of stopwords: first we output a baseline...
 print("\n----\n")
-print("dat/gen.txt")
+print("dat/gen.txt:")
 
 text = pathlib.Path("dat/gen.txt").read_text()
 doc = nlp(text)
@@ -61,7 +63,7 @@
 # now add `"word": ["NOUN"]` to the stop words, to remove instances
 # of `"word"` or `"words"` then see how the ranked phrases differ...
 print("\n----\n")
-print("stopwords:")
+print("stop words:")
 
 nlp = spacy.load("en_core_web_sm")
 nlp.add_pipe("textrank", config={ "stopwords": { "word": ["NOUN"] } })
@@ -71,21 +73,22 @@
 for phrase in doc._.phrases[:10]:
     ic(phrase)
 
-# generate a GraphViz doc to visualize the lemma graph
-print("\n----\n")
-print("extractive summarization:")
 
+# generate a GraphViz doc to visualize the lemma graph
 tr = doc._.textrank
 tr.write_dot(path="lemma_graph.dot")
 
+
 # summarize the document based on its top 15 phrases,
 # yielding its top 5 sentences...
+print("\n----\n")
+print("extractive summarization:")
+
 for sent in tr.summary(limit_phrases=15, limit_sentences=5):
     ic(sent)
 
-print("\n----\n")
-print("Biased TextRank:")
 
+# compare results among the implemented textgraph algorithms
 EXPECTED_PHRASES = [
     "grandmaster Lee Sedol",
     "Lee Sedol",
@@ -97,30 +100,65 @@
     "Kasparov",
 ]
 
-ic(EXPECTED_PHRASES)
 
-# show use of Biased TextRank algorithm
+# show use of TopicRank algorithm
+print("\n----\n")
+print("TopicRank:")
+
 nlp = spacy.load("en_core_web_sm")
-nlp.add_pipe("biasedtextrank")
+nlp.add_pipe("topicrank")
 
 text = pathlib.Path("dat/lee.txt").read_text()
 doc = nlp(text)
 
 for phrase in doc._.phrases[:len(EXPECTED_PHRASES)]:
     ic(phrase)
 
+tr = doc._.textrank
+
+
+# show use of PositionRank algorithm
 print("\n----\n")
+print("PositionRank:")
+
+nlp = spacy.load("en_core_web_sm")
+nlp.add_pipe("positionrank")
+
+text = pathlib.Path("dat/lee.txt").read_text()
+doc = nlp(text)
+
+for phrase in doc._.phrases[:len(EXPECTED_PHRASES)]:
+    ic(phrase)
+
 tr = doc._.textrank
 
+
+# show use of Biased TextRank algorithm
+print("\n----\n")
+print("Biased TextRank:")
+
+nlp = spacy.load("en_core_web_sm")
+nlp.add_pipe("biasedtextrank")
+
+text = pathlib.Path("dat/lee.txt").read_text()
+doc = nlp(text)
+
+for phrase in doc._.phrases[:len(EXPECTED_PHRASES)]:
+    ic(phrase)
+
 # note how the bias parameters get set here, to help emphasize
 # the *focus set*
+tr = doc._.textrank
 
 phrases = tr.change_focus(
     focus="It wasn't until the following year that Deep Blue topped Kasparov over the course of a six-game contest.",
     bias=10.0,
     default_bias=0.0,
     )
 
+print("\n----\n")
+ic(EXPECTED_PHRASES)
+
 for phrase in phrases[:len(EXPECTED_PHRASES)]:
     ic(phrase.text)
     ic(phrase.text in EXPECTED_PHRASES)
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ @@
 -03-06
+      * handles missing `noun_chunks` in some language models (e.g., "ru")
       * add *TopicRank* algorithm; kudos @tomaarsen
       * improved test suite; fixed tests for newer spacy releases; kudos @tomaarsen
@@ Expand Down @@