Adding Pyate phrases, minor tweaks to js regex, stoplist

JasonKessler · Oct 8, 2020 · 1002070 · 1002070
1 parent c26e2ad
commit 1002070
Show file tree

Hide file tree

Showing 8 changed files with 24 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Gitter Chat](https://img.shields.io/badge/GITTER-join%20chat-green.svg)](https://gitter.im/scattertext/Lobby)
 [![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/jasonkessler)
 
-# Scattertext 0.0.2.68
+# Scattertext 0.0.2.69
 
 A tool for finding distinguishing terms in corpora, and presenting them in an 
 interactive, HTML scatter plot. Points corresponding to terms are selectively labeled

diff --git a/scattertext/Common.py b/scattertext/Common.py
@@ -50,7 +50,7 @@
     'you', "dasn't", 'all', 'ought', 'noun', 'was', 'who', 'let', 'didn', "y'all'd've", 'must', "to've", "'re", "'m",
     'gon', 'do', 'isn', 'may', 'weren', "had've", 'they', "'ve", "may've", 'finna', 'which', "I'm'a", "n't", 'should',
     "e'er", 'when', 'ta', 'nal', 'haven', "y'all'd'n've", "those're", 'don', 'wasn', 'everybody', 'wouldn', "ne'er",
-    'something', 'that', 'everyone', 'methinks', '-'
+    'something', 'that', 'everyone', 'methinks', '-', '-pron-'
 }
 
 # Qualitative Colors From Tableau

diff --git a/scattertext/__init__.py b/scattertext/__init__.py
@@ -1,6 +1,8 @@
 from __future__ import print_function
 
-version = [0, 0, 2, 68]
+from scattertext.features.PyatePhrases import PyatePhrases
+
+version = [0, 0, 2, 69]
 __version__ = '.'.join([str(e) for e in version])
 import re
 import numpy as np

diff --git a/scattertext/data/viz/scripts/main.js b/scattertext/data/viz/scripts/main.js
@@ -1195,7 +1195,7 @@ buildViz = function (d3) {
 
                 // https://stackoverflow.com/questions/3446170/escape-string-for-use-in-javascript-regex
                 function escapeRegExp(string) {
-                    return string.replace(/[#.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string
+                    return string.replace(/[#.*+?^${}()|[\]\\]'\%/g, '\\$&'); // $& means the whole matched string
                 }
 
                 /*
@@ -1208,11 +1208,11 @@ buildViz = function (d3) {
                 termToRegex = escapeRegExp(termToRegex);
                 console.log("termToRegex")
                 console.log(termToRegex)
+
                 var regexp = new RegExp(boundary + '('
                     + removeUnderScoreJoin(
                         termToRegex.replace(' ', wordSep, 'gim')
-                    )
-                    + ')' + boundary, 'gim');
+                    ) + ')' + boundary, 'gim');
                 console.log(regexp);
                 try {
                     regexp.exec('X');

diff --git a/scattertext/features/FeatsFromSpacyDoc.py b/scattertext/features/FeatsFromSpacyDoc.py
@@ -68,7 +68,7 @@ def _get_unigram_feats(self, sent):
 				elif tok.tag_ in self._tag_types_to_censor:
 					unigrams.append(tok.tag_)
 				elif self._use_lemmas and tok.lemma_.strip():
-					unigrams.append(self._post_process_term(tok.lemma_.strip()))
+					unigrams.append(self._post_process_term(tok.lemma_.strip().lower()))
 				elif tok.lower_.strip():
 					unigrams.append(self._post_process_term(tok.lower_.strip()))
 		return unigrams

diff --git a/scattertext/features/PyatePhrases.py b/scattertext/features/PyatePhrases.py
@@ -0,0 +1,13 @@
+from collections import Counter
+from scattertext.features.FeatsFromSpacyDoc import FeatsFromSpacyDoc
+
+
+class PyatePhrases(FeatsFromSpacyDoc):
+    def __init__(self, extractor=None, **args):
+        import pyate
+        self._extractor = pyate.combo_basic if extractor is None else extractor
+        FeatsFromSpacyDoc.__init__(self, **args)
+
+    def get_feats(self, doc):
+        return Counter(self._extractor(str(doc)).to_dict())
+
diff --git a/scattertext/semioticsquare/SemioticSquareFromAxes.py b/scattertext/semioticsquare/SemioticSquareFromAxes.py
@@ -74,5 +74,6 @@ def get_lexicons(self, num_terms=10):
         ]:
             #scores = np.linalg.norm(np.array([self.axes['x'] - x_coord, self.axes['y'] - y_coord]), 2, axis=0)
             scores = self._distance_measure.distances(x_coord, y_coord, self.axes['x'], self.axes['y'])
+
             lexicons[label] = list(self.axes.index[np.argsort(scores)])[:num_terms]
         return lexicons
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 
 setup(name='scattertext',
-      version='0.0.2.68',
+      version='0.0.2.69',
       description='An NLP package to visualize interesting terms in text.',
       url='https://github.com/JasonKessler/scattertext',
       author='Jason Kessler',