Intermediate checkin... lots of work done.

JasonKessler · Jan 19, 2018 · 0ed3e9e · 0ed3e9e
1 parent 7a129c3
commit 0ed3e9e
Show file tree

Hide file tree

Showing 26 changed files with 923 additions and 153 deletions.
diff --git a/README.md b/README.md
@@ -3,31 +3,11 @@
 [![Gitter Chat](https://img.shields.io/badge/GITTER-join%20chat-green.svg)](https://gitter.im/scattertext/Lobby)
 [![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/jasonkessler)
 
-# Scattertext 0.0.2.16.1
+# Scattertext 0.0.2.17
 ### Updates
+Incorporated the [General Inquirer](http://www.wjh.harvard.edu/~inquirer/homecat.htm) 
+lexicon. SEe  
 
-Added a very simple semiotic square creator.
-
-The idea to build a semiotic square that contrasts two categories in a Term Document Matrix
-while using other categories as neutral categories.   
-
-See [Creating semiotic squares](#creating-semiotic-squares) for an overview on how to 
-use this functionality and semiotic squares.
-
-Added a parameter to disable the display of the top-terms sidebar, e.g.,
-`produce_scattertext_explorer(..., show_top_terms=False, ...)`.
-
-An interface to part of the subjectivity/sentiment dataset from 
-Bo Pang and Lillian Lee. ``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
-Based on Minimum Cuts''. ACL. 2004. See `SampleCorpora.RottenTomatoes`.
-
-Fixed bug that caused tooltip placement to be off after scrolling.   
-
-Made `category_name` and `not_category_name` optional in `produce_scattertext_explorer` etc.
-
-Created the ability to customize tooltips via the `get_tooltip_content` argument to
- `produce_scattertext_explorer` etc., control axes labels via `x_axis_values` 
- and `y_axis_values`, an d
 
 **Table of Contents**
 
@@ -811,6 +791,32 @@ $ python2.7 src/main.py <script file name> --enable-volume-trees \
 
 ## What's new
 
+### 0.0.2.15-16
+Added a very semiotic square creator.
+
+The idea to build a semiotic square that contrasts two categories in a Term Document Matrix
+while using other categories as neutral categories.   
+
+See [Creating semiotic squares](#creating-semiotic-squares) for an overview on how to 
+use this functionality and semiotic squares.
+
+Added a parameter to disable the display of the top-terms sidebar, e.g.,
+`produce_scattertext_explorer(..., show_top_terms=False, ...)`.
+
+An interface to part of the subjectivity/sentiment dataset from 
+Bo Pang and Lillian Lee. ``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization 
+Based on Minimum Cuts''. ACL. 2004. See `SampleCorpora.RottenTomatoes`.
+
+Fixed bug that caused tooltip placement to be off after scrolling.   
+
+Made `category_name` and `not_category_name` optional in `produce_scattertext_explorer` etc.
+
+Created the ability to customize tooltips via the `get_tooltip_content` argument to
+ `produce_scattertext_explorer` etc., control axes labels via `x_axis_values` 
+ and `y_axis_values`.  The `color_func` parameter is a Javascript function to control color of a point.  Function takes a parameter
+ which is a dictionary entry produced by `ScatterChartExplorer.to_dict` and returns a string.
+
+
 ### 0.0.2.14
 Integration with Scikit-Learn's text-analysis pipeline led the creation of the
 `CorpusFromScikit` and `TermDocMatrixFromScikit` classes.

diff --git a/demo_empath.py b/demo_empath.py
@@ -1,14 +1,11 @@
 from __future__ import print_function
 
-import spacy
-
 from scattertext import CorpusFromParsedDocuments, produce_scattertext_explorer
 from scattertext import FeatsFromOnlyEmpath
 from scattertext import SampleCorpora
 
 
 def main():
-	nlp = spacy.load('en')
 	convention_df = SampleCorpora.ConventionData2012.get_data()
 
 	corpus = CorpusFromParsedDocuments(convention_df,

diff --git a/demo_log_odds_ratio_prior.py b/demo_log_odds_ratio_prior.py
@@ -1,23 +1,19 @@
-import numpy as np
-import spacy
-
-from scattertext import SampleCorpora, LogOddsRatioUninformativeDirichletPrior
-from scattertext import produce_scattertext_explorer
+from scattertext import SampleCorpora, produce_fightin_words_explorer
 from scattertext.CorpusFromPandas import CorpusFromPandas
-from scattertext.Scalers import scale_neg_1_to_1_with_zero_mean_abs_max, scale
+from scattertext.WhitespaceNLP import whitespace_nlp_with_sentences
+from scattertext.termsignificance.LogOddsRatioInformativeDirichletPiror import LogOddsRatioInformativeDirichletPrior
 
-nlp = spacy.load('en')
 convention_df = SampleCorpora.ConventionData2012.get_data()
 corpus = CorpusFromPandas(convention_df,
                           category_col='party',
                           text_col='text',
-                          nlp=nlp).build()
-
+                          nlp=whitespace_nlp_with_sentences).build()
+'''
 term_freq_df = corpus.get_term_freq_df()
 frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
 zeta_i_j = (LogOddsRatioUninformativeDirichletPrior()
-            .get_zeta_i_j_given_separate_counts(term_freq_df['democrat freq'],
-                                                term_freq_df['republican freq']))
+	.get_zeta_i_j_given_separate_counts(term_freq_df['democrat freq'],
+                                      term_freq_df['republican freq']))
 zeta_scaled_for_charting = scale_neg_1_to_1_with_zero_mean_abs_max(zeta_i_j)
 
 html = produce_scattertext_explorer(corpus,
@@ -33,6 +29,33 @@
                                     metadata=convention_df['speaker'],
                                     x_label='Log Frequency',
                                     y_label='Log Odds Ratio w/ Prior (a_w=0.01)')
-file_name = 'demo_log_odds_ratio_prior.html'
+'''
+
+bg_df = (corpus
+	.get_term_and_background_counts()
+	.where(lambda x: x.corpus > 0).dropna()
+)
+bg_df.background += bg_df.corpus
+corpus_bg = corpus.remove_terms(set(corpus.get_terms()) - set(bg_df.index))
+priors = (corpus_bg
+	.get_term_and_background_counts()
+	.reindex(corpus_bg.get_terms())['background']
+)
+term_scorer = LogOddsRatioInformativeDirichletPrior(priors.values, 10)
+
+
+tooltip_context = '''(function(d) {
+	return d.term+"<br/>Count ratio (per 25k): "+d.cat25k+":"+d.ncat25k+"<br/>Z-score: "+ Number(Math.round(d.os+'e3')+'e-3');
+})'''
+
+html = produce_fightin_words_explorer(corpus_bg,
+                                      category='democrat',
+                                      category_name='Democratic',
+                                      not_category_name='Republican',
+                                      minimum_term_frequency=5,
+                                      get_tooltip_content = tooltip_context,
+                                      term_scorer=term_scorer)
+
+file_name = 'demo_log_odds_ratio_prior_10.html'
 open(file_name, 'wb').write(html.encode('utf-8'))
 print('Open %s in Chrome or Firefox.' % file_name)
diff --git a/demo_log_odds_ratio_prior_rotten_tomatoes.com.py b/demo_log_odds_ratio_prior_rotten_tomatoes.com.py
@@ -0,0 +1,30 @@
+import scattertext as st
+
+fn = 'rotten_fresh2.html'
+df = st.SampleCorpora.RottenTomatoes.get_data()
+corpus = (st.CorpusFromPandas(df,
+                              category_col='category',
+                              text_col='text',
+                              nlp=st.whitespace_nlp_with_sentences)
+	.build())
+term_scorer = (st.LORIDPFactory(corpus,
+                                category='fresh',
+                                not_categories=['rotten'],
+                                starting_count=1,
+                                alpha=10)
+	.use_general_term_frequencies()
+	.use_all_categories()
+	.get_term_scorer())
+tdf = corpus.get_term_freq_df()
+(open(fn, 'wb')
+	.write(
+	st.produce_fightin_words_explorer(
+		corpus,
+		category='fresh',
+		not_categories=['rotten'],
+		metadata=df['movie_name'],
+		term_scorer=term_scorer,
+		transform=st.Scalers.percentile_dense)
+		.encode('utf-8'))
+)
+print(fn)
diff --git a/demo_scaled_f_score.py b/demo_scaled_f_score.py
@@ -1,5 +1,5 @@
 from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_fightin_words_explorer, \
-  ScaledFScoreSignificance
+	ScaledFScoreSignificance
 from scattertext.CorpusFromPandas import CorpusFromPandas
 
 convention_df = SampleCorpora.ConventionData2012.get_data()
@@ -14,7 +14,8 @@
                                       not_category_name='Republican',
                                       minimum_term_frequency=5,
                                       width_in_pixels=1000,
-                                      term_scorer=ScaledFScoreSignificance(beta=1),
+                                      term_scorer=ScaledFScoreSignificance(
+	                                      beta=0.5, scaler_algo='percentiledense'),
                                       metadata=convention_df['speaker'])
 open('./demo_scaled_f_score.html', 'wb').write(html.encode('utf-8'))
 print('Open ./demo_scaled_f_score.html in Chrome or Firefox.')
diff --git a/demo_semiotic.py b/demo_semiotic.py
@@ -1,4 +1,5 @@
 import scattertext as st
+from scattertext import ScaledFZScore
 
 movie_df = st.SampleCorpora.RottenTomatoes.get_data()
 
@@ -9,12 +10,23 @@
 	nlp=st.whitespace_nlp_with_sentences
 ).build().get_unigram_corpus()
 
+term_scorer = (st.LORIDPFactory(corpus,
+                                category='fresh',
+                                starting_count=100,
+                                alpha=10,
+                                term_freq_df_func=lambda x: x.get_term_freq_df())
+	.use_general_term_frequencies()
+	.use_all_categories()
+	.get_term_scorer())
+
 semiotic_square = st.SemioticSquare(
 	corpus,
 	category_a='fresh',
 	category_b='rotten',
 	neutral_categories=['plot'],
-	scorer=st.LogOddsRatioUninformativeDirichletPrior(alpha_w=0.001)
+	term_freq_func=lambda x: x.get_term_freq_df(),
+	scorer=ScaledFZScore(beta=1)
+	#term_scorer#st.LogOddsRatioUninformativeDirichletPrior(alpha_w=0.001)
 )
 
 html = st.produce_semiotic_square_explorer(semiotic_square,

diff --git a/scattertext/Common.py b/scattertext/Common.py
@@ -14,6 +14,9 @@
 POLITICAL_DATA_URL = 'https://gitcdn.xyz/repo/JasonKessler/scattertext/master/scattertext/data/political_data.json'
 ROTTEN_TOMATOES_DATA_URL = 'https://gitcdn.xyz/repo/JasonKessler/scattertext/master/scattertext/data/rotten_tomatoes_corpus.csv.bz2'
 
+# General inquirer data
+GENERAL_INQUIRER_URL = 'http://www.wjh.harvard.edu/~inquirer/inqtabs.txt'
+
 # For sample corpus
 DEFAULT_D3_URL \
 	= 'http://cdnjs.cloudflare.com/ajax/libs/d3/4.6.0/d3.min.js'

diff --git a/scattertext/Corpus.py b/scattertext/Corpus.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 from numpy import nonzero
 
@@ -84,12 +85,19 @@ def _document_index_mask(self, ngram):
 		mask = (self._X[:, idx] > 0).todense().A1
 		return mask
 
-	def _term_doc_matrix_with_new_X(self, new_X, new_term_idx_store):
+	def _make_new_term_doc_matrix(self,
+	                              new_X,
+	                              new_mX,
+	                              new_y,
+	                              new_term_idx_store,
+	                              new_category_idx_store,
+	                              new_metadata_idx_store,
+	                              new_y_mask):
 		return Corpus(X=new_X,
-		              mX=self._mX,
-		              y=self._y,
+		              mX=new_mX,
+		              y=new_y,
 		              term_idx_store=new_term_idx_store,
-		              category_idx_store=self._category_idx_store,
-		              metadata_idx_store=self._metadata_idx_store,
-		              raw_texts=self.get_texts(),
+		              category_idx_store=new_category_idx_store,
+		              metadata_idx_store=new_metadata_idx_store,
+		              raw_texts=np.array(self.get_texts())[new_y_mask],
 		              unigram_frequency_path=self._unigram_frequency_path)
diff --git a/scattertext/Formatter.py b/scattertext/Formatter.py
@@ -0,0 +1,16 @@
+def large_int_format(x):
+	num = round_downer(x)
+	if 1000000000 <= num:
+		return str(num // 1000000000) + 'b'
+	elif 1000000 <= num < 1000000000:
+		return str(num // 1000000) + 'mm'
+	elif 1000 <= num < 1000000:
+		return str(num // 1000) + 'k'
+	else:
+		return str(num)
+
+
+def round_downer(x):
+	power_of_ten = 10 ** (len(str(int(x))) - 1)
+	num = power_of_ten * (x // power_of_ten)
+	return num