Skip to content

Commit

Permalink
Intermediate checkin... lots of work done.
Browse files Browse the repository at this point in the history
  • Loading branch information
JasonKessler committed Jan 19, 2018
1 parent 7a129c3 commit 0ed3e9e
Show file tree
Hide file tree
Showing 26 changed files with 923 additions and 153 deletions.
52 changes: 29 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,11 @@
[![Gitter Chat](https://img.shields.io/badge/GITTER-join%20chat-green.svg)](https://gitter.im/scattertext/Lobby)
[![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/jasonkessler)

# Scattertext 0.0.2.16.1
# Scattertext 0.0.2.17
### Updates
Incorporated the [General Inquirer](http://www.wjh.harvard.edu/~inquirer/homecat.htm)
lexicon. SEe

Added a very simple semiotic square creator.

The idea to build a semiotic square that contrasts two categories in a Term Document Matrix
while using other categories as neutral categories.

See [Creating semiotic squares](#creating-semiotic-squares) for an overview on how to
use this functionality and semiotic squares.

Added a parameter to disable the display of the top-terms sidebar, e.g.,
`produce_scattertext_explorer(..., show_top_terms=False, ...)`.

An interface to part of the subjectivity/sentiment dataset from
Bo Pang and Lillian Lee. ``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization
Based on Minimum Cuts''. ACL. 2004. See `SampleCorpora.RottenTomatoes`.

Fixed bug that caused tooltip placement to be off after scrolling.

Made `category_name` and `not_category_name` optional in `produce_scattertext_explorer` etc.

Created the ability to customize tooltips via the `get_tooltip_content` argument to
`produce_scattertext_explorer` etc., control axes labels via `x_axis_values`
and `y_axis_values`, an d

**Table of Contents**

Expand Down Expand Up @@ -811,6 +791,32 @@ $ python2.7 src/main.py <script file name> --enable-volume-trees \
## What's new
### 0.0.2.15-16
Added a very semiotic square creator.
The idea to build a semiotic square that contrasts two categories in a Term Document Matrix
while using other categories as neutral categories.
See [Creating semiotic squares](#creating-semiotic-squares) for an overview on how to
use this functionality and semiotic squares.
Added a parameter to disable the display of the top-terms sidebar, e.g.,
`produce_scattertext_explorer(..., show_top_terms=False, ...)`.
An interface to part of the subjectivity/sentiment dataset from
Bo Pang and Lillian Lee. ``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization
Based on Minimum Cuts''. ACL. 2004. See `SampleCorpora.RottenTomatoes`.
Fixed bug that caused tooltip placement to be off after scrolling.
Made `category_name` and `not_category_name` optional in `produce_scattertext_explorer` etc.
Created the ability to customize tooltips via the `get_tooltip_content` argument to
`produce_scattertext_explorer` etc., control axes labels via `x_axis_values`
and `y_axis_values`. The `color_func` parameter is a Javascript function to control color of a point. Function takes a parameter
which is a dictionary entry produced by `ScatterChartExplorer.to_dict` and returns a string.
### 0.0.2.14
Integration with Scikit-Learn's text-analysis pipeline led the creation of the
`CorpusFromScikit` and `TermDocMatrixFromScikit` classes.
Expand Down
3 changes: 0 additions & 3 deletions demo_empath.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
from __future__ import print_function

import spacy

from scattertext import CorpusFromParsedDocuments, produce_scattertext_explorer
from scattertext import FeatsFromOnlyEmpath
from scattertext import SampleCorpora


def main():
nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()

corpus = CorpusFromParsedDocuments(convention_df,
Expand Down
47 changes: 35 additions & 12 deletions demo_log_odds_ratio_prior.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,19 @@
import numpy as np
import spacy

from scattertext import SampleCorpora, LogOddsRatioUninformativeDirichletPrior
from scattertext import produce_scattertext_explorer
from scattertext import SampleCorpora, produce_fightin_words_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.Scalers import scale_neg_1_to_1_with_zero_mean_abs_max, scale
from scattertext.WhitespaceNLP import whitespace_nlp_with_sentences
from scattertext.termsignificance.LogOddsRatioInformativeDirichletPiror import LogOddsRatioInformativeDirichletPrior

nlp = spacy.load('en')
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
category_col='party',
text_col='text',
nlp=nlp).build()

nlp=whitespace_nlp_with_sentences).build()
'''
term_freq_df = corpus.get_term_freq_df()
frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
zeta_i_j = (LogOddsRatioUninformativeDirichletPrior()
.get_zeta_i_j_given_separate_counts(term_freq_df['democrat freq'],
term_freq_df['republican freq']))
.get_zeta_i_j_given_separate_counts(term_freq_df['democrat freq'],
term_freq_df['republican freq']))
zeta_scaled_for_charting = scale_neg_1_to_1_with_zero_mean_abs_max(zeta_i_j)
html = produce_scattertext_explorer(corpus,
Expand All @@ -33,6 +29,33 @@
metadata=convention_df['speaker'],
x_label='Log Frequency',
y_label='Log Odds Ratio w/ Prior (a_w=0.01)')
file_name = 'demo_log_odds_ratio_prior.html'
'''

bg_df = (corpus
.get_term_and_background_counts()
.where(lambda x: x.corpus > 0).dropna()
)
bg_df.background += bg_df.corpus
corpus_bg = corpus.remove_terms(set(corpus.get_terms()) - set(bg_df.index))
priors = (corpus_bg
.get_term_and_background_counts()
.reindex(corpus_bg.get_terms())['background']
)
term_scorer = LogOddsRatioInformativeDirichletPrior(priors.values, 10)


tooltip_context = '''(function(d) {
return d.term+"<br/>Count ratio (per 25k): "+d.cat25k+":"+d.ncat25k+"<br/>Z-score: "+ Number(Math.round(d.os+'e3')+'e-3');
})'''

html = produce_fightin_words_explorer(corpus_bg,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=5,
get_tooltip_content = tooltip_context,
term_scorer=term_scorer)

file_name = 'demo_log_odds_ratio_prior_10.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open %s in Chrome or Firefox.' % file_name)
30 changes: 30 additions & 0 deletions demo_log_odds_ratio_prior_rotten_tomatoes.com.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import scattertext as st

fn = 'rotten_fresh2.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
category_col='category',
text_col='text',
nlp=st.whitespace_nlp_with_sentences)
.build())
term_scorer = (st.LORIDPFactory(corpus,
category='fresh',
not_categories=['rotten'],
starting_count=1,
alpha=10)
.use_general_term_frequencies()
.use_all_categories()
.get_term_scorer())
tdf = corpus.get_term_freq_df()
(open(fn, 'wb')
.write(
st.produce_fightin_words_explorer(
corpus,
category='fresh',
not_categories=['rotten'],
metadata=df['movie_name'],
term_scorer=term_scorer,
transform=st.Scalers.percentile_dense)
.encode('utf-8'))
)
print(fn)
5 changes: 3 additions & 2 deletions demo_scaled_f_score.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_fightin_words_explorer, \
ScaledFScoreSignificance
ScaledFScoreSignificance
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
Expand All @@ -14,7 +14,8 @@
not_category_name='Republican',
minimum_term_frequency=5,
width_in_pixels=1000,
term_scorer=ScaledFScoreSignificance(beta=1),
term_scorer=ScaledFScoreSignificance(
beta=0.5, scaler_algo='percentiledense'),
metadata=convention_df['speaker'])
open('./demo_scaled_f_score.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_scaled_f_score.html in Chrome or Firefox.')
14 changes: 13 additions & 1 deletion demo_semiotic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import scattertext as st
from scattertext import ScaledFZScore

movie_df = st.SampleCorpora.RottenTomatoes.get_data()

Expand All @@ -9,12 +10,23 @@
nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus()

term_scorer = (st.LORIDPFactory(corpus,
category='fresh',
starting_count=100,
alpha=10,
term_freq_df_func=lambda x: x.get_term_freq_df())
.use_general_term_frequencies()
.use_all_categories()
.get_term_scorer())

semiotic_square = st.SemioticSquare(
corpus,
category_a='fresh',
category_b='rotten',
neutral_categories=['plot'],
scorer=st.LogOddsRatioUninformativeDirichletPrior(alpha_w=0.001)
term_freq_func=lambda x: x.get_term_freq_df(),
scorer=ScaledFZScore(beta=1)
#term_scorer#st.LogOddsRatioUninformativeDirichletPrior(alpha_w=0.001)
)

html = st.produce_semiotic_square_explorer(semiotic_square,
Expand Down
3 changes: 3 additions & 0 deletions scattertext/Common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
POLITICAL_DATA_URL = 'https://gitcdn.xyz/repo/JasonKessler/scattertext/master/scattertext/data/political_data.json'
ROTTEN_TOMATOES_DATA_URL = 'https://gitcdn.xyz/repo/JasonKessler/scattertext/master/scattertext/data/rotten_tomatoes_corpus.csv.bz2'

# General inquirer data
GENERAL_INQUIRER_URL = 'http://www.wjh.harvard.edu/~inquirer/inqtabs.txt'

# For sample corpus
DEFAULT_D3_URL \
= 'http://cdnjs.cloudflare.com/ajax/libs/d3/4.6.0/d3.min.js'
Expand Down
20 changes: 14 additions & 6 deletions scattertext/Corpus.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd
from numpy import nonzero

Expand Down Expand Up @@ -84,12 +85,19 @@ def _document_index_mask(self, ngram):
mask = (self._X[:, idx] > 0).todense().A1
return mask

def _term_doc_matrix_with_new_X(self, new_X, new_term_idx_store):
def _make_new_term_doc_matrix(self,
new_X,
new_mX,
new_y,
new_term_idx_store,
new_category_idx_store,
new_metadata_idx_store,
new_y_mask):
return Corpus(X=new_X,
mX=self._mX,
y=self._y,
mX=new_mX,
y=new_y,
term_idx_store=new_term_idx_store,
category_idx_store=self._category_idx_store,
metadata_idx_store=self._metadata_idx_store,
raw_texts=self.get_texts(),
category_idx_store=new_category_idx_store,
metadata_idx_store=new_metadata_idx_store,
raw_texts=np.array(self.get_texts())[new_y_mask],
unigram_frequency_path=self._unigram_frequency_path)
16 changes: 16 additions & 0 deletions scattertext/Formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
def large_int_format(x):
num = round_downer(x)
if 1000000000 <= num:
return str(num // 1000000000) + 'b'
elif 1000000 <= num < 1000000000:
return str(num // 1000000) + 'mm'
elif 1000 <= num < 1000000:
return str(num // 1000) + 'k'
else:
return str(num)


def round_downer(x):
power_of_ten = 10 ** (len(str(int(x))) - 1)
num = power_of_ten * (x // power_of_ten)
return num
Loading

0 comments on commit 0ed3e9e

Please sign in to comment.