Skip to content

Commit

Permalink
0.1.1 Readme update is pending. Fixing issue in PyTextRank. See demo_…
Browse files Browse the repository at this point in the history
…dispersion.py, demo_label_coloring.py, and demo_tokenizer_roberta.py for examples of features added to update. Preparing for R release. Added a number of new features to Dispersion, and the ability to compute a trend line and add it
  • Loading branch information
JasonKessler committed Mar 8, 2021
1 parent ca45729 commit ff26aee
Show file tree
Hide file tree
Showing 35 changed files with 777 additions and 255 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Gitter Chat](https://img.shields.io/badge/GITTER-join%20chat-green.svg)](https://gitter.im/scattertext/Lobby)
[![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/jasonkessler)

# Scattertext 0.1.0.0
# Scattertext 0.1.1

A tool for finding distinguishing terms in corpora, and presenting them in an
interactive, HTML scatter plot. Points corresponding to terms are selectively labeled
Expand Down Expand Up @@ -493,7 +493,7 @@ import scattertext as st
nlp = spacy.load('en')
convention_df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(nlp)
parse=lambda df: df.text.apply(nlp),
party=lambda df: df.party.apply({'democrat': 'Democratic', 'republican': 'Republican'}.get)
)
corpus = st.CorpusFromParsedDocuments(
Expand Down
65 changes: 53 additions & 12 deletions demo_dispersion.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from sklearn.neighbors import KNeighborsRegressor

import scattertext as st
import pandas as pd

from scattertext.smoothing.lowess import Lowess

df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
Expand All @@ -17,32 +18,72 @@
dispersion_df = dispersion.get_df().assign(
X=lambda df: df.Frequency,
Xpos=lambda df: st.Scalers.log_scale(df.X),
Y=lambda df: dispersion.rosengrens(),
Y=lambda df: dispersion.da(),
Ypos=lambda df: st.Scalers.scale(df.Y),
)

dispersion_df = dispersion_df.assign(
Expected=lambda df: KNeighborsRegressor(n_neighbors=10).fit(
df.X.values.reshape(-1, 1), df.Y
).predict(df.X.values.reshape(-1, 1)),
Residual=lambda df: df.Y - df.Expected,
Expected=lambda df: Lowess().fit_predict(df.Xpos.values, df.Ypos.values),
Residual=lambda df: df.Ypos - df.Expected,
ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Residual)
)

line_df = pd.DataFrame({
'x': dispersion_df.Xpos.values,
'y': dispersion_df.Expected.values,
}).sort_values(by='x')

html = st.dataframe_scattertext(
corpus,
plot_df=dispersion_df,
metadata=corpus.get_df()['speaker'] + ' (' + corpus.get_df()['party'].str.upper() + ')',
ignore_categories=True,
x_label='Log Frequency',
y_label="Rosengren's S",
y_label='DA',
y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
color_score_column='ColorScore',
tooltip_columns=['Frequency', 'DA'],
header_names={'upper': 'Lower than Expected', 'lower': 'More than Expected'},
left_list_column='Residual',
background_color='#e5e5e3'
background_color='#e5e5e3',
line_coordinates = line_df.to_dict('records')
)

fn = 'demo_dispersion.html'
open(fn, 'w').write(html)
print('open ./%s in Chrome' % fn)




residual_dispersion_df = dispersion_df.assign(
Expected=lambda df: Lowess().fit_predict(df.X.values, df.Y.values),
Y=lambda df: df.Y - df.Expected,
Ypos=lambda df: st.Scalers.scale(df.Y),
ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Y)
)

line_df = pd.DataFrame({
'x': dispersion_df.Xpos.values,
'y': st.Scalers.scale(dispersion_df.Expected),
}).sort_values(by='x')

html = st.dataframe_scattertext(
corpus,
plot_df=residual_dispersion_df,
unified_context=False,
metadata=corpus.get_df()['speaker'] + ' (' + corpus.get_df()['party'].str.upper() + ')',
x_label='Log Frequency',
y_label='DA - E[DA] via Lowess',
y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
color_score_column='ColorScore',
tooltip_columns=['Frequency', 'DA - E[DA]'],
header_names={'upper': 'Lower than Expected', 'lower': 'More than Expected'},
left_list_column='Residual',
background_color='#e5e5e3',
line_coordinates = line_df.to_dict('records')
)



fn = 'demo_dispersion_residual.html'
open(fn, 'w').write(html)
print('open ./%s in Chrome' % fn)

13 changes: 7 additions & 6 deletions demo_label_coloring.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,16 @@
MatchesQuery=lambda df: np.array([query.match(word) is not None for word in df.index]),
Frequency=lambda df: df.sum(axis=1),
TextColor=lambda df: [
'blue' if dem_query.match(term) is not None
else 'red' if rep_query.match(term) is not None
else 'rgb(200, 200, 200)'
'#1b4b5a' if dem_query.match(term) is not None
else '#d35c37' if rep_query.match(term) is not None
else '#d6c6b9'
for term in df.index
],
SuppressText=lambda df: df.apply(
lambda row: not (row.MatchesQuery or row.Frequency < 30),
axis=1
),
PointColor=lambda df: df.TextColor,
LabelPriority=lambda df: -(df.MatchesQuery).astype(int),
PointColor=lambda df: df.TextColor
)

html = st.produce_scattertext_explorer(
Expand All @@ -43,10 +42,12 @@
transform=st.Scalers.dense_rank,
max_overlapping=3,
term_metadata_df=term_metadata_df,
header_names={'right': 'Most Frequent'},
text_color_column='TextColor',
suppress_text_column='SuppressText',
color_column='PointColor',
label_priority_column='LabelPriority'
label_priority_column='MatchesQuery',
right_order_column='Frequency'
)
fn = 'demo_label_coloring.html'
open(fn, 'w').write(html)
Expand Down
59 changes: 59 additions & 0 deletions demo_tokenizer_roberta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer
from transformers import RobertaTokenizerFast
import scattertext as st

tokenizer_fast = RobertaTokenizerFast.from_pretrained(
"roberta-base", add_prefix_space=True)
tokenizer = st.RobertaTokenizerWrapper(tokenizer_fast)

df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse = lambda df: df.text.apply(tokenizer.tokenize)
)

corpus = st.OffsetCorpusFactory(
df,
category_col='party',
parsed_col='parse',
feat_and_offset_getter=st.TokenFeatAndOffsetGetter()
).build()

# Remove words occur less than 5 times
corpus = corpus.remove_infrequent_words(5, non_text=True)

plot_df = corpus.get_metadata_freq_df('').assign(
Y=lambda df: df.democrat,
X=lambda df: df.republican,
Ypos=lambda df: st.Scalers.dense_rank(df.Y),
Xpos=lambda df: st.Scalers.dense_rank(df.X),
SuppressDisplay=False,
ColorScore=lambda df: st.Scalers.scale_center_zero(df.Ypos - df.Xpos),
)

html = st.dataframe_scattertext(
corpus,
plot_df=plot_df,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
width_in_pixels=1000,
suppress_text_column='Display',
metadata=corpus.get_df()['speaker'],
use_non_text_features=True,
ignore_categories=False,
use_offsets=True,
unified_context=False,
color_score_column='ColorScore',
left_list_column='ColorScore',
y_label='Democarats',
x_label='Republicans',
header_names={'upper': 'Top Democratic', 'lower': 'Top Republican', 'right': 'Most Frequent'},
subword_encoding='RoBERTa'
)

fn = 'roberta_sentence_piece.html'
with open(fn, 'w') as of:
of.write(html)

print("Open ./" + fn + ' in Chrome.')
5 changes: 4 additions & 1 deletion scattertext/CorpusWithoutCategoriesFromParsedDocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,8 @@ def build(self):
while category_col in self.df:
category_col = 'Category_' + ''.join(np.random.choice(string.ascii_letters) for _ in range(5))
return CorpusFromParsedDocuments(
self.df.assign(**{category_col: '_'}), category_col, self.parsed_col
self.df.assign(**{category_col: '_'}),
category_col,
self.parsed_col,
feats_from_spacy_doc=self.feats_from_spacy_doc,
).build()
71 changes: 71 additions & 0 deletions scattertext/OffsetCorpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from scattertext.DataFrameCorpus import DataFrameCorpus
from scattertext.ParsedCorpus import ParsedDataFrameCorpus


class OffsetCorpus(ParsedDataFrameCorpus):
def __init__(self,
df,
X,
mX,
y,
term_idx_store,
category_idx_store,
metadata_idx_store,
parsed_col,
category_col,
term_offsets,
metadata_offsets,
unigram_frequency_path=None):
self._term_offsets = term_offsets
self._metadata_offsets = metadata_offsets
ParsedDataFrameCorpus.__init__(self, parsed_col, category_col)
DataFrameCorpus.__init__(self, X, mX, y, term_idx_store, category_idx_store,
metadata_idx_store,
df[self._parsed_col],
df,
unigram_frequency_path)

def get_offsets(self):
return self._metadata_offsets

def _make_new_term_doc_matrix(self,
new_X=None,
new_mX=None,
new_y=None,
new_term_idx_store=None,
new_category_idx_store=None,
new_metadata_idx_store=None,
new_y_mask=None,
new_df=None,
new_term_offsets=None,
new_metadata_offsets=None):

X, mX, y = self._update_X_mX_y(new_X, new_mX, new_y, new_y_mask)
metadata_offsets, term_offsets = self._update_offsets(new_metadata_idx_store, new_metadata_offsets,
new_term_idx_store, new_term_offsets)

return OffsetCorpus(
X=X,
mX=mX,
y=y,
parsed_col=self._parsed_col,
category_col=self._category_col,
term_idx_store=new_term_idx_store if new_term_idx_store is not None else self._term_idx_store,
category_idx_store=new_category_idx_store if new_category_idx_store is not None \
else self._category_idx_store,
metadata_idx_store=new_metadata_idx_store if new_metadata_idx_store is not None \
else self._metadata_idx_store,
df=self._apply_mask_to_df(new_y_mask, new_df),
term_offsets=term_offsets,
metadata_offsets=metadata_offsets,
unigram_frequency_path=self._unigram_frequency_path,
)

def _update_offsets(self, new_metadata_idx_store, new_metadata_offsets, new_term_idx_store, new_term_offsets):
term_offsets = self._term_offsets if new_term_offsets is None else new_term_offsets
metadata_offsets = self._metadata_offsets if new_metadata_offsets is None else new_metadata_offsets
if new_term_idx_store is not None:
term_offsets = {k: term_offsets[k] for k in new_term_idx_store.values()}
if new_metadata_idx_store is not None:
metadata_offsets = {k: metadata_offsets[k] for k in new_metadata_idx_store.values()}
return metadata_offsets, term_offsets
88 changes: 88 additions & 0 deletions scattertext/OffsetCorpusFactory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import string

import numpy as np

from scattertext.OffsetCorpus import OffsetCorpus

from scattertext.CSRMatrixTools import CSRMatrixFactory
from scattertext.indexstore.IndexStore import IndexStore


class OffsetCorpusFactory(object):
def __init__(self,
df,
parsed_col,
feat_and_offset_getter,
category_col=None):

'''
Parameters
----------
df : pd.DataFrame
contains category_col, and parse_col, were parsed col is entirely spacy docs
parsed_col : str
name of spacy parsed column in convention_df
feats_from_spacy_doc : FeatsFromSpacyDoc
category_col : str, Optional
name of category column in df; if None, all category names will be '_'
'''
self._df = df.reset_index()
self._category_col = category_col
self._parsed_col = parsed_col
self._category_idx_store = IndexStore()
self._X_factory = CSRMatrixFactory()
self._mX_factory = CSRMatrixFactory()
self._term_idx_store = IndexStore()
self._metadata_idx_store = IndexStore()
self._feat_and_offset_getter = feat_and_offset_getter
self._term_offsets = {}
self._metadata_offsets = {}

def build(self):
'''Constructs the term doc matrix.
Returns
-------
scattertext.ParsedCorpus.ParsedCorpus
'''
self._ensure_category_col_is_in_df()

y = self._get_y_and_populate_category_idx_store(self._df[self._category_col])
self._df.apply(self._add_to_x_factory, axis=1)
self._mX = self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix()
return OffsetCorpus(
df=self._df,
X=self._X_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(),
mX=self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(),
y=self._get_y_and_populate_category_idx_store(self._df[self._category_col]),
term_idx_store=self._term_idx_store,
category_idx_store=self._category_idx_store,
metadata_idx_store=self._metadata_idx_store,
parsed_col=self._parsed_col,
category_col=self._category_col,
term_offsets=self._term_offsets,
metadata_offsets=self._metadata_offsets
)

def _ensure_category_col_is_in_df(self):
if self._category_col not in self._df:
self._category_col = 'Category'
while self._category_col in self._df:
self._category_col = 'Category_' + ''.join(np.random.choice(string.ascii_letters) for _ in range(5))

def _get_y_and_populate_category_idx_store(self, categories):
return np.array(categories.apply(self._category_idx_store.getidx))

def _add_to_x_factory(self, row):
parsed_text = row[self._parsed_col]
for term, (count, offsets) in self._feat_and_offset_getter.get_term_offsets(parsed_text):
term_idx = self._term_idx_store.getidx(term)
self._X_factory[row.name, term_idx] = count
if offsets is not None:
self._term_offsets.setdefault(term, {}).setdefault(row.name, []).extend(offsets)

for meta, (val, offsets) in self._feat_and_offset_getter.get_metadata_offsets(parsed_text):
meta_idx = self._metadata_idx_store.getidx(meta)
self._mX_factory[row.name, meta_idx] = val
if offsets is not None:
self._metadata_offsets.setdefault(meta, {}).setdefault(row.name, []).extend(offsets)
Loading

0 comments on commit ff26aee

Please sign in to comment.