-
Notifications
You must be signed in to change notification settings - Fork 292
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
0.1.1 Readme update is pending. Fixing issue in PyTextRank. See demo_…
…dispersion.py, demo_label_coloring.py, and demo_tokenizer_roberta.py for examples of features added to update. Preparing for R release. Added a number of new features to Dispersion, and the ability to compute a trend line and add it
- Loading branch information
1 parent
ca45729
commit ff26aee
Showing
35 changed files
with
777 additions
and
255 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import pandas as pd | ||
import numpy as np | ||
from transformers import RobertaTokenizer | ||
from transformers import RobertaTokenizerFast | ||
import scattertext as st | ||
|
||
tokenizer_fast = RobertaTokenizerFast.from_pretrained( | ||
"roberta-base", add_prefix_space=True) | ||
tokenizer = st.RobertaTokenizerWrapper(tokenizer_fast) | ||
|
||
df = st.SampleCorpora.ConventionData2012.get_data().assign( | ||
parse = lambda df: df.text.apply(tokenizer.tokenize) | ||
) | ||
|
||
corpus = st.OffsetCorpusFactory( | ||
df, | ||
category_col='party', | ||
parsed_col='parse', | ||
feat_and_offset_getter=st.TokenFeatAndOffsetGetter() | ||
).build() | ||
|
||
# Remove words occur less than 5 times | ||
corpus = corpus.remove_infrequent_words(5, non_text=True) | ||
|
||
plot_df = corpus.get_metadata_freq_df('').assign( | ||
Y=lambda df: df.democrat, | ||
X=lambda df: df.republican, | ||
Ypos=lambda df: st.Scalers.dense_rank(df.Y), | ||
Xpos=lambda df: st.Scalers.dense_rank(df.X), | ||
SuppressDisplay=False, | ||
ColorScore=lambda df: st.Scalers.scale_center_zero(df.Ypos - df.Xpos), | ||
) | ||
|
||
html = st.dataframe_scattertext( | ||
corpus, | ||
plot_df=plot_df, | ||
category='democrat', | ||
category_name='Democratic', | ||
not_category_name='Republican', | ||
width_in_pixels=1000, | ||
suppress_text_column='Display', | ||
metadata=corpus.get_df()['speaker'], | ||
use_non_text_features=True, | ||
ignore_categories=False, | ||
use_offsets=True, | ||
unified_context=False, | ||
color_score_column='ColorScore', | ||
left_list_column='ColorScore', | ||
y_label='Democarats', | ||
x_label='Republicans', | ||
header_names={'upper': 'Top Democratic', 'lower': 'Top Republican', 'right': 'Most Frequent'}, | ||
subword_encoding='RoBERTa' | ||
) | ||
|
||
fn = 'roberta_sentence_piece.html' | ||
with open(fn, 'w') as of: | ||
of.write(html) | ||
|
||
print("Open ./" + fn + ' in Chrome.') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
from scattertext.DataFrameCorpus import DataFrameCorpus | ||
from scattertext.ParsedCorpus import ParsedDataFrameCorpus | ||
|
||
|
||
class OffsetCorpus(ParsedDataFrameCorpus): | ||
def __init__(self, | ||
df, | ||
X, | ||
mX, | ||
y, | ||
term_idx_store, | ||
category_idx_store, | ||
metadata_idx_store, | ||
parsed_col, | ||
category_col, | ||
term_offsets, | ||
metadata_offsets, | ||
unigram_frequency_path=None): | ||
self._term_offsets = term_offsets | ||
self._metadata_offsets = metadata_offsets | ||
ParsedDataFrameCorpus.__init__(self, parsed_col, category_col) | ||
DataFrameCorpus.__init__(self, X, mX, y, term_idx_store, category_idx_store, | ||
metadata_idx_store, | ||
df[self._parsed_col], | ||
df, | ||
unigram_frequency_path) | ||
|
||
def get_offsets(self): | ||
return self._metadata_offsets | ||
|
||
def _make_new_term_doc_matrix(self, | ||
new_X=None, | ||
new_mX=None, | ||
new_y=None, | ||
new_term_idx_store=None, | ||
new_category_idx_store=None, | ||
new_metadata_idx_store=None, | ||
new_y_mask=None, | ||
new_df=None, | ||
new_term_offsets=None, | ||
new_metadata_offsets=None): | ||
|
||
X, mX, y = self._update_X_mX_y(new_X, new_mX, new_y, new_y_mask) | ||
metadata_offsets, term_offsets = self._update_offsets(new_metadata_idx_store, new_metadata_offsets, | ||
new_term_idx_store, new_term_offsets) | ||
|
||
return OffsetCorpus( | ||
X=X, | ||
mX=mX, | ||
y=y, | ||
parsed_col=self._parsed_col, | ||
category_col=self._category_col, | ||
term_idx_store=new_term_idx_store if new_term_idx_store is not None else self._term_idx_store, | ||
category_idx_store=new_category_idx_store if new_category_idx_store is not None \ | ||
else self._category_idx_store, | ||
metadata_idx_store=new_metadata_idx_store if new_metadata_idx_store is not None \ | ||
else self._metadata_idx_store, | ||
df=self._apply_mask_to_df(new_y_mask, new_df), | ||
term_offsets=term_offsets, | ||
metadata_offsets=metadata_offsets, | ||
unigram_frequency_path=self._unigram_frequency_path, | ||
) | ||
|
||
def _update_offsets(self, new_metadata_idx_store, new_metadata_offsets, new_term_idx_store, new_term_offsets): | ||
term_offsets = self._term_offsets if new_term_offsets is None else new_term_offsets | ||
metadata_offsets = self._metadata_offsets if new_metadata_offsets is None else new_metadata_offsets | ||
if new_term_idx_store is not None: | ||
term_offsets = {k: term_offsets[k] for k in new_term_idx_store.values()} | ||
if new_metadata_idx_store is not None: | ||
metadata_offsets = {k: metadata_offsets[k] for k in new_metadata_idx_store.values()} | ||
return metadata_offsets, term_offsets |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import string | ||
|
||
import numpy as np | ||
|
||
from scattertext.OffsetCorpus import OffsetCorpus | ||
|
||
from scattertext.CSRMatrixTools import CSRMatrixFactory | ||
from scattertext.indexstore.IndexStore import IndexStore | ||
|
||
|
||
class OffsetCorpusFactory(object): | ||
def __init__(self, | ||
df, | ||
parsed_col, | ||
feat_and_offset_getter, | ||
category_col=None): | ||
|
||
''' | ||
Parameters | ||
---------- | ||
df : pd.DataFrame | ||
contains category_col, and parse_col, were parsed col is entirely spacy docs | ||
parsed_col : str | ||
name of spacy parsed column in convention_df | ||
feats_from_spacy_doc : FeatsFromSpacyDoc | ||
category_col : str, Optional | ||
name of category column in df; if None, all category names will be '_' | ||
''' | ||
self._df = df.reset_index() | ||
self._category_col = category_col | ||
self._parsed_col = parsed_col | ||
self._category_idx_store = IndexStore() | ||
self._X_factory = CSRMatrixFactory() | ||
self._mX_factory = CSRMatrixFactory() | ||
self._term_idx_store = IndexStore() | ||
self._metadata_idx_store = IndexStore() | ||
self._feat_and_offset_getter = feat_and_offset_getter | ||
self._term_offsets = {} | ||
self._metadata_offsets = {} | ||
|
||
def build(self): | ||
'''Constructs the term doc matrix. | ||
Returns | ||
------- | ||
scattertext.ParsedCorpus.ParsedCorpus | ||
''' | ||
self._ensure_category_col_is_in_df() | ||
|
||
y = self._get_y_and_populate_category_idx_store(self._df[self._category_col]) | ||
self._df.apply(self._add_to_x_factory, axis=1) | ||
self._mX = self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix() | ||
return OffsetCorpus( | ||
df=self._df, | ||
X=self._X_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(), | ||
mX=self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(), | ||
y=self._get_y_and_populate_category_idx_store(self._df[self._category_col]), | ||
term_idx_store=self._term_idx_store, | ||
category_idx_store=self._category_idx_store, | ||
metadata_idx_store=self._metadata_idx_store, | ||
parsed_col=self._parsed_col, | ||
category_col=self._category_col, | ||
term_offsets=self._term_offsets, | ||
metadata_offsets=self._metadata_offsets | ||
) | ||
|
||
def _ensure_category_col_is_in_df(self): | ||
if self._category_col not in self._df: | ||
self._category_col = 'Category' | ||
while self._category_col in self._df: | ||
self._category_col = 'Category_' + ''.join(np.random.choice(string.ascii_letters) for _ in range(5)) | ||
|
||
def _get_y_and_populate_category_idx_store(self, categories): | ||
return np.array(categories.apply(self._category_idx_store.getidx)) | ||
|
||
def _add_to_x_factory(self, row): | ||
parsed_text = row[self._parsed_col] | ||
for term, (count, offsets) in self._feat_and_offset_getter.get_term_offsets(parsed_text): | ||
term_idx = self._term_idx_store.getidx(term) | ||
self._X_factory[row.name, term_idx] = count | ||
if offsets is not None: | ||
self._term_offsets.setdefault(term, {}).setdefault(row.name, []).extend(offsets) | ||
|
||
for meta, (val, offsets) in self._feat_and_offset_getter.get_metadata_offsets(parsed_text): | ||
meta_idx = self._metadata_idx_store.getidx(meta) | ||
self._mX_factory[row.name, meta_idx] = val | ||
if offsets is not None: | ||
self._metadata_offsets.setdefault(meta, {}).setdefault(row.name, []).extend(offsets) |
Oops, something went wrong.