diff --git a/README.md b/README.md
index ed351ce..cbf8059 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
[![Gitter Chat](https://img.shields.io/badge/GITTER-join%20chat-green.svg)](https://gitter.im/scattertext/Lobby)
[![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/jasonkessler)
-# Scattertext 0.1.0.0
+# Scattertext 0.1.1
A tool for finding distinguishing terms in corpora, and presenting them in an
interactive, HTML scatter plot. Points corresponding to terms are selectively labeled
@@ -493,7 +493,7 @@ import scattertext as st
nlp = spacy.load('en')
convention_df = st.SampleCorpora.ConventionData2012.get_data().assign(
- parse=lambda df: df.text.apply(nlp)
+ parse=lambda df: df.text.apply(nlp),
party=lambda df: df.party.apply({'democrat': 'Democratic', 'republican': 'Republican'}.get)
)
corpus = st.CorpusFromParsedDocuments(
diff --git a/demo_dispersion.py b/demo_dispersion.py
index 605748a..b3ca40e 100644
--- a/demo_dispersion.py
+++ b/demo_dispersion.py
@@ -1,6 +1,7 @@
-from sklearn.neighbors import KNeighborsRegressor
-
import scattertext as st
+import pandas as pd
+
+from scattertext.smoothing.lowess import Lowess
df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
@@ -17,32 +18,72 @@
dispersion_df = dispersion.get_df().assign(
X=lambda df: df.Frequency,
Xpos=lambda df: st.Scalers.log_scale(df.X),
- Y=lambda df: dispersion.rosengrens(),
+ Y=lambda df: dispersion.da(),
Ypos=lambda df: st.Scalers.scale(df.Y),
-)
-
-dispersion_df = dispersion_df.assign(
- Expected=lambda df: KNeighborsRegressor(n_neighbors=10).fit(
- df.X.values.reshape(-1, 1), df.Y
- ).predict(df.X.values.reshape(-1, 1)),
- Residual=lambda df: df.Y - df.Expected,
+ Expected=lambda df: Lowess().fit_predict(df.Xpos.values, df.Ypos.values),
+ Residual=lambda df: df.Ypos - df.Expected,
ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Residual)
)
+line_df = pd.DataFrame({
+ 'x': dispersion_df.Xpos.values,
+ 'y': dispersion_df.Expected.values,
+}).sort_values(by='x')
+
html = st.dataframe_scattertext(
corpus,
plot_df=dispersion_df,
metadata=corpus.get_df()['speaker'] + ' (' + corpus.get_df()['party'].str.upper() + ')',
ignore_categories=True,
x_label='Log Frequency',
- y_label="Rosengren's S",
+ y_label='DA',
y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
color_score_column='ColorScore',
+ tooltip_columns=['Frequency', 'DA'],
header_names={'upper': 'Lower than Expected', 'lower': 'More than Expected'},
left_list_column='Residual',
- background_color='#e5e5e3'
+ background_color='#e5e5e3',
+ line_coordinates = line_df.to_dict('records')
)
fn = 'demo_dispersion.html'
open(fn, 'w').write(html)
print('open ./%s in Chrome' % fn)
+
+
+
+
+residual_dispersion_df = dispersion_df.assign(
+ Expected=lambda df: Lowess().fit_predict(df.X.values, df.Y.values),
+ Y=lambda df: df.Y - df.Expected,
+ Ypos=lambda df: st.Scalers.scale(df.Y),
+ ColorScore=lambda df: st.Scalers.scale_center_zero_abs(df.Y)
+)
+
+line_df = pd.DataFrame({
+ 'x': dispersion_df.Xpos.values,
+ 'y': st.Scalers.scale(dispersion_df.Expected),
+}).sort_values(by='x')
+
+html = st.dataframe_scattertext(
+ corpus,
+ plot_df=residual_dispersion_df,
+ unified_context=False,
+ metadata=corpus.get_df()['speaker'] + ' (' + corpus.get_df()['party'].str.upper() + ')',
+ x_label='Log Frequency',
+ y_label='DA - E[DA] via Lowess',
+ y_axis_labels=['More Dispersion', 'Medium', 'Less Dispersion'],
+ color_score_column='ColorScore',
+ tooltip_columns=['Frequency', 'DA - E[DA]'],
+ header_names={'upper': 'Lower than Expected', 'lower': 'More than Expected'},
+ left_list_column='Residual',
+ background_color='#e5e5e3',
+ line_coordinates = line_df.to_dict('records')
+)
+
+
+
+fn = 'demo_dispersion_residual.html'
+open(fn, 'w').write(html)
+print('open ./%s in Chrome' % fn)
+
diff --git a/demo_label_coloring.py b/demo_label_coloring.py
index 97db0bc..60dbf17 100644
--- a/demo_label_coloring.py
+++ b/demo_label_coloring.py
@@ -18,17 +18,16 @@
MatchesQuery=lambda df: np.array([query.match(word) is not None for word in df.index]),
Frequency=lambda df: df.sum(axis=1),
TextColor=lambda df: [
- 'blue' if dem_query.match(term) is not None
- else 'red' if rep_query.match(term) is not None
- else 'rgb(200, 200, 200)'
+ '#1b4b5a' if dem_query.match(term) is not None
+ else '#d35c37' if rep_query.match(term) is not None
+ else '#d6c6b9'
for term in df.index
],
SuppressText=lambda df: df.apply(
lambda row: not (row.MatchesQuery or row.Frequency < 30),
axis=1
),
- PointColor=lambda df: df.TextColor,
- LabelPriority=lambda df: -(df.MatchesQuery).astype(int),
+ PointColor=lambda df: df.TextColor
)
html = st.produce_scattertext_explorer(
@@ -43,10 +42,12 @@
transform=st.Scalers.dense_rank,
max_overlapping=3,
term_metadata_df=term_metadata_df,
+ header_names={'right': 'Most Frequent'},
text_color_column='TextColor',
suppress_text_column='SuppressText',
color_column='PointColor',
- label_priority_column='LabelPriority'
+ label_priority_column='MatchesQuery',
+ right_order_column='Frequency'
)
fn = 'demo_label_coloring.html'
open(fn, 'w').write(html)
diff --git a/demo_tokenizer_roberta.py b/demo_tokenizer_roberta.py
new file mode 100644
index 0000000..c8dc614
--- /dev/null
+++ b/demo_tokenizer_roberta.py
@@ -0,0 +1,59 @@
+import pandas as pd
+import numpy as np
+from transformers import RobertaTokenizer
+from transformers import RobertaTokenizerFast
+import scattertext as st
+
+tokenizer_fast = RobertaTokenizerFast.from_pretrained(
+ "roberta-base", add_prefix_space=True)
+tokenizer = st.RobertaTokenizerWrapper(tokenizer_fast)
+
+df = st.SampleCorpora.ConventionData2012.get_data().assign(
+ parse = lambda df: df.text.apply(tokenizer.tokenize)
+)
+
+corpus = st.OffsetCorpusFactory(
+ df,
+ category_col='party',
+ parsed_col='parse',
+ feat_and_offset_getter=st.TokenFeatAndOffsetGetter()
+).build()
+
+# Remove words occur less than 5 times
+corpus = corpus.remove_infrequent_words(5, non_text=True)
+
+plot_df = corpus.get_metadata_freq_df('').assign(
+ Y=lambda df: df.democrat,
+ X=lambda df: df.republican,
+ Ypos=lambda df: st.Scalers.dense_rank(df.Y),
+ Xpos=lambda df: st.Scalers.dense_rank(df.X),
+ SuppressDisplay=False,
+ ColorScore=lambda df: st.Scalers.scale_center_zero(df.Ypos - df.Xpos),
+)
+
+html = st.dataframe_scattertext(
+ corpus,
+ plot_df=plot_df,
+ category='democrat',
+ category_name='Democratic',
+ not_category_name='Republican',
+ width_in_pixels=1000,
+ suppress_text_column='Display',
+ metadata=corpus.get_df()['speaker'],
+ use_non_text_features=True,
+ ignore_categories=False,
+ use_offsets=True,
+ unified_context=False,
+ color_score_column='ColorScore',
+ left_list_column='ColorScore',
+ y_label='Democarats',
+ x_label='Republicans',
+ header_names={'upper': 'Top Democratic', 'lower': 'Top Republican', 'right': 'Most Frequent'},
+ subword_encoding='RoBERTa'
+)
+
+fn = 'roberta_sentence_piece.html'
+with open(fn, 'w') as of:
+ of.write(html)
+
+print("Open ./" + fn + ' in Chrome.')
diff --git a/scattertext/CorpusWithoutCategoriesFromParsedDocuments.py b/scattertext/CorpusWithoutCategoriesFromParsedDocuments.py
index 579aa16..fdd4543 100644
--- a/scattertext/CorpusWithoutCategoriesFromParsedDocuments.py
+++ b/scattertext/CorpusWithoutCategoriesFromParsedDocuments.py
@@ -34,5 +34,8 @@ def build(self):
while category_col in self.df:
category_col = 'Category_' + ''.join(np.random.choice(string.ascii_letters) for _ in range(5))
return CorpusFromParsedDocuments(
- self.df.assign(**{category_col: '_'}), category_col, self.parsed_col
+ self.df.assign(**{category_col: '_'}),
+ category_col,
+ self.parsed_col,
+ feats_from_spacy_doc=self.feats_from_spacy_doc,
).build()
\ No newline at end of file
diff --git a/scattertext/OffsetCorpus.py b/scattertext/OffsetCorpus.py
new file mode 100644
index 0000000..d12c6a1
--- /dev/null
+++ b/scattertext/OffsetCorpus.py
@@ -0,0 +1,71 @@
+from scattertext.DataFrameCorpus import DataFrameCorpus
+from scattertext.ParsedCorpus import ParsedDataFrameCorpus
+
+
+class OffsetCorpus(ParsedDataFrameCorpus):
+ def __init__(self,
+ df,
+ X,
+ mX,
+ y,
+ term_idx_store,
+ category_idx_store,
+ metadata_idx_store,
+ parsed_col,
+ category_col,
+ term_offsets,
+ metadata_offsets,
+ unigram_frequency_path=None):
+ self._term_offsets = term_offsets
+ self._metadata_offsets = metadata_offsets
+ ParsedDataFrameCorpus.__init__(self, parsed_col, category_col)
+ DataFrameCorpus.__init__(self, X, mX, y, term_idx_store, category_idx_store,
+ metadata_idx_store,
+ df[self._parsed_col],
+ df,
+ unigram_frequency_path)
+
+ def get_offsets(self):
+ return self._metadata_offsets
+
+ def _make_new_term_doc_matrix(self,
+ new_X=None,
+ new_mX=None,
+ new_y=None,
+ new_term_idx_store=None,
+ new_category_idx_store=None,
+ new_metadata_idx_store=None,
+ new_y_mask=None,
+ new_df=None,
+ new_term_offsets=None,
+ new_metadata_offsets=None):
+
+ X, mX, y = self._update_X_mX_y(new_X, new_mX, new_y, new_y_mask)
+ metadata_offsets, term_offsets = self._update_offsets(new_metadata_idx_store, new_metadata_offsets,
+ new_term_idx_store, new_term_offsets)
+
+ return OffsetCorpus(
+ X=X,
+ mX=mX,
+ y=y,
+ parsed_col=self._parsed_col,
+ category_col=self._category_col,
+ term_idx_store=new_term_idx_store if new_term_idx_store is not None else self._term_idx_store,
+ category_idx_store=new_category_idx_store if new_category_idx_store is not None \
+ else self._category_idx_store,
+ metadata_idx_store=new_metadata_idx_store if new_metadata_idx_store is not None \
+ else self._metadata_idx_store,
+ df=self._apply_mask_to_df(new_y_mask, new_df),
+ term_offsets=term_offsets,
+ metadata_offsets=metadata_offsets,
+ unigram_frequency_path=self._unigram_frequency_path,
+ )
+
+ def _update_offsets(self, new_metadata_idx_store, new_metadata_offsets, new_term_idx_store, new_term_offsets):
+ term_offsets = self._term_offsets if new_term_offsets is None else new_term_offsets
+ metadata_offsets = self._metadata_offsets if new_metadata_offsets is None else new_metadata_offsets
+ if new_term_idx_store is not None:
+ term_offsets = {k: term_offsets[k] for k in new_term_idx_store.values()}
+ if new_metadata_idx_store is not None:
+ metadata_offsets = {k: metadata_offsets[k] for k in new_metadata_idx_store.values()}
+ return metadata_offsets, term_offsets
diff --git a/scattertext/OffsetCorpusFactory.py b/scattertext/OffsetCorpusFactory.py
new file mode 100644
index 0000000..40b133d
--- /dev/null
+++ b/scattertext/OffsetCorpusFactory.py
@@ -0,0 +1,88 @@
+import string
+
+import numpy as np
+
+from scattertext.OffsetCorpus import OffsetCorpus
+
+from scattertext.CSRMatrixTools import CSRMatrixFactory
+from scattertext.indexstore.IndexStore import IndexStore
+
+
+class OffsetCorpusFactory(object):
+ def __init__(self,
+ df,
+ parsed_col,
+ feat_and_offset_getter,
+ category_col=None):
+
+ '''
+ Parameters
+ ----------
+ df : pd.DataFrame
+ contains category_col, and parse_col, were parsed col is entirely spacy docs
+ parsed_col : str
+ name of spacy parsed column in convention_df
+ feats_from_spacy_doc : FeatsFromSpacyDoc
+ category_col : str, Optional
+ name of category column in df; if None, all category names will be '_'
+ '''
+ self._df = df.reset_index()
+ self._category_col = category_col
+ self._parsed_col = parsed_col
+ self._category_idx_store = IndexStore()
+ self._X_factory = CSRMatrixFactory()
+ self._mX_factory = CSRMatrixFactory()
+ self._term_idx_store = IndexStore()
+ self._metadata_idx_store = IndexStore()
+ self._feat_and_offset_getter = feat_and_offset_getter
+ self._term_offsets = {}
+ self._metadata_offsets = {}
+
+ def build(self):
+ '''Constructs the term doc matrix.
+
+ Returns
+ -------
+ scattertext.ParsedCorpus.ParsedCorpus
+ '''
+ self._ensure_category_col_is_in_df()
+
+ y = self._get_y_and_populate_category_idx_store(self._df[self._category_col])
+ self._df.apply(self._add_to_x_factory, axis=1)
+ self._mX = self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix()
+ return OffsetCorpus(
+ df=self._df,
+ X=self._X_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(),
+ mX=self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(),
+ y=self._get_y_and_populate_category_idx_store(self._df[self._category_col]),
+ term_idx_store=self._term_idx_store,
+ category_idx_store=self._category_idx_store,
+ metadata_idx_store=self._metadata_idx_store,
+ parsed_col=self._parsed_col,
+ category_col=self._category_col,
+ term_offsets=self._term_offsets,
+ metadata_offsets=self._metadata_offsets
+ )
+
+ def _ensure_category_col_is_in_df(self):
+ if self._category_col not in self._df:
+ self._category_col = 'Category'
+ while self._category_col in self._df:
+ self._category_col = 'Category_' + ''.join(np.random.choice(string.ascii_letters) for _ in range(5))
+
+ def _get_y_and_populate_category_idx_store(self, categories):
+ return np.array(categories.apply(self._category_idx_store.getidx))
+
+ def _add_to_x_factory(self, row):
+ parsed_text = row[self._parsed_col]
+ for term, (count, offsets) in self._feat_and_offset_getter.get_term_offsets(parsed_text):
+ term_idx = self._term_idx_store.getidx(term)
+ self._X_factory[row.name, term_idx] = count
+ if offsets is not None:
+ self._term_offsets.setdefault(term, {}).setdefault(row.name, []).extend(offsets)
+
+ for meta, (val, offsets) in self._feat_and_offset_getter.get_metadata_offsets(parsed_text):
+ meta_idx = self._metadata_idx_store.getidx(meta)
+ self._mX_factory[row.name, meta_idx] = val
+ if offsets is not None:
+ self._metadata_offsets.setdefault(meta, {}).setdefault(row.name, []).extend(offsets)
diff --git a/scattertext/ParsedCorpus.py b/scattertext/ParsedCorpus.py
index a3c85c4..78bc9cc 100644
--- a/scattertext/ParsedCorpus.py
+++ b/scattertext/ParsedCorpus.py
@@ -6,39 +6,10 @@
from scattertext.indexstore.IndexStore import IndexStore
-class ParsedCorpus(DataFrameCorpus):
- def __init__(self,
- df,
- X,
- mX,
- y,
- term_idx_store,
- category_idx_store,
- metadata_idx_store,
- parsed_col,
- category_col,
- unigram_frequency_path=None):
- '''
-
- Parameters
- ----------
- convention_df pd.DataFrame, contains parsed_col and metadata
- X, csr_matrix
- mX csr_matrix
- y, np.array
- term_idx_store, IndexStore
- category_idx_store, IndexStore
- parsed_col str, column in convention_df containing parsed documents
- category_col str, columns in convention_df containing category
- unigram_frequency_path str, None by default, path of unigram counts file
- '''
+class ParsedDataFrameCorpus(DataFrameCorpus):
+ def __init__(self, parsed_col, category_col):
self._parsed_col = parsed_col
self._category_col = category_col
- DataFrameCorpus.__init__(self, X, mX, y, term_idx_store, category_idx_store,
- metadata_idx_store,
- df[self._parsed_col],
- df,
- unigram_frequency_path)
def get_texts(self):
'''
@@ -88,6 +59,49 @@ def term_group_freq_df(self, group_col):
catX = self._change_document_type_in_matrix(newX, category_row)
return self._term_freq_df_from_matrix(catX)
+
+ def _get_group_docids_and_index_store(self, X, group_col, group_idx_store):
+ row_group_cat = X.tocoo().row
+ group_idx_to_cat_idx = {}
+ for doc_idx, row in self._df.iterrows():
+ group_idx = group_idx_store.getidx(row[group_col] + '-' + row[self._category_col])
+ row_group_cat[row_group_cat == doc_idx] = group_idx
+ group_idx_to_cat_idx[group_idx] = self._y[doc_idx]
+ return group_idx_to_cat_idx, row_group_cat
+
+class ParsedCorpus(ParsedDataFrameCorpus):
+ def __init__(self,
+ df,
+ X,
+ mX,
+ y,
+ term_idx_store,
+ category_idx_store,
+ metadata_idx_store,
+ parsed_col,
+ category_col,
+ unigram_frequency_path=None):
+ '''
+
+ Parameters
+ ----------
+ convention_df pd.DataFrame, contains parsed_col and metadata
+ X, csr_matrix
+ mX csr_matrix
+ y, np.array
+ term_idx_store, IndexStore
+ category_idx_store, IndexStore
+ parsed_col str, column in convention_df containing parsed documents
+ category_col str, columns in convention_df containing category
+ unigram_frequency_path str, None by default, path of unigram counts file
+ '''
+ ParsedDataFrameCorpus.__init__(self, parsed_col, category_col)
+ DataFrameCorpus.__init__(self, X, mX, y, term_idx_store, category_idx_store,
+ metadata_idx_store,
+ df[self._parsed_col],
+ df,
+ unigram_frequency_path)
+
def _make_new_term_doc_matrix(self,
new_X=None,
new_mX=None,
@@ -111,12 +125,3 @@ def _make_new_term_doc_matrix(self,
df=self._apply_mask_to_df(new_y_mask, new_df),
unigram_frequency_path=self._unigram_frequency_path
)
-
- def _get_group_docids_and_index_store(self, X, group_col, group_idx_store):
- row_group_cat = X.tocoo().row
- group_idx_to_cat_idx = {}
- for doc_idx, row in self._df.iterrows():
- group_idx = group_idx_store.getidx(row[group_col] + '-' + row[self._category_col])
- row_group_cat[row_group_cat == doc_idx] = group_idx
- group_idx_to_cat_idx[group_idx] = self._y[doc_idx]
- return group_idx_to_cat_idx, row_group_cat
diff --git a/scattertext/Scalers.py b/scattertext/Scalers.py
index 5a00232..0df5c9b 100644
--- a/scattertext/Scalers.py
+++ b/scattertext/Scalers.py
@@ -3,8 +3,10 @@
from scipy.stats import rankdata
-def scale(vec, terms=None, other_vec=None):
- return (vec - vec.min()) / (vec.max() - vec.min())
+def scale(vec, other_vec=None):
+ if other_vec is None:
+ other_vec = vec
+ return (other_vec - vec.min()) / (vec.max() - vec.min())
class Coordinates:
@@ -36,14 +38,18 @@ def rotate_radians(y, x, radians):
x * np.sin(radians) + y * np.cos(radians)
)
-def scale_center_zero(vec):
- return ((((vec > 0).astype(float) * (vec / vec.max())) * 0.5 + 0.5)
- + ((vec < 0).astype(float) * (vec / (-vec.min())) * 0.5))
+def scale_center_zero(vec, other_vec=None):
+ if other_vec is None:
+ other_vec = vec
+ return ((((other_vec > 0).astype(float) * (other_vec / vec.max())) * 0.5 + 0.5)
+ + ((other_vec < 0).astype(float) * (other_vec / (-vec.min())) * 0.5))
-def scale_center_zero_abs(vec):
+def scale_center_zero_abs(vec, other_vec=None):
+ if other_vec is None:
+ other_vec = vec
max_abs = max(vec.max(), -vec.min())
- return ((((vec > 0).astype(float) * (vec / max_abs)) * 0.5 + 0.5)
- + ((vec < 0).astype(float) * (vec / max_abs) * 0.5))
+ return ((((other_vec > 0).astype(float) * (other_vec / max_abs)) * 0.5 + 0.5)
+ + ((other_vec < 0).astype(float) * (other_vec / max_abs) * 0.5))
def scale_neg_1_to_1_with_zero_mean_abs_max(vec):
diff --git a/scattertext/ScatterChart.py b/scattertext/ScatterChart.py
index 4b22a3a..4442158 100644
--- a/scattertext/ScatterChart.py
+++ b/scattertext/ScatterChart.py
@@ -217,6 +217,7 @@ def to_dict(self,
neutral_categories=None,
extra_categories=None,
background_scorer=None,
+ use_offsets=False,
**kwargs):
'''
@@ -392,6 +393,10 @@ def better_title(x):
for term_obj in j['data']:
if term_obj['term'] in self.hidden_terms:
term_obj['display'] = False
+
+ if use_offsets:
+ j['offsets'] = self.term_doc_matrix.get_offsets()
+
return j
def _add_x_and_y_coords_to_term_df_if_injected(self, df):
diff --git a/scattertext/ScatterChartExplorer.py b/scattertext/ScatterChartExplorer.py
index 144c625..425c970 100644
--- a/scattertext/ScatterChartExplorer.py
+++ b/scattertext/ScatterChartExplorer.py
@@ -1,6 +1,7 @@
import logging
import numpy as np
+import pandas as pd
from scattertext import ScatterChart, TermCategoryFrequencies, ParsedCorpus, CorpusDF
from scattertext.Scalers import percentile_alphabetical
@@ -16,7 +17,7 @@ def __init__(self,
'''See ScatterChart. This lets you click on terms to see what contexts they tend to appear in.
Running the `to_dict` function outputs
'''
- #if not (isinstance(corpus, (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies))
+ # if not (isinstance(corpus, (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies))
# or (issubclass(type(corpus), (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies)))):
# raise AssertionError(corpus, 'of type', type(corpus),
# 'must be a subclass of Corpus or TermCategoryFrequencies.')
@@ -40,6 +41,7 @@ def to_dict(self,
extra_category_name=None,
background_scorer=None,
include_term_category_counts=False,
+ use_offsets=False,
**kwargs):
'''
@@ -120,7 +122,8 @@ def to_dict(self,
not_categories=not_categories,
neutral_categories=neutral_categories,
extra_categories=extra_categories,
- background_scorer=background_scorer)
+ background_scorer=background_scorer,
+ use_offsets=use_offsets)
docs_getter = self._make_docs_getter(max_docs_per_category, alternative_text_field)
if neutral_category_name is None:
neutral_category_name = 'Neutral'
@@ -142,14 +145,17 @@ def _get_term_doc_counts(self, terms):
else:
term_doc_counts = self.term_doc_matrix.get_term_doc_count_df('').loc[terms]
term_doc_freq = self.term_doc_matrix.get_term_freq_df('').loc[terms]
- # this can possibly be vectorized
- for category_i, category in enumerate(term_doc_freq.columns):
- category_counts = {}
- for term_i, val in enumerate(term_doc_freq[category].values):
- if val > 0:
- category_counts[term_i] = [val, term_doc_counts.iloc[term_i, category_i]]
- term_counts.append(category_counts)
+ # This should be sped up
+ term2idx = pd.Series(np.arange(len(terms)), index=terms)
+ for category_i, category in enumerate(self.term_doc_matrix.get_categories()):
+ term_ser = term_doc_freq[category]
+ doc_ser = term_doc_counts[category]
+ term_ser = term_ser[term_ser.values > 0]
+ doc_ser = doc_ser[doc_ser.values > 0]
+ category_counts = pd.Series(np.array([term_ser.values, doc_ser.values]).T.tolist(),
+ index=term2idx[term_ser.index].values).to_dict()
+ term_counts.append(category_counts)
return term_counts
def _make_docs_getter(self, max_docs_per_category, alternative_text_field):
@@ -186,7 +192,6 @@ def inject_term_metadata(self, metadata):
self._term_metadata = metadata
return self
-
def inject_term_metadata_df(self, metadata_df):
'''
diff --git a/scattertext/TermDocMatrix.py b/scattertext/TermDocMatrix.py
index cb1464d..7a8bdfc 100644
--- a/scattertext/TermDocMatrix.py
+++ b/scattertext/TermDocMatrix.py
@@ -781,7 +781,7 @@ def use_external_metadata_lists(self, metadata_lists):
new_y_mask=self._y == self._y
)
- def use_doc_labeled_terms_as_metadata(self, doc_labels, separator='_'):
+ def use_doc_labeled_terms_as_metadata(self, doc_labels, separator='_', replace_metadata = True):
'''
Makes the metadata of a new TermDocMatrix a copy of the term-document matrix, except each term is prefixed
by its document's label followed by the separator.
@@ -800,10 +800,14 @@ def use_doc_labeled_terms_as_metadata(self, doc_labels, separator='_'):
new_meta_X = None
ordered_doc_labels = list(sorted(set(doc_labels)))
+ X = self._X
+ if replace_metadata:
+ X = self._mX
+
for doc_label in ordered_doc_labels:
label_doc_mask = doc_labels == doc_label
- label_X = self._X[label_doc_mask, :]
- label_term_mask = (label_X.sum(axis=0) > 0).A1
+ label_X = X[label_doc_mask, :]
+ label_term_mask = (X.sum(axis=0) > 0).A1
label_X = label_X[:, label_term_mask]
cols_to_pad = len(new_metadata_list)
diff --git a/scattertext/TermDocMatrixWithoutCategories.py b/scattertext/TermDocMatrixWithoutCategories.py
index be3377d..9607b1a 100644
--- a/scattertext/TermDocMatrixWithoutCategories.py
+++ b/scattertext/TermDocMatrixWithoutCategories.py
@@ -205,14 +205,18 @@ def _get_relevant_X(self, non_text):
def _get_relevant_idx_store(self, non_text):
return self._metadata_idx_store if non_text else self._term_idx_store
- def remove_infrequent_words(self, minimum_term_count, term_ranker=AbsoluteFrequencyRanker):
+ def remove_infrequent_words(self, minimum_term_count, term_ranker=AbsoluteFrequencyRanker, non_text=False):
'''
Returns
-------
A new TermDocumentMatrix consisting of only terms which occur at least minimum_term_count.
'''
- tdf = term_ranker(self).get_ranks().sum(axis=1)
- return self.remove_terms(list(tdf[tdf <= minimum_term_count].index))
+ ranker = term_ranker(self)
+ if non_text:
+ ranker = ranker.use_non_text_features()
+ tdf = ranker.get_ranks().sum(axis=1)
+
+ return self.remove_terms(list(tdf[tdf <= minimum_term_count].index), non_text=non_text)
def remove_entity_tags(self):
'''
diff --git a/scattertext/WhitespaceNLP.py b/scattertext/WhitespaceNLP.py
index 948e5fd..6127b7c 100644
--- a/scattertext/WhitespaceNLP.py
+++ b/scattertext/WhitespaceNLP.py
@@ -171,3 +171,5 @@ def whitespace_nlp_with_sentences(doc,
sents.append(toks)
sent_start_idx += len(sentence)
return Doc(sents, doc)
+
+
diff --git a/scattertext/__init__.py b/scattertext/__init__.py
index c374fac..2bd3e4f 100644
--- a/scattertext/__init__.py
+++ b/scattertext/__init__.py
@@ -1,8 +1,6 @@
from __future__ import print_function
-from scattertext.dispersion.Dispersion import Dispersion
-
-version = [0, 1, 0, 0]
+version = [0, 1, 1]
__version__ = '.'.join([str(e) for e in version])
import re
@@ -40,7 +38,7 @@
from scattertext.TermCategoryFrequencies import TermCategoryFrequencies
from scattertext.features.FeatsFromTopicModel import FeatsFromTopicModel
from scattertext.termscoring.BM25Difference import BM25Difference
-from scattertext import SampleCorpora, SampleLexicons
+from scattertext import SampleCorpora, SampleLexicons, smoothing
from scattertext import Scalers, ScatterChart
from scattertext import termranking
from scattertext.AsianNLP import chinese_nlp, japanese_nlp
@@ -125,6 +123,13 @@
from scattertext.helpers.MakeUnique import make_unique
from scattertext.viz.TermInfo import get_tooltip_js_function, get_custom_term_info_js_function
from scattertext.CorpusWithoutCategoriesFromParsedDocuments import CorpusWithoutCategoriesFromParsedDocuments
+from scattertext.OffsetCorpus import OffsetCorpus
+from scattertext.OffsetCorpusFactory import OffsetCorpusFactory
+from scattertext.dispersion.Dispersion import Dispersion
+from scattertext.features import featoffsets
+from scattertext.features.featoffsets.feat_and_offset_getter import FeatAndOffsetGetter
+from scattertext.features.featoffsets.token_and_feat_offset_getter import TokenFeatAndOffsetGetter
+from scattertext.tokenizers.roberta import RobertaTokenizerWrapper
PhraseFeatsFromTopicModel = FeatsFromTopicModel # Ensure backwards compatibility
@@ -242,6 +247,10 @@ def produce_scattertext_explorer(corpus,
background_color=None,
left_list_column=None,
censor_point_column=None,
+ right_order_column=None,
+ line_coordinates=None,
+ subword_encoding=None,
+ use_offsets=False,
return_data=False,
return_scatterplot_structure=False):
'''Returns html code of visualization.
@@ -455,7 +464,7 @@ def produce_scattertext_explorer(corpus,
Dict mapping terms to dictionaries containing additional information which can be used in the color_func
or the get_tooltip_content function. These will appear in termDict.etc
term_metadata_df : pd.DataFrame, None by default
- Datframe version of term_metadata
+ Dataframe version of term_metadata
include_all_contexts: bool, default False
Include all contexts, even non-matching ones, in interface
max_overlapping: int, default -1
@@ -493,11 +502,19 @@ def produce_scattertext_explorer(corpus,
color_score_column: str, default None
column in term_metadata df; contains value between 0 and 1 which will be used to assign a color
label_priority_column : str, default None
- Column in term_metadata_df; smaller values in the column indicate a term should be labeled first
+ Column in term_metadata_df; larger values in the column indicate a term should be labeled first
censor_point_column : str, default None
Should we allow labels to be drawn over point?
+ right_order_column : str, default None
+ Order for right column ("characteristic" by default); largest first
background_color : str, default None
Changes document.body's background color to background_color
+ line_coordinates : list, default None
+ Coordinates for drawing a line under the plot
+ subword_encoding : str, default None
+ Type of subword encoding to use, None if none, currently supports "RoBERTa"
+ use_offsets : bool, default False
+ Enable the use of metadata offsets
return_data : bool default False
Return a dict containing the output of `ScatterChartExplorer.to_dict` instead of
an html.
@@ -590,8 +607,12 @@ def produce_scattertext_explorer(corpus,
extra_categories=extra_categories,
background_scorer=characteristic_scorer,
include_term_category_counts=include_term_category_counts,
+ use_offsets=use_offsets
)
+ if line_coordinates is not None:
+ scatter_chart_data['line'] = line_coordinates
+
if return_data:
return scatter_chart_data
@@ -618,7 +639,7 @@ def produce_scattertext_explorer(corpus,
if color_score_column:
assert color_func is None
- color_func = '(function(d) {return d3.interpolateWarm(d.etc["%s"])})' % color_score_column
+ color_func = '(function(d) {return d3.interpolateRdYlBu(d.etc["%s"])})' % color_score_column
if header_sorting_algos is not None:
assert 'upper' not in header_sorting_algos
@@ -627,8 +648,8 @@ def produce_scattertext_explorer(corpus,
assert term_metadata_df is not None
assert left_list_column in term_metadata_df
header_sorting_algos = {
- "upper": '((a,b) => b.etc["'+left_list_column+'"] - a.etc["'+left_list_column+'"])',
- "lower": '((a,b) => a.etc["'+left_list_column+'"] - b.etc["'+left_list_column+'"])'
+ "upper": '((a,b) => b.etc["' + left_list_column + '"] - a.etc["' + left_list_column + '"])',
+ "lower": '((a,b) => a.etc["' + left_list_column + '"] - b.etc["' + left_list_column + '"])'
}
scatterplot_structure = ScatterplotStructure(VizDataAdapter(scatter_chart_data),
@@ -692,7 +713,9 @@ def produce_scattertext_explorer(corpus,
text_color_column=text_color_column,
suppress_text_column=suppress_text_column,
background_color=background_color,
- censor_point_column=censor_point_column)
+ censor_point_column=censor_point_column,
+ right_order_column=right_order_column,
+ subword_encoding=subword_encoding)
if return_scatterplot_structure:
return scatterplot_structure
@@ -1903,25 +1926,22 @@ def dataframe_scattertext(
kwargs['tooltip_columns'] = ['Xpos', 'Ypos']
kwargs['tooltip_column_names'] = {'Xpos': kwargs.get('x_label', 'X'), 'Ypos': kwargs.get('y_label', 'Y')}
- #kwargs.setdefault('color_func',
- # "(function(d) {return d.etc['Color']})" if 'Color' in plot_df else None)
kwargs.setdefault('metadata', None),
kwargs.setdefault('scores', plot_df['Score'] if 'Score' in plot_df else 0),
kwargs.setdefault('minimum_term_frequency', 0)
kwargs.setdefault('pmi_threshold_coefficient', 0)
kwargs.setdefault('category', corpus.get_categories()[0])
- kwargs.setdefault('original_x', plot_df['X'])
- kwargs.setdefault('original_y', plot_df['Y'])
- kwargs.setdefault('x_coords', plot_df['Xpos'])
- kwargs.setdefault('y_coords', plot_df['Ypos'])
+ kwargs.setdefault('original_x', plot_df['X'].values)
+ kwargs.setdefault('original_y', plot_df['Y'].values)
+ kwargs.setdefault('x_coords', plot_df['Xpos'].values)
+ kwargs.setdefault('y_coords', plot_df['Ypos'].values)
kwargs.setdefault('use_global_scale', True)
kwargs.setdefault('ignore_categories', True)
- kwargs.setdefault('show_axes_and_cross_hairs', 1)
+ kwargs.setdefault('show_axes_and_cross_hairs', 0)
kwargs.setdefault('unified_context', 1)
kwargs.setdefault('show_top_terms', False)
kwargs.setdefault('x_label', 'X')
kwargs.setdefault('y_label', 'Y')
-
return produce_scattertext_explorer(
corpus,
term_metadata_df=plot_df,
diff --git a/scattertext/categoryprojector/CategoryProjector.py b/scattertext/categoryprojector/CategoryProjector.py
index 4d94ddb..e4aba08 100644
--- a/scattertext/categoryprojector/CategoryProjector.py
+++ b/scattertext/categoryprojector/CategoryProjector.py
@@ -132,9 +132,9 @@ def normalize(self, weighted_category_counts):
return weighted_category_counts
def select(self, corpus):
- if self.selector_ is not None:
- corpus = corpus.select(self.selector_)
- return corpus
+ if self.selector_ is None:
+ return corpus
+ return corpus.select(self.selector_)
def _project_category_corpus(self, category_corpus, x_dim=0, y_dim=1):
normalized_counts = self.get_category_embeddings(category_corpus)
diff --git a/scattertext/categoryprojector/pairplot.py b/scattertext/categoryprojector/pairplot.py
index 6234d2d..378a563 100644
--- a/scattertext/categoryprojector/pairplot.py
+++ b/scattertext/categoryprojector/pairplot.py
@@ -32,10 +32,12 @@ def produce_category_focused_pairplot(corpus,
'''
category_num = corpus.get_categories().index(category)
-
- uncorrelated_components_projection = (category_projector.project(corpus)
- if category_projection is None
- else category_projection)
+ uncorrelated_components_projection = category_projection
+ if category_projection is None:
+ if 'use_metadata' in kwargs and kwargs['use_metadata']:
+ uncorrelated_components_projection = category_projector.project_with_metadata(corpus)
+ else:
+ uncorrelated_components_projection = category_projector.project(corpus)
distances = cosine_distances(uncorrelated_components_projection.get_category_embeddings().T)
@@ -112,8 +114,8 @@ def produce_pairplot(corpus,
category_tooltip_func = '(function(d) {return d.term})'
- initial_category_idx = corpus.get_categories().index(initial_category)
- term_plot_change_func = _get_term_plot_change_js_func(wordfish_style, category_focused, initial_category_idx)
+ #initial_category_idx = corpus.get_categories().index(initial_category)
+ term_plot_change_func = _get_term_plot_change_js_func(wordfish_style, category_focused, initial_category)
category_scatterplot_structure = ScatterplotStructure(
VizDataAdapter(category_scatter_chart_data),
@@ -274,11 +276,11 @@ def _get_category_scatter_chart_explorer(category_projection, scaler, term_ranke
return category_scatter_chart_explorer
-def _get_term_plot_change_js_func(wordfish_style, category_focused, initial_category_idx):
+def _get_term_plot_change_js_func(wordfish_style, category_focused, initial_category):
if wordfish_style:
return '(function (termInfo) {termPlotInterface.yAxisLogCounts(termInfo.term); return false;})'
if category_focused:
- return '(function (termInfo) {termPlotInterface.drawCategoryAssociation(%s, termInfo.i); return false;})' % (
- initial_category_idx
- )
- return '(function (termInfo) {termPlotInterface.drawCategoryAssociation(termInfo.i); return false;})'
+ return '(function (termInfo) {termPlotInterface.drawCategoryAssociation("%s", termInfo.term); return false;})' \
+ % (initial_category.replace('"', '\\"'))
+ #return '(function (termInfo) {termPlotInterface.drawCategoryAssociation(termInfo.i); return false;})'
+ return '(function (termInfo) {termPlotInterface.drawCategoryAssociation(termInfo.term); return false;})'
diff --git a/scattertext/data/viz/scripts/main.js b/scattertext/data/viz/scripts/main.js
index c42107b..64431d2 100644
--- a/scattertext/data/viz/scripts/main.js
+++ b/scattertext/data/viz/scripts/main.js
@@ -61,7 +61,15 @@ buildViz = function (d3) {
suppressTextColumn = undefined,
backgroundColor = undefined,
censorPointColumn = undefined,
+ rightOrderColumn = undefined,
+ subwordEncoding = null
) {
+ function formatTermForDisplay(term) {
+ if (subwordEncoding === 'RoBERTa' && (term.charCodeAt(0) === 288 || term.charCodeAt(0) === 289))
+ term = '_' + term.substr(1, term.length - 1);
+ return term;
+ }
+
//var divName = 'd3-div-1';
// Set the dimensions of the canvas / graph
var padding = {top: 30, right: 20, bottom: 30, left: 50};
@@ -251,9 +259,8 @@ buildViz = function (d3) {
// setup fill color
if (color == null) {
color = d3.interpolateRdYlBu;
- //color = d3.interpolateWarm;
}
- if((headerNames !== undefined && headerNames !== null)
+ if ((headerNames !== undefined && headerNames !== null)
&& (headerSortingAlgos !== undefined && headerSortingAlgos !== null)) {
showTopTerms = true;
}
@@ -378,11 +385,14 @@ buildViz = function (d3) {
function getDenseRanks(fullData, categoryNum) {
+ console.log("GETTING DENSE RANKS")
+ console.log("CAT NUM " + categoryNum)
+ console.log(fullData)
+
var fgFreqs = Array(fullData.data.length).fill(0);
var bgFreqs = Array(fullData.data.length).fill(0);
var categoryTermCounts = fullData.termCounts[categoryNum];
-
Object.keys(categoryTermCounts).forEach(
key => fgFreqs[key] = categoryTermCounts[key][0]
)
@@ -410,7 +420,11 @@ buildViz = function (d3) {
x => (x - minbgDenseRanks) / (maxbgDenseRanks - minbgDenseRanks)
)
- return {'fg': scalefgDenseRanks, 'bg': scalebgDenseRanks, 'bgFreqs': bgFreqs, 'fgFreqs': fgFreqs}
+ return {'fg': scalefgDenseRanks,
+ 'bg': scalebgDenseRanks,
+ 'bgFreqs': bgFreqs,
+ 'fgFreqs': fgFreqs,
+ 'term': fullData.data.map((x)=>x.term)}
}
function getCategoryDenseRankScores(fullData, categoryNum) {
@@ -700,20 +714,50 @@ buildViz = function (d3) {
continue;
}
var text = fullData.docs.texts[i];
- if (!useFullDoc)
- text = text.slice(0, 300);
- if (pattern !== null) {
- text = text.replace(pattern, '$&');
- }
- var curMatch = {
- 'id': i,
- 'snippets': [text],
- 'strength': strength,
- 'docLabel': docLabel,
- 'meta': fullData.docs.meta ? fullData.docs.meta[i] : ""
- }
- matches[numericLabel].push(curMatch);
+ if (fullData.offsets !== undefined) {
+
+ if (fullData.offsets[term] !== undefined && fullData.offsets[term][i] !== undefined) {
+ var curMatch = {
+ 'id': i,
+ 'snippets': [],
+ 'strength': strength,
+ 'docLabel': docLabel,
+ 'meta': fullData.docs.meta ? fullData.docs.meta[i] : ""
+ }
+ for (const offset_i in fullData.offsets[term][i]) {
+ var offset = fullData.offsets[term][i][offset_i];
+ var spanStart = Math.max(offset[0] - 50, 0);
+ var spanEnd = Math.min(50, text.length-offset[1]);
+ var leftContext = text.substr(spanStart, offset[0] - spanStart);
+ var matchStr = text.substr(offset[0], offset[1] - offset[0]);
+ var rightContext = text.substr(offset[1], spanEnd);
+ var snippet = leftContext + '' + matchStr + '' + rightContext;
+ if(spanStart > 0)
+ snippet = '...' + snippet;
+ if(text.length - offset[1] > 50)
+ snippet = snippet + '...'
+ curMatch.snippets.push(snippet)
+ }
+ matches[numericLabel].push(curMatch);
+ }
+ } else {
+
+ if (!useFullDoc)
+ text = text.slice(0, 300);
+ if (pattern !== null) {
+ text = text.replace(pattern, '$&');
+ }
+ var curMatch = {
+ 'id': i,
+ 'snippets': [text],
+ 'strength': strength,
+ 'docLabel': docLabel,
+ 'meta': fullData.docs.meta ? fullData.docs.meta[i] : ""
+ }
+
+ matches[numericLabel].push(curMatch);
+ }
}
}
for (var i in [0, 1]) {
@@ -991,8 +1035,6 @@ buildViz = function (d3) {
if (max_snippets != null) {
var contextsToDisplay = contexts[catIndex].slice(0, max_snippets);
}
- console.log("CATCAT")
- console.log(catName, catIndex)
//var divId = catName == catInternalName ? '#cat' : '#notcat';
var divId = null
if (fullData.info.category_internal_name == catName) {
@@ -1006,8 +1048,6 @@ buildViz = function (d3) {
} else {
return;
}
- console.log('divid');
- console.log(divId)
var temp = d3.select(divId).selectAll("div").remove();
contexts[catIndex].forEach(function (context) {
@@ -1028,14 +1068,11 @@ buildViz = function (d3) {
d3.select('#' + divName + '-' + 'termstats')
.selectAll("div")
.remove();
- var termHtml = 'Term: ' + info.term + '';
+ var termHtml = 'Term: ' + formatTermForDisplay(info.term) + '';
if ('metalists' in fullData && info.term in fullData.metalists) {
- termHtml = 'Topic: ' + info.term + '';
+ termHtml = 'Topic: ' + formatTermForDisplay(info.term) + '';
}
- console.log("HERE")
- console.log(getCustomTermHtml)
if (getCustomTermHtml !== null) {
- console.log("Making custom html")
termHtml = getCustomTermHtml(info);
}
d3.select('#' + divName + '-' + 'termstats')
@@ -1144,12 +1181,7 @@ buildViz = function (d3) {
info.ncat,
termInfo.contexts[1].length * 1000 / numNCatDocs)
);
- console.log("TermINfo")
- console.log(termInfo);
- console.log(info)
if (showNeutral) {
- console.log("NEUTRAL")
-
var numList = fullData.docs.categories.map(function (x, i) {
if (fullData.info.neutral_category_internal_names.indexOf(x) > -1) {
return i;
@@ -1178,7 +1210,6 @@ buildViz = function (d3) {
);
if (showExtra) {
- console.log("EXTRA")
var numList = fullData.docs.categories.map(function (x, i) {
if (fullData.info.extra_category_internal_names.indexOf(x) > -1) {
return i;
@@ -1252,6 +1283,7 @@ buildViz = function (d3) {
function buildMatcher(term) {
+
var boundary = '(?:\\W|^|$)';
var wordSep = "[^\\w]+";
if (asianMode) {
@@ -1267,6 +1299,7 @@ buildViz = function (d3) {
}
var termToRegex = term;
+
// https://stackoverflow.com/questions/3446170/escape-string-for-use-in-javascript-regex
function escapeRegExp(string) {
return string.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\,\\\^\$\|\'#?]/g, "\\$&");
@@ -1289,6 +1322,20 @@ buildViz = function (d3) {
termToRegex.replace(' ', wordSep, 'gim')
) + ')' + boundary, 'gim');
console.log(regexp);
+
+ if (subwordEncoding === 'RoBERTa') {
+ if (term.charCodeAt(0) === 288 || term.charCodeAt(0) === 289) {
+ // Starts with character Ġ indicating it's a word start
+ console.log("START")
+ regexp = new RegExp(boundary + escapeRegExp(term.substr(1, term.length)), 'gim');
+ } else {
+ regexp = new RegExp("\w" + escapeRegExp(term), 'gim');
+ }
+ console.log("SP")
+ console.log(regexp)
+ }
+
+
try {
regexp.exec('X');
} catch (err) {
@@ -1410,13 +1457,17 @@ buildViz = function (d3) {
}
function getDefaultTooltipContent(d) {
- var message = d.term + "
" + d.cat25k + ":" + d.ncat25k + " per 25k words";
+ var term = formatTermForDisplay(d.term);
+
+ var message = term + "
" + d.cat25k + ":" + d.ncat25k + " per 25k words";
message += '
score: ' + d.os.toFixed(5);
return message;
}
function getDefaultTooltipContentWithoutScore(d) {
- var message = d.term + "
" + d.cat25k + ":" + d.ncat25k + " per 25k words";
+ var term = formatTermForDisplay(d.term);
+
+ var message = term + "
" + d.cat25k + ":" + d.ncat25k + " per 25k words";
return message;
}
@@ -1425,7 +1476,7 @@ buildViz = function (d3) {
var matches = (data.filter(function (term) {
return term.x === d.x && term.y === d.y && (term.display === undefined || term.display === true);
}).map(function (term) {
- return term.term
+ return formatTermForDisplay(term.term)
}).sort()
);
return matches;
@@ -1574,9 +1625,6 @@ buildViz = function (d3) {
function makeWordInteractive(data, svg, domObj, term, termInfo, showObscured = true) {
return domObj
.on("mouseover", function (d) {
- console.log("mouseover")
- console.log(term)
- console.log(termInfo)
showToolTipForTerm(data, svg, term, termInfo, showObscured);
d3.select(this).style("stroke", "black");
})
@@ -1602,6 +1650,8 @@ buildViz = function (d3) {
});
}
+
+
function processData(fullData) {
modelInfo = fullData['info'];
@@ -1671,8 +1721,8 @@ buildViz = function (d3) {
data.forEach(function (d, i) {
d.ci = i
});
- //console.log('XXXXX'); console.log(data)
+ //console.log('XXXXX'); console.log(data)
function getFilter(data) {
@@ -1680,8 +1730,6 @@ buildViz = function (d3) {
}
-
-
var mysvg = svg
.selectAll("dot")
.data(getFilter(data))
@@ -1855,6 +1903,8 @@ buildViz = function (d3) {
if (textColorColumn !== undefined && datum.etc !== undefined && datum.etc[textColorColumn] !== undefined) {
termColor = datum.etc[textColorColumn];
}
+ term = formatTermForDisplay(term);
+
for (var configI in configs) {
var config = configs[configI];
var curLabel = svg.append("text")
@@ -1980,20 +2030,22 @@ buildViz = function (d3) {
}
var sortedData = data.map(x => x).sort(sortByDist ? euclideanDistanceSort : scoreSort);
- console.log("CENSOR COL"); console.log(censorPointColumn);
if (doCensorPoints) {
for (var i in data) {
var d = sortedData[i];
- if(!(censorPointColumn !== undefined
+ if (!(censorPointColumn !== undefined
&& d.etc !== undefined
&& d.etc[censorPointColumn] === false)) {
- console.log("CENSOR COL"); console.log(censorPointColumn);
censorPoints(
d,
- function (d) { return d.x },
- function (d) { return d.y }
+ function (d) {
+ return d.x
+ },
+ function (d) {
+ return d.y
+ }
);
}
@@ -2064,6 +2116,20 @@ buildViz = function (d3) {
}
}
+ if (fullData['line'] !== undefined) {
+ var valueline = d3.line()
+ .x(function (d) {
+ return x(d.x);
+ })
+ .y(function (d) {
+ return y(d.y);
+ });
+ fullData.line = fullData.line.sort((a, b) => b.x - a.x);
+ svg.append("path")
+ .attr("class", "line")
+ .style("stroke-width", "1px")
+ .attr("d", valueline(fullData['line'])).moveToBack();
+ }
if (showAxes || showAxesAndCrossHairs) {
var myXAxis = svg.append("g")
@@ -2216,7 +2282,7 @@ buildViz = function (d3) {
.attr("x", word.node().getBBox().x)
.attr("y", word.node().getBBox().y
+ 2 * word.node().getBBox().height)
- .text(curTerm);
+ .text(formatTermForDisplay(curTerm));
wordObjList.push(curWordPrinted)
return makeWordInteractive(
termDataList, //data,
@@ -2415,10 +2481,15 @@ buildViz = function (d3) {
.attr("dy", "6px")
.text(title);
+ var rightSortMethod = sortMethod;
+ if (rightOrderColumn !== undefined && rightOrderColumn !== null) {
+ rightSortMethod = ((a, b) => b.etc[rightOrderColumn] - a.etc[rightOrderColumn]);
+ }
+
var wordListData = showWordList(
word,
data.filter(term => (term.display === undefined || term.display === true))
- .sort(sortMethod).slice(0, 30)
+ .sort(rightSortMethod).slice(0, 30)
);
word = wordListData.word;
@@ -2449,11 +2520,6 @@ buildViz = function (d3) {
//console.log(datum.i, datum.ci, i)
//var label = labelPointsIfPossible(i, getX(filteredData[i]), getY(filteredData[i]));
if (datum.display === undefined || datum.display === true) {
- if (i === 1) {
- console.log("trying to label datum # " + i + ": " + datum.term)
- console.log(datum)
- console.log([getX(datum), getY(datum)])
- }
var label = labelPointsIfPossible(datum, getX(datum), getY(datum));
if (label !== false) {
//console.log("labeled")
@@ -2469,10 +2535,11 @@ buildViz = function (d3) {
var labeledPoints = [];
var labelPriorityFunction = ((a, b) => Math.min(a.x, 1 - a.x, a.y, 1 - a.y) - Math.min(b.x, 1 - b.x, b.y, 1 - b.y))
if (labelPriorityColumn !== undefined && labelPriorityColumn !== null) {
- labelPriorityFunction = (a, b) => a.etc[labelPriorityColumn] - b.etc[labelPriorityColumn];
+ labelPriorityFunction = (a, b) => b.etc[labelPriorityColumn] - a.etc[labelPriorityColumn];
}
- labeledPoints = performPartialLabeling(data,
+ labeledPoints = performPartialLabeling(
+ data,
labeledPoints,
function (d) {
return d.x
@@ -2857,13 +2924,28 @@ buildViz = function (d3) {
)
};
- plotInterface.drawCategoryAssociation = function (categoryNum, otherCategoryNum = null) {
+ plotInterface.drawCategoryAssociation = function (category, otherCategory = null) {
+ console.log("+++++++ Entering drawCategoryAssociation")
+ console.log("Category: " + category)
+ console.log("Other Category: " + otherCategory)
+ var categoryNum = this.fullData.info.categories.indexOf(category);
+
+ var otherCategoryNum = null;
+ if(otherCategory !== null)
+ otherCategoryNum = this.fullData.info.categories.indexOf(otherCategory);
+
+ console.log("cat/other: " + category + "/" + otherCategory + " ::: " + categoryNum + "/" + otherCategoryNum)
+
+ console.log("Full Data")
+ console.log(this.fullData)
+ /*
var rawLogTermCounts = getTermCounts(this.fullData).map(Math.log);
var maxRawLogTermCounts = Math.max(...rawLogTermCounts);
var minRawLogTermCounts = Math.min(...rawLogTermCounts);
var logTermCounts = rawLogTermCounts.map(
x => (x - minRawLogTermCounts) / maxRawLogTermCounts
)
+ */
//var rawScores = getCategoryDenseRankScores(this.fullData, categoryNum);
//console.log("RAW SCORES")
@@ -2891,15 +2973,10 @@ buildViz = function (d3) {
var denseRanks = getDenseRanks(this.fullData, categoryNum)
- console.log("denseRanks")
- console.log(denseRanks);
if (otherCategoryNum !== null) {
var otherDenseRanks = getDenseRanks(this.fullData, otherCategoryNum);
- console.log("otherDenseRanks");
- console.log(otherDenseRanks);
denseRanks.bg = otherDenseRanks.fg;
denseRanks.bgFreqs = otherDenseRanks.fgFreqs;
-
}
var rawScores = denseRanks.fg.map((x, i) => x - denseRanks.bg[i]);
@@ -2937,16 +3014,16 @@ buildViz = function (d3) {
this.fullData.data = this.fullData.data.map(function (term, i) {
//term.ci = i;
- term.s = scores[i];
- term.os = rawScores[i];
- term.cat = denseRanks.fgFreqs[i];
- term.ncat = denseRanks.bgFreqs[i];
- term.cat25k = parseInt(denseRanks.fgFreqs[i] * 25000 / fgFreqSum);
- term.ncat25k = parseInt(denseRanks.bgFreqs[i] * 25000 / bgFreqSum);
- term.x = xf(ox[i]) // logTermCounts[term.i];
- term.y = yf(oy[i]) // scores[term.i];
- term.ox = ox[i];
- term.oy = oy[i];
+ term.s = scores[term.i];
+ term.os = rawScores[term.i];
+ term.cat = denseRanks.fgFreqs[term.i];
+ term.ncat = denseRanks.bgFreqs[term.i];
+ term.cat25k = parseInt(denseRanks.fgFreqs[term.i] * 25000 / fgFreqSum);
+ term.ncat25k = parseInt(denseRanks.bgFreqs[term.i] * 25000 / bgFreqSum);
+ term.x = xf(ox[term.i]) // logTermCounts[term.i];
+ term.y = yf(oy[term.i]) // scores[term.i];
+ term.ox = ox[term.i];
+ term.oy = oy[term.i];
term.display = false;
return term;
})
diff --git a/scattertext/dispersion/Dispersion.py b/scattertext/dispersion/Dispersion.py
index 6a1237e..04f9555 100644
--- a/scattertext/dispersion/Dispersion.py
+++ b/scattertext/dispersion/Dispersion.py
@@ -4,7 +4,7 @@
import pandas as pd
class Dispersion(object):
- def __init__(self, corpus=None, term_doc_mat=None):
+ def __init__(self, corpus=None, term_doc_mat=None, use_metadata=False):
"""
From https://www.researchgate.net/publication/332120488_Analyzing_dispersion
Stefan Th. Gries. Analyzing dispersion. April 2019. Practical handbook of corpus linguistics. Springer.
@@ -30,11 +30,13 @@ def __init__(self, corpus=None, term_doc_mat=None):
(6) p = (1/9, 2/10, 3/10, 4/10, 5 /11) (the percentages a makes up of each corpus part 1-n)
'''
self.corpus = None
+ X = term_doc_mat
if corpus is not None:
self.corpus = corpus
- X = corpus.get_term_doc_mat()
- else:
- X = term_doc_mat
+ if use_metadata:
+ X = corpus.get_metadata_doc_mat()()
+ else:
+ X = corpus.get_term_doc_mat()
part_sizes = X.sum(axis=1)
self.l = X.sum().sum()
self.n = X.shape[0]
@@ -64,6 +66,8 @@ def vc(self):
def jullands_d(self):
"""
+ Direct quote from Gries (2019)
+
The version of Juilland's D that can handle differently large corpus parts is then computed
as shown in (10). In order to accommodate the different sizes of the corpus parts, however, the
variation coefficient is not computed using the observed frequencies v1-n (i.e. 1, 2, 3, 4, 5 in files
@@ -76,6 +80,8 @@ def jullands_d(self):
def rosengrens(self):
'''
+ Direct quote from Gries (2019)
+
The version of Rosengren’s S that can handle differently large corpus parts is
shown in (12). Each corpus part size’s in percent (in s) is multiplied with the
frequencies of the element in question in each corpus part (in v1-n); of each product,
@@ -88,6 +94,8 @@ def rosengrens(self):
def dp(self):
'''
+ Direct quote from Gries (2019)
+
Finally, Gries (2008, 2010) and the follow-up by Lijffijt and Gries (2012)
proposed a measure called DP (for deviation of proportions), which falls between
1-min s (for an extremely even distribution) and 1 (for an extremely clumpy
@@ -106,7 +114,9 @@ def dp_norm(self):
return self.dp() / (1 - self.s.min())
def kl_divergence(self):
- '''The final measure to be discussed here is one that, as far as I can tell, has never
+ '''
+ Direct quote from Gries (2019)
+ he final measure to be discussed here is one that, as far as I can tell, has never
been proposed as a measure of dispersion, but seems to me to be ideally suited to be
one, namely the Kullback-Leibler (or KL-) divergence, a non-symmetric measure
that quantifies how different one probability distribution (e.g., the distribution of
@@ -137,8 +147,9 @@ def da(self):
for word_i in range(self.v.shape[1]):
y = self.v.T[word_i].todense().A1
yt = np.tile(y, (n, 1))
- s = np.sum(np.abs(yt - yt.T)) / 2
- da.append(1 - constant * s * 0.5 * y.mean())
+ pairs_sum = np.sum(np.abs(yt - yt.T)) / 2
+ da_score = 1 - pairs_sum * constant/(2 * y.mean())
+ da.append(da_score)
return np.array(da)
@@ -154,7 +165,8 @@ def get_df(self, terms = None):
"Rosengren's S": self.rosengrens(),
'DP': self.dp(),
'DP norm': self.dp_norm(),
- 'KL-divergence': self.kl_divergence()
+ 'KL-divergence': self.kl_divergence(),
+ 'DA': self.da()
}
if terms is None:
return pd.DataFrame(df_content)
diff --git a/scattertext/features/FeatsFromOnlyEmpath.py b/scattertext/features/FeatsFromOnlyEmpath.py
index a136175..bf8ebbd 100644
--- a/scattertext/features/FeatsFromOnlyEmpath.py
+++ b/scattertext/features/FeatsFromOnlyEmpath.py
@@ -6,6 +6,6 @@ class FeatsFromOnlyEmpath(FeatsFromSpacyDocAndEmpath):
def get_feats(self, doc):
return Counter()
def get_doc_metadata(self, doc, prefix=''):
- return super(FeatsFromOnlyEmpath, self).get_doc_metadata(doc, prefix=prefix)
+ return FeatsFromSpacyDocAndEmpath.get_doc_metadata(self, doc, prefix=prefix)
diff --git a/scattertext/features/FeatsFromSpacyDocAndEmpath.py b/scattertext/features/FeatsFromSpacyDocAndEmpath.py
index 4326a4d..43db642 100644
--- a/scattertext/features/FeatsFromSpacyDocAndEmpath.py
+++ b/scattertext/features/FeatsFromSpacyDocAndEmpath.py
@@ -6,50 +6,50 @@
class FeatsFromSpacyDocAndEmpath(FeatsFromSpacyDoc):
- def __init__(self,
- use_lemmas=False,
- entity_types_to_censor=set(),
- tag_types_to_censor=set(),
- strip_final_period=False,
- empath_analyze_function=None,
- **kwargs):
- '''
- Parameters
- ----------
- empath_analyze_function: function (default=empath.Empath().analyze)
- Function that produces a dictionary mapping Empath categories to
+ def __init__(self,
+ use_lemmas=False,
+ entity_types_to_censor=set(),
+ tag_types_to_censor=set(),
+ strip_final_period=False,
+ empath_analyze_function=None,
+ **kwargs):
+ '''
+ Parameters
+ ----------
+ empath_analyze_function: function (default=empath.Empath().analyze)
+ Function that produces a dictionary mapping Empath categories to
- Other parameters from FeatsFromSpacyDoc.__init__
- '''
- if empath_analyze_function is None:
- try:
- import empath
- except ImportError:
- raise Exception("Please install the empath library to use FeatsFromSpacyDocAndEmpath.")
- self._empath_analyze_function = empath.Empath().analyze
- else:
- self._empath_analyze_function = partial(empath_analyze_function,
- kwargs={'tokenizer': 'bigram'})
- super(FeatsFromSpacyDocAndEmpath, self).__init__(use_lemmas,
- entity_types_to_censor,
- tag_types_to_censor,
- strip_final_period)
+ Other parameters from FeatsFromSpacyDoc.__init__
+ '''
+ if empath_analyze_function is None:
+ try:
+ import empath
+ except ImportError:
+ raise Exception("Please install the empath library to use FeatsFromSpacyDocAndEmpath.")
+ self._empath_analyze_function = empath.Empath().analyze
+ else:
+ self._empath_analyze_function = partial(empath_analyze_function,
+ kwargs={'tokenizer': 'bigram'})
+ FeatsFromSpacyDoc.__init__(self, use_lemmas,
+ entity_types_to_censor,
+ tag_types_to_censor,
+ strip_final_period)
- def get_doc_metadata(self, doc, prefix=''):
- empath_counter = Counter()
- if version_info[0] >= 3:
- doc = str(doc)
- for empath_category, score in self._empath_analyze_function(doc).items():
- if score > 0:
- empath_counter[prefix + empath_category] = int(score)
- return empath_counter
+ def get_doc_metadata(self, doc, prefix=''):
+ empath_counter = Counter()
+ if version_info[0] >= 3:
+ doc = str(doc)
+ for empath_category, score in self._empath_analyze_function(doc).items():
+ if score > 0:
+ empath_counter[prefix + empath_category] = int(score)
+ return empath_counter
- def has_metadata_term_list(self):
- return True
+ def has_metadata_term_list(self):
+ return True
- def get_top_model_term_lists(self):
- try:
- import empath
- except ImportError:
- raise Exception("Please install the empath library to use FeatsFromSpacyDocAndEmpath.")
- return dict(empath.Empath().cats)
+ def get_top_model_term_lists(self):
+ try:
+ import empath
+ except ImportError:
+ raise Exception("Please install the empath library to use FeatsFromSpacyDocAndEmpath.")
+ return dict(empath.Empath().cats)
diff --git a/scattertext/features/featoffsets/__init__.py b/scattertext/features/featoffsets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scattertext/features/featoffsets/feat_and_offset_getter.py b/scattertext/features/featoffsets/feat_and_offset_getter.py
new file mode 100644
index 0000000..0040eca
--- /dev/null
+++ b/scattertext/features/featoffsets/feat_and_offset_getter.py
@@ -0,0 +1,6 @@
+class FeatAndOffsetGetter(object):
+ def get_term_offsets(self, doc):
+ return None
+
+ def get_metadata_offsets(self, doc):
+ return None
\ No newline at end of file
diff --git a/scattertext/features/featoffsets/token_and_feat_offset_getter.py b/scattertext/features/featoffsets/token_and_feat_offset_getter.py
new file mode 100644
index 0000000..df86f3b
--- /dev/null
+++ b/scattertext/features/featoffsets/token_and_feat_offset_getter.py
@@ -0,0 +1,15 @@
+from scattertext.features.featoffsets.feat_and_offset_getter import FeatAndOffsetGetter
+
+
+class TokenFeatAndOffsetGetter(FeatAndOffsetGetter):
+ def get_term_offsets(self, doc):
+ return []
+
+ def get_metadata_offsets(self, doc):
+ offset_tokens = {}
+ for sent in doc.sents:
+ for tok in sent:
+ token_stats = offset_tokens.setdefault(tok.lower_, [0, []])
+ token_stats[0] += 1
+ token_stats[1].append((tok.idx, tok.idx + len(tok.lower_)))
+ return offset_tokens.items()
\ No newline at end of file
diff --git a/scattertext/smoothing/__init__.py b/scattertext/smoothing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scattertext/smoothing/lowess.py b/scattertext/smoothing/lowess.py
new file mode 100644
index 0000000..04f0f90
--- /dev/null
+++ b/scattertext/smoothing/lowess.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+class Lowess(object):
+ def __init__(self):
+ self.points = None
+
+ def fit(self, xdata, ydata):
+ import statsmodels.api as sm
+ self.model = sm.nonparametric.lowess(ydata, xdata, frac=1./3)
+ return self
+
+ def predict(self, x):
+ return np.interp(x, self.model.T[0], self.model.T[1])
+
+ def fit_predict(self, xdata, ydata):
+ return self.fit(xdata, ydata).predict(xdata)
diff --git a/scattertext/smoothing/mean_isotonic.py b/scattertext/smoothing/mean_isotonic.py
new file mode 100644
index 0000000..9d98f05
--- /dev/null
+++ b/scattertext/smoothing/mean_isotonic.py
@@ -0,0 +1,25 @@
+import pandas as pd
+import numpy as np
+from sklearn.isotonic import IsotonicRegression
+
+
+class MeanIsotonic:
+ def __init__(self, n=1000):
+ self.n = n
+
+ def fit_predict(self, x, y):
+ assert len(x) == len(y)
+ df = pd.DataFrame({
+ 'x': x,
+ 'y': y
+ })
+
+ # Average runs of isotonic regression
+ pred = np.zeros(len(df), dtype=np.float)
+ for i in range(self.n):
+ sample_df = df.sample(frac=0.5)
+ pred += 1 / self.n * IsotonicRegression(
+ y_max=1, y_min=0, out_of_bounds='clip'
+ ).fit(sample_df.x.values, sample_df.y.values).predict(df.x.values)
+ return pred
+
diff --git a/scattertext/helpers/power_law.py b/scattertext/smoothing/power_law.py
similarity index 68%
rename from scattertext/helpers/power_law.py
rename to scattertext/smoothing/power_law.py
index a954718..65e5632 100644
--- a/scattertext/helpers/power_law.py
+++ b/scattertext/smoothing/power_law.py
@@ -13,8 +13,12 @@ def errfunc(p, x, y):
class PowerLaw(object):
def __init__(self):
- pass
+ self.partial_func = None
def fit(self, xdata, ydata):
params, _ = leastsq(errfunc, [max(ydata), -1, -0.5], args=(xdata, ydata), maxfev=500)
- return partial(fitfunc, params)
+ self.partial_func = partial(fitfunc, params)
+ return self
+
+ def predict(self, x):
+ self.partial_func(x)
diff --git a/scattertext/smoothing/sigmoidal.py b/scattertext/smoothing/sigmoidal.py
new file mode 100644
index 0000000..756ab44
--- /dev/null
+++ b/scattertext/smoothing/sigmoidal.py
@@ -0,0 +1,27 @@
+import numpy as np
+from scipy.optimize import curve_fit
+
+
+# from https://stackoverflow.com/questions/55725139/fit-sigmoid-function-s-shape-curve-to-data-using-python
+def sigmoid(x, L, x0, k, b):
+ y = L / (1 + np.exp(-k * (x - x0))) + b
+ return (y)
+
+class Sigmoidal:
+ def __init__(self):
+ self.popt = None
+
+ def fit(self, x, y):
+ assert len(x) == len(y)
+
+ p0 = [max(y), np.median(x), 1, min(y)] # this is an mandatory initial guess
+
+ self.popt, pcov = curve_fit(sigmoid, x, y, p0, method='dogbox', maxfev=10000)
+ return self
+
+ def fit_predict(self, x, y):
+ self.fit(x, y)
+ return self.predict(x)
+
+ def predict(self, x):
+ return sigmoid(x, *self.popt)
diff --git a/scattertext/test/test_HTMLVisualizationAssembly.py b/scattertext/test/test_HTMLVisualizationAssembly.py
index de05dab..3ea75c9 100644
--- a/scattertext/test/test_HTMLVisualizationAssembly.py
+++ b/scattertext/test/test_HTMLVisualizationAssembly.py
@@ -20,7 +20,8 @@ def get_params(self, param_dict={}):
'"' + DEFAULT_D3_AXIS_VALUE_FORMAT + '"',
'"' + DEFAULT_D3_AXIS_VALUE_FORMAT + '"',
'false', '-1', 'true', 'false', 'true', 'false', 'false', 'false', 'true', 'null', 'null', 'null',
- 'false', 'null', 'undefined', 'undefined', 'undefined', 'undefined', 'undefined']
+ 'false', 'null', 'undefined', 'undefined', 'undefined', 'undefined', 'undefined', 'undefined',
+ 'undefined']
for i, val in param_dict.items():
params[i] = val
return 'buildViz(' + ',\n'.join(params) + ');\n'
@@ -526,4 +527,16 @@ def test_censor_point_column(self):
visualization_data = self.make_adapter()
params = (ScatterplotStructure(visualization_data, censor_point_column='CensorPoint')
.call_build_visualization_in_javascript())
- self.assertEqual(params, self.get_params({61: '"CensorPoint"'}))
\ No newline at end of file
+ self.assertEqual(params, self.get_params({61: '"CensorPoint"'}))
+
+ def test_right_order_column(self):
+ visualization_data = self.make_adapter()
+ params = (ScatterplotStructure(visualization_data, right_order_column='Priority')
+ .call_build_visualization_in_javascript())
+ self.assertEqual(params, self.get_params({62: '"Priority"'}))
+
+ def test_sentence_piece(self):
+ visualization_data = self.make_adapter()
+ params = (ScatterplotStructure(visualization_data, subword_encoding='RoBERTa')
+ .call_build_visualization_in_javascript())
+ self.assertEqual(params, self.get_params({63: '"RoBERTa"'}))
diff --git a/scattertext/tokenizers/__init__.py b/scattertext/tokenizers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scattertext/viz/PairPlotFromScattertextStructure.py b/scattertext/viz/PairPlotFromScattertextStructure.py
index ebb9a52..74ca86c 100644
--- a/scattertext/viz/PairPlotFromScattertextStructure.py
+++ b/scattertext/viz/PairPlotFromScattertextStructure.py
@@ -4,7 +4,6 @@
from scattertext.viz.HTMLSemioticSquareViz import ClickableTerms
-
class PairPlotFromScatterplotStructure(object):
def __init__(self,
category_scatterplot_structure,
@@ -74,8 +73,8 @@ def to_html(self):
self.term_plot_interface
),
self.term_scatterplot_structure.get_js_reset_function(
- values_to_set = [self.category_plot_interface, self.term_plot_interface],
- functions_to_reset = ['build'+self.category_plot_interface, 'build'+self.term_plot_interface]
+ values_to_set=[self.category_plot_interface, self.term_plot_interface],
+ functions_to_reset=['build' + self.category_plot_interface, 'build' + self.term_plot_interface]
),
PackedDataUtils.javascript_post_build_viz('categorySearch', self.category_plot_interface),
PackedDataUtils.javascript_post_build_viz('termSearch', self.term_plot_interface),
@@ -104,8 +103,6 @@ def _get_html_template(self):
return PackedDataUtils.get_packaged_html_template_content(PAIR_PLOT_HTML_VIZ_FILE_NAME)
return PackedDataUtils.get_packaged_html_template_content(PAIR_PLOT_WITHOUT_HALO_HTML_VIZ_FILE_NAME)
-
-
def _get_lexicon_html(self, terms):
lexicon_html = ''
for i, term in enumerate(terms):
diff --git a/scattertext/viz/ScatterplotStructure.py b/scattertext/viz/ScatterplotStructure.py
index d8dea79..b2f5109 100644
--- a/scattertext/viz/ScatterplotStructure.py
+++ b/scattertext/viz/ScatterplotStructure.py
@@ -72,7 +72,9 @@ def __init__(
text_color_column=None,
suppress_text_column=None,
censor_point_column=None,
- background_color=None
+ background_color=None,
+ right_order_column=None,
+ subword_encoding=None
):
'''
@@ -209,15 +211,21 @@ def __init__(
background_labels: List[Dict]: default None
List of [{"Text": "Label", "X": xpos, "Y": ypos}, ...] to be background labels on plot
label_priority_column : str, default None
- Column in term_metadata_df; smaler values in the column indicate a term should be labeled first
+ Column in term_metadata_df; larger values in the column indicate a term should be labeled first
text_color_column: str, default None
Column in term_metadata_df which supplies term colors
suppress_text_column: str, default None
Column in term_metadata_df which is of boolean value. Indicates if a term should be labeled.
censor_point_column : str, default None
Should we prevent labels from being drawn over a point?
+ right_order_column : str, default None
+ Order for right column ("characteristic" by default); largest first
+ subword_encoding : str, default None
+ Type of subword encoding to use, None if none, currently supports "RoBERTa"
background_color: str, default None
Color to set document.body's background
+ sentence_piece: bool, default False
+ Use sentence piece conventions from to search for terms in JS
'''
self._visualization_data = visualization_data
self._width_in_pixels = width_in_pixels if width_in_pixels is not None else 1000
@@ -281,7 +289,9 @@ def __init__(
self._text_color_column = text_color_column
self._suppress_label_column = suppress_text_column
self._censor_point_column = censor_point_column
+ self._right_order_column = right_order_column
self._background_color = background_color
+ self._subword_encoding = subword_encoding
def call_build_visualization_in_javascript(self):
def js_default_value(x):
@@ -387,6 +397,8 @@ def json_with_jsvalue_or_null(x):
js_default_string(self._suppress_label_column),
js_default_string(self._background_color),
js_default_string(self._censor_point_column),
+ js_default_string(self._right_order_column),
+ js_default_string(self._subword_encoding)
]
return 'buildViz(' + ',\n'.join(arguments) + ');\n'
diff --git a/scattertext/viz/TermInfo.py b/scattertext/viz/TermInfo.py
index 44bc605..45c0534 100644
--- a/scattertext/viz/TermInfo.py
+++ b/scattertext/viz/TermInfo.py
@@ -2,7 +2,6 @@
import pandas as pd
-
def get_tooltip_js_function(plot_df, tooltip_column_names, tooltip_columns):
if len(tooltip_columns) > 2:
raise Exception("You can have at most two columns in a tooltip.")
@@ -10,7 +9,9 @@ def get_tooltip_js_function(plot_df, tooltip_column_names, tooltip_columns):
tooltip_column_names = {} if tooltip_column_names is None else tooltip_column_names
for col in tooltip_columns:
assert col in plot_df
- formatting = '.toFixed(6)' if pd.api.types.is_float(plot_df[col].iloc[0]) else ''
+ formatting = ''
+ if pd.api.types.is_float(plot_df[col].iloc[0]):
+ formatting = '.toFixed(6)'
tooltip_content += '+ "
%s: " + d.etc["%s"]%s' % (
html.escape(tooltip_column_names.get(col, col)),
col.replace('"', '\\"').replace("'", "\\'"), formatting)
diff --git a/setup.py b/setup.py
index 4da1923..7dd5a3b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages
setup(name='scattertext',
- version='0.1.0.0',
+ version='0.1.1',
description='An NLP package to visualize interesting terms in text.',
url='https://github.com/JasonKessler/scattertext',
author='Jason Kessler',
@@ -16,7 +16,8 @@
'six',
'mock',
'statsmodels',
- 'flashtext'
+ 'flashtext',
+ 'pytextrank==2.1.0'
#'spacy',
#'jieba',
#'tinysegmenter',