Skip to content

Commit

Permalink
Adding in code to ensure that term statistics will show up even if no…
Browse files Browse the repository at this point in the history
… documents are present in visualization.
  • Loading branch information
JasonKessler committed Apr 26, 2020
1 parent 83c5db8 commit 948d244
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 15 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Gitter Chat](https://img.shields.io/badge/GITTER-join%20chat-green.svg)](https://gitter.im/scattertext/Lobby)
[![Twitter Follow](https://img.shields.io/twitter/follow/espadrine.svg?style=social&label=Follow)](https://twitter.com/jasonkessler)

# Scattertext 0.0.2.63
# Scattertext 0.0.2.64

A tool for finding distinguishing terms in corpora, and presenting them in an
interactive, HTML scatter plot. Points corresponding to terms are selectively labeled
Expand Down
22 changes: 22 additions & 0 deletions demo_compact_suppress_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import scattertext as st

df = st.SampleCorpora.ConventionData2012.get_data().assign(
parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)

corpus = st.CorpusFromParsedDocuments(
df, category_col='party', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(
corpus,
category='democrat',
category_name='Democratic',
not_category_name='Republican',
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
transform=st.Scalers.dense_rank,
max_docs_per_category=0
)
open('./demo_compact_suppress_documents.html', 'w').write(html)
print('open ./demo_compact_suppress_documents.html in Chrome')
7 changes: 4 additions & 3 deletions demo_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
transform=st.Scalers.dense_rank,
max_overlapping=10
max_overlapping=10,
max_docs_per_category=0
)
open('./demo_names.html', 'w').write(html)
print('open ./demo_name.html in Chrome')
open('./demo_names2.html', 'w').write(html)
print('open ./demo_names2.html in Chrome')
2 changes: 1 addition & 1 deletion scattertext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from scattertext.diachronic.TimeStructure import TimeStructure

version = [0, 0, 2, 63]
version = [0, 0, 2, 64]
__version__ = '.'.join([str(e) for e in version])
import re
import numpy as np
Expand Down
23 changes: 14 additions & 9 deletions scattertext/data/viz/scripts/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -750,7 +750,7 @@ buildViz = function (d3) {
var info = termInfo.info;
var notmatches = termInfo.notmatches;
if (contexts[0].length + contexts[1].length + contexts[2].length + contexts[3].length == 0) {
return null;
//return null;
}
//!!! Future feature: context words
//var contextWords = getContextWordSFS(info.term);
Expand Down Expand Up @@ -977,7 +977,7 @@ buildViz = function (d3) {
})
.reduce(function (a, b) {
return a + b;
})
}, 0);

var notCategoryNumList = fullData.docs.categories.map(function (x, i) {
if (fullData.info.not_category_internal_names.indexOf(x) > -1) {
Expand All @@ -996,16 +996,21 @@ buildViz = function (d3) {
})
.reduce(function (a, b) {
return a + b;
});
}, 0);

function getFrequencyDescription(name, count25k, count, ndocs) {
var desc = name + ' frequency: <div class=text_subhead>' + count25k
+ ' per 25,000 terms</div><div class=text_subhead>' + Math.round(ndocs)
+ ' per 1,000 docs</div>';
var desc = name + ' frequency: <div class=text_subhead>' + count25k + ' per 25,000 terms</div>';
if (!isNaN(Math.round(ndocs))) {
desc += '<div class=text_subhead>' + Math.round(ndocs) + ' per 1,000 docs</div>';
}
if (count == 0) {
desc += '<u>Not found in any ' + name + ' documents.</u>';
} else {
desc += '<u>Some of the ' + count + ' mentions:</u>';
if (!isNaN(Math.round(ndocs))) {
desc += '<u>Some of the ' + count + ' mentions:</u>';
} else {
desc += count + ' mentions';
}
}
/*
desc += '<br><b>Discriminative:</b> ';
Expand Down Expand Up @@ -1063,7 +1068,7 @@ buildViz = function (d3) {
})
.reduce(function (a, b) {
return a + b;
});
}, 0);

d3.select("#" + divName + "-neuthead")
.style('fill', color(0))
Expand Down Expand Up @@ -1092,7 +1097,7 @@ buildViz = function (d3) {
})
.reduce(function (a, b) {
return a + b;
});
}, 0);

d3.select("#" + divName + "-extrahead")
.style('fill', color(0))
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages

setup(name='scattertext',
version='0.0.2.63',
version='0.0.2.64',
description='An NLP package to visualize interesting terms in text.',
url='https://github.com/JasonKessler/scattertext',
author='Jason Kessler',
Expand Down

0 comments on commit 948d244

Please sign in to comment.