Skip to content

Commit

Permalink
SST unit improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
cgpotts committed Mar 29, 2019
1 parent e516fad commit 848a6e1
Show file tree
Hide file tree
Showing 5 changed files with 231 additions and 313 deletions.
Binary file added fig/tree_nn_subtrees.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
169 changes: 42 additions & 127 deletions sst.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,18 @@
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.metrics import classification_report, f1_score
import scipy.stats
import utils

__author__ = "Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2019"


def sentiment_treebank_reader(
src_filename,
include_subtrees=False,
replace_root_score=True,
class_func=None):
def sentiment_treebank_reader(src_filename, class_func=None):
"""Iterator for the Penn-style distribution of the Stanford
Sentiment Treebank. The iterator yields (tree, label) pairs.
The root node of the tree is the label, so the root node itself is
replaced with a string to ensure that it doesn't get used as a
predictor. The subtree labels are retained. If they are used, it can
feel like cheating (see `root_daughter_scores_phis` below), so take
care!
The labels are strings. They do not make sense as a linear order
because negative ('0', '1'), neutral ('2'), and positive ('3','4')
do not form a linear order conceptually, and because '0' is
Expand All @@ -38,19 +28,11 @@ def sentiment_treebank_reader(
----------
src_filename : str
Full path to the file to be read.
include_subtrees : boolean (default: False)
Whether to yield all the subtrees with labels or just the full
tree. In both cases, the label is the root of the subtree.
replace_root_score : boolean (default: True)
The root node of the tree is the label, so, by default, the root
node itself is replaced with a string to ensure that it doesn't
get used as a predictor.
class_func : None, or function mapping labels to labels or None
If this is None, then the original 5-way labels are returned.
Other options: `binary_class_func` and `ternary_class_func`
(or you could write your own).
Yields
------
(tree, label)
Expand All @@ -62,21 +44,14 @@ def sentiment_treebank_reader(
with open(src_filename) as f:
for line in f:
tree = Tree.fromstring(line)
if include_subtrees:
label = class_func(tree.label())
# As in the paper, if the root node doesn't fall into any
# of the classes for this version of the problem, then
# we drop the example:
if label:
for subtree in tree.subtrees():
label = subtree.label()
label = class_func(label)
if label:
if replace_root_score:
subtree.set_label("X")
yield (subtree, label)
else:
label = tree.label()
label = class_func(label)
if label:
if replace_root_score:
tree.set_label("S")
yield (tree, label)
subtree.set_label(class_func(subtree.label()))
yield (tree, label)


def binary_class_func(y):
Expand Down Expand Up @@ -116,7 +91,7 @@ def ternary_class_func(y):
def train_reader(sst_home, **kwargs):
"""Convenience function for reading the train file, full-trees only."""
src = os.path.join(sst_home, 'train.txt')
return sentiment_treebank_reader(src,**kwargs)
return sentiment_treebank_reader(src, **kwargs)


def dev_reader(sst_home, **kwargs):
Expand All @@ -134,18 +109,6 @@ def test_reader(sst_home, **kwargs):
return sentiment_treebank_reader(src, **kwargs)


def allnodes_train_reader(sst_home, **kwargs):
"""Convenience function for reading the train file, all nodes."""
src = os.path.join(sst_home, 'train.txt')
return sentiment_treebank_reader(src, include_subtrees=True, **kwargs)


def allnodes_dev_reader(sst_home):
"""Convenience function for reading the dev file, all nodes."""
src = os.path.join(sst_home, 'dev.txt')
return sentiment_treebank_reader(src, include_subtrees=True, **kwargs)


def build_dataset(sst_home, reader, phi, class_func, vectorizer=None, vectorize=True):
"""Core general function for building experimental datasets.
Expand Down Expand Up @@ -216,7 +179,7 @@ def experiment(
train_reader=train_reader,
assess_reader=None,
train_size=0.7,
class_func=binary_class_func,
class_func=ternary_class_func,
score_func=utils.safe_macro_f1,
vectorize=True,
verbose=True,
Expand All @@ -232,7 +195,7 @@ def experiment(
phi : feature function
Any function that takes an `nltk.Tree` instance as input
and returns a bool/int/float-valued dict as output.
train_func : model wrapper (default: `fit_maxent_classifier`)
train_func : model wrapper
Any function that takes a feature matrix and a label list
as its values and returns a fitted model with a `predict`
function that operates on feature matrices.
Expand Down Expand Up @@ -272,8 +235,8 @@ def experiment(
Prints
-------
To standard output, if `verbose=True`
Model accuracy and a model precision/recall/F1 report. Accuracy is
reported because many SST papers report that figure, but the
Model precision/recall/F1 report. Accuracy is micro-F1 and is
reported because many SST papers report that figure, but macro
precision/recall/F1 is better given the class imbalances and the
fact that performance across the classes can be highly variable.
Expand Down Expand Up @@ -326,7 +289,6 @@ def experiment(
predictions = mod.predict(X_assess)
# Report:
if verbose:
print('Accuracy: {0:0.03f}'.format(accuracy_score(y_assess, predictions)))
print(classification_report(y_assess, predictions, digits=3))
# Return the overall score and experimental info:
return {
Expand Down Expand Up @@ -395,11 +357,6 @@ def compare_models(
The first two are the scores from each model (length `trials`),
and the third is the p-value returned by stats_test.
TODO
----
This function can easily be parallelized. The ParallelPython
makes this easy:http://www.parallelpython.com
"""
if phi2 == None:
phi2 = phi1
Expand Down Expand Up @@ -433,95 +390,53 @@ def compare_models(
return (scores1, scores2, pval)


def compare_models_mcnemar(
sst_home,
phi1,
train_func1,
phi2=None,
train_func2=None,
vectorize1=True,
vectorize2=True,
train_reader=train_reader,
assess_reader=dev_reader,
class_func=ternary_class_func):
"""Wrapper for comparing models. The parameters are like those of
`experiment`, with the same defaults, except
def build_rnn_dataset(sst_home, reader, class_func=ternary_class_func):
"""Given an SST reader, return the `class_func` version of the
dataset as (X, y) training pair.
Parameters
----------
sst_home : str
Full path to the 'trees' directory for SST.
phi1, phi2
Just like `phi` for `experiment`. `phi1` defaults to
`unigrams_phi`. If `phi2` is None, then it is set equal
to `phi1`.
train_func1, train_func2
Just like `train_func` for `experiment`. If `train_func2`
is None, then it is set equal to `train_func`.
vectorize1, vectorize1 : bool
Whether to vectorize the respective inputs. Use `False` for
deep learning models that featurize their own input.
train_reader : SST iterator (default: `train_reader`)
Iterator for training data.
assess_reader : iterator
Iterator for assessment data.
class_func :
Prints
------
To standard output
A report of the assessment.
reader : train_reader or dev_reader
class_func : function on the SST labels
Returns
-------
(float, float)
Test statistic and p-value.
X, y
Where X is a list of list of str, and y is the output label list.
"""
if phi2 == None:
phi2 = phi1
if train_func2 == None:
train_func2 = train_func1
exp1 = experiment(
sst_home,
train_reader=train_reader,
assess_reader=assess_reader,
phi=phi1,
train_func=train_func1,
class_func=class_func,
vectorize=vectorize1,
verbose=False)
exp2 = experiment(
sst_home,
train_reader=train_reader,
assess_reader=assess_reader,
phi=phi2,
train_func=train_func2,
class_func=class_func,
vectorize=vectorize2,
verbose=False)
assert exp1['assess_dataset']['y'] == exp2['assess_dataset']['y']
gold = exp1['assess_dataset']['y']
preds1 = exp1['predictions']
preds2 = exp2['predictions']
return utils.mcnemar(gold, preds1, preds2)
r = reader(sst_home, class_func=class_func)
data = [(tree.leaves(), label) for tree, label in r]
X, y = zip(*data)
return list(X), list(y)


def build_binary_rnn_dataset(sst_home, reader):
"""Given an SST reader, return the binary version of the dataset
as (X, y) training pair.
def build_tree_dataset(sst_home, reader, class_func=ternary_class_func):
"""Given an SST reader, return the `class_func` version of the
dataset. The root node of each tree (`tree.label()`) is set to
the class for that tree. We also return the label vector for
assessment.
Parameters
----------
sst_home : str
Full path to the 'trees' directory for SST.
reader : train_reader or dev_reader
class_func : function on the SST labels
Returns
-------
X, y
Where X is a list of list of str, and y is the output label list.
Where X is a list of `nltk.tree.Tree`, and y is the output
label list.
"""
r = reader(sst_home, class_func=binary_class_func)
data = [(tree.leaves(), label) for tree, label in r]
X, y = zip(*data)
return list(X), list(y)
data = []
labels = []
for (tree, label) in reader(sst_home, class_func=class_func):
tree.set_label(label)
data.append(tree)
labels.append(label)
return data, labels
24 changes: 4 additions & 20 deletions sst_01_overview.ipynb

Large diffs are not rendered by default.

Loading

0 comments on commit 848a6e1

Please sign in to comment.