Improved notebook and supporting code

zhan-zhang · Mar 20, 2019 · 0f56eb8 · 0f56eb8
1 parent c018c81
commit 0f56eb8
Show file tree

Hide file tree

Showing 21 changed files with 4,385 additions and 4,426 deletions.
diff --git a/evaluation_methods.ipynb b/evaluation_methods.ipynb
@@ -14,7 +14,7 @@
    "outputs": [],
    "source": [
     "__author__ = \"Christopher Potts\"\n",
-    "__version__ = \"CS224u, Stanford, Spring 2018 term\""
+    "__version__ = \"CS224u, Stanford, Spring 2019\""
    ]
   },
   {
@@ -23,7 +23,7 @@
    "source": [
     "## Contents\n",
     "\n",
-    "0. [Overview](#Overview)\n",
+    "1. [Overview](#Overview)\n",
     "0. [Set-up](#Set-up)\n",
     "0. [Data organization](#Data-organization)\n",
     "  0. [Train/dev/test](#Train/dev/test)\n",
@@ -743,7 +743,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.4"
+   "version": "3.7.1"
   }
  },
  "nbformat": 4,

diff --git a/evaluation_metrics.ipynb b/evaluation_metrics.ipynb
@@ -14,14 +14,14 @@
    "outputs": [],
    "source": [
     "__author__ = \"Christopher Potts\"\n",
-    "__version__ = \"CS224u, Stanford, Spring 2018 term\""
+    "__version__ = \"CS224u, Stanford, Spring 2019\""
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "0. [Overview](#Overview)\n",
+    "1. [Overview](#Overview)\n",
     "0. [Set-up](#Set-up)\n",
     "0. [Classifier metrics](#Classifier-metrics)\n",
     "  0. [Confusion matrix](#Confusion-matrix)\n",
@@ -1881,7 +1881,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.4"
+   "version": "3.7.1"
   }
  },
  "nbformat": 4,

diff --git a/fig/wordentail-diagram.png b/fig/wordentail-diagram.png
diff --git a/nli.py b/nli.py
@@ -5,104 +5,119 @@
 import os
 import random
 from sklearn.feature_extraction import DictVectorizer
-from sklearn.metrics import classification_report
+from sklearn.metrics import classification_report, accuracy_score, f1_score
 from sklearn.model_selection import train_test_split
 import utils
 
-
 __author__ = "Christopher Potts"
-__version__ = "CS224u, Stanford, Spring 2018"
+__version__ = "CS224u, Stanford, Spring 2019"
 
 
-BAKEOFF_CONDITION_NAMES = ['edge_disjoint', 'word_disjoint', 'word_disjoint_balanced']
+CONDITION_NAMES = [
+    'edge_disjoint',
+    'word_disjoint',
+    'word_disjoint_balanced']
 
 
-def build_bakeoff_dataset(wordentail_data, vector_func, vector_combo_func):
-    """
+def word_entail_featurize(data, vector_func, vector_combo_func):
+    X = []
+    y = []
+    for (w1, w2), label in data:
+        rep = vector_combo_func(vector_func(w1), vector_func(w2))
+        X.append(rep)
+        y.append(label)
+    return X, y
+
+
+def wordentail_experiment(
+        train_data,
+        assess_data,
+        vector_func,
+        vector_combo_func,
+        model):
+    """Train and evaluation code for the word-level entailment task.
+
     Parameters
     ----------
-    wordentail_data
-        The contents of `wordentail_filename` loaded from JSON.
+    train_data : list
+    assess_data : list
     vector_func : function
         Any function mapping words in the vocab for `wordentail_data`
         to vector representations
     vector_combo_func : function
         Any function for combining two vectors into a new vector
         of fixed dimensionality.
+    model : class with `fit` and `predict` methods
+
+    Prints
+    ------
+    To standard ouput
+        An sklearn classification report for all three splits.
 
     Returns
     -------
-    A dict in the same format as `wordentail_data` but with the
-    pairs of strings for each example replaced by a single vector.
+    dict with structure
+
+        'model': the trained model
+        'train_condition': train_condition
+        'assess_condition': assess_condition
+        'macro-F1': score for 'assess_condition'
+        'vector_func': vector_func
+        'vector_combo_func': vector_combo_func
+
+    We pass 'vector_func' and 'vector_combo_func' through to ensure alignment
+    between these experiments and the bake-off evaluation.
+
     """
-    # A mapping from words (as strings) to their vector
-    # representations, as determined by vector_func:
-    vocab = wordentail_data['vocab']
-    vectors = {w: vector_func(w) for w in vocab}
-    # Dataset in the format required by the neural network:
-    dataset = defaultdict(lambda: defaultdict(list))
-    for condition in BAKEOFF_CONDITION_NAMES:
-        for split, data in wordentail_data[condition].items():
-            for (w1, w2), label in data:
-                # Use vector_combo_func to combine the word vectors for
-                # w1 and w2, as given by the vectors dictionary above,
-                # and pair it with the singleton array containing clsname:
-                rep = vector_combo_func(vectors[w1], vectors[w2])
-                example = [rep, label]
-                dataset[condition][split].append(example)
-    dataset['vocab'] = vocab
-    return dataset
-
-
-def bakeoff_experiment(dataset, model, conditions=None):
-    """Train and evaluation code for the word-level entailment task.
+    X_train, y_train = word_entail_featurize(
+        train_data,  vector_func, vector_combo_func)
+    X_dev, y_dev = word_entail_featurize(
+        assess_data, vector_func, vector_combo_func)
+    model.fit(X_train, y_train)
+    predictions = model.predict(X_dev)
+    # Report:
+    print(classification_report(y_dev, predictions))
+    macrof1 = utils.safe_macro_f1(y_dev, predictions)
+    return {
+        'model': model,
+        'train_data': train_data,
+        'assess_data': assess_data,
+        'macro-F1': macrof1,
+        'vector_func': vector_func,
+        'vector_combo_func': vector_combo_func}
+
+
+def bake_off_evaluation(experiment_results, test_data_filename=None):
+    """Function for evaluating a trained model on the bake-off test set.
 
     Parameters
     ----------
-    dataset : dict
-        With keys `BAKEOFF_CONDITION_NAMES`, each with values that are lists of
-        vector pairs, the first giving the example representation and the second
-        giving its 1d output vector. The expectation is that this was created
-        by `build_bakeoff_dataset`.
-    model : class with `fit` and `predict` methods
-    conditions : list or None
-        If None, then all of `BAKEOFF_CONDITION_NAMES` are evaluated.
-        If this is a list, then it should be a subset of
-        `BAKEOFF_CONDITION_NAMES`.
+    experiment_results : dict
+        This should be the return value of `experiment` with at least
+        keys 'model', 'vector_func', and 'vector_combo_func'.
+    test_data_filename : str or None
+        Full path to the test data. If `None`, then we assume the file is
+        'data/nlidata/nli_wordentail_bakeoff_data-test.json'.
 
     Prints
     ------
     To standard ouput
         An sklearn classification report for all three splits.
 
     """
-    if conditions is None:
-        conditions = BAKEOFF_CONDITION_NAMES
-    else:
-        for c in conditions:
-            if c not in BAKEOFF_CONDITION_NAMES:
-                raise ValueError(
-                    "Condition {} is not recogized. Conditions must "
-                    "be in {}".format(c, BAKEOFF_CONDITION_NAMES))
-    # Train the network:
-    for condition in conditions:
-        cond_data = dataset[condition]
-        X_train, y_train = zip(*cond_data['train'])
-        model.fit(X_train, y_train)
-        X_dev, y_dev = zip(*cond_data['dev'])
-        predictions = model.predict(X_dev)
-        # Report:
-        print("="*70)
-        print("{}".format(condition))
-        print(classification_report(y_dev, predictions))
-        if condition == 'word_disjoint_balanced':
-            X_train, y_train = zip(*dataset['word_disjoint']['train'])
-            model.fit(X_train, y_train)
-            predictions = model.predict(X_dev)
-            # Report:
-            print("="*70)
-            print("{}, training on word_disjoint".format(condition))
-            print(classification_report(y_dev, predictions))
+    if test_data_filename is None:
+        test_data_filename = os.path.join(
+            'data', 'nlidata', 'nli_wordentail_bakeoff_data-test.json')
+    with open(test_data_filename) as f:
+        wordentail_data = json.load(f)
+    X_test, y_test = word_entail_featurize(
+        wordentail_data['word_disjoint']['test'],
+        vector_func=experiment_results['vector_func'],
+        vector_combo_func=experiment_results['vector_combo_func'])
+    predictions = experiment_results['model'].predict(X_test)
+    # Report:
+    print(classification_report(y_test, predictions))
+
 
 
 def str2tree(s, binarize=False):
@@ -214,47 +229,43 @@ def __repr__(self):
         return """"NLIReader({})""".format(d)
 
 
-SNLI_HOME = os.path.join("nlidata", "snli_1.0")
-
-MULTINLI_HOME = os.path.join("nlidata", "multinli_1.0")
-
 
 class SNLITrainReader(NLIReader):
-    def __init__(self, snli_home=SNLI_HOME, **kwargs):
+    def __init__(self, snli_home, **kwargs):
         src_filename = os.path.join(
             snli_home, "snli_1.0_train.jsonl")
         super(SNLITrainReader, self).__init__(src_filename, **kwargs)
 
 
 class SNLIDevReader(NLIReader):
-    def __init__(self, snli_home=SNLI_HOME, **kwargs):
+    def __init__(self, snli_home, **kwargs):
         src_filename = os.path.join(
             snli_home, "snli_1.0_dev.jsonl")
         super(SNLIDevReader, self).__init__(src_filename, **kwargs)
 
 
 class MultiNLITrainReader(NLIReader):
-    def __init__(self, snli_home=MULTINLI_HOME, **kwargs):
+    def __init__(self, snli_home, **kwargs):
         src_filename = os.path.join(
             snli_home, "multinli_1.0_train.jsonl")
         super(MultiNLITrainReader, self).__init__(src_filename, **kwargs)
 
 
 class MultiNLIMatchedDevReader(NLIReader):
-    def __init__(self, multinli_home=MULTINLI_HOME, **kwargs):
+    def __init__(self, multinli_home, **kwargs):
         src_filename = os.path.join(
             multinli_home, "multinli_1.0_dev_matched.jsonl")
         super(MultiNLIMatchedDevReader, self).__init__(src_filename, **kwargs)
 
 
 class MultiNLIMismatchedDevReader(NLIReader):
-    def __init__(self, multinli_home=MULTINLI_HOME, **kwargs):
+    def __init__(self, multinli_home, **kwargs):
         src_filename = os.path.join(
             multinli_home, "multinli_1.0_dev_mismatched.jsonl")
         super(MultiNLIMismatchedDevReader, self).__init__(src_filename, **kwargs)
 
 
-def read_annotated_subset(src_filename):
+def read_annotated_subset(src_filename, multinli_home):
     """Given an annotation filename from MultiNLI's separate
     annotation distribution, associate it with the appropriate
     dev examples.
@@ -263,6 +274,8 @@ def read_annotated_subset(src_filename):
     ----------
     src_filename : str
         Full pat to the annotation file.
+    multinli_home : str
+        Full path to the MultiNLI corpus directory.
 
     Returns
     -------
@@ -273,9 +286,9 @@ def read_annotated_subset(src_filename):
 
     """
     if 'mismatched' in src_filename:
-        reader = MultiNLIMismatchedDevReader()
+        reader = MultiNLIMismatchedDevReader(multinli_home)
     else:
-        reader = MultiNLIMatchedDevReader()
+        reader = MultiNLIMatchedDevReader(multinli_home)
     id2ex = {ex.pairID: ex for ex in reader.read()}
     data = {}
     with open(src_filename) as f:
@@ -404,8 +417,13 @@ def experiment(
 
     Returns
     -------
-    float
-        The overall scoring metric as determined by `score_metric`.
+    dict with keys
+        'model': trained model
+        'train_dataset': a dataset as returned by `build_dataset`
+        'assess_dataset': a dataset as returned by `build_dataset`
+        'predictions': predictions on the assessment data
+        'metric': `score_func.__name__`
+        'score': the `score_func` score on the assessment data
 
     """
     # Train dataset:
@@ -417,12 +435,16 @@ def experiment(
     # Manage the assessment set-up:
     X_train = train['X']
     y_train = train['y']
-    X_assess = None
-    y_assess = None
+    raw_train = train['raw_examples']
     if assess_reader == None:
-         X_train, X_assess, y_train, y_assess = train_test_split(
-             X_train, y_train, train_size=train_size, test_size=None,
-             random_state=random_state)
+         X_train, X_assess, y_train, y_assess, raw_train, raw_assess = train_test_split(
+            X_train, y_train, raw_train,
+            train_size=train_size, test_size=None, random_state=random_state)
+         assess = {
+            'X': X_assess,
+            'y': y_assess,
+            'vectorizer': train['vectorizer'],
+            'raw_examples': raw_assess}
     else:
         # Assessment dataset using the training vectorizer:
         assess = build_dataset(
@@ -437,6 +459,13 @@ def experiment(
     predictions = mod.predict(X_assess)
     # Report:
     if verbose:
+        print('Accuracy: {0:0.03f}'.format(accuracy_score(y_assess, predictions)))
         print(classification_report(y_assess, predictions, digits=3))
-    # Return the overall score:
-    return score_func(y_assess, predictions)
+    # Return the overall score and experimental info:
+    return {
+        'model': mod,
+        'train_dataset': train,
+        'assess_dataset': assess,
+        'predictions': predictions,
+        'metric': score_func.__name__,
+        'score': score_func(y_assess, predictions)}