diff --git a/fig/rnn_classifier.png b/fig/rnn_classifier.png
new file mode 100644
index 0000000..0e4fd05
Binary files /dev/null and b/fig/rnn_classifier.png differ
diff --git a/fig/tree_nn.png b/fig/tree_nn.png
new file mode 100644
index 0000000..9b471d2
Binary files /dev/null and b/fig/tree_nn.png differ
diff --git a/sgd_classifier.py b/sgd_classifier.py
new file mode 100644
index 0000000..8aada14
--- /dev/null
+++ b/sgd_classifier.py
@@ -0,0 +1,178 @@
+import numpy as np
+import random
+
+__author__ = "Christopher Potts"
+__version__ = "CS224u, Stanford, Spring 2018 term"
+
+
+class BasicSGDClassifier:
+ """Basic implementation hinge-loss stochastic sub-gradient descent
+ optimization, intended to illustrate the basic concepts of classifier
+ optimization in code."""
+ def __init__(self, max_iter=10, eta=0.1):
+ """
+ Parameters
+ ----------
+ max_iter : int (default: 10)
+ Number of training epochs (full runs through shuffled data).
+ eta : float (default: 0.1)
+ Learning rate parameter.
+
+ """
+ self.max_iter = max_iter
+ self.eta = eta
+ self.params = ['max_iter', 'eta']
+
+ def fit(self, feat_matrix, labels):
+ """Core optimization function.
+
+ Parameters
+ ----------
+ feat_matrix : 2d matrix (np.array or any scipy.sparse type)
+ The design matrix, one row per example. Hence, the row
+ dimensionality is the example count and the column
+ dimensionality is number of features.
+
+ labels : list
+ The labels for each example, hence assumed to have the
+ same length as, and be aligned with, `feat_matrix`.
+
+ For attributes, we follow the `sklearn` style of using a
+ final `_` for attributes that are created by `fit` methods:
+
+ Attributes
+ ----------
+ self.classes_ : list
+ The set of class labels in sorted order.
+
+ self.n_classes_ : int
+ Length of `self.classes_`
+
+ self.coef_ : np.array of dimension (class count, feature count)
+ These are the weights, named as in `sklearn`. They are
+ organized so that each row represents the feature weights
+ for a given class, as is typical in `sklearn`.
+
+ """
+ # We'll deal with the labels via their indices into self.classes_:
+ self.classes_ = sorted(set(labels))
+ self.n_classes_ = len(self.classes_)
+ # Useful dimensions to store:
+ examplecount, featcount = feat_matrix.shape
+ # The weight matrix -- classes by row:
+ self.coef_ = np.zeros((self.n_classes_, featcount))
+ # Indices for shuffling the data at the start of each epoch:
+ indices = list(range(examplecount))
+ for _ in range(self.max_iter):
+ random.shuffle(indices)
+ for i in indices:
+ # Training instance as a feature rep and a label index:
+ rep = feat_matrix[i]
+ label_index = self.classes_.index(labels[i])
+ # Costs are 1.0 except for the true label:
+ costs = np.ones(self.n_classes_)
+ costs[label_index] = 0.0
+ # Make a prediction:
+ predicted_index = self.predict_one(rep, costs=costs)
+ # Weight update if it's an incorrect prediction:
+ if predicted_index != label_index:
+ self.coef_[label_index] += self.eta * rep
+
+ def predict_one(self, rep, costs=0.0):
+ """The core classification function. After using
+ `predict_one_proba`, the code just needs to figure out which
+ class is highest scoring and make a random choice from that
+ set (in case of ties).
+
+ Parameters
+ ----------
+ rep : np.array of dimension featcount or
+ `scipy.sparse` matrix of dimension (1 x `featcount`)
+
+ costs : float or np.array of dimension self.classcount
+ Where this is 0.0, we're doing prediction. Where it
+ is an array, we expect a 0.0 at the coordinate
+ corresponding to the true label and a 1.0 in all
+ other positions.
+
+ Returns
+ -------
+ int
+ The index of the correct class. This is for the
+ sake of the `fit` method. `predict` returns the class
+ names themselves.
+
+ """
+ scores = rep.dot(self.coef_.T) + costs
+ # Manage the difference between scipy and numpy 1d matrices:
+ scores = scores.reshape(self.n_classes_)
+ # Set of highest scoring label indices (in case of ties):
+ candidates = np.argwhere(scores==np.max(scores)).flatten()
+ return random.choice(candidates)
+
+ def predict(self, reps):
+ """Batch prediction function for experiments.
+
+ Parameters
+ ----------
+ reps : list or feature matrix
+ A featurized set of examples to make predictions about.
+
+ Returns
+ -------
+ list of str
+ A list of class names -- the predictions. Unlike `predict_one`,
+ it returns the class name rather than its index.
+
+ """
+ return [self.classes_[self.predict_one(rep)] for rep in reps]
+
+ def get_params(self, deep=True):
+ """Gets the hyperparameters for the model, as given by the
+ `self.params` attribute. This is called `get_params` for
+ compatibility with sklearn.
+
+ Returns
+ -------
+ dict
+ Map from attribute names to their values.
+
+ """
+ return {p: getattr(self, p) for p in self.params}
+
+ def set_params(self, **params):
+ for key, val in params.items():
+ setattr(self, key, val)
+ return self
+
+
+def simple_example():
+ """Assess on the digits dataset and informally compare
+ against LogisticRegression.
+ """
+ from sklearn.datasets import load_digits
+ from sklearn.model_selection import train_test_split
+ from sklearn.metrics import classification_report
+ from sklearn.linear_model import LogisticRegression
+
+ digits = load_digits()
+ X = digits.data
+ y = digits.target
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ X, y, test_size=0.33, random_state=42)
+
+ models = [
+ BasicSGDClassifier(max_iter=500),
+ LogisticRegression()
+ ]
+
+ for mod in models:
+ print(mod)
+ mod.fit(X_train, y_train)
+ predictions = mod.predict(X_test)
+ print(classification_report(y_test, predictions))
+
+
+if __name__ == '__main__':
+ simple_example()
diff --git a/sst_01_overview.ipynb b/sst_01_overview.ipynb
new file mode 100644
index 0000000..56e2445
--- /dev/null
+++ b/sst_01_overview.ipynb
@@ -0,0 +1,717 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "# Supervised sentiment: Overview of the Stanford Sentiment Treebank"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "__author__ = \"Christopher Potts\"\n",
+ "__version__ = \"CS224u, Stanford, Spring 2018 term\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "-"
+ }
+ },
+ "source": [
+ "## Contents\n",
+ "\n",
+ "0. [Overview of this unit](#Overview-of-this-unit)\n",
+ "0. [Paths through the material](#Paths-through-the-material)\n",
+ "0. [Overview of this notebook](#Overview-of-this-notebook)\n",
+ "0. [The complexity of sentiment analysis](#The-complexity-of-sentiment-analysis)\n",
+ "0. [Set-up](#Set-up)\n",
+ "0. [Data readers](#Data-readers)\n",
+ " 0. [Main readers](#Main-readers)\n",
+ " 0. [All-nodes readers](#All-nodes-readers)\n",
+ " 0. [Methodological notes](#Methodological-notes)\n",
+ "0. [Modeling the SST labels](#Modeling-the-SST-labels)\n",
+ " 0. [Train label distributions](#Train-label-distributions)\n",
+ " 0. [Dev label distributions](#Dev-label-distributions)\n",
+ "0. [Additional sentiment resources](#Additional-sentiment-resources)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Overview of this unit\n",
+ "\n",
+ "We have a few inter-related goals for this unit:\n",
+ "\n",
+ "* Provide a basic introduction to supervised learning in the context of a problem that has long been central to academic research and industry applications: __sentiment analysis__.\n",
+ "\n",
+ "* Explore and evaluate a diverse array of methods for modeling sentiment:\n",
+ " * Hand-built feature functions with (mostly linear) classifiers\n",
+ " * Dense feature representations derived from VSMs as we built them in the previous unit\n",
+ " * Recurrent neural networks (RNNs)\n",
+ " * Tree-structured neural networks\n",
+ " \n",
+ "* Begin discussing and implementing responsible methods for __hyperparameter optimization__ and __classifier assessment and comparison__.\n",
+ "\n",
+ "The unit is built around the [Stanford Sentiment Treebank (SST)](http://nlp.stanford.edu/sentiment/), a widely-used resource for evaluating supervised NLU models, and one that provides rich linguistic representations."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Paths through the material\n",
+ "\n",
+ "* If you're relatively new to supervised learning, we suggest studying the details of this notebook closely and following the links to [additional resources](#Additional-sentiment-resources). \n",
+ "\n",
+ "* If you're familiar with supervised learning, then you can focus right away on innovative feature representations and modeling. \n",
+ "\n",
+ "* As of this writing, the state-of-the-art for the SST seems to be around 88% accuracy for the binary problem and 48% accuracy for the five-class problem. Perhaps you can best these numbers!"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Overview of this notebook\n",
+ "\n",
+ "This is the first notebook in this unit. It does two things:\n",
+ "\n",
+ "* Introduces sentiment analysis as a task.\n",
+ "* Introduces the SST and our tools for reading that corpus. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## The complexity of sentiment analysis\n",
+ "\n",
+ "Sentiment analysis seems simple at first but turns out to exhibit all of the complexity of full natural language understanding. To see this, consider how your intuitions about the sentiment of the following sentences can change depending on perspective, social relationships, tone of voice, and other aspects of the context of utterance:\n",
+ "\n",
+ "1. There was an earthquake in LA.\n",
+ "1. The team failed the physical challenge. (We win/lose!)\n",
+ "1. They said it would be great. They were right/wrong.\n",
+ "1. Many consider the masterpiece bewildering, boring, slow-moving or annoying.\n",
+ "1. The party fat-cats are sipping their expensive, imported wines.\n",
+ "1. Oh, you're terrible!\n",
+ "\n",
+ "SST mostly steers around these challenges by including only focused, evaluative texts (sentences from movie reviews), but you should have them in mind if you consider new domains and applications for the ideas."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Set-up\n",
+ "\n",
+ "* Make sure your environment includes all the requirements for [the cs224u repository](https://github.com/cgpotts/cs224u).\n",
+ "\n",
+ "* Download [the train/dev/test Stanford Sentiment Treebank distribution](http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip), unzip it, and put the resulting folder in the same directory as this notebook. It will be called `trees`. (If you want to put it somewhere else, change sst_home below.)\n",
+ "\n",
+ "* Make sure you still have the `vsmdata` directory and its contents. ([Here's a link in case you need to redownload it.](http://web.stanford.edu/class/cs224u/data/vsmdata.zip)) In addition, you might want the [the Wikipedia 2014 + Gigaword 5 distribution of the pretrained GloVe vectors](http://nlp.stanford.edu/data/glove.6B.zip). This might already be in `vsmdata`, depending on what kind of work you did as part of the VSM unit."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nltk.tree import Tree\n",
+ "import pandas as pd\n",
+ "import sst"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Data readers\n",
+ "\n",
+ "* The train/dev/test SST distribution contains files that are lists of trees where the part-of-speech tags have been replaced with sentiment scores `0...4`:\n",
+ " * `0` and `1` are negative labels.\n",
+ " * `2` is a neutral label.\n",
+ " * `3` and `4` are positive labels. \n",
+ "\n",
+ "* Our readers are iteratorrs that yield `(tree, label)` pairs, where `tree` is an [NLTK Tree](http://www.nltk.org/_modules/nltk/tree.html) instance and `score` is a string."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Main readers\n",
+ "\n",
+ "We'll mainly work with `sst.train_reader` and `sst.dev_reader`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tree, score = next(sst.train_reader())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here, `score` is one of the labels. `tree` is an NLTK Tree instance. It should render pretty legibly in your browser:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "Tree('S', [Tree('2', [Tree('2', ['The']), Tree('2', ['Rock'])]), Tree('4', [Tree('3', [Tree('2', ['is']), Tree('4', [Tree('2', ['destined']), Tree('2', [Tree('2', [Tree('2', [Tree('2', [Tree('2', ['to']), Tree('2', [Tree('2', ['be']), Tree('2', [Tree('2', ['the']), Tree('2', [Tree('2', ['21st']), Tree('2', [Tree('2', [Tree('2', ['Century']), Tree('2', [\"'s\"])]), Tree('2', [Tree('3', ['new']), Tree('2', [Tree('2', ['``']), Tree('2', ['Conan'])])])])])])])]), Tree('2', [\"''\"])]), Tree('2', ['and'])]), Tree('3', [Tree('2', ['that']), Tree('3', [Tree('2', ['he']), Tree('3', [Tree('2', [\"'s\"]), Tree('3', [Tree('2', ['going']), Tree('3', [Tree('2', ['to']), Tree('4', [Tree('3', [Tree('2', ['make']), Tree('3', [Tree('3', [Tree('2', ['a']), Tree('3', ['splash'])]), Tree('2', [Tree('2', ['even']), Tree('3', ['greater'])])])]), Tree('2', [Tree('2', ['than']), Tree('2', [Tree('2', [Tree('2', [Tree('2', [Tree('1', [Tree('2', ['Arnold']), Tree('2', ['Schwarzenegger'])]), Tree('2', [','])]), Tree('2', [Tree('2', ['Jean-Claud']), Tree('2', [Tree('2', ['Van']), Tree('2', ['Damme'])])])]), Tree('2', ['or'])]), Tree('2', [Tree('2', ['Steven']), Tree('2', ['Segal'])])])])])])])])])])])])]), Tree('2', ['.'])])])"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tree"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This is what it actually looks like, of course:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(Tree('S', [Tree('2', [Tree('2', ['The']), Tree('2', ['Rock'])]), Tree('4', [Tree('3', [Tree('2', ['is']), Tree('4', [Tree('2', ['destined']), Tree('2', [Tree('2', [Tree('2', [Tree('2', [Tree('2', ['to']), Tree('2', [Tree('2', ['be']), Tree('2', [Tree('2', ['the']), Tree('2', [Tree('2', ['21st']), Tree('2', [Tree('2', [Tree('2', ['Century']), Tree('2', [\"'s\"])]), Tree('2', [Tree('3', ['new']), Tree('2', [Tree('2', ['``']), Tree('2', ['Conan'])])])])])])])]), Tree('2', [\"''\"])]), Tree('2', ['and'])]), Tree('3', [Tree('2', ['that']), Tree('3', [Tree('2', ['he']), Tree('3', [Tree('2', [\"'s\"]), Tree('3', [Tree('2', ['going']), Tree('3', [Tree('2', ['to']), Tree('4', [Tree('3', [Tree('2', ['make']), Tree('3', [Tree('3', [Tree('2', ['a']), Tree('3', ['splash'])]), Tree('2', [Tree('2', ['even']), Tree('3', ['greater'])])])]), Tree('2', [Tree('2', ['than']), Tree('2', [Tree('2', [Tree('2', [Tree('2', [Tree('1', [Tree('2', ['Arnold']), Tree('2', ['Schwarzenegger'])]), Tree('2', [','])]), Tree('2', [Tree('2', ['Jean-Claud']), Tree('2', [Tree('2', ['Van']), Tree('2', ['Damme'])])])]), Tree('2', ['or'])]), Tree('2', [Tree('2', ['Steven']), Tree('2', ['Segal'])])])])])])])])])])])])]), Tree('2', ['.'])])]),)"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(tree,)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here's a smaller example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAALgAAACMCAIAAABAuvQrAAAACXBIWXMAAA3XAAAN1wFCKJt4AAAAHXRFWHRTb2Z0d2FyZQBHUEwgR2hvc3RzY3JpcHQgOS4xNnO9PXQAAAnVSURBVHic7Z0xbONGFoZnD1cE62tYeNsA3M4ueVtrA5CN3Jpq7TQUkkW6BGSbjoT6AGSTTXcgXR2wbsjCbrMaIM2q88DbWoAGCE5ODsFFVwxujktL1IgSNTPU+ypRIqk3nF9vZjjDX88WiwUCgHX8RXYAgB6AUAAhQCiAECAUQAgQCiCEHkIJgiAIAtlRHDQaCCVJEsuyMMayAzloVBcKpRRj7Lqu7EAOHdWFEkWR53myowDUFgohBCFkWZbsQAD0V9kB1BEEQRzHsqMAEFJZKBhjSmkURWyTEJJlGXRWZKGuUEzT9H2fbxZFYZqmxHgOnGe6zB47jpPnuewoDhelO7OMLMscx8EYwz03iWiTUQC5aJBRABUAoQBCgFAAITQQCn18/Ocvv5DpVHYgB42691EQQvTxMbq+Tm5u/vPnn7/+9pv3+rXf75vHx7LjOkQUHfVwidD53Hv9+qsvvvjHzz/zTZDL/lFOKBWJlDVR8xHQNgoJRVAHIBcpKCGUBnUPctkzkoWyZX2DXPaGNKHssI5BLntAglBaqleQS6vsVSh7qEuQS0vsSSh7rj+Qy85pXSgS6wzkskNaFIoi9aRIGLrTilAUrBsFQ9KLHQtF8fpQPDyV2ZlQNKoDjUJVhx0IRdPrrmnYsthKKB241h0own5oKJSOXd+OFacNmgiFTKd///777l3TslzSr792X72SHZFCNMwowdWV1+t1RiJlmFz8ft94/lx2LAqhxHoUQH00WIUPqAAIBRAChAIIsdlzPRjjKIoopQgh13U77K7GnBPCMJQdiCpskFEopcwrK8/zPM8ppVmWtReZRMCw9CkbCIUQ4vu+YRhs0/f9TgoFDEuXskHTU3FnxBh30isLDEuX0rAzSylNkqTssdYNwLB0FU2EQikdDAae5/FmqDMEQdA99e+Ejd0MWJc2DMPu/ezAsLSGzYRCCImiKAzD7uUSBIal9SyEGY/Htm3PZjP+ThiG4odrh23bskNQiA0mBR3HoZRWckknvV+zLEuSBGPseR7cc2PA7DEgBMz1AEKAUAAhQCiAEEq7QkqBLYX89x9/fGPbnVzr2Ywmndng6grf3+fffttGQHJJbm+DLKPz+d8+++xfv//esdXj29Awo3TPHji5vY2ur8nDg316Gl9cGEdHbEV+cnPjn53BWuuGGSV6927x449tBLR/iskkyDJ8f2+fnvr9vn1ywj9izVD07p1xdMSyy8HK5aD7KMVkEl1fFx8+mC9e5N99V5YIw3j+PDw/93o9Jpfk5uZg5XKgGYVMp6xlMV+88Pt9r9cTP+Qws8vBZZRyfftnZ+H5ueCB5vFxfHHh9/ssu2Tv3wsqrBs0ySjFZOKMRtplFP7EKEJoy5SAP34Msoy1WQcil0MRSnB1xZ9BD113J61GuYvTebl0v+nh496d3xSxT07skxMml+Hbt9H1dXh+3tVH25sLpZhMng4TlCJ7/z64uuK3RlqKtiyXwQ8/PB1jd4NuZhTeKLQqkTJMLix7OaNR9+TSNaGQ6XT400+s3xBfXu653+D1el6v10m5dEco5XHv/iVS5qlc4osL7SeMGiyfvHt4QJeX+YcPu1uRuRWz+dzPMnR5abx542fZbD6XHdH/iW9ujDdv0OWl9/bt3cOD7HCa0ySjqPPjKJtpqTl15/V67qtXfH5R3+nohmtmn3355dLJkX3ClwRocfV19xNsLhSJ/YDKkgCNrrj6KXAVzYWy0UTJrqhZEqAROq5eaDjqsU9P9/87JtOpMxqtWhKgEZXVC3Q+jy8uZAe1Bs2e68EfP1qffy47il3C1gqq33pqJhRAFvC4BiAECAUQ4pPOLCGEEGKaJrd7KIoCIWQYBjO/Y36QCKHyPk/3fLq5Dd22otTFfvIToWCMi6LAGOd5bhgGIYRtmqYZxzGz9mPeMqZpliuMec6g/ymDH2hZ1pZCYb49aZoyF4UoirpkbsPsJ5MkkR2IAJVb+nme+77v+z5/x/f9PM/55irXEOYpWn6nfJLGjMfjymld193+tCowm808z1toYsSypI9i2zallNneSceyLNu2+WaXrCj1sp9c3pkNw5C1nUrRJStK7ewnlwuFdTWU8hvumBWldvaTK4fHzJiaD3NWsZ/E0zErSm4/GQRBEAR8KKAydXM9nudxL81VlA3jK92atSITpHtWlDraT9bdcGO9WsH6tiyL3TthiB9YD8Z4OByWVbJWu+pjGIZdYid3m9rmk7keQshgMEAImaaZpilCiFL68uXLNE1t2x4MBuwPBSql4saQRVFEUcQ+xRjzmx/b0G0rSo3sJ3c/KVgUhRY/EWAjYPYYEAImBQEhQCiAECAUQAidhFJMJvTxUXYUO6aYTLRwTtRJKM5ohO/vZUexY5zRKLm9lR3FenQSCiAREIpk7NNT2SEIAUIBhAChAEKAUOSjRQ8dhAIIAUIBhAChAEKAUCSj/uPpDBCKZIyjI9khCAFCAYQAocgHJgUBIcjDg+wQ1gNCAYQAoQBCgFAAIUAoktHF3lInofhnZ7rcnhLHPD72z85kR7EeeK4HEEKnjAJIBIQCCKGuUMomlAqieHgVto9WXaEEQVA2X9knSZIEQTAcDmsC2Cg8QgjzzKnZp1XZbX8x1RUKMymV8tXMhMIwjJrK2yg80zTDMKyvquFwuFmUm7D9xVT3PwUVd0zceXitZpTto1VRKDxPLjVtwxizHG4YBvuVLLWgybKMedCx3Xzf5248g8GAOW8z5zT2cxf3/KkJL0kSbszMTPAIIcySiO9QFAWltPylhBDWzDmOw3YzDKN81DZlqYm2/thyWdI0bfLnk/uhYoTMsSxrNpux1+PxeKmbb5qmzOuX71axMTYMI45j/ml557UBrPo0TdOKkXPltPVfusqWuNWyrDq2UpbFYqGfUFzXTdOUb97d3S3dp/JOHMfloyq1srSSNhVK5SSz2az+W0RiWLRcllXHPj2Jik1PPXEcs1EJIcQwjKWtb1EUPI1z9uygz1vGLVGhLEjNPkoNrMfHvTcppY7jjMfjym62bZfb+P1Q6eWwvyrZ/rQqlAWpPDxeCsa4/F8Uq3qgrutWXEZ3VW01eJ43HA6ZlJmD8kaOh6Zplgc+/LUKZUEKTgoyR02EEGtZmBTiOGZpvCiKJEn4+4QQ27aXtj5RFPF/WGBF5bdGBoNB2bEzCIIkSfhmfQAi4bEq930/SRJ2zqdf6jgOxth13TiOWcBsNMe0xQLmH7VRlrXHlssSx7FyQhGB2d0ihMp/vLEU5pFsWZYU1+vhcMgrWwRerqUByy2LlkLRgqIoiqJQ3GZYHM06s4rD8jl7bVmW1ioplyXPc8gogBCajXoAWYBQACFAKIAQIBRAiP8CBCFm1Qc2iiMAAAAASUVORK5CYII=",
+ "text/plain": [
+ "Tree('4', [Tree('2', ['NLU']), Tree('4', [Tree('2', ['is']), Tree('4', ['enlightening'])])])"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Tree.fromstring(\"\"\"(4 (2 NLU) (4 (2 is) (4 enlightening)))\"\"\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### All-nodes readers\n",
+ "\n",
+ "In SST parlance, the __all-nodes task__ trains and assesses, not just with the full sentence, but also with all the labeled subtrees. We won't explore this task here, but it's good to know about it, and these readers will give you access to this version of the dataset:\n",
+ " * `sst.allnodes_train_reader`\n",
+ " * `sst.allnodes_dev_reader`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Methodological notes\n",
+ "\n",
+ "* We've deliberately ignored `test` readers. We urge you not to use the `test` set until and unless you are running experiments for a final project or similar. Overuse of test-sets corrupts them, since even subtle lessons learned from those runs can be incorporated back into model-building efforts.\n",
+ "\n",
+ "* We actually have mixed feelings about the overuse of `dev` that might result from working with these notebooks! We've tried to encourage using just splits of the training data for assessment most of the time, with only occasionally use of `dev`. This will give you a clearer picture of how you will ultimately do on test; over-use of `dev` can lead to over-fitting on that particular dataset with a resulting loss of performance of `test`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": true,
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Modeling the SST labels\n",
+ "\n",
+ "Working with the SST involves making decisions about how to handle the raw SST labels. The interpretation of these labels is as follows ([Socher et al., sec. 3](http://www.aclweb.org/anthology/D/D13/D13-1170.pdf)):\n",
+ "\n",
+ "* `'0'`: very negative\n",
+ "* `'1'`: negative\n",
+ "* `'2'`: neutral\n",
+ "* `'3'`: positive\n",
+ "* `'4'`: very positive\n",
+ "\n",
+ "The labels look like they could be treated as totally ordered, even continuous. However, conceptually, they do not form such an order. Rather, they consist of three separate classes, with the negative and positive classes being totally ordered in opposite directions:\n",
+ "\n",
+ "* `'0' > '1'`: negative\n",
+ "* `'2'`: neutral\n",
+ "* `'4' > '3'`: positive\n",
+ "\n",
+ "Thus, in this notebook, we'll look mainly at binary (positive/negative) and ternary tasks.\n",
+ "\n",
+ "A related note: the above shows that the __fine-grained sentiment task__ for the SST is particularly punishing as usually formulated, since it ignores the partial-order structure in the categories completely. As a result, mistaking `'0'` for `'1'` is as bad as mistaking `'0'` for `'4'`, though the first error is clearly less severe than the second.\n",
+ "\n",
+ "The functions `sst.binary_class_func` and `sst.ternary_class_func` will convert the labels for you. Let's now use them to study the label distributions."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Train label distributions\n",
+ "\n",
+ "Check that these numbers all match those reported in [Socher et al. 2013, sec 5.1](http://www.aclweb.org/anthology/D/D13/D13-1170.pdf)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_labels = [y for tree, y in sst.train_reader()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total train examples: 8,544\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Total train examples: {:,}\".format(len(train_labels)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Distribution over the full label set:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "3 2322\n",
+ "1 2218\n",
+ "2 1624\n",
+ "4 1288\n",
+ "0 1092\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.Series(train_labels).value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "Binary label conversion:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "binary_train_labels = [\n",
+ " y for tree, y in sst.train_reader(class_func=sst.binary_class_func)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total binary train examples: 6,920\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Total binary train examples: {:,}\".format(len(binary_train_labels)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "positive 3610\n",
+ "negative 3310\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.Series(binary_train_labels).value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "Ternary label conversion:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "positive 3610\n",
+ "negative 3310\n",
+ "neutral 1624\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ternary_train_labels = [\n",
+ " y for tree, y in sst.train_reader(class_func=sst.ternary_class_func)]\n",
+ "\n",
+ "pd.Series(ternary_train_labels).value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Dev label distributions\n",
+ "\n",
+ "Check that these numbers all match those reported in [Socher et al. 2013, sec 5.1](http://www.aclweb.org/anthology/D/D13/D13-1170.pdf)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dev_labels = [y for tree, y in sst.dev_reader()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total dev examples: 1,101\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Total dev examples: {:,}\".format(len(dev_labels)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1 289\n",
+ "3 279\n",
+ "2 229\n",
+ "4 165\n",
+ "0 139\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.Series(dev_labels).value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "Binary label conversion:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "binary_dev_labels = [\n",
+ " y for tree, y in sst.dev_reader(class_func=sst.binary_class_func)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total binary dev examples: 872\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Total binary dev examples: {:,}\".format(len(binary_dev_labels)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "positive 444\n",
+ "negative 428\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.Series(binary_dev_labels).value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "Ternary label conversion:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "positive 444\n",
+ "negative 428\n",
+ "neutral 229\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ternary_dev_labels = [\n",
+ " y for tree, y in sst.dev_reader(class_func=sst.ternary_class_func)]\n",
+ "\n",
+ "pd.Series(ternary_dev_labels).value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Additional sentiment resources\n",
+ "\n",
+ "Here are a few publicly available datasets and other resources; if you decide to work on sentiment analysis, get in touch with the teaching staff — we have a number of other resources that we can point you to.\n",
+ "\n",
+ "* Sentiment lexica: http://sentiment.christopherpotts.net/lexicons.html\n",
+ "* NLTK now has a SentiWordNet module: http://www.nltk.org/api/nltk.corpus.reader.html#module-nltk.corpus.reader.sentiwordnet\n",
+ "* Stanford Large Movie Review Dataset: http://ai.stanford.edu/~amaas/data/sentiment/index.html\n",
+ "* SemEval-2013: Sentiment Analysis in Twitter: https://www.cs.york.ac.uk/semeval-2013/task2/\n",
+ "* Starter code for a sentiment-aware tokenizer: http://sentiment.christopherpotts.net/code-data/happyfuntokenizing.py"
+ ]
+ }
+ ],
+ "metadata": {
+ "celltoolbar": "Slideshow",
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.4"
+ },
+ "widgets": {
+ "state": {},
+ "version": "1.1.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/sst_02_hand_built_features.ipynb b/sst_02_hand_built_features.ipynb
new file mode 100644
index 0000000..2277b66
--- /dev/null
+++ b/sst_02_hand_built_features.ipynb
@@ -0,0 +1,993 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "# Supervised sentiment: Hand-built feature functions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "__author__ = \"Christopher Potts\"\n",
+ "__version__ = \"CS224u, Stanford, Spring 2018 term\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Contents\n",
+ "\n",
+ "0. [Overview](#Overview)\n",
+ "0. [Set-up](#Set-up)\n",
+ "0. [Feature functions](#Feature-functions)\n",
+ "0. [Building datasets for experiments](#Building-datasets-for-experiments)\n",
+ "0. [Basic optimization](#Basic-optimization)\n",
+ " 0. [Wrapper for SGDClassifier](#Wrapper-for-SGDClassifier)\n",
+ " 0. [Wrapper for LogisticRegression](#Wrapper-for-LogisticRegression)\n",
+ " 0. [Other scikit-learn models](#Other-scikit-learn-models)\n",
+ "0. [Experiments](#Experiments)\n",
+ " 0. [Experiment with default values](#Experiment-with-default-values)\n",
+ " 0. [A dev set run](#A-dev-set-run)\n",
+ " 0. [Assessing BasicSGDClassifier](#Assessing-BasicSGDClassifier)\n",
+ " 0. [Comparison with the baselines from Socher et al. 2013](#Comparison-with-the-baselines-from-Socher-et-al.-2013)\n",
+ " 0. [A shallow neural network classifier](#A-shallow-neural-network-classifier)\n",
+ "0. [Hyperparameter search](#Hyperparameter-search)\n",
+ " 0. [sst.fit_classifier_with_crossvalidation](#sst.fit_classifier_with_crossvalidation)\n",
+ " 0. [Example using LogisticRegression](#Example-using-LogisticRegression)\n",
+ " 0. [Example using BasicSGDClassifier](#Example-using-BasicSGDClassifier)\n",
+ "0. [Statistical comparison of classifier models](#Statistical-comparison-of-classifier-models)\n",
+ "0. [Exploratory exercise: The core development cycle](#Exploratory-exercise:-The-core-development-cycle)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Overview\n",
+ "\n",
+ "* The focus of this notebook is __building feature representations__ for use with (mostly linear) classifiers (though you're encouraged to try out some non-linear ones as well!)\n",
+ "\n",
+ "* The core characteristics of the feature functions we'll build here:\n",
+ " * They represent examples in __very large, very sparse feature spaces__.\n",
+ " * The individual feature functions can be __highly refined__, drawing on expert human knowledge of the domain. \n",
+ " * Taken together, these representations don't comprehensively represent the input examples. They just identify aspects of the inputs that the classifier model can make good use of (we hope).\n",
+ " \n",
+ "* These classifiers tend to be __highly competitive__. We'll look at more powerful deep learning models in the next notebook, and it will immediately become apparent that it is very difficult to get them to measure up to well-built classifiers based in sparse feature representations."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Set-up\n",
+ "\n",
+ "See [the previous notebook](sst_01_overview.ipynb#Set-up) for set-up instructions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Applications/anaconda/envs/nlu/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
+ " from ._conv import register_converters as _register_converters\n"
+ ]
+ }
+ ],
+ "source": [
+ "from collections import Counter\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "import scipy.stats\n",
+ "from sgd_classifier import BasicSGDClassifier\n",
+ "from tf_shallow_neural_classifier import TfShallowNeuralClassifier\n",
+ "import sst\n",
+ "import utils"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Feature functions\n",
+ "\n",
+ "* Feature representation is arguably __the most important step in any machine learning task__. As you experiment with the SST, you'll come to appreciate this fact, since your choice of feature function will have a far greater impact on the effectiveness of your models than any other choice you make.\n",
+ "\n",
+ "* We will define our feature functions as `dict`s mapping feature names (which can be any object that can be a `dict` key) to their values (which must be `bool`, `int`, or `float`). \n",
+ "\n",
+ "* To prepare for optimization, we will use `sklearn`'s [DictVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html) class to turn these into matrices of features. \n",
+ "\n",
+ "* The `dict`-based approach gives us a lot of flexibility and frees us from having to worry about the underlying feature matrix."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "A typical baseline or default feature representation in NLP or NLU is built from unigrams. Here, those are the leaf nodes of the tree:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def unigrams_phi(tree):\n",
+ " \"\"\"The basis for a unigrams feature function.\n",
+ " \n",
+ " Parameters\n",
+ " ----------\n",
+ " tree : nltk.tree\n",
+ " The tree to represent.\n",
+ " \n",
+ " Returns\n",
+ " ------- \n",
+ " defaultdict\n",
+ " A map from strings to their counts in `tree`. (Counter maps a \n",
+ " list to a dict of counts of the elements in that list.)\n",
+ " \n",
+ " \"\"\"\n",
+ " return Counter(tree.leaves())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "In the docstring for `sst.sentiment_treebank_reader`, I pointed out that the labels on the subtrees can be used in a way that feels like cheating. Here's the most dramatic instance of this: `root_daughter_scores_phi` uses just the labels on the daughters of the root to predict the root (label). This will result in performance well north of 90% F1, but that's hardly worth reporting. (Interestingly, using the labels on the leaf nodes is much less powerful.) Anyway, don't use this function!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def root_daughter_scores_phi(tree): \n",
+ " \"\"\"The best way we've found to cheat without literally using the \n",
+ " labels as part of the feature representations. \n",
+ " \n",
+ " Don't use this for any real experiments!\n",
+ " \n",
+ " \"\"\"\n",
+ " return Counter([child.label() for child in tree])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "It's generally good design to __write lots of atomic feature functions__ and then bring them together into a single function when running experiments. This will lead to reusable parts that you can assess independently and in sub-groups as part of development."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Building datasets for experiments\n",
+ "\n",
+ "The second major phase for our analysis is a kind of set-up phase. Ingredients:\n",
+ "\n",
+ "* A reader like `train_reader`\n",
+ "* A feature function like `unigrams_phi`\n",
+ "* A class function like `binary_class_func`\n",
+ "\n",
+ "The convenience function `sst.build_dataset` uses these to build a dataset for training and assessing a model. See its documentation for details on how it works. Much of this is about taking advantage of `sklearn`'s many functions for model building."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_dataset = sst.build_dataset(\n",
+ " reader=sst.train_reader,\n",
+ " phi=unigrams_phi,\n",
+ " class_func=sst.binary_class_func,\n",
+ " vectorizer=None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train dataset with unigram features has 6,920 examples and 16,282 features\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Train dataset with unigram features has {:,} examples and {:,} features\".format(\n",
+ " *train_dataset['X'].shape))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Notice that `sst.build_dataset` has an optional argument `vectorizer`:\n",
+ "\n",
+ "* If it is `None`, then a new vectorizer is used and returned as `dataset['vectorizer']`. This is the usual scenario when training. \n",
+ "\n",
+ "* For evaluation, one wants to represent examples exactly as they were represented during training. To ensure that this happens, pass the training `vectorizer` to this function:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dev_dataset = sst.build_dataset(\n",
+ " reader=sst.dev_reader,\n",
+ " phi=unigrams_phi,\n",
+ " class_func=sst.binary_class_func,\n",
+ " vectorizer=train_dataset['vectorizer'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Dev dataset with unigram features has 872 examples and 16,282 features\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Dev dataset with unigram features has {:,} examples and {:,} features\".format(\n",
+ " *dev_dataset['X'].shape))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Basic optimization\n",
+ "\n",
+ "We're now in a position to begin training supervised models!\n",
+ "\n",
+ "For the most part, in this course, we will not study the theoretical aspects of machine learning optimization, concentrating instead on how to optimize systems effectively in practice. That is, this isn't a theory course, but rather an experimental, project-oriented one.\n",
+ "\n",
+ "Nonetheless, we do want to avoid treating our optimizers as black boxes that work their magic and give us some assessment figures for whatever we feed into them. That seems irresponsible from a scientific and engineering perspective, and it also sends the false signal that the optimization process is inherently mysterious. So we do want to take a minute to demystify it with some simple code.\n",
+ "\n",
+ "The module `sgd_classifier` contains a complete optimization framework, as `BasicSGDClassifier`. Well, it's complete in the sense that it achieves our full task of supervised learning. It's incomplete in the sense that it is very basic. You probably wouldn't want to use it in experiments. Rather, we're going to encourage you to rely on `sklearn` for your experiments (see below). Still, this is a good basic picture of what's happening under the hood.\n",
+ "\n",
+ "So what is `BasicSGDClassifier` doing? The heart of it is the `fit` function (reflecting the usual `sklearn` naming system). This method implements a hinge-loss stochastic sub-gradient descent optimization. Intuitively, it works as follows:\n",
+ "\n",
+ "0. Start by assuming that all the feature weights are `0`.\n",
+ "0. Move through the dataset instance-by-instance in random order.\n",
+ "0. For each instance, classify it using the current weights. \n",
+ "0. If the classification is incorrect, move the weights in the direction of the correct classification\n",
+ "\n",
+ "This process repeats for a user-specified number of iterations (default `10` below), and the weight movement is tempered by a learning-rate parameter `eta` (default `0.1`). The output is a set of weights that can be used to make predictions about new (properly featurized) examples.\n",
+ "\n",
+ "In more technical terms, the objective function is \n",
+ "\n",
+ "$$\n",
+ " \\min_{\\mathbf{w} \\in \\mathbb{R}^{d}}\n",
+ " \\sum_{(x,y)\\in\\mathcal{D}} \n",
+ " \\max_{y'\\in\\mathbf{Y}}\n",
+ " \\left[\\mathbf{Score}_{\\textbf{w}, \\phi}(x,y') + \\mathbf{cost}(y,y')\\right] - \\mathbf{Score}_{\\textbf{w}, \\phi}(x,y)\n",
+ "$$\n",
+ "\n",
+ "where $\\mathbf{w}$ is the set of weights to be learned, $\\mathcal{D}$ is the training set of example–label pairs, $\\mathbf{Y}$ is the set of labels, $\\mathbf{cost}(y,y') = 0$ if $y=y'$, else $1$, and $\\mathbf{Score}_{\\textbf{w}, \\phi}(x,y')$ is the inner product of the weights \n",
+ "$\\mathbf{w}$ and the example as featurized according to $\\phi$.\n",
+ "\n",
+ "The `fit` method is then calculating the sub-gradient of this objective. In succinct pseudo-code:\n",
+ "\n",
+ "* Initialize $\\mathbf{w} = \\mathbf{0}$\n",
+ "* Repeat $T$ times:\n",
+ " * for each $(x,y)$ in $\\mathcal{D}$ (in random order):\n",
+ " * $\\tilde{y} = \\text{argmax}_{y'\\in \\mathcal{Y}} \\mathbf{Score}_{\\textbf{w}, \\phi}(x,y') + \\mathbf{cost}(y,y')$\n",
+ " * $\\mathbf{w} = \\mathbf{w} + \\eta(\\phi(x,y) - \\phi(x,\\tilde{y}))$\n",
+ " \n",
+ "This is very intuitive – push the weights in the direction of the positive cases. It doesn't require any probability theory. And such loss functions have proven highly effective in many settings. For a more powerful version of this classifier, see [sklearn.linear_model.SGDClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier). With `loss='hinge'`, it should behave much like `BasicSGDClassifier` (but faster!)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Wrapper for SGDClassifier\n",
+ "\n",
+ "For the sake of our experimental framework, a simple wrapper for `SGDClassifier`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fit_basic_sgd_classifier(X, y): \n",
+ " \"\"\"Wrapper for `BasicSGDClassifier`.\n",
+ " \n",
+ " Parameters\n",
+ " ----------\n",
+ " X : 2d np.array\n",
+ " The matrix of features, one example per row.\n",
+ " \n",
+ " y : list\n",
+ " The list of labels for rows in `X`.\n",
+ " \n",
+ " Returns\n",
+ " -------\n",
+ " BasicSGDClassifier\n",
+ " A trained `BasicSGDClassifier` instance.\n",
+ " \n",
+ " \"\"\" \n",
+ " mod = BasicSGDClassifier()\n",
+ " mod.fit(X, y)\n",
+ " return mod"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Wrapper for LogisticRegression\n",
+ "\n",
+ "As I said above, we likely don't want to rely on `BasicSGDClassifier` (though it does a good job with SST!). Instead, we want to rely on `sklearn`. Here's a simple wrapper for [sklearn.linear.model.LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) using our \n",
+ "`build_dataset` paradigm."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fit_maxent_classifier(X, y): \n",
+ " \"\"\"Wrapper for `sklearn.linear.model.LogisticRegression`. This is also \n",
+ " called a Maximum Entropy (MaxEnt) Classifier, which is more fitting \n",
+ " for the multiclass case.\n",
+ " \n",
+ " Parameters\n",
+ " ----------\n",
+ " X : 2d np.array\n",
+ " The matrix of features, one example per row.\n",
+ " \n",
+ " y : list\n",
+ " The list of labels for rows in `X`.\n",
+ " \n",
+ " Returns\n",
+ " -------\n",
+ " sklearn.linear.model.LogisticRegression\n",
+ " A trained `LogisticRegression` instance.\n",
+ " \n",
+ " \"\"\"\n",
+ " mod = LogisticRegression(fit_intercept=True)\n",
+ " mod.fit(X, y)\n",
+ " return mod"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Other scikit-learn models\n",
+ "\n",
+ "* The [sklearn.linear_model](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model) package has a number of other classifier models that could be effective for SST.\n",
+ "\n",
+ "* The [sklearn.ensemble](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble) package contains powerful classifiers as well. The theme that runs through all of them is that one can get better results by averaging the predictions of a bunch of more basic classifiers. A [RandomForestClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier) will bring some of the power of deep learning models without the optimization challenges (though see [this blog post on some limitations of the current sklearn implementation](https://roamanalytics.com/2016/10/28/are-categorical-variables-getting-lost-in-your-random-forests/)).\n",
+ "\n",
+ "* The [sklearn.svm](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm) contains variations on Support Vector Machines (SVMs)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Experiments\n",
+ "\n",
+ "We now have all the pieces needed to run experiments. And we're going to want to run a lot of experiments, trying out different feature functions, taking different perspectives on the data and labels, and using different models. \n",
+ "\n",
+ "To make that process efficient and regimented, `sst` contains a function `experiment`. All it does is pull together these pieces and use them for training and assessment. It's complicated, but the flexibility will turn out to be an asset."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Experiment with default values"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.617\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.634 0.696 0.664 997\n",
+ " neutral 0.239 0.106 0.147 483\n",
+ " positive 0.666 0.772 0.715 1084\n",
+ "\n",
+ "avg / total 0.573 0.617 0.588 2564\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "_ = sst.experiment(\n",
+ " unigrams_phi,\n",
+ " fit_maxent_classifier,\n",
+ " train_reader=sst.train_reader, \n",
+ " assess_reader=None, \n",
+ " train_size=0.7,\n",
+ " class_func=sst.ternary_class_func,\n",
+ " score_func=utils.safe_macro_f1,\n",
+ " verbose=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "A few notes on this function call:\n",
+ " \n",
+ "* Since `assess_reader=None`, the function reports performance on a random train–test split. Give `sst.dev_reader` as the argument to assess against the `dev` set.\n",
+ "\n",
+ "* `unigrams_phi` is the function we defined above. By changing/expanding this function, you can start to improve on the above baseline, perhaps periodically seeing how you do on the dev set.\n",
+ "\n",
+ "* `fit_maxent_classifier` is the wrapper we defined above. To assess new models, simply define more functions like this one. Such functions just need to consume an `(X, y)` constituting a dataset and return a model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### A dev set run"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.602\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.628 0.689 0.657 428\n",
+ " neutral 0.343 0.153 0.211 229\n",
+ " positive 0.629 0.750 0.684 444\n",
+ "\n",
+ "avg / total 0.569 0.602 0.575 1101\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "_ = sst.experiment(\n",
+ " unigrams_phi,\n",
+ " fit_maxent_classifier,\n",
+ " class_func=sst.ternary_class_func,\n",
+ " assess_reader=sst.dev_reader)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Assessing BasicSGDClassifier"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.572\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.624 0.589 0.606 428\n",
+ " neutral 0.293 0.170 0.215 229\n",
+ " positive 0.601 0.764 0.673 444\n",
+ "\n",
+ "avg / total 0.546 0.572 0.552 1101\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "_ = sst.experiment(\n",
+ " unigrams_phi,\n",
+ " fit_basic_sgd_classifier,\n",
+ " class_func=sst.ternary_class_func,\n",
+ " assess_reader=sst.dev_reader)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Comparison with the baselines from Socher et al. 2013\n",
+ "\n",
+ "Where does our default set-up sit with regard to published baselines for the binary problem? (Compare [Socher et al., Table 1](http://www.aclweb.org/anthology/D/D13/D13-1170.pdf).)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.772\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.783 0.741 0.761 428\n",
+ " positive 0.762 0.802 0.782 444\n",
+ "\n",
+ "avg / total 0.772 0.772 0.772 872\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "_ = sst.experiment(\n",
+ " unigrams_phi,\n",
+ " fit_maxent_classifier,\n",
+ " class_func=sst.binary_class_func,\n",
+ " assess_reader=sst.dev_reader)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### A shallow neural network classifier\n",
+ "\n",
+ "While we're at it, we might as well see whether adding a hidden layer to our maxent classifier yields any benefits. Whereas `LogisticRegression` is, at its core, computing\n",
+ "\n",
+ "$$\\begin{align*}\n",
+ "y &= \\textbf{softmax}(xW_{xy} + b_{y})\n",
+ "\\end{align*}$$\n",
+ "\n",
+ "this model inserts a hidden layer with a non-linear activation applied to it:\n",
+ "\n",
+ "$$\\begin{align*}\n",
+ "h &= \\tanh(xW_{xh} + b_{h}) \\\\\n",
+ "y &= \\textbf{softmax}(hW_{hy} + b_{y})\n",
+ "\\end{align*}$$"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fit_nn_classifier(X, y):\n",
+ " mod = TfShallowNeuralClassifier(hidden_dim=50, max_iter=100)\n",
+ " mod.fit(X, y)\n",
+ " return mod"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 100: loss: 3.178435742855072"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.645\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.623 0.595 0.609 964\n",
+ " positive 0.662 0.687 0.674 1112\n",
+ "\n",
+ "avg / total 0.644 0.645 0.644 2076\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "_ = sst.experiment(\n",
+ " unigrams_phi,\n",
+ " fit_nn_classifier,\n",
+ " class_func=sst.binary_class_func)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "It looks like, with enough iterations (and perhaps some fiddling with the activation function and hidden dimensionality), this classifier would meet or exceed the baseline set up by `LogisticRegression`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Hyperparameter search\n",
+ "\n",
+ "The training process learns __parameters__ — the weights. There are typically lots of other parameters that need to be set. For instance, our `BasicSGDClassifier` has a learning rate parameter and a training iteration parameter. These are called __hyperparameters__. The more powerful `sklearn` classifiers often have many more such hyperparameters. These are outside of the explicitly stated objective, hence the \"hyper\" part. \n",
+ "\n",
+ "So far, we have just set the hyperparameters by hand. However, their optimal values can vary widely between datasets, and choices here can dramatically impact performance, so we would like to set them as part of the overall experimental framework."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### sst.fit_classifier_with_crossvalidation\n",
+ "\n",
+ "Luckily, `sklearn` provides a lot of functionality for setting hyperparameters via cross-validation. The function `sst.fit_classifier_with_crossvalidation` implements a basic framework for taking advantage of these options. \n",
+ "\n",
+ "\n",
+ "This method has the same basic shape as `fit_maxent_classifier` above: it takes a dataset as input and returns a trained model. However, to find its favored model, it explores a space of hyperparameters supplied by the user, seeking the optimal combination of settings.\n",
+ "\n",
+ "__Note__: this kind of search seems not to have a large impact for SST as we're using it. However, it can matter a lot for other data sets, and it's also an important step to take when trying to publish, since __reviewers are likely to want to check that your comparisons aren't based in part on opportunistic or ill-considered choices for the hyperparameters__."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Example using LogisticRegression\n",
+ "\n",
+ "Here's a fairly full-featured use of the above for the `LogisisticRegression` model family:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fit_maxent_with_crossvalidation(X, y):\n",
+ " \"\"\"A MaxEnt model of dataset with hyperparameter \n",
+ " cross-validation. Some notes:\n",
+ " \n",
+ " * 'fit_intercept': whether to include the class bias feature.\n",
+ " * 'C': weight for the regularization term (smaller is more regularized).\n",
+ " * 'penalty': type of regularization -- roughly, 'l1' ecourages small \n",
+ " sparse models, and 'l2' encourages the weights to conform to a \n",
+ " gaussian prior distribution.\n",
+ " \n",
+ " Other arguments can be cross-validated; see \n",
+ " http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n",
+ " \n",
+ " Parameters\n",
+ " ----------\n",
+ " X : 2d np.array\n",
+ " The matrix of features, one example per row.\n",
+ " \n",
+ " y : list\n",
+ " The list of labels for rows in `X`. \n",
+ " \n",
+ " Returns\n",
+ " -------\n",
+ " sklearn.linear_model.LogisticRegression\n",
+ " A trained model instance, the best model found.\n",
+ " \n",
+ " \"\"\" \n",
+ " basemod = LogisticRegression()\n",
+ " cv = 5\n",
+ " param_grid = {'fit_intercept': [True, False], \n",
+ " 'C': [0.4, 0.6, 0.8, 1.0, 2.0, 3.0],\n",
+ " 'penalty': ['l1','l2']} \n",
+ " return sst.fit_classifier_with_crossvalidation(X, y, basemod, cv, param_grid)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "slideshow": {
+ "slide_type": "-"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Best params {'C': 2.0, 'fit_intercept': True, 'penalty': 'l2'}\n",
+ "Best score: 0.755\n",
+ "Accuracy: 0.772\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.762 0.742 0.752 966\n",
+ " positive 0.781 0.798 0.789 1110\n",
+ "\n",
+ "avg / total 0.772 0.772 0.772 2076\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "_ = sst.experiment(\n",
+ " unigrams_phi,\n",
+ " fit_maxent_with_crossvalidation, \n",
+ " class_func=sst.binary_class_func)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "### Example using BasicSGDClassifier"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The models written for this course are also compatible with this framework. They [\"duck type\"](https://en.wikipedia.org/wiki/Duck_typing) the sklearn models by having methods `fit`, `predict`, `get_params`, and `set_params`, and an attribute `params`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fit_basic_sgd_classifier_with_crossvalidation(X, y):\n",
+ " basemod = BasicSGDClassifier()\n",
+ " cv = 5\n",
+ " param_grid = {'eta': [0.01, 0.1, 1.0], 'max_iter': [10]}\n",
+ " return sst.fit_classifier_with_crossvalidation(X, y, basemod, cv, param_grid)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Best params {'eta': 0.01, 'max_iter': 10}\n",
+ "Best score: 0.743\n",
+ "Accuracy: 0.752\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.717 0.787 0.750 980\n",
+ " positive 0.791 0.722 0.755 1096\n",
+ "\n",
+ "avg / total 0.756 0.752 0.753 2076\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "_ = sst.experiment(\n",
+ " unigrams_phi,\n",
+ " fit_basic_sgd_classifier_with_crossvalidation, \n",
+ " class_func=sst.binary_class_func)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Statistical comparison of classifier models\n",
+ "\n",
+ "Suppose two classifiers differ according to an effectiveness measure like F1 or accuracy. Are they meaningfully different?\n",
+ "\n",
+ "* For very large datasets, the answer might be clear: if performance is very stable across different train/assess splits and the difference in terms of correct predictions has practical import, then you can clearly say yes. \n",
+ "\n",
+ "* With smaller datasets, or models whose performance is closer together, it can be harder to determine whether the two models are different. We can address this question in a basic way with repeated runs and basic null-hypothesis testing on the resulting score vectors.\n",
+ "\n",
+ "The function `sst.compare_models` is designed for such testing. The default set-up uses the non-parametric [Wilcoxon signed-rank test](https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test) to make the comparisons, which is relatively conservative and recommended by [Demšar 2006](http://www.jmlr.org/papers/v7/demsar06a.html) for cases where one can afford to do multiple assessments.\n",
+ "\n",
+ "Here's an example showing the default parameters values and comparing `LogisticRegression` and `BasicSGDClassifier`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model 1 mean: 0.515\n",
+ "Model 2 mean: 0.505\n",
+ "p = 0.074\n"
+ ]
+ }
+ ],
+ "source": [
+ "_ = sst.compare_models(\n",
+ " unigrams_phi,\n",
+ " fit_maxent_classifier,\n",
+ " stats_test=scipy.stats.wilcoxon,\n",
+ " trials=10,\n",
+ " phi2=None, # Defaults to same as first required argument.\n",
+ " train_func2=fit_basic_sgd_classifier, # Defaults to same as second required argument.\n",
+ " reader=sst.train_reader, \n",
+ " train_size=0.7, \n",
+ " class_func=sst.ternary_class_func, \n",
+ " score_func=utils.safe_macro_f1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In general, one wants to compare __two feature functions against the same model__, or one wants to compare __two models with the same feature function used for both__. If both are changed at the same time, then it will be hard to figure out what is causing any differences you see."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "## Exploratory exercise: The core development cycle\n",
+ "\n",
+ "In order to get a feel for the codebase and prepare for the in-class bake-off, we suggest some rounds of __the basic development cycle for models based in hand-built feature functions__:\n",
+ "\n",
+ "0. Write a new feature function. We recommend starting with something simple.\n",
+ "0. Use `sst.experiment` to evaluate your new feature function on the binary and ternary versons of SST, with at least `fit_basic_sgd_classifier` and `fit_maxent_classifier`.\n",
+ "0. If you have time, compare your feature function with `unigrams_phi` using `compare_models`.\n",
+ "0. Return to step 1, or stop the cycle and conduct a more rigorous evaluation with hyperparameter tuning and assessment on the `dev` set.\n",
+ "\n",
+ "Error analysis is one of the most important methods for steadily improving a system, as it facilitates a kind of human-powered hill-climbing on your ultimate objective. Often, it takes a careful human analyst just a few examples to spot a major pattern that can lead to a beneficial change to the feature representations.\n",
+ "\n",
+ "To bring error analysis into your development cycle, you could improve `sst.experiment` by adding a keyword argument `view_errors` with default value `0`. Where the value is `n`, the function prints out a random selection of `n` errors: the underlying tree, the correct label, and the predicted label."
+ ]
+ }
+ ],
+ "metadata": {
+ "celltoolbar": "Slideshow",
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.4"
+ },
+ "widgets": {
+ "state": {},
+ "version": "1.1.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/sst_03_neural_networks.ipynb b/sst_03_neural_networks.ipynb
new file mode 100644
index 0000000..7541586
--- /dev/null
+++ b/sst_03_neural_networks.ipynb
@@ -0,0 +1,1187 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Supervised sentiment: Dense feature representations and neural networks"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "__author__ = \"Christopher Potts\"\n",
+ "__version__ = \"CS224u, Stanford, Spring 2018 term\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Contents\n",
+ "\n",
+ "0. [Overview of this unit](#Overview-of-this-unit)\n",
+ "0. [Set-up](#Set-up)\n",
+ "0. [Distributed representations as features](#Distributed-representations-as-features)\n",
+ " 0. [GloVe inputs](#GloVe-inputs)\n",
+ " 0. [IMDB representations](#IMDB-representations)\n",
+ " 0. [Remarks on this approach](#Remarks-on-this-approach)\n",
+ "0. [RNN classifiers](#RNN-classifiers)\n",
+ " 0. [RNN dataset preparation](#RNN-dataset-preparation)\n",
+ " 0. [Vocabulary for embedding](#Vocabulary-for-embedding)\n",
+ " 0. [Pure NumPy RNN implementation](#Pure-NumPy-RNN-implementation)\n",
+ " 0. [TensorFlow implementation](#TensorFlow-implementation)\n",
+ "0. [Tree-structured neural networks](#Tree-structured-neural-networks)\n",
+ " 0. [TreeNN dataset preparation](#TreeNN-dataset-preparation)\n",
+ " 0. [Pure NumPy TreeNN implementation](#Pure-NumPy-TreeNN-implementation)\n",
+ "0. [Exploratory exercises](#Exploratory-exercises)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Overview of this unit"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Set-up\n",
+ "\n",
+ "See [the first notebook in this unit](sst_01_overview.ipynb#Set-up) for set-up instructions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Applications/anaconda/envs/nlu/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
+ " from ._conv import register_converters as _register_converters\n"
+ ]
+ }
+ ],
+ "source": [
+ "from collections import Counter\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import pandas as pd\n",
+ "import random\n",
+ "from rnn_classifier import RNNClassifier\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "from sklearn.metrics import classification_report\n",
+ "import tensorflow as tf\n",
+ "from tf_rnn_classifier import TfRNNClassifier\n",
+ "from tree_nn import TreeNN\n",
+ "import sst\n",
+ "import vsm\n",
+ "import utils"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "vsmdata_home = 'vsmdata'\n",
+ "\n",
+ "glove_home = os.path.join(vsmdata_home, 'glove.6B')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Distributed representations as features\n",
+ "\n",
+ "As a first step in the direction of neural networks for sentiment, we can connect with our previous unit on distributed representations. Arguably, more than any specific model architecture, this is the major innovation of deep learning: __rather than designing feature functions by hand, we use dense, distributed representations, often derived from unsupervised models__."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Our model will just be `LogisticRegression`, and we'll continue with the experiment framework from the previous notebook. Here's is `fit_maxent_classifier` again:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fit_maxent_classifier(X, y): \n",
+ " mod = LogisticRegression(fit_intercept=True)\n",
+ " mod.fit(X, y)\n",
+ " return mod"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### GloVe inputs\n",
+ "\n",
+ "To illustrate this process, we'll use the general purpose GloVe representations released by the GloVe team, at 50d:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "glove_lookup = utils.glove2dict(\n",
+ " os.path.join(glove_home, 'glove.6B.50d.txt'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def vsm_leaves_phi(tree, lookup, np_func=np.sum):\n",
+ " \"\"\"Represent tree as a combination of the vector of its words.\n",
+ " \n",
+ " Parameters\n",
+ " ----------\n",
+ " tree : nltk.Tree \n",
+ " lookup : dict\n",
+ " From words to vectors.\n",
+ " np_func : function (default: np.sum)\n",
+ " A numpy matrix operation that can be applied columnwise, \n",
+ " like `np.mean`, `np.sum`, or `np.prod`. The requirement is that \n",
+ " the function take `axis=0` as one of its arguments (to ensure\n",
+ " columnwise combination) and that it return a vector of a \n",
+ " fixed length, no matter what the size of the tree is.\n",
+ " \n",
+ " Returns\n",
+ " -------\n",
+ " np.array, dimension `X.shape[1]`\n",
+ " \n",
+ " \"\"\"\n",
+ " dim = len(next(iter(lookup.values()))) \n",
+ " allvecs = np.array([lookup[w] for w in tree.leaves() if w in lookup])\n",
+ " if len(allvecs) == 0:\n",
+ " feats = np.zeros(dim)\n",
+ " else: \n",
+ " feats = np_func(allvecs, axis=0) \n",
+ " return feats"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def glove_leaves_phi(tree, np_func=np.sum):\n",
+ " return vsm_leaves_phi(tree, glove_lookup, np_func=np_func)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.732\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.715 0.718 0.717 981\n",
+ " positive 0.746 0.744 0.745 1095\n",
+ "\n",
+ "avg / total 0.732 0.732 0.732 2076\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "_ = sst.experiment(\n",
+ " glove_leaves_phi,\n",
+ " fit_maxent_classifier,\n",
+ " class_func=sst.binary_class_func,\n",
+ " vectorize=False) # Tell `experiment` that we already have our feature vectors."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### IMDB representations\n",
+ "\n",
+ "Our IMDB VSMs seems pretty well-attuned to the Stanford Sentiment Treebank, so we might think that they can do even better than the general-purpose GloVe inputs. Here are two quick assessments of that idea:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "imdb20 = pd.read_csv(\n",
+ " os.path.join(vsmdata_home, 'imdb_window20-flat.csv.gz'), index_col=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "imdb20_ppmi = vsm.pmi(imdb20, positive=False) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "imdb20_ppmi_svd = vsm.lsa(imdb20_ppmi, k=50) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "imdb_lookup = dict(zip(imdb20_ppmi_svd.index, imdb20_ppmi_svd.values))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def imdb_phi(tree, np_func=np.sum):\n",
+ " return vsm_leaves_phi(tree, imdb_lookup, np_func=np_func)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.751\n",
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.737 0.739 0.738 984\n",
+ " positive 0.764 0.762 0.763 1092\n",
+ "\n",
+ "avg / total 0.751 0.751 0.751 2076\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "_ = sst.experiment(\n",
+ " imdb_phi,\n",
+ " fit_maxent_classifier,\n",
+ " class_func=sst.binary_class_func,\n",
+ " vectorize=False) # Tell `experiment` that we already have our feature vectors."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Remarks on this approach\n",
+ "\n",
+ "* Recall that our `ungrams_phi`created feature representations with over 16K dimensions and got about 0.77.\n",
+ "\n",
+ "* The above models have only 50 dimensions and come close in terms of performance. In many ways, it's striking that we can get a model that is pretty competitive with so few dimensions.\n",
+ "\n",
+ "* The promise of the Mittens model of [Dingwall and Potts 2018](https://arxiv.org/abs/1803.09901) is that we can use GloVe itself to update the general purpose information in the 'glove.6B' vectors with specialized information from one of these IMDB count matrices. That might be worth trying; the `mittens` package already implements this!\n",
+ "\n",
+ "* That said, just summing up all the word representations is pretty unappealing linguistically. There's no doubt that we're losing a lot of valuable information in doing this. The models we turn to now can be seen as addressing this shortcoming while retaining the insight that our distributed representations are valuable for this task."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## RNN classifiers\n",
+ "\n",
+ "A recurrent neural network (RNN) is any deep learning model that process its inputs sequentially. There are many variations on this theme. The one that we use here is a __RNN classifier__.\n",
+ "\n",
+ "
\n",
+ "\n",
+ "For a sequence of length $n$:\n",
+ "\n",
+ "$$\\begin{align*}\n",
+ "h_{t} &= \\tanh(x_{t}W_{xh} + h_{t-1}W_{hh}) \\\\\n",
+ "y &= \\textbf{softmax}(h_{n}W_{hy} + b)\n",
+ "\\end{align*}$$\n",
+ "\n",
+ "where $1 \\leqslant t \\leqslant n$. As indicated in the above diagram, the sequence of hidden states is padded with an initial state $h_{0}$ In our implementations , this is always an all $0$ vector, but it can be initialized in more sophisticated ways (some of which we will explore in our unit on natural language inference).\n",
+ "\n",
+ "This is a potential gain over our sum-the-word-vectors baseline, in that it processes each word independently, and in the context of those that came before it. Thus, not only is this sensitive to word order, but the hidden representation give us the potential to encode how the preceding context for a word affects its interpretation.\n",
+ "\n",
+ "The downside of this, of course, is that this model is much more difficult to set up and optimize. Let's dive into those details."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### RNN dataset preparation\n",
+ "\n",
+ "SST contains trees, but the RNN processes just the sequence of leaf nodes. The function `sst.build_binary_rnn_dataset` creates datasets in this format:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_rnn_train, y_rnn_train = sst.build_binary_rnn_dataset(sst.train_reader)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here, each member of `X_rnn_train` train is a list of lists of words. Here's a look at the start of the first:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['The', 'Rock', 'is', 'destined', 'to', 'be']"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_rnn_train[0][: 6]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Because this is a classifier, `y_rnn_train` is just a list of labels, one per example:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'positive'"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_rnn_train[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For experiments, let's build a `dev` dataset as well:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_rnn_dev, y_rnn_dev = sst.build_binary_rnn_dataset(sst.dev_reader)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Vocabulary for embedding\n",
+ "\n",
+ "The first delicate issue we need to address is the vocabulary for our model:\n",
+ "\n",
+ "* As indicated in the figure above, the first thing we do when processing an example is look up the words in an embedding (a VSM), which has to have a fixed dimensionality. \n",
+ "\n",
+ "* We can use our training data to specify the vocabulary for this embedding; at prediction time, though, we will inevitably encounter words we haven't seen before. \n",
+ "\n",
+ "* The convention we adopt here is to map them to an `$UNK` token that is in our pre-specified vocabulary.\n",
+ "\n",
+ "* At the same time, we might want to collapse infrequent tokens into `$UNK` to make optimization easier.\n",
+ "\n",
+ "In `sst`, the function `get_vocab` implements these strategies:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now we can extract the training vocab and use it for the model embedding, secure in the knowledge that we will be able to process tokens outside of this set (by mapping them to `$UNK`)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sst_full_train_vocab = sst.get_vocab(X_rnn_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "sst_full_train_vocab has 16,283 items\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"sst_full_train_vocab has {:,} items\".format(len(sst_full_train_vocab)))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This frankly seems too big. Let's restrict to just 3000 words:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sst_train_vocab = sst.get_vocab(X_rnn_train, n_words=3000)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Pure NumPy RNN implementation\n",
+ "\n",
+ "The first implementation we'll look at is a pure NumPy implementation of exactly the model depicted above. This implementation is a bit slow and might not be all that effective, but it is useful to have available in case one really wants to inspect the fine details of how these models process examples."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rnn = RNNClassifier(\n",
+ " sst_train_vocab,\n",
+ " embedding=None, # Will be randomly initialized.\n",
+ " embed_dim=50,\n",
+ " hidden_dim=50,\n",
+ " max_iter=50,\n",
+ " eta=0.05) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Finished epoch 50 of 50; error is 1.3376949593831804"
+ ]
+ }
+ ],
+ "source": [
+ "rnn.fit(X_rnn_train, y_rnn_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rnn_dev_predictions = rnn.predict(X_rnn_dev)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.48 0.65 0.55 428\n",
+ " positive 0.48 0.30 0.37 444\n",
+ "\n",
+ "avg / total 0.48 0.48 0.46 872\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(y_rnn_dev, rnn_dev_predictions))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### TensorFlow implementation\n",
+ "\n",
+ "The included TensorFlow implementation is much faster and more configurable. Its only downside is that it requires the user to specify a maximum length for all incoming sequences: \n",
+ "\n",
+ "* Examples that are shorter than this maximum are padded (and the implementation ignores those dimensions)\n",
+ "* Examples that are longer than this maximum are clipped from the start (on the assumption that later words in the sentences will tend to be more informative).\n",
+ "\n",
+ "The function `utils.sequence_length_report` will help you make informed decisions:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Max sequence length: 52\n",
+ "Min sequence length: 2\n",
+ "Mean sequence length: 19.30\n",
+ "Median sequence length: 19.00\n",
+ "Sequences longer than 50: 6 of 6,920\n"
+ ]
+ }
+ ],
+ "source": [
+ "utils.sequence_length_report(X_rnn_train)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Thus, if we choose `max_length=52`, no training examples will get clipped. (When making predictions about new examples, some might still get clipped.)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The class `TfRNNClassifier` takes a parameter for specifying this maximum length. It has many others as well:\n",
+ " \n",
+ "* `hidden_activation`: the activation function for the hidden layers (default: `tf.nn.tanh`).\n",
+ "* `cell_class`: which TensorFlow cell-type to use: \n",
+ " * The default is an LSTM, which should help ensure that we get a good gradient signal all the way through even long sequences.\n",
+ " * `tf.nn.rnn_cell.BasicRNNCell` is the same as what we defined in pure NumPy.\n",
+ " * `tf.nn.rnn_cell.GRUCell` should be similar to the LSTM.\n",
+ "* `train_embedding`: whether to update the embedding during training."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf_rnn = TfRNNClassifier(\n",
+ " sst_train_vocab,\n",
+ " embed_dim=50,\n",
+ " hidden_dim=50,\n",
+ " max_length=52,\n",
+ " hidden_activation=tf.nn.tanh,\n",
+ " cell_class=tf.nn.rnn_cell.LSTMCell,\n",
+ " train_embedding=True,\n",
+ " max_iter=500,\n",
+ " eta=0.05) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 500: loss: 3.9221649765968323"
+ ]
+ }
+ ],
+ "source": [
+ "_ = tf_rnn.fit(X_rnn_train, y_rnn_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf_rnn_dev_predictions = tf_rnn.predict(X_rnn_dev)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.73 0.54 0.62 428\n",
+ " positive 0.64 0.81 0.72 444\n",
+ "\n",
+ "avg / total 0.69 0.68 0.67 872\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(y_rnn_dev, tf_rnn_dev_predictions))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "It can be challenging to make sense of the errors that these models are making, but we should try. Here's a function for viewing random errors:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def view_error(model):\n",
+ " data = list(zip(X_rnn_dev, y_rnn_dev))\n",
+ " for _ in range(len(data)):\n",
+ " ex, label = random.choice(data)\n",
+ " pred = model.predict([ex])[0]\n",
+ " if label != pred:\n",
+ " print(\" \".join(ex))\n",
+ " print(\"Correct label: {}\".format(label))\n",
+ " return"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Affleck and Jackson are good sparring partners .\n",
+ "Correct label: positive\n"
+ ]
+ }
+ ],
+ "source": [
+ "view_error(tf_rnn)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "It can also be informative to invent examples and see how the model deals with them:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def process_new_examples(model):\n",
+ " examples = [\n",
+ " ['great'], ['excellent'], ['bad'], \n",
+ " ['boring'], ['not', 'good']\n",
+ " ] \n",
+ " for ex in examples:\n",
+ " ex = ['This', 'was'] + ex\n",
+ " prediction = model.predict([ex])[0]\n",
+ " print(\"{0:<30} {1:}\".format(\" \".join(ex), prediction))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "This was great negative\n",
+ "This was excellent negative\n",
+ "This was bad negative\n",
+ "This was boring positive\n",
+ "This was not good negative\n"
+ ]
+ }
+ ],
+ "source": [
+ "process_new_examples(tf_rnn)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Such experiments often reveal that the model is more sensitive to minor variation in the inputs than one would like!"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "With `embedding=None`, the `TfRNNClassifier` creates a random embedding space in which the values are drawn from a uniform distribution with bounds `[-1, 1)`. You can also pass in an embedding, as long as you make sure it has the right vocabulary:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sst_glove_vocab = sorted(set(glove_lookup) & set(sst_train_vocab))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "glove_embedding = np.array([glove_lookup[w] for w in sst_glove_vocab])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Add $UNK and its random representation:\n",
+ "\n",
+ "sst_glove_vocab.append(\"$UNK\")\n",
+ "\n",
+ "glove_embedding = np.vstack(\n",
+ " (glove_embedding, utils.randvec(glove_embedding.shape[1])))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf_rnn_glove = TfRNNClassifier(\n",
+ " sst_glove_vocab,\n",
+ " embedding=glove_embedding,\n",
+ " hidden_dim=50,\n",
+ " max_length=52,\n",
+ " hidden_activation=tf.nn.tanh,\n",
+ " cell_class=tf.nn.rnn_cell.LSTMCell,\n",
+ " train_embedding=True,\n",
+ " max_iter=500,\n",
+ " eta=0.05) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Iteration 500: loss: 3.3262188732624054"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tf_rnn_glove.fit(X_rnn_train, y_rnn_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf_rnn_imdb_dev_predictions = tf_rnn_glove.predict(X_rnn_dev)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.76 0.70 0.73 428\n",
+ " positive 0.73 0.79 0.76 444\n",
+ "\n",
+ "avg / total 0.75 0.75 0.75 872\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(y_rnn_dev, tf_rnn_imdb_dev_predictions))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "This was great positive\n",
+ "This was excellent positive\n",
+ "This was bad negative\n",
+ "This was boring negative\n",
+ "This was not good positive\n"
+ ]
+ }
+ ],
+ "source": [
+ "process_new_examples(tf_rnn_glove)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Tree-structured neural networks\n",
+ "\n",
+ "Tree-structured neural networks (TreeNNs) are close relatives of RNN classifiers. (If you tilt your head, you can see the above sequence model as a kind of tree.) The TreeNNs we explore here are the simplest possible and actually have many fewer parameters than RNNs. Here's a summary:\n",
+ "\n",
+ "
\n",
+ "\n",
+ "The crucial property of these networks is the way they employ recursion: the representation of a parent node $p$ has the same dimensionality as the word representations, allowing seamless repeated application of the central combination function:\n",
+ "\n",
+ "$$p = \\tanh([x_{L};x_{R}]W_{wh} + b)$$\n",
+ "\n",
+ "Here, $[x_{L};x_{R}]$ is the concatenation of the left and right child representations, and $p$ is the resulting parent node, which can then be a child node in a higher subtree.\n",
+ "\n",
+ "When we reach the root node $h_{r}$ of the tree, we apply a softmax classifier using that top node's representation:\n",
+ "\n",
+ "$$y = \\textbf{softmax}(h_{r}W_{hy} + b)$$"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### TreeNN dataset preparation\n",
+ "\n",
+ "This is the only model under consideration here that makes use of the tree structures in the SST:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_tree_dataset(reader):\n",
+ " data = [(tree, label) for tree, label in reader(class_func=sst.binary_class_func)]\n",
+ " X, y = zip(*data)\n",
+ " return list(X), list(y)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_tree_train, y_tree_train = get_tree_dataset(sst.train_reader)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "Tree('S', [Tree('2', [Tree('2', ['The']), Tree('2', ['Rock'])]), Tree('4', [Tree('3', [Tree('2', ['is']), Tree('4', [Tree('2', ['destined']), Tree('2', [Tree('2', [Tree('2', [Tree('2', [Tree('2', ['to']), Tree('2', [Tree('2', ['be']), Tree('2', [Tree('2', ['the']), Tree('2', [Tree('2', ['21st']), Tree('2', [Tree('2', [Tree('2', ['Century']), Tree('2', [\"'s\"])]), Tree('2', [Tree('3', ['new']), Tree('2', [Tree('2', ['``']), Tree('2', ['Conan'])])])])])])])]), Tree('2', [\"''\"])]), Tree('2', ['and'])]), Tree('3', [Tree('2', ['that']), Tree('3', [Tree('2', ['he']), Tree('3', [Tree('2', [\"'s\"]), Tree('3', [Tree('2', ['going']), Tree('3', [Tree('2', ['to']), Tree('4', [Tree('3', [Tree('2', ['make']), Tree('3', [Tree('3', [Tree('2', ['a']), Tree('3', ['splash'])]), Tree('2', [Tree('2', ['even']), Tree('3', ['greater'])])])]), Tree('2', [Tree('2', ['than']), Tree('2', [Tree('2', [Tree('2', [Tree('2', [Tree('1', [Tree('2', ['Arnold']), Tree('2', ['Schwarzenegger'])]), Tree('2', [','])]), Tree('2', [Tree('2', ['Jean-Claud']), Tree('2', [Tree('2', ['Van']), Tree('2', ['Damme'])])])]), Tree('2', ['or'])]), Tree('2', [Tree('2', ['Steven']), Tree('2', ['Segal'])])])])])])])])])])])])]), Tree('2', ['.'])])])"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "X_tree_train[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_tree_dev, y_tree_dev = get_tree_dataset(sst.dev_reader)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Pure NumPy TreeNN implementation\n",
+ "\n",
+ "`TreeNN` is a pure NumPy implementation of this model. It should be regarded as a baseline for models of this form. The original SST paper includes evaluations of a wide range of these models."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tree_nn = TreeNN(\n",
+ " sst_train_vocab, \n",
+ " embed_dim=100, \n",
+ " max_iter=100,\n",
+ " eta=0.05) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Finished epoch 100 of 100; error is 2.416002630512955"
+ ]
+ }
+ ],
+ "source": [
+ "tree_nn.fit(X_tree_train, y_tree_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tree_dev_predictions = tree_nn.predict(X_tree_dev)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.51 0.56 0.53 428\n",
+ " positive 0.53 0.47 0.50 444\n",
+ "\n",
+ "avg / total 0.52 0.52 0.52 872\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(y_tree_dev, tree_dev_predictions))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tree_nn_glove = TreeNN(\n",
+ " sst_glove_vocab,\n",
+ " embedding=glove_embedding,\n",
+ " embed_dim=None, # Ignored when embedding is not `None`\n",
+ " max_iter=100,\n",
+ " eta=0.05) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Finished epoch 100 of 100; error is 1.337790645988493"
+ ]
+ }
+ ],
+ "source": [
+ "tree_nn_glove.fit(X_tree_train, y_tree_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tree_glove_dev_predictions = tree_nn_glove.predict(X_tree_dev)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " negative 0.54 0.35 0.43 428\n",
+ " positive 0.53 0.71 0.61 444\n",
+ "\n",
+ "avg / total 0.54 0.54 0.52 872\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(classification_report(y_tree_dev, tree_glove_dev_predictions))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Exploratory exercises\n",
+ "\n",
+ "These are largely meant to give you a feel for the material, but some of them could lead to projects and help you with future work for the course (especially the SST bake-off). These are not for credit.\n",
+ "\n",
+ "1. In the [distributed representations as features](#Distributed-representations-as-features) section, we just summed all of the leaf-node GloVe vectors to obtain a fixed-dimensional representation for all sentences. This ignores all of the tree structure. See if you can do better by paying attention to the binary tree structure: write a function `glove_subtree_phi` that obtains a vector representation for each subtree by combining the vectors of its daughters, with the leaf nodes again given by GloVe (any dimension you like) and the full representation of the sentence given by the final vector obtained by this recursive process. You can decide on how you combine the vectors. \n",
+ "\n",
+ "1. The default hidden activation function for `TfRNNClassifier` is `tf.nn.tanh`. This value is exposed via the `hidden_activation` keyword argument. See if you can get better results with [a different activation function](https://www.tensorflow.org/api_guides/python/nn). If you're feeling really ambitious, you could subclass `TfRNNClassifier` to define a new cost function (maybe with regularization) or optimizer. Deep learning offers a lot of design choices!\n",
+ "\n",
+ "1. Most of the experiments above used random initial representations of the words. Try other initialization schemes by importing or rebuilding vectors from the VSM unit.\n",
+ "\n",
+ "1. Tree-structured networks can be defined relatively easily in TensorFlow. [This excellent repository](https://github.com/erickrf/treernn) provides working code and a clear explanation. Get to know this code, and consider adapting it for use in SST experiments.\n",
+ "\n",
+ "1. The final hidden state in the RNN Classifier is the basis for the classification decisions. It should also be a good embedding for the entire example. Is it? The function `sst.get_sentence_embedding_from_rnn` extracts this space from a trained model given a set of examples. Study this space to see what insights it can provide about what the model has learned."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.4"
+ },
+ "widgets": {
+ "state": {},
+ "version": "1.1.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}