Skip to content

Commit

Permalink
Unit notebooks and figures
Browse files Browse the repository at this point in the history
  • Loading branch information
cgpotts committed Apr 4, 2018
1 parent f3617d0 commit 5b06322
Show file tree
Hide file tree
Showing 6 changed files with 3,075 additions and 0 deletions.
Binary file added fig/rnn_classifier.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added fig/tree_nn.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
178 changes: 178 additions & 0 deletions sgd_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import numpy as np
import random

__author__ = "Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2018 term"


class BasicSGDClassifier:
"""Basic implementation hinge-loss stochastic sub-gradient descent
optimization, intended to illustrate the basic concepts of classifier
optimization in code."""
def __init__(self, max_iter=10, eta=0.1):
"""
Parameters
----------
max_iter : int (default: 10)
Number of training epochs (full runs through shuffled data).
eta : float (default: 0.1)
Learning rate parameter.
"""
self.max_iter = max_iter
self.eta = eta
self.params = ['max_iter', 'eta']

def fit(self, feat_matrix, labels):
"""Core optimization function.
Parameters
----------
feat_matrix : 2d matrix (np.array or any scipy.sparse type)
The design matrix, one row per example. Hence, the row
dimensionality is the example count and the column
dimensionality is number of features.
labels : list
The labels for each example, hence assumed to have the
same length as, and be aligned with, `feat_matrix`.
For attributes, we follow the `sklearn` style of using a
final `_` for attributes that are created by `fit` methods:
Attributes
----------
self.classes_ : list
The set of class labels in sorted order.
self.n_classes_ : int
Length of `self.classes_`
self.coef_ : np.array of dimension (class count, feature count)
These are the weights, named as in `sklearn`. They are
organized so that each row represents the feature weights
for a given class, as is typical in `sklearn`.
"""
# We'll deal with the labels via their indices into self.classes_:
self.classes_ = sorted(set(labels))
self.n_classes_ = len(self.classes_)
# Useful dimensions to store:
examplecount, featcount = feat_matrix.shape
# The weight matrix -- classes by row:
self.coef_ = np.zeros((self.n_classes_, featcount))
# Indices for shuffling the data at the start of each epoch:
indices = list(range(examplecount))
for _ in range(self.max_iter):
random.shuffle(indices)
for i in indices:
# Training instance as a feature rep and a label index:
rep = feat_matrix[i]
label_index = self.classes_.index(labels[i])
# Costs are 1.0 except for the true label:
costs = np.ones(self.n_classes_)
costs[label_index] = 0.0
# Make a prediction:
predicted_index = self.predict_one(rep, costs=costs)
# Weight update if it's an incorrect prediction:
if predicted_index != label_index:
self.coef_[label_index] += self.eta * rep

def predict_one(self, rep, costs=0.0):
"""The core classification function. After using
`predict_one_proba`, the code just needs to figure out which
class is highest scoring and make a random choice from that
set (in case of ties).
Parameters
----------
rep : np.array of dimension featcount or
`scipy.sparse` matrix of dimension (1 x `featcount`)
costs : float or np.array of dimension self.classcount
Where this is 0.0, we're doing prediction. Where it
is an array, we expect a 0.0 at the coordinate
corresponding to the true label and a 1.0 in all
other positions.
Returns
-------
int
The index of the correct class. This is for the
sake of the `fit` method. `predict` returns the class
names themselves.
"""
scores = rep.dot(self.coef_.T) + costs
# Manage the difference between scipy and numpy 1d matrices:
scores = scores.reshape(self.n_classes_)
# Set of highest scoring label indices (in case of ties):
candidates = np.argwhere(scores==np.max(scores)).flatten()
return random.choice(candidates)

def predict(self, reps):
"""Batch prediction function for experiments.
Parameters
----------
reps : list or feature matrix
A featurized set of examples to make predictions about.
Returns
-------
list of str
A list of class names -- the predictions. Unlike `predict_one`,
it returns the class name rather than its index.
"""
return [self.classes_[self.predict_one(rep)] for rep in reps]

def get_params(self, deep=True):
"""Gets the hyperparameters for the model, as given by the
`self.params` attribute. This is called `get_params` for
compatibility with sklearn.
Returns
-------
dict
Map from attribute names to their values.
"""
return {p: getattr(self, p) for p in self.params}

def set_params(self, **params):
for key, val in params.items():
setattr(self, key, val)
return self


def simple_example():
"""Assess on the digits dataset and informally compare
against LogisticRegression.
"""
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

digits = load_digits()
X = digits.data
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

models = [
BasicSGDClassifier(max_iter=500),
LogisticRegression()
]

for mod in models:
print(mod)
mod.fit(X_train, y_train)
predictions = mod.predict(X_test)
print(classification_report(y_test, predictions))


if __name__ == '__main__':
simple_example()
717 changes: 717 additions & 0 deletions sst_01_overview.ipynb

Large diffs are not rendered by default.

Loading

0 comments on commit 5b06322

Please sign in to comment.