Skip to content

Commit

Permalink
creg2 checked in
Browse files Browse the repository at this point in the history
  • Loading branch information
redpony committed Mar 6, 2014
1 parent 1fbf2bb commit 2a3563b
Show file tree
Hide file tree
Showing 6 changed files with 387 additions and 0 deletions.
19 changes: 19 additions & 0 deletions creg2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

This directory contains some experimental learning code for logistic regression on structured label spaces.

It is possible to
* define features of inputs and outputs seperately, the classifier will operate on the outer product space.
* target an empirical distribution over labels, rather than a single gold standard label
* specify different output spaces for each training instance.

Usage

The inputs are the following:
* `labels.feat` defines the feature map for the output space; providing an indepedent binary feature for each output reduces the problem to familiar multiclass logistic regression
* `train.feat` defines the feature maps for the input data and the discriminative neighborhoods for each training instance
* `train.resp` defines the response variable or the response distribution

Example invocation:

python creg2.py test_data/iris/labels.feat test_data/iris/iris.trainfeat test_data/iris/iris.trainresp

64 changes: 64 additions & 0 deletions creg2/creg2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import sys
import json
from sklearn import preprocessing
from sklearn import feature_extraction
from iologreg import IOLogisticRegression

features = []
labels = {}
invlabels = {}
# read labels and associated features
for line in open(sys.argv[1]):
(label, f) = line.strip().split('\t')
invlabels[len(labels)] = label
labels[label] = len(labels)
features.append(json.loads(f))
label_dict = feature_extraction.DictVectorizer()
label_features = label_dict.fit_transform(features).toarray()

sys.stderr.write(' LABELS: %s\n' % ' '.join(labels.keys()))
sys.stderr.write('LABEL-FEATURES: %s\n' % ' '.join(label_dict.get_feature_names()))
out_dim = len(label_dict.get_feature_names())

ids = {}
X = []
N = []
# read training instances and neighborhoods
for line in open(sys.argv[2]):
(id, xfeats, n) = line.strip().split('\t')
ids[id] = len(ids)
X.append(json.loads(xfeats))
neighborhood = json.loads(n)['N']
if len(neighborhood) == 0:
sys.stderr.write('[ERROR] empty neighborhood in line:\n%s' % line)
sys.exit(1)
if len(neighborhood) == 1:
sys.stderr.write('[WARNING] neighborhood for id="%s" is singleton: %s\n' % (id, str(neighborhood)))
n = [labels[x] for x in neighborhood]
N.append(n)
X_dict = feature_extraction.DictVectorizer()
X = X_dict.fit_transform(X).toarray()

sys.stderr.write(' rows(X): %d\n' % len(X))
sys.stderr.write('INPUT-FEATURES: %s\n' % ' '.join(X_dict.get_feature_names()))
in_dim = len(X_dict.get_feature_names())

# read gold labels
Y = [0 for x in xrange(len(X))]
for line in open(sys.argv[3]):
(id, y) = line.strip().split('\t')
Y[ids[id]] = labels[y]

assert len(X) == len(N)
assert len(Y) == len(X)

model = IOLogisticRegression()
model.fit(in_dim, out_dim, X, N, Y, label_features, len(labels), iterations = 1000, minibatch_size=10)

D = model.predict_proba(X, N)
for row in D:
dist = {}
for i in range(len(row)):
if row[i] > 0.0: dist[invlabels[i]] = row[i]
print dist

101 changes: 101 additions & 0 deletions creg2/iologreg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import numpy as np
import random
import math
import sys

INFINITY = float('inf')

def logadd(a,b):
"""
compute log(exp(a) + exp(b))
"""
if a == -INFINITY:
return b
if b == -INFINITY:
return a
if b < a: # b - a < 0
return a + math.log1p(math.exp(b - a))
else: # a - b < 0
return b + math.log1p(math.exp(a - b))

class IOLogisticRegression:
"""
Logistic regression.
Minimize regularized log-loss:
L(x, y|w) = - sum_i log p(y_i|x_i, w) + l2 ||w||^2
p(y|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x))
Parameters
----------
l2: float, default=0
L2 regularization strength
"""
def __init__(self, l1=0.0, l2=0.0):
self.l1 = l1
self.l2 = l2

def gradient(self, x, n, y, y_feats, W, G):
z = -INFINITY
log_probs = np.zeros(self.num_labels)
xw = x.dot(W)
found = False
for yi in n:
if yi == y: found = True
u = xw.dot(y_feats[yi])
log_probs[yi] = u
z = logadd(z, u)
if not found:
print '[ERROR] for training instance', x, 'gold label', y, 'not found in neighborhood', n
raise Exception
loss = -(log_probs[y] - z)
for yi in n:
delta = math.exp(log_probs[yi] - z) - (yi == y)
G += np.outer(x, y_feats[yi]) * delta
return loss

def fit(self, infeats, outfeats, X, N, Y, y_feats, num_labels, iterations=300, minibatch_size=1000, eta=1.0):
minibatch_size = min(minibatch_size, len(X))
self.num_labels = num_labels
self.y_feats = y_feats
self.W = np.zeros(shape=(infeats, outfeats))
G = np.zeros(shape=(infeats, outfeats))
H = np.ones(shape=(infeats, outfeats)) * 1e-300
for i in range(iterations):
sys.stderr.write('Iteration: %d\n' % i)
G.fill(0.0)
loss = 0
for s in random.sample(range(X.shape[0]), minibatch_size):
loss += self.gradient(X[s], N[s], Y[s], y_feats, self.W, G)

#for k in range(self.n_classes - 1):
# offset = (self.n_features + 1) * k
# for j in range(self.n_features):
# loss += self.l2 * self.coef_[offset + j]**2
# g[offset + j] += 2 * self.l2 * self.coef_[offset + j]

sys.stderr.write(' Loss = %f\n' % loss)
G /= minibatch_size
H += np.square(G)
self.W -= np.divide(G, np.sqrt(H)) * eta
return self

def predict_(self, x, n, probs):
probs.fill(0.0)
z = -INFINITY
xw = x.dot(self.W)
for y in n:
u = xw.dot(self.y_feats[y])
probs[y] = u
z = logadd(z, u)
for y in n:
probs[y] = math.exp(probs[y] - z)

def predict(self, X, N):
post = np.zeros(shape=(len(X),self.num_labels))
return post

def predict_proba(self, X, N):
post = np.zeros(shape=(len(X),self.num_labels))
for (x, n, p) in zip(X, N, post):
self.predict_(x, n, p)
return post
Loading

0 comments on commit 2a3563b

Please sign in to comment.