-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
387 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
|
||
This directory contains some experimental learning code for logistic regression on structured label spaces. | ||
|
||
It is possible to | ||
* define features of inputs and outputs seperately, the classifier will operate on the outer product space. | ||
* target an empirical distribution over labels, rather than a single gold standard label | ||
* specify different output spaces for each training instance. | ||
|
||
Usage | ||
|
||
The inputs are the following: | ||
* `labels.feat` defines the feature map for the output space; providing an indepedent binary feature for each output reduces the problem to familiar multiclass logistic regression | ||
* `train.feat` defines the feature maps for the input data and the discriminative neighborhoods for each training instance | ||
* `train.resp` defines the response variable or the response distribution | ||
|
||
Example invocation: | ||
|
||
python creg2.py test_data/iris/labels.feat test_data/iris/iris.trainfeat test_data/iris/iris.trainresp | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import sys | ||
import json | ||
from sklearn import preprocessing | ||
from sklearn import feature_extraction | ||
from iologreg import IOLogisticRegression | ||
|
||
features = [] | ||
labels = {} | ||
invlabels = {} | ||
# read labels and associated features | ||
for line in open(sys.argv[1]): | ||
(label, f) = line.strip().split('\t') | ||
invlabels[len(labels)] = label | ||
labels[label] = len(labels) | ||
features.append(json.loads(f)) | ||
label_dict = feature_extraction.DictVectorizer() | ||
label_features = label_dict.fit_transform(features).toarray() | ||
|
||
sys.stderr.write(' LABELS: %s\n' % ' '.join(labels.keys())) | ||
sys.stderr.write('LABEL-FEATURES: %s\n' % ' '.join(label_dict.get_feature_names())) | ||
out_dim = len(label_dict.get_feature_names()) | ||
|
||
ids = {} | ||
X = [] | ||
N = [] | ||
# read training instances and neighborhoods | ||
for line in open(sys.argv[2]): | ||
(id, xfeats, n) = line.strip().split('\t') | ||
ids[id] = len(ids) | ||
X.append(json.loads(xfeats)) | ||
neighborhood = json.loads(n)['N'] | ||
if len(neighborhood) == 0: | ||
sys.stderr.write('[ERROR] empty neighborhood in line:\n%s' % line) | ||
sys.exit(1) | ||
if len(neighborhood) == 1: | ||
sys.stderr.write('[WARNING] neighborhood for id="%s" is singleton: %s\n' % (id, str(neighborhood))) | ||
n = [labels[x] for x in neighborhood] | ||
N.append(n) | ||
X_dict = feature_extraction.DictVectorizer() | ||
X = X_dict.fit_transform(X).toarray() | ||
|
||
sys.stderr.write(' rows(X): %d\n' % len(X)) | ||
sys.stderr.write('INPUT-FEATURES: %s\n' % ' '.join(X_dict.get_feature_names())) | ||
in_dim = len(X_dict.get_feature_names()) | ||
|
||
# read gold labels | ||
Y = [0 for x in xrange(len(X))] | ||
for line in open(sys.argv[3]): | ||
(id, y) = line.strip().split('\t') | ||
Y[ids[id]] = labels[y] | ||
|
||
assert len(X) == len(N) | ||
assert len(Y) == len(X) | ||
|
||
model = IOLogisticRegression() | ||
model.fit(in_dim, out_dim, X, N, Y, label_features, len(labels), iterations = 1000, minibatch_size=10) | ||
|
||
D = model.predict_proba(X, N) | ||
for row in D: | ||
dist = {} | ||
for i in range(len(row)): | ||
if row[i] > 0.0: dist[invlabels[i]] = row[i] | ||
print dist | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import numpy as np | ||
import random | ||
import math | ||
import sys | ||
|
||
INFINITY = float('inf') | ||
|
||
def logadd(a,b): | ||
""" | ||
compute log(exp(a) + exp(b)) | ||
""" | ||
if a == -INFINITY: | ||
return b | ||
if b == -INFINITY: | ||
return a | ||
if b < a: # b - a < 0 | ||
return a + math.log1p(math.exp(b - a)) | ||
else: # a - b < 0 | ||
return b + math.log1p(math.exp(a - b)) | ||
|
||
class IOLogisticRegression: | ||
""" | ||
Logistic regression. | ||
Minimize regularized log-loss: | ||
L(x, y|w) = - sum_i log p(y_i|x_i, w) + l2 ||w||^2 | ||
p(y|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x)) | ||
Parameters | ||
---------- | ||
l2: float, default=0 | ||
L2 regularization strength | ||
""" | ||
def __init__(self, l1=0.0, l2=0.0): | ||
self.l1 = l1 | ||
self.l2 = l2 | ||
|
||
def gradient(self, x, n, y, y_feats, W, G): | ||
z = -INFINITY | ||
log_probs = np.zeros(self.num_labels) | ||
xw = x.dot(W) | ||
found = False | ||
for yi in n: | ||
if yi == y: found = True | ||
u = xw.dot(y_feats[yi]) | ||
log_probs[yi] = u | ||
z = logadd(z, u) | ||
if not found: | ||
print '[ERROR] for training instance', x, 'gold label', y, 'not found in neighborhood', n | ||
raise Exception | ||
loss = -(log_probs[y] - z) | ||
for yi in n: | ||
delta = math.exp(log_probs[yi] - z) - (yi == y) | ||
G += np.outer(x, y_feats[yi]) * delta | ||
return loss | ||
|
||
def fit(self, infeats, outfeats, X, N, Y, y_feats, num_labels, iterations=300, minibatch_size=1000, eta=1.0): | ||
minibatch_size = min(minibatch_size, len(X)) | ||
self.num_labels = num_labels | ||
self.y_feats = y_feats | ||
self.W = np.zeros(shape=(infeats, outfeats)) | ||
G = np.zeros(shape=(infeats, outfeats)) | ||
H = np.ones(shape=(infeats, outfeats)) * 1e-300 | ||
for i in range(iterations): | ||
sys.stderr.write('Iteration: %d\n' % i) | ||
G.fill(0.0) | ||
loss = 0 | ||
for s in random.sample(range(X.shape[0]), minibatch_size): | ||
loss += self.gradient(X[s], N[s], Y[s], y_feats, self.W, G) | ||
|
||
#for k in range(self.n_classes - 1): | ||
# offset = (self.n_features + 1) * k | ||
# for j in range(self.n_features): | ||
# loss += self.l2 * self.coef_[offset + j]**2 | ||
# g[offset + j] += 2 * self.l2 * self.coef_[offset + j] | ||
|
||
sys.stderr.write(' Loss = %f\n' % loss) | ||
G /= minibatch_size | ||
H += np.square(G) | ||
self.W -= np.divide(G, np.sqrt(H)) * eta | ||
return self | ||
|
||
def predict_(self, x, n, probs): | ||
probs.fill(0.0) | ||
z = -INFINITY | ||
xw = x.dot(self.W) | ||
for y in n: | ||
u = xw.dot(self.y_feats[y]) | ||
probs[y] = u | ||
z = logadd(z, u) | ||
for y in n: | ||
probs[y] = math.exp(probs[y] - z) | ||
|
||
def predict(self, X, N): | ||
post = np.zeros(shape=(len(X),self.num_labels)) | ||
return post | ||
|
||
def predict_proba(self, X, N): | ||
post = np.zeros(shape=(len(X),self.num_labels)) | ||
for (x, n, p) in zip(X, N, post): | ||
self.predict_(x, n, p) | ||
return post |
Oops, something went wrong.