creg2 checked in

redpony · Mar 6, 2014 · 2a3563b · 2a3563b
1 parent 1fbf2bb
commit 2a3563b
Show file tree

Hide file tree

Showing 6 changed files with 387 additions and 0 deletions.
diff --git a/creg2/README.md b/creg2/README.md
@@ -0,0 +1,19 @@
+
+This directory contains some experimental learning code for logistic regression on structured label spaces.
+
+It is possible to
+ * define features of inputs and outputs seperately, the classifier will operate on the outer product space.
+ * target an empirical distribution over labels, rather than a single gold standard label
+ * specify different output spaces for each training instance.
+
+Usage
+
+The inputs are the following:
+ * `labels.feat` defines the feature map for the output space; providing an indepedent binary feature for each output reduces the problem to familiar multiclass logistic regression
+ * `train.feat` defines the feature maps for the input data and the discriminative neighborhoods for each training instance
+ * `train.resp` defines the response variable or the response distribution
+
+Example invocation:
+
+    python creg2.py test_data/iris/labels.feat test_data/iris/iris.trainfeat test_data/iris/iris.trainresp
+
diff --git a/creg2/creg2.py b/creg2/creg2.py
@@ -0,0 +1,64 @@
+import sys
+import json
+from sklearn import preprocessing
+from sklearn import feature_extraction
+from iologreg import IOLogisticRegression
+
+features = []
+labels = {}
+invlabels = {}
+# read labels and associated features
+for line in open(sys.argv[1]):
+  (label, f) = line.strip().split('\t')
+  invlabels[len(labels)] = label
+  labels[label] = len(labels)
+  features.append(json.loads(f))
+label_dict = feature_extraction.DictVectorizer()
+label_features = label_dict.fit_transform(features).toarray()
+
+sys.stderr.write('        LABELS: %s\n' % ' '.join(labels.keys()))
+sys.stderr.write('LABEL-FEATURES: %s\n' % ' '.join(label_dict.get_feature_names()))
+out_dim = len(label_dict.get_feature_names())
+
+ids = {}
+X = []
+N = []
+# read training instances and neighborhoods
+for line in open(sys.argv[2]):
+  (id, xfeats, n) = line.strip().split('\t')
+  ids[id] = len(ids)
+  X.append(json.loads(xfeats))
+  neighborhood = json.loads(n)['N']
+  if len(neighborhood) == 0:
+    sys.stderr.write('[ERROR] empty neighborhood in line:\n%s' % line)
+    sys.exit(1)
+  if len(neighborhood) == 1:
+    sys.stderr.write('[WARNING] neighborhood for id="%s" is singleton: %s\n' % (id, str(neighborhood)))
+  n = [labels[x] for x in neighborhood]
+  N.append(n)
+X_dict = feature_extraction.DictVectorizer()
+X = X_dict.fit_transform(X).toarray()
+
+sys.stderr.write('       rows(X): %d\n' % len(X))
+sys.stderr.write('INPUT-FEATURES: %s\n' % ' '.join(X_dict.get_feature_names()))
+in_dim = len(X_dict.get_feature_names())
+
+# read gold labels
+Y = [0 for x in xrange(len(X))]
+for line in open(sys.argv[3]):
+  (id, y) = line.strip().split('\t')
+  Y[ids[id]] = labels[y]
+
+assert len(X) == len(N)
+assert len(Y) == len(X)
+
+model = IOLogisticRegression()
+model.fit(in_dim, out_dim, X, N, Y, label_features, len(labels), iterations = 1000, minibatch_size=10)
+
+D = model.predict_proba(X, N)
+for row in D:
+  dist = {}
+  for i in range(len(row)):
+    if row[i] > 0.0: dist[invlabels[i]] = row[i]
+  print dist
+
diff --git a/creg2/iologreg.py b/creg2/iologreg.py
@@ -0,0 +1,101 @@
+import numpy as np
+import random
+import math
+import sys
+
+INFINITY = float('inf')
+
+def logadd(a,b):
+    """
+    compute log(exp(a) + exp(b))
+    """
+    if a == -INFINITY:
+        return b
+    if b == -INFINITY:
+        return a
+    if b < a: # b - a < 0
+        return a + math.log1p(math.exp(b - a))
+    else: # a - b < 0
+        return b + math.log1p(math.exp(a - b))
+
+class IOLogisticRegression:
+    """
+    Logistic regression.
+    Minimize regularized log-loss:
+        L(x, y|w) = - sum_i log p(y_i|x_i, w) + l2 ||w||^2
+        p(y|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x))
+
+    Parameters
+    ----------
+    l2: float, default=0
+        L2 regularization strength
+    """
+    def __init__(self, l1=0.0, l2=0.0):
+        self.l1 = l1
+        self.l2 = l2
+
+    def gradient(self, x, n, y, y_feats, W, G):
+        z = -INFINITY
+        log_probs = np.zeros(self.num_labels)
+        xw = x.dot(W)
+        found = False
+        for yi in n:
+            if yi == y: found = True
+            u = xw.dot(y_feats[yi])
+            log_probs[yi] = u
+            z = logadd(z, u)
+        if not found:
+            print '[ERROR] for training instance', x, 'gold label', y, 'not found in neighborhood', n
+            raise Exception
+        loss = -(log_probs[y] - z)
+        for yi in n:
+            delta = math.exp(log_probs[yi] - z) - (yi == y)
+            G += np.outer(x, y_feats[yi]) * delta
+        return loss
+
+    def fit(self, infeats, outfeats, X, N, Y, y_feats, num_labels, iterations=300, minibatch_size=1000, eta=1.0):
+        minibatch_size = min(minibatch_size, len(X))
+        self.num_labels = num_labels
+        self.y_feats = y_feats
+        self.W = np.zeros(shape=(infeats, outfeats))
+        G = np.zeros(shape=(infeats, outfeats))
+        H = np.ones(shape=(infeats, outfeats)) * 1e-300
+        for i in range(iterations):
+            sys.stderr.write('Iteration: %d\n' % i)
+            G.fill(0.0)
+            loss = 0
+            for s in random.sample(range(X.shape[0]), minibatch_size):
+                loss += self.gradient(X[s], N[s], Y[s], y_feats, self.W, G)
+
+            #for k in range(self.n_classes - 1):
+            #    offset = (self.n_features + 1) * k
+            #    for j in range(self.n_features):
+            #        loss += self.l2 * self.coef_[offset + j]**2
+            #        g[offset + j] += 2 * self.l2 * self.coef_[offset + j]
+
+            sys.stderr.write('  Loss = %f\n' % loss)
+            G /= minibatch_size
+            H += np.square(G)
+            self.W -= np.divide(G, np.sqrt(H)) * eta
+        return self
+
+    def predict_(self, x, n, probs):
+        probs.fill(0.0)
+        z = -INFINITY
+        xw = x.dot(self.W)
+        for y in n:
+            u = xw.dot(self.y_feats[y])
+            probs[y] = u
+            z = logadd(z, u)
+        for y in n:
+            probs[y] = math.exp(probs[y] - z)
+
+    def predict(self, X, N):
+        post = np.zeros(shape=(len(X),self.num_labels))
+        return post
+
+    def predict_proba(self, X, N):
+        post = np.zeros(shape=(len(X),self.num_labels))
+        for (x, n, p) in zip(X, N, post):
+          self.predict_(x, n, p)
+        return post