Add Python code, README and plot notebook

negar-foroutan · May 28, 2018 · 1a17873 · 1a17873
1 parent d331cf8
commit 1a17873
Show file tree

Hide file tree

Showing 29 changed files with 1,633 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,39 @@
-# sparsifiedSGD
-Code for Sparsified SGD.
+# Sparsified SGD with Memory
+
+Code for the experimental part of the paper [Sparsified SGD with Memory TODO link](). It contains the code the following experiments:
+
+- Theoretical convergence with different sparsification operator
+- Comparison with QSGD
+- Multi-core experiments
+
+Use `notebooks/plots.ipynb` to visualize the results.
+
+Please open an issue if you have questions or problems.
+
+### Reproduce the results
+
+To reproduce the results, you can download the datasets from [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html)
+
+```bash
+mkdir data
+cd data/
+wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_test.binary.bz2
+wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/epsilon_normalized.bz2
+```
+
+We decompress the libsvm file and use pickle format instead. It takes more space but is faster to load. You can create a file as follow
+
+```python
+import pickle
+from sklearn.datasets import load_svmlight_file
+X, y = load_svmlight_file('rcv1_test.binary.bz2')
+with open('rcv1_test.pickle', 'wb') as f:
+    pickle.dump((X, y), f)
+```
+
+After updating the path to the data files in `experiment.py` , you can then run our experiments, for example
+
+```bash
+python3 experiment.py rcv1-th results/rcv1-th --nproc 10
+```
+
diff --git a/base_logistic.py b/base_logistic.py
@@ -0,0 +1,68 @@
+import numpy as np
+from scipy.special import expit as sigmoid
+
+from parameters import Parameters
+
+
+class BaseLogistic:
+    def __init__(self, params: Parameters):
+        self.params = params
+        self.w_estimate = None
+        self.w = None
+
+    def lr(self, epoch, iteration, num_samples, d):
+        p = self.params
+        t = epoch * num_samples + iteration
+        if p.lr_type == 'constant':
+            return p.initial_lr
+        if p.lr_type == 'epoch-decay':
+            return p.initial_lr * (p.epoch_decay_lr ** epoch)
+        if p.lr_type == 'decay':
+            return p.initial_lr / (p.regularizer * (t + p.tau))
+
+    def loss(self, X, y):
+        w = self.w_estimate if self.w_estimate is not None else self.w
+        w = w.copy()
+        p = self.params
+        loss = np.sum(np.log(1 + np.exp(-y * (X @ w)))) / X.shape[0]
+        if p.regularizer:
+            loss += p.regularizer * np.square(w).sum()
+        return loss
+
+    def predict(self, X):
+        w = self.w_estimate if self.w_estimate is not None else self.w
+        logits = X @ w
+        pred = 1 * (logits >= 0.)
+        return pred
+
+    def predict_proba(self, X):
+        w = self.w_estimate if self.w_estimate is not None else self.w
+        logits = X @ w
+        return sigmoid(logits)
+
+    def score(self, X, y):
+        w = self.w_estimate if self.w_estimate is not None else self.w
+        logits = X @ w
+        pred = 2 * (logits >= 0.) - 1
+        acc = np.mean(pred == y)
+        return acc
+
+    def update_estimate(self, t):
+        p = self.params
+        if p.estimate == 'final':
+            self.w_estimate = self.w
+        elif p.estimate == 'mean':
+            rho = 1 / (t + 1)
+            self.w_estimate = self.w_estimate * (1 - rho) + self.w * rho
+        elif p.estimate == 't+tau':
+            rho = 2 * (t + p.tau) / ((1 + t) * (t + 2 * p.tau))
+            self.w_estimate = self.w_estimate * (1 - rho) + self.w * rho
+        elif p.estimate == '(t+tau)^2':
+            rho = 6 * ((t + p.tau) ** 2) / ((1 + t) * (6 * (p.tau ** 2) + t + 6 * p.tau * t + 2 * (t ** 2)))
+            self.w_estimate = self.w_estimate * (1 - rho) + self.w * rho
+
+    def __str__(self):
+        return "{}({})".format(self.__class__.__name__, self.params)
+
+    def __repr__(self):
+        return str(self)
diff --git a/baselines.py b/baselines.py
@@ -0,0 +1,58 @@
+import argparse
+import os
+import pickle
+
+import numpy as np
+from sklearn.linear_model import SGDClassifier
+
+from utils import pickle_it
+
+"""Arguments"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument('directory', type=str)
+
+args = parser.parse_args()
+if not os.path.exists(args.directory):
+    print('create {}'.format(args.directory))
+    os.makedirs(args.directory)
+
+baselines = {}
+
+
+def loss(clf, X, y, reg):
+    baseline_loss = np.sum(np.log(1 + np.exp(-y * (X @ clf.coef_.transpose()).squeeze()))) / X.shape[0]
+    baseline_loss += reg * np.sum(np.square(clf.coef_))
+    return baseline_loss
+
+
+""" RCV1 test"""
+print('RCV1-test')
+with open(os.path.expanduser('/mlodata1/jb/data/rcv1-test-1.pickle'), 'rb') as f:
+    X, y = pickle.load(f)
+
+reg = 1 / X.shape[0]
+clf = SGDClassifier(tol=1e-4, loss='log', penalty='l2', alpha=reg, fit_intercept=False)
+clf.fit(X, y)
+l = loss(clf, X, y, reg)
+print("loss: {}".format(l))
+print("train accuracy: {}".format(clf.score(X, y)))
+baselines['RCV1-test'] = l
+
+""" EPSILON """
+
+print('epsilon')
+with open(os.path.expanduser('/mlodata1/jb/data/epsilon_normalized_1.pickle'), 'rb') as f:
+    X, y = pickle.load(f)
+
+reg = 1 / X.shape[0]
+clf = SGDClassifier(tol=1e-4, loss='log', penalty='l2', alpha=reg)
+clf.fit(X, y)
+l = loss(clf, X, y, reg)
+print("loss: {}".format(l))
+print("train accuracy: {}".format(clf.score(X, y)))
+baselines['epsilon'] = l
+
+""" Pickle """
+print('baselines', baselines)
+pickle_it(baselines, 'baselines', args.directory)
diff --git a/constants.py b/constants.py
@@ -0,0 +1,2 @@
+INIT_WEIGHT_STD = 0.01
+LOSS_PER_EPOCH = 100
diff --git a/experiment.py b/experiment.py
@@ -0,0 +1,197 @@
+import argparse
+import multiprocessing as mp
+import os
+import pickle
+
+import numpy as np
+
+from logistic import LogisticSGD
+from logistic_parallel import LogisticParallelSGD
+from parameters import Parameters
+from utils import pickle_it
+
+X, y = None, None
+
+
+def run_logistic(param):
+    m = LogisticSGD(param)
+    res = m.fit(X, y)
+    print('{} - score: {}'.format(param, m.score(X, y)))
+    return res
+
+
+def run_experiment(directory, dataset_pickle, params, nproc=None):
+    global X, y
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    pickle_it(params, 'params', directory)
+
+    print('load dataset')
+    with open(dataset_pickle, 'rb') as f:
+        X, y = pickle.load(f)
+
+    print('start experiment')
+    with mp.Pool(nproc) as pool:
+        results = pool.map(run_logistic, params)
+
+    pickle_it(results, 'results', directory)
+    print('results saved in "{}"'.format(directory))
+
+
+def run_parallel_experiment(directory, dataset_pickle, models, cores, baseline, repeat=3):
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    pickle_it([m(1) for m in models], 'models', directory)
+    pickle_it(cores, 'cores', directory)
+
+    print('load dataset')
+    with open(dataset_pickle, 'rb') as f:
+        X, y = pickle.load(f)
+
+    print('start experiment')
+
+    chronos = np.zeros((len(models), len(cores), repeat))
+    stop_times = np.zeros((len(models), len(cores), repeat), dtype=int)
+
+    for r in range(repeat):
+        for c_idx, core in enumerate(cores):
+            for m_idx, model in enumerate(models):
+                p = model(core)
+                print("{} - cores {} - repeat {}".format(p, core, r))
+                m = LogisticParallelSGD(p)
+                timing, epoch, iteration, losses = m.fit_until(X, y, num_features=X.shape[1], num_samples=X.shape[0],
+                                                               baseline=baseline)
+                chronos[m_idx, c_idx, r] = timing
+                stop_times[m_idx, c_idx, r] = epoch * X.shape[0] + iteration
+
+                pickle_it(chronos, 'chronos', directory)
+                pickle_it(stop_times, 'stop_times', directory)
+
+    pickle_it(chronos, 'chronos', directory)
+    pickle_it(stop_times, 'stop_times', directory)
+    print('results saved in "{}"'.format(directory))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('experiment', type=str)
+    parser.add_argument('directory', type=str)
+    parser.add_argument('--nproc', type=int, default=1)
+    args = parser.parse_args()
+
+    assert args.experiment in ['epsilon-th', 'epsilon-quant', 'epsilon-parallel',
+                               'rcv1-th', 'rcv1-quant', 'rcv1-parallel']
+
+    # dataset
+    if args.experiment.startswith('epsilon'):
+        dataset = os.path.expanduser('/mlodata1/jb/data/epsilon_normalized_1.pickle')
+        n, d = 400000, 2000
+    elif args.experiment.startswith('rcv1'):
+        dataset = os.path.expanduser('/mlodata1/jb/data/rcv1-test-1.pickle')
+        n, d = 677399, 47236
+
+    # parameters to evaluate
+    if args.experiment == 'epsilon-th':
+        params = [
+            Parameters(name="full-sgd", num_epoch=3, lr_type='decay', initial_lr=2, tau=1,
+                       regularizer=1 / n, estimate='(t+tau)^2'),
+            Parameters(name="top1", num_epoch=3, lr_type='decay', initial_lr=2, tau=d,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True, take_top=True),
+            Parameters(name="top1-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=1,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True, take_top=True),
+            Parameters(name="rand1", num_epoch=3, lr_type='decay', initial_lr=2, tau=d,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True),
+            Parameters(name="rand1-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=1,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True),
+            Parameters(name="rand2", num_epoch=3, lr_type='decay', initial_lr=2, tau=d / 2,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=2, with_memory=True),
+            Parameters(name="rand2-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=1,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=2, with_memory=True),
+            Parameters(name="rand3", num_epoch=3, lr_type='decay', initial_lr=2, tau=d / 3,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=3, with_memory=True),
+            Parameters(name="rand3-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=1,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=3, with_memory=True),
+        ]
+    elif args.experiment == 'epsilon-quant':
+        params = [
+            Parameters(name="qsgd-8bits", num_epoch=3, lr_type='decay', initial_lr=2, tau=d,
+                       regularizer=1 / n, estimate='(t+tau)^2', qsgd_s=2 ** 8),
+            Parameters(name="qsgd-4bits", num_epoch=3, lr_type='decay', initial_lr=2, tau=d,
+                       regularizer=1 / n, estimate='(t+tau)^2', qsgd_s=2 ** 4),
+            Parameters(name="top1", num_epoch=3, lr_type='decay', initial_lr=2, tau=d,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True, take_top=True),
+            Parameters(name="rand1", num_epoch=3, lr_type='decay', initial_lr=2, tau=d,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True),
+        ]
+    elif args.experiment == 'epsilon-parallel':
+        models = [
+            lambda n_cores: Parameters(name="rand1", num_epoch=5, lr_type='constant', initial_lr=.05, n_cores=n_cores,
+                                       regularizer=1 / n, take_k=1, with_memory=True, estimate='final'),
+            lambda n_cores: Parameters(name="top1", num_epoch=5, lr_type='constant', initial_lr=.05, n_cores=n_cores,
+                                       regularizer=1 / n, take_k=1, take_top=True, with_memory=True, estimate='final'),
+            lambda n_cores: Parameters(name="hogwild", num_epoch=5, lr_type='constant', initial_lr=.05, n_cores=n_cores,
+                                       regularizer=1 / n, estimate='final'),
+        ]
+        cores = [1, 2, 3, 5, 8, 10, 12, 14, 16, 18, 20, 22, 24]
+        baseline = 0.305
+
+    elif args.experiment == 'rcv1-th':
+        params = [
+            Parameters(name="full-sgd", num_epoch=3, lr_type='decay', initial_lr=2, tau=10,
+                       regularizer=1 / n, estimate='(t+tau)^2'),
+            Parameters(name="top10", num_epoch=3, lr_type='decay', initial_lr=2, tau=10 * d / 10,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True, take_top=True),
+            Parameters(name="top10-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=10,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True, take_top=True),
+            Parameters(name="rand10", num_epoch=3, lr_type='decay', initial_lr=2, tau=10 * d / 10,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True),
+            Parameters(name="rand10-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=10,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True),
+            Parameters(name="rand20", num_epoch=3, lr_type='decay', initial_lr=2, tau=10 * d / 20,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=20, with_memory=True),
+            Parameters(name="rand20-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=10,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=20, with_memory=True),
+            Parameters(name="rand30", num_epoch=3, lr_type='decay', initial_lr=2, tau=10 * d / 30,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=30, with_memory=True),
+            Parameters(name="rand30-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=10,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=30, with_memory=True),
+        ]
+    elif args.experiment == 'rcv1-quant':
+        params = [
+            Parameters(name="qsgd-8bits", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d,
+                       regularizer=1 / n, estimate='(t+tau)^2', qsgd_s=2 ** 8),
+            Parameters(name="qsgd-4bits", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d,
+                       regularizer=1 / n, estimate='(t+tau)^2', qsgd_s=2 ** 4),
+            Parameters(name="qsgd-2bits", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d,
+                       regularizer=1 / n, estimate='(t+tau)^2', qsgd_s=2 ** 2),
+            Parameters(name="top1", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True, take_top=True),
+            Parameters(name="rand1", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True),
+            Parameters(name="top10", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True, take_top=True),
+            Parameters(name="rand10", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d,
+                       regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True),
+        ]
+    elif args.experiment == 'rcv1-parallel':
+        models = [
+            lambda n_cores: Parameters(name="top100", num_epoch=6, lr_type='decay', initial_lr=2., n_cores=n_cores,
+                                       tau=10 / 100 * d,
+                                       regularizer=1 / n, estimate='final', take_k=100, take_top=True,
+                                       with_memory=True),
+            lambda n_cores: Parameters(name="rand100", num_epoch=6, lr_type='decay', initial_lr=2., n_cores=n_cores,
+                                       tau=10 / 100 * d,
+                                       regularizer=1 / n, estimate='final', take_k=100, take_top=False,
+                                       with_memory=True),
+            lambda n_cores: Parameters(name="hogwild", num_epoch=6, lr_type='decay', initial_lr=2., n_cores=n_cores,
+                                       tau=10, regularizer=1 / n,
+                                       estimate='final'),
+        ]
+
+        cores = [1, 2, 3, 5, 8, 10, 12, 14, 16, 18, 20, 22, 24]
+        baseline = 0.101
+
+    if 'parallel' in args.experiment:
+        run_parallel_experiment(args.directory, dataset, models, cores, baseline, repeat=3)
+    else:
+        run_experiment(args.directory, dataset, params, nproc=args.nproc)