Working on

negar-foroutan · Sep 13, 2018 · 2e93e9c · 2e93e9c
1 parent 3598c85
commit 2e93e9c
Show file tree

Hide file tree

Showing 16 changed files with 5,611 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -25,15 +25,30 @@ We decompress the libsvm file and use pickle format instead. It takes more space
 
 ```python
 import pickle
+import os
 from sklearn.datasets import load_svmlight_file
-X, y = load_svmlight_file('rcv1_test.binary.bz2')
-with open('rcv1_test.pickle', 'wb') as f:
+
+if not os.path.exists('data'):
+    os.makedirs('data')
+
+X, y = load_svmlight_file('data/rcv1_test.binary.bz2')
+with open('rcv1.pickle', 'wb') as f:
     pickle.dump((X, y), f)
+
+X, y = load_svmlight_file('data/epsilon_normalized.bz2')
+with open('epsilon.pickle', 'wb') as f:
+    pickle.dump((X, y), f)
+```
+
+After updating the path to the data files in `experiment.py` , you can run the baseline
+
+```bash
+python3 baselines.py ./data results/baselines
 ```
 
-After updating the path to the data files in `experiment.py` , you can then run our experiments, for example
+and then run our experiments, for example
 
 ```bash
-python3 experiment.py rcv1-th results/rcv1-th --nproc 10
+python3 experiment.py rcv1-th ./data results/rcv1-th --nproc 10
 ```
 
diff --git a/base_logistic.py b/base_logistic.py
@@ -19,6 +19,8 @@ def lr(self, epoch, iteration, num_samples, d):
             return p.initial_lr * (p.epoch_decay_lr ** epoch)
         if p.lr_type == 'decay':
             return p.initial_lr / (p.regularizer * (t + p.tau))
+        if p.lr_type == 'bottou':
+            return p.initial_lr / (1 + p.initial_lr * p.regularizer * t)
 
     def loss(self, X, y):
         w = self.w_estimate if self.w_estimate is not None else self.w

diff --git a/baselines.py b/baselines.py
@@ -10,12 +10,16 @@
 """Arguments"""
 
 parser = argparse.ArgumentParser()
-parser.add_argument('directory', type=str)
+parser.add_argument('data', type=str)
+parser.add_argument('output', type=str)
+
+EPSILON_NAME = "epsilon.pickle"
+RCV1_NAME = "rcv1.pickle"
 
 args = parser.parse_args()
-if not os.path.exists(args.directory):
-    print('create {}'.format(args.directory))
-    os.makedirs(args.directory)
+if not os.path.exists(args.output):
+    print('create {}'.format(args.output))
+    os.makedirs(args.output)
 
 baselines = {}
 
@@ -28,7 +32,7 @@ def loss(clf, X, y, reg):
 
 """ RCV1 test"""
 print('RCV1-test')
-with open(os.path.expanduser('/mlodata1/jb/data/rcv1-test-1.pickle'), 'rb') as f:
+with open(os.path.join(args.data, RCV1_NAME), 'rb') as f:
     X, y = pickle.load(f)
 
 reg = 1 / X.shape[0]
@@ -42,7 +46,7 @@ def loss(clf, X, y, reg):
 """ EPSILON """
 
 print('epsilon')
-with open(os.path.expanduser('/mlodata1/jb/data/epsilon_normalized_1.pickle'), 'rb') as f:
+with open(os.path.join(args.data, EPSILON_NAME), 'rb') as f:
     X, y = pickle.load(f)
 
 reg = 1 / X.shape[0]
@@ -55,4 +59,4 @@ def loss(clf, X, y, reg):
 
 """ Pickle """
 print('baselines', baselines)
-pickle_it(baselines, 'baselines', args.directory)
+pickle_it(baselines, 'baselines', args.output)
diff --git a/eps-quant.py b/eps-quant.py
@@ -0,0 +1,30 @@
+# For normal SGD bottou final
+# best is lr = 1.0
+from parameters import Parameters
+from logistic import LogisticSGD
+from experiment import run_experiment
+
+n = 400000
+params = []
+
+num_epoch=5
+lr=1.
+params.append(Parameters(name="full-sgd", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
+                   regularizer=1 / n, estimate='final'))
+params.append(Parameters(name="qsgd-8bit", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
+                   regularizer=1 / n, estimate='final',
+                        qsgd_s=2 ** 8))
+params.append(Parameters(name="qsgd-4bit", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
+                   regularizer=1 / n, estimate='final',
+                        qsgd_s=2 ** 4))
+params.append(Parameters(name="qsgd-2bit", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
+                   regularizer=1 / n, estimate='final',
+                        qsgd_s=2 ** 2))
+params.append(Parameters(name="top1", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
+                   regularizer=1 / n, estimate='final',
+                        take_k=1, take_top=True, with_memory=True))
+params.append(Parameters(name="rand1", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
+                   regularizer=1 / n, estimate='final',
+                        take_k=1, take_top=False, with_memory=True))
+
+run_experiment('eps-quantized', '/mlodata1/jb/data/epsilon_normalized_1.pickle', params, nproc=12)
diff --git a/eps-quantized-search.py b/eps-quantized-search.py
@@ -0,0 +1,117 @@
+import argparse
+import multiprocessing as mp
+import os
+import pickle
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from logistic import LogisticSGD
+from parameters import Parameters
+from utils import pickle_it, unpickle_dir
+
+plt.switch_backend('agg')
+from matplotlib import rc
+rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
+## for Palatino and other serif fonts use:
+#rc('font',**{'family':'serif','serif':['Palatino']})
+rc('text', usetex=True)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('data_dir', type=str)
+parser.add_argument('result_dir', type=str)
+args = parser.parse_args()
+
+DATA_DIR = args.data_dir
+RESULT_DIR = args.result_dir
+DATASET = 'epsilon.pickle'
+SUBSAMPLE = 0.01
+SEED = 2018
+NUM_EPOCH = 10
+
+print('load dataset')
+dataset = os.path.join(DATA_DIR, DATASET)
+with open(dataset, 'rb') as f:
+    X, y = pickle.load(f)
+
+print('down sample dataset')
+np.random.seed(SEED)
+n, d = X.shape
+sub_idx = np.random.choice(n, int(SUBSAMPLE * n), replace=False)
+X, y = X[sub_idx, :], y[sub_idx]
+
+params = []
+lrs = [0.01, 0.1, 1., 10., 100.]
+
+for lr in lrs:
+    params.append(Parameters(name="full-sgd-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
+                             regularizer=1 / n, estimate='mean'))
+    params.append(Parameters(name="top1-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
+                             regularizer=1 / n, estimate='mean',
+                             take_k=1, take_top=True, with_memory=True))
+    params.append(Parameters(name="rand1-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
+                             regularizer=1 / n, estimate='mean',
+                             take_k=1, take_top=False, with_memory=True))
+    params.append(Parameters(name="qsgd-8bit-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
+                             regularizer=1 / n, estimate='mean',
+                             qsgd_s=2 ** 8))
+    params.append(Parameters(name="qsgd-4bit-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
+                             regularizer=1 / n, estimate='mean',
+                             qsgd_s=2 ** 4))
+    params.append(Parameters(name="qsgd-2bit-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
+                             regularizer=1 / n, estimate='mean',
+                             qsgd_s=2 ** 2))
+
+
+def run_logistic(param):
+    m = LogisticSGD(param)
+    res = m.fit(X, y)
+    print('{} - score: {}'.format(param, m.score(X, y)))
+    return res
+
+
+if not os.path.exists(RESULT_DIR):
+    os.makedirs(RESULT_DIR)
+pickle_it(params, 'params', RESULT_DIR)
+
+print('start experiment')
+with mp.Pool(len(params)) as pool:
+    results = pool.map(run_logistic, params)
+
+pickle_it(results, 'results', RESULT_DIR)
+print('results saved in "{}"'.format(RESULT_DIR))
+
+# process data
+
+res_and_infos = []
+names = []
+lrs = []
+for p, res in zip(params, results):
+    lr = p.initial_lr
+    name = str(p)[:-(len(str(lr)) + 1)]
+    names.append(name)
+    lrs.append(lr)
+    res_and_infos.append((name, lr, res[1][:-1]))
+
+names = sorted(list(set(names)))
+lrs = sorted(list(set(lrs)))
+
+# plot
+f, axarr = plt.subplots(1, len(names), figsize=(20, 4), sharey=True)
+
+for name, ax in zip(names, axarr):
+    ax.set_title(name)
+    ax.set_xlabel('epoch')
+    ax.set_ylim(0., 2.)
+
+for name, lr, loss in res_and_infos:
+    ax = axarr[names.index(name)]
+    idx = lrs.index(lr)
+    ax.plot(np.arange(len(loss)) / 10, loss, "C{}".format(idx), label=str(lr))
+
+
+axarr[0].set_ylabel('loss')
+axarr[0].legend();
+result_pdf = os.path.join(RESULT_DIR, 'figure.pdf')
+f.savefig(result_pdf)
+print('figure saved in {}'.format(result_pdf))
diff --git a/eps-quantized.py b/eps-quantized.py
@@ -0,0 +1,101 @@
+import argparse
+import multiprocessing as mp
+import os
+import pickle
+
+import matplotlib.pyplot as plt
+
+from logistic import LogisticSGD
+from parameters import Parameters
+from utils import pickle_it
+
+plt.switch_backend('agg')
+
+parser = argparse.ArgumentParser()
+parser.add_argument('data_dir', type=str)
+parser.add_argument('result_dir', type=str)
+args = parser.parse_args()
+
+DATA_DIR = args.data_dir
+RESULT_DIR = args.result_dir
+DATASET = 'epsilon.pickle'
+NUM_EPOCH = 10
+
+print('load dataset')
+dataset = os.path.join(DATA_DIR, DATASET)
+with open(dataset, 'rb') as f:
+    X, y = pickle.load(f)
+
+n, d = X.shape
+
+params = []
+
+params.append(Parameters(name="full-sgd", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=10.,
+                         regularizer=1 / n, estimate='mean'))
+params.append(Parameters(name="top1", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=10.,
+                         regularizer=1 / n, estimate='mean',
+                         take_k=1, take_top=True, with_memory=True))
+params.append(Parameters(name="rand1", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=10.,
+                         regularizer=1 / n, estimate='mean',
+                         take_k=1, take_top=False, with_memory=True))
+params.append(Parameters(name="qsgd-8bit", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=10.,
+                         regularizer=1 / n, estimate='mean',
+                         qsgd_s=2 ** 8))
+params.append(Parameters(name="qsgd-4bit", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=1.,
+                         regularizer=1 / n, estimate='mean',
+                         qsgd_s=2 ** 4))
+params.append(Parameters(name="qsgd-2bit", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=1.,
+                         regularizer=1 / n, estimate='mean',
+                         qsgd_s=2 ** 2))
+
+
+def run_logistic(param):
+    m = LogisticSGD(param)
+    res = m.fit(X, y)
+    print('{} - score: {}'.format(param, m.score(X, y)))
+    return res
+
+
+if not os.path.exists(RESULT_DIR):
+    os.makedirs(RESULT_DIR)
+pickle_it(params, 'params', RESULT_DIR)
+
+print('start experiment')
+with mp.Pool(len(params)) as pool:
+    results = pool.map(run_logistic, params)
+
+pickle_it(results, 'results', RESULT_DIR)
+print('results saved in "{}"'.format(RESULT_DIR))
+
+# process data
+
+# res_and_infos = []
+# names = []
+# lrs = []
+# for p, res in zip(params, results):
+#     lr = p.initial_lr
+#     name = str(p)[:-(len(str(lr)) + 1)]
+#     names.append(name)
+#     lrs.append(lr)
+#     res_and_infos.append((name, lr, res[1][:-1]))
+#
+# names = sorted(list(set(names)))
+# lrs = sorted(list(set(lrs)))
+#
+# # plot
+# f, axarr = plt.subplots(1, len(names), figsize=(20, 4), sharey=True)
+#
+# for name, lr, loss in res_and_infos:
+#     ax = axarr[names.index(name)]
+#     idx = lrs.index(lr)
+#     ax.plot(loss, "C{}".format(idx), label=str(lr))
+#
+# for name, ax in zip(names, axarr):
+#     ax.set_title(name)
+#     ax.set_ylim(top=2.)
+#
+# axarr[0].set_ylabel('loss')
+# axarr[0].legend();
+# result_pdf = os.path.join(RESULT_DIR, 'figure.pdf')
+# f.savefig(result_pdf)
+# print('figure saved in {}'.format(result_pdf))