Skip to content

Commit

Permalink
Working on
Browse files Browse the repository at this point in the history
  • Loading branch information
jbcdnr committed Sep 13, 2018
1 parent 3598c85 commit 2e93e9c
Show file tree
Hide file tree
Showing 16 changed files with 5,611 additions and 28 deletions.
23 changes: 19 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,30 @@ We decompress the libsvm file and use pickle format instead. It takes more space

```python
import pickle
import os
from sklearn.datasets import load_svmlight_file
X, y = load_svmlight_file('rcv1_test.binary.bz2')
with open('rcv1_test.pickle', 'wb') as f:

if not os.path.exists('data'):
os.makedirs('data')

X, y = load_svmlight_file('data/rcv1_test.binary.bz2')
with open('rcv1.pickle', 'wb') as f:
pickle.dump((X, y), f)

X, y = load_svmlight_file('data/epsilon_normalized.bz2')
with open('epsilon.pickle', 'wb') as f:
pickle.dump((X, y), f)
```

After updating the path to the data files in `experiment.py` , you can run the baseline

```bash
python3 baselines.py ./data results/baselines
```

After updating the path to the data files in `experiment.py` , you can then run our experiments, for example
and then run our experiments, for example

```bash
python3 experiment.py rcv1-th results/rcv1-th --nproc 10
python3 experiment.py rcv1-th ./data results/rcv1-th --nproc 10
```

2 changes: 2 additions & 0 deletions base_logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def lr(self, epoch, iteration, num_samples, d):
return p.initial_lr * (p.epoch_decay_lr ** epoch)
if p.lr_type == 'decay':
return p.initial_lr / (p.regularizer * (t + p.tau))
if p.lr_type == 'bottou':
return p.initial_lr / (1 + p.initial_lr * p.regularizer * t)

def loss(self, X, y):
w = self.w_estimate if self.w_estimate is not None else self.w
Expand Down
18 changes: 11 additions & 7 deletions baselines.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@
"""Arguments"""

parser = argparse.ArgumentParser()
parser.add_argument('directory', type=str)
parser.add_argument('data', type=str)
parser.add_argument('output', type=str)

EPSILON_NAME = "epsilon.pickle"
RCV1_NAME = "rcv1.pickle"

args = parser.parse_args()
if not os.path.exists(args.directory):
print('create {}'.format(args.directory))
os.makedirs(args.directory)
if not os.path.exists(args.output):
print('create {}'.format(args.output))
os.makedirs(args.output)

baselines = {}

Expand All @@ -28,7 +32,7 @@ def loss(clf, X, y, reg):

""" RCV1 test"""
print('RCV1-test')
with open(os.path.expanduser('/mlodata1/jb/data/rcv1-test-1.pickle'), 'rb') as f:
with open(os.path.join(args.data, RCV1_NAME), 'rb') as f:
X, y = pickle.load(f)

reg = 1 / X.shape[0]
Expand All @@ -42,7 +46,7 @@ def loss(clf, X, y, reg):
""" EPSILON """

print('epsilon')
with open(os.path.expanduser('/mlodata1/jb/data/epsilon_normalized_1.pickle'), 'rb') as f:
with open(os.path.join(args.data, EPSILON_NAME), 'rb') as f:
X, y = pickle.load(f)

reg = 1 / X.shape[0]
Expand All @@ -55,4 +59,4 @@ def loss(clf, X, y, reg):

""" Pickle """
print('baselines', baselines)
pickle_it(baselines, 'baselines', args.directory)
pickle_it(baselines, 'baselines', args.output)
30 changes: 30 additions & 0 deletions eps-quant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# For normal SGD bottou final
# best is lr = 1.0
from parameters import Parameters
from logistic import LogisticSGD
from experiment import run_experiment

n = 400000
params = []

num_epoch=5
lr=1.
params.append(Parameters(name="full-sgd", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='final'))
params.append(Parameters(name="qsgd-8bit", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='final',
qsgd_s=2 ** 8))
params.append(Parameters(name="qsgd-4bit", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='final',
qsgd_s=2 ** 4))
params.append(Parameters(name="qsgd-2bit", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='final',
qsgd_s=2 ** 2))
params.append(Parameters(name="top1", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='final',
take_k=1, take_top=True, with_memory=True))
params.append(Parameters(name="rand1", num_epoch=num_epoch, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='final',
take_k=1, take_top=False, with_memory=True))

run_experiment('eps-quantized', '/mlodata1/jb/data/epsilon_normalized_1.pickle', params, nproc=12)
117 changes: 117 additions & 0 deletions eps-quantized-search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import argparse
import multiprocessing as mp
import os
import pickle

import matplotlib.pyplot as plt
import numpy as np

from logistic import LogisticSGD
from parameters import Parameters
from utils import pickle_it, unpickle_dir

plt.switch_backend('agg')
from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
## for Palatino and other serif fonts use:
#rc('font',**{'family':'serif','serif':['Palatino']})
rc('text', usetex=True)

parser = argparse.ArgumentParser()
parser.add_argument('data_dir', type=str)
parser.add_argument('result_dir', type=str)
args = parser.parse_args()

DATA_DIR = args.data_dir
RESULT_DIR = args.result_dir
DATASET = 'epsilon.pickle'
SUBSAMPLE = 0.01
SEED = 2018
NUM_EPOCH = 10

print('load dataset')
dataset = os.path.join(DATA_DIR, DATASET)
with open(dataset, 'rb') as f:
X, y = pickle.load(f)

print('down sample dataset')
np.random.seed(SEED)
n, d = X.shape
sub_idx = np.random.choice(n, int(SUBSAMPLE * n), replace=False)
X, y = X[sub_idx, :], y[sub_idx]

params = []
lrs = [0.01, 0.1, 1., 10., 100.]

for lr in lrs:
params.append(Parameters(name="full-sgd-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='mean'))
params.append(Parameters(name="top1-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='mean',
take_k=1, take_top=True, with_memory=True))
params.append(Parameters(name="rand1-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='mean',
take_k=1, take_top=False, with_memory=True))
params.append(Parameters(name="qsgd-8bit-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='mean',
qsgd_s=2 ** 8))
params.append(Parameters(name="qsgd-4bit-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='mean',
qsgd_s=2 ** 4))
params.append(Parameters(name="qsgd-2bit-{}".format(lr), num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=lr,
regularizer=1 / n, estimate='mean',
qsgd_s=2 ** 2))


def run_logistic(param):
m = LogisticSGD(param)
res = m.fit(X, y)
print('{} - score: {}'.format(param, m.score(X, y)))
return res


if not os.path.exists(RESULT_DIR):
os.makedirs(RESULT_DIR)
pickle_it(params, 'params', RESULT_DIR)

print('start experiment')
with mp.Pool(len(params)) as pool:
results = pool.map(run_logistic, params)

pickle_it(results, 'results', RESULT_DIR)
print('results saved in "{}"'.format(RESULT_DIR))

# process data

res_and_infos = []
names = []
lrs = []
for p, res in zip(params, results):
lr = p.initial_lr
name = str(p)[:-(len(str(lr)) + 1)]
names.append(name)
lrs.append(lr)
res_and_infos.append((name, lr, res[1][:-1]))

names = sorted(list(set(names)))
lrs = sorted(list(set(lrs)))

# plot
f, axarr = plt.subplots(1, len(names), figsize=(20, 4), sharey=True)

for name, ax in zip(names, axarr):
ax.set_title(name)
ax.set_xlabel('epoch')
ax.set_ylim(0., 2.)

for name, lr, loss in res_and_infos:
ax = axarr[names.index(name)]
idx = lrs.index(lr)
ax.plot(np.arange(len(loss)) / 10, loss, "C{}".format(idx), label=str(lr))


axarr[0].set_ylabel('loss')
axarr[0].legend();
result_pdf = os.path.join(RESULT_DIR, 'figure.pdf')
f.savefig(result_pdf)
print('figure saved in {}'.format(result_pdf))
101 changes: 101 additions & 0 deletions eps-quantized.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import argparse
import multiprocessing as mp
import os
import pickle

import matplotlib.pyplot as plt

from logistic import LogisticSGD
from parameters import Parameters
from utils import pickle_it

plt.switch_backend('agg')

parser = argparse.ArgumentParser()
parser.add_argument('data_dir', type=str)
parser.add_argument('result_dir', type=str)
args = parser.parse_args()

DATA_DIR = args.data_dir
RESULT_DIR = args.result_dir
DATASET = 'epsilon.pickle'
NUM_EPOCH = 10

print('load dataset')
dataset = os.path.join(DATA_DIR, DATASET)
with open(dataset, 'rb') as f:
X, y = pickle.load(f)

n, d = X.shape

params = []

params.append(Parameters(name="full-sgd", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=10.,
regularizer=1 / n, estimate='mean'))
params.append(Parameters(name="top1", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=10.,
regularizer=1 / n, estimate='mean',
take_k=1, take_top=True, with_memory=True))
params.append(Parameters(name="rand1", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=10.,
regularizer=1 / n, estimate='mean',
take_k=1, take_top=False, with_memory=True))
params.append(Parameters(name="qsgd-8bit", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=10.,
regularizer=1 / n, estimate='mean',
qsgd_s=2 ** 8))
params.append(Parameters(name="qsgd-4bit", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=1.,
regularizer=1 / n, estimate='mean',
qsgd_s=2 ** 4))
params.append(Parameters(name="qsgd-2bit", num_epoch=NUM_EPOCH, lr_type='bottou', initial_lr=1.,
regularizer=1 / n, estimate='mean',
qsgd_s=2 ** 2))


def run_logistic(param):
m = LogisticSGD(param)
res = m.fit(X, y)
print('{} - score: {}'.format(param, m.score(X, y)))
return res


if not os.path.exists(RESULT_DIR):
os.makedirs(RESULT_DIR)
pickle_it(params, 'params', RESULT_DIR)

print('start experiment')
with mp.Pool(len(params)) as pool:
results = pool.map(run_logistic, params)

pickle_it(results, 'results', RESULT_DIR)
print('results saved in "{}"'.format(RESULT_DIR))

# process data

# res_and_infos = []
# names = []
# lrs = []
# for p, res in zip(params, results):
# lr = p.initial_lr
# name = str(p)[:-(len(str(lr)) + 1)]
# names.append(name)
# lrs.append(lr)
# res_and_infos.append((name, lr, res[1][:-1]))
#
# names = sorted(list(set(names)))
# lrs = sorted(list(set(lrs)))
#
# # plot
# f, axarr = plt.subplots(1, len(names), figsize=(20, 4), sharey=True)
#
# for name, lr, loss in res_and_infos:
# ax = axarr[names.index(name)]
# idx = lrs.index(lr)
# ax.plot(loss, "C{}".format(idx), label=str(lr))
#
# for name, ax in zip(names, axarr):
# ax.set_title(name)
# ax.set_ylim(top=2.)
#
# axarr[0].set_ylabel('loss')
# axarr[0].legend();
# result_pdf = os.path.join(RESULT_DIR, 'figure.pdf')
# f.savefig(result_pdf)
# print('figure saved in {}'.format(result_pdf))
Loading

0 comments on commit 2e93e9c

Please sign in to comment.