forked from epfml/sparsifiedSGD
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Python code, README and plot notebook
- Loading branch information
Showing
29 changed files
with
1,633 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,39 @@ | ||
# sparsifiedSGD | ||
Code for Sparsified SGD. | ||
# Sparsified SGD with Memory | ||
|
||
Code for the experimental part of the paper [Sparsified SGD with Memory TODO link](). It contains the code the following experiments: | ||
|
||
- Theoretical convergence with different sparsification operator | ||
- Comparison with QSGD | ||
- Multi-core experiments | ||
|
||
Use `notebooks/plots.ipynb` to visualize the results. | ||
|
||
Please open an issue if you have questions or problems. | ||
|
||
### Reproduce the results | ||
|
||
To reproduce the results, you can download the datasets from [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html) | ||
|
||
```bash | ||
mkdir data | ||
cd data/ | ||
wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_test.binary.bz2 | ||
wget https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/epsilon_normalized.bz2 | ||
``` | ||
|
||
We decompress the libsvm file and use pickle format instead. It takes more space but is faster to load. You can create a file as follow | ||
|
||
```python | ||
import pickle | ||
from sklearn.datasets import load_svmlight_file | ||
X, y = load_svmlight_file('rcv1_test.binary.bz2') | ||
with open('rcv1_test.pickle', 'wb') as f: | ||
pickle.dump((X, y), f) | ||
``` | ||
|
||
After updating the path to the data files in `experiment.py` , you can then run our experiments, for example | ||
|
||
```bash | ||
python3 experiment.py rcv1-th results/rcv1-th --nproc 10 | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import numpy as np | ||
from scipy.special import expit as sigmoid | ||
|
||
from parameters import Parameters | ||
|
||
|
||
class BaseLogistic: | ||
def __init__(self, params: Parameters): | ||
self.params = params | ||
self.w_estimate = None | ||
self.w = None | ||
|
||
def lr(self, epoch, iteration, num_samples, d): | ||
p = self.params | ||
t = epoch * num_samples + iteration | ||
if p.lr_type == 'constant': | ||
return p.initial_lr | ||
if p.lr_type == 'epoch-decay': | ||
return p.initial_lr * (p.epoch_decay_lr ** epoch) | ||
if p.lr_type == 'decay': | ||
return p.initial_lr / (p.regularizer * (t + p.tau)) | ||
|
||
def loss(self, X, y): | ||
w = self.w_estimate if self.w_estimate is not None else self.w | ||
w = w.copy() | ||
p = self.params | ||
loss = np.sum(np.log(1 + np.exp(-y * (X @ w)))) / X.shape[0] | ||
if p.regularizer: | ||
loss += p.regularizer * np.square(w).sum() | ||
return loss | ||
|
||
def predict(self, X): | ||
w = self.w_estimate if self.w_estimate is not None else self.w | ||
logits = X @ w | ||
pred = 1 * (logits >= 0.) | ||
return pred | ||
|
||
def predict_proba(self, X): | ||
w = self.w_estimate if self.w_estimate is not None else self.w | ||
logits = X @ w | ||
return sigmoid(logits) | ||
|
||
def score(self, X, y): | ||
w = self.w_estimate if self.w_estimate is not None else self.w | ||
logits = X @ w | ||
pred = 2 * (logits >= 0.) - 1 | ||
acc = np.mean(pred == y) | ||
return acc | ||
|
||
def update_estimate(self, t): | ||
p = self.params | ||
if p.estimate == 'final': | ||
self.w_estimate = self.w | ||
elif p.estimate == 'mean': | ||
rho = 1 / (t + 1) | ||
self.w_estimate = self.w_estimate * (1 - rho) + self.w * rho | ||
elif p.estimate == 't+tau': | ||
rho = 2 * (t + p.tau) / ((1 + t) * (t + 2 * p.tau)) | ||
self.w_estimate = self.w_estimate * (1 - rho) + self.w * rho | ||
elif p.estimate == '(t+tau)^2': | ||
rho = 6 * ((t + p.tau) ** 2) / ((1 + t) * (6 * (p.tau ** 2) + t + 6 * p.tau * t + 2 * (t ** 2))) | ||
self.w_estimate = self.w_estimate * (1 - rho) + self.w * rho | ||
|
||
def __str__(self): | ||
return "{}({})".format(self.__class__.__name__, self.params) | ||
|
||
def __repr__(self): | ||
return str(self) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import argparse | ||
import os | ||
import pickle | ||
|
||
import numpy as np | ||
from sklearn.linear_model import SGDClassifier | ||
|
||
from utils import pickle_it | ||
|
||
"""Arguments""" | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument('directory', type=str) | ||
|
||
args = parser.parse_args() | ||
if not os.path.exists(args.directory): | ||
print('create {}'.format(args.directory)) | ||
os.makedirs(args.directory) | ||
|
||
baselines = {} | ||
|
||
|
||
def loss(clf, X, y, reg): | ||
baseline_loss = np.sum(np.log(1 + np.exp(-y * (X @ clf.coef_.transpose()).squeeze()))) / X.shape[0] | ||
baseline_loss += reg * np.sum(np.square(clf.coef_)) | ||
return baseline_loss | ||
|
||
|
||
""" RCV1 test""" | ||
print('RCV1-test') | ||
with open(os.path.expanduser('/mlodata1/jb/data/rcv1-test-1.pickle'), 'rb') as f: | ||
X, y = pickle.load(f) | ||
|
||
reg = 1 / X.shape[0] | ||
clf = SGDClassifier(tol=1e-4, loss='log', penalty='l2', alpha=reg, fit_intercept=False) | ||
clf.fit(X, y) | ||
l = loss(clf, X, y, reg) | ||
print("loss: {}".format(l)) | ||
print("train accuracy: {}".format(clf.score(X, y))) | ||
baselines['RCV1-test'] = l | ||
|
||
""" EPSILON """ | ||
|
||
print('epsilon') | ||
with open(os.path.expanduser('/mlodata1/jb/data/epsilon_normalized_1.pickle'), 'rb') as f: | ||
X, y = pickle.load(f) | ||
|
||
reg = 1 / X.shape[0] | ||
clf = SGDClassifier(tol=1e-4, loss='log', penalty='l2', alpha=reg) | ||
clf.fit(X, y) | ||
l = loss(clf, X, y, reg) | ||
print("loss: {}".format(l)) | ||
print("train accuracy: {}".format(clf.score(X, y))) | ||
baselines['epsilon'] = l | ||
|
||
""" Pickle """ | ||
print('baselines', baselines) | ||
pickle_it(baselines, 'baselines', args.directory) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
INIT_WEIGHT_STD = 0.01 | ||
LOSS_PER_EPOCH = 100 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,197 @@ | ||
import argparse | ||
import multiprocessing as mp | ||
import os | ||
import pickle | ||
|
||
import numpy as np | ||
|
||
from logistic import LogisticSGD | ||
from logistic_parallel import LogisticParallelSGD | ||
from parameters import Parameters | ||
from utils import pickle_it | ||
|
||
X, y = None, None | ||
|
||
|
||
def run_logistic(param): | ||
m = LogisticSGD(param) | ||
res = m.fit(X, y) | ||
print('{} - score: {}'.format(param, m.score(X, y))) | ||
return res | ||
|
||
|
||
def run_experiment(directory, dataset_pickle, params, nproc=None): | ||
global X, y | ||
if not os.path.exists(directory): | ||
os.makedirs(directory) | ||
pickle_it(params, 'params', directory) | ||
|
||
print('load dataset') | ||
with open(dataset_pickle, 'rb') as f: | ||
X, y = pickle.load(f) | ||
|
||
print('start experiment') | ||
with mp.Pool(nproc) as pool: | ||
results = pool.map(run_logistic, params) | ||
|
||
pickle_it(results, 'results', directory) | ||
print('results saved in "{}"'.format(directory)) | ||
|
||
|
||
def run_parallel_experiment(directory, dataset_pickle, models, cores, baseline, repeat=3): | ||
if not os.path.exists(directory): | ||
os.makedirs(directory) | ||
pickle_it([m(1) for m in models], 'models', directory) | ||
pickle_it(cores, 'cores', directory) | ||
|
||
print('load dataset') | ||
with open(dataset_pickle, 'rb') as f: | ||
X, y = pickle.load(f) | ||
|
||
print('start experiment') | ||
|
||
chronos = np.zeros((len(models), len(cores), repeat)) | ||
stop_times = np.zeros((len(models), len(cores), repeat), dtype=int) | ||
|
||
for r in range(repeat): | ||
for c_idx, core in enumerate(cores): | ||
for m_idx, model in enumerate(models): | ||
p = model(core) | ||
print("{} - cores {} - repeat {}".format(p, core, r)) | ||
m = LogisticParallelSGD(p) | ||
timing, epoch, iteration, losses = m.fit_until(X, y, num_features=X.shape[1], num_samples=X.shape[0], | ||
baseline=baseline) | ||
chronos[m_idx, c_idx, r] = timing | ||
stop_times[m_idx, c_idx, r] = epoch * X.shape[0] + iteration | ||
|
||
pickle_it(chronos, 'chronos', directory) | ||
pickle_it(stop_times, 'stop_times', directory) | ||
|
||
pickle_it(chronos, 'chronos', directory) | ||
pickle_it(stop_times, 'stop_times', directory) | ||
print('results saved in "{}"'.format(directory)) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('experiment', type=str) | ||
parser.add_argument('directory', type=str) | ||
parser.add_argument('--nproc', type=int, default=1) | ||
args = parser.parse_args() | ||
|
||
assert args.experiment in ['epsilon-th', 'epsilon-quant', 'epsilon-parallel', | ||
'rcv1-th', 'rcv1-quant', 'rcv1-parallel'] | ||
|
||
# dataset | ||
if args.experiment.startswith('epsilon'): | ||
dataset = os.path.expanduser('/mlodata1/jb/data/epsilon_normalized_1.pickle') | ||
n, d = 400000, 2000 | ||
elif args.experiment.startswith('rcv1'): | ||
dataset = os.path.expanduser('/mlodata1/jb/data/rcv1-test-1.pickle') | ||
n, d = 677399, 47236 | ||
|
||
# parameters to evaluate | ||
if args.experiment == 'epsilon-th': | ||
params = [ | ||
Parameters(name="full-sgd", num_epoch=3, lr_type='decay', initial_lr=2, tau=1, | ||
regularizer=1 / n, estimate='(t+tau)^2'), | ||
Parameters(name="top1", num_epoch=3, lr_type='decay', initial_lr=2, tau=d, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True, take_top=True), | ||
Parameters(name="top1-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=1, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True, take_top=True), | ||
Parameters(name="rand1", num_epoch=3, lr_type='decay', initial_lr=2, tau=d, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True), | ||
Parameters(name="rand1-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=1, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True), | ||
Parameters(name="rand2", num_epoch=3, lr_type='decay', initial_lr=2, tau=d / 2, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=2, with_memory=True), | ||
Parameters(name="rand2-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=1, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=2, with_memory=True), | ||
Parameters(name="rand3", num_epoch=3, lr_type='decay', initial_lr=2, tau=d / 3, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=3, with_memory=True), | ||
Parameters(name="rand3-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=1, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=3, with_memory=True), | ||
] | ||
elif args.experiment == 'epsilon-quant': | ||
params = [ | ||
Parameters(name="qsgd-8bits", num_epoch=3, lr_type='decay', initial_lr=2, tau=d, | ||
regularizer=1 / n, estimate='(t+tau)^2', qsgd_s=2 ** 8), | ||
Parameters(name="qsgd-4bits", num_epoch=3, lr_type='decay', initial_lr=2, tau=d, | ||
regularizer=1 / n, estimate='(t+tau)^2', qsgd_s=2 ** 4), | ||
Parameters(name="top1", num_epoch=3, lr_type='decay', initial_lr=2, tau=d, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True, take_top=True), | ||
Parameters(name="rand1", num_epoch=3, lr_type='decay', initial_lr=2, tau=d, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True), | ||
] | ||
elif args.experiment == 'epsilon-parallel': | ||
models = [ | ||
lambda n_cores: Parameters(name="rand1", num_epoch=5, lr_type='constant', initial_lr=.05, n_cores=n_cores, | ||
regularizer=1 / n, take_k=1, with_memory=True, estimate='final'), | ||
lambda n_cores: Parameters(name="top1", num_epoch=5, lr_type='constant', initial_lr=.05, n_cores=n_cores, | ||
regularizer=1 / n, take_k=1, take_top=True, with_memory=True, estimate='final'), | ||
lambda n_cores: Parameters(name="hogwild", num_epoch=5, lr_type='constant', initial_lr=.05, n_cores=n_cores, | ||
regularizer=1 / n, estimate='final'), | ||
] | ||
cores = [1, 2, 3, 5, 8, 10, 12, 14, 16, 18, 20, 22, 24] | ||
baseline = 0.305 | ||
|
||
elif args.experiment == 'rcv1-th': | ||
params = [ | ||
Parameters(name="full-sgd", num_epoch=3, lr_type='decay', initial_lr=2, tau=10, | ||
regularizer=1 / n, estimate='(t+tau)^2'), | ||
Parameters(name="top10", num_epoch=3, lr_type='decay', initial_lr=2, tau=10 * d / 10, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True, take_top=True), | ||
Parameters(name="top10-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=10, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True, take_top=True), | ||
Parameters(name="rand10", num_epoch=3, lr_type='decay', initial_lr=2, tau=10 * d / 10, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True), | ||
Parameters(name="rand10-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=10, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True), | ||
Parameters(name="rand20", num_epoch=3, lr_type='decay', initial_lr=2, tau=10 * d / 20, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=20, with_memory=True), | ||
Parameters(name="rand20-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=10, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=20, with_memory=True), | ||
Parameters(name="rand30", num_epoch=3, lr_type='decay', initial_lr=2, tau=10 * d / 30, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=30, with_memory=True), | ||
Parameters(name="rand30-no-shift", num_epoch=3, lr_type='decay', initial_lr=2, tau=10, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=30, with_memory=True), | ||
] | ||
elif args.experiment == 'rcv1-quant': | ||
params = [ | ||
Parameters(name="qsgd-8bits", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d, | ||
regularizer=1 / n, estimate='(t+tau)^2', qsgd_s=2 ** 8), | ||
Parameters(name="qsgd-4bits", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d, | ||
regularizer=1 / n, estimate='(t+tau)^2', qsgd_s=2 ** 4), | ||
Parameters(name="qsgd-2bits", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d, | ||
regularizer=1 / n, estimate='(t+tau)^2', qsgd_s=2 ** 2), | ||
Parameters(name="top1", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True, take_top=True), | ||
Parameters(name="rand1", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=1, with_memory=True), | ||
Parameters(name="top10", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True, take_top=True), | ||
Parameters(name="rand10", num_epoch=2, lr_type='decay', initial_lr=2, tau=10 * d, | ||
regularizer=1 / n, estimate='(t+tau)^2', take_k=10, with_memory=True), | ||
] | ||
elif args.experiment == 'rcv1-parallel': | ||
models = [ | ||
lambda n_cores: Parameters(name="top100", num_epoch=6, lr_type='decay', initial_lr=2., n_cores=n_cores, | ||
tau=10 / 100 * d, | ||
regularizer=1 / n, estimate='final', take_k=100, take_top=True, | ||
with_memory=True), | ||
lambda n_cores: Parameters(name="rand100", num_epoch=6, lr_type='decay', initial_lr=2., n_cores=n_cores, | ||
tau=10 / 100 * d, | ||
regularizer=1 / n, estimate='final', take_k=100, take_top=False, | ||
with_memory=True), | ||
lambda n_cores: Parameters(name="hogwild", num_epoch=6, lr_type='decay', initial_lr=2., n_cores=n_cores, | ||
tau=10, regularizer=1 / n, | ||
estimate='final'), | ||
] | ||
|
||
cores = [1, 2, 3, 5, 8, 10, 12, 14, 16, 18, 20, 22, 24] | ||
baseline = 0.101 | ||
|
||
if 'parallel' in args.experiment: | ||
run_parallel_experiment(args.directory, dataset, models, cores, baseline, repeat=3) | ||
else: | ||
run_experiment(args.directory, dataset, params, nproc=args.nproc) |
Oops, something went wrong.