Skip to content

Commit

Permalink
fixing merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
dsullivan7 committed Sep 10, 2014
2 parents 149d97f + 5fc0e8f commit 8081745
Show file tree
Hide file tree
Showing 199 changed files with 8,480 additions and 12,622 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ pip-log.txt
scikit_learn.egg-info/
.coverage
coverage
*.py,cover
tags
covtype.data.gz
20news-18828/
Expand Down
21 changes: 13 additions & 8 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ environment:
# /E:ON and /V:ON options are not enabled in the batch script intepreter
# See: http://stackoverflow.com/a/13751649/163740
CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\continuous_integration\\appveyor\\run_with_env.cmd"
WHEELHOUSE_UPLOADER_USERNAME: sklearn-appveyor
WHEELHOUSE_UPLOADER_SECRET:
secure: BQm8KfEj6v2Y+dQxb2syQvTFxDnHXvaNktkLcYSq7jfbTOO6eH9n09tfQzFUVcWZ

matrix:
- PYTHON: "C:\\Python27_32"
Expand All @@ -25,11 +28,6 @@ environment:
PYTHON_VERSION: "3.4.1"
PYTHON_ARCH: "64"

branches:
only:
- master
- 0.15.X

install:
# Install Python (from the official .msi of http://python.org) and pip when
# not already installed.
Expand All @@ -54,15 +52,22 @@ build: false
test_script:
# Change to a non-source folder to make sure we run the tests on the
# installed library.
- "cd C:\\"
- "mkdir empty_folder"
- "cd empty_folder"

# Skip joblib tests that require multiprocessing as they are prone to random
# slow down
- "python -c \"import nose; nose.main()\" -s sklearn"

# Move back to the project folder
- "cd .."

artifacts:
# Archive the generated wheel package in the ci.appveyor.com build report.
- path: dist\*

#on_success:
# - TODO: upload the content of dist/*.whl to a public wheelhouse
on_success:
# Upload the generated wheel package to Rackspace
# On Windows, Apache Libcloud cannot find a standard CA cert bundle so we
# disable the ssl checks.
- "python -m wheelhouse_uploader upload --no-ssl-check --local-folder=dist sklearn-windows-wheels"
282 changes: 101 additions & 181 deletions benchmarks/bench_covertype.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,13 @@
"""
from __future__ import division, print_function

print(__doc__)

# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Arnaud Joly <arnaud.v.joly@gmail.com>
# License: BSD 3 clause

import logging
import os
import sys
from time import time
from optparse import OptionParser

import argparse
import numpy as np

from sklearn.datasets import fetch_covtype, get_data_home
Expand All @@ -62,73 +58,35 @@
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import zero_one_loss
from sklearn.externals.joblib import Memory

logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger(__name__)

op = OptionParser()
op.add_option("--classifiers",
dest="classifiers", default='liblinear,GaussianNB,SGD,CART',
help="comma-separated list of classifiers to benchmark. "
"default: %default. available: "
"liblinear, GaussianNB, SGD, CART, ExtraTrees,\n"
"RandomForest, GBRT")

op.add_option("--n-jobs",
dest="n_jobs", default=1, type=int,
help="Number of concurrently running workers for models that"
" support parallelism.")

# Each number generator use the same seed to avoid coupling issue between
# estimators.
op.add_option("--random-seed",
dest="random_seed", default=13, type=int,
help="Common seed used by random number generator.")

op.print_help()

(opts, args) = op.parse_args()
if len(args) > 0:
op.error("this script takes no arguments.")
sys.exit(1)
from sklearn.utils import check_array

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
joblib_cache_folder = os.path.join(get_data_home(), 'covertype_benchmark_data')
m = Memory(joblib_cache_folder, mmap_mode='r')
memory = Memory(os.path.join(get_data_home(), 'covertype_benchmark_data'),
mmap_mode='r')


# Load the data, then cache and memmap the train/test split
@m.cache
def load_data(dtype=np.float32, order='C'):
@memory.cache
def load_data(dtype=np.float32, order='C', random_state=13):
"""Load the data, then cache and memmap the train/test split"""
######################################################################
## Load dataset
print("Loading dataset...")
data = fetch_covtype(download_if_missing=True, shuffle=True,
random_state=opts.random_seed)
X, y = data['data'], data['target']
X = np.asarray(X, dtype=dtype)

if order.lower() == 'f':
X = np.asfortranarray(X)
random_state=random_state)
X = check_array(data['data'], dtype=dtype, order=order)
y = (data['target'] != 1).astype(np.int)

# class 1 vs. all others.
y[np.where(y != 1)] = -1

######################################################################
## Create train-test split (as [Joachims, 2006])
logger.info("Creating train-test split...")
print("Creating train-test split...")
n_train = 522911

X_train = X[:n_train]
y_train = y[:n_train]
X_test = X[n_train:]
y_test = y[n_train:]

######################################################################
## Standardize first 10 features (the numerical ones)
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
Expand All @@ -139,130 +97,92 @@ def load_data(dtype=np.float32, order='C'):
return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = load_data()

######################################################################
## Print dataset statistics
print("")
print("Dataset statistics:")
print("===================")
print("%s %d" % ("number of features:".ljust(25),
X_train.shape[1]))
print("%s %d" % ("number of classes:".ljust(25),
np.unique(y_train).shape[0]))
print("%s %s" % ("data type:".ljust(25), X_train.dtype))
print("%s %d (pos=%d, neg=%d, size=%dMB)"
% ("number of train samples:".ljust(25),
X_train.shape[0], np.sum(y_train == 1),
np.sum(y_train == -1), int(X_train.nbytes / 1e6)))
print("%s %d (pos=%d, neg=%d, size=%dMB)"
% ("number of test samples:".ljust(25),
X_test.shape[0], np.sum(y_test == 1),
np.sum(y_test == -1), int(X_test.nbytes / 1e6)))


classifiers = dict()


######################################################################
## Benchmark classifiers
def benchmark(clf):
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
err = metrics.zero_one_loss(y_test, pred, normalize=True)
return err, train_time, test_time

######################################################################
## Train Liblinear model
liblinear_parameters = {
'loss': 'l2',
'penalty': 'l2',
'C': 1000,
'dual': False,
'tol': 1e-3,
"random_state": opts.random_seed,
ESTIMATORS = {
'GBRT': GradientBoostingClassifier(n_estimators=250),
'ExtraTrees': ExtraTreesClassifier(n_estimators=20),
'RandomForest': RandomForestClassifier(n_estimators=20),
'CART': DecisionTreeClassifier(min_samples_split=5),
'SGD': SGDClassifier(alpha=0.001, n_iter=2),
'GaussianNB': GaussianNB(),
'liblinear': LinearSVC(loss="l2", penalty="l2", C=1000, dual=False,
tol=1e-3)
}
classifiers['liblinear'] = LinearSVC(**liblinear_parameters)

######################################################################
## Train GaussianNB model
classifiers['GaussianNB'] = GaussianNB()

######################################################################
## Train SGD model
sgd_parameters = {
'alpha': 0.001,
'n_iter': 2,
'n_jobs': opts.n_jobs,
"random_state": opts.random_seed,
}
classifiers['SGD'] = SGDClassifier(**sgd_parameters)

######################################################################
## Train CART model
classifiers['CART'] = DecisionTreeClassifier(min_samples_split=5,
max_depth=None,
random_state=opts.random_seed)

######################################################################
## Train RandomForest model
rf_parameters = {
"n_estimators": 20,
"n_jobs": opts.n_jobs,
"random_state": opts.random_seed,
}
classifiers['RandomForest'] = RandomForestClassifier(**rf_parameters)

######################################################################
## Train Extra-Trees model
classifiers['ExtraTrees'] = ExtraTreesClassifier(n_estimators=20,
n_jobs=opts.n_jobs,
random_state=opts.random_seed)

######################################################################
## Train GBRT model
classifiers['GBRT'] = GradientBoostingClassifier(n_estimators=250,
random_state=opts.random_seed)


selected_classifiers = opts.classifiers.split(',')
for name in selected_classifiers:
if name not in classifiers:
op.error('classifier %r unknown' % name)
sys.exit(1)

print()
print("Training Classifiers")
print("====================")
print()
err, train_time, test_time = {}, {}, {}
for name in sorted(selected_classifiers):
print("Training %s ..." % name)
err[name], train_time[name], test_time[name] = benchmark(classifiers[name])

######################################################################
## Print classification performance
print()
print("Classification performance:")
print("===========================")
print()


def print_row(clf_type, train_time, test_time, err):
print("%s %s %s %s" % (clf_type.ljust(12),
("%.4fs" % train_time).center(10),
("%.4fs" % test_time).center(10),
("%.4f" % err).center(10)))

print("%s %s %s %s" % ("Classifier ", "train-time", "test-time",
"error-rate"))
print("-" * 44)

for name in sorted(selected_classifiers, key=lambda name: err[name]):
print_row(name, train_time[name], test_time[name], err[name])
print()
print()


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--classifiers', nargs="+",
choices=ESTIMATORS, type=str,
default=['liblinear', 'GaussianNB', 'SGD', 'CART'],
help="list of classifiers to benchmark.")
parser.add_argument('--n-jobs', nargs="?", default=1, type=int,
help="Number of concurrently running workers for "
"models that support parallelism.")
parser.add_argument('--order', nargs="?", default="C", type=str,
choices=["F", "C"],
help="Allow to choose between fortran and C ordered "
"data")
parser.add_argument('--random-seed', nargs="?", default=13, type=int,
help="Common seed used by random number generator.")
args = vars(parser.parse_args())

print(__doc__)

X_train, X_test, y_train, y_test = load_data(
order=args["order"], random_state=args["random_seed"])

print("")
print("Dataset statistics:")
print("===================")
print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
print("%s %s" % ("data type:".ljust(25), X_train.dtype))
print("%s %d (pos=%d, neg=%d, size=%dMB)"
% ("number of train samples:".ljust(25),
X_train.shape[0], np.sum(y_train == 1),
np.sum(y_train == 0), int(X_train.nbytes / 1e6)))
print("%s %d (pos=%d, neg=%d, size=%dMB)"
% ("number of test samples:".ljust(25),
X_test.shape[0], np.sum(y_test == 1),
np.sum(y_test == 0), int(X_test.nbytes / 1e6)))

print()
print("Training Classifiers")
print("====================")
error, train_time, test_time = {}, {}, {}
for name in sorted(args["classifiers"]):
print("Training %s ... " % name, end="")
estimator = ESTIMATORS[name]
estimator_params = estimator.get_params()

if "random_state" in estimator_params:
estimator.set_params(random_state=args["random_seed"])

if "n_jobs" in estimator_params:
estimator.set_params(n_jobs=args["n_jobs"])

time_start = time()
estimator.fit(X_train, y_train)
train_time[name] = time() - time_start

time_start = time()
y_pred = estimator.predict(X_test)
test_time[name] = time() - time_start

error[name] = zero_one_loss(y_test, y_pred)

print("done")

print()
print("Classification performance:")
print("===========================")
print("%s %s %s %s"
% ("Classifier ", "train-time", "test-time", "error-rate"))
print("-" * 44)
for name in sorted(args["classifiers"], key=error.get):
print("%s %s %s %s" % (name.ljust(12),
("%.4fs" % train_time[name]).center(10),
("%.4fs" % test_time[name]).center(10),
("%.4f" % error[name]).center(10)))

print()
Loading

0 comments on commit 8081745

Please sign in to comment.