fixing merge conflicts

Atthemoment · Sep 10, 2014 · 8081745 · 8081745
2 parents 149d97f + 5fc0e8f
commit 8081745
Show file tree

Hide file tree

Showing 199 changed files with 8,480 additions and 12,622 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,6 +22,7 @@ pip-log.txt
 scikit_learn.egg-info/
 .coverage
 coverage
+*.py,cover
 tags
 covtype.data.gz
 20news-18828/

diff --git a/appveyor.yml b/appveyor.yml
@@ -7,6 +7,9 @@ environment:
     # /E:ON and /V:ON options are not enabled in the batch script intepreter
     # See: http://stackoverflow.com/a/13751649/163740
     CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\continuous_integration\\appveyor\\run_with_env.cmd"
+    WHEELHOUSE_UPLOADER_USERNAME: sklearn-appveyor
+    WHEELHOUSE_UPLOADER_SECRET:
+      secure: BQm8KfEj6v2Y+dQxb2syQvTFxDnHXvaNktkLcYSq7jfbTOO6eH9n09tfQzFUVcWZ
 
   matrix:
     - PYTHON: "C:\\Python27_32"
@@ -25,11 +28,6 @@ environment:
       PYTHON_VERSION: "3.4.1"
       PYTHON_ARCH: "64"
 
-branches:
-  only:
-    - master
-    - 0.15.X
-
 install:
   # Install Python (from the official .msi of http://python.org) and pip when
   # not already installed.
@@ -54,15 +52,22 @@ build: false
 test_script:
   # Change to a non-source folder to make sure we run the tests on the
   # installed library.
-  - "cd C:\\"
+  - "mkdir empty_folder"
+  - "cd empty_folder"
 
   # Skip joblib tests that require multiprocessing as they are prone to random
   # slow down
   - "python -c \"import nose; nose.main()\" -s sklearn"
 
+  # Move back to the project folder
+  - "cd .."
+
 artifacts:
   # Archive the generated wheel package in the ci.appveyor.com build report.
   - path: dist\*
 
-#on_success:
-#  - TODO: upload the content of dist/*.whl to a public wheelhouse
+on_success:
+  # Upload the generated wheel package to Rackspace
+  # On Windows, Apache Libcloud cannot find a standard CA cert bundle so we
+  # disable the ssl checks.
+  - "python -m wheelhouse_uploader upload --no-ssl-check --local-folder=dist sklearn-windows-wheels"
diff --git a/benchmarks/bench_covertype.py b/benchmarks/bench_covertype.py
@@ -42,17 +42,13 @@
 """
 from __future__ import division, print_function
 
-print(__doc__)
-
 # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#         Arnaud Joly <arnaud.v.joly@gmail.com>
 # License: BSD 3 clause
 
-import logging
 import os
-import sys
 from time import time
-from optparse import OptionParser
-
+import argparse
 import numpy as np
 
 from sklearn.datasets import fetch_covtype, get_data_home
@@ -62,73 +58,35 @@
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn.ensemble import GradientBoostingClassifier
-from sklearn import metrics
+from sklearn.metrics import zero_one_loss
 from sklearn.externals.joblib import Memory
-
-logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s %(levelname)s %(message)s')
-logger = logging.getLogger(__name__)
-
-op = OptionParser()
-op.add_option("--classifiers",
-              dest="classifiers", default='liblinear,GaussianNB,SGD,CART',
-              help="comma-separated list of classifiers to benchmark. "
-                   "default: %default. available: "
-                   "liblinear, GaussianNB, SGD, CART, ExtraTrees,\n"
-                   "RandomForest, GBRT")
-
-op.add_option("--n-jobs",
-              dest="n_jobs", default=1, type=int,
-              help="Number of concurrently running workers for models that"
-                   " support parallelism.")
-
-# Each number generator use the same seed to avoid coupling issue between
-# estimators.
-op.add_option("--random-seed",
-              dest="random_seed", default=13, type=int,
-              help="Common seed used by random number generator.")
-
-op.print_help()
-
-(opts, args) = op.parse_args()
-if len(args) > 0:
-    op.error("this script takes no arguments.")
-    sys.exit(1)
+from sklearn.utils import check_array
 
 # Memoize the data extraction and memory map the resulting
 # train / test splits in readonly mode
-joblib_cache_folder = os.path.join(get_data_home(), 'covertype_benchmark_data')
-m = Memory(joblib_cache_folder, mmap_mode='r')
+memory = Memory(os.path.join(get_data_home(), 'covertype_benchmark_data'),
+                mmap_mode='r')
 
 
-# Load the data, then cache and memmap the train/test split
-@m.cache
-def load_data(dtype=np.float32, order='C'):
+@memory.cache
+def load_data(dtype=np.float32, order='C', random_state=13):
+    """Load the data, then cache and memmap the train/test split"""
     ######################################################################
     ## Load dataset
     print("Loading dataset...")
     data = fetch_covtype(download_if_missing=True, shuffle=True,
-                         random_state=opts.random_seed)
-    X, y = data['data'], data['target']
-    X = np.asarray(X, dtype=dtype)
-
-    if order.lower() == 'f':
-        X = np.asfortranarray(X)
+                         random_state=random_state)
+    X = check_array(data['data'], dtype=dtype, order=order)
+    y = (data['target'] != 1).astype(np.int)
 
-    # class 1 vs. all others.
-    y[np.where(y != 1)] = -1
-
-    ######################################################################
     ## Create train-test split (as [Joachims, 2006])
-    logger.info("Creating train-test split...")
+    print("Creating train-test split...")
     n_train = 522911
-
     X_train = X[:n_train]
     y_train = y[:n_train]
     X_test = X[n_train:]
     y_test = y[n_train:]
 
-    ######################################################################
     ## Standardize first 10 features (the numerical ones)
     mean = X_train.mean(axis=0)
     std = X_train.std(axis=0)
@@ -139,130 +97,92 @@ def load_data(dtype=np.float32, order='C'):
     return X_train, X_test, y_train, y_test
 
 
-X_train, X_test, y_train, y_test = load_data()
-
-######################################################################
-## Print dataset statistics
-print("")
-print("Dataset statistics:")
-print("===================")
-print("%s %d" % ("number of features:".ljust(25),
-                 X_train.shape[1]))
-print("%s %d" % ("number of classes:".ljust(25),
-                 np.unique(y_train).shape[0]))
-print("%s %s" % ("data type:".ljust(25), X_train.dtype))
-print("%s %d (pos=%d, neg=%d, size=%dMB)"
-      % ("number of train samples:".ljust(25),
-         X_train.shape[0], np.sum(y_train == 1),
-         np.sum(y_train == -1), int(X_train.nbytes / 1e6)))
-print("%s %d (pos=%d, neg=%d, size=%dMB)"
-      % ("number of test samples:".ljust(25),
-      X_test.shape[0], np.sum(y_test == 1),
-      np.sum(y_test == -1), int(X_test.nbytes / 1e6)))
-
-
-classifiers = dict()
-
-
-######################################################################
-## Benchmark classifiers
-def benchmark(clf):
-    t0 = time()
-    clf.fit(X_train, y_train)
-    train_time = time() - t0
-    t0 = time()
-    pred = clf.predict(X_test)
-    test_time = time() - t0
-    err = metrics.zero_one_loss(y_test, pred, normalize=True)
-    return err, train_time, test_time
-
-######################################################################
-## Train Liblinear model
-liblinear_parameters = {
-    'loss': 'l2',
-    'penalty': 'l2',
-    'C': 1000,
-    'dual': False,
-    'tol': 1e-3,
-    "random_state": opts.random_seed,
+ESTIMATORS = {
+    'GBRT': GradientBoostingClassifier(n_estimators=250),
+    'ExtraTrees': ExtraTreesClassifier(n_estimators=20),
+    'RandomForest': RandomForestClassifier(n_estimators=20),
+    'CART': DecisionTreeClassifier(min_samples_split=5),
+    'SGD': SGDClassifier(alpha=0.001, n_iter=2),
+    'GaussianNB': GaussianNB(),
+    'liblinear': LinearSVC(loss="l2", penalty="l2", C=1000, dual=False,
+                           tol=1e-3)
 }
-classifiers['liblinear'] = LinearSVC(**liblinear_parameters)
-
-######################################################################
-## Train GaussianNB model
-classifiers['GaussianNB'] = GaussianNB()
-
-######################################################################
-## Train SGD model
-sgd_parameters = {
-    'alpha': 0.001,
-    'n_iter': 2,
-    'n_jobs': opts.n_jobs,
-    "random_state": opts.random_seed,
-}
-classifiers['SGD'] = SGDClassifier(**sgd_parameters)
-
-######################################################################
-## Train CART model
-classifiers['CART'] = DecisionTreeClassifier(min_samples_split=5,
-                                             max_depth=None,
-                                             random_state=opts.random_seed)
-
-######################################################################
-## Train RandomForest model
-rf_parameters = {
-    "n_estimators": 20,
-    "n_jobs": opts.n_jobs,
-    "random_state": opts.random_seed,
-}
-classifiers['RandomForest'] = RandomForestClassifier(**rf_parameters)
-
-######################################################################
-## Train Extra-Trees model
-classifiers['ExtraTrees'] = ExtraTreesClassifier(n_estimators=20,
-                                                 n_jobs=opts.n_jobs,
-                                                 random_state=opts.random_seed)
-
-######################################################################
-## Train GBRT model
-classifiers['GBRT'] = GradientBoostingClassifier(n_estimators=250,
-                                                 random_state=opts.random_seed)
-
-
-selected_classifiers = opts.classifiers.split(',')
-for name in selected_classifiers:
-    if name not in classifiers:
-        op.error('classifier %r unknown' % name)
-        sys.exit(1)
-
-print()
-print("Training Classifiers")
-print("====================")
-print()
-err, train_time, test_time = {}, {}, {}
-for name in sorted(selected_classifiers):
-    print("Training %s ..." % name)
-    err[name], train_time[name], test_time[name] = benchmark(classifiers[name])
-
-######################################################################
-## Print classification performance
-print()
-print("Classification performance:")
-print("===========================")
-print()
-
-
-def print_row(clf_type, train_time, test_time, err):
-    print("%s %s %s %s" % (clf_type.ljust(12),
-                           ("%.4fs" % train_time).center(10),
-                           ("%.4fs" % test_time).center(10),
-                           ("%.4f" % err).center(10)))
-
-print("%s %s %s %s" % ("Classifier  ", "train-time", "test-time",
-                       "error-rate"))
-print("-" * 44)
-
-for name in sorted(selected_classifiers, key=lambda name: err[name]):
-    print_row(name, train_time[name], test_time[name], err[name])
-print()
-print()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--classifiers', nargs="+",
+                        choices=ESTIMATORS, type=str,
+                        default=['liblinear', 'GaussianNB', 'SGD', 'CART'],
+                        help="list of classifiers to benchmark.")
+    parser.add_argument('--n-jobs', nargs="?", default=1, type=int,
+                        help="Number of concurrently running workers for "
+                             "models that support parallelism.")
+    parser.add_argument('--order', nargs="?", default="C", type=str,
+                        choices=["F", "C"],
+                        help="Allow to choose between fortran and C ordered "
+                             "data")
+    parser.add_argument('--random-seed', nargs="?", default=13, type=int,
+                        help="Common seed used by random number generator.")
+    args = vars(parser.parse_args())
+
+    print(__doc__)
+
+    X_train, X_test, y_train, y_test = load_data(
+        order=args["order"], random_state=args["random_seed"])
+
+    print("")
+    print("Dataset statistics:")
+    print("===================")
+    print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
+    print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
+    print("%s %s" % ("data type:".ljust(25), X_train.dtype))
+    print("%s %d (pos=%d, neg=%d, size=%dMB)"
+          % ("number of train samples:".ljust(25),
+             X_train.shape[0], np.sum(y_train == 1),
+             np.sum(y_train == 0), int(X_train.nbytes / 1e6)))
+    print("%s %d (pos=%d, neg=%d, size=%dMB)"
+          % ("number of test samples:".ljust(25),
+             X_test.shape[0], np.sum(y_test == 1),
+             np.sum(y_test == 0), int(X_test.nbytes / 1e6)))
+
+    print()
+    print("Training Classifiers")
+    print("====================")
+    error, train_time, test_time = {}, {}, {}
+    for name in sorted(args["classifiers"]):
+        print("Training %s ... " % name, end="")
+        estimator = ESTIMATORS[name]
+        estimator_params = estimator.get_params()
+
+        if "random_state" in estimator_params:
+            estimator.set_params(random_state=args["random_seed"])
+
+        if "n_jobs" in estimator_params:
+            estimator.set_params(n_jobs=args["n_jobs"])
+
+        time_start = time()
+        estimator.fit(X_train, y_train)
+        train_time[name] = time() - time_start
+
+        time_start = time()
+        y_pred = estimator.predict(X_test)
+        test_time[name] = time() - time_start
+
+        error[name] = zero_one_loss(y_test, y_pred)
+
+        print("done")
+
+    print()
+    print("Classification performance:")
+    print("===========================")
+    print("%s %s %s %s"
+          % ("Classifier  ", "train-time", "test-time", "error-rate"))
+    print("-" * 44)
+    for name in sorted(args["classifiers"], key=error.get):
+        print("%s %s %s %s" % (name.ljust(12),
+                               ("%.4fs" % train_time[name]).center(10),
+                               ("%.4fs" % test_time[name]).center(10),
+                               ("%.4f" % error[name]).center(10)))
+
+    print()