remove catboost training dir; ensemble api; blendsearch for hierarchi…

…cal space; ranking task; forecast improvement (microsoft#178) * remove catboost training dir * close microsoft#48 * bs for hierarchical space. close microsoft#85 * retrain for hierarchical space * clean ml (microsoft#180) Co-authored-by: Qingyun Wu <qxw5138@psu.edu> * support ranking task * examples * cv shuffle * forecast api and implementation cleaner * period constraints * delete groups after fit
jfischburg-us · Sep 1, 2021 · 6ab0730 · 6ab0730
1 parent 1bc8786
commit 6ab0730
Show file tree

Hide file tree

Showing 21 changed files with 1,398 additions and 1,125 deletions.
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ tune.run(train_with_config, config={…}, low_cost_partial_config={…}, time_bu
 
 ## Advantages
 
-* For classification and regression tasks, find quality models with lower computational resources.
+* For common machine learning tasks like classification and regression, find quality models with small computational resources.
 * Users can choose their desired customizability: minimal customization (computational resource budget), medium customization (e.g., scikit-style learner, search space and metric), full customization (arbitrary training and evaluation code).
 * Allow human guidance in hyperparameter tuning to respect prior on certain subspaces but also able to explore other subspaces. Read more about the
 hyperparameter optimization methods
@@ -75,7 +75,7 @@ And they can be used in distributed HPO frameworks such as ray tune or nni.
 
 ## Examples
 
-A basic classification example.
+- A basic classification example.
 
 ```python
 from flaml import AutoML
@@ -99,7 +99,7 @@ print(automl.predict_proba(X_train))
 print(automl.model)
 ```
 
-A basic regression example.
+- A basic regression example.
 
 ```python
 from flaml import AutoML
@@ -123,6 +123,39 @@ print(automl.predict(X_train))
 print(automl.model)
 ```
 
+- Time series forecasting.
+
+```python
+# pip install flaml[forecast]
+import numpy as np
+from flaml import AutoML
+X_train = np.arange('2014-01', '2021-01', dtype='datetime64[M]')
+y_train = np.random.random(size=72)
+automl = AutoML()
+automl.fit(X_train=X_train[:72],  # a single column of timestamp
+           y_train=y_train,  # value for each timestamp
+           period=12,  # time horizon to forecast, e.g., 12 months
+           task='forecast', time_budget=15,  # time budget in seconds
+           log_file_name="test/forecast.log",
+          )
+print(automl.predict(X_train[72:]))
+```
+
+- Learning to rank.
+
+```python
+from sklearn.datasets import fetch_openml
+from flaml import AutoML
+X, y = fetch_openml(name="credit-g", return_X_y=True)   
+# not a real learning to rank dataaset
+groups = [200] * 4 + [100] * 2,    # group counts
+automl = AutoML()
+automl.fit(
+    X_train, y_train, groups=groups,
+    task='rank', time_budget=10,    # in seconds
+)
+```
+
 More examples can be found in [notebooks](https://github.com/microsoft/FLAML/tree/main/notebook/).
 
 ## Documentation

diff --git a/flaml/automl.py b/flaml/automl.py
diff --git a/flaml/data.py b/flaml/data.py
@@ -146,7 +146,7 @@ def get_output_from_log(filename, time_budget):
             config = record.config
             learner = record.learner.split('_')[0]
             sample_size = record.sample_size
-            train_loss = record.logged_metric
+            metric = record.logged_metric
 
             if time_used < time_budget and np.isfinite(val_loss):
                 if val_loss < best_val_loss:
@@ -156,7 +156,7 @@ def get_output_from_log(filename, time_budget):
                     best_config_list.append(best_config)
                 search_time_list.append(time_used)
                 best_error_list.append(best_val_loss)
-                logged_metric_list.append(train_loss)
+                logged_metric_list.append(metric)
                 error_list.append(val_loss)
                 config_list.append({"Current Learner": learner,
                                     "Current Sample": sample_size,
@@ -242,8 +242,12 @@ def fit_transform(self, X, y, task):
                 X[cat_columns] = X[cat_columns].astype('category')
             if num_columns:
                 X_num = X[num_columns]
-                if drop and np.issubdtype(X_num.columns.dtype, np.integer):
+                if np.issubdtype(X_num.columns.dtype, np.integer) and (
+                    drop or min(X_num.columns) != 0
+                    or max(X_num.columns) != X_num.shape[1] - 1
+                ):
                     X_num.columns = range(X_num.shape[1])
+                    drop = True
                 else:
                     drop = False
                 from sklearn.impute import SimpleImputer
@@ -257,12 +261,12 @@ def fit_transform(self, X, y, task):
                 cat_columns, num_columns, datetime_columns
             self._drop = drop
 
-        if task == 'regression':
-            self.label_transformer = None
-        else:
+        if task in ('binary:logistic', 'multi:softmax'):
             from sklearn.preprocessing import LabelEncoder
             self.label_transformer = LabelEncoder()
             y = self.label_transformer.fit_transform(y)
+        else:
+            self.label_transformer = None
         return X, y
 
     def transform(self, X):
@@ -302,3 +306,8 @@ def transform(self, X):
                     X_num.columns = range(X_num.shape[1])
                 X[num_columns] = self.transformer.transform(X_num)
         return X
+
+
+def group_counts(groups):
+    _, i, c = np.unique(groups, return_counts=True, return_index=True)
+    return c[np.argsort(i)]