Skip to content

Commit 59d976d

Browse files
author
Github Actions
committed
Ravin Kohli: [ADD] Allow users to pass feat types to tabular validator (#441)
1 parent d147c30 commit 59d976d

File tree

45 files changed

+1268
-321
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1268
-321
lines changed
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""
2+
=====================================================
3+
Tabular Classification with user passed feature types
4+
=====================================================
5+
6+
The following example shows how to pass feature typesfor datasets which are in
7+
numpy format (also works for dataframes and lists) fit a sample classification
8+
model with AutoPyTorch.
9+
10+
AutoPyTorch relies on column dtypes for intepreting the feature types. But they
11+
can be misinterpreted for example, when dataset is passed as a numpy array, all
12+
the data is interpreted as numerical if it's dtype is int or float. However, the
13+
categorical values could have been encoded as integers.
14+
15+
Passing feature types helps AutoPyTorch interpreting them correctly as well as
16+
validates the dataset by checking the dtype of the columns for any incompatibilities.
17+
"""
18+
import os
19+
import tempfile as tmp
20+
import warnings
21+
22+
os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
23+
os.environ['OMP_NUM_THREADS'] = '1'
24+
os.environ['OPENBLAS_NUM_THREADS'] = '1'
25+
os.environ['MKL_NUM_THREADS'] = '1'
26+
27+
warnings.simplefilter(action='ignore', category=UserWarning)
28+
warnings.simplefilter(action='ignore', category=FutureWarning)
29+
30+
import openml
31+
import sklearn.model_selection
32+
33+
from autoPyTorch.api.tabular_classification import TabularClassificationTask
34+
35+
36+
############################################################################
37+
# Data Loading
38+
# ============
39+
task = openml.tasks.get_task(task_id=146821)
40+
dataset = task.get_dataset()
41+
X, y, categorical_indicator, _ = dataset.get_data(
42+
dataset_format='array',
43+
target=dataset.default_target_attribute,
44+
)
45+
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
46+
X,
47+
y,
48+
random_state=1,
49+
)
50+
51+
feat_types = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator]
52+
53+
#
54+
############################################################################
55+
# Build and fit a classifier
56+
# ==========================
57+
api = TabularClassificationTask(
58+
# To maintain logs of the run, you can uncomment the
59+
# Following lines
60+
# temporary_directory='./tmp/autoPyTorch_example_tmp_01',
61+
# output_directory='./tmp/autoPyTorch_example_out_01',
62+
# delete_tmp_folder_after_terminate=False,
63+
# delete_output_folder_after_terminate=False,
64+
seed=42,
65+
)
66+
67+
############################################################################
68+
# Search for an ensemble of machine learning algorithms
69+
# =====================================================
70+
api.search(
71+
X_train=X_train,
72+
y_train=y_train,
73+
X_test=X_test.copy(),
74+
y_test=y_test.copy(),
75+
dataset_name='Australian',
76+
optimize_metric='accuracy',
77+
total_walltime_limit=100,
78+
func_eval_time_limit_secs=50,
79+
feat_types=feat_types,
80+
enable_traditional_pipeline=False
81+
)
82+
83+
############################################################################
84+
# Print the final ensemble performance
85+
# ====================================
86+
y_pred = api.predict(X_test)
87+
score = api.score(y_pred, y_test)
88+
print(score)
89+
# Print the final ensemble built by AutoPyTorch
90+
print(api.show_models())
91+
92+
# Print statistics from search
93+
print(api.sprint_statistics())
Binary file not shown.
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Tabular Classification with user passed feature types\n\nThe following example shows how to pass feature typesfor datasets which are in \nnumpy format (also works for dataframes and lists) fit a sample classification \nmodel with AutoPyTorch.\n\nAutoPyTorch relies on column dtypes for intepreting the feature types. But they \ncan be misinterpreted for example, when dataset is passed as a numpy array, all \nthe data is interpreted as numerical if it's dtype is int or float. However, the \ncategorical values could have been encoded as integers.\n\nPassing feature types helps AutoPyTorch interpreting them correctly as well as\nvalidates the dataset by checking the dtype of the columns for any incompatibilities.\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"import os\nimport tempfile as tmp\nimport warnings\n\nos.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()\nos.environ['OMP_NUM_THREADS'] = '1'\nos.environ['OPENBLAS_NUM_THREADS'] = '1'\nos.environ['MKL_NUM_THREADS'] = '1'\n\nwarnings.simplefilter(action='ignore', category=UserWarning)\nwarnings.simplefilter(action='ignore', category=FutureWarning)\n\nimport openml\nimport sklearn.model_selection\n\nfrom autoPyTorch.api.tabular_classification import TabularClassificationTask"
30+
]
31+
},
32+
{
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"## Data Loading\n\n"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": null,
42+
"metadata": {
43+
"collapsed": false
44+
},
45+
"outputs": [],
46+
"source": [
47+
"task = openml.tasks.get_task(task_id=146821)\ndataset = task.get_dataset()\nX, y, categorical_indicator, _ = dataset.get_data(\n dataset_format='array',\n target=dataset.default_target_attribute,\n)\nX_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n X,\n y,\n random_state=1,\n)\n\nfeat_types = [\"numerical\" if not indicator else \"categorical\" for indicator in categorical_indicator]\n\n#"
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"## Build and fit a classifier\n\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"metadata": {
61+
"collapsed": false
62+
},
63+
"outputs": [],
64+
"source": [
65+
"api = TabularClassificationTask(\n # To maintain logs of the run, you can uncomment the\n # Following lines\n # temporary_directory='./tmp/autoPyTorch_example_tmp_01',\n # output_directory='./tmp/autoPyTorch_example_out_01',\n # delete_tmp_folder_after_terminate=False,\n # delete_output_folder_after_terminate=False,\n seed=42,\n)"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"metadata": {},
71+
"source": [
72+
"## Search for an ensemble of machine learning algorithms\n\n"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": null,
78+
"metadata": {
79+
"collapsed": false
80+
},
81+
"outputs": [],
82+
"source": [
83+
"api.search(\n X_train=X_train,\n y_train=y_train,\n X_test=X_test.copy(),\n y_test=y_test.copy(),\n dataset_name='Australian',\n optimize_metric='accuracy',\n total_walltime_limit=100,\n func_eval_time_limit_secs=50,\n feat_types=feat_types,\n enable_traditional_pipeline=False\n)"
84+
]
85+
},
86+
{
87+
"cell_type": "markdown",
88+
"metadata": {},
89+
"source": [
90+
"## Print the final ensemble performance\n\n"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": null,
96+
"metadata": {
97+
"collapsed": false
98+
},
99+
"outputs": [],
100+
"source": [
101+
"y_pred = api.predict(X_test)\nscore = api.score(y_pred, y_test)\nprint(score)\n# Print the final ensemble built by AutoPyTorch\nprint(api.show_models())\n\n# Print statistics from search\nprint(api.sprint_statistics())"
102+
]
103+
}
104+
],
105+
"metadata": {
106+
"kernelspec": {
107+
"display_name": "Python 3",
108+
"language": "python",
109+
"name": "python3"
110+
},
111+
"language_info": {
112+
"codemirror_mode": {
113+
"name": "ipython",
114+
"version": 3
115+
},
116+
"file_extension": ".py",
117+
"mimetype": "text/x-python",
118+
"name": "python",
119+
"nbconvert_exporter": "python",
120+
"pygments_lexer": "ipython3",
121+
"version": "3.8.13"
122+
}
123+
},
124+
"nbformat": 4,
125+
"nbformat_minor": 0
126+
}
Binary file not shown.
Loading
Loading
Loading
Loading
Loading

development/_modules/autoPyTorch/api/tabular_classification.html

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
282282
<span class="n">resampling_strategy_args</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
283283
<span class="n">dataset_name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
284284
<span class="n">dataset_compression</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">DatasetCompressionSpec</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
285+
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
285286
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">TabularDataset</span><span class="p">,</span> <span class="n">TabularInputValidator</span><span class="p">]:</span>
286287
<span class="sd">&quot;&quot;&quot;</span>
287288
<span class="sd"> Returns an object of `TabularDataset` and an object of</span>
@@ -308,6 +309,9 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
308309
<span class="sd"> dataset_compression (Optional[DatasetCompressionSpec]):</span>
309310
<span class="sd"> specifications for dataset compression. For more info check</span>
310311
<span class="sd"> documentation for `BaseTask.get_dataset`.</span>
312+
<span class="sd"> kwargs (Any):</span>
313+
<span class="sd"> Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which</span>
314+
<span class="sd"> specifies whether a feature is &#39;numerical&#39; or &#39;categorical&#39;.</span>
311315

312316
<span class="sd"> Returns:</span>
313317
<span class="sd"> TabularDataset:</span>
@@ -320,12 +324,14 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
320324
<span class="n">resampling_strategy_args</span> <span class="o">=</span> <span class="n">resampling_strategy_args</span> <span class="k">if</span> <span class="n">resampling_strategy_args</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> \
321325
<span class="bp">self</span><span class="o">.</span><span class="n">resampling_strategy_args</span>
322326

327+
<span class="n">feat_types</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;feat_types&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
323328
<span class="c1"># Create a validator object to make sure that the data provided by</span>
324329
<span class="c1"># the user matches the autopytorch requirements</span>
325330
<span class="n">input_validator</span> <span class="o">=</span> <span class="n">TabularInputValidator</span><span class="p">(</span>
326331
<span class="n">is_classification</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
327332
<span class="n">logger_port</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_logger_port</span><span class="p">,</span>
328-
<span class="n">dataset_compression</span><span class="o">=</span><span class="n">dataset_compression</span>
333+
<span class="n">dataset_compression</span><span class="o">=</span><span class="n">dataset_compression</span><span class="p">,</span>
334+
<span class="n">feat_types</span><span class="o">=</span><span class="n">feat_types</span>
329335
<span class="p">)</span>
330336

331337
<span class="c1"># Fit a input validator to check the provided data</span>
@@ -352,6 +358,7 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
352358
<span class="n">X_test</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
353359
<span class="n">y_test</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
354360
<span class="n">dataset_name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
361+
<span class="n">feat_types</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
355362
<span class="n">budget_type</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s1">&#39;epochs&#39;</span><span class="p">,</span>
356363
<span class="n">min_budget</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span>
357364
<span class="n">max_budget</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">50</span><span class="p">,</span>
@@ -380,6 +387,10 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
380387
<span class="sd"> A pair of features (X_train) and targets (y_train) used to fit a</span>
381388
<span class="sd"> pipeline. Additionally, a holdout of this pairs (X_test, y_test) can</span>
382389
<span class="sd"> be provided to track the generalization performance of each stage.</span>
390+
<span class="sd"> feat_types (Optional[List[str]]):</span>
391+
<span class="sd"> Description about the feature types of the columns.</span>
392+
<span class="sd"> Accepts `numerical` for integers, float data and `categorical`</span>
393+
<span class="sd"> for categories, strings and bool. Defaults to None.</span>
383394
<span class="sd"> optimize_metric (str):</span>
384395
<span class="sd"> name of the metric that is used to evaluate a pipeline.</span>
385396
<span class="sd"> budget_type (str):</span>
@@ -547,7 +558,8 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
547558
<span class="n">resampling_strategy</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">resampling_strategy</span><span class="p">,</span>
548559
<span class="n">resampling_strategy_args</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">resampling_strategy_args</span><span class="p">,</span>
549560
<span class="n">dataset_name</span><span class="o">=</span><span class="n">dataset_name</span><span class="p">,</span>
550-
<span class="n">dataset_compression</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dataset_compression</span><span class="p">)</span>
561+
<span class="n">dataset_compression</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dataset_compression</span><span class="p">,</span>
562+
<span class="n">feat_types</span><span class="o">=</span><span class="n">feat_types</span><span class="p">)</span>
551563

552564
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_search</span><span class="p">(</span>
553565
<span class="n">dataset</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">dataset</span><span class="p">,</span>

0 commit comments

Comments
 (0)