automl
diff --git a/‎development/_downloads/b94e5326627884f180b75cfade993f09/example_pass_feature_types.py
Lines changed: 93 additions & 0 deletions b/‎development/_downloads/b94e5326627884f180b75cfade993f09/example_pass_feature_types.py
Lines changed: 93 additions & 0 deletions
diff --git a/‎development/_downloads/bc82bea3a5dd7bdba60b65220891d9e5/examples_python.zip
3.23 KB b/‎development/_downloads/bc82bea3a5dd7bdba60b65220891d9e5/examples_python.zip
3.23 KB
diff --git a/‎development/_downloads/f12b80fbfb7d358449eeb84e7ac56173/example_pass_feature_types.ipynb
Lines changed: 126 additions & 0 deletions b/‎development/_downloads/f12b80fbfb7d358449eeb84e7ac56173/example_pass_feature_types.ipynb
Lines changed: 126 additions & 0 deletions
diff --git a/‎development/_downloads/fb625db3c50d423b1b7881136ffdeec8/examples_jupyter.zip
4.86 KB b/‎development/_downloads/fb625db3c50d423b1b7881136ffdeec8/examples_jupyter.zip
4.86 KB
diff --git a/‎development/_images/sphx_glr_example_pass_feature_types_thumb.png
26.2 KB b/‎development/_images/sphx_glr_example_pass_feature_types_thumb.png
26.2 KB
diff --git a/‎development/_images/sphx_glr_example_plot_over_time_001.png
3.73 KB b/‎development/_images/sphx_glr_example_plot_over_time_001.png
3.73 KB
diff --git a/‎development/_images/sphx_glr_example_plot_over_time_thumb.png
2.36 KB b/‎development/_images/sphx_glr_example_plot_over_time_thumb.png
2.36 KB
diff --git a/‎development/_images/sphx_glr_example_visualization_001.png
-4.68 KB b/‎development/_images/sphx_glr_example_visualization_001.png
-4.68 KB
diff --git a/‎development/_images/sphx_glr_example_visualization_thumb.png
-2.51 KB b/‎development/_images/sphx_glr_example_visualization_thumb.png
-2.51 KB
diff --git a/‎development/_modules/autoPyTorch/api/tabular_classification.html
Lines changed: 14 additions & 2 deletions b/‎development/_modules/autoPyTorch/api/tabular_classification.html
Lines changed: 14 additions & 2 deletions
@@ -0,0 +1,93 @@
+"""
+=====================================================
+Tabular Classification with user passed feature types
+=====================================================
+
+The following example shows how to pass feature typesfor datasets which are in 
+numpy format (also works for dataframes and lists) fit a sample classification 
+model with AutoPyTorch.
+
+AutoPyTorch relies on column dtypes for intepreting the feature types. But they 
+can be misinterpreted for example, when dataset is passed as a numpy array, all 
+the data is interpreted as numerical if it's dtype is int or float. However, the 
+categorical values could have been encoded as integers.
+
+Passing feature types helps AutoPyTorch interpreting them correctly as well as
+validates the dataset by checking the dtype of the columns for any incompatibilities.
+"""
+import os
+import tempfile as tmp
+import warnings
+
+os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+
+warnings.simplefilter(action='ignore', category=UserWarning)
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import openml
+import sklearn.model_selection
+
+from autoPyTorch.api.tabular_classification import TabularClassificationTask
+
+
+############################################################################
+# Data Loading
+# ============
+task = openml.tasks.get_task(task_id=146821)
+dataset = task.get_dataset()
+X, y, categorical_indicator, _ = dataset.get_data(
+    dataset_format='array',
+    target=dataset.default_target_attribute,
+)
+X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
+    X,
+    y,
+    random_state=1,
+)
+
+feat_types = ["numerical" if not indicator else "categorical" for indicator in categorical_indicator]
+
+# 
+############################################################################
+# Build and fit a classifier
+# ==========================
+api = TabularClassificationTask(
+    # To maintain logs of the run, you can uncomment the
+    # Following lines
+    # temporary_directory='./tmp/autoPyTorch_example_tmp_01',
+    # output_directory='./tmp/autoPyTorch_example_out_01',
+    # delete_tmp_folder_after_terminate=False,
+    # delete_output_folder_after_terminate=False,
+    seed=42,
+)
+
+############################################################################
+# Search for an ensemble of machine learning algorithms
+# =====================================================
+api.search(
+    X_train=X_train,
+    y_train=y_train,
+    X_test=X_test.copy(),
+    y_test=y_test.copy(),
+    dataset_name='Australian',
+    optimize_metric='accuracy',
+    total_walltime_limit=100,
+    func_eval_time_limit_secs=50,
+    feat_types=feat_types,
+    enable_traditional_pipeline=False
+)
+
+############################################################################
+# Print the final ensemble performance
+# ====================================
+y_pred = api.predict(X_test)
+score = api.score(y_pred, y_test)
+print(score)
+# Print the final ensemble built by AutoPyTorch
+print(api.show_models())
+
+# Print statistics from search
+print(api.sprint_statistics())
@@ -0,0 +1,126 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Tabular Classification with user passed feature types\n\nThe following example shows how to pass feature typesfor datasets which are in \nnumpy format (also works for dataframes and lists) fit a sample classification \nmodel with AutoPyTorch.\n\nAutoPyTorch relies on column dtypes for intepreting the feature types. But they \ncan be misinterpreted for example, when dataset is passed as a numpy array, all \nthe data is interpreted as numerical if it's dtype is int or float. However, the \ncategorical values could have been encoded as integers.\n\nPassing feature types helps AutoPyTorch interpreting them correctly as well as\nvalidates the dataset by checking the dtype of the columns for any incompatibilities.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import os\nimport tempfile as tmp\nimport warnings\n\nos.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()\nos.environ['OMP_NUM_THREADS'] = '1'\nos.environ['OPENBLAS_NUM_THREADS'] = '1'\nos.environ['MKL_NUM_THREADS'] = '1'\n\nwarnings.simplefilter(action='ignore', category=UserWarning)\nwarnings.simplefilter(action='ignore', category=FutureWarning)\n\nimport openml\nimport sklearn.model_selection\n\nfrom autoPyTorch.api.tabular_classification import TabularClassificationTask"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Data Loading\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "task = openml.tasks.get_task(task_id=146821)\ndataset = task.get_dataset()\nX, y, categorical_indicator, _ = dataset.get_data(\n    dataset_format='array',\n    target=dataset.default_target_attribute,\n)\nX_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n    X,\n    y,\n    random_state=1,\n)\n\nfeat_types = [\"numerical\" if not indicator else \"categorical\" for indicator in categorical_indicator]\n\n#"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Build and fit a classifier\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "api = TabularClassificationTask(\n    # To maintain logs of the run, you can uncomment the\n    # Following lines\n    # temporary_directory='./tmp/autoPyTorch_example_tmp_01',\n    # output_directory='./tmp/autoPyTorch_example_out_01',\n    # delete_tmp_folder_after_terminate=False,\n    # delete_output_folder_after_terminate=False,\n    seed=42,\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Search for an ensemble of machine learning algorithms\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "api.search(\n    X_train=X_train,\n    y_train=y_train,\n    X_test=X_test.copy(),\n    y_test=y_test.copy(),\n    dataset_name='Australian',\n    optimize_metric='accuracy',\n    total_walltime_limit=100,\n    func_eval_time_limit_secs=50,\n    feat_types=feat_types,\n    enable_traditional_pipeline=False\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Print the final ensemble performance\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "y_pred = api.predict(X_test)\nscore = api.score(y_pred, y_test)\nprint(score)\n# Print the final ensemble built by AutoPyTorch\nprint(api.show_models())\n\n# Print statistics from search\nprint(api.sprint_statistics())"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.13"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
@@ -282,6 +282,7 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
         <span class="n">resampling_strategy_args</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
         <span class="n">dataset_name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
         <span class="n">dataset_compression</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">DatasetCompressionSpec</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
     <span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">TabularDataset</span><span class="p">,</span> <span class="n">TabularInputValidator</span><span class="p">]:</span>
         <span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">        Returns an object of `TabularDataset` and an object of</span>
@@ -308,6 +309,9 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
 <span class="sd">            dataset_compression (Optional[DatasetCompressionSpec]):</span>
 <span class="sd">                specifications for dataset compression. For more info check</span>
 <span class="sd">                documentation for `BaseTask.get_dataset`.</span>
+<span class="sd">            kwargs (Any):</span>
+<span class="sd">                Currently for tabular tasks, expect `feat_types: (Optional[List[str]]` which</span>
+<span class="sd">                specifies whether a feature is &#39;numerical&#39; or &#39;categorical&#39;.</span>
 
 <span class="sd">        Returns:</span>
 <span class="sd">            TabularDataset:</span>
@@ -320,12 +324,14 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
         <span class="n">resampling_strategy_args</span> <span class="o">=</span> <span class="n">resampling_strategy_args</span> <span class="k">if</span> <span class="n">resampling_strategy_args</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> \
             <span class="bp">self</span><span class="o">.</span><span class="n">resampling_strategy_args</span>
 
+        <span class="n">feat_types</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;feat_types&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
         <span class="c1"># Create a validator object to make sure that the data provided by</span>
         <span class="c1"># the user matches the autopytorch requirements</span>
         <span class="n">input_validator</span> <span class="o">=</span> <span class="n">TabularInputValidator</span><span class="p">(</span>
             <span class="n">is_classification</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
             <span class="n">logger_port</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_logger_port</span><span class="p">,</span>
-            <span class="n">dataset_compression</span><span class="o">=</span><span class="n">dataset_compression</span>
+            <span class="n">dataset_compression</span><span class="o">=</span><span class="n">dataset_compression</span><span class="p">,</span>
+            <span class="n">feat_types</span><span class="o">=</span><span class="n">feat_types</span>
         <span class="p">)</span>
 
         <span class="c1"># Fit a input validator to check the provided data</span>
@@ -352,6 +358,7 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
         <span class="n">X_test</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
         <span class="n">y_test</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
         <span class="n">dataset_name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
+        <span class="n">feat_types</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
         <span class="n">budget_type</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s1">&#39;epochs&#39;</span><span class="p">,</span>
         <span class="n">min_budget</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">,</span>
         <span class="n">max_budget</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">50</span><span class="p">,</span>
@@ -380,6 +387,10 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
 <span class="sd">                A pair of features (X_train) and targets (y_train) used to fit a</span>
 <span class="sd">                pipeline. Additionally, a holdout of this pairs (X_test, y_test) can</span>
 <span class="sd">                be provided to track the generalization performance of each stage.</span>
+<span class="sd">            feat_types (Optional[List[str]]):</span>
+<span class="sd">                Description about the feature types of the columns.</span>
+<span class="sd">                Accepts `numerical` for integers, float data and `categorical`</span>
+<span class="sd">                for categories, strings and bool. Defaults to None.</span>
 <span class="sd">            optimize_metric (str):</span>
 <span class="sd">                name of the metric that is used to evaluate a pipeline.</span>
 <span class="sd">            budget_type (str):</span>
@@ -547,7 +558,8 @@ <h1>Source code for autoPyTorch.api.tabular_classification</h1><div class="highl
             <span class="n">resampling_strategy</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">resampling_strategy</span><span class="p">,</span>
             <span class="n">resampling_strategy_args</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">resampling_strategy_args</span><span class="p">,</span>
             <span class="n">dataset_name</span><span class="o">=</span><span class="n">dataset_name</span><span class="p">,</span>
-            <span class="n">dataset_compression</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dataset_compression</span><span class="p">)</span>
+            <span class="n">dataset_compression</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dataset_compression</span><span class="p">,</span>
+            <span class="n">feat_types</span><span class="o">=</span><span class="n">feat_types</span><span class="p">)</span>
 
         <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_search</span><span class="p">(</span>
             <span class="n">dataset</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">dataset</span><span class="p">,</span>