Skip to content

Commit

Permalink
Ravin Kohli: Final changes for v0.1.0 (#341)
Browse files Browse the repository at this point in the history
  • Loading branch information
Github Actions committed Nov 23, 2021
1 parent bec0fc3 commit 883d627
Show file tree
Hide file tree
Showing 57 changed files with 1,823 additions and 2,551 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,13 @@
from autoPyTorch.api.tabular_classification import TabularClassificationTask
from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes

############################################################################
# Default Resampling Strategy
# ============================

############################################################################
# Data Loading
# ============
# ------------
X, y = sklearn.datasets.fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
X,
Expand All @@ -39,7 +42,7 @@

############################################################################
# Build and fit a classifier with default resampling strategy
# ===========================================================
# -----------------------------------------------------------
api = TabularClassificationTask(
# 'HoldoutValTypes.holdout_validation' with 'val_share': 0.33
# is the default argument setting for TabularClassificationTask.
Expand All @@ -51,7 +54,7 @@

############################################################################
# Search for an ensemble of machine learning algorithms
# =====================================================
# -----------------------------------------------------
api.search(
X_train=X_train,
y_train=y_train,
Expand All @@ -64,27 +67,34 @@

############################################################################
# Print the final ensemble performance
# ====================================
print(api.run_history, api.trajectory)
# ------------------------------------
y_pred = api.predict(X_test)
score = api.score(y_pred, y_test)
print(score)
# Print the final ensemble built by AutoPyTorch
print(api.show_models())

# Print statistics from search
print(api.sprint_statistics())

############################################################################

############################################################################
# Cross validation Resampling Strategy
# =====================================

############################################################################
# Build and fit a classifier with Cross validation resampling strategy
# ====================================================================
# --------------------------------------------------------------------
api = TabularClassificationTask(
resampling_strategy=CrossValTypes.k_fold_cross_validation,
resampling_strategy_args={'num_splits': 3}
)

############################################################################
# Search for an ensemble of machine learning algorithms
# =====================================================
# -----------------------------------------------------------------------

api.search(
X_train=X_train,
y_train=y_train,
Expand All @@ -97,19 +107,25 @@

############################################################################
# Print the final ensemble performance
# ====================================
print(api.run_history, api.trajectory)
# ------------
y_pred = api.predict(X_test)
score = api.score(y_pred, y_test)
print(score)
# Print the final ensemble built by AutoPyTorch
print(api.show_models())

# Print statistics from search
print(api.sprint_statistics())

############################################################################

############################################################################
# Stratified Resampling Strategy
# ===============================

############################################################################
# Build and fit a classifier with Stratified resampling strategy
# ==============================================================
# --------------------------------------------------------------
api = TabularClassificationTask(
# For demonstration purposes, we use
# Stratified hold out validation. However,
Expand All @@ -120,7 +136,7 @@

############################################################################
# Search for an ensemble of machine learning algorithms
# =====================================================
# -----------------------------------------------------
api.search(
X_train=X_train,
y_train=y_train,
Expand All @@ -134,9 +150,11 @@
############################################################################
# Print the final ensemble performance
# ====================================
print(api.run_history, api.trajectory)
y_pred = api.predict(X_test)
score = api.score(y_pred, y_test)
print(score)
# Print the final ensemble built by AutoPyTorch
print(api.show_models())

# Print statistics from search
print(api.sprint_statistics())
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
},
"outputs": [],
"source": [
"print(api.run_history, api.trajectory)\ny_pred = api.predict(X_test)\n\n# Rescale the Neural Network predictions into the original target range\nscore = api.score(y_pred, y_test)\n\nprint(score)\n# Print the final ensemble built by AutoPyTorch\nprint(api.show_models())"
"y_pred = api.predict(X_test)\n\n# Rescale the Neural Network predictions into the original target range\nscore = api.score(y_pred, y_test)\n\nprint(score)\n# Print the final ensemble built by AutoPyTorch\nprint(api.show_models())\n\n# Print statistics from search\nprint(api.sprint_statistics())"
]
}
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
},
"outputs": [],
"source": [
"# We will plot the search incumbent through time.\n\n# Collect the performance of individual machine learning algorithms\n# found by SMAC\nindividual_performances = []\nfor run_key, run_value in estimator.run_history.data.items():\n if run_value.status != StatusType.SUCCESS:\n # Ignore crashed runs\n continue\n individual_performances.append({\n 'Timestamp': pd.Timestamp(\n time.strftime(\n '%Y-%m-%d %H:%M:%S',\n time.localtime(run_value.endtime)\n )\n ),\n 'single_best_optimization_accuracy': accuracy._optimum - run_value.cost,\n 'single_best_test_accuracy': np.nan if run_value.additional_info is None else\n accuracy._optimum - run_value.additional_info['test_loss'],\n })\nindividual_performance_frame = pd.DataFrame(individual_performances)\n\n# Collect the performance of the ensemble through time\n# This ensemble is built from the machine learning algorithms\n# found by SMAC\nensemble_performance_frame = pd.DataFrame(estimator.ensemble_performance_history)\n\n# As we are tracking the incumbent, we are interested in the cummax() performance\nensemble_performance_frame['ensemble_optimization_accuracy'] = ensemble_performance_frame[\n 'train_accuracy'\n].cummax()\nensemble_performance_frame['ensemble_test_accuracy'] = ensemble_performance_frame[\n 'test_accuracy'\n].cummax()\nensemble_performance_frame.drop(columns=['test_accuracy', 'train_accuracy'], inplace=True)\nindividual_performance_frame['single_best_optimization_accuracy'] = individual_performance_frame[\n 'single_best_optimization_accuracy'\n].cummax()\nindividual_performance_frame['single_best_test_accuracy'] = individual_performance_frame[\n 'single_best_test_accuracy'\n].cummax()\n\npd.merge(\n ensemble_performance_frame,\n individual_performance_frame,\n on=\"Timestamp\", how='outer'\n).sort_values('Timestamp').fillna(method='ffill').plot(\n x='Timestamp',\n kind='line',\n legend=True,\n title='Auto-PyTorch accuracy over time',\n grid=True,\n)\nplt.show()\n\n# We then can understand the importance of each input feature using\n# a permutation importance analysis. This is done as a proof of concept, to\n# showcase that we can leverage of scikit-learn API.\nresult = permutation_importance(estimator, X_train, y_train, n_repeats=5,\n scoring='accuracy',\n random_state=seed)\nsorted_idx = result.importances_mean.argsort()\n\nfig, ax = plt.subplots()\nax.boxplot(result.importances[sorted_idx].T,\n vert=False, labels=X_test.columns[sorted_idx])\nax.set_title(\"Permutation Importances (Train set)\")\nfig.tight_layout()\nplt.show()"
"# We will plot the search incumbent through time.\n\n# Collect the performance of individual machine learning algorithms\n# found by SMAC\nindividual_performances = []\nfor run_key, run_value in estimator.run_history.data.items():\n if run_value.status != StatusType.SUCCESS:\n # Ignore crashed runs\n continue\n individual_performances.append({\n 'Timestamp': pd.Timestamp(\n time.strftime(\n '%Y-%m-%d %H:%M:%S',\n time.localtime(run_value.endtime)\n )\n ),\n 'single_best_optimization_accuracy': accuracy._optimum - run_value.cost,\n 'single_best_test_accuracy': np.nan if run_value.additional_info is None else\n accuracy._optimum - run_value.additional_info['test_loss']['accuracy'],\n })\nindividual_performance_frame = pd.DataFrame(individual_performances)\n\n# Collect the performance of the ensemble through time\n# This ensemble is built from the machine learning algorithms\n# found by SMAC\nensemble_performance_frame = pd.DataFrame(estimator.ensemble_performance_history)\n\n# As we are tracking the incumbent, we are interested in the cummax() performance\nensemble_performance_frame['ensemble_optimization_accuracy'] = ensemble_performance_frame[\n 'train_accuracy'\n].cummax()\nensemble_performance_frame['ensemble_test_accuracy'] = ensemble_performance_frame[\n 'test_accuracy'\n].cummax()\nensemble_performance_frame.drop(columns=['test_accuracy', 'train_accuracy'], inplace=True)\nindividual_performance_frame['single_best_optimization_accuracy'] = individual_performance_frame[\n 'single_best_optimization_accuracy'\n].cummax()\nindividual_performance_frame['single_best_test_accuracy'] = individual_performance_frame[\n 'single_best_test_accuracy'\n].cummax()\n\npd.merge(\n ensemble_performance_frame,\n individual_performance_frame,\n on=\"Timestamp\", how='outer'\n).sort_values('Timestamp').fillna(method='ffill').plot(\n x='Timestamp',\n kind='line',\n legend=True,\n title='Auto-PyTorch accuracy over time',\n grid=True,\n)\nplt.show()\n\n# We then can understand the importance of each input feature using\n# a permutation importance analysis. This is done as a proof of concept, to\n# showcase that we can leverage of scikit-learn API.\nresult = permutation_importance(estimator, X_train, y_train, n_repeats=5,\n scoring='accuracy',\n random_state=seed)\nsorted_idx = result.importances_mean.argsort()\n\nfig, ax = plt.subplots()\nax.boxplot(result.importances[sorted_idx].T,\n vert=False, labels=X_test.columns[sorted_idx])\nax.set_title(\"Permutation Importances (Train set)\")\nfig.tight_layout()\nplt.show()"
]
}
],
Expand Down
Loading

0 comments on commit 883d627

Please sign in to comment.