Skip to content

Commit

Permalink
Split EDA and training notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
brendancsmith committed Sep 24, 2024
1 parent d1f08fa commit dc6011e
Show file tree
Hide file tree
Showing 2 changed files with 405 additions and 296 deletions.
310 changes: 14 additions & 296 deletions notebooks/eda.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,9 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"# Add the src directory to the Python path\n",
"sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))"
"%reload_ext autoreload\n",
"%autoreload 2\n",
"%matplotlib inline"
]
},
{
Expand All @@ -26,9 +24,11 @@
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"import sys\n",
"\n",
"pd.set_option('display.max_columns', None)"
"# Add the src directory to the Python path\n",
"sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))"
]
},
{
Expand All @@ -37,6 +37,10 @@
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"pd.set_option('display.max_columns', None)\n",
"\n",
"# suppress debugging warnings\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
Expand Down Expand Up @@ -532,290 +536,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training\n",
"\n",
"### Train the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = create_model(eval_metric='logloss')\n",
"\n",
"model.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Inference\n",
"### Make predictions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_pred = model.predict(X_test)\n",
"y_proba = model.predict_proba(X_test)[:,1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Accuracy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Accuracy\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"print(f\"Accuracy: {accuracy:.4f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Classification Report"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Classification report\n",
"from sklearn.metrics import classification_report\n",
"\n",
"print(\"Classification Report:\\n\", classification_report(y_test, y_pred))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Confusion Matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Confusion matrix\n",
"visualize.confusion_matrix(y_test, y_pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ROC Curve"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import roc_auc_score\n",
"\n",
"# ROC-AUC score\n",
"roc_auc = roc_auc_score(y_test, y_proba)\n",
"print(f\"ROC-AUC Score: {roc_auc:.4f}\")\n",
"\n",
"# Plot ROC curve\n",
"visualize.roc_curve(y_test, y_proba, roc_auc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Cross-Validation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import cross_val_score\n",
"\n",
"# Perform cross-validation\n",
"cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')\n",
"\n",
"print(\"Cross Validation ROC-AUC Scores:\")\n",
"print(f\"Mean: {cv_scores.mean():.4f}\")\n",
"print(f\"Standard deviation: {cv_scores.std():.2e}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature importance"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"feature_importances = pd.DataFrame({\n",
" 'columns': X_train.columns,\n",
" 'importance': model.feature_importances_\n",
"}).sort_values(by='importance', ascending=False)\n",
"\n",
"with pd.option_context('display.max_rows', None):\n",
" display(feature_importances)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.datasets import make_regression\n",
"from sklearn.inspection import permutation_importance\n",
"\n",
"# Calculate permutation importance\n",
"result = permutation_importance(\n",
" model, X_train, y_train, n_repeats=FOLDS, random_state=42)\n",
"\n",
"# Get importance values\n",
"importances = result['importances_mean']\n",
"\n",
"# Create a DataFrame with feature names and importances\n",
"feature_importances = pd.DataFrame({\n",
" 'Feature': X_train.columns,\n",
" 'Importance': importances\n",
"})\n",
"\n",
"# Sort the DataFrame by importance in descending order\n",
"feature_importances = feature_importances.sort_values('Importance', ascending=False)\n",
"\n",
"# Display the feature importances DataFrame\n",
"with pd.option_context('display.max_rows', None):\n",
" display(feature_importances)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hyperparameter Tuning\n",
"\n",
"### HP search"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"\n",
"from sklearn.model_selection import RandomizedSearchCV\n",
"from scipy import stats\n",
"\n",
"# Takes 14.5min at 30 iterations\n",
"param_dist = {\n",
" \"max_depth\": stats.randint(2, 3),\n",
" \"learning_rate\": stats.uniform(loc=0.93, scale=0.07),\n",
" \"n_estimators\": stats.randint(100, 1000),\n",
" \"subsample\": stats.norm(0.85, scale=0.05),\n",
" \"colsample_bytree\": stats.uniform(loc=0.98, scale=0.02),\n",
"}\n",
"\n",
"# Setup the randomized search\n",
"search = RandomizedSearchCV(\n",
" estimator=create_model(eval_metric='logloss'),\n",
" param_distributions=param_dist,\n",
" n_iter=30, # ~28sec/iter\n",
" cv=FOLDS,\n",
" verbose=0,\n",
" random_state=42,\n",
" n_jobs=(-1)\n",
")\n",
"\n",
"# Fit the model\n",
"search.fit(X_train, y_train)\n",
"\n",
"# Best Model\n",
"print(\"Best Parameters:\", search.best_params_)\n",
"print(f\"Best ROC-AUC Score: {search.best_score_:.8f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Retrain with these hyperparameters"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"best_params = search.best_params_\n",
"model_best = create_model(**best_params)\n",
"model_best.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Evaluate the optimized model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Predict on test data\n",
"y_pred_best = model_best.predict(X_test)\n",
"y_proba_best = model_best.predict_proba(X_test)[:,1]\n",
"\n",
"# Accuracy\n",
"accuracy_best = accuracy_score(y_test, y_pred_best)\n",
"print(f\"Optimized Accuracy: {accuracy_best:.4f}\")\n",
"\n",
"print(\"Optimized Classification Report:\\n\", classification_report(y_test, y_pred_best))"
"## Write the preprocessed data"
]
},
{
Expand All @@ -824,12 +545,9 @@
"metadata": {},
"outputs": [],
"source": [
"# ROC-AUC score\n",
"roc_auc = roc_auc_score(y_test, y_proba_best)\n",
"print(f\"Optimized ROC-AUC Score: {roc_auc:.4f}\")\n",
"import joblib\n",
"\n",
"# Plot ROC curve\n",
"visualize.roc_curve(y_test, y_proba_best, roc_auc)"
"joblib.dump((X_train, X_test, y_train, y_test), '../data/processed/accepted_2007_to_2018Q4.pkl')"
]
}
],
Expand Down
Loading

0 comments on commit dc6011e

Please sign in to comment.