Split EDA and training notebooks

brendancsmith · Sep 24, 2024 · dc6011e · dc6011e
1 parent d1f08fa
commit dc6011e
Show file tree

Hide file tree

Showing 2 changed files with 405 additions and 296 deletions.
diff --git a/notebooks/eda.ipynb b/notebooks/eda.ipynb
@@ -13,11 +13,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "import sys\n",
-    "\n",
-    "# Add the src directory to the Python path\n",
-    "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))"
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "%matplotlib inline"
    ]
   },
   {
@@ -26,9 +24,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
+    "import os\n",
+    "import sys\n",
     "\n",
-    "pd.set_option('display.max_columns', None)"
+    "# Add the src directory to the Python path\n",
+    "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))"
    ]
   },
   {
@@ -37,6 +37,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import pandas as pd\n",
+    "\n",
+    "pd.set_option('display.max_columns', None)\n",
+    "\n",
     "# suppress debugging warnings\n",
     "import warnings\n",
     "warnings.filterwarnings('ignore')"
@@ -532,290 +536,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Training\n",
-    "\n",
-    "### Train the model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = create_model(eval_metric='logloss')\n",
-    "\n",
-    "model.fit(X_train, y_train)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Inference\n",
-    "### Make predictions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "y_pred = model.predict(X_test)\n",
-    "y_proba = model.predict_proba(X_test)[:,1]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Evaluation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Accuracy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Accuracy\n",
-    "from sklearn.metrics import accuracy_score\n",
-    "\n",
-    "accuracy = accuracy_score(y_test, y_pred)\n",
-    "print(f\"Accuracy: {accuracy:.4f}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Classification Report"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Classification report\n",
-    "from sklearn.metrics import classification_report\n",
-    "\n",
-    "print(\"Classification Report:\\n\", classification_report(y_test, y_pred))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Confusion Matrix"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Confusion matrix\n",
-    "visualize.confusion_matrix(y_test, y_pred)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### ROC Curve"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.metrics import roc_auc_score\n",
-    "\n",
-    "# ROC-AUC score\n",
-    "roc_auc = roc_auc_score(y_test, y_proba)\n",
-    "print(f\"ROC-AUC Score: {roc_auc:.4f}\")\n",
-    "\n",
-    "# Plot ROC curve\n",
-    "visualize.roc_curve(y_test, y_proba, roc_auc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Cross-Validation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.model_selection import cross_val_score\n",
-    "\n",
-    "# Perform cross-validation\n",
-    "cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')\n",
-    "\n",
-    "print(\"Cross Validation ROC-AUC Scores:\")\n",
-    "print(f\"Mean: {cv_scores.mean():.4f}\")\n",
-    "print(f\"Standard deviation: {cv_scores.std():.2e}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Feature importance"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "feature_importances = pd.DataFrame({\n",
-    "    'columns': X_train.columns,\n",
-    "    'importance': model.feature_importances_\n",
-    "}).sort_values(by='importance', ascending=False)\n",
-    "\n",
-    "with pd.option_context('display.max_rows', None):\n",
-    "    display(feature_importances)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.ensemble import RandomForestRegressor\n",
-    "from sklearn.datasets import make_regression\n",
-    "from sklearn.inspection import permutation_importance\n",
-    "\n",
-    "# Calculate permutation importance\n",
-    "result = permutation_importance(\n",
-    "    model, X_train, y_train, n_repeats=FOLDS, random_state=42)\n",
-    "\n",
-    "# Get importance values\n",
-    "importances = result['importances_mean']\n",
-    "\n",
-    "# Create a DataFrame with feature names and importances\n",
-    "feature_importances = pd.DataFrame({\n",
-    "    'Feature': X_train.columns,\n",
-    "    'Importance': importances\n",
-    "})\n",
-    "\n",
-    "# Sort the DataFrame by importance in descending order\n",
-    "feature_importances = feature_importances.sort_values('Importance', ascending=False)\n",
-    "\n",
-    "# Display the feature importances DataFrame\n",
-    "with pd.option_context('display.max_rows', None):\n",
-    "    display(feature_importances)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Hyperparameter Tuning\n",
-    "\n",
-    "### HP search"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import random\n",
-    "\n",
-    "from sklearn.model_selection import RandomizedSearchCV\n",
-    "from scipy import stats\n",
-    "\n",
-    "# Takes 14.5min at 30 iterations\n",
-    "param_dist = {\n",
-    "    \"max_depth\": stats.randint(2, 3),\n",
-    "    \"learning_rate\": stats.uniform(loc=0.93, scale=0.07),\n",
-    "    \"n_estimators\": stats.randint(100, 1000),\n",
-    "    \"subsample\": stats.norm(0.85, scale=0.05),\n",
-    "    \"colsample_bytree\": stats.uniform(loc=0.98, scale=0.02),\n",
-    "}\n",
-    "\n",
-    "# Setup the randomized search\n",
-    "search = RandomizedSearchCV(\n",
-    "    estimator=create_model(eval_metric='logloss'),\n",
-    "    param_distributions=param_dist,\n",
-    "    n_iter=30, # ~28sec/iter\n",
-    "    cv=FOLDS,\n",
-    "    verbose=0,\n",
-    "    random_state=42,\n",
-    "    n_jobs=(-1)\n",
-    ")\n",
-    "\n",
-    "# Fit the model\n",
-    "search.fit(X_train, y_train)\n",
-    "\n",
-    "# Best Model\n",
-    "print(\"Best Parameters:\", search.best_params_)\n",
-    "print(f\"Best ROC-AUC Score: {search.best_score_:.8f}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Retrain with these hyperparameters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "best_params = search.best_params_\n",
-    "model_best = create_model(**best_params)\n",
-    "model_best.fit(X_train, y_train)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Evaluate the optimized model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Predict on test data\n",
-    "y_pred_best = model_best.predict(X_test)\n",
-    "y_proba_best = model_best.predict_proba(X_test)[:,1]\n",
-    "\n",
-    "# Accuracy\n",
-    "accuracy_best = accuracy_score(y_test, y_pred_best)\n",
-    "print(f\"Optimized Accuracy: {accuracy_best:.4f}\")\n",
-    "\n",
-    "print(\"Optimized Classification Report:\\n\", classification_report(y_test, y_pred_best))"
+    "## Write the preprocessed data"
    ]
   },
   {
@@ -824,12 +545,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ROC-AUC score\n",
-    "roc_auc = roc_auc_score(y_test, y_proba_best)\n",
-    "print(f\"Optimized ROC-AUC Score: {roc_auc:.4f}\")\n",
+    "import joblib\n",
     "\n",
-    "# Plot ROC curve\n",
-    "visualize.roc_curve(y_test, y_proba_best, roc_auc)"
+    "joblib.dump((X_train, X_test, y_train, y_test), '../data/processed/accepted_2007_to_2018Q4.pkl')"
    ]
   }
  ],