Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
250 changes: 250 additions & 0 deletions Decision Trees.ipynb

Large diffs are not rendered by default.

306 changes: 306 additions & 0 deletions Random Forests.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# <font color='#31394d'> Random Forests Practice Exercise </font>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this exercise we're going to use the famous <a href=\"https://archive.ics.uci.edu/ml/datasets/iris\" target=\"_blank\">Iris dataset</a> to determine the species of iris using a random forest classifier. Begin by importing the necessary libraries and loading the Iris dataset from sklearn."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"sns.set(rc={'figure.figsize':(6,6)}) \n",
"import warnings\n",
"warnings.simplefilter(\"ignore\")\n",
"\n",
"%matplotlib inline\n",
"\n",
"from sklearn import datasets\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.model_selection import cross_validate\n",
"from sklearn.metrics import SCORERS"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sepal length (cm)</th>\n",
" <th>sepal width (cm)</th>\n",
" <th>petal length (cm)</th>\n",
" <th>petal width (cm)</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5.1</td>\n",
" <td>3.5</td>\n",
" <td>1.4</td>\n",
" <td>0.2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.9</td>\n",
" <td>3.0</td>\n",
" <td>1.4</td>\n",
" <td>0.2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.7</td>\n",
" <td>3.2</td>\n",
" <td>1.3</td>\n",
" <td>0.2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.6</td>\n",
" <td>3.1</td>\n",
" <td>1.5</td>\n",
" <td>0.2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5.0</td>\n",
" <td>3.6</td>\n",
" <td>1.4</td>\n",
" <td>0.2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n",
"0 5.1 3.5 1.4 0.2 \n",
"1 4.9 3.0 1.4 0.2 \n",
"2 4.7 3.2 1.3 0.2 \n",
"3 4.6 3.1 1.5 0.2 \n",
"4 5.0 3.6 1.4 0.2 \n",
"\n",
" target \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = datasets.load_iris()\n",
"\n",
"# for display purposes\n",
"iris = pd.DataFrame(data.data, columns=data.feature_names)\n",
"iris[\"target\"] = data.target\n",
"iris.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"🚀 <font color='#D9C4B1'>Exercise: </font> Build a random forest classifier, train and evaluate it using cross-validation. You can use the functions below."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>fit_time</th>\n",
" <td>0.504849</td>\n",
" </tr>\n",
" <tr>\n",
" <th>score_time</th>\n",
" <td>0.031602</td>\n",
" </tr>\n",
" <tr>\n",
" <th>test_score</th>\n",
" <td>0.960000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>train_score</th>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0\n",
"fit_time 0.504849\n",
"score_time 0.031602\n",
"test_score 0.960000\n",
"train_score 1.000000"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = data.data\n",
"y = data.target\n",
"\n",
"# Instantiate model\n",
"rfc = RandomForestClassifier(n_estimators=50, random_state=42)\n",
"\n",
"def evaluate_model(estimator):\n",
" cv_results = cross_validate(estimator, X, y, scoring='accuracy', n_jobs=-1, cv=10, return_train_score=True)\n",
" return pd.DataFrame(cv_results).abs().mean().to_dict()\n",
"\n",
"# Evaluate\n",
"results = evaluate_model(rfc)\n",
"\n",
"def display_results(results):\n",
" results_df = pd.DataFrame(results, index=[0]).T\n",
" results_cols = results_df.columns\n",
" for col in results_df:\n",
" results_df[col] = results_df[col].apply(np.mean)\n",
" return results_df\n",
"\n",
"# Display results\n",
"display_results(results)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"🚀 <font color='#D9C4B1'>Exercise: </font> Adjust the hyperparameters (e.g. number of trees). Does model performance decrease or increase? "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best hyperparameters: {'n_estimators': 200}\n",
"Best accuracy score: 0.9666666666666666\n"
]
}
],
"source": [
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"# Define the parameter grid\n",
"param_grid = {'n_estimators': [50, 100, 200, 300, 400, 500]}\n",
"\n",
"rfc = RandomForestClassifier(random_state=42)\n",
"\n",
"grid_search = GridSearchCV(rfc, param_grid=param_grid, scoring='accuracy', n_jobs=-1, cv=10)\n",
"\n",
"# Fit the grid search object to the data\n",
"grid_search.fit(X, y)\n",
"\n",
"# Display the best hyperparameters\n",
"print(\"Best hyperparameters: \", grid_search.best_params_)\n",
"\n",
"# Display the average accuracy score of the best model\n",
"print(\"Best accuracy score: \", grid_search.best_score_)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading