Skip to content

Commit

Permalink
Merge pull request #33 from iancovert/pre-commit-ci-update-config
Browse files Browse the repository at this point in the history
[pre-commit.ci] pre-commit autoupdate
  • Loading branch information
iancovert authored Dec 17, 2024
2 parents 4e37ae0 + 858caf2 commit d84ce84
Show file tree
Hide file tree
Showing 14 changed files with 460 additions and 309 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
rev: v5.0.0
hooks:
- id: check-added-large-files
- id: check-builtin-literals
Expand All @@ -12,7 +12,7 @@ repos:
- id: end-of-file-fixer
- id: mixed-line-ending
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.5.6
rev: v0.8.3
hooks:
- id: ruff
args:
Expand Down
105 changes: 62 additions & 43 deletions notebooks/airbnb.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@
"outputs": [],
"source": [
"import re\n",
"import sage\n",
"\n",
"import gender_guesser.detector as detector\n",
"import numpy as np\n",
"import pandas as pd\n",
"import gender_guesser.detector as detector\n",
"from sklearn.model_selection import train_test_split"
"from sklearn.model_selection import train_test_split\n",
"\n",
"import sage"
]
},
{
Expand Down Expand Up @@ -217,7 +219,7 @@
"outputs": [],
"source": [
"# Categorical features\n",
"categorical_columns = ['neighbourhood_group', 'neighbourhood', 'room_type']\n",
"categorical_columns = [\"neighbourhood_group\", \"neighbourhood\", \"room_type\"]\n",
"for column in categorical_columns:\n",
" df[column] = pd.Categorical(df[column]).codes"
]
Expand All @@ -229,7 +231,7 @@
"outputs": [],
"source": [
"# Exclude outliers (top 0.5%)\n",
"df = df[df['price'] < df['price'].quantile(0.995)]"
"df = df[df[\"price\"] < df[\"price\"].quantile(0.995)]"
]
},
{
Expand All @@ -239,9 +241,9 @@
"outputs": [],
"source": [
"# Features derived from name\n",
"df['name_length'] = df['name'].apply(lambda x: len(x))\n",
"df['name_isupper'] = df['name'].apply(lambda x: int(x.isupper()))\n",
"df['name_words'] = df['name'].apply(lambda x: len(re.findall(r'\\w+', x)))"
"df[\"name_length\"] = df[\"name\"].apply(lambda x: len(x))\n",
"df[\"name_isupper\"] = df[\"name\"].apply(lambda x: int(x.isupper()))\n",
"df[\"name_words\"] = df[\"name\"].apply(lambda x: len(re.findall(r\"\\w+\", x)))"
]
},
{
Expand All @@ -252,8 +254,8 @@
"source": [
"# Host gender guess\n",
"guesser = detector.Detector()\n",
"df['host_gender'] = df['host_name'].apply(lambda x: guesser.get_gender(x.split(' ')[0]))\n",
"df['host_gender'] = pd.Categorical(df['host_gender']).codes"
"df[\"host_gender\"] = df[\"host_name\"].apply(lambda x: guesser.get_gender(x.split(\" \")[0]))\n",
"df[\"host_gender\"] = pd.Categorical(df[\"host_gender\"]).codes"
]
},
{
Expand All @@ -263,10 +265,12 @@
"outputs": [],
"source": [
"# Number of days since last review\n",
"most_recent = df['last_review'].max()\n",
"df['last_review'] = (most_recent - df['last_review']).dt.days\n",
"df['last_review'] = (df['last_review'] - df['last_review'].mean()) / df['last_review'].std()\n",
"df['last_review'] = df['last_review'].fillna(-5)"
"most_recent = df[\"last_review\"].max()\n",
"df[\"last_review\"] = (most_recent - df[\"last_review\"]).dt.days\n",
"df[\"last_review\"] = (df[\"last_review\"] - df[\"last_review\"].mean()) / df[\n",
" \"last_review\"\n",
"].std()\n",
"df[\"last_review\"] = df[\"last_review\"].fillna(-5)"
]
},
{
Expand All @@ -276,7 +280,7 @@
"outputs": [],
"source": [
"# Missing values\n",
"df['reviews_per_month'] = df['reviews_per_month'].fillna(0)"
"df[\"reviews_per_month\"] = df[\"reviews_per_month\"].fillna(0)"
]
},
{
Expand All @@ -286,9 +290,15 @@
"outputs": [],
"source": [
"# Normalize other numerical features\n",
"df['number_of_reviews'] = (df['number_of_reviews'] - df['number_of_reviews'].mean()) / df['number_of_reviews'].std()\n",
"df['availability_365'] = (df['availability_365'] - df['availability_365'].mean()) / df['availability_365'].std()\n",
"df['name_length'] = (df['name_length'] - df['name_length'].mean()) / df['name_length'].std()"
"df[\"number_of_reviews\"] = (\n",
" df[\"number_of_reviews\"] - df[\"number_of_reviews\"].mean()\n",
") / df[\"number_of_reviews\"].std()\n",
"df[\"availability_365\"] = (df[\"availability_365\"] - df[\"availability_365\"].mean()) / df[\n",
" \"availability_365\"\n",
"].std()\n",
"df[\"name_length\"] = (df[\"name_length\"] - df[\"name_length\"].mean()) / df[\n",
" \"name_length\"\n",
"].std()"
]
},
{
Expand All @@ -298,8 +308,8 @@
"outputs": [],
"source": [
"# Normalize latitude and longitude\n",
"df['latitude'] = (df['latitude'] - df['latitude'].mean()) / df['latitude'].std()\n",
"df['longitude'] = (df['longitude'] - df['longitude'].mean()) / df['longitude'].std()"
"df[\"latitude\"] = (df[\"latitude\"] - df[\"latitude\"].mean()) / df[\"latitude\"].std()\n",
"df[\"longitude\"] = (df[\"longitude\"] - df[\"longitude\"].mean()) / df[\"longitude\"].std()"
]
},
{
Expand All @@ -309,7 +319,7 @@
"outputs": [],
"source": [
"# Drop columns\n",
"df = df.drop(['id', 'host_id', 'host_name', 'name'], axis=1)"
"df = df.drop([\"id\", \"host_id\", \"host_name\", \"name\"], axis=1)"
]
},
{
Expand Down Expand Up @@ -503,7 +513,7 @@
"outputs": [],
"source": [
"# Rearrange columns\n",
"target_col = 'price'\n",
"target_col = \"price\"\n",
"cols = df.columns.tolist()\n",
"del cols[cols.index(target_col)]\n",
"cols.append(target_col)\n",
Expand All @@ -512,9 +522,11 @@
"\n",
"# Split data\n",
"train, test = train_test_split(\n",
" df.values, test_size=int(0.1 * len(df.values)), random_state=0)\n",
" df.values, test_size=int(0.1 * len(df.values)), random_state=0\n",
")\n",
"train, val = train_test_split(\n",
" train, test_size=int(0.1 * len(df.values)), random_state=0)\n",
" train, test_size=int(0.1 * len(df.values)), random_state=0\n",
")\n",
"Y_train = train[:, -1:].copy()\n",
"Y_val = val[:, -1:].copy()\n",
"Y_test = test[:, -1:].copy()\n",
Expand All @@ -536,11 +548,12 @@
"metadata": {},
"outputs": [],
"source": [
"from copy import deepcopy\n",
"\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from copy import deepcopy\n",
"from torch.utils.data import TensorDataset, DataLoader"
"from torch.utils.data import DataLoader, TensorDataset"
]
},
{
Expand All @@ -552,13 +565,14 @@
"outputs": [],
"source": [
"# Create model\n",
"device = torch.device('cuda')\n",
"device = torch.device(\"cuda\")\n",
"model = nn.Sequential(\n",
" nn.Linear(len(feature_names), 512),\n",
" nn.ELU(),\n",
" nn.Linear(512, 512),\n",
" nn.ELU(),\n",
" nn.Linear(512, 1)).to(device)\n",
" nn.Linear(512, 1),\n",
").to(device)\n",
"\n",
"# Training parameters\n",
"lr = 1e-3\n",
Expand All @@ -570,8 +584,8 @@
"\n",
"# Data loaders\n",
"train_set = TensorDataset(\n",
" torch.tensor(train, dtype=torch.float32),\n",
" torch.tensor(Y_train, dtype=torch.float32))\n",
" torch.tensor(train, dtype=torch.float32), torch.tensor(Y_train, dtype=torch.float32)\n",
")\n",
"train_loader = DataLoader(train_set, batch_size=mbsize, shuffle=True)\n",
"val_x = torch.tensor(val, dtype=torch.float32, device=device)\n",
"val_y = torch.tensor(Y_val, dtype=torch.float32, device=device)\n",
Expand Down Expand Up @@ -601,8 +615,8 @@
" # Calculate validation loss.\n",
" val_loss = loss_fn(model(val_x), val_y).item()\n",
" if verbose:\n",
" print('{}Epoch = {}{}'.format('-' * 10, epoch + 1, '-' * 10))\n",
" print('Val loss = {:.4f}'.format(val_loss))\n",
" print(\"{}Epoch = {}{}\".format(\"-\" * 10, epoch + 1, \"-\" * 10))\n",
" print(\"Val loss = {:.4f}\".format(val_loss))\n",
"\n",
" # Check convergence criterion.\n",
" if val_loss < min_criterion:\n",
Expand All @@ -611,7 +625,7 @@
" best_model = deepcopy(model)\n",
" elif (epoch - min_epoch) == lookback:\n",
" if verbose:\n",
" print('Stopping early')\n",
" print(\"Stopping early\")\n",
" break\n",
"\n",
"# Keep best model\n",
Expand All @@ -638,8 +652,8 @@
"base_mse = nn.MSELoss()(mean.repeat(len(test_y), 1), test_y)\n",
"mse = nn.MSELoss()(model(test_x), test_y)\n",
"\n",
"print('Base rate MSE = {:.2f}'.format(base_mse))\n",
"print('Model MSE = {:.2f}'.format(mse))"
"print(\"Base rate MSE = {:.2f}\".format(base_mse))\n",
"print(\"Model MSE = {:.2f}\".format(mse))"
]
},
{
Expand Down Expand Up @@ -679,7 +693,7 @@
"source": [
"# Setup and calculate\n",
"imputer = sage.MarginalImputer(model, test[:512])\n",
"estimator = sage.PermutationEstimator(imputer, 'mse')\n",
"estimator = sage.PermutationEstimator(imputer, \"mse\")\n",
"sage_values = estimator(test, Y_test)"
]
},
Expand Down Expand Up @@ -721,12 +735,17 @@
"source": [
"# Feature groups\n",
"feature_groups = group_names = {\n",
" 'location (grouped)': ['latitude', 'longitude', 'neighbourhood', 'neighbourhood_group'],\n",
" 'name (grouped)': ['name_words', 'name_length', 'name_isupper'],\n",
" 'reviews (grouped)': ['last_review', 'reviews_per_month', 'number_of_reviews'],\n",
" 'host (grouped)': ['host_gender', 'calculated_host_listings_count'],\n",
" 'availability': ['availability_365'],\n",
" 'room_type': ['room_type']\n",
" \"location (grouped)\": [\n",
" \"latitude\",\n",
" \"longitude\",\n",
" \"neighbourhood\",\n",
" \"neighbourhood_group\",\n",
" ],\n",
" \"name (grouped)\": [\"name_words\", \"name_length\", \"name_isupper\"],\n",
" \"reviews (grouped)\": [\"last_review\", \"reviews_per_month\", \"number_of_reviews\"],\n",
" \"host (grouped)\": [\"host_gender\", \"calculated_host_listings_count\"],\n",
" \"availability\": [\"availability_365\"],\n",
" \"room_type\": [\"room_type\"],\n",
"}\n",
"group_names = [group for group in feature_groups]\n",
"for col in feature_names:\n",
Expand Down Expand Up @@ -772,7 +791,7 @@
"source": [
"# Setup and calculate\n",
"imputer = sage.GroupedMarginalImputer(model, test[:512], groups)\n",
"estimator = sage.PermutationEstimator(imputer, 'mse')\n",
"estimator = sage.PermutationEstimator(imputer, \"mse\")\n",
"sage_values = estimator(test, Y_test)"
]
},
Expand Down
Loading

0 comments on commit d84ce84

Please sign in to comment.