Skip to content

Commit 8c2e586

Browse files
committed
added early stopping, validation and testing
1 parent 0b10ff3 commit 8c2e586

8 files changed

+302
-206
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,7 @@
55
/mlruns
66
/default
77
/1
8+
/multi_input
9+
/my_model
10+
/image_only
11+
/temp_model.ckpt

boxplot.png

7.58 KB
Loading

lr_i.png

31.5 KB
Loading

lr_mi.png

38 KB
Loading

preprocessing.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,32 +43,40 @@ def remove_outliers(df: pd.DataFrame, col: str):
4343

4444
images = os.listdir(f"{data_path}images")
4545
df = pd.read_pickle(f"{data_path}ny_dataframe.pkl")
46+
# print(images)
4647

4748

4849
df = select_rows_with_images(images, df)
49-
df = df.iloc[0:800]
50+
# df = df.iloc[0:800]
5051

5152
df["unformattedPrice"] = df["unformattedPrice"].astype(float)
5253
df["latLong_latitude"] = df["latLong_latitude"].astype(float)
5354
df["latLong_longitude"] = df["latLong_longitude"].astype(float)
5455
df["beds"] = df["beds"].astype(float)
5556
df["baths"] = df["baths"].astype(float)
5657
df["area"] = df["area"].astype(float)
58+
59+
# df["unformattedPrice"] = df["unformattedPrice"]/df["area"]
60+
df.columns = ["zpid", "price", "latitude", "longitude", "beds", "baths", "area"]
5761
print(df.describe())
58-
ax = sns.boxplot(x=df["unformattedPrice"])
59-
plt.show()
62+
ax = sns.boxplot(x=df["price"])
63+
# plt.show()
6064
print(df.dtypes)
6165

6266
# for col in df.columns[1:]:
6367
# df = remove_outliers(df, col)
6468

65-
df = remove_outliers(df, "unformattedPrice")
69+
df = remove_outliers(df, "price")
6670
df = remove_outliers(df, "beds")
6771
df = remove_outliers(df, "baths")
6872
df = remove_outliers(df, "area")
73+
# dataset has to be divisble by 0.8!
74+
75+
df = df.iloc[3:]
6976
print(df.describe())
70-
ax = sns.boxplot(x=df["unformattedPrice"])
71-
plt.show()
77+
ax = sns.boxplot(x=df["price"])
78+
# plt.show()
79+
print(df)
7280
df.to_pickle(f"{data_path}df.pkl")
73-
74-
# resize_images(images)
81+
# print(list(df["zpid"] + ".png"))
82+
# resize_images(list(df["zpid"] + ".png"))

pytorch_lightning_regression.py

Lines changed: 108 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
1+
import pandas as pd
2+
import numpy as np
3+
from PIL import Image
4+
15
import torch
26
import torch.nn as nn
3-
import torch.nn.functional as F
47
from torch.utils.data import Dataset, DataLoader, random_split
58
from torchvision import transforms
6-
from PIL import Image
7-
import os
8-
import pandas as pd
9-
import numpy as np
10-
from tqdm import tqdm
11-
from datetime import datetime
129

1310
import pytorch_lightning as pl
14-
from pytorch_lightning import loggers as pl_loggers
15-
import matplotlib.pyplot as plt
11+
from pytorch_lightning.logging import TensorBoardLogger
12+
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
13+
1614

1715
data_path = "./data/"
1816

@@ -23,11 +21,8 @@ class ImageDataset(Dataset):
2321
def __init__(self, pickle_file, image_dir):
2422
self.image_dir = image_dir
2523
self.pickle_file = pickle_file
26-
2724
self.tabular = pd.read_pickle(pickle_file)
2825

29-
print(self.tabular)
30-
3126
def __len__(self):
3227
return len(self.tabular)
3328

@@ -37,19 +32,19 @@ def __getitem__(self, idx):
3732

3833
tabular = self.tabular.iloc[idx, 0:]
3934

40-
y = tabular["unformattedPrice"]
35+
y = tabular["price"]
4136

4237
image = Image.open(f"{self.image_dir}/{tabular['zpid']}.png")
4338
image = np.array(image)
4439
image = image[..., :3]
4540

4641
image = transforms.functional.to_tensor(image)
4742

48-
tabular = tabular[["latLong_latitude", "latLong_longitude", "beds", "baths", "area"]]
43+
tabular = tabular[["latitude", "longitude", "beds", "baths", "area"]]
4944
tabular = tabular.tolist()
5045
tabular = torch.FloatTensor(tabular)
5146

52-
return image, y
47+
return image, tabular, y
5348

5449

5550
def conv_block(input_size, output_size):
@@ -61,73 +56,133 @@ def conv_block(input_size, output_size):
6156

6257

6358
class LitClassifier(pl.LightningModule):
64-
def __init__(self, lr=1e-3):
59+
def __init__(
60+
self, lr: float = 1e-3, num_workers: int = 4, batch_size: int = 32,
61+
):
6562
super().__init__()
6663
self.lr = lr
64+
self.num_workers = num_workers
65+
self.batch_size = batch_size
66+
6767
self.conv1 = conv_block(3, 16)
6868
self.conv2 = conv_block(16, 32)
6969
self.conv3 = conv_block(32, 64)
70-
# conv2d -> -2 pixels
71-
# max pool -> pixels/2
72-
# remainder will be dropped
70+
7371
self.ln1 = nn.Linear(64 * 26 * 26, 16)
7472
self.relu = nn.ReLU()
7573
self.batchnorm = nn.BatchNorm1d(16)
7674
self.dropout = nn.Dropout2d(0.5)
77-
self.ln2 = nn.Linear(16, 4)
78-
self.ln3 = nn.Linear(4, 1)
79-
80-
def forward(self, x):
81-
x = self.conv1(x)
82-
x = self.conv2(x)
83-
x = self.conv3(x)
84-
x = x.reshape(x.shape[0], -1)
85-
x = self.ln1(x)
86-
x = self.relu(x)
87-
x = self.batchnorm(x)
88-
x = self.dropout(x)
89-
x = self.ln2(x)
75+
self.ln2 = nn.Linear(16, 5)
76+
77+
self.ln4 = nn.Linear(5, 10)
78+
self.ln5 = nn.Linear(10, 10)
79+
self.ln6 = nn.Linear(10, 5)
80+
self.ln7 = nn.Linear(10, 1)
81+
82+
def forward(self, img, tab):
83+
img = self.conv1(img)
84+
85+
img = self.conv2(img)
86+
img = self.conv3(img)
87+
img = img.reshape(img.shape[0], -1)
88+
img = self.ln1(img)
89+
img = self.relu(img)
90+
img = self.batchnorm(img)
91+
img = self.dropout(img)
92+
img = self.ln2(img)
93+
img = self.relu(img)
94+
95+
tab = self.ln4(tab)
96+
tab = self.relu(tab)
97+
tab = self.ln5(tab)
98+
tab = self.relu(tab)
99+
tab = self.ln6(tab)
100+
tab = self.relu(tab)
101+
102+
x = torch.cat((img, tab), dim=1)
90103
x = self.relu(x)
91-
# x = self.ln3(x)
92-
# print(x)
93-
return self.ln3(x)
94104

95-
def train_dataloader(self):
96-
return DataLoader(image_data, batch_size=32)
97-
98-
def training_step(self, batch, batch_nb):
99-
x, y = batch
100-
# print(x)
101-
# print(y)
102-
# print(self(x))
103-
# print(y)
104-
# print(torch.flatten(self(x)))
105+
return self.ln7(x)
106+
107+
def training_step(self, batch, batch_idx):
108+
image, tabular, y = batch
109+
105110
criterion = torch.nn.L1Loss()
106-
y_pred = torch.flatten(self(x))
111+
y_pred = torch.flatten(self(image, tabular))
107112
y_pred = y_pred.double()
108-
# loss = torch.sqrt(criterion(y_pred, y))
113+
109114
loss = criterion(y_pred, y)
110115

111116
tensorboard_logs = {"train_loss": loss}
112117
return {"loss": loss, "log": tensorboard_logs}
113118

119+
def validation_step(self, batch, batch_idx):
120+
image, tabular, y = batch
121+
122+
criterion = torch.nn.L1Loss()
123+
y_pred = torch.flatten(self(image, tabular))
124+
y_pred = y_pred.double()
125+
126+
val_loss = criterion(y_pred, y)
127+
128+
return {"val_loss": val_loss}
129+
130+
def validation_epoch_end(self, outputs):
131+
avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
132+
tensorboard_logs = {"val_loss": avg_loss}
133+
return {"val_loss": avg_loss, "log": tensorboard_logs}
134+
135+
def test_step(self, batch, batch_idx):
136+
image, tabular, y = batch
137+
138+
criterion = torch.nn.L1Loss()
139+
y_pred = torch.flatten(self(image, tabular))
140+
y_pred = y_pred.double()
141+
142+
test_loss = criterion(y_pred, y)
143+
144+
return {"test_loss": test_loss}
145+
146+
def test_epoch_end(self, outputs):
147+
avg_loss = torch.stack([x["test_loss"] for x in outputs]).mean()
148+
logs = {"test_loss": avg_loss}
149+
return {"test_loss": avg_loss, "log": logs, "progress_bar": logs}
150+
151+
def setup(self, stage):
152+
153+
image_data = ImageDataset(pickle_file=f"{data_path}df.pkl", image_dir=f"{data_path}processed_images/")
154+
155+
train_size = int(0.80 * len(image_data))
156+
val_size = int((len(image_data) - train_size) / 2)
157+
test_size = int((len(image_data) - train_size) / 2)
158+
159+
self.train_set, self.val_set, self.test_set = random_split(image_data, (train_size, val_size, test_size))
160+
114161
def configure_optimizers(self):
115162
return torch.optim.Adam(self.parameters(), lr=(self.lr))
116163

164+
def train_dataloader(self):
165+
return DataLoader(self.train_set, batch_size=self.batch_size)
166+
167+
def val_dataloader(self):
168+
return DataLoader(self.val_set, batch_size=self.batch_size)
169+
170+
def test_dataloader(self):
171+
return DataLoader(self.test_set, batch_size=self.batch_size)
172+
117173

118174
if __name__ == "__main__":
119-
image_data = ImageDataset(pickle_file=f"{data_path}df.pkl", image_dir=f"{data_path}processed_images/")
175+
logger = TensorBoardLogger("lightning_logs", name="multi_input")
176+
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=5000, patience=7, verbose=False, mode="min")
120177

121178
model = LitClassifier()
122-
# mlflow_logger = pl_loggers.MLFlowLogger("logs/")
123-
trainer = pl.Trainer(gpus=1)
179+
trainer = pl.Trainer(gpus=1, logger=logger, early_stop_callback=early_stop_callback)
124180

125181
lr_finder = trainer.lr_find(model)
126-
print(lr_finder.results)
127182
fig = lr_finder.plot(suggest=True, show=True)
128-
129183
new_lr = lr_finder.suggestion()
130184
print(new_lr)
131-
model.hparams.lr = new_lr # 1e-2
185+
model.hparams.lr = new_lr
132186

133187
trainer.fit(model)
188+
trainer.test(model)

0 commit comments

Comments
 (0)