Skip to content

Commit c0511cd

Browse files
committed
first commit
0 parents  commit c0511cd

File tree

4 files changed

+329
-0
lines changed

4 files changed

+329
-0
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
/env
2+
/.vscode
3+
/data
4+
/lightning_logs
5+
/mlruns
6+
/default
7+
/1

preprocessing.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import pandas as pd
2+
import os
3+
from typing import List, Dict
4+
from PIL import Image
5+
import seaborn as sns
6+
import matplotlib.pyplot as plt
7+
8+
data_path = "./data/"
9+
10+
11+
def select_rows_with_images(images: List[str], df: pd.DataFrame) -> pd.DataFrame:
12+
dicts: List[Dict] = []
13+
for index, row in df.iterrows():
14+
image_file = row["zpid"] + ".png"
15+
if image_file in images:
16+
data_row = row.to_dict()
17+
dicts.append(data_row)
18+
19+
df_new = pd.DataFrame(dicts)
20+
21+
df = df_new[["zpid", "unformattedPrice", "latLong_latitude", "latLong_longitude", "beds", "baths", "area"]]
22+
23+
return df
24+
25+
26+
def resize_images(images: List[str]):
27+
for i in images:
28+
image = Image.open(f"{data_path}images/{i}")
29+
new_image = image.resize((224, 224))
30+
new_image.save(f"{data_path}processed_images/{i}")
31+
32+
33+
images = os.listdir(f"{data_path}images")
34+
df = pd.read_pickle(f"{data_path}ny_dataframe.pkl")
35+
36+
37+
df = select_rows_with_images(images, df)
38+
df = df.iloc[0:800]
39+
print(df.describe())
40+
ax = sns.boxplot(x=df["baths"])
41+
plt.show()
42+
43+
df.to_pickle(f"{data_path}df.pkl")
44+
# resize_images(images)

pytorch_lightning_regression.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import torch
2+
import torch.nn as nn
3+
import torch.nn.functional as F
4+
from torch.utils.data import Dataset, DataLoader, random_split
5+
from torchvision import transforms
6+
from PIL import Image
7+
import os
8+
import pandas as pd
9+
import numpy as np
10+
from tqdm import tqdm
11+
from datetime import datetime
12+
13+
import pytorch_lightning as pl
14+
from pytorch_lightning import loggers as pl_loggers
15+
import matplotlib.pyplot as plt
16+
17+
data_path = "./data/"
18+
19+
20+
class ImageDataset(Dataset):
21+
"""Tabular and Image dataset."""
22+
23+
def __init__(self, pickle_file, image_dir):
24+
self.image_dir = image_dir
25+
self.pickle_file = pickle_file
26+
27+
self.tabular = pd.read_pickle(pickle_file)
28+
29+
print(self.tabular)
30+
31+
def __len__(self):
32+
return len(self.tabular)
33+
34+
def __getitem__(self, idx):
35+
if torch.is_tensor(idx):
36+
idx = idx.tolist()
37+
38+
tabular = self.tabular.iloc[idx, 0:]
39+
40+
y = tabular["unformattedPrice"]
41+
42+
image = Image.open(f"{self.image_dir}/{tabular['zpid']}.png")
43+
image = np.array(image)
44+
image = image[..., :3]
45+
46+
image = transforms.functional.to_tensor(image)
47+
48+
tabular = tabular[["latLong_latitude", "latLong_longitude", "beds", "baths", "area"]]
49+
tabular = tabular.tolist()
50+
tabular = torch.FloatTensor(tabular)
51+
52+
return image, y
53+
54+
55+
def conv_block(input_size, output_size):
56+
block = nn.Sequential(
57+
nn.Conv2d(input_size, output_size, (3, 3)), nn.ReLU(), nn.BatchNorm2d(output_size), nn.MaxPool2d((2, 2)),
58+
)
59+
60+
return block
61+
62+
63+
class LitClassifier(pl.LightningModule):
64+
def __init__(self, lr):
65+
super().__init__()
66+
self.lr = lr
67+
self.conv1 = conv_block(3, 16)
68+
self.conv2 = conv_block(16, 32)
69+
self.conv3 = conv_block(32, 64)
70+
# conv2d -> -2 pixels
71+
# max pool -> pixels/2
72+
# remainder will be dropped
73+
self.ln1 = nn.Linear(64 * 26 * 26, 16)
74+
self.relu = nn.ReLU()
75+
self.batchnorm = nn.BatchNorm1d(16)
76+
self.dropout = nn.Dropout2d(0.5)
77+
self.ln2 = nn.Linear(16, 4)
78+
self.ln3 = nn.Linear(4, 1)
79+
80+
def forward(self, x):
81+
x = self.conv1(x)
82+
x = self.conv2(x)
83+
x = self.conv3(x)
84+
x = x.reshape(x.shape[0], -1)
85+
x = self.ln1(x)
86+
x = self.relu(x)
87+
x = self.batchnorm(x)
88+
x = self.dropout(x)
89+
x = self.ln2(x)
90+
x = self.relu(x)
91+
# x = self.ln3(x)
92+
# print(x)
93+
return self.ln3(x)
94+
95+
def train_dataloader(self):
96+
return DataLoader(image_data, batch_size=64)
97+
98+
def training_step(self, batch, batch_nb):
99+
x, y = batch
100+
# print(x)
101+
# print(y)
102+
# print(self(x))
103+
# print(y)
104+
# print(torch.flatten(self(x)))
105+
loss = torch.nn.functional.l1_loss(torch.flatten(self(x)), y)
106+
107+
tensorboard_logs = {"train_loss": loss}
108+
return {"loss": loss, "log": tensorboard_logs}
109+
110+
def configure_optimizers(self):
111+
return torch.optim.Adam(self.parameters(), lr=(self.lr), weight_decay=1e-3 / 200)
112+
113+
114+
if __name__ == "__main__":
115+
image_data = ImageDataset(pickle_file=f"{data_path}df.pkl", image_dir=f"{data_path}processed_images/")
116+
params = {"batch_size": 64}
117+
# train_loader = DataLoader(image_data, **params)
118+
119+
model = LitClassifier(0.3)
120+
mlflow_logger = pl_loggers.MLFlowLogger("logs/")
121+
trainer = pl.Trainer(gpus=1, logger=mlflow_logger)
122+
123+
lr_finder = trainer.lr_find(model)
124+
print(lr_finder.results)
125+
fig = lr_finder.plot(suggest=True, show=True)
126+
127+
# fig.show(block=True)
128+
129+
new_lr = lr_finder.suggestion()
130+
print(new_lr)
131+
model.hparams.lr = new_lr
132+
133+
trainer.fit(model)

pytorch_regression_model.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import torch
2+
import torch.nn as nn
3+
import torch.nn.functional as F
4+
from torch.utils.data import Dataset, DataLoader, random_split
5+
from torchvision import transforms
6+
from PIL import Image
7+
import os
8+
import pandas as pd
9+
import numpy as np
10+
from tqdm import tqdm
11+
from datetime import datetime
12+
13+
data_path = "./data/"
14+
15+
16+
class ImageDataset(Dataset):
17+
"""Tabular and Image dataset."""
18+
19+
def __init__(self, pickle_file, image_dir):
20+
self.image_dir = image_dir
21+
self.pickle_file = pickle_file
22+
23+
self.tabular = pd.read_pickle(pickle_file)
24+
25+
print(self.tabular)
26+
27+
def __len__(self):
28+
return len(self.tabular)
29+
30+
def __getitem__(self, idx):
31+
if torch.is_tensor(idx):
32+
idx = idx.tolist()
33+
34+
tabular = self.tabular.iloc[idx, 0:]
35+
36+
y = tabular["unformattedPrice"]
37+
38+
image = Image.open(f"{self.image_dir}/{tabular['zpid']}.png")
39+
image = np.array(image)
40+
image = image[..., :3]
41+
42+
image = transforms.functional.to_tensor(image)
43+
44+
tabular = tabular[["latLong_latitude", "latLong_longitude", "beds", "baths", "area"]]
45+
tabular = tabular.tolist()
46+
tabular = torch.FloatTensor(tabular)
47+
48+
return image, y
49+
50+
51+
def conv_block(input_size, output_size):
52+
block = nn.Sequential(
53+
nn.Conv2d(input_size, output_size, (3, 3)), nn.ReLU(), nn.BatchNorm2d(output_size), nn.MaxPool2d((2, 2)),
54+
)
55+
56+
return block
57+
58+
59+
class ImageNet(nn.Module):
60+
def __init__(self):
61+
super().__init__()
62+
self.conv1 = conv_block(3, 16)
63+
self.conv2 = conv_block(16, 32)
64+
self.conv3 = conv_block(32, 64)
65+
# conv2d -> -2 pixels
66+
# max pool -> pixels/2
67+
# remainder will be dropped
68+
self.ln1 = nn.Linear(64 * 26 * 26, 16)
69+
self.relu = nn.ReLU()
70+
self.batchnorm = nn.BatchNorm1d(16)
71+
self.dropout = nn.Dropout2d(0.5)
72+
self.ln2 = nn.Linear(16, 4)
73+
self.ln3 = nn.Linear(4, 1)
74+
75+
def forward(self, x):
76+
x = self.conv1(x)
77+
x = self.conv2(x)
78+
x = self.conv3(x)
79+
x = x.reshape(x.shape[0], -1)
80+
x = self.ln1(x)
81+
x = self.relu(x)
82+
x = self.batchnorm(x)
83+
x = self.dropout(x)
84+
x = self.ln2(x)
85+
x = self.relu(x)
86+
x = self.ln3(x)
87+
# print(x)
88+
return x
89+
90+
91+
def train(model, device, train_loader):
92+
start_time = datetime.now()
93+
model.train()
94+
running_loss = 0.0
95+
for i_batch, local_batch in enumerate(train_loader):
96+
local_batch_X = local_batch[0]
97+
local_batch_y = local_batch[1]
98+
99+
local_batch_X, local_batch_y = (
100+
local_batch_X.to(device),
101+
local_batch_y.to(device),
102+
)
103+
104+
optimizer.zero_grad()
105+
y_pred = model(local_batch_X.float())
106+
107+
loss = criterion(torch.flatten(y_pred), local_batch_y.float())
108+
109+
loss.backward()
110+
optimizer.step()
111+
running_loss += loss.item()
112+
113+
epoch_loss = running_loss / len(train_loader)
114+
end_time = datetime.now() - start_time
115+
return epoch_loss
116+
117+
118+
if __name__ == "__main__":
119+
torch.manual_seed(42)
120+
random_seed = 42
121+
use_cuda = torch.cuda.is_available()
122+
device = torch.device("cuda:0" if use_cuda else "cpu")
123+
image_data = ImageDataset(pickle_file=f"{data_path}df.pkl", image_dir=f"{data_path}processed_images/")
124+
params = {"batch_size": 4, "shuffle": True, "num_workers": 4}
125+
max_epochs = 200
126+
127+
train_loader = DataLoader(image_data, **params)
128+
# for x in train_loader:
129+
# print(x)
130+
model = ImageNet()
131+
model = model.to(device)
132+
print(model)
133+
criterion = torch.nn.L1Loss()
134+
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, weight_decay=1e-3 / 200)
135+
136+
for epoch in range(max_epochs):
137+
start_time = datetime.now()
138+
training_loss = train(model, device, train_loader)
139+
print(training_loss)
140+
141+
# TODO: boxplots, remove outliers
142+
# TODO: validation loop
143+
# TODO: add tabular data network -> check dimensions (before was 4, now should be 5)
144+
# TODO: print nicer
145+
# TODO: add simple plot

0 commit comments

Comments
 (0)