first commit

MarkusRosen · MarkusRosen · commit c0511cdede37 · 2020-07-18T18:46:44.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+/env
+/.vscode
+/data
+/lightning_logs
+/mlruns
+/default
+/1
diff --git a/preprocessing.py b/preprocessing.py
@@ -0,0 +1,44 @@
+import pandas as pd
+import os
+from typing import List, Dict
+from PIL import Image
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+data_path = "./data/"
+
+
+def select_rows_with_images(images: List[str], df: pd.DataFrame) -> pd.DataFrame:
+    dicts: List[Dict] = []
+    for index, row in df.iterrows():
+        image_file = row["zpid"] + ".png"
+        if image_file in images:
+            data_row = row.to_dict()
+            dicts.append(data_row)
+
+    df_new = pd.DataFrame(dicts)
+
+    df = df_new[["zpid", "unformattedPrice", "latLong_latitude", "latLong_longitude", "beds", "baths", "area"]]
+
+    return df
+
+
+def resize_images(images: List[str]):
+    for i in images:
+        image = Image.open(f"{data_path}images/{i}")
+        new_image = image.resize((224, 224))
+        new_image.save(f"{data_path}processed_images/{i}")
+
+
+images = os.listdir(f"{data_path}images")
+df = pd.read_pickle(f"{data_path}ny_dataframe.pkl")
+
+
+df = select_rows_with_images(images, df)
+df = df.iloc[0:800]
+print(df.describe())
+ax = sns.boxplot(x=df["baths"])
+plt.show()
+
+df.to_pickle(f"{data_path}df.pkl")
+# resize_images(images)
diff --git a/pytorch_lightning_regression.py b/pytorch_lightning_regression.py
@@ -0,0 +1,133 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader, random_split
+from torchvision import transforms
+from PIL import Image
+import os
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from datetime import datetime
+
+import pytorch_lightning as pl
+from pytorch_lightning import loggers as pl_loggers
+import matplotlib.pyplot as plt
+
+data_path = "./data/"
+
+
+class ImageDataset(Dataset):
+    """Tabular and Image dataset."""
+
+    def __init__(self, pickle_file, image_dir):
+        self.image_dir = image_dir
+        self.pickle_file = pickle_file
+
+        self.tabular = pd.read_pickle(pickle_file)
+
+        print(self.tabular)
+
+    def __len__(self):
+        return len(self.tabular)
+
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+
+        tabular = self.tabular.iloc[idx, 0:]
+
+        y = tabular["unformattedPrice"]
+
+        image = Image.open(f"{self.image_dir}/{tabular['zpid']}.png")
+        image = np.array(image)
+        image = image[..., :3]
+
+        image = transforms.functional.to_tensor(image)
+
+        tabular = tabular[["latLong_latitude", "latLong_longitude", "beds", "baths", "area"]]
+        tabular = tabular.tolist()
+        tabular = torch.FloatTensor(tabular)
+
+        return image, y
+
+
+def conv_block(input_size, output_size):
+    block = nn.Sequential(
+        nn.Conv2d(input_size, output_size, (3, 3)), nn.ReLU(), nn.BatchNorm2d(output_size), nn.MaxPool2d((2, 2)),
+    )
+
+    return block
+
+
+class LitClassifier(pl.LightningModule):
+    def __init__(self, lr):
+        super().__init__()
+        self.lr = lr
+        self.conv1 = conv_block(3, 16)
+        self.conv2 = conv_block(16, 32)
+        self.conv3 = conv_block(32, 64)
+        # conv2d -> -2 pixels
+        # max pool -> pixels/2
+        # remainder will be dropped
+        self.ln1 = nn.Linear(64 * 26 * 26, 16)
+        self.relu = nn.ReLU()
+        self.batchnorm = nn.BatchNorm1d(16)
+        self.dropout = nn.Dropout2d(0.5)
+        self.ln2 = nn.Linear(16, 4)
+        self.ln3 = nn.Linear(4, 1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.ln1(x)
+        x = self.relu(x)
+        x = self.batchnorm(x)
+        x = self.dropout(x)
+        x = self.ln2(x)
+        x = self.relu(x)
+        # x = self.ln3(x)
+        # print(x)
+        return self.ln3(x)
+
+    def train_dataloader(self):
+        return DataLoader(image_data, batch_size=64)
+
+    def training_step(self, batch, batch_nb):
+        x, y = batch
+        # print(x)
+        # print(y)
+        # print(self(x))
+        # print(y)
+        # print(torch.flatten(self(x)))
+        loss = torch.nn.functional.l1_loss(torch.flatten(self(x)), y)
+
+        tensorboard_logs = {"train_loss": loss}
+        return {"loss": loss, "log": tensorboard_logs}
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=(self.lr), weight_decay=1e-3 / 200)
+
+
+if __name__ == "__main__":
+    image_data = ImageDataset(pickle_file=f"{data_path}df.pkl", image_dir=f"{data_path}processed_images/")
+    params = {"batch_size": 64}
+    # train_loader = DataLoader(image_data, **params)
+
+    model = LitClassifier(0.3)
+    mlflow_logger = pl_loggers.MLFlowLogger("logs/")
+    trainer = pl.Trainer(gpus=1, logger=mlflow_logger)
+
+    lr_finder = trainer.lr_find(model)
+    print(lr_finder.results)
+    fig = lr_finder.plot(suggest=True, show=True)
+
+    # fig.show(block=True)
+
+    new_lr = lr_finder.suggestion()
+    print(new_lr)
+    model.hparams.lr = new_lr
+
+    trainer.fit(model)
diff --git a/pytorch_regression_model.py b/pytorch_regression_model.py
@@ -0,0 +1,145 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader, random_split
+from torchvision import transforms
+from PIL import Image
+import os
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from datetime import datetime
+
+data_path = "./data/"
+
+
+class ImageDataset(Dataset):
+    """Tabular and Image dataset."""
+
+    def __init__(self, pickle_file, image_dir):
+        self.image_dir = image_dir
+        self.pickle_file = pickle_file
+
+        self.tabular = pd.read_pickle(pickle_file)
+
+        print(self.tabular)
+
+    def __len__(self):
+        return len(self.tabular)
+
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+
+        tabular = self.tabular.iloc[idx, 0:]
+
+        y = tabular["unformattedPrice"]
+
+        image = Image.open(f"{self.image_dir}/{tabular['zpid']}.png")
+        image = np.array(image)
+        image = image[..., :3]
+
+        image = transforms.functional.to_tensor(image)
+
+        tabular = tabular[["latLong_latitude", "latLong_longitude", "beds", "baths", "area"]]
+        tabular = tabular.tolist()
+        tabular = torch.FloatTensor(tabular)
+
+        return image, y
+
+
+def conv_block(input_size, output_size):
+    block = nn.Sequential(
+        nn.Conv2d(input_size, output_size, (3, 3)), nn.ReLU(), nn.BatchNorm2d(output_size), nn.MaxPool2d((2, 2)),
+    )
+
+    return block
+
+
+class ImageNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = conv_block(3, 16)
+        self.conv2 = conv_block(16, 32)
+        self.conv3 = conv_block(32, 64)
+        # conv2d -> -2 pixels
+        # max pool -> pixels/2
+        # remainder will be dropped
+        self.ln1 = nn.Linear(64 * 26 * 26, 16)
+        self.relu = nn.ReLU()
+        self.batchnorm = nn.BatchNorm1d(16)
+        self.dropout = nn.Dropout2d(0.5)
+        self.ln2 = nn.Linear(16, 4)
+        self.ln3 = nn.Linear(4, 1)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = x.reshape(x.shape[0], -1)
+        x = self.ln1(x)
+        x = self.relu(x)
+        x = self.batchnorm(x)
+        x = self.dropout(x)
+        x = self.ln2(x)
+        x = self.relu(x)
+        x = self.ln3(x)
+        # print(x)
+        return x
+
+
+def train(model, device, train_loader):
+    start_time = datetime.now()
+    model.train()
+    running_loss = 0.0
+    for i_batch, local_batch in enumerate(train_loader):
+        local_batch_X = local_batch[0]
+        local_batch_y = local_batch[1]
+
+        local_batch_X, local_batch_y = (
+            local_batch_X.to(device),
+            local_batch_y.to(device),
+        )
+
+        optimizer.zero_grad()
+        y_pred = model(local_batch_X.float())
+
+        loss = criterion(torch.flatten(y_pred), local_batch_y.float())
+
+        loss.backward()
+        optimizer.step()
+        running_loss += loss.item()
+
+    epoch_loss = running_loss / len(train_loader)
+    end_time = datetime.now() - start_time
+    return epoch_loss
+
+
+if __name__ == "__main__":
+    torch.manual_seed(42)
+    random_seed = 42
+    use_cuda = torch.cuda.is_available()
+    device = torch.device("cuda:0" if use_cuda else "cpu")
+    image_data = ImageDataset(pickle_file=f"{data_path}df.pkl", image_dir=f"{data_path}processed_images/")
+    params = {"batch_size": 4, "shuffle": True, "num_workers": 4}
+    max_epochs = 200
+
+    train_loader = DataLoader(image_data, **params)
+    # for x in train_loader:
+    #    print(x)
+    model = ImageNet()
+    model = model.to(device)
+    print(model)
+    criterion = torch.nn.L1Loss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, weight_decay=1e-3 / 200)
+
+    for epoch in range(max_epochs):
+        start_time = datetime.now()
+        training_loss = train(model, device, train_loader)
+        print(training_loss)
+
+# TODO: boxplots, remove outliers
+# TODO: validation loop
+# TODO: add tabular data network -> check dimensions (before was 4, now should be 5)
+# TODO: print nicer
+# TODO: add simple plot