From 39d439ecb52145382e3b0bf74dd5853becd64ca7 Mon Sep 17 00:00:00 2001 From: Andrea Lombardi Date: Fri, 5 Aug 2022 15:48:15 +0200 Subject: [PATCH] first commit --- ApolloScapeDataset.py | 185 ++++++++++++++++++++++++++ CityscapesDataset.py | 177 +++++++++++++++++++++++++ KITTIDataset.py | 81 ++++++++++++ configuration.ini | 20 +++ eval.py | 82 ++++++++++++ modelv1/Decoder.py | 31 +++++ modelv1/Encoder.py | 56 ++++++++ modelv1/EncoderBlock.py | 127 ++++++++++++++++++ modelv1/EncoderStage.py | 31 +++++ modelv1/SegmentationHead.py | 17 +++ modelv1/model.py | 47 +++++++ modelv2/SegformerDecodeHead.py | 169 ++++++++++++++++++++++++ modelv2/SegformerSkipDecodeHead.py | 133 +++++++++++++++++++ modelv2/Segformer_model.py | 70 ++++++++++ test.py | 100 +++++++++++++++ train.py | 170 ++++++++++++++++++++++++ train_hf.py | 200 +++++++++++++++++++++++++++++ utils.py | 35 +++++ 18 files changed, 1731 insertions(+) create mode 100644 ApolloScapeDataset.py create mode 100644 CityscapesDataset.py create mode 100644 KITTIDataset.py create mode 100644 configuration.ini create mode 100644 eval.py create mode 100644 modelv1/Decoder.py create mode 100644 modelv1/Encoder.py create mode 100644 modelv1/EncoderBlock.py create mode 100644 modelv1/EncoderStage.py create mode 100644 modelv1/SegmentationHead.py create mode 100644 modelv1/model.py create mode 100644 modelv2/SegformerDecodeHead.py create mode 100644 modelv2/SegformerSkipDecodeHead.py create mode 100644 modelv2/Segformer_model.py create mode 100644 test.py create mode 100644 train.py create mode 100644 train_hf.py create mode 100644 utils.py diff --git a/ApolloScapeDataset.py b/ApolloScapeDataset.py new file mode 100644 index 0000000..c35a1ea --- /dev/null +++ b/ApolloScapeDataset.py @@ -0,0 +1,185 @@ +import random + +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" Zpark labels""" + +from collections import namedtuple + +from transformers import SegformerFeatureExtractor +from torch.utils.data import Dataset +import os +import cv2 +import torchvision.transforms.functional as TF + +from torchvision import transforms as tfs +import numpy as np + +class ApolloScapeDataset(Dataset): + """KITTI semantic segmentation dataset.""" + + def __init__(self, root_dir:str, split:str='train', transforms=None): + """ + Args: + root_dir (string): Root directory of the dataset containing the images + annotations. + split: the split of the dataset (train, test or val) + """ + assert split=='train' or split=='test' or split=='val', "The split of the dataset must be one between 'train', 'test' or 'val'" + + self.root_dir = root_dir + self.feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-cityscapes-512-1024") + self.transforms = transforms + + self.img_dir = os.path.join(self.root_dir, "ColorImage", split) + self.ann_dir = os.path.join(self.root_dir, "Label", split) + + # read images + image_file_names = [] + for root, dirs, files in os.walk(self.img_dir): + for f in files: + complete_path = os.path.join(root, f) + #print(complete_path) + image_file_names.append(complete_path) + + self.images = sorted(image_file_names) + # read annotations + annotation_file_names = [] + for root, dirs, files in os.walk(self.ann_dir): + for f in files: + complete_path = os.path.join(root, f) + annotation_file_names.append(complete_path) + + self.annotations = sorted(annotation_file_names) + + + assert len(self.images) == len(self.annotations), "There must be as many images as there are segmentation maps" + + # a label and all meta information + Label = namedtuple('Label', [ + 'name' , # The identifier of this label, e.g. 'car', 'person', ... . + # We use them to uniquely name a class + 'clsId' , + 'id' , # An integer ID that is associated with this label. + 'trainId' , + 'category' , # The name of the category that this label belongs to + 'categoryId' , # The ID of this category. Used to create ground truth images on category level. + 'hasInstances', # Whether this label distinguishes between single instances or not + 'ignoreInEval', # Whether pixels having this class as ground truth label are ignored during evaluations or not + 'color' , # The color of this label + ]) + #-------------------------------------------------------------------------------- + # A list of all labels + #-------------------------------------------------------------------------------- + + self.labels = [ + # name clsId id trainId category catId hasInstanceignoreInEval color + Label('others' , 0 , 0, 0 , 'others' , 0 ,False , True , (0, 0, 0) ), + Label('rover' , 0x01 , 1, 1 , 'others' , 0 ,False , True , (0, 0, 0) ), + Label('sky' , 0x11 , 17, 2 , 'sky' , 1 ,False , False , (70, 130, 180) ), + Label('car' , 0x21 , 33, 3 , 'movable object', 2 ,True , False , (0, 0, 142) ), + Label('car_groups' , 0xA1 , 161, 4 , 'movable object', 2 ,True , False , (0, 0, 142) ), + Label('motorbicycle' , 0x22 , 34, 5 , 'movable object', 2 ,True , False , (0, 0, 230) ), + Label('motorbicycle_group' , 0xA2 , 162, 6 , 'movable object', 2 ,True , False , (0, 0, 230) ), + Label('bicycle' , 0x23 , 35, 7 , 'movable object', 2 ,True , False , (119, 11, 32) ), + Label('bicycle_group' , 0xA3 , 163, 8 , 'movable object', 2 ,True , False , (119, 11, 32) ), + Label('person' , 0x24 , 36, 9 , 'movable object', 2 ,True , False , (0, 128, 192) ), + Label('person_group' , 0xA4 , 164, 10 , 'movable object', 2 ,True , False , (0, 128, 192) ), + Label('rider' , 0x25 , 37, 11 , 'movable object', 2 ,True , False , (128, 64, 128) ), + Label('rider_group' , 0xA5 , 165, 12 , 'movable object', 2 ,True , False , (128, 64, 128) ), + Label('truck' , 0x26 , 38, 13 , 'movable object', 2 ,True , False , (128, 0, 192) ), + Label('truck_group' , 0xA6 , 166, 14 , 'movable object', 2 ,True , False , (128, 0, 192) ), + Label('bus' , 0x27 , 39, 15 , 'movable object', 2 ,True , False , (192, 0, 64) ), + Label('bus_group' , 0xA7 , 167, 16 , 'movable object', 2 ,True , False , (192, 0, 64) ), + Label('tricycle' , 0x28 , 40, 17 , 'movable object', 2 ,True , False , (128, 128, 192) ), + Label('tricycle_group' , 0xA8 , 168, 18 , 'movable object', 2 ,True , False , (128, 128, 192) ), + Label('road' , 0x31 , 49, 19 , 'flat' , 3 ,False , False , (192, 128, 192) ), + Label('siderwalk' , 0x32 , 50, 20 , 'flat' , 3 ,False , False , (192, 128, 64) ), + Label('traffic_cone' , 0x41 , 65, 21 , 'road obstacles', 4 ,False , False , (0, 0, 64) ), + Label('road_pile' , 0x42 , 66, 22 , 'road obstacles', 4 ,False , False , (0, 0, 192) ), + Label('fence' , 0x43 , 67, 23 , 'road obstacles', 4 ,False , False , (64, 64, 128) ), + Label('traffic_light' , 0x51 , 81, 24 , 'Roadside objects', 5 ,False , False , (192, 64, 128) ), + Label('pole' , 0x52 , 82, 25 , 'Roadside objects', 5 ,False , False , (192, 128, 128) ), + Label('traffic_sign' , 0x53 , 83, 26 , 'Roadside objects', 5 ,False , False , (0, 64, 64) ), + Label('wall' , 0x54 , 84, 27 , 'Roadside objects', 5 ,False , False , (192, 192, 128) ), + Label('dustbin' , 0x55 , 85, 28 , 'Roadside objects', 5 ,False , False , (64, 0, 192) ), + Label('billboard' , 0x56 , 86, 29 , 'Roadside objects', 5 ,False , False , (192, 0, 192) ), + Label('building' , 0x61 , 97, 30 , 'building' , 6 ,False , False , (192, 0, 128) ), + Label('bridge' , 0x62 , 98, 31 , 'building' , 6 ,False , True , (128, 128, 0) ), + Label('tunnel' , 0x63 , 99, 32 , 'building' , 6 ,False , True , (128, 0, 0) ), + Label('overpass' , 0x64 , 100, 33 , 'building' , 6 ,False , True , (64, 128, 64) ), + Label('vegatation' , 0x71 , 113, 34 , 'natural' , 7 ,False , False , (128, 128, 64) ), + Label('unlabeled' , 0xFF , -1 , -1 , 'unlabeled' , 8 ,False , True , (255, 255, 255) ), + ] + + #-------------------------------------------------------------------------------- + # Create dictionaries for a fast lookup + #-------------------------------------------------------------------------------- + def get_id2label(self): + # return id to label object + id2label = { label.id : label for label in self.labels } + return id2label + + def get_label2id(self): + # return name to label object + name2label = { label.name : label for label in self.labels } + return name2label + + def get_trainId2label(self): + # trainId to label object. This is used as a id2label. + trainId2label = {label.trainId: label for label in self.labels} + return trainId2label + + def get_label2color(self): + # return label to color code dictionary + label2color = {label.color : label for label in self.labels} + return label2color + #-------------------------------------------------------------------------------- + + def __len__(self): + return len(self.images) + + def __getitem__(self, idx): + + image = cv2.imread(os.path.join(self.img_dir, self.images[idx]), cv2.IMREAD_COLOR) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + + segmentation_map = cv2.imread(os.path.join(self.ann_dir, self.annotations[idx]), cv2.IMREAD_GRAYSCALE) + for l in self.labels: + segmentation_map = np.where(segmentation_map!=l.id, segmentation_map, l.trainId).astype(np.uint8) + #segmentation_map = cv2.cvtColor(segmentation_map, cv2.COLOR_BGR2GRAY) + + #image = Image.open() + #segmentation_map = Image.open() + + if self.transforms is not None: + augmented = self.transforms(image=image, mask=segmentation_map) + # randomly crop + pad both image and segmentation map to same size + encoded_inputs = self.feature_extractor(augmented['image'], augmented['mask'], return_tensors="pt") + else: + encoded_inputs = self.feature_extractor(image, segmentation_map, return_tensors="pt") + + for k,v in encoded_inputs.items(): + encoded_inputs[k].squeeze_() # remove batch dimension + + return encoded_inputs +''' +ds = ApolloScapeDataset("/home/a.lombardi/ApolloScape_Dataset", split='test', transforms=None) +print(len(ds.labels)) + +prova = ds[55] + +print(prova["pixel_values"].shape) +print(prova["labels"].shape) + +import matplotlib.pyplot as plt +import numpy as np +plt.imshow(prova["labels"].numpy()) +plt.savefig("prova.png") + +p = prova["pixel_values"].numpy() +p = np.swapaxes(p, 0, 2) +p = np.swapaxes(p, 0, 1) +plt.imshow(p) +plt.savefig("prova2.png") +''' \ No newline at end of file diff --git a/CityscapesDataset.py b/CityscapesDataset.py new file mode 100644 index 0000000..b1b1a28 --- /dev/null +++ b/CityscapesDataset.py @@ -0,0 +1,177 @@ +import random +from torch import IntTensor +from transformers import SegformerFeatureExtractor +from torch.utils.data import Dataset +from torchvision.datasets import Cityscapes +import torchvision.transforms.functional as TF + +from torchvision import transforms as tfs + +from collections import namedtuple + +class CityscapesDataset(Dataset): + def __init__(self, path: str, split: str, transforms: bool=False, mode='fine', target_type='semantic'): + """ + Args: + root_dir (string): Root directory of the dataset. + split: Whether to load "training", "validation" or "test" set. + mode: 'fine' or 'coarse" + target_type: for the label type, that can be 'instance', 'semantic' or 'panoptic' + """ + self.split = split + self.dataset = Cityscapes(path, split=split, mode=mode, target_type=target_type) + self.feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-cityscapes-512-1024") #SegformerFeatureExtractor(align=False, reduce_zero_label=False) + self.transforms = transforms + + + Label = namedtuple( 'Label' , [ + 'name' , # The identifier of this label, e.g. 'car', 'person', ... . + # We use them to uniquely name a class + 'id' , # An integer ID that is associated with this label. + # The IDs are used to represent the label in ground truth images + # An ID of -1 means that this label does not have an ID and thus + # is ignored when creating ground truth images (e.g. license plate). + # Do not modify these IDs, since exactly these IDs are expected by the + # evaluation server. + 'trainId' , # Feel free to modify these IDs as suitable for your method. Then create + # ground truth images with train IDs, using the tools provided in the + # 'preparation' folder. However, make sure to validate or submit results + # to our evaluation server using the regular IDs above! + 'category' , # The name of the category that this label belongs to + 'categoryId' , # The ID of this category. Used to create ground truth images on category level. + 'hasInstances', # Whether this label distinguishes between single instances or not + 'ignoreInEval', # Whether pixels having this class as ground truth label are ignored during evaluations or not + 'color' , # The color of this label + ] ) + + #-------------------------------------------------------------------------------- + # A list of all labels + #-------------------------------------------------------------------------------- + + self.labels = [ + # name id trainId category catId hasInstances ignoreInEval color + Label( 'unlabeled' , 0 , 255 , 'void' , 0 , False , True , ( 0, 0, 0) ), + Label( 'ego vehicle' , 1 , 255 , 'void' , 0 , False , True , ( 0, 0, 0) ), + Label( 'rectification border' , 2 , 255 , 'void' , 0 , False , True , ( 0, 0, 0) ), + Label( 'out of roi' , 3 , 255 , 'void' , 0 , False , True , ( 0, 0, 0) ), + Label( 'static' , 4 , 255 , 'void' , 0 , False , True , ( 0, 0, 0) ), + Label( 'dynamic' , 5 , 255 , 'void' , 0 , False , True , (111, 74, 0) ), + Label( 'ground' , 6 , 255 , 'void' , 0 , False , True , ( 81, 0, 81) ), + Label( 'road' , 7 , 0 , 'flat' , 1 , False , False , (128, 64,128) ), + Label( 'sidewalk' , 8 , 1 , 'flat' , 1 , False , False , (244, 35,232) ), + Label( 'parking' , 9 , 255 , 'flat' , 1 , False , True , (250,170,160) ), + Label( 'rail track' , 10 , 255 , 'flat' , 1 , False , True , (230,150,140) ), + Label( 'building' , 11 , 2 , 'construction' , 2 , False , False , ( 70, 70, 70) ), + Label( 'wall' , 12 , 3 , 'construction' , 2 , False , False , (102,102,156) ), + Label( 'fence' , 13 , 4 , 'construction' , 2 , False , False , (190,153,153) ), + Label( 'guard rail' , 14 , 255 , 'construction' , 2 , False , True , (180,165,180) ), + Label( 'bridge' , 15 , 255 , 'construction' , 2 , False , True , (150,100,100) ), + Label( 'tunnel' , 16 , 255 , 'construction' , 2 , False , True , (150,120, 90) ), + Label( 'pole' , 17 , 5 , 'object' , 3 , False , False , (153,153,153) ), + Label( 'polegroup' , 18 , 255 , 'object' , 3 , False , True , (153,153,153) ), + Label( 'traffic light' , 19 , 6 , 'object' , 3 , False , False , (250,170, 30) ), + Label( 'traffic sign' , 20 , 7 , 'object' , 3 , False , False , (220,220, 0) ), + Label( 'vegetation' , 21 , 8 , 'nature' , 4 , False , False , (107,142, 35) ), + Label( 'terrain' , 22 , 9 , 'nature' , 4 , False , False , (152,251,152) ), + Label( 'sky' , 23 , 10 , 'sky' , 5 , False , False , ( 70,130,180) ), + Label( 'person' , 24 , 11 , 'human' , 6 , True , False , (220, 20, 60) ), + Label( 'rider' , 25 , 12 , 'human' , 6 , True , False , (255, 0, 0) ), + Label( 'car' , 26 , 13 , 'vehicle' , 7 , True , False , ( 0, 0,142) ), + Label( 'truck' , 27 , 14 , 'vehicle' , 7 , True , False , ( 0, 0, 70) ), + Label( 'bus' , 28 , 15 , 'vehicle' , 7 , True , False , ( 0, 60,100) ), + Label( 'caravan' , 29 , 255 , 'vehicle' , 7 , True , True , ( 0, 0, 90) ), + Label( 'trailer' , 30 , 255 , 'vehicle' , 7 , True , True , ( 0, 0,110) ), + Label( 'train' , 31 , 16 , 'vehicle' , 7 , True , False , ( 0, 80,100) ), + Label( 'motorcycle' , 32 , 17 , 'vehicle' , 7 , True , False , ( 0, 0,230) ), + Label( 'bicycle' , 33 , 18 , 'vehicle' , 7 , True , False , (119, 11, 32) ), + Label( 'license plate' , -1 , -1 , 'vehicle' , 7 , False , True , ( 0, 0,142) ), + ] + + + #-------------------------------------------------------------------------------- + # Create dictionaries for a fast lookup + #-------------------------------------------------------------------------------- + def get_id2label(self): + # return id to label object + id2label = { label.id : label for label in self.labels } + return id2label + + def get_label2id(self): + # return name to label object + name2label = { label.name : label for label in self.labels } + return name2label + + def get_label2color(self): + # return label to color code dictionary + label2color = {label.color : label for label in self.labels} + return label2color + #-------------------------------------------------------------------------------- + + def getNumClasses(self): + return len(self.dataset.classes) + + def __len__(self): + return len(self.dataset) + + def __transform__(self, image, mask): + + # Resize + #resize = tfs.Resize(size=(512, 1024)) + #image = resize(image) + #segmentation_map = resize(segmentation_map) + + # Random crop + i, j, h, w = tfs.RandomCrop.get_params( + image, output_size=(512,512)) + image = TF.crop(image, i, j, h, w) + mask = TF.crop(mask, i, j, h, w) + + # Random horizontal flipping + if random.random() > 0.5: + image = TF.hflip(image) + mask = TF.hflip(mask) + + # Transform to tensor + #image = TF.to_tensor(image) + #mask = TF.to_tensor(mask) + return image, mask + + def __getitem__(self, idx): + + image, segmentation_map = self.dataset[idx] + + if self.transforms: + + #image = self.cvt_to_tensor(image).numpy() + #segmentation_map = self.cvt_to_tensor(segmentation_map).numpy() + #augmented = self.transforms(image=image, mask=segmentation_map) + + image, segmentation_map = self.__transform__(image, segmentation_map) + + encoded_inputs = self.feature_extractor(images=image, segmentation_maps=segmentation_map, return_tensors="pt") + else: + encoded_inputs = self.feature_extractor(image, segmentation_map, return_tensors="pt") + + for k,v in encoded_inputs.items(): + encoded_inputs[k].squeeze_() # remove batch dimension + + return encoded_inputs +''' +ds = CityscapesDataset(path='/home/a.lombardi/CityScapes_Dataset', split='test', transforms=False) + +prova = ds[60] + +print(prova["pixel_values"].shape) +print(prova["labels"].shape) + +import matplotlib.pyplot as plt +import numpy as np +plt.imshow(prova["labels"].numpy()) +plt.savefig("prova.png") + +p = prova["pixel_values"].numpy() +p = np.swapaxes(p, 0, 2) +p = np.swapaxes(p, 0, 1) +plt.imshow(p) +plt.savefig("prova2.png") +''' \ No newline at end of file diff --git a/KITTIDataset.py b/KITTIDataset.py new file mode 100644 index 0000000..edf6c3b --- /dev/null +++ b/KITTIDataset.py @@ -0,0 +1,81 @@ +import random +from torch import IntTensor +from transformers import SegformerFeatureExtractor +from torch.utils.data import Dataset +from torchvision.datasets import Cityscapes +import os +import cv2 +import torchvision.transforms.functional as TF + +from torchvision import transforms as tfs + +class KITTIDataset(Dataset): + """KITTI semantic segmentation dataset.""" + + def __init__(self, root_dir:str, transforms=None, split:str = 'training'): + """ + Args: + root_dir (string): Root directory of the dataset containing the images + annotations. + feature_extractor (SegFormerFeatureExtractor): feature extractor to prepare images + segmentation maps. + train (bool): Whether to load "training" or "validation" images + annotations. + """ + self.root_dir = root_dir + self.feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-cityscapes-512-1024") + self.transforms = transforms + + self.img_dir = os.path.join(self.root_dir, split, "image_2") + self.ann_dir = os.path.join(self.root_dir, split, "semantic_rgb") + + # read images + image_file_names = [] + for root, dirs, files in os.walk(self.img_dir): + image_file_names.extend(files) + self.images = image_file_names + #self.images = sorted(image_file_names) #They are already sorted + + # read annotations + annotation_file_names = [] + for root, dirs, files in os.walk(self.ann_dir): + annotation_file_names.extend(files) + self.annotations = annotation_file_names + # self.annotations = sorted(annotation_file_names) #They are already sorted + + assert len(self.images) == len(self.annotations), "There must be as many images as there are segmentation maps" + + def __len__(self): + return len(self.images) + + def __getitem__(self, idx): + + #print(self.img_dir) + image = cv2.imread(os.path.join(self.img_dir, self.images[idx])) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + + segmentation_map = cv2.imread(os.path.join(self.ann_dir, self.annotations[idx])) + segmentation_map = cv2.cvtColor(segmentation_map, cv2.COLOR_BGR2GRAY) + + #image = Image.open() + #segmentation_map = Image.open() + + if self.transforms is not None: + augmented = self.transforms(image=image, mask=segmentation_map) + # randomly crop + pad both image and segmentation map to same size + encoded_inputs = self.feature_extractor(augmented['image'], augmented['mask'], return_tensors="pt") + else: + encoded_inputs = self.feature_extractor(image, segmentation_map, return_tensors="pt") + + for k,v in encoded_inputs.items(): + encoded_inputs[k].squeeze_() # remove batch dimension + + return encoded_inputs + +ds = KITTIDataset("/home/a.lombardi/KITTI_Dataset", transforms=None, split='training') + +prova = ds[0] + +print(prova["pixel_values"].shape) +print(prova["labels"].shape) + +import matplotlib.pyplot as plt +plt.imshow(prova["labels"].numpy()) +plt.savefig("prova.png") \ No newline at end of file diff --git a/configuration.ini b/configuration.ini new file mode 100644 index 0000000..94fea4e --- /dev/null +++ b/configuration.ini @@ -0,0 +1,20 @@ +[TRAINING] +batch_size=8 +learning_rate=0.00006 + +[MODEL] +#nvidia/mit-b0 +pretrained_type = nvidia/mit-b1 +img_train_size=1024 +model_to_test = /home/a.lombardi/my_segformer/models/b1ApolloNoAug_TRAIN_0.095583_VAL_0.147117/ +data_aug=True +in_channels=3 +widths=[32, 64, 128, 256] +depths=[2, 2, 2, 2] +all_num_heads=[1, 2, 4, 8] +patch_sizes=[7, 3, 3, 3] +overlap_sizes=[4, 2, 2, 2] +reduction_ratios=[8, 4, 2, 1] +mlp_expansions=[4, 4, 4, 4] +decoder_channels=128 +scale_factors=[8, 4, 2, 1] \ No newline at end of file diff --git a/eval.py b/eval.py new file mode 100644 index 0000000..962eafe --- /dev/null +++ b/eval.py @@ -0,0 +1,82 @@ +from transformers import SegformerFeatureExtractor +from modelv2.Segformer_model import SegformerForSemanticSegmentation +from CityscapesDataset import CityscapesDataset +from ApolloScapeDataset import ApolloScapeDataset +from torch.utils.data import DataLoader +from configparser import ConfigParser +import torch +from utils import bcolors +from PIL import Image +import numpy as np +import matplotlib.pyplot as plt + +# assign gpu devices +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +def evaluateOnImage(model: torch.nn.Module, image_path:str, label2color:dict): + + image = Image.open(image_path) + image = image.convert("RGB") + # prepare the image for the model (aligned resize) + feature_extractor_inference = SegformerFeatureExtractor(do_random_crop=False, do_pad=False) + + pixel_values = feature_extractor_inference(image, return_tensors="pt").pixel_values.to(device) + + model.eval() + outputs = model(pixel_values=pixel_values)# logits are of shape (batch_size, num_labels, height/4, width/4) + logits = outputs.logits.cpu() + # First, rescale logits to original image size + upsampled_logits = torch.nn.functional.interpolate(logits, + size=image.size[::-1], # (height, width) + mode='bilinear', + align_corners=False) + + # Second, apply argmax on the class dimension + seg = upsampled_logits.argmax(dim=1)[0] + color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3\ + + palette = label2color + + for label, color in enumerate(palette): + color_seg[seg == label, :] = color + + # Show image + mask + img = np.array(image) * 0.5 + color_seg * 0.5 + img = img.astype(np.uint8) + + fig, axs = plt.subplots(1, 2, figsize=(20, 10)) + axs[0].imshow(img) + axs[1].imshow(color_seg) + plt.savefig("prova.png") + +if __name__ == "__main__": + torch.cuda.empty_cache() + + ############################################### + ####### Getting configuration settings ######## + config = ConfigParser() + config.read('/home/a.lombardi/my_segformer/configuration.ini') + BATCH_SIZE = config.getint('TRAINING', 'batch_size') + PRETRAINED_WEIGHTS = config.get('MODEL', 'model_to_test') + ############################################### + + ############################################### + ############## Preparing the model ############ + model = SegformerForSemanticSegmentation.from_pretrained(PRETRAINED_WEIGHTS, # Encoder pretrained weights + ignore_mismatched_sizes=True, + #num_labels=len(test_set.labels), + #id2label=test_set.get_id2label(), + #label2id=test_set.get_label2id(), + reshape_last_stage=True) + + if torch.cuda.is_available(): + print("Loading the model on GPU: ", torch.cuda.get_device_name(0)) + model = model.cuda() + else: + print("Using the model on CPU\n") + ############################################### + + image_path = "/home/a.lombardi/CityScapes_Dataset/leftImg8bit/val/munster/munster_000069_000019_leftImg8bit.png" + label2color = CityscapesDataset(path='/home/a.lombardi/CityScapes_Dataset', split='test').get_label2color() + #label2color = ApolloScapeDataset("/home/a.lombardi/ApolloScape_Dataset", split='test', transforms=None).get_label2color() + evaluateOnImage(model=model, image_path=image_path, label2color=label2color) \ No newline at end of file diff --git a/modelv1/Decoder.py b/modelv1/Decoder.py new file mode 100644 index 0000000..06c4632 --- /dev/null +++ b/modelv1/Decoder.py @@ -0,0 +1,31 @@ +from typing import List +from torch import nn + +# A single SegFormerDecoderBlock contains one upsample layer (for the spatial dimension) +# and one conv layer (for the channels). +# The scale_factor parameter is needed to tell it how much we want to upsample the feature. +class SegFormerDecoderBlock(nn.Sequential): + def __init__(self, in_channels: int, out_channels: int, scale_factor: int = 2): + super().__init__( + nn.UpsamplingBilinear2d(scale_factor=scale_factor), + nn.Conv2d(in_channels, out_channels, kernel_size=1), + ) + +# SegFormerDecoder is just a list of DecoderBlocks. +# It takes a list of features and returns a list of new features with the same spatial size and channels. +class SegFormerDecoder(nn.Module): + def __init__(self, out_channels: int, widths: List[int], scale_factors: List[int]): + super().__init__() + self.stages = nn.ModuleList( + [ + SegFormerDecoderBlock(in_channels, out_channels, scale_factor) + for in_channels, scale_factor in zip(widths, scale_factors) + ] + ) + + def forward(self, features): + new_features = [] + for feature, stage in zip(features,self.stages): + x = stage(feature) + new_features.append(x) + return new_features \ No newline at end of file diff --git a/modelv1/Encoder.py b/modelv1/Encoder.py new file mode 100644 index 0000000..cd071b3 --- /dev/null +++ b/modelv1/Encoder.py @@ -0,0 +1,56 @@ +from typing import Iterable, List + +import torch +from torch import nn + +from model.EncoderStage import * + +def chunks(data: Iterable, sizes: List[int]): + """ + Given an iterable, returns slices using sizes as indices + """ + curr = 0 + for size in sizes: + chunk = data[curr: curr + size] + curr += size + yield chunk + +class SegFormerEncoder(nn.Module): + def __init__( + self, + in_channels: int, + widths: List[int], + depths: List[int], + all_num_heads: List[int], + patch_sizes: List[int], + overlap_sizes: List[int], + reduction_ratios: List[int], + mlp_expansions: List[int], + drop_prob: float = .0 + ): + super().__init__() + # create drop paths probabilities (one for each stage's block) + drop_probs = [x.item() for x in torch.linspace(0, drop_prob, sum(depths))] + self.stages = nn.ModuleList( + [ + SegFormerEncoderStage(*args) + for args in zip( + [in_channels, *widths], + widths, + patch_sizes, + overlap_sizes, + chunks(drop_probs, sizes=depths), + depths, + reduction_ratios, + all_num_heads, + mlp_expansions + ) + ] + ) + + def forward(self, x): + features = [] + for stage in self.stages: + x = stage(x) + features.append(x) + return features \ No newline at end of file diff --git a/modelv1/EncoderBlock.py b/modelv1/EncoderBlock.py new file mode 100644 index 0000000..3841c68 --- /dev/null +++ b/modelv1/EncoderBlock.py @@ -0,0 +1,127 @@ +import torch +from einops import rearrange +from torch import nn + +# Since nn.LayerNorm in PyTorch works for tensors of shape batch, ...., channels, +# we can create a LayerNorm2d that first swaps the channels axis with the last one, +# then applies layer norm, and swaps it back. +# einops to make the code more readable +class LayerNorm2d(nn.LayerNorm): + def forward(self, x): + x = rearrange(x, "b c h w -> b h w c") + x = super().forward(x) + x = rearrange(x, "b h w c -> b c h w") + return x + +class OverlapPatchMerging(nn.Sequential): + """ Image to Patch Embedding """ + def __init__( + self, in_channels: int, out_channels: int, patch_size: int, overlap_size: int + ): + super().__init__( + nn.Conv2d( + in_channels, + out_channels, + kernel_size=patch_size, + stride=overlap_size, + padding=(patch_size // 2), + bias=False + ), + LayerNorm2d(out_channels) + ) + +# Quoting from the paper: +# We argue that positional encoding is not necessary for semantic segmentation. +# Instead, we introduce Mix-FFN which considers the effect of zero padding to leak location information. + +class MixMLP(nn.Sequential): + def __init__(self, channels: int, expansion: int = 4): + super().__init__( + # dense layer + nn.Conv2d(channels, channels, kernel_size=1), + # depth wise conv + nn.Conv2d( + channels, + channels * expansion, + kernel_size=3, + groups=channels, + padding=1, + ), + nn.GELU(), + # dense layer + nn.Conv2d(channels * expansion, channels, kernel_size=1), + ) + # Not using dropout layer as the paper, but very similar to ViT, + # we have skip connections and normalization layers + Stochastic Depth, also known as Drop Path + +from torchvision.ops import StochasticDepth + +class ResidualAdd(nn.Module): + """Just an util layer""" + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x, **kwargs): + out = self.fn(x, **kwargs) + x = x + out + return x + + +# We all know attention has a square complexity O(N^2) where N=H*W in our case. +# We can reduce N by a factor of R, the complexity becomes O(N^2/R). +# One easy way is to flat the spatial dimension and use a linear layer. + +# We have reduced the spatial size by r=4, so by 2 on each dimension (height and width). +# So we can use a convolution layer with a kernel_size=r and a stride=r to achieve the same effect. + +class EfficientMultiHeadAttention(nn.Module): + def __init__(self, channels: int, reduction_ratio: int = 1, num_heads: int = 8): + super().__init__() + self.reducer = nn.Sequential( + nn.Conv2d( + channels, channels, kernel_size=reduction_ratio, stride=reduction_ratio + ), + LayerNorm2d(channels), + ) + self.att = nn.MultiheadAttention( + channels, num_heads=num_heads, batch_first=True + ) + + def forward(self, x): + _, _, h, w = x.shape + reduced_x = self.reducer(x) + # attention needs tensor of shape (batch, sequence_length, channels) + reduced_x = rearrange(reduced_x, "b c h w -> b (h w) c") + x = rearrange(x, "b c h w -> b (h w) c") + out = self.att(x, reduced_x, reduced_x)[0] + # reshape it back to (batch, channels, height, width) + out = rearrange(out, "b (h w) c -> b c h w", h=h, w=w) + return out + +class SegFormerEncoderBlock(nn.Sequential): + def __init__( + self, + channels: int, + reduction_ratio: int = 1, + num_heads: int = 8, + mlp_expansion: int = 4, + drop_path_prob: float = .0 + ): + super().__init__( + ResidualAdd( + nn.Sequential( + LayerNorm2d(channels), + EfficientMultiHeadAttention(channels, reduction_ratio, num_heads), + ) + ), + ResidualAdd( + nn.Sequential( + LayerNorm2d(channels), + MixMLP(channels, expansion=mlp_expansion), + StochasticDepth(p=drop_path_prob, mode="batch") + ) + ), + ) + + diff --git a/modelv1/EncoderStage.py b/modelv1/EncoderStage.py new file mode 100644 index 0000000..60b0b66 --- /dev/null +++ b/modelv1/EncoderStage.py @@ -0,0 +1,31 @@ +from typing import List + +from model.EncoderBlock import * + +class SegFormerEncoderStage(nn.Sequential): + def __init__( + self, + in_channels: int, + out_channels: int, + patch_size: int, + overlap_size: int, + drop_probs: List[int], + depth: int = 2, + reduction_ratio: int = 1, + num_heads: int = 8, + mlp_expansion: int = 4, + ): + super().__init__() + + self.overlap_patch_merge = OverlapPatchMerging( + in_channels, out_channels, patch_size, overlap_size, + ) + self.blocks = nn.Sequential( + *[ + SegFormerEncoderBlock( + out_channels, reduction_ratio, num_heads, mlp_expansion, drop_probs[i] + ) + for i in range(depth) + ] + ) + self.norm = LayerNorm2d(out_channels) \ No newline at end of file diff --git a/modelv1/SegmentationHead.py b/modelv1/SegmentationHead.py new file mode 100644 index 0000000..ba35757 --- /dev/null +++ b/modelv1/SegmentationHead.py @@ -0,0 +1,17 @@ +from torch import nn, cat + +class SegFormerSegmentationHead(nn.Module): + def __init__(self, channels: int, num_classes: int, num_features: int = 4): + super().__init__() + self.fuse = nn.Sequential( + nn.Conv2d(channels * num_features, channels, kernel_size=1, bias=False), + nn.ReLU(), # why relu? Who knows + nn.BatchNorm2d(channels) # why batchnorm and not layer norm? Idk + ) + self.predict = nn.Conv2d(channels, num_classes, kernel_size=1) + + def forward(self, features): + x = cat(features, dim=1) + x = self.fuse(x) + x = self.predict(x) + return x \ No newline at end of file diff --git a/modelv1/model.py b/modelv1/model.py new file mode 100644 index 0000000..a62f7ea --- /dev/null +++ b/modelv1/model.py @@ -0,0 +1,47 @@ +from typing import List +from torch import nn + +from model.Decoder import SegFormerDecoder +from model.Encoder import * +from model.SegmentationHead import SegFormerSegmentationHead + + +class SegFormer(nn.Module): + def __init__( + self, + in_channels: int, + widths: List[int], + depths: List[int], + all_num_heads: List[int], + patch_sizes: List[int], + overlap_sizes: List[int], + reduction_ratios: List[int], + mlp_expansions: List[int], + decoder_channels: int, + scale_factors: List[int], + num_classes: int, + drop_prob: float = 0.0, + ): + + super().__init__() + self.encoder = SegFormerEncoder( + in_channels, + widths, + depths, + all_num_heads, + patch_sizes, + overlap_sizes, + reduction_ratios, + mlp_expansions, + drop_prob, + ) + self.decoder = SegFormerDecoder(decoder_channels, widths[::-1], scale_factors) + self.head = SegFormerSegmentationHead( + decoder_channels, num_classes, num_features=len(widths) + ) + + def forward(self, x): + features = self.encoder(x) + features = self.decoder(features[::-1]) + segmentation = self.head(features) + return segmentation \ No newline at end of file diff --git a/modelv2/SegformerDecodeHead.py b/modelv2/SegformerDecodeHead.py new file mode 100644 index 0000000..9f9fcda --- /dev/null +++ b/modelv2/SegformerDecodeHead.py @@ -0,0 +1,169 @@ +from transformers import SegformerPreTrainedModel, SegformerConfig +from torch import nn, cat, Tensor +import math + +class SegformerMLP(nn.Module): + """ + Linear Embedding. + """ + + def __init__(self, config: SegformerConfig, input_dim): + super().__init__() + self.proj = nn.Linear(input_dim, config.decoder_hidden_size) + + def forward(self, hidden_states: Tensor): + hidden_states = hidden_states.flatten(2).transpose(1, 2) + hidden_states = self.proj(hidden_states) + return hidden_states + +class SegformerDecodeHead(SegformerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size + mlps = [] + for i in range(config.num_encoder_blocks): + mlp = SegformerMLP(config, input_dim=config.hidden_sizes[i]) + mlps.append(mlp) + self.linear_c = nn.ModuleList(mlps) + #self.linear_c = nn.ModuleList(mlps.reverse()) + + # the following 3 layers implement the ConvModule of the original implementation + self.linear_fuse = nn.Conv2d( + in_channels=config.decoder_hidden_size * config.num_encoder_blocks, + out_channels=config.decoder_hidden_size, + kernel_size=1, + bias=False, + ) + + self.linear_fuse_2_hidden_states = nn.Conv2d( + in_channels=config.decoder_hidden_size * 2, + out_channels=config.decoder_hidden_size, + kernel_size=1, + bias=False, + ) + + self.batch_norm = nn.BatchNorm2d(config.decoder_hidden_size) + self.activation = nn.ReLU() + + self.dropout = nn.Dropout(config.classifier_dropout_prob) + self.classifier = nn.Conv2d(config.decoder_hidden_size, config.num_labels, kernel_size=1) + + self.config = config + + def forward(self, encoder_hidden_states): + batch_size = encoder_hidden_states[-1].shape[0] + + all_hidden_states = () + #print(encoder_hidden_states[0].shape) + #print(encoder_hidden_states[1].shape) + #print(encoder_hidden_states[2].shape) + #print(encoder_hidden_states[3].shape) + #print(encoder_hidden_states[4].shape) + #input() + # MY VERSION + + #reversed(encoder_hidden_states) + #FROM + #0. torch.Size([8, 32, 128, 128]) + #1. torch.Size([8, 64, 64, 64]) + #2. torch.Size([8, 160, 32, 32]) + #3. torch.Size([8, 256, 16, 16]) + + #TO + #0. torch.Size([8, 256, 16, 16]) + #1. torch.Size([8, 160, 32, 32]) + #2. torch.Size([8, 64, 64, 64]) + #3. torch.Size([8, 32, 128, 128]) + + ''' + for idx, (encoder_hidden_state, mlp) in reversed(list(enumerate(zip(encoder_hidden_states, self.linear_c)))): + if self.config.reshape_last_stage is False and encoder_hidden_state.ndim == 3: + height = width = int(math.sqrt(encoder_hidden_state.shape[-1])) + encoder_hidden_state = ( + encoder_hidden_state.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous() + ) + + if idx==3: + # 1. First, multi-level features Fi from the MiT encoder goes through an MLP layer to unify the channel dimension + height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3] + encoder_hidden_state = mlp(encoder_hidden_state) + #print(encoder_hidden_state.shape) + encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1) + encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width) + # Partendo dall'ultimo... es. H/32xW/32 + # 2. Features are upsampled to the previous encoder block size + encoder_hidden_state = nn.functional.interpolate( + encoder_hidden_state, size=encoder_hidden_states[idx-1].size()[2:], mode="bilinear", align_corners=False + ) + + + all_hidden_states += (encoder_hidden_state,) + else: + # 1. First, multi-level features Fi from the MiT encoder goes through an MLP layer to unify the channel dimension + height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3] + encoder_hidden_state = mlp(encoder_hidden_state) + + encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1) + encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width) + + all_hidden_states += (encoder_hidden_state,) + #print(all_hidden_states[0].shape) + #print(all_hidden_states[1].shape) + #fuse the concatenated features + hidden_states = self.linear_fuse_2_hidden_states(cat(all_hidden_states[::-1], dim=1)) + hidden_states = self.batch_norm(hidden_states) + hidden_states = self.activation(hidden_states) + fused_hidden_states = self.dropout(hidden_states) + + #print("fused: ", fused_hidden_states.shape) + + if idx!=0: + #print(idx) + # 2. Features are upsampled to the previous encoder block size + upsampled_hidden_states = nn.functional.interpolate( + fused_hidden_states, size=encoder_hidden_states[idx-1].size()[2:], mode="bilinear", align_corners=False + ) + #print("upsampled: ", upsampled_hidden_states.shape) + all_hidden_states = () + all_hidden_states += (upsampled_hidden_states,) + + logits = self.classifier(fused_hidden_states) + ''' + ########################### + for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.linear_c): + if self.config.reshape_last_stage is False and encoder_hidden_state.ndim == 3: + height = width = int(math.sqrt(encoder_hidden_state.shape[-1])) + encoder_hidden_state = ( + encoder_hidden_state.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous() + ) + + + # 1. First, multi-level features Fi from the MiT encoder go through an MLP layer to unify the channel dimension + height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3] + encoder_hidden_state = mlp(encoder_hidden_state) + encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1) + encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width) + # 2. Features are upsampled to 1/4th and concatenated togheter + encoder_hidden_state = nn.functional.interpolate( + encoder_hidden_state, size=encoder_hidden_states[0].size()[2:], mode="bilinear", align_corners=False + ) + # concatenate + all_hidden_states += (encoder_hidden_state,) + + # ALL_HIDDEN_STATES SIZES ARE: + #torch.Size([8, 256, 128, 128]) + #torch.Size([8, 256, 128, 128]) + #torch.Size([8, 256, 128, 128]) + #torch.Size([8, 256, 128, 128]) + + # 3. fuse the concatenated features + hidden_states = self.linear_fuse(cat(all_hidden_states[::-1], dim=1)) + hidden_states = self.batch_norm(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(hidden_states) + + # 4. MLP layer taking the fused features to make the predictions + # logits are of shape (batch_size, num_labels, height/4, width/4) + logits = self.classifier(hidden_states) + + return logits \ No newline at end of file diff --git a/modelv2/SegformerSkipDecodeHead.py b/modelv2/SegformerSkipDecodeHead.py new file mode 100644 index 0000000..1b7a68b --- /dev/null +++ b/modelv2/SegformerSkipDecodeHead.py @@ -0,0 +1,133 @@ +from transformers import SegformerPreTrainedModel, SegformerConfig +from torch import nn, cat, Tensor +import math + +class SegformerMLP(nn.Module): + """ + Linear Embedding. + """ + + def __init__(self, config: SegformerConfig, input_dim): + super().__init__() + self.proj = nn.Linear(input_dim, config.decoder_hidden_size) + + def forward(self, hidden_states: Tensor): + hidden_states = hidden_states.flatten(2).transpose(1, 2) + hidden_states = self.proj(hidden_states) + return hidden_states + +class SegformerSkipDecodeHead(SegformerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size + mlps = [] + for i in range(config.num_encoder_blocks): + mlp = SegformerMLP(config, input_dim=config.hidden_sizes[i]) + mlps.append(mlp) + self.linear_c = nn.ModuleList(mlps) + #self.linear_c = nn.ModuleList(mlps.reverse()) + + # the following 3 layers implement the ConvModule of the original implementation + self.linear_fuse = nn.Conv2d( + in_channels=config.decoder_hidden_size * config.num_encoder_blocks, + out_channels=config.decoder_hidden_size, + kernel_size=1, + bias=False, + ) + + self.linear_fuse_2_hidden_states = nn.Conv2d( + in_channels=config.decoder_hidden_size * 2, + out_channels=config.decoder_hidden_size, + kernel_size=1, + bias=False, + ) + + self.batch_norm = nn.BatchNorm2d(config.decoder_hidden_size) + self.activation = nn.ReLU() + + self.dropout = nn.Dropout(config.classifier_dropout_prob) + self.classifier = nn.Conv2d(config.decoder_hidden_size, config.num_labels, kernel_size=1) + + self.config = config + + def forward(self, encoder_hidden_states): + batch_size = encoder_hidden_states[-1].shape[0] + + all_hidden_states = () + #print(encoder_hidden_states[0].shape) + #print(encoder_hidden_states[1].shape) + #print(encoder_hidden_states[2].shape) + #print(encoder_hidden_states[3].shape) + #print(encoder_hidden_states[4].shape) + #input() + # MY VERSION + + #reversed(encoder_hidden_states) + #FROM + #0. torch.Size([8, 32, 128, 128]) + #1. torch.Size([8, 64, 64, 64]) + #2. torch.Size([8, 160, 32, 32]) + #3. torch.Size([8, 256, 16, 16]) + + #TO + #0. torch.Size([8, 256, 16, 16]) + #1. torch.Size([8, 160, 32, 32]) + #2. torch.Size([8, 64, 64, 64]) + #3. torch.Size([8, 32, 128, 128]) + + + for idx, (encoder_hidden_state, mlp) in reversed(list(enumerate(zip(encoder_hidden_states, self.linear_c)))): + if self.config.reshape_last_stage is False and encoder_hidden_state.ndim == 3: + height = width = int(math.sqrt(encoder_hidden_state.shape[-1])) + encoder_hidden_state = ( + encoder_hidden_state.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous() + ) + + if idx==3: + # 1. First, multi-level features Fi from the MiT encoder goes through an MLP layer to unify the channel dimension + height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3] + encoder_hidden_state = mlp(encoder_hidden_state) + #print(encoder_hidden_state.shape) + encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1) + encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width) + # Partendo dall'ultimo... es. H/32xW/32 + # 2. Features are upsampled to the previous encoder block size + encoder_hidden_state = nn.functional.interpolate( + encoder_hidden_state, size=encoder_hidden_states[idx-1].size()[2:], mode="bilinear", align_corners=False + ) + + + all_hidden_states += (encoder_hidden_state,) + else: + # 1. First, multi-level features Fi from the MiT encoder goes through an MLP layer to unify the channel dimension + height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3] + encoder_hidden_state = mlp(encoder_hidden_state) + + encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1) + encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width) + + all_hidden_states += (encoder_hidden_state,) + #print(all_hidden_states[0].shape) + #print(all_hidden_states[1].shape) + #fuse the concatenated features + hidden_states = self.linear_fuse_2_hidden_states(cat(all_hidden_states[::-1], dim=1)) + hidden_states = self.batch_norm(hidden_states) + hidden_states = self.activation(hidden_states) + fused_hidden_states = self.dropout(hidden_states) + + #print("fused: ", fused_hidden_states.shape) + + if idx!=0: + #print(idx) + # 2. Features are upsampled to the previous encoder block size + upsampled_hidden_states = nn.functional.interpolate( + fused_hidden_states, size=encoder_hidden_states[idx-1].size()[2:], mode="bilinear", align_corners=False + ) + #print("upsampled: ", upsampled_hidden_states.shape) + all_hidden_states = () + all_hidden_states += (upsampled_hidden_states,) + + logits = self.classifier(fused_hidden_states) + + + return logits \ No newline at end of file diff --git a/modelv2/Segformer_model.py b/modelv2/Segformer_model.py new file mode 100644 index 0000000..6c7f576 --- /dev/null +++ b/modelv2/Segformer_model.py @@ -0,0 +1,70 @@ +from typing import Optional, Tuple, Union +import torch +from transformers import SegformerModel, SegformerPreTrainedModel, modeling_outputs +from torch.nn import CrossEntropyLoss +from modelv2.SegformerDecodeHead import SegformerDecodeHead +from modelv2.SegformerSkipDecodeHead import SegformerSkipDecodeHead + +class SegformerForSemanticSegmentation(SegformerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.segformer = SegformerModel(config) + self.decode_head = SegformerDecodeHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + pixel_values: torch.FloatTensor, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, modeling_outputs.SemanticSegmenterOutput]: + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.segformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=True, # we need the intermediate hidden states + return_dict=return_dict, + ) + + encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + + logits = self.decode_head(encoder_hidden_states) + + loss = None + if labels is not None: # So, if we need output loss calcs as in a training step + if self.config.num_labels == 1: + raise ValueError("The number of labels should be greater than one") + else: + # upsample logits to the images' original size + upsampled_logits = torch.nn.functional.interpolate( + logits, size=labels.shape[-2:], mode="bilinear", align_corners=False + ) + #print("max ", labels.max()) + #print("min ", labels.min()) + # calculate the loss + loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index) + loss = loss_fct(upsampled_logits, labels) + + if not return_dict: + if output_hidden_states: + output = (logits,) + outputs[1:] + else: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return modeling_outputs.SemanticSegmenterOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + diff --git a/test.py b/test.py new file mode 100644 index 0000000..5d9280f --- /dev/null +++ b/test.py @@ -0,0 +1,100 @@ +from codecs import ignore_errors +from modelv2.Segformer_model import SegformerForSemanticSegmentation +from CityscapesDataset import CityscapesDataset +from ApolloScapeDataset import ApolloScapeDataset +from torch.utils.data import DataLoader +from configparser import ConfigParser +import torch +from sklearn.metrics import jaccard_score +from torchmetrics import JaccardIndex +from utils import bcolors +import tqdm +from numpy import mean +import numpy as np +import matplotlib.pyplot as plt + +from datasets import load_metric +# assign gpu devices +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +def test(model: torch.nn.Module, test_loader, num_labels): + + jaccards = [] + jaccard = JaccardIndex(num_classes=num_labels, average="macro", ignore_index=255) + metric = load_metric("mean_iou") + print("-> Testing started:") + with torch.no_grad(): + model.eval() + + for batch in tqdm.tqdm(test_loader): + pixel_values = batch["pixel_values"] + labels = batch["labels"] + + if torch.cuda.is_available(): + pixel_values, labels = pixel_values.cuda(), labels.cuda() + + # evaluate + outputs = model(pixel_values=pixel_values) + # First, rescale logits to original image size + upsampled_logits = torch.nn.functional.interpolate(outputs.logits, size=labels.shape[-2:], mode="bilinear", align_corners=False) + # Second, apply argmax on the class dimension + predicted = upsampled_logits.argmax(dim=1) + + mask = (labels != 255) # we don't include the background class in the accuracy calculation + pred_labels = predicted.detach().cpu() + true_labels = labels.detach().cpu() + + # note that the metric expects predictions + labels as numpy arrays + metric.add_batch(predictions=pred_labels.numpy(), references=true_labels.numpy()) + + #jaccards.append(jaccard_score(y_pred=pred_labels.numpy(), y_true=true_labels.numpy(), average='macro')) + #jaccards.append(jaccard(pred_labels[mask], true_labels[mask])) + + #meanIoU = mean(jaccards) + metrics = metric.compute(num_labels=num_labels, ignore_index=255, + reduce_labels=False)# we've already reduced the labels before + + print("Mean_iou: ", metrics["mean_iou"]) + print("Mean accuracy: ", metrics["mean_accuracy"]) + #print("Jaccard index (mIoU): ", meanIoU) + + +if __name__ == "__main__": + torch.cuda.empty_cache() + + ############################################### + ####### Getting configuration settings ######## + config = ConfigParser() + config.read('/home/a.lombardi/my_segformer/configuration.ini') + BATCH_SIZE = config.getint('TRAINING', 'batch_size') + MODEL = config.get('MODEL', 'model_to_test') + ############################################### + + ############################################### + ############ Preparing the dataset ############ + #test_set = CityscapesDataset(path='/home/a.lombardi/CityScapes_Dataset', split='val', transforms=False) + test_set = ApolloScapeDataset("/home/a.lombardi/ApolloScape_Dataset", split='test', transforms=None) + test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) + ############################################### + + num_labels = len(test_set.get_label2id()) + + ############################################### + ############## Preparing the model ############ + model = SegformerForSemanticSegmentation.from_pretrained(MODEL, # Encoder pretrained weights + ignore_mismatched_sizes=True, + num_labels=num_labels, + id2label=test_set.get_id2label(), + label2id=test_set.get_label2id(), + reshape_last_stage=True) + + if torch.cuda.is_available(): + print("Loading the model on GPU: ", torch.cuda.get_device_name(0)) + model = model.cuda() + else: + print("Using the model on CPU\n") + ############################################### + + test(model, + test_loader=test_loader, + num_labels=num_labels) \ No newline at end of file diff --git a/train.py b/train.py new file mode 100644 index 0000000..fb93faa --- /dev/null +++ b/train.py @@ -0,0 +1,170 @@ +from requests import patch +import torch +from math import inf +from utils import bcolors +from model.model import SegFormer + +from Dataset import Cityscapes_Dataset +from torch.utils.data import DataLoader, distributed +import torch.distributed as dist + +from torchsummary import summary +# mean IoU +from configparser import ConfigParser +import json +import os + +# assign gpu devices +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +''' +nodes = 4 +gpus = 4 +nr = 0 +world_size = nodes*gpus +rank = nr * gpus + 0 + +os.environ['MASTER_ADDR'] = '193.205.230.3' +os.environ['MASTER_PORT'] = '8889' +''' + +def training(model: torch.nn.Module, device, train_loader, val_loader, criterion, optimizer, num_classes, epochs=1, print_step=10): + + print("-> Run training") + ''' + dist.init_process_group( + backend='nccl', + init_method='env://', + world_size=world_size, + rank=rank + ) + + # Wrap the model ########## + model = torch.nn.parallel.DistributedDataParallel(model,device_ids=[device]) + ########################### + ''' + + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=10, min_lr=0.0000001, verbose=True) + min_valid_loss = inf + + # cycle through epochs + for e in range(epochs): + + model.train() # turn on train mode + + # TRAINING STEP ---------------------| + train_loss = 0.0 + + for data, labels in train_loader: # cycle thorugh batches + + if torch.cuda.is_available(): + data, labels = data.cuda(), labels.cuda() + + #print("Data shape: ",data.shape) + #print("Labels shape: ",labels.shape) + # forward pass + output = model(data) + #print("Output shape: ",output.shape) + # find the loss + loss = criterion(output, labels) + # clear the gradients + optimizer.zero_grad() + # calculate gradients + loss.backward() + # update weights + optimizer.step() + # update the loss + train_loss += loss.item() + # ------------------------------------| + + # VALIDATION STEP --------------------| + valid_loss = 0.0 + with torch.no_grad(): # disable gradient calculation + model.eval() + + for data, labels in val_loader: + + if torch.cuda.is_available(): + data, labels = data.cuda(), labels.cuda() + + # forward pass + output = model(data) + # find the loss + loss = criterion(output, labels) + # calculate loss + valid_loss += loss.item() + # ------------------------------------| + + scheduler.step(valid_loss) + + # the loss is printed each print_step epochs + if e%print_step==0: + print(f'| {bcolors.BOLD}Epoch {e+1}{bcolors.ENDC} | Training Loss: {train_loss/len(train_loader):.5f} | Validation Loss: {valid_loss/len(val_loader):.5f}' ) + if min_valid_loss > valid_loss: + if e%print_step==0: + print(f'{bcolors.OKGREEN}| Val loss decreased ({min_valid_loss:.5f}--->{valid_loss:.5f}) - Saving model.{bcolors.ENDC}') + min_valid_loss = valid_loss + + # Saving the model + model_name = str(f'model_TRAIN_{train_loss/len(train_loader):.6f}_VAL_{valid_loss/len(val_loader):.6f}.pth') + torch.save(model.state_dict(), '/home/a.lombardi/my_segformer/models/' + model_name) + + input() +if __name__ == "__main__": + torch.cuda.empty_cache() + + ################ Getting configuration settings ############### + config = ConfigParser() + config.read('/home/a.lombardi/my_segformer/configuration.ini') + BATCH_SIZE = config.getint('TRAINING', 'batch_size') + ################################################################ + + # load data + train_set = Cityscapes_Dataset(path='/home/a.lombardi/CityScapes_Dataset', + batch_size=BATCH_SIZE, + image_size=config.getint('MODEL','img_train_size'), + split='train') + val_set = Cityscapes_Dataset(path='/home/a.lombardi/CityScapes_Dataset', + batch_size=BATCH_SIZE, + image_size=config.getint('MODEL','img_train_size'), + split='train') + + ''' + train_sampler = distributed.DistributedSampler(train_set,num_replicas=world_size,rank=rank) + val_sampler = distributed.DistributedSampler(val_set,num_replicas=world_size,rank=rank) + ''' + train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) + + val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) + + # define and load the model + model_type = 'MODEL' + model = SegFormer(in_channels=config.getint(model_type,'in_channels'), #image channels + widths=json.loads(config.get(model_type,'widths')), + depths=json.loads(config.get(model_type,'depths')), + all_num_heads=json.loads(config.get(model_type,'all_num_heads')), + patch_sizes=json.loads(config.get(model_type,'patch_sizes')), + overlap_sizes=json.loads(config.get(model_type,'overlap_sizes')), + reduction_ratios=json.loads(config.get(model_type,'reduction_ratios')), + mlp_expansions=json.loads(config.get(model_type,'mlp_expansions')), + decoder_channels=config.getint(model_type,'decoder_channels'), + scale_factors=json.loads(config.get(model_type,'scale_factors')), + num_classes=train_set.getNumClasses(), + ).to(device) + + pytorch_total_params = sum(p.numel() for p in model.parameters()) + print(pytorch_total_params) + + if torch.cuda.is_available(): + print("Loading the model on GPU: ", torch.cuda.get_device_name(0)) + model = model.cuda() + else: + print("Using the model on CPU\n") + + # loss function and optimizer + #criterion = torch.nn.MSELoss() + criterion = torch.nn.CrossEntropyLoss() + #criterion = + optimizer = torch.optim.AdamW(model.parameters(),lr=0.00008) # Adam, AdamW or RMSprop + + training(model, device, train_loader, val_loader, criterion, optimizer, num_classes=train_set.getNumClasses(), epochs=500) \ No newline at end of file diff --git a/train_hf.py b/train_hf.py new file mode 100644 index 0000000..d72be57 --- /dev/null +++ b/train_hf.py @@ -0,0 +1,200 @@ +#from transformers import SegformerForSemanticSegmentation +from modelv2.Segformer_model import SegformerForSemanticSegmentation +from CityscapesDataset import CityscapesDataset +from ApolloScapeDataset import ApolloScapeDataset +from torch.utils.data import DataLoader +from configparser import ConfigParser +from math import ceil, inf +import torch +from sklearn.metrics import accuracy_score +from utils import bcolors +from torchvision import transforms as tfs +from torch.utils.tensorboard import SummaryWriter +writer = SummaryWriter() +writer.flush() +# assign gpu devices +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +def training(model: torch.nn.Module, model_type:str, train_loader, val_loader, criterion, optimizer, epochs=1, print_step=1): + """ Training function + + Args: + model (torch.nn.Module): model to train + model_type (str): name of the model type + device (_type_): GPU or CPU device + train_loader (_type_): train data laoder + val_loader (_type_): validation data loader + criterion (_type_): already initialized loss function + optimizer (_type_): + num_classes (_type_): number of the classes to predict + epochs (int, optional): Training epochs. Defaults to 1. + print_step (int, optional): Defaults to 1. + """ + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', + factor=0.5, patience=5, + min_lr=0.0000001, verbose=True) + + if model_type is None: + model_type = "model" + + # utils data + min_valid_loss = inf + accuracies = [] + losses = [] + val_accuracies = [] + val_losses = [] + + print("-> Training started:") + # cycle through epochs + for e in range(epochs): + + model.train() # turn on train mode + + ############################################### + ############### TRAINING STEP ################# + for batch in train_loader: + pixel_values = batch["pixel_values"] + labels = batch["labels"] + + if torch.cuda.is_available(): + pixel_values, labels = pixel_values.cuda(), labels.cuda() + + # forward pass + outputs = model(pixel_values=pixel_values, labels=labels) + + # First, rescale logits to original image size + upsampled_logits = torch.nn.functional.interpolate(outputs.logits, size=labels.shape[-2:], mode="bilinear", align_corners=False) + # Second, apply argmax on the class dimension + predicted = upsampled_logits.argmax(dim=1) + + mask = (labels != 255) # we don't include the background class in the accuracy calculation + pred_labels = predicted[mask].detach().cpu().numpy() + true_labels = labels[mask].detach().cpu().numpy() + accuracy = accuracy_score(pred_labels, true_labels) + loss = outputs.loss + accuracies.append(accuracy) + losses.append(loss.item()) + + writer.add_scalar("Train loss", loss.item(), e) + + # clear the gradients + optimizer.zero_grad() + # calculate gradients + loss.backward() + # update weights + optimizer.step() + ############################################### + + ############################################### + ################## VAL STEP ################### + with torch.no_grad(): # disable gradient calculation + model.eval() + + for batch in val_loader: + pixel_values = batch["pixel_values"] + labels = batch["labels"] + + if torch.cuda.is_available(): + pixel_values, labels = pixel_values.cuda(), labels.cuda() + + # evaluate + outputs = model(pixel_values=pixel_values, labels=labels) + # First, rescale logits to original image size + upsampled_logits = torch.nn.functional.interpolate(outputs.logits, size=labels.shape[-2:], mode="bilinear", align_corners=False) + predicted = upsampled_logits.argmax(dim=1) + + mask = (labels != 255) # we don't include the background class in the accuracy calculation + pred_labels = predicted[mask].detach().cpu().numpy() + true_labels = labels[mask].detach().cpu().numpy() + accuracy = accuracy_score(pred_labels, true_labels) + val_loss = outputs.loss + val_accuracies.append(accuracy) + val_losses.append(val_loss.item()) + + writer.add_scalar("Val loss", val_loss.item(), e) + ############################################### + + scheduler.step(sum(val_losses)/len(val_losses)) + + # Print the loss each print_step epochs + if e%print_step==0: + print(f"| Epoch {e}") + print(f"| Train Pixel-wise accuracy: {sum(accuracies)/len(accuracies)}\ + Train Loss: {sum(losses)/len(losses)}\ + Val Pixel-wise accuracy: {sum(val_accuracies)/len(val_accuracies)}\ + Val Loss: {sum(val_losses)/len(val_losses)}\n") + + # Save best model weights + if min_valid_loss > (sum(val_losses)/len(val_losses)): + + train_loss = sum(losses)/len(losses) + min_valid_loss = sum(val_losses)/len(val_losses) + + # Saving the model + print(f"| Epoch {e} - Saving the model with train loss={train_loss:.6f} and val_loss={min_valid_loss:.6f}") + model_name = str(f'{model_type}_TRAIN_{train_loss:.6f}_VAL_{min_valid_loss:.6f}') + model.save_pretrained(str('/home/a.lombardi/my_segformer/models/' + model_name + "/")) + + + +if __name__ == "__main__": + torch.cuda.empty_cache() + + ############################################### + ####### Getting configuration settings ######## + config = ConfigParser() + config.read('/home/a.lombardi/my_segformer/configuration.ini') + BATCH_SIZE = config.getint('TRAINING', 'batch_size') + PRETRAINED_WEIGHTS = config.get('MODEL', 'pretrained_type') + ############################################### + + ############################################### + ############ Preparing the dataset ############ + + transforms = tfs.Compose([ + tfs.RandomHorizontalFlip(p=0.5), + #aug.CenterCrop(1024,1024, always_apply=False, p=0.5), + tfs.RandomCrop(1024), + ]) + + #train_set = CityscapesDataset(path='/home/a.lombardi/CityScapes_Dataset', split='train', transforms=True) + #val_set = CityscapesDataset(path='/home/a.lombardi/CityScapes_Dataset', split='val', transforms=False) + + train_set = ApolloScapeDataset("/home/a.lombardi/ApolloScape_Dataset", split='train', transforms=None) + print(f"trainset len {len(train_set)}") + val_set = ApolloScapeDataset("/home/a.lombardi/ApolloScape_Dataset", split='val', transforms=None) + print(f"valset len {len(val_set)}") + + train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) + val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) + ############################################### + + num_labels = len(train_set.get_label2id()) + + ############################################### + ############## Preparing the model ############ + model = SegformerForSemanticSegmentation.from_pretrained(PRETRAINED_WEIGHTS, # Encoder pretrained weights + ignore_mismatched_sizes=True, + num_labels=num_labels, + id2label=train_set.get_id2label(), + label2id=train_set.get_label2id(), + reshape_last_stage=True) + + if torch.cuda.is_available(): + print("Loading the model on GPU: ", torch.cuda.get_device_name(0)) + model = model.cuda() + else: + print("Using the model on CPU\n") + ############################################### + + # loss function and optimizer + # The loss is already been used in the model decode head as the output is given + criterion = torch.nn.CrossEntropyLoss() + #criterion = + optimizer = torch.optim.AdamW(model.parameters(),lr=0.00006) # Adam, AdamW or RMSprop + + training(model=model, model_type="b1ApolloMODIFIED", + train_loader=train_loader, val_loader=val_loader, + criterion=None, optimizer=optimizer, epochs=250, print_step=1) + + \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..637e12d --- /dev/null +++ b/utils.py @@ -0,0 +1,35 @@ +class bcolors: + ''' Basic class to output colored strings''' + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + +import numpy as np + +def to_categorical(y, num_classes=None, dtype="float32"): + ''' + Converts a class vector (integers) to binary class matrix. + E.g. for use with `categorical_crossentropy`. + ''' + + y = np.array(y, dtype="int") + input_shape = y.shape + if input_shape and input_shape[-1] == 1 and len(input_shape) > 1: + input_shape = tuple(input_shape[:-1]) + y = y.ravel() + if not num_classes: + num_classes = np.max(y) + 1 + n = y.shape[0] + categorical = np.zeros((n, num_classes), dtype=dtype) + categorical[np.arange(n), y] = 1 + output_shape = input_shape + (num_classes,) + categorical = np.reshape(categorical, output_shape) + + return categorical +