From 39d439ecb52145382e3b0bf74dd5853becd64ca7 Mon Sep 17 00:00:00 2001
From: Andrea Lombardi <lombardiandrea@outlook.it>
Date: Fri, 5 Aug 2022 15:48:15 +0200
Subject: [PATCH] first commit

---
 ApolloScapeDataset.py              | 185 ++++++++++++++++++++++++++
 CityscapesDataset.py               | 177 +++++++++++++++++++++++++
 KITTIDataset.py                    |  81 ++++++++++++
 configuration.ini                  |  20 +++
 eval.py                            |  82 ++++++++++++
 modelv1/Decoder.py                 |  31 +++++
 modelv1/Encoder.py                 |  56 ++++++++
 modelv1/EncoderBlock.py            | 127 ++++++++++++++++++
 modelv1/EncoderStage.py            |  31 +++++
 modelv1/SegmentationHead.py        |  17 +++
 modelv1/model.py                   |  47 +++++++
 modelv2/SegformerDecodeHead.py     | 169 ++++++++++++++++++++++++
 modelv2/SegformerSkipDecodeHead.py | 133 +++++++++++++++++++
 modelv2/Segformer_model.py         |  70 ++++++++++
 test.py                            | 100 +++++++++++++++
 train.py                           | 170 ++++++++++++++++++++++++
 train_hf.py                        | 200 +++++++++++++++++++++++++++++
 utils.py                           |  35 +++++
 18 files changed, 1731 insertions(+)
 create mode 100644 ApolloScapeDataset.py
 create mode 100644 CityscapesDataset.py
 create mode 100644 KITTIDataset.py
 create mode 100644 configuration.ini
 create mode 100644 eval.py
 create mode 100644 modelv1/Decoder.py
 create mode 100644 modelv1/Encoder.py
 create mode 100644 modelv1/EncoderBlock.py
 create mode 100644 modelv1/EncoderStage.py
 create mode 100644 modelv1/SegmentationHead.py
 create mode 100644 modelv1/model.py
 create mode 100644 modelv2/SegformerDecodeHead.py
 create mode 100644 modelv2/SegformerSkipDecodeHead.py
 create mode 100644 modelv2/Segformer_model.py
 create mode 100644 test.py
 create mode 100644 train.py
 create mode 100644 train_hf.py
 create mode 100644 utils.py

diff --git a/ApolloScapeDataset.py b/ApolloScapeDataset.py
new file mode 100644
index 0000000..c35a1ea
--- /dev/null
+++ b/ApolloScapeDataset.py
@@ -0,0 +1,185 @@
+import random
+
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+""" Zpark labels"""
+
+from collections import namedtuple
+
+from transformers import SegformerFeatureExtractor
+from torch.utils.data import Dataset
+import os
+import cv2
+import torchvision.transforms.functional as TF
+
+from torchvision import transforms as tfs
+import numpy as np
+
+class ApolloScapeDataset(Dataset):
+    """KITTI semantic segmentation dataset."""
+
+    def __init__(self, root_dir:str, split:str='train', transforms=None):
+        """
+        Args:
+            root_dir (string): Root directory of the dataset containing the images + annotations.
+            split: the split of the dataset (train, test or val)
+        """
+        assert split=='train' or split=='test' or split=='val', "The split of the dataset must be one between 'train', 'test' or 'val'"
+
+        self.root_dir = root_dir
+        self.feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-cityscapes-512-1024")
+        self.transforms = transforms
+
+        self.img_dir = os.path.join(self.root_dir, "ColorImage", split)
+        self.ann_dir = os.path.join(self.root_dir, "Label", split)
+        
+        # read images
+        image_file_names = []
+        for root, dirs, files in os.walk(self.img_dir):
+            for f in files:
+                complete_path = os.path.join(root, f)
+                #print(complete_path)
+                image_file_names.append(complete_path)
+        
+        self.images = sorted(image_file_names) 
+        # read annotations
+        annotation_file_names = []
+        for root, dirs, files in os.walk(self.ann_dir):
+            for f in files:
+                complete_path = os.path.join(root, f)
+                annotation_file_names.append(complete_path)
+        
+        self.annotations = sorted(annotation_file_names) 
+        
+        
+        assert len(self.images) == len(self.annotations), "There must be as many images as there are segmentation maps"
+
+        # a label and all meta information
+        Label = namedtuple('Label', [
+            'name'        , # The identifier of this label, e.g. 'car', 'person', ... .
+                            # We use them to uniquely name a class
+            'clsId'       ,
+            'id'          , # An integer ID that is associated with this label.
+            'trainId'     , 
+            'category'    , # The name of the category that this label belongs to
+            'categoryId'  , # The ID of this category. Used to create ground truth images on category level.
+            'hasInstances', # Whether this label distinguishes between single instances or not
+            'ignoreInEval', # Whether pixels having this class as ground truth label are ignored during evaluations or not
+            'color'       , # The color of this label
+            ])
+        #--------------------------------------------------------------------------------
+        # A list of all labels
+        #--------------------------------------------------------------------------------
+
+        self.labels = [
+            #     name                    clsId    id   trainId   category  catId  hasInstanceignoreInEval   color
+            Label('others'              ,    0 ,    0,   0  , 'others'        ,   0  ,False , True  ,    (0, 0, 0)       ),
+            Label('rover'               , 0x01 ,    1,   1  , 'others'        ,   0  ,False , True  ,    (0, 0, 0)       ),
+            Label('sky'                 , 0x11 ,   17,   2  , 'sky'           ,   1  ,False , False ,    (70, 130, 180)  ),
+            Label('car'                 , 0x21 ,   33,   3  , 'movable object',   2  ,True  , False ,    (0, 0, 142)     ),
+            Label('car_groups'          , 0xA1 ,  161,   4  , 'movable object',   2  ,True  , False ,    (0, 0, 142)     ),  
+            Label('motorbicycle'        , 0x22 ,   34,   5  , 'movable object',   2  ,True  , False ,    (0, 0, 230)     ),
+            Label('motorbicycle_group'  , 0xA2 ,  162,   6  , 'movable object',   2  ,True  , False ,    (0, 0, 230)     ),
+            Label('bicycle'             , 0x23 ,   35,   7  , 'movable object',   2  ,True  , False ,    (119, 11, 32)   ),
+            Label('bicycle_group'       , 0xA3 ,  163,   8  , 'movable object',   2  ,True  , False ,    (119, 11, 32)   ),
+            Label('person'              , 0x24 ,   36,   9  , 'movable object',   2  ,True  , False ,    (0, 128, 192)   ),
+            Label('person_group'        , 0xA4 ,  164,  10  , 'movable object',   2  ,True  , False ,    (0, 128, 192)   ),
+            Label('rider'               , 0x25 ,   37,  11  , 'movable object',   2  ,True  , False ,    (128, 64, 128)  ),
+            Label('rider_group'         , 0xA5 ,  165,  12  , 'movable object',   2  ,True  , False ,    (128, 64, 128)  ),
+            Label('truck'               , 0x26 ,   38,  13  , 'movable object',   2  ,True  , False ,    (128, 0, 192)   ),
+            Label('truck_group'         , 0xA6 ,  166,  14  , 'movable object',   2  ,True  , False ,    (128, 0, 192)   ), 
+            Label('bus'                 , 0x27 ,   39,  15  , 'movable object',   2  ,True  , False ,    (192, 0, 64)    ),
+            Label('bus_group'           , 0xA7 ,  167,  16  , 'movable object',   2  ,True  , False ,    (192, 0, 64)    ),
+            Label('tricycle'            , 0x28 ,   40,  17  , 'movable object',   2  ,True  , False ,    (128, 128, 192) ),
+            Label('tricycle_group'      , 0xA8 ,  168,  18  , 'movable object',   2  ,True  , False ,    (128, 128, 192) ),
+            Label('road'                , 0x31 ,   49,  19  , 'flat'          ,   3  ,False , False ,    (192, 128, 192) ),
+            Label('siderwalk'           , 0x32 ,   50,  20  , 'flat'          ,   3  ,False , False ,    (192, 128, 64)  ),
+            Label('traffic_cone'        , 0x41 ,   65,  21  , 'road obstacles',   4  ,False , False ,    (0, 0, 64)      ),
+            Label('road_pile'           , 0x42 ,   66,  22  , 'road obstacles',   4  ,False , False ,    (0, 0, 192)     ),
+            Label('fence'               , 0x43 ,   67,  23  , 'road obstacles',   4  ,False , False ,    (64, 64, 128)   ),
+            Label('traffic_light'       , 0x51 ,   81,  24  , 'Roadside objects',   5  ,False , False ,  (192, 64, 128)  ),
+            Label('pole'                , 0x52 ,   82,  25  , 'Roadside objects',   5  ,False , False ,  (192, 128, 128) ),
+            Label('traffic_sign'        , 0x53 ,   83,  26  , 'Roadside objects',   5  ,False , False ,  (0, 64, 64)     ),
+            Label('wall'                , 0x54 ,   84,  27  , 'Roadside objects',   5  ,False , False ,  (192, 192, 128) ),
+            Label('dustbin'             , 0x55 ,   85,  28  , 'Roadside objects',   5  ,False , False ,  (64, 0, 192)    ),
+            Label('billboard'           , 0x56 ,   86,  29  , 'Roadside objects',   5  ,False , False ,  (192, 0, 192)   ),
+            Label('building'            , 0x61 ,   97,  30  , 'building'        ,   6  ,False , False ,  (192, 0, 128)   ),
+            Label('bridge'              , 0x62 ,   98,  31  , 'building'        ,   6  ,False , True  ,  (128, 128, 0)   ),
+            Label('tunnel'              , 0x63 ,   99,  32  , 'building'        ,   6  ,False , True  ,  (128, 0, 0)     ),
+            Label('overpass'            , 0x64 ,  100,  33  , 'building'        ,   6  ,False , True  ,  (64, 128, 64)   ),
+            Label('vegatation'          , 0x71 ,  113,  34  , 'natural'         ,   7  ,False , False ,  (128, 128, 64)  ),
+            Label('unlabeled'           , 0xFF ,  -1 ,  -1  , 'unlabeled'       ,   8  ,False , True  ,  (255, 255, 255) ),
+        ]
+
+    #--------------------------------------------------------------------------------
+    # Create dictionaries for a fast lookup
+    #--------------------------------------------------------------------------------
+    def get_id2label(self):
+        # return id to label object
+        id2label        = { label.id      : label for label in self.labels }
+        return  id2label
+
+    def get_label2id(self):
+        # return name to label object
+        name2label      = { label.name    : label for label in self.labels }
+        return name2label
+
+    def get_trainId2label(self):
+        # trainId to label object. This is used as a id2label.
+        trainId2label   = {label.trainId: label for label in self.labels}
+        return trainId2label
+
+    def get_label2color(self):
+        # return label to color code dictionary
+        label2color = {label.color : label for label in self.labels}
+        return label2color
+    #--------------------------------------------------------------------------------
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, idx):
+        
+        image = cv2.imread(os.path.join(self.img_dir, self.images[idx]), cv2.IMREAD_COLOR)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        
+        segmentation_map = cv2.imread(os.path.join(self.ann_dir, self.annotations[idx]), cv2.IMREAD_GRAYSCALE)
+        for l in self.labels:
+            segmentation_map = np.where(segmentation_map!=l.id, segmentation_map, l.trainId).astype(np.uint8)
+        #segmentation_map = cv2.cvtColor(segmentation_map, cv2.COLOR_BGR2GRAY)
+    
+        #image = Image.open()
+        #segmentation_map = Image.open()
+
+        if self.transforms is not None:
+            augmented = self.transforms(image=image, mask=segmentation_map)
+            # randomly crop + pad both image and segmentation map to same size
+            encoded_inputs = self.feature_extractor(augmented['image'], augmented['mask'], return_tensors="pt")
+        else:
+            encoded_inputs = self.feature_extractor(image, segmentation_map, return_tensors="pt")
+
+        for k,v in encoded_inputs.items():
+            encoded_inputs[k].squeeze_() # remove batch dimension
+
+        return encoded_inputs
+'''
+ds = ApolloScapeDataset("/home/a.lombardi/ApolloScape_Dataset", split='test', transforms=None)
+print(len(ds.labels))
+
+prova = ds[55]
+
+print(prova["pixel_values"].shape)
+print(prova["labels"].shape)
+
+import matplotlib.pyplot as plt
+import numpy as np
+plt.imshow(prova["labels"].numpy())
+plt.savefig("prova.png")
+
+p = prova["pixel_values"].numpy()
+p = np.swapaxes(p, 0, 2)
+p = np.swapaxes(p, 0, 1)
+plt.imshow(p)
+plt.savefig("prova2.png")
+'''
\ No newline at end of file
diff --git a/CityscapesDataset.py b/CityscapesDataset.py
new file mode 100644
index 0000000..b1b1a28
--- /dev/null
+++ b/CityscapesDataset.py
@@ -0,0 +1,177 @@
+import random
+from torch import IntTensor
+from transformers import SegformerFeatureExtractor
+from torch.utils.data import Dataset
+from torchvision.datasets import Cityscapes
+import torchvision.transforms.functional as TF
+
+from torchvision import transforms as tfs
+
+from collections import namedtuple
+
+class CityscapesDataset(Dataset):
+    def __init__(self, path: str, split: str, transforms: bool=False, mode='fine', target_type='semantic'):
+        """
+        Args:
+            root_dir (string): Root directory of the dataset.
+            split: Whether to load "training", "validation" or "test" set.
+            mode: 'fine' or 'coarse"
+            target_type: for the label type, that can be 'instance', 'semantic' or 'panoptic'
+        """
+        self.split = split
+        self.dataset = Cityscapes(path, split=split, mode=mode, target_type=target_type)
+        self.feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-cityscapes-512-1024") #SegformerFeatureExtractor(align=False, reduce_zero_label=False)
+        self.transforms = transforms
+
+
+        Label = namedtuple( 'Label' , [
+            'name'        , # The identifier of this label, e.g. 'car', 'person', ... .
+                            # We use them to uniquely name a class
+            'id'          , # An integer ID that is associated with this label.
+                            # The IDs are used to represent the label in ground truth images
+                            # An ID of -1 means that this label does not have an ID and thus
+                            # is ignored when creating ground truth images (e.g. license plate).
+                            # Do not modify these IDs, since exactly these IDs are expected by the
+                            # evaluation server.
+            'trainId'     , # Feel free to modify these IDs as suitable for your method. Then create
+                            # ground truth images with train IDs, using the tools provided in the
+                            # 'preparation' folder. However, make sure to validate or submit results
+                            # to our evaluation server using the regular IDs above!
+            'category'    , # The name of the category that this label belongs to
+            'categoryId'  , # The ID of this category. Used to create ground truth images on category level.
+            'hasInstances', # Whether this label distinguishes between single instances or not
+            'ignoreInEval', # Whether pixels having this class as ground truth label are ignored during evaluations or not
+            'color'       , # The color of this label
+            ] )
+
+        #--------------------------------------------------------------------------------
+        # A list of all labels
+        #--------------------------------------------------------------------------------
+
+        self.labels = [
+            #       name                     id    trainId   category            catId     hasInstances   ignoreInEval   color
+            Label(  'unlabeled'            ,  0 ,      255 , 'void'            , 0       , False        , True         , (  0,  0,  0) ),
+            Label(  'ego vehicle'          ,  1 ,      255 , 'void'            , 0       , False        , True         , (  0,  0,  0) ),
+            Label(  'rectification border' ,  2 ,      255 , 'void'            , 0       , False        , True         , (  0,  0,  0) ),
+            Label(  'out of roi'           ,  3 ,      255 , 'void'            , 0       , False        , True         , (  0,  0,  0) ),
+            Label(  'static'               ,  4 ,      255 , 'void'            , 0       , False        , True         , (  0,  0,  0) ),
+            Label(  'dynamic'              ,  5 ,      255 , 'void'            , 0       , False        , True         , (111, 74,  0) ),
+            Label(  'ground'               ,  6 ,      255 , 'void'            , 0       , False        , True         , ( 81,  0, 81) ),
+            Label(  'road'                 ,  7 ,        0 , 'flat'            , 1       , False        , False        , (128, 64,128) ),
+            Label(  'sidewalk'             ,  8 ,        1 , 'flat'            , 1       , False        , False        , (244, 35,232) ),
+            Label(  'parking'              ,  9 ,      255 , 'flat'            , 1       , False        , True         , (250,170,160) ),
+            Label(  'rail track'           , 10 ,      255 , 'flat'            , 1       , False        , True         , (230,150,140) ),
+            Label(  'building'             , 11 ,        2 , 'construction'    , 2       , False        , False        , ( 70, 70, 70) ),
+            Label(  'wall'                 , 12 ,        3 , 'construction'    , 2       , False        , False        , (102,102,156) ),
+            Label(  'fence'                , 13 ,        4 , 'construction'    , 2       , False        , False        , (190,153,153) ),
+            Label(  'guard rail'           , 14 ,      255 , 'construction'    , 2       , False        , True         , (180,165,180) ),
+            Label(  'bridge'               , 15 ,      255 , 'construction'    , 2       , False        , True         , (150,100,100) ),
+            Label(  'tunnel'               , 16 ,      255 , 'construction'    , 2       , False        , True         , (150,120, 90) ),
+            Label(  'pole'                 , 17 ,        5 , 'object'          , 3       , False        , False        , (153,153,153) ),
+            Label(  'polegroup'            , 18 ,      255 , 'object'          , 3       , False        , True         , (153,153,153) ),
+            Label(  'traffic light'        , 19 ,        6 , 'object'          , 3       , False        , False        , (250,170, 30) ),
+            Label(  'traffic sign'         , 20 ,        7 , 'object'          , 3       , False        , False        , (220,220,  0) ),
+            Label(  'vegetation'           , 21 ,        8 , 'nature'          , 4       , False        , False        , (107,142, 35) ),
+            Label(  'terrain'              , 22 ,        9 , 'nature'          , 4       , False        , False        , (152,251,152) ),
+            Label(  'sky'                  , 23 ,       10 , 'sky'             , 5       , False        , False        , ( 70,130,180) ),
+            Label(  'person'               , 24 ,       11 , 'human'           , 6       , True         , False        , (220, 20, 60) ),
+            Label(  'rider'                , 25 ,       12 , 'human'           , 6       , True         , False        , (255,  0,  0) ),
+            Label(  'car'                  , 26 ,       13 , 'vehicle'         , 7       , True         , False        , (  0,  0,142) ),
+            Label(  'truck'                , 27 ,       14 , 'vehicle'         , 7       , True         , False        , (  0,  0, 70) ),
+            Label(  'bus'                  , 28 ,       15 , 'vehicle'         , 7       , True         , False        , (  0, 60,100) ),
+            Label(  'caravan'              , 29 ,      255 , 'vehicle'         , 7       , True         , True         , (  0,  0, 90) ),
+            Label(  'trailer'              , 30 ,      255 , 'vehicle'         , 7       , True         , True         , (  0,  0,110) ),
+            Label(  'train'                , 31 ,       16 , 'vehicle'         , 7       , True         , False        , (  0, 80,100) ),
+            Label(  'motorcycle'           , 32 ,       17 , 'vehicle'         , 7       , True         , False        , (  0,  0,230) ),
+            Label(  'bicycle'              , 33 ,       18 , 'vehicle'         , 7       , True         , False        , (119, 11, 32) ),
+            Label(  'license plate'        , -1 ,       -1 , 'vehicle'         , 7       , False        , True         , (  0,  0,142) ),
+        ]
+
+
+    #--------------------------------------------------------------------------------
+    # Create dictionaries for a fast lookup
+    #--------------------------------------------------------------------------------
+    def get_id2label(self):
+        # return id to label object
+        id2label        = { label.id      : label for label in self.labels           }
+        return  id2label
+
+    def get_label2id(self):
+        # return name to label object
+        name2label      = { label.name    : label for label in self.labels           }
+        return name2label
+
+    def get_label2color(self):
+        # return label to color code dictionary
+        label2color = {label.color : label for label in self.labels}
+        return label2color
+    #--------------------------------------------------------------------------------
+
+    def getNumClasses(self):
+        return len(self.dataset.classes)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __transform__(self, image, mask):
+        
+        # Resize
+        #resize = tfs.Resize(size=(512, 1024))
+        #image = resize(image)
+        #segmentation_map = resize(segmentation_map)
+
+        # Random crop
+        i, j, h, w = tfs.RandomCrop.get_params(
+            image, output_size=(512,512))
+        image = TF.crop(image, i, j, h, w)
+        mask = TF.crop(mask, i, j, h, w)
+
+        # Random horizontal flipping
+        if random.random() > 0.5:
+            image = TF.hflip(image)
+            mask = TF.hflip(mask)
+
+        # Transform to tensor
+        #image = TF.to_tensor(image)
+        #mask = TF.to_tensor(mask)
+        return image, mask
+
+    def __getitem__(self, idx):
+        
+        image, segmentation_map = self.dataset[idx]
+
+        if self.transforms:
+            
+            #image = self.cvt_to_tensor(image).numpy()
+            #segmentation_map = self.cvt_to_tensor(segmentation_map).numpy()
+            #augmented = self.transforms(image=image, mask=segmentation_map)
+
+            image, segmentation_map = self.__transform__(image, segmentation_map)
+
+            encoded_inputs = self.feature_extractor(images=image, segmentation_maps=segmentation_map, return_tensors="pt")
+        else:
+            encoded_inputs = self.feature_extractor(image, segmentation_map, return_tensors="pt")
+
+        for k,v in encoded_inputs.items():
+            encoded_inputs[k].squeeze_() # remove batch dimension
+
+        return encoded_inputs
+'''
+ds = CityscapesDataset(path='/home/a.lombardi/CityScapes_Dataset', split='test', transforms=False)
+
+prova = ds[60]
+
+print(prova["pixel_values"].shape)
+print(prova["labels"].shape)
+
+import matplotlib.pyplot as plt
+import numpy as np
+plt.imshow(prova["labels"].numpy())
+plt.savefig("prova.png")
+
+p = prova["pixel_values"].numpy()
+p = np.swapaxes(p, 0, 2)
+p = np.swapaxes(p, 0, 1)
+plt.imshow(p)
+plt.savefig("prova2.png")
+'''
\ No newline at end of file
diff --git a/KITTIDataset.py b/KITTIDataset.py
new file mode 100644
index 0000000..edf6c3b
--- /dev/null
+++ b/KITTIDataset.py
@@ -0,0 +1,81 @@
+import random
+from torch import IntTensor
+from transformers import SegformerFeatureExtractor
+from torch.utils.data import Dataset
+from torchvision.datasets import Cityscapes
+import os
+import cv2
+import torchvision.transforms.functional as TF
+
+from torchvision import transforms as tfs
+
+class KITTIDataset(Dataset):
+    """KITTI semantic segmentation dataset."""
+
+    def __init__(self, root_dir:str, transforms=None, split:str = 'training'):
+        """
+        Args:
+            root_dir (string): Root directory of the dataset containing the images + annotations.
+            feature_extractor (SegFormerFeatureExtractor): feature extractor to prepare images + segmentation maps.
+            train (bool): Whether to load "training" or "validation" images + annotations.
+        """
+        self.root_dir = root_dir
+        self.feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-cityscapes-512-1024")
+        self.transforms = transforms
+
+        self.img_dir = os.path.join(self.root_dir, split, "image_2")
+        self.ann_dir = os.path.join(self.root_dir, split, "semantic_rgb")
+        
+        # read images
+        image_file_names = []
+        for root, dirs, files in os.walk(self.img_dir):
+            image_file_names.extend(files)
+        self.images = image_file_names
+        #self.images = sorted(image_file_names) #They are already sorted
+        
+        # read annotations
+        annotation_file_names = []
+        for root, dirs, files in os.walk(self.ann_dir):
+            annotation_file_names.extend(files)
+        self.annotations = annotation_file_names
+        # self.annotations = sorted(annotation_file_names) #They are already sorted
+
+        assert len(self.images) == len(self.annotations), "There must be as many images as there are segmentation maps"
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, idx):
+        
+        #print(self.img_dir)
+        image = cv2.imread(os.path.join(self.img_dir, self.images[idx]))
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        
+        segmentation_map = cv2.imread(os.path.join(self.ann_dir, self.annotations[idx]))
+        segmentation_map = cv2.cvtColor(segmentation_map, cv2.COLOR_BGR2GRAY)
+    
+        #image = Image.open()
+        #segmentation_map = Image.open()
+
+        if self.transforms is not None:
+            augmented = self.transforms(image=image, mask=segmentation_map)
+            # randomly crop + pad both image and segmentation map to same size
+            encoded_inputs = self.feature_extractor(augmented['image'], augmented['mask'], return_tensors="pt")
+        else:
+            encoded_inputs = self.feature_extractor(image, segmentation_map, return_tensors="pt")
+
+        for k,v in encoded_inputs.items():
+            encoded_inputs[k].squeeze_() # remove batch dimension
+
+        return encoded_inputs
+
+ds = KITTIDataset("/home/a.lombardi/KITTI_Dataset", transforms=None, split='training')
+
+prova = ds[0]
+
+print(prova["pixel_values"].shape)
+print(prova["labels"].shape)
+
+import matplotlib.pyplot as plt
+plt.imshow(prova["labels"].numpy())
+plt.savefig("prova.png")
\ No newline at end of file
diff --git a/configuration.ini b/configuration.ini
new file mode 100644
index 0000000..94fea4e
--- /dev/null
+++ b/configuration.ini
@@ -0,0 +1,20 @@
+[TRAINING]
+batch_size=8
+learning_rate=0.00006
+
+[MODEL]
+#nvidia/mit-b0
+pretrained_type = nvidia/mit-b1
+img_train_size=1024
+model_to_test = /home/a.lombardi/my_segformer/models/b1ApolloNoAug_TRAIN_0.095583_VAL_0.147117/
+data_aug=True
+in_channels=3
+widths=[32, 64, 128, 256]
+depths=[2, 2, 2, 2]
+all_num_heads=[1, 2, 4, 8]
+patch_sizes=[7, 3, 3, 3]
+overlap_sizes=[4, 2, 2, 2]
+reduction_ratios=[8, 4, 2, 1]
+mlp_expansions=[4, 4, 4, 4]
+decoder_channels=128
+scale_factors=[8, 4, 2, 1]
\ No newline at end of file
diff --git a/eval.py b/eval.py
new file mode 100644
index 0000000..962eafe
--- /dev/null
+++ b/eval.py
@@ -0,0 +1,82 @@
+from transformers import SegformerFeatureExtractor
+from modelv2.Segformer_model import SegformerForSemanticSegmentation
+from CityscapesDataset import CityscapesDataset
+from ApolloScapeDataset import ApolloScapeDataset
+from torch.utils.data import DataLoader
+from configparser import ConfigParser
+import torch
+from utils import bcolors
+from PIL import Image
+import numpy as np
+import matplotlib.pyplot as plt
+
+# assign gpu devices
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+def evaluateOnImage(model: torch.nn.Module, image_path:str, label2color:dict):
+
+    image = Image.open(image_path)
+    image = image.convert("RGB")
+    # prepare the image for the model (aligned resize)
+    feature_extractor_inference = SegformerFeatureExtractor(do_random_crop=False, do_pad=False)
+
+    pixel_values = feature_extractor_inference(image, return_tensors="pt").pixel_values.to(device)
+    
+    model.eval()
+    outputs = model(pixel_values=pixel_values)# logits are of shape (batch_size, num_labels, height/4, width/4)
+    logits = outputs.logits.cpu()
+    # First, rescale logits to original image size
+    upsampled_logits = torch.nn.functional.interpolate(logits,
+                    size=image.size[::-1], # (height, width)
+                    mode='bilinear',
+                    align_corners=False)
+
+    # Second, apply argmax on the class dimension
+    seg = upsampled_logits.argmax(dim=1)[0]
+    color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3\
+
+    palette = label2color
+
+    for label, color in enumerate(palette):
+        color_seg[seg == label, :] = color
+
+    # Show image + mask
+    img = np.array(image) * 0.5 + color_seg * 0.5
+    img = img.astype(np.uint8)
+
+    fig, axs = plt.subplots(1, 2, figsize=(20, 10))
+    axs[0].imshow(img)
+    axs[1].imshow(color_seg)
+    plt.savefig("prova.png")
+
+if __name__ == "__main__":
+    torch.cuda.empty_cache()
+
+    ###############################################
+    ####### Getting configuration settings ########
+    config = ConfigParser()
+    config.read('/home/a.lombardi/my_segformer/configuration.ini')
+    BATCH_SIZE = config.getint('TRAINING', 'batch_size')
+    PRETRAINED_WEIGHTS = config.get('MODEL', 'model_to_test')
+    ###############################################
+
+    ###############################################
+    ############## Preparing the model ############
+    model = SegformerForSemanticSegmentation.from_pretrained(PRETRAINED_WEIGHTS, # Encoder pretrained weights
+                                                        ignore_mismatched_sizes=True,
+                                                         #num_labels=len(test_set.labels), 
+                                                         #id2label=test_set.get_id2label(), 
+                                                         #label2id=test_set.get_label2id(),
+                                                         reshape_last_stage=True)
+
+    if torch.cuda.is_available():
+        print("Loading the model on GPU: ", torch.cuda.get_device_name(0))
+        model = model.cuda()
+    else:
+        print("Using the model on CPU\n")
+    ###############################################
+
+    image_path = "/home/a.lombardi/CityScapes_Dataset/leftImg8bit/val/munster/munster_000069_000019_leftImg8bit.png"
+    label2color = CityscapesDataset(path='/home/a.lombardi/CityScapes_Dataset', split='test').get_label2color()
+    #label2color = ApolloScapeDataset("/home/a.lombardi/ApolloScape_Dataset", split='test', transforms=None).get_label2color()
+    evaluateOnImage(model=model, image_path=image_path, label2color=label2color)
\ No newline at end of file
diff --git a/modelv1/Decoder.py b/modelv1/Decoder.py
new file mode 100644
index 0000000..06c4632
--- /dev/null
+++ b/modelv1/Decoder.py
@@ -0,0 +1,31 @@
+from typing import List
+from torch import nn
+
+# A single SegFormerDecoderBlock contains one upsample layer (for the spatial dimension)
+#  and one conv layer (for the channels). 
+# The scale_factor parameter is needed to tell it how much we want to upsample the feature.
+class SegFormerDecoderBlock(nn.Sequential):
+    def __init__(self, in_channels: int, out_channels: int, scale_factor: int = 2):
+        super().__init__(
+            nn.UpsamplingBilinear2d(scale_factor=scale_factor),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1),
+        )
+
+# SegFormerDecoder is just a list of DecoderBlocks.
+# It takes a list of features and returns a list of new features with the same spatial size and channels.
+class SegFormerDecoder(nn.Module):
+    def __init__(self, out_channels: int, widths: List[int], scale_factors: List[int]):
+        super().__init__()
+        self.stages = nn.ModuleList(
+            [
+                SegFormerDecoderBlock(in_channels, out_channels, scale_factor)
+                for in_channels, scale_factor in zip(widths, scale_factors)
+            ]
+        )
+    
+    def forward(self, features):
+        new_features = []
+        for feature, stage in zip(features,self.stages):
+            x = stage(feature)
+            new_features.append(x)
+        return new_features
\ No newline at end of file
diff --git a/modelv1/Encoder.py b/modelv1/Encoder.py
new file mode 100644
index 0000000..cd071b3
--- /dev/null
+++ b/modelv1/Encoder.py
@@ -0,0 +1,56 @@
+from typing import Iterable, List
+
+import torch
+from torch import nn
+
+from model.EncoderStage import *
+
+def chunks(data: Iterable, sizes: List[int]):
+    """
+    Given an iterable, returns slices using sizes as indices
+    """
+    curr = 0
+    for size in sizes:
+        chunk = data[curr: curr + size]
+        curr += size
+        yield chunk
+        
+class SegFormerEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        widths: List[int],
+        depths: List[int],
+        all_num_heads: List[int],
+        patch_sizes: List[int],
+        overlap_sizes: List[int],
+        reduction_ratios: List[int],
+        mlp_expansions: List[int],
+        drop_prob: float = .0
+    ):
+        super().__init__()
+        # create drop paths probabilities (one for each stage's block)
+        drop_probs =  [x.item() for x in torch.linspace(0, drop_prob, sum(depths))]
+        self.stages = nn.ModuleList(
+            [
+                SegFormerEncoderStage(*args)
+                for args in zip(
+                    [in_channels, *widths],
+                    widths,
+                    patch_sizes,
+                    overlap_sizes,
+                    chunks(drop_probs, sizes=depths),
+                    depths,
+                    reduction_ratios,
+                    all_num_heads,
+                    mlp_expansions
+                )
+            ]
+        )
+        
+    def forward(self, x):
+        features = []
+        for stage in self.stages:
+            x = stage(x)
+            features.append(x)
+        return features
\ No newline at end of file
diff --git a/modelv1/EncoderBlock.py b/modelv1/EncoderBlock.py
new file mode 100644
index 0000000..3841c68
--- /dev/null
+++ b/modelv1/EncoderBlock.py
@@ -0,0 +1,127 @@
+import torch
+from einops import rearrange
+from torch import nn
+
+# Since nn.LayerNorm in PyTorch works for tensors of shape batch, ...., channels, 
+# we can create a LayerNorm2d that first swaps the channels axis with the last one, 
+# then applies layer norm, and swaps it back. 
+# einops to make the code more readable
+class LayerNorm2d(nn.LayerNorm):
+    def forward(self, x):
+        x = rearrange(x, "b c h w -> b h w c")
+        x = super().forward(x)
+        x = rearrange(x, "b h w c -> b c h w")
+        return x
+
+class OverlapPatchMerging(nn.Sequential):
+    """ Image to Patch Embedding """
+    def __init__(
+        self, in_channels: int, out_channels: int, patch_size: int, overlap_size: int
+    ):
+        super().__init__(
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=patch_size,
+                stride=overlap_size,
+                padding=(patch_size // 2),
+                bias=False
+            ),
+            LayerNorm2d(out_channels)
+        )
+
+# Quoting from the paper:
+# We argue that positional encoding is not necessary for semantic segmentation. 
+# Instead, we introduce Mix-FFN which considers the effect of zero padding to leak location information. 
+
+class MixMLP(nn.Sequential):
+    def __init__(self, channels: int, expansion: int = 4):
+        super().__init__(
+            # dense layer
+            nn.Conv2d(channels, channels, kernel_size=1),
+            # depth wise conv
+            nn.Conv2d(
+                channels,
+                channels * expansion,
+                kernel_size=3,
+                groups=channels,
+                padding=1,
+            ),
+            nn.GELU(),
+            # dense layer
+            nn.Conv2d(channels * expansion, channels, kernel_size=1),
+        )
+        # Not using dropout layer as the paper, but very similar to ViT, 
+        # we have skip connections and normalization layers + Stochastic Depth, also known as Drop Path
+
+from torchvision.ops import StochasticDepth
+
+class ResidualAdd(nn.Module):
+    """Just an util layer"""
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        out = self.fn(x, **kwargs)
+        x = x + out
+        return x
+
+
+# We all know attention has a square complexity O(N^2) where N=H*W in our case.
+# We can reduce N by a factor of R, the complexity becomes O(N^2/R).
+# One easy way is to flat the spatial dimension and use a linear layer.
+
+# We have reduced the spatial size by r=4, so by 2 on each dimension (height and width). 
+# So we can use a convolution layer with a kernel_size=r and a stride=r to achieve the same effect.
+
+class EfficientMultiHeadAttention(nn.Module):
+    def __init__(self, channels: int, reduction_ratio: int = 1, num_heads: int = 8):
+        super().__init__()
+        self.reducer = nn.Sequential(
+            nn.Conv2d(
+                channels, channels, kernel_size=reduction_ratio, stride=reduction_ratio
+            ),
+            LayerNorm2d(channels),
+        )
+        self.att = nn.MultiheadAttention(
+            channels, num_heads=num_heads, batch_first=True
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.shape
+        reduced_x = self.reducer(x)
+        # attention needs tensor of shape (batch, sequence_length, channels)
+        reduced_x = rearrange(reduced_x, "b c h w -> b (h w) c")
+        x = rearrange(x, "b c h w -> b (h w) c")
+        out = self.att(x, reduced_x, reduced_x)[0]
+        # reshape it back to (batch, channels, height, width)
+        out = rearrange(out, "b (h w) c -> b c h w", h=h, w=w)
+        return out
+
+class SegFormerEncoderBlock(nn.Sequential):
+    def __init__(
+        self,
+        channels: int,
+        reduction_ratio: int = 1,
+        num_heads: int = 8,
+        mlp_expansion: int = 4,
+        drop_path_prob: float = .0
+    ):
+        super().__init__(
+            ResidualAdd(
+                nn.Sequential(
+                    LayerNorm2d(channels),
+                    EfficientMultiHeadAttention(channels, reduction_ratio, num_heads),
+                )
+            ),
+            ResidualAdd(
+                nn.Sequential(
+                    LayerNorm2d(channels),
+                    MixMLP(channels, expansion=mlp_expansion),
+                    StochasticDepth(p=drop_path_prob, mode="batch")
+                )
+            ),
+        )
+
+
diff --git a/modelv1/EncoderStage.py b/modelv1/EncoderStage.py
new file mode 100644
index 0000000..60b0b66
--- /dev/null
+++ b/modelv1/EncoderStage.py
@@ -0,0 +1,31 @@
+from typing import List
+
+from model.EncoderBlock import *
+
+class SegFormerEncoderStage(nn.Sequential):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        patch_size: int,
+        overlap_size: int,
+        drop_probs: List[int],
+        depth: int = 2,
+        reduction_ratio: int = 1,
+        num_heads: int = 8,
+        mlp_expansion: int = 4,
+    ):
+        super().__init__()
+
+        self.overlap_patch_merge = OverlapPatchMerging(
+            in_channels, out_channels, patch_size, overlap_size,
+        )
+        self.blocks = nn.Sequential(
+            *[
+                SegFormerEncoderBlock(
+                    out_channels, reduction_ratio, num_heads, mlp_expansion, drop_probs[i]
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = LayerNorm2d(out_channels)
\ No newline at end of file
diff --git a/modelv1/SegmentationHead.py b/modelv1/SegmentationHead.py
new file mode 100644
index 0000000..ba35757
--- /dev/null
+++ b/modelv1/SegmentationHead.py
@@ -0,0 +1,17 @@
+from torch import nn, cat
+
+class SegFormerSegmentationHead(nn.Module):
+    def __init__(self, channels: int, num_classes: int, num_features: int = 4):
+        super().__init__()
+        self.fuse = nn.Sequential(
+            nn.Conv2d(channels * num_features, channels, kernel_size=1, bias=False),
+            nn.ReLU(), # why relu? Who knows
+            nn.BatchNorm2d(channels) # why batchnorm and not layer norm? Idk
+        )
+        self.predict = nn.Conv2d(channels, num_classes, kernel_size=1)
+
+    def forward(self, features):
+        x = cat(features, dim=1)
+        x = self.fuse(x)
+        x = self.predict(x)
+        return x
\ No newline at end of file
diff --git a/modelv1/model.py b/modelv1/model.py
new file mode 100644
index 0000000..a62f7ea
--- /dev/null
+++ b/modelv1/model.py
@@ -0,0 +1,47 @@
+from typing import List
+from torch import nn
+
+from model.Decoder import SegFormerDecoder
+from model.Encoder import *
+from model.SegmentationHead import SegFormerSegmentationHead
+
+
+class SegFormer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        widths: List[int],
+        depths: List[int],
+        all_num_heads: List[int],
+        patch_sizes: List[int],
+        overlap_sizes: List[int],
+        reduction_ratios: List[int],
+        mlp_expansions: List[int],
+        decoder_channels: int,
+        scale_factors: List[int],
+        num_classes: int,
+        drop_prob: float = 0.0,
+    ):
+
+        super().__init__()
+        self.encoder = SegFormerEncoder(
+            in_channels,
+            widths,
+            depths,
+            all_num_heads,
+            patch_sizes,
+            overlap_sizes,
+            reduction_ratios,
+            mlp_expansions,
+            drop_prob,
+        )
+        self.decoder = SegFormerDecoder(decoder_channels, widths[::-1], scale_factors)
+        self.head = SegFormerSegmentationHead(
+            decoder_channels, num_classes, num_features=len(widths)
+        )
+
+    def forward(self, x):
+        features = self.encoder(x)
+        features = self.decoder(features[::-1])
+        segmentation = self.head(features)
+        return segmentation
\ No newline at end of file
diff --git a/modelv2/SegformerDecodeHead.py b/modelv2/SegformerDecodeHead.py
new file mode 100644
index 0000000..9f9fcda
--- /dev/null
+++ b/modelv2/SegformerDecodeHead.py
@@ -0,0 +1,169 @@
+from transformers import SegformerPreTrainedModel, SegformerConfig
+from torch import nn, cat, Tensor
+import math
+
+class SegformerMLP(nn.Module):
+    """
+    Linear Embedding.
+    """
+
+    def __init__(self, config: SegformerConfig, input_dim):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, config.decoder_hidden_size)
+
+    def forward(self, hidden_states: Tensor):
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+
+class SegformerDecodeHead(SegformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
+        mlps = []
+        for i in range(config.num_encoder_blocks):
+            mlp = SegformerMLP(config, input_dim=config.hidden_sizes[i])
+            mlps.append(mlp)
+        self.linear_c = nn.ModuleList(mlps)
+        #self.linear_c = nn.ModuleList(mlps.reverse())
+
+        # the following 3 layers implement the ConvModule of the original implementation
+        self.linear_fuse = nn.Conv2d(
+            in_channels=config.decoder_hidden_size * config.num_encoder_blocks,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=1,
+            bias=False,
+        )
+
+        self.linear_fuse_2_hidden_states = nn.Conv2d(
+            in_channels=config.decoder_hidden_size * 2,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=1,
+            bias=False,
+        )
+
+        self.batch_norm = nn.BatchNorm2d(config.decoder_hidden_size)
+        self.activation = nn.ReLU()
+
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Conv2d(config.decoder_hidden_size, config.num_labels, kernel_size=1)
+
+        self.config = config
+
+    def forward(self, encoder_hidden_states):
+        batch_size = encoder_hidden_states[-1].shape[0]
+        
+        all_hidden_states = ()
+        #print(encoder_hidden_states[0].shape)
+        #print(encoder_hidden_states[1].shape)
+        #print(encoder_hidden_states[2].shape)
+        #print(encoder_hidden_states[3].shape)
+        #print(encoder_hidden_states[4].shape)
+        #input()
+        # MY VERSION
+              
+        #reversed(encoder_hidden_states)
+        #FROM 
+        #0. torch.Size([8, 32, 128, 128])
+        #1. torch.Size([8, 64, 64, 64])
+        #2. torch.Size([8, 160, 32, 32])
+        #3. torch.Size([8, 256, 16, 16])
+    
+        #TO
+        #0. torch.Size([8, 256, 16, 16])
+        #1. torch.Size([8, 160, 32, 32])
+        #2. torch.Size([8, 64, 64, 64])
+        #3. torch.Size([8, 32, 128, 128])
+        
+        '''
+        for idx, (encoder_hidden_state, mlp) in reversed(list(enumerate(zip(encoder_hidden_states, self.linear_c)))):
+            if self.config.reshape_last_stage is False and encoder_hidden_state.ndim == 3:
+                height = width = int(math.sqrt(encoder_hidden_state.shape[-1]))
+                encoder_hidden_state = (
+                    encoder_hidden_state.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
+                )
+
+            if idx==3:
+                # 1. First, multi-level features Fi from the MiT encoder goes through an MLP layer to unify the channel dimension
+                height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
+                encoder_hidden_state = mlp(encoder_hidden_state)
+                #print(encoder_hidden_state.shape)
+                encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
+                encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width)
+                # Partendo dall'ultimo... es. H/32xW/32
+                # 2. Features are upsampled to the previous encoder block size
+                encoder_hidden_state = nn.functional.interpolate(
+                    encoder_hidden_state, size=encoder_hidden_states[idx-1].size()[2:], mode="bilinear", align_corners=False
+                )
+                
+
+                all_hidden_states += (encoder_hidden_state,)
+            else:
+                # 1. First, multi-level features Fi from the MiT encoder goes through an MLP layer to unify the channel dimension
+                height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
+                encoder_hidden_state = mlp(encoder_hidden_state)
+                
+                encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
+                encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width)
+                
+                all_hidden_states += (encoder_hidden_state,)
+                #print(all_hidden_states[0].shape)
+                #print(all_hidden_states[1].shape)
+                #fuse the concatenated features
+                hidden_states = self.linear_fuse_2_hidden_states(cat(all_hidden_states[::-1], dim=1))
+                hidden_states = self.batch_norm(hidden_states)
+                hidden_states = self.activation(hidden_states)
+                fused_hidden_states = self.dropout(hidden_states)
+
+                #print("fused: ", fused_hidden_states.shape)
+
+                if idx!=0:
+                    #print(idx)
+                    # 2. Features are upsampled to the previous encoder block size
+                    upsampled_hidden_states = nn.functional.interpolate(
+                        fused_hidden_states, size=encoder_hidden_states[idx-1].size()[2:], mode="bilinear", align_corners=False
+                    )
+                    #print("upsampled: ", upsampled_hidden_states.shape)
+                    all_hidden_states = ()
+                    all_hidden_states += (upsampled_hidden_states,)
+
+        logits = self.classifier(fused_hidden_states)
+        '''
+        ###########################
+        for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.linear_c):
+            if self.config.reshape_last_stage is False and encoder_hidden_state.ndim == 3:
+                height = width = int(math.sqrt(encoder_hidden_state.shape[-1]))
+                encoder_hidden_state = (
+                    encoder_hidden_state.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
+                )
+
+
+            # 1. First, multi-level features Fi from the MiT encoder go through an MLP layer to unify the channel dimension
+            height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
+            encoder_hidden_state = mlp(encoder_hidden_state)
+            encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
+            encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width)
+            # 2. Features are upsampled to 1/4th and concatenated togheter
+            encoder_hidden_state = nn.functional.interpolate(
+                encoder_hidden_state, size=encoder_hidden_states[0].size()[2:], mode="bilinear", align_corners=False
+            )
+            # concatenate
+            all_hidden_states += (encoder_hidden_state,)
+        
+        # ALL_HIDDEN_STATES SIZES ARE:
+        #torch.Size([8, 256, 128, 128])
+        #torch.Size([8, 256, 128, 128])
+        #torch.Size([8, 256, 128, 128])
+        #torch.Size([8, 256, 128, 128])
+
+        # 3. fuse the concatenated features
+        hidden_states = self.linear_fuse(cat(all_hidden_states[::-1], dim=1))
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        
+        # 4. MLP layer taking the fused features to make the predictions
+        # logits are of shape (batch_size, num_labels, height/4, width/4)
+        logits = self.classifier(hidden_states)
+        
+        return logits
\ No newline at end of file
diff --git a/modelv2/SegformerSkipDecodeHead.py b/modelv2/SegformerSkipDecodeHead.py
new file mode 100644
index 0000000..1b7a68b
--- /dev/null
+++ b/modelv2/SegformerSkipDecodeHead.py
@@ -0,0 +1,133 @@
+from transformers import SegformerPreTrainedModel, SegformerConfig
+from torch import nn, cat, Tensor
+import math
+
+class SegformerMLP(nn.Module):
+    """
+    Linear Embedding.
+    """
+
+    def __init__(self, config: SegformerConfig, input_dim):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, config.decoder_hidden_size)
+
+    def forward(self, hidden_states: Tensor):
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+
+class SegformerSkipDecodeHead(SegformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
+        mlps = []
+        for i in range(config.num_encoder_blocks):
+            mlp = SegformerMLP(config, input_dim=config.hidden_sizes[i])
+            mlps.append(mlp)
+        self.linear_c = nn.ModuleList(mlps)
+        #self.linear_c = nn.ModuleList(mlps.reverse())
+
+        # the following 3 layers implement the ConvModule of the original implementation
+        self.linear_fuse = nn.Conv2d(
+            in_channels=config.decoder_hidden_size * config.num_encoder_blocks,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=1,
+            bias=False,
+        )
+
+        self.linear_fuse_2_hidden_states = nn.Conv2d(
+            in_channels=config.decoder_hidden_size * 2,
+            out_channels=config.decoder_hidden_size,
+            kernel_size=1,
+            bias=False,
+        )
+
+        self.batch_norm = nn.BatchNorm2d(config.decoder_hidden_size)
+        self.activation = nn.ReLU()
+
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Conv2d(config.decoder_hidden_size, config.num_labels, kernel_size=1)
+
+        self.config = config
+
+    def forward(self, encoder_hidden_states):
+        batch_size = encoder_hidden_states[-1].shape[0]
+        
+        all_hidden_states = ()
+        #print(encoder_hidden_states[0].shape)
+        #print(encoder_hidden_states[1].shape)
+        #print(encoder_hidden_states[2].shape)
+        #print(encoder_hidden_states[3].shape)
+        #print(encoder_hidden_states[4].shape)
+        #input()
+        # MY VERSION
+              
+        #reversed(encoder_hidden_states)
+        #FROM 
+        #0. torch.Size([8, 32, 128, 128])
+        #1. torch.Size([8, 64, 64, 64])
+        #2. torch.Size([8, 160, 32, 32])
+        #3. torch.Size([8, 256, 16, 16])
+    
+        #TO
+        #0. torch.Size([8, 256, 16, 16])
+        #1. torch.Size([8, 160, 32, 32])
+        #2. torch.Size([8, 64, 64, 64])
+        #3. torch.Size([8, 32, 128, 128])
+        
+        
+        for idx, (encoder_hidden_state, mlp) in reversed(list(enumerate(zip(encoder_hidden_states, self.linear_c)))):
+            if self.config.reshape_last_stage is False and encoder_hidden_state.ndim == 3:
+                height = width = int(math.sqrt(encoder_hidden_state.shape[-1]))
+                encoder_hidden_state = (
+                    encoder_hidden_state.reshape(batch_size, height, width, -1).permute(0, 3, 1, 2).contiguous()
+                )
+
+            if idx==3:
+                # 1. First, multi-level features Fi from the MiT encoder goes through an MLP layer to unify the channel dimension
+                height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
+                encoder_hidden_state = mlp(encoder_hidden_state)
+                #print(encoder_hidden_state.shape)
+                encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
+                encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width)
+                # Partendo dall'ultimo... es. H/32xW/32
+                # 2. Features are upsampled to the previous encoder block size
+                encoder_hidden_state = nn.functional.interpolate(
+                    encoder_hidden_state, size=encoder_hidden_states[idx-1].size()[2:], mode="bilinear", align_corners=False
+                )
+                
+
+                all_hidden_states += (encoder_hidden_state,)
+            else:
+                # 1. First, multi-level features Fi from the MiT encoder goes through an MLP layer to unify the channel dimension
+                height, width = encoder_hidden_state.shape[2], encoder_hidden_state.shape[3]
+                encoder_hidden_state = mlp(encoder_hidden_state)
+                
+                encoder_hidden_state = encoder_hidden_state.permute(0, 2, 1)
+                encoder_hidden_state = encoder_hidden_state.reshape(batch_size, -1, height, width)
+                
+                all_hidden_states += (encoder_hidden_state,)
+                #print(all_hidden_states[0].shape)
+                #print(all_hidden_states[1].shape)
+                #fuse the concatenated features
+                hidden_states = self.linear_fuse_2_hidden_states(cat(all_hidden_states[::-1], dim=1))
+                hidden_states = self.batch_norm(hidden_states)
+                hidden_states = self.activation(hidden_states)
+                fused_hidden_states = self.dropout(hidden_states)
+
+                #print("fused: ", fused_hidden_states.shape)
+
+                if idx!=0:
+                    #print(idx)
+                    # 2. Features are upsampled to the previous encoder block size
+                    upsampled_hidden_states = nn.functional.interpolate(
+                        fused_hidden_states, size=encoder_hidden_states[idx-1].size()[2:], mode="bilinear", align_corners=False
+                    )
+                    #print("upsampled: ", upsampled_hidden_states.shape)
+                    all_hidden_states = ()
+                    all_hidden_states += (upsampled_hidden_states,)
+
+        logits = self.classifier(fused_hidden_states)
+        
+        
+        return logits
\ No newline at end of file
diff --git a/modelv2/Segformer_model.py b/modelv2/Segformer_model.py
new file mode 100644
index 0000000..6c7f576
--- /dev/null
+++ b/modelv2/Segformer_model.py
@@ -0,0 +1,70 @@
+from typing import Optional, Tuple, Union
+import torch
+from transformers import SegformerModel, SegformerPreTrainedModel, modeling_outputs
+from torch.nn import CrossEntropyLoss
+from modelv2.SegformerDecodeHead import SegformerDecodeHead
+from modelv2.SegformerSkipDecodeHead import SegformerSkipDecodeHead
+
+class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.segformer = SegformerModel(config)
+        self.decode_head = SegformerDecodeHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, modeling_outputs.SemanticSegmenterOutput]:
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+
+        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        logits = self.decode_head(encoder_hidden_states)
+
+        loss = None
+        if labels is not None: # So, if we need output loss calcs as in a training step
+            if self.config.num_labels == 1:
+                raise ValueError("The number of labels should be greater than one")
+            else:
+                # upsample logits to the images' original size
+                upsampled_logits = torch.nn.functional.interpolate(
+                    logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+                )
+                #print("max ", labels.max())
+                #print("min ", labels.min())
+                # calculate the loss
+                loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
+                loss = loss_fct(upsampled_logits, labels)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (logits,) + outputs[1:]
+            else:
+                output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return modeling_outputs.SemanticSegmenterOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..5d9280f
--- /dev/null
+++ b/test.py
@@ -0,0 +1,100 @@
+from codecs import ignore_errors
+from modelv2.Segformer_model import SegformerForSemanticSegmentation
+from CityscapesDataset import CityscapesDataset
+from ApolloScapeDataset import ApolloScapeDataset
+from torch.utils.data import DataLoader
+from configparser import ConfigParser
+import torch
+from sklearn.metrics import jaccard_score
+from torchmetrics import JaccardIndex
+from utils import bcolors
+import tqdm
+from numpy import mean
+import numpy as np
+import matplotlib.pyplot as plt
+
+from datasets import load_metric
+# assign gpu devices
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+def test(model: torch.nn.Module, test_loader, num_labels):
+
+    jaccards = []
+    jaccard = JaccardIndex(num_classes=num_labels, average="macro", ignore_index=255)
+    metric = load_metric("mean_iou")
+    print("-> Testing started:")
+    with torch.no_grad():
+        model.eval()
+
+        for batch in tqdm.tqdm(test_loader):
+            pixel_values = batch["pixel_values"]
+            labels = batch["labels"]
+
+            if torch.cuda.is_available():
+                pixel_values, labels = pixel_values.cuda(), labels.cuda()
+
+            # evaluate
+            outputs = model(pixel_values=pixel_values)
+            # First, rescale logits to original image size
+            upsampled_logits = torch.nn.functional.interpolate(outputs.logits, size=labels.shape[-2:], mode="bilinear", align_corners=False)
+            # Second, apply argmax on the class dimension
+            predicted = upsampled_logits.argmax(dim=1)
+
+            mask = (labels != 255) # we don't include the background class in the accuracy calculation
+            pred_labels = predicted.detach().cpu()
+            true_labels = labels.detach().cpu()
+
+            # note that the metric expects predictions + labels as numpy arrays
+            metric.add_batch(predictions=pred_labels.numpy(), references=true_labels.numpy())
+            
+            #jaccards.append(jaccard_score(y_pred=pred_labels.numpy(), y_true=true_labels.numpy(), average='macro'))
+            #jaccards.append(jaccard(pred_labels[mask], true_labels[mask]))
+            
+        #meanIoU = mean(jaccards)
+        metrics = metric.compute(num_labels=num_labels, ignore_index=255,
+                                   reduce_labels=False)# we've already reduced the labels before
+
+        print("Mean_iou: ", metrics["mean_iou"])
+        print("Mean accuracy: ", metrics["mean_accuracy"])
+        #print("Jaccard index (mIoU): ", meanIoU)
+
+                
+if __name__ == "__main__":
+    torch.cuda.empty_cache()
+
+    ###############################################
+    ####### Getting configuration settings ########
+    config = ConfigParser()
+    config.read('/home/a.lombardi/my_segformer/configuration.ini')
+    BATCH_SIZE = config.getint('TRAINING', 'batch_size')
+    MODEL = config.get('MODEL', 'model_to_test')
+    ###############################################
+
+    ###############################################
+    ############ Preparing the dataset ############
+    #test_set = CityscapesDataset(path='/home/a.lombardi/CityScapes_Dataset', split='val', transforms=False)
+    test_set = ApolloScapeDataset("/home/a.lombardi/ApolloScape_Dataset", split='test', transforms=None)
+    test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
+    ###############################################
+
+    num_labels = len(test_set.get_label2id())
+
+    ###############################################
+    ############## Preparing the model ############
+    model = SegformerForSemanticSegmentation.from_pretrained(MODEL, # Encoder pretrained weights
+                                                        ignore_mismatched_sizes=True,
+                                                        num_labels=num_labels, 
+                                                        id2label=test_set.get_id2label(), 
+                                                        label2id=test_set.get_label2id(),
+                                                        reshape_last_stage=True)
+
+    if torch.cuda.is_available():
+        print("Loading the model on GPU: ", torch.cuda.get_device_name(0))
+        model = model.cuda()
+    else:
+        print("Using the model on CPU\n")
+    ###############################################
+
+    test(model,
+        test_loader=test_loader, 
+        num_labels=num_labels)
\ No newline at end of file
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..fb93faa
--- /dev/null
+++ b/train.py
@@ -0,0 +1,170 @@
+from requests import patch
+import torch
+from math import inf
+from utils import bcolors
+from model.model import SegFormer
+
+from Dataset import Cityscapes_Dataset
+from torch.utils.data import DataLoader, distributed
+import torch.distributed as dist
+
+from torchsummary import summary
+# mean IoU
+from configparser import ConfigParser
+import json
+import os
+
+# assign gpu devices
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+'''
+nodes = 4
+gpus = 4
+nr = 0
+world_size = nodes*gpus
+rank = nr * gpus + 0
+
+os.environ['MASTER_ADDR'] = '193.205.230.3'
+os.environ['MASTER_PORT'] = '8889'  
+'''
+
+def training(model: torch.nn.Module, device, train_loader, val_loader, criterion, optimizer, num_classes, epochs=1, print_step=10):
+
+    print("-> Run training")
+    '''
+    dist.init_process_group(                                   
+    	backend='nccl',                                         
+   		init_method='env://',                                   
+    	world_size=world_size,                              
+    	rank=rank                                               
+    )
+
+    # Wrap the model ##########
+    model = torch.nn.parallel.DistributedDataParallel(model,device_ids=[device])
+    ###########################
+    '''
+
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=10, min_lr=0.0000001, verbose=True)
+    min_valid_loss = inf
+    
+    # cycle through epochs
+    for e in range(epochs):
+        
+        model.train() # turn on train mode
+
+        # TRAINING STEP ---------------------|
+        train_loss = 0.0
+
+        for data, labels in train_loader: # cycle thorugh batches
+            
+            if torch.cuda.is_available():
+                data, labels = data.cuda(), labels.cuda()
+            
+            #print("Data shape: ",data.shape)
+            #print("Labels shape: ",labels.shape)
+            # forward pass 
+            output = model(data)
+            #print("Output shape: ",output.shape)
+            # find the loss
+            loss = criterion(output, labels)
+            # clear the gradients
+            optimizer.zero_grad()
+            # calculate gradients
+            loss.backward()
+            # update weights
+            optimizer.step()
+            # update the loss
+            train_loss += loss.item()
+        # ------------------------------------|
+
+        # VALIDATION STEP --------------------|
+        valid_loss = 0.0
+        with torch.no_grad():   # disable gradient calculation
+            model.eval()
+
+            for data, labels in val_loader:
+
+                if torch.cuda.is_available():
+                    data, labels = data.cuda(), labels.cuda()
+
+                # forward pass
+                output = model(data)
+                # find the loss
+                loss = criterion(output, labels)
+                # calculate loss
+                valid_loss += loss.item()
+        # ------------------------------------|
+        
+        scheduler.step(valid_loss)
+
+        # the loss is printed each print_step epochs
+        if e%print_step==0:
+            print(f'| {bcolors.BOLD}Epoch {e+1}{bcolors.ENDC} | Training Loss: {train_loss/len(train_loader):.5f} | Validation Loss: {valid_loss/len(val_loader):.5f}' )
+        if min_valid_loss > valid_loss:
+            if e%print_step==0:
+                print(f'{bcolors.OKGREEN}| Val loss decreased ({min_valid_loss:.5f}--->{valid_loss:.5f}) - Saving model.{bcolors.ENDC}')
+            min_valid_loss = valid_loss
+
+            # Saving the model
+            model_name = str(f'model_TRAIN_{train_loss/len(train_loader):.6f}_VAL_{valid_loss/len(val_loader):.6f}.pth')
+            torch.save(model.state_dict(), '/home/a.lombardi/my_segformer/models/' + model_name)
+
+    input()
+if __name__ == "__main__":
+    torch.cuda.empty_cache()
+
+    ################ Getting configuration settings ###############
+    config = ConfigParser()
+    config.read('/home/a.lombardi/my_segformer/configuration.ini')
+    BATCH_SIZE = config.getint('TRAINING', 'batch_size')
+    ################################################################
+
+    # load data
+    train_set = Cityscapes_Dataset(path='/home/a.lombardi/CityScapes_Dataset', 
+                                    batch_size=BATCH_SIZE, 
+                                    image_size=config.getint('MODEL','img_train_size'),
+                                    split='train')
+    val_set = Cityscapes_Dataset(path='/home/a.lombardi/CityScapes_Dataset',
+                                    batch_size=BATCH_SIZE, 
+                                    image_size=config.getint('MODEL','img_train_size'),
+                                    split='train')
+    
+    '''
+    train_sampler = distributed.DistributedSampler(train_set,num_replicas=world_size,rank=rank)
+    val_sampler = distributed.DistributedSampler(val_set,num_replicas=world_size,rank=rank)
+    '''
+    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
+    
+    val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
+
+    # define and load the model
+    model_type = 'MODEL'
+    model = SegFormer(in_channels=config.getint(model_type,'in_channels'), #image channels
+                        widths=json.loads(config.get(model_type,'widths')),
+                        depths=json.loads(config.get(model_type,'depths')),
+                        all_num_heads=json.loads(config.get(model_type,'all_num_heads')),
+                        patch_sizes=json.loads(config.get(model_type,'patch_sizes')),
+                        overlap_sizes=json.loads(config.get(model_type,'overlap_sizes')),
+                        reduction_ratios=json.loads(config.get(model_type,'reduction_ratios')),
+                        mlp_expansions=json.loads(config.get(model_type,'mlp_expansions')),
+                        decoder_channels=config.getint(model_type,'decoder_channels'),
+                        scale_factors=json.loads(config.get(model_type,'scale_factors')),
+                        num_classes=train_set.getNumClasses(),
+                        ).to(device)
+
+    pytorch_total_params = sum(p.numel() for p in model.parameters())
+    print(pytorch_total_params)
+
+    if torch.cuda.is_available():
+        print("Loading the model on GPU: ", torch.cuda.get_device_name(0))
+        model = model.cuda()
+    else:
+        print("Using the model on CPU\n")
+
+    # loss function and optimizer
+    #criterion = torch.nn.MSELoss()
+    criterion = torch.nn.CrossEntropyLoss()
+    #criterion = 
+    optimizer = torch.optim.AdamW(model.parameters(),lr=0.00008) # Adam, AdamW or RMSprop
+    
+    training(model, device, train_loader, val_loader, criterion, optimizer, num_classes=train_set.getNumClasses(), epochs=500)
\ No newline at end of file
diff --git a/train_hf.py b/train_hf.py
new file mode 100644
index 0000000..d72be57
--- /dev/null
+++ b/train_hf.py
@@ -0,0 +1,200 @@
+#from transformers import SegformerForSemanticSegmentation
+from modelv2.Segformer_model import SegformerForSemanticSegmentation
+from CityscapesDataset import CityscapesDataset
+from ApolloScapeDataset import ApolloScapeDataset
+from torch.utils.data import DataLoader
+from configparser import ConfigParser
+from math import ceil, inf
+import torch
+from sklearn.metrics import accuracy_score
+from utils import bcolors
+from torchvision import transforms as tfs
+from torch.utils.tensorboard import SummaryWriter
+writer = SummaryWriter()
+writer.flush()
+# assign gpu devices
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+def training(model: torch.nn.Module, model_type:str, train_loader, val_loader, criterion, optimizer, epochs=1, print_step=1):
+    """ Training function
+
+    Args:
+        model (torch.nn.Module): model to train
+        model_type (str): name of the model type
+        device (_type_): GPU or CPU device
+        train_loader (_type_): train data laoder
+        val_loader (_type_): validation data loader
+        criterion (_type_): already initialized loss function
+        optimizer (_type_): 
+        num_classes (_type_): number of the classes to predict
+        epochs (int, optional): Training epochs. Defaults to 1.
+        print_step (int, optional): Defaults to 1.
+    """
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
+                                                             factor=0.5, patience=5, 
+                                                             min_lr=0.0000001, verbose=True)
+
+    if model_type is None:
+        model_type = "model"
+
+    # utils data
+    min_valid_loss = inf
+    accuracies = []
+    losses = []
+    val_accuracies = []
+    val_losses = []
+
+    print("-> Training started:")
+    # cycle through epochs
+    for e in range(epochs):
+        
+        model.train() # turn on train mode
+
+        ###############################################
+        ############### TRAINING STEP #################
+        for batch in train_loader:
+            pixel_values = batch["pixel_values"]
+            labels = batch["labels"]
+
+            if torch.cuda.is_available():
+                pixel_values, labels = pixel_values.cuda(), labels.cuda()
+
+            # forward pass
+            outputs = model(pixel_values=pixel_values, labels=labels)
+
+            # First, rescale logits to original image size
+            upsampled_logits = torch.nn.functional.interpolate(outputs.logits, size=labels.shape[-2:], mode="bilinear", align_corners=False)
+            # Second, apply argmax on the class dimension
+            predicted = upsampled_logits.argmax(dim=1)
+
+            mask = (labels != 255) # we don't include the background class in the accuracy calculation
+            pred_labels = predicted[mask].detach().cpu().numpy()
+            true_labels = labels[mask].detach().cpu().numpy()
+            accuracy = accuracy_score(pred_labels, true_labels)
+            loss = outputs.loss
+            accuracies.append(accuracy)
+            losses.append(loss.item())
+
+            writer.add_scalar("Train loss", loss.item(), e)
+
+            # clear the gradients
+            optimizer.zero_grad()
+            # calculate gradients
+            loss.backward()
+            # update weights
+            optimizer.step()
+        ###############################################
+
+        ###############################################
+        ################## VAL STEP ###################
+        with torch.no_grad():   # disable gradient calculation
+            model.eval()
+
+            for batch in val_loader:
+                pixel_values = batch["pixel_values"]
+                labels = batch["labels"]
+
+                if torch.cuda.is_available():
+                    pixel_values, labels = pixel_values.cuda(), labels.cuda()
+                
+                # evaluate
+                outputs = model(pixel_values=pixel_values, labels=labels)
+                # First, rescale logits to original image size
+                upsampled_logits = torch.nn.functional.interpolate(outputs.logits, size=labels.shape[-2:], mode="bilinear", align_corners=False)
+                predicted = upsampled_logits.argmax(dim=1)
+
+                mask = (labels != 255) # we don't include the background class in the accuracy calculation
+                pred_labels = predicted[mask].detach().cpu().numpy()
+                true_labels = labels[mask].detach().cpu().numpy()
+                accuracy = accuracy_score(pred_labels, true_labels)
+                val_loss = outputs.loss
+                val_accuracies.append(accuracy)
+                val_losses.append(val_loss.item())
+
+                writer.add_scalar("Val loss", val_loss.item(), e)
+        ###############################################
+
+        scheduler.step(sum(val_losses)/len(val_losses))
+
+        # Print the loss each print_step epochs
+        if e%print_step==0:
+            print(f"| Epoch {e}")
+            print(f"| Train Pixel-wise accuracy: {sum(accuracies)/len(accuracies)}\
+                Train Loss: {sum(losses)/len(losses)}\
+                Val Pixel-wise accuracy: {sum(val_accuracies)/len(val_accuracies)}\
+                Val Loss: {sum(val_losses)/len(val_losses)}\n")
+
+        # Save best model weights 
+        if min_valid_loss > (sum(val_losses)/len(val_losses)):
+            
+            train_loss = sum(losses)/len(losses)
+            min_valid_loss = sum(val_losses)/len(val_losses)
+
+            # Saving the model
+            print(f"| Epoch {e} - Saving the model with train loss={train_loss:.6f} and val_loss={min_valid_loss:.6f}")
+            model_name = str(f'{model_type}_TRAIN_{train_loss:.6f}_VAL_{min_valid_loss:.6f}')
+            model.save_pretrained(str('/home/a.lombardi/my_segformer/models/' + model_name + "/"))
+
+
+
+if __name__ == "__main__":
+    torch.cuda.empty_cache()
+
+    ###############################################
+    ####### Getting configuration settings ########
+    config = ConfigParser()
+    config.read('/home/a.lombardi/my_segformer/configuration.ini')
+    BATCH_SIZE = config.getint('TRAINING', 'batch_size')
+    PRETRAINED_WEIGHTS = config.get('MODEL', 'pretrained_type')
+    ###############################################
+
+    ###############################################
+    ############ Preparing the dataset ############
+
+    transforms = tfs.Compose([
+        tfs.RandomHorizontalFlip(p=0.5),
+        #aug.CenterCrop(1024,1024, always_apply=False, p=0.5),
+        tfs.RandomCrop(1024),
+        ])
+    
+    #train_set = CityscapesDataset(path='/home/a.lombardi/CityScapes_Dataset', split='train', transforms=True)
+    #val_set = CityscapesDataset(path='/home/a.lombardi/CityScapes_Dataset', split='val', transforms=False)
+    
+    train_set = ApolloScapeDataset("/home/a.lombardi/ApolloScape_Dataset", split='train', transforms=None)
+    print(f"trainset len {len(train_set)}")
+    val_set = ApolloScapeDataset("/home/a.lombardi/ApolloScape_Dataset", split='val', transforms=None)
+    print(f"valset len {len(val_set)}")
+
+    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
+    val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
+    ###############################################
+    
+    num_labels = len(train_set.get_label2id())
+
+    ###############################################
+    ############## Preparing the model ############
+    model = SegformerForSemanticSegmentation.from_pretrained(PRETRAINED_WEIGHTS, # Encoder pretrained weights
+                                                        ignore_mismatched_sizes=True,
+                                                         num_labels=num_labels, 
+                                                         id2label=train_set.get_id2label(), 
+                                                         label2id=train_set.get_label2id(),
+                                                         reshape_last_stage=True)
+
+    if torch.cuda.is_available():
+        print("Loading the model on GPU: ", torch.cuda.get_device_name(0))
+        model = model.cuda()
+    else:
+        print("Using the model on CPU\n")
+    ###############################################
+
+    # loss function and optimizer
+    # The loss is already been used in the model decode head as the output is given
+    criterion = torch.nn.CrossEntropyLoss()
+    #criterion = 
+    optimizer = torch.optim.AdamW(model.parameters(),lr=0.00006) # Adam, AdamW or RMSprop
+
+    training(model=model, model_type="b1ApolloMODIFIED", 
+            train_loader=train_loader, val_loader=val_loader,
+            criterion=None, optimizer=optimizer, epochs=250, print_step=1)
+
+    
\ No newline at end of file
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..637e12d
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,35 @@
+class bcolors:
+    ''' Basic class to output colored strings'''
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+import numpy as np
+
+def to_categorical(y, num_classes=None, dtype="float32"):
+    '''
+    Converts a class vector (integers) to binary class matrix.
+    E.g. for use with `categorical_crossentropy`.
+    '''
+
+    y = np.array(y, dtype="int")
+    input_shape = y.shape
+    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
+        input_shape = tuple(input_shape[:-1])
+    y = y.ravel()
+    if not num_classes:
+        num_classes = np.max(y) + 1
+    n = y.shape[0]
+    categorical = np.zeros((n, num_classes), dtype=dtype)
+    categorical[np.arange(n), y] = 1
+    output_shape = input_shape + (num_classes,)
+    categorical = np.reshape(categorical, output_shape)
+    
+    return categorical
+