Updated Dataloader for all 4 splits

rohithpeddi · rohithpeddi · commit 1d62b49fb8cd · 2024-05-20T04:22:03.000-05:00
diff --git a/dataloader/CaptainCookStepDataset.py b/dataloader/CaptainCookStepDataset.py
@@ -15,46 +15,127 @@ def __init__(self, config, phase, split):
         self._phase = phase
         self._split = split
 
-        if self._split is None:
-            self._split = "recordings"
+        with open('../annotations/annotation_json/step_annotations.json', 'r') as f:
+            self._annotations = json.load(f)
 
         assert self._phase in ["train", "val", "test"], f"Invalid phase: {self._phase}"
         self._features_directory = self._config.features_directory
 
-        self._recording_ids_file = f"{self._split}_data_split_combined.json"
+        if self._split == 'shuffle':
+            self._recording_ids_file = f"recordings_combined_splits.json"
+            print(f"Loading recording ids from {self._recording_ids_file}")
+
+            with open(f'../er_annotations/{self._recording_ids_file}', 'r') as file:
+                self._recording_ids_json = json.load(file)
+
+            self._recording_ids = self._recording_ids_json['train'] + self._recording_ids_json['val'] + self._recording_ids_json['test']
+
+            self._step_dict = {}
+            step_index_id = 0
+            for recording_id in self._recording_ids:
+                self._normal_step_dict = {}
+                self._error_step_dict = {}
+                normal_index_id = 0
+                error_index_id = 0
+                # 1. Prepare step_id, list(<start, end>) for the recording_id
+                recording_step_dictionary = {}
+                for step in self._annotations[recording_id]['steps']:
+                    if step['start_time'] < 0 or step['end_time'] < 0:
+                        # Ignore missing steps
+                        continue
+                    if recording_step_dictionary.get(step['step_id']) is None:
+                        recording_step_dictionary[step['step_id']] = []
+
+                    recording_step_dictionary[step['step_id']].append(
+                        (math.floor(step['start_time']), math.ceil(step['end_time']), step['has_errors']))
+
+                # 2. Add step start and end time list to the step_dict
+                for step_id in recording_step_dictionary.keys():
+                    # If the step has errors, add it to the error_step_dict, else add it to the normal_step_dict
+                    if recording_step_dictionary[step_id][0][2]:
+                        self._error_step_dict[f'E{error_index_id}'] = (recording_id, recording_step_dictionary[step_id])
+                        error_index_id += 1
+                    else:
+                        self._normal_step_dict[f'N{normal_index_id}'] = (
+                            recording_id, recording_step_dictionary[step_id])
+                        normal_index_id += 1
+
+                np.random.seed(config.seed)
+                np.random.shuffle(list(self._normal_step_dict.keys()))
+                np.random.shuffle(list(self._error_step_dict.keys()))
+
+                normal_step_indices = list(self._normal_step_dict.keys())
+                error_step_indices = list(self._error_step_dict.keys())
+
+                self._split_proportion = [0.75, 0.16, 0.9]
+
+                num_normal_steps = len(normal_step_indices)
+                num_error_steps = len(error_step_indices)
+
+                self._split_proportion_normal = [int(num_normal_steps * self._split_proportion[0]),
+                                                 int(num_normal_steps * (
+                                                         self._split_proportion[0] + self._split_proportion[1]))]
+                self._split_proportion_error = [int(num_error_steps * self._split_proportion[0]),
+                                                int(num_error_steps * (
+                                                        self._split_proportion[0] + self._split_proportion[1]))]
+
+                if phase == 'train':
+                    self._train_normal = normal_step_indices[:self._split_proportion_normal[0]]
+                    self._train_error = error_step_indices[:self._split_proportion_error[0]]
+                    train_indices = self._train_normal + self._train_error
+                    for index_id in train_indices:
+                        self._step_dict[step_index_id] = self._normal_step_dict.get(index_id,
+                                                                                    self._error_step_dict.get(index_id))
+                        step_index_id += 1
+                elif phase == 'test':
+                    self._val_normal = normal_step_indices[
+                                       self._split_proportion_normal[0]:self._split_proportion_normal[1]]
+                    self._val_error = error_step_indices[
+                                      self._split_proportion_error[0]:self._split_proportion_error[1]]
+                    val_indices = self._val_normal + self._val_error
+                    for index_id in val_indices:
+                        self._step_dict[step_index_id] = self._normal_step_dict.get(index_id,
+                                                                                    self._error_step_dict.get(index_id))
+                        step_index_id += 1
+                elif phase == 'val':
+                    self._test_normal = normal_step_indices[self._split_proportion_normal[1]:]
+                    self._test_error = error_step_indices[self._split_proportion_error[1]:]
+                    test_indices = self._test_normal + self._test_error
+                    for index_id in test_indices:
+                        self._step_dict[step_index_id] = self._normal_step_dict.get(index_id,
+                                                                                    self._error_step_dict.get(index_id))
+                        step_index_id += 1
 
-        print(f"Loading recording ids from {self._recording_ids_file}")
+        else:
 
-        with open(f'../annotations/data_splits/{self._recording_ids_file}', 'r') as file:
-            self._recording_ids_json = json.load(file)
+            self._recording_ids_file = f"{self._split}_combined_splits.json"
+
+            print(f"Loading recording ids from {self._recording_ids_file}")
+
+            with open(f'../er_annotations/{self._recording_ids_file}', 'r') as file:
+                self._recording_ids_json = json.load(file)
 
-        if self._phase == 'train':
-            self._recording_ids = self._recording_ids_json['train'] + self._recording_ids_json['val']
-        else:
             self._recording_ids = self._recording_ids_json[self._phase]
 
-        with open('../annotations/annotation_json/step_annotations.json', 'r') as f:
-                self._annotations = json.load(f)
-
-        self._step_dict = {}
-        index_id = 0
-        for recording in self._recording_ids:
-            # 1. Prepare step_id, list(<start, end>) for the recording_id
-            recording_step_dictionary = {}
-            for step in self._annotations[recording]['steps']:
-                if step['start_time'] < 0 or step['end_time'] < 0:
-                    # Ignore missing steps
-                    continue
-                if recording_step_dictionary.get(step['step_id']) is None:
-                    recording_step_dictionary[step['step_id']] = []
-
-                recording_step_dictionary[step['step_id']].append(
-                    (math.floor(step['start_time']), math.ceil(step['end_time']), step['has_errors']))
-
-            # 2. Add step start and end time list to the step_dict
-            for step_id in recording_step_dictionary.keys():
-                self._step_dict[index_id] = (recording, recording_step_dictionary[step_id])
-                index_id += 1
+            self._step_dict = {}
+            index_id = 0
+            for recording in self._recording_ids:
+                # 1. Prepare step_id, list(<start, end>) for the recording_id
+                recording_step_dictionary = {}
+                for step in self._annotations[recording]['steps']:
+                    if step['start_time'] < 0 or step['end_time'] < 0:
+                        # Ignore missing steps
+                        continue
+                    if recording_step_dictionary.get(step['step_id']) is None:
+                        recording_step_dictionary[step['step_id']] = []
+
+                    recording_step_dictionary[step['step_id']].append(
+                        (math.floor(step['start_time']), math.ceil(step['end_time']), step['has_errors']))
+
+                # 2. Add step start and end time list to the step_dict
+                for step_id in recording_step_dictionary.keys():
+                    self._step_dict[index_id] = (recording, recording_step_dictionary[step_id])
+                    index_id += 1
 
     def __len__(self):
         assert len(self._step_dict) > 0, "No data found in the dataset"
@@ -97,4 +178,4 @@ def collate_fn(batch):
     step_features = torch.cat(step_features, dim=0)
     step_labels = torch.cat(step_labels, dim=0)
 
-    return step_features, step_labels
+    return step_features, step_labels
diff --git a/er_annotations/recordings_combined_splits.json b/er_annotations/recordings_combined_splits.json
diff --git a/train_er.py b/train_er.py
@@ -162,18 +162,18 @@ def train_step_test_step_er(config):
     # val_dataset = CaptainCookStepDataset(config, const.TEST, config.split)
     # val_loader = DataLoader(val_dataset, collate_fn=collate_fn, **test_kwargs)
 
-    train_dataset = CaptainCookStepShuffleDataset(config, const.TRAIN)
+    train_dataset = CaptainCookStepDataset(config, const.TRAIN, config.split)
     train_loader = DataLoader(train_dataset, collate_fn=collate_fn, **train_kwargs)
-    val_dataset = CaptainCookStepShuffleDataset(config, const.VAL)
+    val_dataset = CaptainCookStepDataset(config, const.VAL, config.split)
     val_loader = DataLoader(val_dataset, collate_fn=collate_fn, **test_kwargs)
-    test_dataset = CaptainCookStepShuffleDataset(config, const.TEST)
+    test_dataset = CaptainCookStepDataset(config, const.TEST, config.split)
     test_loader = DataLoader(test_dataset, collate_fn=collate_fn, **test_kwargs)
 
     train_er_model(train_loader, val_loader, device, config, test_loader=test_loader)
 
 
 if __name__ == "__main__":
     conf = Config()
-    init_logger_and_wandb(conf)
+    # init_logger_and_wandb(conf)
     train_step_test_step_er(conf)
-    wandb.finish()
+    # wandb.finish()