Fix npz load issue

sayands · sayands · commit 1fb9a3ebdaa4 · 2025-06-05T11:11:50.000-07:00
diff --git a/common/load_utils.py b/common/load_utils.py
@@ -50,6 +50,20 @@ def write_json(data_dict: Any, filename: str) -> None:
     with open(filename, "w") as outfile:
         outfile.write(json_obj)
 
+def load_npz_as_dict(filename: str) -> dict:
+    with np.load(filename, allow_pickle=True) as npz:
+        if isinstance(npz, np.lib.npyio.NpzFile):
+            out = {}
+            for k in npz.files:                
+                val = npz[k]                 
+                if (isinstance(val, np.ndarray) and
+                    val.dtype == object and
+                    val.shape == ()):
+                    out[k] = val.item()        
+                else:
+                    out[k] = val               
+            return out                   
+
 def get_print_format(value: Any) -> str:
     """Determines the appropriate format string for a given value."""
     if isinstance(value, int):
diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml
@@ -34,6 +34,7 @@ data:
     processor2D    : ARKitScenes2DProcessor
     processor1D    : ARKitScenes1DProcessor
     skip_frames    : 1
+
   MultiScan:
     base_dir       : /media/sayan/Expansion/data/datasets/MultiScan
     process_dir    : ${data.process_dir}/MultiScan
diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml
@@ -72,7 +72,7 @@ task:
     scene_modalities : ['rgb', 'point', 'floorplan', 'referral']
     train            : [Scannet, Scan3R, MultiScan, ARKitScenes]
     val              : [Scannet, Scan3R, MultiScan, ARKitScenes]
-    object_enc_ckpt  : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan+arkitscenes.pth
+    object_enc_ckpt  : /drive/dumps/multimodal-spaces/runs/curr_runs/instance_crossover_scannet+scan3r+multiscan+arkitscenes.pth
     
 trainer: UnifiedTrainer
 
diff --git a/data/datasets/scanbase.py b/data/datasets/scanbase.py
@@ -10,9 +10,11 @@
 from omegaconf import DictConfig
 from typing import List, Dict, Any
 
+from common.load_utils import load_npz_as_dict
 from ..transforms import get_transform
 from ..data_utils import pad_tensors
 
+
 class ScanObjectBase(Dataset):
     """Base Dataset class for instance level training"""
     def __init__(self, data_config: DictConfig, split: str) -> None:
@@ -131,18 +133,13 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         
         scan_process_dir = osp.join(self.process_dir, 'scans', scan_id)
         
-        # scan_objects_data = torch.load(osp.join(scan_process_dir, 'objectsDataMultimodal.pt'))
-        scan_objects_data = np.load(osp.join(scan_process_dir, 'objectsDataMultimodal.npz'), allow_pickle=True)
-        
-        # scandata_1d = torch.load(osp.join(scan_process_dir, 'data1D.pt'))
-        scandata_1d = np.load(osp.join(scan_process_dir, 'data1D.npz'), allow_pickle=True)
-        # scandata_2d = torch.load(osp.join(scan_process_dir, 'data2D.pt'))
-        scandata_2d = np.load(osp.join(scan_process_dir, 'data2D.npz'), allow_pickle=True)
-        # scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt'))
-        scandata_3d = np.load(osp.join(scan_process_dir, 'data3D.npz'), allow_pickle=True)
+        scan_objects_data = load_npz_as_dict(osp.join(scan_process_dir, 'objectsDataMultimodal.npz'))
+        scandata_1d = load_npz_as_dict(osp.join(scan_process_dir, 'data1D.npz'))
+        scandata_2d = load_npz_as_dict(osp.join(scan_process_dir, 'data2D.npz'))
+        scandata_3d = load_npz_as_dict(osp.join(scan_process_dir, 'data3D.npz'))
         
         # Point Cloud Data -- Scene
-        points, feats, scene_label = scandata_3d['scene'].item()['pcl_coords'], scandata_3d['scene'].item()['pcl_feats'], scandata_3d['scene'].item()['scene_label']
+        points, feats, scene_label = scandata_3d['scene']['pcl_coords'], scandata_3d['scene']['pcl_feats'], scandata_3d['scene']['scene_label']
         feats /= 255.
         feats -= 0.5
         
@@ -189,24 +186,25 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
         
         scene_dict['scene_masks'] = {}
         
-        rgb_embedding = torch.from_numpy(scandata_2d['scene'].item()['scene_embeddings'])
+        rgb_embedding = torch.from_numpy(scandata_2d['scene']['scene_embeddings'])
         rgb_embedding = torch.concatenate([rgb_embedding[:, 0, :], rgb_embedding[:, 1:, :].mean(dim=1)], dim=1)
+        rgb_embedding = rgb_embedding[list(range(0, rgb_embedding.shape[0], 2)), :]
         scene_dict['rgb_embedding'] = rgb_embedding
         
         scene_dict['scene_masks']['rgb'] = torch.Tensor([1.0])
         scene_dict['scene_masks']['point'] = torch.Tensor([1.0])
         scene_dict['scene_masks']['object'] = torch.Tensor([1.0])
         
         referral_mask = torch.Tensor([0.0])       
-        referral_embedding = scandata_1d['scene'].item()['referral_embedding']
+        referral_embedding = scandata_1d['scene']['referral_embedding']
         
         if referral_embedding is not None:
             referral_embedding = torch.from_numpy(referral_embedding[0]['feat']).reshape(-1,)
             referral_mask = torch.Tensor([1.0])
         else:
             referral_embedding = torch.zeros((scene_dict['rgb_embedding'].shape[-1] // 4, ))
         
-        floorplan_embedding = scandata_2d['scene'].item()['floorplan']['embedding']
+        floorplan_embedding = scandata_2d['scene']['floorplan']['embedding']
         floorplan_mask = torch.Tensor([0.0])
         if floorplan_embedding is not None:
             floorplan_embedding = torch.from_numpy(floorplan_embedding[0, 0]).reshape(-1, )
diff --git a/preprocess/feat1D/arkit.py b/preprocess/feat1D/arkit.py
@@ -59,8 +59,8 @@ def compute1DFeaturesEachScan(self, scan_id):
         if osp.exists(pt_1d_path):
             os.remove(pt_1d_path)
         
-        npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)
-        objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item()
+        npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))
+        objectID_to_labelID_map = npz_data['obj_id_to_label_id_map']
         
         scan_objects = self.load_objects_for_scan(scan_id)
 
diff --git a/preprocess/feat1D/multiscan.py b/preprocess/feat1D/multiscan.py
@@ -55,9 +55,8 @@ def compute1DFeaturesEachScan(self, scan_id):
         if osp.exists(pt_1d_path):
             os.remove(pt_1d_path)
 
-        # else:   
-        npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)
-        objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item()
+        npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))
+        objectID_to_labelID_map = npz_data['obj_id_to_label_id_map']
         
         scan_objects = self.load_objects_for_scan(scan_id)
 
diff --git a/preprocess/feat1D/scan3r.py b/preprocess/feat1D/scan3r.py
@@ -39,8 +39,8 @@ def compute1DFeaturesEachScan(self, scan_id: str) -> None:
         if osp.exists(pt_1d_path):
             os.remove(pt_1d_path)   
         
-        npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)
-        objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item()
+        npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))
+        objectID_to_labelID_map = npz_data['obj_id_to_label_id_map']
         scan_objects = [obj_data for obj_data in self.objects if obj_data['scan'] == scan_id][0]['objects']
 
         object_referral_embeddings, scene_referral_embeddings = {}, None
diff --git a/preprocess/feat1D/scannet.py b/preprocess/feat1D/scannet.py
@@ -41,9 +41,8 @@ def compute1DFeaturesEachScan(self, scan_id: str) -> None:
         if osp.exists(pt_1d_path):
             os.remove(pt_1d_path)
         
-        # objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map']
-        npz_data = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)
-        objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'].item()
+        npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))
+        objectID_to_labelID_map = npz_data['obj_id_to_label_id_map']
         objects = [objects['objects'] for objects in self.objects if objects['scan'] == scan_id]
         
         object_referral_embeddings, scene_referral_embeddings = {}, None
diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py
@@ -45,32 +45,22 @@ def __init__(self, config_data: DictConfig, config_2D: DictConfig, split: str) -
         for scan_id in self.scan_ids:
             pose_data = arkit.load_poses(osp.join(self.data_dir, 'scans', scan_id),scan_id, skip=self.frame_skip)
             self.frame_pose_data[scan_id] = pose_data
-        
 
     def compute2DFeatures(self) -> None:
         for scan_id in tqdm(self.scan_ids):
             self.compute2DImagesAndSeg(scan_id)
-            self.compute2DFeaturesEachScan(scan_id)
+            self.compute2DFeaturesEachScan(scan_id)    
     
     def compute2DImagesAndSeg(self, scan_id: str) -> None:
         obj_id_imgs = {}
-        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
         
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
         
         objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json")
         if not osp.exists(objects_path):
             raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}")
-        
-        gt_pt_path = osp.join(scene_folder, 'gt-projection-seg.pt')
-        if osp.exists(gt_pt_path):
-            os.remove(gt_pt_path)
-        
-        gt_pt_path = osp.join(scene_out_dir, 'gt-projection-seg.pt')
-        if osp.exists(gt_pt_path):
-            os.remove(gt_pt_path)
-        
+    
         annotations = load_utils.load_json(objects_path)        
         ply_data = arkit.load_ply_data(osp.join(self.data_dir,'scans'), scan_id, annotations)
         instance_ids = ply_data['objectId']
@@ -110,11 +100,8 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None:
         
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
-        pt_2d_path = osp.join(scene_out_dir, 'data2D.pt')
-        if osp.exists(pt_2d_path):
-            os.remove(pt_2d_path)
-            
-        obj_id_to_label_id_map = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item()
+        
+        obj_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map']
         
         # Multi-view Image -- Object (Embeddings)
         object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map)
@@ -147,36 +134,6 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None:
         
         np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D)
     
-    def computeAllImageFeaturesEachScan(self, scan_id: str) -> None:
-        scene_folder = osp.join(self.data_dir, 'scans', scan_id)
-        color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide')
-        
-        scene_out_dir = osp.join(self.out_dir, scan_id)
-        load_utils.ensure_dir(scene_out_dir)
-        
-        frame_idxs = list(self.frame_pose_data[scan_id].keys())
-        
-        # Extract Scene Image Features
-        scene_images_pt = []
-        scene_image_embeddings = []
-        # sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0]
-            
-        for frame_index in frame_idxs:
-            image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png'))
-                
-            image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC)
-            image_pt = self.model.base_tf(image)
-            
-            scene_image_embeddings.append(self.extractFeatures([image_pt], return_only_cls_mean= False))
-            scene_images_pt.append(image_pt)
-        
-        scene_image_embeddings = np.concatenate(scene_image_embeddings)
-        data2D = {} 
-        data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 
-                           'frame_idxs' : frame_idxs}
-        # torch.save(data2D, osp.join(scene_out_dir, 'data2D_all_images.pt'))
-        np.savez_compressed(osp.join(scene_out_dir, 'data2D_all_images.npz'), **data2D)
-    
     def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]:
         # Sample Camera Indexes Based on Rotation Matrix From Grid
         pose_data = []
@@ -204,9 +161,7 @@ def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, fr
         scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False)
         
         return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs
-        # return pose_data, None, None, sampled_frame_idxs
         
-    
     def computeImageFeaturesAllObjectsEachScan(self, scene_folder: str, scene_out_dir: str, obj_id_to_label_id_map: dict) -> Tuple[Dict[int, Dict[int, np.ndarray]], Dict[int, List[int]], List[str]]:
         object_anno_2D = np.load(osp.join(scene_out_dir, 'gt-projection-seg.npz'),allow_pickle=True)
         object_image_votes = {}
diff --git a/preprocess/feat2D/multiscan.py b/preprocess/feat2D/multiscan.py
@@ -45,16 +45,6 @@ def __init__(self, config_data, config_2D, split) -> None:
             while(len(frame_idxs) > 500):
                 self.frame_skip += 2
                 frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip)
-            # if len(frame_idxs) > 500:
-            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=2)
-            # if len(frame_idxs) > 500:
-            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=5)
-            # if len(frame_idxs) > 500:
-            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=10)
-            # if len(frame_idxs) > 500:
-            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=15)
-            # if len(frame_idxs) > 500:
-            #     frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=20)
             
             pose_data = multiscan.load_all_poses(scene_folder, frame_idxs)
             self.frame_pose_data[scan_id] = pose_data
@@ -72,15 +62,6 @@ def compute2DImagesAndSeg(self, scan_id):
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
         
-        gt_pt_path = osp.join(scene_folder, 'gt-projection-seg.pt')
-        if osp.exists(gt_pt_path):
-            os.remove(gt_pt_path)
-            
-        gt_pt_path = osp.join(scene_out_dir, 'gt-projection-seg.pt')
-        if osp.exists(gt_pt_path):
-            os.remove(gt_pt_path)
-                      
-        # else:     
         mesh_file = osp.join(scene_folder, '{}.ply'.format(scan_id))
         ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id)
         instance_ids = ply_data['objectId']
@@ -119,14 +100,7 @@ def compute2DFeaturesEachScan(self, scan_id):
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
         
-        pt_2d_path = osp.join(scene_out_dir, 'data2D.pt')
-        if osp.exists(pt_2d_path):
-            os.remove(pt_2d_path)
-        
-        
-            
-        # else:
-        obj_id_to_label_id_map = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item()
+        obj_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map']
         
         # Multi-view Image -- Object (Embeddings)
         object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map)
diff --git a/preprocess/feat2D/scan3r.py b/preprocess/feat2D/scan3r.py
@@ -50,11 +50,6 @@ def __init__(self, config_data: DictConfig, config_2D: DictConfig, split: str) -
             self.frame_pose_data[scan_id] = pose_data
 
     def compute2DFeatures(self) -> None:
-        if self.split == 'train':
-            self.scan_ids = self.scan_ids[13+102+295:]
-        else:
-            self.scan_ids = self.scan_ids[:]
-
         for scan_id in tqdm(self.scan_ids):
             self.compute2DImagesAndSeg(scan_id)
             self.compute2DFeaturesEachScan(scan_id)   
@@ -66,15 +61,7 @@ def compute2DImagesAndSeg(self, scan_id: str) -> None:
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
         
-        obj_id_imgs = {}
-        gt_pt_path = osp.join(scene_out_dir, 'gt-projection-seg.pt')
-        if osp.exists(gt_pt_path):
-            os.remove(gt_pt_path)
-        
-        gt_pt_path = osp.join(scene_folder, 'gt-projection-seg.pt')
-        if osp.exists(gt_pt_path):
-            os.remove(gt_pt_path)
-                
+        obj_id_imgs = {}        
         ply_data = scan3r.load_ply_data(self.data_dir, scan_id, self.label_filename)
         instance_ids = ply_data['objectId']
         
@@ -113,11 +100,7 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None:
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
         
-        pt_2d_path = osp.join(scene_out_dir, 'data2D.pt')
-        if osp.exists(pt_2d_path):
-            os.remove(pt_2d_path)
-        
-        obj_id_to_label_id_map = np.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'),allow_pickle=True)['obj_id_to_label_id_map'].item()
+        obj_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map']
         
         # Multi-view Image -- Object (Embeddings)
         object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map)
diff --git a/preprocess/feat2D/scannet.py b/preprocess/feat2D/scannet.py
@@ -87,9 +87,7 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None:
         
         scene_out_dir = osp.join(self.out_dir, scan_id)
         load_utils.ensure_dir(scene_out_dir)
-        pt_2d_path = osp.join(scene_out_dir, 'data2D.pt')
-        if osp.exists(pt_2d_path):
-            os.remove(pt_2d_path)
+        
             
         # Floor-plan rendering
         render_img = self.renderShapeAndFloorplan(scene_folder, scene_out_dir, scan_id)
diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py
diff --git a/scripts/evaluation/eval_object_retrieval.sh b/scripts/evaluation/eval_object_retrieval.sh
diff --git a/scripts/preprocess/process_arkit.sh b/scripts/preprocess/process_arkit.sh
diff --git a/scripts/preprocess/process_multiscan.sh b/scripts/preprocess/process_multiscan.sh
diff --git a/scripts/preprocess/process_scan3r.sh b/scripts/preprocess/process_scan3r.sh
diff --git a/scripts/preprocess/process_scannet.sh b/scripts/preprocess/process_scannet.sh