Install fix under mac, plus some small improvements.

luigifreda · Dec 16, 2024 · 0c8aa8f · 0c8aa8f
1 parent 36158b4
commit 0c8aa8f
Show file tree

Hide file tree

Showing 10 changed files with 134 additions and 81 deletions.
diff --git a/depth_estimator_crestereo.py b/depth_estimator_crestereo.py
@@ -22,6 +22,7 @@
 import numpy as np
 import os
 import sys
+import platform
 
 import config
 config.cfg.set_lib('crestereo')
@@ -41,6 +42,7 @@
 
 
 def enforce_megengine_linking():
+    # Experimental: trying to enforce correct linking
     import ctypes
     # Path to the directory containing your libraries
     site_packages_path = next(p for p in sys.path if 'site-packages' in p)
@@ -63,9 +65,11 @@ def enforce_megengine_linking():
     ctypes.CDLL(os.path.join(lib_path, "libmegengine_shared.so"))  # Load the main library last
 
 
-#enforce_megengine_linking()
-import megengine as mge
-import crestereo.nets as crestereo_nets
+if platform.system() != 'Darwin':
+    # Unfortunately, megengine is not fully supported on macOS
+    #enforce_megengine_linking()
+    import megengine as mge
+    import crestereo.nets as crestereo_nets
 
 
 # Stereo depth prediction using the Crestereo model.
@@ -87,70 +91,83 @@ def __init__(self, device=None, camera:Camera=None,
                          dataset_env_type=dataset_env_type, precision=None)        
 
     def load_model(self, model_path=kCrestereoModelPath):
-        print("DepthEstimatorCrestereo: Loading model:", os.path.abspath(model_path))
-        if not os.path.exists(model_path):
-            raise FileNotFoundError(f"Model file not found: {model_path}")        
-        pretrained_dict = mge.load(model_path)
-        model = crestereo_nets.Model(max_disp=self.max_disparity, mixed_precision=False, test_mode=True)
-        model.load_state_dict(pretrained_dict["state_dict"], strict=True)
-        model.eval()
-        return model
-
-    def infer(self, image, image_right=None):     
-        if image_right is None:
-            message = 'Image right is None. Are you using a stereo dataset?'
-            Printer.red(message)
-            raise ValueError(message)
+        if platform.system() != 'Darwin':  
+
+            print("DepthEstimatorCrestereo: Loading model:", os.path.abspath(model_path))
+            if not os.path.exists(model_path):
+                raise FileNotFoundError(f"Model file not found: {model_path}")        
+            pretrained_dict = mge.load(model_path)
+            model = crestereo_nets.Model(max_disp=self.max_disparity, mixed_precision=False, test_mode=True)
+            model.load_state_dict(pretrained_dict["state_dict"], strict=True)
+            model.eval()
+            return model
 
-        in_shape = image.shape 
-        print(f'DepthEstimatorCrestereo: Running inference: {image.shape} {image_right.shape}')
-
-        # Compute disparity map
-        imgL = image.transpose(2, 0, 1)
-        imgR = image_right.transpose(2, 0, 1)
-        imgL = np.ascontiguousarray(imgL[None, :, :, :])
-        imgR = np.ascontiguousarray(imgR[None, :, :, :])
-
-        imgL = mge.tensor(imgL).astype("float32")
-        imgR = mge.tensor(imgR).astype("float32")
-
-        imgL_dw2 = mge.functional.nn.interpolate(
-            imgL,
-            size=(imgL.shape[2] // 2, imgL.shape[3] // 2),
-            mode="bilinear",
-            align_corners=True,
-        )
-        imgR_dw2 = mge.functional.nn.interpolate(
-            imgR,
-            size=(imgL.shape[2] // 2, imgL.shape[3] // 2),
-            mode="bilinear",
-            align_corners=True,
-        )
-        pred_flow_dw2 = self.model(imgL_dw2, imgR_dw2, iters=self.n_iter, flow_init=None)
-
-        pred_flow = self.model(imgL, imgR, iters=self.n_iter, flow_init=pred_flow_dw2)
-        disparity_map = mge.functional.squeeze(pred_flow[:, 0, :, :]).numpy()
+        else: 
+            Printer.red("DepthEstimatorCrestereo: Not implemented for MacOS")
+            raise NotImplementedError
+            return None
+
+    def infer(self, image, image_right=None):
+        if platform.system() != 'Darwin':
+
+            if image_right is None:
+                message = 'Image right is None. Are you using a stereo dataset?'
+                Printer.red(message)
+                raise ValueError(message)
 
-        out_shape = disparity_map.shape
-        if out_shape[0] != in_shape[0] or out_shape[1] != in_shape[1]:
-            in_w = in_shape[1]
-            out_w = out_shape[1]
-            # got this formula from the original testing code
-            t = float(in_w) / float(out_w)
-            disparity_map = cv2.resize(disparity_map, (in_shape[1], in_shape[0]), interpolation=cv2.INTER_AREA)*t
-
-        self.disparity_map = disparity_map
-
-        bf = self.camera.bf if self.camera is not None else 1.0
-        if self.camera is None:
-            Printer.red('Camera is None!')
+            in_shape = image.shape 
+            print(f'DepthEstimatorCrestereo: Running inference: {image.shape} {image_right.shape}')
+
+            # Compute disparity map
+            imgL = image.transpose(2, 0, 1)
+            imgR = image_right.transpose(2, 0, 1)
+            imgL = np.ascontiguousarray(imgL[None, :, :, :])
+            imgR = np.ascontiguousarray(imgR[None, :, :, :])
+
+            imgL = mge.tensor(imgL).astype("float32")
+            imgR = mge.tensor(imgR).astype("float32")
+
+            imgL_dw2 = mge.functional.nn.interpolate(
+                imgL,
+                size=(imgL.shape[2] // 2, imgL.shape[3] // 2),
+                mode="bilinear",
+                align_corners=True,
+            )
+            imgR_dw2 = mge.functional.nn.interpolate(
+                imgR,
+                size=(imgL.shape[2] // 2, imgL.shape[3] // 2),
+                mode="bilinear",
+                align_corners=True,
+            )
+            pred_flow_dw2 = self.model(imgL_dw2, imgR_dw2, iters=self.n_iter, flow_init=None)
+
+            pred_flow = self.model(imgL, imgR, iters=self.n_iter, flow_init=pred_flow_dw2)
+            disparity_map = mge.functional.squeeze(pred_flow[:, 0, :, :]).numpy()
 
-        # Compute depth map
-        abs_disparity_map = np.abs(disparity_map, dtype=float)
-        depth_map = np.where(abs_disparity_map > self.min_disparity, bf / abs_disparity_map, 0.0)
-        self.depth_map = depth_map
+            out_shape = disparity_map.shape
+            if out_shape[0] != in_shape[0] or out_shape[1] != in_shape[1]:
+                in_w = in_shape[1]
+                out_w = out_shape[1]
+                # got this formula from the original testing code
+                t = float(in_w) / float(out_w)
+                disparity_map = cv2.resize(disparity_map, (in_shape[1], in_shape[0]), interpolation=cv2.INTER_AREA)*t
+
+            self.disparity_map = disparity_map
+
+            bf = self.camera.bf if self.camera is not None else 1.0
+            if self.camera is None:
+                Printer.red('Camera is None!')
+
+            # Compute depth map
+            abs_disparity_map = np.abs(disparity_map, dtype=float)
+            depth_map = np.where(abs_disparity_map > self.min_disparity, bf / abs_disparity_map, 0.0)
+            self.depth_map = depth_map
+
+            print(f'DepthEstimatorCrestereo: Depth map shape: {depth_map.shape}')
+            return depth_map
 
-        print(f'DepthEstimatorCrestereo: Depth map shape: {depth_map.shape}')
-        return depth_map
+        else: 
+            Printer.red("DepthEstimatorCrestereo: Not implemented for MacOS")
+            raise NotImplementedError
 
 
diff --git a/depth_estimator_depth_anything_v2.py b/depth_estimator_depth_anything_v2.py
@@ -22,6 +22,7 @@
 import numpy as np
 import os
 import sys
+import platform 
 
 import config
 config.cfg.set_lib('depth_anything_v2')
@@ -58,10 +59,16 @@ def __init__(self, device=None, camera:Camera=None,
                  encoder_name='vitl', precision=None):  # or 'vits', 'vitb'   we use the largest by default
         if device is None:
             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            if device.type != 'cuda':
+                device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
         if device.type == 'cuda':
             print('DepthEstimatorDepthPro: Using CUDA')
+        elif device.type == 'mps':
+            if not torch.backends.mps.is_available():  # Should return True for MPS availability        
+                raise Exception('DepthEstimatorDepthPro: MPS is not available')
+            print('DepthEstimatorDepthPro: Using MPS')
         else:
-            print('DepthEstimatorDepthPro: Using CPU')   
+            print('DepthEstimatorDepthPro: Using CPU') 
         transform = None
         model = DepthAnythingV2(**{**DepthEstimatorDepthAnythingV2.model_configs[encoder_name], 'max_depth': max_depth})
         dataset_name = 'vkitti' if dataset_env_type == DatasetEnvironmentType.OUTDOOR else 'hypersim'

diff --git a/depth_estimator_factory.py b/depth_estimator_factory.py
@@ -25,6 +25,7 @@
 
 import config
 import torch
+import platform
 
 from camera import Camera
 from dataset import DatasetEnvironmentType
@@ -48,9 +49,9 @@ class DepthEstimatorType(SerializableEnum):
     DEPTH_SGBM                 = 0    # Depth SGBM from OpenCV [Stereo, it requires a stereo dataset]    
     DEPTH_ANYTHING_V2          = 1    # "Depth Anything V2" [Monocular]
     DEPTH_PRO                  = 2    # "Depth Pro: Sharp Monocular Metric Depth in Less Than a Second" [Monocular]
-    DEPTH_RAFT_STEREO                 = 3    # "RAFT-Stereo: Multilevel Recurrent Field Transforms for Stereo Matching" [Stereo, it requires a stereo dataset]
+    DEPTH_RAFT_STEREO          = 3    # "RAFT-Stereo: Multilevel Recurrent Field Transforms for Stereo Matching" [Stereo, it requires a stereo dataset]
     DEPTH_CRESTEREO            = 4    # "Practical Stereo Matching via Cascaded Recurrent Network with Adaptive Correlation" [Stereo, it requires a stereo dataset]
-                                      # Note: Unstable and with linking problems [WIP] => use DepthEstimatorType.DEPTH_CRESTEREO_PYTORCH
+                                      # Note: Unstable and with linking problems under Linux, not supported under mac [WIP] => use DepthEstimatorType.DEPTH_CRESTEREO_PYTORCH
     DEPTH_CRESTEREO_PYTORCH    = 5    # "Practical Stereo Matching via Cascaded Recurrent Network with Adaptive Correlation", pytorch implementation [Stereo, it requires a stereo dataset]
 
 
@@ -86,6 +87,8 @@ def depth_estimator_factory(depth_estimator_type=DepthEstimatorType.DEPTH_ANYTHI
                                   min_depth=min_depth, max_depth=max_depth, 
                                   dataset_env_type=dataset_env_type)
     elif depth_estimator_type == DepthEstimatorType.DEPTH_CRESTEREO:
+        if platform.system() == 'Darwin':
+            raise ValueError('DepthEstimatorType.DEPTH_CRESTEREO is not supported on macOS. Use DepthEstimatorType.DEPTH_CRESTEREO_PYTORCH instead!')
         return DepthEstimatorCrestereo(device=device, camera=camera, 
                                        min_depth=min_depth, max_depth=max_depth, 
                                        dataset_env_type=dataset_env_type)

diff --git a/depth_estimator_raft_stereo.py b/depth_estimator_raft_stereo.py
@@ -114,7 +114,7 @@ def __init__(self, device=None, camera:Camera=None,
             raise ValueError(f'Model {restore_ckpt} does not exist')
 
         self.model = torch.nn.DataParallel(RAFTStereo(self.model_config), device_ids=[0])
-        self.model.load_state_dict(torch.load(restore_ckpt))      
+        self.model.load_state_dict(torch.load(restore_ckpt, map_location='cpu'))      
         self.module = self.model.module
         self.module.to(device)
         self.module.eval()          

diff --git a/ground_truth.py b/ground_truth.py
@@ -208,6 +208,7 @@ def getTimestampPositionAndAbsoluteScale(self, frame_id):
             abs_scale = 1
         else:
             abs_scale = np.sqrt((x - x_prev)*(x - x_prev) + (y - y_prev)*(y - y_prev) + (z - z_prev)*(z - z_prev))
+        #print(f'reading frame {frame_id}, timestamp: {timestamp:.15f}, x: {x:.15f}, y: {y:.15f}, z: {z:.15f}, scale: {abs_scale:.15f}')
         return timestamp,x,y,z,abs_scale 
 
     # return timestamp, x,y,z, qx,qy,qz,qw, scale
@@ -443,7 +444,7 @@ def __init__(self, path, name, associations=None, start_frame_id=0, type = Groun
         print('base_path: ', base_path)
 
         if not os.path.isfile(self.filename):
-            error_message = f'ERROR: Groundtruth file not found: {self.filename}'
+            error_message = f'ERROR: Groundtruth file not found: {self.filename}. Use the script groundtruth/generate_euroc_groundtruths_as_tum.sh to generate these groundtruth files!'
             Printer.red(error_message)
             sys.exit(error_message)
 

diff --git a/install_pip3_packages.sh b/install_pip3_packages.sh
@@ -115,8 +115,11 @@ fi
 
 
 # crestereo
-pip install --upgrade cryptography pyOpenSSL
-python3 -m pip install megengine -f https://megengine.org.cn/whl/mge.html # This brings issues when launched in parallel processes
-#pip install megengine  # This brings issues with non-supported CUDA architecture
+if [[ "$OSTYPE" != "darwin"* ]]; then
+    # Unfortunately, megengine is not supported on macOS with arm architecture
+    pip install --upgrade cryptography pyOpenSSL
+    python3 -m pip install megengine -f https://megengine.org.cn/whl/mge.html # This brings issues when launched in parallel processes
+    #pip install megengine  # This brings issues with non-supported CUDA architecture on my machine
+fi
 
 pip install gdown  # to download from google drive
diff --git a/install_thirdparty.sh b/install_thirdparty.sh
@@ -235,6 +235,8 @@ if [ ! -d raft_stereo ]; then
     cd raft_stereo
     #git checkout 6068c1a26f84f8132de10f60b2bc0ce61568e085 # use this commit if you hit any problems
 
+    git apply ../raft_stereo.patch
+
     ./download_models.sh
 fi
 

diff --git a/pyenv-activate.sh b/pyenv-activate.sh
@@ -17,3 +17,9 @@ export PYTHONPATH=""   # clean python path => for me, remove ROS stuff
 
 # N.B.: in order to deactivate the virtual environment run: 
 # $ deactivate 
+
+# Check if the operating system is Darwin (macOS)
+if [[ $OSTYPE == 'darwin'* ]]; then
+    export PYTORCH_ENABLE_MPS_FALLBACK=1
+    echo "setting PYTORCH_ENABLE_MPS_FALLBACK: $PYTORCH_ENABLE_MPS_FALLBACK"
+fi
diff --git a/thirdparty/raft_stereo.patch b/thirdparty/raft_stereo.patch
@@ -0,0 +1,13 @@
+diff --git a/download_models.sh b/download_models.sh
+index 81582c2..489ba6a 100755
+--- a/download_models.sh
++++ b/download_models.sh
+@@ -3,4 +3,6 @@ mkdir models -p
+ cd models
+ wget https://www.dropbox.com/s/ftveifyqcomiwaq/models.zip
+ unzip models.zip
+-rm models.zip -f
++if [ -f models.zip ]; then
++    rm -f models.zip
++fi
+\ No newline at end of file
diff --git a/visual_odometry.py b/visual_odometry.py
@@ -25,30 +25,31 @@
 from feature_tracker import FeatureTrackerTypes, FeatureTrackingResult, FeatureTracker
 from utils_geom import poseRt, is_rotation_matrix, closest_rotation_matrix
 from timer import TimerFps
+from ground_truth import GroundTruth
 
 class VoStage(Enum):
     NO_IMAGES_YET   = 0     # no image received 
     GOT_FIRST_IMAGE = 1     # got first image, we can proceed in a normal way (match current image with previous image)
 
 kVerbose=True     
 kMinNumFeature = 2000
-kRansacThresholdNormalized = 0.0004  # metric threshold used for normalized image coordinates (originally 0.0003)
-kRansacThresholdPixels = 0.1         # pixel threshold used for image coordinates 
-kAbsoluteScaleThresholdKitti = 0.1        # absolute translation scale; it is also the minimum translation norm for an accepted motion 
-kAbsoluteScaleThresholdIndoor = 0.01       # absolute translation scale; it is also the minimum translation norm for an accepted motion
-kUseEssentialMatrixEstimation = True # using the essential matrix fitting algorithm is more robust RANSAC given five-point algorithm solver 
-kRansacProb = 0.999                  # (originally 0.999)
+kRansacThresholdNormalized = 0.0004            # metric threshold used for normalized image coordinates (originally 0.0003)
+kRansacThresholdPixels = 0.1                   # pixel threshold used for image coordinates 
+kAbsoluteScaleThresholdKitti = 0.1             # absolute translation scale; it is also the minimum translation norm for an accepted motion 
+kAbsoluteScaleThresholdIndoor = 0.01           # absolute translation scale; it is also the minimum translation norm for an accepted motion
+kUseEssentialMatrixEstimation = True           # using the essential matrix fitting algorithm is more robust RANSAC given five-point algorithm solver 
+kRansacProb = 0.999                            # (originally 0.999)
+kMinAveragePixelShiftForMotionEstimation = 1.5 # if the average pixel shift is below this threshold, motion is considered to be small enough to be ignored
 kUseGroundTruthScale = True 
 
-
 # This class is a first start to understand the basics of inter frame feature tracking and camera pose estimation.
 # It combines the simplest VO ingredients without performing any image point triangulation or 
 # windowed bundle adjustment. At each step $k$, it estimates the current camera pose $C_k$ with respect to the previous one $C_{k-1}$. 
 # The inter frame pose estimation returns $[R_{k-1,k},t_{k-1,k}]$ with $||t_{k-1,k}||=1$. 
 # With this very basic approach, you need to use a ground truth in order to recover a correct inter-frame scale $s$ and estimate a 
 # valid trajectory by composing $C_k = C_{k-1} * [R_{k-1,k}, s t_{k-1,k}]$. 
 class VisualOdometry(object):
-    def __init__(self, cam, groundtruth, feature_tracker : FeatureTracker):
+    def __init__(self, cam, groundtruth: GroundTruth, feature_tracker: FeatureTracker):
         self.stage = VoStage.NO_IMAGES_YET
         self.cam = cam
 
@@ -189,7 +190,7 @@ def processFrame(self, frame_id):
         # t is estimated up to scale (i.e. the algorithm always returns ||trc||=1, we need a scale in order to recover a translation which is coherent with the previous estimated ones)
         absolute_scale = self.getAbsoluteScale(frame_id)
         # NOTE: This simplistic estimation approach provide reasonable results with Kitti where a good ground velocity provides a decent interframe parallax. It does not work with indoor datasets. 
-        if(absolute_scale > self.absolute_scale_threshold and self.average_pixel_shift > 1):
+        if absolute_scale > self.absolute_scale_threshold and self.average_pixel_shift > kMinAveragePixelShiftForMotionEstimation:
             # compose absolute motion [Rwa,twa] with estimated relative motion [Rab,s*tab] (s is the scale extracted from the ground truth)
             # [Rwb,twb] = [Rwa,twa]*[Rab,tab] = [Rwa*Rab|twa + Rwa*tab]
             print('estimated t with norm |t|: ', np.linalg.norm(t), ' (just for sake of clarity)')