nuScenes KITTI 2d boxes (nutonomy#158)

* Convert 3d boxes to 2d for kitti * Bug fix with swapped sign * Ignore boxes that are behind the camera * Fix wrong type hint * Fix order of statements * Return none if no corners in image * Added assertion for debugging and shortened output precision * Fix cropping and ignore boxes that don't fall into the image * Tools to plot 2d boxes * Debug output * Fleshed out comments * Added disclaimer to 3d-2d projection
tchigher · May 20, 2019 · 57de64c · 57de64c
1 parent 5ec8a39
commit 57de64c
Show file tree

Hide file tree

Showing 3 changed files with 123 additions and 43 deletions.
diff --git a/python-sdk/nuscenes/scripts/export_2d_annotations_as_json.py b/python-sdk/nuscenes/scripts/export_2d_annotations_as_json.py
@@ -3,7 +3,12 @@
 # Licensed under the Creative Commons [see license.txt]
 
 """
-Export 2D annotations (xmin, ymin,xmax, ymax) from re-projections of our annotated 3D bounding boxes to a .json file.
+Export 2D annotations (xmin, ymin, xmax, ymax) from re-projections of our annotated 3D bounding boxes to a .json file.
+
+Note: Projecting tight 3d boxes to 2d generally leads to non-tight boxes.
+      Furthermore it is non-trivial to determine whether a box falls into the image, rather than behind or around it.
+      Finally some of the objects may be occluded by other objects, in particular when the lidar can see them, but the
+      cameras cannot.
 """
 
 from nuscenes.nuscenes import NuScenes
@@ -138,7 +143,7 @@ def get_2d_boxes(sample_data_token: str, visibilities: List[str]) -> List[Ordere
         in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
         corners_3d = corners_3d[:, in_front]
 
-        # Applying the re-projection algorithm post-processing step..
+        # Applying the re-projection algorithm post-processing step.
         corner_coords = view_points(corners_3d, camera_intrinsic, True).T[:, :2].tolist()
         final_coords = post_process_coords(corner_coords)
 

diff --git a/python-sdk/nuscenes/scripts/export_kitti.py b/python-sdk/nuscenes/scripts/export_kitti.py
@@ -27,8 +27,9 @@
 
 To launch these scripts run:
 - python export_kitti.py nuscenes_gt_to_kitti --nusc_kitti_dir ~/nusc_kitti
-- python export_kitti.py render_kitti --nusc_kitti_dir ~/nusc_kitti
+- python export_kitti.py render_kitti --nusc_kitti_dir ~/nusc_kitti --render_2d False
 - python export_kitti.py kitti_res_to_nuscenes --nusc_kitti_dir ~/nusc_kitti
+Note: The parameter --render_2d specifies whether to draw 2d or 3d boxes.
 
 To work with the original KITTI dataset, use these parameters:
  --nusc_kitti_dir /data/sets/kitti --split training
@@ -90,6 +91,7 @@ def nuscenes_gt_to_kitti(self) -> None:
         """
         kitti_to_nu_lidar = Quaternion(axis=(0, 0, 1), angle=np.pi / 2)
         kitti_to_nu_lidar_inv = kitti_to_nu_lidar.inverse
+        imsize = (1600, 900)
 
         token_idx = 0  # Start tokens from 0.
 
@@ -216,7 +218,7 @@ def nuscenes_gt_to_kitti(self) -> None:
                     # Truncated: Set all objects to 0 which means untruncated.
                     truncated = 0.0
 
-                    # Occluded: Hard-coded: Full visibility.
+                    # Occluded: Set all objects to full visibility as this information is not available in nuScenes.
                     occluded = 0
 
                     # Convert nuScenes category to nuScenes detection challenge category.
@@ -226,20 +228,35 @@ def nuscenes_gt_to_kitti(self) -> None:
                     if detection_name is None:
                         continue
 
-                    # Convert to KITTI 3d and 2d box and KITTI output format.
+                    # Convert from nuScenes to KITTI box format.
                     box_cam_kitti = KittiDB.box_nuscenes_to_kitti(
                         box_lidar_nusc, Quaternion(matrix=velo_to_cam_rot), velo_to_cam_trans, r0_rect)
-                    box_cam_kitti.score = 0  # Set dummy score so we can use this file as result.
-                    output = KittiDB.box_to_string(name=detection_name, box=box_cam_kitti, truncation=truncated,
-                                                   occlusion=occluded)
+
+                    # Project 3d box to 2d box in image, ignore box if it does not fall inside.
+                    bbox_2d = KittiDB.project_kitti_box_to_image(box_cam_kitti, p_left_kitti, imsize=imsize)
+                    if bbox_2d is None:
+                        continue
+
+                    # Set dummy score so we can use this file as result.
+                    box_cam_kitti.score = 0
+
+                    # Convert box to output string format.
+                    output = KittiDB.box_to_string(name=detection_name, box=box_cam_kitti, bbox_2d=bbox_2d,
+                                                   truncation=truncated, occlusion=occluded)
 
                     # Write to disk.
                     label_file.write(output + '\n')
 
-    def render_kitti(self) -> None:
+    def render_kitti(self, render_2d: bool) -> None:
         """
         Renders the annotations in the KITTI dataset from a lidar and a camera view.
+        :param render_2d: Whether to render 2d boxes (only works for camera data).
         """
+        if render_2d:
+            print('Rendering 2d boxes from KITTI format')
+        else:
+            print('Rendering 3d boxes projected from 3d KITTI format')
+
         # Load the KITTI dataset.
         kitti = KittiDB(root=self.nusc_kitti_dir, splits=(self.split,))
 
@@ -253,7 +270,7 @@ def render_kitti(self) -> None:
             for sensor in ['lidar', 'camera']:
                 out_path = os.path.join(render_dir, '%s_%s.png' % (token, sensor))
                 print('Rendering file to disk: %s' % out_path)
-                kitti.render_sample_data(token, sensor_modality=sensor, out_path=out_path)
+                kitti.render_sample_data(token, sensor_modality=sensor, out_path=out_path, render_2d=render_2d)
                 plt.close()  # Close the windows to avoid a warning of too many open windows.
 
     def kitti_res_to_nuscenes(self, meta: Dict[str, bool] = None) -> None:

diff --git a/python-sdk/nuscenes/utils/kitti.py b/python-sdk/nuscenes/utils/kitti.py
@@ -4,7 +4,7 @@
 
 import os
 from os import path as osp
-from typing import List, Tuple, Any
+from typing import List, Tuple, Any, Union
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -153,13 +153,14 @@ def box_nuscenes_to_kitti(box: Box, velo_to_cam_rot: Quaternion,
         return box
 
     @staticmethod
-    def project_kitti_box_to_image(box: Box, p_left: np.ndarray, imsize: Tuple[int, int]) -> Tuple[int, int, int, int]:
+    def project_kitti_box_to_image(box: Box, p_left: np.ndarray, imsize: Tuple[int, int]) \
+            -> Union[None, Tuple[int, int, int, int]]:
         """
         Projects 3D box into KITTI image FOV.
         :param box: 3D box in KITTI reference frame.
         :param p_left: <np.float: 3, 4>. Projection matrix.
-        :param imsize: (width , height). Image size.
-        :return: (xmin, ymin, xmax, ymax). Bounding box in image plane.
+        :param imsize: (width, height). Image size.
+        :return: (xmin, ymin, xmax, ymax). Bounding box in image plane or None if box is not in the image.
         """
 
         # Create a new box.
@@ -169,13 +170,25 @@ def project_kitti_box_to_image(box: Box, p_left: np.ndarray, imsize: Tuple[int,
         # We use the true center, so we need to adjust half height in negative y direction.
         box.translate(np.array([0, -box.wlh[2] / 2, 0]))
 
-        # Project corners to 2d to get bbox in pixel coords.
+        # Check that some corners are inside the image.
         corners = np.array([corner for corner in box.corners().T if corner[2] > 0]).T
+        if len(corners) == 0:
+            return None
+
+        # Project corners that are in front of the camera to 2d to get bbox in pixel coords.
         imcorners = view_points(corners, p_left, normalize=True)[:2]
         bbox = (np.min(imcorners[0]), np.min(imcorners[1]), np.max(imcorners[0]), np.max(imcorners[1]))
 
-        # Crop bbox to prevent it extending outside image
-        bbox_crop = (max(0, bbox[0]), max(0, bbox[1]), min(imsize[0], bbox[2]), min(imsize[1], bbox[3]))
+        # Crop bbox to prevent it extending outside image.
+        bbox_crop = tuple(max(0, b) for b in bbox)
+        bbox_crop = (min(imsize[0], bbox_crop[0]),
+                     min(imsize[0], bbox_crop[1]),
+                     min(imsize[0], bbox_crop[2]),
+                     min(imsize[1], bbox_crop[3]))
+
+        # Detect if a cropped box is empty.
+        if bbox_crop[0] >= bbox_crop[2] or bbox_crop[1] >= bbox_crop[3]:
+            return None
 
         return bbox_crop
 
@@ -275,7 +288,6 @@ def get_boxes(self,
             return boxes
 
         with open(KittiDB.get_filepath(token, 'label_2', root=self.root), 'r') as f:
-
             for line in f:
                 # Parse this line into box information.
                 parsed_line = self.parse_label_line(line)
@@ -333,18 +345,51 @@ def get_boxes(self,
 
         return boxes
 
+    def get_boxes_2d(self,
+                     token: str,
+                     filter_classes: List[str] = None) -> Tuple[
+            List[Tuple[float, float, float, float]],
+            List[str]
+        ]:
+        """
+        Get the 2d boxes associated with a sample.
+        :return: A list of boxes in KITTI format (xmin, ymin, xmax, ymax) and a list of the class names.
+        """
+        boxes = []
+        names = []
+        with open(KittiDB.get_filepath(token, 'label_2', root=self.root), 'r') as f:
+            for line in f:
+                # Parse this line into box information.
+                parsed_line = self.parse_label_line(line)
+
+                if parsed_line['name'] in {'DontCare', 'Misc'}:
+                    continue
+
+                bbox_2d = parsed_line['bbox_camera']
+                name = parsed_line['name']
+
+                # Optional: Filter classes.
+                if filter_classes is not None and name not in filter_classes:
+                    continue
+
+                boxes.append(bbox_2d)
+                names.append(name)
+        return boxes, names
+
+
     @staticmethod
     def box_to_string(name: str,
                       box: Box,
-                      bbox: Tuple[float, float, float, float] = (-1.0, -1.0, -1.0, -1.0),
+                      bbox_2d: Tuple[float, float, float, float] = (-1.0, -1.0, -1.0, -1.0),
                       truncation: float = -1.0,
                       occlusion: int = -1,
                       alpha: float = -10.0) -> str:
         """
         Convert box in KITTI image frame to official label string fromat.
         :param name: KITTI name of the box.
         :param box: Box class in KITTI image frame.
-        :param bbox: Optional, 2D bounding box obtained by projected Box into image. Otherwise set to KITTI default.
+        :param bbox_2d: Optional, 2D bounding box obtained by projected Box into image (xmin, ymin, xmax, ymax).
+            Otherwise set to KITTI default.
         :param truncation: Optional truncation, otherwise set to KITTI default.
         :param occlusion: Optional occlusion, otherwise set to KITTI default.
         :param alpha: Optional alpha, otherwise set to KITTI default.
@@ -356,13 +401,13 @@ def box_to_string(name: str,
 
         # Prepare output.
         name += ' '
-        trunc = '{:.3f} '.format(truncation)
+        trunc = '{:.2f} '.format(truncation)
         occ = '{:d} '.format(occlusion)
-        a = '{:.3f} '.format(alpha)
-        bb = '{:.3f} {:.3f} {:.3f} {:.3f} '.format(bbox[0], bbox[1], bbox[2], bbox[3])  # bbox (xmin, ymin, xmax, ymax).
-        hwl = '{:.3f} {:.3f} {:.3f} '.format(box.wlh[2], box.wlh[0], box.wlh[1])  # height, width, length.
-        xyz = '{:.3f} {:.3f} {:.3f} '.format(box.center[0], box.center[1], box.center[2])  # x, y, z.
-        y = '{:.3f}'.format(yaw)  # Yaw angle.
+        a = '{:.2f} '.format(alpha)
+        bb = '{:.2f} {:.2f} {:.2f} {:.2f} '.format(bbox_2d[0], bbox_2d[1], bbox_2d[2], bbox_2d[3])
+        hwl = '{:.2} {:.2f} {:.2f} '.format(box.wlh[2], box.wlh[0], box.wlh[1])  # height, width, length.
+        xyz = '{:.2f} {:.2f} {:.2f} '.format(box.center[0], box.center[1], box.center[2])  # x, y, z.
+        y = '{:.2f}'.format(yaw)  # Yaw angle.
         s = ' {:.4f}'.format(box.score)  # Classification score.
 
         output = name + trunc + occ + a + bb + hwl + xyz + y
@@ -407,7 +452,8 @@ def render_sample_data(self,
                            box_linewidth: int = 2,
                            filter_classes: List[str] = None,
                            max_dist: float = None,
-                           out_path: str = None) -> None:
+                           out_path: str = None,
+                           render_2d: bool = False) -> None:
         """
         Render sample data onto axis. Visualizes lidar in nuScenes lidar frame and camera in camera frame.
         :param token: KITTI token.
@@ -422,6 +468,7 @@ def render_sample_data(self,
         :param filter_classes: Optionally filter the classes to render.
         :param max_dist: Maximum distance in m to still draw a box.
         :param out_path: Optional path to save the rendered figure to disk.
+        :param render_2d: Whether to render 2d boxes (only works for camera data).
         """
         # Default settings.
         if color_func is None:
@@ -456,7 +503,6 @@ def render_sample_data(self,
                     box.render(ax, view=view_3d, colors=(color, color, 'k'), linewidth=box_linewidth)
 
         elif sensor_modality == 'camera':
-            transforms = self.get_transforms(token, self.root)
             im_path = KittiDB.get_filepath(token, 'image_2', root=self.root)
             im = Image.open(im_path)
 
@@ -469,21 +515,33 @@ def render_sample_data(self,
                 ax.set_ylim(im.size[1], 0)
 
             if with_anns:
-                for box in boxes:
-                    # Undo the transformations in get_boxes() to get back to the camera frame.
-                    box.rotate(self.kitti_to_nu_lidar_inv)  # In KITTI lidar frame.
-                    box.rotate(Quaternion(matrix=transforms['velo_to_cam']['R']))
-                    box.translate(transforms['velo_to_cam']['T'])  # In KITTI camera frame, un-rectified.
-                    box.rotate(Quaternion(matrix=transforms['r0_rect']))  # In KITTI camera frame, rectified.
-
-                    # Filter boxes outside the image (relevant when visualizing nuScenes data in KITTI format).
-                    if not box_in_image(box, transforms['p_left'][:3, :3], im.size, vis_level=BoxVisibility.ANY):
-                        continue
-
-                    # Render.
-                    color = np.array(color_func(box.name)) / 255
-                    box.render(ax, view=transforms['p_left'][:3, :3], normalize=True, colors=(color, color, 'k'),
-                               linewidth=box_linewidth)
+                if render_2d:
+                    # Use KITTI's 2d boxes.
+                    boxes_2d, names = self.get_boxes_2d(token, filter_classes=filter_classes)
+                    for box, name in zip(boxes_2d, names):
+                        color = np.array(color_func(name)) / 255
+                        ax.plot([box[0], box[0]], [box[1], box[3]], color=color, linewidth=box_linewidth)
+                        ax.plot([box[2], box[2]], [box[1], box[3]], color=color, linewidth=box_linewidth)
+                        ax.plot([box[0], box[2]], [box[1], box[1]], color=color, linewidth=box_linewidth)
+                        ax.plot([box[0], box[2]], [box[3], box[3]], color=color, linewidth=box_linewidth)
+                else:
+                    # Project 3d boxes to 2d.
+                    transforms = self.get_transforms(token, self.root)
+                    for box in boxes:
+                        # Undo the transformations in get_boxes() to get back to the camera frame.
+                        box.rotate(self.kitti_to_nu_lidar_inv)  # In KITTI lidar frame.
+                        box.rotate(Quaternion(matrix=transforms['velo_to_cam']['R']))
+                        box.translate(transforms['velo_to_cam']['T'])  # In KITTI camera frame, un-rectified.
+                        box.rotate(Quaternion(matrix=transforms['r0_rect']))  # In KITTI camera frame, rectified.
+
+                        # Filter boxes outside the image (relevant when visualizing nuScenes data in KITTI format).
+                        if not box_in_image(box, transforms['p_left'][:3, :3], im.size, vis_level=BoxVisibility.ANY):
+                            continue
+
+                        # Render.
+                        color = np.array(color_func(box.name)) / 255
+                        box.render(ax, view=transforms['p_left'][:3, :3], normalize=True, colors=(color, color, 'k'),
+                                   linewidth=box_linewidth)
         else:
             raise ValueError("Unrecognized modality {}.".format(sensor_modality))