nuScenes tracking evaluation improvements (nutonomy#251)

* Remove lap solver which is slower and causes issues in continuous integration * Switch to 20 point evaluation for more accurate results * Switch to 40 point evaluation * Remove unused config fields * Make test_algo independent of the number of recall thresholds * Update test target values after changing number of recall thresholds * Optimize memory usage by freeing up NuScenes object * Update readme to reflect val results and 40 point interpolation
JeongMin-98 · Nov 5, 2019 · a093f3a · a093f3a
1 parent 0e32541
commit a093f3a
Show file tree

Hide file tree

Showing 7 changed files with 52 additions and 53 deletions.
diff --git a/python-sdk/nuscenes/eval/tracking/README.md b/python-sdk/nuscenes/eval/tracking/README.md
@@ -198,8 +198,8 @@ The matching threshold (center distance) is 2m.
 
 ### AMOTA and AMOTP metrics
 Our main metrics are the AMOTA and AMOTP metrics developed in \[2\].
-These are integrals over the MOTA/MOTP curves using `n`-point interpolation (`n` to be determined).
-Similar to the detection challenge, we drop points with `recall < 0.1` (not shown in the equation), as these are typically noisy.
+These are integrals over the MOTA/MOTP curves using `n`-point interpolation (`n = 40`).
+Similar to the detection challenge, we do not include points with `recall < 0.1` (not shown in the equation), as these are typically noisy.
 
 - **AMOTA** (average multi object tracking accuracy):
 Average over the MOTA \[3\] metric (see below) at different recall thresholds.
@@ -261,9 +261,9 @@ Note that these numbers are measured on the val split and therefore not identica
 
 |   Method             | NDS  | mAP  | AMOTA | AMOTP | Modality | Detections download                                              | Tracking download                                               |
 |   ---                | ---  | ---  | ---   | ---   | ---      | ---                                                              | ---                                                             |
-|   Megvii \[6\]       | 62.8 | 51.9 | 28.2  | 1.49  | Lidar    | [link](https://www.nuscenes.org/data/detection-megvii.zip)       | [link](https://www.nuscenes.org/data/tracking-megvii.zip)       |
-|   PointPillars \[5\] | 44.8 | 29.5 |  6.9  | 1.69  | Lidar    | [link](https://www.nuscenes.org/data/detection-pointpillars.zip) | [link](https://www.nuscenes.org/data/tracking-pointpillars.zip) |
-|   Mapillary \[7\]    | 36.9 | 29.8 |  8.2  | 1.77  | Camera   | [link](https://www.nuscenes.org/data/detection-mapillary.zip)    | [link](https://www.nuscenes.org/data/tracking-mapillary.zip)    |
+|   Megvii \[6\]       | 62.8 | 51.9 | 27.9  | 1.50  | Lidar    | [link](https://www.nuscenes.org/data/detection-megvii.zip)       | [link](https://www.nuscenes.org/data/tracking-megvii.zip)       |
+|   PointPillars \[5\] | 44.8 | 29.5 | 13.1  | 1.69  | Lidar    | [link](https://www.nuscenes.org/data/detection-pointpillars.zip) | [link](https://www.nuscenes.org/data/tracking-pointpillars.zip) |
+|   Mapillary \[7\]    | 36.9 | 29.8 | 10.3  | 1.79  | Camera   | [link](https://www.nuscenes.org/data/detection-mapillary.zip)    | [link](https://www.nuscenes.org/data/tracking-mapillary.zip)    |
 
 #### Overfitting
 Some object detection methods overfit to the training data.

diff --git a/python-sdk/nuscenes/eval/tracking/configs/tracking_nips_2019.json b/python-sdk/nuscenes/eval/tracking/configs/tracking_nips_2019.json
@@ -9,10 +9,8 @@
     "bicycle": 40
   },
   "dist_fcn": "center_distance",
-  "dist_ths": [0.5, 1.0, 2.0, 4.0],
   "dist_th_tp": 2.0,
   "min_recall": 0.1,
-  "min_precision": 0.1,
   "max_boxes_per_sample": 500,
   "metric_worst": {
     "amota": 0.0,
@@ -33,5 +31,5 @@
     "tid": 20,
     "lgd": 20
   },
-  "num_thresholds": 10
+  "num_thresholds": 40
 }
diff --git a/python-sdk/nuscenes/eval/tracking/data_classes.py b/python-sdk/nuscenes/eval/tracking/data_classes.py
@@ -16,23 +16,18 @@ class TrackingConfig:
     def __init__(self,
                  class_range: Dict[str, int],
                  dist_fcn: str,
-                 dist_ths: List[float],
                  dist_th_tp: float,
                  min_recall: float,
-                 min_precision: float,
                  max_boxes_per_sample: float,
                  metric_worst: Dict[str, float],
                  num_thresholds: int):
 
         assert set(class_range.keys()) == set(TRACKING_NAMES), "Class count mismatch."
-        assert dist_th_tp in dist_ths, "dist_th_tp must be in set of dist_ths."
 
         self.class_range = class_range
         self.dist_fcn = dist_fcn
-        self.dist_ths = dist_ths
         self.dist_th_tp = dist_th_tp
         self.min_recall = min_recall
-        self.min_precision = min_precision
         self.max_boxes_per_sample = max_boxes_per_sample
         self.metric_worst = metric_worst
         self.num_thresholds = num_thresholds
@@ -52,10 +47,8 @@ def serialize(self) -> dict:
         return {
             'class_range': self.class_range,
             'dist_fcn': self.dist_fcn,
-            'dist_ths': self.dist_ths,
             'dist_th_tp': self.dist_th_tp,
             'min_recall': self.min_recall,
-            'min_precision': self.min_precision,
             'max_boxes_per_sample': self.max_boxes_per_sample,
             'metric_worst': self.metric_worst,
             'num_thresholds': self.num_thresholds
@@ -66,10 +59,8 @@ def deserialize(cls, content: dict):
         """ Initialize from serialized dictionary. """
         return cls(content['class_range'],
                    content['dist_fcn'],
-                   content['dist_ths'],
                    content['dist_th_tp'],
                    content['min_recall'],
-                   content['min_precision'],
                    content['max_boxes_per_sample'],
                    content['metric_worst'],
                    content['num_thresholds'])

diff --git a/python-sdk/nuscenes/eval/tracking/evaluate.py b/python-sdk/nuscenes/eval/tracking/evaluate.py
@@ -37,19 +37,21 @@ class TrackingEval:
     Please see https://www.nuscenes.org/tracking for more details.
     """
     def __init__(self,
-                 nusc: NuScenes,
                  config: TrackingConfig,
                  result_path: str,
                  eval_set: str,
-                 output_dir: str = None,
+                 output_dir: str,
+                 nusc_version: str,
+                 nusc_dataroot: str,
                  verbose: bool = True):
         """
         Initialize a TrackingEval object.
-        :param nusc: A NuScenes object.
         :param config: A TrackingConfig object.
         :param result_path: Path of the nuScenes JSON result file.
         :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
         :param output_dir: Folder to save plots and results to.
+        :param nusc_version: The version of the NuScenes dataset.
+        :param nusc_dataroot: Path of the nuScenes dataset on disk.
         :param verbose: Whether to print to stdout.
         """
         self.cfg = config
@@ -68,6 +70,10 @@ def __init__(self,
         if not os.path.isdir(self.plot_dir):
             os.makedirs(self.plot_dir)
 
+        # Initialize NuScenes object.
+        # We do not store it in self to let garbage collection take care of it and save memory.
+        nusc = NuScenes(version=nusc_version, verbose=verbose, dataroot=nusc_dataroot)
+
         # Load data.
         if verbose:
             print('Initializing nuScenes tracking evaluation')
@@ -252,7 +258,6 @@ def main(self, render_curves: bool = True) -> TrackingMetrics:
         with open(config_path, 'r') as _f:
             cfg_ = TrackingConfig.deserialize(json.load(_f))
 
-    nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
-    nusc_eval = TrackingEval(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
-                             output_dir=output_dir_, verbose=verbose_)
+    nusc_eval = TrackingEval(config=cfg_, result_path=result_path_, eval_set=eval_set_, output_dir=output_dir_,
+                             nusc_version=version_, nusc_dataroot=dataroot_, verbose=verbose_)
     nusc_eval.main(render_curves=render_curves_)
diff --git a/python-sdk/nuscenes/eval/tracking/tests/test_algo.py b/python-sdk/nuscenes/eval/tracking/tests/test_algo.py
@@ -71,8 +71,8 @@ def test_empty_submission(self):
 
         # Remove all predictions.
         timestamp_boxes_pred = copy.deepcopy(tracks_gt['scene-1'])
-        for id, box in timestamp_boxes_pred.items():
-            timestamp_boxes_pred[id] = []
+        for timestamp, box in timestamp_boxes_pred.items():
+            timestamp_boxes_pred[timestamp] = []
         tracks_pred = {'scene-1': timestamp_boxes_pred}
 
         # Accumulate metrics.
@@ -116,14 +116,15 @@ def test_drop_prediction(self):
 
         # Check outputs.
         # Recall values above 0.75 (3/4 correct) are not achieved and therefore nan.
-        assert np.all(np.isnan(md.confidence[md.recall_hypo > 0.75]))
-        assert md.tp[3] == 3
-        assert md.fp[3] == 0
-        assert md.fn[3] == 1
-        assert md.lgd[3] == 0.5
-        assert md.tid[3] == 0
-        assert md.frag[3] == 1
-        assert md.ids[3] == 0
+        first_achieved = np.where(md.recall_hypo <= 0.75)[0][0]
+        assert np.all(np.isnan(md.confidence[:first_achieved]))
+        assert md.tp[first_achieved] == 3
+        assert md.fp[first_achieved] == 0
+        assert md.fn[first_achieved] == 1
+        assert md.lgd[first_achieved] == 0.5
+        assert md.tid[first_achieved] == 0
+        assert md.frag[first_achieved] == 1
+        assert md.ids[first_achieved] == 0
 
     def test_drop_prediction_multiple(self):
         """  Drop the first three predictions from the GT submission. """
@@ -150,14 +151,15 @@ def test_drop_prediction_multiple(self):
 
         # Check outputs.
         # Recall values above 0.75 (3/4 correct) are not achieved and therefore nan.
-        assert np.all(np.isnan(md.confidence[md.recall_hypo > 0.25]))
-        assert md.tp[8] == 1
-        assert md.fp[8] == 0
-        assert md.fn[8] == 3
-        assert md.lgd[8] == 3 * 0.5
-        assert md.tid[8] == 3 * 0.5
-        assert md.frag[8] == 0
-        assert md.ids[8] == 0
+        first_achieved = np.where(md.recall_hypo <= 0.25)[0][0]
+        assert np.all(np.isnan(md.confidence[:first_achieved]))
+        assert md.tp[first_achieved] == 1
+        assert md.fp[first_achieved] == 0
+        assert md.fn[first_achieved] == 3
+        assert md.lgd[first_achieved] == 3 * 0.5
+        assert md.tid[first_achieved] == 3 * 0.5
+        assert md.frag[first_achieved] == 0
+        assert md.ids[first_achieved] == 0
 
     def test_identity_switch(self):
         """ Change the tracking_id of one frame from the GT submission. """
@@ -181,13 +183,14 @@ def test_identity_switch(self):
         md = ev.accumulate()
 
         # Check outputs.
-        assert md.tp[5] == 2
-        assert md.fp[5] == 0
-        assert md.fn[5] == 0
-        assert md.lgd[5] == 0
-        assert md.tid[5] == 0
-        assert md.frag[5] == 0
-        assert md.ids[5] == 2  # One wrong id leads to 2 identity switches.
+        first_achieved = np.where(md.recall_hypo <= 0.5)[0][0]
+        assert md.tp[first_achieved] == 2
+        assert md.fp[first_achieved] == 0
+        assert md.fn[first_achieved] == 0
+        assert md.lgd[first_achieved] == 0
+        assert md.tid[first_achieved] == 0
+        assert md.frag[first_achieved] == 0
+        assert md.ids[first_achieved] == 2  # One wrong id leads to 2 identity switches.
 
     def test_drop_gt(self):
         """ Drop one box from the GT. """

diff --git a/python-sdk/nuscenes/eval/tracking/tests/test_evaluate.py b/python-sdk/nuscenes/eval/tracking/tests/test_evaluate.py
@@ -163,8 +163,8 @@ def basic_test(self,
             json.dump(mock, f, indent=2)
 
         cfg = config_factory('tracking_nips_2019')
-        nusc_eval = TrackingEval(nusc, cfg, self.res_mockup, eval_set=eval_set, output_dir=self.res_eval_folder,
-                                 verbose=False)
+        nusc_eval = TrackingEval(cfg, self.res_mockup, eval_set=eval_set, output_dir=self.res_eval_folder,
+                                 nusc_version=version, nusc_dataroot=os.environ['NUSCENES'], verbose=False)
         metrics = nusc_eval.main(render_curves=render_curves)
 
         return metrics
@@ -184,8 +184,11 @@ def test_delta_mock(self,
 
         # Compare metrics to known solution.
         if eval_set == 'mini_val':
-            self.assertAlmostEqual(metrics.compute_metric('mota'), 0.24081829757545278)
-            self.assertAlmostEqual(metrics.compute_metric('motp'), 1.2974351821696868)
+            self.assertAlmostEqual(metrics.compute_metric('amota'), 0.5383961573989436)
+            self.assertAlmostEqual(metrics.compute_metric('amotp'), 1.5275400961369252)
+            self.assertAlmostEqual(metrics.compute_metric('motar'), 0.8261827096838301)
+            self.assertAlmostEqual(metrics.compute_metric('mota'), 0.25003943918566174)
+            self.assertAlmostEqual(metrics.compute_metric('motp'), 1.2976508610883917)
         else:
             print('Skipping checks due to choice of custom eval_set: %s' % eval_set)
 

diff --git a/setup/requirements.txt b/setup/requirements.txt
@@ -1,7 +1,6 @@
 cachetools
 descartes
 jupyter
-lap
 matplotlib
 motmetrics
 numpy