From 53fbadf3497d86857aca891333a467ca0f0f4af8 Mon Sep 17 00:00:00 2001
From: i-aki-y <aki.y.ishikawa@gmail.com>
Date: Thu, 30 May 2024 21:30:00 +0900
Subject: [PATCH] fix unpredictable panoptic_quality output when
 `return_per_class=True` (#2548)

* fix panoptic_quality class order

* added changelog and updated test-case

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com>
---
 CHANGELOG.md                                  |  3 +--
 .../detection/panoptic_qualities.py           |  8 +++---
 .../detection/_panoptic_quality_common.py     |  4 +--
 .../detection/test_panoptic_quality.py        | 27 +++++++++++++++++++
 4 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ab55af30265..37e3b5d4963 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,8 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
--
-
+- Fixed class order of `panoptic_quality(..., return_per_class=True)` output ([#2548](https://github.com/Lightning-AI/torchmetrics/pull/2548))
 
 
 ## [1.4.0] - 2024-05-03
diff --git a/src/torchmetrics/detection/panoptic_qualities.py b/src/torchmetrics/detection/panoptic_qualities.py
index b5696dd7bbd..50b7c4c5594 100644
--- a/src/torchmetrics/detection/panoptic_qualities.py
+++ b/src/torchmetrics/detection/panoptic_qualities.py
@@ -66,9 +66,11 @@ class PanopticQuality(Metric):
           single scalar tensor is returned with average panoptic quality over all classes. If ``return_sq_and_rq=True``
           and ``return_per_class=False`` a tensor of length 3 is returned with panoptic, segmentation and recognition
           quality (in that order). If If ``return_sq_and_rq=False`` and ``return_per_class=True`` a tensor of length
-          equal to the number of classes are returned, with panoptic quality for each class. Finally, if both arguments
-          are ``True`` a tensor of shape ``(3, C)`` is returned with individual panoptic, segmentation and recognition
-          quality for each class.
+          equal to the number of classes are returned, with panoptic quality for each class. The order of classes is
+          ``things`` first and then ``stuffs``, and numerically sorted within each.
+          (ex. with ``things=[4, 1], stuffs=[3, 2]``, the output classes are ordered by ``[1, 4, 2, 3]``)
+          Finally, if both arguments are ``True`` a tensor of shape ``(3, C)`` is returned with individual panoptic,
+          segmentation and recognition quality for each class.
 
     Args:
         things:
diff --git a/src/torchmetrics/functional/detection/_panoptic_quality_common.py b/src/torchmetrics/functional/detection/_panoptic_quality_common.py
index e00beb98bd2..c94978e3435 100644
--- a/src/torchmetrics/functional/detection/_panoptic_quality_common.py
+++ b/src/torchmetrics/functional/detection/_panoptic_quality_common.py
@@ -148,9 +148,9 @@ def _get_category_id_to_continuous_id(things: Set[int], stuffs: Set[int]) -> Dic
 
     """
     # things metrics are stored with a continuous id in [0, len(things)[,
-    thing_id_to_continuous_id = {thing_id: idx for idx, thing_id in enumerate(things)}
+    thing_id_to_continuous_id = {thing_id: idx for idx, thing_id in enumerate(sorted(things))}
     # stuff metrics are stored with a continuous id in [len(things), len(things) + len(stuffs)[
-    stuff_id_to_continuous_id = {stuff_id: idx + len(things) for idx, stuff_id in enumerate(stuffs)}
+    stuff_id_to_continuous_id = {stuff_id: idx + len(things) for idx, stuff_id in enumerate(sorted(stuffs))}
     cat_id_to_continuous_id = {}
     cat_id_to_continuous_id.update(thing_id_to_continuous_id)
     cat_id_to_continuous_id.update(stuff_id_to_continuous_id)
diff --git a/tests/unittests/detection/test_panoptic_quality.py b/tests/unittests/detection/test_panoptic_quality.py
index 4d087073266..c7333ccac06 100644
--- a/tests/unittests/detection/test_panoptic_quality.py
+++ b/tests/unittests/detection/test_panoptic_quality.py
@@ -61,10 +61,26 @@
     .reshape((1, 1, 5, 2))
     .repeat(2, 1, 1, 1),
 )
+
 _ARGS_0 = {"things": {0, 1}, "stuffs": {6, 7}}
 _ARGS_1 = {"things": {2}, "stuffs": {3}, "allow_unknown_preds_category": True}
 _ARGS_2 = {"things": {0, 1}, "stuffs": {10, 11}}
 
+
+def _get_class_order_test_input_args(class_type, class1, class2, class3) -> (np.ndarray, dict):
+    a = [class1, 0]
+    b = [class2, 0]
+    c = [class3, 0]
+    _input = _Input(
+        # Shape of input tensors is (num_batches, batch_size, num_points, 2).
+        preds=torch.tensor([a, a, b, b, b, c]).reshape((1, 1, 6, 2)).repeat(2, 1, 1, 1),
+        target=torch.tensor([a, a, b, b, c, c]).reshape((1, 1, 6, 2)).repeat(2, 1, 1, 1),
+    )
+    _args = {"things": [], "stuffs": [], "return_per_class": True}
+    _args[class_type] = [class1, class2, class3]
+    return _input, _args
+
+
 # TODO: Improve _reference_fn by calling https://github.com/cocodataset/panopticapi/blob/master/panopticapi/evaluation.py
 # directly and compare at runtime on multiple examples.
 
@@ -84,6 +100,11 @@ def _reference_fn_1_2(preds, target) -> np.ndarray:
     return np.array([(2 / 3 + 1 + 2 / 3) / 3])
 
 
+def _reference_fn_class_order(preds, target) -> np.ndarray:
+    """Baseline result for the result of _get_class_order_test_input_args."""
+    return np.array([1, 0, 2 / 3])
+
+
 @pytest.mark.skipif(not _TORCH_GREATER_EQUAL_1_12, reason="PanopticQuality metric only supports PyTorch >= 1.12")
 class TestPanopticQuality(MetricTester):
     """Test class for `PanopticQuality` metric."""
@@ -95,6 +116,12 @@ class TestPanopticQuality(MetricTester):
             (_INPUTS_0, _ARGS_0, _reference_fn_0_0),
             (_INPUTS_0, _ARGS_1, _reference_fn_0_1),
             (_INPUTS_1, _ARGS_2, _reference_fn_1_2),
+            (*_get_class_order_test_input_args("stuffs", 0, 2, 1), _reference_fn_class_order),
+            (*_get_class_order_test_input_args("stuffs", 0, 3, 2), _reference_fn_class_order),
+            (*_get_class_order_test_input_args("stuffs", 0, 10, 2), _reference_fn_class_order),
+            (*_get_class_order_test_input_args("things", 0, 2, 1), _reference_fn_class_order),
+            (*_get_class_order_test_input_args("things", 0, 3, 2), _reference_fn_class_order),
+            (*_get_class_order_test_input_args("things", 0, 10, 2), _reference_fn_class_order),
         ],
     )
     def test_panoptic_quality_class(self, ddp, inputs, args, reference_metric):