From a66fa8314c7527043b010135aa9dd25c95900e8f Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Fri, 14 Apr 2023 18:11:15 +0200
Subject: [PATCH] Add NMS to CoreML exports (#11361)

* Add NMS to CoreML exports

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 export.py | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 148 insertions(+), 2 deletions(-)

diff --git a/export.py b/export.py
index 73f4bb8c0870..5f8e1c4821da 100644
--- a/export.py
+++ b/export.py
@@ -77,6 +77,25 @@
 MACOS = platform.system() == 'Darwin'  # macOS environment
 
 
+class iOSModel(torch.nn.Module):
+
+    def __init__(self, model, im):
+        super().__init__()
+        b, c, h, w = im.shape  # batch, channel, height, width
+        self.model = model
+        self.nc = model.nc  # number of classes
+        if w == h:
+            self.normalize = 1. / w
+        else:
+            self.normalize = torch.tensor([1. / w, 1. / h, 1. / w, 1. / h])  # broadcast (slower, smaller)
+            # np = model(im)[0].shape[1]  # number of points
+            # self.normalize = torch.tensor([1. / w, 1. / h, 1. / w, 1. / h]).expand(np, 4)  # explicit (faster, larger)
+
+    def forward(self, x):
+        xywh, conf, cls = self.model(x)[0].squeeze().split((4, 1, self.nc), 1)
+        return cls * conf, xywh * self.normalize  # confidence (3780, 80), coordinates (3780, 4)
+
+
 def export_formats():
     # YOLOv5 export formats
     x = [
@@ -223,7 +242,7 @@ def export_paddle(model, im, file, metadata, prefix=colorstr('PaddlePaddle:')):
 
 
 @try_export
-def export_coreml(model, im, file, int8, half, prefix=colorstr('CoreML:')):
+def export_coreml(model, im, file, int8, half, nms, prefix=colorstr('CoreML:')):
     # YOLOv5 CoreML export
     check_requirements('coremltools')
     import coremltools as ct
@@ -231,6 +250,8 @@ def export_coreml(model, im, file, int8, half, prefix=colorstr('CoreML:')):
     LOGGER.info(f'\n{prefix} starting export with coremltools {ct.__version__}...')
     f = file.with_suffix('.mlmodel')
 
+    if nms:
+        model = iOSModel(model, im)
     ts = torch.jit.trace(model, im, strict=False)  # TorchScript model
     ct_model = ct.convert(ts, inputs=[ct.ImageType('image', shape=im.shape, scale=1 / 255, bias=[0, 0, 0])])
     bits, mode = (8, 'kmeans_lut') if int8 else (16, 'linear') if half else (32, None)
@@ -506,6 +527,129 @@ def add_tflite_metadata(file, metadata, num_outputs):
         tmp_file.unlink()
 
 
+def pipeline_coreml(model, im, file, names, y, prefix=colorstr('CoreML Pipeline:')):
+    # YOLOv5 CoreML pipeline
+    import coremltools as ct
+    from PIL import Image
+
+    print(f'{prefix} starting pipeline with coremltools {ct.__version__}...')
+    batch_size, ch, h, w = list(im.shape)  # BCHW
+    t = time.time()
+
+    # Output shapes
+    spec = model.get_spec()
+    out0, out1 = iter(spec.description.output)
+    if platform.system() == 'Darwin':
+        img = Image.new('RGB', (w, h))  # img(192 width, 320 height)
+        # img = torch.zeros((*opt.img_size, 3)).numpy()  # img size(320,192,3) iDetection
+        out = model.predict({'image': img})
+        out0_shape, out1_shape = out[out0.name].shape, out[out1.name].shape
+    else:  # linux and windows can not run model.predict(), get sizes from pytorch output y
+        s = tuple(y[0].shape)
+        out0_shape, out1_shape = (s[1], s[2] - 5), (s[1], 4)  # (3780, 80), (3780, 4)
+
+    # Checks
+    nx, ny = spec.description.input[0].type.imageType.width, spec.description.input[0].type.imageType.height
+    na, nc = out0_shape
+    # na, nc = out0.type.multiArrayType.shape  # number anchors, classes
+    assert len(names) == nc, f'{len(names)} names found for nc={nc}'  # check
+
+    # Define output shapes (missing)
+    out0.type.multiArrayType.shape[:] = out0_shape  # (3780, 80)
+    out1.type.multiArrayType.shape[:] = out1_shape  # (3780, 4)
+    # spec.neuralNetwork.preprocessing[0].featureName = '0'
+
+    # Flexible input shapes
+    # from coremltools.models.neural_network import flexible_shape_utils
+    # s = [] # shapes
+    # s.append(flexible_shape_utils.NeuralNetworkImageSize(320, 192))
+    # s.append(flexible_shape_utils.NeuralNetworkImageSize(640, 384))  # (height, width)
+    # flexible_shape_utils.add_enumerated_image_sizes(spec, feature_name='image', sizes=s)
+    # r = flexible_shape_utils.NeuralNetworkImageSizeRange()  # shape ranges
+    # r.add_height_range((192, 640))
+    # r.add_width_range((192, 640))
+    # flexible_shape_utils.update_image_size_range(spec, feature_name='image', size_range=r)
+
+    # Print
+    print(spec.description)
+
+    # Model from spec
+    model = ct.models.MLModel(spec)
+
+    # 3. Create NMS protobuf
+    nms_spec = ct.proto.Model_pb2.Model()
+    nms_spec.specificationVersion = 5
+    for i in range(2):
+        decoder_output = model._spec.description.output[i].SerializeToString()
+        nms_spec.description.input.add()
+        nms_spec.description.input[i].ParseFromString(decoder_output)
+        nms_spec.description.output.add()
+        nms_spec.description.output[i].ParseFromString(decoder_output)
+
+    nms_spec.description.output[0].name = 'confidence'
+    nms_spec.description.output[1].name = 'coordinates'
+
+    output_sizes = [nc, 4]
+    for i in range(2):
+        ma_type = nms_spec.description.output[i].type.multiArrayType
+        ma_type.shapeRange.sizeRanges.add()
+        ma_type.shapeRange.sizeRanges[0].lowerBound = 0
+        ma_type.shapeRange.sizeRanges[0].upperBound = -1
+        ma_type.shapeRange.sizeRanges.add()
+        ma_type.shapeRange.sizeRanges[1].lowerBound = output_sizes[i]
+        ma_type.shapeRange.sizeRanges[1].upperBound = output_sizes[i]
+        del ma_type.shape[:]
+
+    nms = nms_spec.nonMaximumSuppression
+    nms.confidenceInputFeatureName = out0.name  # 1x507x80
+    nms.coordinatesInputFeatureName = out1.name  # 1x507x4
+    nms.confidenceOutputFeatureName = 'confidence'
+    nms.coordinatesOutputFeatureName = 'coordinates'
+    nms.iouThresholdInputFeatureName = 'iouThreshold'
+    nms.confidenceThresholdInputFeatureName = 'confidenceThreshold'
+    nms.iouThreshold = 0.45
+    nms.confidenceThreshold = 0.25
+    nms.pickTop.perClass = True
+    nms.stringClassLabels.vector.extend(names.values())
+    nms_model = ct.models.MLModel(nms_spec)
+
+    # 4. Pipeline models together
+    pipeline = ct.models.pipeline.Pipeline(input_features=[('image', ct.models.datatypes.Array(3, ny, nx)),
+                                                           ('iouThreshold', ct.models.datatypes.Double()),
+                                                           ('confidenceThreshold', ct.models.datatypes.Double())],
+                                           output_features=['confidence', 'coordinates'])
+    pipeline.add_model(model)
+    pipeline.add_model(nms_model)
+
+    # Correct datatypes
+    pipeline.spec.description.input[0].ParseFromString(model._spec.description.input[0].SerializeToString())
+    pipeline.spec.description.output[0].ParseFromString(nms_model._spec.description.output[0].SerializeToString())
+    pipeline.spec.description.output[1].ParseFromString(nms_model._spec.description.output[1].SerializeToString())
+
+    # Update metadata
+    pipeline.spec.specificationVersion = 5
+    pipeline.spec.description.metadata.versionString = 'https://github.com/ultralytics/yolov5'
+    pipeline.spec.description.metadata.shortDescription = 'https://github.com/ultralytics/yolov5'
+    pipeline.spec.description.metadata.author = 'glenn.jocher@ultralytics.com'
+    pipeline.spec.description.metadata.license = 'https://github.com/ultralytics/yolov5/blob/master/LICENSE'
+    pipeline.spec.description.metadata.userDefined.update({
+        'classes': ','.join(names.values()),
+        'iou_threshold': str(nms.iouThreshold),
+        'confidence_threshold': str(nms.confidenceThreshold)})
+
+    # Save the model
+    f = file.with_suffix('.mlmodel')  # filename
+    model = ct.models.MLModel(pipeline.spec)
+    model.input_description['image'] = 'Input image'
+    model.input_description['iouThreshold'] = f'(optional) IOU Threshold override (default: {nms.iouThreshold})'
+    model.input_description['confidenceThreshold'] = \
+        f'(optional) Confidence Threshold override (default: {nms.confidenceThreshold})'
+    model.output_description['confidence'] = 'Boxes × Class confidence (see user-defined metadata "classes")'
+    model.output_description['coordinates'] = 'Boxes × [x, y, width, height] (relative to image size)'
+    model.save(f)  # pipelined
+    print(f'{prefix} pipeline success ({time.time() - t:.2f}s), saved as {f} ({file_size(f):.1f} MB)')
+
+
 @smart_inference_mode()
 def run(
         data=ROOT / 'data/coco128.yaml',  # 'dataset.yaml path'
@@ -584,7 +728,9 @@ def run(
     if xml:  # OpenVINO
         f[3], _ = export_openvino(file, metadata, half)
     if coreml:  # CoreML
-        f[4], _ = export_coreml(model, im, file, int8, half)
+        f[4], ct_model = export_coreml(model, im, file, int8, half, nms)
+        if nms:
+            pipeline_coreml(ct_model, im, file, model.names, y)
     if any((saved_model, pb, tflite, edgetpu, tfjs)):  # TensorFlow formats
         assert not tflite or not tfjs, 'TFLite and TF.js models must be exported separately, please pass only one type.'
         assert not isinstance(model, ClassificationModel), 'ClassificationModel export to TF formats not yet supported.'