From 734363f58c0b3d870bebe4f808b6092fa8f00061 Mon Sep 17 00:00:00 2001
From: Kris Hung <krish@nvidia.com>
Date: Wed, 3 May 2023 17:38:49 -0700
Subject: [PATCH] Add testing for Python custom metrics API (#5669)

* Add testing for python custom metrics API

* Add custom metrics example to the test

* Fix for CodeQL report

* Fix test name

* Address comment

* Add logger and change the enum usage
---
 qa/L0_backend_python/custom_metrics/test.sh  |  85 +++++++
 qa/L0_backend_python/examples/test.sh        |  30 +++
 qa/L0_backend_python/test.sh                 |   5 +
 qa/python_models/custom_metrics/config.pbtxt |  43 ++++
 qa/python_models/custom_metrics/model.py     | 231 +++++++++++++++++++
 5 files changed, 394 insertions(+)
 create mode 100644 qa/L0_backend_python/custom_metrics/test.sh
 create mode 100644 qa/python_models/custom_metrics/config.pbtxt
 create mode 100644 qa/python_models/custom_metrics/model.py

diff --git a/qa/L0_backend_python/custom_metrics/test.sh b/qa/L0_backend_python/custom_metrics/test.sh
new file mode 100644
index 0000000000..8842fa4ecf
--- /dev/null
+++ b/qa/L0_backend_python/custom_metrics/test.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CLIENT_PY=../python_unittest.py
+CLIENT_LOG="./client.log"
+EXPECTED_NUM_TESTS="1"
+TEST_RESULT_FILE='test_results.txt'
+source ../../common/util.sh
+
+TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
+SERVER=${TRITON_DIR}/bin/tritonserver
+BACKEND_DIR=${TRITON_DIR}/backends
+SERVER_ARGS="--model-repository=`pwd`/models --backend-directory=${BACKEND_DIR} --log-verbose=1"
+SERVER_LOG="./inference_server.log"
+
+RET=0
+rm -fr *.log ./models *.txt
+
+mkdir -p models/custom_metrics/1/
+cp ../../python_models/custom_metrics/model.py models/custom_metrics/1/
+cp ../../python_models/custom_metrics/config.pbtxt models/custom_metrics
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+
+export MODEL_NAME='custom_metrics'
+python3 $CLIENT_PY >> $CLIENT_LOG 2>&1 
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** 'Custom Metrics' test FAILED. \n***"
+    cat $CLIENT_LOG
+    RET=1
+else
+    check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
+    if [ $? -ne 0 ]; then
+        cat $CLIENT_LOG
+        echo -e "\n***\n*** Test Result Verification Failed\n***"
+        RET=1
+    fi
+fi
+
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+
+if [ $RET -eq 1 ]; then
+    cat $CLIENT_LOG
+    cat $SERVER_LOG
+    echo -e "\n***\n*** Custom Metrics test FAILED. \n***"
+else
+    echo -e "\n***\n*** Custom Metrics test PASSED. \n***"
+fi
+
+exit $RET
diff --git a/qa/L0_backend_python/examples/test.sh b/qa/L0_backend_python/examples/test.sh
index 5ed9c739d2..2c94904135 100644
--- a/qa/L0_backend_python/examples/test.sh
+++ b/qa/L0_backend_python/examples/test.sh
@@ -413,6 +413,36 @@ set -e
 kill $SERVER_PID
 wait $SERVER_PID
 
+# Custom Metrics
+CLIENT_LOG="./custom_metrics_client.log"
+mkdir -p models/custom_metrics/1
+cp examples/custom_metrics/model.py models/custom_metrics/1/model.py
+cp examples/custom_metrics/config.pbtxt models/custom_metrics/config.pbtxt
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    RET=1
+fi
+
+set +e
+python3 examples/custom_metrics/client.py > $CLIENT_LOG
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Failed to verify Custom Metrics example. \n***"
+    RET=1
+fi
+
+grep "PASS" $CLIENT_LOG
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Failed to verify Custom Metrics example. \n***"
+    cat $CLIENT_LOG
+    RET=1
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
 
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Example verification test PASSED.\n***"
diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh
index a45f4c7101..9bf9163757 100755
--- a/qa/L0_backend_python/test.sh
+++ b/qa/L0_backend_python/test.sh
@@ -442,6 +442,11 @@ if [ "$TEST_JETSON" == "0" ]; then
   fi
 fi
 
+(cd custom_metrics && bash -ex test.sh)
+if [ $? -ne 0 ]; then
+  RET=1
+fi
+
 if [ $RET -eq 0 ]; then
   echo -e "\n***\n*** Test Passed\n***"
 else
diff --git a/qa/python_models/custom_metrics/config.pbtxt b/qa/python_models/custom_metrics/config.pbtxt
new file mode 100644
index 0000000000..bba420d9d2
--- /dev/null
+++ b/qa/python_models/custom_metrics/config.pbtxt
@@ -0,0 +1,43 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "custom_metrics"
+backend: "python"
+
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ 16 ]
+  }
+]
+
+instance_group [
+  { 
+    count: 3
+    kind: KIND_CPU 
+  }
+]
diff --git a/qa/python_models/custom_metrics/model.py b/qa/python_models/custom_metrics/model.py
new file mode 100644
index 0000000000..219868c757
--- /dev/null
+++ b/qa/python_models/custom_metrics/model.py
@@ -0,0 +1,231 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import unittest
+import triton_python_backend_utils as pb_utils
+import requests
+
+
+class PBCustomMetricsTest(unittest.TestCase):
+
+    def _get_metrics(self):
+        metrics_url = "http://localhost:8002/metrics"
+        r = requests.get(metrics_url)
+        r.raise_for_status()
+        return r.text
+
+    def _metric_api_helper(self, metric, kind):
+        # Adding logger to test if custom metrics and logging work together
+        # as they use the same message queue.
+        logger = pb_utils.Logger
+
+        # The value should be 0.0 before the test
+        self.assertEqual(metric.value(), 0.0)
+
+        # Test increment positive value
+        increment = 2023.0
+        metric.increment(increment)
+        self.assertEqual(metric.value(), increment)
+        logger.log_info("Incremented metric to : {}".format(metric.value()))
+
+        # Test increment negative value
+        decrement = -23.5
+        if kind == 'counter':
+            # Counter should not accept negative values
+            with self.assertRaises(pb_utils.TritonModelException):
+                metric.increment(decrement)
+        else:
+            metric.increment(decrement)
+            self.assertEqual(metric.value(), increment + decrement)
+            logger.log_info("Decremented metric to : {}".format(metric.value()))
+
+        # Test set value
+        value = 999.9
+        if kind == 'counter':
+            # Counter does not support set
+            with self.assertRaises(pb_utils.TritonModelException):
+                metric.set(value)
+        else:
+            metric.set(value)
+            self.assertEqual(metric.value(), value)
+            logger.log_info("Set metric to : {}".format(metric.value()))
+
+    def _dup_metric_helper(self, labels={}):
+        # Adding logger to test if custom metrics and logging work together
+        # as they use the same message queue.
+        logger = pb_utils.Logger
+
+        description = "dup metric"
+        metric_family = pb_utils.MetricFamily(
+            name="test_dup_metric",
+            description=description,
+            kind=pb_utils.MetricFamily.COUNTER)
+
+        # Verify dupe metrics reference same underlying metric
+        metric1 = metric_family.Metric(labels=labels)
+        metric2 = metric_family.Metric(labels=labels)
+
+        # The value should be 0 before the test
+        self.assertEqual(metric1.value(), 0.0)
+        self.assertEqual(metric2.value(), 0.0)
+
+        # Increment metric 1, check metric 2 == metric 1
+        increment = 7.5
+        metric1.increment(increment)
+        self.assertEqual(metric1.value(), metric2.value())
+        logger.log_info("Incremented metric1 to : {}".format(metric1.value()))
+        logger.log_info("Incremented metric2 to : {}".format(metric2.value()))
+
+        # Assert custom metric/family remains when there's still a reference to it
+        del metric1
+        metrics = self._get_metrics()
+        self.assertIn(description, metrics)
+
+    def test_counter_e2e(self):
+        metric_family = pb_utils.MetricFamily(
+            name="test_counter_e2e",
+            description="test metric counter kind end to end",
+            kind=pb_utils.MetricFamily.COUNTER)
+        labels = {"example1": "counter_label1", "example2": "counter_label2"}
+        metric = metric_family.Metric(labels=labels)
+        self._metric_api_helper(metric, 'counter')
+
+        pattern = 'test_counter_e2e{example1="counter_label1",example2="counter_label2"}'
+        metrics = self._get_metrics()
+        self.assertIn(pattern, metrics)
+
+    def test_gauge_e2e(self):
+        metric_family = pb_utils.MetricFamily(
+            name="test_gauge_e2e",
+            description="test metric gauge kind end to end",
+            kind=pb_utils.MetricFamily.GAUGE)
+        labels = {"example1": "counter_label1", "example2": "counter_label2"}
+        metric = metric_family.Metric(labels=labels)
+        self._metric_api_helper(metric, 'gauge')
+
+        pattern = 'test_gauge_e2e{example1="counter_label1",example2="counter_label2"}'
+        metrics = self._get_metrics()
+        self.assertIn(pattern, metrics)
+
+    def test_dup_metric_family_diff_kind(self):
+        # Test that a duplicate metric family can't be added with a conflicting type/kind
+        metric_family1 = pb_utils.MetricFamily(
+            name="test_dup_metric_family_diff_kind",
+            description="test metric family with same name but different kind",
+            kind=pb_utils.MetricFamily.COUNTER)
+        with self.assertRaises(pb_utils.TritonModelException):
+            metric_family2 = pb_utils.MetricFamily(
+                name="test_dup_metric_family_diff_kind",
+                description=
+                "test metric family with same name but different kind",
+                kind=pb_utils.MetricFamily.GAUGE)
+            self.assertIsNone(metric_family2)
+
+        self.assertIsNotNone(metric_family1)
+
+    def test_dup_metric_family_diff_description(self):
+        # Test that a duplicate metric family name will still return the
+        # original metric family even if the description is changed
+        metric_family1 = pb_utils.MetricFamily(
+            name="test_dup_metric_family_diff_description",
+            description="first description",
+            kind=pb_utils.MetricFamily.COUNTER)
+        metric_family2 = pb_utils.MetricFamily(
+            name="test_dup_metric_family_diff_description",
+            description="second description",
+            kind=pb_utils.MetricFamily.COUNTER)
+
+        metric2 = metric_family2.Metric()
+        self.assertEqual(metric2.value(), 0)
+
+        # Delete metric_family1 and check if metric_family2 still references it
+        del metric_family1
+        pattern = 'test_dup_metric_family_diff_description first description'
+        metrics = self._get_metrics()
+        self.assertIn(pattern, metrics)
+
+        # The first description will be kept if adding a duplicate metric
+        # family name with a different description
+        pattern = 'test_dup_metric_family_diff_description second description'
+        self.assertNotIn(pattern, metrics)
+
+    def test_dup_metric_family(self):
+        # Test that adding a duplicate metric family will reuse the original
+        # and not add another entry to registry
+        metric_family1 = pb_utils.MetricFamily(
+            name="test_dup_metric_family",
+            description="dup description",
+            kind=pb_utils.MetricFamily.COUNTER)
+        metric_family2 = pb_utils.MetricFamily(
+            name="test_dup_metric_family",
+            description="dup description",
+            kind=pb_utils.MetricFamily.COUNTER)
+
+        metric_key = "custom_metric_key"
+        metric1 = metric_family1.Metric(labels={metric_key: "label1"})
+        metric2 = metric_family2.Metric(labels={metric_key: "label2"})
+
+        self.assertEqual(metric1.value(), 0)
+        self.assertEqual(metric2.value(), 0)
+
+        patterns = [
+            '# HELP test_dup_metric_family dup description',
+            '# TYPE test_dup_metric_family counter',
+            'test_dup_metric_family{custom_metric_key="label2"} 0',
+            'test_dup_metric_family{custom_metric_key="label1"} 0'
+        ]
+        metrics = self._get_metrics()
+        for pattern in patterns:
+            self.assertIn(pattern, metrics)
+
+    def test_dup_metric_labels(self):
+        # Test that adding a duplicate metric will refer to the same
+        # underlying metric, and all instances will be updated
+        labels = {"example1": "label1", "example2": "label2"}
+        self._dup_metric_helper(labels)
+
+    def test_dup_metric_empty_labels(self):
+        # Test that adding a duplicate metric will refer to the same
+        # underlying metric, and all instances will be updated
+        self._dup_metric_helper()
+
+
+class TritonPythonModel:
+
+    def execute(self, requests):
+        responses = []
+        for _ in requests:
+            # Run the unittest and store the results in InferenceResponse.
+            test = unittest.main('model', exit=False)
+            responses.append(
+                pb_utils.InferenceResponse([
+                    pb_utils.Tensor(
+                        'OUTPUT0',
+                        np.array([test.result.wasSuccessful()],
+                                 dtype=np.float16))
+                ]))
+        return responses