From 26553e1a3d7dea7bbc3466654a6cdf94231992b6 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Mon, 15 May 2023 23:02:06 -0700
Subject: [PATCH] Fix L0_memory_growth (#5795)

(1) reduce MAX_ALLOWED_ALLOC to be more strict for bounded tests, and generous for unbounded tests.
(2) allow unstable measurement from PA.
(3) improve logging for future triage
---
 qa/L0_memory_growth/test.sh | 41 ++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/qa/L0_memory_growth/test.sh b/qa/L0_memory_growth/test.sh
index d6778a168e..64277e6b6e 100755
--- a/qa/L0_memory_growth/test.sh
+++ b/qa/L0_memory_growth/test.sh
@@ -92,8 +92,12 @@ else
 fi
 
 # Threshold memory growth in MB
-MAX_ALLOWED_ALLOC="150"
-export MAX_ALLOWED_ALLOC
+# NOTES:
+# - Bounded memory growth tests typically show < 70 MB usage
+#   - Plan/ONNX is typically between 20-40 MB
+#   - Savedmodel is closer to 50-70 MB
+# - Unbounded memory growth test typically shows > 100 MB usage
+export MAX_ALLOWED_ALLOC="100"
 
 # Create local model repository
 mkdir -p models/
@@ -164,23 +168,24 @@ for MODEL in $(ls models); do
     SECONDS=0
     # Run the perf analyzer 'REPETITION' times
     for ((i=1; i<=$REPETITION; i++)); do
+        # [TMA-621] Use --no-stability mode in perf analyzer when available
         $PERF_ANALYZER -v -m $MODEL -i grpc --concurrency-range $CONCURRENCY -b $CLIENT_BS > $TEMP_CLIENT_LOG 2>&1
-        # Only record failure log for unexpected perf analyzer error
-        # [TMA-625] Currently check failure log as WAR, should check for specific
-        # code once perf analyzer returns different code for different error
-        if [ $? -ne 0 ] && [ `grep -c "^No valid requests recorded" $TEMP_CLIENT_LOG` == "0" ]; then
+        PA_RET=$?
+        # Success
+        if [ ${PA_RET} -eq 0 ]; then
+          continue
+        # Unstable measurement: OK for this test
+        elif [ ${PA_RET} -eq 2 ]; then
+          continue
+        # Other failures unexpected, report error
+        else
             cat $TEMP_CLIENT_LOG >> $CLIENT_LOG
             echo -e "\n***\n*** perf_analyzer for $MODEL failed on iteration $i\n***" >> $CLIENT_LOG
-            TEMP_RET=1
+            RET=1
         fi
     done
     TEST_DURATION=$SECONDS
 
-    if [ $TEMP_RET -ne 0 ]; then
-        cat $CLIENT_LOG
-        RET=1
-    fi
-
     set -e
 
     # Stop Server
@@ -200,9 +205,11 @@ for MODEL in $(ls models); do
     python $MASSIF_TEST $MASSIF_LOG $MAX_ALLOWED_ALLOC --start-from-middle >> $CLIENT_LOG 2>&1
     if [ $? -ne 0 ]; then
         cat $CLIENT_LOG
-        echo -e "\n***\n*** Test for $MODEL Failed\n***"
+        echo -e "\n***\n*** Test for $MODEL Failed.\n***"
         RET=1
     fi
+    # Always output memory usage for easier triage of MAX_ALLOWED_ALLOC settings in the future
+    grep -i "Change in memory allocation" "${CLIENT_LOG}" || true
     set -e
 done
 
@@ -272,11 +279,17 @@ if [ $SKIP_BUSYOP -ne 1 ]; then
     cat ${GRAPH_LOG}
     # Check the massif output
     python $MASSIF_TEST $MASSIF_LOG $MAX_ALLOWED_ALLOC --start-from-middle >> $CLIENT_LOG 2>&1
+    # This busyop test is expected to return a non-zero error since it is
+    # intentionally testing unbounded growth. If it returns success for some
+    # reason, raise error.
     if [ $? -ne 1 ]; then
         cat $CLIENT_LOG
-        echo -e "\n***\n*** Test for graphdef_busyop Failed\n***"
+        echo -e "\n***\n*** Massif test for graphdef_busyop Failed\n***"
+        echo -e "\n***\n*** Expected unbounded growth, but found acceptable growth within ${MAX_ALLOWED_ALLOC} MB\n***"
         RET=1
     fi
+    # Always output memory usage for easier triage of MAX_ALLOWED_ALLOC settings in the future
+    grep -i "Change in memory allocation" "${CLIENT_LOG}" || true
 fi
 set -e