Fix long-running stress test (triton-inference-server#3346)

* Change tensor shape for plan models * Increase timeout * Increase sequence idle time * Add CI output link * Fix for sequence_no_start * Use last_seq_choices for sequence-only cases * Use dict to remember the last-used model for no-end cases Co-authored-by: Kris Hung <krish@krish-dt.nvidia.com>
Weizhongjin · Sep 13, 2021 · 4301e4b · 4301e4b
1 parent f49f7a8
commit 4301e4b
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 35 deletions.
diff --git a/qa/L0_long_running_stress/crashing_client.py b/qa/L0_long_running_stress/crashing_client.py
@@ -42,6 +42,9 @@ def crashing_client(model_name,
                     triton_client,
                     tensor_shape=(1000,),
                     input_name="INPUT0"):
+    if "plan" in model_name:
+        tensor_shape = (32,)
+
     in0 = np.random.random(tensor_shape).astype(dtype)
     if "libtorch" in model_name:
         input_name = "INPUT__0"

diff --git a/qa/L0_long_running_stress/stress.py b/qa/L0_long_running_stress/stress.py
@@ -50,8 +50,8 @@
     import Queue as queue
 
 FLAGS = None
-CORRELATION_ID_BLOCK_SIZE = 100
-DEFAULT_TIMEOUT_MS = 10000
+CORRELATION_ID_BLOCK_SIZE = 1024 * 1024
+DEFAULT_TIMEOUT_MS = 25000
 SEQUENCE_LENGTH_MEAN = 16
 SEQUENCE_LENGTH_STDEV = 8
 BACKENDS = os.environ.get('BACKENDS', "graphdef savedmodel onnx plan")
@@ -163,7 +163,8 @@ def check_sequence_async(client_metadata,
             now_ms = int(round(time.time() * 1000))
             if (now_ms - seq_start_ms) > timeout_ms:
                 raise TimeoutException(
-                    "Timeout expired for {}".format(sequence_name))
+                    "Timeout expired for {}, got {} ms".format(
+                        sequence_name, (now_ms - seq_start_ms)))
 
         result = results.as_numpy(
             output_name)[0] if "nobatch" in trial else results.as_numpy(
@@ -553,19 +554,18 @@ def stress_thread(name, seed, test_duration, correlation_id_base,
     # inference requests. Also create some rare-use contexts that
     # are used to make requests with rarely-used correlation IDs.
     #
-    # Need to remember the last choice for each context since we
-    # don't want some choices to follow others since that gives
-    # results not expected. See below for details.
+    # Need to remember if the last sequence case runs on each model
+    # is no-end cases since we don't want some choices to follow others
+    # since that gives results not expected. See below for details.
     common_cnt = 2
     rare_cnt = 8
-    last_choices = []
+    is_last_used_no_end = {}
 
     for c in range(common_cnt + rare_cnt):
         client_metadata_list.append(
             (grpcclient.InferenceServerClient("localhost:8001",
                                               verbose=FLAGS.verbose),
              correlation_id_base + c))
-        last_choices.append(None)
 
     rare_idx = 0
     start_time = time.time()
@@ -587,7 +587,8 @@ def stress_thread(name, seed, test_duration, correlation_id_base,
                 # scheduler
                 if choice < 0.33:
                     count_test_case("sequence_no_end", test_case_count)
-                    last_choices[client_idx] = "sequence_no_end"
+                    is_last_used_no_end[model_name] = True
+                    last_choice = "sequence_no_end"
                     sequence_no_end(
                         client_metadata_list[client_idx],
                         rng,
@@ -600,7 +601,8 @@ def stress_thread(name, seed, test_duration, correlation_id_base,
                         sequence_request_count=sequence_request_count)
                 elif choice < 0.66:
                     count_test_case("sequence_valid_no_end", test_case_count)
-                    last_choices[client_idx] = "sequence_valid_no_end"
+                    is_last_used_no_end[model_name] = True
+                    last_choice = "sequence_valid_no_end"
                     sequence_valid_no_end(
                         client_metadata_list[client_idx],
                         rng,
@@ -613,7 +615,8 @@ def stress_thread(name, seed, test_duration, correlation_id_base,
                         sequence_request_count=sequence_request_count)
                 else:
                     count_test_case("sequence_valid_valid", test_case_count)
-                    last_choices[client_idx] = "sequence_valid_valid"
+                    is_last_used_no_end[model_name] = False
+                    last_choice = "sequence_valid_valid"
                     sequence_valid_valid(
                         client_metadata_list[client_idx],
                         rng,
@@ -630,30 +633,45 @@ def stress_thread(name, seed, test_duration, correlation_id_base,
                 # Common context...
                 client_idx = 0
                 client_metadata = client_metadata_list[client_idx]
-                last_choice = last_choices[client_idx]
 
                 choice = rng.rand()
 
                 # no-start cannot follow no-end since the server will
                 # just assume that the no-start is a continuation of
                 # the no-end sequence instead of being a sequence
                 # missing start flag.
-                if ((last_choice != "sequence_no_end") and
-                    (last_choice != "sequence_valid_no_end") and
-                    (choice < 0.01)):
-                    count_test_case("sequence_no_start", test_case_count)
-                    last_choices[client_idx] = "sequence_no_start"
-                    sequence_no_start(
-                        client_metadata,
-                        rng,
-                        trial,
-                        model_name,
-                        dtype,
-                        sequence_name=name,
-                        sequence_request_count=sequence_request_count)
+                if model_name in is_last_used_no_end:
+                    if ((not is_last_used_no_end[model_name]) and
+                        (choice < 0.01)):
+                        count_test_case("sequence_no_start", test_case_count)
+                        is_last_used_no_end[model_name] = False
+                        last_choice = "sequence_no_start"
+                        sequence_no_start(
+                            client_metadata,
+                            rng,
+                            trial,
+                            model_name,
+                            dtype,
+                            sequence_name=name,
+                            sequence_request_count=sequence_request_count)
+                    else:
+                        if choice < 0.01:
+                            count_test_case("sequence_no_start",
+                                            test_case_count)
+                            is_last_used_no_end[model_name] = False
+                            last_choice = "sequence_no_start"
+                            sequence_no_start(
+                                client_metadata,
+                                rng,
+                                trial,
+                                model_name,
+                                dtype,
+                                sequence_name=name,
+                                sequence_request_count=sequence_request_count)
                 elif choice < 0.05:
                     count_test_case("sequence_no_end", test_case_count)
-                    last_choices[client_idx] = "sequence_no_end"
+                    is_last_used_no_end[model_name] = True
+                    last_choice = "sequence_no_end"
                     sequence_no_end(
                         client_metadata,
                         rng,
@@ -666,7 +684,8 @@ def stress_thread(name, seed, test_duration, correlation_id_base,
                         sequence_request_count=sequence_request_count)
                 elif choice < 0.10:
                     count_test_case("sequence_valid_no_end", test_case_count)
-                    last_choices[client_idx] = "sequence_valid_no_end"
+                    is_last_used_no_end[model_name] = True
+                    last_choice = "sequence_valid_no_end"
                     sequence_valid_no_end(
                         client_metadata,
                         rng,
@@ -679,7 +698,8 @@ def stress_thread(name, seed, test_duration, correlation_id_base,
                         sequence_request_count=sequence_request_count)
                 elif choice < 0.15:
                     count_test_case("sequence_valid_valid", test_case_count)
-                    last_choices[client_idx] = "sequence_valid_valid"
+                    is_last_used_no_end[model_name] = False
+                    last_choice = "sequence_valid_valid"
                     sequence_valid_valid(
                         client_metadata,
                         rng,
@@ -692,7 +712,8 @@ def stress_thread(name, seed, test_duration, correlation_id_base,
                         sequence_request_count=sequence_request_count)
                 else:
                     count_test_case("sequence_valid", test_case_count)
-                    last_choices[client_idx] = "sequence_valid"
+                    is_last_used_no_end[model_name] = False
+                    last_choice = "sequence_valid"
                     sequence_valid(
                         client_metadata,
                         rng,
@@ -710,26 +731,25 @@ def stress_thread(name, seed, test_duration, correlation_id_base,
 
                 if choice < 0.3:
                     count_test_case("timeout_client", test_case_count)
-                    last_choices[client_idx] = "timeout_client"
+                    last_choice = "timeout_client"
                     timeout_client(
                         client_metadata=client_metadata_list[client_idx],
                         sequence_name=name,
                         sequence_request_count=sequence_request_count)
                 elif choice < 0.7:
                     count_test_case("resnet_model_request", test_case_count)
-                    last_choices[client_idx] = "resnet_model_request"
+                    last_choice = "resnet_model_request"
                     resnet_model_request(
                         sequence_name=name,
                         sequence_request_count=sequence_request_count)
                 else:
                     count_test_case("crashing_client", test_case_count)
-                    last_choices[client_idx] = "crashing_client"
+                    last_choice = "crashing_client"
                     crashing_client(
                         sequence_name=name,
                         sequence_request_count=sequence_request_count)
         except Exception as ex:
-            count_failed_test_case(last_choices[client_idx],
-                                   failed_test_case_count)
+            count_failed_test_case(last_choice, failed_test_case_count)
             _thread_exceptions_mutex.acquire()
             try:
                 _thread_exceptions.append(traceback.format_exc())

diff --git a/qa/L0_long_running_stress/stress_mail.py b/qa/L0_long_running_stress/stress_mail.py
@@ -28,16 +28,21 @@
 import sys
 sys.path.append("../common")
 
+import os
 import nightly_email_helper
 
 import glob
 from datetime import date
 
+CI_JOB_ID = os.environ.get('CI_JOB_ID', '')
+
 if __name__ == '__main__':
     today = date.today().strftime("%Y-%m-%d")
     subject = "Triton Long-Running Stress Test Summary: " + today
     stress_report = "stress_report.txt"
+    link = "https://gitlab-master.nvidia.com/dl/dgx/tritonserver/-/jobs/" + CI_JOB_ID
     write_up = "<p>The table below includes results from long-running stress test. Please refer to the description of each test case to see what different kinds of inference requests were sent. Request concurrency is set to 8.</p>"
+    write_up += "<p>Please check the CI output webpage for the details of the failures: " + link + "</p>"
     html_content = "<html><head></head><body><pre style=\"font-size:11pt;font-family:Arial, sans-serif;\">" + write_up + "</pre><pre style=\"font-size:11pt;font-family:Consolas;\">"
     with open(stress_report, "r") as f:
         html_content += f.read() + "\n"

diff --git a/qa/L0_long_running_stress/test.sh b/qa/L0_long_running_stress/test.sh
@@ -47,9 +47,11 @@ source ../common/util.sh
 RET=0
 
 # If BACKENDS not specified, set to all
-BACKENDS=${BACKENDS:="graphdef savedmodel onnx plan libtorch"}
+BACKENDS=${BACKENDS:="graphdef savedmodel onnx libtorch"}
 export BACKENDS
 
+export CI_JOB_ID=${CI_JOB_ID}
+
 MODEL_DIR=models
 
 rm -fr *.log *.txt *.serverlog models && mkdir models
@@ -76,6 +78,7 @@ for MODEL in $MODELS; do
     cp -r $MODEL $MODEL_DIR/. && \
       (cd $MODEL_DIR/$(basename $MODEL) && \
         sed -i "s/^max_batch_size:.*/max_batch_size: 2/" config.pbtxt && \
+        sed -i "s/max_sequence_idle_microseconds:.*/max_sequence_idle_microseconds: 7000000/" config.pbtxt && \
         sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 2/" config.pbtxt && \
         sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 2/" config.pbtxt)
 done
@@ -89,6 +92,7 @@ done
 for MODEL in $MODELS; do
     cp -r $MODEL $MODEL_DIR/. && \
       (cd $MODEL_DIR/$(basename $MODEL) && \
+        sed -i "s/max_sequence_idle_microseconds:.*/max_sequence_idle_microseconds: 7000000/" config.pbtxt && \
         sed -i "s/kind: KIND_GPU/kind: KIND_GPU\\ncount: 2/" config.pbtxt && \
         sed -i "s/kind: KIND_CPU/kind: KIND_CPU\\ncount: 2/" config.pbtxt)
 done