triton-inference-server · the-david-oy · Dec 2, 2025 · Mar 27, 2024 · Mar 27, 2024 · Mar 27, 2024
diff --git a/model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py b/model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py
@@ -1,18 +1,6 @@
 #!/usr/bin/env python3
-
-# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import logging
 from copy import deepcopy
@@ -132,9 +120,11 @@ def _binary_search_over_top_results(self) -> Generator[RunConfig, None, None]:
             for result in top_results:
                 run_config = deepcopy(result.run_config())
                 model_parameters = self._get_model_parameters(model_name)
+                perf_analyzer_flags = self._get_model_perf_analyzer_flags(model_name)
                 parameter_search = ParameterSearch(
                     config=self._config,
                     model_parameters=model_parameters,
+                    perf_analyzer_flags=perf_analyzer_flags,
                     skip_parameter_sweep=True,
                 )
                 for parameter in parameter_search.search_parameters():
@@ -151,6 +141,12 @@ def _get_model_parameters(self, model_name: str) -> Dict:
 
         return {}
 
+    def _get_model_perf_analyzer_flags(self, model_name: str) -> Dict:
+        for model in self._models:
+            if model_name == model.model_name():
+                return model.perf_analyzer_flags()
+        return {}
+
     def _set_parameter(
         self, run_config: RunConfig, model_parameters: Dict, parameter: int
     ) -> RunConfig:

diff --git a/model_analyzer/config/generate/model_profile_spec.py b/model_analyzer/config/generate/model_profile_spec.py
@@ -1,18 +1,6 @@
 #!/usr/bin/env python3
-
-# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 from copy import deepcopy
 from typing import List
@@ -22,6 +10,7 @@
     ConfigModelProfileSpec,
 )
 from model_analyzer.device.gpu_device import GPUDevice
+from model_analyzer.perf_analyzer.perf_config import PerfAnalyzerConfig
 from model_analyzer.triton.client.client import TritonClient
 from model_analyzer.triton.model.model_config import ModelConfig
 
@@ -72,3 +61,14 @@ def supports_dynamic_batching(self) -> bool:
     def is_ensemble(self) -> bool:
         """Returns true if the model is an ensemble"""
         return "ensemble_scheduling" in self._default_model_config
+
+    def is_load_specified(self) -> bool:
+        """
+        Returns true if the model's PA config has specified any of the
+        inference load args (such as concurrency). Else returns false
+        """
+        load_args = PerfAnalyzerConfig.get_inference_load_args()
+        pa_flags = self.perf_analyzer_flags()
+        if pa_flags is None:
+            return False
+        return any(e in pa_flags for e in load_args)
diff --git a/model_analyzer/config/generate/perf_analyzer_config_generator.py b/model_analyzer/config/generate/perf_analyzer_config_generator.py
@@ -1,18 +1,6 @@
 #!/usr/bin/env python3
-
-# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import logging
 from typing import Generator, List, Optional
@@ -169,10 +157,12 @@ def set_last_results(
             self._parameter_results.extend(measurement)
 
     def _create_parameter_list(self) -> List[int]:
-        # The two possible parameters are request rate or concurrency
-        # Concurrency is the default and will be used unless the user specifies
-        # request rate, either as a model parameter or a config option
-        if self._cli_config.is_request_rate_specified(self._model_parameters):
+        # Determines the inference load (concurrency or request-rate or request-intervals)
+        # and creates the list of values to use. If nothing is specified by the user, then
+        # concurrency will be used.
+        if "request-intervals" in self._perf_analyzer_flags:
+            return [self._perf_analyzer_flags["request-intervals"]]
+        elif self._cli_config.is_request_rate_specified(self._model_parameters):
             return self._create_request_rate_list()
         else:
             return self._create_concurrency_list()
@@ -207,7 +197,7 @@ def _generate_perf_configs(self) -> None:
         for params in utils.generate_parameter_combinations(
             perf_config_non_parameter_values
         ):
-            configs_with_concurrency = []
+            configs_with_inference_load = []
             for parameter in self._parameters:
                 new_perf_config = PerfAnalyzerConfig()
 
@@ -217,16 +207,18 @@ def _generate_perf_configs(self) -> None:
 
                 new_perf_config.update_config(params)
 
-                if self._cli_config.is_request_rate_specified(self._model_parameters):
+                if "request-intervals" in self._perf_analyzer_flags:
+                    pass
+                elif self._cli_config.is_request_rate_specified(self._model_parameters):
                     new_perf_config.update_config({"request-rate-range": parameter})
                 else:
                     new_perf_config.update_config({"concurrency-range": parameter})
 
                 # User provided flags can override the search parameters
                 new_perf_config.update_config(self._perf_analyzer_flags)
 
-                configs_with_concurrency.append(new_perf_config)
-            self._configs.append(configs_with_concurrency)
+                configs_with_inference_load.append(new_perf_config)
+            self._configs.append(configs_with_inference_load)
 
     def _create_non_parameter_perf_config_values(self) -> dict:
         perf_config_values = {

diff --git a/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py b/model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py
@@ -1,21 +1,8 @@
 #!/usr/bin/env python3
-
-# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import logging
-from copy import deepcopy
 from typing import Generator, List, Optional
 
 from model_analyzer.config.generate.concurrency_sweeper import ConcurrencySweeper
@@ -30,7 +17,6 @@
 from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
 from model_analyzer.config.run.run_config import RunConfig
 from model_analyzer.constants import LOGGER_NAME
-from model_analyzer.result.parameter_search import ParameterSearch
 from model_analyzer.result.result_manager import ResultManager
 from model_analyzer.result.run_config_measurement import RunConfigMeasurement
 

diff --git a/model_analyzer/config/generate/quick_run_config_generator.py b/model_analyzer/config/generate/quick_run_config_generator.py
@@ -1,18 +1,6 @@
 #!/usr/bin/env python3
-
-# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 import logging
 from sys import maxsize
@@ -507,13 +495,13 @@ def _get_next_perf_analyzer_config(
 
         perf_analyzer_config.update_config_from_profile_config(model_name, self._config)
 
-        concurrency = self._calculate_concurrency(dimension_values)
-
-        perf_config_params = {
-            "batch-size": DEFAULT_BATCH_SIZES,
-            "concurrency-range": concurrency,
-        }
-        perf_analyzer_config.update_config(perf_config_params)
+        if not model.is_load_specified():
+            concurrency = self._calculate_concurrency(dimension_values)
+            perf_config_params = {
+                "batch-size": DEFAULT_BATCH_SIZES,
+                "concurrency-range": concurrency,
+            }
+            perf_analyzer_config.update_config(perf_config_params)
 
         perf_analyzer_config.update_config(model.perf_analyzer_flags())
         return perf_analyzer_config
@@ -703,13 +691,13 @@ def _create_default_perf_analyzer_config(
             model_config.get_field("name"), self._config
         )
 
-        default_concurrency = self._calculate_default_concurrency(model_config)
-
-        perf_config_params = {
-            "batch-size": DEFAULT_BATCH_SIZES,
-            "concurrency-range": default_concurrency,
-        }
-        default_perf_analyzer_config.update_config(perf_config_params)
+        if not model.is_load_specified():
+            default_concurrency = self._calculate_default_concurrency(model_config)
+            perf_config_params = {
+                "batch-size": DEFAULT_BATCH_SIZES,
+                "concurrency-range": default_concurrency,
+            }
+            default_perf_analyzer_config.update_config(perf_config_params)
 
         default_perf_analyzer_config.update_config(model.perf_analyzer_flags())
 

diff --git a/model_analyzer/perf_analyzer/perf_config.py b/model_analyzer/perf_analyzer/perf_config.py
@@ -1,18 +1,6 @@
 #!/usr/bin/env python3
-
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 from typing import List
 
@@ -98,6 +86,13 @@ class PerfAnalyzerConfig:
         "collect-metrics",
     ]
 
+    # Only one of these args can be sent to PA, as each one controls the inference load in a different way
+    inference_load_args = [
+        "concurrency-range",
+        "request-rate-range",
+        "request-intervals",
+    ]
+
     def __init__(self):
         """
         Construct a PerfAnalyzerConfig
@@ -108,7 +103,9 @@ def __init__(self):
         self._options = {
             "-m": None,
             "-x": None,
-            "-b": None,
+            # Default to batch size of 1. This would be handled by PA if unspecified,
+            # but we want to be explicit so we can properly print/track values
+            "-b": 1,
             "-u": None,
             "-i": None,
             "-f": None,
@@ -160,6 +157,16 @@ def additive_keys(cls):
 
         return cls.additive_args[:]
 
+    @classmethod
+    def get_inference_load_args(cls):
+        """
+        Returns
+        -------
+        list of str
+            The Perf Analyzer args that control the inference load
+        """
+        return cls.inference_load_args
+
     def update_config(self, params=None):
         """
         Allows setting values from a params dict
@@ -275,6 +282,7 @@ def extract_model_specific_parameters(self):
             "batch-size": self._options["-b"],
             "concurrency-range": self._args["concurrency-range"],
             "request-rate-range": self._args["request-rate-range"],
+            "request-intervals": self._args["request-intervals"],
         }
 
     @classmethod