diff --git a/run.py b/run.py index ff12c2cc6..c41d90d83 100644 --- a/run.py +++ b/run.py @@ -477,18 +477,17 @@ def main() -> None: ) parser.add_argument( "--metrics-gpu-backend", - choices=["dcgm", "default"], - default="default", + choices=["torch", "nvml", "dcgm"], + default="torch", help=""" - Specify the backend [dcgm, default] to collect metrics. - In default mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. - Optionally, - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process(). - - you can specify gpu peak memory usage by --metrics gpu_peak_mem, and it is collected by nvml library. - - you can specify flops by --metrics flops, and it is collected by fvcore. - In dcgm mode, the latency(execution time) is collected by time.time_ns() and it is always enabled. - Optionally, - - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process(). - - you can specify cpu and gpu peak memory usage by --metrics cpu_peak_mem,gpu_peak_mem, and they are collected by dcgm library.""", + Specify the backend [torch, nvml, dcgm] to collect metrics. In all modes, + the latency (execution time) is always collected using `time.time_ns()`. The CPU + and GPU peak memory usage metrics are optional. The CPU peak memory usage is + collected by `psutil.Process()` in all modes. In nvml mode, the GPU peak memory + usage is collected by the `nvml` library. In dcgm mode, the GPU peak memory usage is + collected by the `dcgm` library. In torch mode, the GPU peak memory usage is collected + by `torch.cuda.max_memory_allocated()`. + """, ) args, extra_args = parser.parse_known_args() if args.cudastreams and not args.device == "cuda": @@ -541,7 +540,7 @@ def main() -> None: ) check_dcgm() - elif "gpu_peak_mem" in metrics_needed: + elif metrics_gpu_backend == "nvml": from torchbenchmark._components.model_analyzer.TorchBenchAnalyzer import ( check_nvml, ) diff --git a/torchbenchmark/util/experiment/metrics.py b/torchbenchmark/util/experiment/metrics.py index ed0a21720..fd206b22a 100644 --- a/torchbenchmark/util/experiment/metrics.py +++ b/torchbenchmark/util/experiment/metrics.py @@ -4,11 +4,15 @@ import copy import dataclasses +import os import pathlib import time from typing import List, Optional, Tuple, Union +import psutil + import torch + from torchbenchmark import ModelTask from torchbenchmark.util.experiment.instantiator import TorchBenchModelConfig from torchbenchmark.util.model import BenchmarkModel @@ -31,6 +35,11 @@ class TorchBenchModelMetrics: model_flops: Optional[float] +def maybe_synchronize(device: str): + if device == "cuda": + torch.cuda.synchronize() + + def get_latencies( func, device: str, nwarmup=WARMUP_ROUNDS, num_iter=BENCHMARK_ITERS ) -> List[float]: @@ -62,14 +71,10 @@ def get_peak_memory( num_iter=MEMPROF_ITER, export_metrics_file="", metrics_needed=[], - metrics_gpu_backend="dcgm", + metrics_gpu_backend="torch", cpu_monitored_pid=None, ) -> Tuple[Optional[float], Optional[str], Optional[float]]: "Run one step of the model, and return the peak memory in MB." - from torchbenchmark._components.model_analyzer.TorchBenchAnalyzer import ( - ModelAnalyzer, - ) - new_metrics_needed = [ _ for _ in metrics_needed if _ in ["cpu_peak_mem", "gpu_peak_mem"] ] @@ -77,10 +82,19 @@ def get_peak_memory( raise ValueError( f"Expected metrics_needed to be non-empty, get: {metrics_needed}" ) - mem_model_analyzer = ModelAnalyzer( - export_metrics_file, new_metrics_needed, metrics_gpu_backend, cpu_monitored_pid - ) - continue_num_iter = BENCHMARK_ITERS - num_iter + if metrics_gpu_backend in ["dcgm", "nvml"]: + from torchbenchmark._components.model_analyzer.TorchBenchAnalyzer import ( + ModelAnalyzer, + ) + + mem_model_analyzer = ModelAnalyzer( + export_metrics_file, + new_metrics_needed, + metrics_gpu_backend, + cpu_monitored_pid, + ) + else: + mem_model_analyzer = None def work_func(): if device == "cuda": @@ -99,22 +113,37 @@ def work_func(): num_iter = BENCHMARK_ITERS else: num_iter = MEMPROF_ITER - mem_model_analyzer.start_monitor() - for _i in range(num_iter): - work_func() - mem_model_analyzer.stop_monitor() - mem_model_analyzer.aggregate() device_id = None gpu_peak_mem = None cpu_peak_mem = None - if "gpu_peak_mem" in metrics_needed: - device_id, gpu_peak_mem = mem_model_analyzer.calculate_gpu_peak_mem() - if "cpu_peak_mem" in metrics_needed: - cpu_peak_mem = mem_model_analyzer.calculate_cpu_peak_mem() - if export_metrics_file: - mem_model_analyzer.update_export_name("_peak_memory") - mem_model_analyzer.export_all_records_to_csv() + + if mem_model_analyzer: + mem_model_analyzer.start_monitor() + for _i in range(num_iter): + work_func() + mem_model_analyzer.stop_monitor() + mem_model_analyzer.aggregate() + + if "gpu_peak_mem" in metrics_needed: + device_id, gpu_peak_mem = mem_model_analyzer.calculate_gpu_peak_mem() + if "cpu_peak_mem" in metrics_needed: + cpu_peak_mem = mem_model_analyzer.calculate_cpu_peak_mem() + if export_metrics_file: + mem_model_analyzer.update_export_name("_peak_memory") + mem_model_analyzer.export_all_records_to_csv() + else: + if device == "cuda": + torch.cuda.reset_peak_memory_stats() + torch.cuda.empty_cache() + for _ in range(num_iter): + work_func() + if device == "cuda": + device_id = torch.cuda.current_device() + gpu_peak_mem = torch.cuda.max_memory_allocated() / 10**9 + total = psutil.virtual_memory().total + percentage = psutil.Process(os.getpid()).memory_percent() + cpu_peak_mem = percentage * total / 10**9 return cpu_peak_mem, device_id, gpu_peak_mem diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py index 45287556b..ded41ee8d 100644 --- a/torchbenchmark/util/triton_op.py +++ b/torchbenchmark/util/triton_op.py @@ -892,7 +892,7 @@ def _init_extra_metrics() -> Dict[str, Any]: or "gpu_peak_mem" in self.required_metrics ): metrics.cpu_peak_mem, _device_id, metrics.gpu_peak_mem = ( - self.get_peak_mem(fn) + self.get_peak_mem(fn, self.tb_args.metrics_gpu_backend) ) if not baseline and "accuracy" in self.required_metrics: metrics.accuracy = ( @@ -1014,13 +1014,13 @@ def _init_extra_metrics() -> Dict[str, Any]: return metrics def get_peak_mem( - self, fn: Callable + self, fn: Callable, metrics_memory_usage_backend: str ) -> Tuple[Optional[float], Optional[str], Optional[float]]: return get_peak_memory( func=fn, device=self.device, metrics_needed=["gpu_peak_mem", "cpu_peak_mem"], - metrics_gpu_backend="nvml", + metrics_gpu_backend=metrics_memory_usage_backend, ) def nsys_rep(self, input_id: int, fn_name: str) -> str: diff --git a/userbenchmark/triton/run.py b/userbenchmark/triton/run.py index d9ac57e6a..32dd41dd3 100644 --- a/userbenchmark/triton/run.py +++ b/userbenchmark/triton/run.py @@ -93,6 +93,18 @@ def get_parser(args=None): default=None, help="Metrics to collect, split with comma. E.g., --metrics latency,tflops,speedup.", ) + parser.add_argument( + "--metrics-gpu-backend", + choices=["torch", "nvml"], + default="torch", + help=( + "Specify the backend [torch, nvml] to collect metrics. In all modes, the latency " + "(execution time) is always collected using `time.time_ns()`. The CPU peak memory " + "usage is collected by `psutil.Process()`. In nvml mode, the GPU peak memory usage " + "is collected by the `nvml` library. In torch mode, the GPU peak memory usage is " + "collected by `torch.cuda.max_memory_allocated()`." + ), + ) parser.add_argument( "--only", default=None,