diff --git a/run.py b/run.py
index ff12c2cc6..c41d90d83 100644
--- a/run.py
+++ b/run.py
@@ -477,18 +477,17 @@ def main() -> None:
     )
     parser.add_argument(
         "--metrics-gpu-backend",
-        choices=["dcgm", "default"],
-        default="default",
+        choices=["torch", "nvml", "dcgm"],
+        default="torch",
         help="""
-        Specify the backend [dcgm, default] to collect metrics.
-        In default mode, the latency(execution time) is collected by time.time_ns() and it is always enabled.
-        Optionally, - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process().
-        - you can specify gpu peak memory usage by --metrics gpu_peak_mem, and it is collected by nvml library.
-        - you can specify flops by --metrics flops, and it is collected by fvcore.
-        In dcgm mode, the latency(execution time) is collected by time.time_ns() and it is always enabled.
-        Optionally,
-        - you can specify cpu peak memory usage by --metrics cpu_peak_mem, and it is collected by psutil.Process().
-        - you can specify cpu and gpu peak memory usage by --metrics cpu_peak_mem,gpu_peak_mem, and they are collected by dcgm library.""",
+        Specify the backend [torch, nvml, dcgm] to collect metrics. In all modes,
+        the latency (execution time) is always collected using `time.time_ns()`. The CPU
+        and GPU peak memory usage metrics are optional. The CPU peak memory usage is
+        collected by `psutil.Process()` in all modes. In nvml mode, the GPU peak memory
+        usage is collected by the `nvml` library. In dcgm mode, the GPU peak memory usage is
+        collected by the `dcgm` library. In torch mode, the GPU peak memory usage is collected
+        by `torch.cuda.max_memory_allocated()`.
+        """,
     )
     args, extra_args = parser.parse_known_args()
     if args.cudastreams and not args.device == "cuda":
@@ -541,7 +540,7 @@ def main() -> None:
             )
 
             check_dcgm()
-        elif "gpu_peak_mem" in metrics_needed:
+        elif metrics_gpu_backend == "nvml":
             from torchbenchmark._components.model_analyzer.TorchBenchAnalyzer import (
                 check_nvml,
             )
diff --git a/torchbenchmark/util/experiment/metrics.py b/torchbenchmark/util/experiment/metrics.py
index ed0a21720..fd206b22a 100644
--- a/torchbenchmark/util/experiment/metrics.py
+++ b/torchbenchmark/util/experiment/metrics.py
@@ -4,11 +4,15 @@
 
 import copy
 import dataclasses
+import os
 import pathlib
 import time
 from typing import List, Optional, Tuple, Union
 
+import psutil
+
 import torch
+
 from torchbenchmark import ModelTask
 from torchbenchmark.util.experiment.instantiator import TorchBenchModelConfig
 from torchbenchmark.util.model import BenchmarkModel
@@ -31,6 +35,11 @@ class TorchBenchModelMetrics:
     model_flops: Optional[float]
 
 
+def maybe_synchronize(device: str):
+    if device == "cuda":
+        torch.cuda.synchronize()
+
+
 def get_latencies(
     func, device: str, nwarmup=WARMUP_ROUNDS, num_iter=BENCHMARK_ITERS
 ) -> List[float]:
@@ -62,14 +71,10 @@ def get_peak_memory(
     num_iter=MEMPROF_ITER,
     export_metrics_file="",
     metrics_needed=[],
-    metrics_gpu_backend="dcgm",
+    metrics_gpu_backend="torch",
     cpu_monitored_pid=None,
 ) -> Tuple[Optional[float], Optional[str], Optional[float]]:
     "Run one step of the model, and return the peak memory in MB."
-    from torchbenchmark._components.model_analyzer.TorchBenchAnalyzer import (
-        ModelAnalyzer,
-    )
-
     new_metrics_needed = [
         _ for _ in metrics_needed if _ in ["cpu_peak_mem", "gpu_peak_mem"]
     ]
@@ -77,10 +82,19 @@ def get_peak_memory(
         raise ValueError(
             f"Expected metrics_needed to be non-empty, get: {metrics_needed}"
         )
-    mem_model_analyzer = ModelAnalyzer(
-        export_metrics_file, new_metrics_needed, metrics_gpu_backend, cpu_monitored_pid
-    )
-    continue_num_iter = BENCHMARK_ITERS - num_iter
+    if metrics_gpu_backend in ["dcgm", "nvml"]:
+        from torchbenchmark._components.model_analyzer.TorchBenchAnalyzer import (
+            ModelAnalyzer,
+        )
+
+        mem_model_analyzer = ModelAnalyzer(
+            export_metrics_file,
+            new_metrics_needed,
+            metrics_gpu_backend,
+            cpu_monitored_pid,
+        )
+    else:
+        mem_model_analyzer = None
 
     def work_func():
         if device == "cuda":
@@ -99,22 +113,37 @@ def work_func():
         num_iter = BENCHMARK_ITERS
     else:
         num_iter = MEMPROF_ITER
-    mem_model_analyzer.start_monitor()
 
-    for _i in range(num_iter):
-        work_func()
-    mem_model_analyzer.stop_monitor()
-    mem_model_analyzer.aggregate()
     device_id = None
     gpu_peak_mem = None
     cpu_peak_mem = None
-    if "gpu_peak_mem" in metrics_needed:
-        device_id, gpu_peak_mem = mem_model_analyzer.calculate_gpu_peak_mem()
-    if "cpu_peak_mem" in metrics_needed:
-        cpu_peak_mem = mem_model_analyzer.calculate_cpu_peak_mem()
-    if export_metrics_file:
-        mem_model_analyzer.update_export_name("_peak_memory")
-        mem_model_analyzer.export_all_records_to_csv()
+
+    if mem_model_analyzer:
+        mem_model_analyzer.start_monitor()
+        for _i in range(num_iter):
+            work_func()
+        mem_model_analyzer.stop_monitor()
+        mem_model_analyzer.aggregate()
+
+        if "gpu_peak_mem" in metrics_needed:
+            device_id, gpu_peak_mem = mem_model_analyzer.calculate_gpu_peak_mem()
+        if "cpu_peak_mem" in metrics_needed:
+            cpu_peak_mem = mem_model_analyzer.calculate_cpu_peak_mem()
+        if export_metrics_file:
+            mem_model_analyzer.update_export_name("_peak_memory")
+            mem_model_analyzer.export_all_records_to_csv()
+    else:
+        if device == "cuda":
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.empty_cache()
+        for _ in range(num_iter):
+            work_func()
+        if device == "cuda":
+            device_id = torch.cuda.current_device()
+            gpu_peak_mem = torch.cuda.max_memory_allocated() / 10**9
+        total = psutil.virtual_memory().total
+        percentage = psutil.Process(os.getpid()).memory_percent()
+        cpu_peak_mem = percentage * total / 10**9
     return cpu_peak_mem, device_id, gpu_peak_mem
 
 
diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py
index 45287556b..ded41ee8d 100644
--- a/torchbenchmark/util/triton_op.py
+++ b/torchbenchmark/util/triton_op.py
@@ -892,7 +892,7 @@ def _init_extra_metrics() -> Dict[str, Any]:
                 or "gpu_peak_mem" in self.required_metrics
             ):
                 metrics.cpu_peak_mem, _device_id, metrics.gpu_peak_mem = (
-                    self.get_peak_mem(fn)
+                    self.get_peak_mem(fn, self.tb_args.metrics_gpu_backend)
                 )
             if not baseline and "accuracy" in self.required_metrics:
                 metrics.accuracy = (
@@ -1014,13 +1014,13 @@ def _init_extra_metrics() -> Dict[str, Any]:
         return metrics
 
     def get_peak_mem(
-        self, fn: Callable
+        self, fn: Callable, metrics_memory_usage_backend: str
     ) -> Tuple[Optional[float], Optional[str], Optional[float]]:
         return get_peak_memory(
             func=fn,
             device=self.device,
             metrics_needed=["gpu_peak_mem", "cpu_peak_mem"],
-            metrics_gpu_backend="nvml",
+            metrics_gpu_backend=metrics_memory_usage_backend,
         )
 
     def nsys_rep(self, input_id: int, fn_name: str) -> str:
diff --git a/userbenchmark/triton/run.py b/userbenchmark/triton/run.py
index d9ac57e6a..32dd41dd3 100644
--- a/userbenchmark/triton/run.py
+++ b/userbenchmark/triton/run.py
@@ -93,6 +93,18 @@ def get_parser(args=None):
         default=None,
         help="Metrics to collect, split with comma. E.g., --metrics latency,tflops,speedup.",
     )
+    parser.add_argument(
+        "--metrics-gpu-backend",
+        choices=["torch", "nvml"],
+        default="torch",
+        help=(
+            "Specify the backend [torch, nvml] to collect metrics. In all modes, the latency "
+            "(execution time) is always collected using `time.time_ns()`. The CPU peak memory "
+            "usage is collected by `psutil.Process()`. In nvml mode, the GPU peak memory usage "
+            "is collected by the `nvml` library. In torch mode, the GPU peak memory usage is "
+            "collected by `torch.cuda.max_memory_allocated()`."
+        ),
+    )
     parser.add_argument(
         "--only",
         default=None,