vllm-project · simon-mo · Jun 14, 2024 · Jun 14, 2024
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
@@ -165,7 +165,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                         choices=["v1", "v2"],
                         default="v2")
     parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument("--seq_len", type=int, default=4096)
+    parser.add_argument("--seq-len", type=int, default=4096)
     parser.add_argument("--num-query-heads", type=int, default=64)
     parser.add_argument("--num-kv-heads", type=int, default=8)
     parser.add_argument("--head-size",

diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
@@ -17,7 +17,7 @@ def main():
                         type=int,
                         default=0,
                         help='known good models by index, [0-4]')
-    parser.add_argument('--tensor_parallel_size',
+    parser.add_argument('--tensor-parallel-size',
                         '-t',
                         type=int,
                         default=1,

diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py
@@ -327,7 +327,7 @@ def main(args):
         "--quantization-param-path <filename>). This is only used "
         "if the KV cache dtype is FP8 and on ROCm (AMD GPU).")
     parser.add_argument(
-        "--quantized_model",
+        "--quantized-model",
         help="Specify the directory containing a single quantized HF model. "
         "It is expected that the quantization format is FP8_E4M3, for use "
         "on ROCm (AMD GPU).",
@@ -339,18 +339,18 @@ def main(args):
         choices=["auto", "safetensors", "npz", "pt"],
         default="auto")
     parser.add_argument(
-        "--output_dir",
+        "--output-dir",
         help="Optionally specify the output directory. By default the "
         "KV cache scaling factors will be saved in the model directory, "
         "however you can override this behavior here.",
         default=None)
     parser.add_argument(
-        "--output_name",
+        "--output-name",
         help="Optionally specify the output filename.",
         # TODO: Change this once additional scaling factors are enabled
         default="kv_cache_scales.json")
     parser.add_argument(
-        "--tp_size",
+        "--tp-size",
         help="Optionally specify the tensor-parallel (TP) size that the "
         "quantized model should correspond to. If specified, during KV "
         "cache scaling factor extraction the observed TP size will be "