Skip to content

Commit 85a69e8

Browse files
satpalsrmrwyattii
andauthored
fix HF infer & update mpsize arg (deepspeedai#262)
Co-authored-by: Michael Wyatt <mrwyattii@gmail.com>
1 parent 7dd11d3 commit 85a69e8

File tree

2 files changed

+6
-4
lines changed

2 files changed

+6
-4
lines changed

benchmarks/inference/bert-bench.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def print_latency(latency_set, title, warmup=3):
7070
if args.deepspeed:
7171
pipe.model = deepspeed.init_inference(pipe.model,
7272
dtype=dtype,
73-
mp_size=1,
73+
tensor_parallel={"tp_size": 1},
7474
replace_with_kernel_inject=args.kernel_inject,
7575
enable_cuda_graph=args.graphs)
7676
pipe.model.profile_model_time()
@@ -90,6 +90,7 @@ def print_latency(latency_set, title, warmup=3):
9090
mtimes += pipe.model.model_times()
9191

9292
print_latency(times, "e2e latency")
93-
print_latency(mtimes, "model latency")
93+
if args.deepspeed:
94+
print_latency(mtimes, "model latency")
9495

9596
print(responses[0:3])

benchmarks/inference/gpt-bench.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def print_latency(latency_set, title, warmup=3):
7979
pipe.model = deepspeed.init_inference(
8080
pipe.model,
8181
dtype=dtype,
82-
mp_size=args.world_size,
82+
tensor_parallel={"tp_size": args.world_size},
8383
replace_with_kernel_inject=args.kernel_inject,
8484
enable_cuda_graph=args.graphs,
8585
)
@@ -101,7 +101,8 @@ def print_latency(latency_set, title, warmup=3):
101101

102102
if args.local_rank == 0:
103103
print_latency(times, "(e2e) latency")
104-
print_latency(mtimes, "(model-only) latency")
104+
if args.deepspeed:
105+
print_latency(mtimes, "(model-only) latency")
105106
print_latency(map(lambda t: t / (args.max_tokens - 3), times), "(e2e) per token latency")
106107
print(f"RESPONSE 0:")
107108
print("-" * 30)

0 commit comments

Comments
 (0)