Skip to content

Commit

Permalink
fix HF infer & update mpsize arg (microsoft#262)
Browse files Browse the repository at this point in the history
Co-authored-by: Michael Wyatt <mrwyattii@gmail.com>
  • Loading branch information
satpalsr and mrwyattii authored Apr 11, 2023
1 parent 7dd11d3 commit 85a69e8
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 4 deletions.
5 changes: 3 additions & 2 deletions benchmarks/inference/bert-bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def print_latency(latency_set, title, warmup=3):
if args.deepspeed:
pipe.model = deepspeed.init_inference(pipe.model,
dtype=dtype,
mp_size=1,
tensor_parallel={"tp_size": 1},
replace_with_kernel_inject=args.kernel_inject,
enable_cuda_graph=args.graphs)
pipe.model.profile_model_time()
Expand All @@ -90,6 +90,7 @@ def print_latency(latency_set, title, warmup=3):
mtimes += pipe.model.model_times()

print_latency(times, "e2e latency")
print_latency(mtimes, "model latency")
if args.deepspeed:
print_latency(mtimes, "model latency")

print(responses[0:3])
5 changes: 3 additions & 2 deletions benchmarks/inference/gpt-bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def print_latency(latency_set, title, warmup=3):
pipe.model = deepspeed.init_inference(
pipe.model,
dtype=dtype,
mp_size=args.world_size,
tensor_parallel={"tp_size": args.world_size},
replace_with_kernel_inject=args.kernel_inject,
enable_cuda_graph=args.graphs,
)
Expand All @@ -101,7 +101,8 @@ def print_latency(latency_set, title, warmup=3):

if args.local_rank == 0:
print_latency(times, "(e2e) latency")
print_latency(mtimes, "(model-only) latency")
if args.deepspeed:
print_latency(mtimes, "(model-only) latency")
print_latency(map(lambda t: t / (args.max_tokens - 3), times), "(e2e) per token latency")
print(f"RESPONSE 0:")
print("-" * 30)
Expand Down

0 comments on commit 85a69e8

Please sign in to comment.