From d77ccc77be801041658b675525b2f47100c7382f Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 6 Apr 2024 14:32:30 -0700 Subject: [PATCH] [CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) --- benchmarks/benchmark_latency.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d4ebd200ff502..e2d358ea6631e 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -68,7 +68,8 @@ def run_to_completion(profile_dir: Optional[str] = None): return latency print("Warming up...") - run_to_completion(profile_dir=None) + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): + run_to_completion(profile_dir=None) if args.profile: profile_dir = args.profile_result_dir @@ -84,7 +85,12 @@ def run_to_completion(profile_dir: Optional[str] = None): latencies = [] for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): latencies.append(run_to_completion(profile_dir=None)) + latencies = np.array(latencies) + percentages = [10, 25, 50, 75, 90] + percentiles = np.percentile(latencies, percentages) print(f'Avg latency: {np.mean(latencies)} seconds') + for percentage, percentile in zip(percentages, percentiles): + print(f'{percentage}% percentile latency: {percentile} seconds') if __name__ == '__main__': @@ -106,9 +112,13 @@ def run_to_completion(profile_dir: Optional[str] = None): default=1, help='Number of generated sequences per prompt.') parser.add_argument('--use-beam-search', action='store_true') + parser.add_argument('--num-iters-warmup', + type=int, + default=10, + help='Number of iterations to run for warmup.') parser.add_argument('--num-iters', type=int, - default=3, + default=30, help='Number of iterations to run.') parser.add_argument('--trust-remote-code', action='store_true',