Skip to content

saving measurements during compile and run time #108

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 47 additions & 3 deletions QEfficient/cloud/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,19 @@
import argparse
import logging
import os
import time
from typing import List, Optional

import QEfficient
from QEfficient.cloud.export import get_onnx_model_path
from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists
from QEfficient.utils import (
check_and_assign_cache_dir,
get_qpc_dir_path,
load_hf_tokenizer,
qpc_exists,
tabulate_measurements,
)
from QEfficient.utils.logging_utils import logger


Expand All @@ -35,6 +42,7 @@ def main(
local_model_dir: Optional[str] = None,
cache_dir: Optional[str] = None,
hf_token: Optional[str] = None,
benchmark: bool = False,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you update the doc string with this flag usage?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couldn't find. Can you point me to the right doc string path?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can add it on line number 71

) -> None:
"""
1. Check if compiled qpc for given config already exists, if it does jump to execute, else
Expand Down Expand Up @@ -78,8 +86,11 @@ def main(
)

# Handle qpc generation

if qpc_exists(qpc_dir_path):
logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
compile_time = "pre-compiled"

else:
# Handle onnx model generation
onnx_model_path = get_onnx_model_path(
Expand All @@ -89,6 +100,9 @@ def main(
#########
# Compile
#########

compile_start_time = time.perf_counter()

_ = QEfficient.compile(
onnx_path=onnx_model_path,
qpc_path=os.path.dirname(
Expand All @@ -106,10 +120,13 @@ def main(
full_batch_size=full_batch_size,
)

compile_time = (time.perf_counter() - compile_start_time) // 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can also be moved under the if condition of benchmark flag, also why keeping it integer?


#########
# Execute
#########
cloud_ai_100_exec_kv(

execinfo = cloud_ai_100_exec_kv(
tokenizer=tokenizer,
qpc_path=qpc_dir_path,
device_id=device_group,
Expand All @@ -119,6 +136,27 @@ def main(
full_batch_size=full_batch_size,
)

#########
# Log
#########

if benchmark:
_ = tabulate_measurements(
model_name=model_name,
tokenizer=tokenizer,
prompt=prompt,
batch_size=batch_size,
full_batch_size=full_batch_size,
prompt_len=prompt_len,
ctx_len=ctx_len,
num_cores=num_cores,
device_group=device_group,
mxfp6=mxfp6,
mxint8=mxint8,
compile_time=compile_time,
execinfo=execinfo,
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -198,9 +236,15 @@ def main(
default=None,
help="Set full batch size to enable continuous batching mode, default is None",
)

parser.add_argument(
"--benchmark",
"-b",
action="store_true",
help="store measurements into a csv table at model_card_dir",
)
args = parser.parse_args()
if args.verbose:
logger.setLevel(logging.INFO)
del args.verbose # type: ignore

main(**args.__dict__)
18 changes: 11 additions & 7 deletions QEfficient/generation/text_generation_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class CloudAI100ExecInfo:
:decode_perf (float): Decoding performance.
:total_perf (float): Total performance.
:total_time (float): Total time.
:total_decode_tokens (int): Total decode tokens.
"""

batch_size: int
Expand All @@ -43,6 +44,7 @@ class CloudAI100ExecInfo:
decode_perf: float
total_perf: float
total_time: float
total_decode_tokens: float

def __repr__(self):
return f"Average Prefill time a.k.a TTFT is= {round(self.prefill_time, 2)}\
Expand Down Expand Up @@ -276,7 +278,7 @@ def cloud_ai_100_exec_kv(
total_time = np.average([info.total_time for info in exec_info])
generated_texts = [info.generated_texts for info in exec_info]
generated_ids = [info.generated_ids for info in exec_info]

total_decode_tokens = np.average([info.total_decode_tokens for info in exec_info])
exec_info = CloudAI100ExecInfo(
batch_size=batch_size,
generated_texts=generated_texts,
Expand All @@ -285,6 +287,7 @@ def cloud_ai_100_exec_kv(
decode_perf=decode_perf,
total_perf=total_perf,
total_time=total_time,
total_decode_tokens=total_decode_tokens,
)
else:
exec_info = generate_text.cloud_ai_100_exec_kv_helper(prompt=prompt, generation_len=generation_len)
Expand Down Expand Up @@ -687,7 +690,7 @@ def regular_model_execution(self, prompt, generation_len):
prefill_time, decode_perf, total_perf, total_time = self.calculate_latency(
total_decode_tokens, loop_start, start, end
)
return prefill_time, decode_perf, total_perf, total_time, generated_texts
return prefill_time, decode_perf, total_perf, total_time, generated_texts, total_decode_tokens

def continuous_batching_execution(self, prompt, prompt_queue, generation_len):
"""
Expand Down Expand Up @@ -718,7 +721,7 @@ def continuous_batching_execution(self, prompt, prompt_queue, generation_len):
total_decode_tokens, loop_start, start, end, decode_pause_time
)
prefill_time /= len(prompt) # Average prefill time for continuous batching
return prefill_time, decode_perf, total_perf, total_time, generated_texts
return prefill_time, decode_perf, total_perf, total_time, generated_texts, total_decode_tokens

def cloud_ai_100_exec_kv_helper(self, prompt: List[str], generation_len: Optional[int] = None):
"""
Expand Down Expand Up @@ -753,14 +756,14 @@ def cloud_ai_100_exec_kv_helper(self, prompt: List[str], generation_len: Optiona

if self.full_batch_size is not None:
logger.warning("Streamer is currently unavailable for continuous batch execution.")
prefill_time, decode_perf, total_perf, total_time, generated_texts = self.continuous_batching_execution(
prompt, prompt_queue, generation_len
prefill_time, decode_perf, total_perf, total_time, generated_texts, total_decode_tokens = (
self.continuous_batching_execution(prompt, prompt_queue, generation_len)
)
else:
if self.stream:
self.streamer.on_finalized_text("\nPrompt : " + prompt[0] + "\nCompletion :")
prefill_time, decode_perf, total_perf, total_time, generated_texts = self.regular_model_execution(
prompt, generation_len
prefill_time, decode_perf, total_perf, total_time, generated_texts, total_decode_tokens = (
self.regular_model_execution(prompt, generation_len)
)

if self.stream:
Expand All @@ -779,5 +782,6 @@ def cloud_ai_100_exec_kv_helper(self, prompt: List[str], generation_len: Optiona
decode_perf=decode_perf,
total_perf=total_perf,
total_time=total_time,
total_decode_tokens=total_decode_tokens,
)
return latency_stats
1 change: 1 addition & 0 deletions QEfficient/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@
onnx_exists,
padding_check_and_fix,
qpc_exists,
tabulate_measurements,
)
57 changes: 57 additions & 0 deletions QEfficient/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#
# -----------------------------------------------------------------------------

import csv
import os
from typing import List, Optional, Tuple, Union

Expand Down Expand Up @@ -277,3 +278,59 @@ def get_num_layers_from_config(config):
raise ValueError("Invalid model configuration: n_layer/n_layers or num_hidden_layers not found.")

return n_layer


def tabulate_measurements(
model_name,
tokenizer,
prompt,
batch_size,
full_batch_size,
prompt_len,
ctx_len,
num_cores,
device_group,
mxfp6,
mxint8,
compile_time,
execinfo,
):
input_len = max([len(x) for x in tokenizer(prompt, return_tensors="np").input_ids])

fields = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two more fields mos and aic_enable_depth_first can be added here.

"MODEL\nNAME": model_name,
"BATCH\nSIZE": batch_size,
"FULL\nBATCH_SIZE": full_batch_size,
"CPL": prompt_len,
"PL": input_len,
"GL": int(execinfo.total_decode_tokens + 1),
"CL": ctx_len,
"CORES": num_cores,
"NUM\nSOCS": len(device_group) if device_group else 1,
"DEVICE\nID": device_group,
"MXFP6\nW": mxfp6,
"MXINT8\n$KV": mxint8,
"COMPILE\nTIME (S)": compile_time,
"PREFILL\nTIME (S)": round(execinfo.prefill_time, 2),
"DECODE\nTOK/S": round(execinfo.decode_perf, 2),
"TOTAL\nTOK/S": round(execinfo.total_perf, 2),
"TOTAL\nTIME (S)": round(execinfo.total_time, 2),
}

model_card_dir = os.path.join(QEFF_MODELS_DIR, str(model_name))
model_name = model_name.replace("/", "-")
file_name = f"{model_card_dir}/{model_name}_benchmarking.csv"

try:
os.makedirs(model_card_dir, exist_ok=True)
if not os.path.exists(file_name):
with open(file_name, "w") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(list(fields.keys()))
with open(file_name, "a", newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(list(fields.values()))
except OSError as e:
print(f"An error occurred while handling file {file_name}: {e}")

return file_name
Loading