Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding online benchmarking scripts #55

Merged
merged 42 commits into from
Dec 31, 2024
Merged
Changes from 1 commit
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
7d68656
add print_prompts cli arg
tstescoTT Dec 4, 2024
8d78d64
remove redundant stop token from vLLM example api calls
tstescoTT Dec 4, 2024
3108bc0
add capture_trace.py util to pre-prompt vllm server to capture all tr…
tstescoTT Dec 4, 2024
ea3d75d
adding utils/startup_utils.py to refine handling of startup in automa…
tstescoTT Dec 4, 2024
cc1d17a
adding force_max_tokens as option to call_inference_api(), add input_…
tstescoTT Dec 4, 2024
059d513
faster mock model prefill
tstescoTT Dec 4, 2024
48d17de
make it not send stop tokens by default and speed up mock model decod…
tstescoTT Dec 5, 2024
fead1aa
adding token count verification for vllm open ai api server to prompt…
tstescoTT Dec 5, 2024
5a80551
add max-log-len to limit logging of prompts to avoid clutter in logs
tstescoTT Dec 5, 2024
d845f08
add InferenceServerContext to startup_utils.py, improve wait_for_healthy
tstescoTT Dec 5, 2024
632ac83
add all_responses to utils/prompt_client_cli.py not using globals
tstescoTT Dec 5, 2024
f563e32
adding new utils/prompt_client_cli.py using utils/prompt_client.py an…
tstescoTT Dec 5, 2024
2467c74
fix health endpoint
tstescoTT Dec 5, 2024
af5e8dc
add vllm_model to EnvironmentConfig instead of BatchConfig
tstescoTT Dec 5, 2024
60c7ab2
refactor utils/capture_traces.py with new prompt_client
tstescoTT Dec 5, 2024
10993a2
fix utils imports
tstescoTT Dec 5, 2024
20ccdf4
fix BatchConfig usage
tstescoTT Dec 6, 2024
eab7e76
add benchmarking/online_benchmark_prompt_client.py using prompt_clien…
tstescoTT Dec 6, 2024
90acdf6
add benchmarking/online_benchmark_prompt_client.py using prompt_clien…
tstescoTT Dec 6, 2024
ec486ad
add benchmarking, evals, and tests dirs to Dockerfile
tstescoTT Dec 6, 2024
c58d7b3
update patchfile and benchmarking README.md with commands
tstescoTT Dec 6, 2024
fe4f96d
update Docker IMAGE_VERSION to v0.0.3
tstescoTT Dec 6, 2024
f3d815a
improve doc
tstescoTT Dec 6, 2024
8246a72
update benchmark_serving.patch
tstescoTT Dec 6, 2024
765c4be
add tt_model_runner.py patch for best_of
tstescoTT Dec 6, 2024
b93370d
update benchmarking/benchmark_serving.patch
tstescoTT Dec 6, 2024
5e07baa
use CACHE_ROOT for vllm_online_benchmark_results dir
tstescoTT Dec 6, 2024
d0e0b0f
adding timestamped online benchmark run result directory, rps=1 for v…
tstescoTT Dec 9, 2024
5db2523
update benchmark output file naming convention
tstescoTT Dec 9, 2024
5ab742c
rename benchmarking/online_benchmark_prompt_client.py to benchmarking…
tstescoTT Dec 9, 2024
06420bd
increase num_prompts default, default to 128/128 online test
tstescoTT Dec 9, 2024
b7e4cfc
use min_tokens and ignore_eos=True to force output seq len
tstescoTT Dec 9, 2024
dda29a9
adding min_tokens to locust requests
tstescoTT Dec 9, 2024
f8b3033
add --ignore-eos to vllm_online_benchmark.py to force the output seq …
tstescoTT Dec 10, 2024
12c38fc
add context_lens (isl, osl) pairs to capture_traces() to capture corr…
tstescoTT Dec 10, 2024
1cabdc9
add trace pre-capture to prompt_client_cli.py with option to disable
tstescoTT Dec 10, 2024
68f08d0
better comment and logs for trace capture
tstescoTT Dec 10, 2024
962c507
use TPOT and TPS in benchmarking/prompt_client_online_benchmark.py, a…
tstescoTT Dec 12, 2024
62bf427
update utils/prompt_client_cli.py and docs
tstescoTT Dec 12, 2024
d9e163c
remove WIP utils/startup_utils.py from this branch
tstescoTT Dec 12, 2024
cd29085
adding doc string to BatchProcessor
tstescoTT Dec 31, 2024
376403d
add output_path arg to batch_processor.py::BatchProcessor to optional…
tstescoTT Dec 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
make it not send stop tokens by default and speed up mock model decod…
…e and prefill
  • Loading branch information
tstescoTT committed Dec 20, 2024
commit 48d17deb89107977792d19fb3f023151f6bd3efe
63 changes: 36 additions & 27 deletions tests/mock_vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
import torch
from huggingface_hub import hf_hub_download

from vllm.engine.metrics import logger

# mock out ttnn fully so we can import ttnn without using it
sys.modules["ttnn"] = MagicMock()
sys.modules["ttnn.device"] = MagicMock()

from vllm.engine.metrics import logger

from models.demos.t3000.llama2_70b.tt.llama_common import (
setup_llama_env,
)
Expand All @@ -31,6 +31,8 @@
get_model_config,
)

torch.manual_seed(9387)


def setup_mock_model_weights(cache_root: str, weights_dir: str, hf_token: str):
if not hf_token:
Expand Down Expand Up @@ -269,10 +271,11 @@ def prefill_forward(
"""

batch, batch_seq_len = tokens.shape
# faster prefill that does not mimic the actual prefill process
fast_prefill = True
if fast_prefill:
output_logits = torch.randn((batch, 1, self.params.vocab_size))
# faster prefill that does not mimic the actual prefill process
logger.info("Filling kv cache via fast_prefill in mock model")
output_logits = self.decode_forward(tokens=tokens, start_pos=start_pos)
else:
output_logits = torch.zeros(batch, 1, self.params.vocab_size)
prompt_lens = (
Expand Down Expand Up @@ -304,29 +307,27 @@ def prefill_forward(
output_logits[user_id] = logits[
:, last_token_idx % 32 : last_token_idx % 32 + 1, :
]

return output_logits

def decode_mock_send_token(self, logits, start_pos, batch, send_eot=False):
def decode_send_stop_token(self, logits, start_pos, batch):
# tooling for sending EOT token or other specific token at specific output position
EOT_ID = 128009
send_index = 200
send_token = EOT_ID
if send_eot:
if start_pos is not None:
if isinstance(start_pos, int):
# if start pos is same across batch, ie. now in prefill
cache_idxs = torch.tensor(
[start_pos for _ in range(batch)], dtype=torch.int64
)
else: # if start_pos is a tensor ie. is different across batch, now in decode mode
# if start position is greater than index to send EOT
cache_idxs = start_pos.to(dtype=torch.int64)
send_token_mask = cache_idxs > send_index
# find positions where start pos passes send_index (ie. done decoding) + make 1D
batch_indices = torch.nonzero(send_token_mask).squeeze()
# assign a high logit at at the send _token index so model will select it and generate the EOT so that generation stops
logits[batch_indices, 0, send_token] = 100.0
if start_pos is not None:
if isinstance(start_pos, int):
# if start pos is same across batch, ie. now in prefill
cache_idxs = torch.tensor(
[start_pos for _ in range(batch)], dtype=torch.int64
)
else: # if start_pos is a tensor ie. is different across batch, now in decode mode
# if start position is greater than index to send EOT
cache_idxs = start_pos.to(dtype=torch.int64)
send_token_mask = cache_idxs > send_index
# find positions where start pos passes send_index (ie. done decoding) + make 1D
batch_indices = torch.nonzero(send_token_mask).squeeze()
# assign a high logit at at the send _token index so model will select it and generate the EOT so that generation stops
logits[batch_indices, 0, send_token] = 100.0
return logits

def decode_forward(
Expand All @@ -342,15 +343,23 @@ def decode_forward(
assert len(tokens.shape) == 2
batch, seqlen = tokens.shape
forward_start = time.time()
simulated_tps = 10000.0
simulated_tps = 100000.0
simulated_duration = 1.0 / simulated_tps
# update the new tokens generated to the input id
# vocab_size = tokenizer.nwords
low_value = -100.0
high_value = 100.0
vocab_size = 128256
unreserved_vocab_size = 128000
# logits: [batch, seqlen, vocab_size]
logits = torch.randn((batch, seqlen, 128256))
logits = self.decode_mock_send_token(logits, start_pos, batch, send_eot=True)
actual_duration = time.time() - forward_start
logits = torch.full((batch, seqlen, vocab_size), low_value)
# set randomly selected tokens to high value
gen_token_ids = torch.randint(0, unreserved_vocab_size, (batch,))
logits[:, :, gen_token_ids] = high_value
send_eot = False
if send_eot:
# optionally send EOT token with some logic
logits = self.decode_send_stop_token(logits, start_pos, batch)
# simulate forward latency
actual_duration = time.time() - forward_start
time.sleep(max(simulated_duration - actual_duration, 0))
return logits

Expand Down