Skip to content

Commit 699a0dc

Browse files
author
Tristsky
committed
change for eval
1 parent 37f8c7b commit 699a0dc

File tree

4 files changed

+78
-2
lines changed

4 files changed

+78
-2
lines changed

common/arg.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
664664
params.n_predict = value;
665665
}
666666
).set_env("LLAMA_ARG_N_PREDICT"));
667+
add_opt(llama_arg(
668+
{"-promp_len", "--prompt-length"}, "N",
669+
format("number of tokens to process (default: %d)", params.prompt_length),
670+
[](gpt_params & params, int value) {
671+
params.prompt_length = value;
672+
}
673+
).set_env("LLAMA_ARG_PROMPT_LENGTH"));
667674
add_opt(llama_arg(
668675
{"-b", "--batch-size"}, "N",
669676
format("logical maximum batch size (default: %d)", params.n_batch),

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ struct gpt_sampler_params {
143143

144144
struct gpt_params {
145145
int32_t n_predict = -1; // new tokens to predict
146+
int32_t prompt_length = -1; // prompt length
146147
int32_t n_ctx = 0; // context size
147148
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
148149
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)

examples/main/main.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ int main(int argc, char ** argv) {
301301
LOG_DBG("use session tokens\n");
302302
embd_inp = session_tokens;
303303
}
304-
304+
embd_inp.resize(params.prompt_length);
305305
LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
306306
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
307307
}
@@ -927,7 +927,11 @@ int main(int argc, char ** argv) {
927927
LOG("\n\n");
928928
gpt_perf_print(ctx, smpl);
929929
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
930-
930+
auto timings = llama_perf_context(ctx);
931+
std::ofstream file;
932+
file.open("latency.txt", std::ios::app);
933+
file << params.prompt_length << "," << params.n_predict << "," << 1e3 / timings.t_eval_ms * timings.n_eval << std::endl;
934+
file.close();
931935
gpt_sampler_free(smpl);
932936

933937
llama_free(ctx);

latency.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""Microbenchmarking for CPU offloading"""
2+
3+
import argparse
4+
import json
5+
import os
6+
import random
7+
import sys
8+
import numpy as np
9+
import subprocess
10+
import time
11+
12+
if __name__ == "__main__":
13+
14+
path_json = "../ShareGPT_V3_unfiltered_cleaned_split.json"
15+
with open(path_json, "r") as f:
16+
data = json.load(f)
17+
18+
texts = []
19+
for d in data:
20+
if len(d["conversations"]) == 0:
21+
continue
22+
# the input of the first round
23+
texts.append(" ".join(d["conversations"][0]["value"].split()))
24+
25+
random.seed(0)
26+
random.shuffle(texts)
27+
n_sample = 3
28+
# for input_token in [16, 32, 64, 128]:
29+
# for output_token in [16, 32, 64, 128, 256, 512]:
30+
with open(f"./latency.txt", "a") as f:
31+
f.write(f"input_token, output_token, prefill_time, decode_time, token/s\n")
32+
idx_text = 0
33+
# for input_token in [32, 64, 128, 256, 512]:
34+
# for output_token in [64, 128, 256, 512]:
35+
for input_token in [32]:
36+
for output_token in [64]:
37+
while True:
38+
text = texts[idx_text]
39+
idx_text += 1
40+
if len(text.split()) >= input_token:
41+
# enough input length
42+
break
43+
total_time_sum = 0
44+
print(f"input_token: {input_token}, output_token: {output_token}")
45+
# print("text:", text)
46+
subprocess.run(
47+
[
48+
"./build/bin/llama-cli",
49+
"-m",
50+
"/root/llama.cpp/models/mixtral-87B-v0.1.gguf",
51+
"-p",
52+
f'"{text}"',
53+
"--prompt_length",
54+
str(input_token),
55+
"-n",
56+
str(output_token),
57+
"-e",
58+
"-ngl",
59+
"8",
60+
"-t",
61+
"16"
62+
],
63+
check=True, # Optional: will raise an error if the command fails
64+
)

0 commit comments

Comments
 (0)