change for eval

Tristsky · Tristsky · commit 699a0dc1b963 · 2024-09-23T14:55:26.000Z
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -664,6 +664,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.n_predict = value;
         }
     ).set_env("LLAMA_ARG_N_PREDICT"));
+    add_opt(llama_arg(
+        {"-promp_len", "--prompt-length"}, "N",
+        format("number of tokens to process (default: %d)", params.prompt_length),
+        [](gpt_params & params, int value) {
+            params.prompt_length = value;
+        }
+    ).set_env("LLAMA_ARG_PROMPT_LENGTH"));
     add_opt(llama_arg(
         {"-b", "--batch-size"}, "N",
         format("logical maximum batch size (default: %d)", params.n_batch),
diff --git a/common/common.h b/common/common.h
@@ -143,6 +143,7 @@ struct gpt_sampler_params {
 
 struct gpt_params {
     int32_t n_predict             =    -1; // new tokens to predict
+    int32_t prompt_length         =    -1; // prompt length
     int32_t n_ctx                 =     0; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -301,7 +301,7 @@ int main(int argc, char ** argv) {
             LOG_DBG("use session tokens\n");
             embd_inp = session_tokens;
         }
-
+        embd_inp.resize(params.prompt_length);
         LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
         LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
     }
@@ -927,7 +927,11 @@ int main(int argc, char ** argv) {
     LOG("\n\n");
     gpt_perf_print(ctx, smpl);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
-
+    auto timings = llama_perf_context(ctx);
+    std::ofstream file;
+    file.open("latency.txt", std::ios::app);
+    file << params.prompt_length << "," << params.n_predict << "," << 1e3 / timings.t_eval_ms * timings.n_eval << std::endl;
+    file.close();
     gpt_sampler_free(smpl);
 
     llama_free(ctx);
diff --git a/latency.py b/latency.py
@@ -0,0 +1,64 @@
+"""Microbenchmarking for CPU offloading"""
+
+import argparse
+import json
+import os
+import random
+import sys
+import numpy as np
+import subprocess
+import time
+
+if __name__ == "__main__":
+
+    path_json = "../ShareGPT_V3_unfiltered_cleaned_split.json"
+    with open(path_json, "r") as f:
+        data = json.load(f)
+
+    texts = []
+    for d in data:
+        if len(d["conversations"]) == 0:
+            continue
+        # the input of the first round
+        texts.append(" ".join(d["conversations"][0]["value"].split()))
+
+    random.seed(0)
+    random.shuffle(texts)
+    n_sample = 3
+    # for input_token in [16, 32, 64, 128]:
+    #     for output_token in [16, 32, 64, 128, 256, 512]:
+    with open(f"./latency.txt", "a") as f:
+        f.write(f"input_token, output_token, prefill_time, decode_time, token/s\n")
+    idx_text = 0
+    # for input_token in [32, 64, 128, 256, 512]:
+    #     for output_token in [64, 128, 256, 512]:
+    for input_token in [32]:
+        for output_token in [64]:
+            while True:
+                text = texts[idx_text]
+                idx_text += 1
+                if len(text.split()) >= input_token:
+                    # enough input length
+                    break
+            total_time_sum = 0
+            print(f"input_token: {input_token}, output_token: {output_token}")
+            # print("text:", text)
+            subprocess.run(
+                [
+                    "./build/bin/llama-cli",
+                    "-m",
+                    "/root/llama.cpp/models/mixtral-87B-v0.1.gguf",
+                    "-p",
+                    f'"{text}"',
+                    "--prompt_length",
+                    str(input_token),
+                    "-n",
+                    str(output_token),
+                    "-e",
+                    "-ngl",
+                    "8",
+                    "-t",
+                    "16"
+                ],
+                check=True,  # Optional: will raise an error if the command fails
+            )