Support multiple prompts in the runner

cccclai · facebook-github-bot · commit 1e9a596c5f44 · 2025-04-01T16:03:33.000-07:00
Summary:
As a preparation for the multiturn conversation, we can provide multiple prompts and execute them in sequence. Example command:
```
./qnn_llama3_2_runner --model_path hybrid_llama_qnn.pte    --tokenizer_path tiktokenizer.bin  --eval_mode 1 --prompt "Once upon a time" --prompt "girl named Lily." --prompt "her toys and her favorite toy was a big,"  --kv_updater "ShiftPointer" --logits_scale 0.1 --output_path output.txt --num_iters 1
```
It will be hard to use any char as delimiter, so we use `--prompt` to explicitly mark a prompt and collect them together.

Differential Revision: D72276104
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -35,6 +35,7 @@ DEFINE_string(
     "Records inference speed. For CI purpose.");
 DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
 DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
+
 DEFINE_string(
     system_prompt,
     "",
@@ -59,9 +60,22 @@ DEFINE_string(
     "SmartMask");
 DEFINE_int32(num_iters, 1, "total num of iterations to run.");
 
+std::vector<std::string> CollectPrompts(int argc, char** argv) {
+    // Collect all prompts from command line, example usage: 
+    // --prompt "prompt1" --prompt "prompt2" --prompt "prompt3"
+    std::vector<std::string> prompts;
+    for (int i = 1; i < argc; i++) {
+        if (std::string(argv[i]) == "--prompt" && i + 1 < argc) {
+            prompts.push_back(argv[i+1]);
+            i++;  // Skip the next argument
+        }
+    }
+    return prompts;
+}
+
 int main(int argc, char** argv) {
+  std::vector<std::string> prompts = CollectPrompts(argc, argv);
   gflags::ParseCommandLineFlags(&argc, &argv, true);
-
   // create llama runner
   example::Runner runner(
       {FLAGS_model_path},
@@ -83,11 +97,13 @@ int main(int argc, char** argv) {
   };
   // generate tokens & store inference output
   for (int i = 0; i < FLAGS_num_iters; i++) {
-    runner.generate(
+    for (const auto& prompt : prompts) {
+        runner.generate(
         FLAGS_seq_len,
-        FLAGS_prompt.c_str(),
+        prompt.c_str(),
         FLAGS_system_prompt.c_str(),
         callback);
+    }
   }
   fout.write(buf.data(), buf.size());
   fout.close();