huggingface · clefourrier · Feb 22, 2024 · Feb 16, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
@@ -43,6 +43,7 @@ def get_parser():
         help="Whether to force multiple choice continuations to not start with a space",
     )
     parser.add_argument("--use_chat_template", default=False, action="store_true")
+    parser.add_argument("--system_prompt", type=str, default=None)
     # Model type 2) TGI
     task_type_group.add_argument("--inference_server_address", type=str)
     parser.add_argument("--inference_server_auth", type=str, default=None)

diff --git a/src/lighteval/few_shot_manager.py b/src/lighteval/few_shot_manager.py
@@ -163,10 +163,12 @@ def get_examples_with_chat_template(
         example: str,
         instruction: str,
         fewshot_ex: list[str],
+        system_prompt: str,
     ):
         examples = []
+        if system_prompt is not None:
+            examples.append({"role": "system", "content": system_prompt})
         for ex in fewshot_ex:
-            # many places to put these "\n" though
             examples.append({"role": "user", "content": task.doc_to_text_without_instructions(ex)})
             examples.append({"role": "assistant", "content": task.doc_to_target(ex)})
         # We add the actual example
@@ -202,6 +204,7 @@ def fewshot_context(
         max_model_length: Optional[int] = None,
         tokenizer: Optional[AutoTokenizer] = None,
         use_chat_template=False,
+        system_prompt: str = None,
     ):
         """Returns a fewshot context string that is made up of a prepended description
         (if provided), the `num_fewshot` number of examples, and an appended prompt example.
@@ -230,7 +233,12 @@ def fewshot_context(
 
         if use_chat_template:
             output = self.get_examples_with_chat_template(
-                task=task, tokenizer=tokenizer, example=example, instruction=instruction, fewshot_ex=fewshot_ex
+                task=task,
+                tokenizer=tokenizer,
+                example=example,
+                instruction=instruction,
+                fewshot_ex=fewshot_ex,
+                system_prompt=system_prompt,
             )
             toks = tokenizer(output)["input_ids"]
         else:
@@ -254,6 +262,7 @@ def fewshot_context(
                         example=example,
                         instruction=instruction,
                         fewshot_ex=fewshot_ex[:num_effective_fewshots],
+                        system_prompt=system_prompt,
                     )
                     toks = tokenizer(output)["input_ids"]
                 else:

diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -69,13 +69,14 @@ def main(args):
 
             hlog("Loading documents, and requests")
             requests, docs = create_requests_from_tasks(
-                task_dict,
-                few_shots_dict,
-                args.num_fewshot_seeds,
-                model,
-                args.max_samples,
-                evaluation_tracker,
-                args.use_chat_template,
+                task_dict=task_dict,
+                fewshot_dict=few_shots_dict,
+                num_fewshot_seeds=args.num_fewshot_seeds,
+                lm=model,
+                max_samples=args.max_samples,
+                evaluation_tracker=evaluation_tracker,
+                use_chat_template=args.use_chat_template,
+                system_prompt=args.system_prompt,
             )
 
     with htrack_block("Setting seeds and waiting for all processes"):

diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
@@ -129,6 +129,7 @@ def main(
                 max_samples=lighteval_config.tasks.max_samples,
                 evaluation_tracker=evaluation_tracker,
                 use_chat_template=False,
+                system_prompt=None,
             )
 
     with htrack_block("Setting seeds and waiting for all processes"):

diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -529,6 +529,7 @@ def create_requests_from_tasks(  # noqa: C901
     max_samples: int,
     evaluation_tracker: "EvaluationTracker",
     use_chat_template: bool,
+    system_prompt: str,
 ) -> Tuple[dict[RequestType, list[Request]], dict[TaskExampleId, Doc]]:
     """
     Takes a task dict and a fewshot dict and returns a dict of requests, a dict
@@ -598,10 +599,16 @@ def create_requests_from_tasks(  # noqa: C901
                         sampler=rnd,
                         tokenizer=lm.tokenizer,
                         use_chat_template=use_chat_template,
+                        system_prompt=system_prompt,
                     )
                     doc.num_effective_few_shots = num_effective_few_shots
                     doc.num_asked_few_shots = num_fewshot
                     doc.ctx = ctx
+                    if use_chat_template:
+                        doc.choices = [
+                            lm.tokenizer.apply_chat_template([{"role": "assistant", "content": choice}])
+                            for choice in doc.choices
+                        ]
 
                     # Constructing the requests
                     docs[TaskExampleId(cur_task_name, doc_id_seed)] = doc