Go back to repetition_penalty 1.0 as default, hurts larger models too…

… much when repeated tokens normal, like: What is bigger, 9.9 or 9.11? and doing CoT. Also fix system prompt use when dealing with exact matches
h2oai · Aug 9, 2024 · 4a8bda6 · 4a8bda6
1 parent 842e770
commit 4a8bda6
Show file tree

Hide file tree

Showing 5 changed files with 8 additions and 7 deletions.
diff --git a/gradio_utils/grclient.py b/gradio_utils/grclient.py
@@ -732,7 +732,8 @@ def query_or_summarize_or_extract(
         temperature: float = 0.0,
         top_p: float = 1.0,
         top_k: int = 40,
-        repetition_penalty: float = 1.07,
+        # 1.07 causes issues still with more repetition
+        repetition_penalty: float = 1.0,
         penalty_alpha: float = 0.0,
         max_time: int = 360,
         max_new_tokens: int = 1024,

diff --git a/src/gen.py b/src/gen.py
@@ -4494,7 +4494,7 @@ def get_generate_params(model_lower,
         penalty_alpha = 0 if penalty_alpha is None else penalty_alpha
         num_beams = num_beams or 1
         max_new_tokens = max_new_tokens or 512
-        repetition_penalty = repetition_penalty or 1.07
+        repetition_penalty = repetition_penalty or 1.0  # 1.07 causes issues still with more repetition
         num_return_sequences = min(num_beams, num_return_sequences or 1)
         do_sample = False if do_sample is None else do_sample
     else:
@@ -4504,7 +4504,7 @@ def get_generate_params(model_lower,
         penalty_alpha = 0 if penalty_alpha is None else penalty_alpha
         num_beams = num_beams or 1
         max_new_tokens = max_new_tokens or 1024
-        repetition_penalty = repetition_penalty or 1.07
+        repetition_penalty = repetition_penalty or 1.0  # 1.07 causes issues still with more repetition
         num_return_sequences = min(num_beams, num_return_sequences or 1)
         do_sample = False if do_sample is None else do_sample
     # doesn't include chat, instruction_nochat, iinput_nochat, added later

diff --git a/src/prompter.py b/src/prompter.py
@@ -2406,14 +2406,14 @@ def apply_chat_template(instruction, system_prompt, history, image_file,
     for si, system_prompt_to_use in enumerate(system_prompts_to_use):
         try:
             messages = structure_to_messages(instruction,
-                                             system_prompt_to_use,
+                                             system_prompt_to_use.strip() if system_prompt_to_use else system_prompt_to_use,
                                              history,
                                              image_file,
                                              )
             if not messages:
                 return ''
             prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            if si == 0 and system_prompt_to_use not in [None, ''] and system_prompt_to_use not in prompt:
+            if si == 0 and system_prompt_to_use not in [None, ''] and system_prompt_to_use.strip() != '' and system_prompt_to_use.strip() not in prompt.strip():
                 raise ValueError("System prompt not used: %s" % system_prompt_to_use)
             break
         except Exception as e:

diff --git a/src/version.py b/src/version.py
@@ -1 +1 @@
-__version__ = "794ec254460a0c38a2e3ae3e4437f5dc0f695a09"
+__version__ = "3618868401689179d98c95be3ecdcc64d44d2acd"
diff --git a/tests/test_eval.py b/tests/test_eval.py
@@ -79,7 +79,7 @@ def run_eval1(cpu=False, bits=None, base_model='h2oai/h2ogpt-oig-oasst1-512-6_9b
     kwargs = dict(
         stream_output=False, prompt_type=prompt_type, prompt_dict='',
         temperature=0.4, top_p=0.85, top_k=70, penalty_alpha=0.0, num_beams=1, max_new_tokens=256,
-        min_new_tokens=0, early_stopping=False, max_time=180, repetition_penalty=1.07,
+        min_new_tokens=0, early_stopping=False, max_time=180, repetition_penalty=1.0,
         num_return_sequences=1, do_sample=True, seed=0, chat=False,
         langchain_mode=langchain_mode, add_chat_history_to_context=True,
         add_search_to_context=False,