finitearth
diff --git a/‎docs/release-notes.md‎
Lines changed: 12 additions & 0 deletions b/‎docs/release-notes.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎promptolution/callbacks.py‎
Lines changed: 35 additions & 48 deletions b/‎promptolution/callbacks.py‎
Lines changed: 35 additions & 48 deletions
diff --git a/‎promptolution/llms/api_llm.py‎
Lines changed: 6 additions & 4 deletions b/‎promptolution/llms/api_llm.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎promptolution/llms/base_llm.py‎
Lines changed: 21 additions & 5 deletions b/‎promptolution/llms/base_llm.py‎
Lines changed: 21 additions & 5 deletions
diff --git a/‎promptolution/llms/local_llm.py‎
Lines changed: 6 additions & 2 deletions b/‎promptolution/llms/local_llm.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎promptolution/llms/vllm.py‎
Lines changed: 13 additions & 4 deletions b/‎promptolution/llms/vllm.py‎
Lines changed: 13 additions & 4 deletions
@@ -1,5 +1,15 @@
 # Release Notes
 
+## Release v1.3.2
+### What's changed
+#### Added features
+* Allow for configuration and evaluation of system prompts in all LLM-Classes
+* CSV Callback is now FileOutputCallback and able to write Parquet files
+* Fixed LLM-Call templates in VLLM
+* refined OPRO-implementation to be closer to the paper
+
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.3.1...v1.3.2)
+
 ## Release v1.3.1
 ### What's changed
 #### Added features
@@ -9,6 +19,8 @@
 * generalize the Classificator
 * add verbosity and callback handling in EvoPromptGA
 * add timestamp to the callback
+* removed datasets from repo
+* changed task creation (now by default with a dataset)
 
 **Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.3.0...v1.3.1)
 
 
@@ -66,6 +66,8 @@ def on_step_end(self, optimizer):
         self.step += 1
         time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f")
         self.logger.critical(f"{time} - ✨Step {self.step} ended✨")
+        time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f")
+        self.logger.critical(f"{time} - ✨Step {self.step} ended✨")
         for i, (prompt, score) in enumerate(zip(optimizer.prompts, optimizer.scores)):
             self.logger.critical(f"*** Prompt {i}: Score: {score}")
             self.logger.critical(f"{prompt}")
@@ -80,40 +82,48 @@ def on_train_end(self, optimizer, logs=None):
         logs: Additional information to log.
         """
         time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f")
+        time = datetime.now().strftime("%d-%m-%y %H:%M:%S:%f")
         if logs is None:
             self.logger.critical(f"{time} - Training ended")
+            self.logger.critical(f"{time} - Training ended")
         else:
             self.logger.critical(f"{time} - Training ended - {logs}")
+            self.logger.critical(f"{time} - Training ended - {logs}")
 
         return True
 
 
-class CSVCallback(Callback):
-    """Callback for saving optimization progress to a CSV file.
+class FileOutputCallback(Callback):
+    """Callback for saving optimization progress to a specified file type.
 
-    This callback saves prompts and scores at each step to a CSV file.
+    This callback saves information about each step to a file.
 
     Attributes:
-        dir (str): Directory the CSV file is saved to.
+        dir (str): Directory the file is saved to.
         step (int): The current step number.
+        file_type (str): The type of file to save the output to.
     """
 
-    def __init__(self, dir):
-        """Initialize the CSVCallback.
+    def __init__(self, dir, file_type: Literal["parquet", "csv"] = "parquet"):
+        """Initialize the FileOutputCallback.
 
         Args:
         dir (str): Directory the CSV file is saved to.
+        file_type (str): The type of file to save the output to.
         """
         if not os.path.exists(dir):
             os.makedirs(dir)
 
-        self.dir = dir
-        self.dir = dir
+        self.file_type = file_type
+
+        if file_type == "parquet":
+            self.path = dir + "/step_results.parquet"
+        elif file_type == "csv":
+            self.path = dir + "/step_results.csv"
+        else:
+            raise ValueError(f"File type {file_type} not supported.")
+
         self.step = 0
-        self.input_tokens = 0
-        self.output_tokens = 0
-        self.start_time = datetime.now()
-        self.step_time = datetime.now()
 
     def on_step_end(self, optimizer):
         """Save prompts and scores to csv.
@@ -125,47 +135,24 @@ def on_step_end(self, optimizer):
         df = pd.DataFrame(
             {
                 "step": [self.step] * len(optimizer.prompts),
-                "input_tokens": [optimizer.meta_llm.input_token_count - self.input_tokens] * len(optimizer.prompts),
-                "output_tokens": [optimizer.meta_llm.output_token_count - self.output_tokens] * len(optimizer.prompts),
-                "time_elapsed": [(datetime.now() - self.step_time).total_seconds()] * len(optimizer.prompts),
+                "input_tokens": [optimizer.meta_llm.input_token_count] * len(optimizer.prompts),
+                "output_tokens": [optimizer.meta_llm.output_token_count] * len(optimizer.prompts),
+                "time": [datetime.now().total_seconds()] * len(optimizer.prompts),
                 "score": optimizer.scores,
                 "prompt": optimizer.prompts,
             }
         )
-        self.step_time = datetime.now()
-        self.input_tokens = optimizer.meta_llm.input_token_count
-        self.output_tokens = optimizer.meta_llm.output_token_count
-
-        if not os.path.exists(self.dir + "step_results.csv"):
-            df.to_csv(self.dir + "step_results.csv", index=False)
-        else:
-            df.to_csv(self.dir + "step_results.csv", mode="a", header=False, index=False)
-
-        return True
-
-    def on_train_end(self, optimizer):
-        """Called at the end of training.
-
-        Args:
-        optimizer: The optimizer object that called the callback.
-        """
-        df = pd.DataFrame(
-            dict(
-                steps=self.step,
-                input_tokens=optimizer.meta_llm.input_token_count,
-                output_tokens=optimizer.meta_llm.output_token_count,
-                time_elapsed=(datetime.now() - self.start_time).total_seconds(),
-                time=datetime.now(),
-                score=np.array(optimizer.scores).mean(),
-                best_prompts=str(optimizer.prompts),
-            ),
-            index=[0],
-        )
 
-        if not os.path.exists(self.dir + "train_results.csv"):
-            df.to_csv(self.dir + "train_results.csv", index=False)
-        else:
-            df.to_csv(self.dir + "train_results.csv", mode="a", header=False, index=False)
+        if self.file_type == "parquet":
+            if self.step == 1:
+                df.to_parquet(self.path, index=False)
+            else:
+                df.to_parquet(self.path, mode="a", index=False)
+        elif self.file_type == "csv":
+            if self.step == 1:
+                df.to_csv(self.path, index=False)
+            else:
+                df.to_csv(self.path, mode="a", header=False, index=False)
 
         return True
 
 
@@ -10,19 +10,20 @@
 import requests
 from langchain_anthropic import ChatAnthropic
 from langchain_community.chat_models.deepinfra import ChatDeepInfra, ChatDeepInfraException
-from langchain_core.messages import HumanMessage
+from langchain_core.messages import HumanMessage, SystemMessage
 from langchain_openai import ChatOpenAI
 
 from promptolution.llms.base_llm import BaseLLM
 
 logger = Logger(__name__)
 
 
-async def invoke_model(prompt, model, semaphore):
+async def invoke_model(prompt, system_prompt, model, semaphore):
     """Asynchronously invoke a language model with retry logic.
 
     Args:
         prompt (str): The input prompt for the model.
+        system_prompt (str): The system prompt for the model.
         model: The language model to invoke.
         semaphore (asyncio.Semaphore): Semaphore to limit concurrent calls.
 
@@ -39,7 +40,7 @@ async def invoke_model(prompt, model, semaphore):
 
         while attempts < max_retries:
             try:
-                response = await model.ainvoke([HumanMessage(content=prompt)])
+                response = await model.ainvoke([SystemMessage(content=system_prompt), HumanMessage(content=prompt)])
                 return response.content
             except ChatDeepInfraException as e:
                 print(f"DeepInfra error: {e}. Attempt {attempts}/{max_retries}. Retrying in {delay} seconds...")
@@ -80,13 +81,14 @@ def __init__(self, model_id: str, token: str = None, **kwargs: Any):
         else:
             self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token)
 
-    def _get_response(self, prompts: List[str]) -> List[str]:
+    def _get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
         """Get responses for a list of prompts in a synchronous manner.
 
         This method includes retry logic for handling connection errors and rate limits.
 
         Args:
             prompts (list[str]): List of input prompts.
+            system_prompts (list[str]): List of system prompts. If not provided, uses default system_prompts
 
         Returns:
             list[str]: List of model responses.
 
@@ -6,6 +6,8 @@
 
 import numpy as np
 
+from promptolution.templates import DEFAULT_SYS_PROMPT
+
 logger = logging.getLogger(__name__)
 
 
@@ -54,7 +56,7 @@ def update_token_count(self, inputs: List[str], outputs: List[str]):
         self.input_token_count += input_tokens
         self.output_token_count += output_tokens
 
-    def get_response(self, prompts: str) -> str:
+    def get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
         """Generate responses for the given prompts.
 
         This method calls the _get_response method to generate responses
@@ -64,31 +66,45 @@ def get_response(self, prompts: str) -> str:
         Args:
             prompts (str or List[str]): Input prompt(s). If a single string is provided,
                                         it's converted to a list containing that string.
+            system_prompts (str or List[str]): System prompt(s) to provide context to the model.
 
         Returns:
             List[str]: A list of generated responses, one for each input prompt.
         """
+        if system_prompts is None:
+            system_prompts = DEFAULT_SYS_PROMPT
         if isinstance(prompts, str):
             prompts = [prompts]
-        responses = self._get_response(prompts)
-        self.update_token_count(prompts, responses)
+        if isinstance(system_prompts, str):
+            system_prompts = [system_prompts] * len(prompts)
+        responses = self._get_response(prompts, system_prompts)
+        self.update_token_count(prompts + system_prompts, responses)
 
         return responses
 
+    def set_generation_seed(self, seed: int):
+        """Set the random seed for reproducibility per request.
+
+        Args:
+            seed (int): Random seed value.
+        """
+        pass
+
     @abstractmethod
-    def _get_response(self, prompts: List[str]) -> List[str]:
+    def _get_response(self, prompts: List[str], system_prompts: List[str] = None) -> List[str]:
         """Generate responses for the given prompts.
 
         This method should be implemented by subclasses to define how
         the LLM generates responses.
 
         Args:
             prompts (List[str]): A list of input prompts.
+            system_prompts (List[str]): A list of system prompts to provide context to the model.
 
         Returns:
             List[str]: A list of generated responses corresponding to the input prompts.
         """
-        pass
+        raise NotImplementedError
 
 
 class DummyLLM(BaseLLM):
 
@@ -50,7 +50,7 @@ def __init__(self, model_id: str, batch_size=8):
         self.pipeline.tokenizer.pad_token_id = self.pipeline.tokenizer.eos_token_id
         self.pipeline.tokenizer.padding_side = "left"
 
-    def _get_response(self, prompts: list[str]):
+    def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
         """Generate responses for a list of prompts using the local language model.
 
         Args:
@@ -63,8 +63,12 @@ def _get_response(self, prompts: list[str]):
             This method uses torch.no_grad() for inference to reduce memory usage.
             It handles both single and batch inputs, ensuring consistent output format.
         """
+        inputs = []
+        for prompt, sys_prompt in zip(prompts, system_prompts):
+            inputs.append([{"role": "system", "prompt": sys_prompt}, {"role": "user", "prompt": prompt}])
+
         with torch.no_grad():
-            response = self.pipeline(prompts, pad_token_id=self.pipeline.tokenizer.eos_token_id)
+            response = self.pipeline(inputs, pad_token_id=self.pipeline.tokenizer.eos_token_id)
 
         if len(response) != 1:
             response = [r[0] if isinstance(r, list) else r for r in response]
 
@@ -108,7 +108,7 @@ def __init__(
         # Initialize tokenizer separately for potential pre-processing
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-    def _get_response(self, inputs: list[str]):
+    def _get_response(self, prompts: list[str], system_prompts: list[str]) -> list[str]:
         """Generate responses for a list of prompts using the vLLM engine.
 
         Args:
@@ -126,13 +126,14 @@ def _get_response(self, inputs: list[str]):
                 [
                     {
                         "role": "system",
-                        "content": "You are a helpful assistant.",
+                        "content": sys_prompt,
                     },
-                    {"role": "user", "content": input},
+                    {"role": "user", "content": prompt},
                 ],
                 tokenize=False,
+                add_generation_prompt=True,
             )
-            for input in inputs
+            for prompt, sys_prompt in zip(prompts, system_prompts)
         ]
 
         # generate responses for self.batch_size prompts at the same time
@@ -161,6 +162,14 @@ def update_token_count(self, inputs: List[str], outputs: List[str]):
         for output in outputs:
             self.output_token_count += len(self.tokenizer.encode(output))
 
+    def set_generation_seed(self, seed):
+        """Set the random seed for text generation.
+
+        Args:
+            seed (int): Random seed for text generation.
+        """
+        self.sampling_params.seed = seed
+
     def __del__(self):
         """Cleanup method to delete the LLM instance and free up GPU memory."""
         del self.llm