finitearth · mo374z · Mar 9, 2025 · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,5 @@ __pycache__/
 temp/
 dist/
 outputs/
+results/
 poetry.lock
diff --git a/docs/release-notes.md b/docs/release-notes.md
@@ -1,5 +1,19 @@
 # Release Notes
 
+## Release v1.3.0
+### What's changed
+#### Added features
+* new features for the VLLM Wrapper (automatic batch size determination, accepting kwargs)
+* allow callbacks to terminate optimization run
+* add token count functionality
+* renamed "Classificator"-Predictor to "FirstOccurenceClassificator"
+* introduced "MarkerBasedClassifcator"
+* automatic task description creation
+* use task description in prompt creation
+* implement CSV callbacks
+
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.2.0...v1.3.0)
+
 ## Release v1.2.0
 ### What's changed
 #### Added features

diff --git a/promptolution/callbacks.py b/promptolution/callbacks.py
@@ -1,7 +1,9 @@
 """Callback classes for logging, saving, and tracking optimization progress."""
 
 import os
+import time
 
+import numpy as np
 import pandas as pd
 from tqdm import tqdm
 
@@ -14,24 +16,33 @@ def on_step_end(self, optimizer):
 
         Args:
         optimizer: The optimizer object that called the callback.
+
+        Returns:
+            Bool: True if the optimization should continue, False if it should stop.
         """
-        pass
+        return True
 
     def on_epoch_end(self, optimizer):
         """Called at the end of each optimization epoch.
 
         Args:
         optimizer: The optimizer object that called the callback.
+
+        Returns:
+            Bool: True if the optimization should continue, False if it should stop.
         """
-        pass
+        return True
 
     def on_train_end(self, optimizer):
         """Called at the end of the entire optimization process.
 
         Args:
         optimizer: The optimizer object that called the callback.
+
+        Returns:
+            Bool: True if the optimization should continue, False if it should stop.
         """
-        pass
+        return True
 
 
 class LoggerCallback(Callback):
@@ -57,14 +68,21 @@ def on_step_end(self, optimizer):
             self.logger.critical(f"*** Prompt {i}: Score: {score}")
             self.logger.critical(f"{prompt}")
 
+        return True
+
     def on_train_end(self, optimizer, logs=None):
         """Log information at the end of training.
 
         Args:
         optimizer: The optimizer object that called the callback.
         logs: Additional information to log.
         """
-        self.logger.critical(f"Training ended - {logs}")
+        if logs is None:
+            self.logger.critical("Training ended")
+        else:
+            self.logger.critical(f"Training ended - {logs}")
+
+        return True
 
 
 class CSVCallback(Callback):
@@ -73,25 +91,24 @@ class CSVCallback(Callback):
     This callback saves prompts and scores at each step to a CSV file.
 
     Attributes:
-        path (str): The path to the CSV file.
+        dir (str): Directory the CSV file is saved to.
         step (int): The current step number.
     """
 
-    def __init__(self, path):
+    def __init__(self, dir):
         """Initialize the CSVCallback.
 
         Args:
-        path (str): The path to the CSV file.
+        dir (str): Directory the CSV file is saved to.
         """
-        # if dir does not exist
-        if not os.path.exists(os.path.dirname(path)):
-            os.makedirs(os.path.dirname(path))
-
-        # create file in path with header: "step,prompt,score"
-        with open(path, "w") as f:
-            f.write("step,prompt,score\n")
-        self.path = path
+        if not os.path.exists(dir):
+            os.makedirs(dir)
+
+        self.dir = dir
         self.step = 0
+        self.input_tokens = 0
+        self.output_tokens = 0
+        self.step_time = time.time()
 
     def on_step_end(self, optimizer):
         """Save prompts and scores to csv.
@@ -101,17 +118,50 @@ def on_step_end(self, optimizer):
         """
         self.step += 1
         df = pd.DataFrame(
-            {"step": [self.step] * len(optimizer.prompts), "prompt": optimizer.prompts, "score": optimizer.scores}
+            {
+                "step": [self.step] * len(optimizer.prompts),
+                "input_tokens": [optimizer.meta_llm.input_token_count - self.input_tokens] * len(optimizer.prompts),
+                "output_tokens": [optimizer.meta_llm.output_token_count - self.output_tokens] * len(optimizer.prompts),
+                "time_elapsed": [time.time() - self.step_time] * len(optimizer.prompts),
+                "score": optimizer.scores,
+                "prompt": optimizer.prompts,
+            }
         )
-        df.to_csv(self.path, mode="a", header=False, index=False)
+        self.step_time = time.time()
+        self.input_tokens = optimizer.meta_llm.input_token_count
+        self.output_tokens = optimizer.meta_llm.output_token_count
+
+        if not os.path.exists(self.dir + "step_results.csv"):
+            df.to_csv(self.dir + "step_results.csv", index=False)
+        else:
+            df.to_csv(self.dir + "step_results.csv", mode="a", header=False, index=False)
+
+        return True
 
     def on_train_end(self, optimizer):
         """Called at the end of training.
 
         Args:
         optimizer: The optimizer object that called the callback.
         """
-        pass
+        df = pd.DataFrame(
+            dict(
+                steps=self.step,
+                input_tokens=optimizer.meta_llm.input_token_count,
+                output_tokens=optimizer.meta_llm.output_token_count,
+                time_elapsed=time.time() - optimizer.start_time,
+                score=np.array(optimizer.scores).mean(),
+                best_prompts=str(optimizer.prompts),
+            ),
+            index=[0],
+        )
+
+        if not os.path.exists(self.dir + "train_results.csv"):
+            df.to_csv(self.dir + "train_results.csv", index=False)
+        else:
+            df.to_csv(self.dir + "train_results.csv", mode="a", header=False, index=False)
+
+        return True
 
 
 class BestPromptCallback(Callback):
@@ -139,6 +189,8 @@ def on_step_end(self, optimizer):
             self.best_score = optimizer.scores[0]
             self.best_prompt = optimizer.prompts[0]
 
+        return True
+
     def get_best_prompt(self):
         """Get the best prompt and score achieved during optimization.
 
@@ -173,10 +225,32 @@ def on_step_end(self, optimizer):
         """
         self.pbar.update(1)
 
+        return True
+
     def on_train_end(self, optimizer):
         """Close the progress bar at the end of training.
 
         Args:
         optimizer: The optimizer object that called the callback.
         """
         self.pbar.close()
+
+        return True
+
+
+class TokenCountCallback(Callback):
+    """Callback for stopping optimization based on the total token count."""
+
+    def __init__(self, max_tokens_for_termination):
+        """Initialize the TokenCountCallback."""
+        self.max_tokens_for_termination = max_tokens_for_termination
+
+    def on_step_end(self, optimizer):
+        """Check if the total token count exceeds the maximum allowed. If so, stop the optimization."""
+        token_counts = optimizer.predictor.llm.get_token_count()
+        total_token_count = token_counts["total_tokens"]
+
+        if total_token_count > self.max_tokens_for_termination:
+            return False
+
+        return True
diff --git a/promptolution/config.py b/promptolution/config.py
@@ -17,15 +17,17 @@ class Config:
         ds_path (str): Path to the dataset. Should not be None if used.
         n_steps (int): Number of optimization steps. Should not be None if used.
         optimizer (str): Name of the optimizer to use. Should not be None if used.
+        predictor (str): Name of the predictor to use. Defaults to "FirstOccurenceClassificator".
         meta_llm (str): Name of the meta language model. Should not be None if used.
         downstream_llm (str): Name of the downstream language model. Should not be None if used.
         evaluation_llm (str): Name of the evaluation language model. Should not be None if used.
         init_pop_size (int): Initial population size. Defaults to 10.
         logging_dir (str): Directory for logging. Defaults to "logs/run.csv".
         experiment_name (str): Name of the experiment. Defaults to "experiment".
-        include_task_desc (bool): Whether to include task description. Defaults to False.
+        task_description (str): Task Description fed to the optimizer. Defaults to None.
         donor_random (bool): Whether to use random donor prompts for EvoPromptDE. Defaults to False.
         random_seed (int): Random seed for reproducibility. Defaults to 42.
+        model_storage_path (str): Path to the model storage directory (used for VLLM). Defaults to "../models/".
         selection_mode (str): Selection mode for EvoPromptGA. Defaults to "random".
         meta_bs (int): Batch size for local meta LLM. Should not be None if llm is run locally. Defaults to None.
         downstream_bs (int): Batch size for local downstream LLM.
@@ -46,16 +48,18 @@ class Config:
     task_name: str = None
     ds_path: Path = None
     optimizer: str = None
+    predictor: Literal["MarkerBasedClassificator", "FirstOccurenceClassificator"] = "FirstOccurenceClassificator"
     meta_llm: str = None
     downstream_llm: str = None
     evaluation_llm: str = None
     n_steps: int = None
     init_pop_size: int = None
     logging_dir: Path = Path("logs/run.csv")
     experiment_name: str = "experiment"
-    include_task_desc: bool = True
+    task_description: str = None
     donor_random: bool = False
     random_seed: int = 42
+    model_storage_path: Optional[Path] = Path("../models/")
     selection_mode: Optional[Literal["random", "wheel", "tour"]] = "random"
     meta_bs: Optional[int] = None
     downstream_bs: Optional[int] = None

diff --git a/promptolution/helpers.py b/promptolution/helpers.py
@@ -9,7 +9,7 @@
 from promptolution.exemplar_selectors import get_exemplar_selector
 from promptolution.llms import get_llm
 from promptolution.optimizers import get_optimizer
-from promptolution.predictors import Classificator
+from promptolution.predictors import FirstOccurrenceClassificator, MarkerBasedClassificator
 from promptolution.tasks import get_task
 
 
@@ -27,7 +27,7 @@ def run_experiment(config: Config):
     return df
 
 
-def run_optimization(config: Config):
+def run_optimization(config: Config, callbacks: List = None):
     """Run the optimization phase of the experiment.
 
     Args:
@@ -37,8 +37,13 @@ def run_optimization(config: Config):
         List[str]: The optimized list of prompts.
     """
     task = get_task(config)
-    llm = get_llm(config.meta_llm, token=config.api_token)
-    predictor = Classificator(llm, classes=task.classes)
+    llm = get_llm(config.meta_llm, token=config.api_token, model_storage_path=config.model_storage_path)
+    if config.predictor == "MarkerBasedClassificator":
+        predictor = MarkerBasedClassificator(llm, classes=task.classes)
+    elif config.predictor == "FirstOccurenceClassificator":
+        predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
+    else:
+        raise ValueError(f"Predictor {config.predictor} not supported.")
 
     if config.init_pop_size:
         init_pop = np.random.choice(task.initial_population, size=config.init_pop_size, replace=True)
@@ -52,6 +57,8 @@ def run_optimization(config: Config):
         task=task,
         predictor=predictor,
         n_eval_samples=config.n_eval_samples,
+        callbacks=callbacks,
+        task_description=predictor.extraction_description,
     )
 
     prompts = optimizer.optimize(n_steps=config.n_steps)
@@ -76,7 +83,7 @@ def run_evaluation(config: Config, prompts: List[str]):
     task = get_task(config, split="test")
 
     llm = get_llm(config.evaluation_llm, token=config.api_token)
-    predictor = Classificator(llm, classes=task.classes)
+    predictor = FirstOccurrenceClassificator(llm, classes=task.classes)
 
     scores = task.evaluate(prompts, predictor, subsample=True, n_samples=config.n_eval_samples)
     df = pd.DataFrame(dict(prompt=prompts, score=scores))

diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
@@ -3,7 +3,7 @@
 import asyncio
 import time
 from logging import INFO, Logger
-from typing import List
+from typing import Any, List
 
 import nest_asyncio
 import openai
@@ -63,7 +63,7 @@ class APILLM(BaseLLM):
         get_response_async: Asynchronously get responses for a list of prompts.
     """
 
-    def __init__(self, model_id: str, token: str = None):
+    def __init__(self, model_id: str, token: str = None, **kwargs: Any):
         """Initialize the APILLM with a specific model.
 
         Args:
@@ -73,14 +73,15 @@ def __init__(self, model_id: str, token: str = None):
         Raises:
             ValueError: If an unknown model identifier is provided.
         """
+        super().__init__()
         if "claude" in model_id:
             self.model = ChatAnthropic(model=model_id, api_key=token)
         elif "gpt" in model_id:
             self.model = ChatOpenAI(model=model_id, api_key=token)
         else:
             self.model = ChatDeepInfra(model_name=model_id, deepinfra_api_token=token)
 
-    def get_response(self, prompts: List[str]) -> List[str]:
+    def _get_response(self, prompts: List[str]) -> List[str]:
         """Get responses for a list of prompts in a synchronous manner.
 
         This method includes retry logic for handling connection errors and rate limits.