v1.2.0 (#31)

mo374z · finitearth · timo282 · web-flow · commit 0eb5409df6f0 · 2025-03-06T13:48:59.000+01:00
* Add vllm as feature and a llm_test_run_script

* small fixes in vllm class

* differentiate between vllm and api inference

* add base llm super class

* add changes from PR review

* change some VLLM params

* add batching to vllm

* Add release notes and increase version number

* change system prompt

---------
Co-authored-by: Tom Zehle &lt;t.zehle@gmail.com&gt;
Co-authored-by: Timo Heiß &lt;ti-heiss@t-online.de&gt;
diff --git a/.flake8 b/.flake8
@@ -1,3 +1,3 @@
 [flake8]
 max-line-length = 120
-ignore = F401, W503
+ignore = E731,E231,E203,E501,F401,W503
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,5 @@ rsync_exclude.txt
 __pycache__/
 temp/
 dist/
+outputs/
 poetry.lock
diff --git a/docs/release-notes.md b/docs/release-notes.md
@@ -1,10 +1,19 @@
 # Release Notes
 
+## Release v1.2.0
+### What's changed
+#### Added features
+* New LLM wrapper: VLLM for local inference with batches
+
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.1.1...v1.2.0)
+
 ## Release v1.1.1
 ### What's Changed
 #### Further Changes:
 - deleted poetry.lock
-- updated transformers dependency: bumped from 4.46.3 to 4.48.0 
+- updated transformers dependency: bumped from 4.46.3 to 4.48.0
+
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.1.0...v1.1.1)
 
 ## Release v1.1.0
 ### What's changed
@@ -16,6 +25,8 @@
 * improved opros meta-prompt
 * added support for python versions from 3.9 onwards (previously 3.11)
 
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.0.1...v1.1.0)
+
 ## Release v1.0.1
 ### What's changed
 #### Added features
@@ -24,6 +35,8 @@
 #### Further Changes:
 * fixed release notes
 
+**Full Changelog**: [here](https://github.com/finitearth/promptolution/compare/v1.0.0...v1.0.1)
+
 ## Release v1.0.0
 ### What's changed
 #### Added Features:
diff --git a/promptolution/llms/__init__.py b/promptolution/llms/__init__.py
@@ -3,20 +3,23 @@
 from .api_llm import APILLM
 from .base_llm import DummyLLM
 from .local_llm import LocalLLM
+from .vllm import VLLM
 
 
 def get_llm(model_id: str, *args, **kwargs):
     """Factory function to create and return a language model instance based on the provided model_id.
 
     This function supports three types of language models:
     1. DummyLLM: A mock LLM for testing purposes.
-    2. LocalLLM: For running models locally (identified by 'local' in the model_id).
-    3. APILLM: For API-based models (default if not matching other types).
+    2. LocalLLM: For running models locally.
+    3. VLLM: For running models using the vLLM library.
+    4. APILLM: For API-based models (default if not matching other types).
 
     Args:
         model_id (str): Identifier for the model to use. Special cases:
                         - "dummy" for DummyLLM
                         - "local-{model_name}" for LocalLLM
+                        - "vllm-{model_name}" for VLLM
                         - Any other string for APILLM
         *args: Variable length argument list passed to the LLM constructor.
         **kwargs: Arbitrary keyword arguments passed to the LLM constructor.
@@ -29,4 +32,7 @@ def get_llm(model_id: str, *args, **kwargs):
     if "local" in model_id:
         model_id = "-".join(model_id.split("-")[1:])
         return LocalLLM(model_id, *args, **kwargs)
+    if "vllm" in model_id:
+        model_id = "-".join(model_id.split("-")[1:])
+        return VLLM(model_id, *args, **kwargs)
     return APILLM(model_id, *args, **kwargs)
diff --git a/promptolution/llms/api_llm.py b/promptolution/llms/api_llm.py
@@ -13,6 +13,8 @@
 from langchain_core.messages import HumanMessage
 from langchain_openai import ChatOpenAI
 
+from promptolution.llms.base_llm import BaseLLM
+
 logger = Logger(__name__)
 logger.setLevel(INFO)
 
@@ -46,7 +48,7 @@ async def invoke_model(prompt, model, semaphore):
                 await asyncio.sleep(delay)
 
 
-class APILLM:
+class APILLM(BaseLLM):
     """A class to interface with various language models through their respective APIs.
 
     This class supports Claude (Anthropic), GPT (OpenAI), and LLaMA (DeepInfra) models.
diff --git a/promptolution/llms/local_llm.py b/promptolution/llms/local_llm.py
@@ -8,8 +8,10 @@
     logger = logging.getLogger(__name__)
     logger.warning(f"Could not import torch or transformers in local_llm.py: {e}")
 
+from promptolution.llms.base_llm import BaseLLM
 
-class LocalLLM:
+
+class LocalLLM(BaseLLM):
     """A class for running language models locally using the Hugging Face Transformers library.
 
     This class sets up a text generation pipeline with specified model parameters
diff --git a/promptolution/llms/vllm.py b/promptolution/llms/vllm.py
@@ -0,0 +1,135 @@
+"""Module for running language models locally using the vLLM library."""
+
+
+from logging import INFO, Logger
+
+try:
+    import torch
+    from transformers import AutoTokenizer
+    from vllm import LLM, SamplingParams
+except ImportError as e:
+    import logging
+
+    logger = logging.getLogger(__name__)
+    logger.warning(f"Could not import vllm, torch or transformers in vllm.py: {e}")
+
+from promptolution.llms.base_llm import BaseLLM
+
+logger = Logger(__name__)
+logger.setLevel(INFO)
+
+
+class VLLM(BaseLLM):
+    """A class for running language models using the vLLM library.
+
+    This class sets up a vLLM inference engine with specified model parameters
+    and provides a method to generate responses for given prompts.
+
+    Attributes:
+        llm (vllm.LLM): The vLLM inference engine.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
+        sampling_params (vllm.SamplingParams): Parameters for text generation.
+
+    Methods:
+        get_response: Generate responses for a list of prompts.
+    """
+
+    def __init__(
+        self,
+        model_id: str,
+        batch_size: int = 64,
+        max_generated_tokens: int = 256,
+        temperature: float = 0.1,
+        top_p: float = 0.9,
+        model_storage_path: str = None,
+        token: str = None,
+        dtype: str = "auto",
+        tensor_parallel_size: int = 1,
+        gpu_memory_utilization: float = 0.95,
+        max_model_len: int = 2048,
+        trust_remote_code: bool = False,
+    ):
+        """Initialize the VLLM with a specific model.
+
+        Args:
+            model_id (str): The identifier of the model to use.
+            batch_size (int, optional): The batch size for text generation. Defaults to 8.
+            max_generated_tokens (int, optional): Maximum number of tokens to generate. Defaults to 256.
+            temperature (float, optional): Sampling temperature. Defaults to 0.1.
+            top_p (float, optional): Top-p sampling parameter. Defaults to 0.9.
+            model_storage_path (str, optional): Directory to store the model. Defaults to None.
+            token: (str, optional): Token for accessing the model - not used in implementation yet.
+            dtype (str, optional): Data type for model weights. Defaults to "float16".
+            tensor_parallel_size (int, optional): Number of GPUs for tensor parallelism. Defaults to 1.
+            gpu_memory_utilization (float, optional): Fraction of GPU memory to use. Defaults to 0.95.
+            max_model_len (int, optional): Maximum sequence length for the model. Defaults to 2048.
+            trust_remote_code (bool, optional): Whether to trust remote code. Defaults to False.
+
+        Note:
+            This method sets up a vLLM engine with specified parameters for efficient inference.
+        """
+        self.batch_size = batch_size
+        self.dtype = dtype
+        self.tensor_parallel_size = tensor_parallel_size
+        self.gpu_memory_utilization = gpu_memory_utilization
+        self.max_model_len = max_model_len
+        self.trust_remote_code = trust_remote_code
+
+        # Configure sampling parameters
+        self.sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_generated_tokens)
+
+        # Initialize the vLLM engine
+        self.llm = LLM(
+            model=model_id,
+            tokenizer=model_id,
+            dtype=self.dtype,
+            tensor_parallel_size=self.tensor_parallel_size,
+            gpu_memory_utilization=self.gpu_memory_utilization,
+            max_model_len=self.max_model_len,
+            download_dir=model_storage_path,
+            trust_remote_code=self.trust_remote_code,
+        )
+
+        # Initialize tokenizer separately for potential pre-processing
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+    def get_response(self, inputs: list[str]):
+        """Generate responses for a list of prompts using the vLLM engine.
+
+        Args:
+            prompts (list[str]): A list of input prompts.
+
+        Returns:
+            list[str]: A list of generated responses corresponding to the input prompts.
+
+        Note:
+            This method uses vLLM's batched generation capabilities for efficient inference.
+        """
+        prompts = [
+            self.tokenizer.apply_chat_template(
+                [
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant.",
+                    },
+                    {"role": "user", "content": input},
+                ],
+                tokenize=False,
+            )
+            for input in inputs
+        ]
+
+        # generate responses for self.batch_size prompts at the same time
+        all_responses = []
+        for i in range(0, len(prompts), self.batch_size):
+            batch = prompts[i : i + self.batch_size]
+            outputs = self.llm.generate(batch, self.sampling_params)
+            responses = [output.outputs[0].text for output in outputs]
+            all_responses.extend(responses)
+
+        return all_responses
+
+    def __del__(self):
+        """Cleanup method to delete the LLM instance and free up GPU memory."""
+        del self.llm
+        torch.cuda.empty_cache()
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "promptolution"
-version = "1.1.1"
+version = "1.2.0"
 description = ""
 authors = ["Tom Zehle, Moritz Schlager, Timo Heiß"]
 readme = "README.md"
@@ -15,6 +15,7 @@ langchain-community = "^0.2.12"
 pandas = "^2.2.2"
 tqdm = "^4.66.5"
 scikit-learn = "^1.5.2"
+vllm = "^0.7.3"
 
 [tool.poetry.group.dev.dependencies]
 matplotlib = "^3.9.2"