feat: add anthropic support and make vllm optional

ganler · ganler · commit b698b5852184 · 2024-04-24T14:01:16.000-05:00
diff --git a/README.md b/README.md
@@ -9,14 +9,18 @@
 ## 🚀 Installation
 
 ```bash
-pip install repoqa
+# without vLLM (can run openai, anthropic, and huggingface backends)
+pip install --upgrade repoqa
+# with vLLM
+pip install --upgrade "repoqa[vllm]"
 ```
 
 <details><summary>⏬ Install nightly version <i>:: click to expand ::</i></summary>
 <div>
 
 ```bash
-pip install "git+https://github.com/evalplus/repoqa.git" --upgrade
+pip install --upgrade "git+https://github.com/evalplus/repoqa.git"                 # without vLLM
+pip install --upgrade "repoqa[vllm] @ git+https://github.com/evalplus/repoqa@main" # with vLLM
 ```
 
 </div>
@@ -35,46 +39,55 @@ pip install -r requirements.txt
 </div>
 </details>
 
-
 ## 🏁 Search Needle Function
 
-### Inference with vLLM
+### Inference with OpenAI Compatible Servers
 
 ```bash
-repoqa.search_needle_function --model "Qwen/CodeQwen1.5-7B-Chat" --caching --backend vllm
+repoqa.search_needle_function --model "gpt4-turbo" --caching --backend openai
+# 💡 If you use customized server such vLLM:
+# repoqa.search_needle_function --base-url "http://url.to.vllm.server/v1" \
+#                               --model "gpt4-turbo" --caching --backend openai
 ```
 
-### Inference with OpenAI Compatible Servers
+### Inference with Anthropic Compatible Servers
+
+```bash
+repoqa.search_needle_function --model "claude-3-haiku-20240307" --caching --backend anthropic
+```
+
+### Inference with vLLM
 
 ```bash
-repoqa.search_needle_function --base-url "http://api.openai.com/v1" \
-                              --model "gpt4-turbo" --caching --backend openai
+repoqa.search_needle_function --model "Qwen/CodeQwen1.5-7B-Chat" \
+                              --caching --backend vllm
 ```
 
 ### Inference with HuggingFace transformers
 
 ```bash
-repoqa.search_needle_function --model "gpt2" "Qwen/CodeQwen1.5-7B-Chat" --caching --backend hf
+repoqa.search_needle_function --model "gpt2" "Qwen/CodeQwen1.5-7B-Chat" \
+                              --caching --backend hf --trust-remote-code
 ```
 
 ### Usage
 
 > [!Tip]
 >
-> * **Input**:
->   * `--model`: Hugging-Face model ID, such as `ise-uiuc/Magicoder-S-DS-6.7B`
->   * `--backend`: `vllm` (default) or `openai`
->   * `--base-url`: OpenAI API base URL
->   * `--code-context-size` (default: 16384): Number of tokens (using DeepSeekCoder tokenizer) of code in the long context
->   * `--caching` (default: False): if enabled, the tokenization and chuncking results will be cached to accelerate subsequent runs
->   * `--max-new-tokens` (default: 1024): Maximum number of new tokens to generate
->   * `--system-message` (default: None): if given, the model use a system message (but note some models don't support system message)
->   * `--tensor-parallel-size`: Number of tensor parallelism (only for vLLM)
->   * `--languages` (default: None): List of languages to evaluate (None means all)
->   * `--result-dir` (default: "results"): Directory to save the model outputs and evaluation results
-> * **Output**:
->   * `results/ntoken_{code-context-size}/{model}.jsonl`: Model generated outputs
->   * `results/ntoken_{code-context-size}/{model}-SCORE.json`: Evaluation scores (also see [Compute Scores](#compute-scores))
+> - **Input**:
+>   - `--model`: Hugging-Face model ID, such as `ise-uiuc/Magicoder-S-DS-6.7B`
+>   - `--backend`: `vllm` (default) or `openai`
+>   - `--base-url`: OpenAI API base URL
+>   - `--code-context-size` (default: 16384): Number of tokens (using DeepSeekCoder tokenizer) of code in the long context
+>   - `--caching` (default: False): if enabled, the tokenization and chuncking results will be cached to accelerate subsequent runs
+>   - `--max-new-tokens` (default: 1024): Maximum number of new tokens to generate
+>   - `--system-message` (default: None): if given, the model use a system message (but note some models don't support system message)
+>   - `--tensor-parallel-size`: Number of tensor parallelism (only for vLLM)
+>   - `--languages` (default: None): List of languages to evaluate (None means all)
+>   - `--result-dir` (default: "results"): Directory to save the model outputs and evaluation results
+> - **Output**:
+>   - `results/ntoken_{code-context-size}/{model}.jsonl`: Model generated outputs
+>   - `results/ntoken_{code-context-size}/{model}-SCORE.json`: Evaluation scores (also see [Compute Scores](#compute-scores))
 
 ### Compute Scores
 
@@ -87,12 +100,11 @@ repoqa.compute_score --model-output-path={model-output}.jsonl
 
 > [!Tip]
 >
-> * **Input**: Path to the model generated outputs.
-> * **Output**: The evaluation scores would be stored in `{model-output}-SCORES.json`
-
+> - **Input**: Path to the model generated outputs.
+> - **Output**: The evaluation scores would be stored in `{model-output}-SCORES.json`
 
 ## 📚 Read More
 
-* [RepoQA Homepage](https://evalplus.github.io/repoqa.html)
-* [RepoQA Dataset Curation](docs/curate_dataset.md)
-* [RepoQA Development Notes](docs/dev_note.md)
+- [RepoQA Homepage](https://evalplus.github.io/repoqa.html)
+- [RepoQA Dataset Curation](docs/curate_dataset.md)
+- [RepoQA Development Notes](docs/dev_note.md)
diff --git a/repoqa/provider/__init__.py b/repoqa/provider/__init__.py
@@ -3,6 +3,3 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from repoqa.provider.base import BaseProvider
-from repoqa.provider.hf import HfProvider
-from repoqa.provider.openai import OpenAIProvider
-from repoqa.provider.vllm import VllmProvider
diff --git a/repoqa/provider/anthropic.py b/repoqa/provider/anthropic.py
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from typing import List
+
+from anthropic import Client
+
+from repoqa.provider.base import BaseProvider
+from repoqa.provider.request.anthropic import make_auto_request
+
+
+class AnthropicProvider(BaseProvider):
+    def __init__(self, model):
+        self.model = model
+        self.client = Client(api_key=os.getenv("ANTHROPIC_KEY"))
+
+    def generate_reply(
+        self, question, n=1, max_tokens=1024, temperature=0, system_msg=None
+    ) -> List[str]:
+        assert temperature != 0 or n == 1, "n must be 1 when temperature is 0"
+        replies = []
+        for _ in range(n):
+            reply = make_auto_request(
+                self.client,
+                message=question,
+                model=self.model,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                system_msg=system_msg,
+            )
+            replies.append(reply.content[0].text)
+
+        return replies
diff --git a/repoqa/provider/request/anthropic.py b/repoqa/provider/request/anthropic.py
@@ -0,0 +1,71 @@
+# SPDX-FileCopyrightText: (c) 2024 EvalPlus Team
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import signal
+import time
+
+import anthropic
+from anthropic.types import Message
+
+from repoqa.provider.request import construct_message_list
+
+
+def make_request(
+    client: anthropic.Client,
+    message: str,
+    model: str,
+    max_tokens: int = 512,
+    temperature: float = 1,
+    system_msg="You are a helpful assistant good at coding.",
+    **kwargs,
+) -> Message:
+    return client.messages.create(
+        model=model,
+        messages=construct_message_list(message, system_message=system_msg),
+        max_tokens=max_tokens,
+        temperature=temperature,
+        **kwargs,
+    )
+
+
+def handler(signum, frame):
+    # swallow signum and frame
+    raise Exception("end of time")
+
+
+def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message:
+    ret = None
+    while ret is None:
+        try:
+            signal.signal(signal.SIGALRM, handler)
+            signal.alarm(100)
+            ret = make_request(client, *args, **kwargs)
+            signal.alarm(0)
+        except anthropic.RateLimitError:
+            print("Rate limit exceeded. Waiting...")
+            signal.alarm(0)
+            time.sleep(5)
+        except anthropic.APIConnectionError:
+            print("API connection error. Waiting...")
+            signal.alarm(0)
+            time.sleep(5)
+        except anthropic.InternalServerError:
+            print("Internal server error. Waiting...")
+            signal.alarm(0)
+            time.sleep(5)
+        except anthropic.APIError as e:
+            print("Unknown API error")
+            print(e)
+            if (
+                e.body["error"]["message"]
+                == "Output blocked by content filtering policy"
+            ):
+                raise Exception("Content filtering policy blocked output")
+            signal.alarm(0)
+        except Exception as e:
+            print("Unknown error. Waiting...")
+            print(e)
+            signal.alarm(0)
+            time.sleep(1)
+    return ret
diff --git a/repoqa/search_needle_function.py b/repoqa/search_needle_function.py
@@ -302,20 +302,24 @@ def evaluate_model(
         return
 
     if backend == "openai":
-        from repoqa.provider import OpenAIProvider
+        from repoqa.provider.openai import OpenAIProvider
 
         engine = OpenAIProvider(model, base_url=base_url)
     elif backend == "vllm":
-        from repoqa.provider import VllmProvider
+        from repoqa.provider.vllm import VllmProvider
 
         engine = VllmProvider(
             model,
             tensor_parallel_size=tensor_parallel_size,
             max_model_len=int(code_context_size * 1.25),  # Magic number
             trust_remote_code=trust_remote_code,
         )
+    elif backend == "anthropic":
+        from repoqa.provider.anthropic import AnthropicProvider
+
+        engine = AnthropicProvider(model)
     elif backend == "hf":
-        from repoqa.provider import HfProvider
+        from repoqa.provider.hf import HfProvider
 
         engine = HfProvider(model, trust_remote_code=trust_remote_code)
 
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ rich
 vllm
 numpy
 tree_sitter_languages
+anthropic
diff --git a/setup.cfg b/setup.cfg
@@ -22,11 +22,14 @@ install_requires =
     openai>=1.23.2
     nltk>=3.8.1
     rich>=13.5.2
-    vllm>=0.3.3
     tree_sitter_languages>=1.10.2
     numpy>=1.25.2
+    anthropic>=0.25.6
 
 [options.entry_points]
 console_scripts =
     repoqa.search_needle_function = repoqa.search_needle_function:main
     repoqa.compute_score = repoqa.compute_score:main
+
+[options.extras_require]
+vllm = vllm>=0.3.3