Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docker/api.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ apt-get clean && \
apt-get autoremove && \
rm -rf /var/lib/apt/lists/* && \
pip install uv && \
ln -s $(which aclocal) /usr/local/bin/aclocal-1.16 && \
ln -s $(which automake) /usr/local/bin/automake-1.16 && \
uv sync

EXPOSE 8080
Expand Down
1 change: 1 addition & 0 deletions docker/compose/docker-compose.deepseek-14b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ services:
--model deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
--gpu-memory-utilization 0.39
--max-model-len 10000
--max-num-batched-tokens 10000
--tensor-parallel-size 1
--uvicorn-log-level warning
environment:
Expand Down
1 change: 1 addition & 0 deletions docker/compose/docker-compose.dolphin-8b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ services:
--model cognitivecomputations/Dolphin3.0-Llama3.1-8B
--gpu-memory-utilization 0.21
--max-model-len 10000
--max-num-batched-tokens 10000
--tensor-parallel-size 1
--enable-auto-tool-choice
--tool-call-parser llama3_json
Expand Down
4 changes: 2 additions & 2 deletions docker/compose/docker-compose.llama-1b-cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ services:
condition: service_healthy
command: >
--model meta-llama/Llama-3.2-1B-Instruct
--gpu-memory-utilization 0.5
--max-model-len 30000
--max-num-batched-tokens 30000
--tensor-parallel-size 1
--enable-auto-tool-choice
--tool-call-parser llama3_json
Expand All @@ -23,7 +23,7 @@ services:
- ETCD_PORT=2379
- TOOL_SUPPORT=true
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
- hugging_face_models:/root/.cache/huggingface
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
Expand Down
3 changes: 3 additions & 0 deletions docker/compose/docker-compose.llama-1b-gpu.ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ services:
condition: service_healthy
command: >
--model meta-llama/Llama-3.2-1B-Instruct
--gpu-memory-utilization 0.5
--max-model-len 30000
--max-num-batched-tokens 30000
--tensor-parallel-size 1
--enable-auto-tool-choice
--tool-call-parser llama3_json
Expand Down
1 change: 1 addition & 0 deletions docker/compose/docker-compose.llama-1b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ services:
--model meta-llama/Llama-3.2-1B-Instruct
--gpu-memory-utilization 0.5
--max-model-len 30000
--max-num-batched-tokens 30000
--tensor-parallel-size 1
--enable-auto-tool-choice
--tool-call-parser llama3_json
Expand Down
1 change: 1 addition & 0 deletions docker/compose/docker-compose.llama-3b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ services:
--model meta-llama/Llama-3.2-3B-Instruct
--gpu-memory-utilization 0.5
--max-model-len 30000
--max-num-batched-tokens 30000
--tensor-parallel-size 1
--enable-auto-tool-choice
--tool-call-parser llama3_json
Expand Down
1 change: 1 addition & 0 deletions docker/compose/docker-compose.llama-70b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ services:
--model hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
--gpu-memory-utilization 0.95
--max-model-len 60000
--max-num-batched-tokens 60000
--tensor-parallel-size 1
--enable-auto-tool-choice
--tool-call-parser llama3_json
Expand Down
1 change: 1 addition & 0 deletions docker/compose/docker-compose.llama-8b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ services:
--model meta-llama/Llama-3.1-8B-Instruct
--gpu-memory-utilization 0.21
--max-model-len 10000
--max-num-batched-tokens 10000
--tensor-parallel-size 1
--enable-auto-tool-choice
--tool-call-parser llama3_json
Expand Down
2 changes: 1 addition & 1 deletion docker/vllm.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM vllm/vllm-openai:v0.7.3
FROM vllm/vllm-openai:v0.10.1

# # Specify model name and path during build
# ARG MODEL_NAME=llama_1b_cpu
Expand Down
14 changes: 8 additions & 6 deletions tests/e2e/test_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,14 +328,16 @@ def test_function_calling(client, model):

assert len(tool_calls) > 0, f"Tool calls array is empty for {model}"

# Validate the first tool call
first_call = tool_calls[0]
assert first_call.function.name == "get_weather", (
"Function name should be get_weather"
)
first_call_dict = first_call.model_dump()

function_name = first_call_dict["function"]["name"]
function_args = first_call_dict["function"]["arguments"]

assert function_name == "get_weather", "Function name should be get_weather"

# Parse arguments and check for location
args = json.loads(first_call.function.arguments)
args = json.loads(function_args)
assert "location" in args, "Arguments should contain location"
assert "paris" in args["location"].lower(), "Location should be Paris"

Expand Down Expand Up @@ -363,7 +365,7 @@ def test_function_calling(client, model):
"type": "function",
"function": {
"name": "get_weather",
"arguments": first_call.function.arguments,
"arguments": function_args,
},
}
],
Expand Down
Loading