Skip to content

Commit 75a7c7a

Browse files
#191: Add simple ollama chat example (#336)
1 parent 638bf75 commit 75a7c7a

File tree

6 files changed

+157
-0
lines changed

6 files changed

+157
-0
lines changed

examples/ollama-chat/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
volumes/*

examples/ollama-chat/Cargo.toml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "ollama-chat"
3+
version = "0.1.0"
4+
edition = "2021"
5+
publish = false
6+
7+
[dependencies]
8+
async-openai = {path = "../../async-openai"}
9+
serde_json = "1.0.135"
10+
tokio = { version = "1.43.0", features = ["full"] }

examples/ollama-chat/README.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
## Setup
2+
3+
A docker compose file is provided to run a dockerized version of Ollama and download a default model. You will need the Ollama container to be up and running _before_ you can run the Rust example code.
4+
5+
You can check the container status with `docker ps` or check the container's logs with `docker container logs {CONTAINER NAME} -f`. E.g. `docker container logs ollama -f`.
6+
7+
## Running the Example
8+
9+
```sh
10+
# Bring ollama up with model and wait for it to be healthy.
11+
docker compose up -d
12+
13+
# Once model is downloaded and Ollama is up, run the Rust code.
14+
cargo run
15+
```
16+
17+
## Docker Notes
18+
19+
- Since Ollama requires you to pull a model before first use, a custom entrypoint script is used. See [Stack Overflow discussion](https://stackoverflow.com/a/78501628).
20+
- The model will be cached in the volumes dir.
21+
- Depending on your network connection, the healthcheck may need to be adjusted to allow more time for the model to download.
22+
- [llama3.2:1b](https://ollama.com/library/llama3.2:1b) is used in the example as it is a smaller model and will download more quickly compared to larger models.
23+
- A larger model will provide better responses, but be slower to download.
24+
- Also, using the default CPU inference, smaller models will have better tokens / second performance.
25+
- The GPU mapping is written but commented out. This means it will default to CPU inference which is slower, but should run without any additional setup.
26+
- If you have a GPU and the proper container support, feel free to uncomment / adapt.
27+
28+
## Ollama OpenAI Compatibility
29+
30+
**NOTE: an api key parameter is used for compatibility with OpenAI's API spec, but it is ignored by Ollama (it can be any value).**
31+
32+
See the [Ollama OpenAI Compatibility docs](https://github.com/ollama/ollama/blob/main/docs/openai.md) for more details on what Ollama supports.
33+
34+
## Response
35+
36+
> Response:
37+
>
38+
> 0: Role: assistant Content: Some("The 2020 World Series was played at Globe Life Field in Arlington, Texas, as part of Major League Baseball's (MLB) move to play its season without spectators due to the COVID-19 pandemic. The Dodgers defeated the Tampa Bay Rays in 6 games.")
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
services:
2+
ollama:
3+
container_name: ollama
4+
image: ollama/ollama:0.5.12
5+
entrypoint: ["/usr/bin/bash", "/ollama_entrypoint.sh"]
6+
environment:
7+
MODEL: "llama3.2:1b"
8+
volumes:
9+
- ./volumes/ollama:/root/.ollama
10+
- ./ollama_entrypoint.sh:/ollama_entrypoint.sh
11+
restart: unless-stopped
12+
ports:
13+
- "11434:11434"
14+
healthcheck:
15+
test: ["CMD", "bash", "-c", "ollama list | grep -q llama3.2:1b"]
16+
interval: 15s
17+
retries: 30
18+
start_period: 5s
19+
timeout: 5s
20+
# Uncomment if you have NVIDIA container toolkit, CUDA, etc.
21+
# deploy:
22+
# resources:
23+
# reservations:
24+
# devices:
25+
# - capabilities: [gpu]
26+
# driver: nvidia
27+
# count: all
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/usr/bin/env bash
2+
3+
# Start Ollama in the background.
4+
/bin/ollama serve &
5+
# Record Process ID.
6+
pid=$!
7+
8+
# Pause for Ollama to start.
9+
sleep 5
10+
11+
echo "Retrieving model $MODEL..."
12+
ollama pull $MODEL
13+
echo "Done!"
14+
15+
# Wait for Ollama process to finish.
16+
wait $pid

examples/ollama-chat/src/main.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
use std::error::Error;
2+
3+
use async_openai::{
4+
config::OpenAIConfig,
5+
types::{
6+
ChatCompletionRequestAssistantMessageArgs, ChatCompletionRequestSystemMessageArgs,
7+
ChatCompletionRequestUserMessageArgs, CreateChatCompletionRequestArgs,
8+
},
9+
Client,
10+
};
11+
12+
#[tokio::main]
13+
async fn main() -> Result<(), Box<dyn Error>> {
14+
// This is the default host:port for Ollama's OpenAI endpoint.
15+
// Should match the config in docker-compose.yml.
16+
let api_base = "http://localhost:11434/v1";
17+
// Required but ignored
18+
let api_key = "ollama";
19+
20+
let client = Client::with_config(
21+
OpenAIConfig::new()
22+
.with_api_key(api_key)
23+
.with_api_base(api_base),
24+
);
25+
26+
// This should match whatever model is downloaded in Ollama docker container.
27+
let model = "llama3.2:1b";
28+
29+
let request = CreateChatCompletionRequestArgs::default()
30+
.max_tokens(512u32)
31+
.model(model)
32+
.messages([
33+
ChatCompletionRequestSystemMessageArgs::default()
34+
.content("You are a helpful assistant.")
35+
.build()?
36+
.into(),
37+
ChatCompletionRequestUserMessageArgs::default()
38+
.content("Who won the world series in 2020?")
39+
.build()?
40+
.into(),
41+
ChatCompletionRequestAssistantMessageArgs::default()
42+
.content("The Los Angeles Dodgers won the World Series in 2020.")
43+
.build()?
44+
.into(),
45+
ChatCompletionRequestUserMessageArgs::default()
46+
.content("Where was it played?")
47+
.build()?
48+
.into(),
49+
])
50+
.build()?;
51+
52+
println!("{}", serde_json::to_string(&request).unwrap());
53+
54+
let response = client.chat().create(request).await?;
55+
56+
println!("\nResponse:\n");
57+
for choice in response.choices {
58+
println!(
59+
"{}: Role: {} Content: {:?}",
60+
choice.index, choice.message.role, choice.message.content
61+
);
62+
}
63+
64+
Ok(())
65+
}

0 commit comments

Comments
 (0)