#191: Add simple ollama chat example (#336)

matthewhaynesonline · web-flow · commit 75a7c7aeace8 · 2025-03-08T18:10:03.000-08:00
diff --git a/examples/ollama-chat/.gitignore b/examples/ollama-chat/.gitignore
@@ -0,0 +1 @@
+volumes/*
diff --git a/examples/ollama-chat/Cargo.toml b/examples/ollama-chat/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "ollama-chat"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[dependencies]
+async-openai = {path = "../../async-openai"}
+serde_json = "1.0.135"
+tokio = { version = "1.43.0", features = ["full"] }
diff --git a/examples/ollama-chat/README.md b/examples/ollama-chat/README.md
@@ -0,0 +1,38 @@
+## Setup
+
+A docker compose file is provided to run a dockerized version of Ollama and download a default model. You will need the Ollama container to be up and running _before_ you can run the Rust example code.
+
+You can check the container status with `docker ps` or check the container's logs with `docker container logs {CONTAINER NAME} -f`. E.g. `docker container logs ollama -f`.
+
+## Running the Example
+
+```sh
+# Bring ollama up with model and wait for it to be healthy.
+docker compose up -d
+
+# Once model is downloaded and Ollama is up, run the Rust code.
+cargo run
+```
+
+## Docker Notes
+
+- Since Ollama requires you to pull a model before first use, a custom entrypoint script is used. See [Stack Overflow discussion](https://stackoverflow.com/a/78501628).
+  - The model will be cached in the volumes dir.
+    - Depending on your network connection, the healthcheck may need to be adjusted to allow more time for the model to download.
+- [llama3.2:1b](https://ollama.com/library/llama3.2:1b) is used in the example as it is a smaller model and will download more quickly compared to larger models.
+  - A larger model will provide better responses, but be slower to download.
+  - Also, using the default CPU inference, smaller models will have better tokens / second performance.
+- The GPU mapping is written but commented out. This means it will default to CPU inference which is slower, but should run without any additional setup.
+  - If you have a GPU and the proper container support, feel free to uncomment / adapt.
+
+## Ollama OpenAI Compatibility
+
+**NOTE: an api key parameter is used for compatibility with OpenAI's API spec, but it is ignored by Ollama (it can be any value).**
+
+See the [Ollama OpenAI Compatibility docs](https://github.com/ollama/ollama/blob/main/docs/openai.md) for more details on what Ollama supports.
+
+## Response
+
+> Response:
+>
+> 0: Role: assistant Content: Some("The 2020 World Series was played at Globe Life Field in Arlington, Texas, as part of Major League Baseball's (MLB) move to play its season without spectators due to the COVID-19 pandemic. The Dodgers defeated the Tampa Bay Rays in 6 games.")
diff --git a/examples/ollama-chat/docker-compose.yml b/examples/ollama-chat/docker-compose.yml
@@ -0,0 +1,27 @@
+services:
+  ollama:
+    container_name: ollama
+    image: ollama/ollama:0.5.12
+    entrypoint: ["/usr/bin/bash", "/ollama_entrypoint.sh"]
+    environment:
+      MODEL: "llama3.2:1b"
+    volumes:
+      - ./volumes/ollama:/root/.ollama
+      - ./ollama_entrypoint.sh:/ollama_entrypoint.sh
+    restart: unless-stopped
+    ports:
+      - "11434:11434"
+    healthcheck:
+      test: ["CMD", "bash", "-c", "ollama list | grep -q llama3.2:1b"]
+      interval: 15s
+      retries: 30
+      start_period: 5s
+      timeout: 5s
+    # Uncomment if you have NVIDIA container toolkit, CUDA, etc.
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - capabilities: [gpu]
+    #           driver: nvidia
+    #           count: all
diff --git a/examples/ollama-chat/ollama_entrypoint.sh b/examples/ollama-chat/ollama_entrypoint.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+# Start Ollama in the background.
+/bin/ollama serve &
+# Record Process ID.
+pid=$!
+
+# Pause for Ollama to start.
+sleep 5
+
+echo "Retrieving model $MODEL..."
+ollama pull $MODEL
+echo "Done!"
+
+# Wait for Ollama process to finish.
+wait $pid
diff --git a/examples/ollama-chat/src/main.rs b/examples/ollama-chat/src/main.rs
@@ -0,0 +1,65 @@
+use std::error::Error;
+
+use async_openai::{
+    config::OpenAIConfig,
+    types::{
+        ChatCompletionRequestAssistantMessageArgs, ChatCompletionRequestSystemMessageArgs,
+        ChatCompletionRequestUserMessageArgs, CreateChatCompletionRequestArgs,
+    },
+    Client,
+};
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    // This is the default host:port for Ollama's OpenAI endpoint.
+    // Should match the config in docker-compose.yml.
+    let api_base = "http://localhost:11434/v1";
+    // Required but ignored
+    let api_key = "ollama";
+
+    let client = Client::with_config(
+        OpenAIConfig::new()
+            .with_api_key(api_key)
+            .with_api_base(api_base),
+    );
+
+    // This should match whatever model is downloaded in Ollama docker container.
+    let model = "llama3.2:1b";
+
+    let request = CreateChatCompletionRequestArgs::default()
+        .max_tokens(512u32)
+        .model(model)
+        .messages([
+            ChatCompletionRequestSystemMessageArgs::default()
+                .content("You are a helpful assistant.")
+                .build()?
+                .into(),
+            ChatCompletionRequestUserMessageArgs::default()
+                .content("Who won the world series in 2020?")
+                .build()?
+                .into(),
+            ChatCompletionRequestAssistantMessageArgs::default()
+                .content("The Los Angeles Dodgers won the World Series in 2020.")
+                .build()?
+                .into(),
+            ChatCompletionRequestUserMessageArgs::default()
+                .content("Where was it played?")
+                .build()?
+                .into(),
+        ])
+        .build()?;
+
+    println!("{}", serde_json::to_string(&request).unwrap());
+
+    let response = client.chat().create(request).await?;
+
+    println!("\nResponse:\n");
+    for choice in response.choices {
+        println!(
+            "{}: Role: {}  Content: {:?}",
+            choice.index, choice.message.role, choice.message.content
+        );
+    }
+
+    Ok(())
+}