strands-agents · pgrayy · Jul 7, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 11, 2025
diff --git a/src/strands/models/llamaapi.py b/src/strands/models/llamaapi.py
@@ -11,7 +11,6 @@
 from typing import Any, AsyncGenerator, Optional, Type, TypeVar, Union, cast
 
 import llama_api_client
-from llama_api_client import LlamaAPIClient
 from pydantic import BaseModel
 from typing_extensions import TypedDict, Unpack, override
 
@@ -63,10 +62,8 @@ def __init__(
         self.config = LlamaAPIModel.LlamaConfig(**model_config)
         logger.debug("config=<%s> | initializing", self.config)
 
-        if not client_args:
-            self.client = LlamaAPIClient()
-        else:
-            self.client = LlamaAPIClient(**client_args)
+        client_args = client_args or {}
+        self.client = llama_api_client.AsyncLlamaAPIClient(**client_args)
 
     @override
     def update_config(self, **model_config: Unpack[LlamaConfig]) -> None:  # type: ignore
@@ -349,7 +346,7 @@ async def stream(
 
         logger.debug("invoking model")
         try:
-            response = self.client.chat.completions.create(**request)
+            response = await self.client.chat.completions.create(**request)
         except llama_api_client.RateLimitError as e:
             raise ModelThrottledException(str(e)) from e
 
@@ -361,7 +358,7 @@ async def stream(
         curr_tool_call_id = None
 
         metrics_event = None
-        for chunk in response:
+        async for chunk in response:
             if chunk.event.event_type == "start":
                 yield self.format_chunk({"chunk_type": "content_start", "data_type": "text"})
             elif chunk.event.event_type in ["progress", "complete"] and chunk.event.delta.type == "text":

diff --git a/tests/strands/models/test_llamaapi.py b/tests/strands/models/test_llamaapi.py
@@ -9,7 +9,7 @@
 
 @pytest.fixture
 def llamaapi_client():
-    with unittest.mock.patch.object(strands.models.llamaapi, "LlamaAPIClient") as mock_client_cls:
+    with unittest.mock.patch.object(strands.models.llamaapi.llama_api_client, "AsyncLlamaAPIClient") as mock_client_cls:
         yield mock_client_cls.return_value
 
 
@@ -361,3 +361,40 @@ def test_format_chunk_other(model):
 
     with pytest.raises(RuntimeError, match="chunk_type=<other> | unknown type"):
         model.format_chunk(event)
+
+
+@pytest.mark.asyncio
+async def test_stream(llamaapi_client, model, agenerator, alist):
+    mock_event_1 = unittest.mock.Mock(event=unittest.mock.Mock(event_type="start", stop_reason=None))
+    mock_event_2 = unittest.mock.Mock(
+        event=unittest.mock.Mock(
+            delta=unittest.mock.Mock(text="test stream", type="text"),
+            event_type="complete",
+            stop_reason="end_turn",
+        ),
+    )
+
+    llamaapi_client.chat.completions.create = unittest.mock.AsyncMock(
+        return_value=agenerator([mock_event_1, mock_event_2])
+    )
+
+    messages = [{"role": "user", "content": [{"text": "calculate 2+2"}]}]
+    response = model.stream(messages)
+
+    tru_events = await alist(response)
+    exp_events = [
+        {"messageStart": {"role": "assistant"}},
+        {"contentBlockStart": {"start": {}}},
+        {"contentBlockDelta": {"delta": {"text": "test stream"}}},
+        {"contentBlockStop": {}},
+        {"messageStop": {"stopReason": "end_turn"}},
+    ]
+    assert tru_events == exp_events
+
+    expected_request = {
+        "model": "Llama-4-Maverick-17B-128E-Instruct-FP8",
+        "messages": [{"role": "user", "content": [{"text": "calculate 2+2", "type": "text"}]}],
+        "stream": True,
+        "tools": [],
+    }
+    llamaapi_client.chat.completions.create.assert_called_once_with(**expected_request)
diff --git a/tests_integ/models/test_model_llamaapi.py b/tests_integ/models/test_model_llamaapi.py
@@ -40,8 +40,28 @@ def agent(model, tools):
     return Agent(model=model, tools=tools)
 
 
-def test_agent(agent):
+def test_agent_invoke(agent):
     result = agent("What is the time and weather in New York?")
     text = result.message["content"][0]["text"].lower()
 
     assert all(string in text for string in ["12:00", "sunny"])
+
+
+@pytest.mark.asyncio
+async def test_agent_invoke_async(agent):
+    result = await agent.invoke_async("What is the time and weather in New York?")
+    text = result.message["content"][0]["text"].lower()
+
+    assert all(string in text for string in ["12:00", "sunny"])
+
+
+@pytest.mark.asyncio
+async def test_agent_stream_async(agent):
+    stream = agent.stream_async("What is the time and weather in New York?")
+    async for event in stream:
+        _ = event
+
+    result = event["result"]
+    text = result.message["content"][0]["text"].lower()
+
+    assert all(string in text for string in ["12:00", "sunny"])