feat: Add streaming support

whiterabbit1983 · whiterabbit1983 · commit 20b65d995478 · 2025-03-17T21:11:27.000+03:00
diff --git a/agents-api/agents_api/routers/sessions/chat.py b/agents-api/agents_api/routers/sessions/chat.py
@@ -1,7 +1,11 @@
+import json
+import asyncio
 from typing import Annotated, Any
 from uuid import UUID
 
-from fastapi import BackgroundTasks, Depends, Header
+from fastapi import BackgroundTasks, Depends, Header, HTTPException, status
+from starlette.background import BackgroundTask
+from fastapi.responses import StreamingResponse
 from starlette.status import HTTP_201_CREATED
 from uuid_extensions import uuid7
 
@@ -23,14 +27,22 @@
 from .metrics import total_tokens_per_user
 from .render import render_chat_input
 from .router import router
+from .render import render_chat_input
 
 COMPUTER_USE_BETA_FLAG = "computer-use-2024-10-22"
 
 
+async def wait_for_tasks(tasks: list[asyncio.Task]) -> None:
+    """Wait for all background tasks to complete."""
+    if tasks:
+        await asyncio.gather(*tasks, return_exceptions=True)
+
+
 @router.post(
     "/sessions/{session_id}/chat",
     status_code=HTTP_201_CREATED,
     tags=["sessions", "chat"],
+    response_model=None,
 )
 async def chat(
     developer: Annotated[Developer, Depends(get_developer_data)],
@@ -39,7 +51,7 @@ async def chat(
     background_tasks: BackgroundTasks,
     x_custom_api_key: str | None = Header(None, alias="X-Custom-Api-Key"),
     connection_pool: Any = None,  # FIXME: Placeholder that should be removed
-) -> ChatResponse:
+) -> ChatResponse | StreamingResponse:  # FIXME: Update type to include StreamingResponse
     """
     Initiates a chat session.
 
@@ -73,16 +85,31 @@ async def chat(
         "user": str(developer.id),
         "tags": developer.tags,
         "custom_api_key": x_custom_api_key,
+        "stream": chat_input.stream,  # Enable streaming if requested
     }
-    evaluator = ToolCallsEvaluator(
-        tool_types={"system"}, developer_id=developer.id, completion_func=litellm.acompletion
-    )
-    model_response = await evaluator.completion(**{
-        **settings,
-        **params,
-    })
+    payload = {**settings, **params}
+
+    try:
+        evaluator = ToolCallsEvaluator(
+            tool_types={"system"}, developer_id=developer.id, completion_func=litellm.acompletion
+        )
+        model_response = await evaluator.completion(**payload)
+    except Exception as e:
+        import logging
+
+        logging.error(f"LLM completion error: {e!s}")
+        # Create basic error response
+        return ChatResponse(
+            id=uuid7(),
+            created_at=utcnow(),
+            jobs=[],
+            docs=doc_references,
+            usage=None,
+            choices=[],
+            error=f"Error getting model completion: {e!s}",
+        )
 
-    # Save the input and the response to the session history
+    # Save the input messages to the session history
     if chat_input.save:
         new_entries = [
             CreateEntryRequest.from_model_input(
@@ -93,21 +120,33 @@ async def chat(
             for msg in new_messages
         ]
 
-        # Add the response to the new entries
-        # FIXME: We need to save all the choices
-        new_entries.append(
-            CreateEntryRequest.from_model_input(
-                model=settings["model"],
-                **model_response.choices[0].model_dump()["message"],
-                source="api_response",
-            ),
-        )
-        background_tasks.add_task(
-            create_entries,
-            developer_id=developer.id,
-            session_id=session_id,
-            data=new_entries,
-        )
+        # For non-streaming responses, add the response to the new entries immediately
+        if not chat_input.stream:
+            # FIXME: We need to save all the choices
+            new_entries.append(
+                CreateEntryRequest.from_model_input(
+                    model=settings["model"],
+                    **model_response.choices[0].model_dump()["message"],
+                    source="api_response",
+                ),
+            )
+            background_tasks.add_task(
+                create_entries,
+                developer_id=developer.id,
+                session_id=session_id,
+                data=new_entries,
+            )
+        else:
+            # For streaming, we need to collect all chunks and save at the end
+            # For now, just save the input messages and handle response separately
+            background_tasks.add_task(
+                create_entries,
+                developer_id=developer.id,
+                session_id=session_id,
+                data=new_entries,
+            )
+            # The complete streamed response will be saved in the stream_chat_response function
+            # using a separate background task to avoid blocking the stream
 
     # Adaptive context handling
     jobs = []
@@ -120,9 +159,147 @@ async def chat(
         raise NotImplementedError(msg)
 
     # Return the response
-    # FIXME: Implement streaming for chat
-    chat_response_class = ChunkChatResponse if chat_input.stream else MessageChatResponse
+    # Handle streaming response if requested
+    stream_tasks: list[asyncio.Task] = []
+
+    if chat_input.stream:
+        # For streaming, we'll use an async generator to yield chunks
+        async def stream_chat_response():
+            """Stream chat response chunks to the client."""
+            # Create initial response with metadata
+            response_id = uuid7()
+            created_at = utcnow()
+
+            # Collect full response for metrics and optional saving
+            content_so_far = ""
+            final_usage = None
+            has_content = False
+
+            nonlocal stream_tasks
+
+            try:
+                # Stream chunks from the model_response (CustomStreamWrapper from litellm)
+                async for chunk in model_response:
+                    # Process a single chunk of the streaming response
+                    try:
+                        # Extract usage metrics if available
+                        if hasattr(chunk, "usage") and chunk.usage:
+                            final_usage = chunk.usage.model_dump()
 
+                        # Check if chunk has valid choices
+                        has_choices = (
+                            hasattr(chunk, "choices")
+                            and chunk.choices
+                            and len(chunk.choices) > 0
+                        )
+
+                        # Update metrics when we detect the final chunk
+                        if final_usage and has_choices and chunk.choices[0].finish_reason:
+                            # This is the last chunk with the finish reason
+                            total_tokens = final_usage.get("total_tokens", 0)
+                            total_tokens_per_user.labels(str(developer.id)).inc(
+                                amount=total_tokens
+                            )
+
+                        # Collect content for the full response
+                        if has_choices and hasattr(chunk.choices[0], "delta"):
+                            delta = chunk.choices[0].delta
+                            if hasattr(delta, "content") and delta.content:
+                                content_so_far += delta.content
+                                has_content = True
+
+                        # Prepare the response chunk
+                        choices_to_send = []
+                        if has_choices:
+                            chunk_data = chunk.choices[0].model_dump()
+
+                            # Ensure delta always contains a role field
+                            if "delta" in chunk_data and "role" not in chunk_data["delta"]:
+                                chunk_data["delta"]["role"] = "assistant"
+
+                            choices_to_send = [chunk_data]
+
+                        # Create and send the chunk response
+                        chunk_response = ChunkChatResponse(
+                            id=response_id,
+                            created_at=created_at,
+                            jobs=jobs,
+                            docs=doc_references,
+                            usage=final_usage,
+                            choices=choices_to_send,
+                        )
+                        yield chunk_response.model_dump_json() + "\n"
+
+                    except Exception as e:
+                        # Log error details for debugging but send a generic message to client
+                        import logging
+
+                        logging.error(f"Error processing chunk: {e!s}")
+
+                        error_response = {
+                            "id": str(response_id),
+                            "created_at": created_at.isoformat(),
+                            "error": "An error occurred while processing the response chunk.",
+                        }
+                        yield json.dumps(error_response) + "\n"
+                        # Continue processing remaining chunks
+                        continue
+
+                # Save complete response to history if needed
+                if chat_input.save and has_content:
+                    try:
+                        # Create entry for the complete response
+                        complete_entry = CreateEntryRequest.from_model_input(
+                            model=settings["model"],
+                            role="assistant",
+                            content=content_so_far,
+                            source="api_response",
+                        )
+                        # Create a task to save the entry without blocking the stream
+                        ref = asyncio.create_task(
+                            create_entries(
+                                developer_id=developer.id,
+                                session_id=session_id,
+                                data=[complete_entry],
+                            )
+                        )
+                        stream_tasks.append(ref)
+                    except Exception as e:
+                        # Log the full error for debugging purposes
+                        import logging
+
+                        logging.error(f"Failed to save streamed response: {e!s}")
+
+                        # Send a minimal error message to the client
+                        error_response = {
+                            "id": str(response_id),
+                            "created_at": created_at.isoformat(),
+                            "error": "Failed to save response history.",
+                        }
+                        yield json.dumps(error_response) + "\n"
+            except Exception as e:
+                # Log the detailed error for system debugging
+                import logging
+
+                logging.error(f"Streaming error: {e!s}")
+
+                # Send a user-friendly error message to the client
+                error_response = {
+                    "id": str(response_id),
+                    "created_at": created_at.isoformat(),
+                    "error": "An error occurred during the streaming response.",
+                }
+                yield json.dumps(error_response) + "\n"
+
+        # Return a streaming response with a background task to wait for all entry saving tasks
+        return StreamingResponse(
+            stream_chat_response(),
+            media_type="application/json",
+            background=BackgroundTask(wait_for_tasks, stream_tasks),
+        )
+
+    # For non-streaming, return the complete response
+    chat_response_class = MessageChatResponse
     chat_response: ChatResponse = chat_response_class(
         id=uuid7(),
         created_at=utcnow(),
@@ -132,8 +309,13 @@ async def chat(
         choices=[choice.model_dump() for choice in model_response.choices],
     )
 
-    total_tokens_per_user.labels(str(developer.id)).inc(
-        amount=chat_response.usage.total_tokens if chat_response.usage is not None else 0,
-    )
+    # For non-streaming responses, update metrics and return the response
+    if not chat_input.stream:
+        total_tokens_per_user.labels(str(developer.id)).inc(
+            amount=chat_response.usage.total_tokens if chat_response.usage is not None else 0,
+        )
+        return chat_response
 
-    return chat_response
+    # Note: For streaming responses, we've already returned the StreamingResponse above
+    # This code is unreachable for streaming responses
+    return None