@@ -1557,6 +1557,198 @@ def get_response(
15571557 total_time = time .time () - start_time
15581558 logging .debug (f"get_response completed in { total_time :.2f} seconds" )
15591559
1560+ def get_response_stream (
1561+ self ,
1562+ prompt : Union [str , List [Dict ]],
1563+ system_prompt : Optional [str ] = None ,
1564+ chat_history : Optional [List [Dict ]] = None ,
1565+ temperature : float = 0.2 ,
1566+ tools : Optional [List [Any ]] = None ,
1567+ output_json : Optional [BaseModel ] = None ,
1568+ output_pydantic : Optional [BaseModel ] = None ,
1569+ verbose : bool = False , # Default to non-verbose for streaming
1570+ markdown : bool = True ,
1571+ agent_name : Optional [str ] = None ,
1572+ agent_role : Optional [str ] = None ,
1573+ agent_tools : Optional [List [str ]] = None ,
1574+ task_name : Optional [str ] = None ,
1575+ task_description : Optional [str ] = None ,
1576+ task_id : Optional [str ] = None ,
1577+ execute_tool_fn : Optional [Callable ] = None ,
1578+ ** kwargs
1579+ ):
1580+ """Generator that yields real-time response chunks from the LLM.
1581+
1582+ This method provides true streaming by yielding content chunks as they
1583+ are received from the underlying LLM, enabling real-time display of
1584+ responses without waiting for the complete response.
1585+
1586+ Args:
1587+ prompt: The prompt to send to the LLM
1588+ system_prompt: Optional system prompt
1589+ chat_history: Optional chat history
1590+ temperature: Sampling temperature
1591+ tools: Optional list of tools for function calling
1592+ output_json: Optional JSON schema for structured output
1593+ output_pydantic: Optional Pydantic model for structured output
1594+ verbose: Whether to enable verbose logging (default False for streaming)
1595+ markdown: Whether to enable markdown processing
1596+ agent_name: Optional agent name for logging
1597+ agent_role: Optional agent role for logging
1598+ agent_tools: Optional list of agent tools for logging
1599+ task_name: Optional task name for logging
1600+ task_description: Optional task description for logging
1601+ task_id: Optional task ID for logging
1602+ execute_tool_fn: Optional function for executing tools
1603+ **kwargs: Additional parameters
1604+
1605+ Yields:
1606+ str: Individual content chunks as they are received from the LLM
1607+
1608+ Raises:
1609+ Exception: If streaming fails or LLM call encounters an error
1610+ """
1611+ try :
1612+ import litellm
1613+
1614+ # Build messages using existing logic
1615+ messages , original_prompt = self ._build_messages (
1616+ prompt = prompt ,
1617+ system_prompt = system_prompt ,
1618+ chat_history = chat_history ,
1619+ output_json = output_json ,
1620+ output_pydantic = output_pydantic
1621+ )
1622+
1623+ # Format tools for litellm
1624+ formatted_tools = self ._format_tools_for_litellm (tools )
1625+
1626+ # Determine if we should use streaming based on tool support
1627+ use_streaming = True
1628+ if formatted_tools and not self ._supports_streaming_tools ():
1629+ # Provider doesn't support streaming with tools, fall back to non-streaming
1630+ use_streaming = False
1631+
1632+ if use_streaming :
1633+ # Real-time streaming approach with tool call support
1634+ try :
1635+ tool_calls = []
1636+ response_text = ""
1637+
1638+ for chunk in litellm .completion (
1639+ ** self ._build_completion_params (
1640+ messages = messages ,
1641+ tools = formatted_tools ,
1642+ temperature = temperature ,
1643+ stream = True ,
1644+ output_json = output_json ,
1645+ output_pydantic = output_pydantic ,
1646+ ** kwargs
1647+ )
1648+ ):
1649+ if chunk and chunk .choices and chunk .choices [0 ].delta :
1650+ delta = chunk .choices [0 ].delta
1651+
1652+ # Process both content and tool calls using existing helper
1653+ response_text , tool_calls = self ._process_stream_delta (
1654+ delta , response_text , tool_calls , formatted_tools
1655+ )
1656+
1657+ # Yield content chunks in real-time as they arrive
1658+ if delta .content :
1659+ yield delta .content
1660+
1661+ # After streaming completes, handle tool calls if present
1662+ if tool_calls and execute_tool_fn :
1663+ # Add assistant message with tool calls to conversation
1664+ if self ._is_ollama_provider ():
1665+ messages .append ({
1666+ "role" : "assistant" ,
1667+ "content" : response_text
1668+ })
1669+ else :
1670+ serializable_tool_calls = self ._serialize_tool_calls (tool_calls )
1671+ messages .append ({
1672+ "role" : "assistant" ,
1673+ "content" : response_text ,
1674+ "tool_calls" : serializable_tool_calls
1675+ })
1676+
1677+ # Execute tool calls and add results to conversation
1678+ for tool_call in tool_calls :
1679+ is_ollama = self ._is_ollama_provider ()
1680+ function_name , arguments , tool_call_id = self ._extract_tool_call_info (tool_call , is_ollama )
1681+
1682+ try :
1683+ # Execute the tool
1684+ tool_result = execute_tool_fn (function_name , arguments )
1685+
1686+ # Add tool result to messages
1687+ tool_message = self ._create_tool_message (function_name , tool_result , tool_call_id , is_ollama )
1688+ messages .append (tool_message )
1689+
1690+ except Exception as e :
1691+ logging .error (f"Tool execution error for { function_name } : { e } " )
1692+ # Add error message to conversation
1693+ error_message = self ._create_tool_message (
1694+ function_name , f"Error executing tool: { e } " , tool_call_id , is_ollama
1695+ )
1696+ messages .append (error_message )
1697+
1698+ # Continue conversation after tool execution - get follow-up response
1699+ try :
1700+ follow_up_response = litellm .completion (
1701+ ** self ._build_completion_params (
1702+ messages = messages ,
1703+ tools = formatted_tools ,
1704+ temperature = temperature ,
1705+ stream = False ,
1706+ ** kwargs
1707+ )
1708+ )
1709+
1710+ if follow_up_response and follow_up_response .choices :
1711+ follow_up_content = follow_up_response .choices [0 ].message .content
1712+ if follow_up_content :
1713+ # Yield the follow-up response after tool execution
1714+ yield follow_up_content
1715+ except Exception as e :
1716+ logging .error (f"Follow-up response failed: { e } " )
1717+
1718+ except Exception as e :
1719+ logging .error (f"Streaming failed: { e } " )
1720+ # Fall back to non-streaming if streaming fails
1721+ use_streaming = False
1722+
1723+ if not use_streaming :
1724+ # Fall back to non-streaming and yield the complete response
1725+ try :
1726+ response = litellm .completion (
1727+ ** self ._build_completion_params (
1728+ messages = messages ,
1729+ tools = formatted_tools ,
1730+ temperature = temperature ,
1731+ stream = False ,
1732+ output_json = output_json ,
1733+ output_pydantic = output_pydantic ,
1734+ ** kwargs
1735+ )
1736+ )
1737+
1738+ if response and response .choices :
1739+ content = response .choices [0 ].message .content
1740+ if content :
1741+ # Yield the complete response as a single chunk
1742+ yield content
1743+
1744+ except Exception as e :
1745+ logging .error (f"Non-streaming fallback failed: { e } " )
1746+ raise
1747+
1748+ except Exception as e :
1749+ logging .error (f"Error in get_response_stream: { e } " )
1750+ raise
1751+
15601752 def _is_gemini_model (self ) -> bool :
15611753 """Check if the model is a Gemini model."""
15621754 if not self .model :
0 commit comments