Replace old instruction parsing code with new Instruction class

dangusev · dangusev · commit 7deee3b819b8 · 2025-11-18T23:55:41.000+01:00
diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py
@@ -26,6 +26,7 @@
 )
 from ..edge.types import Connection, Participant, PcmData, User, OutputAudioTrack
 from ..events.manager import EventManager
+from ..instructions import Instructions
 from ..llm import events as llm_events
 from ..llm.events import (
     LLMResponseChunkEvent,
@@ -139,7 +140,7 @@ def __init__(
         # audio incoming is enqueued to self._incoming_audio_queue (eg. human audio)
         self._incoming_audio_queue: AudioQueue = AudioQueue(buffer_limit_ms=8000)
 
-        self.instructions = instructions
+        self.instructions = Instructions(input_text=instructions)
         self.edge = edge
         self.agent_user = agent_user
         self._agent_user_initialized = False
@@ -506,7 +507,7 @@ async def join(
 
         # Setup chat and connect it to transcript events (we'll wait at the end)
         create_conversation_coro = self.edge.create_conversation(
-            call, self.agent_user, self.instructions
+            call, self.agent_user, self.instructions.full_reference
         )
 
         try:
diff --git a/agents-core/vision_agents/core/llm/llm.py b/agents-core/vision_agents/core/llm/llm.py
@@ -4,33 +4,34 @@
 import asyncio
 import json
 from typing import (
-    Optional,
     TYPE_CHECKING,
-    Tuple,
-    List,
-    Dict,
     Any,
-    TypeVar,
     Callable,
+    Dict,
     Generic,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
 )
 
 import aiortc
+from vision_agents.core.instructions import Instructions
 from vision_agents.core.llm import events
-from vision_agents.core.llm.events import ToolStartEvent, ToolEndEvent
+from vision_agents.core.llm.events import ToolEndEvent, ToolStartEvent
 
 if TYPE_CHECKING:
     from vision_agents.core.agents import Agent
     from vision_agents.core.agents.conversation import Conversation
 
-from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import Participant
 from getstream.video.rtc import PcmData
-from vision_agents.core.processors import Processor
-from vision_agents.core.utils.utils import Instructions, parse_instructions
+from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import Participant
 from vision_agents.core.events.manager import EventManager
-from .function_registry import FunctionRegistry
-from .llm_types import ToolSchema, NormalizedToolCallItem
+from vision_agents.core.processors import Processor
+
 from ..utils.video_forwarder import VideoForwarder
+from .function_registry import FunctionRegistry
+from .llm_types import NormalizedToolCallItem, ToolSchema
 
 T = TypeVar("T")
 
@@ -58,8 +59,8 @@ def __init__(self):
         self.events = EventManager()
         self.events.register_events_from_module(events)
         self.function_registry = FunctionRegistry()
-        self.instructions: Optional[str] = None
-        self.parsed_instructions: Optional[Instructions] = None
+        # LLM instructions. Provided by the Agent via `set_instructions` method
+        self._instructions: str = ""
         self._conversation: Optional[Conversation] = None
 
     async def warmup(self) -> None:
@@ -80,34 +81,6 @@ async def simple_response(
     ) -> LLMResponseEvent[Any]:
         raise NotImplementedError
 
-    def _build_enhanced_instructions(self) -> Optional[str]:
-        """
-        Build enhanced instructions by combining the original instructions with markdown file contents.
-
-        Returns:
-            Enhanced instructions string with markdown file contents included, or None if no parsed instructions
-        """
-        if not hasattr(self, "parsed_instructions") or not self.parsed_instructions:
-            return None
-
-        parsed = self.parsed_instructions
-        enhanced_instructions = [parsed.input_text]
-
-        # Add markdown file contents if any exist
-        if parsed.markdown_contents:
-            enhanced_instructions.append("\n\n## Referenced Documentation:")
-            for filename, content in parsed.markdown_contents.items():
-                if content:  # Only include non-empty content
-                    enhanced_instructions.append(f"\n### {filename}")
-                    enhanced_instructions.append(content)
-                else:
-                    enhanced_instructions.append(f"\n### {filename}")
-                    enhanced_instructions.append(
-                        "*(File not found or could not be read)*"
-                    )
-
-        return "\n".join(enhanced_instructions)
-
     def _get_tools_for_provider(self) -> List[Dict[str, Any]]:
         """
         Get tools in provider-specific format.
@@ -189,7 +162,7 @@ def _attach_agent(self, agent: Agent):
         Attach agent to the llm
         """
         self.agent = agent
-        self._set_instructions(agent.instructions)
+        self.set_instructions(agent.instructions)
 
     def set_conversation(self, conversation: Conversation):
         """
@@ -203,11 +176,8 @@ def set_conversation(self, conversation: Conversation):
         """
         self._conversation = conversation
 
-    def _set_instructions(self, instructions: str):
-        self.instructions = instructions
-
-        # Parse instructions to extract @ mentioned markdown files
-        self.parsed_instructions = parse_instructions(instructions)
+    def set_instructions(self, instructions: Instructions):
+        self._instructions = instructions.full_reference
 
     def register_function(
         self, name: Optional[str] = None, description: Optional[str] = None
diff --git a/agents-core/vision_agents/core/utils/utils.py b/agents-core/vision_agents/core/utils/utils.py
@@ -2,8 +2,6 @@
 import importlib.metadata
 import logging
 import os
-import re
-from dataclasses import dataclass
 from typing import Dict, Optional
 
 import httpx
@@ -28,107 +26,6 @@ def _load_version() -> str:
 
 _VISION_AGENTS_VERSION = _load_version()
 
-# Cache current working directory at module load time
-_INITIAL_CWD = os.getcwd()
-
-
-@dataclass
-class Instructions:
-    """Container for parsed instructions with input text and markdown files."""
-
-    input_text: str
-    markdown_contents: MarkdownFileContents  # Maps filename to file content
-    base_dir: str = ""  # Base directory for file search, defaults to empty string
-
-
-def _read_markdown_file_sync(file_path: str) -> str:
-    """Synchronous helper to read a markdown file."""
-    try:
-        if os.path.isfile(file_path):
-            with open(file_path, "r", encoding="utf-8") as f:
-                return f.read()
-        else:
-            return ""
-    except (OSError, IOError, UnicodeDecodeError):
-        return ""
-
-
-async def parse_instructions_async(
-    text: str, base_dir: Optional[str] = None
-) -> Instructions:
-    """
-    Async version: Parse instructions from a string, extracting @ mentioned markdown files and their contents.
-
-    Args:
-        text: Input text that may contain @ mentions of markdown files
-        base_dir: Base directory to search for markdown files. If None, uses cached working directory.
-
-    Returns:
-        Instructions object containing the input text and file contents
-    """
-    # Find all @ mentions that look like markdown files
-    markdown_pattern = r"@([^\s@]+\.md)"
-    matches = re.findall(markdown_pattern, text)
-
-    # Create a dictionary mapping filename to file content
-    markdown_contents = {}
-
-    # Set base directory for file search
-    if base_dir is None:
-        base_dir = _INITIAL_CWD
-
-    for match in matches:
-        # Try to read the markdown file content
-        file_path = os.path.join(base_dir, match)
-        # Run blocking I/O in thread pool
-        content = await asyncio.to_thread(_read_markdown_file_sync, file_path)
-        markdown_contents[match] = content
-
-    return Instructions(
-        input_text=text, markdown_contents=markdown_contents, base_dir=base_dir
-    )
-
-
-def parse_instructions(text: str, base_dir: Optional[str] = None) -> Instructions:
-    """
-    Parse instructions from a string, extracting @ mentioned markdown files and their contents.
-
-    Args:
-        text: Input text that may contain @ mentions of markdown files
-        base_dir: Base directory to search for markdown files. If None, uses cached working directory.
-
-    Returns:
-        Instructions object containing the input text and file contents
-
-    Example:
-        >>> text = "Please read @file1.md and @file2.md for context"
-        >>> result = parse_instructions(text)
-        >>> result.input_text
-        "Please read @file1.md and @file2.md for context"
-        >>> result.markdown_contents
-        {"file1.md": "# File 1 content...", "file2.md": "# File 2 content..."}
-    """
-    # Find all @ mentions that look like markdown files
-    # Pattern matches @ followed by filename with .md extension
-    markdown_pattern = r"@([^\s@]+\.md)"
-    matches = re.findall(markdown_pattern, text)
-
-    # Create a dictionary mapping filename to file content
-    markdown_contents = {}
-
-    # Set base directory for file search
-    if base_dir is None:
-        base_dir = _INITIAL_CWD
-
-    for match in matches:
-        # Try to read the markdown file content
-        file_path = os.path.join(base_dir, match)
-        markdown_contents[match] = _read_markdown_file_sync(file_path)
-
-    return Instructions(
-        input_text=text, markdown_contents=markdown_contents, base_dir=base_dir
-    )
-
 
 def get_vision_agents_version() -> Optional[str]:
     """
diff --git a/docs/ai/instructions/ai-llm.md b/docs/ai/instructions/ai-llm.md
@@ -22,9 +22,8 @@ class MyLLM(LLM):
         
         # some details to get right here...
         # ensure conversation history is maintained. typically by passing it ie:
-        enhanced_instructions = self._build_enhanced_instructions()
-        if enhanced_instructions:
-            kwargs["system"] = [{"text": enhanced_instructions}]
+        if self._instructions:
+            kwargs["system"] = [{"text": self._instructions}]
             
         response_iterator = await self.client.mynativemethod(self, *args, **kwargs)
         
diff --git a/docs/ai/instructions/ai-realtime-llm.md b/docs/ai/instructions/ai-realtime-llm.md
@@ -46,9 +46,8 @@ class MyRealtime(realtime.Realtime):
 
         # some details to get right here...
         # ensure conversation history is maintained. typically by passing it ie:
-        enhanced_instructions = self._build_enhanced_instructions()
-        if enhanced_instructions:
-            kwargs["system"] = [{"text": enhanced_instructions}]
+        if self._instructions:
+            kwargs["system"] = [{"text": self._instructions}]
 
         response_iterator = await self.client.mynativemethod(self, *args, **kwargs)
 
diff --git a/plugins/aws/tests/test_aws.py b/plugins/aws/tests/test_aws.py
@@ -146,7 +146,7 @@ async def test_instruction_following(self, llm: BedrockLLM):
             model="qwen.qwen3-32b-v1:0",
             region_name="us-east-1",
         )
-        llm._set_instructions("only reply in 2 letter country shortcuts")
+        llm.set_instructions("only reply in 2 letter country shortcuts")
 
         response = await llm.simple_response(
             text="Which country is rainy, protected from water with dikes and below sea level?",
diff --git a/plugins/aws/tests/test_aws_realtime.py b/plugins/aws/tests/test_aws_realtime.py
@@ -24,7 +24,7 @@ async def realtime(self):
             model="amazon.nova-sonic-v1:0",
             region_name="us-east-1",
         )
-        realtime._set_instructions(
+        realtime.set_instructions(
             "you're a kind assistant, always be friendly please."
         )
         try:
@@ -36,7 +36,7 @@ async def realtime(self):
     async def test_simple_response_flow(self, realtime):
         # unlike other realtime LLMs, AWS doesn't reply if you only send text
         events = []
-        realtime._set_instructions(
+        realtime.set_instructions(
             "whenever you reply mention a fun fact about The Netherlands"
         )
 
diff --git a/plugins/aws/vision_agents/plugins/aws/aws_llm.py b/plugins/aws/vision_agents/plugins/aws/aws_llm.py
@@ -127,9 +127,8 @@ async def converse(self, *args, **kwargs) -> LLMResponseEvent[Any]:
             kwargs["toolConfig"] = {"tools": converted_tools}
 
         # Combine original instructions with markdown file contents
-        enhanced_instructions = self._build_enhanced_instructions()
-        if enhanced_instructions:
-            kwargs["system"] = [{"text": enhanced_instructions}]
+        if self._instructions:
+            kwargs["system"] = [{"text": self._instructions}]
 
         # Ensure the AI remembers the past conversation
         new_messages = kwargs.get("messages", [])
@@ -354,10 +353,8 @@ async def converse_stream(self, *args, **kwargs) -> LLMResponseEvent[Any]:
             for msg in normalized_messages:
                 self._conversation.messages.append(msg)
 
-        # Combine original instructions with markdown file contents
-        enhanced_instructions = self._build_enhanced_instructions()
-        if enhanced_instructions:
-            kwargs["system"] = [{"text": enhanced_instructions}]
+        if self._instructions:
+            kwargs["system"] = [{"text": self._instructions}]
 
         try:
             system_param = kwargs.get("system")
diff --git a/plugins/aws/vision_agents/plugins/aws/aws_realtime.py b/plugins/aws/vision_agents/plugins/aws/aws_realtime.py
@@ -229,12 +229,11 @@ async def connect(self):
             await asyncio.sleep(0.1)
 
             # next send system instructions
-            system_instructions = self._build_enhanced_instructions()
-            if not system_instructions:
+            if not self._instructions:
                 raise Exception(
                     "AWS Bedrock requires system instructions before sending regular user input"
                 )
-            await self.content_input(system_instructions, "SYSTEM")
+            await self.content_input(self._instructions, "SYSTEM")
 
             logger.info("AWS Bedrock connection established")
 
diff --git a/plugins/gemini/tests/test_gemini_llm.py b/plugins/gemini/tests/test_gemini_llm.py
@@ -84,7 +84,7 @@ async def test_instruction_following(self):
         llm = GeminiLLM(model="gemini-2.0-flash-exp")
         llm.set_conversation(InMemoryConversation("be friendly", []))
 
-        llm._set_instructions("only reply in 2 letter country shortcuts")
+        llm.set_instructions("only reply in 2 letter country shortcuts")
 
         response = await llm.simple_response(
             text="Which country is rainy, protected from water with dikes and below sea level?",
diff --git a/plugins/gemini/vision_agents/plugins/gemini/gemini_llm.py b/plugins/gemini/vision_agents/plugins/gemini/gemini_llm.py
@@ -162,8 +162,7 @@ async def send_message(self, *args, **kwargs):
 
         # initialize chat if needed
         if self.chat is None:
-            enhanced_instructions = self._build_enhanced_instructions()
-            config = self._build_config(system_instruction=enhanced_instructions)
+            config = self._build_config(system_instruction=self._instructions)
             self.chat = self.client.chats.create(model=self.model, config=config)
 
         # Add tools if available - Gemini uses GenerateContentConfig
diff --git a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py
@@ -320,7 +320,7 @@ def get_config(self) -> LiveConnectConfigDict:
             config["session_resumption"] = {"handle": self._session_resumption_id}
 
         # set the instructions
-        config["system_instruction"] = self._build_enhanced_instructions()
+        config["system_instruction"] = self._instructions
 
         # Add tools if available - Gemini Live uses similar format to regular Gemini
         tools_spec = self.get_available_functions()
diff --git a/plugins/openai/tests/test_openai_realtime.py b/plugins/openai/tests/test_openai_realtime.py
@@ -23,7 +23,7 @@ async def realtime(self):
             model="gpt-realtime",
             voice="alloy",
         )
-        realtime._set_instructions("be friendly")
+        realtime.set_instructions("be friendly")
         try:
             yield realtime
         finally:
diff --git a/plugins/openai/vision_agents/plugins/openai/chat_completions/chat_completions_llm.py b/plugins/openai/vision_agents/plugins/openai/chat_completions/chat_completions_llm.py
@@ -167,8 +167,8 @@ async def simple_response(
     async def _build_model_request(self) -> list[dict]:
         messages: list[dict] = []
         # Add Agent's instructions as system prompt.
-        if self.instructions:
-            messages.append({"role": "system", "content": self.instructions})
+        if self._instructions:
+            messages.append({"role": "system", "content": self._instructions})
 
         # Add all messages from the conversation to the prompt
         if self._conversation is not None:
diff --git a/plugins/openai/vision_agents/plugins/openai/chat_completions/chat_completions_vlm.py b/plugins/openai/vision_agents/plugins/openai/chat_completions/chat_completions_vlm.py
diff --git a/plugins/openai/vision_agents/plugins/openai/openai_llm.py b/plugins/openai/vision_agents/plugins/openai/openai_llm.py
diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
diff --git a/plugins/openrouter/tests/test_openrouter_llm.py b/plugins/openrouter/tests/test_openrouter_llm.py

Original file line number	Diff line number	Diff line change
`@@ -146,7 +146,7 @@ async def test_instruction_following(self, llm: BedrockLLM):`
`146`	`146`	`model="qwen.qwen3-32b-v1:0",`
`147`	`147`	`region_name="us-east-1",`
`148`	`148`	`)`
`149`		`- llm._set_instructions("only reply in 2 letter country shortcuts")`
	`149`	`+ llm.set_instructions("only reply in 2 letter country shortcuts")`
`150`	`150`
`151`	`151`	`response = await llm.simple_response(`
`152`	`152`	`text="Which country is rainy, protected from water with dikes and below sea level?",`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ async def realtime(self):`
`24`	`24`	`model="amazon.nova-sonic-v1:0",`
`25`	`25`	`region_name="us-east-1",`
`26`	`26`	`)`
`27`		`- realtime._set_instructions(`
	`27`	`+ realtime.set_instructions(`
`28`	`28`	`"you're a kind assistant, always be friendly please."`
`29`	`29`	`)`
`30`	`30`	`try:`
`@@ -36,7 +36,7 @@ async def realtime(self):`
`36`	`36`	`async def test_simple_response_flow(self, realtime):`
`37`	`37`	`# unlike other realtime LLMs, AWS doesn't reply if you only send text`
`38`	`38`	`events = []`
`39`		`- realtime._set_instructions(`
	`39`	`+ realtime.set_instructions(`
`40`	`40`	`"whenever you reply mention a fun fact about The Netherlands"`
`41`	`41`	`)`
`42`	`42`