phidatahq · dirkbrnd · Dec 12, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/cookbook/agents/15_generate_video.py b/cookbook/agents/15_generate_video.py
@@ -7,7 +7,7 @@
     tools=[ModelsLabs()],
     description="You are an AI agent that can generate videos using the ModelsLabs API.",
     instructions=[
-        "When the user asks you to create a video, use the `create_video` tool to create the video.",
+        "When the user asks you to create a video, use the `generate_media` tool to create the video.",
         "The video will be displayed in the UI automatically below your response, so you don't need to show the video URL in your response.",
         "Politely and courteously let the user know that the video has been generated and will be displayed below as soon as its ready.",
     ],

diff --git a/cookbook/agents/43_generate_replicate_video.py b/cookbook/agents/43_generate_replicate_video.py
@@ -0,0 +1,24 @@
+from phi.agent import Agent
+from phi.model.openai import OpenAIChat
+from phi.tools.replicate import ReplicateTools
+
+"""Create an agent specialized for Replicate AI content generation"""
+
+video_agent = Agent(
+    name="Video Generator Agent",
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[
+        ReplicateTools(model="tencent/hunyuan-video:847dfa8b01e739637fc76f480ede0c1d76408e1d694b830b5dfb8e547bf98405")
+    ],
+    description="You are an AI agent that can generate videos using the Replicate API.",
+    instructions=[
+        "When the user asks you to create a video, use the `generate_media` tool to create the video.",
+        "Return the URL as raw to the user.",
+        "Don't convert video URL to markdown or anything else.",
+    ],
+    markdown=True,
+    debug_mode=True,
+    show_tool_calls=True,
+)
+
+video_agent.print_response("Generate a video of a horse in the dessert.")
diff --git a/cookbook/agents/44_generate_replicate_image.py b/cookbook/agents/44_generate_replicate_image.py
@@ -0,0 +1,22 @@
+from phi.agent import Agent
+from phi.model.openai import OpenAIChat
+from phi.tools.replicate import ReplicateTools
+
+"""Create an agent specialized for Replicate AI content generation"""
+
+image_agent = Agent(
+    name="Image Generator Agent",
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[ReplicateTools(model="luma/photon-flash")],
+    description="You are an AI agent that can generate images using the Replicate API.",
+    instructions=[
+        "When the user asks you to create an image, use the `generate_media` tool to create the image.",
+        "Return the URL as raw to the user.",
+        "Don't convert image URL to markdown or anything else.",
+    ],
+    markdown=True,
+    debug_mode=True,
+    show_tool_calls=True,
+)
+
+image_agent.print_response("Generate an image of a horse in the dessert.")
diff --git a/cookbook/agents/45_generate_fal_video.py b/cookbook/agents/45_generate_fal_video.py
@@ -0,0 +1,20 @@
+from phi.agent import Agent
+from phi.model.openai import OpenAIChat
+from phi.tools.fal_tools import FalTools
+
+fal_agent = Agent(
+    name="Fal Video Generator Agent",
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[FalTools("fal-ai/hunyuan-video")],
+    description="You are an AI agent that can generate videos using the Fal API.",
+    instructions=[
+        "When the user asks you to create a video, use the `generate_media` tool to create the video.",
+        "Return the URL as raw to the user.",
+        "Don't convert video URL to markdown or anything else.",
+    ],
+    markdown=True,
+    debug_mode=True,
+    show_tool_calls=True,
+)
+
+fal_agent.print_response("Generate video of balloon in the ocean")
diff --git a/cookbook/assistants/llms/vertexai/samples/multimodal.py b/cookbook/assistants/llms/vertexai/samples/multimodal.py
@@ -11,7 +11,7 @@ def multimodal_example(project: Optional[str], location: Optional[str]) -> str:
     # Load the model
     multimodal_model = GenerativeModel("gemini-1.0-pro-vision")
     # Query the model
-    response = multimodal_model.generate_content(
+    response = multimodal_model.generate_media(
         [
             # Add an example image
             Part.from_uri("gs://generativeai-downloads/images/scones.jpg", mime_type="image/jpeg"),

diff --git a/cookbook/assistants/llms/vertexai/samples/text_stream.py b/cookbook/assistants/llms/vertexai/samples/text_stream.py
@@ -11,7 +11,7 @@ def generate(project: Optional[str], location: Optional[str]) -> None:
     # Load the model
     model = GenerativeModel("gemini-1.0-pro-vision")
     # Query the model
-    responses: Iterable[GenerationResponse] = model.generate_content("Who are you?", stream=True)
+    responses: Iterable[GenerationResponse] = model.generate_media("Who are you?", stream=True)
     # Process the response
     for response in responses:
         print(response.text, end="")

diff --git a/cookbook/playground/multimodal_agent.py b/cookbook/playground/multimodal_agent.py
@@ -10,48 +10,83 @@
 from phi.model.openai import OpenAIChat
 from phi.tools.dalle import Dalle
 from phi.tools.models_labs import ModelsLabs
+from phi.model.response import FileType
 from phi.playground import Playground, serve_playground_app
 from phi.storage.agent.sqlite import SqlAgentStorage
+from phi.tools.fal_tools import FalTools
 
 image_agent_storage_file: str = "tmp/image_agent.db"
 
 image_agent = Agent(
-    name="Image Agent",
+    name="DALL-E Image Agent",
     agent_id="image_agent",
     model=OpenAIChat(id="gpt-4o"),
     tools=[Dalle()],
     description="You are an AI agent that can generate images using DALL-E.",
     instructions=[
         "When the user asks you to create an image, use the `create_image` tool to create the image.",
-        "The image will be displayed in the UI automatically below your response, so you don't need to show the image URL in your response.",
-        "Politely and courteously let the user know that the image has been generated and will be displayed below as soon as its ready.",
+        "Don't provide the URL of the image in the response. Only describe what image was generated.",
     ],
     markdown=True,
     debug_mode=True,
     add_history_to_messages=True,
     add_datetime_to_instructions=True,
-    storage=SqlAgentStorage(table_name="image_agent", db_file="tmp/image_agent.db"),
+    storage=SqlAgentStorage(table_name="image_agent", db_file=image_agent_storage_file),
 )
 
-video_agent = Agent(
-    name="Video Agent",
-    agent_id="video_agent",
+ml_gif_agent = Agent(
+    name="ModelsLab GIF Agent",
+    agent_id="ml_gif_agent",
     model=OpenAIChat(id="gpt-4o"),
-    tools=[ModelsLabs(wait_for_completion=True)],
+    tools=[ModelsLabs(wait_for_completion=True, file_type=FileType.GIF)],
+    description="You are an AI agent that can generate gifs using the ModelsLabs API.",
+    instructions=[
+        "When the user asks you to create an image, use the `generate_media` tool to create the image.",
+        "Don't provide the URL of the image in the response. Only describe what image was generated.",
+    ],
+    markdown=True,
+    debug_mode=True,
+    add_history_to_messages=True,
+    add_datetime_to_instructions=True,
+    storage=SqlAgentStorage(table_name="ml_gif_agent", db_file=image_agent_storage_file),
+)
+
+ml_video_agent = Agent(
+    name="ModelsLab Video Agent",
+    agent_id="ml_video_agent",
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[ModelsLabs(wait_for_completion=True, file_type=FileType.MP4)],
     description="You are an AI agent that can generate videos using the ModelsLabs API.",
     instructions=[
-        "When the user asks you to create a video, use the `create_video` tool to create the video.",
-        "The video will be displayed in the UI automatically below your response, so you don't need to show the video URL in your response.",
-        "Politely and courteously let the user know that the video has been generated and will be displayed below as soon as its ready.",
+        "When the user asks you to create a video, use the `generate_media` tool to create the video.",
+        "Don't provide the URL of the video in the response. Only describe what video was generated.",
     ],
     markdown=True,
     debug_mode=True,
     add_history_to_messages=True,
     add_datetime_to_instructions=True,
-    storage=SqlAgentStorage(table_name="video_agent", db_file="tmp/video_agent.db"),
+    storage=SqlAgentStorage(table_name="ml_video_agent", db_file=image_agent_storage_file),
 )
 
-app = Playground(agents=[image_agent, video_agent]).get_app()
+fal_agent = Agent(
+    name="Fal Video Agent",
+    agent_id="fal_agent",
+    model=OpenAIChat(id="gpt-4o"),
+    tools=[FalTools("fal-ai/hunyuan-video")],
+    description="You are an AI agent that can generate videos using the Fal API.",
+    instructions=[
+        "When the user asks you to create a video, use the `generate_media` tool to create the video.",
+        "Don't provide the URL of the video in the response. Only describe what video was generated.",
+    ],
+    markdown=True,
+    debug_mode=True,
+    add_history_to_messages=True,
+    add_datetime_to_instructions=True,
+    storage=SqlAgentStorage(table_name="fal_agent", db_file=image_agent_storage_file),
+)
+
+
+app = Playground(agents=[image_agent, ml_gif_agent, ml_video_agent, fal_agent]).get_app(use_async=False)
 
 if __name__ == "__main__":
     serve_playground_app("multimodal_agent:app", reload=True)
diff --git a/cookbook/tools/lumalabs_tool.py b/cookbook/tools/lumalabs_tool.py
@@ -0,0 +1,45 @@
+from phi.agent import Agent
+from phi.llm.openai import OpenAIChat
+from phi.tools.lumalab import LumaLabTools
+
+"""Create an agent specialized for Luma AI video generation"""
+
+luma_agent = Agent(
+    name="Luma Video Agent",
+    agent_id="luma-video-agent",
+    llm=OpenAIChat(model="gpt-4o"),
+    tools=[LumaLabTools()],  # Using the LumaLab tool we created
+    markdown=True,
+    debug_mode=True,
+    show_tool_calls=True,
+    instructions=[
+        "You are an agent designed to generate videos using the Luma AI API.",
+        "You can generate videos in two ways:",
+        "1. Text-to-Video Generation:",
+        "   - Use the generate_video function for creating videos from text prompts",
+        "   - Default parameters: loop=False, aspect_ratio='16:9', keyframes=None",
+        "2. Image-to-Video Generation:",
+        "   - Use the image_to_video function when starting from one or two images",
+        "   - Required parameters: prompt, start_image_url",
+        "   - Optional parameters: end_image_url, loop=False, aspect_ratio='16:9'",
+        "   - The image URLs must be publicly accessible",
+        "Choose the appropriate function based on whether the user provides image URLs or just a text prompt.",
+        "The video will be displayed in the UI automatically below your response, so you don't need to show the video URL in your response.",
+        "Politely and courteously let the user know that the video has been generated and will be displayed below as soon as its ready.",
+        "After generating any video, if generation is async (wait_for_completion=False), inform about the generation ID",
+    ],
+    system_message=(
+        "Use generate_video for text-to-video requests and image_to_video for image-based "
+        "generation. Don't modify default parameters unless specifically requested. "
+        "Always provide clear feedback about the video generation status."
+    ),
+)
+
+luma_agent.run("Generate a video of a car in a sky")
+# luma_agent.run("Transform this image into a video of a tiger walking: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Walking_tiger_female.jpg/1920px-Walking_tiger_female.jpg")
+# luma_agent.run("""
+# Create a transition video between these two images:
+# Start: https://img.freepik.com/premium-photo/car-driving-dark-forest-generative-ai_634053-6661.jpg?w=1380
+# End: https://img.freepik.com/free-photo/front-view-black-luxury-sedan-road_114579-5030.jpg?t=st=1733821884~exp=1733825484~hmac=735ca584a9b985c53875fc1ad343c3fd394e1de4db49e5ab1a9ab37ac5f91a36&w=1380
+# Make it a smooth, natural movement
+# """)
diff --git a/phi/agent/agent.py b/phi/agent/agent.py
@@ -28,6 +28,7 @@
 
 from phi.document import Document
 from phi.agent.session import AgentSession
+from phi.model.content import Image, Video
 from phi.reasoning.step import ReasoningStep, ReasoningSteps, NextAction
 from phi.run.response import RunEvent, RunResponse, RunResponseExtraData
 from phi.knowledge.agent import AgentKnowledge
@@ -57,9 +58,9 @@ class Agent(BaseModel):
 
     # -*- Agent Data
     # Images associated with this agent
-    images: Optional[List[Union[str, Dict[str, Any]]]] = None
+    images: Optional[List[Image]] = None
     # Videos associated with this agent
-    videos: Optional[List[Union[str, Dict[str, Any]]]] = None
+    videos: Optional[List[Video]] = None
 
     # Data associated with this agent
     # name, model, images and videos are automatically added to the agent_data
@@ -573,9 +574,9 @@ def get_agent_data(self) -> Dict[str, Any]:
         if self.model is not None:
             agent_data["model"] = self.model.to_dict()
         if self.images is not None:
-            agent_data["images"] = self.images
+            agent_data["images"] = [img.model_dump() for img in self.images]
         if self.videos is not None:
-            agent_data["videos"] = self.videos
+            agent_data["videos"] = [vid.model_dump() for vid in self.videos]
         return agent_data
 
     def get_session_data(self) -> Dict[str, Any]:
@@ -588,7 +589,6 @@ def get_session_data(self) -> Dict[str, Any]:
 
     def get_agent_session(self) -> AgentSession:
         """Get an AgentSession object, which can be saved to the database"""
-
         return AgentSession(
             session_id=self.session_id,
             agent_id=self.agent_id,
@@ -632,13 +632,13 @@ def from_agent_session(self, session: AgentSession):
             if "images" in session.agent_data:
                 images_from_db = session.agent_data.get("images")
                 if self.images is not None and isinstance(self.images, list):
-                    self.images.extend(images_from_db)  # type: ignore
+                    self.images.extend([Image.model_validate(img) for img in self.images])
                 else:
                     self.images = images_from_db
             if "videos" in session.agent_data:
                 videos_from_db = session.agent_data.get("videos")
                 if self.videos is not None and isinstance(self.videos, list):
-                    self.videos.extend(videos_from_db)  # type: ignore
+                    self.videos.extend([Video.model_validate(vid) for vid in self.videos])
                 else:
                     self.videos = videos_from_db
 
@@ -2433,7 +2433,7 @@ def delete_session(self, session_id: str):
     # Handle images and videos
     ###########################################################################
 
-    def add_image(self, image: Union[str, Dict]) -> None:
+    def add_image(self, image: Image) -> None:
         if self.images is None:
             self.images = []
         self.images.append(image)
@@ -2442,7 +2442,7 @@ def add_image(self, image: Union[str, Dict]) -> None:
                 self.run_response.images = []
             self.run_response.images.append(image)
 
-    def add_video(self, video: Union[str, Dict]) -> None:
+    def add_video(self, video: Video) -> None:
         if self.videos is None:
             self.videos = []
         self.videos.append(video)
@@ -2451,10 +2451,10 @@ def add_video(self, video: Union[str, Dict]) -> None:
                 self.run_response.videos = []
             self.run_response.videos.append(video)
 
-    def get_images(self) -> Optional[List[Union[str, Dict]]]:
+    def get_images(self) -> Optional[List[Image]]:
         return self.images
 
-    def get_videos(self) -> Optional[List[Union[str, Dict]]]:
+    def get_videos(self) -> Optional[List[Video]]:
         return self.videos
 
     ###########################################################################

diff --git a/phi/llm/openai/chat.py b/phi/llm/openai/chat.py
@@ -181,7 +181,9 @@ def to_dict(self) -> Dict[str, Any]:
         if self.presence_penalty:
             _dict["presence_penalty"] = self.presence_penalty
         if self.response_format:
-            _dict["response_format"] = self.response_format
+            _dict["response_format"] = (
+                self.response_format if isinstance(self.response_format, dict) else str(self.response_format)
+            )
         if self.seed is not None:
             _dict["seed"] = self.seed
         if self.stop:

diff --git a/phi/model/content.py b/phi/model/content.py
@@ -0,0 +1,18 @@
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class Video(BaseModel):
+    id: str
+    url: str
+    original_prompt: Optional[str] = None
+    revised_prompt: Optional[str] = None
+    eta: Optional[str] = None
+
+
+class Image(BaseModel):
+    id: str
+    url: str
+    original_prompt: Optional[str] = None
+    revised_prompt: Optional[str] = None
diff --git a/phi/model/openai/chat.py b/phi/model/openai/chat.py
@@ -255,7 +255,9 @@ def to_dict(self) -> Dict[str, Any]:
         if self.presence_penalty is not None:
             model_dict["presence_penalty"] = self.presence_penalty
         if self.response_format is not None:
-            model_dict["response_format"] = self.response_format
+            model_dict["response_format"] = (
+                self.response_format if isinstance(self.response_format, dict) else str(self.response_format)
+            )
         if self.seed is not None:
             model_dict["seed"] = self.seed
         if self.stop is not None:

diff --git a/phi/model/response.py b/phi/model/response.py
@@ -23,3 +23,8 @@ class ModelResponse:
     tool_call: Optional[Dict[str, Any]] = None
     event: str = ModelResponseEvent.assistant_response.value
     created_at: int = int(time())
+
+
+class FileType(str, Enum):
+    MP4 = "mp4"
+    GIF = "gif"
diff --git a/phi/run/response.py b/phi/run/response.py
@@ -1,9 +1,10 @@
 from time import time
 from enum import Enum
-from typing import Optional, Any, Dict, List, Union
+from typing import Optional, Any, Dict, List
 
 from pydantic import BaseModel, ConfigDict, Field
 
+from phi.model.content import Video, Image
 from phi.reasoning.step import ReasoningStep
 from phi.model.message import Message, MessageReferences
 
@@ -48,8 +49,8 @@ class RunResponse(BaseModel):
     session_id: Optional[str] = None
     workflow_id: Optional[str] = None
     tools: Optional[List[Dict[str, Any]]] = None
-    images: Optional[List[Union[str, Dict[str, Any]]]] = None
-    videos: Optional[List[Union[str, Dict[str, Any]]]] = None
+    images: Optional[List[Image]] = None
+    videos: Optional[List[Video]] = None
     audio: Optional[Dict] = None
     extra_data: Optional[RunResponseExtraData] = None
     created_at: int = Field(default_factory=lambda: int(time()))