Merge pull request #1532 from phidatahq/lumalabs-video-generation

Add LumaLabs Video Generation
phidatahq · Dec 11, 2024 · 51e8ac0 · 51e8ac0
2 parents 6a60da1 + d10bc9c
commit 51e8ac0
Show file tree

Hide file tree

Showing 5 changed files with 223 additions and 7 deletions.
diff --git a/cookbook/playground/multimodal_agent.py b/cookbook/playground/multimodal_agent.py
@@ -14,7 +14,6 @@
 from phi.playground import Playground, serve_playground_app
 from phi.storage.agent.sqlite import SqlAgentStorage
 from phi.tools.fal_tools import FalTools
-from pydantic import BaseModel, Field
 
 image_agent_storage_file: str = "tmp/image_agent.db"
 
@@ -26,7 +25,7 @@
     description="You are an AI agent that can generate images using DALL-E.",
     instructions=[
         "When the user asks you to create an image, use the `create_image` tool to create the image.",
-        "Don't provide the URL of the image in the response. Only describe what image was generated."
+        "Don't provide the URL of the image in the response. Only describe what image was generated.",
     ],
     markdown=True,
     debug_mode=True,
@@ -43,7 +42,7 @@
     description="You are an AI agent that can generate gifs using the ModelsLabs API.",
     instructions=[
         "When the user asks you to create an image, use the `generate_media` tool to create the image.",
-        "Don't provide the URL of the image in the response. Only describe what image was generated."
+        "Don't provide the URL of the image in the response. Only describe what image was generated.",
     ],
     markdown=True,
     debug_mode=True,
@@ -60,7 +59,7 @@
     description="You are an AI agent that can generate videos using the ModelsLabs API.",
     instructions=[
         "When the user asks you to create a video, use the `generate_media` tool to create the video.",
-        "Don't provide the URL of the video in the response. Only describe what video was generated."
+        "Don't provide the URL of the video in the response. Only describe what video was generated.",
     ],
     markdown=True,
     debug_mode=True,
@@ -77,7 +76,7 @@
     description="You are an AI agent that can generate videos using the Fal API.",
     instructions=[
         "When the user asks you to create a video, use the `generate_media` tool to create the video.",
-        "Don't provide the URL of the video in the response. Only describe what video was generated."
+        "Don't provide the URL of the video in the response. Only describe what video was generated.",
     ],
     markdown=True,
     debug_mode=True,

diff --git a/cookbook/tools/lumalabs_tool.py b/cookbook/tools/lumalabs_tool.py
@@ -0,0 +1,45 @@
+from phi.agent import Agent
+from phi.llm.openai import OpenAIChat
+from phi.tools.lumalab import LumaLabTools
+
+"""Create an agent specialized for Luma AI video generation"""
+
+luma_agent = Agent(
+    name="Luma Video Agent",
+    agent_id="luma-video-agent",
+    llm=OpenAIChat(model="gpt-4o"),
+    tools=[LumaLabTools()],  # Using the LumaLab tool we created
+    markdown=True,
+    debug_mode=True,
+    show_tool_calls=True,
+    instructions=[
+        "You are an agent designed to generate videos using the Luma AI API.",
+        "You can generate videos in two ways:",
+        "1. Text-to-Video Generation:",
+        "   - Use the generate_video function for creating videos from text prompts",
+        "   - Default parameters: loop=False, aspect_ratio='16:9', keyframes=None",
+        "2. Image-to-Video Generation:",
+        "   - Use the image_to_video function when starting from one or two images",
+        "   - Required parameters: prompt, start_image_url",
+        "   - Optional parameters: end_image_url, loop=False, aspect_ratio='16:9'",
+        "   - The image URLs must be publicly accessible",
+        "Choose the appropriate function based on whether the user provides image URLs or just a text prompt.",
+        "The video will be displayed in the UI automatically below your response, so you don't need to show the video URL in your response.",
+        "Politely and courteously let the user know that the video has been generated and will be displayed below as soon as its ready.",
+        "After generating any video, if generation is async (wait_for_completion=False), inform about the generation ID",
+    ],
+    system_message=(
+        "Use generate_video for text-to-video requests and image_to_video for image-based "
+        "generation. Don't modify default parameters unless specifically requested. "
+        "Always provide clear feedback about the video generation status."
+    ),
+)
+
+luma_agent.run("Generate a video of a car in a sky")
+# luma_agent.run("Transform this image into a video of a tiger walking: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/Walking_tiger_female.jpg/1920px-Walking_tiger_female.jpg")
+# luma_agent.run("""
+# Create a transition video between these two images:
+# Start: https://img.freepik.com/premium-photo/car-driving-dark-forest-generative-ai_634053-6661.jpg?w=1380
+# End: https://img.freepik.com/free-photo/front-view-black-luxury-sedan-road_114579-5030.jpg?t=st=1733821884~exp=1733825484~hmac=735ca584a9b985c53875fc1ad343c3fd394e1de4db49e5ab1a9ab37ac5f91a36&w=1380
+# Make it a smooth, natural movement
+# """)
diff --git a/phi/llm/openai/chat.py b/phi/llm/openai/chat.py
@@ -181,7 +181,9 @@ def to_dict(self) -> Dict[str, Any]:
         if self.presence_penalty:
             _dict["presence_penalty"] = self.presence_penalty
         if self.response_format:
-            _dict["response_format"] = self.response_format if isinstance(self.response_format, dict) else str(self.response_format)
+            _dict["response_format"] = (
+                self.response_format if isinstance(self.response_format, dict) else str(self.response_format)
+            )
         if self.seed is not None:
             _dict["seed"] = self.seed
         if self.stop:

diff --git a/phi/model/openai/chat.py b/phi/model/openai/chat.py
@@ -255,7 +255,9 @@ def to_dict(self) -> Dict[str, Any]:
         if self.presence_penalty is not None:
             model_dict["presence_penalty"] = self.presence_penalty
         if self.response_format is not None:
-            model_dict["response_format"] = self.response_format if isinstance(self.response_format, dict) else str(self.response_format)
+            model_dict["response_format"] = (
+                self.response_format if isinstance(self.response_format, dict) else str(self.response_format)
+            )
         if self.seed is not None:
             model_dict["seed"] = self.seed
         if self.stop is not None:

diff --git a/phi/tools/lumalab.py b/phi/tools/lumalab.py
@@ -0,0 +1,168 @@
+import time
+import uuid
+from os import getenv
+from typing import Optional, Dict, Any, Literal, TypedDict
+
+from phi.agent import Agent
+from phi.tools import Toolkit
+from phi.utils.log import logger
+from phi.model.content import Video
+
+try:
+    from lumaai import LumaAI  # type: ignore
+except ImportError:
+    raise ImportError("`lumaai` not installed. Please install using `pip install lumaai`")
+
+
+# Define types for keyframe structure
+class KeyframeImage(TypedDict):
+    type: Literal["image"]
+    url: str
+
+
+Keyframes = Dict[str, KeyframeImage]
+
+
+class LumaLabTools(Toolkit):
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        wait_for_completion: bool = True,
+        poll_interval: int = 3,
+        max_wait_time: int = 300,  # 5 minutes
+    ):
+        super().__init__(name="luma_lab")
+
+        self.wait_for_completion = wait_for_completion
+        self.poll_interval = poll_interval
+        self.max_wait_time = max_wait_time
+        self.api_key = api_key or getenv("LUMAAI_API_KEY")
+
+        if not self.api_key:
+            logger.error("LUMAAI_API_KEY not set. Please set the LUMAAI_API_KEY environment variable.")
+
+        self.client = LumaAI(auth_token=self.api_key)
+        self.register(self.generate_video)
+        self.register(self.image_to_video)
+
+    def image_to_video(
+        self,
+        agent: Agent,
+        prompt: str,
+        start_image_url: str,
+        end_image_url: Optional[str] = None,
+        loop: bool = False,
+        aspect_ratio: Literal["1:1", "16:9", "9:16", "4:3", "3:4", "21:9", "9:21"] = "16:9",
+    ) -> str:
+        """Generate a video from one or two images with a prompt.
+
+        Args:
+            agent: The agent instance
+            prompt: Text description of the desired video
+            start_image_url: URL of the starting image
+            end_image_url: Optional URL of the ending image
+            loop: Whether the video should loop
+            aspect_ratio: Aspect ratio of the output video
+
+        Returns:
+            str: Status message or error
+        """
+
+        try:
+            # Construct keyframes
+            keyframes: Dict[str, Dict[str, str]] = {"frame0": {"type": "image", "url": start_image_url}}
+
+            # Add end image if provided
+            if end_image_url:
+                keyframes["frame1"] = {"type": "image", "url": end_image_url}
+
+            # Create generation with keyframes
+            generation = self.client.generations.create(
+                prompt=prompt,
+                loop=loop,
+                aspect_ratio=aspect_ratio,
+                keyframes=keyframes,  # type: ignore
+            )
+
+            video_id = str(uuid.uuid4())
+
+            if not self.wait_for_completion:
+                return "Async generation unsupported"
+
+            # Poll for completion
+            seconds_waited = 0
+            while seconds_waited < self.max_wait_time:
+                if not generation or not generation.id:
+                    return "Failed to get generation ID"
+
+                generation = self.client.generations.get(generation.id)
+
+                if generation.state == "completed" and generation.assets:
+                    video_url = generation.assets.video
+                    if video_url:
+                        agent.add_video(Video(id=video_id, url=video_url, eta="completed"))
+                        return f"Video generated successfully: {video_url}"
+                elif generation.state == "failed":
+                    return f"Generation failed: {generation.failure_reason}"
+
+                logger.info(f"Generation in progress... State: {generation.state}")
+                time.sleep(self.poll_interval)
+                seconds_waited += self.poll_interval
+
+            return f"Video generation timed out after {self.max_wait_time} seconds"
+
+        except Exception as e:
+            logger.error(f"Failed to generate video: {e}")
+            return f"Error: {e}"
+
+    def generate_video(
+        self,
+        agent: Agent,
+        prompt: str,
+        loop: bool = False,
+        aspect_ratio: Literal["1:1", "16:9", "9:16", "4:3", "3:4", "21:9", "9:21"] = "16:9",
+        keyframes: Optional[Dict[str, Dict[str, str]]] = None,
+    ) -> str:
+        """Use this function to generate a video given a prompt."""
+
+        try:
+            generation_params: Dict[str, Any] = {
+                "prompt": prompt,
+                "loop": loop,
+                "aspect_ratio": aspect_ratio,
+            }
+
+            if keyframes is not None:
+                generation_params["keyframes"] = keyframes
+
+            generation = self.client.generations.create(**generation_params)  # type: ignore
+
+            video_id = str(uuid.uuid4())
+            if not self.wait_for_completion:
+                return "Async generation unsupported"
+
+            # Poll for completion
+            seconds_waited = 0
+            while seconds_waited < self.max_wait_time:
+                if not generation or not generation.id:
+                    return "Failed to get generation ID"
+
+                generation = self.client.generations.get(generation.id)
+
+                if generation.state == "completed" and generation.assets:
+                    video_url = generation.assets.video
+                    if video_url:
+                        agent.add_video(Video(id=video_id, url=video_url, state="completed"))
+                        return f"Video generated successfully: {video_url}"
+                elif generation.state == "failed":
+                    return f"Generation failed: {generation.failure_reason}"
+
+                logger.info(f"Generation in progress... State: {generation.state}")
+                time.sleep(self.poll_interval)
+                seconds_waited += self.poll_interval
+
+            return f"Video generation timed out after {self.max_wait_time} seconds"
+
+        except Exception as e:
+            logger.error(f"Failed to generate video: {e}")
+            return f"Error: {e}"