Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multimodality #967

Merged
merged 19 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add: Check support for image uri and url
  • Loading branch information
Pingdred committed Nov 6, 2024
commit e1bf0dd22cfd6898e7a4f797b5c9d23745eb3fa4
72 changes: 48 additions & 24 deletions core/cat/looking_glass/cheshire_cat.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
import base64
from typing import List, Dict
from typing_extensions import Protocol

Expand Down Expand Up @@ -128,45 +129,64 @@ def load_language_model(self) -> BaseLanguageModel:
"""

selected_llm = crud.get_setting_by_name(name="llm_selected")
self._llm_modalities = {"image": False, "audio": False}
self._llm_modalities = {"image_uri": False, "image_url": False, "audio": False}
Pingdred marked this conversation as resolved.
Show resolved Hide resolved

def _get_black_pixel_data() -> str:
"""Return the base64 data for a black pixel image."""
return """data:image/png;base64,
iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAA
AfFcSJAAAADUlEQVQIW2NgYGD4DwABBAEAwS2OU
AAAAABJRU5ErkJggg=="""
def _load_test_image():
"""Return the base64 data for the test image."""
with open("cat/loading_cat.jpg", "rb") as image_file:
Pingdred marked this conversation as resolved.
Show resolved Hide resolved
return base64.b64encode(image_file.read()).decode('utf-8')

def _check_image_suppot(llm) -> Dict:
"""Test the LLM to check if it supports image input."""
def _check_image_support(llm, image_type: str, image_value: str) -> Dict:
"""Test the LLM to check if it supports image input of a specified type."""

black_pixel = _get_black_pixel_data()

message = HumanMessage(
content=[
# Prepare message content based on the image type
if image_type == "image_uri":
content = [
{
"type": "image_url",
"image_url": {"url": black_pixel},
},
"image_url": {"url": f"data:image/jpeg;base64,{image_value}"},
}
]
elif image_type == "image_url":
content = [
{
"type": "text",
"text": "Respond with `MEOW`.",
"type": "image_url",
"image_url": {"url": image_value},
}
],
]


content.append(
{
"type": "text",
"text": "Respond with `MEOW`.",
}
)

message = HumanMessage(content=content)

# Retrieve model information
selected_llm_class = selected_llm["value"]["name"]
selected_llm_config = crud.get_setting_by_name(name=selected_llm_class)
model_name = selected_llm_config["value"].get("model_name") or selected_llm_config["value"].get("model")

# Perform the image support check
try:
llm.invoke([message])
self._llm_modalities["image"] = True
self._llm_modalities[image_type] = True
except Exception as e:
log.warning(f"The LLM '{model_name}' does not support input images")
finally:
log.info(f"LLM {model_name} Supported modalities:")
log.info(self._llm_modalities)
log.warning(f"The LLM '{model_name}' does not support {image_type} as input images.")
log.debug(e)

# Wrapper functions for checking each type of image support
def _check_image_uri_support(llm) -> Dict:
"""Test LLM support for base64-encoded image input."""
return _check_image_support(llm, "image_uri", _load_test_image())

def _check_image_url_support(llm) -> Dict:
"""Test LLM support for URL-based image input."""
return _check_image_support(llm, "image_url", "https://raw.githubusercontent.com/cheshire-cat-ai/core/refs/heads/main/readme/cheshire-cat.jpeg")


def _initialize_llm(selected_llm):
Pingdred marked this conversation as resolved.
Show resolved Hide resolved
"""Initialize the LLM based on the selected settings."""
Expand All @@ -180,9 +200,13 @@ def _initialize_llm(selected_llm):

# Obtain configuration and instantiate LLM
selected_llm_config = crud.get_setting_by_name(name=selected_llm_class)
model_name = selected_llm_config["value"].get("model_name") or selected_llm_config["value"].get("model") or None
try:
llm = FactoryClass.get_llm_from_config(selected_llm_config["value"])
_check_image_suppot(llm)
_check_image_uri_support(llm)
_check_image_url_support(llm)
log.info(f"LLM {model_name} Supported modalities:")
log.info(self._llm_modalities)
return llm
except Exception:
import traceback
Expand Down
1 change: 1 addition & 0 deletions core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies = [
"langchain-openai==0.1.7",
"langchain-anthropic==0.1.23",
"langchain-google-genai==1.0.8",
"langchain-google-genai[images]",
Pingdred marked this conversation as resolved.
Show resolved Hide resolved
"langchain-cohere==0.1.5",
"huggingface-hub==0.20.3",
"beautifulsoup4==4.12.3",
Expand Down