Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/my-website/docs/caching/all_caches.md
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ litellm.cache = Cache(
similarity_threshold=0.7, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
qdrant_quantization_config ="binary", # can be one of 'binary', 'product' or 'scalar' quantizations that is supported by qdrant
qdrant_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
qdrant_semantic_cache_vector_size=1536, # vector size for the embedding model, must match the dimensionality of the embedding model used
)

response1 = completion(
Expand Down Expand Up @@ -635,6 +636,7 @@ def __init__(
qdrant_quantization_config: Optional[str] = None,
qdrant_semantic_cache_embedding_model="text-embedding-ada-002",

qdrant_semantic_cache_vector_size: Optional[int] = None,
**kwargs
):
```
Expand Down
1 change: 1 addition & 0 deletions docs/my-website/docs/proxy/caching.md
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,7 @@ litellm_settings:
qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
qdrant_collection_name: test_collection
qdrant_quantization_config: binary
qdrant_semantic_cache_vector_size: 1536 # vector size must match embedding model dimensionality
similarity_threshold: 0.8 # similarity threshold for semantic cache
```

Expand Down
1 change: 1 addition & 0 deletions docs/my-website/docs/proxy/config_settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ litellm_settings:
qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
qdrant_collection_name: test_collection
qdrant_quantization_config: binary
qdrant_semantic_cache_vector_size: 1536 # vector size must match embedding model dimensionality
similarity_threshold: 0.8 # similarity threshold for semantic cache

# Optional - S3 Cache Settings
Expand Down
2 changes: 2 additions & 0 deletions litellm/caching/caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def __init__(
qdrant_collection_name: Optional[str] = None,
qdrant_quantization_config: Optional[str] = None,
qdrant_semantic_cache_embedding_model: str = "text-embedding-ada-002",
qdrant_semantic_cache_vector_size: Optional[int] = None,
# GCP IAM authentication parameters
gcp_service_account: Optional[str] = None,
gcp_ssl_ca_certs: Optional[str] = None,
Expand Down Expand Up @@ -207,6 +208,7 @@ def __init__(
similarity_threshold=similarity_threshold,
quantization_config=qdrant_quantization_config,
embedding_model=qdrant_semantic_cache_embedding_model,
vector_size=qdrant_semantic_cache_vector_size,
)
elif type == LiteLLMCacheType.LOCAL:
self.cache = InMemoryCache()
Expand Down
4 changes: 3 additions & 1 deletion litellm/caching/qdrant_semantic_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__( # noqa: PLR0915
quantization_config=None,
embedding_model="text-embedding-ada-002",
host_type=None,
vector_size=None,
):
import os

Expand All @@ -53,6 +54,7 @@ def __init__( # noqa: PLR0915
raise Exception("similarity_threshold must be provided, passed None")
self.similarity_threshold = similarity_threshold
self.embedding_model = embedding_model
self.vector_size = vector_size if vector_size is not None else QDRANT_VECTOR_SIZE
headers = {}

# check if defined as os.environ/ variable
Expand Down Expand Up @@ -138,7 +140,7 @@ def __init__( # noqa: PLR0915
new_collection_status = self.sync_client.put(
url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
json={
"vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
"vectors": {"size": self.vector_size, "distance": "Cosine"},
"quantization_config": quantization_params,
},
headers=self.headers,
Expand Down
129 changes: 128 additions & 1 deletion tests/test_litellm/caching/test_qdrant_semantic_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,4 +408,131 @@ async def test_qdrant_semantic_cache_async_set_cache():
)

# Verify async upsert was called
qdrant_cache.async_client.put.assert_called()
qdrant_cache.async_client.put.assert_called()

def test_qdrant_semantic_cache_custom_vector_size():
"""
Test that QdrantSemanticCache uses a custom vector_size when creating a new collection.
Verifies that the vector size passed to the constructor is used in the Qdrant collection
creation payload instead of the default 1536.
"""
with patch("litellm.llms.custom_httpx.http_handler._get_httpx_client") as mock_sync_client, \
patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client") as mock_async_client:

# Mock the collection does NOT exist (so it will be created)
mock_exists_response = MagicMock()
mock_exists_response.status_code = 200
mock_exists_response.json.return_value = {"result": {"exists": False}}

# Mock the collection creation response
mock_create_response = MagicMock()
mock_create_response.status_code = 200
mock_create_response.json.return_value = {"result": True}

# Mock the collection details response after creation
mock_details_response = MagicMock()
mock_details_response.status_code = 200
mock_details_response.json.return_value = {"result": {"status": "ok"}}

mock_sync_client_instance = MagicMock()
mock_sync_client_instance.get.side_effect = [mock_exists_response, mock_details_response]
mock_sync_client_instance.put.return_value = mock_create_response
mock_sync_client.return_value = mock_sync_client_instance

from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache

# Initialize with custom vector_size of 768
qdrant_cache = QdrantSemanticCache(
collection_name="test_collection_768",
qdrant_api_base="http://test.qdrant.local",
qdrant_api_key="test_key",
similarity_threshold=0.8,
vector_size=768,
)

# Verify the vector_size attribute is set correctly
assert qdrant_cache.vector_size == 768

# Verify the PUT call to create the collection used vector_size=768
put_call = mock_sync_client_instance.put.call_args
assert put_call is not None
create_payload = put_call.kwargs.get("json") or put_call[1].get("json")
assert create_payload["vectors"]["size"] == 768
assert create_payload["vectors"]["distance"] == "Cosine"


def test_qdrant_semantic_cache_default_vector_size():
"""
Test that QdrantSemanticCache defaults to QDRANT_VECTOR_SIZE (1536) when vector_size
is not provided, and stores it as self.vector_size.
"""
with patch("litellm.llms.custom_httpx.http_handler._get_httpx_client") as mock_sync_client, \
patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client") as mock_async_client:

# Mock the collection exists check
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.json.return_value = {"result": {"exists": True}}

mock_sync_client_instance = MagicMock()
mock_sync_client_instance.get.return_value = mock_response
mock_sync_client.return_value = mock_sync_client_instance

from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache
from litellm.constants import QDRANT_VECTOR_SIZE

# Initialize without vector_size
qdrant_cache = QdrantSemanticCache(
collection_name="test_collection",
qdrant_api_base="http://test.qdrant.local",
qdrant_api_key="test_key",
similarity_threshold=0.8,
)

# Verify it falls back to the default QDRANT_VECTOR_SIZE constant
assert qdrant_cache.vector_size == QDRANT_VECTOR_SIZE


def test_qdrant_semantic_cache_large_vector_size():
"""
Test that QdrantSemanticCache supports large embedding dimensions (e.g. 4096, 8192)
for models like Stella, bge-en-icl, etc.
"""
with patch("litellm.llms.custom_httpx.http_handler._get_httpx_client") as mock_sync_client, \
patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client") as mock_async_client:

# Mock the collection does NOT exist (so it will be created)
mock_exists_response = MagicMock()
mock_exists_response.status_code = 200
mock_exists_response.json.return_value = {"result": {"exists": False}}

mock_create_response = MagicMock()
mock_create_response.status_code = 200
mock_create_response.json.return_value = {"result": True}

mock_details_response = MagicMock()
mock_details_response.status_code = 200
mock_details_response.json.return_value = {"result": {"status": "ok"}}

mock_sync_client_instance = MagicMock()
mock_sync_client_instance.get.side_effect = [mock_exists_response, mock_details_response]
mock_sync_client_instance.put.return_value = mock_create_response
mock_sync_client.return_value = mock_sync_client_instance

from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache

# Initialize with a large vector_size of 4096
qdrant_cache = QdrantSemanticCache(
collection_name="test_collection_4096",
qdrant_api_base="http://test.qdrant.local",
qdrant_api_key="test_key",
similarity_threshold=0.8,
vector_size=4096,
)

assert qdrant_cache.vector_size == 4096

# Verify the collection was created with 4096
put_call = mock_sync_client_instance.put.call_args
create_payload = put_call.kwargs.get("json") or put_call[1].get("json")
assert create_payload["vectors"]["size"] == 4096
Loading