BerriAI · krrishdholakia · Feb 21, 2026 · Feb 20, 2026
diff --git a/docs/my-website/docs/caching/all_caches.md b/docs/my-website/docs/caching/all_caches.md
@@ -297,6 +297,7 @@ litellm.cache = Cache(
     similarity_threshold=0.7, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity
     qdrant_quantization_config ="binary", # can be one of 'binary', 'product' or 'scalar' quantizations that is supported by qdrant
     qdrant_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here
+    qdrant_semantic_cache_vector_size=1536, # vector size for the embedding model, must match the dimensionality of the embedding model used
 )
 
 response1 = completion(
@@ -635,6 +636,7 @@ def __init__(
     qdrant_quantization_config: Optional[str] = None,
     qdrant_semantic_cache_embedding_model="text-embedding-ada-002",
 
+    qdrant_semantic_cache_vector_size: Optional[int] = None,
     **kwargs
 ):
 ```

diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md
@@ -340,6 +340,7 @@ litellm_settings:
     qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
     qdrant_collection_name: test_collection
     qdrant_quantization_config: binary
+    qdrant_semantic_cache_vector_size: 1536 # vector size must match embedding model dimensionality
     similarity_threshold: 0.8 # similarity threshold for semantic cache
 ```
 

diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md
@@ -73,6 +73,7 @@ litellm_settings:
     qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list
     qdrant_collection_name: test_collection
     qdrant_quantization_config: binary
+    qdrant_semantic_cache_vector_size: 1536 # vector size must match embedding model dimensionality
     similarity_threshold: 0.8 # similarity threshold for semantic cache
 
     # Optional - S3 Cache Settings

diff --git a/litellm/caching/caching.py b/litellm/caching/caching.py
@@ -108,6 +108,7 @@ def __init__(
         qdrant_collection_name: Optional[str] = None,
         qdrant_quantization_config: Optional[str] = None,
         qdrant_semantic_cache_embedding_model: str = "text-embedding-ada-002",
+        qdrant_semantic_cache_vector_size: Optional[int] = None,
         # GCP IAM authentication parameters
         gcp_service_account: Optional[str] = None,
         gcp_ssl_ca_certs: Optional[str] = None,
@@ -207,6 +208,7 @@ def __init__(
                 similarity_threshold=similarity_threshold,
                 quantization_config=qdrant_quantization_config,
                 embedding_model=qdrant_semantic_cache_embedding_model,
+                vector_size=qdrant_semantic_cache_vector_size,
             )
         elif type == LiteLLMCacheType.LOCAL:
             self.cache = InMemoryCache()

diff --git a/litellm/caching/qdrant_semantic_cache.py b/litellm/caching/qdrant_semantic_cache.py
@@ -31,6 +31,7 @@ def __init__(  # noqa: PLR0915
         quantization_config=None,
         embedding_model="text-embedding-ada-002",
         host_type=None,
+        vector_size=None,
     ):
         import os
 
@@ -53,6 +54,7 @@ def __init__(  # noqa: PLR0915
             raise Exception("similarity_threshold must be provided, passed None")
         self.similarity_threshold = similarity_threshold
         self.embedding_model = embedding_model
+        self.vector_size = vector_size if vector_size is not None else QDRANT_VECTOR_SIZE
         headers = {}
 
         # check if defined as os.environ/ variable
@@ -138,7 +140,7 @@ def __init__(  # noqa: PLR0915
             new_collection_status = self.sync_client.put(
                 url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
                 json={
-                    "vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
+                    "vectors": {"size": self.vector_size, "distance": "Cosine"},
                     "quantization_config": quantization_params,
                 },
                 headers=self.headers,

diff --git a/tests/test_litellm/caching/test_qdrant_semantic_cache.py b/tests/test_litellm/caching/test_qdrant_semantic_cache.py
@@ -408,4 +408,131 @@ async def test_qdrant_semantic_cache_async_set_cache():
             )
 
             # Verify async upsert was called
-            qdrant_cache.async_client.put.assert_called() 
+            qdrant_cache.async_client.put.assert_called() 
+
+def test_qdrant_semantic_cache_custom_vector_size():
+    """
+    Test that QdrantSemanticCache uses a custom vector_size when creating a new collection.
+    Verifies that the vector size passed to the constructor is used in the Qdrant collection
+    creation payload instead of the default 1536.
+    """
+    with patch("litellm.llms.custom_httpx.http_handler._get_httpx_client") as mock_sync_client, \
+         patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client") as mock_async_client:
+
+        # Mock the collection does NOT exist (so it will be created)
+        mock_exists_response = MagicMock()
+        mock_exists_response.status_code = 200
+        mock_exists_response.json.return_value = {"result": {"exists": False}}
+
+        # Mock the collection creation response
+        mock_create_response = MagicMock()
+        mock_create_response.status_code = 200
+        mock_create_response.json.return_value = {"result": True}
+
+        # Mock the collection details response after creation
+        mock_details_response = MagicMock()
+        mock_details_response.status_code = 200
+        mock_details_response.json.return_value = {"result": {"status": "ok"}}
+
+        mock_sync_client_instance = MagicMock()
+        mock_sync_client_instance.get.side_effect = [mock_exists_response, mock_details_response]
+        mock_sync_client_instance.put.return_value = mock_create_response
+        mock_sync_client.return_value = mock_sync_client_instance
+
+        from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache
+
+        # Initialize with custom vector_size of 768
+        qdrant_cache = QdrantSemanticCache(
+            collection_name="test_collection_768",
+            qdrant_api_base="http://test.qdrant.local",
+            qdrant_api_key="test_key",
+            similarity_threshold=0.8,
+            vector_size=768,
+        )
+
+        # Verify the vector_size attribute is set correctly
+        assert qdrant_cache.vector_size == 768
+
+        # Verify the PUT call to create the collection used vector_size=768
+        put_call = mock_sync_client_instance.put.call_args
+        assert put_call is not None
+        create_payload = put_call.kwargs.get("json") or put_call[1].get("json")
+        assert create_payload["vectors"]["size"] == 768
+        assert create_payload["vectors"]["distance"] == "Cosine"
+
+
+def test_qdrant_semantic_cache_default_vector_size():
+    """
+    Test that QdrantSemanticCache defaults to QDRANT_VECTOR_SIZE (1536) when vector_size
+    is not provided, and stores it as self.vector_size.
+    """
+    with patch("litellm.llms.custom_httpx.http_handler._get_httpx_client") as mock_sync_client, \
+         patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client") as mock_async_client:
+
+        # Mock the collection exists check
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"result": {"exists": True}}
+
+        mock_sync_client_instance = MagicMock()
+        mock_sync_client_instance.get.return_value = mock_response
+        mock_sync_client.return_value = mock_sync_client_instance
+
+        from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache
+        from litellm.constants import QDRANT_VECTOR_SIZE
+
+        # Initialize without vector_size
+        qdrant_cache = QdrantSemanticCache(
+            collection_name="test_collection",
+            qdrant_api_base="http://test.qdrant.local",
+            qdrant_api_key="test_key",
+            similarity_threshold=0.8,
+        )
+
+        # Verify it falls back to the default QDRANT_VECTOR_SIZE constant
+        assert qdrant_cache.vector_size == QDRANT_VECTOR_SIZE
+
+
+def test_qdrant_semantic_cache_large_vector_size():
+    """
+    Test that QdrantSemanticCache supports large embedding dimensions (e.g. 4096, 8192)
+    for models like Stella, bge-en-icl, etc.
+    """
+    with patch("litellm.llms.custom_httpx.http_handler._get_httpx_client") as mock_sync_client, \
+         patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client") as mock_async_client:
+
+        # Mock the collection does NOT exist (so it will be created)
+        mock_exists_response = MagicMock()
+        mock_exists_response.status_code = 200
+        mock_exists_response.json.return_value = {"result": {"exists": False}}
+
+        mock_create_response = MagicMock()
+        mock_create_response.status_code = 200
+        mock_create_response.json.return_value = {"result": True}
+
+        mock_details_response = MagicMock()
+        mock_details_response.status_code = 200
+        mock_details_response.json.return_value = {"result": {"status": "ok"}}
+
+        mock_sync_client_instance = MagicMock()
+        mock_sync_client_instance.get.side_effect = [mock_exists_response, mock_details_response]
+        mock_sync_client_instance.put.return_value = mock_create_response
+        mock_sync_client.return_value = mock_sync_client_instance
+
+        from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache
+
+        # Initialize with a large vector_size of 4096
+        qdrant_cache = QdrantSemanticCache(
+            collection_name="test_collection_4096",
+            qdrant_api_base="http://test.qdrant.local",
+            qdrant_api_key="test_key",
+            similarity_threshold=0.8,
+            vector_size=4096,
+        )
+
+        assert qdrant_cache.vector_size == 4096
+
+        # Verify the collection was created with 4096
+        put_call = mock_sync_client_instance.put.call_args
+        create_payload = put_call.kwargs.get("json") or put_call[1].get("json")
+        assert create_payload["vectors"]["size"] == 4096