Context-Engine-AI · m1rl0k · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/.env.example b/.env.example
@@ -312,6 +312,12 @@ REFRAG_DECODER_MODE=prompt  # prompt|soft
 # Set to 0 to use Docker CPU-only server (default, stable)
 USE_GPU_DECODER=0
 
+# Llama.cpp decoder service configuration
+# Default: ghcr.io/ggml-org/llama.cpp:server (multi-arch)
+# CUDA support: ghcr.io/ggml-org/llama.cpp:server-cuda (for NVIDIA GPUs)
+# Alternative: local builds or custom images
+# LLAMACPP_IMAGE=ghcr.io/ggml-org/llama.cpp:server
+
 REFRAG_SOFT_SCALE=1.0
 
 # Llama.cpp runtime tuning

diff --git a/Dockerfile.mcp b/Dockerfile.mcp
@@ -10,7 +10,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
 # Python deps: reuse shared requirements file for consistency across services
 # Create cache/rerank directories in same layer
 COPY requirements.txt /tmp/requirements.txt
-RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt \
+RUN pip install --no-cache-dir --upgrade --timeout 300 --retries 3 -r /tmp/requirements.txt \
     && mkdir -p /tmp/cache && chmod 755 /tmp/cache \
     && mkdir -p /tmp/rerank_events /tmp/rerank_weights \
     && chmod 777 /tmp/rerank_events /tmp/rerank_weights

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -66,11 +66,11 @@ services:
     container_name: redis-cache
     ports:
       - "6379:6379"
-    command: ["redis-server", "--appendonly", "yes", "--maxmemory", "256mb", "--maxmemory-policy", "allkeys-lru"]
+    command: [ "redis-server", "--appendonly", "yes", "--maxmemory", "256mb", "--maxmemory-policy", "allkeys-lru" ]
     volumes:
       - redis_data:/data
     healthcheck:
-      test: ["CMD", "redis-cli", "ping"]
+      test: [ "CMD", "redis-cli", "ping" ]
       interval: 10s
       timeout: 5s
       retries: 5
@@ -436,7 +436,7 @@ services:
 
   # Llama.cpp decoder service - same as base compose
   llamacpp:
-    image: ghcr.io/ggml-org/llama.cpp:server
+    image: ${LLAMACPP_IMAGE:-ghcr.io/ggml-org/llama.cpp:server}
     container_name: llama-decoder-dev-remote
     environment:
       - LLAMA_ARG_MODEL=/models/model.gguf
@@ -518,8 +518,8 @@ services:
     volumes:
       - workspace_pvc:/work:rw
       - codebase_pvc:/work/.codebase:rw
-    entrypoint: ["sh", "-c", "mkdir -p /tmp/logs /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && /app/scripts/wait-for-qdrant.sh && cd /app && python /app/scripts/ingest_code.py --root /work"]
-    restart: "no"  # Run once on startup, do not restart after completion
+    entrypoint: [ "sh", "-c", "mkdir -p /tmp/logs /tmp/huggingface/hub /tmp/huggingface/transformers /tmp/huggingface/fastembed && /app/scripts/wait-for-qdrant.sh && cd /app && python /app/scripts/ingest_code.py --root /work" ]
+    restart: "no" # Run once on startup, do not restart after completion
     cpus: 4.0
     networks:
       - dev-remote-network

diff --git a/tests/test_ingest_schema_mode.py b/tests/test_ingest_schema_mode.py
@@ -30,14 +30,15 @@ def get_collection(self, name):
             payload_schema=self.payload_schema,
         )
 
-    def create_collection(self, collection_name, vectors_config, sparse_vectors_config=None, hnsw_config=None, quantization_config=None):
+    def create_collection(self, collection_name, vectors_config, sparse_vectors_config=None, hnsw_config=None, quantization_config=None, on_disk_payload=None):
         self.create_calls.append(
             {
                 "collection_name": collection_name,
                 "vectors_config": vectors_config,
                 "sparse_vectors_config": sparse_vectors_config,
                 "hnsw_config": hnsw_config,
                 "quantization_config": quantization_config,
+                "on_disk_payload": on_disk_payload,
             }
         )
         self.collection_exists = True