Skip to content

Intense CPU underutilization on Windows using Docker #5459

Open
@amadou-6e

Description

@amadou-6e

The CPU is underutilized and the response times are very slow:

LocalAI version:
localai/localai:latest-cpu

Environment, CPU architecture, OS, and Version:
12th gen Intel(R) Core(TM) i7 with no GPU (intel iris but disabled for this)
1TB SSD NVMe
Windows 11 latest version

Describe the bug
Not enough CPU usage

To Reproduce

#!/usr/bin/env python3
"""
Minimal LocalAI test script - no classes, just sequential steps.
Designed for debugging CPU underutilization issues.
"""

import docker
import requests
import time
import json
import psutil
import os
from pathlib import Path

# Configuration
HOST = "localhost"
PORT = 8080
IMAGE = "localai/localai:latest-cpu"  # or try latest-aio-cpu for preloaded models
CONTAINER_NAME = "test-localai"

# Get optimal thread count (physical cores)
PHYSICAL_CORES = psutil.cpu_count(logical=False)
LOGICAL_CORES = psutil.cpu_count(logical=True)
print(f"System info: {PHYSICAL_CORES} physical cores, {LOGICAL_CORES} logical cores")

# Use physical cores for better performance
THREADS = PHYSICAL_CORES or 4
MEMORY_LIMIT = "4g"

# Model to test with (small and fast)
MODEL_ID = "localai@gemma-3-1b-it"
MODEL_NAME = "gemma-3-1b-it"

# Create directories
MODELS_DIR = Path("./test_models")
CACHE_DIR = Path("./test_cache")
MODELS_DIR.mkdir(exist_ok=True)
CACHE_DIR.mkdir(exist_ok=True)

print(f"Using {THREADS} threads, {MEMORY_LIMIT} memory limit")
print(f"Models dir: {MODELS_DIR.absolute()}")
print(f"Cache dir: {CACHE_DIR.absolute()}")


def check_docker():
    """Step 1: Check if Docker is running"""
    print("\n=== STEP 1: Checking Docker ===")
    try:
        client = docker.from_env()
        client.ping()
        print(" Docker is running")
        return client
    except Exception as e:
        print(f"Docker error: {e}")
        print("Make sure Docker Desktop is running")
        exit(1)


def cleanup_container(client):
    """Step 2: Clean up any existing container"""
    print("\n=== STEP 2: Cleaning up existing container ===")
    try:
        existing = client.containers.get(CONTAINER_NAME)
        print(f"Found existing container: {existing.status}")
        existing.remove(force=True)
        print(" Removed existing container")
    except docker.errors.NotFound:
        print(" No existing container to remove")


def pull_image(client):
    """Step 3: Ensure image is available"""
    print("\n=== STEP 3: Checking/pulling image ===")
    try:
        image = client.images.get(IMAGE)
        print(f"Image {IMAGE} already available")
        return image
    except docker.errors.NotFound:
        print(f"Pulling {IMAGE}...")
        image = client.images.pull(IMAGE)
        print(" Image pulled successfully")
        return image


def create_container(client):
    """Step 4: Create optimized container"""
    print("\n=== STEP 4: Creating container ===")

    # Mount points
    mounts = [
        docker.types.Mount(target="/build/models", source=str(MODELS_DIR.absolute()), type="bind"),
        docker.types.Mount(target="/tmp/generated", source=str(CACHE_DIR.absolute()), type="bind")
    ]

    # Optimized environment variables for CPU performance
    env_vars = {
        # Core settings
        "LOCALAI_THREADS": str(THREADS),
        "LOCALAI_CONTEXT_SIZE": "2048",

        # Performance optimizations
        "LOCALAI_PARALLEL_REQUESTS": "true",
        "LOCALAI_F16": "true",  # Enable F16 for better performance
        "LOCALAI_SINGLE_ACTIVE_BACKEND": "false",  # Allow multiple backends

        # CPU-specific optimizations
        "OMP_NUM_THREADS": str(THREADS),  # OpenMP threads
        "GOMAXPROCS": str(THREADS),  # Go runtime threads
        "MKL_NUM_THREADS": str(THREADS),  # Intel MKL threads

        # Disable debug for performance
        "DEBUG": "false",
        "LOCALAI_LOG_LEVEL": "info",

        # Memory optimizations
        "LOCALAI_WATCHDOG_IDLE": "true",
        "LOCALAI_WATCHDOG_IDLE_TIMEOUT": "10m",

        # Backend-specific optimizations
        "REBUILD": "false",  # Don't rebuild, use precompiled
        "LLAMACPP_PARALLEL": str(max(1, THREADS // 2)),  # Parallel llama.cpp workers
    }

    ports = {"8080/tcp": PORT}

    container = client.containers.create(
        image=IMAGE,
        name=CONTAINER_NAME,
        mounts=mounts,
        ports=ports,
        environment=env_vars,
        detach=True,
        # Resource limits for Windows
        mem_limit=MEMORY_LIMIT,
        # Allow container to use all CPU cores but don't overwhelm system
        cpu_period=100000,
        cpu_quota=int(100000 * THREADS * 0.9),  # Use 90% of available CPU
        # Remove default CPU shares limit
        cpu_shares=1024 * THREADS,
    )

    print(f"Container created with optimizations:")
    print(f" - {THREADS} threads")
    print(f" - {MEMORY_LIMIT} memory limit")
    print(f" - CPU quota: {int(100000 * THREADS * 0.9)} (90% of {THREADS} cores)")
    print(f" - Parallel requests enabled")

    return container


def start_container(container):
    """Step 5: Start container"""
    print("\n=== STEP 5: Starting container ===")
    container.start()
    print(" Container started")

    # Show initial logs
    time.sleep(2)
    logs = container.logs(tail=10).decode('utf-8', errors='ignore')
    print("Initial container logs:")
    print("---")
    print(logs)
    print("---")


def wait_for_ready():
    """Step 6: Wait for LocalAI to be ready"""
    print("\n=== STEP 6: Waiting for LocalAI to be ready ===")
    start_time = time.time()
    timeout = 600  # 5 minutes

    while time.time() - start_time < timeout:
        try:
            response = requests.get(f"http://{HOST}:{PORT}/readyz", timeout=10)
            if response.status_code == 200:
                elapsed = time.time() - start_time
                print(f"LocalAI ready in {elapsed:.1f} seconds")
                return True
        except requests.RequestException:
            pass

        elapsed = time.time() - start_time
        print(f"Waiting... ({elapsed:.1f}s)")
        time.sleep(5)

    print(" LocalAI not ready within timeout")
    return False


def show_system_status():
    """Show system resource usage"""
    print("\n=== SYSTEM STATUS ===")
    cpu_percent = psutil.cpu_percent(interval=1, percpu=True)
    memory = psutil.virtual_memory()

    print(f"CPU usage per core: {[f'{x:.1f}%' for x in cpu_percent]}")
    print(f"Average CPU: {sum(cpu_percent)/len(cpu_percent):.1f}%")
    print(
        f"Memory: {memory.percent:.1f}% ({memory.used//1024//1024}MB/{memory.total//1024//1024}MB)")


def check_models():
    """Step 7: Check what models are available"""
    print("\n=== STEP 7: Checking available models ===")

    # Check loaded models
    try:
        response = requests.get(f"http://{HOST}:{PORT}/v1/models", timeout=10)
        if response.status_code == 200:
            models = response.json().get('data', [])
            print(f"Currently loaded models: {len(models)}")
            for model in models[:5]:  # Show first 5
                print(f" - {model.get('id', 'unknown')}")
        else:
            print(f"Failed to get models: {response.status_code}")
    except Exception as e:
        print(f"Error checking models: {e}")

    # Check available models for download
    try:
        response = requests.get(f"http://{HOST}:{PORT}/models/available", timeout=30)
        if response.status_code == 200:
            available = response.json()
            print(f"Available for download: {len(available)} models")
            # Find our target model
            target_found = any(MODEL_ID.split('@')[1] in str(model) for model in available)
            print(f"Target model '{MODEL_ID}' available: {target_found}")
        else:
            print(f"Failed to get available models: {response.status_code}")
    except Exception as e:
        print(f"Error checking available models: {e}")


def install_model():
    """Step 8: Install model"""
    print(f"\n=== STEP 8: Installing model {MODEL_ID} ===")

    # Check if already loaded
    try:
        response = requests.get(f"http://{HOST}:{PORT}/v1/models", timeout=10)
        if response.status_code == 200:
            models = response.json().get('data', [])
            if any(model.get('id') == MODEL_NAME for model in models):
                print(f"Model {MODEL_NAME} already loaded")
                return True
    except Exception as e:
        print(f"Warning: Could not check existing models: {e}")

    # Install model
    install_data = {"id": MODEL_ID, "name": MODEL_NAME}

    print(f"Installing model: {install_data}")

    try:
        response = requests.post(f"http://{HOST}:{PORT}/models/apply",
                                 json=install_data,
                                 timeout=30)

        if response.status_code != 200:
            print(f"Install request failed: {response.status_code}")
            print(f"Response: {response.text}")
            return False

        result = response.json()
        job_id = result.get('uuid')

        if not job_id:
            print(f"No job ID returned: {result}")
            return False

        print(f"Installation started, job ID: {job_id}")

        # Monitor job progress
        start_time = time.time()
        timeout = 600  # 10 minutes for model download

        while time.time() - start_time < timeout:
            try:
                response = requests.get(f"http://{HOST}:{PORT}/models/jobs/{job_id}", timeout=10)
                if response.status_code == 200:
                    status = response.json()

                    if status.get('processed', False):
                        if status.get('error'):
                            print(f"Installation failed: {status['error']}")
                            return False
                        else:
                            elapsed = time.time() - start_time
                            print(f"Model installed successfully in {elapsed:.1f} seconds")
                            return True
                    else:
                        elapsed = time.time() - start_time
                        message = status.get('message', 'processing')
                        print(f"Installing... ({elapsed:.1f}s) - {message}")

                        # Show system status during download
                        if int(elapsed) % 30 == 0:  # Every 30 seconds
                            show_system_status()

                        time.sleep(10)
                else:
                    print(f"Warning: Job status check failed: {response.status_code}")
                    time.sleep(5)

            except Exception as e:
                print(f"Warning: Error checking job status: {e}")
                time.sleep(5)

        print(f"Model installation timed out after {timeout} seconds")
        return False

    except Exception as e:
        print(f"Installation error: {e}")
        return False


def test_query():
    """Step 9: Test model query"""
    print(f"\n=== STEP 9: Testing model query ===")

    query_data = {
        "model": MODEL_NAME,
        "prompt": "What is the capital of France? Answer briefly.",
        "max_tokens": 50,
        "temperature": 0.7
    }

    print(f"Query: {query_data}")
    print("Measuring CPU usage during inference...")

    # Measure CPU before query
    cpu_before = psutil.cpu_percent(interval=1)

    try:
        start_time = time.time()
        response = requests.post(f"http://{HOST}:{PORT}/v1/completions", json=query_data)
        # timeout=120)
        elapsed = time.time() - start_time

        # Measure CPU during/after query
        cpu_after = psutil.cpu_percent(interval=1)

        if response.status_code == 200:
            result = response.json()
            if 'choices' in result and len(result['choices']) > 0:
                answer = result['choices'][0]['text'].strip()
                usage = result.get('usage', {})

                print(f"Query successful in {elapsed:.1f} seconds")
                print(f"Answer: {answer}")
                print(f"Usage: {usage}")
                print(f"CPU before query: {cpu_before:.1f}%")
                print(f"CPU during query: {cpu_after:.1f}%")

                return True
            else:
                print(f"Invalid response format: {result}")
        else:
            print(f"Query failed: {response.status_code}")
            print(f"Response: {response.text}")

    except Exception as e:
        print(f"Query error: {e}")

    return False


def show_final_status(client):
    """Step 10: Show final status and cleanup info"""
    print("\n=== STEP 10: Final Status ===")

    try:
        container = client.containers.get(CONTAINER_NAME)
        container.reload()
        print(f"Container status: {container.status}")

        # Show final logs
        logs = container.logs(tail=20).decode('utf-8', errors='ignore')
        print("\nFinal container logs:")
        print("---")
        print(logs)
        print("---")

    except Exception as e:
        print(f"Error getting container status: {e}")

    show_system_status()

    print(f"\nTo stop container: docker stop {CONTAINER_NAME}")
    print(f"To remove container: docker rm {CONTAINER_NAME}")
    print(f"To view logs: docker logs {CONTAINER_NAME}")


def main():
    """Main execution"""
    print("=== MINIMAL LOCALAI TEST SCRIPT ===")
    print(f"Target: Install {MODEL_ID} and run a test query")

    try:
        # Sequential steps
        client = check_docker()
        cleanup_container(client)
        pull_image(client)
        container = create_container(client)
        start_container(container)

        if not wait_for_ready():
            print("\n LocalAI failed to start properly")
            # Show logs for debugging
            logs = container.logs().decode('utf-8', errors='ignore')
            print("Container logs for debugging:")
            print("=" * 50)
            print(logs)
            print("=" * 50)
            return False

        check_models()

        if not install_model():
            print("\n Model installation failed")
            return False

        if not test_query():
            print("\n Model query failed")
            return False

        show_final_status(client)
        print("\n ALL TESTS PASSED!")
        return True

    except KeyboardInterrupt:
        print("\n  Test interrupted by user")
        return False
    except Exception as e:
        print(f"\n Unexpected error: {e}")
        import traceback
        traceback.print_exc()
        return False


if __name__ == "__main__":
    success = main()
    exit(0 if success else 1)

Expected behavior
Simply more CPU usage.

Logs

=== MINIMAL LOCALAI TEST SCRIPT ===
Target: Install localai@gemma-3-1b-it and run a test query

=== STEP 1: Checking Docker ===
✓ Docker is running

=== STEP 2: Cleaning up existing container ===
Found existing container: running
✓ Removed existing container

=== STEP 3: Checking/pulling image ===
✓ Image localai/localai:latest-cpu already available

=== STEP 4: Creating container ===
✓ Container created with optimizations:
  - 16 threads
  - 4g memory limit
  - CPU quota: 1440000 (90% of 16 cores)
  - Parallel requests enabled

=== STEP 5: Starting container ===
✓ Container started
Initial container logs:
---
CPU:    AVX2   found OK
CPU: no AVX512 found
@@@@@
6:48AM INF env file found, loading environment variables from file envFile=.env
6:48AM INF Setting logging to info
6:48AM INF Starting LocalAI using 16 threads, with models path: /build/models
6:48AM INF LocalAI version: v2.29.0 (fd17a3312c4c1f5688152eff227e27d9b7bce365)
WARNING: failed to determine nodes: open /sys/devices/system/node: no such file or directory
WARNING: failed to read int from file: open /sys/class/drm/card0/device/numa_node: no such file or directory  
WARNING: failed to determine nodes: open /sys/devices/system/node: no such file or directory

---

=== STEP 6: Waiting for LocalAI to be ready ===
Waiting... (0.0s)
Waiting... (5.0s)
Waiting... (10.1s)
Waiting... (15.1s)
Waiting... (20.1s)
Waiting... (25.1s)
Waiting... (30.1s)
Waiting... (35.2s)
Waiting... (40.2s)
Waiting... (45.2s)
Waiting... (50.2s)
Waiting... (55.2s)
Waiting... (60.2s)
Waiting... (65.3s)
Waiting... (70.3s)
Waiting... (75.3s)
Waiting... (80.3s)
Waiting... (85.3s)
Waiting... (90.4s)
Waiting... (95.4s)
Waiting... (100.4s)
Waiting... (105.4s)
Waiting... (110.4s)
Waiting... (115.4s)
Waiting... (120.5s)
Waiting... (125.5s)
Waiting... (130.5s)
Waiting... (135.5s)
Waiting... (140.5s)
Waiting... (145.5s)
Waiting... (150.5s)
Waiting... (155.6s)
Waiting... (160.6s)
Waiting... (165.6s)
Waiting... (170.6s)
Waiting... (175.6s)
Waiting... (180.6s)
Waiting... (185.6s)
Waiting... (190.6s)
Waiting... (195.7s)
Waiting... (200.7s)
Waiting... (205.7s)
Waiting... (210.7s)
Waiting... (215.7s)
Waiting... (220.7s)
Waiting... (225.8s)
Waiting... (230.8s)
Waiting... (235.8s)
Waiting... (240.8s)
Waiting... (245.8s)
Waiting... (250.8s)
Waiting... (255.9s)
Waiting... (260.9s)
Waiting... (265.9s)
Waiting... (270.9s)
Waiting... (275.9s)
Waiting... (281.0s)
Waiting... (286.0s)
Waiting... (291.0s)
Waiting... (296.0s)
Waiting... (301.0s)
Waiting... (306.0s)
Waiting... (311.1s)
Waiting... (316.1s)
Waiting... (321.1s)
Waiting... (326.1s)
Waiting... (331.1s)
Waiting... (336.1s)
✓ LocalAI ready in 341.1 seconds

=== STEP 7: Checking available models ===
✓ Currently loaded models: 1
  - gemma-3-1b-it
✓ Available for download: 999 models
✓ Target model 'localai@gemma-3-1b-it' available: True

=== STEP 8: Installing model localai@gemma-3-1b-it ===
✓ Model gemma-3-1b-it already loaded

=== STEP 9: Testing model query ===
Query: {'model': 'gemma-3-1b-it', 'prompt': 'What is the capital of France? Answer briefly.', 'max_tokens': 50, 'temperature': 0.7}
Measuring CPU usage during inference...
✓ Query successful in 427.1 seconds
Answer: The capital of France is Paris.

Final Answer: Paris
Usage: {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
CPU before query: 31.0%
CPU during query: 14.8%

=== STEP 10: Final Status ===
Container status: running

Final container logs:
---
WARNING: failed to determine nodes: open /sys/devices/system/node: no such file or directory
WARNING: failed to determine nodes: open /sys/devices/system/node: no such file or directory
WARNING: failed to read int from file: open /sys/class/drm/card0/device/numa_node: no such file or directory  
WARNING: failed to determine nodes: open /sys/devices/system/node: no such file or directory
7:00AM INF Trying to load the model 'gemma-3-1b-it' with the backend '[llama-cpp llama-cpp-fallback piper silero-vad stablediffusion-ggml whisper bark-cpp huggingface /build/backend/python/exllama2/run.sh /build/backend/python/vllm/run.sh /build/backend/python/faster-whisper/run.sh /build/backend/python/coqui/run.sh /build/backend/python/transformers/run.sh /build/backend/python/rerankers/run.sh /build/backend/python/diffusers/run.sh /build/backend/python/kokoro/run.sh /build/backend/python/bark/run.sh]'
7:00AM INF [llama-cpp] Attempting to load
7:00AM INF BackendLoader starting backend=llama-cpp modelID=gemma-3-1b-it o.model=gemma-3-1b-it-Q4_K_M.gguf   
WARNING: failed to determine nodes: open /sys/devices/system/node: no such file or directory
WARNING: failed to read int from file: open /sys/class/drm/card0/device/numa_node: no such file or directory  
WARNING: failed to determine nodes: open /sys/devices/system/node: no such file or directory
WARNING: failed to determine nodes: open /sys/devices/system/node: no such file or directory
WARNING: failed to read int from file: open /sys/class/drm/card0/device/numa_node: no such file or directory  
WARNING: failed to determine nodes: open /sys/devices/system/node: no such file or directory
WARNING: failed to determine nodes: open /sys/devices/system/node: no such file or directory
WARNING: failed to read int from file: open /sys/class/drm/card0/device/numa_node: no such file or directory  
WARNING: failed to determine nodes: open /sys/devices/system/node: no such file or directory
7:00AM INF [llama-cpp] attempting to load with AVX2 variant
7:00AM INF Success ip=127.0.0.1 latency="29.632µs" method=GET status=200 url=/readyz
7:01AM INF [llama-cpp] Loads OK
7:01AM INF Success ip=172.17.0.1 latency=7m7.078412191s method=POST status=200 url=/v1/completions

---

=== SYSTEM STATUS ===
CPU usage per core: ['3.1%', '6.2%', '4.7%', '6.2%', '4.6%', '1.6%', '4.7%', '1.6%', '7.7%', '17.2%', '3.1%', '1.6%', '20.3%', '15.4%', '1.6%', '3.1%', '42.3%', '27.7%', '40.6%', '36.9%', '36.9%', '48.5%', '45.3%', '34.4%']
Average CPU: 17.3%
Memory: 46.3% (30156MB/65201MB)

To stop container: docker stop test-localai
To remove container: docker rm test-localai
To view logs: docker logs test-localai

✅ ALL TESTS PASSED!

Additional context

Metadata

Metadata

Assignees

No one assigned

    Labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions