DataArcTech · Mi221e · Dec 9, 2025 · Nov 17, 2025 · Nov 17, 2025 · Nov 27, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,86 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Virtual environments
+venv/
+env/
+ENV/
+.venv
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+.hypothesis/
+
+# Data directories (will be mounted as volumes)
+data/
+local/
+models/
+*.faiss
+*.index
+
+# Logs
+*.log
+logs/
+
+# Environment files
+.env
+.env.local
+.env.*.local
+
+# Git
+.git/
+.gitignore
+.gitattributes
+
+# Documentation
+*.md
+docs/
+
+# Docker
+Dockerfile
+docker-compose.yml
+.dockerignore
+
+# CI/CD
+.github/
+.gitlab-ci.yml
+
+# Temporary files
+tmp/
+temp/
+*.tmp
+
+# OS
+Thumbs.db
+
diff --git a/.env.example b/.env.example
@@ -0,0 +1,86 @@
+# ============================================
+# LLM API Configuration
+# ============================================
+# Setup your LLM API config
+# For OpenAI: https://platform.openai.com/docs/api-reference
+# For DeepSeek: https://api-docs.deepseek.com/
+# For other providers: check their documentation
+
+OPENAI_API_KEY=sk-xxxxxx
+OPENAI_BASE_URL=https://api.openai.com/v1
+
+# Choose device to run AI models (cpu, cuda:0, cuda:1 etc.)
+DEVICE=xxx
+
+# Embedding model config, Qwen/Qwen3-Embedding-0.6B or BAAI/bge-large-zh-v1.5
+EMBEDDING_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
+
+# If you use Azure OpenAI, uncomment below and fill in your info
+# API_VERSION=2025-01-01-preview
+# OPENAI_PROVIDER=azure
+
+# ============================================
+# PostgreSQL Database Configuration
+# ============================================
+POSTGRES_HOST=localhost
+POSTGRES_PORT=5555
+POSTGRES_USER=postgres
+POSTGRES_PASSWORD=123
+POSTGRES_DB=rag_arc
+
+# ============================================
+# Redis Cache Configuration
+# ============================================
+REDIS_HOST=localhost
+REDIS_PORT=6379
+REDIS_DB=0
+REDIS_PASSWORD=
+
+# ============================================
+# Application Configuration
+# ============================================
+# JWT Secret Key - Run: openssl rand -hex 32
+JWT_SECRET_KEY=your-secret-key-change-this-in-production
+
+# HuggingFace Token (optional, for downloading models)
+HF_TOKEN=
+
+# Log Level (DEBUG, INFO, WARNING, ERROR)
+LOG_LEVEL=INFO
+
+# ============================================
+# File Storage Configuration
+# ============================================
+# Unified parser output directory (all parsers output to subdirectories here)
+# Structure: {PARSER_OUTPUT_DIR}/native/, {PARSER_OUTPUT_DIR}/dots_ocr/, {PARSER_OUTPUT_DIR}/vlm_ocr/
+PARSER_OUTPUT_DIR=./data/parsed_files
+
+# Local file storage path
+LOCAL_FILE_STORAGE_PATH=./local/files
+
+# ============================================
+# Neo4j Graph Database Configuration
+# ============================================
+# Neo4j connection (auto-configured by Docker deployment)
+NEO4J_URL=bolt://localhost:7687
+NEO4J_USERNAME=neo4j
+NEO4J_PASSWORD=12345678
+NEO4J_DATABASE=neo4j
+
+# Docker: Expose Neo4j ports to host (default: false for security)
+# Set to true if you want to access Neo4j Browser at http://localhost:7474
+EXPOSE_NEO4J=false
+
+# Docker: Neo4j port mapping (only used if EXPOSE_NEO4J=true)
+NEO4J_HTTP_PORT=7474
+NEO4J_BOLT_PORT=7687
+
+# ============================================
+# Optional: MinIO Object Storage
+# ============================================
+# Uncomment if you want to use MinIO
+# MINIO_ENDPOINT=localhost:9000
+# MINIO_USERNAME=ROOTNAME
+# MINIO_PASSWORD=CHANGEME123
+# MINIO_BUCKET=rag-arc-bucket
+# MINIO_SECURE=false
diff --git a/.gitignore b/.gitignore
@@ -136,6 +136,7 @@ celerybeat.pid
 
 # Environments
 .env
+!.env.example
 .envrc
 .venv
 env/
@@ -205,3 +206,11 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+__pycache__/
+*.py[cod]
+.DS_Store
+
+data/
+local/
+models/
+benchmark/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,67 @@
+# 1. Use an official Python base image
+# This version uses CPU-only dependencies and calls external LLM APIs
+FROM python:3.11-slim
+
+# 2. Build arguments for region-specific configuration
+ARG UV_INSTALL_URL=https://astral.sh/uv/install.sh
+ARG UV_INDEX_URL=https://pypi.org/simple
+
+# 3. Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    DEBIAN_FRONTEND=noninteractive \
+    UV_HTTP_TIMEOUT=300 \
+    UV_INDEX_URL=${UV_INDEX_URL}
+
+# 4. Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    curl \
+    git \
+    libpq-dev \
+    tzdata \
+    && apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# 5. Install uv
+RUN curl -LsSf ${UV_INSTALL_URL} | sh && \
+    mv /root/.local/bin/uv /usr/local/bin/uv && \
+    chmod +x /usr/local/bin/uv
+
+# 6. Create and set working directory
+WORKDIR /rag_arc
+
+# 7. Copy only dependency files first (for better caching)
+COPY pyproject.toml uv.lock /rag_arc/
+
+# 8. Install Python dependencies using uv sync with virtual environment
+# Use --no-cache and configure index URL based on region
+RUN uv sync --no-cache --index-url ${UV_INDEX_URL}
+
+# 9. Copy the rest of the project files (after dependencies are installed)
+COPY . /rag_arc/
+
+# 10. Reinstall the package in editable mode using uv sync
+RUN uv sync --no-cache --index-url ${UV_INDEX_URL}
+
+# 11. Create necessary directories
+RUN mkdir -p \
+    /rag_arc/data/parsed_files \
+    /rag_arc/data/file_store \
+    /rag_arc/data/chunk_store \
+    /rag_arc/data/faiss_index \
+    /rag_arc/data/unified_faiss_index \
+    /rag_arc/data/unified_bm25_index \
+    /rag_arc/data/graph_index \
+    /rag_arc/data/graph_index_neo4j \
+    /rag_arc/local/files \
+    /rag_arc/models
+
+# 12. Expose application port
+EXPOSE 8000
+
+# 13. Set PATH to include virtual environment
+ENV PATH="/rag_arc/.venv/bin:$PATH"
+
+# 14. Start the application
+CMD ["uv", "run", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -0,0 +1,119 @@
+# 1. Use NVIDIA CUDA base image for GPU support
+FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
+
+# 2. Build arguments for region-specific configuration
+ARG UV_INSTALL_URL=https://astral.sh/uv/install.sh
+ARG UV_INDEX_URL=https://pypi.org/simple
+ARG APT_GET_URL=
+ARG PYTORCH_INDEX_URL=https://download.pytorch.org/whl/cu121
+
+# 3. Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    DEBIAN_FRONTEND=noninteractive \
+    NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    UV_HTTP_TIMEOUT=300 \
+    UV_INDEX_URL=${UV_INDEX_URL}
+
+# 4. Install Python 3.11 and system dependencies
+# Fix GPG signature issues when using mirror by using http instead of https
+RUN MIRROR_URL="${APT_GET_URL:-http://archive.ubuntu.com}" && \
+    # Force convert https:// to http:// to avoid GPG signature issues
+    MIRROR_URL=$(echo "$MIRROR_URL" | sed 's|^https://|http://|') && \
+    echo "Configuring apt mirror: $MIRROR_URL..." && \
+    # Use http instead of https for mirrors to avoid GPG issues
+    sed -i "s|http://archive.ubuntu.com|${MIRROR_URL}|g" /etc/apt/sources.list && \
+    sed -i "s|https://archive.ubuntu.com|${MIRROR_URL}|g" /etc/apt/sources.list || true && \
+    sed -i "s|http://security.ubuntu.com|${MIRROR_URL}|g" /etc/apt/sources.list && \
+    sed -i "s|https://security.ubuntu.com|${MIRROR_URL}|g" /etc/apt/sources.list || true && \
+    # Configure apt timeout and retry settings
+    echo 'Acquire::http::Timeout "600";' > /etc/apt/apt.conf.d/99timeout && \
+    echo 'Acquire::ftp::Timeout "600";' >> /etc/apt/apt.conf.d/99timeout && \
+    echo 'Acquire::Retries "10";' >> /etc/apt/apt.conf.d/99timeout && \
+    echo 'Acquire::http::MaxParallelDownloads "4";' >> /etc/apt/apt.conf.d/99timeout && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+    software-properties-common \
+    ca-certificates \
+    gnupg2 \
+    && echo "Checking if Python 3.11 is available in official Ubuntu repository..." && \
+    (apt-cache show python3.11 2>/dev/null | grep -q "Version:" && \
+     echo "Python 3.11 found in official repository, using it instead of deadsnakes PPA" || \
+     (echo "Python 3.11 not in official repo, adding deadsnakes PPA..." && \
+      add-apt-repository -y ppa:deadsnakes/ppa && \
+      if [ -n "$APT_GET_URL" ]; then \
+          echo "Replacing PPA with USTC Launchpad proxy for faster downloads in Mainland China..." && \
+          find /etc/apt/sources.list.d/ -type f -name "*deadsnakes*.list" \
+              -exec sed -i.bak -r 's#deb(-src)?\s*https?://ppa\.launchpadcontent\.net#deb\1 http://launchpad.proxy.ustclug.org#ig' {} \; && \
+          find /etc/apt/sources.list.d/ -type f -name "*deadsnakes*.list" \
+              -exec sed -i.bak -r 's#deb(-src)?\s*https?://ppa\.launchpad\.net#deb\1 http://launchpad.proxy.ustclug.org#ig' {} \; || true; \
+      fi && \
+      echo "Updating package lists (this may take a while for PPA)..." && \
+      apt-get update)) && \
+    echo "Installing Python 3.11 and dependencies..." && \
+    apt-get install -y --no-install-recommends \
+    python3.11 \
+    python3.11-dev \
+    python3.11-distutils \
+    build-essential \
+    curl \
+    git \
+    libpq-dev \
+    tzdata \
+    && apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# 5. Set Python 3.11 as default
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1
+
+# 6. Install uv
+RUN curl -LsSf ${UV_INSTALL_URL} | sh && \
+    mv /root/.local/bin/uv /usr/local/bin/uv && \
+    chmod +x /usr/local/bin/uv
+
+# 7. Create and set working directory
+WORKDIR /rag_arc
+
+# 8. Copy only dependency files first (for better caching)
+COPY pyproject.toml uv.lock /rag_arc/
+
+# 9. Install Python dependencies using uv sync with virtual environment
+# Use --no-cache and configure index URL based on region
+# First install all dependencies except PyTorch
+RUN uv sync --no-cache --index-url ${UV_INDEX_URL} && \
+    # Install PyTorch with CUDA support from PyTorch index
+    # This is an exception as PyTorch CUDA packages are typically not on PyPI
+    # Use uv pip install only for PyTorch as it requires a specific index
+    uv pip install --no-cache --index-url ${PYTORCH_INDEX_URL} \
+        torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 || \
+    uv pip install --no-cache --index-url ${UV_INDEX_URL} \
+        torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1
+
+# 10. Copy the rest of the project files (after dependencies are installed)
+COPY . /rag_arc/
+
+# 11. Reinstall the package in editable mode using uv sync
+RUN uv sync --no-cache --index-url ${UV_INDEX_URL}
+
+# 12. Create necessary directories
+RUN mkdir -p \
+    /rag_arc/data/parsed_files \
+    /rag_arc/data/file_store \
+    /rag_arc/data/chunk_store \
+    /rag_arc/data/faiss_index \
+    /rag_arc/data/unified_faiss_index \
+    /rag_arc/data/unified_bm25_index \
+    /rag_arc/data/graph_index \
+    /rag_arc/data/graph_index_neo4j \
+    /rag_arc/local/files \
+    /rag_arc/models
+
+# 13. Expose application port
+EXPOSE 8000
+
+# 14. Set PATH to include virtual environment
+ENV PATH="/rag_arc/.venv/bin:$PATH"
+
+# 15. Start the application
+CMD ["uv", "run", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]