llm-d · kfirtoledo · Nov 28, 2025 · Nov 23, 2025 · Nov 23, 2025 · Nov 23, 2025
@@ -0,0 +1,46 @@
+# Copyright 2025 The llm-d Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+BasedOnStyle: Google
+UseTab: Never
+IndentWidth: 4
+ColumnLimit: 140
+
+# Disable argument and parameter bin-packing for consistent multi-line formatting
+BinPackArguments: false
+BinPackParameters: false
+AllowAllArgumentsOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+
+# Force pointers to the type for C++.
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# Reordering #include statements can (and currently will) introduce errors
+SortIncludes: false
+
+# Style choices
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+IndentPPDirectives: BeforeHash
+
+IncludeCategories:
+  - Regex:           '^<'
+    Priority:        4
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
+    Priority:        3
+  - Regex:           '^"(qoda|\.\.)/'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        1
@@ -25,7 +25,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo add-apt-repository ppa:deadsnakes/ppa -y
-          sudo apt-get install -y libzmq3-dev pkg-config python3.12 python3.12-dev python3.12-venv
+          sudo apt-get install -y libzmq3-dev pkg-config python3.12 python3.12-dev python3.12-venv clang-format
 
       - name: Sanity check repo contents
         run: ls -la
@@ -66,10 +66,12 @@ jobs:
           export CGO_CFLAGS="$(python3.12-config --cflags) -I./lib"
           export CGO_LDFLAGS="$(python3.12-config --ldflags --embed) -L./lib -ltokenizers -ldl -lm"
           export CGO_ENABLED=1
-          
+
           # Now run the linting command from the Makefile
           make precommit
 
+      - name: Run C/C++/CUDA formatting check
+        run: make clang-check
       - name: Run make build
         shell: bash
         run: |

@@ -22,6 +22,25 @@ __pycache__/
 *.pyd
 *.python-version
 
+# Python build artifacts
+*.egg-info/
+build/
+dist/
+
+# C++/CUDA build outputs
+*.o
+*.so
+*.d
+*.a
+
+# Ninja build files
+build.ninja
+.ninja_log
+.ninja_deps
+
+# Temporary pip build directories
+*.egg-info/
+
 # Go workspace file
 go.work
 go.work.sum

@@ -36,6 +36,8 @@ header:
     - "**/not_a_yaml"
     - "**/go.mod"
     - "**/go.sum"
+    - "**/*.cu"
+    - "**/*.cuh"
     - ".gitignore"
     - "LICENSE"
     - "NOTICE"

@@ -149,6 +149,15 @@ copr-fix:
 	@echo "Adding copyright headers..."
 	@docker run -i --rm -v $(shell pwd):/github/workspace apache/skywalking-eyes header fix
 
+clang:
+	@echo "Running clang-format..."
+	@find kv_connectors -type f \( \
+	    -name "*.cu" -o -name "*.cuh" -o \
+	    -name "*.cc" -o -name "*.cpp" -o \
+	    -name "*.hpp" -o -name "*.h" \
+	\) -exec clang-format -i {} +
+
+
 ##@ Development
 
 # Common environment variables for Go tests and builds

@@ -0,0 +1,98 @@
+# llmd-fs-backend README
+
+## Overview
+The llmd-fs-backend extends the native [vLLM Offloading Connector](#offloading-connector-docs) to support a file system backend.
+This backend provides a shared-storage offloading layer for vLLM. It moves KV-cache blocks between GPU and shared storage efficiently using:
+
+- Async CUDA copies or GPU kernels
+- Pinned memory pools
+- Multi-threaded I/O workers
+- NUMA-aware CPU affinity
+- Atomic file writes and zero-copy reads
+
+The fs connector (llmd_fs_backend) is used for shared storage but it can also work with local disk.
+
+For architectural clarity, the fs connector is not responsible for cleanup. Storage systems should manage this.
+For simple setups, see the **Storage Cleanup** section.
+
+<img src="./docs/images/fs_connector.png" width="400" />
+
+## System Requirements
+- vLLM version 0.11.0 or above, which includes the Offloading Connector
+
+## Installation
+
+```bash
+apt-get update && apt-get install -y libnuma-dev
+pip install git+https://github.com/llm-d-kv-cache-manager.git#subdirectory=kv_connectors/llmd_fs_backend
+```
+
+This installs:
+- Python module `llmd_fs_backend`
+- CUDA extension `storage_offload.so`
+
+## Configuration Flags
+
+### Connector parameters
+
+- `shared_storage_path`: filesystem path for store and load the KV files.
+- `block_size`: number of GPU blocks grouped into each file (must be in granulaity of GPU block size that)
+- `threads_per_gpu`: number of I/O threads per GPU
+- `max_pinned_memory_gb`: total pinned memory limit
+
+### Environment variables
+- `STORAGE_CONNECTOR_DEBUG`: enable debug logs
+- `USE_KERNEL_COPY_WRITE`: enable GPU-kernel writes (default 0)
+- `USE_KERNEL_COPY_READ`: enable GPU-kernel reads (default 1)
+
+## Example vLLM YAML
+
+To load the fs connector:
+
+```yaml
+--kv-transfer-config '{
+  "kv_connector": "OffloadingConnector",
+  "kv_role": "kv_both",
+  "kv_connector_extra_config": {
+    "spec_name": "SharedStorageOffloadingSpec",
+    "spec_module_path": "llmd_fs_backend.spec",
+    "shared_storage_path": "/mnt/files-storage/kv-cache/",
+    "block_size": 256,
+    "threads_per_gpu": "64"
+  }
+}'
+--distributed_executor_backend "mp"
+```
+
+A full deployment example can be found in the [`docs`](./docs/deployment) folder.
+
+It is recommended to use multiprocess mode by setting:
+`--distributed_executor_backend "mp"`
+
+To configure environment variables:
+
+```yaml
+env:
+- name: STORAGE_CONNECTOR_DEBUG
+  value: 1
+```
+
+## Storage Cleanup
+TBD
+
+## Troubleshooting
+
+### Missing `numa.h`
+Install the required package:
+
+```bash
+apt-get install -y libnuma-dev
+```
+
+---
+
+## Link Aliases
+
+- **Offloading Connector Docs**
+  <a name="offloading-connector-docs"></a>
+  https://docs.vllm.ai/en/stable/features/disagg_prefill/#usage-example:~:text=backends%22%3A%5B%22UCX%22%2C%20%22GDS%22%5D%7D%7D%27-,OffloadingConnector,-%3A%20enable%20offloading%20of
@@ -0,0 +1,39 @@
+[build-system]
+requires = [
+    "setuptools>=65",
+    "wheel",
+    "torch",
+    "ninja"
+]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "llmd_fs_connector"
+version = "0.1.0"
+description = "Standalone llm-d fs storage connector"
+readme = "README.md"
+authors = [
+    { name = "Kfir", email = "kfir.toledo@ibm.com" }
+]
+maintainers = [
+    { name = "llm-d community" }
+]
+requires-python = ">=3.9"
+dependencies = [
+    "torch>=2.1",
+]
+
+[tool.setuptools]
+packages = ["llmd_fs_backend"]
+package-dir = {"" = "src"}
+
+[tool.setuptools.package-data]
+llmd_fs_backend = ["*.so"]
+
+[project.optional-dependencies]
+dev = [
+    "vllm",
+    "pytest",
+    "black",
+    "ruff",
+]
@@ -0,0 +1,40 @@
+# Copyright 2025 The llm-d Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from setuptools import setup, find_packages
+from torch.utils.cpp_extension import CUDAExtension, BuildExtension
+
+setup(
+    name="storage_offload",
+    packages=find_packages("src"),
+    package_dir={"": "src"},
+    ext_modules=[
+        CUDAExtension(
+            "storage_offload",
+             sources=[
+                "src/csrc/storage/storage_offload.cu",
+                "src/csrc/storage/buffer.cpp",
+                "src/csrc/storage/file_io.cpp",
+                "src/csrc/storage/thread_pool.cpp",
+                "src/csrc/storage/tensor_copy.cu",
+            ],
+            libraries=['numa', 'cuda'],
+            extra_compile_args={
+                "cxx": ["-O3", "-std=c++17", "-fopenmp"],
+                "nvcc": ["-O3", "-std=c++17", "-Xcompiler", "-std=c++17","-Xcompiler", "-fopenmp"]
+            }
+        ),
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)