modelplaneai · dennis-upbound · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/design/modelcache/design.md b/design/modelcache/design.md
diff --git a/design/modelcache/examples/01-basic-weights.yaml b/design/modelcache/examples/01-basic-weights.yaml
@@ -0,0 +1,62 @@
+# Basic weights cache — single cluster, single node.
+#
+# The most common case: a single-node deployment (TP only) that wants to
+# avoid redownloading Llama 70B every time the engine pod restarts.
+#
+# ModelCache pulls weights from HuggingFace to a per-cluster RWX PVC;
+# the engine mounts it read-only and starts with --model pointing at
+# the local path.
+#
+# Speedup (rough): 140 GB at ~50 MB/s = ~45 min cold pull. ModelCache
+# turns this into one pull per cluster; every replica restart after is
+# ~0 download.
+
+apiVersion: modelplane.ai/v1alpha1
+kind: ModelCache
+metadata:
+  name: llama-3-3-70b
+  namespace: ml-team
+spec:
+  artifact:
+    kind: Weights
+    source:
+      huggingFace:
+        repo: meta-llama/Llama-3.3-70B-Instruct
+        secretRef: { name: hf-token, key: token }
+  mount:
+    path: /mnt/model
+  storage:
+    backend: PVC
+    pvc:
+      storageClassName: filestore-rwx
+  clusterSelector:
+    matchLabels:
+      modelplane.ai/tier: prod
+  replication: AllMatchingClusters
+
+---
+apiVersion: modelplane.ai/v1alpha1
+kind: ModelDeployment
+metadata:
+  name: llama-3-3-70b
+  namespace: ml-team
+spec:
+  replicas: 2
+  clusterSelector:
+    matchLabels:
+      modelplane.ai/tier: prod
+  workers:
+    topology:
+      strategy: Tensor
+      tensor: 8
+    resources:
+      cpu: "16"
+      memory: "64Gi"
+  engine:
+    image: vllm/vllm-openai:v0.11.0
+    args:
+      - --model=/mnt/model
+      - --tensor-parallel-size=8
+      - --max-model-len=131072
+  caches:
+    - name: llama-3-3-70b
diff --git a/design/modelcache/examples/02-multi-node-llama-405b.yaml b/design/modelcache/examples/02-multi-node-llama-405b.yaml
@@ -0,0 +1,68 @@
+# Multi-node serving — TensorPipeline gang, shared RWX weights.
+#
+# Llama 3.1 405B doesn't fit on one node. Two nodes of 8 GPUs each
+# (TP=8, PP=2) form one LWS gang. Both pods need the same 810 GB of
+# weights — without ModelCache, each pod would download independently
+# (impractical) and KServe explicitly requires a shared RWX volume.
+#
+# v0.1's PVC backend handles this directly: one RWX PVC populated once
+# by a Job, mounted by both leader and worker pods.
+#
+# Speedup (rough): without sharing, 810 GB × 2 pods = ~5 hr per gang
+# restart and KServe's init container OOMs. Shared RWX PVC: one ~80 min
+# pull, both pods mount it instantly.
+
+apiVersion: modelplane.ai/v1alpha1
+kind: ModelCache
+metadata:
+  name: llama-3-1-405b
+  namespace: ml-team
+spec:
+  artifact:
+    kind: Weights
+    source:
+      huggingFace:
+        repo: meta-llama/Llama-3.1-405B-Instruct
+        secretRef: { name: hf-token, key: token }
+  mount:
+    path: /mnt/model
+  storage:
+    backend: PVC
+    pvc:
+      storageClassName: filestore-rwx
+      sizeGiB: 900
+  clusterSelector:
+    matchLabels:
+      modelplane.ai/tier: prod
+      modelplane.ai/rdma: "true"
+  replication: AllMatchingClusters
+
+---
+apiVersion: modelplane.ai/v1alpha1
+kind: ModelDeployment
+metadata:
+  name: llama-3-1-405b
+  namespace: ml-team
+spec:
+  replicas: 1
+  clusterSelector:
+    matchLabels:
+      modelplane.ai/tier: prod
+      modelplane.ai/rdma: "true"
+  workers:
+    topology:
+      strategy: TensorPipeline
+      tensor: 8
+      pipeline: 2
+    resources:
+      cpu: "32"
+      memory: "256Gi"
+  engine:
+    image: vllm/vllm-openai:v0.11.0
+    args:
+      - --model=/mnt/model
+      - --tensor-parallel-size=8
+      - --pipeline-parallel-size=2
+      - --max-model-len=131072
+  caches:
+    - name: llama-3-1-405b
diff --git a/design/modelcache/examples/03-multi-cluster-replication.yaml b/design/modelcache/examples/03-multi-cluster-replication.yaml
@@ -0,0 +1,61 @@
+# Multi-cluster replication — same model staged to every cluster the
+# deployment may land on.
+#
+# Production deployment of Qwen3-32B fans out to 4 replicas across
+# multiple production clusters (us-east, us-west, eu-west). ModelCache
+# pre-stages the weights to every matching cluster so any cluster the
+# scheduler picks already has the bytes ready.
+#
+# Engine starts from the local PVC; no HF traffic at replica scale-up.
+#
+# Speedup (rough): 4N redundant 65 GB pulls without ModelCache. With
+# ModelCache: one ~25 min pull per cluster, then ~0 per replica. Wins
+# when N is 3+ regions and replicas churn.
+
+apiVersion: modelplane.ai/v1alpha1
+kind: ModelCache
+metadata:
+  name: qwen3-32b
+  namespace: ml-team
+spec:
+  artifact:
+    kind: Weights
+    source:
+      huggingFace:
+        repo: Qwen/Qwen3-32B
+  mount:
+    path: /mnt/model
+  storage:
+    backend: PVC
+    pvc:
+      storageClassName: filestore-rwx
+  clusterSelector:
+    matchLabels:
+      modelplane.ai/tier: prod
+  replication: AllMatchingClusters
+
+---
+apiVersion: modelplane.ai/v1alpha1
+kind: ModelDeployment
+metadata:
+  name: qwen3-32b
+  namespace: ml-team
+spec:
+  replicas: 4
+  clusterSelector:
+    matchLabels:
+      modelplane.ai/tier: prod
+  workers:
+    topology:
+      strategy: Tensor
+      tensor: 4
+    resources:
+      cpu: "16"
+      memory: "64Gi"
+  engine:
+    image: vllm/vllm-openai:v0.11.0
+    args:
+      - --model=/mnt/model
+      - --tensor-parallel-size=4
+  caches:
+    - name: qwen3-32b
diff --git a/design/modelcache/examples/04-separate-tokenizer.yaml b/design/modelcache/examples/04-separate-tokenizer.yaml
@@ -0,0 +1,80 @@
+# Separate Tokenizer cache — multiple ModelCaches referenced from one
+# ModelDeployment.
+#
+# Some workflows want the tokenizer staged separately — e.g. a custom
+# tokenizer different from the model's bundled one, or a tokenizer
+# update independent of the weights. v0.1 keeps one artifact per
+# ModelCache; the deployment references multiple caches by name.
+#
+# Speedup (rough): tokenizer is tiny (~1-10 MB) so direct download is
+# already fast. The win is operational: tokenizer updates roll
+# independently of weights, useful when fixing a tokenizer bug without
+# triggering a full weight re-stage.
+
+apiVersion: modelplane.ai/v1alpha1
+kind: ModelCache
+metadata:
+  name: mistral-small-weights
+  namespace: ml-team
+spec:
+  artifact:
+    kind: Weights
+    source:
+      huggingFace:
+        repo: mistralai/Mistral-Small-3.2-24B-Instruct-2506
+  mount:
+    path: /mnt/model
+  storage:
+    backend: PVC
+    pvc:
+      storageClassName: filestore-rwx
+  clusterSelector: { matchLabels: { modelplane.ai/tier: prod } }
+  replication: AllMatchingClusters
+
+---
+apiVersion: modelplane.ai/v1alpha1
+kind: ModelCache
+metadata:
+  name: custom-mistral-tokenizer
+  namespace: ml-team
+spec:
+  artifact:
+    kind: Tokenizer
+    source:
+      s3:
+        uri: s3://ml-team-artifacts/tokenizers/mistral-custom-v2/
+        secretRef: { name: aws-creds }
+  mount:
+    path: /mnt/tokenizer
+  storage:
+    backend: PVC
+    pvc:
+      storageClassName: filestore-rwx
+      sizeGiB: 1
+  clusterSelector: { matchLabels: { modelplane.ai/tier: prod } }
+  replication: AllMatchingClusters
+
+---
+apiVersion: modelplane.ai/v1alpha1
+kind: ModelDeployment
+metadata:
+  name: mistral-small
+  namespace: ml-team
+spec:
+  replicas: 2
+  workers:
+    topology:
+      strategy: Tensor
+      tensor: 2
+    resources:
+      cpu: "8"
+      memory: "32Gi"
+  engine:
+    image: vllm/vllm-openai:v0.11.0
+    args:
+      - --model=/mnt/model
+      - --tokenizer=/mnt/tokenizer
+      - --tensor-parallel-size=2
+  caches:
+    - name: mistral-small-weights
+    - name: custom-mistral-tokenizer
diff --git a/design/modelcache/examples/05-private-s3-source.yaml b/design/modelcache/examples/05-private-s3-source.yaml
@@ -0,0 +1,63 @@
+# Private S3 source with credentials — air-gapped / regulated case.
+#
+# Some teams can't pull from HuggingFace (data residency, custom
+# fine-tunes, compliance). The platform team mirrors approved model
+# weights into an internal S3 bucket; ModelCache pulls from there.
+#
+# Cluster selector restricts the cache to clusters in a compliant
+# region. The Secret in this namespace holds the S3 credentials.
+#
+# Speedup (rough): intra-region S3 ~1 GB/s vs HF ~50 MB/s. 140 GB =
+# ~3 min vs ~45 min. Plus the bytes never leave a compliant region.
+
+apiVersion: modelplane.ai/v1alpha1
+kind: ModelCache
+metadata:
+  name: company-finetune-llama-70b
+  namespace: ml-team
+spec:
+  artifact:
+    kind: Weights
+    source:
+      s3:
+        uri: s3://company-ml-artifacts/finetunes/llama-70b-support-v17/
+        region: eu-west-1
+        secretRef: { name: company-s3-readonly }
+  mount:
+    path: /mnt/model
+  storage:
+    backend: PVC
+    pvc:
+      storageClassName: efs-rwx
+  clusterSelector:
+    matchLabels:
+      modelplane.ai/region: eu-west-1
+      modelplane.ai/compliance: gdpr
+  replication: AllMatchingClusters
+
+---
+apiVersion: modelplane.ai/v1alpha1
+kind: ModelDeployment
+metadata:
+  name: support-llm
+  namespace: ml-team
+spec:
+  replicas: 3
+  clusterSelector:
+    matchLabels:
+      modelplane.ai/region: eu-west-1
+      modelplane.ai/compliance: gdpr
+  workers:
+    topology:
+      strategy: Tensor
+      tensor: 8
+    resources:
+      cpu: "16"
+      memory: "64Gi"
+  engine:
+    image: vllm/vllm-openai:v0.11.0
+    args:
+      - --model=/mnt/model
+      - --tensor-parallel-size=8
+  caches:
+    - name: company-finetune-llama-70b