archebase · zhexuany · Feb 2, 2026 · Feb 1, 2026 · Feb 1, 2026 · Feb 1, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -70,6 +70,7 @@ libc = "0.2"
 num_cpus = "1.16"
 sysinfo = "0.30"
 raw-cpuid = { version = "11.0", optional = true }
+hostname = "0.4"
 
 # Compression
 zstd = "0.13"
@@ -186,6 +187,11 @@ path = "src/bin/schema.rs"
 name = "search"
 path = "src/bin/search.rs"
 
+[[bin]]
+name = "roboflow"
+path = "src/bin/roboflow.rs"
+required-features = ["distributed"]
+
 # Benchmarks
 [[bench]]
 name = "profiler"

diff --git a/crates/roboflow-distributed/src/scanner.rs b/crates/roboflow-distributed/src/scanner.rs
@@ -506,13 +506,15 @@ impl Scanner {
             if let Err(e) = self.tikv.batch_put(job_pairs).await {
                 tracing::error!(
                     pod_id = %self.pod_id,
+                    batch_size = chunk.len(),
                     error = %e,
-                    "Failed to create batch of jobs"
+                    "Failed to create batch of jobs - scan cycle incomplete, files skipped"
                 );
                 self.metrics.inc_scan_errors();
-            } else {
-                jobs_created += chunk.len() as u64;
+                // Return error to fail the entire scan cycle - continuing would skip files
+                return Err(e);
             }
+            jobs_created += chunk.len() as u64;
         }
         self.metrics.inc_jobs_created(jobs_created);
 

diff --git a/crates/roboflow-distributed/src/worker.rs b/crates/roboflow-distributed/src/worker.rs
@@ -1082,7 +1082,14 @@ impl Worker {
                                 "Shutdown requested, not processing new job"
                             );
                             // Release the job back to Pending
-                            let _ = self.release_job(&job_id).await;
+                            if let Err(e) = self.release_job(&job_id).await {
+                                tracing::error!(
+                                    pod_id = %self.pod_id,
+                                    job_id = %job_id,
+                                    error = %e,
+                                    "CRITICAL: Failed to release job during shutdown - job may be stuck in Processing state"
+                                );
+                            }
                             break;
                         }
 

diff --git a/deploy/k8s/README.md b/deploy/k8s/README.md
@@ -0,0 +1,251 @@
+# Roboflow Kubernetes Deployment
+
+This directory contains Kubernetes manifests for deploying Roboflow as long-running worker pods.
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                    roboflow-worker Deployment               │
+│  ┌───────────────────────────────────────────────────────┐  │
+│  │               Worker Container                         │  │
+│  │  - Claims jobs from TiKV queue                       │  │
+│  │  - Processes bag/MCAP to LeRobot datasets             │  │
+│  │  - Sends heartbeats to TiKV                           │  │
+│  │  - Health server on :8080                              │  │
+│  └───────────────────────────────────────────────────────┘  │
+│  ┌───────────────────────────────────────────────────────┐  │
+│  │               Scanner Sidecar                         │  │
+│  │  - Discovers files in storage                        │  │
+│  │  - Creates jobs in TiKV                              │  │
+│  │  - Leader election via TiKV locks                    │  │
+│  │  - Health server on :8081                              │  │
+│  └───────────────────────────────────────────────────────┘  │
+└─────────────────────────────────────────────────────────────┘
+                            ↓
+                    ┌───────────────┐
+                    │  TiKV Cluster  │
+                    │  (Coordination)│
+                    └───────────────┘
+                            ↓
+                    ┌───────────────┐
+                    │  S3/OSS Storage│
+                    │  (Input/Output)│
+                    └───────────────┘
+```
+
+## Manifests
+
+| File | Description |
+|------|-------------|
+| `namespace.yaml` | Roboflow namespace |
+| `configmap.yaml` | Configuration (TiKV endpoints, timeouts, paths) |
+| `secrets.yaml` | Secret template for cloud storage credentials |
+| `deployment.yaml` | Worker deployment with scanner sidecar |
+| `scanner-standalone.yaml` | Optional standalone scanner deployment |
+| `service.yaml` | Included in deployment.yaml |
+| `hpa.yaml` | HorizontalPodAutoscaler for auto-scaling |
+| `pdb.yaml` | PodDisruptionBudget for graceful updates |
+| `servicemonitor.yaml` | Prometheus ServiceMonitor for metrics |
+
+## Quick Start
+
+### 1. Create Namespace
+
+```bash
+kubectl apply -f deploy/k8s/namespace.yaml
+```
+
+### 2. Create ConfigMap
+
+```bash
+kubectl apply -f deploy/k8s/configmap.yaml
+```
+
+### 3. Create Secret (for cloud storage)
+
+```bash
+kubectl create secret generic roboflow-secrets \
+  --from-literal=AWS_ACCESS_KEY_ID=your_key_id \
+  --from-literal=AWS_SECRET_ACCESS_KEY=your_secret \
+  --from-literal=AWS_REGION=us-east-1 \
+  --namespace=roboflow
+```
+
+Or use IRSA (IAM Roles for Service Accounts) for AWS, which is recommended for production.
+
+### 4. Deploy Workers
+
+```bash
+kubectl apply -f deploy/k8s/deployment.yaml
+```
+
+### 5. Deploy HPA and PDB
+
+```bash
+kubectl apply -f deploy/k8s/hpa.yaml
+kubectl apply -f deploy/k8s/pdb.yaml
+```
+
+### 6. (Optional) Deploy ServiceMonitor
+
+```bash
+kubectl apply -f deploy/k8s/servicemonitor.yaml
+```
+
+## Configuration
+
+### Environment Variables
+
+Configuration is managed via the `roboflow-config` ConfigMap. Key variables:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `TIKV_PD_ENDPOINTS` | `127.0.0.1:2379` | TiKV placement driver endpoints |
+| `STORAGE_URL` | - | S3/OSS storage URL |
+| `WORKER_MAX_CONCURRENT_JOBS` | `1` | Max jobs per worker |
+| `WORKER_POLL_INTERVAL_SECS` | `5` | Job poll interval |
+| `HEALTH_PORT` | `8080` | Health server port |
+
+### Resource Requirements
+
+Default resource requests/limits per pod:
+
+| Container | CPU Request | CPU Limit | Memory Request | Memory Limit |
+|-----------|-------------|----------|----------------|--------------|
+| Worker | 4 | 8 | 16Gi | 32Gi |
+| Scanner | 500m | 1 | 512Mi | 1Gi |
+
+### GPU Support
+
+Uncomment the GPU resource limit in `deployment.yaml`:
+
+```yaml
+resources:
+  limits:
+    nvidia.com/gpu: "1"
+```
+
+## Health Probes
+
+All containers expose HTTP health endpoints on port 8080:
+
+- `/health/live` - Liveness probe (always returns 200 if process running)
+- `/health/ready` - Readiness probe (200 when connected to TiKV)
+- `/health` - Basic health check
+
+Test from within the cluster:
+
+```bash
+kubectl exec -n roboflow deployment/roboflow-worker -c worker -- \
+  curl http://localhost:8080/health
+```
+
+## Scaling
+
+### Manual Scaling
+
+```bash
+kubectl scale deployment/roboflow-worker --replicas=10 -n roboflow
+```
+
+### Auto-Scaling
+
+The HPA is configured to scale based on CPU and memory utilization:
+
+```bash
+kubectl get hpa -n roboflow
+```
+
+For custom metrics (e.g., pending jobs in TiKV), configure Prometheus Adapter and update `hpa.yaml`.
+
+## Monitoring
+
+Metrics are exposed in Prometheus format on `/metrics` (port 8080).
+
+Available metrics:
+- `roboflow_jobs_claimed_total` - Total jobs claimed
+- `roboflow_jobs_completed_total` - Total jobs completed
+- `roboflow_jobs_failed_total` - Total jobs failed
+- `roboflow_active_jobs` - Current active jobs
+- `roboflow_scanner_files_discovered_total` - Files discovered
+- `roboflow_scanner_jobs_created_total` - Jobs created
+
+## Logs
+
+View logs for a specific pod:
+
+```bash
+kubectl logs -n roboflow deployment/roboflow-worker -c worker -f
+```
+
+View scanner logs:
+
+```bash
+kubectl logs -n roboflow deployment/roboflow-worker -c scanner -f
+```
+
+## Troubleshooting
+
+### Worker not claiming jobs
+
+1. Check TiKV connectivity:
+   ```bash
+   kubectl exec -n roboflow deployment/roboflow-worker -c worker -- \
+     nc -zv tikv.tikv.svc.cluster.local 2379
+   ```
+
+2. Check worker logs for errors:
+   ```bash
+   kubectl logs -n roboflow deployment/roboflow-worker -c worker --tail=100
+   ```
+
+3. Verify jobs exist in TiKV (requires TiKV CLI)
+
+### Scanner not creating jobs
+
+1. Check scanner logs:
+   ```bash
+   kubectl logs -n roboflow deployment/roboflow-worker -c scanner --tail=100
+   ```
+
+2. Verify storage accessibility and `SCANNER_INPUT_PREFIX`
+
+3. Check if scanner is the leader (only leader creates jobs)
+
+### Pod failing readiness probe
+
+1. Check if TiKV is reachable from the pod
+2. Verify `TIKV_PD_ENDPOINTS` in ConfigMap
+3. Check network policies allow pod-to-TiKV communication
+
+## Upgrading
+
+Rolling updates are configured with 25% max unavailable and 25% max surge:
+
+```bash
+kubectl set image deployment/roboflow-worker \
+  worker=roboflow:v2.0.0 \
+  scanner=roboflow:v2.0.0 \
+  -n roboflow
+```
+
+Watch the rollout:
+
+```bash
+kubectl rollout status deployment/roboflow-worker -n roboflow
+```
+
+## Cleanup
+
+Delete all resources:
+
+```bash
+kubectl delete namespace roboflow
+```
+
+Or delete individual resources:
+
+```bash
+kubectl delete -f deploy/k8s/
+```
diff --git a/deploy/k8s/configmap.yaml b/deploy/k8s/configmap.yaml
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: 2026 ArcheBase
+#
+# SPDX-License-Identifier: MulanPSL-2.0
+
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: roboflow-config
+  namespace: roboflow
+data:
+  # TiKV Configuration
+  TIKV_PD_ENDPOINTS: "tikv.tikv.svc.cluster.local:2379"
+  TIKV_CONNECTION_TIMEOUT_SECS: "10"
+  TIKV_OPERATION_TIMEOUT_SECS: "30"
+  TIKV_TRANSACTION_TIMEOUT_SECS: "60"
+
+  # Storage Configuration
+  # OSS credentials should be provided via Secret, not ConfigMap
+  STORAGE_URL: "s3://roboflow-data"
+
+  # Worker Configuration
+  WORKER_POLL_INTERVAL_SECS: "5"
+  WORKER_MAX_CONCURRENT_JOBS: "1"
+  WORKER_MAX_ATTEMPTS: "3"
+  WORKER_JOB_TIMEOUT_SECS: "3600"
+  WORKER_HEARTBEAT_INTERVAL_SECS: "30"
+  WORKER_CHECKPOINT_INTERVAL_FRAMES: "100"
+  WORKER_CHECKPOINT_INTERVAL_SECS: "10"
+  WORKER_STORAGE_PREFIX: "input/"
+  WORKER_OUTPUT_PREFIX: "output/"
+
+  # Scanner Configuration
+  SCANNER_INPUT_PREFIX: "input/"
+  SCANNER_SCAN_INTERVAL_SECS: "60"
+  SCANNER_OUTPUT_PREFIX: "output/"
+  # SCANNER_FILE_PATTERN: "*.mcap"  # Optional: filter files by glob pattern
+
+  # Health Server Configuration
+  HEALTH_HOST: "0.0.0.0"
+  HEALTH_PORT: "8080"
+
+  # Logging Configuration
+  LOG_FORMAT: "json"
+  LOG_LEVEL: "info"
+  RUST_LOG: "roboflow=info,roboflow_distributed=info"