Skip to content

Commit 5089bba

Browse files
committed
feat(supervisor): require metadata URL when compute snapshots enabled
1 parent 4332743 commit 5089bba

File tree

1 file changed

+157
-143
lines changed

1 file changed

+157
-143
lines changed

apps/supervisor/src/env.ts

Lines changed: 157 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -3,148 +3,162 @@ import { env as stdEnv } from "std-env";
33
import { z } from "zod";
44
import { AdditionalEnvVars, BoolEnv } from "./envUtil.js";
55

6-
const Env = z.object({
7-
// This will come from `spec.nodeName` in k8s
8-
TRIGGER_WORKER_INSTANCE_NAME: z.string().default(randomUUID()),
9-
TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30),
10-
11-
// Required settings
12-
TRIGGER_API_URL: z.string().url(),
13-
TRIGGER_WORKER_TOKEN: z.string(), // accepts file:// path to read from a file
14-
MANAGED_WORKER_SECRET: z.string(),
15-
OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), // set on the runners
16-
17-
// Workload API settings (coordinator mode) - the workload API is what the run controller connects to
18-
TRIGGER_WORKLOAD_API_ENABLED: BoolEnv.default(true),
19-
TRIGGER_WORKLOAD_API_PROTOCOL: z
20-
.string()
21-
.transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase()))
22-
.default("http"),
23-
TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default
24-
TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"),
25-
TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on
26-
TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller
27-
28-
// Runner settings
29-
RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(),
30-
RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(),
31-
RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv)
32-
RUNNER_PRETTY_LOGS: BoolEnv.default(false),
33-
34-
// Dequeue settings (provider mode)
35-
TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true),
36-
TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250),
37-
TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000),
38-
TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1),
39-
TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT: z.coerce.number().int().default(1),
40-
TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT: z.coerce.number().int().default(10),
41-
TRIGGER_DEQUEUE_SCALING_STRATEGY: z.enum(["none", "smooth", "aggressive"]).default("none"),
42-
TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS: z.coerce.number().int().default(5000), // 5 seconds
43-
TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS: z.coerce.number().int().default(30000), // 30 seconds
44-
TRIGGER_DEQUEUE_SCALING_TARGET_RATIO: z.coerce.number().default(1.0), // Target ratio of queue items to consumers (1.0 = 1 item per consumer)
45-
TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA: z.coerce.number().min(0).max(1).default(0.3), // Smooths queue length measurements (0=historical, 1=current)
46-
TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms)
47-
TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate)
48-
49-
// Optional services
50-
TRIGGER_WARM_START_URL: z.string().optional(),
51-
TRIGGER_CHECKPOINT_URL: z.string().optional(),
52-
TRIGGER_METADATA_URL: z.string().optional(),
53-
54-
// Used by the resource monitor
55-
RESOURCE_MONITOR_ENABLED: BoolEnv.default(false),
56-
RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(),
57-
RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB: z.coerce.number().optional(),
58-
59-
// Docker settings
60-
DOCKER_API_VERSION: z.string().optional(),
61-
DOCKER_PLATFORM: z.string().optional(), // e.g. linux/amd64, linux/arm64
62-
DOCKER_STRIP_IMAGE_DIGEST: BoolEnv.default(true),
63-
DOCKER_REGISTRY_USERNAME: z.string().optional(),
64-
DOCKER_REGISTRY_PASSWORD: z.string().optional(),
65-
DOCKER_REGISTRY_URL: z.string().optional(), // e.g. https://index.docker.io/v1
66-
DOCKER_ENFORCE_MACHINE_PRESETS: BoolEnv.default(true),
67-
DOCKER_AUTOREMOVE_EXITED_CONTAINERS: BoolEnv.default(true),
68-
/**
69-
* Network mode to use for all runners. Supported standard values are: `bridge`, `host`, `none`, and `container:<name|id>`.
70-
* Any other value is taken as a custom network's name to which all runners should connect to.
71-
*
72-
* Accepts a list of comma-separated values to attach to multiple networks. Additional networks are interpreted as network names and will be attached after container creation.
73-
*
74-
* **WARNING**: Specifying multiple networks will slightly increase startup times.
75-
*
76-
* @default "host"
77-
*/
78-
DOCKER_RUNNER_NETWORKS: z.string().default("host"),
79-
80-
// Compute settings
81-
COMPUTE_GATEWAY_URL: z.string().url().optional(),
82-
COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(),
83-
COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000),
84-
COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false),
85-
86-
// Kubernetes settings
87-
KUBERNETES_FORCE_ENABLED: BoolEnv.default(false),
88-
KUBERNETES_NAMESPACE: z.string().default("default"),
89-
KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"),
90-
KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv
91-
KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"),
92-
KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"),
93-
KUBERNETES_STRIP_IMAGE_DIGEST: BoolEnv.default(false),
94-
KUBERNETES_CPU_REQUEST_MIN_CORES: z.coerce.number().min(0).default(0),
95-
KUBERNETES_CPU_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(0.75), // Ratio of CPU limit, so 0.75 = 75% of CPU limit
96-
KUBERNETES_MEMORY_REQUEST_MIN_GB: z.coerce.number().min(0).default(0),
97-
KUBERNETES_MEMORY_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(1), // Ratio of memory limit, so 1 = 100% of memory limit
98-
99-
// Per-preset overrides of the global KUBERNETES_CPU_REQUEST_RATIO
100-
KUBERNETES_CPU_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(),
101-
KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(),
102-
KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(),
103-
KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(),
104-
KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(),
105-
KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(),
106-
KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(),
107-
108-
// Per-preset overrides of the global KUBERNETES_MEMORY_REQUEST_RATIO
109-
KUBERNETES_MEMORY_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(),
110-
KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(),
111-
KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(),
112-
KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(),
113-
KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(),
114-
KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(),
115-
KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(),
116-
117-
KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB
118-
KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods
119-
KUBERNETES_LARGE_MACHINE_POOL_LABEL: z.string().optional(), // if set, large-* presets affinity for machinepool=<value>
120-
121-
// Project affinity settings - pods from the same project prefer the same node
122-
KUBERNETES_PROJECT_AFFINITY_ENABLED: BoolEnv.default(false),
123-
KUBERNETES_PROJECT_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(50),
124-
KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY: z.string().trim().min(1).default("kubernetes.io/hostname"),
125-
126-
// Placement tags settings
127-
PLACEMENT_TAGS_ENABLED: BoolEnv.default(false),
128-
PLACEMENT_TAGS_PREFIX: z.string().default("node.cluster.x-k8s.io"),
129-
130-
// Metrics
131-
METRICS_ENABLED: BoolEnv.default(true),
132-
METRICS_COLLECT_DEFAULTS: BoolEnv.default(true),
133-
METRICS_HOST: z.string().default("127.0.0.1"),
134-
METRICS_PORT: z.coerce.number().int().default(9090),
135-
136-
// Pod cleaner
137-
POD_CLEANER_ENABLED: BoolEnv.default(true),
138-
POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000),
139-
POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500),
140-
141-
// Failed pod handler
142-
FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true),
143-
FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000),
144-
145-
// Debug
146-
DEBUG: BoolEnv.default(false),
147-
SEND_RUN_DEBUG_LOGS: BoolEnv.default(false),
148-
});
6+
const Env = z
7+
.object({
8+
// This will come from `spec.nodeName` in k8s
9+
TRIGGER_WORKER_INSTANCE_NAME: z.string().default(randomUUID()),
10+
TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30),
11+
12+
// Required settings
13+
TRIGGER_API_URL: z.string().url(),
14+
TRIGGER_WORKER_TOKEN: z.string(), // accepts file:// path to read from a file
15+
MANAGED_WORKER_SECRET: z.string(),
16+
OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), // set on the runners
17+
18+
// Workload API settings (coordinator mode) - the workload API is what the run controller connects to
19+
TRIGGER_WORKLOAD_API_ENABLED: BoolEnv.default(true),
20+
TRIGGER_WORKLOAD_API_PROTOCOL: z
21+
.string()
22+
.transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase()))
23+
.default("http"),
24+
TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default
25+
TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"),
26+
TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on
27+
TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller
28+
29+
// Runner settings
30+
RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(),
31+
RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(),
32+
RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv)
33+
RUNNER_PRETTY_LOGS: BoolEnv.default(false),
34+
35+
// Dequeue settings (provider mode)
36+
TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true),
37+
TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250),
38+
TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000),
39+
TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1),
40+
TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT: z.coerce.number().int().default(1),
41+
TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT: z.coerce.number().int().default(10),
42+
TRIGGER_DEQUEUE_SCALING_STRATEGY: z.enum(["none", "smooth", "aggressive"]).default("none"),
43+
TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS: z.coerce.number().int().default(5000), // 5 seconds
44+
TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS: z.coerce.number().int().default(30000), // 30 seconds
45+
TRIGGER_DEQUEUE_SCALING_TARGET_RATIO: z.coerce.number().default(1.0), // Target ratio of queue items to consumers (1.0 = 1 item per consumer)
46+
TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA: z.coerce.number().min(0).max(1).default(0.3), // Smooths queue length measurements (0=historical, 1=current)
47+
TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms)
48+
TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate)
49+
50+
// Optional services
51+
TRIGGER_WARM_START_URL: z.string().optional(),
52+
TRIGGER_CHECKPOINT_URL: z.string().optional(),
53+
TRIGGER_METADATA_URL: z.string().optional(),
54+
55+
// Used by the resource monitor
56+
RESOURCE_MONITOR_ENABLED: BoolEnv.default(false),
57+
RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(),
58+
RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB: z.coerce.number().optional(),
59+
60+
// Docker settings
61+
DOCKER_API_VERSION: z.string().optional(),
62+
DOCKER_PLATFORM: z.string().optional(), // e.g. linux/amd64, linux/arm64
63+
DOCKER_STRIP_IMAGE_DIGEST: BoolEnv.default(true),
64+
DOCKER_REGISTRY_USERNAME: z.string().optional(),
65+
DOCKER_REGISTRY_PASSWORD: z.string().optional(),
66+
DOCKER_REGISTRY_URL: z.string().optional(), // e.g. https://index.docker.io/v1
67+
DOCKER_ENFORCE_MACHINE_PRESETS: BoolEnv.default(true),
68+
DOCKER_AUTOREMOVE_EXITED_CONTAINERS: BoolEnv.default(true),
69+
/**
70+
* Network mode to use for all runners. Supported standard values are: `bridge`, `host`, `none`, and `container:<name|id>`.
71+
* Any other value is taken as a custom network's name to which all runners should connect to.
72+
*
73+
* Accepts a list of comma-separated values to attach to multiple networks. Additional networks are interpreted as network names and will be attached after container creation.
74+
*
75+
* **WARNING**: Specifying multiple networks will slightly increase startup times.
76+
*
77+
* @default "host"
78+
*/
79+
DOCKER_RUNNER_NETWORKS: z.string().default("host"),
80+
81+
// Compute settings
82+
COMPUTE_GATEWAY_URL: z.string().url().optional(),
83+
COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(),
84+
COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000),
85+
COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false),
86+
87+
// Kubernetes settings
88+
KUBERNETES_FORCE_ENABLED: BoolEnv.default(false),
89+
KUBERNETES_NAMESPACE: z.string().default("default"),
90+
KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"),
91+
KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv
92+
KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"),
93+
KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"),
94+
KUBERNETES_STRIP_IMAGE_DIGEST: BoolEnv.default(false),
95+
KUBERNETES_CPU_REQUEST_MIN_CORES: z.coerce.number().min(0).default(0),
96+
KUBERNETES_CPU_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(0.75), // Ratio of CPU limit, so 0.75 = 75% of CPU limit
97+
KUBERNETES_MEMORY_REQUEST_MIN_GB: z.coerce.number().min(0).default(0),
98+
KUBERNETES_MEMORY_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(1), // Ratio of memory limit, so 1 = 100% of memory limit
99+
100+
// Per-preset overrides of the global KUBERNETES_CPU_REQUEST_RATIO
101+
KUBERNETES_CPU_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(),
102+
KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(),
103+
KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(),
104+
KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(),
105+
KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(),
106+
KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(),
107+
KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(),
108+
109+
// Per-preset overrides of the global KUBERNETES_MEMORY_REQUEST_RATIO
110+
KUBERNETES_MEMORY_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(),
111+
KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(),
112+
KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(),
113+
KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(),
114+
KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(),
115+
KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(),
116+
KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(),
117+
118+
KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB
119+
KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods
120+
KUBERNETES_LARGE_MACHINE_POOL_LABEL: z.string().optional(), // if set, large-* presets affinity for machinepool=<value>
121+
122+
// Project affinity settings - pods from the same project prefer the same node
123+
KUBERNETES_PROJECT_AFFINITY_ENABLED: BoolEnv.default(false),
124+
KUBERNETES_PROJECT_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(50),
125+
KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY: z
126+
.string()
127+
.trim()
128+
.min(1)
129+
.default("kubernetes.io/hostname"),
130+
131+
// Placement tags settings
132+
PLACEMENT_TAGS_ENABLED: BoolEnv.default(false),
133+
PLACEMENT_TAGS_PREFIX: z.string().default("node.cluster.x-k8s.io"),
134+
135+
// Metrics
136+
METRICS_ENABLED: BoolEnv.default(true),
137+
METRICS_COLLECT_DEFAULTS: BoolEnv.default(true),
138+
METRICS_HOST: z.string().default("127.0.0.1"),
139+
METRICS_PORT: z.coerce.number().int().default(9090),
140+
141+
// Pod cleaner
142+
POD_CLEANER_ENABLED: BoolEnv.default(true),
143+
POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000),
144+
POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500),
145+
146+
// Failed pod handler
147+
FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true),
148+
FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000),
149+
150+
// Debug
151+
DEBUG: BoolEnv.default(false),
152+
SEND_RUN_DEBUG_LOGS: BoolEnv.default(false),
153+
})
154+
.superRefine((data, ctx) => {
155+
if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_METADATA_URL) {
156+
ctx.addIssue({
157+
code: z.ZodIssueCode.custom,
158+
message: "TRIGGER_METADATA_URL is required when COMPUTE_SNAPSHOTS_ENABLED is true",
159+
path: ["TRIGGER_METADATA_URL"],
160+
});
161+
}
162+
});
149163

150164
export const env = Env.parse(stdEnv);

0 commit comments

Comments
 (0)