Skip to content

Commit 3f97cf4

Browse files
authored
Managed run controller revamp (#1927)
* update nypm to support text-based bun lockfiles * fix retry spans * only download debug logs if admin * add nypm changeset * pull out env override logic * use runner env gather helper * handle dev flushing failures gracefully * fix path normalization for init.ts * add logger * add execution heartbeat service * add snapshot poller service * fix poller * add changesets * create socket in constructor * enable strictPropertyInitialization * deprecate dequeue from version * start is not async * dependency injection in prep for tests * add warm start count to all controller logs * add restore count * pull out run execution logic * temp disable pre * add a controller log when starting an execution * refactor execution and squash some bugs * cleanup completed docker containers by default * execution fixes and logging improvements * don't throw afet abort cleanup * poller should use private interval * rename heartbeat service file * rename HeartbeatService to IntervalService * restore old heartbeat service but deprecate it * use the new interval service everywhere * Revert "temp disable pre" This reverts commit e03f417. * add changeset * replace all run engine find uniques with find first
1 parent 0b2eb34 commit 3f97cf4

File tree

35 files changed

+2213
-1768
lines changed

35 files changed

+2213
-1768
lines changed

.changeset/tricky-houses-invite.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
"trigger.dev": patch
3+
"@trigger.dev/core": patch
4+
---
5+
6+
Managed run controller performance and reliability improvements

.configs/tsconfig.base.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
"strict": true,
1212
"alwaysStrict": true,
13-
"strictPropertyInitialization": false,
13+
"strictPropertyInitialization": true,
1414
"skipLibCheck": true,
1515
"forceConsistentCasingInFileNames": true,
1616
"noUnusedLocals": false,

apps/supervisor/src/env.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ const Env = z.object({
2727
RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(),
2828
RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(),
2929
RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv)
30+
RUNNER_DOCKER_AUTOREMOVE: BoolEnv.default(true),
3031

3132
// Dequeue settings (provider mode)
3233
TRIGGER_DEQUEUE_ENABLED: BoolEnv.default("true"),

apps/supervisor/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class ManagedSupervisor {
6666
heartbeatIntervalSeconds: env.RUNNER_HEARTBEAT_INTERVAL_SECONDS,
6767
snapshotPollIntervalSeconds: env.RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS,
6868
additionalEnvVars: env.RUNNER_ADDITIONAL_ENV_VARS,
69+
dockerAutoremove: env.RUNNER_DOCKER_AUTOREMOVE,
6970
} satisfies WorkloadManagerOptions;
7071

7172
if (this.isKubernetes) {

apps/supervisor/src/services/podCleaner.ts

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger";
22
import { K8sApi } from "../clients/kubernetes.js";
33
import { createK8sApi } from "../clients/kubernetes.js";
4-
import { HeartbeatService } from "@trigger.dev/core/v3";
4+
import { IntervalService } from "@trigger.dev/core/v3";
55
import { Counter, Gauge, Registry } from "prom-client";
66
import { register } from "../metrics.js";
77

@@ -19,7 +19,7 @@ export class PodCleaner {
1919
private readonly namespace: string;
2020

2121
private readonly batchSize: number;
22-
private readonly deletionHeartbeat: HeartbeatService;
22+
private readonly deletionInterval: IntervalService;
2323

2424
// Metrics
2525
private readonly register: Registry;
@@ -32,10 +32,10 @@ export class PodCleaner {
3232
this.namespace = opts.namespace;
3333
this.batchSize = opts.batchSize ?? 500;
3434

35-
this.deletionHeartbeat = new HeartbeatService({
35+
this.deletionInterval = new IntervalService({
3636
intervalMs: opts.intervalMs ?? 10000,
3737
leadingEdge: true,
38-
heartbeat: this.deleteCompletedPods.bind(this),
38+
onInterval: this.deleteCompletedPods.bind(this),
3939
});
4040

4141
// Initialize metrics
@@ -57,11 +57,11 @@ export class PodCleaner {
5757
}
5858

5959
async start() {
60-
this.deletionHeartbeat.start();
60+
this.deletionInterval.start();
6161
}
6262

6363
async stop() {
64-
this.deletionHeartbeat.stop();
64+
this.deletionInterval.stop();
6565
}
6666

6767
private async deleteCompletedPods() {

apps/supervisor/src/workloadManager/docker.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ export class DockerWorkloadManager implements WorkloadManager {
4343
`--name=${runnerId}`,
4444
];
4545

46+
if (this.opts.dockerAutoremove) {
47+
runArgs.push("--rm");
48+
}
49+
4650
if (this.opts.warmStartUrl) {
4751
runArgs.push(`--env=TRIGGER_WARM_START_URL=${this.opts.warmStartUrl}`);
4852
}

apps/supervisor/src/workloadManager/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ export interface WorkloadManagerOptions {
1010
heartbeatIntervalSeconds?: number;
1111
snapshotPollIntervalSeconds?: number;
1212
additionalEnvVars?: Record<string, string>;
13+
dockerAutoremove?: boolean;
1314
}
1415

1516
export interface WorkloadManager {

apps/supervisor/src/workloadServer/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
452452
logger.debug("runConnected", { ...getSocketMetadata() });
453453

454454
// If there's already a run ID set, we should "disconnect" it from this socket
455-
if (socket.data.runFriendlyId) {
455+
if (socket.data.runFriendlyId && socket.data.runFriendlyId !== friendlyId) {
456456
logger.debug("runConnected: disconnecting existing run", {
457457
...getSocketMetadata(),
458458
newRunId: friendlyId,

apps/webapp/app/v3/authenticatedSocketConnection.server.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import {
22
clientWebsocketMessages,
3-
HeartbeatService,
3+
IntervalService,
44
serverWebsocketMessages,
55
} from "@trigger.dev/core/v3";
66
import { ZodMessageHandler, ZodMessageSender } from "@trigger.dev/core/v3/zodMessageHandler";
@@ -19,7 +19,7 @@ export class AuthenticatedSocketConnection {
1919
private _sender: ZodMessageSender<typeof serverWebsocketMessages>;
2020
private _consumer: DevQueueConsumer;
2121
private _messageHandler: ZodMessageHandler<typeof clientWebsocketMessages>;
22-
private _pingService: HeartbeatService;
22+
private _pingService: IntervalService;
2323

2424
constructor(
2525
public ws: WebSocket,
@@ -75,8 +75,8 @@ export class AuthenticatedSocketConnection {
7575
// });
7676
});
7777

78-
this._pingService = new HeartbeatService({
79-
heartbeat: async () => {
78+
this._pingService = new IntervalService({
79+
onInterval: async () => {
8080
if (ws.readyState !== WebSocket.OPEN) {
8181
logger.debug("[AuthenticatedSocketConnection] Websocket not open, skipping ping");
8282
return;

internal-packages/run-engine/src/engine/db/worker.ts

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ export async function getWorkerDeploymentFromWorker(
193193
prisma: PrismaClientOrTransaction,
194194
workerId: string
195195
): Promise<WorkerDeploymentWithWorkerTasks | null> {
196-
const worker = await prisma.backgroundWorker.findUnique({
196+
const worker = await prisma.backgroundWorker.findFirst({
197197
where: {
198198
id: workerId,
199199
},
@@ -264,12 +264,10 @@ export async function getManagedWorkerFromCurrentlyPromotedDeployment(
264264
prisma: PrismaClientOrTransaction,
265265
environmentId: string
266266
): Promise<WorkerDeploymentWithWorkerTasks | null> {
267-
const promotion = await prisma.workerDeploymentPromotion.findUnique({
267+
const promotion = await prisma.workerDeploymentPromotion.findFirst({
268268
where: {
269-
environmentId_label: {
270-
environmentId,
271-
label: CURRENT_DEPLOYMENT_LABEL,
272-
},
269+
environmentId,
270+
label: CURRENT_DEPLOYMENT_LABEL,
273271
},
274272
include: {
275273
deployment: {

0 commit comments

Comments
 (0)