Nomad autoscaler deployment loses canary status and gets stuck #20295
Description
Nomad version
1.7.3
Operating system and Environment details
Ubuntu 22.04.2 LTS
Issue
From time to time auto scaling results in a deployment that partially fails. The deployment never completes and gets hung in a state that seems corrupt.
In the example below, the autoscaler is dropping an instance count from 5 to 2. We came across this instance an hour after it started.
Given that the deployment is auto promoting and auto reverting, and that there is a tight progress deadline of 31 seconds, I don't understand why the deployment is still running and stuck hours later. The deployment starts at 08:54:13 but with events and deadlines smeared across about 4 minutes also doesn't make sense to me. I'm not sure what it was doing.
The UI overview tab shows a summary that does not include canary information:
The UI deployment tab does show canary information:
The chart showing auto scaling behavior:
The change recorded for the deployment:
A request to the API to get the deployment returns:
{
"ID": "fdf207df-adb5-ec23-52ce-fbada3fe495a",
"Namespace": "default",
"JobID": "invoice",
"JobVersion": 12847,
"JobModifyIndex": 23689764,
"JobSpecModifyIndex": 23689764,
"JobCreateIndex": 6028898,
"IsMultiregion": false,
"TaskGroups": {
"app": {
"AutoRevert": true,
"AutoPromote": true,
"ProgressDeadline": 31000000000,
"RequireProgressBy": "2024-04-04T08:58:57.851083477-05:00",
"Promoted": false,
"PlacedCanaries": [
"638d875b-b842-10cb-34b2-3439e9753929"
],
"DesiredCanaries": 1,
"DesiredTotal": 2,
"PlacedAllocs": 3,
"HealthyAllocs": 2,
"UnhealthyAllocs": 1
}
},
"Status": "running",
"StatusDescription": "Deployment is running pending automatic promotion",
"EvalPriority": 50,
"CreateIndex": 23689767,
"ModifyIndex": 23693324
}
A request to the API to get the canary allocation returns:
{
"ID": "638d875b-b842-10cb-34b2-3439e9753929",
"EvalID": "41fff724-6644-dfb8-5fce-43fc3b04c4c2",
"Name": "invoice.app[0]",
"Namespace": "default",
"NodeID": "7098fdeb-2541-4122-eee4-ae393c4b5d38",
"NodeName": "p-mwsv-nc04",
"JobID": "invoice",
"JobType": "service",
"JobVersion": 12847,
"TaskGroup": "app",
"AllocatedResources": null,
"DesiredStatus": "run",
"DesiredDescription": "",
"ClientStatus": "running",
"ClientDescription": "Tasks are running",
"DesiredTransition": {
"Migrate": null,
"Reschedule": true,
"ForceReschedule": null,
"NoShutdownDelay": null
},
"TaskStates": {
"sidecar-check": {
"State": "dead",
"Failed": false,
"Restarts": 0,
"LastRestart": null,
"StartedAt": "2024-04-04T13:55:20.546726093Z",
"FinishedAt": "2024-04-04T13:55:53.799774611Z",
"Events": [
{
"Type": "Received",
"Time": 1712238853931674722,
"Message": "",
"DisplayMessage": "Task received by client",
"Details": {},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Task Setup",
"Time": 1712238920329798253,
"Message": "Building Task Directory",
"DisplayMessage": "Building Task Directory",
"Details": {
"message": "Building Task Directory"
},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Started",
"Time": 1712238920546681038,
"Message": "",
"DisplayMessage": "Task started by client",
"Details": {},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Terminated",
"Time": 1712238953778944828,
"Message": "",
"DisplayMessage": "Exit Code: 0",
"Details": {
"signal": "0",
"oom_killed": "false",
"exit_code": "0"
},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
}
],
"TaskHandle": null
},
"service-invoice": {
"State": "running",
"Failed": false,
"Restarts": 0,
"LastRestart": null,
"StartedAt": "2024-04-04T13:55:54.06487478Z",
"FinishedAt": null,
"Events": [
{
"Type": "Received",
"Time": 1712238853932189345,
"Message": "",
"DisplayMessage": "Task received by client",
"Details": {},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Task Setup",
"Time": 1712238953802419893,
"Message": "Building Task Directory",
"DisplayMessage": "Building Task Directory",
"Details": {
"message": "Building Task Directory"
},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Started",
"Time": 1712238954064811992,
"Message": "",
"DisplayMessage": "Task started by client",
"Details": {},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
}
],
"TaskHandle": null
},
"connect-proxy-invoice": {
"State": "running",
"Failed": false,
"Restarts": 0,
"LastRestart": null,
"StartedAt": "2024-04-04T13:55:53.086854811Z",
"FinishedAt": null,
"Events": [
{
"Type": "Received",
"Time": 1712238853932586579,
"Message": "",
"DisplayMessage": "Task received by client",
"Details": {},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Task Setup",
"Time": 1712238920328470417,
"Message": "Building Task Directory",
"DisplayMessage": "Building Task Directory",
"Details": {
"message": "Building Task Directory"
},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Started",
"Time": 1712238953086798075,
"Message": "",
"DisplayMessage": "Task started by client",
"Details": {},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
}
],
"TaskHandle": null
}
},
"DeploymentStatus": {
"Healthy": false,
"Timestamp": "2024-04-04T08:54:43.935741087-05:00",
"Canary": true,
"ModifyIndex": 23690280
},
"FollowupEvalID": "",
"NextAllocation": "",
"RescheduleTracker": null,
"PreemptedAllocations": null,
"PreemptedByAllocation": "",
"CreateIndex": 23689767,
"ModifyIndex": 23691359,
"CreateTime": 1712238853910271848,
"ModifyTime": 1712238954658221350
}
Consul shows all 3 allocations are healthy.
Reproduction steps
We have 100s of successful auto scaling deployments every day across many services. I haven't been able to recreate on demand.
Expected Result
- Ignoring the failure since we are scaling down.
- I'm not sure why a canary was involved in a scaling down operation. Based on spot checking, it seems like this usually isn't the case?
- After the progress deadline has been passed, revert the deployment and mark it as failed.
Actual Result
- A deployment that is stuck as running indefinitely blocking out future autoscaler requests and interfering with agent drains.
Nomad Client logs
Consul logs for canary alloc:
Apr 04 08:55:20 p-mwsv-nc04 consul[1051]: {"@level":"error","@message":"Request error","@module":"agent.http","@timestamp":"2024-04-04T08:55:20.485001-05:00","error":"unknown service ID: _nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy","from":"@","method":"GET","url":"/v1/agent/service/_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy"}
Apr 04 08:55:22 p-mwsv-nc04 consul[1051]: {"@level":"error","@message":"Request error","@module":"agent.http","@timestamp":"2024-04-04T08:55:22.019648-05:00","error":"unknown service ID: _nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy","from":"@","method":"GET","url":"/v1/agent/service/_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy"}
Apr 04 08:55:24 p-mwsv-nc04 consul[1051]: {"@level":"error","@message":"Request error","@module":"agent.http","@timestamp":"2024-04-04T08:55:24.198084-05:00","error":"unknown service ID: _nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy","from":"@","method":"GET","url":"/v1/agent/service/_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy"}
Apr 04 08:55:28 p-mwsv-nc04 consul[1051]: {"@level":"error","@message":"Request error","@module":"agent.http","@timestamp":"2024-04-04T08:55:28.583497-05:00","error":"unknown service ID: _nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy","from":"@","method":"GET","url":"/v1/agent/service/_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy"}
Apr 04 08:55:36 p-mwsv-nc04 consul[1051]: {"@level":"error","@message":"Request error","@module":"agent.http","@timestamp":"2024-04-04T08:55:36.674509-05:00","error":"unknown service ID: _nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy","from":"@","method":"GET","url":"/v1/agent/service/_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy"}
Apr 04 08:55:40 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced service","@module":"agent","@timestamp":"2024-04-04T08:55:40.296370-05:00","service":"_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000"}
Apr 04 08:55:40 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced service","@module":"agent","@timestamp":"2024-04-04T08:55:40.309195-05:00","service":"_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy"}
Apr 04 08:55:40 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced check","@module":"agent","@timestamp":"2024-04-04T08:55:40.316722-05:00","check":"service:_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy:1"}
Apr 04 08:55:41 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced check","@module":"agent","@timestamp":"2024-04-04T08:55:41.695784-05:00","check":"service:_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy:1"}
Apr 04 08:55:44 p-mwsv-nc04 consul[1051]: {"@level":"warn","@message":"Check TCP connection failed","@module":"agent","@timestamp":"2024-04-04T08:55:44.736246-05:00","check":"service:_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy:2","error":"dial tcp 10.20.50.159:22623: connect: connection refused"}
Apr 04 08:55:44 p-mwsv-nc04 consul[1051]: {"@level":"warn","@message":"Check is now critical","@module":"agent","@timestamp":"2024-04-04T08:55:44.736293-05:00","check":"service:_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy:2"}
Apr 04 08:55:44 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced check","@module":"agent","@timestamp":"2024-04-04T08:55:44.790898-05:00","check":"service:_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy:2"}
Apr 04 08:55:55 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced check","@module":"agent","@timestamp":"2024-04-04T08:55:55.348619-05:00","check":"service:_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy:1"}
Apr 04 08:55:55 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced check","@module":"agent","@timestamp":"2024-04-04T08:55:55.506208-05:00","check":"service:_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy:1"}
Apr 04 08:55:57 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced check","@module":"agent","@timestamp":"2024-04-04T08:55:57.284271-05:00","check":"service:_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy:2"}
Apr 04 08:59:25 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced service","@module":"agent","@timestamp":"2024-04-04T08:59:25.440198-05:00","service":"_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy"}
Apr 04 09:01:22 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced service","@module":"agent","@timestamp":"2024-04-04T09:01:22.973376-05:00","service":"_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy"}
Apr 04 09:03:23 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced service","@module":"agent","@timestamp":"2024-04-04T09:03:23.474654-05:00","service":"_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy"}
Apr 04 09:06:15 p-mwsv-nc04 consul[1051]: {"@level":"info","@message":"Synced service","@module":"agent","@timestamp":"2024-04-04T09:06:15.079710-05:00","service":"_nomad-task-638d875b-b842-10cb-34b2-3439e9753929-group-app-invoice-8000-sidecar-proxy"}
...
Nomad logs don't turn up anything interesting on that alloc.
Metadata
Assignees
Type
Projects
Status
Needs Roadmapping