Skip to content

Commit 7a76e51

Browse files
authored
add configurable final-sleep time for basic lifecycler (#5517)
1 parent b91a24d commit 7a76e51

File tree

10 files changed

+43
-0
lines changed

10 files changed

+43
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
* [ENHANCEMENT] DDBKV: Change metric name from dynamodb_kv_read_capacity_total to dynamodb_kv_consumed_capacity_total and include Delete, Put, Batch dimension. #5481
5252
* [ENHANCEMENT] Compactor: allow unregisteronshutdown to be configurable. #5503
5353
* [ENHANCEMENT] Store Gateway: add metric `cortex_bucket_store_chunk_refetches_total` for number of chunk refetches. #5532
54+
* [ENHANCEMENT] BasicLifeCycler: allow final-sleep during shutdown #5517
5455
* [BUGFIX] Ruler: Validate if rule group can be safely converted back to rule group yaml from protobuf message #5265
5556
* [BUGFIX] Querier: Convert gRPC `ResourceExhausted` status code from store gateway to 422 limit error. #5286
5657
* [BUGFIX] Alertmanager: Route web-ui requests to the alertmanager distributor when sharding is enabled. #5293

docs/blocks-storage/store-gateway.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,11 @@ store_gateway:
309309
# CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration
310310
[wait_stability_max_duration: <duration> | default = 5m]
311311

312+
# The sleep seconds when store-gateway is shutting down. Need to be close to
313+
# or larger than KV Store information propagation delay
314+
# CLI flag: -store-gateway.sharding-ring.final-sleep
315+
[final_sleep: <duration> | default = 0s]
316+
312317
# Name of network interface to read address from.
313318
# CLI flag: -store-gateway.sharding-ring.instance-interface-names
314319
[instance_interface_names: <list of string> | default = [eth0 en0]]

docs/configuration/config-file-reference.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,11 @@ sharding_ring:
369369
# CLI flag: -alertmanager.sharding-ring.zone-awareness-enabled
370370
[zone_awareness_enabled: <boolean> | default = false]
371371
372+
# The sleep seconds when alertmanager is shutting down. Need to be close to or
373+
# larger than KV Store information propagation delay
374+
# CLI flag: -alertmanager.sharding-ring.final-sleep
375+
[final_sleep: <duration> | default = 0s]
376+
372377
# Name of network interface to read address from.
373378
# CLI flag: -alertmanager.sharding-ring.instance-interface-names
374379
[instance_interface_names: <list of string> | default = [eth0 en0]]
@@ -3945,6 +3950,11 @@ ring:
39453950
# CLI flag: -ruler.ring.num-tokens
39463951
[num_tokens: <int> | default = 128]
39473952
3953+
# The sleep seconds when ruler is shutting down. Need to be close to or larger
3954+
# than KV Store information propagation delay
3955+
# CLI flag: -ruler.ring.final-sleep
3956+
[final_sleep: <duration> | default = 0s]
3957+
39483958
# Period with which to attempt to flush rule groups.
39493959
# CLI flag: -ruler.flush-period
39503960
[flush_period: <duration> | default = 1m]
@@ -4836,6 +4846,11 @@ sharding_ring:
48364846
# CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration
48374847
[wait_stability_max_duration: <duration> | default = 5m]
48384848
4849+
# The sleep seconds when store-gateway is shutting down. Need to be close to
4850+
# or larger than KV Store information propagation delay
4851+
# CLI flag: -store-gateway.sharding-ring.final-sleep
4852+
[final_sleep: <duration> | default = 0s]
4853+
48394854
# Name of network interface to read address from.
48404855
# CLI flag: -store-gateway.sharding-ring.instance-interface-names
48414856
[instance_interface_names: <list of string> | default = [eth0 en0]]

docs/configuration/v1-guarantees.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,7 @@ Currently experimental features are:
108108
- Store Gateway Zone Stable Shuffle Sharding
109109
- `-store-gateway.sharding-ring.zone-stable-shuffle-sharding` CLI flag
110110
- `zone_stable_shuffle_sharding` (boolean) field in config file
111+
- Basic Lifecycler (Storegateway, Alertmanager, Ruler) Final Sleep on shutdown, which tells the pod wait before shutdown, allowing a delay to propagate ring changes.
112+
- `-ruler.ring.final-sleep` (duration) CLI flag
113+
- `store-gateway.sharding-ring.final-sleep` (duration) CLI flag
114+
- `alertmanager-sharding-ring.final-sleep` (duration) CLI flag

pkg/alertmanager/alertmanager_ring.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ type RingConfig struct {
4949
ReplicationFactor int `yaml:"replication_factor"`
5050
ZoneAwarenessEnabled bool `yaml:"zone_awareness_enabled"`
5151

52+
FinalSleep time.Duration `yaml:"final_sleep"`
53+
5254
// Instance details
5355
InstanceID string `yaml:"instance_id" doc:"hidden"`
5456
InstanceInterfaceNames []string `yaml:"instance_interface_names"`
@@ -79,6 +81,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
7981
cfg.KVStore.RegisterFlagsWithPrefix(rfprefix, "alertmanagers/", f)
8082
f.DurationVar(&cfg.HeartbeatPeriod, rfprefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.")
8183
f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring. 0 = never (timeout disabled).")
84+
f.DurationVar(&cfg.FinalSleep, rfprefix+"final-sleep", 0*time.Second, "The sleep seconds when alertmanager is shutting down. Need to be close to or larger than KV Store information propagation delay")
8285
f.IntVar(&cfg.ReplicationFactor, rfprefix+"replication-factor", 3, "The replication factor to use when sharding the alertmanager.")
8386
f.BoolVar(&cfg.ZoneAwarenessEnabled, rfprefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate alerts across different availability zones.")
8487

@@ -110,6 +113,7 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl
110113
TokensObservePeriod: 0,
111114
Zone: cfg.InstanceZone,
112115
NumTokens: RingNumTokens,
116+
FinalSleep: cfg.FinalSleep,
113117
}, nil
114118
}
115119

pkg/alertmanager/multitenant_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ func mockAlertmanagerConfig(t *testing.T) *MultitenantAlertmanagerConfig {
8282
cfg.ShardingRing.InstanceID = "test"
8383
cfg.ShardingRing.InstanceAddr = "127.0.0.1"
8484
cfg.PollInterval = time.Minute
85+
cfg.ShardingRing.FinalSleep = 0
8586

8687
return cfg
8788
}

pkg/ring/basic_lifecycler.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ type BasicLifecyclerConfig struct {
5555
// If true lifecycler doesn't unregister instance from the ring when it's stopping. Default value is false,
5656
// which means unregistering.
5757
KeepInstanceInTheRingOnShutdown bool
58+
59+
FinalSleep time.Duration
5860
}
5961

6062
// BasicLifecycler is a basic ring lifecycler which allows to hook custom
@@ -251,6 +253,7 @@ heartbeatLoop:
251253
level.Info(l.logger).Log("msg", "instance removed from the ring", "ring", l.ringName)
252254
}
253255

256+
time.Sleep(l.cfg.FinalSleep)
254257
return nil
255258
}
256259

pkg/ruler/ruler_ring.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ type RingConfig struct {
4242
InstanceAddr string `yaml:"instance_addr" doc:"hidden"`
4343
NumTokens int `yaml:"num_tokens"`
4444

45+
FinalSleep time.Duration `yaml:"final_sleep"`
46+
4547
// Injected internally
4648
ListenPort int `yaml:"-"`
4749

@@ -60,6 +62,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
6062
cfg.KVStore.RegisterFlagsWithPrefix("ruler.ring.", "rulers/", f)
6163
f.DurationVar(&cfg.HeartbeatPeriod, "ruler.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring. 0 = disabled.")
6264
f.DurationVar(&cfg.HeartbeatTimeout, "ruler.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which rulers are considered unhealthy within the ring. 0 = never (timeout disabled).")
65+
f.DurationVar(&cfg.FinalSleep, "ruler.ring.final-sleep", 0*time.Second, "The sleep seconds when ruler is shutting down. Need to be close to or larger than KV Store information propagation delay")
6366

6467
// Instance flags
6568
cfg.InstanceInterfaceNames = []string{"eth0", "en0"}
@@ -86,6 +89,7 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl
8689
HeartbeatPeriod: cfg.HeartbeatPeriod,
8790
TokensObservePeriod: 0,
8891
NumTokens: cfg.NumTokens,
92+
FinalSleep: cfg.FinalSleep,
8993
}, nil
9094
}
9195

pkg/ruler/ruler_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ func defaultRulerConfig(t testing.TB) Config {
7474
cfg.Ring.ListenPort = 0
7575
cfg.Ring.InstanceAddr = "localhost"
7676
cfg.Ring.InstanceID = "localhost"
77+
cfg.Ring.FinalSleep = 0
7778
cfg.EnableQueryStats = false
7879

7980
return cfg

pkg/storegateway/gateway_ring.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ type RingConfig struct {
7373
WaitStabilityMinDuration time.Duration `yaml:"wait_stability_min_duration"`
7474
WaitStabilityMaxDuration time.Duration `yaml:"wait_stability_max_duration"`
7575

76+
FinalSleep time.Duration `yaml:"final_sleep"`
77+
7678
// Instance details
7779
InstanceID string `yaml:"instance_id" doc:"hidden"`
7880
InstanceInterfaceNames []string `yaml:"instance_interface_names"`
@@ -109,6 +111,8 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
109111
f.DurationVar(&cfg.WaitStabilityMinDuration, ringFlagsPrefix+"wait-stability-min-duration", time.Minute, "Minimum time to wait for ring stability at startup. 0 to disable.")
110112
f.DurationVar(&cfg.WaitStabilityMaxDuration, ringFlagsPrefix+"wait-stability-max-duration", 5*time.Minute, "Maximum time to wait for ring stability at startup. If the store-gateway ring keeps changing after this period of time, the store-gateway will start anyway.")
111113

114+
f.DurationVar(&cfg.FinalSleep, ringFlagsPrefix+"final-sleep", 0*time.Second, "The sleep seconds when store-gateway is shutting down. Need to be close to or larger than KV Store information propagation delay")
115+
112116
// Instance flags
113117
cfg.InstanceInterfaceNames = []string{"eth0", "en0"}
114118
f.Var((*flagext.StringSlice)(&cfg.InstanceInterfaceNames), ringFlagsPrefix+"instance-interface-names", "Name of network interface to read address from.")
@@ -150,5 +154,6 @@ func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecycl
150154
TokensObservePeriod: 0,
151155
NumTokens: RingNumTokens,
152156
KeepInstanceInTheRingOnShutdown: cfg.KeepInstanceInTheRingOnShutdown,
157+
FinalSleep: cfg.FinalSleep,
153158
}, nil
154159
}

0 commit comments

Comments
 (0)