Skip to content

add context timeout for waitInstanceState call for alertmanager and s… #5581

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
* [BUGFIX] DDBKV: When no change detected in ring, retry the CAS until there is change. #5502
* [BUGFIX] Fix bug on objstore when configured to use S3 fips endpoints. #5540
* [BUGFIX] Ruler: Fix bug on ruler where a failure to load a single RuleGroup would prevent rulers to sync all RuleGroup. #5563
* [BUGFIX] Store-Gateway and AlertManager: Add a `wait_instance_time_out` to WaitInstanceState context to avoid waiting forever. #5581

## 1.15.1 2023-04-26

Expand Down
4 changes: 4 additions & 0 deletions docs/blocks-storage/store-gateway.md
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,10 @@ store_gateway:
# CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration
[wait_stability_max_duration: <duration> | default = 5m]

# Timeout for waiting on store-gateway to become desired state in the ring.
# CLI flag: -store-gateway.sharding-ring.wait-instance-state-timeout
[wait_instance_state_timeout: <duration> | default = 10m]

# The sleep seconds when store-gateway is shutting down. Need to be close to
# or larger than KV Store information propagation delay
# CLI flag: -store-gateway.sharding-ring.final-sleep
Expand Down
8 changes: 8 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,10 @@ sharding_ring:
# CLI flag: -alertmanager.sharding-ring.final-sleep
[final_sleep: <duration> | default = 0s]

# Timeout for waiting on alertmanager to become desired state in the ring.
# CLI flag: -alertmanager.sharding-ring.wait-instance-state-timeout
[wait_instance_state_timeout: <duration> | default = 10m]

# Name of network interface to read address from.
# CLI flag: -alertmanager.sharding-ring.instance-interface-names
[instance_interface_names: <list of string> | default = [eth0 en0]]
Expand Down Expand Up @@ -4867,6 +4871,10 @@ sharding_ring:
# CLI flag: -store-gateway.sharding-ring.wait-stability-max-duration
[wait_stability_max_duration: <duration> | default = 5m]

# Timeout for waiting on store-gateway to become desired state in the ring.
# CLI flag: -store-gateway.sharding-ring.wait-instance-state-timeout
[wait_instance_state_timeout: <duration> | default = 10m]

# The sleep seconds when store-gateway is shutting down. Need to be close to
# or larger than KV Store information propagation delay
# CLI flag: -store-gateway.sharding-ring.final-sleep
Expand Down
6 changes: 5 additions & 1 deletion pkg/alertmanager/alertmanager_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ type RingConfig struct {
ReplicationFactor int `yaml:"replication_factor"`
ZoneAwarenessEnabled bool `yaml:"zone_awareness_enabled"`

FinalSleep time.Duration `yaml:"final_sleep"`
FinalSleep time.Duration `yaml:"final_sleep"`
WaitInstanceStateTimeout time.Duration `yaml:"wait_instance_state_timeout"`

// Instance details
InstanceID string `yaml:"instance_id" doc:"hidden"`
Expand Down Expand Up @@ -94,6 +95,9 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
f.StringVar(&cfg.InstanceZone, rfprefix+"instance-availability-zone", "", "The availability zone where this instance is running. Required if zone-awareness is enabled.")

cfg.RingCheckPeriod = 5 * time.Second

// Timeout durations
f.DurationVar(&cfg.WaitInstanceStateTimeout, rfprefix+"wait-instance-state-timeout", 10*time.Minute, "Timeout for waiting on alertmanager to become desired state in the ring.")
}

// ToLifecyclerConfig returns a LifecyclerConfig based on the alertmanager
Expand Down
10 changes: 8 additions & 2 deletions pkg/alertmanager/multitenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,10 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) {

// We wait until the instance is in the JOINING state, once it does we know that tokens are assigned to this instance and we'll be ready to perform an initial sync of configs.
level.Info(am.logger).Log("msg", "waiting until alertmanager is JOINING in the ring")
if err = ring.WaitInstanceState(ctx, am.ring, am.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
ctxWithTimeout, cancel := context.WithTimeout(ctx, am.cfg.ShardingRing.WaitInstanceStateTimeout)
defer cancel()
if err = ring.WaitInstanceState(ctxWithTimeout, am.ring, am.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
level.Error(am.logger).Log("msg", "alertmanager failed to become JOINING in the ring", "err", err)
return err
}
level.Info(am.logger).Log("msg", "alertmanager is JOINING in the ring")
Expand Down Expand Up @@ -519,7 +522,10 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) {

// Wait until the ring client detected this instance in the ACTIVE state.
level.Info(am.logger).Log("msg", "waiting until alertmanager is ACTIVE in the ring")
if err := ring.WaitInstanceState(ctx, am.ring, am.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil {
ctxWithTimeout, cancel := context.WithTimeout(ctx, am.cfg.ShardingRing.WaitInstanceStateTimeout)
defer cancel()
if err := ring.WaitInstanceState(ctxWithTimeout, am.ring, am.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil {
level.Error(am.logger).Log("msg", "alertmanager failed to become ACTIVE in the ring", "err", err)
return err
}
level.Info(am.logger).Log("msg", "alertmanager is ACTIVE in the ring")
Expand Down
10 changes: 8 additions & 2 deletions pkg/storegateway/gateway.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,10 @@ func (g *StoreGateway) starting(ctx context.Context) (err error) {
// make sure that when we'll run the initial sync we already know the tokens
// assigned to this instance.
level.Info(g.logger).Log("msg", "waiting until store-gateway is JOINING in the ring")
if err := ring.WaitInstanceState(ctx, g.ring, g.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
ctxWithTimeout, cancel := context.WithTimeout(ctx, g.gatewayCfg.ShardingRing.WaitInstanceStateTimeout)
defer cancel()
if err := ring.WaitInstanceState(ctxWithTimeout, g.ring, g.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
level.Error(g.logger).Log("msg", "store-gateway failed to become JOINING in the ring", "err", err)
return err
}
level.Info(g.logger).Log("msg", "store-gateway is JOINING in the ring")
Expand Down Expand Up @@ -285,7 +288,10 @@ func (g *StoreGateway) starting(ctx context.Context) (err error) {
// make sure that when we'll run the loop it won't be detected as a ring
// topology change.
level.Info(g.logger).Log("msg", "waiting until store-gateway is ACTIVE in the ring")
if err := ring.WaitInstanceState(ctx, g.ring, g.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil {
ctxWithTimeout, cancel := context.WithTimeout(ctx, g.gatewayCfg.ShardingRing.WaitInstanceStateTimeout)
defer cancel()
if err := ring.WaitInstanceState(ctxWithTimeout, g.ring, g.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil {
level.Error(g.logger).Log("msg", "store-gateway failed to become ACTIVE in the ring", "err", err)
return err
}
level.Info(g.logger).Log("msg", "store-gateway is ACTIVE in the ring")
Expand Down
4 changes: 4 additions & 0 deletions pkg/storegateway/gateway_ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ type RingConfig struct {
// Wait ring stability.
WaitStabilityMinDuration time.Duration `yaml:"wait_stability_min_duration"`
WaitStabilityMaxDuration time.Duration `yaml:"wait_stability_max_duration"`
WaitInstanceStateTimeout time.Duration `yaml:"wait_instance_state_timeout"`

FinalSleep time.Duration `yaml:"final_sleep"`

Expand Down Expand Up @@ -123,6 +124,9 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {

// Defaults for internal settings.
cfg.RingCheckPeriod = 5 * time.Second

// Timeout durations
f.DurationVar(&cfg.WaitInstanceStateTimeout, ringFlagsPrefix+"wait-instance-state-timeout", 10*time.Minute, "Timeout for waiting on store-gateway to become desired state in the ring.")
}

func (cfg *RingConfig) ToRingConfig() ring.Config {
Expand Down