Skip to content

Set default conn timeout of 5 sec for storeGateway and alertmanager clients #6603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changelog

## master / unreleased
* [CHANGE] StoreGateway/Alertmanager: Add default 5s connection timeout on client. #6603
* [FEATURE] Query Frontend: Add dynamic interval size for query splitting. This is enabled by configuring experimental flags `querier.max-shards-per-query` and/or `querier.max-fetched-data-duration-per-query`. The split interval size is dynamically increased to maintain a number of shards and total duration fetched below the configured values. #6458
* [FEATURE] Querier/Ruler: Add `query_partial_data` and `rules_partial_data` limits to allow queries/rules to be evaluated with data from a single zone, if other zones are not available. #6526
* [FEATURE] Update prometheus alertmanager version to v0.28.0 and add new integration msteamsv2, jira, and rocketchat. #6590
Expand Down
5 changes: 5 additions & 0 deletions docs/blocks-storage/querier.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,11 @@ querier:
# CLI flag: -querier.store-gateway-client.healthcheck.timeout
[timeout: <duration> | default = 1s]

# The maximum amount of time to establish a connection. A value of 0 means
# using default gRPC client connect timeout 5s.
# CLI flag: -querier.store-gateway-client.connect-timeout
[connect_timeout: <duration> | default = 5s]

# If enabled, store gateway query stats will be logged using `info` log level.
# CLI flag: -querier.store-gateway-query-stats-enabled
[store_gateway_query_stats: <boolean> | default = true]
Expand Down
10 changes: 10 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,11 @@ alertmanager_client:
# CLI flag: -alertmanager.alertmanager-client.grpc-max-send-msg-size
[max_send_msg_size: <int> | default = 4194304]

# The maximum amount of time to establish a connection. A value of 0 means
# using default gRPC client connect timeout 5s.
# CLI flag: -alertmanager.alertmanager-client.connect-timeout
[connect_timeout: <duration> | default = 5s]

# The interval between persisting the current alertmanager state (notification
# log and silences) to object storage. This is only used when sharding is
# enabled. This state is read when all replicas for a shard can not be
Expand Down Expand Up @@ -4100,6 +4105,11 @@ store_gateway_client:
# CLI flag: -querier.store-gateway-client.healthcheck.timeout
[timeout: <duration> | default = 1s]

# The maximum amount of time to establish a connection. A value of 0 means
# using default gRPC client connect timeout 5s.
# CLI flag: -querier.store-gateway-client.connect-timeout
[connect_timeout: <duration> | default = 5s]

# If enabled, store gateway query stats will be logged using `info` log level.
# CLI flag: -querier.store-gateway-query-stats-enabled
[store_gateway_query_stats: <boolean> | default = true]
Expand Down
3 changes: 3 additions & 0 deletions pkg/alertmanager/alertmanager_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ type ClientConfig struct {
GRPCCompression string `yaml:"grpc_compression"`
MaxRecvMsgSize int `yaml:"max_recv_msg_size"`
MaxSendMsgSize int `yaml:"max_send_msg_size"`
ConnectTimeout time.Duration `yaml:"connect_timeout"`
}

// RegisterFlagsWithPrefix registers flags with prefix.
Expand All @@ -50,6 +51,7 @@ func (cfg *ClientConfig) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet)
cfg.TLS.RegisterFlagsWithPrefix(prefix, f)
f.IntVar(&cfg.MaxRecvMsgSize, prefix+".grpc-max-recv-msg-size", 16*1024*1024, "gRPC client max receive message size (bytes).")
f.IntVar(&cfg.MaxSendMsgSize, prefix+".grpc-max-send-msg-size", 4*1024*1024, "gRPC client max send message size (bytes).")
f.DurationVar(&cfg.ConnectTimeout, prefix+".connect-timeout", 5*time.Second, "The maximum amount of time to establish a connection. A value of 0 means using default gRPC client connect timeout 5s.")
}

type alertmanagerClientsPool struct {
Expand All @@ -67,6 +69,7 @@ func newAlertmanagerClientsPool(discovery client.PoolServiceDiscovery, amClientC
BackoffOnRatelimits: false,
TLSEnabled: amClientCfg.TLSEnabled,
TLS: amClientCfg.TLS,
ConnectTimeout: amClientCfg.ConnectTimeout,
}

requestDuration := promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
Expand Down
3 changes: 3 additions & 0 deletions pkg/querier/store_gateway_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ func newStoreGatewayClientPool(discovery client.PoolServiceDiscovery, clientConf
BackoffOnRatelimits: false,
TLSEnabled: clientConfig.TLSEnabled,
TLS: clientConfig.TLS,
ConnectTimeout: clientConfig.ConnectTimeout,
},
HealthCheckConfig: clientConfig.HealthCheckConfig,
}
Expand All @@ -103,11 +104,13 @@ type ClientConfig struct {
TLS tls.ClientConfig `yaml:",inline"`
GRPCCompression string `yaml:"grpc_compression"`
HealthCheckConfig grpcclient.HealthCheckConfig `yaml:"healthcheck_config" doc:"description=EXPERIMENTAL: If enabled, gRPC clients perform health checks for each target and fail the request if the target is marked as unhealthy."`
ConnectTimeout time.Duration `yaml:"connect_timeout"`
}

func (cfg *ClientConfig) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
f.BoolVar(&cfg.TLSEnabled, prefix+".tls-enabled", cfg.TLSEnabled, "Enable TLS for gRPC client connecting to store-gateway.")
f.StringVar(&cfg.GRPCCompression, prefix+".grpc-compression", "", "Use compression when sending messages. Supported values are: 'gzip', 'snappy' and '' (disable compression)")
f.DurationVar(&cfg.ConnectTimeout, prefix+".connect-timeout", 5*time.Second, "The maximum amount of time to establish a connection. A value of 0 means using default gRPC client connect timeout 5s.")
cfg.TLS.RegisterFlagsWithPrefix(prefix, f)
cfg.HealthCheckConfig.RegisterFlagsWithPrefix(prefix, f)
}
Loading