From 758f7bf76c65272775c49ca9b48304f26a2d8e02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacob=20Baung=C3=A5rd=20Hansen?= Date: Fri, 2 Jun 2023 10:34:48 +0200 Subject: [PATCH] Receive: allow unlimited head_series_limit tenants (#6406) With this commit we now allow to configure tenants with unlimited active series limit by setting the limit to `0`. Prior to this commit setting a per tenant limit to `0` would cause the tenant to be unable to write any metrics at all. This fixes: https://github.com/thanos-io/thanos/issues/6393 Signed-off-by: Jacob Baungard Hansen --- CHANGELOG.md | 1 + docs/components/receive.md | 2 +- pkg/receive/head_series_limiter.go | 5 +++ test/e2e/e2ethanos/services.go | 8 ++++- test/e2e/receive_test.go | 49 +++++++++++++++++++++++++----- 5 files changed, 56 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b84e3f867..83f7b07f30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re - [#5548](https://github.com/thanos-io/thanos/pull/5548) Query: Add experimental support for load balancing across multiple Store endpoints. - [#6148](https://github.com/thanos-io/thanos/pull/6148) Query-frontend: Add `traceID` to slow query detected log line. - [#6153](https://github.com/thanos-io/thanos/pull/6153) Query-frontend: Add `remote_user` (from http basic auth) and `remote_addr` to slow query detected log line. +- [#6406](https://github.com/thanos-io/thanos/pull/6406) Receive: Allow tenants to be configured with unlimited active series by setting head_series_limit to 0. ### Fixed diff --git a/docs/components/receive.md b/docs/components/receive.md index fbf08e4b63..2e2a74e1f5 100644 --- a/docs/components/receive.md +++ b/docs/components/receive.md @@ -241,7 +241,7 @@ Under `global`: - `meta_monitoring_http_client`: Optional YAML field specifying HTTP client config for meta-monitoring. Under `default` and per `tenant`: -- `head_series_limit`: Specifies the total number of active (head) series for any tenant, across all replicas (including data replication), allowed by Thanos Receive. +- `head_series_limit`: Specifies the total number of active (head) series for any tenant, across all replicas (including data replication), allowed by Thanos Receive. Set to 0 for unlimited. NOTE: - It is possible that Receive ingests more active series than the specified limit, as it relies on meta-monitoring, which may not have the latest data for current number of active series of a tenant at all times. diff --git a/pkg/receive/head_series_limiter.go b/pkg/receive/head_series_limiter.go index c088dc6495..f0e4512386 100644 --- a/pkg/receive/head_series_limiter.go +++ b/pkg/receive/head_series_limiter.go @@ -155,6 +155,11 @@ func (h *headSeriesLimit) isUnderLimit(tenant string) (bool, error) { limit = h.defaultLimit } + // If tenant limit is 0 we treat it as unlimited. + if limit == 0 { + return true, nil + } + if v >= float64(limit) { level.Error(h.logger).Log("msg", "tenant above limit", "tenant", tenant, "currentSeries", v, "limit", limit) h.limitedRequests.WithLabelValues(tenant).Inc() diff --git a/test/e2e/e2ethanos/services.go b/test/e2e/e2ethanos/services.go index 18e111332a..2053421ef6 100644 --- a/test/e2e/e2ethanos/services.go +++ b/test/e2e/e2ethanos/services.go @@ -500,6 +500,7 @@ type ReceiveBuilder struct { maxExemplars int ingestion bool limit int + tenantsLimits receive.TenantsWriteLimitsConfig metaMonitoring string metaMonitoringQuery string hashringConfigs []receive.HashringConfig @@ -554,9 +555,10 @@ func (r *ReceiveBuilder) WithRelabelConfigs(relabelConfigs []*relabel.Config) *R return r } -func (r *ReceiveBuilder) WithValidationEnabled(limit int, metaMonitoring string, query ...string) *ReceiveBuilder { +func (r *ReceiveBuilder) WithValidationEnabled(limit int, metaMonitoring string, tenantsLimits receive.TenantsWriteLimitsConfig, query ...string) *ReceiveBuilder { r.limit = limit r.metaMonitoring = metaMonitoring + r.tenantsLimits = tenantsLimits if len(query) > 0 { r.metaMonitoringQuery = query[0] } @@ -611,6 +613,10 @@ func (r *ReceiveBuilder) Init() *e2emon.InstrumentedRunnable { }, } + if r.tenantsLimits != nil { + cfg.WriteLimits.TenantsLimits = r.tenantsLimits + } + b, err := yaml.Marshal(cfg) if err != nil { return &e2emon.InstrumentedRunnable{Runnable: e2e.NewFailedRunnable(r.Name(), errors.Wrapf(err, "generate limiting file: %v", hashring))} diff --git a/test/e2e/receive_test.go b/test/e2e/receive_test.go index 4f729b7c32..117a4e4c1c 100644 --- a/test/e2e/receive_test.go +++ b/test/e2e/receive_test.go @@ -813,9 +813,13 @@ test_metric{a="2", b="2"} 1`) }, } - i1Runnable := ingestor1.WithRouting(1, h).WithValidationEnabled(10, "http://"+meta.GetMonitoringRunnable().InternalEndpoint(e2edb.AccessPortName)).Init() - i2Runnable := ingestor2.WithRouting(1, h).WithValidationEnabled(10, "http://"+meta.GetMonitoringRunnable().InternalEndpoint(e2edb.AccessPortName)).Init() - i3Runnable := ingestor3.WithRouting(1, h).WithValidationEnabled(10, "http://"+meta.GetMonitoringRunnable().InternalEndpoint(e2edb.AccessPortName)).Init() + tenantsLimits := receive.TenantsWriteLimitsConfig{ + "unlimited-tenant": receive.NewEmptyWriteLimitConfig().SetHeadSeriesLimit(0), + } + + i1Runnable := ingestor1.WithRouting(1, h).WithValidationEnabled(10, "http://"+meta.GetMonitoringRunnable().InternalEndpoint(e2edb.AccessPortName), tenantsLimits).Init() + i2Runnable := ingestor2.WithRouting(1, h).WithValidationEnabled(10, "http://"+meta.GetMonitoringRunnable().InternalEndpoint(e2edb.AccessPortName), tenantsLimits).Init() + i3Runnable := ingestor3.WithRouting(1, h).WithValidationEnabled(10, "http://"+meta.GetMonitoringRunnable().InternalEndpoint(e2edb.AccessPortName), tenantsLimits).Init() testutil.Ok(t, e2e.StartAndWaitReady(i1Runnable, i2Runnable, i3Runnable)) @@ -824,7 +828,7 @@ test_metric{a="2", b="2"} 1`) testutil.Ok(t, querier.WaitSumMetricsWithOptions(e2emon.Equals(3), []string{"thanos_store_nodes_grpc_connections"}, e2emon.WaitMissingMetrics())) - // We run two avalanches, one tenant which exceeds the limit, and one tenant which remains under it. + // We run three avalanches, one tenant which exceeds the limit, one tenant which remains under it, and one for the unlimited tenant. // Avalanche in this configuration, would send 5 requests each with 10 new timeseries. // One request always fails due to TSDB not being ready for new tenant. @@ -864,7 +868,26 @@ test_metric{a="2", b="2"} 1`) TenantID: "under-tenant", }) - testutil.Ok(t, e2e.StartAndWaitReady(avalanche1, avalanche2)) + // Avalanche in this configuration, would send 5 requests each with 10 new timeseries. + // One request always fails due to TSDB not being ready for new tenant. + // So without limiting we end up with 40 timeseries and 40 samples. + avalanche3 := e2ethanos.NewAvalanche(e, "avalanche-3", + e2ethanos.AvalancheOptions{ + MetricCount: "10", + SeriesCount: "1", + MetricInterval: "30", + SeriesInterval: "3600", + ValueInterval: "3600", + + RemoteURL: e2ethanos.RemoteWriteEndpoint(ingestor1.InternalEndpoint("remote-write")), + RemoteWriteInterval: "30s", + RemoteBatchSize: "10", + RemoteRequestCount: "5", + + TenantID: "unlimited-tenant", + }) + + testutil.Ok(t, e2e.StartAndWaitReady(avalanche1, avalanche2, avalanche3)) // Here, 3/5 requests are failed due to limiting, as one request fails due to TSDB readiness and we ingest one initial request. // 3 limited requests belong to the exceed-tenant. @@ -876,7 +899,7 @@ test_metric{a="2", b="2"} 1`) ingestor1Name := e.Name() + "-" + ingestor1.Name() // Here for exceed-tenant we go above limit by 10, which results in 0 value. queryWaitAndAssert(t, ctx, meta.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { - return fmt.Sprintf("sum(prometheus_tsdb_head_series{tenant=\"exceed-tenant\"}) - on() thanos_receive_head_series_limit{instance=\"%s:8080\", job=\"receive-i1\"}", ingestor1Name) + return fmt.Sprintf("sum(prometheus_tsdb_head_series{tenant=\"exceed-tenant\"}) - on() thanos_receive_head_series_limit{instance=\"%s:8080\", job=\"receive-i1\", tenant=\"\"}", ingestor1Name) }, time.Now, promclient.QueryOptions{ Deduplicate: true, }, model.Vector{ @@ -888,7 +911,7 @@ test_metric{a="2", b="2"} 1`) // For under-tenant we stay at -5, as we have only pushed 5 series. queryWaitAndAssert(t, ctx, meta.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { - return fmt.Sprintf("sum(prometheus_tsdb_head_series{tenant=\"under-tenant\"}) - on() thanos_receive_head_series_limit{instance=\"%s:8080\", job=\"receive-i1\"}", ingestor1Name) + return fmt.Sprintf("sum(prometheus_tsdb_head_series{tenant=\"under-tenant\"}) - on() thanos_receive_head_series_limit{instance=\"%s:8080\", job=\"receive-i1\", tenant=\"\"}", ingestor1Name) }, time.Now, promclient.QueryOptions{ Deduplicate: true, }, model.Vector{ @@ -918,6 +941,18 @@ test_metric{a="2", b="2"} 1`) }, }) + // Query meta-monitoring solution to assert that we have ingested some number of timeseries. + // Avalanche sometimes misses some requests due to TSDB readiness etc. In this case, as the + // limit is set to `0` we just want to make sure some timeseries are ingested. + queryWaitAndAssert(t, ctx, meta.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { return "sum(prometheus_tsdb_head_series{tenant=\"unlimited-tenant\"}) >=bool 10" }, time.Now, promclient.QueryOptions{ + Deduplicate: true, + }, model.Vector{ + &model.Sample{ + Metric: model.Metric{}, + Value: model.SampleValue(1), + }, + }) + // Query meta-monitoring solution to assert that 3 requests were limited for exceed-tenant and none for under-tenant. queryWaitAndAssert(t, ctx, meta.GetMonitoringRunnable().Endpoint(e2edb.AccessPortName), func() string { return "thanos_receive_head_series_limited_requests_total" }, time.Now, promclient.QueryOptions{ Deduplicate: true,