Skip to content

Commit

Permalink
fix(sidecar): Allow sidecar to not crash on bucket initialization tha…
Browse files Browse the repository at this point in the history
…nos-io#7585

This commit allows sidecar to continue to serve prometheus read path if objstore is not available at startup.
Bucket creation will be attempted again on next upload.

This commit brings a new metric to alert in case of bucket initialization crash: thanos_sidecar_shipper_up

Signed-off-by: Amaury Decrême <amaury.decreme@gmail.com>
  • Loading branch information
amaury-d committed Sep 10, 2024
1 parent 27412d2 commit 10349bf
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 22 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.envrc
.bin

/prometheus
/thanos
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re

### Fixed

- [#7585](https://github.com/thanos-io/thanos/pull/7511) fix(sidecar): Allow sidecar to not crash on startup if objstore is not available
- [#7511](https://github.com/thanos-io/thanos/pull/7511) Query Frontend: fix doubled gzip compression for response body.
- [#7592](https://github.com/thanos-io/thanos/pull/7592) Ruler: Only increment `thanos_rule_evaluation_with_warnings_total` metric for non PromQL warnings.
- [#7614](https://github.com/thanos-io/thanos/pull/7614) *: fix debug log formatting.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<p align="center"><img src="docs/img/Thanos-logo_fullmedium.png" alt="Thanos Logo"></p>

[![Latest Release](https://img.shields.io/github/release/thanos-io/thanos.svg?style=flat-square)](https://github.com/thanos-io/thanos/releases/latest) [![Go Report Card](https://goreportcard.com/badge/github.com/thanos-io/thanos)](https://goreportcard.com/report/github.com/thanos-io/thanos) [![Go Code reference](https://img.shields.io/badge/code%20reference-go.dev-darkblue.svg)](https://pkg.go.dev/github.com/thanos-io/thanos?tab=subdirectories) [![Slack](https://img.shields.io/badge/join%20slack-%23thanos-brightgreen.svg)](https://slack.cncf.io/) [![Netlify Status](https://api.netlify.com/api/v1/badges/664a5091-934c-4b0e-a7b6-bc12f822a590/deploy-status)](https://app.netlify.com/sites/thanos-io/deploys) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/3048/badge)](https://bestpractices.coreinfrastructure.org/projects/3048)
[![Latest Release](https://img.shields.io/github/release/thanos-io/thanos.svg?style=flat-square)](https://github.com/thanos-io/thanos/releases/latest) [![Go Report Card](https://goreportcard.com/badge/github.com/thanos-io/thanos)](https://goreportcard.com/report/github.com/thanos-io/thanos) [![Go Code reference](https://img.shields.io/badge/code%20reference-go.dev-darkblue.svg)](https://pkg.go.dev/github.com/thanos-io/thanos?tab=subdirectories) [![Slack](https://img.shields.io/badge/join%20slack-%23thanos-brightgreen.svg)](https://slack.cncf.io/) [![Netlify Status](https://api.netlify.com/api/v1/badges/664a5091-934c-4b0e-a7b6-bc12f822a590/deploy-status)](https://app.netlify.com/sites/thanos-io/deploys) [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/3048/badge)](https://www.bestpractices.dev/en/projects/3048)

[![CI](https://github.com/thanos-io/thanos/workflows/CI/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3ACI) [![CI](https://circleci.com/gh/thanos-io/thanos.svg?style=svg)](https://circleci.com/gh/thanos-io/thanos) [![go](https://github.com/thanos-io/thanos/workflows/go/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Ago) [![react](https://github.com/thanos-io/thanos/workflows/react/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Areact) [![docs](https://github.com/thanos-io/thanos/workflows/docs/badge.svg)](https://github.com/thanos-io/thanos/actions?query=workflow%3Adocs) [![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/thanos-io/thanos) [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=109162639)

Expand Down
50 changes: 29 additions & 21 deletions cmd/thanos/sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ func runSidecar(

limitMinTime: conf.limitMinTime,
client: promclient.NewWithTracingClient(logger, httpClient, "thanos-sidecar"),
shipper: nil,
}

confContentYaml, err := conf.objStore.Content()
Expand Down Expand Up @@ -365,29 +366,18 @@ func runSidecar(
})
}
if uploads {
// The background shipper continuously scans the data directory and uploads
// new blocks to Google Cloud Storage or an S3-compatible storage service.
bkt, err := client.NewBucket(logger, confContentYaml, component.Sidecar.String())
if err != nil {
return err
}
bkt = objstoretracing.WrapWithTraces(objstore.WrapWithMetrics(bkt, extprom.WrapRegistererWithPrefix("thanos_", reg), bkt.Name()))

// Ensure we close up everything properly.
defer func() {
if err != nil {
runutil.CloseWithLogOnErr(logger, bkt, "bucket client")
}
}()
shipperUp := promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Name: "thanos_sidecar_shipper_up",
Help: "Boolean indicator whether the sidecar shipper is running.",
})
shipperUp.Set(0)

if err := promclient.IsWALDirAccessible(conf.tsdb.path); err != nil {
level.Error(logger).Log("err", err)
}

ctx, cancel := context.WithCancel(context.Background())
g.Add(func() error {
defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client")

promReadyTimeout := conf.prometheus.readyTimeout
extLabelsCtx, cancel := context.WithTimeout(ctx, promReadyTimeout)
defer cancel()
Expand All @@ -402,15 +392,32 @@ func runSidecar(
}

uploadCompactedFunc := func() bool { return conf.shipper.uploadCompacted }
s := shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource,
uploadCompactedFunc, conf.shipper.allowOutOfOrderUpload, metadata.HashFunc(conf.shipper.hashFunc), conf.shipper.metaFileName)

return runutil.Repeat(30*time.Second, ctx.Done(), func() error {
if uploaded, err := s.Sync(ctx); err != nil {
if m.shipper == nil {
bkt, err := client.NewBucket(logger, confContentYaml, component.Sidecar.String())
if err != nil {
level.Warn(logger).Log("err", err, "msg", "Failed to create bucket. Sidecar will start without upload feature and will retry later.")
return nil // Allow sidecar to start without bucket.
}
bkt = objstoretracing.WrapWithTraces(objstore.WrapWithMetrics(bkt, extprom.WrapRegistererWithPrefix("thanos_", reg), bkt.Name()))
shipperUp.Set(1)
level.Debug(logger).Log("msg", "Bucket created")

defer runutil.CloseWithLogOnErr(logger, bkt, "bucket client")

m.shipper = shipper.New(logger, reg, conf.tsdb.path, bkt, m.Labels, metadata.SidecarSource,
uploadCompactedFunc, conf.shipper.allowOutOfOrderUpload, metadata.HashFunc(conf.shipper.hashFunc), conf.shipper.metaFileName)
level.Debug(logger).Log("msg", "Shipper created")
}

// The background shipper continuously scans the data directory and uploads
// new blocks to Google Cloud Storage or an S3-compatible storage service.
if uploaded, err := m.shipper.Sync(ctx); err != nil {
level.Warn(logger).Log("err", err, "uploaded", uploaded)
}

minTime, _, err := s.Timestamps()
minTime, _, err := m.shipper.Timestamps()
if err != nil {
level.Warn(logger).Log("msg", "reading timestamps failed", "err", err)
return nil
Expand Down Expand Up @@ -474,7 +481,8 @@ type promMetadata struct {
promVersion string
limitMinTime thanosmodel.TimeOrDurationValue

client *promclient.Client
client *promclient.Client
shipper *shipper.Shipper
}

func (s *promMetadata) UpdateLabels(ctx context.Context) error {
Expand Down

0 comments on commit 10349bf

Please sign in to comment.