Skip to content

Commit

Permalink
Sidecar: wait for prometheus on startup (#7323)
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Hoffmann <mhoffm@posteo.de>
  • Loading branch information
MichaHoffmann authored May 3, 2024
1 parent f0b1753 commit c94b34c
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 38 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re

### Fixed

- [#7323](https://github.com/thanos-io/thanos/pull/7323) Sidecar: wait for prometheus on startup

### Added

- [#7317](https://github.com/thanos-io/thanos/pull/7317) Tracing: allow specifying resource attributes for the OTLP configuration.
Expand Down
99 changes: 61 additions & 38 deletions cmd/thanos/sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,64 +172,87 @@ func runSidecar(
Help: "Boolean indicator whether the sidecar can reach its Prometheus peer.",
})

ctx, cancel := context.WithCancel(context.Background())
g.Add(func() error {
// Only check Prometheus's flags when upload is enabled.
if uploads {
// Check prometheus's flags to ensure same sidecar flags.
if err := validatePrometheus(ctx, m.client, logger, conf.shipper.ignoreBlockSize, m); err != nil {
return errors.Wrap(err, "validate Prometheus flags")
}
}
ctx := context.Background()
// Only check Prometheus's flags when upload is enabled.
if uploads {
// Check prometheus's flags to ensure same sidecar flags.
// We retry infinitely until we validated prometheus flags
err := runutil.Retry(conf.prometheus.getConfigInterval, ctx.Done(), func() error {
iterCtx, iterCancel := context.WithTimeout(context.Background(), conf.prometheus.getConfigTimeout)
defer iterCancel()

// We retry infinitely until we reach and fetch BuildVersion from our Prometheus.
err := runutil.Retry(2*time.Second, ctx.Done(), func() error {
if err := m.BuildVersion(ctx); err != nil {
if err := validatePrometheus(iterCtx, m.client, logger, conf.shipper.ignoreBlockSize, m); err != nil {
level.Warn(logger).Log(
"msg", "failed to fetch prometheus version. Is Prometheus running? Retrying",
"msg", "failed to validate prometheus flags. Is Prometheus running? Retrying",
"err", err,
)
return err
}

level.Info(logger).Log(
"msg", "successfully loaded prometheus version",
"msg", "successfully validated prometheus flags",
)
return nil
})
if err != nil {
return errors.Wrap(err, "failed to get prometheus version")
return errors.Wrap(err, "failed to validate prometheus flags")
}
}

// Blocking query of external labels before joining as a Source Peer into gossip.
// We retry infinitely until we reach and fetch labels from our Prometheus.
err = runutil.Retry(2*time.Second, ctx.Done(), func() error {
if err := m.UpdateLabels(ctx); err != nil {
level.Warn(logger).Log(
"msg", "failed to fetch initial external labels. Is Prometheus running? Retrying",
"err", err,
)
promUp.Set(0)
statusProber.NotReady(err)
return err
}
// We retry infinitely until we reach and fetch BuildVersion from our Prometheus.
err := runutil.Retry(conf.prometheus.getConfigInterval, ctx.Done(), func() error {
iterCtx, iterCancel := context.WithTimeout(context.Background(), conf.prometheus.getConfigTimeout)
defer iterCancel()

level.Info(logger).Log(
"msg", "successfully loaded prometheus external labels",
"external_labels", m.Labels().String(),
if err := m.BuildVersion(iterCtx); err != nil {
level.Warn(logger).Log(
"msg", "failed to fetch prometheus version. Is Prometheus running? Retrying",
"err", err,
)
promUp.Set(1)
statusProber.Ready()
return nil
})
if err != nil {
return errors.Wrap(err, "initial external labels query")
return err
}

if len(m.Labels()) == 0 {
return errors.New("no external labels configured on Prometheus server, uniquely identifying external labels must be configured; see https://thanos.io/tip/thanos/storage.md#external-labels for details.")
level.Info(logger).Log(
"msg", "successfully loaded prometheus version",
)
return nil
})
if err != nil {
return errors.Wrap(err, "failed to get prometheus version")
}

// Blocking query of external labels before joining as a Source Peer into gossip.
// We retry infinitely until we reach and fetch labels from our Prometheus.
err = runutil.Retry(conf.prometheus.getConfigInterval, ctx.Done(), func() error {
iterCtx, iterCancel := context.WithTimeout(context.Background(), conf.prometheus.getConfigTimeout)
defer iterCancel()

if err := m.UpdateLabels(iterCtx); err != nil {
level.Warn(logger).Log(
"msg", "failed to fetch initial external labels. Is Prometheus running? Retrying",
"err", err,
)
return err
}

level.Info(logger).Log(
"msg", "successfully loaded prometheus external labels",
"external_labels", m.Labels().String(),
)
return nil
})
if err != nil {
return errors.Wrap(err, "initial external labels query")
}

if len(m.Labels()) == 0 {
return errors.New("no external labels configured on Prometheus server, uniquely identifying external labels must be configured; see https://thanos.io/tip/thanos/storage.md#external-labels for details.")
}
promUp.Set(1)
statusProber.Ready()

ctx, cancel := context.WithCancel(context.Background())
g.Add(func() error {
// Periodically query the Prometheus config. We use this as a heartbeat as well as for updating
// the external labels we apply.
return runutil.Repeat(conf.prometheus.getConfigInterval, ctx.Done(), func() error {
Expand Down

0 comments on commit c94b34c

Please sign in to comment.