Skip to content

Add flush job mode to ingester. #1885

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion pkg/cortex/cortex.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ func (c *Config) Validate() error {

// Cortex is the root datastructure for Cortex.
type Cortex struct {
cfg Config

target moduleName
httpAuthMiddleware middleware.Interface

Expand Down Expand Up @@ -173,6 +175,7 @@ func New(cfg Config) (*Cortex, error) {
}

cortex := &Cortex{
cfg: cfg,
target: cfg.Target,
}

Expand Down Expand Up @@ -240,7 +243,22 @@ func (t *Cortex) initModule(cfg *Config, m moduleName) error {

// Run starts Cortex running, and blocks until a signal is received.
func (t *Cortex) Run() error {
return t.server.Run()
var err error
done := make(chan struct{})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are not waiting for this anywhere (I will create a follow-up PR to your branch, just a note to myself)

go func() {
err = t.server.Run()

close(done)
}()

// Initiate shutdown if its a job to flush data.
if t.cfg.Ingester.IsFlushJob {
_ = t.Stop()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to make sure that calling t.Stop() twice is not a problem. Currently, main.go would also call this and try stopping the modules again.

}

<-done

return err
}

// Stop gracefully stops a Cortex.
Expand Down
16 changes: 16 additions & 0 deletions pkg/ingester/ingester.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ type Config struct {

RateUpdatePeriod time.Duration

// Controls whether this is a job that solely flushes and exits or not.
// Useful when paired with WAL to flush WAL data immediately in case of incidents.
IsFlushJob bool

// Use tsdb block storage
TSDBEnabled bool `yaml:"-"`
TSDBConfig tsdb.Config `yaml:"-"`
Expand Down Expand Up @@ -145,6 +149,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
f.BoolVar(&cfg.SpreadFlushes, "ingester.spread-flushes", false, "If true, spread series flushes across the whole period of MaxChunkAge")
f.IntVar(&cfg.ConcurrentFlushes, "ingester.concurrent-flushes", 50, "Number of concurrent goroutines flushing to dynamodb.")
f.DurationVar(&cfg.RateUpdatePeriod, "ingester.rate-update-period", 15*time.Second, "Period with which to update the per-user ingestion rates.")
f.BoolVar(&cfg.IsFlushJob, "ingester.is-flush-job", false, "Enables flush mode where the ingester just flushes and exits.")
}

// Ingester deals with "in flight" chunks. Based on Prometheus 1.x
Expand Down Expand Up @@ -204,6 +209,12 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c
flushQueues: make([]*util.PriorityQueue, cfg.ConcurrentFlushes, cfg.ConcurrentFlushes),
}

// If it's a flush job then transfers needs to be disabled.
if cfg.IsFlushJob {
cfg.LifecyclerConfig.NumTokens = 0
cfg.MaxTransferRetries = -1 // Disables transfers.
}

var err error
i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", ring.IngesterRingKey)
if err != nil {
Expand All @@ -214,6 +225,11 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c
i.limiter = NewSeriesLimiter(limits, i.lifecycler, cfg.LifecyclerConfig.RingConfig.ReplicationFactor, cfg.ShardByAllLabels)
i.userStates = newUserStates(i.limiter, cfg)

// If its a job, the ingester should not participate in the transfer or ingestion logic. It already has 0 tokens,
// and if we set the initial state to ACTIVE, it won't be transferred to.
if cfg.IsFlushJob {
i.lifecycler.SetState(ring.ACTIVE)
}
// Now that user states have been created, we can start the lifecycler
i.lifecycler.Start()

Expand Down
2 changes: 1 addition & 1 deletion pkg/ingester/transfer.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ func fromWireChunks(wireChunks []client.Chunk) ([]*desc, error) {
// TransferOut finds an ingester in PENDING state and transfers our chunks to it.
// Called as part of the ingester shutdown process.
func (i *Ingester) TransferOut(ctx context.Context) error {
if i.cfg.MaxTransferRetries <= 0 {
if i.cfg.MaxTransferRetries <= 0 || i.cfg.IsFlushJob {
return ring.ErrTransferDisabled
}
backoff := util.NewBackoff(ctx, util.BackoffConfig{
Expand Down
13 changes: 8 additions & 5 deletions pkg/ring/lifecycler.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,9 @@ func (i *Lifecycler) GetState() IngesterState {
return i.state
}

func (i *Lifecycler) setState(state IngesterState) {
// SetState sets the internal state. This doesn't change the state in the ring.
// Use ChangeState instead.
func (i *Lifecycler) SetState(state IngesterState) {
i.stateMtx.Lock()
defer i.stateMtx.Unlock()
i.state = state
Expand Down Expand Up @@ -484,7 +486,7 @@ func (i *Lifecycler) initRing(ctx context.Context) error {
if len(tokensFromFile) > 0 {
level.Info(util.Logger).Log("msg", "adding tokens from file", "num_tokens", len(tokensFromFile))
if len(tokensFromFile) >= i.cfg.NumTokens {
i.setState(ACTIVE)
i.SetState(ACTIVE)
}
ringDesc.AddIngester(i.ID, i.Addr, tokensFromFile, i.GetState())
i.setTokens(tokensFromFile)
Expand All @@ -498,7 +500,7 @@ func (i *Lifecycler) initRing(ctx context.Context) error {
}

// We exist in the ring, so assume the ring is right and copy out tokens & state out of there.
i.setState(ingesterDesc.State)
i.SetState(ingesterDesc.State)
tokens, _ := ringDesc.TokensFor(i.ID)
i.setTokens(tokens)

Expand Down Expand Up @@ -598,7 +600,8 @@ func (i *Lifecycler) autoJoin(ctx context.Context, targetState IngesterState) er
}

newTokens := GenerateTokens(i.cfg.NumTokens-len(myTokens), takenTokens)
i.setState(targetState)
i.SetState(targetState)
ringDesc.AddIngester(i.ID, i.Addr, newTokens, i.GetState())

myTokens = append(myTokens, newTokens...)
sort.Sort(myTokens)
Expand Down Expand Up @@ -666,7 +669,7 @@ func (i *Lifecycler) changeState(ctx context.Context, state IngesterState) error
}

level.Info(util.Logger).Log("msg", "changing instance state from", "old_state", currState, "new_state", state, "ring", i.RingName)
i.setState(state)
i.SetState(state)
return i.updateConsul(ctx)
}

Expand Down