cortexproject · gouthamve · Dec 5, 2019 · codesome · Dec 6, 2019 · codesome
diff --git a/pkg/cortex/cortex.go b/pkg/cortex/cortex.go
@@ -143,6 +143,8 @@ func (c *Config) Validate() error {
 
 // Cortex is the root datastructure for Cortex.
 type Cortex struct {
+	cfg Config
+
 	target             moduleName
 	httpAuthMiddleware middleware.Interface
 
@@ -173,6 +175,7 @@ func New(cfg Config) (*Cortex, error) {
 	}
 
 	cortex := &Cortex{
+		cfg:    cfg,
 		target: cfg.Target,
 	}
 
@@ -240,7 +243,22 @@ func (t *Cortex) initModule(cfg *Config, m moduleName) error {
 
 // Run starts Cortex running, and blocks until a signal is received.
 func (t *Cortex) Run() error {
-	return t.server.Run()
+	var err error
+	done := make(chan struct{})
+	go func() {
+		err = t.server.Run()
+
+		close(done)
+	}()
+
+	// Initiate shutdown if its a job to flush data.
+	if t.cfg.Ingester.IsFlushJob {
+		_ = t.Stop()
+	}
+
+	<-done
+
+	return err
 }
 
 // Stop gracefully stops a Cortex.

diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go
@@ -118,6 +118,10 @@ type Config struct {
 
 	RateUpdatePeriod time.Duration
 
+	// Controls whether this is a job that solely flushes and exits or not.
+	// Useful when paired with WAL to flush WAL data immediately in case of incidents.
+	IsFlushJob bool
+
 	// Use tsdb block storage
 	TSDBEnabled bool        `yaml:"-"`
 	TSDBConfig  tsdb.Config `yaml:"-"`
@@ -145,6 +149,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
 	f.BoolVar(&cfg.SpreadFlushes, "ingester.spread-flushes", false, "If true, spread series flushes across the whole period of MaxChunkAge")
 	f.IntVar(&cfg.ConcurrentFlushes, "ingester.concurrent-flushes", 50, "Number of concurrent goroutines flushing to dynamodb.")
 	f.DurationVar(&cfg.RateUpdatePeriod, "ingester.rate-update-period", 15*time.Second, "Period with which to update the per-user ingestion rates.")
+	f.BoolVar(&cfg.IsFlushJob, "ingester.is-flush-job", false, "Enables flush mode where the ingester just flushes and exits.")
 }
 
 // Ingester deals with "in flight" chunks.  Based on Prometheus 1.x
@@ -204,6 +209,12 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c
 		flushQueues:  make([]*util.PriorityQueue, cfg.ConcurrentFlushes, cfg.ConcurrentFlushes),
 	}
 
+	// If it's a flush job then transfers needs to be disabled.
+	if cfg.IsFlushJob {
+		cfg.LifecyclerConfig.NumTokens = 0
+		cfg.MaxTransferRetries = -1 // Disables transfers.
+	}
+
 	var err error
 	i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", ring.IngesterRingKey)
 	if err != nil {
@@ -214,6 +225,11 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c
 	i.limiter = NewSeriesLimiter(limits, i.lifecycler, cfg.LifecyclerConfig.RingConfig.ReplicationFactor, cfg.ShardByAllLabels)
 	i.userStates = newUserStates(i.limiter, cfg)
 
+	// If its a job, the ingester should not participate in the transfer or ingestion logic. It already has 0 tokens,
+	// and if we set the initial state to ACTIVE, it won't be transferred to.
+	if cfg.IsFlushJob {
+		i.lifecycler.SetState(ring.ACTIVE)
+	}
 	// Now that user states have been created, we can start the lifecycler
 	i.lifecycler.Start()
 

diff --git a/pkg/ingester/transfer.go b/pkg/ingester/transfer.go
@@ -396,7 +396,7 @@ func fromWireChunks(wireChunks []client.Chunk) ([]*desc, error) {
 // TransferOut finds an ingester in PENDING state and transfers our chunks to it.
 // Called as part of the ingester shutdown process.
 func (i *Ingester) TransferOut(ctx context.Context) error {
-	if i.cfg.MaxTransferRetries <= 0 {
+	if i.cfg.MaxTransferRetries <= 0 || i.cfg.IsFlushJob {
 		return ring.ErrTransferDisabled
 	}
 	backoff := util.NewBackoff(ctx, util.BackoffConfig{

diff --git a/pkg/ring/lifecycler.go b/pkg/ring/lifecycler.go
@@ -236,7 +236,9 @@ func (i *Lifecycler) GetState() IngesterState {
 	return i.state
 }
 
-func (i *Lifecycler) setState(state IngesterState) {
+// SetState sets the internal state. This doesn't change the state in the ring.
+// Use ChangeState instead.
+func (i *Lifecycler) SetState(state IngesterState) {
 	i.stateMtx.Lock()
 	defer i.stateMtx.Unlock()
 	i.state = state
@@ -484,7 +486,7 @@ func (i *Lifecycler) initRing(ctx context.Context) error {
 			if len(tokensFromFile) > 0 {
 				level.Info(util.Logger).Log("msg", "adding tokens from file", "num_tokens", len(tokensFromFile))
 				if len(tokensFromFile) >= i.cfg.NumTokens {
-					i.setState(ACTIVE)
+					i.SetState(ACTIVE)
 				}
 				ringDesc.AddIngester(i.ID, i.Addr, tokensFromFile, i.GetState())
 				i.setTokens(tokensFromFile)
@@ -498,7 +500,7 @@ func (i *Lifecycler) initRing(ctx context.Context) error {
 		}
 
 		// We exist in the ring, so assume the ring is right and copy out tokens & state out of there.
-		i.setState(ingesterDesc.State)
+		i.SetState(ingesterDesc.State)
 		tokens, _ := ringDesc.TokensFor(i.ID)
 		i.setTokens(tokens)
 
@@ -598,7 +600,8 @@ func (i *Lifecycler) autoJoin(ctx context.Context, targetState IngesterState) er
 		}
 
 		newTokens := GenerateTokens(i.cfg.NumTokens-len(myTokens), takenTokens)
-		i.setState(targetState)
+		i.SetState(targetState)
+		ringDesc.AddIngester(i.ID, i.Addr, newTokens, i.GetState())
 
 		myTokens = append(myTokens, newTokens...)
 		sort.Sort(myTokens)
@@ -666,7 +669,7 @@ func (i *Lifecycler) changeState(ctx context.Context, state IngesterState) error
 	}
 
 	level.Info(util.Logger).Log("msg", "changing instance state from", "old_state", currState, "new_state", state, "ring", i.RingName)
-	i.setState(state)
+	i.SetState(state)
 	return i.updateConsul(ctx)
 }