Skip to content

Commit d015aad

Browse files
authored
BugFix: 1.0 Ingesters having availability zone value erased by pre-1.0 ingesters during rolling upgrade. (#2404)
* When executing updateConsul for an existing ingester, ensure to write the zone on the updated record Signed-off-by: Ken Haines <khaines@microsoft.com> * adding unit test checking that the zone value is updated when an external actor clears it Signed-off-by: Ken Haines <khaines@microsoft.com> * updating CHANGELOG.md Signed-off-by: Ken Haines <khaines@microsoft.com> * removing internal default 'no-zone' value of lifecyler.id and leaving a blank string Signed-off-by: Ken Haines <khaines@microsoft.com>
1 parent 8f59b14 commit d015aad

File tree

3 files changed

+57
-4
lines changed

3 files changed

+57
-4
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
* [BUGFIX] Experimental TSDB: fixed chunk data corruption when querying back series using the experimental blocks storage. #2400
1818
* [BUGFIX] Cassandra Storage: Fix endpoint TLS host verification. #2109
1919
* [BUGFIX] Experimental TSDB: fixed response status code from `422` to `500` when an error occurs while iterating chunks with the experimental blocks storage. #2402
20+
* [BUGFIX] Ring: Fixed a situation where upgrading from pre-1.0 cortex with a rolling strategy caused new 1.0 ingesters to lose their zone value in the ring until manually forced to re-register. #2404
2021

2122
## 1.0.0 / 2020-04-02
2223

pkg/ring/lifecycler.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,10 +162,6 @@ func NewLifecycler(cfg LifecyclerConfig, flushTransferer FlushTransferer, ringNa
162162
util.WarnExperimentalUse("Zone aware replication")
163163
}
164164

165-
if zone == "" {
166-
zone = cfg.ID
167-
}
168-
169165
// We do allow a nil FlushTransferer, but to keep the ring logic easier we assume
170166
// it's always set, so we use a noop FlushTransferer
171167
if flushTransferer == nil {
@@ -667,6 +663,7 @@ func (i *Lifecycler) updateConsul(ctx context.Context) error {
667663
ingesterDesc.Timestamp = time.Now().Unix()
668664
ingesterDesc.State = i.GetState()
669665
ingesterDesc.Addr = i.Addr
666+
ingesterDesc.Zone = i.Zone
670667
ringDesc.Ingesters[i.ID] = ingesterDesc
671668
}
672669

pkg/ring/lifecycler_test.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ func testLifecyclerConfig(ringConfig Config, id string) LifecyclerConfig {
3737
lifecyclerConfig.RingConfig = ringConfig
3838
lifecyclerConfig.NumTokens = 1
3939
lifecyclerConfig.ID = id
40+
lifecyclerConfig.Zone = "zone1"
4041
lifecyclerConfig.FinalSleep = 0
4142
lifecyclerConfig.HeartbeatPeriod = 100 * time.Millisecond
4243

@@ -390,3 +391,57 @@ func TestJoinInLeavingState(t *testing.T) {
390391
len(desc.Ingesters["ing2"].Tokens) == 2
391392
})
392393
}
394+
395+
func TestRestoreOfZoneWhenOverwritten(t *testing.T) {
396+
// This test is simulating a case during upgrade of pre 1.0 cortex where
397+
// older ingesters do not have the zone field in their ring structs
398+
// so it gets removed. The current version of the lifecylcer should
399+
// write it back on update during its next heartbeat.
400+
401+
var ringConfig Config
402+
flagext.DefaultValues(&ringConfig)
403+
codec := GetCodec()
404+
ringConfig.KVStore.Mock = consul.NewInMemoryClient(codec)
405+
406+
r, err := New(ringConfig, "ingester", IngesterRingKey)
407+
require.NoError(t, err)
408+
require.NoError(t, services.StartAndAwaitRunning(context.Background(), r))
409+
defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck
410+
411+
cfg := testLifecyclerConfig(ringConfig, "ing1")
412+
413+
// Set ing1 to not have a zone
414+
err = r.KVClient.CAS(context.Background(), IngesterRingKey, func(in interface{}) (interface{}, bool, error) {
415+
r := &Desc{
416+
Ingesters: map[string]IngesterDesc{
417+
"ing1": {
418+
State: ACTIVE,
419+
Addr: "0.0.0.0",
420+
Tokens: []uint32{1, 4},
421+
},
422+
"ing2": {
423+
Tokens: []uint32{2, 3},
424+
},
425+
},
426+
}
427+
428+
return r, true, nil
429+
})
430+
require.NoError(t, err)
431+
432+
l1, err := NewLifecycler(cfg, &nopFlushTransferer{}, "ingester", IngesterRingKey, true)
433+
require.NoError(t, err)
434+
require.NoError(t, services.StartAndAwaitRunning(context.Background(), l1))
435+
436+
// Check that the lifecycler was able to reset the zone value to the expected setting
437+
test.Poll(t, 1000*time.Millisecond, true, func() interface{} {
438+
d, err := r.KVClient.Get(context.Background(), IngesterRingKey)
439+
require.NoError(t, err)
440+
desc, ok := d.(*Desc)
441+
return ok &&
442+
len(desc.Ingesters) == 2 &&
443+
desc.Ingesters["ing1"].Zone == l1.Zone &&
444+
desc.Ingesters["ing2"].Zone == ""
445+
446+
})
447+
}

0 commit comments

Comments
 (0)