Skip to content

Commit d74af59

Browse files
pracuccigouthamve
authored andcommitted
Fixed HA Tracker jitter causing unnecessary CAS operations (#1861)
* Fixed HA Tracker jitter causing unnecessary CAS operations Signed-off-by: Marco Pracucci <marco@pracucci.com> * Added entry to changelog Signed-off-by: Marco Pracucci <marco@pracucci.com> * Improved HATrackerConfig.Validate() Signed-off-by: Marco Pracucci <marco@pracucci.com> * Cleaner checkReplicaTimestamp() test function Signed-off-by: Marco Pracucci <marco@pracucci.com>
1 parent 4998875 commit d74af59

File tree

3 files changed

+206
-90
lines changed

3 files changed

+206
-90
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
## master / unreleased
44

5+
* [BUGFIX] Fixed unnecessary CAS operations done by the HA tracker when the jitter is enabled. #1861
6+
57
## 0.4.0 / 2019-12-02
68

79
* [CHANGE] The frontend component has been refactored to be easier to re-use. When upgrading the frontend, cache entries will be discarded and re-created with the new protobuf schema. #1734

pkg/distributor/ha_tracker.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package distributor
22

33
import (
44
"context"
5+
"errors"
56
"flag"
67
"fmt"
78
"math/rand"
@@ -48,6 +49,9 @@ var (
4849
Name: "ha_tracker_kv_store_cas_total",
4950
Help: "The total number of CAS calls to the KV store for a user ID/cluster.",
5051
}, []string{"user", "cluster"})
52+
53+
errNegativeUpdateTimeoutJitterMax = errors.New("HA tracker max update timeout jitter shouldn't be negative")
54+
errInvalidFailoverTimeout = "HA Tracker failover timeout (%v) must be at least 1s greater than update timeout - max jitter (%v)"
5155
)
5256

5357
// ProtoReplicaDescFactory makes new InstanceDescs
@@ -117,10 +121,15 @@ func (cfg *HATrackerConfig) RegisterFlags(f *flag.FlagSet) {
117121

118122
// Validate config and returns error on failure
119123
func (cfg *HATrackerConfig) Validate() error {
120-
if cfg.FailoverTimeout < cfg.UpdateTimeout+cfg.UpdateTimeoutJitterMax+time.Second {
121-
return fmt.Errorf("HA Tracker failover timeout (%v) must be at least 1s greater than update timeout (%v)",
122-
cfg.FailoverTimeout, cfg.UpdateTimeout+cfg.UpdateTimeoutJitterMax+time.Second)
124+
if cfg.UpdateTimeoutJitterMax < 0 {
125+
return errNegativeUpdateTimeoutJitterMax
126+
}
127+
128+
minFailureTimeout := cfg.UpdateTimeout + cfg.UpdateTimeoutJitterMax + time.Second
129+
if cfg.FailoverTimeout < minFailureTimeout {
130+
return fmt.Errorf(errInvalidFailoverTimeout, cfg.FailoverTimeout, minFailureTimeout)
123131
}
132+
124133
return nil
125134
}
126135

@@ -207,7 +216,7 @@ func (c *haTracker) checkReplica(ctx context.Context, userID, cluster, replica s
207216
c.electedLock.RLock()
208217
entry, ok := c.elected[key]
209218
c.electedLock.RUnlock()
210-
if ok && now.Sub(timestamp.Time(entry.ReceivedAt)) < c.cfg.UpdateTimeout {
219+
if ok && now.Sub(timestamp.Time(entry.ReceivedAt)) < c.cfg.UpdateTimeout+c.updateTimeoutJitter {
211220
if entry.Replica != replica {
212221
return replicasNotMatchError(replica, entry.Replica)
213222
}

0 commit comments

Comments
 (0)