Skip to content

Commit 87418c3

Browse files
authored
Merge pull request #9942 from wenjiaswe/automated-cherry-pick-of-#9761-upstream-release-3.2
Automated cherry pick of #9761
2 parents cad3cf7 + 8c9fd1b commit 87418c3

File tree

8 files changed

+66
-8
lines changed

8 files changed

+66
-8
lines changed

etcdserver/metrics.go

+14
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,18 @@ var (
4141
Name: "leader_changes_seen_total",
4242
Help: "The number of leader changes seen.",
4343
})
44+
heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{
45+
Namespace: "etcd",
46+
Subsystem: "server",
47+
Name: "heartbeat_send_failures_total",
48+
Help: "The total number of leader heartbeat send failures (likely overloaded from slow disk).",
49+
})
50+
slowApplies = prometheus.NewCounter(prometheus.CounterOpts{
51+
Namespace: "etcd",
52+
Subsystem: "server",
53+
Name: "slow_apply_total",
54+
Help: "The total number of slow apply requests (likely overloaded from slow disk).",
55+
})
4456
proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
4557
Namespace: "etcd",
4658
Subsystem: "server",
@@ -96,6 +108,8 @@ func init() {
96108
prometheus.MustRegister(hasLeader)
97109
prometheus.MustRegister(isLeader)
98110
prometheus.MustRegister(leaderChanges)
111+
prometheus.MustRegister(heartbeatSendFailures)
112+
prometheus.MustRegister(slowApplies)
99113
prometheus.MustRegister(proposalsCommitted)
100114
prometheus.MustRegister(proposalsApplied)
101115
prometheus.MustRegister(proposalsPending)

etcdserver/raft.go

+1
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,7 @@ func (r *raftNode) processMessages(ms []raftpb.Message) []raftpb.Message {
346346
// TODO: limit request rate.
347347
plog.Warningf("failed to send out heartbeat on time (exceeded the %v timeout for %v)", r.heartbeat, exceed)
348348
plog.Warningf("server is likely overloaded")
349+
heartbeatSendFailures.Inc()
349350
}
350351
}
351352
}

etcdserver/util.go

+1
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ func warnOfExpensiveGenericRequest(now time.Time, reqStringer fmt.Stringer, pref
146146
result = resp
147147
}
148148
plog.Warningf("%srequest %q with result %q took too long (%v) to execute", prefix, reqStringer.String(), result, d)
149+
slowApplies.Inc()
149150
}
150151
}
151152

mvcc/backend/backend.go

+5
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,8 @@ func (b *backend) Defrag() error {
288288
}
289289

290290
func (b *backend) defrag() error {
291+
now := time.Now()
292+
291293
// TODO: make this non-blocking?
292294
// lock batchTx to ensure nobody is using previous tx, and then
293295
// close previous ongoing tx.
@@ -351,6 +353,9 @@ func (b *backend) defrag() error {
351353
atomic.StoreInt64(&b.size, size)
352354
atomic.StoreInt64(&b.sizeInUse, size-(int64(db.Stats().FreePageN)*int64(db.Info().PageSize)))
353355

356+
took := time.Since(now)
357+
defragDurations.Observe(took.Seconds())
358+
354359
return nil
355360
}
356361

mvcc/backend/batch_tx.go

+7-5
Original file line numberDiff line numberDiff line change
@@ -141,15 +141,15 @@ func unsafeForEach(tx *bolt.Tx, bucket []byte, visitor func(k, v []byte) error)
141141
// Commit commits a previous tx and begins a new writable one.
142142
func (t *batchTx) Commit() {
143143
t.Lock()
144-
defer t.Unlock()
145144
t.commit(false)
145+
t.Unlock()
146146
}
147147

148148
// CommitAndStop commits the previous tx and does not create a new one.
149149
func (t *batchTx) CommitAndStop() {
150150
t.Lock()
151-
defer t.Unlock()
152151
t.commit(true)
152+
t.Unlock()
153153
}
154154

155155
func (t *batchTx) Unlock() {
@@ -167,9 +167,11 @@ func (t *batchTx) commit(stop bool) {
167167
}
168168

169169
start := time.Now()
170+
170171
// gofail: var beforeCommit struct{}
171172
err := t.tx.Commit()
172173
// gofail: var afterCommit struct{}
174+
173175
commitDurations.Observe(time.Since(start).Seconds())
174176
atomic.AddInt64(&t.backend.commits, 1)
175177

@@ -214,21 +216,21 @@ func (t *batchTxBuffered) Unlock() {
214216

215217
func (t *batchTxBuffered) Commit() {
216218
t.Lock()
217-
defer t.Unlock()
218219
t.commit(false)
220+
t.Unlock()
219221
}
220222

221223
func (t *batchTxBuffered) CommitAndStop() {
222224
t.Lock()
223-
defer t.Unlock()
224225
t.commit(true)
226+
t.Unlock()
225227
}
226228

227229
func (t *batchTxBuffered) commit(stop bool) {
228230
// all read txs must be closed to acquire boltdb commit rwlock
229231
t.backend.readTx.mu.Lock()
230-
defer t.backend.readTx.mu.Unlock()
231232
t.unsafeCommit(stop)
233+
t.backend.readTx.mu.Unlock()
232234
}
233235

234236
func (t *batchTxBuffered) unsafeCommit(stop bool) {

mvcc/backend/metrics.go

+20-2
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,38 @@ var (
2222
Subsystem: "disk",
2323
Name: "backend_commit_duration_seconds",
2424
Help: "The latency distributions of commit called by backend.",
25-
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
25+
26+
// lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2
27+
// highest bucket start of 0.001 sec * 2^13 == 8.192 sec
28+
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
29+
})
30+
31+
defragDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
32+
Namespace: "etcd",
33+
Subsystem: "disk",
34+
Name: "backend_defrag_duration_seconds",
35+
Help: "The latency distribution of backend defragmentation.",
36+
37+
// 100 MB usually takes 1 sec, so start with 10 MB of 100 ms
38+
// lowest bucket start of upper bound 0.1 sec (100 ms) with factor 2
39+
// highest bucket start of 0.1 sec * 2^12 == 409.6 sec
40+
Buckets: prometheus.ExponentialBuckets(.1, 2, 13),
2641
})
2742

2843
snapshotDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
2944
Namespace: "etcd",
3045
Subsystem: "disk",
3146
Name: "backend_snapshot_duration_seconds",
3247
Help: "The latency distribution of backend snapshots.",
33-
// 10 ms -> 655 seconds
48+
49+
// lowest bucket start of upper bound 0.01 sec (10 ms) with factor 2
50+
// highest bucket start of 0.01 sec * 2^16 == 655.36 sec
3451
Buckets: prometheus.ExponentialBuckets(.01, 2, 17),
3552
})
3653
)
3754

3855
func init() {
3956
prometheus.MustRegister(commitDurations)
57+
prometheus.MustRegister(defragDurations)
4058
prometheus.MustRegister(snapshotDurations)
4159
}

mvcc/kvstore.go

+4
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,12 @@ func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
150150
}
151151

152152
func (s *store) Hash() (hash uint32, revision int64, err error) {
153+
start := time.Now()
154+
153155
s.b.ForceCommit()
154156
h, err := s.b.Hash(DefaultIgnores)
157+
158+
hashDurations.Observe(time.Since(start).Seconds())
155159
return h, s.currentRev, err
156160
}
157161

mvcc/metrics.go

+14-1
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,19 @@ var (
173173
)
174174
// overridden by mvcc initialization
175175
reportDbTotalSizeInUseInBytesMu sync.RWMutex
176-
reportDbTotalSizeInUseInBytes = func() float64 { return 0 }
176+
reportDbTotalSizeInUseInBytes func() float64 = func() float64 { return 0 }
177+
178+
hashDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
179+
Namespace: "etcd",
180+
Subsystem: "mvcc",
181+
Name: "hash_duration_seconds",
182+
Help: "The latency distribution of storage hash operation.",
183+
184+
// 100 MB usually takes 100 ms, so start with 10 MB of 10 ms
185+
// lowest bucket start of upper bound 0.01 sec (10 ms) with factor 2
186+
// highest bucket start of 0.01 sec * 2^14 == 163.84 sec
187+
Buckets: prometheus.ExponentialBuckets(.01, 2, 15),
188+
})
177189
)
178190

179191
func init() {
@@ -193,6 +205,7 @@ func init() {
193205
prometheus.MustRegister(dbTotalSizeDebugging)
194206
prometheus.MustRegister(dbTotalSize)
195207
prometheus.MustRegister(dbTotalSizeInUse)
208+
prometheus.MustRegister(hashDurations)
196209
}
197210

198211
// ReportEventReceived reports that an event is received.

0 commit comments

Comments
 (0)