Skip to content

Commit 9845a5e

Browse files
author
Ganesh Vernekar
committed
Add metrics for checkpoint and name changes
Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
1 parent 787fbb8 commit 9845a5e

File tree

1 file changed

+59
-15
lines changed

1 file changed

+59
-15
lines changed

pkg/ingester/wal.go

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -55,58 +55,89 @@ func (noop) Log(*Record) error {
5555
// Stop any background WAL processes.
5656
func (noop) Stop() {}
5757

58-
type wrapper struct {
58+
type walWrapper struct {
5959
cfg WALConfig
6060
ingester *Ingester
6161
quit chan struct{}
6262
wait sync.WaitGroup
6363

6464
lastWalSegment int
6565
wal *wal.WAL
66+
67+
// Checkpoint metrics.
68+
checkpointDeleteFail prometheus.Counter
69+
checkpointDeleteTotal prometheus.Counter
70+
checkpointCreationFail prometheus.Counter
71+
checkpointCreationTotal prometheus.Counter
6672
}
6773

6874
func newWAL(cfg WALConfig, ingester *Ingester) (WAL, error) {
6975
if !cfg.enabled {
7076
return &noop{}, nil
7177
}
7278

73-
var samplesRegistry prometheus.Registerer
79+
var walRegistry prometheus.Registerer
7480
if cfg.metricsRegisterer != nil {
75-
samplesRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "samples"}, cfg.metricsRegisterer)
81+
walRegistry = prometheus.WrapRegistererWith(prometheus.Labels{"kind": "wal"}, cfg.metricsRegisterer)
7682
}
77-
samples, err := wal.New(util.Logger, samplesRegistry, path.Join(cfg.dir, "samples"), true)
83+
tsdbWAL, err := wal.New(util.Logger, walRegistry, path.Join(cfg.dir, "wal"), true)
7884
if err != nil {
7985
return nil, err
8086
}
8187

82-
w := &wrapper{
88+
w := &walWrapper{
8389
cfg: cfg,
8490
ingester: ingester,
8591
quit: make(chan struct{}),
86-
wal: samples,
92+
wal: tsdbWAL,
93+
}
94+
95+
w.checkpointDeleteFail = prometheus.NewCounter(prometheus.CounterOpts{
96+
Name: "ingester_checkpoint_deletions_failed_total",
97+
Help: "Total number of checkpoint deletions that failed.",
98+
})
99+
w.checkpointDeleteTotal = prometheus.NewCounter(prometheus.CounterOpts{
100+
Name: "ingester_checkpoint_deletions_total",
101+
Help: "Total number of checkpoint deletions attempted.",
102+
})
103+
w.checkpointCreationFail = prometheus.NewCounter(prometheus.CounterOpts{
104+
Name: "ingester_checkpoint_creations_failed_total",
105+
Help: "Total number of checkpoint creations that failed.",
106+
})
107+
w.checkpointCreationTotal = prometheus.NewCounter(prometheus.CounterOpts{
108+
Name: "ingester_checkpoint_creations_total",
109+
Help: "Total number of checkpoint creations attempted.",
110+
})
111+
if cfg.metricsRegisterer != nil {
112+
cfg.metricsRegisterer.MustRegister(
113+
w.checkpointDeleteFail,
114+
w.checkpointDeleteTotal,
115+
w.checkpointCreationFail,
116+
w.checkpointCreationTotal,
117+
)
87118
}
88119

89120
w.wait.Add(1)
90121
go w.run()
91122
return w, nil
92123
}
93124

94-
func (w *wrapper) Stop() {
125+
func (w *walWrapper) Stop() {
95126
close(w.quit)
96127
w.wait.Wait()
97128

98129
w.wal.Close()
99130
}
100131

101-
func (w *wrapper) Log(record *Record) error {
132+
func (w *walWrapper) Log(record *Record) error {
102133
buf, err := proto.Marshal(record)
103134
if err != nil {
104135
return err
105136
}
106137
return w.wal.Log(buf)
107138
}
108139

109-
func (w *wrapper) run() {
140+
func (w *walWrapper) run() {
110141
defer w.wait.Done()
111142

112143
for !w.isStopped() {
@@ -123,7 +154,7 @@ func (w *wrapper) run() {
123154
}
124155
}
125156

126-
func (w *wrapper) isStopped() bool {
157+
func (w *walWrapper) isStopped() bool {
127158
select {
128159
case <-w.quit:
129160
return true
@@ -134,7 +165,13 @@ func (w *wrapper) isStopped() bool {
134165

135166
const checkpointPrefix = "checkpoint."
136167

137-
func (w *wrapper) checkpoint() error {
168+
func (w *walWrapper) checkpoint() (err error) {
169+
w.checkpointCreationTotal.Inc()
170+
defer func() {
171+
if err != nil {
172+
w.checkpointCreationFail.Inc()
173+
}
174+
}()
138175
_, last, err := w.lastCheckpoint()
139176
if err != nil {
140177
return err
@@ -201,7 +238,7 @@ func (w *wrapper) checkpoint() error {
201238

202239
// lastCheckpoint returns the directory name and index of the most recent checkpoint.
203240
// If dir does not contain any checkpoints, -1 is returned as index.
204-
func (w *wrapper) lastCheckpoint() (string, int, error) {
241+
func (w *walWrapper) lastCheckpoint() (string, int, error) {
205242
files, err := ioutil.ReadDir(w.wal.Dir())
206243
if err != nil {
207244
return "", -1, err
@@ -226,7 +263,14 @@ func (w *wrapper) lastCheckpoint() (string, int, error) {
226263
}
227264

228265
// deleteCheckpoints deletes all checkpoints in a directory below a given index.
229-
func (w *wrapper) deleteCheckpoints(maxIndex int) error {
266+
func (w *walWrapper) deleteCheckpoints(maxIndex int) (err error) {
267+
w.checkpointDeleteTotal.Inc()
268+
defer func() {
269+
if err != nil {
270+
w.checkpointDeleteFail.Inc()
271+
}
272+
}()
273+
230274
var errs tsdb_errors.MultiError
231275

232276
files, err := ioutil.ReadDir(w.wal.Dir())
@@ -248,7 +292,7 @@ func (w *wrapper) deleteCheckpoints(maxIndex int) error {
248292
return errs.Err()
249293
}
250294

251-
func (w *wrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Fingerprint, series *memorySeries) error {
295+
func (w *walWrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Fingerprint, series *memorySeries) error {
252296
wireChunks, err := toWireChunks(series.chunkDescs)
253297
if err != nil {
254298
return err
@@ -268,7 +312,7 @@ func (w *wrapper) checkpointSeries(cp *wal.WAL, userID string, fp model.Fingerpr
268312
}
269313

270314
// truncateSamples removed the wal from before the checkpoint.
271-
func (w *wrapper) truncateSamples() error {
315+
func (w *walWrapper) truncateSamples() error {
272316
_, last, err := w.wal.Segments()
273317
if err != nil {
274318
return err

0 commit comments

Comments
 (0)