Skip to content

Commit

Permalink
Ruler: log rule evaluation detail (grafana#9206)
Browse files Browse the repository at this point in the history
**What this PR does / why we need it**:
This logs some extra detail around rule evaluation to enable more
effective troubleshooting when a rule is not behaving as expected.
  • Loading branch information
Danny Kopping authored Apr 20, 2023
1 parent 90a1d45 commit af896eb
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
* [8918](https://github.com/grafana/loki/pull/8918) **salvacorts** Introduce limit to require at least a number label matchers on metric and log queries.
* [8909](https://github.com/grafana/loki/pull/8909) **salvacorts** Requests to `/loki/api/v1/index/stats` are split in 24h intervals.
* [8732](https://github.com/grafana/loki/pull/8732) **abaguas**: azure: respect retry config before cancelling the context
* [9206](https://github.com/grafana/loki/pull/9206) **dannykopping**: Ruler: log rule evaluation detail.

##### Fixes
* [8971](https://github.com/grafana/loki/pull/8971) **dannykopping**: Stats: fix `Cache.Chunk.BytesSent` statistic and loki_chunk_fetcher_fetched_size_bytes metric with correct chunk size.
Expand Down
14 changes: 12 additions & 2 deletions pkg/ruler/compat.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"time"

"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
Expand All @@ -22,6 +23,7 @@ import (
"github.com/prometheus/prometheus/template"
"github.com/weaveworks/common/user"

"github.com/grafana/loki/pkg/logql"
"github.com/grafana/loki/pkg/logql/syntax"
ruler "github.com/grafana/loki/pkg/ruler/base"
"github.com/grafana/loki/pkg/ruler/rulespb"
Expand Down Expand Up @@ -55,8 +57,14 @@ type RulesLimits interface {

// queryFunc returns a new query function using the rules.EngineQueryFunc function
// and passing an altered timestamp.
func queryFunc(evaluator Evaluator, overrides RulesLimits, checker readyChecker, userID string) rules.QueryFunc {
func queryFunc(evaluator Evaluator, overrides RulesLimits, checker readyChecker, userID string, logger log.Logger) rules.QueryFunc {
return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
hash := logql.HashedQuery(qs)
detail := rules.FromOriginContext(ctx)
detailLog := log.With(logger, "rule_name", detail.Name, "rule_type", detail.Kind, "query", qs, "query_hash", hash)

level.Info(detailLog).Log("msg", "evaluating rule")

// check if storage instance is ready; if not, fail the rule evaluation;
// we do this to prevent an attempt to append new samples before the WAL appender is ready
if !checker.isReady(userID) {
Expand All @@ -67,6 +75,7 @@ func queryFunc(evaluator Evaluator, overrides RulesLimits, checker readyChecker,
res, err := evaluator.Eval(ctx, qs, adjusted)

if err != nil {
level.Error(detailLog).Log("msg", "rule evaluation failed", "err", err)
return nil, fmt.Errorf("rule evaluation failed: %w", err)
}
switch v := res.Data.(type) {
Expand All @@ -78,6 +87,7 @@ func queryFunc(evaluator Evaluator, overrides RulesLimits, checker readyChecker,
Metric: labels.Labels{},
}}, nil
default:
level.Error(detailLog).Log("msg", "rule result is not a vector or scalar", "err", err)
return nil, errors.New("rule result is not a vector or scalar")
}
}
Expand Down Expand Up @@ -134,7 +144,7 @@ func MultiTenantRuleManager(cfg Config, evaluator Evaluator, overrides RulesLimi
registry.configureTenantStorage(userID)

logger = log.With(logger, "user", userID)
queryFn := queryFunc(evaluator, overrides, registry, userID)
queryFn := queryFunc(evaluator, overrides, registry, userID, logger)
memStore := NewMemStore(userID, queryFn, newMemstoreMetrics(reg), 5*time.Minute, log.With(logger, "subcomponent", "MemStore"))

// GroupLoader builds a cache of the rules as they're loaded by the
Expand Down
7 changes: 4 additions & 3 deletions pkg/ruler/compat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,12 @@ func TestNonMetricQuery(t *testing.T) {
overrides, err := validation.NewOverrides(validation.Limits{}, nil)
require.Nil(t, err)

engine := logql.NewEngine(logql.EngineOpts{}, &FakeQuerier{}, overrides, log.Logger)
eval, err := NewLocalEvaluator(engine, log.Logger)
log := log.Logger
engine := logql.NewEngine(logql.EngineOpts{}, &FakeQuerier{}, overrides, log)
eval, err := NewLocalEvaluator(engine, log)
require.NoError(t, err)

queryFunc := queryFunc(eval, overrides, fakeChecker{}, "fake")
queryFunc := queryFunc(eval, overrides, fakeChecker{}, "fake", log)

_, err = queryFunc(context.TODO(), `{job="nginx"}`, time.Now())
require.Error(t, err, "rule result is not a vector or scalar")
Expand Down

0 comments on commit af896eb

Please sign in to comment.