Add support to log slow queries in Frontend (#1744)

achilles42 · gouthamve · commit e421794bb286 · 2019-11-07T16:05:55.000+01:00
* Add support of logging slow queries

Signed-off-by: Praveen Shukla &lt;praveen.shukla.c42@gmail.com&gt;

* Update orgId, URL and time-taken field in query logging

Signed-off-by: Praveen Shukla &lt;praveen.shukla.c42@gmail.com&gt;

* Update default value as 0 and disable slow query logging disabled

Signed-off-by: Praveen Shukla &lt;praveen.shukla.c42@gmail.com&gt;

* Add slow log feature in changelog

Signed-off-by: Praveen Shukla &lt;praveen.shukla.c42@gmail.com&gt;

* Update slow query CLI flag to use frontend scope instead of querier

Signed-off-by: Praveen Shukla &lt;praveen.shukla.c42@gmail.com&gt;

* Update org_id in slow query logs and add more info in flag description

Signed-off-by: Praveen Shukla &lt;praveen.shukla.c42@gmail.com&gt;

* Add info log for slow query

Signed-off-by: Praveen Shukla &lt;praveen.shukla.c42@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@
   * `-ingester.max-global-series-per-metric`
 * [FEATURE] Flush chunks with stale markers early with `ingester.max-stale-chunk-idle`. #1759
 * [FEATURE] EXPERIMENTAL: Added new KV Store backend based on memberlist library. Components can gossip about tokens and ingester states, instead of using Consul or Etcd. #1721
+* [FEATURE] Allow Query Frontend to log slow queries with `frontend.log-queries-longer-than`. #1744
 * [ENHANCEMENT] Allocation improvements in adding samples to Chunk. #1706
 * [ENHANCEMENT] Consul client now follows recommended practices for blocking queries wrt returned Index value. #1708
 * [ENHANCEMENT] Consul client can optionally rate-limit itself during Watch (used e.g. by ring watchers) and WatchPrefix (used by HA feature) operations. Rate limiting is disabled by default. New flags added: `--consul.watch-rate-limit`, and `--consul.watch-burst-size`. #1708
diff --git a/docs/prometheus-frontend.yml b/docs/prometheus-frontend.yml
@@ -16,6 +16,7 @@ server:
   http_listen_port: 9091
 
 frontend:
+  log_queries_longer_than: 1s
   split_queries_by_day: true
   align_queries_with_step: true
   cache_results: true
diff --git a/pkg/querier/frontend/frontend.go b/pkg/querier/frontend/frontend.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"context"
 	"flag"
+	"fmt"
 	"io"
 	"io/ioutil"
 	"math/rand"
@@ -15,6 +16,7 @@ import (
 
 	"github.com/NYTimes/gziphandler"
 	"github.com/go-kit/kit/log"
+	"github.com/go-kit/kit/log/level"
 	opentracing "github.com/opentracing/opentracing-go"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promauto"
@@ -42,16 +44,18 @@ var (
 
 // Config for a Frontend.
 type Config struct {
-	MaxOutstandingPerTenant int    `yaml:"max_outstanding_per_tenant"`
-	CompressResponses       bool   `yaml:"compress_responses"`
-	DownstreamURL           string `yaml:"downstream"`
+	MaxOutstandingPerTenant int           `yaml:"max_outstanding_per_tenant"`
+	CompressResponses       bool          `yaml:"compress_responses"`
+	DownstreamURL           string        `yaml:"downstream"`
+	LogQueriesLongerThan    time.Duration `yaml:"log_queries_longer_than"`
 }
 
 // RegisterFlags adds the flags required to config this to the given FlagSet.
 func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
 	f.IntVar(&cfg.MaxOutstandingPerTenant, "querier.max-outstanding-requests-per-tenant", 100, "Maximum number of outstanding requests per tenant per frontend; requests beyond this error with HTTP 429.")
 	f.BoolVar(&cfg.CompressResponses, "querier.compress-http-responses", false, "Compress HTTP responses.")
 	f.StringVar(&cfg.DownstreamURL, "frontend.downstream-url", "", "URL of downstream Prometheus.")
+	f.DurationVar(&cfg.LogQueriesLongerThan, "frontend.log-queries-longer-than", 0, "Log queries that are slower than the specified duration. 0 to disable.")
 }
 
 // Frontend queues HTTP requests, dispatches them to backends, and handles retries
@@ -139,7 +143,20 @@ func (f *Frontend) Handler() http.Handler {
 }
 
 func (f *Frontend) handle(w http.ResponseWriter, r *http.Request) {
+	userID, err := user.ExtractOrgID(r.Context())
+	if err != nil {
+		server.WriteError(w, err)
+		return
+	}
+
+	startTime := time.Now()
 	resp, err := f.roundTripper.RoundTrip(r)
+	queryResponseTime := time.Now().Sub(startTime)
+
+	if f.cfg.LogQueriesLongerThan > 0 && queryResponseTime > f.cfg.LogQueriesLongerThan {
+		level.Info(f.log).Log("msg", "slow query", "org_id", userID, "url", fmt.Sprintf("http://%s", r.Host+r.RequestURI), "time_taken", queryResponseTime.String())
+	}
+
 	if err != nil {
 		server.WriteError(w, err)
 		return
@@ -187,7 +204,7 @@ func (c *httpgrpcHeadersCarrier) Set(key, val string) {
 	})
 }
 
-// RoundTripGRPC round trips a proto (instread of a HTTP request).
+// RoundTripGRPC round trips a proto (instead of a HTTP request).
 func (f *Frontend) RoundTripGRPC(ctx context.Context, req *ProcessRequest) (*ProcessResponse, error) {
 	// Propagate trace context in gRPC too - this will be ignored if using HTTP.
 	tracer, span := opentracing.GlobalTracer(), opentracing.SpanFromContext(ctx)