cortexproject · owen-d · Oct 8, 2019 · Oct 8, 2019 · Oct 9, 2019 · Oct 9, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,16 @@
 
 ## master / unreleased
 
+* [FEATURE] Fan out parallelizable queries to backend queriers concurrently.
+  * `-querier.sum-shards` (bool)
+  * Requires a shard-compatible schema (v10+)
+  * This causes the number of traces to increase accordingly.
+  * The query-frontend now requires a schema config to determine how/when to shard queries, either from a file or from flags (i.e. by the `config-yaml` CLI flag). This is the same schema config the queriers consume.
+  * It's also advised to increase downstream concurrency controls as well:
+    * `querier.max-outstanding-requests-per-tenant`
+    * `querier.max-query-parallelism`
+    * `querier.max-concurrent`
+    * `server.grpc-max-concurrent-streams` (for both query-frontends and queriers)
 * [BUGFIX] Fixed unnecessary CAS operations done by the HA tracker when the jitter is enabled. #1861
 
 ## 0.4.0 / 2019-12-02

diff --git a/docs/arguments.md b/docs/arguments.md
@@ -63,6 +63,28 @@ The ingester query API was improved over time, but defaults to the old behaviour
 
 ## Query Frontend
 
+- `-querier.sum-shards`
+
+   If set to true, will cause the query frontend to mutate incoming queries when possible by turning `sum` operations into sharded `sum` operations. This requires a shard-compatible schema (v10+). An abridged example:
+   `sum by (foo) (rate(bar{baz=”blip”}[1m]))` ->
+   ```
+   sum by (foo) (
+    sum by (foo) (rate(bar{baz=”blip”,__cortex_shard__=”0of16”}[1m])) or
+    sum by (foo) (rate(bar{baz=”blip”,__cortex_shard__=”1of16”}[1m])) or
+    ...
+    sum by (foo) (rate(bar{baz=”blip”,__cortex_shard__=”15of16”}[1m]))
+   )
+   ```
+   When enabled, the query-frontend requires a schema config to determine how/when to shard queries, either from a file or from flags (i.e. by the `config-yaml` CLI flag). This is the same schema config the queriers consume.
+   It's also advised to increase downstream concurrency controls as well to account for more queries of smaller sizes:
+
+   - `querier.max-outstanding-requests-per-tenant`
+   - `querier.max-query-parallelism`
+   - `querier.max-concurrent`
+   - `server.grpc-max-concurrent-streams`
+
+   Instrumentation (traces) also scale with the number of sharded queries and it's suggested to account for increased throughput there as well.
+
 - `-querier.align-querier-with-step`
 
    If set to true, will cause the query frontend to mutate incoming queries and align their start and end parameters to the step parameter of the query.  This improves the cacheability of the query results.

diff --git a/go.mod b/go.mod
@@ -79,6 +79,7 @@ require (
 	google.golang.org/api v0.11.0
 	google.golang.org/grpc v1.25.1
 	gopkg.in/yaml.v2 v2.2.2
+	sigs.k8s.io/yaml v1.1.0
 )
 
 // Override since git.apache.org is down.  The docs say to fetch from github.

diff --git a/pkg/chunk/chunk_store.go b/pkg/chunk/chunk_store.go
@@ -436,6 +436,9 @@ func (c *store) lookupChunksByMetricName(ctx context.Context, userID string, fro
 }
 
 func (c *store) lookupEntriesByQueries(ctx context.Context, queries []IndexQuery) ([]IndexEntry, error) {
+	log, ctx := spanlogger.New(ctx, "store.lookupEntriesByQueries")
+	defer log.Span.Finish()
+
 	var lock sync.Mutex
 	var entries []IndexEntry
 	err := c.index.QueryPages(ctx, queries, func(query IndexQuery, resp ReadBatch) bool {
@@ -459,6 +462,9 @@ func (c *store) lookupEntriesByQueries(ctx context.Context, queries []IndexQuery
 }
 
 func (c *store) parseIndexEntries(ctx context.Context, entries []IndexEntry, matcher *labels.Matcher) ([]string, error) {
+	log, ctx := spanlogger.New(ctx, "store.parseIndexEntries")
+	defer log.Span.Finish()
+
 	result := make([]string, 0, len(entries))
 	for _, entry := range entries {
 		chunkKey, labelValue, _, err := parseChunkTimeRangeValue(entry.RangeValue, entry.Value)

diff --git a/pkg/chunk/chunk_store_test.go b/pkg/chunk/chunk_store_test.go
@@ -77,6 +77,8 @@ func newTestChunkStoreConfig(t require.TestingT, schemaName string, storeCfg Sto
 		tbmConfig TableManagerConfig
 		schemaCfg = DefaultSchemaConfig("", schemaName, 0)
 	)
+	err := schemaCfg.Validate()
+	require.NoError(t, err)
 	flagext.DefaultValues(&tbmConfig)
 	storage := NewMockStorage()
 	tableManager, err := NewTableManager(tbmConfig, schemaCfg, maxChunkAge, storage, nil)

diff --git a/pkg/chunk/chunk_store_utils.go b/pkg/chunk/chunk_store_utils.go
@@ -10,6 +10,7 @@ import (
 	"github.com/prometheus/prometheus/promql"
 
 	"github.com/cortexproject/cortex/pkg/chunk/cache"
+	"github.com/cortexproject/cortex/pkg/querier/astmapper"
 	"github.com/cortexproject/cortex/pkg/util"
 	"github.com/cortexproject/cortex/pkg/util/spanlogger"
 )
@@ -146,13 +147,13 @@ func (c *Fetcher) worker() {
 // FetchChunks fetches a set of chunks from cache and store. Note that the keys passed in must be
 // lexicographically sorted, while the returned chunks are not in the same order as the passed in chunks.
 func (c *Fetcher) FetchChunks(ctx context.Context, chunks []Chunk, keys []string) ([]Chunk, error) {
-	log, ctx := spanlogger.New(ctx, "ChunkStore.fetchChunks")
+	log, ctx := spanlogger.New(ctx, "ChunkStore.FetchChunks")
 	defer log.Span.Finish()
 
 	// Now fetch the actual chunk data from Memcache / S3
 	cacheHits, cacheBufs, _ := c.cache.Fetch(ctx, keys)
 
-	fromCache, missing, err := c.processCacheResponse(chunks, cacheHits, cacheBufs)
+	fromCache, missing, err := c.processCacheResponse(ctx, chunks, cacheHits, cacheBufs)
 	if err != nil {
 		level.Warn(log).Log("msg", "error fetching from cache", "err", err)
 	}
@@ -199,12 +200,14 @@ func (c *Fetcher) writeBackCache(ctx context.Context, chunks []Chunk) error {
 
 // ProcessCacheResponse decodes the chunks coming back from the cache, separating
 // hits and misses.
-func (c *Fetcher) processCacheResponse(chunks []Chunk, keys []string, bufs [][]byte) ([]Chunk, []Chunk, error) {
+func (c *Fetcher) processCacheResponse(ctx context.Context, chunks []Chunk, keys []string, bufs [][]byte) ([]Chunk, []Chunk, error) {
 	var (
 		requests  = make([]decodeRequest, 0, len(keys))
 		responses = make(chan decodeResponse)
 		missing   []Chunk
 	)
+	log, ctx := spanlogger.New(ctx, "Fetcher.processCacheResponse")
+	defer log.Span.Finish()
 
 	i, j := 0, 0
 	for i < len(chunks) && j < len(keys) {
@@ -229,6 +232,7 @@ func (c *Fetcher) processCacheResponse(chunks []Chunk, keys []string, bufs [][]b
 	for ; i < len(chunks); i++ {
 		missing = append(missing, chunks[i])
 	}
+	level.Debug(log).Log("chunks", len(chunks), "decodeRequests", len(requests), "missing", len(missing))
 
 	go func() {
 		for _, request := range requests {
@@ -252,3 +256,10 @@ func (c *Fetcher) processCacheResponse(chunks []Chunk, keys []string, bufs [][]b
 	}
 	return found, missing, err
 }
+
+func injectShardLabels(chunks []Chunk, shard astmapper.ShardAnnotation) {
+	for i, chunk := range chunks {
+		chunk.Metric = append(chunk.Metric, shard.Label())
+		chunks[i] = chunk
+	}
+}
diff --git a/pkg/chunk/schema.go b/pkg/chunk/schema.go
@@ -7,7 +7,11 @@ import (
 	"fmt"
 	"strings"
 
+	"strconv"
+
 	jsoniter "github.com/json-iterator/go"
+
+	"github.com/cortexproject/cortex/pkg/querier/astmapper"
 	"github.com/prometheus/common/model"
 	"github.com/prometheus/prometheus/pkg/labels"
 )
@@ -46,6 +50,7 @@ type Schema interface {
 	GetReadQueriesForMetric(from, through model.Time, userID string, metricName string) ([]IndexQuery, error)
 	GetReadQueriesForMetricLabel(from, through model.Time, userID string, metricName string, labelName string) ([]IndexQuery, error)
 	GetReadQueriesForMetricLabelValue(from, through model.Time, userID string, metricName string, labelName string, labelValue string) ([]IndexQuery, error)
+	FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery
 
 	// If the query resulted in series IDs, use this method to find chunks.
 	GetChunksForSeries(from, through model.Time, userID string, seriesID []byte) ([]IndexQuery, error)
@@ -114,7 +119,7 @@ func (s schema) GetCacheKeysAndLabelWriteEntries(from, through model.Time, userI
 		key := strings.Join([]string{
 			bucket.tableName,
 			bucket.hashKey,
-			string(labelsSeriesID(labels)),
+			string(LabelsSeriesID(labels)),
 		},
 			"-",
 		)
@@ -216,6 +221,10 @@ func (s schema) GetLabelNamesForSeries(from, through model.Time, userID string,
 	return result, nil
 }
 
+func (s schema) FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery {
+	return s.entries.FilterReadQueries(queries, shard)
+}
+
 type entries interface {
 	GetWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error)
 	GetLabelWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error)
@@ -226,13 +235,23 @@ type entries interface {
 	GetReadMetricLabelValueQueries(bucket Bucket, metricName string, labelName string, labelValue string) ([]IndexQuery, error)
 	GetChunksForSeries(bucket Bucket, seriesID []byte) ([]IndexQuery, error)
 	GetLabelNamesForSeries(bucket Bucket, seriesID []byte) ([]IndexQuery, error)
+	FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery
+}
+
+// noops is a placeholder which can be embedded to provide default implementations
+type noops struct{}
+
+func (n noops) FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) []IndexQuery {
+	return queries
 }
 
 // original entries:
 // - hash key: <userid>:<bucket>:<metric name>
 // - range key: <label name>\0<label value>\0<chunk name>
 
-type originalEntries struct{}
+type originalEntries struct {
+	noops
+}
 
 func (originalEntries) GetWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error) {
 	chunkIDBytes := []byte(chunkID)
@@ -349,7 +368,9 @@ func (base64Entries) GetReadMetricLabelValueQueries(bucket Bucket, metricName st
 //    - range key: \0<base64(label value)>\0<chunk name>\0<version 2>
 // 2) - hash key: <userid>:<hour bucket>:<metric name>
 //    - range key: \0\0<chunk name>\0<version 3>
-type labelNameInHashKeyEntries struct{}
+type labelNameInHashKeyEntries struct {
+	noops
+}
 
 func (labelNameInHashKeyEntries) GetWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error) {
 	chunkIDBytes := []byte(chunkID)
@@ -423,7 +444,9 @@ func (labelNameInHashKeyEntries) GetLabelNamesForSeries(_ Bucket, _ []byte) ([]I
 // v5 schema is an extension of v4, with the chunk end time in the
 // range key to improve query latency.  However, it did it wrong
 // so the chunk end times are ignored.
-type v5Entries struct{}
+type v5Entries struct {
+	noops
+}
 
 func (v5Entries) GetWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error) {
 	chunkIDBytes := []byte(chunkID)
@@ -496,7 +519,9 @@ func (v5Entries) GetLabelNamesForSeries(_ Bucket, _ []byte) ([]IndexQuery, error
 
 // v6Entries fixes issues with v5 time encoding being wrong (see #337), and
 // moves label value out of range key (see #199).
-type v6Entries struct{}
+type v6Entries struct {
+	noops
+}
 
 func (v6Entries) GetWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error) {
 	chunkIDBytes := []byte(chunkID)
@@ -576,14 +601,15 @@ func (v6Entries) GetLabelNamesForSeries(_ Bucket, _ []byte) ([]IndexQuery, error
 
 // v9Entries adds a layer of indirection between labels -> series -> chunks.
 type v9Entries struct {
+	noops
 }
 
 func (v9Entries) GetWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error) {
 	return nil, ErrNotSupported
 }
 
 func (v9Entries) GetLabelWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error) {
-	seriesID := labelsSeriesID(labels)
+	seriesID := LabelsSeriesID(labels)
 
 	entries := []IndexEntry{
 		// Entry for metricName -> seriesID
@@ -613,7 +639,7 @@ func (v9Entries) GetLabelWriteEntries(bucket Bucket, metricName string, labels l
 }
 
 func (v9Entries) GetChunkWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error) {
-	seriesID := labelsSeriesID(labels)
+	seriesID := LabelsSeriesID(labels)
 	encodedThroughBytes := encodeTime(bucket.through)
 
 	entries := []IndexEntry{
@@ -683,7 +709,7 @@ func (v10Entries) GetWriteEntries(bucket Bucket, metricName string, labels label
 }
 
 func (s v10Entries) GetLabelWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error) {
-	seriesID := labelsSeriesID(labels)
+	seriesID := LabelsSeriesID(labels)
 
 	// read first 32 bits of the hash and use this to calculate the shard
 	shard := binary.BigEndian.Uint32(seriesID) % s.rowShards
@@ -716,7 +742,7 @@ func (s v10Entries) GetLabelWriteEntries(bucket Bucket, metricName string, label
 }
 
 func (v10Entries) GetChunkWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error) {
-	seriesID := labelsSeriesID(labels)
+	seriesID := LabelsSeriesID(labels)
 	encodedThroughBytes := encodeTime(bucket.through)
 
 	entries := []IndexEntry{
@@ -782,13 +808,29 @@ func (v10Entries) GetLabelNamesForSeries(_ Bucket, _ []byte) ([]IndexQuery, erro
 	return nil, ErrNotSupported
 }
 
+// FilterReadQueries will return only queries that match a certain shard
+func (v10Entries) FilterReadQueries(queries []IndexQuery, shard *astmapper.ShardAnnotation) (matches []IndexQuery) {
+	if shard == nil {
+		return queries
+	}
+
+	for _, query := range queries {
+		s := strings.Split(query.HashValue, ":")[0]
+		n, err := strconv.Atoi(s)
+		if err == nil && n == shard.Shard {
+			matches = append(matches, query)
+		}
+	}
+	return matches
+}
+
 // v11Entries builds on v10 but adds index entries for each series to store respective labels.
 type v11Entries struct {
 	v10Entries
 }
 
 func (s v11Entries) GetLabelWriteEntries(bucket Bucket, metricName string, labels labels.Labels, chunkID string) ([]IndexEntry, error) {
-	seriesID := labelsSeriesID(labels)
+	seriesID := LabelsSeriesID(labels)
 
 	// read first 32 bits of the hash and use this to calculate the shard
 	shard := binary.BigEndian.Uint32(seriesID) % s.rowShards
@@ -845,4 +887,5 @@ func (v11Entries) GetLabelNamesForSeries(bucket Bucket, seriesID []byte) ([]Inde
 			HashValue: string(seriesID),
 		},
 	}, nil
+
 }
diff --git a/pkg/chunk/schema_config.go b/pkg/chunk/schema_config.go
@@ -192,15 +192,29 @@ func (cfg *SchemaConfig) loadFromFile() error {
 // Validate the schema config and returns an error if the validation
 // doesn't pass
 func (cfg *SchemaConfig) Validate() error {
-	for _, periodCfg := range cfg.Configs {
+	for i, periodCfg := range cfg.Configs {
 		if err := periodCfg.validate(); err != nil {
 			return err
 		}
-	}
 
+		// apply default row shards
+		if periodCfg.RowShards == 0 {
+			periodCfg.RowShards = defaultRowShards(periodCfg.Schema)
+			cfg.Configs[i] = periodCfg
+		}
+	}
 	return nil
 }
 
+func defaultRowShards(schema string) uint32 {
+	switch schema {
+	case "v1", "v2", "v3", "v4", "v5", "v6", "v9":
+		return 0
+	default:
+		return 16
+	}
+}
+
 // ForEachAfter will call f() on every entry after t, splitting
 // entries if necessary so there is an entry starting at t
 func (cfg *SchemaConfig) ForEachAfter(t model.Time, f func(config *PeriodConfig)) {
@@ -219,7 +233,7 @@ func (cfg *SchemaConfig) ForEachAfter(t model.Time, f func(config *PeriodConfig)
 
 // CreateSchema returns the schema defined by the PeriodConfig
 func (cfg PeriodConfig) CreateSchema() Schema {
-	rowShards := uint32(16)
+	rowShards := defaultRowShards(cfg.Schema)
 	if cfg.RowShards > 0 {
 		rowShards = cfg.RowShards
 	}