chris-ng-1987
diff --git a/‎CHANGELOG.md
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎development/tsdb-blocks-storage-s3/config/cortex.yaml
Lines changed: 11 additions & 0 deletions b/‎development/tsdb-blocks-storage-s3/config/cortex.yaml
Lines changed: 11 additions & 0 deletions
diff --git a/‎development/tsdb-blocks-storage-s3/docker-compose.yml
Lines changed: 14 additions & 0 deletions b/‎development/tsdb-blocks-storage-s3/docker-compose.yml
Lines changed: 14 additions & 0 deletions
diff --git a/‎docs/operations/blocks-storage.md
Lines changed: 69 additions & 5 deletions b/‎docs/operations/blocks-storage.md
Lines changed: 69 additions & 5 deletions
diff --git a/‎pkg/compactor/compactor.go
Lines changed: 116 additions & 5 deletions b/‎pkg/compactor/compactor.go
Lines changed: 116 additions & 5 deletions
diff --git a/‎pkg/compactor/compactor_http.go
Lines changed: 35 additions & 0 deletions b/‎pkg/compactor/compactor_http.go
Lines changed: 35 additions & 0 deletions
@@ -10,6 +10,7 @@
 * [FEATURE] Added user sub rings to distribute users to a subset of ingesters. #1947
   * `--experimental.distributor.user-subring-size`
 * [FEATURE] Added flag `-experimental.ruler.enable-api` to enable the ruler api which implements the Prometheus API `/api/v1/rules` and `/api/v1/alerts` endpoints under the configured `-http.prefix`. #1999
+* [FEATURE] Added sharding support to compactor when using the experimental TSDB blocks storage. #2113
 * [ENHANCEMENT] Experimental TSDB: Export TSDB Syncer metrics from Compactor component, they are prefixed with `cortex_compactor_`. #2023
 * [ENHANCEMENT] Experimental TSDB: Added dedicated flag `-experimental.tsdb.bucket-store.tenant-sync-concurrency` to configure the maximum number of concurrent tenants for which blocks are synched. #2026
 * [ENHANCEMENT] Experimental TSDB: Expose metrics for objstore operations (prefixed with `cortex_<component>_thanos_objstore_`, component being one of `ingester`, `querier` and `compactor`). #2027
 
@@ -55,3 +55,14 @@ ruler:
 
 storage:
   engine: tsdb
+
+compactor:
+  compaction_interval: 30s
+  data_dir:            /tmp/cortex-compactor
+  consistency_delay:   1m
+  sharding_enabled:    true
+  sharding_ring:
+    kvstore:
+      store: consul
+      consul:
+        host: consul:8500
@@ -94,3 +94,17 @@ services:
       - 8005:8005
     volumes:
       - ./config:/cortex/config
+
+  compactor:
+    build:
+      context:    .
+      dockerfile: dev.dockerfile
+    image: cortex
+    command: ["sh", "-c", "sleep 3 && exec ./cortex -config.file=./config/cortex.yaml -target=compactor -server.http-listen-port=8006 -server.grpc-listen-port=9006"]
+    depends_on:
+      - consul
+      - minio
+    ports:
+      - 8006:8006
+    volumes:
+      - ./config:/cortex/config
@@ -41,7 +41,7 @@ The blocks chunks and the entire index is never fully downloaded by the queriers
 
 The index header is also stored to the local disk, in order to avoid to re-download it on subsequent restarts of a querier. For this reason, it's recommended - but not required - to run the querier with a persistent local disk. For example, if you're running the Cortex cluster in Kubernetes, you may use a StatefulSet with a persistent volume claim for the queriers.
 
-### Sharding and Replication
+### Series sharding and replication
 
 The series sharding and replication doesn't change based on the storage engine, so the general overview provided by the "[Cortex architecture](../architecture.md)" documentation applies to the blocks storage as well.
 
@@ -60,6 +60,17 @@ The **horizontal compaction** triggers after the vertical compaction and compact
 
 The compactor is **stateless**.
 
+#### Compactor sharding
+
+The compactor optionally supports sharding. When sharding is enabled, multiple compactor instances can coordinate to split the workload and shard blocks by tenant. All the blocks of a tenant are processed by a single compactor instance at any given time, but compaction for different tenants may simultaneously run on different compactor instances.
+
+Whenever the pool of compactors increase or decrease (ie. following up a scale up/down), tenants are resharded across the available compactor instances without any manual intervention. Compactors coordinate via the Cortex [hash ring](../architecture.md#the-hash-ring).
+
+#### Compactor HTTP endpoints
+
+- `GET /compactor_ring`<br />
+  Displays the status of the compactors ring, including the tokens owned by each compactor and an option to remove (forget) instances from the ring.
+
 ## Configuration
 
 The general [configuration documentation](../configuration/_index.md) also applied to a Cortex cluster running the blocks storage, with few differences:
@@ -251,14 +262,67 @@ compactor:
     # interval.
     # CLI flag: -compactor.compaction-retries
     [compaction_retries: <int> | default = 3]
+
+    # Shard tenants across multiple compactor instances. Sharding is required if
+    # you run multiple compactor instances, in order to coordinate compactions
+    # and avoid race conditions leading to the same tenant blocks simultaneously
+    # compacted by different instances.
+    # CLI flag: -compactor.sharding-enabled
+    [sharding_enabled: <bool> | default = false]
+
+    # Configures the ring used when sharding is enabled.
+    sharding_ring:
+      kvstore:
+        # Backend storage to use for the ring. Supported values are: consul, etcd,
+        # inmemory, multi, memberlist (experimental).
+        # CLI flag: -compactor.ring.store
+        [store: <string> | default = "consul"]
+
+        # The prefix for the keys in the store. Should end with a /.
+        # CLI flag: -compactor.ring.prefix
+        [prefix: <string> | default = "collectors/"]
+
+        # The consul_config configures the consul client.
+        # The CLI flags prefix for this block config is: compactor.ring
+        [consul: <consul_config>]
+
+        # The etcd_config configures the etcd client.
+        # The CLI flags prefix for this block config is: compactor.ring
+        [etcd: <etcd_config>]
+
+        # The memberlist_config configures the Gossip memberlist.
+        # The CLI flags prefix for this block config is: compactor.ring
+        [memberlist: <memberlist_config>]
+
+        multi:
+          # Primary backend storage used by multi-client.
+          # CLI flag: -compactor.ring.multi.primary
+          [primary: <string> | default = ""]
+
+          # Secondary backend storage used by multi-client.
+          # CLI flag: -compactor.ring.multi.secondary
+          [secondary: <string> | default = ""]
+
+          # Mirror writes to secondary store.
+          # CLI flag: -compactor.ring.multi.mirror-enabled
+          [mirror_enabled: <boolean> | default = false]
+
+          # Timeout for storing value to secondary store.
+          # CLI flag: -compactor.ring.multi.mirror-timeout
+          [mirror_timeout: <duration> | default = 2s]
+
+      # Period at which to heartbeat to the ring.
+      # CLI flag: -compactor.ring.heartbeat-period
+      [heartbeat_period: <duration> | default = 5m]
+
+      # The heartbeat timeout after which compactors are considered unhealthy
+      # within the ring.
+      # CLI flag: -compactor.ring.heartbeat-timeout
+      [heartbeat_timeout: <duration> | default = 1m]
 ```
 
 ## Known issues
 
-### Horizontal scalability
-
-The compactor currently doesn't support horizontal scalability and only 1 replica of the compactor should run at any given time within a Cortex cluster.
-
 ### Migrating from the chunks to the blocks storage
 
 Currently, no smooth migration path is provided to migrate from chunks to blocks storage. For this reason, the blocks storage can only be enabled in new Cortex clusters.
@@ -4,6 +4,7 @@ import (
 	"context"
 	"flag"
 	"fmt"
+	"hash/fnv"
 	"path"
 	"strings"
 	"sync"
@@ -19,6 +20,7 @@ import (
 	"github.com/thanos-io/thanos/pkg/compact/downsample"
 	"github.com/thanos-io/thanos/pkg/objstore"
 
+	"github.com/cortexproject/cortex/pkg/ring"
 	cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb"
 	"github.com/cortexproject/cortex/pkg/util"
 )
@@ -33,6 +35,10 @@ type Config struct {
 	CompactionInterval   time.Duration            `yaml:"compaction_interval"`
 	CompactionRetries    int                      `yaml:"compaction_retries"`
 
+	// Compactors sharding.
+	ShardingEnabled bool       `yaml:"sharding_enabled"`
+	ShardingRing    RingConfig `yaml:"sharding_ring"`
+
 	// No need to add options to customize the retry backoff,
 	// given the defaults should be fine, but allow to override
 	// it in tests.
@@ -42,6 +48,8 @@ type Config struct {
 
 // RegisterFlags registers the Compactor flags.
 func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
+	cfg.ShardingRing.RegisterFlags(f)
+
 	cfg.BlockRanges = cortex_tsdb.DurationList{2 * time.Hour, 12 * time.Hour, 24 * time.Hour}
 	cfg.retryMinBackoff = 10 * time.Second
 	cfg.retryMaxBackoff = time.Minute
@@ -53,6 +61,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
 	f.StringVar(&cfg.DataDir, "compactor.data-dir", "./data", "Data directory in which to cache blocks and process compactions")
 	f.DurationVar(&cfg.CompactionInterval, "compactor.compaction-interval", time.Hour, "The frequency at which the compaction runs")
 	f.IntVar(&cfg.CompactionRetries, "compactor.compaction-retries", 3, "How many times to retry a failed compaction during a single compaction interval")
+	f.BoolVar(&cfg.ShardingEnabled, "compactor.sharding-enabled", false, "Shard tenants across multiple compactor instances. Sharding is required if you run multiple compactor instances, in order to coordinate compactions and avoid race conditions leading to the same tenant blocks simultaneously compacted by different instances.")
 }
 
 // Compactor is a multi-tenant TSDB blocks compactor based on Thanos.
@@ -75,6 +84,10 @@ type Compactor struct {
 	ctx       context.Context
 	cancelCtx context.CancelFunc
 
+	// Ring used for sharding compactions.
+	ringLifecycler *ring.Lifecycler
+	ring           *ring.Ring
+
 	// Metrics.
 	compactionRunsStarted   prometheus.Counter
 	compactionRunsCompleted prometheus.Counter
@@ -104,7 +117,13 @@ func NewCompactor(compactorCfg Config, storageCfg cortex_tsdb.Config, logger log
 		return nil, errors.Wrap(err, "failed to create TSDB compactor")
 	}
 
-	return newCompactor(ctx, cancelCtx, compactorCfg, storageCfg, bucketClient, tsdbCompactor, logger, registerer)
+	cortexCompactor, err := newCompactor(ctx, cancelCtx, compactorCfg, storageCfg, bucketClient, tsdbCompactor, logger, registerer)
+	if err != nil {
+		cancelCtx()
+		return nil, errors.Wrap(err, "failed to create Cortex blocks compactor")
+	}
+
+	return cortexCompactor, nil
 }
 
 func newCompactor(
@@ -139,29 +158,70 @@ func newCompactor(
 		}),
 	}
 
+	// Initialize the compactors ring if sharding is enabled.
+	if compactorCfg.ShardingEnabled {
+		lifecyclerCfg := compactorCfg.ShardingRing.ToLifecyclerConfig()
+		lifecycler, err := ring.NewLifecycler(lifecyclerCfg, ring.NewNoopFlushTransferer(), "compactor", ring.CompactorRingKey, false)
+		if err != nil {
+			return nil, errors.Wrap(err, "unable to initialize compactor ring lifecycler")
+		}
+
+		lifecycler.Start()
+		c.ringLifecycler = lifecycler
+
+		ring, err := ring.New(lifecyclerCfg.RingConfig, "compactor", ring.CompactorRingKey)
+		if err != nil {
+			return nil, errors.Wrap(err, "unable to initialize compactor ring")
+		}
+
+		c.ring = ring
+	}
+
 	// Register metrics.
 	if registerer != nil {
 		registerer.MustRegister(c.compactionRunsStarted, c.compactionRunsCompleted, c.compactionRunsFailed)
 		c.syncerMetrics = newSyncerMetrics(registerer)
 	}
 
+	return c, nil
+}
+
+// Start the compactor.
+func (c *Compactor) Start() {
 	// Start the compactor loop.
 	c.runner.Add(1)
 	go c.run()
-
-	return c, nil
 }
 
-// Shutdown the compactor and waits until done. This may take some time
+// Stop the compactor and waits until done. This may take some time
 // if there's a on-going compaction.
-func (c *Compactor) Shutdown() {
+func (c *Compactor) Stop() {
 	c.cancelCtx()
 	c.runner.Wait()
+
+	// Shutdown the ring lifecycler (if any)
+	if c.ringLifecycler != nil {
+		c.ringLifecycler.Shutdown()
+	}
+
+	if c.ring != nil {
+		c.ring.Stop()
+	}
 }
 
 func (c *Compactor) run() {
 	defer c.runner.Done()
 
+	// If sharding is enabled we should wait until this instance is
+	// ACTIVE within the ring.
+	if c.compactorCfg.ShardingEnabled {
+		level.Info(c.logger).Log("msg", "waiting until compactor is ACTIVE in the ring")
+		if err := c.waitRingActive(); err != nil {
+			return
+		}
+		level.Info(c.logger).Log("msg", "compactor is ACTIVE in the ring")
+	}
+
 	// Run an initial compaction before starting the interval.
 	c.compactUsersWithRetries(c.ctx)
 
@@ -215,6 +275,17 @@ func (c *Compactor) compactUsers(ctx context.Context) bool {
 			return false
 		}
 
+		// If sharding is enabled, ensure the user ID belongs to our shard.
+		if c.compactorCfg.ShardingEnabled {
+			if owned, err := c.ownUser(userID); err != nil {
+				level.Warn(c.logger).Log("msg", "unable to check if user is owned by this shard", "user", userID, "err", err)
+				continue
+			} else if !owned {
+				level.Debug(c.logger).Log("msg", "skipping user because not owned by this shard", "user", userID)
+				continue
+			}
+		}
+
 		level.Info(c.logger).Log("msg", "starting compaction of user blocks", "user", userID)
 
 		if err = c.compactUser(ctx, userID); err != nil {
@@ -290,3 +361,43 @@ func (c *Compactor) discoverUsers(ctx context.Context) ([]string, error) {
 
 	return users, err
 }
+
+func (c *Compactor) ownUser(userID string) (bool, error) {
+	// Hash the user ID.
+	hasher := fnv.New32a()
+	_, _ = hasher.Write([]byte(userID))
+	userHash := hasher.Sum32()
+
+	// Check whether this compactor instance owns the user.
+	rs, err := c.ring.Get(userHash, ring.Read, []ring.IngesterDesc{})
+	if err != nil {
+		return false, err
+	}
+
+	if len(rs.Ingesters) != 1 {
+		return false, fmt.Errorf("unexpected number of compactors in the shard (expected 1, got %d)", len(rs.Ingesters))
+	}
+
+	return rs.Ingesters[0].Addr == c.ringLifecycler.Addr, nil
+}
+
+func (c *Compactor) waitRingActive() error {
+	for {
+		// Check if the ingester is ACTIVE in the ring and our ring client
+		// has detected it.
+		if rs, err := c.ring.GetAll(); err == nil {
+			for _, i := range rs.Ingesters {
+				if i.GetAddr() == c.ringLifecycler.Addr && i.GetState() == ring.ACTIVE {
+					return nil
+				}
+			}
+		}
+
+		select {
+		case <-time.After(time.Second):
+			// Nothing to do
+		case <-c.ctx.Done():
+			return c.ctx.Err()
+		}
+	}
+}
@@ -0,0 +1,35 @@
+package compactor
+
+import (
+	"net/http"
+
+	"github.com/go-kit/kit/log/level"
+)
+
+const (
+	shardingDisabledPage = `
+	<!DOCTYPE html>
+	<html>
+		<head>
+			<meta charset="UTF-8">
+			<title>Cortex Compactor Ring</title>
+		</head>
+		<body>
+			<h1>Cortex Compactor Ring</h1>
+			<p>Compactor has no ring because sharding is disabled.</p>
+		</body>
+	</html>
+	`
+)
+
+func (c *Compactor) RingHandler(w http.ResponseWriter, req *http.Request) {
+	if c.compactorCfg.ShardingEnabled {
+		c.ring.ServeHTTP(w, req)
+		return
+	}
+
+	w.WriteHeader(http.StatusOK)
+	if _, err := w.Write([]byte(shardingDisabledPage)); err != nil {
+		level.Error(c.logger).Log("msg", "unable to serve compactor ring page", "err", err)
+	}
+}