cortexproject · pracucci · Sep 22, 2020 · Sep 18, 2020 · Sep 18, 2020 · Sep 18, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,11 +7,13 @@
   * `-experimental.distributor.user-subring-size` flag renamed to `-distributor.ingestion-tenant-shard-size`
   * `user_subring_size` limit YAML config option renamed to `ingestion_tenant_shard_size`
 * [CHANGE] Dropped "blank Alertmanager configuration; using fallback" message from Info to Debug level. #3205
+* [CHANGE] Zone-awareness replication for time-series now should be explicitly enabled in the distributor via the `-distributor.zone-awareness-enabled` CLI flag (or its respective YAML config option). Before, zone-aware replication was implicitly enabled if a zone was set on ingesters. #3200
 * [FEATURE] Added support for shuffle-sharding queriers in the query-frontend. When configured (`-frontend.max-queriers-per-user` globally, or using per-user limit `max_queriers_per_user`), each user's requests will be handled by different set of queriers. #3113
 * [ENHANCEMENT] Added `cortex_query_frontend_connected_clients` metric to show the number of workers currently connected to the frontend. #3207
 * [ENHANCEMENT] Shuffle sharding: improved shuffle sharding in the write path. Shuffle sharding now should be explicitly enabled via `-distributor.sharding-strategy` CLI flag (or its respective YAML config option) and guarantees stability, consistency, shuffling and balanced zone-awareness properties. #3090
 * [ENHANCEMENT] Ingester: added new metric `cortex_ingester_active_series` to track active series more accurately. Also added options to control whether active series tracking is enabled (`-ingester.active-series-enabled`, defaults to false), and how often this metric is updated (`-ingester.active-series-update-period`) and max idle time for series to be considered inactive (`-ingester.active-series-idle-timeout`). #3153
 * [ENHANCEMENT] Blocksconvert – Builder: download plan file locally before processing it. #3209
+* [ENHANCEMENT] Store-gateway: added zone-aware replication support to blocks replication in the store-gateway. #3200
 * [BUGFIX] No-longer-needed ingester operations for queries triggered by queriers and rulers are now canceled. #3178
 * [BUGFIX] Ruler: directories in the configured `rules-path` will be removed on startup and shutdown in order to ensure they don't persist between runs. #3195
 * [BUGFIX] Handle hash-collisions in the query path. #3192

diff --git a/docs/blocks-storage/store-gateway.md b/docs/blocks-storage/store-gateway.md
@@ -56,6 +56,16 @@ To protect from this, when an healthy store-gateway instance finds another insta
 
 This feature is called **auto-forget** and is built into the store-gateway.
 
+### Zone-awareness
+
+The store-gateway replication optionally supports [zone-awareness](../guides/zone-replication.md). When zone-aware replication is enabled and the blocks replication factor is > 1, each block is guaranteed to be replicated across store-gateway instances running in different availability zones.
+
+**To enable** the zone-aware replication for the store-gateways you should:
+
+1. Configure the availability zone for each store-gateway via the `-store-gateway.sharding-ring.instance-availability-zone` CLI flag (or its respective YAML config option)
+2. Enable blocks zone-aware replication via the `-store-gateway.sharding-ring.zone-awareness-enabled` CLI flag (or its respective YAML config option). Please be aware this configuration option should be set to store-gateways, queriers and rulers.
+3. Rollout store-gateways, queriers and rulers to apply the new configuration
+
 ## Caching
 
 The store-gateway supports the following caches:
@@ -207,6 +217,16 @@ store_gateway:
     # CLI flag: -store-gateway.sharding-ring.tokens-file-path
     [tokens_file_path: <string> | default = ""]
 
+    # True to enable zone-awareness and replicate blocks across different
+    # availability zones.
+    # CLI flag: -store-gateway.sharding-ring.zone-awareness-enabled
+    [zone_awareness_enabled: <boolean> | default = false]
+
+    # The availability zone where this instance is running. Required if
+    # zone-awareness is enabled.
+    # CLI flag: -store-gateway.sharding-ring.instance-availability-zone
+    [instance_availability_zone: <string> | default = ""]
+
   # The sharding strategy to use. Supported values are: default,
   # shuffle-sharding.
   # CLI flag: -store-gateway.sharding-strategy

diff --git a/docs/blocks-storage/store-gateway.template b/docs/blocks-storage/store-gateway.template
@@ -56,6 +56,16 @@ To protect from this, when an healthy store-gateway instance finds another insta
 
 This feature is called **auto-forget** and is built into the store-gateway.
 
+### Zone-awareness
+
+The store-gateway replication optionally supports [zone-awareness](../guides/zone-replication.md). When zone-aware replication is enabled and the blocks replication factor is > 1, each block is guaranteed to be replicated across store-gateway instances running in different availability zones.
+
+**To enable** the zone-aware replication for the store-gateways you should:
+
+1. Configure the availability zone for each store-gateway via the `-store-gateway.sharding-ring.instance-availability-zone` CLI flag (or its respective YAML config option)
+2. Enable blocks zone-aware replication via the `-store-gateway.sharding-ring.zone-awareness-enabled` CLI flag (or its respective YAML config option). Please be aware this configuration option should be set to store-gateways, queriers and rulers.
+3. Rollout store-gateways, queriers and rulers to apply the new configuration
+
 ## Caching
 
 The store-gateway supports the following caches:

diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
@@ -523,6 +523,11 @@ lifecycler:
     # CLI flag: -distributor.replication-factor
     [replication_factor: <int> | default = 3]
 
+    # True to enable the zone-awareness and replicate ingested samples across
+    # different availability zones.
+    # CLI flag: -distributor.zone-awareness-enabled
+    [zone_awareness_enabled: <boolean> | default = false]
+
   # Number of tokens for each ingester.
   # CLI flag: -ingester.num-tokens
   [num_tokens: <int> | default = 128]
@@ -559,8 +564,7 @@ lifecycler:
   # CLI flag: -ingester.tokens-file-path
   [tokens_file_path: <string> | default = ""]
 
-  # The availability zone of the host, this instance is running on. Default is
-  # an empty string, which disables zone awareness for writes.
+  # The availability zone where this instance is running.
   # CLI flag: -ingester.availability-zone
   [availability_zone: <string> | default = ""]
 
@@ -3663,6 +3667,16 @@ sharding_ring:
   # CLI flag: -store-gateway.sharding-ring.tokens-file-path
   [tokens_file_path: <string> | default = ""]
 
+  # True to enable zone-awareness and replicate blocks across different
+  # availability zones.
+  # CLI flag: -store-gateway.sharding-ring.zone-awareness-enabled
+  [zone_awareness_enabled: <boolean> | default = false]
+
+  # The availability zone where this instance is running. Required if
+  # zone-awareness is enabled.
+  # CLI flag: -store-gateway.sharding-ring.instance-availability-zone
+  [instance_availability_zone: <string> | default = ""]
+
 # The sharding strategy to use. Supported values are: default, shuffle-sharding.
 # CLI flag: -store-gateway.sharding-strategy
 [sharding_strategy: <string> | default = "default"]

diff --git a/docs/guides/zone-replication.md b/docs/guides/zone-replication.md
@@ -5,26 +5,39 @@ weight: 5
 slug: zone-aware-replication
 ---
 
-In a default configuration, time-series written to ingesters are replicated based on the container/pod name of the ingester instances. It is completely possible that all the replicas for the given time-series are held with in the same availability zone, even if the cortex infrastructure spans multiple zones within the region. Storing multiple replicas for a given time-series poses a risk for data loss if there is an outage affecting various nodes within a zone or a total outage.
+Cortex supports data replication for different services. By default, data is transparently replicated across the whole pool of service instances, regardless of whether these instances are all running within the same availability zone (or data center, or rack) or in different ones.
 
-## Configuration
+It is completely possible that all the replicas for the given data are held within the same availability zone, even if the Cortex cluster spans multiple zones. Storing multiple replicas for a given data within the same availability zone poses a risk for data loss if there is an outage affecting various nodes within a zone or a full zone outage.
 
-Cortex can be configured to consider an availability zone value in its replication system. Doing so mitigates risks associated with losing multiple nodes within the same availability zone. The availability zone for an ingester can be defined on the command line of the ingester using the `ingester.availability-zone` flag or using the yaml configuration:
+For this reason, Cortex optionally supports zone-aware replication. When zone-aware replication is **enabled**, replicas for the given data are guaranteed to span across different availability zones. This requires Cortex cluster to run at least in a number of zones equal to the configured replication factor.
 
-```yaml
-ingester:
-      lifecycler:
-        availability_zone: "zone-3"
-```
+The Cortex services supporting **zone-aware replication** are:
 
-## Zone Replication Considerations
+- **[Distributors and Ingesters](#distributors-and-ingesters-time-series-replication)**
+- **[Store-gateways](#store-gateways-blocks-replication)** ([blocks storage](../blocks-storage/_index.md) only)
 
-Enabling availability zone awareness helps mitigate risks regarding data loss within a single zone, some items need consideration by an operator if they are thinking of enabling this feature.
+## Distributors / Ingesters: time-series replication
 
-### Minimum number of Zones
+The Cortex time-series replication is used to hold multiple (typically 3) replicas of each time series in the **ingesters**.
 
-For cortex to function correctly, there must be at least the same number of availability zones as there is replica count. So by default, a cortex cluster should be spread over 3 zones as the default replica count is 3. It is safe to have more zones than the replica count, but it cannot be less. Having fewer availability zones than replica count causes a replica write to be missed, and in some cases, the write fails if the availability zone count is too low.
+**To enable** the zone-aware replication for the ingesters you should:
 
-### Cost
+1. Configure the availability zone for each ingester via the `-ingester.availability-zone` CLI flag (or its respective YAML config option)
+2. Rollout ingesters to apply the configured zone
+3. Enable time-series zone-aware replication via the `-distributor.zone-awareness-enabled` CLI flag (or its respective YAML config option). Please be aware this configuration option should be set to distributors, queriers and rulers.
 
-Depending on the existing cortex infrastructure being used, this may cause an increase in running costs as most cloud providers charge for cross availability zone traffic. The most significant change would be for a cortex cluster currently running in a singular zone.
+## Store-gateways: blocks replication
+
+The Cortex [store-gateway](../blocks-storage/store-gateway.md) (used only when Cortex is running with the [blocks storage](../blocks-storage/_index.md)) supports blocks sharding, used to horizontally scale blocks in a large cluster without hitting any vertical scalability limit.
+
+To enable the zone-aware replication for the store-gateways, please refer to the [store-gateway](../blocks-storage/store-gateway.md#zone-awareness) documentation.
+
+## Minimum number of zones
+
+For Cortex to function correctly, there must be at least the same number of availability zones as the replication factor. For example, if the replication factor is configured to 3 (default for time-series replication), the Cortex cluster should be spread at least over 3 availability zones.
+
+It is safe to have more zones than the replication factor, but it cannot be less. Having fewer availability zones than replication factor causes a replica write to be missed, and in some cases, the write fails if the availability zones count is too low.
+
+## Impact on costs
+
+Depending on the underlying infrastructure being used, deploying Cortex across multiple availability zones may cause an increase in running costs as most cloud providers charge for inter availability zone networking. The most significant change would be for a Cortex cluster currently running in a single zone.
diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go
@@ -963,7 +963,7 @@ func prepare(t *testing.T, cfg prepConfig) ([]*Distributor, []mockIngester, *rin
 		addr := fmt.Sprintf("%d", i)
 		ingesterDescs[addr] = ring.IngesterDesc{
 			Addr:      addr,
-			Zone:      addr,
+			Zone:      "",
 			State:     ring.ACTIVE,
 			Timestamp: time.Now().Unix(),
 			Tokens:    []uint32{uint32((math.MaxUint32 / cfg.numIngesters) * i)},

diff --git a/pkg/ring/lifecycler.go b/pkg/ring/lifecycler.go
@@ -101,7 +101,7 @@ func (cfg *LifecyclerConfig) RegisterFlagsWithPrefix(prefix string, f *flag.Flag
 	f.StringVar(&cfg.Addr, prefix+"lifecycler.addr", "", "IP address to advertise in consul.")
 	f.IntVar(&cfg.Port, prefix+"lifecycler.port", 0, "port to advertise in consul (defaults to server.grpc-listen-port).")
 	f.StringVar(&cfg.ID, prefix+"lifecycler.ID", hostname, "ID to register into consul.")
-	f.StringVar(&cfg.Zone, prefix+"availability-zone", "", "The availability zone of the host, this instance is running on. Default is an empty string, which disables zone awareness for writes.")
+	f.StringVar(&cfg.Zone, prefix+"availability-zone", "", "The availability zone where this instance is running.")
 }
 
 // Lifecycler is responsible for managing the lifecycle of entries in the ring.

diff --git a/pkg/ring/replication_strategy.go b/pkg/ring/replication_strategy.go
@@ -9,7 +9,7 @@ type ReplicationStrategy interface {
 	// Filter out unhealthy instances and checks if there're enough instances
 	// for an operation to succeed. Returns an error if there are not enough
 	// instances.
-	Filter(instances []IngesterDesc, op Operation, replicationFactor int, heartbeatTimeout time.Duration) (healthy []IngesterDesc, maxFailures int, err error)
+	Filter(instances []IngesterDesc, op Operation, replicationFactor int, heartbeatTimeout time.Duration, zoneAwarenessEnabled bool) (healthy []IngesterDesc, maxFailures int, err error)
 
 	// ShouldExtendReplicaSet returns true if given an instance that's going to be
 	// added to the replica set, the replica set size should be extended by 1
@@ -25,7 +25,7 @@ type DefaultReplicationStrategy struct{}
 // - Filters out dead ingesters so the one doesn't even try to write to them.
 // - Checks there is enough ingesters for an operation to succeed.
 // The ingesters argument may be overwritten.
-func (s *DefaultReplicationStrategy) Filter(ingesters []IngesterDesc, op Operation, replicationFactor int, heartbeatTimeout time.Duration) ([]IngesterDesc, int, error) {
+func (s *DefaultReplicationStrategy) Filter(ingesters []IngesterDesc, op Operation, replicationFactor int, heartbeatTimeout time.Duration, zoneAwarenessEnabled bool) ([]IngesterDesc, int, error) {
 	// We need a response from a quorum of ingesters, which is n/2 + 1.  In the
 	// case of a node joining/leaving, the actual replica set might be bigger
 	// than the replication factor, so use the bigger or the two.
@@ -49,8 +49,14 @@ func (s *DefaultReplicationStrategy) Filter(ingesters []IngesterDesc, op Operati
 	// This is just a shortcut - if there are not minSuccess available ingesters,
 	// after filtering out dead ones, don't even bother trying.
 	if len(ingesters) < minSuccess {
-		err := fmt.Errorf("at least %d live replicas required, could only find %d",
-			minSuccess, len(ingesters))
+		var err error
+
+		if zoneAwarenessEnabled {
+			err = fmt.Errorf("at least %d live replicas required across different availability zones, could only find %d", minSuccess, len(ingesters))
+		} else {
+			err = fmt.Errorf("at least %d live replicas required, could only find %d", minSuccess, len(ingesters))
+		}
+
 		return nil, 0, err
 	}
 

diff --git a/pkg/ring/replication_strategy_test.go b/pkg/ring/replication_strategy_test.go
@@ -91,7 +91,7 @@ func TestRingReplicationStrategy(t *testing.T) {
 
 		t.Run(fmt.Sprintf("[%d]", i), func(t *testing.T) {
 			strategy := &DefaultReplicationStrategy{}
-			liveIngesters, maxFailure, err := strategy.Filter(ingesters, tc.op, tc.RF, 100*time.Second)
+			liveIngesters, maxFailure, err := strategy.Filter(ingesters, tc.op, tc.RF, 100*time.Second, false)
 			if tc.ExpectedError == "" {
 				assert.NoError(t, err)
 				assert.Equal(t, tc.LiveIngesters, len(liveIngesters))