Skip to content

Commit 29cfdf6

Browse files
authored
Limit number of retires when we get ProvisionedThroughputExceededException (#256)
* Limit number of retires when we get ProvisionedThroughputExceededException, include table name in the dynamodb error metric. * Review feedback
1 parent 631f77c commit 29cfdf6

File tree

1 file changed

+26
-11
lines changed

1 file changed

+26
-11
lines changed

chunk/dynamodb_client.go

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package chunk
22

33
import (
4+
"fmt"
45
"math/rand"
56
"net/url"
67
"strings"
@@ -18,16 +19,15 @@ import (
1819

1920
const (
2021
// For dynamodb errors
22+
tableNameLabel = "table"
2123
errorReasonLabel = "error"
2224
otherError = "other"
2325

2426
// Backoff for dynamoDB requests, to match AWS lib - see:
2527
// https://github.com/aws/aws-sdk-go/blob/master/service/dynamodb/customizations.go
2628
minBackoff = 50 * time.Millisecond
2729
maxBackoff = 50 * time.Second
28-
29-
// Number of synchronous dynamodb requests
30-
numDynamoRequests = 25
30+
maxRetries = 20
3131

3232
// See http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Limits.html.
3333
dynamoMaxBatchSize = 25
@@ -54,7 +54,7 @@ var (
5454
Namespace: "cortex",
5555
Name: "dynamo_failures_total",
5656
Help: "The total number of errors while storing chunks to the chunk store.",
57-
}, []string{errorReasonLabel})
57+
}, []string{tableNameLabel, errorReasonLabel})
5858
dynamoUnprocessedItems = prometheus.NewCounter(prometheus.CounterOpts{
5959
Namespace: "cortex",
6060
Name: "dynamo_unprocessed_items_total",
@@ -69,11 +69,11 @@ func init() {
6969
prometheus.MustRegister(dynamoUnprocessedItems)
7070
}
7171

72-
func recordDynamoError(err error) {
72+
func recordDynamoError(tableName string, err error) {
7373
if awsErr, ok := err.(awserr.Error); ok {
74-
dynamoFailures.WithLabelValues(awsErr.Code()).Add(float64(1))
74+
dynamoFailures.WithLabelValues(tableName, awsErr.Code()).Add(float64(1))
7575
} else {
76-
dynamoFailures.WithLabelValues(otherError).Add(float64(1))
76+
dynamoFailures.WithLabelValues(tableName, otherError).Add(float64(1))
7777
}
7878
}
7979

@@ -215,9 +215,17 @@ func (c *dynamoDBBackoffClient) batchWriteDynamo(ctx context.Context, reqs map[s
215215
}
216216
}
217217

218+
tableNames := func(reqs map[string][]*dynamodb.WriteRequest) []string {
219+
result := []string{}
220+
for tableName := range reqs {
221+
result = append(result, tableName)
222+
}
223+
return result
224+
}
225+
218226
outstanding, unprocessed := reqs, map[string][]*dynamodb.WriteRequest{}
219-
backoff := minBackoff
220-
for dictLen(outstanding)+dictLen(unprocessed) > 0 {
227+
backoff, numRetries := minBackoff, 0
228+
for dictLen(outstanding)+dictLen(unprocessed) > 0 && numRetries < maxRetries {
221229
reqs := map[string][]*dynamodb.WriteRequest{}
222230
fillReq(unprocessed, reqs)
223231
fillReq(outstanding, reqs)
@@ -237,7 +245,9 @@ func (c *dynamoDBBackoffClient) batchWriteDynamo(ctx context.Context, reqs map[s
237245
}
238246

239247
if err != nil {
240-
recordDynamoError(err)
248+
for _, tableName := range tableNames(reqs) {
249+
recordDynamoError(tableName, err)
250+
}
241251
}
242252

243253
// If there are unprocessed items, backoff and retry those items.
@@ -254,6 +264,7 @@ func (c *dynamoDBBackoffClient) batchWriteDynamo(ctx context.Context, reqs map[s
254264
copyUnprocessed(reqs, unprocessed)
255265
time.Sleep(backoff)
256266
backoff = nextBackoff(backoff)
267+
numRetries++
257268
continue
258269
}
259270

@@ -263,8 +274,12 @@ func (c *dynamoDBBackoffClient) batchWriteDynamo(ctx context.Context, reqs map[s
263274
}
264275

265276
backoff = minBackoff
277+
numRetries = 0
266278
}
267279

280+
if valuesLeft := dictLen(outstanding) + dictLen(unprocessed); valuesLeft > 0 {
281+
return fmt.Errorf("failed to write chunk after %d retries, %d values remaining", numRetries, valuesLeft)
282+
}
268283
return nil
269284
}
270285

@@ -283,7 +298,7 @@ func (c *dynamoDBBackoffClient) queryPages(ctx context.Context, input *dynamodb.
283298
}
284299

285300
if err != nil {
286-
recordDynamoError(err)
301+
recordDynamoError(*input.TableName, err)
287302

288303
if awsErr, ok := err.(awserr.Error); ok && awsErr.Code() == provisionedThroughputExceededException {
289304
time.Sleep(backoff)

0 commit comments

Comments
 (0)