Skip to content

Commit 4bb8f24

Browse files
authored
Follow up of 4532 - Return data fetched from a subset of store-gateways instead of returning error if a single store-gateway fails (#4839)
* isRetryableError on stream Signed-off-by: Alan Protasio <approtas@amazon.com> * Update Changelog Signed-off-by: Alan Protasio <approtas@amazon.com> Signed-off-by: Alan Protasio <approtas@amazon.com>
1 parent c815b3c commit 4bb8f24

File tree

3 files changed

+45
-4
lines changed

3 files changed

+45
-4
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
* [CHANGE] Remove support for alertmanager and ruler legacy store configuration. Before upgrading, you need to convert your configuration to use the `alertmanager-storage` and `ruler-storage` configuration on the version that you're already running, then upgrade.
4141
* [CHANGE] Disables TSDB isolation. #4825
4242
* [CHANGE] Drops support Prometheus 1.x rule format on configdb. #4826
43-
* [ENHANCEMENT] Querier/Ruler: Retry store-gateway in case of unexpected failure, instead of failing the query. #4532
43+
* [ENHANCEMENT] Querier/Ruler: Retry store-gateway in case of unexpected failure, instead of failing the query. #4532 #4839
4444
* [ENHANCEMENT] Ring: DoBatch prioritize 4xx errors when failing. #4783
4545
* [ENHANCEMENT] Cortex now built with Go 1.18. #4829
4646
* [FEATURE] Compactor: Added `-compactor.block-files-concurrency` allowing to configure number of go routines for download/upload block files during compaction. #4784

pkg/querier/blocks_store_queryable.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,12 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(
624624
if err == io.EOF {
625625
break
626626
}
627+
628+
if isRetryableError(err) {
629+
level.Warn(spanLog).Log("err", errors.Wrapf(err, "failed to receive series from %s due to retryable error", c.RemoteAddress()))
630+
return nil
631+
}
632+
627633
if err != nil {
628634
return errors.Wrapf(err, "failed to receive series from %s", c.RemoteAddress())
629635
}

pkg/querier/blocks_store_queryable_test.go

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,38 @@ func TestBlocksStoreQuerier_Select(t *testing.T) {
616616
},
617617
},
618618
},
619+
"multiple store-gateways has the block, but one of them fails to return on stream": {
620+
finderResult: bucketindex.Blocks{
621+
{ID: block1},
622+
},
623+
storeSetResponses: []interface{}{
624+
map[BlocksStoreClient][]ulid.ULID{
625+
&storeGatewayClientMock{
626+
remoteAddr: "1.1.1.1",
627+
mockedSeriesStreamErr: status.Error(codes.Unavailable, "unavailable"),
628+
mockedSeriesResponses: []*storepb.SeriesResponse{
629+
mockSeriesResponse(labels.Labels{metricNameLabel, series1Label}, minT, 2),
630+
mockHintsResponse(block1),
631+
}}: {block1},
632+
},
633+
map[BlocksStoreClient][]ulid.ULID{
634+
&storeGatewayClientMock{remoteAddr: "2.2.2.2", mockedSeriesResponses: []*storepb.SeriesResponse{
635+
mockSeriesResponse(labels.Labels{metricNameLabel, series1Label}, minT, 2),
636+
mockHintsResponse(block1),
637+
}}: {block1},
638+
},
639+
},
640+
limits: &blocksStoreLimitsMock{},
641+
queryLimiter: noOpQueryLimiter,
642+
expectedSeries: []seriesResult{
643+
{
644+
lbls: labels.New(metricNameLabel, series1Label),
645+
values: []valueResult{
646+
{t: minT, v: 2},
647+
},
648+
},
649+
},
650+
},
619651
}
620652

621653
for testName, testData := range tests {
@@ -1429,14 +1461,16 @@ type storeGatewayClientMock struct {
14291461
remoteAddr string
14301462
mockedSeriesResponses []*storepb.SeriesResponse
14311463
mockedSeriesErr error
1464+
mockedSeriesStreamErr error
14321465
mockedLabelNamesResponse *storepb.LabelNamesResponse
14331466
mockedLabelValuesResponse *storepb.LabelValuesResponse
14341467
mockedLabelValuesErr error
14351468
}
14361469

14371470
func (m *storeGatewayClientMock) Series(ctx context.Context, in *storepb.SeriesRequest, opts ...grpc.CallOption) (storegatewaypb.StoreGateway_SeriesClient, error) {
14381471
seriesClient := &storeGatewaySeriesClientMock{
1439-
mockedResponses: m.mockedSeriesResponses,
1472+
mockedResponses: m.mockedSeriesResponses,
1473+
mockedSeriesStreamErr: m.mockedSeriesStreamErr,
14401474
}
14411475

14421476
return seriesClient, m.mockedSeriesErr
@@ -1457,7 +1491,8 @@ func (m *storeGatewayClientMock) RemoteAddress() string {
14571491
type storeGatewaySeriesClientMock struct {
14581492
grpc.ClientStream
14591493

1460-
mockedResponses []*storepb.SeriesResponse
1494+
mockedResponses []*storepb.SeriesResponse
1495+
mockedSeriesStreamErr error
14611496
}
14621497

14631498
func (m *storeGatewaySeriesClientMock) Recv() (*storepb.SeriesResponse, error) {
@@ -1470,7 +1505,7 @@ func (m *storeGatewaySeriesClientMock) Recv() (*storepb.SeriesResponse, error) {
14701505

14711506
res := m.mockedResponses[0]
14721507
m.mockedResponses = m.mockedResponses[1:]
1473-
return res, nil
1508+
return res, m.mockedSeriesStreamErr
14741509
}
14751510

14761511
type blocksStoreLimitsMock struct {

0 commit comments

Comments
 (0)