Skip to content

Follow up of 4532 - Return data fetched from a subset of store-gateways instead of returning error if a single store-gateway fails #4839

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
* [CHANGE] Remove support for alertmanager and ruler legacy store configuration. Before upgrading, you need to convert your configuration to use the `alertmanager-storage` and `ruler-storage` configuration on the version that you're already running, then upgrade.
* [CHANGE] Disables TSDB isolation. #4825
* [CHANGE] Drops support Prometheus 1.x rule format on configdb. #4826
* [ENHANCEMENT] Querier/Ruler: Retry store-gateway in case of unexpected failure, instead of failing the query. #4532
* [ENHANCEMENT] Querier/Ruler: Retry store-gateway in case of unexpected failure, instead of failing the query. #4532 #4839
* [ENHANCEMENT] Ring: DoBatch prioritize 4xx errors when failing. #4783
* [ENHANCEMENT] Cortex now built with Go 1.18. #4829
* [FEATURE] Compactor: Added `-compactor.block-files-concurrency` allowing to configure number of go routines for download/upload block files during compaction. #4784
Expand Down
6 changes: 6 additions & 0 deletions pkg/querier/blocks_store_queryable.go
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,12 @@ func (q *blocksStoreQuerier) fetchSeriesFromStores(
if err == io.EOF {
break
}

if isRetryableError(err) {
level.Warn(spanLog).Log("err", errors.Wrapf(err, "failed to receive series from %s due to retryable error", c.RemoteAddress()))
return nil
}

if err != nil {
return errors.Wrapf(err, "failed to receive series from %s", c.RemoteAddress())
}
Expand Down
41 changes: 38 additions & 3 deletions pkg/querier/blocks_store_queryable_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,38 @@ func TestBlocksStoreQuerier_Select(t *testing.T) {
},
},
},
"multiple store-gateways has the block, but one of them fails to return on stream": {
finderResult: bucketindex.Blocks{
{ID: block1},
},
storeSetResponses: []interface{}{
map[BlocksStoreClient][]ulid.ULID{
&storeGatewayClientMock{
remoteAddr: "1.1.1.1",
mockedSeriesStreamErr: status.Error(codes.Unavailable, "unavailable"),
mockedSeriesResponses: []*storepb.SeriesResponse{
mockSeriesResponse(labels.Labels{metricNameLabel, series1Label}, minT, 2),
mockHintsResponse(block1),
}}: {block1},
},
map[BlocksStoreClient][]ulid.ULID{
&storeGatewayClientMock{remoteAddr: "2.2.2.2", mockedSeriesResponses: []*storepb.SeriesResponse{
mockSeriesResponse(labels.Labels{metricNameLabel, series1Label}, minT, 2),
mockHintsResponse(block1),
}}: {block1},
},
},
limits: &blocksStoreLimitsMock{},
queryLimiter: noOpQueryLimiter,
expectedSeries: []seriesResult{
{
lbls: labels.New(metricNameLabel, series1Label),
values: []valueResult{
{t: minT, v: 2},
},
},
},
},
}

for testName, testData := range tests {
Expand Down Expand Up @@ -1429,14 +1461,16 @@ type storeGatewayClientMock struct {
remoteAddr string
mockedSeriesResponses []*storepb.SeriesResponse
mockedSeriesErr error
mockedSeriesStreamErr error
mockedLabelNamesResponse *storepb.LabelNamesResponse
mockedLabelValuesResponse *storepb.LabelValuesResponse
mockedLabelValuesErr error
}

func (m *storeGatewayClientMock) Series(ctx context.Context, in *storepb.SeriesRequest, opts ...grpc.CallOption) (storegatewaypb.StoreGateway_SeriesClient, error) {
seriesClient := &storeGatewaySeriesClientMock{
mockedResponses: m.mockedSeriesResponses,
mockedResponses: m.mockedSeriesResponses,
mockedSeriesStreamErr: m.mockedSeriesStreamErr,
}

return seriesClient, m.mockedSeriesErr
Expand All @@ -1457,7 +1491,8 @@ func (m *storeGatewayClientMock) RemoteAddress() string {
type storeGatewaySeriesClientMock struct {
grpc.ClientStream

mockedResponses []*storepb.SeriesResponse
mockedResponses []*storepb.SeriesResponse
mockedSeriesStreamErr error
}

func (m *storeGatewaySeriesClientMock) Recv() (*storepb.SeriesResponse, error) {
Expand All @@ -1470,7 +1505,7 @@ func (m *storeGatewaySeriesClientMock) Recv() (*storepb.SeriesResponse, error) {

res := m.mockedResponses[0]
m.mockedResponses = m.mockedResponses[1:]
return res, nil
return res, m.mockedSeriesStreamErr
}

type blocksStoreLimitsMock struct {
Expand Down