Skip to content

Commit c5e8173

Browse files
authored
Stricter failure handling in TransportGetSnapshotsAction (#107191)
Today if there's a failure during a multi-repo get-snapshots request then we record a per-repository failure but allow the rest of the request to proceed. This is trappy for clients, it means that they must always remember to check the `failures` response field or else risk missing some results. It's also a pain for the implementation because it means we have to collect the per-repository results separately first before adding them to the final results set just in case the last one triggers a failure. This commit drops this leniency and bubbles all failures straight up to the top level.
1 parent 69ee6d7 commit c5e8173

File tree

6 files changed

+47
-44
lines changed

6 files changed

+47
-44
lines changed

docs/changelog/107191.yaml

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
pr: 107191
2+
summary: Stricter failure handling in multi-repo get-snapshots request handling
3+
area: Snapshot/Restore
4+
type: bug
5+
issues: []
6+
highlight:
7+
title: Stricter failure handling in multi-repo get-snapshots request handling
8+
body: |
9+
If a multi-repo get-snapshots request encounters a failure in one of the
10+
targeted repositories then earlier versions of Elasticsearch would proceed
11+
as if the faulty repository did not exist, except for a per-repository
12+
failure report in a separate section of the response body. This makes it
13+
impossible to paginate the results properly in the presence of failures. In
14+
versions 8.15.0 and later this API's failure handling behaviour has been
15+
made stricter, reporting an overall failure if any targeted repository's
16+
contents cannot be listed.
17+
notable: true

server/src/internalClusterTest/java/org/elasticsearch/snapshots/GetSnapshotsIT.java

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,8 @@
3131

3232
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
3333
import static org.hamcrest.Matchers.empty;
34-
import static org.hamcrest.Matchers.equalTo;
3534
import static org.hamcrest.Matchers.hasSize;
3635
import static org.hamcrest.Matchers.in;
37-
import static org.hamcrest.Matchers.instanceOf;
3836
import static org.hamcrest.Matchers.is;
3937

4038
public class GetSnapshotsIT extends AbstractSnapshotIntegTestCase {
@@ -314,6 +312,7 @@ public void testExcludePatterns() throws Exception {
314312
assertThat(
315313
clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, matchAllPattern())
316314
.setSnapshots("non-existing*", otherPrefixSnapshot1, "-o*")
315+
.setIgnoreUnavailable(true)
317316
.get()
318317
.getSnapshots(),
319318
empty()
@@ -586,12 +585,17 @@ public void testRetrievingSnapshotsWhenRepositoryIsMissing() throws Exception {
586585
final List<String> snapshotNames = createNSnapshots(repoName, randomIntBetween(1, 10));
587586
snapshotNames.sort(String::compareTo);
588587

589-
final GetSnapshotsResponse response = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, repoName, missingRepoName)
588+
final var oneRepoFuture = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, repoName, missingRepoName)
590589
.setSort(SnapshotSortKey.NAME)
591-
.get();
592-
assertThat(response.getSnapshots().stream().map(info -> info.snapshotId().getName()).toList(), equalTo(snapshotNames));
593-
assertTrue(response.getFailures().containsKey(missingRepoName));
594-
assertThat(response.getFailures().get(missingRepoName), instanceOf(RepositoryMissingException.class));
590+
.setIgnoreUnavailable(randomBoolean())
591+
.execute();
592+
expectThrows(RepositoryMissingException.class, oneRepoFuture::actionGet);
593+
594+
final var multiRepoFuture = clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, repoName, missingRepoName)
595+
.setSort(SnapshotSortKey.NAME)
596+
.setIgnoreUnavailable(randomBoolean())
597+
.execute();
598+
expectThrows(RepositoryMissingException.class, multiRepoFuture::actionGet);
595599
}
596600

597601
// Create a snapshot that is guaranteed to have a unique start time and duration for tests around ordering by either.

server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotStatusApisIT.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@
5252
import static org.hamcrest.Matchers.greaterThan;
5353
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
5454
import static org.hamcrest.Matchers.hasSize;
55-
import static org.hamcrest.Matchers.instanceOf;
5655
import static org.hamcrest.Matchers.is;
5756
import static org.hamcrest.Matchers.not;
5857
import static org.hamcrest.Matchers.oneOf;
@@ -395,16 +394,13 @@ public void testGetSnapshotsMultipleRepos() throws Exception {
395394
}
396395

397396
logger.info("--> specify all snapshot names with ignoreUnavailable=false");
398-
GetSnapshotsResponse getSnapshotsResponse2 = client.admin()
397+
final var failingFuture = client.admin()
399398
.cluster()
400399
.prepareGetSnapshots(TEST_REQUEST_TIMEOUT, randomFrom("_all", "repo*"))
401400
.setIgnoreUnavailable(false)
402401
.setSnapshots(snapshotList.toArray(new String[0]))
403-
.get();
404-
405-
for (String repo : repoList) {
406-
assertThat(getSnapshotsResponse2.getFailures().get(repo), instanceOf(SnapshotMissingException.class));
407-
}
402+
.execute();
403+
expectThrows(SnapshotMissingException.class, failingFuture::actionGet);
408404

409405
logger.info("--> specify all snapshot names with ignoreUnavailable=true");
410406
GetSnapshotsResponse getSnapshotsResponse3 = client.admin()

server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/get/GetSnapshotsRequest.java

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
import org.elasticsearch.common.Strings;
1616
import org.elasticsearch.common.io.stream.StreamInput;
1717
import org.elasticsearch.common.io.stream.StreamOutput;
18-
import org.elasticsearch.common.regex.Regex;
1918
import org.elasticsearch.core.Nullable;
2019
import org.elasticsearch.core.TimeValue;
2120
import org.elasticsearch.search.sort.SortOrder;
@@ -220,13 +219,6 @@ public String[] policies() {
220219
return policies;
221220
}
222221

223-
public boolean isSingleRepositoryRequest() {
224-
return repositories.length == 1
225-
&& repositories[0] != null
226-
&& "_all".equals(repositories[0]) == false
227-
&& Regex.isSimpleMatchPattern(repositories[0]) == false;
228-
}
229-
230222
/**
231223
* Returns the names of the snapshots.
232224
*

server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/get/GetSnapshotsResponse.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.elasticsearch.common.io.stream.StreamOutput;
1717
import org.elasticsearch.common.xcontent.ChunkedToXContentObject;
1818
import org.elasticsearch.core.Nullable;
19+
import org.elasticsearch.core.UpdateForV9;
1920
import org.elasticsearch.snapshots.SnapshotInfo;
2021
import org.elasticsearch.xcontent.ToXContent;
2122

@@ -33,6 +34,7 @@ public class GetSnapshotsResponse extends ActionResponse implements ChunkedToXCo
3334

3435
private final List<SnapshotInfo> snapshots;
3536

37+
@UpdateForV9 // always empty, can be dropped
3638
private final Map<String, ElasticsearchException> failures;
3739

3840
@Nullable

server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/get/TransportGetSnapshotsAction.java

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
package org.elasticsearch.action.admin.cluster.snapshots.get;
1010

11-
import org.elasticsearch.ElasticsearchException;
1211
import org.elasticsearch.action.ActionListener;
1312
import org.elasticsearch.action.ActionType;
1413
import org.elasticsearch.action.support.ActionFilters;
@@ -120,10 +119,14 @@ protected void masterOperation(
120119
) {
121120
assert task instanceof CancellableTask : task + " not cancellable";
122121

122+
final var resolvedRepositories = ResolvedRepositories.resolve(state, request.repositories());
123+
if (resolvedRepositories.hasMissingRepositories()) {
124+
throw new RepositoryMissingException(String.join(", ", resolvedRepositories.missing()));
125+
}
126+
123127
new GetSnapshotsOperation(
124128
(CancellableTask) task,
125-
ResolvedRepositories.resolve(state, request.repositories()),
126-
request.isSingleRepositoryRequest() == false,
129+
resolvedRepositories.repositoryMetadata(),
127130
request.snapshots(),
128131
request.ignoreUnavailable(),
129132
request.policies(),
@@ -151,7 +154,6 @@ private class GetSnapshotsOperation {
151154

152155
// repositories
153156
private final List<RepositoryMetadata> repositories;
154-
private final boolean isMultiRepoRequest;
155157

156158
// snapshots selection
157159
private final SnapshotNamePredicate snapshotNamePredicate;
@@ -179,7 +181,6 @@ private class GetSnapshotsOperation {
179181
private final GetSnapshotInfoExecutor getSnapshotInfoExecutor;
180182

181183
// results
182-
private final Map<String, ElasticsearchException> failuresByRepository = ConcurrentCollections.newConcurrentMap();
183184
private final Queue<List<SnapshotInfo>> allSnapshotInfos = ConcurrentCollections.newQueue();
184185

185186
/**
@@ -195,8 +196,7 @@ private class GetSnapshotsOperation {
195196

196197
GetSnapshotsOperation(
197198
CancellableTask cancellableTask,
198-
ResolvedRepositories resolvedRepositories,
199-
boolean isMultiRepoRequest,
199+
List<RepositoryMetadata> repositories,
200200
String[] snapshots,
201201
boolean ignoreUnavailable,
202202
String[] policies,
@@ -211,8 +211,7 @@ private class GetSnapshotsOperation {
211211
boolean indices
212212
) {
213213
this.cancellableTask = cancellableTask;
214-
this.repositories = resolvedRepositories.repositoryMetadata();
215-
this.isMultiRepoRequest = isMultiRepoRequest;
214+
this.repositories = repositories;
216215
this.ignoreUnavailable = ignoreUnavailable;
217216
this.sortBy = sortBy;
218217
this.order = order;
@@ -232,10 +231,6 @@ private class GetSnapshotsOperation {
232231
threadPool.info(ThreadPool.Names.SNAPSHOT_META).getMax(),
233232
cancellableTask::isCancelled
234233
);
235-
236-
for (final var missingRepo : resolvedRepositories.missing()) {
237-
failuresByRepository.put(missingRepo, new RepositoryMissingException(missingRepo));
238-
}
239234
}
240235

241236
void getMultipleReposSnapshotInfo(ActionListener<GetSnapshotsResponse> listener) {
@@ -249,6 +244,10 @@ void getMultipleReposSnapshotInfo(ActionListener<GetSnapshotsResponse> listener)
249244
continue;
250245
}
251246

247+
if (listeners.isFailing()) {
248+
return;
249+
}
250+
252251
SubscribableListener
253252

254253
.<RepositoryData>newForked(repositoryDataListener -> {
@@ -261,14 +260,7 @@ void getMultipleReposSnapshotInfo(ActionListener<GetSnapshotsResponse> listener)
261260

262261
.<Void>andThen((l, repositoryData) -> loadSnapshotInfos(repoName, repositoryData, l))
263262

264-
.addListener(listeners.acquire().delegateResponse((l, e) -> {
265-
if (isMultiRepoRequest && e instanceof ElasticsearchException elasticsearchException) {
266-
failuresByRepository.put(repoName, elasticsearchException);
267-
l.onResponse(null);
268-
} else {
269-
l.onFailure(e);
270-
}
271-
}));
263+
.addListener(listeners.acquire());
272264
}
273265
}
274266
})
@@ -503,7 +495,7 @@ private GetSnapshotsResponse buildResponse() {
503495
}
504496
return new GetSnapshotsResponse(
505497
snapshotInfos,
506-
failuresByRepository,
498+
null,
507499
remaining > 0 ? sortBy.encodeAfterQueryParam(snapshotInfos.get(snapshotInfos.size() - 1)) : null,
508500
totalCount.get(),
509501
remaining

0 commit comments

Comments
 (0)