Skip to content

Manage retention of failed snapshots in SLM #47617

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.snapshots.SnapshotInfo;
import org.elasticsearch.snapshots.SnapshotState;

import java.io.IOException;
import java.util.Comparator;
Expand All @@ -29,6 +30,7 @@
import java.util.function.LongSupplier;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class SnapshotRetentionConfiguration implements ToXContentObject, Writeable {

Expand Down Expand Up @@ -113,33 +115,48 @@ public Integer getMaximumSnapshotCount() {
* @param allSnapshots a list of all snapshot pertaining to this SLM policy and repository
*/
public Predicate<SnapshotInfo> getSnapshotDeletionPredicate(final List<SnapshotInfo> allSnapshots) {
final int snapCount = allSnapshots.size();
List<SnapshotInfo> sortedSnapshots = allSnapshots.stream()
final int totalSnapshotCount = allSnapshots.size();
final List<SnapshotInfo> sortedSnapshots = allSnapshots.stream()
.sorted(Comparator.comparingLong(SnapshotInfo::startTime))
.collect(Collectors.toList());
final long successfulSnapshotCount = allSnapshots.stream()
.filter(snap -> SnapshotState.SUCCESS.equals(snap.state()))
.count();
final long newestSuccessfulTimestamp = allSnapshots.stream()
.filter(snap -> SnapshotState.SUCCESS.equals(snap.state()))
.mapToLong(SnapshotInfo::startTime)
.max()
.orElse(Long.MIN_VALUE);

return si -> {
final String snapName = si.snapshotId().getName();

// First, enforce the maximum count, if the size is over the maximum number of
// First, if there's no expire_after and a more recent successful snapshot, we can delete all the failed ones
if (this.expireAfter == null && SnapshotState.FAILED.equals(si.state()) && newestSuccessfulTimestamp > si.startTime()) {
// There's no expire_after and there's a more recent successful snapshot, delete this failed one
logger.trace("[{}]: ELIGIBLE as it is FAILED and there is a more recent successful snapshot", snapName);
return true;
}

// Next, enforce the maximum count, if the size is over the maximum number of
// snapshots, then allow the oldest N (where N is the number over the maximum snapshot
// count) snapshots to be eligible for deletion
if (this.maximumSnapshotCount != null) {
if (allSnapshots.size() > this.maximumSnapshotCount) {
int snapsToDelete = allSnapshots.size() - this.maximumSnapshotCount;
boolean eligible = sortedSnapshots.stream()
if (successfulSnapshotCount > this.maximumSnapshotCount) {
final long snapsToDelete = successfulSnapshotCount - this.maximumSnapshotCount;
final boolean eligible = sortedSnapshots.stream()
.limit(snapsToDelete)
.anyMatch(s -> s.equals(si));

if (eligible) {
logger.trace("[{}]: ELIGIBLE as it is one of the {} oldest snapshots with " +
"{} total snapshots, over the limit of {} maximum snapshots",
snapName, snapsToDelete, snapCount, this.maximumSnapshotCount);
"{} non-failed snapshots ({} total), over the limit of {} maximum snapshots",
snapName, snapsToDelete, successfulSnapshotCount, totalSnapshotCount, this.maximumSnapshotCount);
return true;
} else {
logger.trace("[{}]: INELIGIBLE as it is not one of the {} oldest snapshots with " +
"{} total snapshots, over the limit of {} maximum snapshots",
snapName, snapsToDelete, snapCount, this.maximumSnapshotCount);
"{} non-failed snapshots ({} total), over the limit of {} maximum snapshots",
snapName, snapsToDelete, successfulSnapshotCount, totalSnapshotCount, this.maximumSnapshotCount);
return false;
}
}
Expand All @@ -149,25 +166,34 @@ public Predicate<SnapshotInfo> getSnapshotDeletionPredicate(final List<SnapshotI
// if we haven't hit the minimum then we need to keep the snapshot regardless of
// expiration time
if (this.minimumSnapshotCount != null) {
if (allSnapshots.size() <= this.minimumSnapshotCount) {
logger.trace("[{}]: INELIGIBLE as there are {} snapshots and {} minimum snapshots needed",
snapName, snapCount, this.minimumSnapshotCount);
return false;
}
if (successfulSnapshotCount <= this.minimumSnapshotCount)
if (SnapshotState.FAILED.equals(si.state()) == false) {
logger.trace("[{}]: INELIGIBLE as there are {} non-failed snapshots ({} total) and {} minimum snapshots needed",
snapName, successfulSnapshotCount, totalSnapshotCount, this.minimumSnapshotCount);
return false;
} else {
logger.trace("[{}]: SKIPPING minimum snapshot count check as this snapshot is {} and not counted " +
"towards the minimum snapshot count.", snapName, SnapshotState.FAILED);
}
}

// Finally, check the expiration time of the snapshot, if it is past, then it is
// eligible for deletion
if (this.expireAfter != null) {
TimeValue snapshotAge = new TimeValue(nowSupplier.getAsLong() - si.startTime());
final TimeValue snapshotAge = new TimeValue(nowSupplier.getAsLong() - si.startTime());

if (this.minimumSnapshotCount != null) {
int eligibleForExpiration = snapCount - minimumSnapshotCount;
final long eligibleForExpiration = successfulSnapshotCount - minimumSnapshotCount;

// Only the oldest N snapshots are actually eligible, since if we went below this we
// would fall below the configured minimum number of snapshots to keep
Set<SnapshotInfo> snapsEligibleForExpiration = sortedSnapshots.stream()
.limit(eligibleForExpiration)
final Stream<SnapshotInfo> successfulSnapsEligibleForExpiration = sortedSnapshots.stream()
.filter(snap -> SnapshotState.SUCCESS.equals(snap.state()))
.limit(eligibleForExpiration);
final Stream<SnapshotInfo> failedSnaps = sortedSnapshots.stream()
.filter(snap -> SnapshotState.FAILED.equals(snap.state()));

final Set<SnapshotInfo> snapsEligibleForExpiration = Stream.concat(successfulSnapsEligibleForExpiration, failedSnaps)
.collect(Collectors.toSet());

if (snapsEligibleForExpiration.contains(si) == false) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
package org.elasticsearch.xpack.slm;

import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.snapshots.SnapshotId;
import org.elasticsearch.snapshots.SnapshotInfo;
import org.elasticsearch.snapshots.SnapshotShardFailure;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.xpack.core.slm.SnapshotLifecyclePolicy;
import org.elasticsearch.xpack.core.slm.SnapshotRetentionConfiguration;
Expand Down Expand Up @@ -100,10 +102,137 @@ public void testMaximum() {
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s9), equalTo(false));
}

public void testFailuresDeletedIfExpired() {
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(
() -> TimeValue.timeValueDays(1).millis() + 1,
TimeValue.timeValueDays(1), null, null);
SnapshotInfo oldInfo = makeFailureInfo(0);
assertThat(conf.getSnapshotDeletionPredicate(Collections.singletonList(oldInfo)).test(oldInfo), equalTo(true));

SnapshotInfo newInfo = makeFailureInfo(1);
assertThat(conf.getSnapshotDeletionPredicate(Collections.singletonList(newInfo)).test(newInfo), equalTo(false));

List<SnapshotInfo> infos = new ArrayList<>();
infos.add(newInfo);
infos.add(oldInfo);
assertThat(conf.getSnapshotDeletionPredicate(infos).test(newInfo), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(oldInfo), equalTo(true));
}

public void testFailuresDeletedIfNoExpiryAndMoreRecentSuccessExists() {
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, null, 2, 5);
SnapshotInfo s1 = makeInfo(1);
SnapshotInfo s2 = makeInfo(2);
SnapshotInfo s3 = makeFailureInfo(3);
SnapshotInfo s4 = makeInfo(4);

List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4);
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s1), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s2), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s3), equalTo(true));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s4), equalTo(false));
}

public void testFailuresKeptIfNoExpiryAndNoMoreRecentSuccess() {
// Also tests that failures are not counted towards the maximum
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, null, 2, 3);
SnapshotInfo s1 = makeInfo(1);
SnapshotInfo s2 = makeInfo(2);
SnapshotInfo s3 = makeInfo(3);
SnapshotInfo s4 = makeFailureInfo(4);

List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4);
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s1), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s2), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s3), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s4), equalTo(false));
}

public void testFailuresNotCountedTowardsMaximum() {
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, TimeValue.timeValueDays(1), 2, 2);
SnapshotInfo s1 = makeInfo(1);
SnapshotInfo s2 = makeFailureInfo(2);
SnapshotInfo s3 = makeFailureInfo(3);
SnapshotInfo s4 = makeFailureInfo(4);
SnapshotInfo s5 = makeInfo(5);

List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4, s5);
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s1), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s2), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s3), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s4), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s5), equalTo(false));
}

public void testFailuresNotCountedTowardsMinimum() {
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> TimeValue.timeValueDays(1).millis() + 1,
TimeValue.timeValueDays(1), 2, null);
SnapshotInfo oldInfo = makeInfo(0);
SnapshotInfo failureInfo = makeFailureInfo( 1);
SnapshotInfo newInfo = makeInfo(2);

List<SnapshotInfo> infos = new ArrayList<>();
infos.add(newInfo);
infos.add(failureInfo);
infos.add(oldInfo);
assertThat(conf.getSnapshotDeletionPredicate(infos).test(newInfo), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(failureInfo), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(oldInfo), equalTo(false));

conf = new SnapshotRetentionConfiguration(() -> TimeValue.timeValueDays(1).millis() + 2,
TimeValue.timeValueDays(1), 1, null);
assertThat(conf.getSnapshotDeletionPredicate(infos).test(newInfo), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(failureInfo), equalTo(true));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(oldInfo), equalTo(true));
}

public void testMostRecentSuccessfulTimestampIsUsed() {
SnapshotRetentionConfiguration conf = new SnapshotRetentionConfiguration(() -> 1, null, 2, 2);
SnapshotInfo s1 = makeInfo(1);
SnapshotInfo s2 = makeInfo(2);
SnapshotInfo s3 = makeFailureInfo(3);
SnapshotInfo s4 = makeFailureInfo(4);

List<SnapshotInfo> infos = Arrays.asList(s1 , s2, s3, s4);
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s1), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s2), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s3), equalTo(false));
assertThat(conf.getSnapshotDeletionPredicate(infos).test(s4), equalTo(false));
}

private SnapshotInfo makeInfo(long startTime) {
final Map<String, Object> meta = new HashMap<>();
meta.put(SnapshotLifecyclePolicy.POLICY_ID_METADATA_FIELD, REPO);
final int totalShards = between(1,20);
return new SnapshotInfo(new SnapshotId("snap-" + randomAlphaOfLength(3), "uuid"),
Collections.singletonList("foo"), startTime, false, meta);
Collections.singletonList("foo"),
startTime,
null,
startTime + between(1,10000),
totalShards,
new ArrayList<>(),
false,
meta);
}

private SnapshotInfo makeFailureInfo(long startTime) {
final Map<String, Object> meta = new HashMap<>();
meta.put(SnapshotLifecyclePolicy.POLICY_ID_METADATA_FIELD, REPO);
final int totalShards = between(1,20);
final List<SnapshotShardFailure> failures = new ArrayList<>();
final int failureCount = between(1,totalShards);
for (int i = 0; i < failureCount; i++) {
failures.add(new SnapshotShardFailure("nodeId", new ShardId("index-name", "index-uuid", i), "failed"));
}
assert failureCount == failures.size();
return new SnapshotInfo(new SnapshotId("snap-fail-" + randomAlphaOfLength(3), "uuid-fail"),
Collections.singletonList("foo-fail"),
startTime,
"forced-failure",
startTime + between(1,10000),
totalShards,
failures,
randomBoolean(),
meta);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ public void triggered(SchedulerEngine.Event event) {
// Finally, asynchronously retrieve all the snapshots, deleting them serially,
// before updating the cluster state with the new metrics and setting 'running'
// back to false
getAllSuccessfulSnapshots(repositioriesToFetch, new ActionListener<>() {
getAllRetainableSnapshots(repositioriesToFetch, new ActionListener<>() {
@Override
public void onResponse(Map<String, List<SnapshotInfo>> allSnapshots) {
try {
Expand Down Expand Up @@ -222,7 +222,7 @@ static boolean snapshotEligibleForDeletion(SnapshotInfo snapshot, Map<String, Li
return eligible;
}

void getAllSuccessfulSnapshots(Collection<String> repositories, ActionListener<Map<String, List<SnapshotInfo>>> listener,
void getAllRetainableSnapshots(Collection<String> repositories, ActionListener<Map<String, List<SnapshotInfo>>> listener,
Consumer<Exception> errorHandler) {
if (repositories.isEmpty()) {
// Skip retrieving anything if there are no repositories to fetch
Expand All @@ -236,11 +236,12 @@ void getAllSuccessfulSnapshots(Collection<String> repositories, ActionListener<M
@Override
public void onResponse(final GetSnapshotsResponse resp) {
Map<String, List<SnapshotInfo>> snapshots = new HashMap<>();
final Set<SnapshotState> retainableStates = Set.of(SnapshotState.SUCCESS, SnapshotState.FAILED);
repositories.forEach(repo -> {
snapshots.put(repo,
// Only return snapshots in the SUCCESS state
resp.getSnapshots(repo).stream()
.filter(info -> info.state() == SnapshotState.SUCCESS)
.filter(info -> retainableStates.contains(info.state()))
.collect(Collectors.toList()));
});
listener.onResponse(snapshots);
Expand Down
Loading