Skip to content

Commit 16cbbab

Browse files
authored
[ILM] fix retry so it picks up latest policy and executes async action (#35406)
Before, moving to a failed step would only change the step info to be that of the failed step. This means two things. 1. Async Steps would never be triggered to execute 2. If there are inherent problems with the action definition that can be fixed with a policy update, these changes were not being reflected by the new execution info. Changes now 1. Async steps are executed after the move to the failed step in cluster state 2. the lifecycle execution info's phase definition is updated from the current latest policy definition, even though the index isn't moving to a new phase. Closes #35397.
1 parent 40ca62c commit 16cbbab

File tree

7 files changed

+127
-25
lines changed

7 files changed

+127
-25
lines changed

x-pack/plugin/ilm/qa/multi-node/src/test/java/org/elasticsearch/xpack/indexlifecycle/TimeSeriesLifecycleActionsIT.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.elasticsearch.xpack.core.indexlifecycle.ReadOnlyAction;
3131
import org.elasticsearch.xpack.core.indexlifecycle.RolloverAction;
3232
import org.elasticsearch.xpack.core.indexlifecycle.ShrinkAction;
33+
import org.elasticsearch.xpack.core.indexlifecycle.ShrinkStep;
3334
import org.elasticsearch.xpack.core.indexlifecycle.Step.StepKey;
3435
import org.elasticsearch.xpack.core.indexlifecycle.TerminalPolicyStep;
3536
import org.junit.Before;
@@ -178,6 +179,41 @@ public void testMoveToRolloverStep() throws Exception {
178179
assertBusy(() -> assertFalse(indexExists(shrunkenOriginalIndex)));
179180
}
180181

182+
public void testRetryFailedShrinkAction() throws Exception {
183+
int numShards = 6;
184+
int divisor = randomFrom(2, 3, 6);
185+
int expectedFinalShards = numShards / divisor;
186+
String shrunkenIndex = ShrinkAction.SHRUNKEN_INDEX_PREFIX + index;
187+
createIndexWithSettings(index, Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, numShards)
188+
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0));
189+
createNewSingletonPolicy("warm", new ShrinkAction(numShards + randomIntBetween(1, numShards)));
190+
updatePolicy(index, policy);
191+
assertBusy(() -> {
192+
String failedStep = getFailedStepForIndex(index);
193+
assertThat(failedStep, equalTo(ShrinkStep.NAME));
194+
});
195+
196+
// update policy to be correct
197+
createNewSingletonPolicy("warm", new ShrinkAction(expectedFinalShards));
198+
updatePolicy(index, policy);
199+
200+
// retry step
201+
Request retryRequest = new Request("POST", index + "/_ilm/retry");
202+
assertOK(client().performRequest(retryRequest));
203+
204+
// assert corrected policy is picked up and index is shrunken
205+
assertBusy(() -> {
206+
logger.error(explainIndex(index));
207+
assertTrue(indexExists(shrunkenIndex));
208+
assertTrue(aliasExists(shrunkenIndex, index));
209+
Map<String, Object> settings = getOnlyIndexSettings(shrunkenIndex);
210+
assertThat(getStepKeyForIndex(shrunkenIndex), equalTo(TerminalPolicyStep.KEY));
211+
assertThat(settings.get(IndexMetaData.SETTING_NUMBER_OF_SHARDS), equalTo(String.valueOf(expectedFinalShards)));
212+
assertThat(settings.get(IndexMetaData.INDEX_BLOCKS_WRITE_SETTING.getKey()), equalTo("true"));
213+
});
214+
expectThrows(ResponseException.class, this::indexDocument);
215+
}
216+
181217
public void testRolloverAction() throws Exception {
182218
String originalIndex = index + "-000001";
183219
String secondIndex = index + "-000002";

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/indexlifecycle/ExecuteStepsUpdateTask.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ public ClusterState execute(final ClusterState currentState) throws IOException
101101
return state;
102102
} else {
103103
state = IndexLifecycleRunner.moveClusterStateToNextStep(index, state, currentStep.getKey(),
104-
currentStep.getNextStepKey(), nowSupplier);
104+
currentStep.getNextStepKey(), nowSupplier, false);
105105
}
106106
} else {
107107
// cluster state wait step so evaluate the
@@ -125,7 +125,7 @@ public ClusterState execute(final ClusterState currentState) throws IOException
125125
return state;
126126
} else {
127127
state = IndexLifecycleRunner.moveClusterStateToNextStep(index, state, currentStep.getKey(),
128-
currentStep.getNextStepKey(), nowSupplier);
128+
currentStep.getNextStepKey(), nowSupplier, false);
129129
}
130130
} else {
131131
logger.trace("[{}] condition not met ({}) [{}], returning existing state",

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/indexlifecycle/IndexLifecycleRunner.java

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -271,11 +271,13 @@ static Step getCurrentStep(PolicyStepsRegistry stepRegistry, String policy, Inde
271271
* @param nextStepKey The next step to move the index into
272272
* @param nowSupplier The current-time supplier for updating when steps changed
273273
* @param stepRegistry The steps registry to check a step-key's existence in the index's current policy
274+
* @param forcePhaseDefinitionRefresh When true, step information will be recompiled from the latest version of the
275+
* policy. Otherwise, existing phase definition is used.
274276
* @return The updated cluster state where the index moved to <code>nextStepKey</code>
275277
*/
276278
static ClusterState moveClusterStateToStep(String indexName, ClusterState currentState, StepKey currentStepKey,
277279
StepKey nextStepKey, LongSupplier nowSupplier,
278-
PolicyStepsRegistry stepRegistry) {
280+
PolicyStepsRegistry stepRegistry, boolean forcePhaseDefinitionRefresh) {
279281
IndexMetaData idxMeta = currentState.getMetaData().index(indexName);
280282
Settings indexSettings = idxMeta.getSettings();
281283
String indexPolicySetting = LifecycleSettings.LIFECYCLE_NAME_SETTING.get(indexSettings);
@@ -295,18 +297,19 @@ static ClusterState moveClusterStateToStep(String indexName, ClusterState curren
295297
"] with policy [" + indexPolicySetting + "] does not exist");
296298
}
297299

298-
return IndexLifecycleRunner.moveClusterStateToNextStep(idxMeta.getIndex(), currentState, currentStepKey, nextStepKey, nowSupplier);
300+
return IndexLifecycleRunner.moveClusterStateToNextStep(idxMeta.getIndex(), currentState, currentStepKey,
301+
nextStepKey, nowSupplier, forcePhaseDefinitionRefresh);
299302
}
300303

301304
static ClusterState moveClusterStateToNextStep(Index index, ClusterState clusterState, StepKey currentStep, StepKey nextStep,
302-
LongSupplier nowSupplier) {
305+
LongSupplier nowSupplier, boolean forcePhaseDefinitionRefresh) {
303306
IndexMetaData idxMeta = clusterState.getMetaData().index(index);
304307
IndexLifecycleMetadata ilmMeta = clusterState.metaData().custom(IndexLifecycleMetadata.TYPE);
305308
LifecyclePolicyMetadata policyMetadata = ilmMeta.getPolicyMetadatas()
306309
.get(LifecycleSettings.LIFECYCLE_NAME_SETTING.get(idxMeta.getSettings()));
307310
LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(idxMeta);
308311
LifecycleExecutionState newLifecycleState = moveExecutionStateToNextStep(policyMetadata,
309-
lifecycleState, currentStep, nextStep, nowSupplier);
312+
lifecycleState, currentStep, nextStep, nowSupplier, forcePhaseDefinitionRefresh);
310313
ClusterState.Builder newClusterStateBuilder = newClusterStateWithLifecycleState(index, clusterState, newLifecycleState);
311314

312315
return newClusterStateBuilder.build();
@@ -324,7 +327,7 @@ static ClusterState moveClusterStateToErrorStep(Index index, ClusterState cluste
324327
causeXContentBuilder.endObject();
325328
LifecycleExecutionState nextStepState = moveExecutionStateToNextStep(policyMetadata,
326329
LifecycleExecutionState.fromIndexMetadata(idxMeta), currentStep, new StepKey(currentStep.getPhase(),
327-
currentStep.getAction(), ErrorStep.NAME), nowSupplier);
330+
currentStep.getAction(), ErrorStep.NAME), nowSupplier, false);
328331
LifecycleExecutionState.Builder failedState = LifecycleExecutionState.builder(nextStepState);
329332
failedState.setFailedStep(currentStep.getName());
330333
failedState.setStepInfo(BytesReference.bytes(causeXContentBuilder).utf8ToString());
@@ -343,9 +346,9 @@ ClusterState moveClusterStateToFailedStep(ClusterState currentState, String[] in
343346
StepKey currentStepKey = IndexLifecycleRunner.getCurrentStepKey(lifecycleState);
344347
String failedStep = lifecycleState.getFailedStep();
345348
if (currentStepKey != null && ErrorStep.NAME.equals(currentStepKey.getName())
346-
&& Strings.isNullOrEmpty(failedStep) == false) {
349+
&& Strings.isNullOrEmpty(failedStep) == false) {
347350
StepKey nextStepKey = new StepKey(currentStepKey.getPhase(), currentStepKey.getAction(), failedStep);
348-
newState = moveClusterStateToStep(index, currentState, currentStepKey, nextStepKey, nowSupplier, stepRegistry);
351+
newState = moveClusterStateToStep(index, currentState, currentStepKey, nextStepKey, nowSupplier, stepRegistry, true);
349352
} else {
350353
throw new IllegalArgumentException("cannot retry an action for an index ["
351354
+ index + "] that has not encountered an error when running a Lifecycle Policy");
@@ -357,7 +360,8 @@ ClusterState moveClusterStateToFailedStep(ClusterState currentState, String[] in
357360
private static LifecycleExecutionState moveExecutionStateToNextStep(LifecyclePolicyMetadata policyMetadata,
358361
LifecycleExecutionState existingState,
359362
StepKey currentStep, StepKey nextStep,
360-
LongSupplier nowSupplier) {
363+
LongSupplier nowSupplier,
364+
boolean forcePhaseDefinitionRefresh) {
361365
long nowAsMillis = nowSupplier.getAsLong();
362366
LifecycleExecutionState.Builder updatedState = LifecycleExecutionState.builder(existingState);
363367
updatedState.setPhase(nextStep.getPhase());
@@ -369,7 +373,7 @@ private static LifecycleExecutionState moveExecutionStateToNextStep(LifecyclePol
369373
updatedState.setFailedStep(null);
370374
updatedState.setStepInfo(null);
371375

372-
if (currentStep.getPhase().equals(nextStep.getPhase()) == false) {
376+
if (currentStep.getPhase().equals(nextStep.getPhase()) == false || forcePhaseDefinitionRefresh) {
373377
final String newPhaseDefinition;
374378
final Phase nextPhase;
375379
if ("new".equals(nextStep.getPhase()) || TerminalPolicyStep.KEY.equals(nextStep)) {

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/indexlifecycle/IndexLifecycleService.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ public void maybeRunAsyncAction(ClusterState clusterState, IndexMetaData indexMe
8484

8585
public ClusterState moveClusterStateToStep(ClusterState currentState, String indexName, StepKey currentStepKey, StepKey nextStepKey) {
8686
return IndexLifecycleRunner.moveClusterStateToStep(indexName, currentState, currentStepKey, nextStepKey,
87-
nowSupplier, policyRegistry);
87+
nowSupplier, policyRegistry, false);
8888
}
8989

9090
public ClusterState moveClusterStateToFailedStep(ClusterState currentState, String[] indices) {

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/indexlifecycle/MoveToNextStepUpdateTask.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ public ClusterState execute(ClusterState currentState) {
6868
if (policy.equals(LifecycleSettings.LIFECYCLE_NAME_SETTING.get(indexSettings))
6969
&& currentStepKey.equals(IndexLifecycleRunner.getCurrentStepKey(indexILMData))) {
7070
logger.trace("moving [{}] to next step ({})", index.getName(), nextStepKey);
71-
return IndexLifecycleRunner.moveClusterStateToNextStep(index, currentState, currentStepKey, nextStepKey, nowSupplier);
71+
return IndexLifecycleRunner.moveClusterStateToNextStep(index, currentState, currentStepKey, nextStepKey, nowSupplier, false);
7272
} else {
7373
// either the policy has changed or the step is now
7474
// not the same as when we submitted the update task. In

x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/indexlifecycle/action/TransportRetryAction.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,14 @@
1313
import org.elasticsearch.cluster.ClusterState;
1414
import org.elasticsearch.cluster.block.ClusterBlockException;
1515
import org.elasticsearch.cluster.block.ClusterBlockLevel;
16+
import org.elasticsearch.cluster.metadata.IndexMetaData;
1617
import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
1718
import org.elasticsearch.cluster.service.ClusterService;
1819
import org.elasticsearch.common.inject.Inject;
1920
import org.elasticsearch.threadpool.ThreadPool;
2021
import org.elasticsearch.transport.TransportService;
22+
import org.elasticsearch.xpack.core.indexlifecycle.LifecycleExecutionState;
23+
import org.elasticsearch.xpack.core.indexlifecycle.Step.StepKey;
2124
import org.elasticsearch.xpack.core.indexlifecycle.action.RetryAction;
2225
import org.elasticsearch.xpack.core.indexlifecycle.action.RetryAction.Request;
2326
import org.elasticsearch.xpack.core.indexlifecycle.action.RetryAction.Response;
@@ -55,6 +58,22 @@ public ClusterState execute(ClusterState currentState) {
5558
return indexLifecycleService.moveClusterStateToFailedStep(currentState, request.indices());
5659
}
5760

61+
@Override
62+
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
63+
for (String index : request.indices()) {
64+
IndexMetaData idxMeta = newState.metaData().index(index);
65+
LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(idxMeta);
66+
StepKey retryStep = new StepKey(lifecycleState.getPhase(), lifecycleState.getAction(), lifecycleState.getStep());
67+
if (idxMeta == null) {
68+
// The index has somehow been deleted - there shouldn't be any opportunity for this to happen, but just in case.
69+
logger.debug("index [" + index + "] has been deleted after moving to step [" +
70+
lifecycleState.getStep() + "], skipping async action check");
71+
return;
72+
}
73+
indexLifecycleService.maybeRunAsyncAction(newState, idxMeta, retryStep);
74+
}
75+
}
76+
5877
@Override
5978
protected Response newResponse(boolean acknowledged) {
6079
return new Response(acknowledged);

0 commit comments

Comments
 (0)