164
164
import java .util .stream .StreamSupport ;
165
165
166
166
import static org .elasticsearch .index .mapper .SourceToParse .source ;
167
- import static org .elasticsearch .index .seqno .SequenceNumbers .NO_OPS_PERFORMED ;
168
167
import static org .elasticsearch .index .seqno .SequenceNumbers .UNASSIGNED_SEQ_NO ;
169
168
170
169
public class IndexShard extends AbstractIndexShardComponent implements IndicesClusterStateService .Shard {
@@ -1307,16 +1306,18 @@ public Engine.Result applyTranslogOperation(Translog.Operation operation, Engine
1307
1306
return result ;
1308
1307
}
1309
1308
1310
- // package-private for testing
1311
- int runTranslogRecovery (Engine engine , Translog .Snapshot snapshot ) throws IOException {
1312
- recoveryState .getTranslog ().totalOperations (snapshot .totalOperations ());
1313
- recoveryState .getTranslog ().totalOperationsOnStart (snapshot .totalOperations ());
1309
+ /**
1310
+ * Replays translog operations from the provided translog {@code snapshot} to the current engine using the given {@code origin}.
1311
+ * The callback {@code onOperationRecovered} is notified after each translog operation is replayed successfully.
1312
+ */
1313
+ int runTranslogRecovery (Engine engine , Translog .Snapshot snapshot , Engine .Operation .Origin origin ,
1314
+ Runnable onOperationRecovered ) throws IOException {
1314
1315
int opsRecovered = 0 ;
1315
1316
Translog .Operation operation ;
1316
1317
while ((operation = snapshot .next ()) != null ) {
1317
1318
try {
1318
1319
logger .trace ("[translog] recover op {}" , operation );
1319
- Engine .Result result = applyTranslogOperation (operation , Engine . Operation . Origin . LOCAL_TRANSLOG_RECOVERY );
1320
+ Engine .Result result = applyTranslogOperation (operation , origin );
1320
1321
switch (result .getResultType ()) {
1321
1322
case FAILURE :
1322
1323
throw result .getFailure ();
@@ -1329,7 +1330,7 @@ int runTranslogRecovery(Engine engine, Translog.Snapshot snapshot) throws IOExce
1329
1330
}
1330
1331
1331
1332
opsRecovered ++;
1332
- recoveryState . getTranslog (). incrementRecoveredOperations ();
1333
+ onOperationRecovered . run ();
1333
1334
} catch (Exception e ) {
1334
1335
if (ExceptionsHelper .status (e ) == RestStatus .BAD_REQUEST ) {
1335
1336
// mainly for MapperParsingException and Failure to detect xcontent
@@ -1347,8 +1348,15 @@ int runTranslogRecovery(Engine engine, Translog.Snapshot snapshot) throws IOExce
1347
1348
* Operations from the translog will be replayed to bring lucene up to date.
1348
1349
**/
1349
1350
public void openEngineAndRecoverFromTranslog () throws IOException {
1351
+ final RecoveryState .Translog translogRecoveryStats = recoveryState .getTranslog ();
1352
+ final Engine .TranslogRecoveryRunner translogRecoveryRunner = (engine , snapshot ) -> {
1353
+ translogRecoveryStats .totalOperations (snapshot .totalOperations ());
1354
+ translogRecoveryStats .totalOperationsOnStart (snapshot .totalOperations ());
1355
+ return runTranslogRecovery (engine , snapshot , Engine .Operation .Origin .LOCAL_TRANSLOG_RECOVERY ,
1356
+ translogRecoveryStats ::incrementRecoveredOperations );
1357
+ };
1350
1358
innerOpenEngineAndTranslog ();
1351
- getEngine ().recoverFromTranslog (this :: runTranslogRecovery , Long .MAX_VALUE );
1359
+ getEngine ().recoverFromTranslog (translogRecoveryRunner , Long .MAX_VALUE );
1352
1360
}
1353
1361
1354
1362
/**
@@ -1386,11 +1394,7 @@ private void innerOpenEngineAndTranslog() throws IOException {
1386
1394
final String translogUUID = store .readLastCommittedSegmentsInfo ().getUserData ().get (Translog .TRANSLOG_UUID_KEY );
1387
1395
final long globalCheckpoint = Translog .readGlobalCheckpoint (translogConfig .getTranslogPath (), translogUUID );
1388
1396
replicationTracker .updateGlobalCheckpointOnReplica (globalCheckpoint , "read from translog checkpoint" );
1389
-
1390
- assertMaxUnsafeAutoIdInCommit ();
1391
-
1392
- final long minRetainedTranslogGen = Translog .readMinTranslogGeneration (translogConfig .getTranslogPath (), translogUUID );
1393
- store .trimUnsafeCommits (globalCheckpoint , minRetainedTranslogGen , config .getIndexSettings ().getIndexVersionCreated ());
1397
+ trimUnsafeCommits ();
1394
1398
1395
1399
createNewEngine (config );
1396
1400
verifyNotClosed ();
@@ -1401,6 +1405,15 @@ private void innerOpenEngineAndTranslog() throws IOException {
1401
1405
assert recoveryState .getStage () == RecoveryState .Stage .TRANSLOG : "TRANSLOG stage expected but was: " + recoveryState .getStage ();
1402
1406
}
1403
1407
1408
+ private void trimUnsafeCommits () throws IOException {
1409
+ assert currentEngineReference .get () == null : "engine is running" ;
1410
+ final String translogUUID = store .readLastCommittedSegmentsInfo ().getUserData ().get (Translog .TRANSLOG_UUID_KEY );
1411
+ final long globalCheckpoint = Translog .readGlobalCheckpoint (translogConfig .getTranslogPath (), translogUUID );
1412
+ final long minRetainedTranslogGen = Translog .readMinTranslogGeneration (translogConfig .getTranslogPath (), translogUUID );
1413
+ assertMaxUnsafeAutoIdInCommit ();
1414
+ store .trimUnsafeCommits (globalCheckpoint , minRetainedTranslogGen , indexSettings .getIndexVersionCreated ());
1415
+ }
1416
+
1404
1417
private boolean assertSequenceNumbersInCommit () throws IOException {
1405
1418
final Map <String , String > userData = SegmentInfos .readLatestCommit (store .directory ()).getUserData ();
1406
1419
assert userData .containsKey (SequenceNumbers .LOCAL_CHECKPOINT_KEY ) : "commit point doesn't contains a local checkpoint" ;
@@ -1501,7 +1514,7 @@ private void ensureWriteAllowed(Engine.Operation.Origin origin) throws IllegalIn
1501
1514
if (origin == Engine .Operation .Origin .PRIMARY ) {
1502
1515
assert assertPrimaryMode ();
1503
1516
} else {
1504
- assert origin == Engine .Operation .Origin .REPLICA ;
1517
+ assert origin == Engine .Operation .Origin .REPLICA || origin == Engine . Operation . Origin . LOCAL_RESET ;
1505
1518
assert assertReplicationTarget ();
1506
1519
}
1507
1520
if (writeAllowedStates .contains (state ) == false ) {
@@ -2207,9 +2220,7 @@ public void onFailedEngine(String reason, @Nullable Exception failure) {
2207
2220
2208
2221
private Engine createNewEngine (EngineConfig config ) {
2209
2222
synchronized (mutex ) {
2210
- if (state == IndexShardState .CLOSED ) {
2211
- throw new AlreadyClosedException (shardId + " can't create engine - shard is closed" );
2212
- }
2223
+ verifyNotClosed ();
2213
2224
assert this .currentEngineReference .get () == null ;
2214
2225
Engine engine = newEngine (config );
2215
2226
onNewEngine (engine ); // call this before we pass the memory barrier otherwise actions that happen
@@ -2355,19 +2366,14 @@ public void acquireReplicaOperationPermit(final long opPrimaryTerm, final long g
2355
2366
bumpPrimaryTerm (opPrimaryTerm , () -> {
2356
2367
updateGlobalCheckpointOnReplica (globalCheckpoint , "primary term transition" );
2357
2368
final long currentGlobalCheckpoint = getGlobalCheckpoint ();
2358
- final long localCheckpoint ;
2359
- if (currentGlobalCheckpoint == UNASSIGNED_SEQ_NO ) {
2360
- localCheckpoint = NO_OPS_PERFORMED ;
2369
+ final long maxSeqNo = seqNoStats ().getMaxSeqNo ();
2370
+ logger .info ("detected new primary with primary term [{}], global checkpoint [{}], max_seq_no [{}]" ,
2371
+ opPrimaryTerm , currentGlobalCheckpoint , maxSeqNo );
2372
+ if (currentGlobalCheckpoint < maxSeqNo ) {
2373
+ resetEngineToGlobalCheckpoint ();
2361
2374
} else {
2362
- localCheckpoint = currentGlobalCheckpoint ;
2375
+ getEngine (). rollTranslogGeneration () ;
2363
2376
}
2364
- logger .trace (
2365
- "detected new primary with primary term [{}], resetting local checkpoint from [{}] to [{}]" ,
2366
- opPrimaryTerm ,
2367
- getLocalCheckpoint (),
2368
- localCheckpoint );
2369
- getEngine ().resetLocalCheckpoint (localCheckpoint );
2370
- getEngine ().rollTranslogGeneration ();
2371
2377
});
2372
2378
}
2373
2379
}
@@ -2663,4 +2669,26 @@ public ParsedDocument newNoopTombstoneDoc(String reason) {
2663
2669
}
2664
2670
};
2665
2671
}
2672
+
2673
+ /**
2674
+ * Rollback the current engine to the safe commit, then replay local translog up to the global checkpoint.
2675
+ */
2676
+ void resetEngineToGlobalCheckpoint () throws IOException {
2677
+ assert getActiveOperationsCount () == 0 : "Ongoing writes [" + getActiveOperations () + "]" ;
2678
+ sync (); // persist the global checkpoint to disk
2679
+ final long globalCheckpoint = getGlobalCheckpoint ();
2680
+ final Engine newEngine ;
2681
+ synchronized (mutex ) {
2682
+ verifyNotClosed ();
2683
+ IOUtils .close (currentEngineReference .getAndSet (null ));
2684
+ trimUnsafeCommits ();
2685
+ newEngine = createNewEngine (newEngineConfig ());
2686
+ active .set (true );
2687
+ }
2688
+ final Engine .TranslogRecoveryRunner translogRunner = (engine , snapshot ) -> runTranslogRecovery (
2689
+ engine , snapshot , Engine .Operation .Origin .LOCAL_RESET , () -> {
2690
+ // TODO: add a dedicate recovery stats for the reset translog
2691
+ });
2692
+ newEngine .recoverFromTranslog (translogRunner , globalCheckpoint );
2693
+ }
2666
2694
}
0 commit comments