Skip to content

Commit d008d24

Browse files
committed
HBASE-22193 Add backoff when region failed open too many times
1 parent 494a8ef commit d008d24

File tree

2 files changed

+24
-11
lines changed

2 files changed

+24
-11
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ public class AssignmentManager {
131131
"hbase.assignment.maximum.attempts";
132132
private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE;
133133

134+
public static final String FAILED_OPEN_ATTEMPTS = "hbase.assignment.failed.open.attempts";
135+
private static final int DEFAULT_FAILED_OPEN_ATTEMPTS = 3;
136+
134137
/** Region in Transition metrics threshold time */
135138
public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD =
136139
"hbase.metrics.rit.stuck.warning.threshold";
@@ -151,6 +154,7 @@ public class AssignmentManager {
151154
private final int assignDispatchWaitQueueMaxSize;
152155
private final int assignDispatchWaitMillis;
153156
private final int assignMaxAttempts;
157+
private final int failedOpenAttempts;
154158

155159
private final Object checkIfShouldMoveSystemRegionLock = new Object();
156160

@@ -179,6 +183,7 @@ public AssignmentManager(final MasterServices master) {
179183

180184
this.assignMaxAttempts = Math.max(1, conf.getInt(ASSIGN_MAX_ATTEMPTS,
181185
DEFAULT_ASSIGN_MAX_ATTEMPTS));
186+
this.failedOpenAttempts = conf.getInt(FAILED_OPEN_ATTEMPTS, DEFAULT_FAILED_OPEN_ATTEMPTS);
182187

183188
int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY,
184189
DEFAULT_RIT_CHORE_INTERVAL_MSEC);
@@ -308,6 +313,10 @@ int getAssignMaxAttempts() {
308313
return assignMaxAttempts;
309314
}
310315

316+
int getFailedOpenAttempts() {
317+
return failedOpenAttempts;
318+
}
319+
311320
public RegionStates getRegionStates() {
312321
return regionStates;
313322
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -226,20 +226,32 @@ private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode)
226226
return Flow.HAS_MORE_STATE;
227227
}
228228

229-
if (incrementAndCheckMaxAttempts(env, regionNode)) {
229+
int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
230+
.incrementAndGetRetries();
231+
int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts();
232+
LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, regionNode.toShortString());
233+
234+
if (retries >= maxAttempts) {
230235
env.getAssignmentManager().regionFailedOpen(regionNode, true);
231236
setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
232237
"Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded"));
233238
regionNode.unsetProcedure(this);
234239
return Flow.NO_MORE_STATE;
235240
}
241+
236242
env.getAssignmentManager().regionFailedOpen(regionNode, false);
237243
// we failed to assign the region, force a new plan
238244
forceNewPlan = true;
239245
regionNode.setRegionLocation(null);
240246
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
241-
// Here we do not throw exception because we want to the region to be online ASAP
242-
return Flow.HAS_MORE_STATE;
247+
248+
if (retries > env.getAssignmentManager().getFailedOpenAttempts()) {
249+
// Throw exception to backoff and retry when failed open too many times
250+
throw new HBaseIOException("Failed to open region");
251+
} else {
252+
// Here we do not throw exception because we want to the region to be online ASAP
253+
return Flow.HAS_MORE_STATE;
254+
}
243255
}
244256

245257
private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) throws IOException {
@@ -400,14 +412,6 @@ void unattachRemoteProc(RegionRemoteProcedureBase proc) {
400412
this.remoteProc = null;
401413
}
402414

403-
private boolean incrementAndCheckMaxAttempts(MasterProcedureEnv env, RegionStateNode regionNode) {
404-
int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
405-
.incrementAndGetRetries();
406-
int max = env.getAssignmentManager().getAssignMaxAttempts();
407-
LOG.info("Retry={} of max={}; {}; {}", retries, max, this, regionNode.toShortString());
408-
return retries >= max;
409-
}
410-
411415
@Override
412416
protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state)
413417
throws IOException, InterruptedException {

0 commit comments

Comments
 (0)