Skip to content

HBASE-22193 Add backoff when region failed open too many times #133

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ public class AssignmentManager {
"hbase.assignment.maximum.attempts";
private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE;

public static final String ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS =
"hbase.assignment.retry.immediately.maximum.attempts";
private static final int DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS = 3;

/** Region in Transition metrics threshold time */
public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD =
"hbase.metrics.rit.stuck.warning.threshold";
Expand All @@ -151,6 +155,7 @@ public class AssignmentManager {
private final int assignDispatchWaitQueueMaxSize;
private final int assignDispatchWaitMillis;
private final int assignMaxAttempts;
private final int assignRetryImmediatelyMaxAttempts;

private final Object checkIfShouldMoveSystemRegionLock = new Object();

Expand Down Expand Up @@ -179,6 +184,8 @@ public AssignmentManager(final MasterServices master) {

this.assignMaxAttempts = Math.max(1, conf.getInt(ASSIGN_MAX_ATTEMPTS,
DEFAULT_ASSIGN_MAX_ATTEMPTS));
this.assignRetryImmediatelyMaxAttempts = conf.getInt(ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS,
DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS);

int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY,
DEFAULT_RIT_CHORE_INTERVAL_MSEC);
Expand Down Expand Up @@ -308,6 +315,10 @@ int getAssignMaxAttempts() {
return assignMaxAttempts;
}

int getAssignRetryImmediatelyMaxAttempts() {
return assignRetryImmediatelyMaxAttempts;
}

public RegionStates getRegionStates() {
return regionStates;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,20 +226,32 @@ private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode)
return Flow.HAS_MORE_STATE;
}

if (incrementAndCheckMaxAttempts(env, regionNode)) {
int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
.incrementAndGetRetries();
int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts();
LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, regionNode.toShortString());

if (retries >= maxAttempts) {
env.getAssignmentManager().regionFailedOpen(regionNode, true);
setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
"Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded"));
regionNode.unsetProcedure(this);
return Flow.NO_MORE_STATE;
}

env.getAssignmentManager().regionFailedOpen(regionNode, false);
// we failed to assign the region, force a new plan
forceNewPlan = true;
regionNode.setRegionLocation(null);
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
// Here we do not throw exception because we want to the region to be online ASAP
return Flow.HAS_MORE_STATE;

if (retries > env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) {
// Throw exception to backoff and retry when failed open too many times
throw new HBaseIOException("Failed to open region");
} else {
// Here we do not throw exception because we want to the region to be online ASAP
return Flow.HAS_MORE_STATE;
}
}

private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) throws IOException {
Expand Down Expand Up @@ -400,14 +412,6 @@ void unattachRemoteProc(RegionRemoteProcedureBase proc) {
this.remoteProc = null;
}

private boolean incrementAndCheckMaxAttempts(MasterProcedureEnv env, RegionStateNode regionNode) {
int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
.incrementAndGetRetries();
int max = env.getAssignmentManager().getAssignMaxAttempts();
LOG.info("Retry={} of max={}; {}; {}", retries, max, this, regionNode.toShortString());
return retries >= max;
}

@Override
protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state)
throws IOException, InterruptedException {
Expand Down