-
Notifications
You must be signed in to change notification settings - Fork 3.4k
HBASE-28192 Master should recover if meta region state is inconsistent #5513
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -316,6 +316,15 @@ public class HMaster extends HBaseServerBase<MasterRpcServices> implements Maste | |
// instance into a web context !! AND OTHER PLACES !! | ||
public static final String MASTER = "master"; | ||
|
||
// for meta/namespace region, if SCP is scheduled by master, num of retries to perform | ||
// before giving up. | ||
private static final String HBASE_MASTER_REGION_SCHEDULE_RECOVERY_WAIT_RETRIES = | ||
"hbase.master.region.schedule.recovery.wait.retries"; | ||
// for meta/namespace region, if SCP is scheduled by master, retry interval in ms to wait | ||
// for before retrying again. | ||
private static final String HBASE_MASTER_REGION_SCHEDULE_RECOVERY_WAIT_INTERVAL_MS = | ||
"hbase.master.region.schedule.recovery.wait.interval.ms"; | ||
|
||
// Manager and zk listener for master election | ||
private final ActiveMasterManager activeMasterManager; | ||
// Region server tracker | ||
|
@@ -1399,23 +1408,75 @@ private void createMissingCFsInMetaDuringUpgrade(TableDescriptor metaDescriptor) | |
* Check hbase:meta is up and ready for reading. For use during Master startup only. | ||
* @return True if meta is UP and online and startup can progress. Otherwise, meta is not online | ||
* and we will hold here until operator intervention. | ||
* @throws IOException If the master restart is required. | ||
*/ | ||
@InterfaceAudience.Private | ||
public boolean waitForMetaOnline() { | ||
public boolean waitForMetaOnline() throws IOException { | ||
return isRegionOnline(RegionInfoBuilder.FIRST_META_REGIONINFO); | ||
} | ||
|
||
/** | ||
* Wait until the region is reported online on a live regionserver. | ||
* @param ri Region info. | ||
* @return True if region is online and scannable else false if an error or shutdown (Otherwise we | ||
* just block in here holding up all forward-progess). | ||
* @throws IOException If the master restart is required. | ||
*/ | ||
private boolean isRegionOnline(RegionInfo ri) { | ||
private boolean isRegionOnline(RegionInfo ri) throws IOException { | ||
RetryCounter rc = null; | ||
while (!isStopped()) { | ||
RegionState rs = this.assignmentManager.getRegionStates().getRegionState(ri); | ||
if (rs != null && rs.isOpened()) { | ||
if (this.getServerManager().isServerOnline(rs.getServerName())) { | ||
return true; | ||
} else { | ||
ServerName serverNameForRegion = rs.getServerName(); | ||
Optional<Procedure<MasterProcedureEnv>> scpForServer = | ||
this.procedureExecutor.getProcedures().stream() | ||
.filter(p -> p instanceof ServerCrashProcedure | ||
&& serverNameForRegion.equals(((ServerCrashProcedure) p).getServerName())) | ||
.findFirst(); | ||
if (!scpForServer.isPresent()) { | ||
LOG.info("{} has state {} but the server {} is not online, scheduling recovery.", | ||
ri.getRegionNameAsString(), rs, rs.getServerName()); | ||
this.getServerManager().expireServer(rs.getServerName(), true); | ||
int numRetries = this.getConfiguration() | ||
.getInt(HBASE_MASTER_REGION_SCHEDULE_RECOVERY_WAIT_RETRIES, 20); | ||
int sleepInterval = this.getConfiguration() | ||
.getInt(HBASE_MASTER_REGION_SCHEDULE_RECOVERY_WAIT_INTERVAL_MS, 2000); | ||
while (numRetries > 0) { | ||
scpForServer = this.procedureExecutor.getProcedures().stream() | ||
.filter(p -> p instanceof ServerCrashProcedure | ||
&& serverNameForRegion.equals(((ServerCrashProcedure) p).getServerName())) | ||
.findFirst(); | ||
if (scpForServer.isPresent()) { | ||
ServerCrashProcedure proc = (ServerCrashProcedure) scpForServer.get(); | ||
if (proc.isFinished() || proc.isSuccess()) { | ||
rs = this.assignmentManager.getRegionStates().getRegionState(ri); | ||
if (rs != null && rs.isOpened()) { | ||
if (this.getServerManager().isServerOnline(rs.getServerName())) { | ||
return true; | ||
} | ||
} | ||
} | ||
} | ||
Threads.sleep(sleepInterval); | ||
numRetries--; | ||
} | ||
if (numRetries == 0) { | ||
rs = this.assignmentManager.getRegionStates().getRegionState(ri); | ||
if (rs != null && rs.isOpened()) { | ||
if (this.getServerManager().isServerOnline(rs.getServerName())) { | ||
return true; | ||
} | ||
} | ||
throw new PleaseRestartMasterException("Scheduled SCP for old server for region " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the solution here is to "crash" master once we never find an SCP for this region's RS? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we crash only because even after scheduling SCP for the region (meta or namespace), we still find the region not online. |
||
+ ri.getRegionNameAsString() + " could not be completed"); | ||
} | ||
} else { | ||
LOG.info("{} has state {} but the server {} is not online. Wait for SCP {} to complete", | ||
ri.getRegionNameAsString(), rs, rs.getServerName(), scpForServer.get()); | ||
} | ||
} | ||
} | ||
// Region is not OPEN. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if an SCP is triggered and finished successfully whilst we are sleeping? Won't we miss the update and keep retrying?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is also fine because eventually this will be successful:
So if the region (meta/namespace) was assigned successfully, we will return true from here eventually.