Skip to content

HBASE-21521 Expose master startup status via web UI #3667

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ AssignmentManager assignmentManager = master.getAssignmentManager();
<%if HBaseConfiguration.isShowConfInServlet()%>
<li><a href="/conf">HBase Configuration</a></li>
</%if>
<li><a href="/startupProgress.jsp">Startup Progress</a></li>

</ul>
</div><!--/.nav-collapse -->
</div>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@
import org.apache.hadoop.hbase.master.replication.UpdatePeerConfigProcedure;
import org.apache.hadoop.hbase.master.slowlog.SlowLogMasterService;
import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
import org.apache.hadoop.hbase.master.startupprogress.Phase;
import org.apache.hadoop.hbase.master.startupprogress.StartupProgressView;
import org.apache.hadoop.hbase.master.zksyncer.MasterAddressSyncer;
import org.apache.hadoop.hbase.master.zksyncer.MetaLocationSyncer;
import org.apache.hadoop.hbase.mob.MobFileCleanerChore;
Expand Down Expand Up @@ -427,6 +429,7 @@ public class HMaster extends HBaseServerBase<MasterRpcServices> implements Maste

// Cached clusterId on stand by masters to serve clusterID requests from clients.
private final CachedClusterId cachedClusterId;
private MonitoredTask startupStatusTask;

/**
* Initializes the HMaster. The steps are as follows:
Expand All @@ -437,7 +440,7 @@ public class HMaster extends HBaseServerBase<MasterRpcServices> implements Maste
* </ol>
* <p>
* Remaining steps of initialization occur in
* {@link #finishActiveMasterInitialization(MonitoredTask)} after the master becomes the
* {@link #finishActiveMasterInitialization()} after the master becomes the
* active one.
*/
public HMaster(final Configuration conf) throws IOException {
Expand Down Expand Up @@ -855,12 +858,12 @@ private void tryMigrateMetaLocationsFromZooKeeper() throws IOException, KeeperEx
* Notice that now we will not schedule a special procedure to make meta online(unless the first
* time where meta has not been created yet), we will rely on SCP to bring meta online.
*/
private void finishActiveMasterInitialization(MonitoredTask status) throws IOException,
private void finishActiveMasterInitialization() throws IOException,
InterruptedException, KeeperException, ReplicationException {
/*
* We are active master now... go initialize components we need to run.
*/
status.setStatus("Initializing Master file system");
startupStatusTask.setStatus("Initializing Master file system");

this.masterActiveTime = EnvironmentEdgeManager.currentTime();
// TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring.
Expand All @@ -873,15 +876,15 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc

// warm-up HTDs cache on master initialization
if (preLoadTableDescriptors) {
status.setStatus("Pre-loading table descriptors");
startupStatusTask.setStatus("Pre-loading table descriptors");
this.tableDescriptors.getAll();
}

// Publish cluster ID; set it in Master too. The superclass RegionServer does this later but
// only after it has checked in with the Master. At least a few tests ask Master for clusterId
// before it has called its run method and before RegionServer has done the reportForDuty.
ClusterId clusterId = fileSystemManager.getClusterId();
status.setStatus("Publishing Cluster ID " + clusterId + " in ZooKeeper");
startupStatusTask.setStatus("Publishing Cluster ID " + clusterId + " in ZooKeeper");
ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId());
this.clusterId = clusterId.toString();

Expand All @@ -900,7 +903,7 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
}
}

status.setStatus("Initialize ServerManager and schedule SCP for crash servers");
startupStatusTask.setStatus("Initialize ServerManager");
// The below two managers must be created before loading procedures, as they will be used during
// loading.
this.serverManager = createServerManager(this);
Expand All @@ -913,6 +916,7 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
// initialize master local region
masterRegion = MasterRegionFactory.create(this);

startupStatusTask.setStatus("Migrate Meta Locations From Zookeeper");
tryMigrateMetaLocationsFromZooKeeper();

createProcedureExecutor();
Expand All @@ -921,6 +925,7 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
.collect(Collectors.groupingBy(p -> p.getClass()));

// Create Assignment Manager
startupStatusTask.setStatus("Create Assignment Manager");
this.assignmentManager = createAssignmentManager(this, masterRegion);
this.assignmentManager.start();
// TODO: TRSP can perform as the sub procedure for other procedures, so even if it is marked as
Expand All @@ -937,16 +942,17 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
// filesystem that COULD BE 'alive' (we'll schedule SCPs for each and let SCP figure it out).
// We also pass dirs that are already 'splitting'... so we can do some checks down in tracker.
// TODO: Generate the splitting and live Set in one pass instead of two as we currently do.
startupStatusTask.setStatus("Schedule SCP for crash servers");
this.regionServerTracker.upgrade(
procsByType.getOrDefault(ServerCrashProcedure.class, Collections.emptyList()).stream()
.map(p -> (ServerCrashProcedure) p).map(p -> p.getServerName()).collect(Collectors.toSet()),
walManager.getLiveServersFromWALDir(), walManager.getSplittingServersFromWALDir());
// This manager must be accessed AFTER hbase:meta is confirmed on line..
this.tableStateManager = new TableStateManager(this);

status.setStatus("Initializing ZK system trackers");
startupStatusTask.setStatus("Initializing ZK system trackers");
initializeZKBasedSystemTrackers();
status.setStatus("Loading last flushed sequence id of regions");
startupStatusTask.setStatus("Loading last flushed sequence id of regions");
try {
this.serverManager.loadLastFlushedSequenceIds();
} catch (IOException e) {
Expand All @@ -969,7 +975,7 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
updateConfigurationForQuotasObserver(conf);
}
// initialize master side coprocessors before we start handling requests
status.setStatus("Initializing master coprocessors");
startupStatusTask.setStatus("Initializing master coprocessors");
this.cpHost = new MasterCoprocessorHost(this, this.conf);
} else {
// start an in process region server for carrying system regions
Expand All @@ -979,7 +985,7 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
}

// Checking if meta needs initializing.
status.setStatus("Initializing meta table if this is a new deploy");
startupStatusTask.setStatus("Initializing meta table if this is a new deploy");
InitMetaProcedure initMetaProc = null;
// Print out state of hbase:meta on startup; helps debugging.
if (!this.assignmentManager.getRegionStates().hasTableRegionStates(TableName.META_TABLE_NAME)) {
Expand All @@ -999,7 +1005,7 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
this.balancer.updateClusterMetrics(getClusterMetricsWithoutCoprocessor());

// start up all service threads.
status.setStatus("Initializing master service threads");
startupStatusTask.setStatus("Initializing master service threads");
startServiceThreads();
// wait meta to be initialized after we start procedure executor
if (initMetaProc != null) {
Expand All @@ -1012,16 +1018,15 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
// With this as part of master initialization, it precludes our being able to start a single
// server that is both Master and RegionServer. Needs more thought. TODO.
String statusStr = "Wait for region servers to report in";
status.setStatus(statusStr);
LOG.info(Objects.toString(status));
waitForRegionServers(status);
LOG.info(Objects.toString(startupStatusTask));
waitForRegionServers(startupStatusTask);

// Check if master is shutting down because issue initializing regionservers or balancer.
if (isStopped()) {
return;
}

status.setStatus("Starting assignment manager");
startupStatusTask.setStatus("Starting assignment manager");
// FIRST HBASE:META READ!!!!
// The below cannot make progress w/o hbase:meta being online.
// This is the FIRST attempt at going to hbase:meta. Meta on-lining is going on in background
Expand Down Expand Up @@ -1087,7 +1092,7 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
this.balancer.updateClusterMetrics(getClusterMetricsWithoutCoprocessor());

// Start balancer and meta catalog janitor after meta and regions have been assigned.
status.setStatus("Starting balancer and catalog janitor");
startupStatusTask.setStatus("Starting balancer and catalog janitor");
this.clusterStatusChore = new ClusterStatusChore(this, balancer);
getChoreService().scheduleChore(clusterStatusChore);
this.balancerChore = new BalancerChore(this);
Expand All @@ -1105,7 +1110,7 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
if (!waitForNamespaceOnline()) {
return;
}
status.setStatus("Starting cluster schema service");
startupStatusTask.setStatus("Starting cluster schema service");
try {
initClusterSchemaService();
} catch (IllegalStateException e) {
Expand All @@ -1126,7 +1131,7 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
}
}

status.markComplete("Initialization successful");
startupStatusTask.setStatus("Initialization successful");
LOG.info(String.format("Master has completed initialization %.3fsec",
(EnvironmentEdgeManager.currentTime() - masterActiveTime) / 1000.0f));
this.masterFinishedInitializationTime = EnvironmentEdgeManager.currentTime();
Expand Down Expand Up @@ -1167,7 +1172,7 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
}

assignmentManager.checkIfShouldMoveSystemRegionAsync();
status.setStatus("Starting quota manager");
startupStatusTask.setStatus("Starting quota manager");
initQuotaManager();
if (QuotaUtil.isQuotaEnabled(conf)) {
// Create the quota snapshot notifier
Expand All @@ -1190,13 +1195,13 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
this.serverManager.clearDeadServersWithSameHostNameAndPortOfOnlineServer();

// Check and set the znode ACLs if needed in case we are overtaking a non-secure configuration
status.setStatus("Checking ZNode ACLs");
startupStatusTask.setStatus("Checking ZNode ACLs");
zooKeeper.checkAndSetZNodeAcls();

status.setStatus("Initializing MOB Cleaner");
startupStatusTask.setStatus("Initializing MOB Cleaner");
initMobCleaner();

status.setStatus("Calling postStartMaster coprocessors");
startupStatusTask.setStatus("Calling postStartMaster coprocessors");
if (this.cpHost != null) {
// don't let cp initialization errors kill the master
try {
Expand All @@ -1212,12 +1217,26 @@ private void finishActiveMasterInitialization(MonitoredTask status) throws IOExc
* After master has started up, lets do balancer post startup initialization. Since this runs
* in activeMasterManager thread, it should be fine.
*/
startupStatusTask.setStatus("Calling postMasterStartup Balancer");
long start = EnvironmentEdgeManager.currentTime();
this.balancer.postMasterStartupInitialize();
if (LOG.isDebugEnabled()) {
LOG.debug("Balancer post startup initialization complete, took " + (
(EnvironmentEdgeManager.currentTime() - start) / 1000) + " seconds");
}
startupStatusTask.markComplete("Startup Complete");
}

/*
Get all the completed/running phases.
*/
public List<Phase> getStartupPhases() {
if (startupStatusTask != null) {
return new StartupProgressView(startupStatusTask).getPhaseStatus();
} else {
// Startup hasn't started yet.
return Collections.emptyList();
}
}

private void createMissingCFsInMetaDuringUpgrade(
Expand Down Expand Up @@ -2271,14 +2290,16 @@ private void startActiveMasterManager(int infoPort) throws KeeperException {
Threads.sleep(timeout);
}
}
MonitoredTask status = TaskMonitor.get().createStatus("Master startup");
status.setDescription("Master startup");
startupStatusTask = TaskMonitor.get().createStatus("Master startup");
startupStatusTask.setDescription("Master startup");
// Enabling status journal for Startup Progress UI
startupStatusTask.enableStatusJournal(false);
try {
if (activeMasterManager.blockUntilBecomingActiveMaster(timeout, status)) {
finishActiveMasterInitialization(status);
if (activeMasterManager.blockUntilBecomingActiveMaster(timeout, startupStatusTask)) {
finishActiveMasterInitialization();
}
} catch (Throwable t) {
status.setStatus("Failed to become active: " + t.getMessage());
startupStatusTask.setStatus("Failed to become active: " + t.getMessage());
LOG.error(HBaseMarkers.FATAL, "Failed to become active master", t);
// HBASE-5680: Likely hadoop23 vs hadoop 20.x/1.x incompatibility
if (t instanceof NoClassDefFoundError && t.getMessage().
Expand All @@ -2291,7 +2312,7 @@ private void startActiveMasterManager(int infoPort) throws KeeperException {
abort("Unhandled exception. Starting shutdown.", t);
}
} finally {
status.cleanup();
startupStatusTask.cleanup();
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master.startupprogress;

import org.apache.yetus.audience.InterfaceAudience;

/**
* This stores all the startup progress details that are required on the UI page.
*/
@InterfaceAudience.Private
public class Phase {
private String name;
private String status;
private String startTime;
private String endTime;
private String elapsedTime;

public Phase(String name) {
this.name = name;
}

public String getName() {
return name;
}

public String getStatus() {
return status;
}

public void setStatus(String status) {
this.status = status;
}

public String getStartTime() {
return startTime;
}

public void setStartTime(String startTime) {
this.startTime = startTime;
}

public String getEndTime() {
return endTime;
}

public void setEndTime(String endTime) {
this.endTime = endTime;
}

public String getElapsedTime() {
return elapsedTime;
}

public void setElapsedTime(String elapsedTime) {
this.elapsedTime = elapsedTime;
}
}
Loading