sunhelly
diff --git a/‎hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
Lines changed: 1 addition & 0 deletions b/‎hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/MasterStatusTmpl.jamon
Lines changed: 1 addition & 0 deletions
diff --git a/‎hbase-server/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
Lines changed: 9 additions & 6 deletions b/‎hbase-server/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
Lines changed: 9 additions & 6 deletions
diff --git a/‎hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
Lines changed: 44 additions & 27 deletions b/‎hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
Lines changed: 44 additions & 27 deletions
diff --git a/‎hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/TakeSnapshotHandler.java
Lines changed: 2 additions & 2 deletions b/‎hbase-server/src/main/java/org/apache/hadoop/hbase/master/snapshot/TakeSnapshotHandler.java
Lines changed: 2 additions & 2 deletions
@@ -167,6 +167,7 @@ AssignmentManager assignmentManager = master.getAssignmentManager();
                 <%if HBaseConfiguration.isShowConfInServlet()%>
                 <li><a href="/conf">HBase Configuration</a></li>
                 </%if>
+                <li><a href="/startupProgress.jsp">Startup Progress</a></li>
                </ul>
             </div><!--/.nav-collapse -->
         </div>
 
@@ -27,6 +27,7 @@
 import org.apache.hadoop.hbase.ZNodeClearer;
 import org.apache.hadoop.hbase.exceptions.DeserializationException;
 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
+import org.apache.hadoop.hbase.monitoring.TaskGroup;
 import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
 import org.apache.hadoop.hbase.zookeeper.ZKListener;
 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
@@ -200,16 +201,18 @@ private void handleMasterNodeChange() {
    * Block until becoming the active master. Method blocks until there is not another active master
    * and our attempt to become the new active master is successful. This also makes sure that we are
    * watching the master znode so will be notified if another master dies.
-   * @param checkInterval the interval to check if the master is stopped
-   * @param startupStatus the monitor status to track the progress
+   * @param checkInterval    the interval to check if the master is stopped
+   * @param startupTaskGroup the task group for master startup to track the progress
    * @return True if no issue becoming active master else false if another master was running or if
    *         some other problem (zookeeper, stop flag has been set on this Master)
    */
-  boolean blockUntilBecomingActiveMaster(int checkInterval, MonitoredTask startupStatus) {
+  boolean blockUntilBecomingActiveMaster(int checkInterval, TaskGroup startupTaskGroup) {
+    MonitoredTask blockUntilActive =
+      startupTaskGroup.addTask("Blocking until becoming active master");
     String backupZNode = ZNodePaths
       .joinZNode(this.watcher.getZNodePaths().backupMasterAddressesZNode, this.sn.toString());
     while (!(master.isAborted() || master.isStopped())) {
-      startupStatus.setStatus("Trying to register in ZK as active master");
+      blockUntilActive.setStatus("Trying to register in ZK as active master");
       // Try to become the active master, watch if there is another master.
       // Write out our ServerName as versioned bytes.
       try {
@@ -228,7 +231,7 @@ boolean blockUntilBecomingActiveMaster(int checkInterval, MonitoredTask startupS
           ZNodeClearer.writeMyEphemeralNodeOnDisk(this.sn.toString());
 
           // We are the master, return
-          startupStatus.setStatus("Successfully registered as active master.");
+          blockUntilActive.setStatus("Successfully registered as active master.");
           this.clusterHasActiveMaster.set(true);
           activeMasterServerName = sn;
           LOG.info("Registered as active master=" + this.sn);
@@ -273,7 +276,7 @@ boolean blockUntilBecomingActiveMaster(int checkInterval, MonitoredTask startupS
           }
         }
         LOG.info(msg);
-        startupStatus.setStatus(msg);
+        blockUntilActive.setStatus(msg);
       } catch (KeeperException ke) {
         master.abort("Received an unexpected KeeperException, aborting", ke);
         return false;
 
@@ -154,6 +154,7 @@
 import org.apache.hadoop.hbase.mob.MobConstants;
 import org.apache.hadoop.hbase.monitoring.MemoryBoundedLogMessageBuffer;
 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
+import org.apache.hadoop.hbase.monitoring.TaskGroup;
 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
 import org.apache.hadoop.hbase.procedure.MasterProcedureManagerHost;
 import org.apache.hadoop.hbase.procedure.flush.MasterFlushTableProcedureManager;
@@ -409,6 +410,11 @@ public class HMaster extends HRegionServer implements MasterServices {
   // Cached clusterId on stand by masters to serve clusterID requests from clients.
   private final CachedClusterId cachedClusterId;
 
+  public static final String WARMUP_BEFORE_MOVE = "hbase.master.warmup.before.move";
+  private static final boolean DEFAULT_WARMUP_BEFORE_MOVE = true;
+
+  private TaskGroup startupTaskGroup;
+
   /**
    * Initializes the HMaster. The steps are as follows:
    * <p>
@@ -417,9 +423,8 @@ public class HMaster extends HRegionServer implements MasterServices {
    * <li>Start the ActiveMasterManager.
    * </ol>
    * <p>
-   * Remaining steps of initialization occur in
-   * {@link #finishActiveMasterInitialization(MonitoredTask)} after the master becomes the active
-   * one.
+   * Remaining steps of initialization occur in {@link #finishActiveMasterInitialization()} after
+   * the master becomes the active one.
    */
   public HMaster(final Configuration conf) throws IOException {
     super(conf);
@@ -792,12 +797,13 @@ protected AssignmentManager createAssignmentManager(MasterServices master) {
    * Notice that now we will not schedule a special procedure to make meta online(unless the first
    * time where meta has not been created yet), we will rely on SCP to bring meta online.
    */
-  private void finishActiveMasterInitialization(MonitoredTask status)
+
+  private void finishActiveMasterInitialization()
     throws IOException, InterruptedException, KeeperException, ReplicationException {
     /*
      * We are active master now... go initialize components we need to run.
      */
-    status.setStatus("Initializing Master file system");
+    startupTaskGroup.addTask("Initializing Master file system");
 
     this.masterActiveTime = System.currentTimeMillis();
     // TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring.
@@ -810,15 +816,15 @@ private void finishActiveMasterInitialization(MonitoredTask status)
 
     // warm-up HTDs cache on master initialization
     if (preLoadTableDescriptors) {
-      status.setStatus("Pre-loading table descriptors");
+      startupTaskGroup.addTask("Pre-loading table descriptors");
       this.tableDescriptors.getAll();
     }
 
     // Publish cluster ID; set it in Master too. The superclass RegionServer does this later but
     // only after it has checked in with the Master. At least a few tests ask Master for clusterId
     // before it has called its run method and before RegionServer has done the reportForDuty.
     ClusterId clusterId = fileSystemManager.getClusterId();
-    status.setStatus("Publishing Cluster ID " + clusterId + " in ZooKeeper");
+    startupTaskGroup.addTask("Publishing Cluster ID " + clusterId + " in ZooKeeper");
     ZKClusterId.setClusterId(this.zooKeeper, fileSystemManager.getClusterId());
     this.clusterId = clusterId.toString();
 
@@ -837,7 +843,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
       }
     }
 
-    status.setStatus("Initialize ServerManager and schedule SCP for crash servers");
+    startupTaskGroup.addTask("Initialize ServerManager and schedule SCP for crash servers");
     this.serverManager = createServerManager(this);
     if (
       !conf.getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK, DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK)
@@ -881,8 +887,9 @@ private void finishActiveMasterInitialization(MonitoredTask status)
         ? new MirroringTableStateManager(this)
         : new TableStateManager(this);
 
-    status.setStatus("Initializing ZK system trackers");
+    startupTaskGroup.addTask("Initializing ZK system trackers");
     initializeZKBasedSystemTrackers();
+
     // Set ourselves as active Master now our claim has succeeded up in zk.
     this.activeMaster = true;
 
@@ -894,19 +901,19 @@ private void finishActiveMasterInitialization(MonitoredTask status)
 
     // This is for backwards compatibility
     // See HBASE-11393
-    status.setStatus("Update TableCFs node in ZNode");
+    startupTaskGroup.addTask("Update TableCFs node in ZNode");
     ReplicationPeerConfigUpgrader tableCFsUpdater =
       new ReplicationPeerConfigUpgrader(zooKeeper, conf);
     tableCFsUpdater.copyTableCFs();
 
     if (!maintenanceMode) {
-      status.setStatus("Initializing master coprocessors");
+      startupTaskGroup.addTask("Initializing master coprocessors");
       setQuotasObserver(conf);
       initializeCoprocessorHost(conf);
     }
 
     // Checking if meta needs initializing.
-    status.setStatus("Initializing meta table if this is a new deploy");
+    startupTaskGroup.addTask("Initializing meta table if this is a new deploy");
     InitMetaProcedure initMetaProc = null;
     // Print out state of hbase:meta on startup; helps debugging.
     if (!this.assignmentManager.getRegionStates().hasTableRegionStates(TableName.META_TABLE_NAME)) {
@@ -929,7 +936,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
     this.balancer.initialize();
 
     // start up all service threads.
-    status.setStatus("Initializing master service threads");
+    startupTaskGroup.addTask("Initializing master service threads");
     startServiceThreads();
     // wait meta to be initialized after we start procedure executor
     if (initMetaProc != null) {
@@ -942,16 +949,16 @@ private void finishActiveMasterInitialization(MonitoredTask status)
     // With this as part of master initialization, it precludes our being able to start a single
     // server that is both Master and RegionServer. Needs more thought. TODO.
     String statusStr = "Wait for region servers to report in";
-    status.setStatus(statusStr);
-    LOG.info(Objects.toString(status));
-    waitForRegionServers(status);
+    MonitoredTask waitRegionServer = startupTaskGroup.addTask(statusStr);
+    LOG.info(Objects.toString(waitRegionServer));
+    waitForRegionServers(waitRegionServer);
 
     // Check if master is shutting down because issue initializing regionservers or balancer.
     if (isStopped()) {
       return;
     }
 
-    status.setStatus("Starting assignment manager");
+    startupTaskGroup.addTask("Starting assignment manager");
     // FIRST HBASE:META READ!!!!
     // The below cannot make progress w/o hbase:meta being online.
     // This is the FIRST attempt at going to hbase:meta. Meta on-lining is going on in background
@@ -1028,7 +1035,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
     this.balancer.setClusterMetrics(getClusterMetricsWithoutCoprocessor());
 
     // Start balancer and meta catalog janitor after meta and regions have been assigned.
-    status.setStatus("Starting balancer and catalog janitor");
+    startupTaskGroup.addTask("Starting balancer and catalog janitor");
     this.clusterStatusChore = new ClusterStatusChore(this, balancer);
     getChoreService().scheduleChore(clusterStatusChore);
     this.balancerChore = new BalancerChore(this);
@@ -1050,7 +1057,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
     if (!waitForNamespaceOnline()) {
       return;
     }
-    status.setStatus("Starting cluster schema service");
+    startupTaskGroup.addTask("Starting cluster schema service");
     try {
       initClusterSchemaService();
     } catch (IllegalStateException e) {
@@ -1073,7 +1080,6 @@ private void finishActiveMasterInitialization(MonitoredTask status)
       }
     }
 
-    status.markComplete("Initialization successful");
     LOG.info(String.format("Master has completed initialization %.3fsec",
       (System.currentTimeMillis() - masterActiveTime) / 1000.0f));
     this.masterFinishedInitializationTime = System.currentTimeMillis();
@@ -1085,6 +1091,9 @@ private void finishActiveMasterInitialization(MonitoredTask status)
     configurationManager.registerObserver(this.regionsRecoveryConfigManager);
     // Set master as 'initialized'.
     setInitialized(true);
+    startupTaskGroup.markComplete("Initialization successful");
+    MonitoredTask status =
+      TaskMonitor.get().createStatus("Progress after master initialized", false, true);
 
     if (tableFamilyDesc == null && replBarrierFamilyDesc == null) {
       // create missing CFs in meta table after master is set to 'initialized'.
@@ -1166,6 +1175,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
       LOG.debug("Balancer post startup initialization complete, took "
         + ((System.currentTimeMillis() - start) / 1000) + " seconds");
     }
+    status.markComplete("Progress after master initialized complete");
   }
 
   private void createMissingCFsInMetaDuringUpgrade(TableDescriptor metaDescriptor)
@@ -2171,14 +2181,19 @@ private void startActiveMasterManager(int infoPort) throws KeeperException {
         Threads.sleep(timeout);
       }
     }
-    MonitoredTask status = TaskMonitor.get().createStatus("Master startup");
-    status.setDescription("Master startup");
+
+    // Here for the master startup process, we use TaskGroup to monitor the whole progress.
+    // The UI is similar to how Hadoop designed the startup page for the NameNode.
+    // See HBASE-21521 for more details.
+    // We do not cleanup the startupTaskGroup, let the startup progress information
+    // be permanent in the MEM.
+    startupTaskGroup = TaskMonitor.createTaskGroup(true, "Master startup");
     try {
-      if (activeMasterManager.blockUntilBecomingActiveMaster(timeout, status)) {
-        finishActiveMasterInitialization(status);
+      if (activeMasterManager.blockUntilBecomingActiveMaster(timeout, startupTaskGroup)) {
+        finishActiveMasterInitialization();
       }
     } catch (Throwable t) {
-      status.setStatus("Failed to become active: " + t.getMessage());
+      startupTaskGroup.abort("Failed to become active master due to:" + t.getMessage());
       LOG.error(HBaseMarkers.FATAL, "Failed to become active master", t);
       // HBASE-5680: Likely hadoop23 vs hadoop 20.x/1.x incompatibility
       if (
@@ -2192,8 +2207,6 @@ private void startActiveMasterManager(int infoPort) throws KeeperException {
       } else {
         abort("Unhandled exception. Starting shutdown.", t);
       }
-    } finally {
-      status.cleanup();
     }
   }
 
@@ -2756,6 +2769,10 @@ public MemoryBoundedLogMessageBuffer getRegionServerFatalLogBuffer() {
     return rsFatals;
   }
 
+  public TaskGroup getStartupProgress() {
+    return startupTaskGroup;
+  }
+
   /**
    * Shutdown the cluster. Master runs a coordinated stop of all RegionServers and then itself.
    */
 
@@ -129,8 +129,8 @@ public TakeSnapshotHandler(SnapshotDescription snapshot, final MasterServices ma
     // prepare the verify
     this.verifier = new MasterSnapshotVerifier(masterServices, snapshot, workingDirFs);
     // update the running tasks
-    this.status = TaskMonitor.get()
-      .createStatus("Taking " + snapshot.getType() + " snapshot on table: " + snapshotTable, true);
+    this.status = TaskMonitor.get().createStatus(
+      "Taking " + snapshot.getType() + " snapshot on table: " + snapshotTable, false, true);
     this.snapshotManifest =
       SnapshotManifest.create(conf, rootFs, workingDir, snapshot, monitor, status);
   }