154
154
import org .apache .hadoop .hbase .mob .MobConstants ;
155
155
import org .apache .hadoop .hbase .monitoring .MemoryBoundedLogMessageBuffer ;
156
156
import org .apache .hadoop .hbase .monitoring .MonitoredTask ;
157
+ import org .apache .hadoop .hbase .monitoring .TaskGroup ;
157
158
import org .apache .hadoop .hbase .monitoring .TaskMonitor ;
158
159
import org .apache .hadoop .hbase .procedure .MasterProcedureManagerHost ;
159
160
import org .apache .hadoop .hbase .procedure .flush .MasterFlushTableProcedureManager ;
@@ -409,6 +410,11 @@ public class HMaster extends HRegionServer implements MasterServices {
409
410
// Cached clusterId on stand by masters to serve clusterID requests from clients.
410
411
private final CachedClusterId cachedClusterId ;
411
412
413
+ public static final String WARMUP_BEFORE_MOVE = "hbase.master.warmup.before.move" ;
414
+ private static final boolean DEFAULT_WARMUP_BEFORE_MOVE = true ;
415
+
416
+ private TaskGroup startupTaskGroup ;
417
+
412
418
/**
413
419
* Initializes the HMaster. The steps are as follows:
414
420
* <p>
@@ -417,9 +423,8 @@ public class HMaster extends HRegionServer implements MasterServices {
417
423
* <li>Start the ActiveMasterManager.
418
424
* </ol>
419
425
* <p>
420
- * Remaining steps of initialization occur in
421
- * {@link #finishActiveMasterInitialization(MonitoredTask)} after the master becomes the active
422
- * one.
426
+ * Remaining steps of initialization occur in {@link #finishActiveMasterInitialization()} after
427
+ * the master becomes the active one.
423
428
*/
424
429
public HMaster (final Configuration conf ) throws IOException {
425
430
super (conf );
@@ -792,12 +797,13 @@ protected AssignmentManager createAssignmentManager(MasterServices master) {
792
797
* Notice that now we will not schedule a special procedure to make meta online(unless the first
793
798
* time where meta has not been created yet), we will rely on SCP to bring meta online.
794
799
*/
795
- private void finishActiveMasterInitialization (MonitoredTask status )
800
+
801
+ private void finishActiveMasterInitialization ()
796
802
throws IOException , InterruptedException , KeeperException , ReplicationException {
797
803
/*
798
804
* We are active master now... go initialize components we need to run.
799
805
*/
800
- status . setStatus ("Initializing Master file system" );
806
+ startupTaskGroup . addTask ("Initializing Master file system" );
801
807
802
808
this .masterActiveTime = System .currentTimeMillis ();
803
809
// TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring.
@@ -810,15 +816,15 @@ private void finishActiveMasterInitialization(MonitoredTask status)
810
816
811
817
// warm-up HTDs cache on master initialization
812
818
if (preLoadTableDescriptors ) {
813
- status . setStatus ("Pre-loading table descriptors" );
819
+ startupTaskGroup . addTask ("Pre-loading table descriptors" );
814
820
this .tableDescriptors .getAll ();
815
821
}
816
822
817
823
// Publish cluster ID; set it in Master too. The superclass RegionServer does this later but
818
824
// only after it has checked in with the Master. At least a few tests ask Master for clusterId
819
825
// before it has called its run method and before RegionServer has done the reportForDuty.
820
826
ClusterId clusterId = fileSystemManager .getClusterId ();
821
- status . setStatus ("Publishing Cluster ID " + clusterId + " in ZooKeeper" );
827
+ startupTaskGroup . addTask ("Publishing Cluster ID " + clusterId + " in ZooKeeper" );
822
828
ZKClusterId .setClusterId (this .zooKeeper , fileSystemManager .getClusterId ());
823
829
this .clusterId = clusterId .toString ();
824
830
@@ -837,7 +843,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
837
843
}
838
844
}
839
845
840
- status . setStatus ("Initialize ServerManager and schedule SCP for crash servers" );
846
+ startupTaskGroup . addTask ("Initialize ServerManager and schedule SCP for crash servers" );
841
847
this .serverManager = createServerManager (this );
842
848
if (
843
849
!conf .getBoolean (HBASE_SPLIT_WAL_COORDINATED_BY_ZK , DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK )
@@ -881,8 +887,9 @@ private void finishActiveMasterInitialization(MonitoredTask status)
881
887
? new MirroringTableStateManager (this )
882
888
: new TableStateManager (this );
883
889
884
- status . setStatus ("Initializing ZK system trackers" );
890
+ startupTaskGroup . addTask ("Initializing ZK system trackers" );
885
891
initializeZKBasedSystemTrackers ();
892
+
886
893
// Set ourselves as active Master now our claim has succeeded up in zk.
887
894
this .activeMaster = true ;
888
895
@@ -894,19 +901,19 @@ private void finishActiveMasterInitialization(MonitoredTask status)
894
901
895
902
// This is for backwards compatibility
896
903
// See HBASE-11393
897
- status . setStatus ("Update TableCFs node in ZNode" );
904
+ startupTaskGroup . addTask ("Update TableCFs node in ZNode" );
898
905
ReplicationPeerConfigUpgrader tableCFsUpdater =
899
906
new ReplicationPeerConfigUpgrader (zooKeeper , conf );
900
907
tableCFsUpdater .copyTableCFs ();
901
908
902
909
if (!maintenanceMode ) {
903
- status . setStatus ("Initializing master coprocessors" );
910
+ startupTaskGroup . addTask ("Initializing master coprocessors" );
904
911
setQuotasObserver (conf );
905
912
initializeCoprocessorHost (conf );
906
913
}
907
914
908
915
// Checking if meta needs initializing.
909
- status . setStatus ("Initializing meta table if this is a new deploy" );
916
+ startupTaskGroup . addTask ("Initializing meta table if this is a new deploy" );
910
917
InitMetaProcedure initMetaProc = null ;
911
918
// Print out state of hbase:meta on startup; helps debugging.
912
919
if (!this .assignmentManager .getRegionStates ().hasTableRegionStates (TableName .META_TABLE_NAME )) {
@@ -929,7 +936,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
929
936
this .balancer .initialize ();
930
937
931
938
// start up all service threads.
932
- status . setStatus ("Initializing master service threads" );
939
+ startupTaskGroup . addTask ("Initializing master service threads" );
933
940
startServiceThreads ();
934
941
// wait meta to be initialized after we start procedure executor
935
942
if (initMetaProc != null ) {
@@ -942,16 +949,16 @@ private void finishActiveMasterInitialization(MonitoredTask status)
942
949
// With this as part of master initialization, it precludes our being able to start a single
943
950
// server that is both Master and RegionServer. Needs more thought. TODO.
944
951
String statusStr = "Wait for region servers to report in" ;
945
- status . setStatus (statusStr );
946
- LOG .info (Objects .toString (status ));
947
- waitForRegionServers (status );
952
+ MonitoredTask waitRegionServer = startupTaskGroup . addTask (statusStr );
953
+ LOG .info (Objects .toString (waitRegionServer ));
954
+ waitForRegionServers (waitRegionServer );
948
955
949
956
// Check if master is shutting down because issue initializing regionservers or balancer.
950
957
if (isStopped ()) {
951
958
return ;
952
959
}
953
960
954
- status . setStatus ("Starting assignment manager" );
961
+ startupTaskGroup . addTask ("Starting assignment manager" );
955
962
// FIRST HBASE:META READ!!!!
956
963
// The below cannot make progress w/o hbase:meta being online.
957
964
// This is the FIRST attempt at going to hbase:meta. Meta on-lining is going on in background
@@ -1028,7 +1035,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
1028
1035
this .balancer .setClusterMetrics (getClusterMetricsWithoutCoprocessor ());
1029
1036
1030
1037
// Start balancer and meta catalog janitor after meta and regions have been assigned.
1031
- status . setStatus ("Starting balancer and catalog janitor" );
1038
+ startupTaskGroup . addTask ("Starting balancer and catalog janitor" );
1032
1039
this .clusterStatusChore = new ClusterStatusChore (this , balancer );
1033
1040
getChoreService ().scheduleChore (clusterStatusChore );
1034
1041
this .balancerChore = new BalancerChore (this );
@@ -1050,7 +1057,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
1050
1057
if (!waitForNamespaceOnline ()) {
1051
1058
return ;
1052
1059
}
1053
- status . setStatus ("Starting cluster schema service" );
1060
+ startupTaskGroup . addTask ("Starting cluster schema service" );
1054
1061
try {
1055
1062
initClusterSchemaService ();
1056
1063
} catch (IllegalStateException e ) {
@@ -1073,7 +1080,6 @@ private void finishActiveMasterInitialization(MonitoredTask status)
1073
1080
}
1074
1081
}
1075
1082
1076
- status .markComplete ("Initialization successful" );
1077
1083
LOG .info (String .format ("Master has completed initialization %.3fsec" ,
1078
1084
(System .currentTimeMillis () - masterActiveTime ) / 1000.0f ));
1079
1085
this .masterFinishedInitializationTime = System .currentTimeMillis ();
@@ -1085,6 +1091,9 @@ private void finishActiveMasterInitialization(MonitoredTask status)
1085
1091
configurationManager .registerObserver (this .regionsRecoveryConfigManager );
1086
1092
// Set master as 'initialized'.
1087
1093
setInitialized (true );
1094
+ startupTaskGroup .markComplete ("Initialization successful" );
1095
+ MonitoredTask status =
1096
+ TaskMonitor .get ().createStatus ("Progress after master initialized" , false , true );
1088
1097
1089
1098
if (tableFamilyDesc == null && replBarrierFamilyDesc == null ) {
1090
1099
// create missing CFs in meta table after master is set to 'initialized'.
@@ -1166,6 +1175,7 @@ private void finishActiveMasterInitialization(MonitoredTask status)
1166
1175
LOG .debug ("Balancer post startup initialization complete, took "
1167
1176
+ ((System .currentTimeMillis () - start ) / 1000 ) + " seconds" );
1168
1177
}
1178
+ status .markComplete ("Progress after master initialized complete" );
1169
1179
}
1170
1180
1171
1181
private void createMissingCFsInMetaDuringUpgrade (TableDescriptor metaDescriptor )
@@ -2171,14 +2181,19 @@ private void startActiveMasterManager(int infoPort) throws KeeperException {
2171
2181
Threads .sleep (timeout );
2172
2182
}
2173
2183
}
2174
- MonitoredTask status = TaskMonitor .get ().createStatus ("Master startup" );
2175
- status .setDescription ("Master startup" );
2184
+
2185
+ // Here for the master startup process, we use TaskGroup to monitor the whole progress.
2186
+ // The UI is similar to how Hadoop designed the startup page for the NameNode.
2187
+ // See HBASE-21521 for more details.
2188
+ // We do not cleanup the startupTaskGroup, let the startup progress information
2189
+ // be permanent in the MEM.
2190
+ startupTaskGroup = TaskMonitor .createTaskGroup (true , "Master startup" );
2176
2191
try {
2177
- if (activeMasterManager .blockUntilBecomingActiveMaster (timeout , status )) {
2178
- finishActiveMasterInitialization (status );
2192
+ if (activeMasterManager .blockUntilBecomingActiveMaster (timeout , startupTaskGroup )) {
2193
+ finishActiveMasterInitialization ();
2179
2194
}
2180
2195
} catch (Throwable t ) {
2181
- status . setStatus ("Failed to become active: " + t .getMessage ());
2196
+ startupTaskGroup . abort ("Failed to become active master due to: " + t .getMessage ());
2182
2197
LOG .error (HBaseMarkers .FATAL , "Failed to become active master" , t );
2183
2198
// HBASE-5680: Likely hadoop23 vs hadoop 20.x/1.x incompatibility
2184
2199
if (
@@ -2192,8 +2207,6 @@ private void startActiveMasterManager(int infoPort) throws KeeperException {
2192
2207
} else {
2193
2208
abort ("Unhandled exception. Starting shutdown." , t );
2194
2209
}
2195
- } finally {
2196
- status .cleanup ();
2197
2210
}
2198
2211
}
2199
2212
@@ -2756,6 +2769,10 @@ public MemoryBoundedLogMessageBuffer getRegionServerFatalLogBuffer() {
2756
2769
return rsFatals ;
2757
2770
}
2758
2771
2772
+ public TaskGroup getStartupProgress () {
2773
+ return startupTaskGroup ;
2774
+ }
2775
+
2759
2776
/**
2760
2777
* Shutdown the cluster. Master runs a coordinated stop of all RegionServers and then itself.
2761
2778
*/
0 commit comments