Skip to content

Commit 75c81c7

Browse files
committed
HBASE-26245 Store region server list in master local region (#4136)
Signed-off-by: Andrew Purtell <apurtell@apache.org> (cherry picked from commit bb1bbdd) (cherry picked from commit 2711142)
1 parent 08ecae9 commit 75c81c7

17 files changed

+368
-44
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,8 @@ public class HMaster extends HRegionServer implements MasterServices {
402402
// the master local storage to store procedure data, meta region locations, etc.
403403
private MasterRegion masterRegion;
404404

405+
private RegionServerList rsListStorage;
406+
405407
// handle table states
406408
private TableStateManager tableStateManager;
407409

@@ -896,14 +898,19 @@ private void finishActiveMasterInitialization(MonitoredTask status)
896898
}
897899

898900
status.setStatus("Initialize ServerManager and schedule SCP for crash servers");
899-
this.serverManager = createServerManager(this);
901+
// The below two managers must be created before loading procedures, as they will be used during
902+
// loading.
903+
// initialize master local region
904+
masterRegion = MasterRegionFactory.create(this);
905+
rsListStorage = new MasterRegionServerList(masterRegion, this);
906+
907+
this.serverManager = createServerManager(this, rsListStorage);
900908
if (!conf.getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK,
901909
DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK)) {
902910
this.splitWALManager = new SplitWALManager(this);
903911
}
904912

905-
// initialize master local region
906-
masterRegion = MasterRegionFactory.create(this);
913+
907914

908915
tryMigrateMetaLocationsFromZooKeeper();
909916

@@ -932,7 +939,8 @@ private void finishActiveMasterInitialization(MonitoredTask status)
932939
this.regionServerTracker.upgrade(
933940
procsByType.getOrDefault(ServerCrashProcedure.class, Collections.emptyList()).stream()
934941
.map(p -> (ServerCrashProcedure) p).map(p -> p.getServerName()).collect(Collectors.toSet()),
935-
walManager.getLiveServersFromWALDir(), walManager.getSplittingServersFromWALDir());
942+
Sets.union(rsListStorage.getAll(), walManager.getLiveServersFromWALDir()),
943+
walManager.getSplittingServersFromWALDir());
936944
// This manager will be started AFTER hbase:meta is confirmed on line.
937945
// hbase.mirror.table.state.to.zookeeper is so hbase1 clients can connect. They read table
938946
// state from zookeeper while hbase2 reads it from hbase:meta. Disable if no hbase1 clients.
@@ -1376,11 +1384,12 @@ private void initMobCleaner() {
13761384
* </p>
13771385
*/
13781386
@InterfaceAudience.Private
1379-
protected ServerManager createServerManager(final MasterServices master) throws IOException {
1387+
protected ServerManager createServerManager(MasterServices master,
1388+
RegionServerList storage) throws IOException {
13801389
// We put this out here in a method so can do a Mockito.spy and stub it out
13811390
// w/ a mocked up ServerManager.
13821391
setupClusterConnection();
1383-
return new ServerManager(master);
1392+
return new ServerManager(master, storage);
13841393
}
13851394

13861395
private void waitForRegionServers(final MonitoredTask status)
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master;
19+
20+
import java.io.IOException;
21+
import java.io.UncheckedIOException;
22+
import java.util.HashSet;
23+
import java.util.Set;
24+
import org.apache.hadoop.hbase.Abortable;
25+
import org.apache.hadoop.hbase.HConstants;
26+
import org.apache.hadoop.hbase.ServerName;
27+
import org.apache.hadoop.hbase.client.Delete;
28+
import org.apache.hadoop.hbase.client.Put;
29+
import org.apache.hadoop.hbase.client.Result;
30+
import org.apache.hadoop.hbase.client.ResultScanner;
31+
import org.apache.hadoop.hbase.client.Scan;
32+
import org.apache.hadoop.hbase.log.HBaseMarkers;
33+
import org.apache.hadoop.hbase.master.assignment.ServerState;
34+
import org.apache.hadoop.hbase.master.region.MasterRegion;
35+
import org.apache.hadoop.hbase.master.region.MasterRegionFactory;
36+
import org.apache.hadoop.hbase.util.Bytes;
37+
import org.apache.yetus.audience.InterfaceAudience;
38+
import org.slf4j.Logger;
39+
import org.slf4j.LoggerFactory;
40+
41+
/**
42+
* {@link MasterRegion} based {@link RegionServerList}.
43+
* <p/>
44+
* This is useful when we want to restart a cluster with only the data on file system, as when
45+
* restarting, we need to get the previous live region servers for scheduling SCP. Before we have
46+
* this class, we need to scan the WAL directory on WAL file system to find out the previous live
47+
* region servers, which means we can not restart a cluster without the previous WAL file system,
48+
* even if we have flushed all the data.
49+
* <p/>
50+
* Please see HBASE-26245 for more details.
51+
*/
52+
@InterfaceAudience.Private
53+
public class MasterRegionServerList implements RegionServerList {
54+
55+
private static final Logger LOG = LoggerFactory.getLogger(MasterRegionServerList.class);
56+
57+
private final MasterRegion region;
58+
59+
private final Abortable abortable;
60+
61+
public MasterRegionServerList(MasterRegion region, Abortable abortable) {
62+
this.region = region;
63+
this.abortable = abortable;
64+
}
65+
66+
@Override
67+
public void started(ServerName sn) {
68+
Put put =
69+
new Put(Bytes.toBytes(sn.getServerName())).addColumn(MasterRegionFactory.REGION_SERVER_FAMILY,
70+
HConstants.STATE_QUALIFIER, Bytes.toBytes(ServerState.ONLINE.name()));
71+
try {
72+
region.update(r -> r.put(put));
73+
} catch (IOException e) {
74+
LOG.error(HBaseMarkers.FATAL, "Failed to record region server {} as started, aborting...", sn,
75+
e);
76+
abortable.abort("Failed to record region server as started");
77+
throw new UncheckedIOException(e);
78+
}
79+
}
80+
81+
@Override
82+
public void expired(ServerName sn) {
83+
Delete delete = new Delete(Bytes.toBytes(sn.getServerName()))
84+
.addFamily(MasterRegionFactory.REGION_SERVER_FAMILY);
85+
try {
86+
region.update(r -> r.delete(delete));
87+
} catch (IOException e) {
88+
LOG.error(HBaseMarkers.FATAL, "Failed to record region server {} as expired, aborting...", sn,
89+
e);
90+
abortable.abort("Failed to record region server as expired");
91+
throw new UncheckedIOException(e);
92+
}
93+
}
94+
95+
@Override
96+
public Set<ServerName> getAll() throws IOException {
97+
Set<ServerName> rsList = new HashSet<>();
98+
try (ResultScanner scanner =
99+
region.getScanner(new Scan().addFamily(MasterRegionFactory.REGION_SERVER_FAMILY))) {
100+
for (;;) {
101+
Result result = scanner.next();
102+
if (result == null) {
103+
break;
104+
}
105+
rsList.add(ServerName.valueOf(Bytes.toString(result.getRow())));
106+
}
107+
}
108+
return rsList;
109+
}
110+
111+
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -157,8 +157,6 @@ private boolean checkFileSystem() {
157157

158158
/**
159159
* Get Servernames which are currently splitting; paths have a '-splitting' suffix.
160-
* @return ServerName
161-
* @throws IOException IOException
162160
*/
163161
public Set<ServerName> getSplittingServersFromWALDir() throws IOException {
164162
return getServerNamesFromWALDirPath(
@@ -168,8 +166,6 @@ public Set<ServerName> getSplittingServersFromWALDir() throws IOException {
168166
/**
169167
* Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these
170168
* are already being split -- they cannot be 'alive'.
171-
* @return ServerName
172-
* @throws IOException IOException
173169
*/
174170
public Set<ServerName> getLiveServersFromWALDir() throws IOException {
175171
return getServerNamesFromWALDirPath(
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master;
19+
20+
import java.io.IOException;
21+
import java.util.Set;
22+
import org.apache.hadoop.hbase.ServerName;
23+
import org.apache.yetus.audience.InterfaceAudience;
24+
25+
/**
26+
* For storing the region server list.
27+
* <p/>
28+
* Mainly be used when restarting master, to load the previous active region server list.
29+
*/
30+
@InterfaceAudience.Private
31+
public interface RegionServerList {
32+
33+
/**
34+
* Called when a region server join the cluster.
35+
*/
36+
void started(ServerName sn);
37+
38+
/**
39+
* Called when a region server is dead.
40+
*/
41+
void expired(ServerName sn);
42+
43+
/**
44+
* Get all live region servers.
45+
*/
46+
Set<ServerName> getAll() throws IOException;
47+
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionServerTracker.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -115,22 +115,22 @@ private RegionServerInfo getServerInfo(ServerName serverName)
115115
* {@link ServerManager#findDeadServersAndProcess(Set, Set)}, we call it here under the lock
116116
* protection to prevent concurrency issues with server expiration operation.
117117
* @param deadServersFromPE the region servers which already have SCP associated.
118-
* @param liveServersFromWALDir the live region servers from wal directory.
118+
* @param liveServersBeforeRestart the live region servers we recorded before master restarts.
119119
* @param splittingServersFromWALDir Servers whose WALs are being actively 'split'.
120120
*/
121-
public void upgrade(Set<ServerName> deadServersFromPE, Set<ServerName> liveServersFromWALDir,
121+
public void upgrade(Set<ServerName> deadServersFromPE, Set<ServerName> liveServersBeforeRestart,
122122
Set<ServerName> splittingServersFromWALDir) throws KeeperException, IOException {
123123
LOG.info(
124124
"Upgrading RegionServerTracker to active master mode; {} have existing" +
125125
"ServerCrashProcedures, {} possibly 'live' servers, and {} 'splitting'.",
126-
deadServersFromPE.size(), liveServersFromWALDir.size(), splittingServersFromWALDir.size());
126+
deadServersFromPE.size(), liveServersBeforeRestart.size(), splittingServersFromWALDir.size());
127127
// deadServersFromPE is made from a list of outstanding ServerCrashProcedures.
128128
// splittingServersFromWALDir are being actively split -- the directory in the FS ends in
129129
// '-SPLITTING'. Each splitting server should have a corresponding SCP. Log if not.
130130
splittingServersFromWALDir.stream().filter(s -> !deadServersFromPE.contains(s)).
131131
forEach(s -> LOG.error("{} has no matching ServerCrashProcedure", s));
132132
// create ServerNode for all possible live servers from wal directory
133-
liveServersFromWALDir
133+
liveServersBeforeRestart
134134
.forEach(sn -> server.getAssignmentManager().getRegionStates().getOrCreateServer(sn));
135135
ServerManager serverManager = server.getServerManager();
136136
synchronized (this) {
@@ -142,7 +142,7 @@ public void upgrade(Set<ServerName> deadServersFromPE, Set<ServerName> liveServe
142142
info.getVersionInfo().getVersion()) : ServerMetricsBuilder.of(serverName);
143143
serverManager.checkAndRecordNewServer(serverName, serverMetrics);
144144
}
145-
serverManager.findDeadServersAndProcess(deadServersFromPE, liveServersFromWALDir);
145+
serverManager.findDeadServersAndProcess(deadServersFromPE, liveServersBeforeRestart);
146146
active = true;
147147
}
148148
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ public class ServerManager {
139139

140140
private final MasterServices master;
141141
private final ClusterConnection connection;
142+
private final RegionServerList storage;
142143

143144
private final DeadServer deadservers = new DeadServer();
144145

@@ -153,8 +154,9 @@ public class ServerManager {
153154
/**
154155
* Constructor.
155156
*/
156-
public ServerManager(final MasterServices master) {
157+
public ServerManager(final MasterServices master, RegionServerList storage) {
157158
this.master = master;
159+
this.storage = storage;
158160
Configuration c = master.getConfiguration();
159161
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
160162
warningSkew = c.getLong("hbase.master.warningclockskew", 10000);
@@ -185,7 +187,6 @@ public boolean unregisterListener(final ServerListener listener) {
185187
* @param version the version of the new regionserver, could contain strings like "SNAPSHOT"
186188
* @param ia the InetAddress from which request is received
187189
* @return The ServerName we know this server as.
188-
* @throws IOException
189190
*/
190191
ServerName regionServerStartup(RegionServerStartupRequest request, int versionNumber,
191192
String version, InetAddress ia) throws IOException {
@@ -206,13 +207,12 @@ ServerName regionServerStartup(RegionServerStartupRequest request, int versionNu
206207
LOG.warn(
207208
"THIS SHOULD NOT HAPPEN, RegionServerStartup" + " could not record the server: " + sn);
208209
}
210+
storage.started(sn);
209211
return sn;
210212
}
211213

212214
/**
213215
* Updates last flushed sequence Ids for the regions on server sn
214-
* @param sn
215-
* @param hsl
216216
*/
217217
private void updateLastFlushedSequenceIds(ServerName sn, ServerMetrics hsl) {
218218
for (Entry<byte[], RegionMetrics> entry : hsl.getRegionMetrics().entrySet()) {
@@ -581,6 +581,7 @@ synchronized long expireServer(final ServerName serverName, boolean force) {
581581
}
582582
LOG.info("Processing expiration of " + serverName + " on " + this.master.getServerName());
583583
long pid = master.getAssignmentManager().submitServerCrash(serverName, true, force);
584+
storage.expired(serverName);
584585
// Tell our listeners that a server was removed
585586
if (!this.listeners.isEmpty()) {
586587
this.listeners.stream().forEach(l -> l.serverRemoved(serverName));

hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegionFactory.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,16 @@ public final class MasterRegionFactory {
8787

8888
public static final byte[] PROC_FAMILY = Bytes.toBytes("proc");
8989

90+
public static final byte[] REGION_SERVER_FAMILY = Bytes.toBytes("rs");
91+
9092
private static final TableDescriptor TABLE_DESC = TableDescriptorBuilder.newBuilder(TABLE_NAME)
9193
.setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(HConstants.CATALOG_FAMILY)
9294
.setMaxVersions(HConstants.DEFAULT_HBASE_META_VERSIONS).setInMemory(true)
9395
.setBlocksize(HConstants.DEFAULT_HBASE_META_BLOCK_SIZE).setBloomFilterType(BloomType.ROWCOL)
9496
.setDataBlockEncoding(DataBlockEncoding.ROW_INDEX_V1).build())
95-
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(PROC_FAMILY)).build();
97+
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(PROC_FAMILY))
98+
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(REGION_SERVER_FAMILY))
99+
.build();
96100

97101
private static TableDescriptor withTrackerConfigs(Configuration conf) {
98102
String trackerImpl = conf.get(TRACKER_IMPL, conf.get(StoreFileTrackerFactory.TRACKER_IMPL,
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master;
19+
20+
import java.io.IOException;
21+
import java.util.Collections;
22+
import java.util.Set;
23+
import org.apache.hadoop.hbase.ServerName;
24+
25+
public class DummyRegionServerList implements RegionServerList {
26+
27+
@Override
28+
public void started(ServerName sn) {
29+
}
30+
31+
@Override
32+
public void expired(ServerName sn) {
33+
}
34+
35+
@Override
36+
public Set<ServerName> getAll() throws IOException {
37+
return Collections.emptySet();
38+
}
39+
40+
}

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestClockSkewDetection.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ public ClusterConnection getClusterConnection() {
5959
when(conn.getRpcControllerFactory()).thenReturn(mock(RpcControllerFactory.class));
6060
return conn;
6161
}
62-
});
62+
}, new DummyRegionServerList());
6363

6464
LOG.debug("regionServerStartup 1");
6565
InetAddress ia1 = InetAddress.getLocalHost();

0 commit comments

Comments
 (0)