Skip to content

Commit ddc96b5

Browse files
committed
HBASE-23275: Track active master's address in ActiveMasterManager (apache#812)
Currently we just track whether an active master exists. It helps to also track the address of the active master in all the masters to help serve the client RPC requests to know which master is active. Signed-off-by: Nick Dimiduk <ndimiduk@apache.org> Signed-off-by: Andrew Purtell <apurtell@apache.org> (cherry picked from commit efebb84)
1 parent 5a51c5d commit ddc96b5

File tree

3 files changed

+64
-14
lines changed

3 files changed

+64
-14
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/**
1+
/*
22
*
33
* Licensed to the Apache Software Foundation (ASF) under one
44
* or more contributor license agreements. See the NOTICE file
@@ -17,25 +17,24 @@
1717
* limitations under the License.
1818
*/
1919
package org.apache.hadoop.hbase.master;
20-
2120
import java.io.IOException;
21+
import java.util.Optional;
2222
import java.util.concurrent.atomic.AtomicBoolean;
23-
24-
import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
25-
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
26-
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
27-
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
28-
import org.apache.yetus.audience.InterfaceAudience;
2923
import org.apache.hadoop.hbase.Server;
3024
import org.apache.hadoop.hbase.ServerName;
3125
import org.apache.hadoop.hbase.ZNodeClearer;
3226
import org.apache.hadoop.hbase.exceptions.DeserializationException;
3327
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
34-
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
28+
import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
3529
import org.apache.hadoop.hbase.zookeeper.ZKListener;
30+
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
31+
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
32+
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
33+
import org.apache.yetus.audience.InterfaceAudience;
3634
import org.apache.zookeeper.KeeperException;
3735
import org.slf4j.Logger;
3836
import org.slf4j.LoggerFactory;
37+
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
3938

4039
/**
4140
* Handles everything on master-side related to master election.
@@ -57,12 +56,18 @@ public class ActiveMasterManager extends ZKListener {
5756
final AtomicBoolean clusterHasActiveMaster = new AtomicBoolean(false);
5857
final AtomicBoolean clusterShutDown = new AtomicBoolean(false);
5958

59+
// This server's information.
6060
private final ServerName sn;
6161
private int infoPort;
6262
private final Server master;
6363

64+
// Active master's server name. Invalidated anytime active master changes (based on ZK
65+
// notifications) and lazily fetched on-demand.
66+
// ServerName is immutable, so we don't need heavy synchronization around it.
67+
private volatile ServerName activeMasterServerName;
68+
6469
/**
65-
* @param watcher
70+
* @param watcher ZK watcher
6671
* @param sn ServerName
6772
* @param master In an instance of a Master.
6873
*/
@@ -106,6 +111,30 @@ void handle(final String path) {
106111
}
107112
}
108113

114+
/**
115+
* Fetches the active master's ServerName from zookeeper.
116+
*/
117+
private void fetchAndSetActiveMasterServerName() {
118+
LOG.debug("Attempting to fetch active master sn from zk");
119+
try {
120+
activeMasterServerName = MasterAddressTracker.getMasterAddress(watcher);
121+
} catch (IOException | KeeperException e) {
122+
// Log and ignore for now and re-fetch later if needed.
123+
LOG.error("Error fetching active master information", e);
124+
}
125+
}
126+
127+
public Optional<ServerName> getActiveMasterServerName() {
128+
if (!clusterHasActiveMaster.get()) {
129+
return Optional.empty();
130+
}
131+
if (activeMasterServerName == null) {
132+
fetchAndSetActiveMasterServerName();
133+
}
134+
// It could still be null, but return whatever we have.
135+
return Optional.ofNullable(activeMasterServerName);
136+
}
137+
109138
/**
110139
* Handle a change in the master node. Doesn't matter whether this was called
111140
* from a nodeCreated or nodeDeleted event because there are no guarantees
@@ -134,6 +163,9 @@ private void handleMasterNodeChange() {
134163
// Notify any thread waiting to become the active master
135164
clusterHasActiveMaster.notifyAll();
136165
}
166+
// Reset the active master sn. Will be re-fetched later if needed.
167+
// We don't want to make a synchronous RPC under a monitor.
168+
activeMasterServerName = null;
137169
}
138170
} catch (KeeperException ke) {
139171
master.abort("Received an unexpected KeeperException, aborting", ke);
@@ -151,8 +183,8 @@ private void handleMasterNodeChange() {
151183
* @param checkInterval the interval to check if the master is stopped
152184
* @param startupStatus the monitor status to track the progress
153185
* @return True if no issue becoming active master else false if another
154-
* master was running or if some other problem (zookeeper, stop flag has been
155-
* set on this Master)
186+
* master was running or if some other problem (zookeeper, stop flag has been
187+
* set on this Master)
156188
*/
157189
boolean blockUntilBecomingActiveMaster(
158190
int checkInterval, MonitoredTask startupStatus) {
@@ -178,10 +210,14 @@ boolean blockUntilBecomingActiveMaster(
178210
// We are the master, return
179211
startupStatus.setStatus("Successfully registered as active master.");
180212
this.clusterHasActiveMaster.set(true);
213+
activeMasterServerName = sn;
181214
LOG.info("Registered as active master=" + this.sn);
182215
return true;
183216
}
184217

218+
// Invalidate the active master name so that subsequent requests do not get any stale
219+
// master information. Will be re-fetched if needed.
220+
activeMasterServerName = null;
185221
// There is another active master running elsewhere or this is a restart
186222
// and the master ephemeral node has not expired yet.
187223
this.clusterHasActiveMaster.set(true);
@@ -208,7 +244,8 @@ boolean blockUntilBecomingActiveMaster(
208244
ZKUtil.deleteNode(this.watcher, this.watcher.getZNodePaths().masterAddressZNode);
209245

210246
// We may have failed to delete the znode at the previous step, but
211-
// we delete the file anyway: a second attempt to delete the znode is likely to fail again.
247+
// we delete the file anyway: a second attempt to delete the znode is likely to fail
248+
// again.
212249
ZNodeClearer.deleteMyEphemeralNodeOnDisk();
213250
} else {
214251
msg = "Another master is the active master, " + currentMaster +

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3784,7 +3784,10 @@ public HbckChore getHbckChore() {
37843784
return this.hbckChore;
37853785
}
37863786

3787-
@Override
3787+
public Optional<ServerName> getActiveMaster() {
3788+
return activeMasterManager.getActiveMasterServerName();
3789+
}
3790+
37883791
public void runReplicationBarrierCleaner() {
37893792
ReplicationBarrierCleaner rbc = this.replicationBarrierCleaner;
37903793
if (rbc != null) {

hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestActiveMasterManager.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
*/
1818
package org.apache.hadoop.hbase.master;
1919

20+
import static org.junit.Assert.assertEquals;
2021
import static org.junit.Assert.assertFalse;
2122
import static org.junit.Assert.assertNotNull;
2223
import static org.junit.Assert.assertTrue;
@@ -91,6 +92,7 @@ public static void tearDownAfterClass() throws Exception {
9192
ActiveMasterManager activeMasterManager =
9293
dummyMaster.getActiveMasterManager();
9394
assertFalse(activeMasterManager.clusterHasActiveMaster.get());
95+
assertFalse(activeMasterManager.getActiveMasterServerName().isPresent());
9496

9597
// First test becoming the active master uninterrupted
9698
MonitoredTask status = Mockito.mock(MonitoredTask.class);
@@ -99,6 +101,7 @@ public static void tearDownAfterClass() throws Exception {
99101
activeMasterManager.blockUntilBecomingActiveMaster(100, status);
100102
assertTrue(activeMasterManager.clusterHasActiveMaster.get());
101103
assertMaster(zk, master);
104+
assertMaster(zk, activeMasterManager.getActiveMasterServerName().get());
102105

103106
// Now pretend master restart
104107
DummyMaster secondDummyMaster = new DummyMaster(zk,master);
@@ -108,6 +111,8 @@ public static void tearDownAfterClass() throws Exception {
108111
activeMasterManager.blockUntilBecomingActiveMaster(100, status);
109112
assertTrue(activeMasterManager.clusterHasActiveMaster.get());
110113
assertMaster(zk, master);
114+
assertMaster(zk, activeMasterManager.getActiveMasterServerName().get());
115+
assertMaster(zk, secondActiveMasterManager.getActiveMasterServerName().get());
111116
}
112117

113118
/**
@@ -135,6 +140,7 @@ public void testActiveMasterManagerFromZK() throws Exception {
135140
ActiveMasterManager activeMasterManager =
136141
ms1.getActiveMasterManager();
137142
assertFalse(activeMasterManager.clusterHasActiveMaster.get());
143+
assertFalse(activeMasterManager.getActiveMasterServerName().isPresent());
138144

139145
// First test becoming the active master uninterrupted
140146
ClusterStatusTracker clusterStatusTracker =
@@ -144,6 +150,7 @@ public void testActiveMasterManagerFromZK() throws Exception {
144150
Mockito.mock(MonitoredTask.class));
145151
assertTrue(activeMasterManager.clusterHasActiveMaster.get());
146152
assertMaster(zk, firstMasterAddress);
153+
assertMaster(zk, activeMasterManager.getActiveMasterServerName().get());
147154

148155
// New manager will now try to become the active master in another thread
149156
WaitToBeMasterThread t = new WaitToBeMasterThread(zk, secondMasterAddress);
@@ -161,6 +168,8 @@ public void testActiveMasterManagerFromZK() throws Exception {
161168
assertTrue(t.manager.clusterHasActiveMaster.get());
162169
// But secondary one should not be the active master
163170
assertFalse(t.isActiveMaster);
171+
// Verify the active master ServerName is populated in standby master.
172+
assertEquals(firstMasterAddress, t.manager.getActiveMasterServerName().get());
164173

165174
// Close the first server and delete it's master node
166175
ms1.stop("stopping first server");
@@ -189,6 +198,7 @@ public void testActiveMasterManagerFromZK() throws Exception {
189198

190199
assertTrue(t.manager.clusterHasActiveMaster.get());
191200
assertTrue(t.isActiveMaster);
201+
assertEquals(secondMasterAddress, t.manager.getActiveMasterServerName().get());
192202

193203
LOG.info("Deleting master node");
194204

0 commit comments

Comments
 (0)