Skip to content

Commit 9df62ab

Browse files
HBASE-28690 Aborting Active HMaster is not rejecting reportRegionStateTransition if procedure is initialised by next Active master (#6136)
Added masterActiveTime as fencing token for remote procedures Signed-off-by: Duo Zhang <zhangduo@apache.org> Reviewed-by: Aman Poonia <aman.poonia.29@gmail.com>
1 parent 18ddef1 commit 9df62ab

30 files changed

+190
-75
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/shaded/protobuf/ProtobufUtil.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3054,10 +3054,12 @@ public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte
30543054
}
30553055

30563056
public static CloseRegionRequest buildCloseRegionRequest(ServerName server, byte[] regionName,
3057-
ServerName destinationServer, long closeProcId, boolean evictCache) {
3057+
ServerName destinationServer, long closeProcId, boolean evictCache,
3058+
long initiatingMasterActiveTime) {
30583059
CloseRegionRequest.Builder builder =
30593060
getBuilder(server, regionName, destinationServer, closeProcId);
30603061
builder.setEvictCache(evictCache);
3062+
builder.setInitiatingMasterActiveTime(initiatingMasterActiveTime);
30613063
return builder.build();
30623064
}
30633065

hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,14 +222,22 @@ protected final void submitTask(Runnable task, long delay, TimeUnit unit) {
222222
*/
223223
public static abstract class RemoteOperation {
224224
private final RemoteProcedure remoteProcedure;
225+
// active time of the master that sent this request, used for fencing
226+
private final long initiatingMasterActiveTime;
225227

226-
protected RemoteOperation(final RemoteProcedure remoteProcedure) {
228+
protected RemoteOperation(final RemoteProcedure remoteProcedure,
229+
long initiatingMasterActiveTime) {
227230
this.remoteProcedure = remoteProcedure;
231+
this.initiatingMasterActiveTime = initiatingMasterActiveTime;
228232
}
229233

230234
public RemoteProcedure getRemoteProcedure() {
231235
return remoteProcedure;
232236
}
237+
238+
public long getInitiatingMasterActiveTime() {
239+
return initiatingMasterActiveTime;
240+
}
233241
}
234242

235243
/**

hbase-protocol-shaded/src/main/protobuf/Admin.proto

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ message OpenRegionRequest {
8080
repeated RegionOpenInfo open_info = 1;
8181
// the intended server for this RPC.
8282
optional uint64 serverStartCode = 2;
83+
// Master active time as fencing token
84+
optional int64 initiating_master_active_time = 3;
8385
// wall clock time from master
8486
optional uint64 master_system_time = 5;
8587

@@ -123,6 +125,8 @@ message CloseRegionRequest {
123125
optional uint64 serverStartCode = 5;
124126
optional int64 close_proc_id = 6 [default = -1];
125127
optional bool evict_cache = 7 [default = false];
128+
// Master active time as fencing token
129+
optional int64 initiating_master_active_time = 8;
126130
}
127131

128132
message CloseRegionResponse {
@@ -272,6 +276,8 @@ message RemoteProcedureRequest {
272276
required uint64 proc_id = 1;
273277
required string proc_class = 2;
274278
optional bytes proc_data = 3;
279+
// Master active time as fencing token
280+
optional int64 initiating_master_active_time = 4;
275281
}
276282

277283
message ExecuteProceduresRequest {

hbase-protocol-shaded/src/main/protobuf/RegionServerStatus.proto

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ message RegionStateTransition {
9797
optional uint64 open_seq_num = 3;
9898

9999
repeated int64 proc_id = 4;
100+
101+
// Master active time as fencing token
102+
optional int64 initiating_master_active_time = 5;
100103
enum TransitionCode {
101104
OPENED = 0;
102105
FAILED_OPEN = 1;
@@ -155,6 +158,8 @@ message RemoteProcedureResult {
155158
}
156159
required Status status = 2;
157160
optional ForeignExceptionMessage error = 3;
161+
// Master active time as fencing token
162+
optional int64 initiating_master_active_time = 4;
158163
}
159164
message ReportProcedureDoneRequest {
160165
repeated RemoteProcedureResult result = 1;

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3097,6 +3097,7 @@ public long getMasterStartTime() {
30973097
}
30983098

30993099
/** Returns timestamp in millis when HMaster became the active master. */
3100+
@Override
31003101
public long getMasterActiveTime() {
31013102
return masterActiveTime;
31023103
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import org.apache.hadoop.hbase.ClusterMetricsBuilder;
4545
import org.apache.hadoop.hbase.DoNotRetryIOException;
4646
import org.apache.hadoop.hbase.HConstants;
47+
import org.apache.hadoop.hbase.MasterNotRunningException;
4748
import org.apache.hadoop.hbase.MetaTableAccessor;
4849
import org.apache.hadoop.hbase.NamespaceDescriptor;
4950
import org.apache.hadoop.hbase.Server;
@@ -72,7 +73,6 @@
7273
import org.apache.hadoop.hbase.ipc.RpcServer.BlockingServiceAndInterface;
7374
import org.apache.hadoop.hbase.ipc.RpcServerFactory;
7475
import org.apache.hadoop.hbase.ipc.RpcServerInterface;
75-
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
7676
import org.apache.hadoop.hbase.ipc.ServerRpcController;
7777
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
7878
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
@@ -338,6 +338,7 @@
338338
import org.apache.hadoop.hbase.shaded.protobuf.generated.QuotaProtos.GetSpaceQuotaRegionSizesResponse;
339339
import org.apache.hadoop.hbase.shaded.protobuf.generated.QuotaProtos.GetSpaceQuotaRegionSizesResponse.RegionSizes;
340340
import org.apache.hadoop.hbase.shaded.protobuf.generated.RecentLogs;
341+
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos;
341342
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.FileArchiveNotificationRequest;
342343
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.FileArchiveNotificationResponse;
343344
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.GetLastFlushedSequenceIdRequest;
@@ -1794,6 +1795,15 @@ public ReportRegionStateTransitionResponse reportRegionStateTransition(RpcContro
17941795
ReportRegionStateTransitionRequest req) throws ServiceException {
17951796
try {
17961797
master.checkServiceStarted();
1798+
for (RegionServerStatusProtos.RegionStateTransition transition : req.getTransitionList()) {
1799+
long procId =
1800+
transition.getProcIdCount() > 0 ? transition.getProcId(0) : Procedure.NO_PROC_ID;
1801+
// -1 is less than any possible MasterActiveCode
1802+
long initiatingMasterActiveTime = transition.hasInitiatingMasterActiveTime()
1803+
? transition.getInitiatingMasterActiveTime()
1804+
: -1;
1805+
throwOnOldMaster(procId, initiatingMasterActiveTime);
1806+
}
17971807
return master.getAssignmentManager().reportRegionStateTransition(req);
17981808
} catch (IOException ioe) {
17991809
throw new ServiceException(ioe);
@@ -2544,8 +2554,14 @@ public ReportProcedureDoneResponse reportProcedureDone(RpcController controller,
25442554
// Check Masters is up and ready for duty before progressing. Remote side will keep trying.
25452555
try {
25462556
this.master.checkServiceStarted();
2547-
} catch (ServerNotRunningYetException snrye) {
2548-
throw new ServiceException(snrye);
2557+
for (RemoteProcedureResult result : request.getResultList()) {
2558+
// -1 is less than any possible MasterActiveCode
2559+
long initiatingMasterActiveTime =
2560+
result.hasInitiatingMasterActiveTime() ? result.getInitiatingMasterActiveTime() : -1;
2561+
throwOnOldMaster(result.getProcId(), initiatingMasterActiveTime);
2562+
}
2563+
} catch (IOException ioe) {
2564+
throw new ServiceException(ioe);
25492565
}
25502566
request.getResultList().forEach(result -> {
25512567
if (result.getStatus() == RemoteProcedureResult.Status.SUCCESS) {
@@ -2558,6 +2574,18 @@ public ReportProcedureDoneResponse reportProcedureDone(RpcController controller,
25582574
return ReportProcedureDoneResponse.getDefaultInstance();
25592575
}
25602576

2577+
private void throwOnOldMaster(long procId, long initiatingMasterActiveTime)
2578+
throws MasterNotRunningException {
2579+
if (initiatingMasterActiveTime > master.getMasterActiveTime()) {
2580+
// procedure is initiated by new active master but report received on master with older active
2581+
// time
2582+
LOG.warn(
2583+
"Report for procId: {} and initiatingMasterAT {} received on master with activeTime {}",
2584+
procId, initiatingMasterActiveTime, master.getMasterActiveTime());
2585+
throw new MasterNotRunningException("Another master is active");
2586+
}
2587+
}
2588+
25612589
// HBCK Services
25622590

25632591
@Override

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,9 @@ long splitRegion(final RegionInfo regionInfo, final byte[] splitRow, final long
261261
/** Returns true if master is the active one */
262262
boolean isActiveMaster();
263263

264+
/** Returns timestamp in millis when this master became the active one. */
265+
long getMasterActiveTime();
266+
264267
/** Returns true if master is initialized */
265268
boolean isInitialized();
266269

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,9 @@ public TableOperationType getTableOperationType() {
6464
}
6565

6666
@Override
67-
public RemoteOperation newRemoteOperation() {
68-
return new RegionCloseOperation(this, region, getProcId(), assignCandidate, evictCache);
67+
public RemoteOperation newRemoteOperation(MasterProcedureEnv env) {
68+
return new RegionCloseOperation(this, region, getProcId(), assignCandidate, evictCache,
69+
env.getMasterServices().getMasterActiveTime());
6970
}
7071

7172
@Override

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,9 @@ public TableOperationType getTableOperationType() {
5757
}
5858

5959
@Override
60-
public RemoteOperation newRemoteOperation() {
61-
return new RegionOpenOperation(this, region, getProcId());
60+
public RemoteOperation newRemoteOperation(MasterProcedureEnv env) {
61+
return new RegionOpenOperation(this, region, getProcId(),
62+
env.getMasterServices().getMasterActiveTime());
6263
}
6364

6465
@Override

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,11 @@ public Optional<RemoteProcedureDispatcher.RemoteOperation> remoteCallBuild(Maste
9292
if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) {
9393
return Optional.empty();
9494
}
95-
return Optional.of(newRemoteOperation());
95+
return Optional.of(newRemoteOperation(env));
9696
}
9797

98-
protected abstract RemoteProcedureDispatcher.RemoteOperation newRemoteOperation();
98+
protected abstract RemoteProcedureDispatcher.RemoteOperation
99+
newRemoteOperation(MasterProcedureEnv env);
99100

100101
@Override
101102
public void remoteOperationCompleted(MasterProcedureEnv env) {

0 commit comments

Comments
 (0)