Skip to content

Commit dc30ca5

Browse files
authored
HBASE-27277 TestRaceBetweenSCPAndTRSP fails in pre commit (apache#5248)
Signed-off-by: GeorryHuang <huangzhuoyue@apache.org>
1 parent e4e7917 commit dc30ca5

File tree

2 files changed

+25
-1
lines changed

2 files changed

+25
-1
lines changed

hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/RemoteProcedureDispatcher.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
*/
1818
package org.apache.hadoop.hbase.procedure2;
1919

20+
import com.google.errorprone.annotations.RestrictedApi;
2021
import java.io.IOException;
2122
import java.lang.Thread.UncaughtExceptionHandler;
2223
import java.util.HashSet;
@@ -296,6 +297,12 @@ protected <T extends RemoteOperation> List<T> fetchType(
296297
return (List<T>) requestByType.removeAll(type);
297298
}
298299

300+
@RestrictedApi(explanation = "Should only be called in tests", link = "",
301+
allowedOnPath = ".*/src/test/.*")
302+
public boolean hasNode(TRemote key) {
303+
return nodeMap.containsKey(key);
304+
}
305+
299306
// ============================================================================================
300307
// Timeout Helpers
301308
// ============================================================================================

hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRaceBetweenSCPAndTRSP.java

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.apache.hadoop.hbase.master.HMaster;
3232
import org.apache.hadoop.hbase.master.MasterServices;
3333
import org.apache.hadoop.hbase.master.RegionPlan;
34+
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
3435
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
3536
import org.apache.hadoop.hbase.master.region.MasterRegion;
3637
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
@@ -147,16 +148,32 @@ public void test() throws Exception {
147148
Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn));
148149
arriveRegionOpening.await();
149150

151+
// Kill the region server and trigger a SCP
150152
UTIL.getMiniHBaseCluster().killRegionServer(sn);
153+
// Wait until the SCP reaches the getRegionsOnServer call
151154
arriveGetRegionsOnServer.await();
155+
RSProcedureDispatcher remoteDispatcher = UTIL.getMiniHBaseCluster().getMaster()
156+
.getMasterProcedureExecutor().getEnvironment().getRemoteDispatcher();
157+
// this is necessary for making the UT stable, the problem here is that, in
158+
// ServerManager.expireServer, we will submit the SCP and then the SCP will be executed in
159+
// another thread(the PEWorker), so when we reach the above getRegionsOnServer call in SCP, it
160+
// is still possible that the expireServer call has not been finished so the remote dispatcher
161+
// still think it can dispatcher the TRSP, in this way we will be in dead lock as the TRSP will
162+
// not schedule a new ORP since it relies on SCP to wake it up after everything is OK. This is
163+
// not what we want to test in this UT so we need to wait here to prevent this from happening.
164+
// See HBASE-27277 for more detailed analysis.
165+
UTIL.waitFor(15000, () -> !remoteDispatcher.hasNode(sn));
166+
167+
// Resume the TRSP, it should be able to finish
152168
RESUME_REGION_OPENING.countDown();
153-
154169
moveFuture.get();
170+
155171
ProcedureExecutor<?> procExec =
156172
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
157173
long scpProcId =
158174
procExec.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure)
159175
.map(p -> (ServerCrashProcedure) p).findAny().get().getProcId();
176+
// Resume the SCP and make sure it can finish too
160177
RESUME_GET_REGIONS_ON_SERVER.countDown();
161178
UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId));
162179
}

0 commit comments

Comments
 (0)