|
31 | 31 | import org.apache.hadoop.hbase.master.HMaster; |
32 | 32 | import org.apache.hadoop.hbase.master.MasterServices; |
33 | 33 | import org.apache.hadoop.hbase.master.RegionPlan; |
| 34 | +import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher; |
34 | 35 | import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; |
35 | 36 | import org.apache.hadoop.hbase.master.region.MasterRegion; |
36 | 37 | import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; |
@@ -147,16 +148,32 @@ public void test() throws Exception { |
147 | 148 | Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn)); |
148 | 149 | arriveRegionOpening.await(); |
149 | 150 |
|
| 151 | + // Kill the region server and trigger a SCP |
150 | 152 | UTIL.getMiniHBaseCluster().killRegionServer(sn); |
| 153 | + // Wait until the SCP reaches the getRegionsOnServer call |
151 | 154 | arriveGetRegionsOnServer.await(); |
| 155 | + RSProcedureDispatcher remoteDispatcher = UTIL.getMiniHBaseCluster().getMaster() |
| 156 | + .getMasterProcedureExecutor().getEnvironment().getRemoteDispatcher(); |
| 157 | + // this is necessary for making the UT stable, the problem here is that, in |
| 158 | + // ServerManager.expireServer, we will submit the SCP and then the SCP will be executed in |
| 159 | + // another thread(the PEWorker), so when we reach the above getRegionsOnServer call in SCP, it |
| 160 | + // is still possible that the expireServer call has not been finished so the remote dispatcher |
| 161 | + // still think it can dispatcher the TRSP, in this way we will be in dead lock as the TRSP will |
| 162 | + // not schedule a new ORP since it relies on SCP to wake it up after everything is OK. This is |
| 163 | + // not what we want to test in this UT so we need to wait here to prevent this from happening. |
| 164 | + // See HBASE-27277 for more detailed analysis. |
| 165 | + UTIL.waitFor(15000, () -> !remoteDispatcher.hasNode(sn)); |
| 166 | + |
| 167 | + // Resume the TRSP, it should be able to finish |
152 | 168 | RESUME_REGION_OPENING.countDown(); |
153 | | - |
154 | 169 | moveFuture.get(); |
| 170 | + |
155 | 171 | ProcedureExecutor<?> procExec = |
156 | 172 | UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(); |
157 | 173 | long scpProcId = |
158 | 174 | procExec.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure) |
159 | 175 | .map(p -> (ServerCrashProcedure) p).findAny().get().getProcId(); |
| 176 | + // Resume the SCP and make sure it can finish too |
160 | 177 | RESUME_GET_REGIONS_ON_SERVER.countDown(); |
161 | 178 | UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId)); |
162 | 179 | } |
|
0 commit comments