Skip to content

Commit 8920ca3

Browse files
committed
HBASE-22343 Make procedure retry interval configurable in test
1 parent 8ed1791 commit 8920ca3

File tree

13 files changed

+165
-147
lines changed

13 files changed

+165
-147
lines changed

hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureUtil.java

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,13 @@
2121
import java.io.InputStream;
2222
import java.lang.reflect.Constructor;
2323
import java.lang.reflect.Modifier;
24-
import java.util.concurrent.ThreadLocalRandom;
24+
import java.util.concurrent.TimeUnit;
25+
import org.apache.hadoop.conf.Configuration;
2526
import org.apache.hadoop.hbase.HConstants;
2627
import org.apache.hadoop.hbase.util.NonceKey;
28+
import org.apache.hadoop.hbase.util.RetryCounter;
29+
import org.apache.hadoop.hbase.util.RetryCounter.ExponentialBackoffPolicyWithLimit;
30+
import org.apache.hadoop.hbase.util.RetryCounter.RetryConfig;
2731
import org.apache.yetus.audience.InterfaceAudience;
2832

2933
import com.xiaomi.infra.thirdparty.com.google.common.base.Preconditions;
@@ -335,20 +339,35 @@ public static LockServiceProtos.LockedResource convertToProtoLockedResource(
335339
return builder.build();
336340
}
337341

342+
public static final String PROCEDURE_RETRY_SLEEP_INTERVAL_MS =
343+
"hbase.procedure.retry.sleep.interval.ms";
344+
345+
// default to 1 second
346+
public static final long DEFAULT_PROCEDURE_RETRY_SLEEP_INTERVAL_MS = 1000;
347+
348+
public static final String PROCEDURE_RETRY_MAX_SLEEP_TIME_MS =
349+
"hbase.procedure.retry.max.sleep.time.ms";
350+
351+
// default to 10 minutes
352+
public static final long DEFAULT_PROCEDURE_RETRY_MAX_SLEEP_TIME_MS =
353+
TimeUnit.MINUTES.toMillis(10);
354+
338355
/**
339-
* Get an exponential backoff time, in milliseconds. The base unit is 1 second, and the max
340-
* backoff time is 10 minutes. This is the general backoff policy for most procedure
341-
* implementation.
356+
* Get a retry counter for getting the backoff time. We will use the
357+
* {@link ExponentialBackoffPolicyWithLimit} policy, and the base unit is 1 second, max sleep time
358+
* is 10 minutes by default.
359+
* <p/>
360+
* For UTs, you can set the {@link #PROCEDURE_RETRY_SLEEP_INTERVAL_MS} and
361+
* {@link #PROCEDURE_RETRY_MAX_SLEEP_TIME_MS} to make more frequent retry so your UT will not
362+
* timeout.
342363
*/
343-
public static long getBackoffTimeMs(int attempts) {
344-
long maxBackoffTime = 10L * 60 * 1000; // Ten minutes, hard coded for now.
345-
// avoid overflow
346-
if (attempts >= 30) {
347-
return maxBackoffTime;
348-
}
349-
long backoffTimeMs = Math.min((long) (1000 * Math.pow(2, attempts)), maxBackoffTime);
350-
// 1% possible jitter
351-
long jitter = (long) (backoffTimeMs * ThreadLocalRandom.current().nextFloat() * 0.01f);
352-
return backoffTimeMs + jitter;
364+
public static RetryCounter createRetryCounter(Configuration conf) {
365+
long sleepIntervalMs =
366+
conf.getLong(PROCEDURE_RETRY_SLEEP_INTERVAL_MS, DEFAULT_PROCEDURE_RETRY_SLEEP_INTERVAL_MS);
367+
long maxSleepTimeMs =
368+
conf.getLong(PROCEDURE_RETRY_MAX_SLEEP_TIME_MS, DEFAULT_PROCEDURE_RETRY_MAX_SLEEP_TIME_MS);
369+
RetryConfig retryConfig = new RetryConfig().setSleepInterval(sleepIntervalMs)
370+
.setMaxSleepTime(maxSleepTimeMs).setBackoffPolicy(new ExponentialBackoffPolicyWithLimit());
371+
return new RetryCounter(retryConfig);
353372
}
354373
}

hbase-procedure/src/test/java/org/apache/hadoop/hbase/procedure2/TestProcedureUtil.java

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@
1818
package org.apache.hadoop.hbase.procedure2;
1919

2020
import static org.junit.Assert.assertEquals;
21-
import static org.junit.Assert.assertTrue;
2221

23-
import java.util.concurrent.TimeUnit;
2422
import org.apache.hadoop.hbase.HBaseClassTestRule;
2523
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility.TestProcedure;
2624
import org.apache.hadoop.hbase.testclassification.MasterTests;
@@ -59,24 +57,6 @@ public void testConvert() throws Exception {
5957
assertEquals("Procedure protobuf does not match", proto1, proto2);
6058
}
6159

62-
@Test
63-
public void testGetBackoffTimeMs() {
64-
for (int i = 30; i < 1000; i++) {
65-
assertEquals(TimeUnit.MINUTES.toMillis(10), ProcedureUtil.getBackoffTimeMs(30));
66-
}
67-
long backoffTimeMs = ProcedureUtil.getBackoffTimeMs(0);
68-
assertTrue(backoffTimeMs >= 1000);
69-
assertTrue(backoffTimeMs <= 1000 * 1.01f);
70-
71-
backoffTimeMs = ProcedureUtil.getBackoffTimeMs(1);
72-
assertTrue(backoffTimeMs >= 2000);
73-
assertTrue(backoffTimeMs <= 2000 * 1.01f);
74-
75-
backoffTimeMs = ProcedureUtil.getBackoffTimeMs(5);
76-
assertTrue(backoffTimeMs >= 32000);
77-
assertTrue(backoffTimeMs <= 32000 * 1.01f);
78-
}
79-
8060
public static class TestProcedureNoDefaultConstructor extends TestProcedure {
8161
public TestProcedureNoDefaultConstructor(int x) {}
8262
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
3535
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure;
3636
import org.apache.hadoop.hbase.procedure2.RemoteProcedureException;
37+
import org.apache.hadoop.hbase.util.RetryCounter;
3738
import org.apache.yetus.audience.InterfaceAudience;
3839
import org.slf4j.Logger;
3940
import org.slf4j.LoggerFactory;
@@ -68,7 +69,7 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur
6869

6970
private long seqId;
7071

71-
private int attempt;
72+
private RetryCounter retryCounter;
7273

7374
protected RegionRemoteProcedureBase() {
7475
}
@@ -268,7 +269,10 @@ protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env)
268269
throw new IllegalStateException("Unknown state: " + state);
269270
}
270271
} catch (IOException e) {
271-
long backoff = ProcedureUtil.getBackoffTimeMs(this.attempt++);
272+
if (retryCounter == null) {
273+
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
274+
}
275+
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
272276
LOG.warn("Failed updating meta, suspend {}secs {}; {};", backoff / 1000, this, regionNode, e);
273277
setTimeout(Math.toIntExact(backoff));
274278
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
3737
import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
3838
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
39+
import org.apache.hadoop.hbase.util.RetryCounter;
3940
import org.apache.yetus.audience.InterfaceAudience;
4041
import org.slf4j.Logger;
4142
import org.slf4j.LoggerFactory;
@@ -115,7 +116,7 @@ public class TransitRegionStateProcedure
115116

116117
private boolean forceNewPlan;
117118

118-
private int attempt;
119+
private RetryCounter retryCounter;
119120

120121
private RegionRemoteProcedureBase remoteProc;
121122

@@ -210,7 +211,7 @@ private void openRegion(MasterProcedureEnv env, RegionStateNode regionNode) thro
210211
private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode)
211212
throws IOException {
212213
if (regionNode.isInState(State.OPEN)) {
213-
attempt = 0;
214+
retryCounter = null;
214215
if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED) {
215216
// we are the last state, finish
216217
regionNode.unsetProcedure(this);
@@ -271,7 +272,7 @@ private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) thr
271272
private Flow confirmClosed(MasterProcedureEnv env, RegionStateNode regionNode)
272273
throws IOException {
273274
if (regionNode.isInState(State.CLOSED)) {
274-
attempt = 0;
275+
retryCounter = null;
275276
if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_CLOSED) {
276277
// we are the last state, finish
277278
regionNode.unsetProcedure(this);
@@ -300,7 +301,7 @@ private Flow confirmClosed(MasterProcedureEnv env, RegionStateNode regionNode)
300301
regionNode.unsetProcedure(this);
301302
return Flow.NO_MORE_STATE;
302303
}
303-
attempt = 0;
304+
retryCounter = null;
304305
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
305306
return Flow.HAS_MORE_STATE;
306307
}
@@ -347,7 +348,10 @@ protected Flow executeFromState(MasterProcedureEnv env, RegionStateTransitionSta
347348
throw new UnsupportedOperationException("unhandled state=" + state);
348349
}
349350
} catch (IOException e) {
350-
long backoff = ProcedureUtil.getBackoffTimeMs(this.attempt++);
351+
if (retryCounter == null) {
352+
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
353+
}
354+
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
351355
LOG.warn(
352356
"Failed transition, suspend {}secs {}; {}; waiting on rectified condition fixed " +
353357
"by other Procedure or operator intervention",

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/InitMetaProcedure.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
3333
import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
3434
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
35+
import org.apache.hadoop.hbase.util.RetryCounter;
3536
import org.apache.yetus.audience.InterfaceAudience;
3637
import org.slf4j.Logger;
3738
import org.slf4j.LoggerFactory;
@@ -51,7 +52,7 @@ public class InitMetaProcedure extends AbstractStateMachineTableProcedure<InitMe
5152

5253
private CountDownLatch latch = new CountDownLatch(1);
5354

54-
private int attempts;
55+
private RetryCounter retryCounter;
5556

5657
@Override
5758
public TableName getTableName() {
@@ -85,7 +86,10 @@ protected Flow executeFromState(MasterProcedureEnv env, InitMetaState state)
8586
insertNamespaceToMeta(env.getMasterServices().getConnection(), DEFAULT_NAMESPACE);
8687
insertNamespaceToMeta(env.getMasterServices().getConnection(), SYSTEM_NAMESPACE);
8788
} catch (IOException e) {
88-
long backoff = ProcedureUtil.getBackoffTimeMs(this.attempts++);
89+
if (retryCounter == null) {
90+
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
91+
}
92+
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
8993
LOG.warn("Failed to init default and system namespaces, suspend {}secs", backoff, e);
9094
setTimeout(Math.toIntExact(backoff));
9195
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ReopenTableRegionsProcedure.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
3030
import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
3131
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
32+
import org.apache.hadoop.hbase.util.RetryCounter;
3233
import org.apache.yetus.audience.InterfaceAudience;
3334
import org.slf4j.Logger;
3435
import org.slf4j.LoggerFactory;
@@ -51,7 +52,7 @@ public class ReopenTableRegionsProcedure
5152

5253
private List<HRegionLocation> regions = Collections.emptyList();
5354

54-
private int attempt;
55+
private RetryCounter retryCounter;
5556

5657
public ReopenTableRegionsProcedure() {
5758
}
@@ -125,13 +126,16 @@ protected Flow executeFromState(MasterProcedureEnv env, ReopenTableRegionsState
125126
return Flow.NO_MORE_STATE;
126127
}
127128
if (regions.stream().anyMatch(loc -> canSchedule(env, loc))) {
128-
attempt = 0;
129+
retryCounter = null;
129130
setNextState(ReopenTableRegionsState.REOPEN_TABLE_REGIONS_REOPEN_REGIONS);
130131
return Flow.HAS_MORE_STATE;
131132
}
132133
// We can not schedule TRSP for all the regions need to reopen, wait for a while and retry
133134
// again.
134-
long backoff = ProcedureUtil.getBackoffTimeMs(this.attempt++);
135+
if (retryCounter == null) {
136+
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
137+
}
138+
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
135139
LOG.info(
136140
"There are still {} region(s) which need to be reopened for table {} are in " +
137141
"OPENING state, suspend {}secs and try again later",

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SplitWALProcedure.java

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
2929
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
3030
import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
31+
import org.apache.hadoop.hbase.util.RetryCounter;
3132
import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
3233
import org.apache.yetus.audience.InterfaceAudience;
3334
import org.slf4j.Logger;
@@ -53,7 +54,7 @@ public class SplitWALProcedure
5354
private String walPath;
5455
private ServerName worker;
5556
private ServerName crashedServer;
56-
private int attempts = 0;
57+
private RetryCounter retryCounter;
5758

5859
public SplitWALProcedure() {
5960
}
@@ -82,11 +83,16 @@ protected Flow executeFromState(MasterProcedureEnv env, MasterProcedureProtos.Sp
8283
try {
8384
finished = splitWALManager.isSplitWALFinished(walPath);
8485
} catch (IOException ioe) {
85-
long backoff = ProcedureUtil.getBackoffTimeMs(attempts++);
86-
LOG.warn(
87-
"Failed to check whether splitting wal {} success, wait {} seconds to retry",
86+
if (retryCounter == null) {
87+
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
88+
}
89+
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
90+
LOG.warn("Failed to check whether splitting wal {} success, wait {} seconds to retry",
8891
walPath, backoff / 1000, ioe);
89-
throw suspend(backoff);
92+
setTimeout(Math.toIntExact(backoff));
93+
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
94+
skipPersistence();
95+
throw new ProcedureSuspendedException();
9096
}
9197
splitWALManager.releaseSplitWALWorker(worker, env.getProcedureScheduler());
9298
if (!finished) {
@@ -157,15 +163,6 @@ protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
157163
return false;
158164
}
159165

160-
protected final ProcedureSuspendedException suspend(long backoff)
161-
throws ProcedureSuspendedException {
162-
attempts++;
163-
setTimeout(Math.toIntExact(backoff));
164-
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
165-
skipPersistence();
166-
throw new ProcedureSuspendedException();
167-
}
168-
169166
public String getWAL() {
170167
return walPath;
171168
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/SwitchRpcThrottleProcedure.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
2626
import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
2727
import org.apache.hadoop.hbase.quotas.RpcThrottleStorage;
28+
import org.apache.hadoop.hbase.util.RetryCounter;
2829
import org.apache.yetus.audience.InterfaceAudience;
2930
import org.slf4j.Logger;
3031
import org.slf4j.LoggerFactory;
@@ -43,11 +44,11 @@ public class SwitchRpcThrottleProcedure
4344

4445
private static Logger LOG = LoggerFactory.getLogger(SwitchRpcThrottleProcedure.class);
4546

46-
RpcThrottleStorage rpcThrottleStorage;
47-
boolean rpcThrottleEnabled;
48-
ProcedurePrepareLatch syncLatch;
49-
ServerName serverName;
50-
int attempts;
47+
private RpcThrottleStorage rpcThrottleStorage;
48+
private boolean rpcThrottleEnabled;
49+
private ProcedurePrepareLatch syncLatch;
50+
private ServerName serverName;
51+
private RetryCounter retryCounter;
5152

5253
public SwitchRpcThrottleProcedure() {
5354
}
@@ -68,7 +69,10 @@ protected Flow executeFromState(MasterProcedureEnv env, SwitchRpcThrottleState s
6869
try {
6970
switchThrottleState(env, rpcThrottleEnabled);
7071
} catch (IOException e) {
71-
long backoff = ProcedureUtil.getBackoffTimeMs(this.attempts++);
72+
if (retryCounter == null) {
73+
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
74+
}
75+
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
7276
LOG.warn("Failed to store rpc throttle value {}, sleep {} secs and retry",
7377
rpcThrottleEnabled, backoff / 1000, e);
7478
setTimeout(Math.toIntExact(backoff));

hbase-server/src/main/java/org/apache/hadoop/hbase/master/replication/AbstractPeerNoLockProcedure.java

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,15 @@
1818
package org.apache.hadoop.hbase.master.replication;
1919

2020
import java.io.IOException;
21+
import java.util.function.LongConsumer;
22+
import org.apache.hadoop.conf.Configuration;
2123
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
2224
import org.apache.hadoop.hbase.master.procedure.PeerProcedureInterface;
2325
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
2426
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
27+
import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
2528
import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
29+
import org.apache.hadoop.hbase.util.RetryCounter;
2630
import org.apache.yetus.audience.InterfaceAudience;
2731

2832
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.PeerProcedureStateData;
@@ -38,7 +42,7 @@ public abstract class AbstractPeerNoLockProcedure<TState>
3842

3943
protected String peerId;
4044

41-
protected int attempts;
45+
private RetryCounter retryCounter;
4246

4347
protected AbstractPeerNoLockProcedure() {
4448
}
@@ -87,12 +91,20 @@ protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
8791
return false;
8892
}
8993

90-
protected final ProcedureSuspendedException suspend(long backoff)
91-
throws ProcedureSuspendedException {
92-
attempts++;
94+
protected final ProcedureSuspendedException suspend(Configuration conf,
95+
LongConsumer backoffConsumer) throws ProcedureSuspendedException {
96+
if (retryCounter == null) {
97+
retryCounter = ProcedureUtil.createRetryCounter(conf);
98+
}
99+
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
100+
backoffConsumer.accept(backoff);
93101
setTimeout(Math.toIntExact(backoff));
94102
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
95103
skipPersistence();
96104
throw new ProcedureSuspendedException();
97105
}
106+
107+
protected final void resetRetry() {
108+
retryCounter = null;
109+
}
98110
}

0 commit comments

Comments
 (0)