Skip to content

Commit 591f781

Browse files
authored
HBASE-26552 Introduce retry to logroller to avoid abort (#4038)
Signed-off-by: Andrew Purtell <apurtell@apache.org>
1 parent f3a48d1 commit 591f781

File tree

1 file changed

+49
-10
lines changed

1 file changed

+49
-10
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractWALRoller.java

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,29 @@ public abstract class AbstractWALRoller<T extends Abortable> extends Thread
6060

6161
protected static final String WAL_ROLL_PERIOD_KEY = "hbase.regionserver.logroll.period";
6262

63+
/**
64+
* Configure for the timeout of log rolling retry.
65+
*/
66+
protected static final String WAL_ROLL_WAIT_TIMEOUT = "hbase.regionserver.logroll.wait.timeout.ms";
67+
68+
/**
69+
* Configure for the max count of log rolling retry.
70+
* The real retry count is also limited by the timeout of log rolling
71+
* via {@link #WAL_ROLL_WAIT_TIMEOUT}
72+
*/
73+
protected static final String WAL_ROLL_RETRIES = "hbase.regionserver.logroll.retries";
74+
6375
protected final ConcurrentMap<WAL, RollController> wals = new ConcurrentHashMap<>();
6476
protected final T abortable;
6577
// Period to roll log.
6678
private final long rollPeriod;
6779
private final int threadWakeFrequency;
6880
// The interval to check low replication on hlog's pipeline
6981
private final long checkLowReplicationInterval;
82+
// Wait period for roll log
83+
private final long rollWaitTimeout;
84+
// Max retry for roll log
85+
private final int maxRollRetry;
7086

7187
private volatile boolean running = true;
7288

@@ -114,6 +130,9 @@ protected AbstractWALRoller(String name, Configuration conf, T abortable) {
114130
this.threadWakeFrequency = conf.getInt(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
115131
this.checkLowReplicationInterval =
116132
conf.getLong("hbase.regionserver.hlog.check.lowreplication.interval", 30 * 1000);
133+
this.rollWaitTimeout = conf.getLong(WAL_ROLL_WAIT_TIMEOUT, 30000);
134+
// retry rolling does not have to be the default behavior, so the default value is 0 here
135+
this.maxRollRetry = conf.getInt(WAL_ROLL_RETRIES, 0);
117136
}
118137

119138
/**
@@ -184,18 +203,38 @@ public void run() {
184203
} else {
185204
continue;
186205
}
187-
try {
188-
// Force the roll if the logroll.period is elapsed or if a roll was requested.
189-
// The returned value is an collection of actual region and family names.
190-
Map<byte[], List<byte[]>> regionsToFlush = controller.rollWal(now);
191-
if (regionsToFlush != null) {
192-
for (Map.Entry<byte[], List<byte[]>> r : regionsToFlush.entrySet()) {
193-
scheduleFlush(Bytes.toString(r.getKey()), r.getValue());
206+
Map<byte[], List<byte[]>> regionsToFlush = null;
207+
int nAttempts = 0;
208+
long startWaiting = EnvironmentEdgeManager.currentTime();
209+
do {
210+
try {
211+
// Force the roll if the logroll.period is elapsed or if a roll was requested.
212+
// The returned value is an collection of actual region and family names.
213+
regionsToFlush = controller.rollWal(EnvironmentEdgeManager.currentTime());
214+
break;
215+
} catch (IOException ioe) {
216+
if (ioe instanceof WALClosedException) {
217+
LOG.warn("WAL has been closed. Skipping rolling of writer and just remove it", ioe);
218+
iter.remove();
219+
break;
194220
}
221+
long waitingTime = EnvironmentEdgeManager.currentTime() - startWaiting;
222+
if (waitingTime < rollWaitTimeout && nAttempts < maxRollRetry) {
223+
nAttempts++;
224+
LOG.warn("Retry to roll log, nAttempts={}, waiting time={}ms, sleeping 1s to retry,"
225+
+ " last excepiton= {}", nAttempts, waitingTime,
226+
ioe.getCause().getClass().getSimpleName());
227+
sleep(1000);
228+
} else {
229+
LOG.error("Roll wal failed and waiting timeout, will not retry", ioe);
230+
throw ioe;
231+
}
232+
}
233+
} while (EnvironmentEdgeManager.currentTime() - startWaiting < rollWaitTimeout);
234+
if (regionsToFlush != null) {
235+
for (Map.Entry<byte[], List<byte[]>> r : regionsToFlush.entrySet()) {
236+
scheduleFlush(Bytes.toString(r.getKey()), r.getValue());
195237
}
196-
} catch (WALClosedException e) {
197-
LOG.warn("WAL has been closed. Skipping rolling of writer and just remove it", e);
198-
iter.remove();
199238
}
200239
}
201240
} catch (FailedLogCloseException | ConnectException e) {

0 commit comments

Comments
 (0)