Skip to content

Commit 77dbf7a

Browse files
authored
HBASE-26552 Introduce retry to logroller to avoid abort (#4171)
Signed-off-by: Andrew Purtell <apurtell@apache.org>
1 parent 45801a7 commit 77dbf7a

File tree

1 file changed

+44
-3
lines changed

1 file changed

+44
-3
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractWALRoller.java

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
3636
import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
3737
import org.apache.hadoop.hbase.util.Bytes;
38+
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
3839
import org.apache.hadoop.ipc.RemoteException;
3940
import org.apache.yetus.audience.InterfaceAudience;
4041
import org.slf4j.Logger;
@@ -58,13 +59,30 @@ public abstract class AbstractWALRoller<T extends Abortable> extends Thread
5859

5960
protected static final String WAL_ROLL_PERIOD_KEY = "hbase.regionserver.logroll.period";
6061

62+
/**
63+
* Configure for the timeout of log rolling retry.
64+
*/
65+
protected static final String WAL_ROLL_WAIT_TIMEOUT =
66+
"hbase.regionserver.logroll.wait.timeout.ms";
67+
68+
/**
69+
* Configure for the max count of log rolling retry.
70+
* The real retry count is also limited by the timeout of log rolling
71+
* via {@link #WAL_ROLL_WAIT_TIMEOUT}
72+
*/
73+
protected static final String WAL_ROLL_RETRIES = "hbase.regionserver.logroll.retries";
74+
6175
protected final ConcurrentMap<WAL, RollController> wals = new ConcurrentHashMap<>();
6276
protected final T abortable;
6377
// Period to roll log.
6478
private final long rollPeriod;
6579
private final int threadWakeFrequency;
6680
// The interval to check low replication on hlog's pipeline
6781
private final long checkLowReplicationInterval;
82+
// Wait period for roll log
83+
private final long rollWaitTimeout;
84+
// Max retry for roll log
85+
private final int maxRollRetry;
6886

6987
private volatile boolean running = true;
7088

@@ -112,6 +130,9 @@ protected AbstractWALRoller(String name, Configuration conf, T abortable) {
112130
this.threadWakeFrequency = conf.getInt(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
113131
this.checkLowReplicationInterval =
114132
conf.getLong("hbase.regionserver.hlog.check.lowreplication.interval", 30 * 1000);
133+
this.rollWaitTimeout = conf.getLong(WAL_ROLL_WAIT_TIMEOUT, 30000);
134+
// retry rolling does not have to be the default behavior, so the default value is 0 here
135+
this.maxRollRetry = conf.getInt(WAL_ROLL_RETRIES, 0);
115136
}
116137

117138
/**
@@ -182,9 +203,29 @@ public void run() {
182203
} else {
183204
continue;
184205
}
185-
// Force the roll if the logroll.period is elapsed or if a roll was requested.
186-
// The returned value is an collection of actual region and family names.
187-
Map<byte[], List<byte[]>> regionsToFlush = controller.rollWal(now);
206+
Map<byte[], List<byte[]>> regionsToFlush = null;
207+
int nAttempts = 0;
208+
long startWaiting = System.currentTimeMillis();
209+
do {
210+
try {
211+
// Force the roll if the logroll.period is elapsed or if a roll was requested.
212+
// The returned value is an collection of actual region and family names.
213+
regionsToFlush = controller.rollWal(System.currentTimeMillis());
214+
break;
215+
} catch (IOException ioe) {
216+
long waitingTime = System.currentTimeMillis() - startWaiting;
217+
if (waitingTime < rollWaitTimeout && nAttempts < maxRollRetry) {
218+
nAttempts++;
219+
LOG.warn("Retry to roll log, nAttempts={}, waiting time={}ms, sleeping 1s to retry,"
220+
+ " last excepiton= {}", nAttempts, waitingTime,
221+
ioe.getCause().getClass().getSimpleName());
222+
sleep(1000);
223+
} else {
224+
LOG.error("Roll wal failed and waiting timeout, will not retry", ioe);
225+
throw ioe;
226+
}
227+
}
228+
} while (EnvironmentEdgeManager.currentTime() - startWaiting < rollWaitTimeout);
188229
if (regionsToFlush != null) {
189230
for (Map.Entry<byte[], List<byte[]>> r : regionsToFlush.entrySet()) {
190231
scheduleFlush(Bytes.toString(r.getKey()), r.getValue());

0 commit comments

Comments
 (0)