Skip to content

Commit 37651ee

Browse files
authored
HBASE-27321 The ReplicationLogCleaner is not thread safe but can be called from different threads at the same time (#4730)
Signed-off-by: Xin Sun <ddupgs@gmail.com>
1 parent 06728e5 commit 37651ee

File tree

7 files changed

+243
-56
lines changed

7 files changed

+243
-56
lines changed

hbase-common/src/main/java/org/apache/hadoop/hbase/ChoreService.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -182,13 +182,14 @@ public boolean scheduleChore(ScheduledChore chore) {
182182
* @param chore The Chore to be rescheduled. If the chore is not scheduled with this ChoreService
183183
* yet then this call is equivalent to a call to scheduleChore.
184184
*/
185-
private void rescheduleChore(ScheduledChore chore) {
185+
private void rescheduleChore(ScheduledChore chore, boolean immediately) {
186186
if (scheduledChores.containsKey(chore)) {
187187
ScheduledFuture<?> future = scheduledChores.get(chore);
188188
future.cancel(false);
189189
}
190-
ScheduledFuture<?> future = scheduler.scheduleAtFixedRate(chore, chore.getInitialDelay(),
191-
chore.getPeriod(), chore.getTimeUnit());
190+
// set initial delay to 0 as we want to run it immediately
191+
ScheduledFuture<?> future = scheduler.scheduleAtFixedRate(chore,
192+
immediately ? 0 : chore.getPeriod(), chore.getPeriod(), chore.getTimeUnit());
192193
scheduledChores.put(chore, future);
193194
}
194195

@@ -244,7 +245,7 @@ public synchronized boolean isChoreScheduled(ScheduledChore chore) {
244245
allowedOnPath = ".*/org/apache/hadoop/hbase/ScheduledChore.java")
245246
synchronized void triggerNow(ScheduledChore chore) {
246247
assert chore.getChoreService() == this;
247-
rescheduleChore(chore);
248+
rescheduleChore(chore, true);
248249
}
249250

250251
/** Returns number of chores that this service currently has scheduled */
@@ -343,7 +344,7 @@ synchronized void onChoreMissedStartTime(ScheduledChore chore) {
343344
// the chore is NOT rescheduled, future executions of this chore will be delayed more and
344345
// more on each iteration. This hurts us because the ScheduledThreadPoolExecutor allocates
345346
// idle threads to chores based on how delayed they are.
346-
rescheduleChore(chore);
347+
rescheduleChore(chore, false);
347348
printChoreDetails("onChoreMissedStartTime", chore);
348349
}
349350

hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterRpcServices.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
import java.util.Map;
3030
import java.util.Map.Entry;
3131
import java.util.Set;
32+
import java.util.concurrent.CompletableFuture;
33+
import java.util.concurrent.ExecutionException;
3234
import java.util.concurrent.ThreadLocalRandom;
3335
import java.util.stream.Collectors;
3436
import org.apache.hadoop.conf.Configuration;
@@ -1632,8 +1634,16 @@ public RunCatalogScanResponse runCatalogScan(RpcController c, RunCatalogScanRequ
16321634
public RunCleanerChoreResponse runCleanerChore(RpcController c, RunCleanerChoreRequest req)
16331635
throws ServiceException {
16341636
rpcPreCheck("runCleanerChore");
1635-
boolean result = server.getHFileCleaner().runCleaner() && server.getLogCleaner().runCleaner();
1636-
return ResponseConverter.buildRunCleanerChoreResponse(result);
1637+
try {
1638+
CompletableFuture<Boolean> fileCleanerFuture = server.getHFileCleaner().triggerCleanerNow();
1639+
CompletableFuture<Boolean> logCleanerFuture = server.getLogCleaner().triggerCleanerNow();
1640+
boolean result = fileCleanerFuture.get() && logCleanerFuture.get();
1641+
return ResponseConverter.buildRunCleanerChoreResponse(result);
1642+
} catch (InterruptedException e) {
1643+
throw new ServiceException(e);
1644+
} catch (ExecutionException e) {
1645+
throw new ServiceException(e.getCause());
1646+
}
16371647
}
16381648

16391649
@Override

hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/CleanerChore.java

Lines changed: 71 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import java.util.Arrays;
2424
import java.util.Comparator;
2525
import java.util.HashMap;
26-
import java.util.LinkedList;
2726
import java.util.List;
2827
import java.util.Map;
2928
import java.util.concurrent.CompletableFuture;
@@ -81,6 +80,8 @@ public abstract class CleanerChore<T extends FileCleanerDelegate> extends Schedu
8180
private final AtomicBoolean enabled = new AtomicBoolean(true);
8281
protected List<T> cleanersChain;
8382
protected List<String> excludeDirs;
83+
private CompletableFuture<Boolean> future;
84+
private boolean forceRun;
8485

8586
public CleanerChore(String name, final int sleepPeriod, final Stoppable s, Configuration conf,
8687
FileSystem fs, Path oldFileDir, String confKey, DirScanPool pool) {
@@ -168,10 +169,10 @@ static int calculatePoolSize(String poolSize) {
168169
* @param confKey key to get the file cleaner classes from the configuration
169170
*/
170171
private void initCleanerChain(String confKey) {
171-
this.cleanersChain = new LinkedList<>();
172-
String[] logCleaners = conf.getStrings(confKey);
173-
if (logCleaners != null) {
174-
for (String className : logCleaners) {
172+
this.cleanersChain = new ArrayList<>();
173+
String[] cleaners = conf.getStrings(confKey);
174+
if (cleaners != null) {
175+
for (String className : cleaners) {
175176
className = className.trim();
176177
if (className.isEmpty()) {
177178
continue;
@@ -208,42 +209,87 @@ private T newFileCleaner(String className, Configuration conf) {
208209
}
209210
}
210211

212+
@Override
213+
protected boolean initialChore() {
214+
synchronized (this) {
215+
if (forceRun) {
216+
// wake up the threads waiting in triggerCleanerNow, as a triggerNow may triggers the first
217+
// loop where we will only call initialChore. We need to trigger another run immediately.
218+
forceRun = false;
219+
notifyAll();
220+
}
221+
}
222+
return true;
223+
}
224+
211225
@Override
212226
protected void chore() {
213-
if (getEnabled()) {
214-
try {
215-
pool.latchCountUp();
216-
if (runCleaner()) {
217-
LOG.trace("Cleaned all WALs under {}", oldFileDir);
227+
CompletableFuture<Boolean> f;
228+
synchronized (this) {
229+
if (!enabled.get()) {
230+
if (!forceRun) {
231+
LOG.trace("Cleaner chore {} disabled! Not cleaning.", getName());
232+
return;
218233
} else {
219-
LOG.trace("WALs outstanding under {}", oldFileDir);
234+
LOG.info("Force executing cleaner chore {} when disabled", getName());
220235
}
221-
} finally {
222-
pool.latchCountDown();
223236
}
237+
if (future != null) {
238+
LOG.warn("A cleaner chore {}'s run is in progress, give up running", getName());
239+
return;
240+
}
241+
f = new CompletableFuture<>();
242+
future = f;
243+
notifyAll();
244+
}
245+
pool.latchCountUp();
246+
try {
247+
preRunCleaner();
248+
pool.execute(() -> traverseAndDelete(oldFileDir, true, f));
249+
if (f.get()) {
250+
LOG.trace("Cleaned all files under {}", oldFileDir);
251+
} else {
252+
LOG.trace("Files outstanding under {}", oldFileDir);
253+
}
254+
} catch (Exception e) {
255+
LOG.info("Failed to traverse and delete the dir: {}", oldFileDir, e);
256+
} finally {
257+
postRunCleaner();
258+
synchronized (this) {
259+
future = null;
260+
forceRun = false;
261+
}
262+
pool.latchCountDown();
224263
// After each cleaner chore, checks if received reconfigure notification while cleaning.
225264
// First in cleaner turns off notification, to avoid another cleaner updating pool again.
226265
// This cleaner is waiting for other cleaners finishing their jobs.
227266
// To avoid missing next chore, only wait 0.8 * period, then shutdown.
228267
pool.tryUpdatePoolSize((long) (0.8 * getTimeUnit().toMillis(getPeriod())));
229-
} else {
230-
LOG.trace("Cleaner chore disabled! Not cleaning.");
231268
}
232269
}
233270

234271
private void preRunCleaner() {
235272
cleanersChain.forEach(FileCleanerDelegate::preClean);
236273
}
237274

238-
public boolean runCleaner() {
239-
preRunCleaner();
240-
try {
241-
CompletableFuture<Boolean> future = new CompletableFuture<>();
242-
pool.execute(() -> traverseAndDelete(oldFileDir, true, future));
243-
return future.get();
244-
} catch (Exception e) {
245-
LOG.info("Failed to traverse and delete the dir: {}", oldFileDir, e);
246-
return false;
275+
private void postRunCleaner() {
276+
cleanersChain.forEach(FileCleanerDelegate::postClean);
277+
}
278+
279+
/**
280+
* Trigger the cleaner immediately and return a CompletableFuture for getting the result. Return
281+
* {@code true} means all the old files have been deleted, otherwise {@code false}.
282+
*/
283+
public synchronized CompletableFuture<Boolean> triggerCleanerNow() throws InterruptedException {
284+
for (;;) {
285+
if (future != null) {
286+
return future;
287+
}
288+
forceRun = true;
289+
if (!triggerNow()) {
290+
return CompletableFuture.completedFuture(false);
291+
}
292+
wait();
247293
}
248294
}
249295

@@ -396,9 +442,6 @@ int getChorePoolSize() {
396442
return pool.getSize();
397443
}
398444

399-
/**
400-
* n
401-
*/
402445
public boolean setEnabled(final boolean enabled) {
403446
return this.enabled.getAndSet(enabled);
404447
}
@@ -449,7 +492,7 @@ private void traverseAndDelete(Path dir, boolean root, CompletableFuture<Boolean
449492
CompletableFuture.allOf(futures.toArray(new CompletableFuture[futures.size()])),
450493
(voidObj, e) -> {
451494
if (e != null) {
452-
result.completeExceptionally(e);
495+
result.completeExceptionally(FutureUtils.unwrapCompletionException(e));
453496
return;
454497
}
455498
try {

hbase-server/src/main/java/org/apache/hadoop/hbase/master/cleaner/FileCleanerDelegate.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ public interface FileCleanerDelegate extends Configurable, Stoppable {
4949
default void preClean() {
5050
}
5151

52+
/**
53+
* Used to do some cleanup work
54+
*/
55+
default void postClean() {
56+
}
57+
5258
/**
5359
* Check if a empty directory with no subdirs or subfiles can be deleted
5460
* @param dir Path of the directory

hbase-server/src/main/java/org/apache/hadoop/hbase/replication/master/ReplicationLogCleaner.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,12 @@ public void preClean() {
6666
}
6767
}
6868

69+
@Override
70+
public void postClean() {
71+
// release memory
72+
wals = null;
73+
}
74+
6975
@Override
7076
public Iterable<FileStatus> getDeletableFiles(Iterable<FileStatus> files) {
7177
// all members of this class are null if replication is disabled,

hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestTableSnapshotScanner.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,7 @@ public void testMergeRegion() throws Exception {
449449
// set file modify time and then run cleaner
450450
long time = EnvironmentEdgeManager.currentTime() - TimeToLiveHFileCleaner.DEFAULT_TTL * 1000;
451451
traverseAndSetFileTime(HFileArchiveUtil.getArchivePath(conf), time);
452-
UTIL.getMiniHBaseCluster().getMaster().getHFileCleaner().runCleaner();
452+
UTIL.getMiniHBaseCluster().getMaster().getHFileCleaner().triggerCleanerNow().get();
453453
// scan snapshot
454454
try (TableSnapshotScanner scanner =
455455
new TableSnapshotScanner(conf, UTIL.getDataTestDirOnTestFS(snapshotName), snapshotName,

0 commit comments

Comments
 (0)