Skip to content

Commit 7ed0dc9

Browse files
committed
HBASE-28170 Put the cached time at the beginning of the block; run cache validation in the background when retrieving the persistent cache (#5471)
Signed-off-by: Peter Somogyi <psomogyi@apache.org> (cherry-picked from commit 41057bb)
1 parent 1641a4a commit 7ed0dc9

File tree

7 files changed

+266
-70
lines changed

7 files changed

+266
-70
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePreadReader.java

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ public HFilePreadReader(ReaderContext context, HFileInfo fileInfo, CacheConfig c
4040
Configuration conf) throws IOException {
4141
super(context, fileInfo, cacheConf, conf);
4242
final MutableBoolean fileAlreadyCached = new MutableBoolean(false);
43-
BucketCache.getBuckedCacheFromCacheConfig(cacheConf).ifPresent(bc -> fileAlreadyCached
43+
Optional<BucketCache> bucketCacheOptional =
44+
BucketCache.getBucketCacheFromCacheConfig(cacheConf);
45+
bucketCacheOptional.ifPresent(bc -> fileAlreadyCached
4446
.setValue(bc.getFullyCachedFiles().get(path.getName()) == null ? false : true));
4547
// Prefetch file blocks upon open if requested
4648
if (
@@ -65,8 +67,6 @@ public void run() {
6567
if (LOG.isTraceEnabled()) {
6668
LOG.trace("Prefetch start " + getPathOffsetEndStr(path, offset, end));
6769
}
68-
Optional<BucketCache> bucketCacheOptional =
69-
BucketCache.getBuckedCacheFromCacheConfig(cacheConf);
7070
// Don't use BlockIterator here, because it's designed to read load-on-open section.
7171
long onDiskSizeOfNextBlock = -1;
7272
while (offset < end) {
@@ -78,21 +78,24 @@ public void run() {
7878
// to the next block without actually going read all the way to the cache.
7979
if (bucketCacheOptional.isPresent()) {
8080
BucketCache cache = bucketCacheOptional.get();
81-
BlockCacheKey cacheKey = new BlockCacheKey(name, offset);
82-
BucketEntry entry = cache.getBackingMap().get(cacheKey);
83-
if (entry != null) {
84-
cacheKey = new BlockCacheKey(name, offset);
85-
entry = cache.getBackingMap().get(cacheKey);
86-
if (entry == null) {
87-
LOG.debug("No cache key {}, we'll read and cache it", cacheKey);
81+
if (cache.getBackingMapValidated().get()) {
82+
BlockCacheKey cacheKey = new BlockCacheKey(name, offset);
83+
BucketEntry entry = cache.getBackingMap().get(cacheKey);
84+
if (entry != null) {
85+
cacheKey = new BlockCacheKey(name, offset);
86+
entry = cache.getBackingMap().get(cacheKey);
87+
if (entry == null) {
88+
LOG.debug("No cache key {}, we'll read and cache it", cacheKey);
89+
} else {
90+
offset += entry.getOnDiskSizeWithHeader();
91+
LOG.debug(
92+
"Found cache key {}. Skipping prefetch, the block is already cached.",
93+
cacheKey);
94+
continue;
95+
}
8896
} else {
89-
offset += entry.getOnDiskSizeWithHeader();
90-
LOG.debug("Found cache key {}. Skipping prefetch, the block is already cached.",
91-
cacheKey);
92-
continue;
97+
LOG.debug("No entry in the backing map for cache key {}", cacheKey);
9398
}
94-
} else {
95-
LOG.debug("No entry in the backing map for cache key {}", cacheKey);
9699
}
97100
}
98101
// Perhaps we got our block from cache? Unlikely as this may be, if it happens, then
@@ -111,9 +114,7 @@ public void run() {
111114
block.release();
112115
}
113116
}
114-
BucketCache.getBuckedCacheFromCacheConfig(cacheConf)
115-
.ifPresent(bc -> bc.fileCacheCompleted(path.getName()));
116-
117+
bucketCacheOptional.ifPresent(bc -> bc.fileCacheCompleted(path.getName()));
117118
} catch (IOException e) {
118119
// IOExceptions are probably due to region closes (relocation, etc.)
119120
if (LOG.isTraceEnabled()) {

hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCache.java

Lines changed: 74 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ public class BucketCache implements BlockCache, HeapSize {
144144
// In this map, store the block's meta data like offset, length
145145
transient Map<BlockCacheKey, BucketEntry> backingMap;
146146

147+
private AtomicBoolean backingMapValidated = new AtomicBoolean(false);
148+
147149
/** Set of files for which prefetch is completed */
148150
final Map<String, Boolean> fullyCachedFiles = new ConcurrentHashMap<>();
149151

@@ -301,7 +303,6 @@ public BucketCache(String ioEngineName, long capacity, int blockSize, int[] buck
301303

302304
this.allocFailLogPrevTs = 0;
303305

304-
bucketAllocator = new BucketAllocator(capacity, bucketSizes);
305306
for (int i = 0; i < writerThreads.length; ++i) {
306307
writerQueues.add(new ArrayBlockingQueue<>(writerQLen));
307308
}
@@ -318,10 +319,14 @@ public BucketCache(String ioEngineName, long capacity, int blockSize, int[] buck
318319
try {
319320
retrieveFromFile(bucketSizes);
320321
} catch (IOException ioex) {
322+
LOG.error("Can't restore from file[{}] because of ", persistencePath, ioex);
321323
backingMap.clear();
322324
fullyCachedFiles.clear();
323-
LOG.error("Can't restore from file[" + persistencePath + "] because of ", ioex);
325+
backingMapValidated.set(true);
326+
bucketAllocator = new BucketAllocator(capacity, bucketSizes);
324327
}
328+
} else {
329+
bucketAllocator = new BucketAllocator(capacity, bucketSizes);
325330
}
326331
final String threadName = Thread.currentThread().getName();
327332
this.cacheEnabled = true;
@@ -374,6 +379,7 @@ protected void startWriterThreads() {
374379
}
375380

376381
void startBucketCachePersisterThread() {
382+
LOG.info("Starting BucketCachePersisterThread");
377383
cachePersister = new BucketCachePersister(this, bucketcachePersistInterval);
378384
cachePersister.setDaemon(true);
379385
cachePersister.start();
@@ -529,6 +535,7 @@ protected void cacheBlockWithWaitInternal(BlockCacheKey cacheKey, Cacheable cach
529535
} else {
530536
this.blockNumber.increment();
531537
this.heapSize.add(cachedItem.heapSize());
538+
blocksByHFile.add(cacheKey);
532539
}
533540
}
534541

@@ -589,6 +596,7 @@ public Cacheable getBlock(BlockCacheKey key, boolean caching, boolean repeat,
589596
// the cache map state might differ from the actual cache. If we reach this block,
590597
// we should remove the cache key entry from the backing map
591598
backingMap.remove(key);
599+
fullyCachedFiles.remove(key.getHfileName());
592600
LOG.debug("Failed to fetch block for cache key: {}.", key, hioex);
593601
} catch (IOException ioex) {
594602
LOG.error("Failed reading block " + key + " from bucket cache", ioex);
@@ -684,6 +692,7 @@ private boolean doEvictBlock(BlockCacheKey cacheKey, BucketEntry bucketEntry,
684692
} else {
685693
return bucketEntryToUse.withWriteLock(offsetLock, () -> {
686694
if (backingMap.remove(cacheKey, bucketEntryToUse)) {
695+
LOG.debug("removed key {} from back map in the evict process", cacheKey);
687696
blockEvicted(cacheKey, bucketEntryToUse, !existedInRamCache, evictedByEvictionProcess);
688697
return true;
689698
}
@@ -1255,30 +1264,43 @@ static List<RAMQueueEntry> getRAMQueueEntries(BlockingQueue<RAMQueueEntry> q,
12551264
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "OBL_UNSATISFIED_OBLIGATION",
12561265
justification = "false positive, try-with-resources ensures close is called.")
12571266
void persistToFile() throws IOException {
1267+
LOG.debug("Thread {} started persisting bucket cache to file",
1268+
Thread.currentThread().getName());
12581269
if (!isCachePersistent()) {
12591270
throw new IOException("Attempt to persist non-persistent cache mappings!");
12601271
}
12611272
File tempPersistencePath = new File(persistencePath + EnvironmentEdgeManager.currentTime());
12621273
try (FileOutputStream fos = new FileOutputStream(tempPersistencePath, false)) {
12631274
fos.write(ProtobufMagic.PB_MAGIC);
12641275
BucketProtoUtils.toPB(this).writeDelimitedTo(fos);
1276+
} catch (IOException e) {
1277+
LOG.error("Failed to persist bucket cache to file", e);
1278+
throw e;
12651279
}
1280+
LOG.debug("Thread {} finished persisting bucket cache to file, renaming",
1281+
Thread.currentThread().getName());
12661282
if (!tempPersistencePath.renameTo(new File(persistencePath))) {
12671283
LOG.warn("Failed to commit cache persistent file. We might lose cached blocks if "
12681284
+ "RS crashes/restarts before we successfully checkpoint again.");
12691285
}
12701286
}
12711287

1272-
private boolean isCachePersistent() {
1288+
public boolean isCachePersistent() {
12731289
return ioEngine.isPersistent() && persistencePath != null;
12741290
}
12751291

12761292
/**
12771293
* @see #persistToFile()
12781294
*/
12791295
private void retrieveFromFile(int[] bucketSizes) throws IOException {
1296+
LOG.info("Started retrieving bucket cache from file");
12801297
File persistenceFile = new File(persistencePath);
12811298
if (!persistenceFile.exists()) {
1299+
LOG.warn("Persistence file missing! "
1300+
+ "It's ok if it's first run after enabling persistent cache.");
1301+
bucketAllocator = new BucketAllocator(cacheCapacity, bucketSizes, backingMap, realCacheSize);
1302+
blockNumber.add(backingMap.size());
1303+
backingMapValidated.set(true);
12821304
return;
12831305
}
12841306
assert !cacheEnabled;
@@ -1296,6 +1318,7 @@ private void retrieveFromFile(int[] bucketSizes) throws IOException {
12961318
parsePB(BucketCacheProtos.BucketCacheEntry.parseDelimitedFrom(in));
12971319
bucketAllocator = new BucketAllocator(cacheCapacity, bucketSizes, backingMap, realCacheSize);
12981320
blockNumber.add(backingMap.size());
1321+
LOG.info("Bucket cache retrieved from file successfully");
12991322
}
13001323
}
13011324

@@ -1368,27 +1391,43 @@ private void parsePB(BucketCacheProtos.BucketCacheEntry proto) throws IOExceptio
13681391
try {
13691392
((PersistentIOEngine) ioEngine).verifyFileIntegrity(proto.getChecksum().toByteArray(),
13701393
algorithm);
1394+
backingMapValidated.set(true);
13711395
} catch (IOException e) {
13721396
LOG.warn("Checksum for cache file failed. "
1373-
+ "We need to validate each cache key in the backing map. This may take some time...");
1374-
long startTime = EnvironmentEdgeManager.currentTime();
1375-
int totalKeysOriginally = backingMap.size();
1376-
for (Map.Entry<BlockCacheKey, BucketEntry> keyEntry : backingMap.entrySet()) {
1377-
try {
1378-
((FileIOEngine) ioEngine).checkCacheTime(keyEntry.getValue());
1379-
} catch (IOException e1) {
1380-
LOG.debug("Check for key {} failed. Removing it from map.", keyEntry.getKey());
1381-
backingMap.remove(keyEntry.getKey());
1382-
fullyCachedFiles.remove(keyEntry.getKey().getHfileName());
1397+
+ "We need to validate each cache key in the backing map. "
1398+
+ "This may take some time, so we'll do it in a background thread,");
1399+
Runnable cacheValidator = () -> {
1400+
while (bucketAllocator == null) {
1401+
try {
1402+
Thread.sleep(50);
1403+
} catch (InterruptedException ex) {
1404+
throw new RuntimeException(ex);
1405+
}
13831406
}
1384-
}
1385-
LOG.info("Finished validating {} keys in the backing map. Recovered: {}. This took {}ms.",
1386-
totalKeysOriginally, backingMap.size(),
1387-
(EnvironmentEdgeManager.currentTime() - startTime));
1407+
long startTime = EnvironmentEdgeManager.currentTime();
1408+
int totalKeysOriginally = backingMap.size();
1409+
for (Map.Entry<BlockCacheKey, BucketEntry> keyEntry : backingMap.entrySet()) {
1410+
try {
1411+
((FileIOEngine) ioEngine).checkCacheTime(keyEntry.getValue());
1412+
} catch (IOException e1) {
1413+
LOG.debug("Check for key {} failed. Evicting.", keyEntry.getKey());
1414+
evictBlock(keyEntry.getKey());
1415+
fullyCachedFiles.remove(keyEntry.getKey().getHfileName());
1416+
}
1417+
}
1418+
backingMapValidated.set(true);
1419+
LOG.info("Finished validating {} keys in the backing map. Recovered: {}. This took {}ms.",
1420+
totalKeysOriginally, backingMap.size(),
1421+
(EnvironmentEdgeManager.currentTime() - startTime));
1422+
};
1423+
Thread t = new Thread(cacheValidator);
1424+
t.setDaemon(true);
1425+
t.start();
13881426
}
13891427
} else {
13901428
// if has not checksum, it means the persistence file is old format
13911429
LOG.info("Persistent file is old format, it does not support verifying file integrity!");
1430+
backingMapValidated.set(true);
13921431
}
13931432
verifyCapacityAndClasses(proto.getCacheCapacity(), proto.getIoClass(), proto.getMapClass());
13941433
}
@@ -1417,6 +1456,7 @@ private void checkIOErrorIsTolerated() {
14171456
*/
14181457
private void disableCache() {
14191458
if (!cacheEnabled) return;
1459+
LOG.info("Disabling cache");
14201460
cacheEnabled = false;
14211461
ioEngine.shutdown();
14221462
this.scheduleThreadPool.shutdown();
@@ -1441,11 +1481,15 @@ public void shutdown() {
14411481
LOG.info("Shutdown bucket cache: IO persistent=" + ioEngine.isPersistent() + "; path to write="
14421482
+ persistencePath);
14431483
if (ioEngine.isPersistent() && persistencePath != null) {
1444-
if (cachePersister != null) {
1445-
cachePersister.interrupt();
1446-
}
14471484
try {
14481485
join();
1486+
if (cachePersister != null) {
1487+
LOG.info("Shutting down cache persister thread.");
1488+
cachePersister.shutdown();
1489+
while (cachePersister.isAlive()) {
1490+
Thread.sleep(10);
1491+
}
1492+
}
14491493
persistToFile();
14501494
} catch (IOException ex) {
14511495
LOG.error("Unable to persist data on exit: " + ex.toString(), ex);
@@ -1650,17 +1694,17 @@ public BucketEntry writeToCache(final IOEngine ioEngine, final BucketAllocator a
16501694
HFileBlock block = (HFileBlock) data;
16511695
ByteBuff sliceBuf = block.getBufferReadOnly();
16521696
block.getMetaData(metaBuff);
1653-
ioEngine.write(sliceBuf, offset);
1654-
// adds the cache time after the block and metadata part
1697+
// adds the cache time prior to the block and metadata part
16551698
if (isCachePersistent) {
1656-
ioEngine.write(metaBuff, offset + len - metaBuff.limit() - Long.BYTES);
16571699
ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES);
16581700
buffer.putLong(bucketEntry.getCachedTime());
16591701
buffer.rewind();
1660-
ioEngine.write(buffer, (offset + len - Long.BYTES));
1702+
ioEngine.write(buffer, offset);
1703+
ioEngine.write(sliceBuf, (offset + Long.BYTES));
16611704
} else {
1662-
ioEngine.write(metaBuff, offset + len - metaBuff.limit());
1705+
ioEngine.write(sliceBuf, offset);
16631706
}
1707+
ioEngine.write(metaBuff, offset + len - metaBuff.limit());
16641708
} else {
16651709
// Only used for testing.
16661710
ByteBuffer bb = ByteBuffer.allocate(len);
@@ -1902,11 +1946,15 @@ public Map<BlockCacheKey, BucketEntry> getBackingMap() {
19021946
return backingMap;
19031947
}
19041948

1949+
public AtomicBoolean getBackingMapValidated() {
1950+
return backingMapValidated;
1951+
}
1952+
19051953
public Map<String, Boolean> getFullyCachedFiles() {
19061954
return fullyCachedFiles;
19071955
}
19081956

1909-
public static Optional<BucketCache> getBuckedCacheFromCacheConfig(CacheConfig cacheConf) {
1957+
public static Optional<BucketCache> getBucketCacheFromCacheConfig(CacheConfig cacheConf) {
19101958
if (cacheConf.getBlockCache().isPresent()) {
19111959
BlockCache bc = cacheConf.getBlockCache().get();
19121960
if (bc instanceof CombinedBlockCache) {

hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/bucket/BucketCachePersister.java

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.hadoop.hbase.io.hfile.bucket;
1919

2020
import java.io.IOException;
21+
import java.util.concurrent.atomic.AtomicBoolean;
2122
import org.apache.yetus.audience.InterfaceAudience;
2223
import org.slf4j.Logger;
2324
import org.slf4j.LoggerFactory;
@@ -28,6 +29,8 @@ public class BucketCachePersister extends Thread {
2829
private final long intervalMillis;
2930
private static final Logger LOG = LoggerFactory.getLogger(BucketCachePersister.class);
3031

32+
private AtomicBoolean shutdown = new AtomicBoolean(false);
33+
3134
public BucketCachePersister(BucketCache cache, long intervalMillis) {
3235
super("bucket-cache-persister");
3336
this.cache = cache;
@@ -36,20 +39,34 @@ public BucketCachePersister(BucketCache cache, long intervalMillis) {
3639
}
3740

3841
public void run() {
39-
while (true) {
40-
try {
41-
Thread.sleep(intervalMillis);
42-
if (cache.isCacheInconsistent()) {
43-
LOG.debug("Cache is inconsistent, persisting to disk");
44-
cache.persistToFile();
45-
cache.setCacheInconsistent(false);
42+
try {
43+
while (true) {
44+
try {
45+
Thread.sleep(intervalMillis);
46+
if (cache.isCacheInconsistent()) {
47+
LOG.debug("Cache is inconsistent, persisting to disk");
48+
cache.persistToFile();
49+
cache.setCacheInconsistent(false);
50+
}
51+
// Thread.interrupt may cause an InterruptException inside util method used for checksum
52+
// calculation in persistToFile. This util currently swallows the exception, causing this
53+
// thread to net get interrupt, so we added this flag to indicate the persister thread
54+
// should stop.
55+
if (shutdown.get()) {
56+
break;
57+
}
58+
} catch (IOException e) {
59+
LOG.warn("Exception in BucketCachePersister.", e);
4660
}
47-
} catch (IOException e) {
48-
LOG.warn("IOException in BucketCachePersister {} ", e.getMessage());
49-
} catch (InterruptedException iex) {
50-
LOG.warn("InterruptedException in BucketCachePersister {} ", iex.getMessage());
51-
break;
5261
}
62+
LOG.info("Finishing cache persister thread.");
63+
} catch (InterruptedException e) {
64+
LOG.warn("Interrupting BucketCachePersister thread.", e);
5365
}
5466
}
67+
68+
public void shutdown() {
69+
this.shutdown.set(true);
70+
this.interrupt();
71+
}
5572
}

0 commit comments

Comments
 (0)