Skip to content

Commit

Permalink
HDFS-16550. Allow JN edit cache size to be set as a fraction of heap …
Browse files Browse the repository at this point in the history
…memory (#4209)
  • Loading branch information
tomscut authored Nov 30, 2022
1 parent 7786600 commit 2067fcb
Show file tree
Hide file tree
Showing 6 changed files with 91 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1424,7 +1424,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final long DFS_JOURNALNODE_SYNC_INTERVAL_DEFAULT = 2*60*1000L;
public static final String DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY =
"dfs.journalnode.edit-cache-size.bytes";
public static final int DFS_JOURNALNODE_EDIT_CACHE_SIZE_DEFAULT = 1024 * 1024;

public static final String DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY =
"dfs.journalnode.edit-cache-size.fraction";
public static final float DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_DEFAULT = 0.5f;

// Journal-node related configs for the client side.
public static final String DFS_QJOURNAL_QUEUE_SIZE_LIMIT_KEY = "dfs.qjournal.queued-edits.limit.mb";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp;
import org.apache.hadoop.util.AutoCloseableLock;
import org.apache.hadoop.util.Preconditions;

/**
* An in-memory cache of edits in their serialized form. This is used to serve
Expand Down Expand Up @@ -121,12 +122,18 @@ class JournaledEditsCache {
// ** End lock-protected fields **

JournaledEditsCache(Configuration conf) {
float fraction = conf.getFloat(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY,
DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_DEFAULT);
Preconditions.checkArgument((fraction > 0 && fraction < 1.0f),
String.format("Cache config %s is set at %f, it should be a positive float value, " +
"less than 1.0. The recommended value is less than 0.9.",
DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY, fraction));
capacity = conf.getInt(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY,
DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_DEFAULT);
(int) (Runtime.getRuntime().maxMemory() * fraction));
if (capacity > 0.9 * Runtime.getRuntime().maxMemory()) {
Journal.LOG.warn(String.format("Cache capacity is set at %d bytes but " +
"maximum JVM memory is only %d bytes. It is recommended that you " +
"decrease the cache size or increase the heap size.",
"decrease the cache size/fraction or increase the heap size.",
capacity, Runtime.getRuntime().maxMemory()));
}
Journal.LOG.info("Enabling the journaled edits cache with a capacity " +
Expand Down Expand Up @@ -277,11 +284,12 @@ void storeEdits(byte[] inputData, long newStartTxn, long newEndTxn,
initialize(INVALID_TXN_ID);
Journal.LOG.warn(String.format("A single batch of edits was too " +
"large to fit into the cache: startTxn = %d, endTxn = %d, " +
"input length = %d. The capacity of the cache (%s) must be " +
"input length = %d. The cache size (%s) or cache fraction (%s) must be " +
"increased for it to work properly (current capacity %d)." +
"Cache is now empty.",
newStartTxn, newEndTxn, inputData.length,
DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY, capacity));
DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY,
DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY, capacity));
return;
}
if (dataMap.isEmpty()) {
Expand Down Expand Up @@ -388,10 +396,11 @@ private CacheMissException getCacheMissException(long requestedTxnId) {
} else {
return new CacheMissException(lowestTxnId - requestedTxnId,
"Oldest txn ID available in the cache is %d, but requested txns " +
"starting at %d. The cache size (%s) may need to be increased " +
"to hold more transactions (currently %d bytes containing %d " +
"starting at %d. The cache size (%s) or cache fraction (%s) may need to be " +
"increased to hold more transactions (currently %d bytes containing %d " +
"transactions)", lowestTxnId, requestedTxnId,
DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY, capacity,
DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY,
DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY, capacity,
highestTxnId - lowestTxnId + 1);
}
}
Expand All @@ -414,4 +423,9 @@ long getCacheMissAmount() {

}

@VisibleForTesting
int getCapacity() {
return capacity;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -4945,7 +4945,7 @@

<property>
<name>dfs.journalnode.edit-cache-size.bytes</name>
<value>1048576</value>
<value></value>
<description>
The size, in bytes, of the in-memory cache of edits to keep on the
JournalNode. This cache is used to serve edits for tailing via the RPC-based
Expand All @@ -4955,6 +4955,22 @@
</description>
</property>

<property>
<name>dfs.journalnode.edit-cache-size.fraction</name>
<value>0.5f</value>
<description>
This ratio refers to the proportion of the maximum memory of the JVM.
Used to calculate the size of the edits cache that is kept in the JournalNode's memory.
This config is an alternative to the dfs.journalnode.edit-cache-size.bytes.
And it is used to serve edits for tailing via the RPC-based mechanism, and is only
enabled when dfs.ha.tail-edits.in-progress is true. Transactions range in size but
are around 200 bytes on average, so the default of 1MB can store around 5000 transactions.
So we can configure a reasonable value based on the maximum memory. The recommended value
is less than 0.9. If we set dfs.journalnode.edit-cache-size.bytes, this parameter will
not take effect.
</description>
</property>

<property>
<name>dfs.journalnode.kerberos.internal.spnego.principal</name>
<value></value>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,16 @@ lag time will be much longer. The relevant configurations are:
the oldest data in the cache was at transaction ID 20, a value of 10 would be added to the
average.

* **dfs.journalnode.edit-cache-size.fraction** - This fraction refers to the proportion of
the maximum memory of the JVM. Used to calculate the size of the edits cache that is
kept in the JournalNode's memory. This config is an alternative to the
dfs.journalnode.edit-cache-size.bytes. And it is used to serve edits for tailing via
the RPC-based mechanism, and is only enabled when dfs.ha.tail-edits.in-progress is true.
Transactions range in size but are around 200 bytes on average, so the default of 1MB
can store around 5000 transactions. So we can configure a reasonable value based on
the maximum memory. The recommended value is less than 0.9. If we set
dfs.journalnode.edit-cache-size.bytes, this parameter will not take effect.

This feature is primarily useful in conjunction with the Standby/Observer Read feature. Using this
feature, read requests can be serviced from non-active NameNodes; thus tailing in-progress edits
provides these nodes with the ability to serve requests with data which is much more fresh. See the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,24 @@ few configurations to your **hdfs-site.xml**:
<value>1048576</value>
</property>

* **dfs.journalnode.edit-cache-size.fraction** - the fraction refers to
the proportion of the maximum memory of the JVM.

Used to calculate the size of the edits cache that
is kept in the JournalNode's memory.
This config is an alternative to the dfs.journalnode.edit-cache-size.bytes.
And it is used to serve edits for tailing via the RPC-based mechanism, and is only
enabled when dfs.ha.tail-edits.in-progress is true. Transactions range in size but
are around 200 bytes on average, so the default of 1MB can store around 5000 transactions.
So we can configure a reasonable value based on the maximum memory. The recommended value
is less than 0.9. If we set dfs.journalnode.edit-cache-size.bytes, this parameter will
not take effect.

<property>
<name>dfs.journalnode.edit-cache-size.fraction</name>
<value>0.5f</value>
</property>

* **dfs.namenode.accesstime.precision** -- whether to enable access
time for HDFS file.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,27 @@ public void testCacheMalformedInput() throws Exception {
cache.retrieveEdits(-1, 10, new ArrayList<>());
}

@Test
public void testCacheSizeConfigs() {
// Assert the default configs.
Configuration config = new Configuration();
cache = new JournaledEditsCache(config);
assertEquals((int) (Runtime.getRuntime().maxMemory() * 0.5f), cache.getCapacity());

// Set dfs.journalnode.edit-cache-size.bytes.
Configuration config1 = new Configuration();
config1.setInt(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY, 1);
config1.setFloat(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY, 0.1f);
cache = new JournaledEditsCache(config1);
assertEquals(1, cache.getCapacity());

// Don't set dfs.journalnode.edit-cache-size.bytes.
Configuration config2 = new Configuration();
config2.setFloat(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_FRACTION_KEY, 0.1f);
cache = new JournaledEditsCache(config2);
assertEquals((int) (Runtime.getRuntime().maxMemory() * 0.1f), cache.getCapacity());
}

private void storeEdits(int startTxn, int endTxn) throws Exception {
cache.storeEdits(createTxnData(startTxn, endTxn - startTxn + 1), startTxn,
endTxn, NameNodeLayoutVersion.CURRENT_LAYOUT_VERSION);
Expand Down

0 comments on commit 2067fcb

Please sign in to comment.