Skip to content

Commit 43fac73

Browse files
fengnanliaajisaka
authored andcommitted
HDFS-15810. RBF: RBFMetrics's TotalCapacity out of bounds (#2910)
Reviewed-by: Inigo Goiri <inigoiri@apache.org> Signed-off-by: Akira Ajisaka <aajisaka@apache.org> (cherry picked from commit 6e525ab)
1 parent 670205c commit 43fac73

File tree

6 files changed

+115
-9
lines changed

6 files changed

+115
-9
lines changed

hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -507,9 +507,12 @@ RBFMetrics shows the metrics which are the aggregated values of sub-clusters' in
507507
| `NumInMaintenanceLiveDataNodes` | Number of live Datanodes which are in maintenance state |
508508
| `NumInMaintenanceDeadDataNodes` | Number of dead Datanodes which are in maintenance state |
509509
| `NumEnteringMaintenanceDataNodes` | Number of Datanodes that are entering the maintenance state |
510-
| `TotalCapacity` | Current raw capacity of DataNodes in bytes |
511-
| `UsedCapacity` | Current used capacity across all DataNodes in bytes |
512-
| `RemainingCapacity` | Current remaining capacity in bytes |
510+
| `TotalCapacity` | Current raw capacity of DataNodes in bytes (long primitive, may overflow) |
511+
| `UsedCapacity` | Current used capacity across all DataNodes in bytes (long primitive, may overflow) |
512+
| `RemainingCapacity` | Current remaining capacity in bytes (long primitive, may overflow) |
513+
| `TotalCapacityBigInt` | Current raw capacity of DataNodes in bytes (using BigInteger) |
514+
| `UsedCapacityBigInt` | Current used capacity across all DataNodes in bytes (using BigInteger) |
515+
| `RemainingCapacityBigInt` | Current remaining capacity in bytes (using BigInteger) |
513516
| `NumOfMissingBlocks` | Current number of missing blocks |
514517
| `NumLiveNodes` | Number of datanodes which are currently live |
515518
| `NumDeadNodes` | Number of datanodes which are currently dead |

hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationMBean.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
*/
1818
package org.apache.hadoop.hdfs.server.federation.metrics;
1919

20+
import java.math.BigInteger;
21+
2022
import org.apache.hadoop.classification.InterfaceAudience;
2123
import org.apache.hadoop.classification.InterfaceStability;
2224

@@ -54,22 +56,46 @@ public interface FederationMBean {
5456

5557
/**
5658
* Get the total capacity of the federated cluster.
59+
* The number could overflow if too big. In that case use
60+
* {@link #getTotalCapacityBigInt()} instead.
5761
* @return Total capacity of the federated cluster.
5862
*/
5963
long getTotalCapacity();
6064

6165
/**
6266
* Get the used capacity of the federated cluster.
67+
* The number could overflow if too big. In that case use
68+
* {@link #getUsedCapacityBigInt()} instead.
6369
* @return Used capacity of the federated cluster.
6470
*/
6571
long getUsedCapacity();
6672

6773
/**
6874
* Get the remaining capacity of the federated cluster.
75+
* The number could overflow if too big. In that case use
76+
* {@link #getRemainingCapacityBigInt()} instead.
6977
* @return Remaining capacity of the federated cluster.
7078
*/
7179
long getRemainingCapacity();
7280

81+
/**
82+
* Get the total capacity (big integer) of the federated cluster.
83+
* @return Total capacity of the federated cluster.
84+
*/
85+
BigInteger getTotalCapacityBigInt();
86+
87+
/**
88+
* Get the used capacity (big integer) of the federated cluster.
89+
* @return Used capacity of the federated cluster.
90+
*/
91+
BigInteger getUsedCapacityBigInt();
92+
93+
/**
94+
* Get the remaining capacity (big integer) of the federated cluster.
95+
* @return Remaining capacity of the federated cluster.
96+
*/
97+
BigInteger getRemainingCapacityBigInt();
98+
7399
/**
74100
* Get the total remote storage capacity mounted in the federated cluster.
75101
* @return Remote capacity of the federated cluster.

hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/RBFMetrics.java

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import java.io.IOException;
2323
import java.lang.reflect.Method;
24+
import java.math.BigInteger;
2425
import java.net.InetAddress;
2526
import java.net.InetSocketAddress;
2627
import java.net.UnknownHostException;
@@ -377,14 +378,29 @@ public long getRemainingCapacity() {
377378
return getNameserviceAggregatedLong(MembershipStats::getAvailableSpace);
378379
}
379380

381+
@Override
382+
public long getUsedCapacity() {
383+
return getTotalCapacity() - getRemainingCapacity();
384+
}
385+
386+
@Override
387+
public BigInteger getTotalCapacityBigInt() {
388+
return getNameserviceAggregatedBigInt(MembershipStats::getTotalSpace);
389+
}
390+
391+
@Override
392+
public BigInteger getRemainingCapacityBigInt() {
393+
return getNameserviceAggregatedBigInt(MembershipStats::getAvailableSpace);
394+
}
395+
380396
@Override
381397
public long getProvidedSpace() {
382398
return getNameserviceAggregatedLong(MembershipStats::getProvidedSpace);
383399
}
384400

385401
@Override
386-
public long getUsedCapacity() {
387-
return getTotalCapacity() - getRemainingCapacity();
402+
public BigInteger getUsedCapacityBigInt() {
403+
return getTotalCapacityBigInt().subtract(getRemainingCapacityBigInt());
388404
}
389405

390406
@Override
@@ -730,6 +746,22 @@ private long getNameserviceAggregatedLong(ToLongFunction<MembershipStats> f) {
730746
}
731747
}
732748

749+
private BigInteger getNameserviceAggregatedBigInt(
750+
ToLongFunction<MembershipStats> f) {
751+
try {
752+
List<MembershipState> states = getActiveNamenodeRegistrations();
753+
BigInteger sum = BigInteger.valueOf(0);
754+
for (MembershipState state : states) {
755+
long lvalue = f.applyAsLong(state.getStats());
756+
sum = sum.add(BigInteger.valueOf(lvalue));
757+
}
758+
return sum;
759+
} catch (IOException e) {
760+
LOG.error("Unable to extract metrics: {}", e.getMessage());
761+
return new BigInteger("0");
762+
}
763+
}
764+
733765
/**
734766
* Fetches the most active namenode memberships for all known nameservices.
735767
* The fetched membership may not or may not be active. Excludes expired

hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/webapps/router/federationhealth.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,9 @@
113113
<p>Non Heap Memory used {used|fmt_bytes} of {committed|fmt_bytes} Commited Non Heap Memory. Max Non Heap Memory is {@eq key=max value="-1" type="number"}&ltunbounded&gt{:else}{max|fmt_bytes}{/eq}.</p>
114114
{/mem.NonHeapMemoryUsage}
115115
<table class="table table-bordered table-striped">
116-
<tr><th>Total capacity</th><td>{TotalCapacity|fmt_bytes}</td></tr>
117-
<tr><th>Used capacity</th><td>{UsedCapacity|fmt_bytes}</td></tr>
118-
<tr><th>Remaining capacity</th><td>{RemainingCapacity|fmt_bytes}</td></tr>
116+
<tr><th>Total capacity</th><td>{TotalCapacityBigInt|fmt_bytes}</td></tr>
117+
<tr><th>Used capacity</th><td>{UsedCapacityBigInt|fmt_bytes}</td></tr>
118+
<tr><th>Remaining capacity</th><td>{RemainingCapacityBigInt|fmt_bytes}</td></tr>
119119
<tr><th>Nameservices</th><td>{NumNameservices}</td></tr>
120120
<tr><th>Namenodes</th><td>{NumNamenodes}</td></tr>
121121
<tr>

hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestMetricsBase.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,4 +259,15 @@ private MembershipState createRegistration(String ns, String nn,
259259
assertTrue(response.getResult());
260260
return record;
261261
}
262+
263+
// refresh namenode registration for new attributes
264+
public boolean refreshNamenodeRegistration(NamenodeHeartbeatRequest request)
265+
throws IOException {
266+
boolean result = membershipStore.namenodeHeartbeat(request).getResult();
267+
membershipStore.loadCache(true);
268+
MembershipNamenodeResolver resolver =
269+
(MembershipNamenodeResolver) router.getNamenodeResolver();
270+
resolver.loadCache(true);
271+
return result;
272+
}
262273
}

hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/metrics/TestRBFMetrics.java

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,21 @@
1919

2020
import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.getBean;
2121
import static org.junit.Assert.assertEquals;
22+
import static org.junit.Assert.assertFalse;
23+
import static org.junit.Assert.assertNotEquals;
2224
import static org.junit.Assert.assertNotNull;
2325
import static org.junit.Assert.assertTrue;
24-
import static org.junit.Assert.assertFalse;
2526

2627
import java.io.IOException;
28+
import java.math.BigInteger;
2729
import java.util.Iterator;
2830
import java.util.List;
2931

3032
import javax.management.MalformedObjectNameException;
3133

3234
import org.apache.commons.collections.ListUtils;
3335
import org.apache.hadoop.hdfs.server.federation.router.Router;
36+
import org.apache.hadoop.hdfs.server.federation.store.protocol.NamenodeHeartbeatRequest;
3437
import org.apache.hadoop.hdfs.server.federation.store.records.MembershipState;
3538
import org.apache.hadoop.hdfs.server.federation.store.records.MembershipStats;
3639
import org.apache.hadoop.hdfs.server.federation.store.records.MountTable;
@@ -58,6 +61,7 @@ public void testClusterStatsJMX()
5861
FederationMBean federationBean = getBean(FEDERATION_BEAN,
5962
FederationMBean.class);
6063
validateClusterStatsFederationBean(federationBean);
64+
testCapacity(federationBean);
6165
RouterMBean routerBean = getBean(ROUTER_BEAN, RouterMBean.class);
6266
validateClusterStatsRouterBean(routerBean);
6367
}
@@ -326,4 +330,34 @@ private void validateClusterStatsRouterBean(RouterMBean bean) {
326330
assertTrue(bean.getHostAndPort().length() > 0);
327331
assertFalse(bean.isSecurityEnabled());
328332
}
333+
334+
private void testCapacity(FederationMBean bean) throws IOException {
335+
List<MembershipState> memberships = getActiveMemberships();
336+
assertTrue(memberships.size() > 1);
337+
338+
BigInteger availableCapacity = BigInteger.valueOf(0);
339+
BigInteger totalCapacity = BigInteger.valueOf(0);
340+
BigInteger unitCapacity = BigInteger.valueOf(Long.MAX_VALUE);
341+
for (MembershipState mock : memberships) {
342+
MembershipStats stats = mock.getStats();
343+
stats.setTotalSpace(Long.MAX_VALUE);
344+
stats.setAvailableSpace(Long.MAX_VALUE);
345+
// reset stats to make the new value persistent
346+
mock.setStats(stats);
347+
// write back the new namenode information to state store
348+
assertTrue(refreshNamenodeRegistration(
349+
NamenodeHeartbeatRequest.newInstance(mock)));
350+
totalCapacity = totalCapacity.add(unitCapacity);
351+
availableCapacity = availableCapacity.add(unitCapacity);
352+
}
353+
354+
// for local cache update
355+
assertEquals(totalCapacity, bean.getTotalCapacityBigInt());
356+
// not equal since overflow happened.
357+
assertNotEquals(totalCapacity, BigInteger.valueOf(bean.getTotalCapacity()));
358+
assertEquals(availableCapacity, bean.getRemainingCapacityBigInt());
359+
// not equal since overflow happened.
360+
assertNotEquals(availableCapacity,
361+
BigInteger.valueOf(bean.getRemainingCapacity()));
362+
}
329363
}

0 commit comments

Comments
 (0)