Skip to content

HDFS-15810. RBF: RBFMetrics's TotalCapacity out of bounds #2910

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -527,9 +527,12 @@ RBFMetrics shows the metrics which are the aggregated values of sub-clusters' in
| `NumInMaintenanceLiveDataNodes` | Number of live Datanodes which are in maintenance state |
| `NumInMaintenanceDeadDataNodes` | Number of dead Datanodes which are in maintenance state |
| `NumEnteringMaintenanceDataNodes` | Number of Datanodes that are entering the maintenance state |
| `TotalCapacity` | Current raw capacity of DataNodes in bytes |
| `UsedCapacity` | Current used capacity across all DataNodes in bytes |
| `RemainingCapacity` | Current remaining capacity in bytes |
| `TotalCapacity` | Current raw capacity of DataNodes in bytes (long primitive, may overflow) |
| `UsedCapacity` | Current used capacity across all DataNodes in bytes (long primitive, may overflow) |
| `RemainingCapacity` | Current remaining capacity in bytes (long primitive, may overflow) |
| `TotalCapacityBigInt` | Current raw capacity of DataNodes in bytes (using BigInteger) |
| `UsedCapacityBigInt` | Current used capacity across all DataNodes in bytes (using BigInteger) |
| `RemainingCapacityBigInt` | Current remaining capacity in bytes (using BigInteger) |
| `NumOfMissingBlocks` | Current number of missing blocks |
| `NumLiveNodes` | Number of datanodes which are currently live |
| `NumDeadNodes` | Number of datanodes which are currently dead |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
*/
package org.apache.hadoop.hdfs.server.federation.metrics;

import java.math.BigInteger;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;

Expand Down Expand Up @@ -54,22 +56,46 @@ public interface FederationMBean {

/**
* Get the total capacity of the federated cluster.
* The number could overflow if too big. In that case use
* {@link #getTotalCapacityBigInt()} instead.
* @return Total capacity of the federated cluster.
*/
long getTotalCapacity();

/**
* Get the used capacity of the federated cluster.
* The number could overflow if too big. In that case use
* {@link #getUsedCapacityBigInt()} instead.
* @return Used capacity of the federated cluster.
*/
long getUsedCapacity();

/**
* Get the remaining capacity of the federated cluster.
* The number could overflow if too big. In that case use
* {@link #getRemainingCapacityBigInt()} instead.
* @return Remaining capacity of the federated cluster.
*/
long getRemainingCapacity();

/**
* Get the total capacity (big integer) of the federated cluster.
* @return Total capacity of the federated cluster.
*/
BigInteger getTotalCapacityBigInt();

/**
* Get the used capacity (big integer) of the federated cluster.
* @return Used capacity of the federated cluster.
*/
BigInteger getUsedCapacityBigInt();

/**
* Get the remaining capacity (big integer) of the federated cluster.
* @return Remaining capacity of the federated cluster.
*/
BigInteger getRemainingCapacityBigInt();

/**
* Get the total remote storage capacity mounted in the federated cluster.
* @return Remote capacity of the federated cluster.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import java.io.IOException;
import java.lang.reflect.Method;
import java.math.BigInteger;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
Expand Down Expand Up @@ -380,14 +381,29 @@ public long getRemainingCapacity() {
return getNameserviceAggregatedLong(MembershipStats::getAvailableSpace);
}

@Override
public long getUsedCapacity() {
return getTotalCapacity() - getRemainingCapacity();
}

@Override
public BigInteger getTotalCapacityBigInt() {
return getNameserviceAggregatedBigInt(MembershipStats::getTotalSpace);
}

@Override
public BigInteger getRemainingCapacityBigInt() {
return getNameserviceAggregatedBigInt(MembershipStats::getAvailableSpace);
}

@Override
public long getProvidedSpace() {
return getNameserviceAggregatedLong(MembershipStats::getProvidedSpace);
}

@Override
public long getUsedCapacity() {
return getTotalCapacity() - getRemainingCapacity();
public BigInteger getUsedCapacityBigInt() {
return getTotalCapacityBigInt().subtract(getRemainingCapacityBigInt());
}

@Override
Expand Down Expand Up @@ -783,6 +799,22 @@ private long getNameserviceAggregatedLong(ToLongFunction<MembershipStats> f) {
}
}

private BigInteger getNameserviceAggregatedBigInt(
ToLongFunction<MembershipStats> f) {
try {
List<MembershipState> states = getActiveNamenodeRegistrations();
BigInteger sum = BigInteger.valueOf(0);
for (MembershipState state : states) {
long lvalue = f.applyAsLong(state.getStats());
sum = sum.add(BigInteger.valueOf(lvalue));
}
return sum;
} catch (IOException e) {
LOG.error("Unable to extract metrics: {}", e.getMessage());
return new BigInteger("0");
}
}

/**
* Fetches the most active namenode memberships for all known nameservices.
* The fetched membership may not or may not be active. Excludes expired
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,9 @@
<p>Non Heap Memory used {used|fmt_bytes} of {committed|fmt_bytes} Commited Non Heap Memory. Max Non Heap Memory is {@eq key=max value="-1" type="number"}&ltunbounded&gt{:else}{max|fmt_bytes}{/eq}.</p>
{/mem.NonHeapMemoryUsage}
<table class="table table-bordered table-striped">
<tr><th>Total capacity</th><td>{TotalCapacity|fmt_bytes}</td></tr>
<tr><th>Used capacity</th><td>{UsedCapacity|fmt_bytes}</td></tr>
<tr><th>Remaining capacity</th><td>{RemainingCapacity|fmt_bytes}</td></tr>
<tr><th>Total capacity</th><td>{TotalCapacityBigInt|fmt_bytes}</td></tr>
<tr><th>Used capacity</th><td>{UsedCapacityBigInt|fmt_bytes}</td></tr>
<tr><th>Remaining capacity</th><td>{RemainingCapacityBigInt|fmt_bytes}</td></tr>
<tr><th>Nameservices</th><td>{NumNameservices}</td></tr>
<tr><th>Namenodes</th><td>{NumNamenodes}</td></tr>
<tr>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,4 +259,15 @@ private MembershipState createRegistration(String ns, String nn,
assertTrue(response.getResult());
return record;
}

// refresh namenode registration for new attributes
public boolean refreshNamenodeRegistration(NamenodeHeartbeatRequest request)
throws IOException {
boolean result = membershipStore.namenodeHeartbeat(request).getResult();
membershipStore.loadCache(true);
MembershipNamenodeResolver resolver =
(MembershipNamenodeResolver) router.getNamenodeResolver();
resolver.loadCache(true);
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,21 @@

import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.getBean;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;

import java.io.IOException;
import java.math.BigInteger;
import java.util.Iterator;
import java.util.List;

import javax.management.MalformedObjectNameException;

import org.apache.commons.collections.ListUtils;
import org.apache.hadoop.hdfs.server.federation.router.Router;
import org.apache.hadoop.hdfs.server.federation.store.protocol.NamenodeHeartbeatRequest;
import org.apache.hadoop.hdfs.server.federation.store.records.MembershipState;
import org.apache.hadoop.hdfs.server.federation.store.records.MembershipStats;
import org.apache.hadoop.hdfs.server.federation.store.records.MountTable;
Expand Down Expand Up @@ -58,6 +61,7 @@ public void testClusterStatsJMX()
FederationMBean federationBean = getBean(FEDERATION_BEAN,
FederationMBean.class);
validateClusterStatsFederationBean(federationBean);
testCapacity(federationBean);
RouterMBean routerBean = getBean(ROUTER_BEAN, RouterMBean.class);
validateClusterStatsRouterBean(routerBean);
}
Expand Down Expand Up @@ -348,4 +352,34 @@ private void validateClusterStatsRouterBean(RouterMBean bean) {
assertTrue(bean.getHostAndPort().length() > 0);
assertFalse(bean.isSecurityEnabled());
}

private void testCapacity(FederationMBean bean) throws IOException {
List<MembershipState> memberships = getActiveMemberships();
assertTrue(memberships.size() > 1);

BigInteger availableCapacity = BigInteger.valueOf(0);
BigInteger totalCapacity = BigInteger.valueOf(0);
BigInteger unitCapacity = BigInteger.valueOf(Long.MAX_VALUE);
for (MembershipState mock : memberships) {
MembershipStats stats = mock.getStats();
stats.setTotalSpace(Long.MAX_VALUE);
stats.setAvailableSpace(Long.MAX_VALUE);
// reset stats to make the new value persistent
mock.setStats(stats);
// write back the new namenode information to state store
assertTrue(refreshNamenodeRegistration(
NamenodeHeartbeatRequest.newInstance(mock)));
totalCapacity = totalCapacity.add(unitCapacity);
availableCapacity = availableCapacity.add(unitCapacity);
}

// for local cache update
assertEquals(totalCapacity, bean.getTotalCapacityBigInt());
// not equal since overflow happened.
assertNotEquals(totalCapacity, BigInteger.valueOf(bean.getTotalCapacity()));
assertEquals(availableCapacity, bean.getRemainingCapacityBigInt());
// not equal since overflow happened.
assertNotEquals(availableCapacity,
BigInteger.valueOf(bean.getRemainingCapacity()));
}
}