Skip to content

HDDS-2199 In SCMNodeManager dnsToUuidMap cannot track multiple DNs on the same host #1551

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Original file line number Diff line number Diff line change
Expand Up @@ -192,11 +192,11 @@ void processNodeReport(DatanodeDetails datanodeDetails,
DatanodeDetails getNodeByUuid(String uuid);

/**
* Given datanode address(Ipaddress or hostname), returns the DatanodeDetails
* for the node.
* Given datanode address(Ipaddress or hostname), returns a list of
* DatanodeDetails for the datanodes running at that address.
*
* @param address datanode address
* @return the given datanode, or null if not found
* @return the given datanode, or empty list if none found
*/
DatanodeDetails getNodeByAddress(String address);
List<DatanodeDetails> getNodesByAddress(String address);
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.LinkedList;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ScheduledFuture;
import java.util.stream.Collectors;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState;
Expand Down Expand Up @@ -98,7 +100,7 @@ public class SCMNodeManager implements NodeManager {
private final NetworkTopology clusterMap;
private final DNSToSwitchMapping dnsToSwitchMapping;
private final boolean useHostname;
private final ConcurrentHashMap<String, String> dnsToUuidMap =
private final ConcurrentHashMap<String, Set<String>> dnsToUuidMap =
new ConcurrentHashMap<>();

/**
Expand Down Expand Up @@ -260,7 +262,7 @@ public RegisteredCommand register(
}
nodeStateManager.addNode(datanodeDetails);
clusterMap.add(datanodeDetails);
dnsToUuidMap.put(dnsName, datanodeDetails.getUuidString());
addEntryTodnsToUuidMap(dnsName, datanodeDetails.getUuidString());
// Updating Node Report, as registration is successful
processNodeReport(datanodeDetails, nodeReport);
LOG.info("Registered Data node : {}", datanodeDetails);
Expand All @@ -275,6 +277,26 @@ public RegisteredCommand register(
.build();
}

/**
* Add an entry to the dnsToUuidMap, which maps hostname / IP to the DNs
* running on that host. As each address can have many DNs running on it,
* this is a one to many mapping.
* @param dnsName String representing the hostname or IP of the node
* @param uuid String representing the UUID of the registered node.
*/
@SuppressFBWarnings(value="AT_OPERATION_SEQUENCE_ON_CONCURRENT_ABSTRACTION",
justification="The method is synchronized and this is the only place "+
"dnsToUuidMap is modified")
private synchronized void addEntryTodnsToUuidMap(
String dnsName, String uuid) {
Set<String> dnList = dnsToUuidMap.get(dnsName);
if (dnList == null) {
dnList = ConcurrentHashMap.newKeySet();
dnsToUuidMap.put(dnsName, dnList);
}
dnList.add(uuid);
}

/**
* Send heartbeat to indicate the datanode is alive and doing well.
*
Expand Down Expand Up @@ -584,29 +606,34 @@ public DatanodeDetails getNodeByUuid(String uuid) {
}

/**
* Given datanode address(Ipaddress or hostname), returns the DatanodeDetails
* for the node.
* Given datanode address(Ipaddress or hostname), return a list of
* DatanodeDetails for the datanodes registered on that address.
*
* @param address datanode address
* @return the given datanode, or null if not found
* @return the given datanode, or empty list if none found
*/
@Override
public DatanodeDetails getNodeByAddress(String address) {
public List<DatanodeDetails> getNodesByAddress(String address) {
List<DatanodeDetails> results = new LinkedList<>();
if (Strings.isNullOrEmpty(address)) {
LOG.warn("address is null");
return null;
return results;
}
String uuid = dnsToUuidMap.get(address);
if (uuid != null) {
Set<String> uuids = dnsToUuidMap.get(address);
if (uuids == null) {
LOG.warn("Cannot find node for address {}", address);
return results;
}

for (String uuid : uuids) {
DatanodeDetails temp = DatanodeDetails.newBuilder().setUuid(uuid).build();
try {
return nodeStateManager.getNode(temp);
results.add(nodeStateManager.getNode(temp));
} catch (NodeNotFoundException e) {
LOG.warn("Cannot find node for uuid {}", uuid);
}
}
LOG.warn("Cannot find node for address {}", address);
return null;
return results;
}

private String nodeResolve(String hostname) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,12 @@ public List<DatanodeDetails> sortDatanodes(List<String> nodes,
boolean auditSuccess = true;
try{
NodeManager nodeManager = scm.getScmNodeManager();
Node client = nodeManager.getNodeByAddress(clientMachine);
Copy link
Member

@elek elek Oct 2, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am trying to understand why this big block is not just as simple:

      Node client = null;
      List<DatanodeDetails> possibleClients =
          nodeManager.getNodesByAddress(clientMachine);
      if (possibleClients.size()>0){
        client = possibleClients.get(0);
      }

It seems to be a logic to find a datanode which is on the same host as the client. I am not sure if we need this tricky randomization (or choosing the first possible datanodes): if client is null, we don't need sort (handled by the sort method below), if there are multiple datanodes on the same client we can choose the first one as in the topology sort it doesn't matter which one is chosen.

But please fix me if I am wrong.

Copy link
Contributor Author

@sodonnel sodonnel Oct 2, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not certain how this sortDatanodes() call is used. Is it on the read path or write path? I was assuming it was on the read path, but write path may be different if all the cluster DNs are passed into the method - then you would always get a match.

A list of DNs (UUIDs) are passed into the method, and then we retrieve a list of DatanodeDetails running on the client machine. The client machine can then be set to one of those DatanodeDetails, but it is not guaranteed that the first in the list will match on of the UUIDs passed into the method.

Eg this is passed in:

DN0, DN5, DN10, DN15

On the client machine is:

DN1, DN6, DN10 and DN16

So only DN10 is a match with one that is passed it. If we just picked the first one (DN1) it would look like there is no DN on the client machine and then when the list and client machine are passed into sortByDistanceCost() at line 355, it would not give the expected result.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at where sortDatanodes() is used, it seems to be from the OM when performing lookup file or lookup key. So that suggests it is only used in the read path, and hence at most 3 DNs should be passed in along with one client address.

The code could be simplified a little, but I think we do need to filter the list of returned nodes down to only the nodes it cares about due to what I said in the comment above.

However, thinking about this some more, I think we can avoid the random selection. In the case where there is only 1 DN per host, the DN matching the client would always be sorted first, so we don't really need to randomize the first node returned if all nodes are on the same host. I will refactor this and see how it looks then.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eg this is passed in:

DN0, DN5, DN10, DN15

On the client machine is:

DN1, DN6, DN10 and DN16

Not sure If I understand well. I think there are two parameters

  1. The host name of the client (eg. DN10)
  2. List of datanode UUIDs (DN1, DN6, DN10)

The task is to sort the list of datanode UUIDs, with prioritizing the client machine.

First the client hostname is converted to UUID. This is the part which is replaced by this complex block. If it could not been converted it can be null. After that the topology logic sorts the list. sortByDistanceCost.

IMHO this functionality can be implemented my 4 lines of code, but fix me If I am wrong.

Copy link
Contributor Author

@sodonnel sodonnel Oct 2, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When there are multiple DNs on a given host (host1), you pass in the client as host1.

We do a lookup for host1 and we get DN UUIDs (actually DatanodeDetails objects) DN1, DN6, DN10 and DN16 which are on this host.

The pipeline passed in is DN4, DN5 and DN10. Ie only one of these hosts is on the client machine, but the client machine has other nodes not involved in this pipeline.

If it was just the hostname/IP used for later comparison your logic would be fine. However ...

The matched client DatanodeDetails object is passed to sortByDistanceCost() later in the same method, which calls this for each pipeline node:

NetworkTopologyImpl.getDistanceCost(reader, nodes.get(i)); 

Where reader is the client Node object we found.

In get distance by cost, the first few lines do this:

if ((node1 != null && node2 != null && node1.equals(node2)) ||
        (node1 == null && node2 == null))  {
      return 0;
    }

Ie, it both parameters are non-null and are the same object returns a distance of zero, otherwise it goes through more logic to calculate a distance.

Going back to the example above - your logic would return DN1 which would not give a zero distance cost in getDistanceByCost() when compared with any of the pipeline nodes.

My more complex logic would return DN10 which would return a zero cost when compared with pipeline node DN10, as they are the same object.

So after a rather long example the summary is that if the cost calculation was based on hostname your logic would be fine, but as it compares the actual node objects I think we need the more complex logic, unfortunately! I have not followed the rest of the logic in getDistanceByCost to see if a cost of zero would fall out in the end, but I suspect it will be some small non-zero value as both nodes will be at the same level.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reflecting on this issue some more, I think the simplified logic you have suggested is better and the problem is better solved in getDistanceByCost - rather than comparing just the node objects are the same, we should test if they are the same hostname and if so treat that as a zero distance match too.

Unfortunately, as that method takes Node objects rather than DatanodeDetails, this is not trivial to do.

The code path under question here is only relevant for clusters with more than one datanode on the same host, and by definition that is a non-production setup. The only consequence of the change you have suggested over my original code, is that the client may get the wrong 'cost to reach a datanode' sometimes on test clusters - nothing will fail, so the impact of this issue is very low.

Therefore if you are happy, I think we should commit the latest version (which has your simplified logic) and create a followup Jira to look into fixing getDistanceByCost somehow.

Node client = null;
List<DatanodeDetails> possibleClients =
nodeManager.getNodesByAddress(clientMachine);
if (possibleClients.size()>0){
client = possibleClients.get(0);
}
List<Node> nodeList = new ArrayList();
nodes.stream().forEach(uuid -> {
DatanodeDetails node = nodeManager.getNodeByUuid(uuid);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public class MockNodeManager implements NodeManager {
private final Node2PipelineMap node2PipelineMap;
private final Node2ContainerMap node2ContainerMap;
private NetworkTopology clusterMap;
private ConcurrentHashMap<String, String> dnsToUuidMap;
private ConcurrentHashMap<String, Set<String>> dnsToUuidMap;

public MockNodeManager(boolean initializeFakeNodes, int nodeCount) {
this.healthyNodes = new LinkedList<>();
Expand Down Expand Up @@ -386,7 +386,7 @@ public RegisteredCommand register(DatanodeDetails datanodeDetails,
try {
node2ContainerMap.insertNewDatanode(datanodeDetails.getUuid(),
Collections.emptySet());
dnsToUuidMap.put(datanodeDetails.getIpAddress(),
addEntryTodnsToUuidMap(datanodeDetails.getIpAddress(),
datanodeDetails.getUuidString());
if (clusterMap != null) {
datanodeDetails.setNetworkName(datanodeDetails.getUuidString());
Expand All @@ -398,6 +398,23 @@ public RegisteredCommand register(DatanodeDetails datanodeDetails,
return null;
}

/**
* Add an entry to the dnsToUuidMap, which maps hostname / IP to the DNs
* running on that host. As each address can have many DNs running on it,
* this is a one to many mapping.
* @param dnsName String representing the hostname or IP of the node
* @param uuid String representing the UUID of the registered node.
*/
private synchronized void addEntryTodnsToUuidMap(
String dnsName, String uuid) {
Set<String> dnList = dnsToUuidMap.get(dnsName);
if (dnList == null) {
dnList = ConcurrentHashMap.newKeySet();
dnsToUuidMap.put(dnsName, dnList);
}
dnList.add(uuid);
}

/**
* Send heartbeat to indicate the datanode is alive and doing well.
*
Expand Down Expand Up @@ -484,8 +501,19 @@ public DatanodeDetails getNodeByUuid(String uuid) {
}

@Override
public DatanodeDetails getNodeByAddress(String address) {
return getNodeByUuid(dnsToUuidMap.get(address));
public List<DatanodeDetails> getNodesByAddress(String address) {
List<DatanodeDetails> results = new LinkedList<>();
Set<String> uuids = dnsToUuidMap.get(address);
if (uuids == null) {
return results;
}
for(String uuid : uuids) {
DatanodeDetails dn = getNodeByUuid(uuid);
if (dn != null) {
results.add(dn);
}
}
return results;
}

public void setNetworkTopology(NetworkTopology topology) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,25 @@ public void testScmRegisterNodeWithHostname()
testScmRegisterNodeWithNetworkTopology(true);
}

/**
* Test getNodesByAddress when using IPs.
*
*/
@Test
public void testgetNodesByAddressWithIpAddress()
throws IOException, InterruptedException, AuthenticationException {
testGetNodesByAddress(false);
}

/**
* Test getNodesByAddress when using hostnames.
*/
@Test
public void testgetNodesByAddressWithHostname()
throws IOException, InterruptedException, AuthenticationException {
testGetNodesByAddress(true);
}

/**
* Test add node into a 4-layer network topology during node register.
*/
Expand Down Expand Up @@ -1152,11 +1171,55 @@ private void testScmRegisterNodeWithNetworkTopology(boolean useHostname)
// test get node
if (useHostname) {
Arrays.stream(hostNames).forEach(hostname ->
Assert.assertNotNull(nodeManager.getNodeByAddress(hostname)));
Assert.assertNotEquals(0, nodeManager.getNodesByAddress(hostname)
.size()));
} else {
Arrays.stream(ipAddress).forEach(ip ->
Assert.assertNotNull(nodeManager.getNodeByAddress(ip)));
Assert.assertNotEquals(0, nodeManager.getNodesByAddress(ip)
.size()));
}
}
}

/**
* Test add node into a 4-layer network topology during node register.
*/
private void testGetNodesByAddress(boolean useHostname)
throws IOException, InterruptedException, AuthenticationException {
OzoneConfiguration conf = getConf();
conf.setTimeDuration(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL, 1000,
MILLISECONDS);

// create a set of hosts - note two hosts on "host1"
String[] hostNames = {"host1", "host1", "host2", "host3", "host4"};
String[] ipAddress =
{"1.2.3.4", "1.2.3.4", "2.3.4.5", "3.4.5.6", "4.5.6.7"};

if (useHostname) {
conf.set(DFSConfigKeys.DFS_DATANODE_USE_DN_HOSTNAME, "true");
}
final int nodeCount = hostNames.length;
try (SCMNodeManager nodeManager = createNodeManager(conf)) {
DatanodeDetails[] nodes = new DatanodeDetails[nodeCount];
for (int i = 0; i < nodeCount; i++) {
DatanodeDetails node = TestUtils.createDatanodeDetails(
UUID.randomUUID().toString(), hostNames[i], ipAddress[i], null);
nodeManager.register(node, null, null);
}
// test get node
Assert.assertEquals(0, nodeManager.getNodesByAddress(null).size());
if (useHostname) {
Assert.assertEquals(2,
nodeManager.getNodesByAddress("host1").size());
Assert.assertEquals(1, nodeManager.getNodesByAddress("host2").size());
Assert.assertEquals(0, nodeManager.getNodesByAddress("unknown").size());
} else {
Assert.assertEquals(2,
nodeManager.getNodesByAddress("1.2.3.4").size());
Assert.assertEquals(1, nodeManager.getNodesByAddress("2.3.4.5").size());
Assert.assertEquals(0, nodeManager.getNodesByAddress("1.9.8.7").size());
}
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.LinkedList;

/**
* A Node Manager to test replication.
Expand Down Expand Up @@ -323,7 +324,7 @@ public DatanodeDetails getNodeByUuid(String address) {
}

@Override
public DatanodeDetails getNodeByAddress(String address) {
return null;
public List<DatanodeDetails> getNodesByAddress(String address) {
return new LinkedList<>();
}
}