From b8c6ba60463cf835ec20f99244a827a3d3056165 Mon Sep 17 00:00:00 2001
From: Takanobu Asanuma <tasanuma@apache.org>
Date: Thu, 14 Apr 2022 11:23:38 +0900
Subject: [PATCH] HDFS-16479. EC: NameNode should not send a reconstruction
 work when the source datanodes are insufficient (#4138)

(cherry picked from commit 2efab92959ca0a68c52ed6a9c721704e57afbcc7)
---
 .../server/blockmanagement/BlockManager.java  | 10 ++
 .../blockmanagement/TestBlockManager.java     | 96 +++++++++++++++++++
 2 files changed, 106 insertions(+)

diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index 25a47cd433ea5..9546be16d754d 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -2060,6 +2060,16 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block,
       return null;
     }
 
+    // skip if source datanodes for reconstructing ec block are not enough
+    if (block.isStriped()) {
+      BlockInfoStriped stripedBlock = (BlockInfoStriped) block;
+      if (stripedBlock.getRealDataBlockNum() > srcNodes.length) {
+        LOG.debug("Block {} cannot be reconstructed due to shortage of source datanodes ", block);
+        NameNode.getNameNodeMetrics().incNumTimesReReplicationNotScheduled();
+        return null;
+      }
+    }
+
     // liveReplicaNodes can include READ_ONLY_SHARED replicas which are
     // not included in the numReplicas.liveReplicas() count
     assert liveReplicaNodes.size() >= numReplicas.liveReplicas();
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
index d42d2d98e7293..65e6c356620cd 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java
@@ -842,6 +842,102 @@ public void testChooseSrcDNWithDupECInDecommissioningNode() throws Exception {
         0, numReplicas.redundantInternalBlocks());
   }
 
+  @Test
+  public void testSkipReconstructionWithManyBusyNodes() {
+    long blockId = -9223372036854775776L; // real ec block id
+    // RS-3-2 EC policy
+    ErasureCodingPolicy ecPolicy =
+        SystemErasureCodingPolicies.getPolicies().get(1);
+
+    // create an EC block group: 3 data blocks + 2 parity blocks
+    Block aBlockGroup = new Block(blockId, ecPolicy.getCellSize() * ecPolicy.getNumDataUnits(), 0);
+    BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlockGroup, ecPolicy);
+
+    // create 4 storageInfo, which means 1 block is missing
+    DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo(
+        "storage1", "1.1.1.1", "rack1", "host1");
+    DatanodeStorageInfo ds2 = DFSTestUtil.createDatanodeStorageInfo(
+        "storage2", "2.2.2.2", "rack2", "host2");
+    DatanodeStorageInfo ds3 = DFSTestUtil.createDatanodeStorageInfo(
+        "storage3", "3.3.3.3", "rack3", "host3");
+    DatanodeStorageInfo ds4 = DFSTestUtil.createDatanodeStorageInfo(
+        "storage4", "4.4.4.4", "rack4", "host4");
+
+    // link block with storage
+    aBlockInfoStriped.addStorage(ds1, aBlockGroup);
+    aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0));
+    aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0));
+    aBlockInfoStriped.addStorage(ds4, new Block(blockId + 3, 0, 0));
+
+    addEcBlockToBM(blockId, ecPolicy);
+    aBlockInfoStriped.setBlockCollectionId(mockINodeId);
+
+    // reconstruction should be scheduled
+    BlockReconstructionWork work = bm.scheduleReconstruction(aBlockInfoStriped, 3);
+    assertNotNull(work);
+
+    // simulate the 2 nodes reach maxReplicationStreams
+    for(int i = 0; i < bm.maxReplicationStreams; i++){
+      ds3.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets();
+      ds4.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets();
+    }
+
+    // reconstruction should be skipped since the number of non-busy nodes are not enough
+    work = bm.scheduleReconstruction(aBlockInfoStriped, 3);
+    assertNull(work);
+  }
+
+  @Test
+  public void testSkipReconstructionWithManyBusyNodes2() {
+    long blockId = -9223372036854775776L; // real ec block id
+    // RS-3-2 EC policy
+    ErasureCodingPolicy ecPolicy =
+        SystemErasureCodingPolicies.getPolicies().get(1);
+
+    // create an EC block group: 2 data blocks + 2 parity blocks
+    Block aBlockGroup = new Block(blockId,
+        ecPolicy.getCellSize() * (ecPolicy.getNumDataUnits() - 1), 0);
+    BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlockGroup, ecPolicy);
+
+    // create 3 storageInfo, which means 1 block is missing
+    DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo(
+        "storage1", "1.1.1.1", "rack1", "host1");
+    DatanodeStorageInfo ds2 = DFSTestUtil.createDatanodeStorageInfo(
+        "storage2", "2.2.2.2", "rack2", "host2");
+    DatanodeStorageInfo ds3 = DFSTestUtil.createDatanodeStorageInfo(
+        "storage3", "3.3.3.3", "rack3", "host3");
+
+    // link block with storage
+    aBlockInfoStriped.addStorage(ds1, aBlockGroup);
+    aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0));
+    aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0));
+
+    addEcBlockToBM(blockId, ecPolicy);
+    aBlockInfoStriped.setBlockCollectionId(mockINodeId);
+
+    // reconstruction should be scheduled
+    BlockReconstructionWork work = bm.scheduleReconstruction(aBlockInfoStriped, 3);
+    assertNotNull(work);
+
+    // simulate the 1 node reaches maxReplicationStreams
+    for(int i = 0; i < bm.maxReplicationStreams; i++){
+      ds2.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets();
+    }
+
+    // reconstruction should still be scheduled since there are 2 source nodes to create 2 blocks
+    work = bm.scheduleReconstruction(aBlockInfoStriped, 3);
+    assertNotNull(work);
+
+    // simulate the 1 more node reaches maxReplicationStreams
+    for(int i = 0; i < bm.maxReplicationStreams; i++){
+      ds3.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets();
+    }
+
+    // reconstruction should be skipped since the number of non-busy nodes are not enough
+    work = bm.scheduleReconstruction(aBlockInfoStriped, 3);
+    assertNull(work);
+  }
+
   @Test
   public void testFavorDecomUntilHardLimit() throws Exception {
     bm.maxReplicationStreams = 0;