From b8c6ba60463cf835ec20f99244a827a3d3056165 Mon Sep 17 00:00:00 2001 From: Takanobu Asanuma Date: Thu, 14 Apr 2022 11:23:38 +0900 Subject: [PATCH] HDFS-16479. EC: NameNode should not send a reconstruction work when the source datanodes are insufficient (#4138) (cherry picked from commit 2efab92959ca0a68c52ed6a9c721704e57afbcc7) --- .../server/blockmanagement/BlockManager.java | 10 ++ .../blockmanagement/TestBlockManager.java | 96 +++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 25a47cd433ea5..9546be16d754d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -2060,6 +2060,16 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block, return null; } + // skip if source datanodes for reconstructing ec block are not enough + if (block.isStriped()) { + BlockInfoStriped stripedBlock = (BlockInfoStriped) block; + if (stripedBlock.getRealDataBlockNum() > srcNodes.length) { + LOG.debug("Block {} cannot be reconstructed due to shortage of source datanodes ", block); + NameNode.getNameNodeMetrics().incNumTimesReReplicationNotScheduled(); + return null; + } + } + // liveReplicaNodes can include READ_ONLY_SHARED replicas which are // not included in the numReplicas.liveReplicas() count assert liveReplicaNodes.size() >= numReplicas.liveReplicas(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java index d42d2d98e7293..65e6c356620cd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java @@ -842,6 +842,102 @@ public void testChooseSrcDNWithDupECInDecommissioningNode() throws Exception { 0, numReplicas.redundantInternalBlocks()); } + @Test + public void testSkipReconstructionWithManyBusyNodes() { + long blockId = -9223372036854775776L; // real ec block id + // RS-3-2 EC policy + ErasureCodingPolicy ecPolicy = + SystemErasureCodingPolicies.getPolicies().get(1); + + // create an EC block group: 3 data blocks + 2 parity blocks + Block aBlockGroup = new Block(blockId, ecPolicy.getCellSize() * ecPolicy.getNumDataUnits(), 0); + BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlockGroup, ecPolicy); + + // create 4 storageInfo, which means 1 block is missing + DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo( + "storage1", "1.1.1.1", "rack1", "host1"); + DatanodeStorageInfo ds2 = DFSTestUtil.createDatanodeStorageInfo( + "storage2", "2.2.2.2", "rack2", "host2"); + DatanodeStorageInfo ds3 = DFSTestUtil.createDatanodeStorageInfo( + "storage3", "3.3.3.3", "rack3", "host3"); + DatanodeStorageInfo ds4 = DFSTestUtil.createDatanodeStorageInfo( + "storage4", "4.4.4.4", "rack4", "host4"); + + // link block with storage + aBlockInfoStriped.addStorage(ds1, aBlockGroup); + aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0)); + aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0)); + aBlockInfoStriped.addStorage(ds4, new Block(blockId + 3, 0, 0)); + + addEcBlockToBM(blockId, ecPolicy); + aBlockInfoStriped.setBlockCollectionId(mockINodeId); + + // reconstruction should be scheduled + BlockReconstructionWork work = bm.scheduleReconstruction(aBlockInfoStriped, 3); + assertNotNull(work); + + // simulate the 2 nodes reach maxReplicationStreams + for(int i = 0; i < bm.maxReplicationStreams; i++){ + ds3.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); + ds4.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); + } + + // reconstruction should be skipped since the number of non-busy nodes are not enough + work = bm.scheduleReconstruction(aBlockInfoStriped, 3); + assertNull(work); + } + + @Test + public void testSkipReconstructionWithManyBusyNodes2() { + long blockId = -9223372036854775776L; // real ec block id + // RS-3-2 EC policy + ErasureCodingPolicy ecPolicy = + SystemErasureCodingPolicies.getPolicies().get(1); + + // create an EC block group: 2 data blocks + 2 parity blocks + Block aBlockGroup = new Block(blockId, + ecPolicy.getCellSize() * (ecPolicy.getNumDataUnits() - 1), 0); + BlockInfoStriped aBlockInfoStriped = new BlockInfoStriped(aBlockGroup, ecPolicy); + + // create 3 storageInfo, which means 1 block is missing + DatanodeStorageInfo ds1 = DFSTestUtil.createDatanodeStorageInfo( + "storage1", "1.1.1.1", "rack1", "host1"); + DatanodeStorageInfo ds2 = DFSTestUtil.createDatanodeStorageInfo( + "storage2", "2.2.2.2", "rack2", "host2"); + DatanodeStorageInfo ds3 = DFSTestUtil.createDatanodeStorageInfo( + "storage3", "3.3.3.3", "rack3", "host3"); + + // link block with storage + aBlockInfoStriped.addStorage(ds1, aBlockGroup); + aBlockInfoStriped.addStorage(ds2, new Block(blockId + 1, 0, 0)); + aBlockInfoStriped.addStorage(ds3, new Block(blockId + 2, 0, 0)); + + addEcBlockToBM(blockId, ecPolicy); + aBlockInfoStriped.setBlockCollectionId(mockINodeId); + + // reconstruction should be scheduled + BlockReconstructionWork work = bm.scheduleReconstruction(aBlockInfoStriped, 3); + assertNotNull(work); + + // simulate the 1 node reaches maxReplicationStreams + for(int i = 0; i < bm.maxReplicationStreams; i++){ + ds2.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); + } + + // reconstruction should still be scheduled since there are 2 source nodes to create 2 blocks + work = bm.scheduleReconstruction(aBlockInfoStriped, 3); + assertNotNull(work); + + // simulate the 1 more node reaches maxReplicationStreams + for(int i = 0; i < bm.maxReplicationStreams; i++){ + ds3.getDatanodeDescriptor().incrementPendingReplicationWithoutTargets(); + } + + // reconstruction should be skipped since the number of non-busy nodes are not enough + work = bm.scheduleReconstruction(aBlockInfoStriped, 3); + assertNull(work); + } + @Test public void testFavorDecomUntilHardLimit() throws Exception { bm.maxReplicationStreams = 0;