Skip to content

Commit 56d2497

Browse files
authored
HDFS-13671. Namenode deletes large dir slowly caused by FoldedTreeSet#removeAndGet (#3065)
1 parent 9e7c7ad commit 56d2497

37 files changed

+621
-2620
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -291,18 +291,6 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
291291
public static final int DFS_NAMENODE_REPLICATION_MAX_STREAMS_DEFAULT = 2;
292292
public static final String DFS_NAMENODE_REPLICATION_STREAMS_HARD_LIMIT_KEY = "dfs.namenode.replication.max-streams-hard-limit";
293293
public static final int DFS_NAMENODE_REPLICATION_STREAMS_HARD_LIMIT_DEFAULT = 4;
294-
public static final String DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_INTERVAL_MS_KEY
295-
= "dfs.namenode.storageinfo.defragment.interval.ms";
296-
public static final int
297-
DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_INTERVAL_MS_DEFAULT = 10 * 60 * 1000;
298-
public static final String DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_TIMEOUT_MS_KEY
299-
= "dfs.namenode.storageinfo.defragment.timeout.ms";
300-
public static final int
301-
DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_TIMEOUT_MS_DEFAULT = 4;
302-
public static final String DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_RATIO_KEY
303-
= "dfs.namenode.storageinfo.defragment.ratio";
304-
public static final double
305-
DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_RATIO_DEFAULT = 0.75;
306294
public static final String DFS_NAMENODE_BLOCKREPORT_QUEUE_SIZE_KEY
307295
= "dfs.namenode.blockreport.queue.size";
308296
public static final int DFS_NAMENODE_BLOCKREPORT_QUEUE_SIZE_DEFAULT

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocolPB/PBHelper.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -967,8 +967,8 @@ public static JournalInfoProto convert(JournalInfo j) {
967967

968968

969969
public static BlockReportContext convert(BlockReportContextProto proto) {
970-
return new BlockReportContext(proto.getTotalRpcs(), proto.getCurRpc(),
971-
proto.getId(), proto.getLeaseId(), proto.getSorted());
970+
return new BlockReportContext(proto.getTotalRpcs(),
971+
proto.getCurRpc(), proto.getId(), proto.getLeaseId());
972972
}
973973

974974
public static BlockReportContextProto convert(BlockReportContext context) {
@@ -977,7 +977,6 @@ public static BlockReportContextProto convert(BlockReportContext context) {
977977
setCurRpc(context.getCurRpc()).
978978
setId(context.getReportId()).
979979
setLeaseId(context.getLeaseId()).
980-
setSorted(context.isSorted()).
981980
build();
982981
}
983982

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfo.java

Lines changed: 152 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919

2020
import java.io.IOException;
2121
import java.util.Iterator;
22+
import java.util.LinkedList;
2223
import java.util.List;
23-
import java.util.NoSuchElementException;
2424

2525
import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
2626
import org.apache.hadoop.classification.InterfaceAudience;
@@ -57,9 +57,19 @@ public abstract class BlockInfo extends Block
5757
/** For implementing {@link LightWeightGSet.LinkedElement} interface. */
5858
private LightWeightGSet.LinkedElement nextLinkedElement;
5959

60-
61-
// Storages this block is replicated on
62-
protected DatanodeStorageInfo[] storages;
60+
/**
61+
* This array contains triplets of references. For each i-th storage, the
62+
* block belongs to triplets[3*i] is the reference to the
63+
* {@link DatanodeStorageInfo} and triplets[3*i+1] and triplets[3*i+2] are
64+
* references to the previous and the next blocks, respectively, in the list
65+
* of blocks belonging to this storage.
66+
*
67+
* Using previous and next in Object triplets is done instead of a
68+
* {@link LinkedList} list to efficiently use memory. With LinkedList the cost
69+
* per replica is 42 bytes (LinkedList#Entry object per replica) versus 16
70+
* bytes using the triplets.
71+
*/
72+
protected Object[] triplets;
6373

6474
private BlockUnderConstructionFeature uc;
6575

@@ -69,14 +79,14 @@ public abstract class BlockInfo extends Block
6979
* in the block group
7080
*/
7181
public BlockInfo(short size) {
72-
this.storages = new DatanodeStorageInfo[size];
82+
this.triplets = new Object[3 * size];
7383
this.bcId = INVALID_INODE_ID;
7484
this.replication = isStriped() ? 0 : size;
7585
}
7686

7787
public BlockInfo(Block blk, short size) {
7888
super(blk);
79-
this.storages = new DatanodeStorageInfo[size];
89+
this.triplets = new Object[3*size];
8090
this.bcId = INVALID_INODE_ID;
8191
this.replication = isStriped() ? 0 : size;
8292
}
@@ -106,31 +116,7 @@ public boolean isDeleted() {
106116
}
107117

108118
public Iterator<DatanodeStorageInfo> getStorageInfos() {
109-
return new Iterator<DatanodeStorageInfo>() {
110-
111-
private int index = 0;
112-
113-
@Override
114-
public boolean hasNext() {
115-
while (index < storages.length && storages[index] == null) {
116-
index++;
117-
}
118-
return index < storages.length;
119-
}
120-
121-
@Override
122-
public DatanodeStorageInfo next() {
123-
if (!hasNext()) {
124-
throw new NoSuchElementException();
125-
}
126-
return storages[index++];
127-
}
128-
129-
@Override
130-
public void remove() {
131-
throw new UnsupportedOperationException("Sorry. can't remove.");
132-
}
133-
};
119+
return new BlocksMap.StorageIterator(this);
134120
}
135121

136122
public DatanodeDescriptor getDatanode(int index) {
@@ -139,18 +125,73 @@ public DatanodeDescriptor getDatanode(int index) {
139125
}
140126

141127
DatanodeStorageInfo getStorageInfo(int index) {
142-
assert this.storages != null : "BlockInfo is not initialized";
143-
return storages[index];
128+
assert this.triplets != null : "BlockInfo is not initialized";
129+
assert index >= 0 && index*3 < triplets.length : "Index is out of bound";
130+
return (DatanodeStorageInfo)triplets[index*3];
131+
}
132+
133+
BlockInfo getPrevious(int index) {
134+
assert this.triplets != null : "BlockInfo is not initialized";
135+
assert index >= 0 && index*3+1 < triplets.length : "Index is out of bound";
136+
BlockInfo info = (BlockInfo)triplets[index*3+1];
137+
assert info == null ||
138+
info.getClass().getName().startsWith(BlockInfo.class.getName()) :
139+
"BlockInfo is expected at " + index*3;
140+
return info;
141+
}
142+
143+
BlockInfo getNext(int index) {
144+
assert this.triplets != null : "BlockInfo is not initialized";
145+
assert index >= 0 && index*3+2 < triplets.length : "Index is out of bound";
146+
BlockInfo info = (BlockInfo)triplets[index*3+2];
147+
assert info == null || info.getClass().getName().startsWith(
148+
BlockInfo.class.getName()) :
149+
"BlockInfo is expected at " + index*3;
150+
return info;
144151
}
145152

146153
void setStorageInfo(int index, DatanodeStorageInfo storage) {
147-
assert this.storages != null : "BlockInfo is not initialized";
148-
this.storages[index] = storage;
154+
assert this.triplets != null : "BlockInfo is not initialized";
155+
assert index >= 0 && index*3 < triplets.length : "Index is out of bound";
156+
triplets[index*3] = storage;
157+
}
158+
159+
/**
160+
* Return the previous block on the block list for the datanode at
161+
* position index. Set the previous block on the list to "to".
162+
*
163+
* @param index - the datanode index
164+
* @param to - block to be set to previous on the list of blocks
165+
* @return current previous block on the list of blocks
166+
*/
167+
BlockInfo setPrevious(int index, BlockInfo to) {
168+
assert this.triplets != null : "BlockInfo is not initialized";
169+
assert index >= 0 && index*3+1 < triplets.length : "Index is out of bound";
170+
BlockInfo info = (BlockInfo) triplets[index*3+1];
171+
triplets[index*3+1] = to;
172+
return info;
173+
}
174+
175+
/**
176+
* Return the next block on the block list for the datanode at
177+
* position index. Set the next block on the list to "to".
178+
*
179+
* @param index - the datanode index
180+
* @param to - block to be set to next on the list of blocks
181+
* @return current next block on the list of blocks
182+
*/
183+
BlockInfo setNext(int index, BlockInfo to) {
184+
assert this.triplets != null : "BlockInfo is not initialized";
185+
assert index >= 0 && index*3+2 < triplets.length : "Index is out of bound";
186+
BlockInfo info = (BlockInfo) triplets[index*3+2];
187+
triplets[index*3+2] = to;
188+
return info;
149189
}
150190

151191
public int getCapacity() {
152-
assert this.storages != null : "BlockInfo is not initialized";
153-
return storages.length;
192+
assert this.triplets != null : "BlockInfo is not initialized";
193+
assert triplets.length % 3 == 0 : "Malformed BlockInfo";
194+
return triplets.length / 3;
154195
}
155196

156197
/**
@@ -227,6 +268,80 @@ int findStorageInfo(DatanodeStorageInfo storageInfo) {
227268
return -1;
228269
}
229270

271+
/**
272+
* Insert this block into the head of the list of blocks
273+
* related to the specified DatanodeStorageInfo.
274+
* If the head is null then form a new list.
275+
* @return current block as the new head of the list.
276+
*/
277+
BlockInfo listInsert(BlockInfo head, DatanodeStorageInfo storage) {
278+
int dnIndex = this.findStorageInfo(storage);
279+
assert dnIndex >= 0 : "Data node is not found: current";
280+
assert getPrevious(dnIndex) == null && getNext(dnIndex) == null :
281+
"Block is already in the list and cannot be inserted.";
282+
this.setPrevious(dnIndex, null);
283+
this.setNext(dnIndex, head);
284+
if (head != null) {
285+
head.setPrevious(head.findStorageInfo(storage), this);
286+
}
287+
return this;
288+
}
289+
290+
/**
291+
* Remove this block from the list of blocks
292+
* related to the specified DatanodeStorageInfo.
293+
* If this block is the head of the list then return the next block as
294+
* the new head.
295+
* @return the new head of the list or null if the list becomes
296+
* empy after deletion.
297+
*/
298+
BlockInfo listRemove(BlockInfo head, DatanodeStorageInfo storage) {
299+
if (head == null) {
300+
return null;
301+
}
302+
int dnIndex = this.findStorageInfo(storage);
303+
if (dnIndex < 0) { // this block is not on the data-node list
304+
return head;
305+
}
306+
307+
BlockInfo next = this.getNext(dnIndex);
308+
BlockInfo prev = this.getPrevious(dnIndex);
309+
this.setNext(dnIndex, null);
310+
this.setPrevious(dnIndex, null);
311+
if (prev != null) {
312+
prev.setNext(prev.findStorageInfo(storage), next);
313+
}
314+
if (next != null) {
315+
next.setPrevious(next.findStorageInfo(storage), prev);
316+
}
317+
if (this == head) { // removing the head
318+
head = next;
319+
}
320+
return head;
321+
}
322+
323+
/**
324+
* Remove this block from the list of blocks related to the specified
325+
* DatanodeDescriptor. Insert it into the head of the list of blocks.
326+
*
327+
* @return the new head of the list.
328+
*/
329+
public BlockInfo moveBlockToHead(BlockInfo head, DatanodeStorageInfo storage,
330+
int curIndex, int headIndex) {
331+
if (head == this) {
332+
return this;
333+
}
334+
BlockInfo next = this.setNext(curIndex, head);
335+
BlockInfo prev = this.setPrevious(curIndex, null);
336+
337+
head.setPrevious(headIndex, this);
338+
prev.setNext(prev.findStorageInfo(storage), next);
339+
if (next != null) {
340+
next.setPrevious(next.findStorageInfo(storage), prev);
341+
}
342+
return this;
343+
}
344+
230345
@Override
231346
public int hashCode() {
232347
// Super implementation is sufficient

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockInfoContiguous.java

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,20 +38,20 @@ public BlockInfoContiguous(Block blk, short size) {
3838
}
3939

4040
/**
41-
* Ensure that there is enough space to include num more storages.
42-
* @return first free storage index.
41+
* Ensure that there is enough space to include num more triplets.
42+
* @return first free triplet index.
4343
*/
4444
private int ensureCapacity(int num) {
45-
assert this.storages != null : "BlockInfo is not initialized";
45+
assert this.triplets != null : "BlockInfo is not initialized";
4646
int last = numNodes();
47-
if (storages.length >= (last+num)) {
47+
if (triplets.length >= (last+num)*3) {
4848
return last;
4949
}
5050
/* Not enough space left. Create a new array. Should normally
5151
* happen only when replication is manually increased by the user. */
52-
DatanodeStorageInfo[] old = storages;
53-
storages = new DatanodeStorageInfo[(last+num)];
54-
System.arraycopy(old, 0, storages, 0, last);
52+
Object[] old = triplets;
53+
triplets = new Object[(last+num)*3];
54+
System.arraycopy(old, 0, triplets, 0, last * 3);
5555
return last;
5656
}
5757

@@ -63,6 +63,8 @@ boolean addStorage(DatanodeStorageInfo storage, Block reportedBlock) {
6363
// find the last null node
6464
int lastNode = ensureCapacity(1);
6565
setStorageInfo(lastNode, storage);
66+
setNext(lastNode, null);
67+
setPrevious(lastNode, null);
6668
return true;
6769
}
6870

@@ -72,12 +74,18 @@ boolean removeStorage(DatanodeStorageInfo storage) {
7274
if (dnIndex < 0) { // the node is not found
7375
return false;
7476
}
77+
assert getPrevious(dnIndex) == null && getNext(dnIndex) == null :
78+
"Block is still in the list and must be removed first.";
7579
// find the last not null node
7680
int lastNode = numNodes()-1;
77-
// replace current node entry by the lastNode one
81+
// replace current node triplet by the lastNode one
7882
setStorageInfo(dnIndex, getStorageInfo(lastNode));
79-
// set the last entry to null
83+
setNext(dnIndex, getNext(lastNode));
84+
setPrevious(dnIndex, getPrevious(lastNode));
85+
// set the last triplet to null
8086
setStorageInfo(lastNode, null);
87+
setNext(lastNode, null);
88+
setPrevious(lastNode, null);
8189
return true;
8290
}
8391

@@ -96,7 +104,8 @@ boolean isProvided() {
96104

97105
@Override
98106
public int numNodes() {
99-
assert this.storages != null : "BlockInfo is not initialized";
107+
assert this.triplets != null : "BlockInfo is not initialized";
108+
assert triplets.length % 3 == 0 : "Malformed BlockInfo";
100109

101110
for (int idx = getCapacity()-1; idx >= 0; idx--) {
102111
if (getDatanode(idx) != null) {

0 commit comments

Comments
 (0)