Skip to content

Commit 2764236

Browse files
YARN-9011. Race condition during decommissioning. Contributed by Peter Bacsko
1 parent 7f81172 commit 2764236

File tree

4 files changed

+140
-12
lines changed

4 files changed

+140
-12
lines changed

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/HostsFileReader.java

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ public class HostsFileReader {
5252
.class);
5353

5454
private final AtomicReference<HostDetails> current;
55+
private final AtomicReference<HostDetails> lazyLoaded =
56+
new AtomicReference<>();
5557

5658
public HostsFileReader(String inFile,
5759
String exFile) throws IOException {
@@ -187,7 +189,18 @@ static String readFirstTagValue(Element e, String tag) {
187189

188190
public void refresh(String includesFile, String excludesFile)
189191
throws IOException {
190-
LOG.info("Refreshing hosts (include/exclude) list");
192+
refreshInternal(includesFile, excludesFile, false);
193+
}
194+
195+
public void lazyRefresh(String includesFile, String excludesFile)
196+
throws IOException {
197+
refreshInternal(includesFile, excludesFile, true);
198+
}
199+
200+
private void refreshInternal(String includesFile, String excludesFile,
201+
boolean lazy) throws IOException {
202+
LOG.info("Refreshing hosts (include/exclude) list (lazy refresh = {})",
203+
lazy);
191204
HostDetails oldDetails = current.get();
192205
Set<String> newIncludes = oldDetails.includes;
193206
Map<String, Integer> newExcludes = oldDetails.excludes;
@@ -203,7 +216,21 @@ public void refresh(String includesFile, String excludesFile)
203216
}
204217
HostDetails newDetails = new HostDetails(includesFile, newIncludes,
205218
excludesFile, newExcludes);
206-
current.set(newDetails);
219+
220+
if (lazy) {
221+
lazyLoaded.set(newDetails);
222+
} else {
223+
current.set(newDetails);
224+
}
225+
}
226+
227+
public void finishRefresh() {
228+
if (lazyLoaded.get() == null) {
229+
throw new IllegalStateException(
230+
"Cannot finish refresh - call lazyRefresh() first");
231+
}
232+
current.set(lazyLoaded.get());
233+
lazyLoaded.set(null);
207234
}
208235

209236
@Private
@@ -279,6 +306,10 @@ public HostDetails getHostDetails() {
279306
return current.get();
280307
}
281308

309+
public HostDetails getLazyLoadedHostDetails() {
310+
return lazyLoaded.get();
311+
}
312+
282313
public void setIncludesFile(String includesFile) {
283314
LOG.info("Setting the includes file to " + includesFile);
284315
HostDetails oldDetails = current.get();

hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/util/TestHostsFileReader.java

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import java.io.File;
2121
import java.io.FileWriter;
22+
import java.io.IOException;
2223
import java.nio.file.NoSuchFileException;
2324
import java.util.Map;
2425

@@ -347,4 +348,62 @@ public void testHostFileReaderWithTimeout() throws Exception {
347348
assertTrue(excludes.get("host5") == 1800);
348349
assertTrue(excludes.get("host6") == 1800);
349350
}
350-
}
351+
352+
@Test
353+
public void testLazyRefresh() throws IOException {
354+
FileWriter efw = new FileWriter(excludesFile);
355+
FileWriter ifw = new FileWriter(includesFile);
356+
357+
efw.write("host1\n");
358+
efw.write("host2\n");
359+
efw.close();
360+
ifw.write("host3\n");
361+
ifw.write("host4\n");
362+
ifw.close();
363+
364+
HostsFileReader hfp = new HostsFileReader(includesFile, excludesFile);
365+
366+
ifw = new FileWriter(includesFile);
367+
ifw.close();
368+
369+
efw = new FileWriter(excludesFile, true);
370+
efw.write("host3\n");
371+
efw.write("host4\n");
372+
efw.close();
373+
374+
hfp.lazyRefresh(includesFile, excludesFile);
375+
376+
HostDetails details = hfp.getHostDetails();
377+
HostDetails lazyDetails = hfp.getLazyLoadedHostDetails();
378+
379+
assertEquals("Details: no. of excluded hosts", 2,
380+
details.getExcludedHosts().size());
381+
assertEquals("Details: no. of included hosts", 2,
382+
details.getIncludedHosts().size());
383+
assertEquals("LazyDetails: no. of excluded hosts", 4,
384+
lazyDetails.getExcludedHosts().size());
385+
assertEquals("LayDetails: no. of included hosts", 0,
386+
lazyDetails.getIncludedHosts().size());
387+
388+
hfp.finishRefresh();
389+
390+
details = hfp.getHostDetails();
391+
assertEquals("Details: no. of excluded hosts", 4,
392+
details.getExcludedHosts().size());
393+
assertEquals("Details: no. of included hosts", 0,
394+
details.getIncludedHosts().size());
395+
assertNull("Lazy host details should be null",
396+
hfp.getLazyLoadedHostDetails());
397+
}
398+
399+
@Test(expected = IllegalStateException.class)
400+
public void testFinishRefreshWithoutLazyRefresh() throws IOException {
401+
FileWriter efw = new FileWriter(excludesFile);
402+
FileWriter ifw = new FileWriter(includesFile);
403+
efw.close();
404+
ifw.close();
405+
406+
HostsFileReader hfp = new HostsFileReader(includesFile, excludesFile);
407+
hfp.finishRefresh();
408+
}
409+
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/NodesListManager.java

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,12 @@ public class NodesListManager extends CompositeService implements
8484
private Resolver resolver;
8585
private Timer removalTimer;
8686
private int nodeRemovalCheckInterval;
87+
private Set<RMNode> gracefulDecommissionableNodes;
8788

8889
public NodesListManager(RMContext rmContext) {
8990
super(NodesListManager.class.getName());
9091
this.rmContext = rmContext;
92+
this.gracefulDecommissionableNodes = ConcurrentHashMap.newKeySet();
9193
}
9294

9395
@Override
@@ -115,7 +117,7 @@ protected void serviceInit(Configuration conf) throws Exception {
115117
this.hostsReader =
116118
createHostsFileReader(this.includesFile, this.excludesFile);
117119
setDecommissionedNMs();
118-
printConfiguredHosts();
120+
printConfiguredHosts(false);
119121
} catch (YarnException ex) {
120122
disableHostsFileReader(ex);
121123
} catch (IOException ioe) {
@@ -187,7 +189,7 @@ public void serviceStop() {
187189
removalTimer.cancel();
188190
}
189191

190-
private void printConfiguredHosts() {
192+
private void printConfiguredHosts(boolean graceful) {
191193
if (!LOG.isDebugEnabled()) {
192194
return;
193195
}
@@ -198,7 +200,12 @@ private void printConfiguredHosts() {
198200
conf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
199201
YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH));
200202

201-
HostDetails hostDetails = hostsReader.getHostDetails();
203+
HostDetails hostDetails;
204+
if (graceful) {
205+
hostDetails = hostsReader.getLazyLoadedHostDetails();
206+
} else {
207+
hostDetails = hostsReader.getHostDetails();
208+
}
202209
for (String include : hostDetails.getIncludedHosts()) {
203210
LOG.debug("include: " + include);
204211
}
@@ -235,8 +242,15 @@ private void refreshHostsReader(
235242
yarnConf.get(YarnConfiguration.RM_NODES_EXCLUDE_FILE_PATH,
236243
YarnConfiguration.DEFAULT_RM_NODES_EXCLUDE_FILE_PATH);
237244
LOG.info("refreshNodes excludesFile " + excludesFile);
238-
hostsReader.refresh(includesFile, excludesFile);
239-
printConfiguredHosts();
245+
246+
if (graceful) {
247+
// update hosts, but don't make it visible just yet
248+
hostsReader.lazyRefresh(includesFile, excludesFile);
249+
} else {
250+
hostsReader.refresh(includesFile, excludesFile);
251+
}
252+
253+
printConfiguredHosts(graceful);
240254

241255
LOG.info("hostsReader include:{" +
242256
StringUtils.join(",", hostsReader.getHosts()) +
@@ -270,7 +284,14 @@ private void handleExcludeNodeList(boolean graceful, int timeout) {
270284
// Nodes need to be decommissioned (graceful or forceful);
271285
List<RMNode> nodesToDecom = new ArrayList<RMNode>();
272286

273-
HostDetails hostDetails = hostsReader.getHostDetails();
287+
HostDetails hostDetails;
288+
gracefulDecommissionableNodes.clear();
289+
if (graceful) {
290+
hostDetails = hostsReader.getLazyLoadedHostDetails();
291+
} else {
292+
hostDetails = hostsReader.getHostDetails();
293+
}
294+
274295
Set<String> includes = hostDetails.getIncludedHosts();
275296
Map<String, Integer> excludes = hostDetails.getExcludedMap();
276297

@@ -298,11 +319,13 @@ private void handleExcludeNodeList(boolean graceful, int timeout) {
298319
s != NodeState.DECOMMISSIONING) {
299320
LOG.info("Gracefully decommission " + nodeStr);
300321
nodesToDecom.add(n);
322+
gracefulDecommissionableNodes.add(n);
301323
} else if (s == NodeState.DECOMMISSIONING &&
302324
!Objects.equals(n.getDecommissioningTimeout(),
303325
timeoutToUse)) {
304326
LOG.info("Update " + nodeStr + " timeout to be " + timeoutToUse);
305327
nodesToDecom.add(n);
328+
gracefulDecommissionableNodes.add(n);
306329
} else {
307330
LOG.info("No action for " + nodeStr);
308331
}
@@ -315,6 +338,10 @@ private void handleExcludeNodeList(boolean graceful, int timeout) {
315338
}
316339
}
317340

341+
if (graceful) {
342+
hostsReader.finishRefresh();
343+
}
344+
318345
for (RMNode n : nodesToRecom) {
319346
RMNodeEvent e = new RMNodeEvent(
320347
n.getNodeID(), RMNodeEventType.RECOMMISSION);
@@ -466,6 +493,10 @@ public boolean isValidNode(String hostName) {
466493
hostDetails.getExcludedHosts());
467494
}
468495

496+
boolean isGracefullyDecommissionableNode(RMNode node) {
497+
return gracefulDecommissionableNodes.contains(node);
498+
}
499+
469500
private boolean isValidNode(
470501
String hostName, Set<String> hostsList, Set<String> excludeList) {
471502
String ip = resolver.resolve(hostName);

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceTrackerService.java

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -836,10 +836,17 @@ private void updateAppCollectorsMap(NodeHeartbeatRequest request) {
836836
*/
837837
private boolean isNodeInDecommissioning(NodeId nodeId) {
838838
RMNode rmNode = this.rmContext.getRMNodes().get(nodeId);
839-
if (rmNode != null &&
840-
rmNode.getState().equals(NodeState.DECOMMISSIONING)) {
841-
return true;
839+
840+
if (rmNode != null) {
841+
NodeState state = rmNode.getState();
842+
843+
if (state == NodeState.DECOMMISSIONING ||
844+
(state == NodeState.RUNNING &&
845+
this.nodesListManager.isGracefullyDecommissionableNode(rmNode))) {
846+
return true;
847+
}
842848
}
849+
843850
return false;
844851
}
845852

0 commit comments

Comments
 (0)