Skip to content

Commit 0780710

Browse files
authored
YARN-11567 - Aggregate container launch debug artifacts on error (#6053)
1 parent cc66683 commit 0780710

File tree

4 files changed

+95
-6
lines changed

4 files changed

+95
-6
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,11 @@ private static void addDeprecatedKeys() {
150150
public static final String NM_LOG_CONTAINER_DEBUG_INFO =
151151
YarnConfiguration.NM_PREFIX + "log-container-debug-info.enabled";
152152

153+
public static final String NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR =
154+
YarnConfiguration.NM_PREFIX + "log-container-debug-info-on-error.enabled";
155+
153156
public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO = true;
157+
public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR = false;
154158

155159
////////////////////////////////
156160
// IPC Configs

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1656,6 +1656,21 @@
16561656
<value>true</value>
16571657
</property>
16581658

1659+
<property>
1660+
<description>Generate additional logs about container launches,
1661+
if container returned with non-zero exit code.
1662+
Currently, this creates a copy of the launch script and lists the
1663+
directory contents of the container work dir. When listing directory
1664+
contents, we follow symlinks to a max-depth of 5(including symlinks
1665+
which point to outside the container work dir) which may lead to a
1666+
slowness in launching containers.
1667+
If yarn.nodemanager.log-container-debug-info.enabled is true,
1668+
it does not have effect on the behavior.
1669+
</description>
1670+
<name>yarn.nodemanager.log-container-debug-info-on-error.enabled</name>
1671+
<value>false</value>
1672+
</property>
1673+
16591674
<property>
16601675
<description>Amount of physical memory, in MB, that can be allocated
16611676
for containers. If set to -1 and

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ public abstract class ContainerExecutor implements Configurable {
102102
private String[] whitelistVars;
103103
private int exitCodeFileTimeout =
104104
YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_EXIT_FILE_TIMEOUT;
105+
private int containerExitCode;
105106

106107
@Override
107108
public void setConf(Configuration conf) {
@@ -303,7 +304,7 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
303304

304305
if (pidPath == null) {
305306
LOG.warn("{} is not active, returning terminated error", containerId);
306-
307+
containerExitCode = ExitCode.TERMINATED.getExitCode();
307308
return ExitCode.TERMINATED.getExitCode();
308309
}
309310

@@ -335,7 +336,7 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
335336
while (!file.exists() && msecLeft >= 0) {
336337
if (!isContainerActive(containerId)) {
337338
LOG.info("{} was deactivated", containerId);
338-
339+
containerExitCode = ExitCode.TERMINATED.getExitCode();
339340
return ExitCode.TERMINATED.getExitCode();
340341
}
341342

@@ -350,7 +351,9 @@ public int reacquireContainer(ContainerReacquisitionContext ctx)
350351
}
351352

352353
try {
353-
return Integer.parseInt(FileUtils.readFileToString(file, StandardCharsets.UTF_8).trim());
354+
containerExitCode = Integer.parseInt(
355+
FileUtils.readFileToString(file, StandardCharsets.UTF_8).trim());
356+
return containerExitCode;
354357
} catch (NumberFormatException e) {
355358
throw new IOException("Error parsing exit code from pid " + pid, e);
356359
}
@@ -453,9 +456,7 @@ public void writeLaunchEnv(OutputStream out, Map<String, String> environment,
453456
}
454457

455458
// dump debugging information if configured
456-
if (getConf() != null &&
457-
getConf().getBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
458-
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO)) {
459+
if (shouldWriteDebugInformation(getConf())) {
459460
sb.echo("Copying debugging information");
460461
sb.copyDebugInformation(new Path(outFilename),
461462
new Path(logDir, outFilename));
@@ -488,6 +489,18 @@ protected File[] readDirAsUser(String user, Path dir) {
488489
return new File(dir.toString()).listFiles();
489490
}
490491

492+
private boolean shouldWriteDebugInformation(Configuration config) {
493+
return config != null && (
494+
config.getBoolean(
495+
YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
496+
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO
497+
) || (
498+
config.getBoolean(
499+
YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR,
500+
YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR
501+
) && containerExitCode != 0));
502+
}
503+
491504
/**
492505
* The container exit code.
493506
*/

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1844,6 +1844,63 @@ public void testDebuggingInformation() throws IOException {
18441844
}
18451845
}
18461846

1847+
@Test
1848+
public void testDebuggingInformationOnError() throws IOException {
1849+
File shellFile = null;
1850+
File tempFile = null;
1851+
Configuration conf = new YarnConfiguration();
1852+
try {
1853+
shellFile = Shell.appendScriptExtension(tmpDir, "hello");
1854+
tempFile = Shell.appendScriptExtension(tmpDir, "temp");
1855+
String testCommand = Shell.WINDOWS ? "@echo \"hello\"" : "echo \"hello\"";
1856+
PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
1857+
FileUtil.setExecutable(shellFile, true);
1858+
writer.println(testCommand);
1859+
writer.close();
1860+
Map<Path, List<String>> resources = new HashMap<>();
1861+
Map<String, String> env = new HashMap<>();
1862+
List<String> commands = new ArrayList<>();
1863+
if (Shell.WINDOWS) {
1864+
commands.add("cmd");
1865+
commands.add("/c");
1866+
commands.add("\"" + shellFile.getAbsolutePath() + "\"");
1867+
} else {
1868+
commands.add("/bin/sh \\\"" + shellFile.getAbsolutePath() + "\\\"");
1869+
}
1870+
conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO, false);
1871+
conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR, true);
1872+
FileOutputStream fos = new FileOutputStream(tempFile);
1873+
ContainerExecutor exec = new DefaultContainerExecutor();
1874+
exec.setConf(conf);
1875+
LinkedHashSet<String> nmVars = new LinkedHashSet<>();
1876+
exec.writeLaunchEnv(fos, env, resources, commands,
1877+
new Path(localLogDir.getAbsolutePath()), "user",
1878+
tempFile.getName(), nmVars);
1879+
fos.flush();
1880+
fos.close();
1881+
FileUtil.setExecutable(tempFile, true);
1882+
Shell.ShellCommandExecutor shexc = new Shell.ShellCommandExecutor(
1883+
new String[]{tempFile.getAbsolutePath()}, tmpDir);
1884+
shexc.execute();
1885+
assertThat(shexc.getExitCode()).isZero();
1886+
File directorInfo =
1887+
new File(localLogDir, ContainerExecutor.DIRECTORY_CONTENTS);
1888+
File scriptCopy = new File(localLogDir, tempFile.getName());
1889+
Assert.assertFalse("Directory info file missing",
1890+
directorInfo.exists());
1891+
Assert.assertFalse("Copy of launch script missing",
1892+
scriptCopy.exists());
1893+
} finally {
1894+
// cleanup
1895+
if (shellFile != null && shellFile.exists()) {
1896+
shellFile.delete();
1897+
}
1898+
if (tempFile != null && tempFile.exists()) {
1899+
tempFile.delete();
1900+
}
1901+
}
1902+
}
1903+
18471904
/**
18481905
* Test container launch fault.
18491906
* @throws Exception

0 commit comments

Comments
 (0)