Skip to content

Commit eff6cd6

Browse files
committed
Handle cgroups v2 in OsProbe (#76883)
Closes #76812. `OsProbe` was only capable of handle cgroup data in the v1 format. However, Debian 11 uses cgroups v2 by default, and Elasticsearch isn't capable of reporting any cgroup information. Therefore, add support for the v2 layout.
1 parent 6c7f692 commit eff6cd6

File tree

4 files changed

+299
-98
lines changed

4 files changed

+299
-98
lines changed

qa/os/src/test/java/org/elasticsearch/packaging/test/DockerTests.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -838,7 +838,6 @@ public void test131InitProcessHasCorrectPID() {
838838
/**
839839
* Check that Elasticsearch reports per-node cgroup information.
840840
*/
841-
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/76812")
842841
public void test140CgroupOsStatsAreAvailable() throws Exception {
843842
waitForElasticsearch(installation);
844843

server/src/main/java/org/elasticsearch/monitor/os/OsProbe.java

Lines changed: 215 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -30,25 +30,31 @@
3030
import java.util.regex.Matcher;
3131
import java.util.regex.Pattern;
3232
import java.util.stream.Collectors;
33+
import java.util.stream.Stream;
3334

3435
/**
3536
* The {@link OsProbe} class retrieves information about the physical and swap size of the machine
3637
* memory, as well as the system load average and cpu load.
3738
*
38-
* In some exceptional cases, it's possible the underlying native methods used by
39+
* <p>In some exceptional cases, it's possible the underlying native methods used by
3940
* {@link #getFreePhysicalMemorySize()}, {@link #getTotalPhysicalMemorySize()},
4041
* {@link #getFreeSwapSpaceSize()}, and {@link #getTotalSwapSpaceSize()} can return a
4142
* negative value. Because of this, we prevent those methods from returning negative values,
4243
* returning 0 instead.
4344
*
44-
* The OS can report a negative number in a number of cases:
45-
* - Non-supported OSes (HP-UX, or AIX)
46-
* - A failure of macOS to initialize host statistics
47-
* - An OS that does not support the {@code _SC_PHYS_PAGES} or {@code _SC_PAGE_SIZE} flags for the {@code sysconf()} linux kernel call
48-
* - An overflow of the product of {@code _SC_PHYS_PAGES} and {@code _SC_PAGE_SIZE}
49-
* - An error case retrieving these values from a linux kernel
50-
* - A non-standard libc implementation not implementing the required values
51-
* For a more exhaustive explanation, see https://github.com/elastic/elasticsearch/pull/42725
45+
* <p>The OS can report a negative number in a number of cases:
46+
*
47+
* <ul>
48+
* <li>Non-supported OSes (HP-UX, or AIX)
49+
* <li>A failure of macOS to initialize host statistics
50+
* <li>An OS that does not support the {@code _SC_PHYS_PAGES} or {@code _SC_PAGE_SIZE} flags for the {@code sysconf()} linux kernel call
51+
* <li>An overflow of the product of {@code _SC_PHYS_PAGES} and {@code _SC_PAGE_SIZE}
52+
* <li>An error case retrieving these values from a linux kernel
53+
* <li>A non-standard libc implementation not implementing the required values
54+
* </ul>
55+
*
56+
* <p>For a more exhaustive explanation, see <a href="https://github.com/elastic/elasticsearch/pull/42725"
57+
* >https://github.com/elastic/elasticsearch/pull/42725</a>
5258
*/
5359
public class OsProbe {
5460

@@ -178,7 +184,7 @@ final double[] getSystemLoadAverage() {
178184
final String procLoadAvg = readProcLoadavg();
179185
assert procLoadAvg.matches("(\\d+\\.\\d+\\s+){3}\\d+/\\d+\\s+\\d+");
180186
final String[] fields = procLoadAvg.split("\\s+");
181-
return new double[]{Double.parseDouble(fields[0]), Double.parseDouble(fields[1]), Double.parseDouble(fields[2])};
187+
return new double[] { Double.parseDouble(fields[0]), Double.parseDouble(fields[1]), Double.parseDouble(fields[2]) };
182188
} catch (final IOException e) {
183189
if (logger.isDebugEnabled()) {
184190
logger.debug("error reading /proc/loadavg", e);
@@ -192,7 +198,7 @@ final double[] getSystemLoadAverage() {
192198
}
193199
try {
194200
final double oneMinuteLoadAverage = (double) getSystemLoadAverage.invoke(osMxBean);
195-
return new double[]{oneMinuteLoadAverage >= 0 ? oneMinuteLoadAverage : -1, -1, -1};
201+
return new double[] { oneMinuteLoadAverage >= 0 ? oneMinuteLoadAverage : -1, -1, -1 };
196202
} catch (IllegalAccessException | InvocationTargetException e) {
197203
if (logger.isDebugEnabled()) {
198204
logger.debug("error reading one minute load average from operating system", e);
@@ -318,6 +324,23 @@ String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) throws IOEx
318324
return readSingleLine(PathUtils.get("/sys/fs/cgroup/cpuacct", controlGroup, "cpuacct.usage"));
319325
}
320326

327+
private long[] getCgroupV2CpuLimit(String controlGroup) throws IOException {
328+
String entry = readCgroupV2CpuLimit(controlGroup);
329+
String[] parts = entry.split("\\s+");
330+
assert parts.length == 2 : "Expected 2 fields in [cpu.max]";
331+
332+
long[] values = new long[2];
333+
334+
values[0] = "max".equals(parts[0]) ? -1L : Long.parseLong(parts[0]);
335+
values[1] = Long.parseLong(parts[1]);
336+
return values;
337+
}
338+
339+
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpu.max")
340+
String readCgroupV2CpuLimit(String controlGroup) throws IOException {
341+
return readSingleLine(PathUtils.get("/sys/fs/cgroup/", controlGroup, "cpu.max"));
342+
}
343+
321344
/**
322345
* The total period of time in microseconds for how frequently the Elasticsearch control group's access to CPU resources will be
323346
* reallocated.
@@ -454,6 +477,35 @@ String readSysFsCgroupMemoryLimitInBytes(final String controlGroup) throws IOExc
454477
return readSingleLine(PathUtils.get("/sys/fs/cgroup/memory", controlGroup, "memory.limit_in_bytes"));
455478
}
456479

480+
/**
481+
* The maximum amount of user memory (including file cache).
482+
* If there is no limit then some Linux versions return the maximum value that can be stored in an
483+
* unsigned 64 bit number, and this will overflow a long, hence the result type is <code>String</code>.
484+
* (The alternative would have been <code>BigInteger</code> but then it would not be possible to index
485+
* the OS stats document into Elasticsearch without losing information, as <code>BigInteger</code> is
486+
* not a supported Elasticsearch type.)
487+
*
488+
* @param controlGroup the control group for the Elasticsearch process for the {@code memory} subsystem
489+
* @return the maximum amount of user memory (including file cache)
490+
* @throws IOException if an I/O exception occurs reading {@code memory.limit_in_bytes} for the control group
491+
*/
492+
private String getCgroupV2MemoryLimitInBytes(final String controlGroup) throws IOException {
493+
return readSysFsCgroupV2MemoryLimitInBytes(controlGroup);
494+
}
495+
496+
/**
497+
* Returns the line from {@code memory.max} for the control group to which the Elasticsearch process belongs for the
498+
* {@code memory} subsystem. This line represents the maximum amount of user memory (including file cache).
499+
*
500+
* @param controlGroup the control group to which the Elasticsearch process belongs for the {@code memory} subsystem
501+
* @return the line from {@code memory.max}
502+
* @throws IOException if an I/O exception occurs reading {@code memory.max} for the control group
503+
*/
504+
@SuppressForbidden(reason = "access /sys/fs/cgroup/memory.max")
505+
String readSysFsCgroupV2MemoryLimitInBytes(final String controlGroup) throws IOException {
506+
return readSingleLine(PathUtils.get("/sys/fs/cgroup/", controlGroup, "memory.max"));
507+
}
508+
457509
/**
458510
* The total current memory usage by processes in the cgroup (in bytes).
459511
* If there is no limit then some Linux versions return the maximum value that can be stored in an
@@ -483,27 +535,94 @@ String readSysFsCgroupMemoryUsageInBytes(final String controlGroup) throws IOExc
483535
return readSingleLine(PathUtils.get("/sys/fs/cgroup/memory", controlGroup, "memory.usage_in_bytes"));
484536
}
485537

538+
/**
539+
* The total current memory usage by processes in the cgroup (in bytes).
540+
* If there is no limit then some Linux versions return the maximum value that can be stored in an
541+
* unsigned 64 bit number, and this will overflow a long, hence the result type is <code>String</code>.
542+
* (The alternative would have been <code>BigInteger</code> but then it would not be possible to index
543+
* the OS stats document into Elasticsearch without losing information, as <code>BigInteger</code> is
544+
* not a supported Elasticsearch type.)
545+
*
546+
* @param controlGroup the control group for the Elasticsearch process for the {@code memory} subsystem
547+
* @return the total current memory usage by processes in the cgroup (in bytes)
548+
* @throws IOException if an I/O exception occurs reading {@code memory.current} for the control group
549+
*/
550+
private String getCgroupV2MemoryUsageInBytes(final String controlGroup) throws IOException {
551+
return readSysFsCgroupV2MemoryUsageInBytes(controlGroup);
552+
}
553+
554+
/**
555+
* Returns the line from {@code memory.current} for the control group to which the Elasticsearch process belongs for the
556+
* {@code memory} subsystem. This line represents the total current memory usage by processes in the cgroup (in bytes).
557+
*
558+
* @param controlGroup the control group to which the Elasticsearch process belongs for the {@code memory} subsystem
559+
* @return the line from {@code memory.current}
560+
* @throws IOException if an I/O exception occurs reading {@code memory.current} for the control group
561+
*/
562+
@SuppressForbidden(reason = "access /sys/fs/cgroup/memory.current")
563+
String readSysFsCgroupV2MemoryUsageInBytes(final String controlGroup) throws IOException {
564+
return readSingleLine(PathUtils.get("/sys/fs/cgroup/", controlGroup, "memory.current"));
565+
}
566+
486567
/**
487568
* Checks if cgroup stats are available by checking for the existence of {@code /proc/self/cgroup}, {@code /sys/fs/cgroup/cpu},
488569
* {@code /sys/fs/cgroup/cpuacct} and {@code /sys/fs/cgroup/memory}.
489570
*
490571
* @return {@code true} if the stats are available, otherwise {@code false}
491572
*/
492573
@SuppressForbidden(reason = "access /proc/self/cgroup, /sys/fs/cgroup/cpu, /sys/fs/cgroup/cpuacct and /sys/fs/cgroup/memory")
493-
boolean areCgroupStatsAvailable() {
574+
boolean areCgroupStatsAvailable() throws IOException {
494575
if (Files.exists(PathUtils.get("/proc/self/cgroup")) == false) {
495576
return false;
496577
}
497-
if (Files.exists(PathUtils.get("/sys/fs/cgroup/cpu")) == false) {
498-
return false;
499-
}
500-
if (Files.exists(PathUtils.get("/sys/fs/cgroup/cpuacct")) == false) {
501-
return false;
578+
579+
List<String> lines = readProcSelfCgroup();
580+
581+
// cgroup v2
582+
if (lines.size() == 1 && lines.get(0).startsWith("0::")) {
583+
return Stream.of("/sys/fs/cgroup/cpu.stat", "/sys/fs/cgroup/memory.stat").allMatch(path -> Files.exists(PathUtils.get(path)));
502584
}
503-
if (Files.exists(PathUtils.get("/sys/fs/cgroup/memory")) == false) {
504-
return false;
585+
586+
return Stream.of("/sys/fs/cgroup/cpu", "/sys/fs/cgroup/cpuacct", "/sys/fs/cgroup/memory")
587+
.allMatch(path -> Files.exists(PathUtils.get(path)));
588+
}
589+
590+
/**
591+
* The CPU statistics for all tasks in the Elasticsearch control group.
592+
*
593+
* @param controlGroup the control group to which the Elasticsearch process belongs for the {@code memory} subsystem
594+
* @return the CPU statistics
595+
* @throws IOException if an I/O exception occurs reading {@code cpu.stat} for the control group
596+
*/
597+
private Map<String, Long> getCgroupV2CpuStats(String controlGroup) throws IOException {
598+
final List<String> lines = readCgroupV2CpuStats(controlGroup);
599+
final Map<String, Long> stats = new HashMap<>();
600+
601+
for (String line : lines) {
602+
String[] parts = line.split("\\s+");
603+
assert parts.length == 2 : "Corrupt cpu.stat line: [" + line + "]";
604+
stats.put(parts[0], Long.parseLong(parts[1]));
505605
}
506-
return true;
606+
607+
final List<String> expectedKeys = org.elasticsearch.core.List.of(
608+
"nr_periods",
609+
"nr_throttled",
610+
"system_usec",
611+
"throttled_usec",
612+
"usage_usec",
613+
"user_usec"
614+
);
615+
expectedKeys.forEach(key -> {
616+
assert stats.containsKey(key) : key;
617+
assert stats.get(key) != -1 : stats.get(key);
618+
});
619+
620+
return stats;
621+
}
622+
623+
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpu.stat")
624+
List<String> readCgroupV2CpuStats(final String controlGroup) throws IOException {
625+
return Files.readAllLines(PathUtils.get("/sys/fs/cgroup", controlGroup, "cpu.stat"));
507626
}
508627

509628
/**
@@ -515,45 +634,79 @@ private OsStats.Cgroup getCgroup() {
515634
try {
516635
if (areCgroupStatsAvailable() == false) {
517636
return null;
518-
} else {
519-
final Map<String, String> controllerMap = getControlGroups();
520-
assert controllerMap.isEmpty() == false;
637+
}
638+
639+
final Map<String, String> controllerMap = getControlGroups();
640+
assert controllerMap.isEmpty() == false;
521641

522-
final String cpuAcctControlGroup = controllerMap.get("cpuacct");
642+
final String cpuAcctControlGroup;
643+
final long cgroupCpuAcctUsageNanos;
644+
final long cgroupCpuAcctCpuCfsPeriodMicros;
645+
final long cgroupCpuAcctCpuCfsQuotaMicros;
646+
final String cpuControlGroup;
647+
final OsStats.Cgroup.CpuStat cpuStat;
648+
final String memoryControlGroup;
649+
final String cgroupMemoryLimitInBytes;
650+
final String cgroupMemoryUsageInBytes;
651+
652+
if (controllerMap.size() == 1 && controllerMap.containsKey("")) {
653+
// There's a single hierarchy for all controllers
654+
cpuControlGroup = cpuAcctControlGroup = memoryControlGroup = controllerMap.get("");
655+
656+
// `cpuacct` was merged with `cpu` in v2
657+
final Map<String, Long> cpuStatsMap = getCgroupV2CpuStats(cpuControlGroup);
658+
659+
cgroupCpuAcctUsageNanos = cpuStatsMap.get("usage_usec");
660+
661+
long[] cpuLimits = getCgroupV2CpuLimit(cpuControlGroup);
662+
cgroupCpuAcctCpuCfsQuotaMicros = cpuLimits[0];
663+
cgroupCpuAcctCpuCfsPeriodMicros = cpuLimits[1];
664+
665+
cpuStat = new OsStats.Cgroup.CpuStat(
666+
cpuStatsMap.get("nr_periods"),
667+
cpuStatsMap.get("nr_throttled"),
668+
cpuStatsMap.get("throttled_usec")
669+
);
670+
671+
cgroupMemoryLimitInBytes = getCgroupV2MemoryLimitInBytes(memoryControlGroup);
672+
cgroupMemoryUsageInBytes = getCgroupV2MemoryUsageInBytes(memoryControlGroup);
673+
} else {
674+
cpuAcctControlGroup = controllerMap.get("cpuacct");
523675
if (cpuAcctControlGroup == null) {
524676
logger.debug("no [cpuacct] data found in cgroup stats");
525677
return null;
526678
}
527-
final long cgroupCpuAcctUsageNanos = getCgroupCpuAcctUsageNanos(cpuAcctControlGroup);
679+
cgroupCpuAcctUsageNanos = getCgroupCpuAcctUsageNanos(cpuAcctControlGroup);
528680

529-
final String cpuControlGroup = controllerMap.get("cpu");
681+
cpuControlGroup = controllerMap.get("cpu");
530682
if (cpuControlGroup == null) {
531683
logger.debug("no [cpu] data found in cgroup stats");
532684
return null;
533685
}
534-
final long cgroupCpuAcctCpuCfsPeriodMicros = getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup);
535-
final long cgroupCpuAcctCpuCfsQuotaMicros = getCgroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup);
536-
final OsStats.Cgroup.CpuStat cpuStat = getCgroupCpuAcctCpuStat(cpuControlGroup);
686+
cgroupCpuAcctCpuCfsPeriodMicros = getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup);
687+
cgroupCpuAcctCpuCfsQuotaMicros = getCgroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup);
688+
cpuStat = getCgroupCpuAcctCpuStat(cpuControlGroup);
537689

538-
final String memoryControlGroup = controllerMap.get("memory");
690+
memoryControlGroup = controllerMap.get("memory");
539691
if (memoryControlGroup == null) {
540692
logger.debug("no [memory] data found in cgroup stats");
541693
return null;
542694
}
543-
final String cgroupMemoryLimitInBytes = getCgroupMemoryLimitInBytes(memoryControlGroup);
544-
final String cgroupMemoryUsageInBytes = getCgroupMemoryUsageInBytes(memoryControlGroup);
545-
546-
return new OsStats.Cgroup(
547-
cpuAcctControlGroup,
548-
cgroupCpuAcctUsageNanos,
549-
cpuControlGroup,
550-
cgroupCpuAcctCpuCfsPeriodMicros,
551-
cgroupCpuAcctCpuCfsQuotaMicros,
552-
cpuStat,
553-
memoryControlGroup,
554-
cgroupMemoryLimitInBytes,
555-
cgroupMemoryUsageInBytes);
695+
cgroupMemoryLimitInBytes = getCgroupMemoryLimitInBytes(memoryControlGroup);
696+
cgroupMemoryUsageInBytes = getCgroupMemoryUsageInBytes(memoryControlGroup);
556697
}
698+
699+
return new OsStats.Cgroup(
700+
cpuAcctControlGroup,
701+
cgroupCpuAcctUsageNanos,
702+
cpuControlGroup,
703+
cgroupCpuAcctCpuCfsPeriodMicros,
704+
cgroupCpuAcctCpuCfsQuotaMicros,
705+
cpuStat,
706+
memoryControlGroup,
707+
cgroupMemoryLimitInBytes,
708+
cgroupMemoryUsageInBytes
709+
);
557710
} catch (final IOException e) {
558711
logger.debug("error reading control group stats", e);
559712
return null;
@@ -576,13 +729,14 @@ public static OsProbe getInstance() {
576729

577730
OsInfo osInfo(long refreshInterval, int allocatedProcessors) throws IOException {
578731
return new OsInfo(
579-
refreshInterval,
580-
Runtime.getRuntime().availableProcessors(),
581-
allocatedProcessors,
582-
Constants.OS_NAME,
583-
getPrettyName(),
584-
Constants.OS_ARCH,
585-
Constants.OS_VERSION);
732+
refreshInterval,
733+
Runtime.getRuntime().availableProcessors(),
734+
allocatedProcessors,
735+
Constants.OS_NAME,
736+
getPrettyName(),
737+
Constants.OS_ARCH,
738+
Constants.OS_VERSION
739+
);
586740
}
587741

588742
private String getPrettyName() throws IOException {
@@ -594,11 +748,13 @@ private String getPrettyName() throws IOException {
594748
* wrapped in single- or double-quotes.
595749
*/
596750
final List<String> etcOsReleaseLines = readOsRelease();
597-
final List<String> prettyNameLines =
598-
etcOsReleaseLines.stream().filter(line -> line.startsWith("PRETTY_NAME")).collect(Collectors.toList());
751+
final List<String> prettyNameLines = etcOsReleaseLines.stream()
752+
.filter(line -> line.startsWith("PRETTY_NAME"))
753+
.collect(Collectors.toList());
599754
assert prettyNameLines.size() <= 1 : prettyNameLines;
600-
final Optional<String> maybePrettyNameLine =
601-
prettyNameLines.size() == 1 ? Optional.of(prettyNameLines.get(0)) : Optional.empty();
755+
final Optional<String> maybePrettyNameLine = prettyNameLines.size() == 1
756+
? Optional.of(prettyNameLines.get(0))
757+
: Optional.empty();
602758
if (maybePrettyNameLine.isPresent()) {
603759
// we trim since some OS contain trailing space, for example, Oracle Linux Server 6.9 has a trailing space after the quote
604760
final String trimmedPrettyNameLine = maybePrettyNameLine.get().trim();
@@ -695,11 +851,15 @@ boolean isDebian8() throws IOException {
695851
return Constants.LINUX && getPrettyName().equals("Debian GNU/Linux 8 (jessie)");
696852
}
697853

854+
OsStats.Cgroup getCgroup(boolean isLinux) {
855+
return isLinux ? getCgroup() : null;
856+
}
857+
698858
public OsStats osStats() {
699859
final OsStats.Cpu cpu = new OsStats.Cpu(getSystemCpuPercent(), getSystemLoadAverage());
700860
final OsStats.Mem mem = new OsStats.Mem(getTotalPhysicalMemorySize(), getFreePhysicalMemorySize());
701861
final OsStats.Swap swap = new OsStats.Swap(getTotalSwapSpaceSize(), getFreeSwapSpaceSize());
702-
final OsStats.Cgroup cgroup = Constants.LINUX ? getCgroup() : null;
862+
final OsStats.Cgroup cgroup = getCgroup(Constants.LINUX);
703863
return new OsStats(System.currentTimeMillis(), cpu, mem, swap, cgroup);
704864
}
705865

0 commit comments

Comments
 (0)