Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(omitting children) to the new group, and leave the CLOS orphaned after
container exit. (#4827)

### Fixed
* Container processes will no longer inherit the CPU affinity of runc by
default. Instead, the default CPU affinity of container processes will be
the largest set of CPUs permitted by the container's cpuset cgroup and any
other system restrictions (such as isolated CPUs). (#4041, #4815, #4858)

## [1.3.0] - 2025-04-30

> Mr. President, we must not allow a mine shaft gap!
Expand Down
51 changes: 50 additions & 1 deletion libcontainer/process_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,46 @@ type setnsProcess struct {
initProcessPid int
}

// tryResetCPUAffinity tries to reset the CPU affinity of the process
// identified by pid to include all possible CPUs (notwithstanding cgroup
// cpuset restrictions and isolated CPUs).
func tryResetCPUAffinity(pid int) {
// When resetting the CPU affinity, we want to match the configured cgroup
// cpuset (or the default set of all CPUs, if no cpuset is configured)
// rather than some more restrictive affinity we were spawned in (such as
// one that may have been inherited from systemd). The cpuset cgroup used
// to reconfigure the cpumask automatically for joining processes, but
// kcommit da019032819a ("sched: Enforce user requested affinity") changed
// this behaviour in Linux 6.2.
//
// Parsing cpuset.cpus.effective is quite inefficient (and looking at
// things like /proc/stat would be wrong for most nested containers), but
// luckily sched_setaffinity(2) will implicitly:
//
// * Clamp the cpumask so that it matches the current number of CPUs on
// the system.
// * Mask out any CPUs that are not a member of the target task's
// configured cgroup cpuset.
//
// So we can just pass a very large array of set cpumask bits and the
// kernel will silently convert that to the correct value very cheaply.

// Ideally, we would just set the array to 0xFF...FF. Unfortunately, the
// size depends on the architecture. It is also a private newtype, so we
// can't use (^0) or generics since those require us to be able to name the
// type. However, we can just underflow the zero value instead.
// TODO: Once <https://golang.org/cl/698015> is merged, switch to that.
cpuset := unix.CPUSet{}
for i := range cpuset {
cpuset[i]-- // underflow to 0xFF..FF
}
if err := unix.SchedSetaffinity(pid, &cpuset); err != nil {
logrus.WithError(
os.NewSyscallError("sched_setaffinity", err),
).Warnf("resetting the CPU affinity of pid %d failed -- the container process may inherit runc's CPU affinity", pid)
}
}

// Starts setns process with specified initial CPU affinity.
func (p *setnsProcess) startWithCPUAffinity() error {
aff := p.config.CPUAffinity
Expand Down Expand Up @@ -189,7 +229,13 @@ func (p *setnsProcess) startWithCPUAffinity() error {

func (p *setnsProcess) setFinalCPUAffinity() error {
aff := p.config.CPUAffinity
if aff == nil || aff.Final == nil {
// If there was no affinity configured at all, we want to reset
// the affinity to make sure we don't inherit an unexpected one.
if aff == nil || aff.Final == nil && aff.Initial == nil {
tryResetCPUAffinity(p.pid())
return nil
}
if aff.Final == nil {
return nil
}
if err := unix.SchedSetaffinity(p.pid(), aff.Final); err != nil {
Expand Down Expand Up @@ -615,6 +661,9 @@ func (p *initProcess) start() (retErr error) {
return fmt.Errorf("unable to apply cgroup configuration: %w", err)
}
}
// Reset the CPU affinity after cgroups are configured to make sure it
// matches any configured cpuset.
tryResetCPUAffinity(p.pid())
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
Expand Down
109 changes: 109 additions & 0 deletions tests/integration/cpu_affinity.bats
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@

load helpers

INITIAL_CPU_MASK="$(grep -F Cpus_allowed_list: /proc/self/status | awk '{ print $2 }')"

function setup() {
requires smp cgroups_cpuset
setup_busybox

echo "Initial CPU mask: $INITIAL_CPU_MASK" >&2
echo "---" >&2
}

function teardown() {
Expand Down Expand Up @@ -99,3 +104,107 @@ function cpus_to_mask() {
[[ "$output" == *"nsexec"*": affinity: $mask"* ]]
[[ "$output" == *"Cpus_allowed_list: $final"* ]] # Mind the literal tab.
}

@test "runc run [CPU affinity should reset]" {
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
# bash function (which is what runc and __runc are).
setup_runc_cmdline

first="$(first_cpu)"

# Running without cpuset should result in an affinity for all CPUs.
update_config '.process.args = [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ]'
update_config 'del(.linux.resources.cpu)'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
[[ "$output" == $'Cpus_allowed_list:\t'"$INITIAL_CPU_MASK" ]]
}

@test "runc run [CPU affinity should reset to cgroup cpuset]" {
[ $EUID -ne 0 ] && requires rootless_cgroup
set_cgroups_path

# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
# bash function (which is what runc and __runc are).
setup_runc_cmdline

first="$(first_cpu)"
second="$((first + 1))" # Hacky; might not work in all environments.

# Running with a cpuset should result in an affinity that matches.
update_config '.process.args = [ "/bin/grep", "-F", "Cpus_allowed_list:", "/proc/self/status" ]'
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$first-$second"'"}'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
# XXX: For some reason, systemd-cgroup leads to us using the all-set
# cpumask rather than the cpuset we configured?
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$first-$second" ]]

# Ditto for a cpuset that has no overlap with the original cpumask.
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$second"'"}'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run ctr
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
# XXX: For some reason, systemd-cgroup leads to us using the all-set
# cpumask rather than the cpuset we configured?
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$second" ]]
}

@test "runc exec [default CPU affinity should reset]" {
# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
# bash function (which is what runc and __runc are).
setup_runc_cmdline

first="$(first_cpu)"

# Running without cpuset should result in an affinity for all CPUs.
update_config '.process.args = [ "/bin/sleep", "infinity" ]'
update_config 'del(.linux.resources.cpu)'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr3
[ "$status" -eq 0 ]
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr3 grep -F Cpus_allowed_list: /proc/self/status
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
[[ "$output" == $'Cpus_allowed_list:\t'"$INITIAL_CPU_MASK" ]]
}

@test "runc exec [default CPU affinity should reset to cgroup cpuset]" {
[ $EUID -ne 0 ] && requires rootless_cgroup
set_cgroups_path

# We need to use RUNC_CMDLINE since taskset requires a proper binary, not a
# bash function (which is what runc and __runc are).
setup_runc_cmdline

first="$(first_cpu)"
second="$((first + 1))" # Hacky; might not work in all environments.

# Running with a cpuset should result in an affinity that matches.
update_config '.process.args = [ "/bin/sleep", "infinity" ]'
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$first-$second"'"}'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr
[ "$status" -eq 0 ]
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr grep -F Cpus_allowed_list: /proc/self/status
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
# XXX: For some reason, systemd-cgroup leads to us using the all-set
# cpumask rather than the cpuset we configured?
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$first-$second" ]]

# Stop the container so we can reconfigure it.
runc delete -f ctr
[ "$status" -eq 0 ]

# Ditto for a cpuset that has no overlap with the original cpumask.
update_config '.linux.resources.cpu = {"mems": "0", "cpus": "'"$second"'"}'
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" run -d --console-socket "$CONSOLE_SOCKET" ctr
[ "$status" -eq 0 ]
sane_run taskset -c "$first" "${RUNC_CMDLINE[@]}" exec ctr grep -F Cpus_allowed_list: /proc/self/status
[ "$status" -eq 0 ]
[[ "$output" != $'Cpus_allowed_list:\t'"$first" ]]
# XXX: For some reason, systemd-cgroup leads to us using the all-set
# cpumask rather than the cpuset we configured?
[ -v RUNC_USE_SYSTEMD ] || [[ "$output" == $'Cpus_allowed_list:\t'"$second" ]]
}
28 changes: 22 additions & 6 deletions tests/integration/helpers.bash
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,38 @@ ARCH=$(uname -m)
# Seccomp agent socket.
SECCCOMP_AGENT_SOCKET="$BATS_TMPDIR/seccomp-agent.sock"

# Wrapper for runc.
function runc() {
run __runc "$@"
# Wrapper around "run" that logs output to make tests easier to debug.
function sane_run() {
local cmd="$1"
local cmdname="${CMDNAME:-$(basename "$cmd")}"
shift

run "$cmd" "$@"

# Some debug information to make life easier. bats will only print it if the
# test failed, in which case the output is useful.
# shellcheck disable=SC2154
echo "$(basename "$RUNC") $* (status=$status):" >&2
echo "$cmdname $* (status=$status)" >&2
# shellcheck disable=SC2154
echo "$output" >&2
}

# Wrapper for runc.
function runc() {
CMDNAME="$(basename "$RUNC")" sane_run __runc "$@"
}

function setup_runc_cmdline() {
RUNC_CMDLINE=("$RUNC")
[[ -v RUNC_USE_SYSTEMD ]] && RUNC_CMDLINE+=("--systemd-cgroup")
[[ -n "${ROOT:-}" ]] && RUNC_CMDLINE+=("--root" "$ROOT/state")
export RUNC_CMDLINE
}

# Raw wrapper for runc.
function __runc() {
"$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} \
${ROOT:+--root "$ROOT/state"} "$@"
setup_runc_cmdline
"${RUNC_CMDLINE[@]}" "$@"
}

# Wrapper for runc spec.
Expand Down