Skip to content

Performance addon operator code base move to NTO #322

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
51 changes: 40 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,10 @@ TUNED_COMMIT:=682c47c0a9eb5596c2d396b6d0dae4e297414c50
TUNED_DIR:=daemon

# API-related variables
API_TYPES_DIR:=./pkg/apis/tuned/v1
API_TYPES:=$(wildcard $(API_TYPES_DIR)/*_types.go)
API_TYPES_DIR:=pkg/apis
API_TYPES:=$(shell find $(API_TYPES_DIR) -name \*_types.go)
API_ZZ_GENERATED:=zz_generated.deepcopy
API_TYPES_GENERATED:=$(API_TYPES_DIR)/$(API_ZZ_GENERATED).go
API_GO_HEADER_FILE:=pkg/apis/header.go.txt
API_GO_HEADER_FILE:=$(API_TYPES_DIR)/header.go.txt

# Container image-related variables
IMAGE_BUILD_CMD=podman build --no-cache
Expand All @@ -33,6 +32,10 @@ ORG=openshift
TAG=$(shell git rev-parse --abbrev-ref HEAD)
IMAGE=$(REGISTRY)/$(ORG)/origin-cluster-node-tuning-operator:$(TAG)

# PAO variables
CLUSTER ?= "ci"
PAO_CRD_APIS :=$(addprefix ./$(API_TYPES_DIR)/pao/,v2 v1 v1alpha1)

all: build

# Do not put any includes above the "all" target. We want the default target to build
Expand All @@ -59,25 +62,25 @@ $(BINDATA): $(GOBINDATA_BIN) $(ASSETS)

pkg/generated: $(API_TYPES)
$(GO) run k8s.io/code-generator/cmd/deepcopy-gen \
--input-dirs $(PACKAGE)/pkg/apis/tuned/v1 \
--input-dirs $(PACKAGE)/$(API_TYPES_DIR)/tuned/v1,$(PACKAGE)/$(API_TYPES_DIR)/pao/v1alpha1,$(PACKAGE)/$(API_TYPES_DIR)/pao/v1,$(PACKAGE)/$(API_TYPES_DIR)/pao/v2 \
-O $(API_ZZ_GENERATED) \
--go-header-file $(API_GO_HEADER_FILE) \
--bounding-dirs $(PACKAGE)/pkg/apis \
--bounding-dirs $(PACKAGE)/$(API_TYPES_DIR) \
--output-base tmp
$(GO) run k8s.io/code-generator/cmd/client-gen \
--clientset-name versioned \
--input-base '' \
--input $(PACKAGE)/pkg/apis/tuned/v1 \
--input $(PACKAGE)/$(API_TYPES_DIR)/tuned/v1 \
--go-header-file $(API_GO_HEADER_FILE) \
--output-package $(PACKAGE)/pkg/generated/clientset \
--output-base tmp
$(GO) run k8s.io/code-generator/cmd/lister-gen \
--input-dirs $(PACKAGE)/pkg/apis/tuned/v1 \
--input-dirs $(PACKAGE)/$(API_TYPES_DIR)/tuned/v1 \
--go-header-file $(API_GO_HEADER_FILE) \
--output-package $(PACKAGE)/pkg/generated/listers \
--output-base tmp
$(GO) run k8s.io/code-generator/cmd/informer-gen \
--input-dirs $(PACKAGE)/pkg/apis/tuned/v1 \
--input-dirs $(PACKAGE)/$(API_TYPES_DIR)/tuned/v1 \
--versioned-clientset-package $(PACKAGE)/pkg/generated/clientset/versioned \
--listers-package $(PACKAGE)/pkg/generated/listers \
--go-header-file $(API_GO_HEADER_FILE) \
Expand All @@ -86,7 +89,6 @@ pkg/generated: $(API_TYPES)
tar c tmp | tar x --strip-components=4
touch $@


$(GOBINDATA_BIN):
$(GO) build -o $(GOBINDATA_BIN) ./vendor/github.com/kevinburke/go-bindata/go-bindata

Expand Down Expand Up @@ -130,7 +132,8 @@ local-image-push:
# $2 - apis
# $3 - manifests
# $4 - output
$(call add-crd-gen,tuned,$(API_TYPES_DIR),./manifests,./manifests)
$(call add-crd-gen,tuned,./$(API_TYPES_DIR)/tuned/v1,./manifests,./manifests)
$(call add-crd-gen,pao,$(PAO_CRD_APIS),./manifests,./manifests)

# This will include additional actions on the update and verify targets to ensure that profile patches are applied
# to manifest files
Expand All @@ -141,3 +144,29 @@ $(call add-crd-gen,tuned,$(API_TYPES_DIR),./manifests,./manifests)
$(call add-profile-manifests,manifests,./profile-patches,./manifests)

.PHONY: all build deepcopy crd-schema-gen test-e2e verify verify-gofmt clean local-image local-image-push

# PAO

.PHONY: cluster-deploy-pao
cluster-deploy-pao:
@echo "Deploying PAO artifacts"
CLUSTER=$(CLUSTER) hack/deploy.sh

.PHONY: cluster-label-worker-cnf
cluster-label-worker-cnf:
@echo "Adding worker-cnf label to worker nodes"
hack/label-worker-cnf.sh

.PHONY: pao-functests
pao-functests: cluster-label-worker-cnf pao-functests-only

.PHONY: pao-functests-only
pao-functests-only:
@echo "Cluster Version"
hack/show-cluster-version.sh
hack/run-functests.sh

.PHONY: cluster-clean-pao
cluster-clean-pao:
@echo "Cleaning up performance addons artifacts"
hack/clean-deploy.sh
19 changes: 19 additions & 0 deletions assets/pao/assets.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package assets

import (
"embed"
)

var (
// Configs contains all files that placed under the configs directory
//go:embed configs
Configs embed.FS

// Scripts contains all files that placed under the scripts directory
//go:embed scripts
Scripts embed.FS

// Tuned contains all files that placed under the tuned directory
//go:embed tuned
Tuned embed.FS
)
11 changes: 11 additions & 0 deletions assets/pao/configs/99-low-latency-hooks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"version": "1.0.0",
"hook": {
"path": "/usr/local/bin/low-latency-hooks.sh",
"args": ["low-latency-hooks.sh", "{{.RPSMask}}"]
},
"when": {
"always": true
},
"stages": ["prestart"]
}
1 change: 1 addition & 0 deletions assets/pao/configs/99-netdev-rps.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SUBSYSTEM=="net", ACTION=="add", ENV{DEVPATH}!="/devices/virtual/net/veth*", TAG+="systemd", ENV{SYSTEMD_WANTS}="update-rps@%k.service"
20 changes: 20 additions & 0 deletions assets/pao/configs/99-runtimes.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{{if .ReservedCpus}}
[crio.runtime]
infra_ctr_cpuset = "{{.ReservedCpus}}"
{{end}}

# We should copy paste the default runtime because this snippet will override the whole runtimes section
[crio.runtime.runtimes.runc]
runtime_path = ""
runtime_type = "oci"
runtime_root = "/run/runc"

# The CRI-O will check the allowed_annotations under the runtime handler and apply high-performance hooks when one of
# high-performance annotations presents under it.
# We should provide the runtime_path because we need to inform that we want to re-use runc binary and we
# do not have high-performance binary under the $PATH that will point to it.
[crio.runtime.runtimes.high-performance]
runtime_path = "/bin/runc"
runtime_type = "oci"
runtime_root = "/run/runc"
allowed_annotations = ["cpu-load-balancing.crio.io", "cpu-quota.crio.io", "irq-load-balancing.crio.io"]
26 changes: 26 additions & 0 deletions assets/pao/scripts/hugepages-allocation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env bash

set -euo pipefail

nodes_path="/sys/devices/system/node"
hugepages_file="${nodes_path}/node${NUMA_NODE}/hugepages/hugepages-${HUGEPAGES_SIZE}kB/nr_hugepages"

if [ ! -f "${hugepages_file}" ]; then
echo "ERROR: ${hugepages_file} does not exist"
exit 1
fi

timeout=60
sample=1
current_time=0
while [ "$(cat "${hugepages_file}")" -ne "${HUGEPAGES_COUNT}" ]; do
echo "${HUGEPAGES_COUNT}" >"${hugepages_file}"

current_time=$((current_time + sample))
if [ $current_time -gt $timeout ]; then
echo "ERROR: ${hugepages_file} does not have the expected number of hugepages ${HUGEPAGES_COUNT}"
exit 1
fi

sleep $sample
done
35 changes: 35 additions & 0 deletions assets/pao/scripts/low-latency-hooks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash

mask="${1}"
[ -n "${mask}" ] || { logger "${0}: The rps-mask parameter is missing" ; exit 0; }

pid=$(jq '.pid' /dev/stdin 2>&1)
[[ $? -eq 0 && -n "${pid}" ]] || { logger "${0}: Failed to extract the pid: ${pid}"; exit 0; }

ns=$(ip netns identify "${pid}" 2>&1)
[[ $? -eq 0 && -n "${ns}" ]] || { logger "${0} Failed to identify the namespace: ${ns}"; exit 0; }

# Updates the container veth RPS mask on the node
netns_link_indexes=$(ip netns exec "${ns}" ip -j link | jq ".[] | select(.link_index != null) | .link_index")
for link_index in ${netns_link_indexes}; do
container_veth=$(ip -j link | jq ".[] | select(.ifindex == ${link_index}) | .ifname" | tr -d '"')
echo ${mask} > /sys/devices/virtual/net/${container_veth}/queues/rx-0/rps_cpus
done

# Updates the RPS mask for the interface inside of the container network namespace
mode=$(ip netns exec "${ns}" [ -w /sys ] && echo "rw" || echo "ro" 2>&1)
[ $? -eq 0 ] || { logger "${0} Failed to determine if the /sys is writable: ${mode}"; exit 0; }

if [ "${mode}" = "ro" ]; then
res=$(ip netns exec "${ns}" mount -o remount,rw /sys 2>&1)
[ $? -eq 0 ] || { logger "${0}: Failed to remount /sys as rw: ${res}"; exit 0; }
fi

# /sys/class/net can't be used recursively to find the rps_cpus file, use /sys/devices instead
res=$(ip netns exec "${ns}" find /sys/devices -type f -name rps_cpus -exec sh -c "echo ${mask} | cat > {}" \; 2>&1)
[[ $? -eq 0 && -z "${res}" ]] || logger "${0}: Failed to apply the RPS mask: ${res}"

if [ "${mode}" = "ro" ]; then
ip netns exec "${ns}" mount -o remount,ro /sys
[ $? -eq 0 ] || exit 1 # Error out so the pod will not start with a writable /sys
fi
36 changes: 36 additions & 0 deletions assets/pao/scripts/set-rps-mask.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env bash

dev=$1
[ -n "${dev}" ] || { echo "The device argument is missing" >&2 ; exit 1; }

mask=$2
[ -n "${mask}" ] || { echo "The mask argument is missing" >&2 ; exit 1; }

dev_dir="/sys/class/net/${dev}"

function find_dev_dir {
systemd_devs=$(systemctl list-units -t device | grep sys-subsystem-net-devices | cut -d' ' -f1)

for systemd_dev in ${systemd_devs}; do
dev_sysfs=$(systemctl show "${systemd_dev}" -p SysFSPath --value)

dev_orig_name="${dev_sysfs##*/}"
if [ "${dev_orig_name}" = "${dev}" ]; then
dev_name="${systemd_dev##*-}"
dev_name="${dev_name%%.device}"
if [ "${dev_name}" = "${dev}" ]; then # disregard the original device unit
continue
fi

echo "${dev} device was renamed to $dev_name"
dev_dir="/sys/class/net/${dev_name}"
break
fi
done
}

[ -d "${dev_dir}" ] || find_dev_dir # the net device was renamed, find the new name
[ -d "${dev_dir}" ] || { sleep 5; find_dev_dir; } # search failed, wait a little and try again
[ -d "${dev_dir}" ] || { echo "${dev_dir}" directory not found >&2 ; exit 0; } # the interface disappeared, not an error

find "${dev_dir}"/queues -type f -name rps_cpus -exec sh -c "echo ${mask} | cat > {}" \;
132 changes: 132 additions & 0 deletions assets/pao/tuned/openshift-node-performance
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
[main]
summary=Openshift node optimized for deterministic performance at the cost of increased power consumption, focused on low latency network performance. Based on Tuned 2.11 and Cluster node tuning (oc 4.5)
include=openshift-node,cpu-partitioning

# Inheritance of base profiles legend:
# cpu-partitioning -> network-latency -> latency-performance
# https://github.com/redhat-performance/tuned/blob/master/profiles/latency-performance/tuned.conf
# https://github.com/redhat-performance/tuned/blob/master/profiles/network-latency/tuned.conf
# https://github.com/redhat-performance/tuned/blob/master/profiles/cpu-partitioning/tuned.conf

# All values are mapped with a comment where a parent profile contains them.
# Different values will override the original values in parent profiles.

[variables]
#> isolated_cores take a list of ranges; e.g. isolated_cores=2,4-7
{{if .IsolatedCpus}}
isolated_cores={{.IsolatedCpus}}
{{end}}

not_isolated_cores_expanded=${f:cpulist_invert:${isolated_cores_expanded}}

[cpu]
#> latency-performance
#> (override)
force_latency=cstate.id:1|3
governor=performance
energy_perf_bias=performance
min_perf_pct=100

[service]
service.stalld=start,enable

[vm]
#> network-latency
transparent_hugepages=never

{{if not .GloballyDisableIrqLoadBalancing}}
[irqbalance]
#> Override the value set by cpu-partitioning with an empty one
banned_cpus=""
{{end}}

[scheduler]
runtime=0
group.ksoftirqd=0:f:11:*:ksoftirqd.*
group.rcuc=0:f:11:*:rcuc.*
sched_rt_runtime_us=-1
sched_min_granularity_ns=10000000
sched_migration_cost_ns=5000000
numa_balancing=0
{{if not .GloballyDisableIrqLoadBalancing}}
default_irq_smp_affinity = ignore
{{end}}

[sysctl]
#> cpu-partitioning #realtime
kernel.hung_task_timeout_secs = 600
#> cpu-partitioning #realtime
kernel.nmi_watchdog = 0
#> realtime
kernel.sched_rt_runtime_us = -1
# cpu-partitioning and realtime for RHEL disable it (= 0)
# OCP is too dynamic when partitioning and needs to evacuate
#> scheduled timers when starting a guaranteed workload (= 1)
kernel.timer_migration = 1
#> network-latency
kernel.numa_balancing=0
net.core.busy_read=50
net.core.busy_poll=50
net.ipv4.tcp_fastopen=3
#> cpu-partitioning #realtime
vm.stat_interval = 10

# ktune sysctl settings for rhel6 servers, maximizing i/o throughput
#
# Minimal preemption granularity for CPU-bound tasks:
# (default: 1 msec# (1 + ilog(ncpus)), units: nanoseconds)
#> latency-performance
kernel.sched_min_granularity_ns=10000000

# If a workload mostly uses anonymous memory and it hits this limit, the entire
# working set is buffered for I/O, and any more write buffering would require
# swapping, so it's time to throttle writes until I/O can catch up. Workloads
# that mostly use file mappings may be able to use even higher values.
#
# The generator of dirty data starts writeback at this percentage (system default
# is 20%)
#> latency-performance
vm.dirty_ratio=10

# Start background writeback (via writeback threads) at this percentage (system
# default is 10%)
#> latency-performance
vm.dirty_background_ratio=3

# The swappiness parameter controls the tendency of the kernel to move
# processes out of physical memory and onto the swap disk.
# 0 tells the kernel to avoid swapping processes out of physical memory
# for as long as possible
# 100 tells the kernel to aggressively swap processes out of physical memory
# and move them to swap cache
#> latency-performance
vm.swappiness=10

# The total time the scheduler will consider a migrated process
# "cache hot" and thus less likely to be re-migrated
# (system default is 500000, i.e. 0.5 ms)
#> latency-performance
kernel.sched_migration_cost_ns=5000000

[selinux]
#> Custom (atomic host)
avc_cache_threshold=8192

{{if .NetDevices}}
{{.NetDevices}}
{{end}}

[bootloader]
# set empty values to disable RHEL initrd setting in cpu-partitioning
initrd_remove_dir=
initrd_dst_img=
initrd_add_dir=
# overrides cpu-partitioning cmdline
cmdline_cpu_part=+nohz=on rcu_nocbs=${isolated_cores} tuned.non_isolcpus=${not_isolated_cpumask} intel_pstate=disable nosoftlockup
{{if .StaticIsolation}}
cmdline_realtime=+tsc=nowatchdog intel_iommu=on iommu=pt isolcpus=domain,managed_irq,${isolated_cores} systemd.cpu_affinity=${not_isolated_cores_expanded}
{{else}}
cmdline_realtime=+tsc=nowatchdog intel_iommu=on iommu=pt isolcpus=managed_irq,${isolated_cores} systemd.cpu_affinity=${not_isolated_cores_expanded}
{{end}}
cmdline_hugepages=+{{if .DefaultHugepagesSize}} default_hugepagesz={{.DefaultHugepagesSize}} {{end}} {{if .Hugepages}} {{.Hugepages}} {{end}}
cmdline_additionalArg=+{{if .AdditionalArgs}} {{.AdditionalArgs}} {{end}}
Loading