Skip to content

Commit

Permalink
chore: nvidia-persistenced as an extension service
Browse files Browse the repository at this point in the history
Run `nvidia-persistenced` as a Talos Extension Service

Bump nvidia drivers to 510.68.02
pkgs repo bumped [here](siderolabs/pkgs#470)

Use the patch from https://gitlab.com/nvidia/container-toolkit/libnvidia-container/-/merge_requests/165

Signed-off-by: Noel Georgi <git@frezbo.dev>
  • Loading branch information
frezbo committed May 12, 2022
1 parent 3ccc1b5 commit 7cf2843
Show file tree
Hide file tree
Showing 12 changed files with 183 additions and 17 deletions.
3 changes: 1 addition & 2 deletions nonfree/nvidia-container-toolkit/DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ This is used to create the required NVIDIA device files under `/dev`. This requi
## Updating the nvidia driver version

- Update the driver version in `pkgs` repo [here](https://github.com/siderolabs/pkgs/blob/master/nonfree/kmod-nvidia/pkg.yaml)
- Update the driver version [here](../Pkgfile) and [here](./manifest.yaml)
- Update the driver checksums [here](./nvidia-pkgs/pkg.yaml)
- Update the driver version [here](../vars.yaml)

## Updating the nvidia-container-toolkit version

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ index d73d0f1..c28e982 100644
return (-1);

- argv = (char * []){cnt->cfg.ldconfig, cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
+ argv = (char * []){cnt->cfg.ldconfig, cnt->cfg.libs_dir, cnt->cfg.libs32_dir, "-C", "/etc/ld.so.cache", NULL};
+ argv = (char * []){cnt->cfg.ldconfig, "-f", "/etc/ld.so.conf", "-C", "/etc/ld.so.cache", cnt->cfg.libs_dir, cnt->cfg.libs32_dir, NULL};
if (*argv[0] == '@') {
/*
* We treat this path specially to be relative to the host filesystem.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,3 @@ ACTION=="add", DEVPATH=="/bus/pci/drivers/nvidia", RUN+="/usr/local/bin/ub-devic

# Create the device node for the nvidia-uvm module
ACTION=="add", DEVPATH=="/module/nvidia_uvm", SUBSYSTEM=="module", RUN+="/usr/local/bin/ub-device-create"

# https://download.nvidia.com/XFree86/Linux-x86_64/510.60.02/README/nvidia-persistenced.html
ACTION=="add", DEVPATH=="/bus/pci/drivers/nvidia", RUN+="/usr/local/bin/nvidia-persistenced --no-persistence-mode --verbose"
12 changes: 6 additions & 6 deletions nonfree/nvidia-container-toolkit/nvidia-device-create/pkg.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# https://download.nvidia.com/XFree86/Linux-x86_64/510.60.02/README/faq.html#devicenodes
# https://download.nvidia.com/XFree86/Linux-x86_64/510.68.02/README/faq.html#devicenodes
# check the section under NVIDIA-INSTALLER -> How and when are the NVIDIA device files created?
name: nvidia-device-create
variant: scratch
Expand All @@ -7,11 +7,11 @@ dependencies:
shell: /bin/bash
steps:
- sources:
# https://github.com/tseliot/nvidia-graphics-drivers/commit/0a622008df6a81cf96a7740f958654fafc23f79d
- url: https://github.com/tseliot/nvidia-graphics-drivers/archive/0a622008df6a81cf96a7740f958654fafc23f79d.tar.gz
# https://github.com/tseliot/nvidia-graphics-drivers/commit/4f9e4b3b5859b7058b0c961762ce1a89df3150ca
- url: https://github.com/tseliot/nvidia-graphics-drivers/archive/4f9e4b3b5859b7058b0c961762ce1a89df3150ca.tar.gz
destination: nvidia-graphics-drivers-build.tar.gz
sha256: 5fce931c2a0e67e00d2786f66f412f87be4a7b2103dda98cd5400675a85eda63
sha512: f043d0f09272fa99130b22c5b3fa4974d60c6f6a258ea45caa0145ea0737d3d2f43eb6caf40f901dd18e20670fff9d1827adb95f89acde53022eb6bb7ca46371
sha256: d59c376a8371d829fb733901b71e163f263648ee7506bd92f7bf2a06e3495455
sha512: 949c6cff617de1cc62364dc781fa195d1b545d868945f323a4fc6de9b2b0467dc6e7a93f32ba1206ddf71d95cc3adca749e7a73d37a9155149e00d139beec9a4
env:
DEBIAN_FRONTEND: noninteractive
prepare:
Expand All @@ -22,7 +22,7 @@ steps:
libkmod-dev \
build-essential
# https://download.nvidia.com/XFree86/Linux-x86_64/510.60.02/README/faq.html#devicenodes
# https://download.nvidia.com/XFree86/Linux-x86_64/510.68.02/README/faq.html#devicenodes
# https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#runfile-verifications
mkdir nvidia-graphics-drivers-build
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module nvidia-persistenced-wrapper

go 1.18

require golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6 h1:nonptSpoQ4vQjyraW20DXPAglgQfVnM9ZC6MmNLMR60=
golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
package main

import (
"errors"
"io/ioutil"
"log"
"os"
"os/exec"
"os/signal"
"strconv"

"golang.org/x/sys/unix"
)

const (
stateFolder = "/var/run/nvidia-persistenced"
pidFile = stateFolder + "/" + "nvidia-persistenced.pid"
)

func main() {
// first check if the pid file exists,
// then check if the process is running,
// if running try to kill it, then start the new process
if _, err := os.Stat(pidFile); err != nil {
if !errors.Is(err, os.ErrNotExist) {
log.Fatalf("nvidia-persistenced-wrapper: failed to stat pid file: %s%v\n", pidFile, err)
}
} else {
pid, err := getProcessId()

if err != nil {
log.Fatalf("nvidia-persistenced-wrapper: error reading pid file: %s%v\n", pidFile, err)
}
if err := killProcess(pid); err != nil {
log.Fatalf("nvidia-persistenced-wrapper: error killing process: %d%v\n", pid, err)
}
// now we can remove the state directory
if err := os.RemoveAll(stateFolder); err != nil {
log.Fatalf("nvidia-persistenced-wrapper: error removing state directory: %s%v\n", stateFolder, err)
}
}

cmd := exec.Command("/usr/local/bin/nvidia-persistenced",
[]string{
"--no-persistence-mode",
"--verbose",
}...)

cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr

if err := cmd.Start(); err != nil {
log.Fatalf("nvidia-persistenced-wrapper: error starting nvidia-persistenced: %v\n", err)
}

ch := make(chan os.Signal, 1)
signal.Notify(ch, unix.SIGINT, unix.SIGTERM)

if err := cmd.Process.Signal(<-ch); err != nil {
log.Fatalf("nvidia-persistenced-wrapper: error sending signal to nvidia-persistenced: %v\n", err)
}

if _, err := cmd.Process.Wait(); err != nil {
log.Fatalf("nvidia-persistenced-wrapper: error waiting for nvidia-persistenced to exit: %v\n", err)
}
}

func getProcessId() (int, error) {
pidData, err := ioutil.ReadFile(pidFile)
if err != nil {
return 0, err
}
pid, err := strconv.Atoi(string(pidData))
if err != nil {
return 0, err
}
return int(pid), nil
}

func killProcess(pid int) error {
p, err := os.FindProcess(pid)
if err != nil {
return err
}
// ignore if process is already dead
if err := p.Kill(); !errors.Is(err, os.ErrProcessDone) {
return err
}
return nil
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: nvidia-persistenced-wrapper
variant: scratch
shell: /toolchain/bin/bash
dependencies:
- stage: base
steps:
- build:
- |
export PATH=${PATH}:${TOOLCHAIN}/go/bin
cp -r /pkg/* .
CGO_ENABLED=0 go build -o nvidia-persistenced-wrapper main.go
install:
- |
mkdir -p /rootfs/usr/local/bin
mkdir -p /rootfs/usr/lib/containers/nvidia-persistenced
cp nvidia-persistenced-wrapper /rootfs/usr/local/bin/nvidia-persistenced-wrapper
finalize:
- from: /rootfs
to: /rootfs
48 changes: 48 additions & 0 deletions nonfree/nvidia-container-toolkit/nvidia-persistenced.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# https://download.nvidia.com/XFree86/Linux-x86_64/510.54/README/nvidia-persistenced.html
name: nvidia-persistenced
container:
entrypoint: /usr/local/bin/nvidia-persistenced-wrapper
mounts:
# device files
- source: /dev
destination: /dev
type: bind
options:
- rshared
- rbind
- rw
# shared libraries
- source: /usr/local/glibc
destination: /usr/local/glibc
type: bind
options:
- bind
- ro
# shared libraries
- source: /usr/local/lib
destination: /usr/local/lib
type: bind
options:
- bind
- ro
# service state file
- source: /var/run/nvidia-persistenced
destination: /var/run/nvidia-persistenced
type: bind
options:
- rshared
- rbind
- rw
# binaries
- source: /usr/local/bin
destination: /usr/local/bin
type: bind
options:
- bind
- ro
depends:
- service: cri
# we need to depend on udevd so that the nvidia device files are created
- service: udevd
- path: /sys/bus/pci/drivers/nvidia
restart: always
8 changes: 4 additions & 4 deletions nonfree/nvidia-container-toolkit/nvidia-pkgs/pkg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ steps:
# {{ if eq .ARCH "aarch64" }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr
- url: https://download.nvidia.com/XFree86/Linux-aarch64/{{ .NVIDIA_DRIVER_VERSION_MAJOR }}.{{ .NVIDIA_DRIVER_VERSION_MINOR }}/NVIDIA-Linux-aarch64-{{ .NVIDIA_DRIVER_VERSION_MAJOR }}.{{ .NVIDIA_DRIVER_VERSION_MINOR }}.run
destination: nvidia.run
sha256: 931521e4fc8175411f2a232e2d3704f8369c21e530283b4fdc4cacb323acc568
sha512: fca54ba6abff197dbce55761a4755e98aebd16a851b54ba072c2a10296eadc7924adc102be6599d16052d94d9a0a4e260a0d63a098e039afe46210c65dfb3b32
sha256: 6a4aa4418813dd691b8a1cc7e4f24d82175e7be07c38563dffa1485c6be56562
sha512: b075e20b8457a1fe16a0ac1f34ff9a94d739673858a9a973c361856e07ab25a56e9ff2856a828866fea00407c5e4a4394d3c7aa6728a9a31bad0905e1d60f002
# {{ else }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr
- url: https://download.nvidia.com/XFree86/Linux-x86_64/{{ .NVIDIA_DRIVER_VERSION_MAJOR }}.{{ .NVIDIA_DRIVER_VERSION_MINOR }}/NVIDIA-Linux-x86_64-{{ .NVIDIA_DRIVER_VERSION_MAJOR }}.{{ .NVIDIA_DRIVER_VERSION_MINOR }}.run
destination: nvidia.run
sha256: a800dfc0549078fd8c6e8e6780efb8eee87872e6055c7f5f386a4768ce07e003
sha512: ccc459bdf5f89a37f79a1831bac8c03980deaa13082b516d5e9c74a49e1aea7f1f6b03304705a95564a390bf0ca38df10b8c8b73e3470a31444dd5ebfd981cfd
sha256: bd2c344ac92b2fc12b06043590a4fe8d4eb0ccb74d0c49352f004cf2d299f4c5
sha512: eb31ed729555075bcc307acc576cb6fdfdd7e397c9e47dd80fc2f55cac6902c3924b69bb91036e5ded1001e81d4b81082ba093dd63d6d97bc313fe78e510131b
# {{ end }} This in fact is YAML comment, but Go templating instruction is evaluated by bldr
env:
DEBIAN_FRONTEND: noninteractive
Expand Down
3 changes: 3 additions & 0 deletions nonfree/nvidia-container-toolkit/pkg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ dependencies:
- stage: nvidia-container-cli
- stage: nvidia-container-runtime
- stage: nvidia-device-create
- stage: nvidia-persistenced-wrapper
steps:
- prepare:
- |
Expand All @@ -16,5 +17,7 @@ steps:
finalize:
- from: /rootfs
to: /rootfs
- from: /pkg/nvidia-persistenced.yaml
to: /rootfs/usr/local/etc/containers/nvidia-persistenced.yaml
- from: /pkg/manifest.yaml
to: /
2 changes: 1 addition & 1 deletion nonfree/vars.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
NVIDIA_DRIVER_VERSION_MAJOR: 510
NVIDIA_DRIVER_VERSION_MINOR: 60.02
NVIDIA_DRIVER_VERSION_MINOR: 68.02
NVIDIA_CONTAINER_TOOLKIT_VERSION: v1.9.0

0 comments on commit 7cf2843

Please sign in to comment.