From 76ea5dc465860a6d43dabe7eb04385d09844d751 Mon Sep 17 00:00:00 2001 From: Noel Georgi Date: Tue, 7 Apr 2026 12:32:47 +0800 Subject: [PATCH] feat: nvidia extension rework Rework and cleanup the NVIDIA extensions so that it uses standard paths and is easier to maintain. Signed-off-by: Noel Georgi (cherry picked from commit fdd02585f835159c1266f1b141f67423e3450d2a) --- internal/base/pkg.yaml | 2 +- misc/glibc/ld.so.conf | 2 +- misc/glibc/pkg.yaml | 39 ++++++++----------- .../lts/nvidia-cdi-gen.yaml | 32 ++++++++++++--- .../lts/nvidia-persistenced.yaml | 6 ++- .../nvidia-container-runtime/pkg.yaml | 5 ++- .../nvidia-pkgs/lts/pkg.yaml | 21 +++++++--- .../nvidia-pkgs/production/pkg.yaml | 21 +++++++--- .../production/nvidia-cdi-gen.yaml | 31 ++++++++++++--- .../production/nvidia-persistenced.yaml | 6 ++- .../lts/nvidia-fabricmanager.yaml | 13 ++++++- .../production/nvidia-fabricmanager.yaml | 13 ++++++- tools/nvme-cli/pkg.yaml | 4 ++ 13 files changed, 138 insertions(+), 57 deletions(-) diff --git a/internal/base/pkg.yaml b/internal/base/pkg.yaml index d58e87f..00754af 100644 --- a/internal/base/pkg.yaml +++ b/internal/base/pkg.yaml @@ -3,7 +3,7 @@ variant: scratch shell: /bin/bash dependencies: - image: "{{ .BUILD_ARG_TOOLS_PREFIX }}/tools:{{ .BUILD_ARG_TOOLS }}" - - image: ghcr.io/siderolabs/extensions-validator:52d35f8 + - image: ghcr.io/siderolabs/extensions-validator:84cecb0 finalize: - from: / to: / diff --git a/misc/glibc/ld.so.conf b/misc/glibc/ld.so.conf index 7b8c1c9..e8e4bdc 100644 --- a/misc/glibc/ld.so.conf +++ b/misc/glibc/ld.so.conf @@ -1 +1 @@ -/usr/local/glibc/usr/lib +/usr/local/lib diff --git a/misc/glibc/pkg.yaml b/misc/glibc/pkg.yaml index 069e7f3..cfb10e4 100644 --- a/misc/glibc/pkg.yaml +++ b/misc/glibc/pkg.yaml @@ -28,10 +28,11 @@ steps: mkdir build cd build + # setting `--disable-sanity-checks` is fine in this case since glibc is an extension and base talos uses musl ../configure \ - --prefix=/usr/local/glibc \ - --libdir=/usr/local/glibc/usr/lib \ - --libexecdir=/usr/local/glibc/usr/lib \ + --prefix=/usr/local \ + --sysconfdir=/etc \ + --disable-sanity-checks \ --enable-stack-protection=strong \ --disable-werror build: @@ -40,34 +41,26 @@ steps: make -j $(nproc) install: - | - mkdir -p \ - /rootfs/usr/local/glibc/usr/bin \ - /rootfs/usr/local/glibc/usr/lib \ - /rootfs/usr/local/glibc/usr/lib32 \ - /rootfs/usr/local/glibc/usr/sbin - ln -s usr/bin/ /rootfs/usr/local/glibc/bin - ln -s usr/lib/ /rootfs/usr/local/glibc/lib - ln -s usr/lib/ /rootfs/usr/local/glibc/lib64 - ln -s usr/sbin/ /rootfs/usr/local/glibc/sbin - ln -s lib/ /rootfs/usr/local/glibc/usr/lib64 - cd build - make install DESTDIR=/rootfs - - cp /pkg/ld.so.conf /rootfs/usr/local/glibc/etc/ld.so.conf + make install DESTDIR=/rootfs -j $(nproc) ARCH_UPPER="${ARCH^^}" LD_LINUX_PATH="LD_LINUX_${ARCH_UPPER}" export LD_LINUX_PATH="${!LD_LINUX_PATH}" - mkdir -p /rootfs/usr/lib /rootfs/usr/bin - ln -s /usr/local/glibc/usr/lib/${LD_LINUX_PATH} /rootfs/usr/lib/${LD_LINUX_PATH} - ln -s /usr/local/glibc/usr/sbin/ldconfig /rootfs/usr/bin/ldconfig + mkdir -p /rootfs/{etc,usr/{bin,lib}} + + cp /pkg/ld.so.conf /rootfs/etc/ld.so.conf + + ln -s /usr/local/lib/${LD_LINUX_PATH} /rootfs/usr/lib/${LD_LINUX_PATH} + ln -s /usr/local/sbin/ldconfig /rootfs/usr/bin/ldconfig + ln -s lib/ /rootfs/usr/local/lib64 # cleanup - rm -rf /rootfs/usr/local/glibc/include - rm -rf /rootfs/usr/local/glibc/share - rm -rf /rootfs/usr/local/glibc/var + rm -rf /rootfs/usr/local/include + rm -rf /rootfs/usr/local/share + rm -rf /rootfs/usr/local/var + rm -f /rootfs/etc/rpc sbom: outputPath: /rootfs/usr/local/share/spdx/glibc.spdx.json version: {{ .GLIBC_VERSION }} diff --git a/nvidia-gpu/nvidia-container-toolkit/lts/nvidia-cdi-gen.yaml b/nvidia-gpu/nvidia-container-toolkit/lts/nvidia-cdi-gen.yaml index 2ae5e09..92c6f79 100644 --- a/nvidia-gpu/nvidia-container-toolkit/lts/nvidia-cdi-gen.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/lts/nvidia-cdi-gen.yaml @@ -1,12 +1,9 @@ - name: nvidia-cdi-gen container: entrypoint: /usr/local/bin/nvidia-ctk args: - cdi - generate - - --library-search-path - - /usr/local/glibc/usr/lib - --output - /run/cdi/nvidia.yaml mounts: @@ -19,6 +16,7 @@ container: - rbind - ro # shared libraries + # for amd64 it's /lib64/ld-linux-x86-64.so.2 - source: /lib64 destination: /lib64 type: bind @@ -26,15 +24,37 @@ container: - bind - ro # shared libraries + # for arm64 it's /lib/ld-linux-aarch64.so.1 - source: /lib destination: /lib type: bind options: - bind - ro - # shared libraries - - source: /usr/local/glibc - destination: /usr/local/glibc + # shared libraries /usr/local/lib, nvidia files in /usr/local/share + - source: /usr/local + destination: /usr/local + type: bind + options: + - bind + - ro + # nvidia files in /usr/share + - source: /usr/share + destination: /usr/share + type: bind + options: + - bind + - ro + # firmware files + - source: /lib/firmware + destination: /lib/firmware + type: bind + options: + - bind + - ro + # glibc etc files, nvidia etc files + - source: /etc + destination: /etc type: bind options: - bind diff --git a/nvidia-gpu/nvidia-container-toolkit/lts/nvidia-persistenced.yaml b/nvidia-gpu/nvidia-container-toolkit/lts/nvidia-persistenced.yaml index 7ae72a1..0f868ab 100644 --- a/nvidia-gpu/nvidia-container-toolkit/lts/nvidia-persistenced.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/lts/nvidia-persistenced.yaml @@ -12,6 +12,7 @@ container: - rbind - rw # shared libraries + # for amd64 it's /lib64/ld-linux-x86-64.so.2 - source: /lib64 destination: /lib64 type: bind @@ -19,6 +20,7 @@ container: - bind - ro # shared libraries + # for arm64 it's /lib/ld-linux-aarch64.so.1 - source: /lib destination: /lib type: bind @@ -26,8 +28,8 @@ container: - bind - ro # shared libraries - - source: /usr/local/glibc - destination: /usr/local/glibc + - source: /usr/local/lib + destination: /usr/local/lib type: bind options: - bind diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/pkg.yaml index 9727b07..faec673 100644 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/pkg.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/nvidia-container-runtime/pkg.yaml @@ -30,11 +30,14 @@ steps: make cmds install: - | - mkdir -p /rootfs/usr/local/bin + mkdir -p /rootfs/usr/{bin,local/bin} cd container-toolkit find . -maxdepth 1 -type f -executable -exec cp {} /rootfs/usr/local/bin/ \; + + ln -s /usr/local/bin/nvidia-ctk /rootfs/usr/bin/nvidia-ctk + ln -s /usr/local/bin/nvidia-cdi-hook /rootfs/usr/bin/nvidia-cdi-hook - | mkdir -p /rootfs/etc/cri/conf.d cp /pkg/10-nvidia-container-runtime.part /rootfs/etc/cri/conf.d/10-nvidia-container-runtime.part diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/pkg.yaml index 049c0a9..3cd8545 100644 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/pkg.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/lts/pkg.yaml @@ -59,15 +59,11 @@ steps: ./nvidia-installer --silent \ --opengl-prefix=/rootfs/usr/local \ --utility-prefix=/rootfs/usr/local \ - --utility-libdir=glibc/usr/lib \ --documentation-prefix=/rootfs/usr/local \ --glvnd-egl-config-path=/rootfs/usr/share/glvnd/egl_vendor.d \ --egl-external-platform-config-path=/rootfs/usr/share/egl/egl_external_platform.d \ --x-prefix=/rootfs/usr/local \ - --x-library-path=glibc/usr/lib \ - --x-module-path=/rootfs/usr/local/glibc/usr/lib/xorg/modules \ - --gbm-backend-dir=glibc/usr/lib/gbm \ - --opengl-libdir=glibc/usr/lib \ + --x-module-path=/rootfs/usr/local/lib/xorg/modules \ --no-rpms \ --no-kernel-modules \ --log-file-name=/tmp/nvidia-installer.log \ @@ -88,8 +84,12 @@ steps: # TODO: should we allow this in extension spec, cdi doesn't seem to check this files # --override-file-type-destination=CUDA_ICD:/rootfs/etc/OpenCL/vendors \ + ln -s /usr/local/bin/nvidia-smi /rootfs/usr/bin/nvidia-smi + ln -s /usr/local/bin/nvidia-modprobe /rootfs/usr/bin/nvidia-modprobe + ln -s /usr/local/bin/nvidia-pcc /rootfs/usr/bin/nvidia-pcc + # run ldconfig to update the cache - /rootfs/usr/local/glibc/sbin/ldconfig -r /rootfs + /rootfs/usr/local/sbin/ldconfig -r /rootfs mkdir -p /rootfs/usr/local/lib/containers/nvidia-persistenced \ /rootfs/usr/local/etc/containers \ @@ -97,6 +97,15 @@ steps: # copy udev rule cp /pkg/files/15-nvidia-device.rules /rootfs/usr/lib/udev/rules.d + - | + # Fix symlinks whose targets are rooted under /rootfs + # currently this is only one file that I noticed but better to be safe and fix all of them + find /rootfs -type l | while read -r link; do + target="$(readlink "${link}")" + if [[ "${target}" == /rootfs/* ]]; then + ln -sfn "${target#/rootfs}" "${link}" + fi + done finalize: - from: /rootfs to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/pkg.yaml b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/pkg.yaml index 7917129..7392771 100644 --- a/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/pkg.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/nvidia-pkgs/production/pkg.yaml @@ -58,15 +58,11 @@ steps: ./nvidia-installer --silent \ --opengl-prefix=/rootfs/usr/local \ --utility-prefix=/rootfs/usr/local \ - --utility-libdir=glibc/usr/lib \ --documentation-prefix=/rootfs/usr/local \ --glvnd-egl-config-path=/rootfs/usr/share/glvnd/egl_vendor.d \ --egl-external-platform-config-path=/rootfs/usr/share/egl/egl_external_platform.d \ --x-prefix=/rootfs/usr/local \ - --x-library-path=glibc/usr/lib \ - --x-module-path=/rootfs/usr/local/glibc/usr/lib/xorg/modules \ - --gbm-backend-dir=glibc/usr/lib/gbm \ - --opengl-libdir=glibc/usr/lib \ + --x-module-path=/rootfs/usr/local/lib/xorg/modules \ --no-rpms \ --no-kernel-modules \ --log-file-name=/tmp/nvidia-installer.log \ @@ -87,8 +83,12 @@ steps: # TODO: should we allow this in extension spec, cdi doesn't seem to check this files # --override-file-type-destination=CUDA_ICD:/rootfs/etc/OpenCL/vendors \ + ln -s /usr/local/bin/nvidia-smi /rootfs/usr/bin/nvidia-smi + ln -s /usr/local/bin/nvidia-modprobe /rootfs/usr/bin/nvidia-modprobe + ln -s /usr/local/bin/nvidia-pcc /rootfs/usr/bin/nvidia-pcc + # run ldconfig to update the cache - /rootfs/usr/local/glibc/sbin/ldconfig -r /rootfs + /rootfs/usr/local/sbin/ldconfig -r /rootfs mkdir -p /rootfs/usr/local/lib/containers/nvidia-persistenced \ /rootfs/usr/local/etc/containers \ @@ -96,6 +96,15 @@ steps: # copy udev rule cp /pkg/files/15-nvidia-device.rules /rootfs/usr/lib/udev/rules.d + - | + # Fix symlinks whose targets are rooted under /rootfs + # currently this is only one file that I noticed but better to be safe and fix all of them + find /rootfs -type l | while read -r link; do + target="$(readlink "${link}")" + if [[ "${target}" == /rootfs/* ]]; then + ln -sfn "${target#/rootfs}" "${link}" + fi + done finalize: - from: /rootfs to: /rootfs diff --git a/nvidia-gpu/nvidia-container-toolkit/production/nvidia-cdi-gen.yaml b/nvidia-gpu/nvidia-container-toolkit/production/nvidia-cdi-gen.yaml index 2af0565..92c6f79 100644 --- a/nvidia-gpu/nvidia-container-toolkit/production/nvidia-cdi-gen.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/production/nvidia-cdi-gen.yaml @@ -4,8 +4,6 @@ container: args: - cdi - generate - - --library-search-path - - /usr/local/glibc/usr/lib - --output - /run/cdi/nvidia.yaml mounts: @@ -18,6 +16,7 @@ container: - rbind - ro # shared libraries + # for amd64 it's /lib64/ld-linux-x86-64.so.2 - source: /lib64 destination: /lib64 type: bind @@ -25,15 +24,37 @@ container: - bind - ro # shared libraries + # for arm64 it's /lib/ld-linux-aarch64.so.1 - source: /lib destination: /lib type: bind options: - bind - ro - # shared libraries - - source: /usr/local/glibc - destination: /usr/local/glibc + # shared libraries /usr/local/lib, nvidia files in /usr/local/share + - source: /usr/local + destination: /usr/local + type: bind + options: + - bind + - ro + # nvidia files in /usr/share + - source: /usr/share + destination: /usr/share + type: bind + options: + - bind + - ro + # firmware files + - source: /lib/firmware + destination: /lib/firmware + type: bind + options: + - bind + - ro + # glibc etc files, nvidia etc files + - source: /etc + destination: /etc type: bind options: - bind diff --git a/nvidia-gpu/nvidia-container-toolkit/production/nvidia-persistenced.yaml b/nvidia-gpu/nvidia-container-toolkit/production/nvidia-persistenced.yaml index 7ae72a1..0f868ab 100644 --- a/nvidia-gpu/nvidia-container-toolkit/production/nvidia-persistenced.yaml +++ b/nvidia-gpu/nvidia-container-toolkit/production/nvidia-persistenced.yaml @@ -12,6 +12,7 @@ container: - rbind - rw # shared libraries + # for amd64 it's /lib64/ld-linux-x86-64.so.2 - source: /lib64 destination: /lib64 type: bind @@ -19,6 +20,7 @@ container: - bind - ro # shared libraries + # for arm64 it's /lib/ld-linux-aarch64.so.1 - source: /lib destination: /lib type: bind @@ -26,8 +28,8 @@ container: - bind - ro # shared libraries - - source: /usr/local/glibc - destination: /usr/local/glibc + - source: /usr/local/lib + destination: /usr/local/lib type: bind options: - bind diff --git a/nvidia-gpu/nvidia-fabricmanager/lts/nvidia-fabricmanager.yaml b/nvidia-gpu/nvidia-fabricmanager/lts/nvidia-fabricmanager.yaml index 3643519..82c2376 100644 --- a/nvidia-gpu/nvidia-fabricmanager/lts/nvidia-fabricmanager.yaml +++ b/nvidia-gpu/nvidia-fabricmanager/lts/nvidia-fabricmanager.yaml @@ -12,6 +12,7 @@ container: - rbind - rw # shared libraries + # for amd64 it's /lib64/ld-linux-x86-64.so.2 - source: /lib64 destination: /lib64 type: bind @@ -19,8 +20,16 @@ container: - bind - ro # shared libraries - - source: /usr/local/glibc - destination: /usr/local/glibc + # for arm64 it's /lib/ld-linux-aarch64.so.1 + - source: /lib + destination: /lib + type: bind + options: + - bind + - ro + # shared libraries + - source: /usr/local/lib + destination: /usr/local/lib type: bind options: - bind diff --git a/nvidia-gpu/nvidia-fabricmanager/production/nvidia-fabricmanager.yaml b/nvidia-gpu/nvidia-fabricmanager/production/nvidia-fabricmanager.yaml index 3643519..82c2376 100644 --- a/nvidia-gpu/nvidia-fabricmanager/production/nvidia-fabricmanager.yaml +++ b/nvidia-gpu/nvidia-fabricmanager/production/nvidia-fabricmanager.yaml @@ -12,6 +12,7 @@ container: - rbind - rw # shared libraries + # for amd64 it's /lib64/ld-linux-x86-64.so.2 - source: /lib64 destination: /lib64 type: bind @@ -19,8 +20,16 @@ container: - bind - ro # shared libraries - - source: /usr/local/glibc - destination: /usr/local/glibc + # for arm64 it's /lib/ld-linux-aarch64.so.1 + - source: /lib + destination: /lib + type: bind + options: + - bind + - ro + # shared libraries + - source: /usr/local/lib + destination: /usr/local/lib type: bind options: - bind diff --git a/tools/nvme-cli/pkg.yaml b/tools/nvme-cli/pkg.yaml index e58e208..d6d0684 100644 --- a/tools/nvme-cli/pkg.yaml +++ b/tools/nvme-cli/pkg.yaml @@ -33,8 +33,12 @@ steps: meson compile -C .build install: - | + mkdir -p /rootfs/usr/bin + DESTDIR=/rootfs meson install -C .build rm -rf /rootfs/{etc/nvme/discovery.conf,usr/local/{include,lib/{dracut,systemd},share}} + + ln -s /usr/local/sbin/nvme /rootfs/usr/bin/nvme test: - | mkdir -p /extensions-validator-rootfs