From 3c4c7c6cb04a818d62f29c849f24713b6223300c Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Fri, 14 Feb 2025 14:55:06 +0100 Subject: [PATCH 1/9] setup-nvidia: Support aarch64 installer Use `uname -m` to fetch the correct driver installer for aarch64 or x86_64. Signed-off-by: Jeremi Piotrowski --- .../x11-drivers/nvidia-drivers/files/bin/setup-nvidia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia index 75f6fcf5ee..c63b624df9 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia @@ -7,7 +7,7 @@ set -euo pipefail [ -f /etc/flatcar/nvidia-metadata ] && . /etc/flatcar/nvidia-metadata NVIDIA_DOWNLOAD_BASEURL="https://us.download.nvidia.com/${NVIDIA_PRODUCT_TYPE}/" -NVIDIA_DRIVER_BASENAME="NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}" +NVIDIA_DRIVER_BASENAME="NVIDIA-Linux-$(uname -m)-${NVIDIA_DRIVER_VERSION}" NVIDIA_WORKDIR='nvidia-workdir' FLATCAR_DEVELOPER_CONTAINER="flatcar_developer_container-${FLATCAR_RELEASE_VERSION}.bin" From 03925e1ca3047c071d98cf619df36eb4a928ba58 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Fri, 14 Feb 2025 15:00:46 +0100 Subject: [PATCH 2/9] setup-nvidia: Make "current" symlink logic more robust Users have reported that in some cases the nvidia.service fails because /opt/nvidia/current is a directory and the symbolic link gets created inside it. I have no idea how we get there, but to make the service robust in the face of this kind of issue: - remove the directory if it exists - use `-T` with ln to ensure that symbolic link creation fails if `current` is a directory Signed-off-by: Jeremi Piotrowski --- .../x11-drivers/nvidia-drivers/files/bin/setup-nvidia | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia index c63b624df9..5cf7a851ac 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia @@ -88,7 +88,11 @@ SYSEXT_LEVEL=1.0 EOF pushd /opt/nvidia - ln -sfn "${NVIDIA_FLATCAR_VERSION_PAIR}" current + if [[ -d "${NVIDIA_CURRENT_INSTALLATION}" ]] + then + rm -rf "${NVIDIA_CURRENT_INSTALLATION}" + fi + ln -sfn -T "${NVIDIA_FLATCAR_VERSION_PAIR}" "${NVIDIA_CURRENT_INSTALLATION}" popd } From f8b8b8ce40e9e7a70c14fda16cb7f00bfffa4ab2 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 24 Feb 2025 16:56:31 +0100 Subject: [PATCH 3/9] setup-nvidia: Keep devcontainer image sparse This saves space at runtime. Signed-off-by: Jeremi Piotrowski --- .../x11-drivers/nvidia-drivers/files/bin/setup-nvidia | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia index 5cf7a851ac..15fc458465 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia @@ -38,8 +38,8 @@ function download_flatcar_developer_container() { fi curl -L --fail "${FLATCAR_DEVELOPER_CONTAINER_URL}" -o "${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}.bz2" - lbzip2 -d "${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}.bz2" - + cp --sparse=always <(lbzcat "${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}.bz2") "${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}" + rm "${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}.bz2" fi return 0 From b048140f97e65bb5312be88d53cf3043a960dfc1 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 24 Feb 2025 20:42:16 +0100 Subject: [PATCH 4/9] coreos-modules: Cross-compile module build tools Signed-off-by: Jeremi Piotrowski --- .../coreos-overlay/eclass/coreos-kernel.eclass | 2 +- .../coreos-modules/coreos-modules-6.6.82.ebuild | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sdk_container/src/third_party/coreos-overlay/eclass/coreos-kernel.eclass b/sdk_container/src/third_party/coreos-overlay/eclass/coreos-kernel.eclass index 0ea1ee111b..d5f1346637 100644 --- a/sdk_container/src/third_party/coreos-overlay/eclass/coreos-kernel.eclass +++ b/sdk_container/src/third_party/coreos-overlay/eclass/coreos-kernel.eclass @@ -42,7 +42,7 @@ DEPEND="=sys-kernel/coreos-sources-${COREOS_SOURCE_VERSION}" RESTRICT="binchecks strip" # The build tools are OK and shouldn't trip up multilib-strict. -QA_MULTILIB_PATHS="usr/lib/modules/.*/build/scripts/.*" +QA_MULTILIB_PATHS="usr/lib/modules/.*/build/scripts/kconfig/.*" # Use source installed by coreos-sources # KERNEL_DIR must find the kernel source tree under /usr/src/linux-*-coreos, diff --git a/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-modules/coreos-modules-6.6.82.ebuild b/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-modules/coreos-modules-6.6.82.ebuild index d68540c22a..73c33b970e 100644 --- a/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-modules/coreos-modules-6.6.82.ebuild +++ b/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-modules/coreos-modules-6.6.82.ebuild @@ -54,6 +54,14 @@ src_install() { # Clean up the build tree shred_keys kmake clean + + # TODO: ensure that fixdep and kbuild tools shipped inside the image + # are native (we previously shipped amd64 binaries on arm64). + # Upstream has a new script from v6.12 that we might be able to use: + # scripts/package/install-extmod-build + kmake HOSTLD=$(tc-getLD) HOSTCC=$(tc-getCC) cmd_and_fixdep='$(cmd)' modules_prepare + kmake clean + find "build/" -type d -empty -delete || die rm "build/.config.old" || die From bfde33bcc37a12818f1db1f26062777e7a9693b5 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Tue, 25 Feb 2025 15:03:32 +0100 Subject: [PATCH 5/9] install-nvidia: Force building proprietary kernel module Installers for 570 sometimes default to Open drivers, which we can't support properly at this time. Force proprietary drivers. There are also additional options that suppress certain worrisome error strings - enable those if supported too. Signed-off-by: Jeremi Piotrowski --- .../nvidia-drivers/files/bin/install-nvidia | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/install-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/install-nvidia index 803934b190..4166ff93d0 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/install-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/install-nvidia @@ -4,11 +4,35 @@ NVIDIA_DRIVER_BASENAME="$1" KERNEL_NAME="$(ls /lib/modules)" +option_supported() { + local opt="$1" + ./nvidia-installer -A -h | grep -qe "--$opt" +} + cd "/nvidia/${NVIDIA_DRIVER_BASENAME}" + +EXTRA_OPTS=() +if option_supported no-rebuild-initramfs ; then + EXTRA_OPTS+=( --no-rebuild-initramfs ) +fi +if option_supported skip-module-load ; then + EXTRA_OPTS+=( --skip-module-load ) +fi + +# TODO: open requires firmware loading. +# Can we load the module from the nspawn container? +if option_supported kernel-module-type ; then + EXTRA_OPTS+=( --kernel-module-type=proprietary ) +elif option_supported kernel-module-build-directory ; then + EXTRA_OPTS+=( --kernel-module-build-directory=kernel ) +fi + ./nvidia-installer -s -n \ --no-check-for-alternate-installs \ --no-kernel-module-source \ --kernel-name="${KERNEL_NAME}" \ + "${EXTRA_OPTS[@]}" \ + --no-x-check \ --no-opengl-files \ --no-distro-scripts \ --no-systemd \ @@ -16,6 +40,9 @@ cd "/nvidia/${NVIDIA_DRIVER_BASENAME}" --kernel-install-path="${PWD}/install-mod" \ --log-file-name="${PWD}/nvidia-installer.log" || true +echo "Last 50 lines of nvidia-installer.log:" +tail -n50 nvidia-installer.log || true + mkdir -p /lib/modules/${KERNEL_NAME}/video mkdir -p "${PWD}"/install-mod cp "${PWD}"/kernel/*.ko "${PWD}"/install-mod From 48e42f8ab685fd7e5ae7b7899ed31eb306474607 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Tue, 25 Feb 2025 16:33:47 +0100 Subject: [PATCH 6/9] setup-nvidia: Keep systemd unit when running nspawn container The nspawn container runs in it's own scope, which journal output is then associated with. By passing `--keep-unit` we can guarantee that all log output will stay associated with the nvidia.service and can be viewed by running `journalctl -u nvidia.service`. Signed-off-by: Jeremi Piotrowski --- .../x11-drivers/nvidia-drivers/files/bin/setup-nvidia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia index 15fc458465..dea42857af 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia @@ -67,7 +67,7 @@ function extract_nvidia_installer() { function run_nspawn_container() { echo Spawn system-nspawn container to install the NVIDIA drivers - sudo systemd-nspawn --read-only --volatile=overlay --image="${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}" --bind="${FLATCAR_ROOT_WORKDIR}/${NVIDIA_WORKDIR}":/nvidia --bind=/usr/lib/nvidia/bin:/app/bin/ /app/bin/install-nvidia "$NVIDIA_DRIVER_BASENAME" + sudo systemd-nspawn --keep-unit --register=no --read-only --volatile=overlay --image="${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}" --bind="${FLATCAR_ROOT_WORKDIR}/${NVIDIA_WORKDIR}":/nvidia --bind=/usr/lib/nvidia/bin:/app/bin/ /app/bin/install-nvidia "$NVIDIA_DRIVER_BASENAME" } function copy_nvidia_build_artifacts() { From 73cbffea56cf21c371a5ab1590b7b42b977ef50f Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Wed, 5 Mar 2025 16:12:20 +0100 Subject: [PATCH 7/9] setup-nvidia: Overlay host /lib/modules into devcontainer So that we can pick-up kmods contained in sysexts (like zfs) and generate complete module dependency information. I thought we could skip running depmod for nvidia drivers because we manually insmod them, but nvidia's GPU operator driver validation expects to be able to run modprobe - so we have to generate them. Signed-off-by: Jeremi Piotrowski --- .../x11-drivers/nvidia-drivers/files/bin/setup-nvidia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia index dea42857af..dd936f6aff 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia @@ -67,7 +67,7 @@ function extract_nvidia_installer() { function run_nspawn_container() { echo Spawn system-nspawn container to install the NVIDIA drivers - sudo systemd-nspawn --keep-unit --register=no --read-only --volatile=overlay --image="${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}" --bind="${FLATCAR_ROOT_WORKDIR}/${NVIDIA_WORKDIR}":/nvidia --bind=/usr/lib/nvidia/bin:/app/bin/ /app/bin/install-nvidia "$NVIDIA_DRIVER_BASENAME" + sudo systemd-nspawn --keep-unit --register=no --read-only --volatile=overlay --image="${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}" --overlay=/usr/lib/modules::/usr/lib/modules --bind="${FLATCAR_ROOT_WORKDIR}/${NVIDIA_WORKDIR}":/nvidia --bind=/usr/lib/nvidia/bin:/app/bin/ /app/bin/install-nvidia "$NVIDIA_DRIVER_BASENAME" } function copy_nvidia_build_artifacts() { From f61987b23ccbf56f698e625d684a43afce092c81 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Fri, 7 Mar 2025 11:40:14 +0000 Subject: [PATCH 8/9] nvidia-drivers: Split amd64 and arm64 driver version The R535 driver branch, which is LTS, does not compile on arm64 with GCC 14/kernel 6.6. Keep amd64 on R535 and switch arm64 to R570 by default. R570 is the first driver version that I found that is currently supported and works for arm64. Signed-off-by: Jeremi Piotrowski --- .../nvidia-drivers/files/nvidia-metadata | 2 +- ...build => nvidia-drivers-535.230.02.ebuild} | 6 ++-- .../nvidia-drivers-570.86.15.ebuild | 30 +++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) rename sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/{nvidia-drivers-535.216.01.ebuild => nvidia-drivers-535.230.02.ebuild} (78%) create mode 100644 sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-570.86.15.ebuild diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/nvidia-metadata b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/nvidia-metadata index 6ec82b56bd..34b387e45c 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/nvidia-metadata +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/nvidia-metadata @@ -1,2 +1,2 @@ -NVIDIA_DRIVER_VERSION=535.216.01 +NVIDIA_DRIVER_VERSION=@PV@ NVIDIA_PRODUCT_TYPE=tesla diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.216.01.ebuild b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.230.02.ebuild similarity index 78% rename from sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.216.01.ebuild rename to sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.230.02.ebuild index 53465ec289..63b6d962e9 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.216.01.ebuild +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.230.02.ebuild @@ -11,7 +11,7 @@ SRC_URI="" LICENSE="Apache-2.0" SLOT="0" -KEYWORDS="amd64 arm64" +KEYWORDS="amd64" IUSE="" # no source directory @@ -23,6 +23,8 @@ src_install() { exeinto "/usr/lib/nvidia/bin" doexe "${FILESDIR}/bin/install-nvidia" doexe "${FILESDIR}/bin/setup-nvidia" + cp "${FILESDIR}/nvidia-metadata" nvidia-metadata || die "cp failed" + sed -i -e "s/@PV@/${PV}/" nvidia-metadata insinto "/usr/share/flatcar" - doins "${FILESDIR}/nvidia-metadata" + doins nvidia-metadata } diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-570.86.15.ebuild b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-570.86.15.ebuild new file mode 100644 index 0000000000..0e9ef08f4c --- /dev/null +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-570.86.15.ebuild @@ -0,0 +1,30 @@ +# Copyright (c) 2020 Kinvolk GmbH. All rights reserved. +# Distributed under the terms of the GNU General Public License v2 + +EAPI=7 + +inherit systemd + +DESCRIPTION="NVIDIA drivers" +HOMEPAGE="" +SRC_URI="" + +LICENSE="Apache-2.0" +SLOT="0" +KEYWORDS="arm64" +IUSE="" + +# no source directory +S="${WORKDIR}" + +src_install() { + systemd_dounit "${FILESDIR}/units/nvidia.service" + systemd_enable_service multi-user.target nvidia.service + exeinto "/usr/lib/nvidia/bin" + doexe "${FILESDIR}/bin/install-nvidia" + doexe "${FILESDIR}/bin/setup-nvidia" + cp "${FILESDIR}/nvidia-metadata" nvidia-metadata || die "cp failed" + sed -i -e "s/@PV@/${PV}/" nvidia-metadata + insinto "/usr/share/flatcar" + doins nvidia-metadata +} From e313934fea1a19a91c8729a0eebb43d4b90172f9 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Fri, 14 Mar 2025 10:46:34 +0100 Subject: [PATCH 9/9] changelog: Add entries for nvidia.service changes Signed-off-by: Jeremi Piotrowski --- .../bugfixes/2025-03-07-coreos-modules-cross-compilation.md | 1 + changelog/bugfixes/2025-03-07-nvidia-driver-type.md | 1 + changelog/changes/2025-03-07-nvidia-arm64-support.md | 1 + changelog/updates/2025-03-07-nvidia.md | 2 ++ 4 files changed, 5 insertions(+) create mode 100644 changelog/bugfixes/2025-03-07-coreos-modules-cross-compilation.md create mode 100644 changelog/bugfixes/2025-03-07-nvidia-driver-type.md create mode 100644 changelog/changes/2025-03-07-nvidia-arm64-support.md create mode 100644 changelog/updates/2025-03-07-nvidia.md diff --git a/changelog/bugfixes/2025-03-07-coreos-modules-cross-compilation.md b/changelog/bugfixes/2025-03-07-coreos-modules-cross-compilation.md new file mode 100644 index 0000000000..f9e46f714a --- /dev/null +++ b/changelog/bugfixes/2025-03-07-coreos-modules-cross-compilation.md @@ -0,0 +1 @@ +- The kernel module build directory now contains native binaries in arm64 images instead of the previous amd64 binaries ([scripts#2694](https://github.com/flatcar/scripts/pull/2694)) \ No newline at end of file diff --git a/changelog/bugfixes/2025-03-07-nvidia-driver-type.md b/changelog/bugfixes/2025-03-07-nvidia-driver-type.md new file mode 100644 index 0000000000..e717994468 --- /dev/null +++ b/changelog/bugfixes/2025-03-07-nvidia-driver-type.md @@ -0,0 +1 @@ +- Nvidia driver installer service now supports the 570 driver branch by forcing the use of the proprietary kernel module. The 570 branch defaults to the kernel-open driver which requires loading firmware, which is not yet supported on Flatcar. ([scripts#2694](https://github.com/flatcar/scripts/pull/2694)) \ No newline at end of file diff --git a/changelog/changes/2025-03-07-nvidia-arm64-support.md b/changelog/changes/2025-03-07-nvidia-arm64-support.md new file mode 100644 index 0000000000..65957180eb --- /dev/null +++ b/changelog/changes/2025-03-07-nvidia-arm64-support.md @@ -0,0 +1 @@ +- Added support for ARM64 architecture in the NVIDIA driver installer service ([scripts#2694](https://github.com/flatcar/scripts/pull/2694)) \ No newline at end of file diff --git a/changelog/updates/2025-03-07-nvidia.md b/changelog/updates/2025-03-07-nvidia.md new file mode 100644 index 0000000000..9c89b70c6c --- /dev/null +++ b/changelog/updates/2025-03-07-nvidia.md @@ -0,0 +1,2 @@ +- AMD64: nvidia-drivers ([535.230.02](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-535-230-02/index.html)) +- ARM64: nvidia-drivers ([570.86.15](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-570-86-15/index.html))