From b2c608769d2c6ee6c3e4b7a1a5477039314e972f Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Fri, 14 Feb 2025 14:55:06 +0100 Subject: [PATCH 1/9] setup-nvidia: Support aarch64 installer Use `uname -m` to fetch the correct driver installer for aarch64 or x86_64. Signed-off-by: Jeremi Piotrowski --- .../x11-drivers/nvidia-drivers/files/bin/setup-nvidia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia index 6ecb16d334..eab3cab06a 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia @@ -7,7 +7,7 @@ set -euo pipefail [ -f /etc/flatcar/nvidia-metadata ] && . /etc/flatcar/nvidia-metadata NVIDIA_DOWNLOAD_BASEURL="https://us.download.nvidia.com/${NVIDIA_PRODUCT_TYPE}/" -NVIDIA_DRIVER_BASENAME="NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}" +NVIDIA_DRIVER_BASENAME="NVIDIA-Linux-$(uname -m)-${NVIDIA_DRIVER_VERSION}" NVIDIA_WORKDIR='nvidia-workdir' FLATCAR_DEVELOPER_CONTAINER="flatcar_developer_container-${FLATCAR_RELEASE_VERSION}.bin" From 418f26ae6b1f4a3f9c0a90e26197438d6cf0731f Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Fri, 14 Feb 2025 15:00:46 +0100 Subject: [PATCH 2/9] setup-nvidia: Make "current" symlink logic more robust Users have reported that in some cases the nvidia.service fails because /opt/nvidia/current is a directory and the symbolic link gets created inside it. I have no idea how we get there, but to make the service robust in the face of this kind of issue: - remove the directory if it exists - use `-T` with ln to ensure that symbolic link creation fails if `current` is a directory Signed-off-by: Jeremi Piotrowski --- .../x11-drivers/nvidia-drivers/files/bin/setup-nvidia | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia index eab3cab06a..80d58ad050 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia @@ -88,7 +88,11 @@ SYSEXT_LEVEL=1.0 EOF pushd /opt/nvidia - ln -sfn "${NVIDIA_FLATCAR_VERSION_PAIR}" current + if [[ -d "${NVIDIA_CURRENT_INSTALLATION}" ]] + then + rm -rf "${NVIDIA_CURRENT_INSTALLATION}" + fi + ln -sfn -T "${NVIDIA_FLATCAR_VERSION_PAIR}" "${NVIDIA_CURRENT_INSTALLATION}" popd } From 2381ea1f99c7d6a430d5bf515f660e01396f6694 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 24 Feb 2025 16:56:31 +0100 Subject: [PATCH 3/9] setup-nvidia: Keep devcontainer image sparse This saves space at runtime. Signed-off-by: Jeremi Piotrowski --- .../x11-drivers/nvidia-drivers/files/bin/setup-nvidia | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia index 80d58ad050..625efaeaa9 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia @@ -38,8 +38,8 @@ function download_flatcar_developer_container() { fi curl -L --fail "${FLATCAR_DEVELOPER_CONTAINER_URL}" -o "${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}.bz2" - lbzip2 -d "${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}.bz2" - + cp --sparse=always <(lbzcat "${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}.bz2") "${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}" + rm "${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}.bz2" fi return 0 From 6a92c7cce81de674ce62df08aef7433c1da2f0ef Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Mon, 24 Feb 2025 20:42:16 +0100 Subject: [PATCH 4/9] coreos-modules: Cross-compile module build tools Signed-off-by: Jeremi Piotrowski --- .../coreos-overlay/eclass/coreos-kernel.eclass | 2 +- .../coreos-modules/coreos-modules-6.6.82.ebuild | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sdk_container/src/third_party/coreos-overlay/eclass/coreos-kernel.eclass b/sdk_container/src/third_party/coreos-overlay/eclass/coreos-kernel.eclass index 0ea1ee111b..d5f1346637 100644 --- a/sdk_container/src/third_party/coreos-overlay/eclass/coreos-kernel.eclass +++ b/sdk_container/src/third_party/coreos-overlay/eclass/coreos-kernel.eclass @@ -42,7 +42,7 @@ DEPEND="=sys-kernel/coreos-sources-${COREOS_SOURCE_VERSION}" RESTRICT="binchecks strip" # The build tools are OK and shouldn't trip up multilib-strict. -QA_MULTILIB_PATHS="usr/lib/modules/.*/build/scripts/.*" +QA_MULTILIB_PATHS="usr/lib/modules/.*/build/scripts/kconfig/.*" # Use source installed by coreos-sources # KERNEL_DIR must find the kernel source tree under /usr/src/linux-*-coreos, diff --git a/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-modules/coreos-modules-6.6.82.ebuild b/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-modules/coreos-modules-6.6.82.ebuild index d68540c22a..73c33b970e 100644 --- a/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-modules/coreos-modules-6.6.82.ebuild +++ b/sdk_container/src/third_party/coreos-overlay/sys-kernel/coreos-modules/coreos-modules-6.6.82.ebuild @@ -54,6 +54,14 @@ src_install() { # Clean up the build tree shred_keys kmake clean + + # TODO: ensure that fixdep and kbuild tools shipped inside the image + # are native (we previously shipped amd64 binaries on arm64). + # Upstream has a new script from v6.12 that we might be able to use: + # scripts/package/install-extmod-build + kmake HOSTLD=$(tc-getLD) HOSTCC=$(tc-getCC) cmd_and_fixdep='$(cmd)' modules_prepare + kmake clean + find "build/" -type d -empty -delete || die rm "build/.config.old" || die From 066fb3ffd4f4b01c4bf789885cf30612d529451a Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Tue, 25 Feb 2025 15:03:32 +0100 Subject: [PATCH 5/9] install-nvidia: Force building proprietary kernel module Installers for 570 sometimes default to Open drivers, which we can't support properly at this time. Force proprietary drivers. There are also additional options that suppress certain worrisome error strings - enable those if supported too. Signed-off-by: Jeremi Piotrowski --- .../nvidia-drivers/files/bin/install-nvidia | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/install-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/install-nvidia index 803934b190..4166ff93d0 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/install-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/install-nvidia @@ -4,11 +4,35 @@ NVIDIA_DRIVER_BASENAME="$1" KERNEL_NAME="$(ls /lib/modules)" +option_supported() { + local opt="$1" + ./nvidia-installer -A -h | grep -qe "--$opt" +} + cd "/nvidia/${NVIDIA_DRIVER_BASENAME}" + +EXTRA_OPTS=() +if option_supported no-rebuild-initramfs ; then + EXTRA_OPTS+=( --no-rebuild-initramfs ) +fi +if option_supported skip-module-load ; then + EXTRA_OPTS+=( --skip-module-load ) +fi + +# TODO: open requires firmware loading. +# Can we load the module from the nspawn container? +if option_supported kernel-module-type ; then + EXTRA_OPTS+=( --kernel-module-type=proprietary ) +elif option_supported kernel-module-build-directory ; then + EXTRA_OPTS+=( --kernel-module-build-directory=kernel ) +fi + ./nvidia-installer -s -n \ --no-check-for-alternate-installs \ --no-kernel-module-source \ --kernel-name="${KERNEL_NAME}" \ + "${EXTRA_OPTS[@]}" \ + --no-x-check \ --no-opengl-files \ --no-distro-scripts \ --no-systemd \ @@ -16,6 +40,9 @@ cd "/nvidia/${NVIDIA_DRIVER_BASENAME}" --kernel-install-path="${PWD}/install-mod" \ --log-file-name="${PWD}/nvidia-installer.log" || true +echo "Last 50 lines of nvidia-installer.log:" +tail -n50 nvidia-installer.log || true + mkdir -p /lib/modules/${KERNEL_NAME}/video mkdir -p "${PWD}"/install-mod cp "${PWD}"/kernel/*.ko "${PWD}"/install-mod From ab519ab4d266dc232088c9dbc2165794670c14d1 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Tue, 25 Feb 2025 16:33:47 +0100 Subject: [PATCH 6/9] setup-nvidia: Keep systemd unit when running nspawn container The nspawn container runs in it's own scope, which journal output is then associated with. By passing `--keep-unit` we can guarantee that all log output will stay associated with the nvidia.service and can be viewed by running `journalctl -u nvidia.service`. Signed-off-by: Jeremi Piotrowski --- .../x11-drivers/nvidia-drivers/files/bin/setup-nvidia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia index 625efaeaa9..9ec365c611 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia @@ -67,7 +67,7 @@ function extract_nvidia_installer() { function run_nspawn_container() { echo Spawn system-nspawn container to install the NVIDIA drivers - sudo systemd-nspawn --read-only --volatile=overlay --image="${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}" --bind="${FLATCAR_ROOT_WORKDIR}/${NVIDIA_WORKDIR}":/nvidia --bind=/usr/lib/nvidia/bin:/app/bin/ /app/bin/install-nvidia "$NVIDIA_DRIVER_BASENAME" + sudo systemd-nspawn --keep-unit --register=no --read-only --volatile=overlay --image="${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}" --bind="${FLATCAR_ROOT_WORKDIR}/${NVIDIA_WORKDIR}":/nvidia --bind=/usr/lib/nvidia/bin:/app/bin/ /app/bin/install-nvidia "$NVIDIA_DRIVER_BASENAME" } function copy_nvidia_build_artifacts() { From 5d4b6697c91e3e93f88a94eaf4b5826879e78edd Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Wed, 5 Mar 2025 16:12:20 +0100 Subject: [PATCH 7/9] setup-nvidia: Overlay host /lib/modules into devcontainer So that we can pick-up kmods contained in sysexts (like zfs) and generate complete module dependency information. I thought we could skip running depmod for nvidia drivers because we manually insmod them, but nvidia's GPU operator driver validation expects to be able to run modprobe - so we have to generate them. Signed-off-by: Jeremi Piotrowski --- .../x11-drivers/nvidia-drivers/files/bin/setup-nvidia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia index 9ec365c611..a1568988be 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/bin/setup-nvidia @@ -67,7 +67,7 @@ function extract_nvidia_installer() { function run_nspawn_container() { echo Spawn system-nspawn container to install the NVIDIA drivers - sudo systemd-nspawn --keep-unit --register=no --read-only --volatile=overlay --image="${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}" --bind="${FLATCAR_ROOT_WORKDIR}/${NVIDIA_WORKDIR}":/nvidia --bind=/usr/lib/nvidia/bin:/app/bin/ /app/bin/install-nvidia "$NVIDIA_DRIVER_BASENAME" + sudo systemd-nspawn --keep-unit --register=no --read-only --volatile=overlay --image="${FLATCAR_ROOT_WORKDIR}/${FLATCAR_DEVELOPER_CONTAINER}" --overlay=/usr/lib/modules::/usr/lib/modules --bind="${FLATCAR_ROOT_WORKDIR}/${NVIDIA_WORKDIR}":/nvidia --bind=/usr/lib/nvidia/bin:/app/bin/ /app/bin/install-nvidia "$NVIDIA_DRIVER_BASENAME" } function copy_nvidia_build_artifacts() { From d355ecf1da7cf6e31c8df87d78b6d6c9986c0863 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Fri, 7 Mar 2025 11:40:14 +0000 Subject: [PATCH 8/9] nvidia-drivers: Split amd64 and arm64 driver version The R535 driver branch, which is LTS, does not compile on arm64 with GCC 14/kernel 6.6. Keep amd64 on R535 and switch arm64 to R570 by default. R570 is the first driver version that I found that is currently supported and works for arm64. Signed-off-by: Jeremi Piotrowski --- .../nvidia-drivers/files/nvidia-metadata | 2 +- ...build => nvidia-drivers-535.230.02.ebuild} | 6 ++-- .../nvidia-drivers-570.86.15.ebuild | 30 +++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) rename sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/{nvidia-drivers-535.216.01.ebuild => nvidia-drivers-535.230.02.ebuild} (78%) create mode 100644 sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-570.86.15.ebuild diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/nvidia-metadata b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/nvidia-metadata index 6ec82b56bd..34b387e45c 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/nvidia-metadata +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/files/nvidia-metadata @@ -1,2 +1,2 @@ -NVIDIA_DRIVER_VERSION=535.216.01 +NVIDIA_DRIVER_VERSION=@PV@ NVIDIA_PRODUCT_TYPE=tesla diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.216.01.ebuild b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.230.02.ebuild similarity index 78% rename from sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.216.01.ebuild rename to sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.230.02.ebuild index 53465ec289..63b6d962e9 100644 --- a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.216.01.ebuild +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-535.230.02.ebuild @@ -11,7 +11,7 @@ SRC_URI="" LICENSE="Apache-2.0" SLOT="0" -KEYWORDS="amd64 arm64" +KEYWORDS="amd64" IUSE="" # no source directory @@ -23,6 +23,8 @@ src_install() { exeinto "/usr/lib/nvidia/bin" doexe "${FILESDIR}/bin/install-nvidia" doexe "${FILESDIR}/bin/setup-nvidia" + cp "${FILESDIR}/nvidia-metadata" nvidia-metadata || die "cp failed" + sed -i -e "s/@PV@/${PV}/" nvidia-metadata insinto "/usr/share/flatcar" - doins "${FILESDIR}/nvidia-metadata" + doins nvidia-metadata } diff --git a/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-570.86.15.ebuild b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-570.86.15.ebuild new file mode 100644 index 0000000000..0e9ef08f4c --- /dev/null +++ b/sdk_container/src/third_party/coreos-overlay/x11-drivers/nvidia-drivers/nvidia-drivers-570.86.15.ebuild @@ -0,0 +1,30 @@ +# Copyright (c) 2020 Kinvolk GmbH. All rights reserved. +# Distributed under the terms of the GNU General Public License v2 + +EAPI=7 + +inherit systemd + +DESCRIPTION="NVIDIA drivers" +HOMEPAGE="" +SRC_URI="" + +LICENSE="Apache-2.0" +SLOT="0" +KEYWORDS="arm64" +IUSE="" + +# no source directory +S="${WORKDIR}" + +src_install() { + systemd_dounit "${FILESDIR}/units/nvidia.service" + systemd_enable_service multi-user.target nvidia.service + exeinto "/usr/lib/nvidia/bin" + doexe "${FILESDIR}/bin/install-nvidia" + doexe "${FILESDIR}/bin/setup-nvidia" + cp "${FILESDIR}/nvidia-metadata" nvidia-metadata || die "cp failed" + sed -i -e "s/@PV@/${PV}/" nvidia-metadata + insinto "/usr/share/flatcar" + doins nvidia-metadata +} From 1348abd68501f89bf12e8332565f3f1019c01e87 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Fri, 14 Mar 2025 10:46:34 +0100 Subject: [PATCH 9/9] changelog: Add entries for nvidia.service changes Signed-off-by: Jeremi Piotrowski --- .../bugfixes/2025-03-07-coreos-modules-cross-compilation.md | 1 + changelog/bugfixes/2025-03-07-nvidia-driver-type.md | 1 + changelog/changes/2025-03-07-nvidia-arm64-support.md | 1 + changelog/updates/2025-03-07-nvidia.md | 2 ++ 4 files changed, 5 insertions(+) create mode 100644 changelog/bugfixes/2025-03-07-coreos-modules-cross-compilation.md create mode 100644 changelog/bugfixes/2025-03-07-nvidia-driver-type.md create mode 100644 changelog/changes/2025-03-07-nvidia-arm64-support.md create mode 100644 changelog/updates/2025-03-07-nvidia.md diff --git a/changelog/bugfixes/2025-03-07-coreos-modules-cross-compilation.md b/changelog/bugfixes/2025-03-07-coreos-modules-cross-compilation.md new file mode 100644 index 0000000000..f9e46f714a --- /dev/null +++ b/changelog/bugfixes/2025-03-07-coreos-modules-cross-compilation.md @@ -0,0 +1 @@ +- The kernel module build directory now contains native binaries in arm64 images instead of the previous amd64 binaries ([scripts#2694](https://github.com/flatcar/scripts/pull/2694)) \ No newline at end of file diff --git a/changelog/bugfixes/2025-03-07-nvidia-driver-type.md b/changelog/bugfixes/2025-03-07-nvidia-driver-type.md new file mode 100644 index 0000000000..e717994468 --- /dev/null +++ b/changelog/bugfixes/2025-03-07-nvidia-driver-type.md @@ -0,0 +1 @@ +- Nvidia driver installer service now supports the 570 driver branch by forcing the use of the proprietary kernel module. The 570 branch defaults to the kernel-open driver which requires loading firmware, which is not yet supported on Flatcar. ([scripts#2694](https://github.com/flatcar/scripts/pull/2694)) \ No newline at end of file diff --git a/changelog/changes/2025-03-07-nvidia-arm64-support.md b/changelog/changes/2025-03-07-nvidia-arm64-support.md new file mode 100644 index 0000000000..65957180eb --- /dev/null +++ b/changelog/changes/2025-03-07-nvidia-arm64-support.md @@ -0,0 +1 @@ +- Added support for ARM64 architecture in the NVIDIA driver installer service ([scripts#2694](https://github.com/flatcar/scripts/pull/2694)) \ No newline at end of file diff --git a/changelog/updates/2025-03-07-nvidia.md b/changelog/updates/2025-03-07-nvidia.md new file mode 100644 index 0000000000..9c89b70c6c --- /dev/null +++ b/changelog/updates/2025-03-07-nvidia.md @@ -0,0 +1,2 @@ +- AMD64: nvidia-drivers ([535.230.02](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-535-230-02/index.html)) +- ARM64: nvidia-drivers ([570.86.15](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-570-86-15/index.html))