From 450b30d5a986563869efdbaa074e82d612f6f2ef Mon Sep 17 00:00:00 2001 From: Noel Georgi Date: Mon, 21 Jul 2025 15:42:23 +0530 Subject: [PATCH] chore(ci): add more nvidia test matrix Add more NVIDIA tests covering all supported OSS and Proprietary LTS and Production driver versions. Fixes: #11398 Signed-off-by: Noel Georgi --- .github/renovate.json | 3 +- .github/workflows/ci.yaml | 330 +++++++++++++++++- ...egration-aws-nvidia-nonfree-lts-cron.yaml} | 15 +- ...on-aws-nvidia-nonfree-production-cron.yaml | 156 +++++++++ .../integration-aws-nvidia-oss-lts-cron.yaml | 156 +++++++++ ...ation-aws-nvidia-oss-production-cron.yaml} | 15 +- .github/workflows/slack-notify.yaml | 8 +- .kres.yaml | 237 +++++++++++-- hack/test/e2e-aws-prepare.sh | 14 +- internal/integration/api/constants.go | 16 + internal/integration/api/extensions_nvidia.go | 98 +----- .../api/testdata/nvidia-device-plugin.yaml | 1 + .../talos-guides/configuration/nvidia-gpu.md | 24 +- 13 files changed, 906 insertions(+), 167 deletions(-) rename .github/workflows/{integration-aws-nvidia-nonfree-cron.yaml => integration-aws-nvidia-nonfree-lts-cron.yaml} (92%) create mode 100644 .github/workflows/integration-aws-nvidia-nonfree-production-cron.yaml create mode 100644 .github/workflows/integration-aws-nvidia-oss-lts-cron.yaml rename .github/workflows/{integration-aws-nvidia-oss-cron.yaml => integration-aws-nvidia-oss-production-cron.yaml} (92%) create mode 100644 internal/integration/api/constants.go create mode 100644 internal/integration/api/testdata/nvidia-device-plugin.yaml diff --git a/.github/renovate.json b/.github/renovate.json index f02b7c67a..2f4768ab2 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -33,7 +33,8 @@ "customType": "regex", "versioningTemplate": "{{#if versioning}}{{versioning}}{{else}}semver{{/if}}", "managerFilePatterns": [ - "/internal/integration/k8s/constants.go/" + "/internal/integration/k8s/constants.go/", + "/internal/integration/api/constants.go/" ], "matchStrings": [ "\\/\\/\\s+renovate: datasource=(?.*?)(?:\\s+extractVersion=(?.+?))?(?:\\s+versioning=(?.+?))?\\s+depName=(?.+?)?(?:\\s+registryUrl=(?.+?))?\\s.*Version\\s+=\\s+\\\"(?.+?)\\\"" diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b06eb5f8a..6c840f031 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2025-07-21T14:04:07Z by kres b869533. +# Generated on 2025-07-22T04:25:57Z by kres b869533. concurrency: group: ${{ github.head_ref || github.run_id }} @@ -704,7 +704,7 @@ jobs: TF_SCRIPT_DIR: _out/contrib run: | make e2e-cloud-tf - integration-aws-nvidia-nonfree: + integration-aws-nvidia-nonfree-lts: permissions: actions: read contents: write @@ -714,7 +714,7 @@ jobs: runs-on: - self-hosted - generic - if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree') + if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree-lts') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws') needs: - default steps: @@ -797,13 +797,6 @@ jobs: if: github.event_name == 'schedule' run: | make talosctl-cni-bundle - - name: images-essential - if: github.event_name == 'schedule' - env: - IMAGE_REGISTRY: registry.dev.siderolabs.io - PLATFORM: linux/amd64,linux/arm64 - run: | - make images-essential - name: image-aws env: IMAGE_REGISTRY: registry.dev.siderolabs.io @@ -828,7 +821,7 @@ jobs: make nvidia-container-toolkit-lts nonfree-kmod-nvidia-lts extensions-metadata -C _out/extensions - name: e2e-aws-prepare env: - E2E_AWS_TARGET: nvidia-nonfree + E2E_AWS_TARGET: nvidia-nonfree-lts EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata IMAGE_REGISTRY: registry.dev.siderolabs.io run: | @@ -850,7 +843,7 @@ jobs: TF_SCRIPT_DIR: _out/contrib run: | make e2e-cloud-tf - - name: e2e-aws-nvidia-nonfree + - name: e2e-aws-nvidia-nonfree-lts env: EXTRA_TEST_ARGS: -talos.extensions.nvidia run: | @@ -864,7 +857,7 @@ jobs: TF_SCRIPT_DIR: _out/contrib run: | make e2e-cloud-tf - integration-aws-nvidia-oss: + integration-aws-nvidia-nonfree-production: permissions: actions: read contents: write @@ -874,7 +867,7 @@ jobs: runs-on: - self-hosted - generic - if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss') + if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree-production') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws') needs: - default steps: @@ -957,13 +950,312 @@ jobs: if: github.event_name == 'schedule' run: | make talosctl-cni-bundle - - name: images-essential - if: github.event_name == 'schedule' + - name: image-aws env: IMAGE_REGISTRY: registry.dev.siderolabs.io PLATFORM: linux/amd64,linux/arm64 run: | - make images-essential + make image-aws + - name: checkout extensions + uses: actions/checkout@v4 + with: + path: _out/extensions + ref: main + repository: siderolabs/extensions + - name: set variables + run: | + cat _out/talos-metadata >> "$GITHUB_ENV" + - name: build extensions + env: + PLATFORM: linux/amd64 + PUSH: "true" + REGISTRY: registry.dev.siderolabs.io + run: | + make nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata -C _out/extensions + - name: e2e-aws-prepare + env: + E2E_AWS_TARGET: nvidia-nonfree-production + EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata + IMAGE_REGISTRY: registry.dev.siderolabs.io + run: | + make e2e-aws-prepare + - name: checkout contrib + uses: actions/checkout@v4 + with: + path: _out/contrib + ref: main + repository: siderolabs/contrib + - name: setup tf + uses: hashicorp/setup-terraform@v3 + with: + terraform_wrapper: "false" + - name: tf apply + env: + TF_E2E_ACTION: apply + TF_E2E_TEST_TYPE: aws + TF_SCRIPT_DIR: _out/contrib + run: | + make e2e-cloud-tf + - name: e2e-aws-nvidia-nonfree-production + env: + EXTRA_TEST_ARGS: -talos.extensions.nvidia + run: | + make e2e-aws + - name: tf destroy + if: always() + env: + TF_E2E_ACTION: destroy + TF_E2E_REFRESH_ON_DESTROY: "false" + TF_E2E_TEST_TYPE: aws + TF_SCRIPT_DIR: _out/contrib + run: | + make e2e-cloud-tf + integration-aws-nvidia-oss-lts: + permissions: + actions: read + contents: write + issues: read + packages: write + pull-requests: read + runs-on: + - self-hosted + - generic + if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss-lts') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws') + needs: + - default + steps: + - name: gather-system-info + id: system-info + uses: kenchan0130/actions-system-info@v1.3.1 + continue-on-error: true + - name: print-system-info + run: | + MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024)) + + OUTPUTS=( + "CPU Core: ${{ steps.system-info.outputs.cpu-core }}" + "CPU Model: ${{ steps.system-info.outputs.cpu-model }}" + "Hostname: ${{ steps.system-info.outputs.hostname }}" + "NodeName: ${NODE_NAME}" + "Kernel release: ${{ steps.system-info.outputs.kernel-release }}" + "Kernel version: ${{ steps.system-info.outputs.kernel-version }}" + "Name: ${{ steps.system-info.outputs.name }}" + "Platform: ${{ steps.system-info.outputs.platform }}" + "Release: ${{ steps.system-info.outputs.release }}" + "Total memory: ${MEMORY_GB} GB" + ) + + for OUTPUT in "${OUTPUTS[@]}";do + echo "${OUTPUT}" + done + continue-on-error: true + - name: checkout + uses: actions/checkout@v4 + - name: Unshallow + run: | + git fetch --prune --unshallow + - name: Set up Docker Buildx + id: setup-buildx + uses: docker/setup-buildx-action@v3 + with: + driver: remote + endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234 + timeout-minutes: 10 + - name: Mask secrets + run: | + echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')" + - name: Set secrets for job + run: | + sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV" + - name: Download artifacts + if: github.event_name != 'schedule' + uses: actions/download-artifact@v4 + with: + name: talos-artifacts + path: _out + - name: Fix artifact permissions + if: github.event_name != 'schedule' + run: | + xargs -a _out/executable-artifacts -I {} chmod +x {} + - name: ci-temp-release-tag + if: github.event_name != 'schedule' + run: | + make ci-temp-release-tag + - name: generate + if: github.event_name == 'schedule' + run: | + make generate + - name: uki-certs + if: github.event_name == 'schedule' + env: + PLATFORM: linux/amd64 + run: | + make uki-certs + - name: build + if: github.event_name == 'schedule' + env: + IMAGE_REGISTRY: registry.dev.siderolabs.io + PLATFORM: linux/amd64,linux/arm64 + PUSH: "true" + run: | + make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64 + - name: talosctl-cni-bundle + if: github.event_name == 'schedule' + run: | + make talosctl-cni-bundle + - name: image-aws + env: + IMAGE_REGISTRY: registry.dev.siderolabs.io + PLATFORM: linux/amd64,linux/arm64 + run: | + make image-aws + - name: checkout extensions + uses: actions/checkout@v4 + with: + path: _out/extensions + ref: main + repository: siderolabs/extensions + - name: set variables + run: | + cat _out/talos-metadata >> "$GITHUB_ENV" + - name: build extensions + env: + PLATFORM: linux/amd64 + PUSH: "true" + REGISTRY: registry.dev.siderolabs.io + run: | + make nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts zfs extensions-metadata -C _out/extensions + - name: e2e-aws-prepare + env: + E2E_AWS_TARGET: nvidia-oss-lts + EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata + IMAGE_REGISTRY: registry.dev.siderolabs.io + run: | + make e2e-aws-prepare + - name: checkout contrib + uses: actions/checkout@v4 + with: + path: _out/contrib + ref: main + repository: siderolabs/contrib + - name: setup tf + uses: hashicorp/setup-terraform@v3 + with: + terraform_wrapper: "false" + - name: tf apply + env: + TF_E2E_ACTION: apply + TF_E2E_TEST_TYPE: aws + TF_SCRIPT_DIR: _out/contrib + run: | + make e2e-cloud-tf + - name: e2e-aws-nvidia-oss-lts + env: + EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false + run: | + make e2e-aws + - name: tf destroy + if: always() + env: + TF_E2E_ACTION: destroy + TF_E2E_REFRESH_ON_DESTROY: "false" + TF_E2E_TEST_TYPE: aws + TF_SCRIPT_DIR: _out/contrib + run: | + make e2e-cloud-tf + integration-aws-nvidia-oss-production: + permissions: + actions: read + contents: write + issues: read + packages: write + pull-requests: read + runs-on: + - self-hosted + - generic + if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss-production') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws') + needs: + - default + steps: + - name: gather-system-info + id: system-info + uses: kenchan0130/actions-system-info@v1.3.1 + continue-on-error: true + - name: print-system-info + run: | + MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024)) + + OUTPUTS=( + "CPU Core: ${{ steps.system-info.outputs.cpu-core }}" + "CPU Model: ${{ steps.system-info.outputs.cpu-model }}" + "Hostname: ${{ steps.system-info.outputs.hostname }}" + "NodeName: ${NODE_NAME}" + "Kernel release: ${{ steps.system-info.outputs.kernel-release }}" + "Kernel version: ${{ steps.system-info.outputs.kernel-version }}" + "Name: ${{ steps.system-info.outputs.name }}" + "Platform: ${{ steps.system-info.outputs.platform }}" + "Release: ${{ steps.system-info.outputs.release }}" + "Total memory: ${MEMORY_GB} GB" + ) + + for OUTPUT in "${OUTPUTS[@]}";do + echo "${OUTPUT}" + done + continue-on-error: true + - name: checkout + uses: actions/checkout@v4 + - name: Unshallow + run: | + git fetch --prune --unshallow + - name: Set up Docker Buildx + id: setup-buildx + uses: docker/setup-buildx-action@v3 + with: + driver: remote + endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234 + timeout-minutes: 10 + - name: Mask secrets + run: | + echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')" + - name: Set secrets for job + run: | + sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV" + - name: Download artifacts + if: github.event_name != 'schedule' + uses: actions/download-artifact@v4 + with: + name: talos-artifacts + path: _out + - name: Fix artifact permissions + if: github.event_name != 'schedule' + run: | + xargs -a _out/executable-artifacts -I {} chmod +x {} + - name: ci-temp-release-tag + if: github.event_name != 'schedule' + run: | + make ci-temp-release-tag + - name: generate + if: github.event_name == 'schedule' + run: | + make generate + - name: uki-certs + if: github.event_name == 'schedule' + env: + PLATFORM: linux/amd64 + run: | + make uki-certs + - name: build + if: github.event_name == 'schedule' + env: + IMAGE_REGISTRY: registry.dev.siderolabs.io + PLATFORM: linux/amd64,linux/arm64 + PUSH: "true" + run: | + make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64 + - name: talosctl-cni-bundle + if: github.event_name == 'schedule' + run: | + make talosctl-cni-bundle - name: image-aws env: IMAGE_REGISTRY: registry.dev.siderolabs.io @@ -988,7 +1280,7 @@ jobs: make nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata -C _out/extensions - name: e2e-aws-prepare env: - E2E_AWS_TARGET: nvidia-oss + E2E_AWS_TARGET: nvidia-oss-production EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata IMAGE_REGISTRY: registry.dev.siderolabs.io run: | @@ -1010,7 +1302,7 @@ jobs: TF_SCRIPT_DIR: _out/contrib run: | make e2e-cloud-tf - - name: e2e-aws-nvidia-oss + - name: e2e-aws-nvidia-oss-production env: EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false run: | diff --git a/.github/workflows/integration-aws-nvidia-nonfree-cron.yaml b/.github/workflows/integration-aws-nvidia-nonfree-lts-cron.yaml similarity index 92% rename from .github/workflows/integration-aws-nvidia-nonfree-cron.yaml rename to .github/workflows/integration-aws-nvidia-nonfree-lts-cron.yaml index 3542c06c5..85366bf3d 100644 --- a/.github/workflows/integration-aws-nvidia-nonfree-cron.yaml +++ b/.github/workflows/integration-aws-nvidia-nonfree-lts-cron.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2025-07-21T09:52:07Z by kres b869533. +# Generated on 2025-07-22T04:25:57Z by kres b869533. concurrency: group: ${{ github.head_ref || github.run_id }} @@ -8,7 +8,7 @@ concurrency: "on": schedule: - cron: 30 7 * * * -name: integration-aws-nvidia-nonfree-cron +name: integration-aws-nvidia-nonfree-lts-cron jobs: default: runs-on: @@ -94,13 +94,6 @@ jobs: if: github.event_name == 'schedule' run: | make talosctl-cni-bundle - - name: images-essential - if: github.event_name == 'schedule' - env: - IMAGE_REGISTRY: registry.dev.siderolabs.io - PLATFORM: linux/amd64,linux/arm64 - run: | - make images-essential - name: image-aws env: IMAGE_REGISTRY: registry.dev.siderolabs.io @@ -125,7 +118,7 @@ jobs: make nvidia-container-toolkit-lts nonfree-kmod-nvidia-lts extensions-metadata -C _out/extensions - name: e2e-aws-prepare env: - E2E_AWS_TARGET: nvidia-nonfree + E2E_AWS_TARGET: nvidia-nonfree-lts EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata IMAGE_REGISTRY: registry.dev.siderolabs.io run: | @@ -147,7 +140,7 @@ jobs: TF_SCRIPT_DIR: _out/contrib run: | make e2e-cloud-tf - - name: e2e-aws-nvidia-nonfree + - name: e2e-aws-nvidia-nonfree-lts env: EXTRA_TEST_ARGS: -talos.extensions.nvidia run: | diff --git a/.github/workflows/integration-aws-nvidia-nonfree-production-cron.yaml b/.github/workflows/integration-aws-nvidia-nonfree-production-cron.yaml new file mode 100644 index 000000000..4e29fed93 --- /dev/null +++ b/.github/workflows/integration-aws-nvidia-nonfree-production-cron.yaml @@ -0,0 +1,156 @@ +# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. +# +# Generated on 2025-07-22T04:25:57Z by kres b869533. + +concurrency: + group: ${{ github.head_ref || github.run_id }} + cancel-in-progress: true +"on": + schedule: + - cron: 30 7 * * * +name: integration-aws-nvidia-nonfree-production-cron +jobs: + default: + runs-on: + - self-hosted + - generic + steps: + - name: gather-system-info + id: system-info + uses: kenchan0130/actions-system-info@v1.3.1 + continue-on-error: true + - name: print-system-info + run: | + MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024)) + + OUTPUTS=( + "CPU Core: ${{ steps.system-info.outputs.cpu-core }}" + "CPU Model: ${{ steps.system-info.outputs.cpu-model }}" + "Hostname: ${{ steps.system-info.outputs.hostname }}" + "NodeName: ${NODE_NAME}" + "Kernel release: ${{ steps.system-info.outputs.kernel-release }}" + "Kernel version: ${{ steps.system-info.outputs.kernel-version }}" + "Name: ${{ steps.system-info.outputs.name }}" + "Platform: ${{ steps.system-info.outputs.platform }}" + "Release: ${{ steps.system-info.outputs.release }}" + "Total memory: ${MEMORY_GB} GB" + ) + + for OUTPUT in "${OUTPUTS[@]}";do + echo "${OUTPUT}" + done + continue-on-error: true + - name: checkout + uses: actions/checkout@v4 + - name: Unshallow + run: | + git fetch --prune --unshallow + - name: Set up Docker Buildx + id: setup-buildx + uses: docker/setup-buildx-action@v3 + with: + driver: remote + endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234 + timeout-minutes: 10 + - name: Mask secrets + run: | + echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')" + - name: Set secrets for job + run: | + sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV" + - name: Download artifacts + if: github.event_name != 'schedule' + uses: actions/download-artifact@v4 + with: + name: talos-artifacts + path: _out + - name: Fix artifact permissions + if: github.event_name != 'schedule' + run: | + xargs -a _out/executable-artifacts -I {} chmod +x {} + - name: ci-temp-release-tag + if: github.event_name != 'schedule' + run: | + make ci-temp-release-tag + - name: generate + if: github.event_name == 'schedule' + run: | + make generate + - name: uki-certs + if: github.event_name == 'schedule' + env: + PLATFORM: linux/amd64 + run: | + make uki-certs + - name: build + if: github.event_name == 'schedule' + env: + IMAGE_REGISTRY: registry.dev.siderolabs.io + PLATFORM: linux/amd64,linux/arm64 + PUSH: "true" + run: | + make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64 + - name: talosctl-cni-bundle + if: github.event_name == 'schedule' + run: | + make talosctl-cni-bundle + - name: image-aws + env: + IMAGE_REGISTRY: registry.dev.siderolabs.io + PLATFORM: linux/amd64,linux/arm64 + run: | + make image-aws + - name: checkout extensions + uses: actions/checkout@v4 + with: + path: _out/extensions + ref: main + repository: siderolabs/extensions + - name: set variables + run: | + cat _out/talos-metadata >> "$GITHUB_ENV" + - name: build extensions + env: + PLATFORM: linux/amd64 + PUSH: "true" + REGISTRY: registry.dev.siderolabs.io + run: | + make nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata -C _out/extensions + - name: e2e-aws-prepare + env: + E2E_AWS_TARGET: nvidia-nonfree-production + EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata + IMAGE_REGISTRY: registry.dev.siderolabs.io + run: | + make e2e-aws-prepare + - name: checkout contrib + uses: actions/checkout@v4 + with: + path: _out/contrib + ref: main + repository: siderolabs/contrib + - name: setup tf + uses: hashicorp/setup-terraform@v3 + with: + terraform_wrapper: "false" + - name: tf apply + env: + TF_E2E_ACTION: apply + TF_E2E_TEST_TYPE: aws + TF_SCRIPT_DIR: _out/contrib + run: | + make e2e-cloud-tf + - name: e2e-aws-nvidia-nonfree-production + env: + EXTRA_TEST_ARGS: -talos.extensions.nvidia + run: | + make e2e-aws + - name: tf destroy + if: always() + env: + TF_E2E_ACTION: destroy + TF_E2E_REFRESH_ON_DESTROY: "false" + TF_E2E_TEST_TYPE: aws + TF_SCRIPT_DIR: _out/contrib + run: | + make e2e-cloud-tf diff --git a/.github/workflows/integration-aws-nvidia-oss-lts-cron.yaml b/.github/workflows/integration-aws-nvidia-oss-lts-cron.yaml new file mode 100644 index 000000000..2c170f286 --- /dev/null +++ b/.github/workflows/integration-aws-nvidia-oss-lts-cron.yaml @@ -0,0 +1,156 @@ +# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. +# +# Generated on 2025-07-22T04:25:57Z by kres b869533. + +concurrency: + group: ${{ github.head_ref || github.run_id }} + cancel-in-progress: true +"on": + schedule: + - cron: 30 5 * * * +name: integration-aws-nvidia-oss-lts-cron +jobs: + default: + runs-on: + - self-hosted + - generic + steps: + - name: gather-system-info + id: system-info + uses: kenchan0130/actions-system-info@v1.3.1 + continue-on-error: true + - name: print-system-info + run: | + MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024)) + + OUTPUTS=( + "CPU Core: ${{ steps.system-info.outputs.cpu-core }}" + "CPU Model: ${{ steps.system-info.outputs.cpu-model }}" + "Hostname: ${{ steps.system-info.outputs.hostname }}" + "NodeName: ${NODE_NAME}" + "Kernel release: ${{ steps.system-info.outputs.kernel-release }}" + "Kernel version: ${{ steps.system-info.outputs.kernel-version }}" + "Name: ${{ steps.system-info.outputs.name }}" + "Platform: ${{ steps.system-info.outputs.platform }}" + "Release: ${{ steps.system-info.outputs.release }}" + "Total memory: ${MEMORY_GB} GB" + ) + + for OUTPUT in "${OUTPUTS[@]}";do + echo "${OUTPUT}" + done + continue-on-error: true + - name: checkout + uses: actions/checkout@v4 + - name: Unshallow + run: | + git fetch --prune --unshallow + - name: Set up Docker Buildx + id: setup-buildx + uses: docker/setup-buildx-action@v3 + with: + driver: remote + endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234 + timeout-minutes: 10 + - name: Mask secrets + run: | + echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')" + - name: Set secrets for job + run: | + sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV" + - name: Download artifacts + if: github.event_name != 'schedule' + uses: actions/download-artifact@v4 + with: + name: talos-artifacts + path: _out + - name: Fix artifact permissions + if: github.event_name != 'schedule' + run: | + xargs -a _out/executable-artifacts -I {} chmod +x {} + - name: ci-temp-release-tag + if: github.event_name != 'schedule' + run: | + make ci-temp-release-tag + - name: generate + if: github.event_name == 'schedule' + run: | + make generate + - name: uki-certs + if: github.event_name == 'schedule' + env: + PLATFORM: linux/amd64 + run: | + make uki-certs + - name: build + if: github.event_name == 'schedule' + env: + IMAGE_REGISTRY: registry.dev.siderolabs.io + PLATFORM: linux/amd64,linux/arm64 + PUSH: "true" + run: | + make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64 + - name: talosctl-cni-bundle + if: github.event_name == 'schedule' + run: | + make talosctl-cni-bundle + - name: image-aws + env: + IMAGE_REGISTRY: registry.dev.siderolabs.io + PLATFORM: linux/amd64,linux/arm64 + run: | + make image-aws + - name: checkout extensions + uses: actions/checkout@v4 + with: + path: _out/extensions + ref: main + repository: siderolabs/extensions + - name: set variables + run: | + cat _out/talos-metadata >> "$GITHUB_ENV" + - name: build extensions + env: + PLATFORM: linux/amd64 + PUSH: "true" + REGISTRY: registry.dev.siderolabs.io + run: | + make nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts zfs extensions-metadata -C _out/extensions + - name: e2e-aws-prepare + env: + E2E_AWS_TARGET: nvidia-oss-lts + EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata + IMAGE_REGISTRY: registry.dev.siderolabs.io + run: | + make e2e-aws-prepare + - name: checkout contrib + uses: actions/checkout@v4 + with: + path: _out/contrib + ref: main + repository: siderolabs/contrib + - name: setup tf + uses: hashicorp/setup-terraform@v3 + with: + terraform_wrapper: "false" + - name: tf apply + env: + TF_E2E_ACTION: apply + TF_E2E_TEST_TYPE: aws + TF_SCRIPT_DIR: _out/contrib + run: | + make e2e-cloud-tf + - name: e2e-aws-nvidia-oss-lts + env: + EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false + run: | + make e2e-aws + - name: tf destroy + if: always() + env: + TF_E2E_ACTION: destroy + TF_E2E_REFRESH_ON_DESTROY: "false" + TF_E2E_TEST_TYPE: aws + TF_SCRIPT_DIR: _out/contrib + run: | + make e2e-cloud-tf diff --git a/.github/workflows/integration-aws-nvidia-oss-cron.yaml b/.github/workflows/integration-aws-nvidia-oss-production-cron.yaml similarity index 92% rename from .github/workflows/integration-aws-nvidia-oss-cron.yaml rename to .github/workflows/integration-aws-nvidia-oss-production-cron.yaml index b48ba0b53..35330d621 100644 --- a/.github/workflows/integration-aws-nvidia-oss-cron.yaml +++ b/.github/workflows/integration-aws-nvidia-oss-production-cron.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2025-07-21T09:52:07Z by kres b869533. +# Generated on 2025-07-22T04:25:57Z by kres b869533. concurrency: group: ${{ github.head_ref || github.run_id }} @@ -8,7 +8,7 @@ concurrency: "on": schedule: - cron: 30 5 * * * -name: integration-aws-nvidia-oss-cron +name: integration-aws-nvidia-oss-production-cron jobs: default: runs-on: @@ -94,13 +94,6 @@ jobs: if: github.event_name == 'schedule' run: | make talosctl-cni-bundle - - name: images-essential - if: github.event_name == 'schedule' - env: - IMAGE_REGISTRY: registry.dev.siderolabs.io - PLATFORM: linux/amd64,linux/arm64 - run: | - make images-essential - name: image-aws env: IMAGE_REGISTRY: registry.dev.siderolabs.io @@ -125,7 +118,7 @@ jobs: make nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata -C _out/extensions - name: e2e-aws-prepare env: - E2E_AWS_TARGET: nvidia-oss + E2E_AWS_TARGET: nvidia-oss-production EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata IMAGE_REGISTRY: registry.dev.siderolabs.io run: | @@ -147,7 +140,7 @@ jobs: TF_SCRIPT_DIR: _out/contrib run: | make e2e-cloud-tf - - name: e2e-aws-nvidia-oss + - name: e2e-aws-nvidia-oss-production env: EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false run: | diff --git a/.github/workflows/slack-notify.yaml b/.github/workflows/slack-notify.yaml index 3febe583d..384092952 100644 --- a/.github/workflows/slack-notify.yaml +++ b/.github/workflows/slack-notify.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2025-07-21T16:10:17Z by kres b869533. +# Generated on 2025-07-22T04:25:57Z by kres b869533. "on": workflow_run: @@ -36,8 +36,10 @@ - integration-image-cache-cron - integration-image-factory-cron - integration-aws-cron - - integration-aws-nvidia-oss-cron - - integration-aws-nvidia-nonfree-cron + - integration-aws-nvidia-oss-lts-cron + - integration-aws-nvidia-oss-production-cron + - integration-aws-nvidia-nonfree-lts-cron + - integration-aws-nvidia-nonfree-production-cron - integration-gcp-cron types: - completed diff --git a/.kres.yaml b/.kres.yaml index 8952c9d67..fcb641079 100644 --- a/.kres.yaml +++ b/.kres.yaml @@ -85,8 +85,10 @@ spec: - integration-image-cache - integration-image-factory - integration-aws - - integration-aws-nvidia-oss - - integration-aws-nvidia-nonfree + - integration-aws-nvidia-oss-lts + - integration-aws-nvidia-oss-production + - integration-aws-nvidia-nonfree-lts + - integration-aws-nvidia-nonfree-production - integration-gcp --- kind: common.GHWorkflow @@ -2602,7 +2604,7 @@ spec: TF_E2E_TEST_TYPE: aws TF_E2E_ACTION: destroy TF_E2E_REFRESH_ON_DESTROY: false - - name: integration-aws-nvidia-oss + - name: integration-aws-nvidia-oss-lts buildxOptions: enabled: true sops: true @@ -2614,7 +2616,10 @@ spec: crons: - '30 5 * * *' triggerLabels: + - integration/aws-nvidia-oss-lts - integration/aws-nvidia-oss + - integration/aws-nvidia + - integration/aws steps: - name: download-artifacts conditions: @@ -2645,12 +2650,6 @@ spec: - name: talosctl-cni-bundle conditions: - only-on-schedule - - name: images-essential - conditions: - - only-on-schedule - environment: - PLATFORM: linux/amd64,linux/arm64 - IMAGE_REGISTRY: registry.dev.siderolabs.io - name: image-aws environment: PLATFORM: linux/amd64,linux/arm64 @@ -2664,7 +2663,9 @@ spec: nonMakeStep: true command: cat _out/talos-metadata >> "$GITHUB_ENV" - name: build extensions - command: nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata + # zfs is only added since it uses libtirpc from musl and nvidia needs libtirpc from glibc + # this verifies that both libtirpc can co-exist together + command: nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts zfs extensions-metadata arguments: - -C - _out/extensions @@ -2675,7 +2676,7 @@ spec: - name: e2e-aws-prepare environment: EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata - E2E_AWS_TARGET: nvidia-oss + E2E_AWS_TARGET: nvidia-oss-lts IMAGE_REGISTRY: registry.dev.siderolabs.io - name: checkout contrib checkoutStep: @@ -2690,7 +2691,7 @@ spec: TF_SCRIPT_DIR: _out/contrib TF_E2E_TEST_TYPE: aws TF_E2E_ACTION: apply - - name: e2e-aws-nvidia-oss + - name: e2e-aws-nvidia-oss-lts command: e2e-aws environment: EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false" @@ -2703,7 +2704,7 @@ spec: TF_E2E_TEST_TYPE: aws TF_E2E_ACTION: destroy TF_E2E_REFRESH_ON_DESTROY: false - - name: integration-aws-nvidia-nonfree + - name: integration-aws-nvidia-oss-production buildxOptions: enabled: true sops: true @@ -2713,9 +2714,12 @@ spec: - self-hosted - generic # we can use generic here since the tests run against a remote talos cluster crons: - - '30 7 * * *' + - '30 5 * * *' triggerLabels: - - integration/aws-nvidia-nonfree + - integration/aws-nvidia-oss-production + - integration/aws-nvidia-oss + - integration/aws-nvidia + - integration/aws steps: - name: download-artifacts conditions: @@ -2746,12 +2750,106 @@ spec: - name: talosctl-cni-bundle conditions: - only-on-schedule - - name: images-essential - conditions: - - only-on-schedule + - name: image-aws environment: PLATFORM: linux/amd64,linux/arm64 IMAGE_REGISTRY: registry.dev.siderolabs.io + - name: checkout extensions + checkoutStep: + repository: siderolabs/extensions + ref: main + path: _out/extensions + - name: set variables + nonMakeStep: true + command: cat _out/talos-metadata >> "$GITHUB_ENV" + - name: build extensions + # zfs is only added since it uses libtirpc from musl and nvidia needs libtirpc from glibc + # this verifies that both libtirpc can co-exist together + command: nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata + arguments: + - -C + - _out/extensions + environment: + PLATFORM: linux/amd64 + PUSH: true + REGISTRY: registry.dev.siderolabs.io + - name: e2e-aws-prepare + environment: + EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata + E2E_AWS_TARGET: nvidia-oss-production + IMAGE_REGISTRY: registry.dev.siderolabs.io + - name: checkout contrib + checkoutStep: + repository: siderolabs/contrib + ref: main + path: _out/contrib + - name: setup tf + terraformStep: true + - name: tf apply + command: e2e-cloud-tf + environment: + TF_SCRIPT_DIR: _out/contrib + TF_E2E_TEST_TYPE: aws + TF_E2E_ACTION: apply + - name: e2e-aws-nvidia-oss-production + command: e2e-aws + environment: + EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false" + - name: tf destroy + command: e2e-cloud-tf + conditions: + - always + environment: + TF_SCRIPT_DIR: _out/contrib + TF_E2E_TEST_TYPE: aws + TF_E2E_ACTION: destroy + TF_E2E_REFRESH_ON_DESTROY: false + - name: integration-aws-nvidia-nonfree-lts + buildxOptions: + enabled: true + sops: true + depends: + - default + runners: + - self-hosted + - generic # we can use generic here since the tests run against a remote talos cluster + crons: + - '30 7 * * *' + triggerLabels: + - integration/aws-nvidia-nonfree-lts + - integration/aws-nvidia-nonfree + - integration/aws-nvidia + - integration/aws + steps: + - name: download-artifacts + conditions: + - not-on-schedule + artifactStep: + type: download + artifactName: talos-artifacts + artifactPath: _out + - name: ci-temp-release-tag + conditions: + - not-on-schedule + - name: generate + conditions: + - only-on-schedule + - name: uki-certs + conditions: + - only-on-schedule + environment: + PLATFORM: linux/amd64 + - name: build + conditions: + - only-on-schedule + command: talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64 + environment: + PLATFORM: linux/amd64,linux/arm64 + IMAGE_REGISTRY: registry.dev.siderolabs.io + PUSH: true + - name: talosctl-cni-bundle + conditions: + - only-on-schedule - name: image-aws environment: PLATFORM: linux/amd64,linux/arm64 @@ -2776,7 +2874,7 @@ spec: - name: e2e-aws-prepare environment: EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata - E2E_AWS_TARGET: nvidia-nonfree + E2E_AWS_TARGET: nvidia-nonfree-lts IMAGE_REGISTRY: registry.dev.siderolabs.io - name: checkout contrib checkoutStep: @@ -2791,7 +2889,105 @@ spec: TF_SCRIPT_DIR: _out/contrib TF_E2E_TEST_TYPE: aws TF_E2E_ACTION: apply - - name: e2e-aws-nvidia-nonfree + - name: e2e-aws-nvidia-nonfree-lts + command: e2e-aws + environment: + EXTRA_TEST_ARGS: -talos.extensions.nvidia + - name: tf destroy + command: e2e-cloud-tf + conditions: + - always + environment: + TF_SCRIPT_DIR: _out/contrib + TF_E2E_TEST_TYPE: aws + TF_E2E_ACTION: destroy + TF_E2E_REFRESH_ON_DESTROY: false + - name: integration-aws-nvidia-nonfree-production + buildxOptions: + enabled: true + sops: true + depends: + - default + runners: + - self-hosted + - generic # we can use generic here since the tests run against a remote talos cluster + crons: + - '30 7 * * *' + triggerLabels: + - integration/aws-nvidia-nonfree-production + - integration/aws-nvidia-nonfree + - integration/aws-nvidia + - integration/aws + steps: + - name: download-artifacts + conditions: + - not-on-schedule + artifactStep: + type: download + artifactName: talos-artifacts + artifactPath: _out + - name: ci-temp-release-tag + conditions: + - not-on-schedule + - name: generate + conditions: + - only-on-schedule + - name: uki-certs + conditions: + - only-on-schedule + environment: + PLATFORM: linux/amd64 + - name: build + conditions: + - only-on-schedule + command: talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64 + environment: + PLATFORM: linux/amd64,linux/arm64 + IMAGE_REGISTRY: registry.dev.siderolabs.io + PUSH: true + - name: talosctl-cni-bundle + conditions: + - only-on-schedule + - name: image-aws + environment: + PLATFORM: linux/amd64,linux/arm64 + IMAGE_REGISTRY: registry.dev.siderolabs.io + - name: checkout extensions + checkoutStep: + repository: siderolabs/extensions + ref: main + path: _out/extensions + - name: set variables + nonMakeStep: true + command: cat _out/talos-metadata >> "$GITHUB_ENV" + - name: build extensions + command: nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata + arguments: + - -C + - _out/extensions + environment: + PLATFORM: linux/amd64 + PUSH: true + REGISTRY: registry.dev.siderolabs.io + - name: e2e-aws-prepare + environment: + EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata + E2E_AWS_TARGET: nvidia-nonfree-production + IMAGE_REGISTRY: registry.dev.siderolabs.io + - name: checkout contrib + checkoutStep: + repository: siderolabs/contrib + ref: main + path: _out/contrib + - name: setup tf + terraformStep: true + - name: tf apply + command: e2e-cloud-tf + environment: + TF_SCRIPT_DIR: _out/contrib + TF_E2E_TEST_TYPE: aws + TF_E2E_ACTION: apply + - name: e2e-aws-nvidia-nonfree-production command: e2e-aws environment: EXTRA_TEST_ARGS: -talos.extensions.nvidia @@ -2898,6 +3094,7 @@ spec: - customType: regex managerFilePatterns: - internal/integration/k8s/constants.go + - internal/integration/api/constants.go matchStrings: - '\/\/\s+renovate: datasource=(?.*?)(?:\s+extractVersion=(?.+?))?(?:\s+versioning=(?.+?))?\s+depName=(?.+?)?(?:\s+registryUrl=(?.+?))?\s.*Version\s+=\s+\"(?.+?)\"' versioningTemplate: "{{#if versioning}}{{versioning}}{{else}}semver{{/if}}" diff --git a/hack/test/e2e-aws-prepare.sh b/hack/test/e2e-aws-prepare.sh index cbdce9841..48f5e381f 100755 --- a/hack/test/e2e-aws-prepare.sh +++ b/hack/test/e2e-aws-prepare.sh @@ -10,7 +10,7 @@ function cloud_image_upload() { CLOUD_IMAGES_EXTRA_ARGS=("--name-prefix=${1}" "--target-clouds=aws" "--architectures=amd64" "--aws-regions=${REGION}") case "${1}" in - talos-e2e-nvidia-oss) + talos-e2e-nvidia-oss-*) CLOUD_IMAGES_EXTRA_ARGS+=("--aws-force-bios") ;; esac @@ -24,15 +24,21 @@ function get_ami_id() { function cloud_image_upload_with_extensions() { case "${1}" in - nvidia-oss) + nvidia-oss-lts) + EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nvidia-open-gpu-kernel-modules-lts") or contains("nvidia-container-toolkit-lts") or contains("zfs")) and (contains("nvidia-fabricmanager") or contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")') + ;; + nvidia-oss-production) EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nvidia-open-gpu-kernel-modules-production") or contains("nvidia-container-toolkit-production") or contains("zfs")) and (contains("nvidia-fabricmanager") or contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")') ;; nvidia-oss-fabricmanager) EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nvidia-open-gpu-kernel-modules-production") or contains("nvidia-container-toolkit-production")) and (contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")') ;; - nvidia-nonfree) + nvidia-nonfree-lts) EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nonfree-kmod-nvidia-lts") or contains("nvidia-container-toolkit-lts")) and (contains("nvidia-fabricmanager") or contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")') ;; + nvidia-nonfree-production) + EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nonfree-kmod-nvidia-production") or contains("nvidia-container-toolkit-production")) and (contains("nvidia-fabricmanager") or contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")') + ;; nvidia-nonfree-fabricmanager) EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nonfree-kmod-nvidia-lts") or contains("nvidia-container-toolkit-lts")) and (contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")') ;; @@ -65,7 +71,7 @@ esac mkdir -p "${ARTIFACTS}/e2e-aws-generated" -NAME_PREFIX="talos-e2e-${SHA}-aws-${E2E_AWS_TARGET}" +NAME_PREFIX="${SHA}-${E2E_AWS_TARGET}" jq --null-input \ --arg WORKER_GROUP "${WORKER_GROUP}" \ diff --git a/internal/integration/api/constants.go b/internal/integration/api/constants.go new file mode 100644 index 000000000..5d934a841 --- /dev/null +++ b/internal/integration/api/constants.go @@ -0,0 +1,16 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +//go:build integration_api + +package api + +const ( + // NvidiaDevicePluginChartVersion is the version of the NVIDA device plugin chart to use + // renovate: datasource=helm versioning=helm depName=nvidia-device-plugin registryUrl=https://nvidia.github.io/k8s-device-plugin + NvidiaDevicePluginChartVersion = "v0.17.2" + // NvidiaCUDATestImageVersion is the version of the NVIDIA CUDA test image to use + // renovate: datasource=docker depName=nvcr.io/nvidia/k8s/cuda-sample + NvidiaCUDATestImageVersion = "vectoradd-cuda12.5.0" +) diff --git a/internal/integration/api/extensions_nvidia.go b/internal/integration/api/extensions_nvidia.go index 035a4b8bc..0ce0211a4 100644 --- a/internal/integration/api/extensions_nvidia.go +++ b/internal/integration/api/extensions_nvidia.go @@ -8,12 +8,13 @@ package api import ( "context" + _ "embed" + "fmt" "io" "time" "github.com/siderolabs/go-pointer" "github.com/siderolabs/go-retry/retry" - appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" nodev1 "k8s.io/api/node/v1" @@ -22,6 +23,9 @@ import ( "github.com/siderolabs/talos/internal/integration/base" ) +//go:embed testdata/nvidia-device-plugin.yaml +var nvidiaDevicePluginHelmChartValues []byte + // ExtensionsSuiteNVIDIA verifies Talos is securebooted. type ExtensionsSuiteNVIDIA struct { base.K8sSuite @@ -97,13 +101,18 @@ func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() { suite.Require().NoError(err) - _, err = suite.Clientset.AppsV1().DaemonSets("kube-system").Create(suite.ctx, nvidiaDevicePluginDaemonSetSpec(), metav1.CreateOptions{}) - defer suite.Clientset.AppsV1().DaemonSets("kube-system").Delete(suite.ctx, "nvidia-device-plugin", metav1.DeleteOptions{}) //nolint:errcheck - - suite.Require().NoError(err) + suite.Require().NoError(suite.HelmInstall( + suite.ctx, + "kube-system", + "https://nvidia.github.io/k8s-device-plugin", + NvidiaDevicePluginChartVersion, + "nvidia-device-plugin", + "nvidia-device-plugin", + nvidiaDevicePluginHelmChartValues, + )) // now we can create a cuda test job - _, err = suite.Clientset.BatchV1().Jobs("default").Create(suite.ctx, nvidiaCUDATestJob("nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1"), metav1.CreateOptions{}) + _, err = suite.Clientset.BatchV1().Jobs("default").Create(suite.ctx, nvidiaCUDATestJob(), metav1.CreateOptions{}) defer suite.Clientset.BatchV1().Jobs("default").Delete(suite.ctx, "cuda-test", metav1.DeleteOptions{}) //nolint:errcheck suite.Require().NoError(err) @@ -199,80 +208,7 @@ func (suite *ExtensionsSuiteNVIDIA) getNVIDIANodes(labelQuery string) []string { return nodeList } -func nvidiaDevicePluginDaemonSetSpec() *appsv1.DaemonSet { - return &appsv1.DaemonSet{ - ObjectMeta: metav1.ObjectMeta{ - Name: "nvidia-device-plugin", - }, - Spec: appsv1.DaemonSetSpec{ - Selector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app.kubernetes.io/name": "nvidia-device-plugin", - }, - }, - UpdateStrategy: appsv1.DaemonSetUpdateStrategy{ - Type: appsv1.RollingUpdateDaemonSetStrategyType, - }, - Template: corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - "app.kubernetes.io/name": "nvidia-device-plugin", - }, - }, - Spec: corev1.PodSpec{ - PriorityClassName: "system-node-critical", - RuntimeClassName: pointer.To("nvidia"), - Containers: []corev1.Container{ - { - Name: "nvidia-device-plugin-ctr", - Image: "nvcr.io/nvidia/k8s-device-plugin:v0.14.1", - Env: []corev1.EnvVar{ - { - Name: "NVIDIA_MIG_MONITOR_DEVICES", - Value: "all", - }, - }, - SecurityContext: &corev1.SecurityContext{ - Capabilities: &corev1.Capabilities{ - Add: []corev1.Capability{"SYS_ADMIN"}, - }, - }, - VolumeMounts: []corev1.VolumeMount{ - { - Name: "device-plugin", - MountPath: "/var/lib/kubelet/device-plugins", - }, - }, - }, - }, - Volumes: []corev1.Volume{ - { - Name: "device-plugin", - VolumeSource: corev1.VolumeSource{ - HostPath: &corev1.HostPathVolumeSource{ - Path: "/var/lib/kubelet/device-plugins", - }, - }, - }, - }, - Tolerations: []corev1.Toleration{ - { - Key: "CriticalAddonsOnly", - Operator: corev1.TolerationOpExists, - }, - { - Effect: corev1.TaintEffectNoSchedule, - Key: "nvidia.com/gpu", - Operator: corev1.TolerationOpExists, - }, - }, - }, - }, - }, - } -} - -func nvidiaCUDATestJob(image string) *batchv1.Job { +func nvidiaCUDATestJob() *batchv1.Job { return &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: "cuda-test", @@ -290,7 +226,7 @@ func nvidiaCUDATestJob(image string) *batchv1.Job { Containers: []corev1.Container{ { Name: "cuda-test", - Image: image, + Image: fmt.Sprintf("nvcr.io/nvidia/k8s/cuda-sample:%s", NvidiaCUDATestImageVersion), }, }, Affinity: &corev1.Affinity{ diff --git a/internal/integration/api/testdata/nvidia-device-plugin.yaml b/internal/integration/api/testdata/nvidia-device-plugin.yaml new file mode 100644 index 000000000..dacd2694c --- /dev/null +++ b/internal/integration/api/testdata/nvidia-device-plugin.yaml @@ -0,0 +1 @@ +runtimeClassName: nvidia diff --git a/website/content/v1.11/talos-guides/configuration/nvidia-gpu.md b/website/content/v1.11/talos-guides/configuration/nvidia-gpu.md index 4a34b7ec4..fbcd4c5a7 100644 --- a/website/content/v1.11/talos-guides/configuration/nvidia-gpu.md +++ b/website/content/v1.11/talos-guides/configuration/nvidia-gpu.md @@ -57,16 +57,17 @@ The NVIDIA modules should be loaded and the system extension should be installed This can be confirmed by running: ```bash -talosctl read /proc/modules +talosctl get modules ``` which should produce an output similar to below: ```text -nvidia_uvm 1146880 - - Live 0xffffffffc2733000 (PO) -nvidia_drm 69632 - - Live 0xffffffffc2721000 (PO) -nvidia_modeset 1142784 - - Live 0xffffffffc25ea000 (PO) -nvidia 39047168 - - Live 0xffffffffc00ac000 (PO) +NODE NAMESPACE TYPE ID VERSION STATE +10.5.0.3 runtime LoadedKernelModule nvidia_uvm 1 Live +10.5.0.3 runtime LoadedKernelModule nvidia_drm 1 Live +10.5.0.3 runtime LoadedKernelModule nvidia_modeset 1 Live +10.5.0.3 runtime LoadedKernelModule nvidia 1 Live ``` ```bash @@ -81,17 +82,6 @@ NODE NAMESPACE TYPE ID 172.31.41.27 runtime ExtensionStatus 000.ghcr.io-siderolabs-nvidia-open-gpu-kernel-modules-515.65.01-v1.2.0 1 nvidia-open-gpu-kernel-modules 515.65.01-v1.2.0 ``` -```bash -talosctl read /proc/driver/nvidia/version -``` - -which should produce an output similar to below: - -```text -NVRM version: NVIDIA UNIX x86_64 Kernel Module 515.65.01 Wed Mar 16 11:24:05 UTC 2022 -GCC version: gcc version 12.2.0 (GCC) -``` - ## Deploying NVIDIA device plugin First we need to create the `RuntimeClass` @@ -151,7 +141,7 @@ kubectl run \ nvidia-test \ --restart=Never \ -ti --rm \ - --image nvcr.io/nvidia/cuda:12.5.0-base-ubuntu22.04 \ + --image nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0 \ --overrides '{"spec": {"runtimeClassName": "nvidia"}}' \ nvidia-smi ```