chore(ci): add more nvidia test matrix

Add more NVIDIA tests covering all supported OSS and Proprietary LTS and Production driver versions.

Fixes: #11398

Signed-off-by: Noel Georgi <git@frezbo.dev>
This commit is contained in:
Noel Georgi 2025-07-21 15:42:23 +05:30
parent 451c2c4c39
commit 450b30d5a9
No known key found for this signature in database
GPG Key ID: 21A9F444075C9E36
13 changed files with 906 additions and 167 deletions

View File

@ -33,7 +33,8 @@
"customType": "regex",
"versioningTemplate": "{{#if versioning}}{{versioning}}{{else}}semver{{/if}}",
"managerFilePatterns": [
"/internal/integration/k8s/constants.go/"
"/internal/integration/k8s/constants.go/",
"/internal/integration/api/constants.go/"
],
"matchStrings": [
"\\/\\/\\s+renovate: datasource=(?<datasource>.*?)(?:\\s+extractVersion=(?<extractVersion>.+?))?(?:\\s+versioning=(?<versioning>.+?))?\\s+depName=(?<depName>.+?)?(?:\\s+registryUrl=(?<registryUrl>.+?))?\\s.*Version\\s+=\\s+\\\"(?<currentValue>.+?)\\\""

View File

@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2025-07-21T14:04:07Z by kres b869533.
# Generated on 2025-07-22T04:25:57Z by kres b869533.
concurrency:
group: ${{ github.head_ref || github.run_id }}
@ -704,7 +704,7 @@ jobs:
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-aws-nvidia-nonfree:
integration-aws-nvidia-nonfree-lts:
permissions:
actions: read
contents: write
@ -714,7 +714,7 @@ jobs:
runs-on:
- self-hosted
- generic
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree')
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree-lts') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws')
needs:
- default
steps:
@ -797,13 +797,6 @@ jobs:
if: github.event_name == 'schedule'
run: |
make talosctl-cni-bundle
- name: images-essential
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64,linux/arm64
run: |
make images-essential
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
@ -828,7 +821,7 @@ jobs:
make nvidia-container-toolkit-lts nonfree-kmod-nvidia-lts extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-nonfree
E2E_AWS_TARGET: nvidia-nonfree-lts
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
run: |
@ -850,7 +843,7 @@ jobs:
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-nonfree
- name: e2e-aws-nvidia-nonfree-lts
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia
run: |
@ -864,7 +857,7 @@ jobs:
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-aws-nvidia-oss:
integration-aws-nvidia-nonfree-production:
permissions:
actions: read
contents: write
@ -874,7 +867,7 @@ jobs:
runs-on:
- self-hosted
- generic
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss')
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree-production') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws')
needs:
- default
steps:
@ -957,13 +950,312 @@ jobs:
if: github.event_name == 'schedule'
run: |
make talosctl-cni-bundle
- name: images-essential
if: github.event_name == 'schedule'
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64,linux/arm64
run: |
make images-essential
make image-aws
- name: checkout extensions
uses: actions/checkout@v4
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/amd64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-nonfree-production
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@v4
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@v3
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-nonfree-production
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-aws-nvidia-oss-lts:
permissions:
actions: read
contents: write
issues: read
packages: write
pull-requests: read
runs-on:
- self-hosted
- generic
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss-lts') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws')
needs:
- default
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@v1.3.1
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@v4
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@v3
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@v4
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64,linux/arm64
PUSH: "true"
run: |
make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
- name: talosctl-cni-bundle
if: github.event_name == 'schedule'
run: |
make talosctl-cni-bundle
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64,linux/arm64
run: |
make image-aws
- name: checkout extensions
uses: actions/checkout@v4
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/amd64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts zfs extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-oss-lts
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@v4
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@v3
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss-lts
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-aws-nvidia-oss-production:
permissions:
actions: read
contents: write
issues: read
packages: write
pull-requests: read
runs-on:
- self-hosted
- generic
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss-production') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws')
needs:
- default
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@v1.3.1
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@v4
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@v3
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@v4
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64,linux/arm64
PUSH: "true"
run: |
make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
- name: talosctl-cni-bundle
if: github.event_name == 'schedule'
run: |
make talosctl-cni-bundle
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
@ -988,7 +1280,7 @@ jobs:
make nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-oss
E2E_AWS_TARGET: nvidia-oss-production
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
run: |
@ -1010,7 +1302,7 @@ jobs:
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss
- name: e2e-aws-nvidia-oss-production
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
run: |

View File

@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2025-07-21T09:52:07Z by kres b869533.
# Generated on 2025-07-22T04:25:57Z by kres b869533.
concurrency:
group: ${{ github.head_ref || github.run_id }}
@ -8,7 +8,7 @@ concurrency:
"on":
schedule:
- cron: 30 7 * * *
name: integration-aws-nvidia-nonfree-cron
name: integration-aws-nvidia-nonfree-lts-cron
jobs:
default:
runs-on:
@ -94,13 +94,6 @@ jobs:
if: github.event_name == 'schedule'
run: |
make talosctl-cni-bundle
- name: images-essential
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64,linux/arm64
run: |
make images-essential
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
@ -125,7 +118,7 @@ jobs:
make nvidia-container-toolkit-lts nonfree-kmod-nvidia-lts extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-nonfree
E2E_AWS_TARGET: nvidia-nonfree-lts
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
run: |
@ -147,7 +140,7 @@ jobs:
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-nonfree
- name: e2e-aws-nvidia-nonfree-lts
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia
run: |

View File

@ -0,0 +1,156 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2025-07-22T04:25:57Z by kres b869533.
concurrency:
group: ${{ github.head_ref || github.run_id }}
cancel-in-progress: true
"on":
schedule:
- cron: 30 7 * * *
name: integration-aws-nvidia-nonfree-production-cron
jobs:
default:
runs-on:
- self-hosted
- generic
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@v1.3.1
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@v4
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@v3
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@v4
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64,linux/arm64
PUSH: "true"
run: |
make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
- name: talosctl-cni-bundle
if: github.event_name == 'schedule'
run: |
make talosctl-cni-bundle
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64,linux/arm64
run: |
make image-aws
- name: checkout extensions
uses: actions/checkout@v4
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/amd64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-nonfree-production
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@v4
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@v3
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-nonfree-production
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf

View File

@ -0,0 +1,156 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2025-07-22T04:25:57Z by kres b869533.
concurrency:
group: ${{ github.head_ref || github.run_id }}
cancel-in-progress: true
"on":
schedule:
- cron: 30 5 * * *
name: integration-aws-nvidia-oss-lts-cron
jobs:
default:
runs-on:
- self-hosted
- generic
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@v1.3.1
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@v4
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@v3
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@v4
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64,linux/arm64
PUSH: "true"
run: |
make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
- name: talosctl-cni-bundle
if: github.event_name == 'schedule'
run: |
make talosctl-cni-bundle
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64,linux/arm64
run: |
make image-aws
- name: checkout extensions
uses: actions/checkout@v4
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/amd64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts zfs extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-oss-lts
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@v4
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@v3
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss-lts
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf

View File

@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2025-07-21T09:52:07Z by kres b869533.
# Generated on 2025-07-22T04:25:57Z by kres b869533.
concurrency:
group: ${{ github.head_ref || github.run_id }}
@ -8,7 +8,7 @@ concurrency:
"on":
schedule:
- cron: 30 5 * * *
name: integration-aws-nvidia-oss-cron
name: integration-aws-nvidia-oss-production-cron
jobs:
default:
runs-on:
@ -94,13 +94,6 @@ jobs:
if: github.event_name == 'schedule'
run: |
make talosctl-cni-bundle
- name: images-essential
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/amd64,linux/arm64
run: |
make images-essential
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
@ -125,7 +118,7 @@ jobs:
make nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-oss
E2E_AWS_TARGET: nvidia-oss-production
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
run: |
@ -147,7 +140,7 @@ jobs:
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss
- name: e2e-aws-nvidia-oss-production
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
run: |

View File

@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2025-07-21T16:10:17Z by kres b869533.
# Generated on 2025-07-22T04:25:57Z by kres b869533.
"on":
workflow_run:
@ -36,8 +36,10 @@
- integration-image-cache-cron
- integration-image-factory-cron
- integration-aws-cron
- integration-aws-nvidia-oss-cron
- integration-aws-nvidia-nonfree-cron
- integration-aws-nvidia-oss-lts-cron
- integration-aws-nvidia-oss-production-cron
- integration-aws-nvidia-nonfree-lts-cron
- integration-aws-nvidia-nonfree-production-cron
- integration-gcp-cron
types:
- completed

View File

@ -85,8 +85,10 @@ spec:
- integration-image-cache
- integration-image-factory
- integration-aws
- integration-aws-nvidia-oss
- integration-aws-nvidia-nonfree
- integration-aws-nvidia-oss-lts
- integration-aws-nvidia-oss-production
- integration-aws-nvidia-nonfree-lts
- integration-aws-nvidia-nonfree-production
- integration-gcp
---
kind: common.GHWorkflow
@ -2602,7 +2604,7 @@ spec:
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-aws-nvidia-oss
- name: integration-aws-nvidia-oss-lts
buildxOptions:
enabled: true
sops: true
@ -2614,7 +2616,10 @@ spec:
crons:
- '30 5 * * *'
triggerLabels:
- integration/aws-nvidia-oss-lts
- integration/aws-nvidia-oss
- integration/aws-nvidia
- integration/aws
steps:
- name: download-artifacts
conditions:
@ -2645,12 +2650,6 @@ spec:
- name: talosctl-cni-bundle
conditions:
- only-on-schedule
- name: images-essential
conditions:
- only-on-schedule
environment:
PLATFORM: linux/amd64,linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: image-aws
environment:
PLATFORM: linux/amd64,linux/arm64
@ -2664,7 +2663,9 @@ spec:
nonMakeStep: true
command: cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
command: nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata
# zfs is only added since it uses libtirpc from musl and nvidia needs libtirpc from glibc
# this verifies that both libtirpc can co-exist together
command: nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts zfs extensions-metadata
arguments:
- -C
- _out/extensions
@ -2675,7 +2676,7 @@ spec:
- name: e2e-aws-prepare
environment:
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
E2E_AWS_TARGET: nvidia-oss
E2E_AWS_TARGET: nvidia-oss-lts
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: checkout contrib
checkoutStep:
@ -2690,7 +2691,7 @@ spec:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: apply
- name: e2e-aws-nvidia-oss
- name: e2e-aws-nvidia-oss-lts
command: e2e-aws
environment:
EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false"
@ -2703,7 +2704,7 @@ spec:
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-aws-nvidia-nonfree
- name: integration-aws-nvidia-oss-production
buildxOptions:
enabled: true
sops: true
@ -2713,9 +2714,12 @@ spec:
- self-hosted
- generic # we can use generic here since the tests run against a remote talos cluster
crons:
- '30 7 * * *'
- '30 5 * * *'
triggerLabels:
- integration/aws-nvidia-nonfree
- integration/aws-nvidia-oss-production
- integration/aws-nvidia-oss
- integration/aws-nvidia
- integration/aws
steps:
- name: download-artifacts
conditions:
@ -2746,12 +2750,106 @@ spec:
- name: talosctl-cni-bundle
conditions:
- only-on-schedule
- name: images-essential
conditions:
- only-on-schedule
- name: image-aws
environment:
PLATFORM: linux/amd64,linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: checkout extensions
checkoutStep:
repository: siderolabs/extensions
ref: main
path: _out/extensions
- name: set variables
nonMakeStep: true
command: cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
# zfs is only added since it uses libtirpc from musl and nvidia needs libtirpc from glibc
# this verifies that both libtirpc can co-exist together
command: nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata
arguments:
- -C
- _out/extensions
environment:
PLATFORM: linux/amd64
PUSH: true
REGISTRY: registry.dev.siderolabs.io
- name: e2e-aws-prepare
environment:
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
E2E_AWS_TARGET: nvidia-oss-production
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: checkout contrib
checkoutStep:
repository: siderolabs/contrib
ref: main
path: _out/contrib
- name: setup tf
terraformStep: true
- name: tf apply
command: e2e-cloud-tf
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: apply
- name: e2e-aws-nvidia-oss-production
command: e2e-aws
environment:
EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false"
- name: tf destroy
command: e2e-cloud-tf
conditions:
- always
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-aws-nvidia-nonfree-lts
buildxOptions:
enabled: true
sops: true
depends:
- default
runners:
- self-hosted
- generic # we can use generic here since the tests run against a remote talos cluster
crons:
- '30 7 * * *'
triggerLabels:
- integration/aws-nvidia-nonfree-lts
- integration/aws-nvidia-nonfree
- integration/aws-nvidia
- integration/aws
steps:
- name: download-artifacts
conditions:
- not-on-schedule
artifactStep:
type: download
artifactName: talos-artifacts
artifactPath: _out
- name: ci-temp-release-tag
conditions:
- not-on-schedule
- name: generate
conditions:
- only-on-schedule
- name: uki-certs
conditions:
- only-on-schedule
environment:
PLATFORM: linux/amd64
- name: build
conditions:
- only-on-schedule
command: talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
environment:
PLATFORM: linux/amd64,linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
PUSH: true
- name: talosctl-cni-bundle
conditions:
- only-on-schedule
- name: image-aws
environment:
PLATFORM: linux/amd64,linux/arm64
@ -2776,7 +2874,7 @@ spec:
- name: e2e-aws-prepare
environment:
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
E2E_AWS_TARGET: nvidia-nonfree
E2E_AWS_TARGET: nvidia-nonfree-lts
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: checkout contrib
checkoutStep:
@ -2791,7 +2889,105 @@ spec:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: apply
- name: e2e-aws-nvidia-nonfree
- name: e2e-aws-nvidia-nonfree-lts
command: e2e-aws
environment:
EXTRA_TEST_ARGS: -talos.extensions.nvidia
- name: tf destroy
command: e2e-cloud-tf
conditions:
- always
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-aws-nvidia-nonfree-production
buildxOptions:
enabled: true
sops: true
depends:
- default
runners:
- self-hosted
- generic # we can use generic here since the tests run against a remote talos cluster
crons:
- '30 7 * * *'
triggerLabels:
- integration/aws-nvidia-nonfree-production
- integration/aws-nvidia-nonfree
- integration/aws-nvidia
- integration/aws
steps:
- name: download-artifacts
conditions:
- not-on-schedule
artifactStep:
type: download
artifactName: talos-artifacts
artifactPath: _out
- name: ci-temp-release-tag
conditions:
- not-on-schedule
- name: generate
conditions:
- only-on-schedule
- name: uki-certs
conditions:
- only-on-schedule
environment:
PLATFORM: linux/amd64
- name: build
conditions:
- only-on-schedule
command: talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
environment:
PLATFORM: linux/amd64,linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
PUSH: true
- name: talosctl-cni-bundle
conditions:
- only-on-schedule
- name: image-aws
environment:
PLATFORM: linux/amd64,linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: checkout extensions
checkoutStep:
repository: siderolabs/extensions
ref: main
path: _out/extensions
- name: set variables
nonMakeStep: true
command: cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
command: nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata
arguments:
- -C
- _out/extensions
environment:
PLATFORM: linux/amd64
PUSH: true
REGISTRY: registry.dev.siderolabs.io
- name: e2e-aws-prepare
environment:
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
E2E_AWS_TARGET: nvidia-nonfree-production
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: checkout contrib
checkoutStep:
repository: siderolabs/contrib
ref: main
path: _out/contrib
- name: setup tf
terraformStep: true
- name: tf apply
command: e2e-cloud-tf
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: apply
- name: e2e-aws-nvidia-nonfree-production
command: e2e-aws
environment:
EXTRA_TEST_ARGS: -talos.extensions.nvidia
@ -2898,6 +3094,7 @@ spec:
- customType: regex
managerFilePatterns:
- internal/integration/k8s/constants.go
- internal/integration/api/constants.go
matchStrings:
- '\/\/\s+renovate: datasource=(?<datasource>.*?)(?:\s+extractVersion=(?<extractVersion>.+?))?(?:\s+versioning=(?<versioning>.+?))?\s+depName=(?<depName>.+?)?(?:\s+registryUrl=(?<registryUrl>.+?))?\s.*Version\s+=\s+\"(?<currentValue>.+?)\"'
versioningTemplate: "{{#if versioning}}{{versioning}}{{else}}semver{{/if}}"

View File

@ -10,7 +10,7 @@ function cloud_image_upload() {
CLOUD_IMAGES_EXTRA_ARGS=("--name-prefix=${1}" "--target-clouds=aws" "--architectures=amd64" "--aws-regions=${REGION}")
case "${1}" in
talos-e2e-nvidia-oss)
talos-e2e-nvidia-oss-*)
CLOUD_IMAGES_EXTRA_ARGS+=("--aws-force-bios")
;;
esac
@ -24,15 +24,21 @@ function get_ami_id() {
function cloud_image_upload_with_extensions() {
case "${1}" in
nvidia-oss)
nvidia-oss-lts)
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nvidia-open-gpu-kernel-modules-lts") or contains("nvidia-container-toolkit-lts") or contains("zfs")) and (contains("nvidia-fabricmanager") or contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
;;
nvidia-oss-production)
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nvidia-open-gpu-kernel-modules-production") or contains("nvidia-container-toolkit-production") or contains("zfs")) and (contains("nvidia-fabricmanager") or contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
;;
nvidia-oss-fabricmanager)
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nvidia-open-gpu-kernel-modules-production") or contains("nvidia-container-toolkit-production")) and (contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
;;
nvidia-nonfree)
nvidia-nonfree-lts)
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nonfree-kmod-nvidia-lts") or contains("nvidia-container-toolkit-lts")) and (contains("nvidia-fabricmanager") or contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
;;
nvidia-nonfree-production)
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nonfree-kmod-nvidia-production") or contains("nvidia-container-toolkit-production")) and (contains("nvidia-fabricmanager") or contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
;;
nvidia-nonfree-fabricmanager)
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nonfree-kmod-nvidia-lts") or contains("nvidia-container-toolkit-lts")) and (contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
;;
@ -65,7 +71,7 @@ esac
mkdir -p "${ARTIFACTS}/e2e-aws-generated"
NAME_PREFIX="talos-e2e-${SHA}-aws-${E2E_AWS_TARGET}"
NAME_PREFIX="${SHA}-${E2E_AWS_TARGET}"
jq --null-input \
--arg WORKER_GROUP "${WORKER_GROUP}" \

View File

@ -0,0 +1,16 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
//go:build integration_api
package api
const (
// NvidiaDevicePluginChartVersion is the version of the NVIDA device plugin chart to use
// renovate: datasource=helm versioning=helm depName=nvidia-device-plugin registryUrl=https://nvidia.github.io/k8s-device-plugin
NvidiaDevicePluginChartVersion = "v0.17.2"
// NvidiaCUDATestImageVersion is the version of the NVIDIA CUDA test image to use
// renovate: datasource=docker depName=nvcr.io/nvidia/k8s/cuda-sample
NvidiaCUDATestImageVersion = "vectoradd-cuda12.5.0"
)

View File

@ -8,12 +8,13 @@ package api
import (
"context"
_ "embed"
"fmt"
"io"
"time"
"github.com/siderolabs/go-pointer"
"github.com/siderolabs/go-retry/retry"
appsv1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
nodev1 "k8s.io/api/node/v1"
@ -22,6 +23,9 @@ import (
"github.com/siderolabs/talos/internal/integration/base"
)
//go:embed testdata/nvidia-device-plugin.yaml
var nvidiaDevicePluginHelmChartValues []byte
// ExtensionsSuiteNVIDIA verifies Talos is securebooted.
type ExtensionsSuiteNVIDIA struct {
base.K8sSuite
@ -97,13 +101,18 @@ func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() {
suite.Require().NoError(err)
_, err = suite.Clientset.AppsV1().DaemonSets("kube-system").Create(suite.ctx, nvidiaDevicePluginDaemonSetSpec(), metav1.CreateOptions{})
defer suite.Clientset.AppsV1().DaemonSets("kube-system").Delete(suite.ctx, "nvidia-device-plugin", metav1.DeleteOptions{}) //nolint:errcheck
suite.Require().NoError(err)
suite.Require().NoError(suite.HelmInstall(
suite.ctx,
"kube-system",
"https://nvidia.github.io/k8s-device-plugin",
NvidiaDevicePluginChartVersion,
"nvidia-device-plugin",
"nvidia-device-plugin",
nvidiaDevicePluginHelmChartValues,
))
// now we can create a cuda test job
_, err = suite.Clientset.BatchV1().Jobs("default").Create(suite.ctx, nvidiaCUDATestJob("nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1"), metav1.CreateOptions{})
_, err = suite.Clientset.BatchV1().Jobs("default").Create(suite.ctx, nvidiaCUDATestJob(), metav1.CreateOptions{})
defer suite.Clientset.BatchV1().Jobs("default").Delete(suite.ctx, "cuda-test", metav1.DeleteOptions{}) //nolint:errcheck
suite.Require().NoError(err)
@ -199,80 +208,7 @@ func (suite *ExtensionsSuiteNVIDIA) getNVIDIANodes(labelQuery string) []string {
return nodeList
}
func nvidiaDevicePluginDaemonSetSpec() *appsv1.DaemonSet {
return &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Name: "nvidia-device-plugin",
},
Spec: appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"app.kubernetes.io/name": "nvidia-device-plugin",
},
},
UpdateStrategy: appsv1.DaemonSetUpdateStrategy{
Type: appsv1.RollingUpdateDaemonSetStrategyType,
},
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"app.kubernetes.io/name": "nvidia-device-plugin",
},
},
Spec: corev1.PodSpec{
PriorityClassName: "system-node-critical",
RuntimeClassName: pointer.To("nvidia"),
Containers: []corev1.Container{
{
Name: "nvidia-device-plugin-ctr",
Image: "nvcr.io/nvidia/k8s-device-plugin:v0.14.1",
Env: []corev1.EnvVar{
{
Name: "NVIDIA_MIG_MONITOR_DEVICES",
Value: "all",
},
},
SecurityContext: &corev1.SecurityContext{
Capabilities: &corev1.Capabilities{
Add: []corev1.Capability{"SYS_ADMIN"},
},
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "device-plugin",
MountPath: "/var/lib/kubelet/device-plugins",
},
},
},
},
Volumes: []corev1.Volume{
{
Name: "device-plugin",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/var/lib/kubelet/device-plugins",
},
},
},
},
Tolerations: []corev1.Toleration{
{
Key: "CriticalAddonsOnly",
Operator: corev1.TolerationOpExists,
},
{
Effect: corev1.TaintEffectNoSchedule,
Key: "nvidia.com/gpu",
Operator: corev1.TolerationOpExists,
},
},
},
},
},
}
}
func nvidiaCUDATestJob(image string) *batchv1.Job {
func nvidiaCUDATestJob() *batchv1.Job {
return &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: "cuda-test",
@ -290,7 +226,7 @@ func nvidiaCUDATestJob(image string) *batchv1.Job {
Containers: []corev1.Container{
{
Name: "cuda-test",
Image: image,
Image: fmt.Sprintf("nvcr.io/nvidia/k8s/cuda-sample:%s", NvidiaCUDATestImageVersion),
},
},
Affinity: &corev1.Affinity{

View File

@ -0,0 +1 @@
runtimeClassName: nvidia

View File

@ -57,16 +57,17 @@ The NVIDIA modules should be loaded and the system extension should be installed
This can be confirmed by running:
```bash
talosctl read /proc/modules
talosctl get modules
```
which should produce an output similar to below:
```text
nvidia_uvm 1146880 - - Live 0xffffffffc2733000 (PO)
nvidia_drm 69632 - - Live 0xffffffffc2721000 (PO)
nvidia_modeset 1142784 - - Live 0xffffffffc25ea000 (PO)
nvidia 39047168 - - Live 0xffffffffc00ac000 (PO)
NODE NAMESPACE TYPE ID VERSION STATE
10.5.0.3 runtime LoadedKernelModule nvidia_uvm 1 Live
10.5.0.3 runtime LoadedKernelModule nvidia_drm 1 Live
10.5.0.3 runtime LoadedKernelModule nvidia_modeset 1 Live
10.5.0.3 runtime LoadedKernelModule nvidia 1 Live
```
```bash
@ -81,17 +82,6 @@ NODE NAMESPACE TYPE ID
172.31.41.27 runtime ExtensionStatus 000.ghcr.io-siderolabs-nvidia-open-gpu-kernel-modules-515.65.01-v1.2.0 1 nvidia-open-gpu-kernel-modules 515.65.01-v1.2.0
```
```bash
talosctl read /proc/driver/nvidia/version
```
which should produce an output similar to below:
```text
NVRM version: NVIDIA UNIX x86_64 Kernel Module 515.65.01 Wed Mar 16 11:24:05 UTC 2022
GCC version: gcc version 12.2.0 (GCC)
```
## Deploying NVIDIA device plugin
First we need to create the `RuntimeClass`
@ -151,7 +141,7 @@ kubectl run \
nvidia-test \
--restart=Never \
-ti --rm \
--image nvcr.io/nvidia/cuda:12.5.0-base-ubuntu22.04 \
--image nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0 \
--overrides '{"spec": {"runtimeClassName": "nvidia"}}' \
nvidia-smi
```