feat(ci): add nvidia arm64 matrix

Add NVIDIA arm64 test matrix.

Also ensure we have a known baseline for nvidia cdi files,
so if upstream adds more files and we don't install to right location
the test would fail.

Signed-off-by: Noel Georgi <git@frezbo.dev>
(cherry picked from commit 6a3ab87c54f83f70869a2e298e6ed7722cf4afad)
This commit is contained in:
Noel Georgi 2026-04-08 09:52:13 +08:00 committed by Andrey Smirnov
parent cd73b4a822
commit 67a34a6eb3
No known key found for this signature in database
GPG Key ID: 322C6F63F594CE7C
13 changed files with 1745 additions and 15 deletions

View File

@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2026-04-15T14:42:53Z by kres b6d29bf.
# Generated on 2026-04-15T14:54:12Z by kres b6d29bf.
concurrency:
group: ${{ github.head_ref || github.run_id }}
@ -972,6 +972,156 @@ jobs:
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-aws-nvidia-nonfree-lts-arm64:
permissions:
actions: read
contents: write
issues: read
packages: write
pull-requests: read
runs-on:
group: generic
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree-lts-arm64') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia')
needs:
- default
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@59699597e84e80085a750998045983daa49274c4 # version: v1.4.0
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # version: v4.0.0
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # version: v8.0.1
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
PUSH: "true"
run: |
make installer-base imager _out/integration-test-linux-amd64
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
run: |
make image-aws
- name: checkout extensions
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/arm64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-lts nonfree-kmod-nvidia-lts extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-nonfree-lts
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@5e8dbf3c6d9deaf4193ca7a8fb23f2ac83bb6c85 # version: v4.0.0
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-nonfree-lts
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-aws-nvidia-nonfree-production:
permissions:
actions: read
@ -1125,6 +1275,156 @@ jobs:
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-aws-nvidia-nonfree-production-arm64:
permissions:
actions: read
contents: write
issues: read
packages: write
pull-requests: read
runs-on:
group: generic
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree-production-arm64') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia')
needs:
- default
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@59699597e84e80085a750998045983daa49274c4 # version: v1.4.0
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # version: v4.0.0
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # version: v8.0.1
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
PUSH: "true"
run: |
make installer-base imager _out/integration-test-linux-amd64
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
run: |
make image-aws
- name: checkout extensions
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/arm64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-nonfree-production
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@5e8dbf3c6d9deaf4193ca7a8fb23f2ac83bb6c85 # version: v4.0.0
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss-production
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-aws-nvidia-oss-lts:
permissions:
actions: read
@ -1278,6 +1578,156 @@ jobs:
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-aws-nvidia-oss-lts-arm64:
permissions:
actions: read
contents: write
issues: read
packages: write
pull-requests: read
runs-on:
group: generic
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss-lts-arm64') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia')
needs:
- default
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@59699597e84e80085a750998045983daa49274c4 # version: v1.4.0
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # version: v4.0.0
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # version: v8.0.1
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
PUSH: "true"
run: |
make installer-base imager _out/integration-test-linux-amd64
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
run: |
make image-aws
- name: checkout extensions
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/arm64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-oss-lts
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@5e8dbf3c6d9deaf4193ca7a8fb23f2ac83bb6c85 # version: v4.0.0
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss-lts
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-aws-nvidia-oss-production:
permissions:
actions: read
@ -1431,6 +1881,156 @@ jobs:
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-aws-nvidia-oss-production-arm64:
permissions:
actions: read
contents: write
issues: read
packages: write
pull-requests: read
runs-on:
group: generic
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss-production-arm64') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia')
needs:
- default
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@59699597e84e80085a750998045983daa49274c4 # version: v1.4.0
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # version: v4.0.0
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # version: v8.0.1
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
PUSH: "true"
run: |
make installer-base imager _out/integration-test-linux-amd64
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
run: |
make image-aws
- name: checkout extensions
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/arm64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-oss-production
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@5e8dbf3c6d9deaf4193ca7a8fb23f2ac83bb6c85 # version: v4.0.0
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss-production
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
integration-cilium:
permissions:
actions: read

View File

@ -0,0 +1,153 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2026-04-11T06:20:55Z by kres b6d29bf.
concurrency:
group: ${{ github.head_ref || github.run_id }}
cancel-in-progress: true
"on":
schedule:
- cron: 30 7 * * *
name: integration-aws-nvidia-nonfree-lts-arm64-cron
jobs:
default:
runs-on:
group: generic
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@59699597e84e80085a750998045983daa49274c4 # version: v1.4.0
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # version: v4.0.0
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # version: v8.0.1
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
PUSH: "true"
run: |
make installer-base imager _out/integration-test-linux-amd64
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
run: |
make image-aws
- name: checkout extensions
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/arm64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-lts nonfree-kmod-nvidia-lts extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-nonfree-lts
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@5e8dbf3c6d9deaf4193ca7a8fb23f2ac83bb6c85 # version: v4.0.0
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-nonfree-lts
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf

View File

@ -0,0 +1,153 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2026-04-11T06:20:55Z by kres b6d29bf.
concurrency:
group: ${{ github.head_ref || github.run_id }}
cancel-in-progress: true
"on":
schedule:
- cron: 30 7 * * *
name: integration-aws-nvidia-nonfree-production-arm64-cron
jobs:
default:
runs-on:
group: generic
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@59699597e84e80085a750998045983daa49274c4 # version: v1.4.0
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # version: v4.0.0
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # version: v8.0.1
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
PUSH: "true"
run: |
make installer-base imager _out/integration-test-linux-amd64
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
run: |
make image-aws
- name: checkout extensions
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/arm64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-nonfree-production
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@5e8dbf3c6d9deaf4193ca7a8fb23f2ac83bb6c85 # version: v4.0.0
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss-production
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf

View File

@ -0,0 +1,153 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2026-04-08T12:37:05Z by kres b6d29bf.
concurrency:
group: ${{ github.head_ref || github.run_id }}
cancel-in-progress: true
"on":
schedule:
- cron: 30 7 * * *
name: integration-aws-nvidia-oss-lts-arm64-cron
jobs:
default:
runs-on:
group: generic
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@59699597e84e80085a750998045983daa49274c4 # version: v1.4.0
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # version: v4.0.0
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # version: v8.0.1
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
PUSH: "true"
run: |
make installer-base imager _out/integration-test-linux-amd64
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
run: |
make image-aws
- name: checkout extensions
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/arm64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-oss-lts
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@5e8dbf3c6d9deaf4193ca7a8fb23f2ac83bb6c85 # version: v4.0.0
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss-lts
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf

View File

@ -0,0 +1,153 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2026-04-11T06:20:55Z by kres b6d29bf.
concurrency:
group: ${{ github.head_ref || github.run_id }}
cancel-in-progress: true
"on":
schedule:
- cron: 30 7 * * *
name: integration-aws-nvidia-oss-production-arm64-cron
jobs:
default:
runs-on:
group: generic
steps:
- name: gather-system-info
id: system-info
uses: kenchan0130/actions-system-info@59699597e84e80085a750998045983daa49274c4 # version: v1.4.0
continue-on-error: true
- name: print-system-info
run: |
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
OUTPUTS=(
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
"Hostname: ${{ steps.system-info.outputs.hostname }}"
"NodeName: ${NODE_NAME}"
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
"Name: ${{ steps.system-info.outputs.name }}"
"Platform: ${{ steps.system-info.outputs.platform }}"
"Release: ${{ steps.system-info.outputs.release }}"
"Total memory: ${MEMORY_GB} GB"
)
for OUTPUT in "${OUTPUTS[@]}";do
echo "${OUTPUT}"
done
continue-on-error: true
- name: checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
- name: Unshallow
run: |
git fetch --prune --unshallow
- name: Set up Docker Buildx
id: setup-buildx
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # version: v4.0.0
with:
driver: remote
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
timeout-minutes: 10
- name: Mask secrets
run: |
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
- name: Set secrets for job
run: |
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
- name: Download artifacts
if: github.event_name != 'schedule'
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # version: v8.0.1
with:
name: talos-artifacts
path: _out
- name: Fix artifact permissions
if: github.event_name != 'schedule'
run: |
xargs -a _out/executable-artifacts -I {} chmod +x {}
- name: ci-temp-release-tag
if: github.event_name != 'schedule'
run: |
make ci-temp-release-tag
- name: generate
if: github.event_name == 'schedule'
run: |
make generate
- name: uki-certs
if: github.event_name == 'schedule'
env:
PLATFORM: linux/amd64
run: |
make uki-certs
- name: build
if: github.event_name == 'schedule'
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
PUSH: "true"
run: |
make installer-base imager _out/integration-test-linux-amd64
- name: image-aws
env:
IMAGE_REGISTRY: registry.dev.siderolabs.io
PLATFORM: linux/arm64
run: |
make image-aws
- name: checkout extensions
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/extensions
ref: main
repository: siderolabs/extensions
- name: set variables
run: |
cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
env:
PLATFORM: linux/arm64
PUSH: "true"
REGISTRY: registry.dev.siderolabs.io
run: |
make nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production extensions-metadata -C _out/extensions
- name: e2e-aws-prepare
env:
E2E_AWS_TARGET: nvidia-oss-production
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
run: |
make e2e-aws-prepare
- name: checkout contrib
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # version: v6.0.2
with:
path: _out/contrib
ref: main
repository: siderolabs/contrib
- name: setup tf
uses: hashicorp/setup-terraform@5e8dbf3c6d9deaf4193ca7a8fb23f2ac83bb6c85 # version: v4.0.0
with:
terraform_wrapper: "false"
- name: tf apply
env:
TF_E2E_ACTION: apply
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss-production
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
run: |
make e2e-aws
- name: tf destroy
if: always()
env:
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: "false"
TF_E2E_TEST_TYPE: aws
TF_SCRIPT_DIR: _out/contrib
run: |
make e2e-cloud-tf

View File

@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2026-04-15T14:42:53Z by kres b6d29bf.
# Generated on 2026-04-15T14:54:12Z by kres b6d29bf.
"on":
workflow_run:
@ -40,9 +40,13 @@
- integration-image-factory-cron
- integration-aws-cron
- integration-aws-nvidia-oss-lts-cron
- integration-aws-nvidia-oss-lts-arm64-cron
- integration-aws-nvidia-oss-production-cron
- integration-aws-nvidia-oss-production-arm64-cron
- integration-aws-nvidia-nonfree-lts-cron
- integration-aws-nvidia-nonfree-lts-arm64-cron
- integration-aws-nvidia-nonfree-production-cron
- integration-aws-nvidia-nonfree-production-arm64-cron
- integration-gcp-cron
types:
- completed

View File

@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2026-04-15T14:42:53Z by kres b6d29bf.
# Generated on 2026-04-15T14:54:12Z by kres b6d29bf.
"on":
workflow_run:
@ -40,9 +40,13 @@
- integration-image-factory-cron
- integration-aws-cron
- integration-aws-nvidia-oss-lts-cron
- integration-aws-nvidia-oss-lts-arm64-cron
- integration-aws-nvidia-oss-production-cron
- integration-aws-nvidia-oss-production-arm64-cron
- integration-aws-nvidia-nonfree-lts-cron
- integration-aws-nvidia-nonfree-lts-arm64-cron
- integration-aws-nvidia-nonfree-production-cron
- integration-aws-nvidia-nonfree-production-arm64-cron
- integration-gcp-cron
types:
- completed

View File

@ -89,6 +89,7 @@ spec:
- integration-image-factory
- integration-aws
- integration-aws-nvidia-oss-lts
- integration-aws-nvidia-oss-lts-arm64
- integration-aws-nvidia-oss-production
- integration-aws-nvidia-nonfree-lts
- integration-aws-nvidia-nonfree-production
@ -2865,6 +2866,100 @@ spec:
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-aws-nvidia-oss-lts-arm64
buildxOptions:
enabled: true
sops: true
depends:
- default
runnerGroup: generic # we can use generic here since the tests run against a remote talos cluster
crons:
- '30 7 * * *'
triggerLabels:
- integration/aws-nvidia-oss-lts-arm64
- integration/aws-nvidia-oss
- integration/aws-nvidia
steps:
- name: download-artifacts
conditions:
- not-on-schedule
artifactStep:
type: download
artifactName: talos-artifacts
artifactPath: _out
- name: ci-temp-release-tag
conditions:
- not-on-schedule
- name: generate
conditions:
- only-on-schedule
- name: uki-certs
conditions:
- only-on-schedule
environment:
PLATFORM: linux/amd64
- name: build
conditions:
- only-on-schedule
command: installer-base imager _out/integration-test-linux-amd64
environment:
PLATFORM: linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
PUSH: true
- name: image-aws
environment:
PLATFORM: linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: checkout extensions
checkoutStep:
repository: siderolabs/extensions
ref: main
path: _out/extensions
- name: set variables
nonMakeStep: true
command: cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
command: nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts extensions-metadata
arguments:
- -C
- _out/extensions
environment:
PLATFORM: linux/arm64
PUSH: true
REGISTRY: registry.dev.siderolabs.io
- name: e2e-aws-prepare
environment:
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
E2E_AWS_TARGET: nvidia-oss-lts
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
- name: checkout contrib
checkoutStep:
repository: siderolabs/contrib
ref: main
path: _out/contrib
- name: setup tf
terraformStep: true
- name: tf apply
command: e2e-cloud-tf
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: apply
- name: e2e-aws-nvidia-oss-lts
command: e2e-aws
environment:
EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false"
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
- name: tf destroy
command: e2e-cloud-tf
conditions:
- always
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-aws-nvidia-oss-production
buildxOptions:
enabled: true
@ -2961,6 +3056,100 @@ spec:
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-aws-nvidia-oss-production-arm64
buildxOptions:
enabled: true
sops: true
depends:
- default
runnerGroup: generic # we can use generic here since the tests run against a remote talos cluster
crons:
- '30 7 * * *'
triggerLabels:
- integration/aws-nvidia-oss-production-arm64
- integration/aws-nvidia-oss
- integration/aws-nvidia
steps:
- name: download-artifacts
conditions:
- not-on-schedule
artifactStep:
type: download
artifactName: talos-artifacts
artifactPath: _out
- name: ci-temp-release-tag
conditions:
- not-on-schedule
- name: generate
conditions:
- only-on-schedule
- name: uki-certs
conditions:
- only-on-schedule
environment:
PLATFORM: linux/amd64
- name: build
conditions:
- only-on-schedule
command: installer-base imager _out/integration-test-linux-amd64
environment:
PLATFORM: linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
PUSH: true
- name: image-aws
environment:
PLATFORM: linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: checkout extensions
checkoutStep:
repository: siderolabs/extensions
ref: main
path: _out/extensions
- name: set variables
nonMakeStep: true
command: cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
command: nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production extensions-metadata
arguments:
- -C
- _out/extensions
environment:
PLATFORM: linux/arm64
PUSH: true
REGISTRY: registry.dev.siderolabs.io
- name: e2e-aws-prepare
environment:
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
E2E_AWS_TARGET: nvidia-oss-production
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
- name: checkout contrib
checkoutStep:
repository: siderolabs/contrib
ref: main
path: _out/contrib
- name: setup tf
terraformStep: true
- name: tf apply
command: e2e-cloud-tf
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: apply
- name: e2e-aws-nvidia-oss-production
command: e2e-aws
environment:
EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false"
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
- name: tf destroy
command: e2e-cloud-tf
conditions:
- always
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-aws-nvidia-nonfree-lts
buildxOptions:
enabled: true
@ -3057,6 +3246,100 @@ spec:
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-aws-nvidia-nonfree-lts-arm64
buildxOptions:
enabled: true
sops: true
depends:
- default
runnerGroup: generic # we can use generic here since the tests run against a remote talos cluster
crons:
- '30 7 * * *'
triggerLabels:
- integration/aws-nvidia-nonfree-lts-arm64
- integration/aws-nvidia-nonfree
- integration/aws-nvidia
steps:
- name: download-artifacts
conditions:
- not-on-schedule
artifactStep:
type: download
artifactName: talos-artifacts
artifactPath: _out
- name: ci-temp-release-tag
conditions:
- not-on-schedule
- name: generate
conditions:
- only-on-schedule
- name: uki-certs
conditions:
- only-on-schedule
environment:
PLATFORM: linux/amd64
- name: build
conditions:
- only-on-schedule
command: installer-base imager _out/integration-test-linux-amd64
environment:
PLATFORM: linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
PUSH: true
- name: image-aws
environment:
PLATFORM: linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: checkout extensions
checkoutStep:
repository: siderolabs/extensions
ref: main
path: _out/extensions
- name: set variables
nonMakeStep: true
command: cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
command: nvidia-container-toolkit-lts nonfree-kmod-nvidia-lts extensions-metadata
arguments:
- -C
- _out/extensions
environment:
PLATFORM: linux/arm64
PUSH: true
REGISTRY: registry.dev.siderolabs.io
- name: e2e-aws-prepare
environment:
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
E2E_AWS_TARGET: nvidia-nonfree-lts
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
- name: checkout contrib
checkoutStep:
repository: siderolabs/contrib
ref: main
path: _out/contrib
- name: setup tf
terraformStep: true
- name: tf apply
command: e2e-cloud-tf
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: apply
- name: e2e-aws-nvidia-nonfree-lts
command: e2e-aws
environment:
EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false"
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
- name: tf destroy
command: e2e-cloud-tf
conditions:
- always
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-aws-nvidia-nonfree-production
buildxOptions:
enabled: true
@ -3153,6 +3436,100 @@ spec:
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-aws-nvidia-nonfree-production-arm64
buildxOptions:
enabled: true
sops: true
depends:
- default
runnerGroup: generic # we can use generic here since the tests run against a remote talos cluster
crons:
- '30 7 * * *'
triggerLabels:
- integration/aws-nvidia-nonfree-production-arm64
- integration/aws-nvidia-nonfree
- integration/aws-nvidia
steps:
- name: download-artifacts
conditions:
- not-on-schedule
artifactStep:
type: download
artifactName: talos-artifacts
artifactPath: _out
- name: ci-temp-release-tag
conditions:
- not-on-schedule
- name: generate
conditions:
- only-on-schedule
- name: uki-certs
conditions:
- only-on-schedule
environment:
PLATFORM: linux/amd64
- name: build
conditions:
- only-on-schedule
command: installer-base imager _out/integration-test-linux-amd64
environment:
PLATFORM: linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
PUSH: true
- name: image-aws
environment:
PLATFORM: linux/arm64
IMAGE_REGISTRY: registry.dev.siderolabs.io
- name: checkout extensions
checkoutStep:
repository: siderolabs/extensions
ref: main
path: _out/extensions
- name: set variables
nonMakeStep: true
command: cat _out/talos-metadata >> "$GITHUB_ENV"
- name: build extensions
command: nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata
arguments:
- -C
- _out/extensions
environment:
PLATFORM: linux/arm64
PUSH: true
REGISTRY: registry.dev.siderolabs.io
- name: e2e-aws-prepare
environment:
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
E2E_AWS_TARGET: nvidia-nonfree-production
IMAGE_REGISTRY: registry.dev.siderolabs.io
TARGET_ARCH: arm64
- name: checkout contrib
checkoutStep:
repository: siderolabs/contrib
ref: main
path: _out/contrib
- name: setup tf
terraformStep: true
- name: tf apply
command: e2e-cloud-tf
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: apply
- name: e2e-aws-nvidia-oss-production
command: e2e-aws
environment:
EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false"
INTEGRATION_TEST_RUN: TestIntegration/api.ExtensionsSuiteNVIDIA
- name: tf destroy
command: e2e-cloud-tf
conditions:
- always
environment:
TF_SCRIPT_DIR: _out/contrib
TF_E2E_TEST_TYPE: aws
TF_E2E_ACTION: destroy
TF_E2E_REFRESH_ON_DESTROY: false
- name: integration-gcp
buildxOptions:
enabled: true

View File

@ -6,10 +6,12 @@ source ./hack/test/e2e.sh
REGION="us-east-1"
ARCH="${TARGET_ARCH:-amd64}"
function cloud_image_upload() {
RANDOM_SUFFIX=$(openssl rand -hex 4)
CLOUD_IMAGES_EXTRA_ARGS=("--name-prefix=${1}-${RANDOM_SUFFIX}" "--target-clouds=aws" "--architectures=amd64" "--aws-regions=${REGION}")
CLOUD_IMAGES_EXTRA_ARGS=("--name-prefix=${1}-${RANDOM_SUFFIX}" "--target-clouds=aws" "--architectures=${ARCH}" "--aws-regions=${REGION}")
case "${1}" in
talos-e2e-nvidia-oss-*)
@ -21,7 +23,7 @@ function cloud_image_upload() {
}
function get_ami_id() {
jq -r ".[] | select(.cloud == \"aws\") | select(.region == \"${REGION}\") | select (.arch == \"amd64\") | .id" "${ARTIFACTS}/cloud-images.json"
jq -r ".[] | select(.cloud == \"aws\") | select(.region == \"${REGION}\") | select (.arch == \"${ARCH}\") | .id" "${ARTIFACTS}/cloud-images.json"
}
function cloud_image_upload_with_extensions() {
@ -48,7 +50,7 @@ function cloud_image_upload_with_extensions() {
;;
esac
make image-aws IMAGER_ARGS="${EXTENSIONS}" PLATFORM=linux/amd64
make image-aws IMAGER_ARGS="${EXTENSIONS}" PLATFORM="linux/${ARCH}"
cloud_image_upload "talos-e2e-${1}"
}
@ -73,7 +75,9 @@ esac
mkdir -p "${ARTIFACTS}/e2e-aws-generated"
NAME_PREFIX="${SHA}-${E2E_AWS_TARGET}"
NAME_PREFIX="${SHA}-${E2E_AWS_TARGET}-${ARCH}"
AWS_JQ_TEMPLATE="aws-${ARCH}.jq"
jq --null-input \
--arg WORKER_GROUP "${WORKER_GROUP}" \
@ -90,6 +94,6 @@ jq --null-input \
talos_version_contract: $TALOS_VERSION_CONTRACT,
kubernetes_version: $KUBERNETES_VERSION
}' \
| jq -f hack/test/tfvars/aws.jq > "${ARTIFACTS}/e2e-aws-generated/vars.json"
| jq -f "hack/test/tfvars/${AWS_JQ_TEMPLATE}" > "${ARTIFACTS}/e2e-aws-generated/vars.json"
cp hack/test/tfvars/*.yaml "${ARTIFACTS}/e2e-aws-generated"

View File

@ -0,0 +1,35 @@
{
"cluster_name": .cluster_name,
"ccm": true,
"talos_version_contract": .talos_version_contract,
"kubernetes_version": .kubernetes_version,
"control_plane": {
"ami_id": .ami_id,
"instance_type": "t4g.large"
},
"worker_groups": (if .worker_group == "nvidia" then [
{
"name": "nvidia-t4",
"ami_id": .nvidia_ami_id,
"instance_type": "g5g.xlarge",
"config_patch_files": [
"nvidia.yaml"
],
"tags": {
"Type": "nvidia-t4"
}
}
] else [
{
"name": "default",
"num_instances": 3,
"ami_id": .ami_id,
"instance_type": "t4g.large"
}
] end),
"extra_tags": {
"ClusterName": .cluster_name,
"Project": "talos-e2e-ci",
"Environment": "ci"
}
}

View File

@ -7,12 +7,15 @@
package api
import (
"bytes"
"context"
_ "embed"
"fmt"
"io"
"strings"
"time"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/siderolabs/go-retry/retry"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
@ -20,6 +23,10 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/siderolabs/talos/internal/integration/base"
"github.com/siderolabs/talos/pkg/machinery/api/common"
"github.com/siderolabs/talos/pkg/machinery/client"
"github.com/siderolabs/talos/pkg/machinery/constants"
"github.com/siderolabs/talos/pkg/machinery/resources/runtime"
)
//go:embed testdata/nvidia-gpu-operator.yaml
@ -69,13 +76,11 @@ func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() {
// if we're testing NVIDIA stuff we need to get the nodes having NVIDIA GPUs
// we query k8s to get the nodes having the label node.kubernetes.io/instance-type.
// this label is set by the cloud provider and it's value is the instance type.
// the nvidia e2e-aws tests creates gpu nodes one with g4dn.xlarge and another
// with p4d.24xlarge
for _, nvidiaNode := range suite.getNVIDIANodes("node.kubernetes.io/instance-type in (g4dn.xlarge, p4d.24xlarge)") {
for _, nvidiaNode := range suite.getNVIDIANodes("node.kubernetes.io/instance-type in (g4dn.xlarge, p4d.24xlarge, g5g.xlarge)") {
suite.AssertExpectedModules(suite.ctx, nvidiaNode, expectedModulesModDep)
}
nodes := suite.getNVIDIANodes("node.kubernetes.io/instance-type=g4dn.xlarge")
nodes := suite.getNVIDIANodes("node.kubernetes.io/instance-type in (g4dn.xlarge, p4d.24xlarge, g5g.xlarge)")
for _, node := range nodes {
suite.AssertServicesRunning(suite.ctx, node, map[string]string{
"ext-nvidia-persistenced": "Running",
@ -83,6 +88,95 @@ func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() {
})
}
missingCDIFilesData := map[string]map[string]int{
"amd64": {
"nvidia-open-gpu-kernel-modules-production": 13,
"nvidia-open-gpu-kernel-modules-lts": 9,
"nonfree-kmod-nvidia-production": 13,
"nonfree-kmod-nvidia-lts": 9,
},
"arm64": {
"nvidia-open-gpu-kernel-modules-production": 11,
"nvidia-open-gpu-kernel-modules-lts": 9,
"nonfree-kmod-nvidia-production": 11,
"nonfree-kmod-nvidia-lts": 9,
},
}
for _, node := range nodes {
nodeCtx := client.WithNode(suite.ctx, node)
versionInfo, err := suite.Client.Version(nodeCtx)
suite.Require().NoError(err)
suite.Require().NotNil(versionInfo.GetMessages(), "version info messages should not be nil")
extInfo := missingCDIFilesData[versionInfo.GetMessages()[0].Version.Arch]
list, err := safe.StateListAll[*runtime.ExtensionStatus](nodeCtx, suite.Client.COSI)
suite.Require().NoError(err)
extensionsList := safe.ToSlice(list, func(info *runtime.ExtensionStatus) string {
return info.TypedSpec().Metadata.Name
})
var expectedCount int
for _, name := range extensionsList {
if count, exists := extInfo[name]; exists {
expectedCount = count
break
}
}
suite.Require().NotZero(expectedCount, "did not find any matching nvidia extension in the list of extensions: %v", extensionsList)
logsStream, err := suite.Client.Logs(
nodeCtx,
constants.SystemContainerdNamespace,
common.ContainerDriver_CONTAINERD,
"ext-nvidia-cdi-gen",
false,
-1,
)
suite.Require().NoError(err)
logReader, err := client.ReadStream(logsStream)
suite.Require().NoError(err)
defer logReader.Close() //nolint:errcheck
var buffer bytes.Buffer
_, err = io.Copy(&buffer, logReader)
suite.Require().NoError(err)
logData := buffer.String()
// we know as baseline we have different number of missing files that are not present in the extension
// and manually verified, if some new files are not found we want to fix the extension
// Adding an example of the current log message for reference:
// talosctl -n 172.16.15.116 logs ext-nvidia-cdi-gen | grep "Could not"
// msg="Could not locate libnvidia-vulkan-producer.so.580.126.20: libnvidia-vulkan-producer.so.580.126.20: not found\nlibnvidia-vulkan-producer.so.580.126.20: not found"
// msg="Could not locate X11/xorg.conf.d/10-nvidia.conf: X11/xorg.conf.d/10-nvidia.conf: not found"
// msg="Could not locate X11/xorg.conf.d/nvidia-drm-outputclass.conf: X11/xorg.conf.d/nvidia-drm-outputclass.conf: not found"
// msg="Could not locate vulkan/implicit_layer.d/nvidia_layers.json: vulkan/implicit_layer.d/nvidia_layers.json: not found\nvulkan/implicit_layer.d/nvidia_layers.json: not found"
// msg="Could not locate vulkan/icd.d/nvidia_icd.x86_64.json: vulkan/icd.d/nvidia_icd.x86_64.json: not found\nvulkan/icd.d/nvidia_icd.x86_64.json: not found"
// msg="Could not locate /nvidia-fabricmanager/socket: /nvidia-fabricmanager/socket: not found"
// msg="Could not locate /tmp/nvidia-mps: /tmp/nvidia-mps: not found"
// msg="Could not locate nvidia-imex: nvidia-imex: not found"
// msg="Could not locate nvidia-imex-ctl: nvidia-imex-ctl: not found"
suite.Assert().Equal(
expectedCount,
strings.Count(logData, "Could not locate"),
"expected exactly %d 'Could not locate' in the logs, got %d. Logs:\n%s",
expectedCount,
strings.Count(logData, "Could not"),
logData,
)
}
// nodes = suite.getNVIDIANodes("node.kubernetes.io/instance-type=p4d.24xlarge")
// for _, node := range nodes {
// suite.testServicesRunning(node, map[string]string{
@ -301,7 +395,7 @@ func nvidiaCUDATestJob() *batchv1.Job {
{
Key: "node.kubernetes.io/instance-type",
Operator: corev1.NodeSelectorOpIn,
Values: []string{"g4dn.xlarge", "p4d.24xlarge"},
Values: []string{"g4dn.xlarge", "p4d.24xlarge", "g5g.xlarge"},
},
},
},
@ -354,7 +448,7 @@ func nvidiaCDITestJob() *batchv1.Job {
{
Key: "node.kubernetes.io/instance-type",
Operator: corev1.NodeSelectorOpIn,
Values: []string{"g4dn.xlarge", "p4d.24xlarge"},
Values: []string{"g4dn.xlarge", "p4d.24xlarge", "g5g.xlarge"},
},
},
},

View File

@ -3,4 +3,4 @@ driver:
toolkit:
enabled: false
hostPaths:
driverInstallDir: /usr/local/glibc/usr
driverInstallDir: /usr/local/lib