mirror of
https://github.com/siderolabs/talos.git
synced 2026-05-04 20:06:18 +02:00
chore(ci): add more nvidia test matrix
Add more NVIDIA tests covering all supported OSS and Proprietary LTS and Production driver versions. Fixes: #11398 Signed-off-by: Noel Georgi <git@frezbo.dev>
This commit is contained in:
parent
451c2c4c39
commit
450b30d5a9
3
.github/renovate.json
vendored
3
.github/renovate.json
vendored
@ -33,7 +33,8 @@
|
||||
"customType": "regex",
|
||||
"versioningTemplate": "{{#if versioning}}{{versioning}}{{else}}semver{{/if}}",
|
||||
"managerFilePatterns": [
|
||||
"/internal/integration/k8s/constants.go/"
|
||||
"/internal/integration/k8s/constants.go/",
|
||||
"/internal/integration/api/constants.go/"
|
||||
],
|
||||
"matchStrings": [
|
||||
"\\/\\/\\s+renovate: datasource=(?<datasource>.*?)(?:\\s+extractVersion=(?<extractVersion>.+?))?(?:\\s+versioning=(?<versioning>.+?))?\\s+depName=(?<depName>.+?)?(?:\\s+registryUrl=(?<registryUrl>.+?))?\\s.*Version\\s+=\\s+\\\"(?<currentValue>.+?)\\\""
|
||||
|
||||
330
.github/workflows/ci.yaml
vendored
330
.github/workflows/ci.yaml
vendored
@ -1,6 +1,6 @@
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
|
||||
#
|
||||
# Generated on 2025-07-21T14:04:07Z by kres b869533.
|
||||
# Generated on 2025-07-22T04:25:57Z by kres b869533.
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.head_ref || github.run_id }}
|
||||
@ -704,7 +704,7 @@ jobs:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
integration-aws-nvidia-nonfree:
|
||||
integration-aws-nvidia-nonfree-lts:
|
||||
permissions:
|
||||
actions: read
|
||||
contents: write
|
||||
@ -714,7 +714,7 @@ jobs:
|
||||
runs-on:
|
||||
- self-hosted
|
||||
- generic
|
||||
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree')
|
||||
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree-lts') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws')
|
||||
needs:
|
||||
- default
|
||||
steps:
|
||||
@ -797,13 +797,6 @@ jobs:
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make talosctl-cni-bundle
|
||||
- name: images-essential
|
||||
if: github.event_name == 'schedule'
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
run: |
|
||||
make images-essential
|
||||
- name: image-aws
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
@ -828,7 +821,7 @@ jobs:
|
||||
make nvidia-container-toolkit-lts nonfree-kmod-nvidia-lts extensions-metadata -C _out/extensions
|
||||
- name: e2e-aws-prepare
|
||||
env:
|
||||
E2E_AWS_TARGET: nvidia-nonfree
|
||||
E2E_AWS_TARGET: nvidia-nonfree-lts
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
@ -850,7 +843,7 @@ jobs:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
- name: e2e-aws-nvidia-nonfree
|
||||
- name: e2e-aws-nvidia-nonfree-lts
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
run: |
|
||||
@ -864,7 +857,7 @@ jobs:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
integration-aws-nvidia-oss:
|
||||
integration-aws-nvidia-nonfree-production:
|
||||
permissions:
|
||||
actions: read
|
||||
contents: write
|
||||
@ -874,7 +867,7 @@ jobs:
|
||||
runs-on:
|
||||
- self-hosted
|
||||
- generic
|
||||
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss')
|
||||
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree-production') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-nonfree') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws')
|
||||
needs:
|
||||
- default
|
||||
steps:
|
||||
@ -957,13 +950,312 @@ jobs:
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make talosctl-cni-bundle
|
||||
- name: images-essential
|
||||
if: github.event_name == 'schedule'
|
||||
- name: image-aws
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
run: |
|
||||
make images-essential
|
||||
make image-aws
|
||||
- name: checkout extensions
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: _out/extensions
|
||||
ref: main
|
||||
repository: siderolabs/extensions
|
||||
- name: set variables
|
||||
run: |
|
||||
cat _out/talos-metadata >> "$GITHUB_ENV"
|
||||
- name: build extensions
|
||||
env:
|
||||
PLATFORM: linux/amd64
|
||||
PUSH: "true"
|
||||
REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
make nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata -C _out/extensions
|
||||
- name: e2e-aws-prepare
|
||||
env:
|
||||
E2E_AWS_TARGET: nvidia-nonfree-production
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
make e2e-aws-prepare
|
||||
- name: checkout contrib
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: _out/contrib
|
||||
ref: main
|
||||
repository: siderolabs/contrib
|
||||
- name: setup tf
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_wrapper: "false"
|
||||
- name: tf apply
|
||||
env:
|
||||
TF_E2E_ACTION: apply
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
- name: e2e-aws-nvidia-nonfree-production
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
run: |
|
||||
make e2e-aws
|
||||
- name: tf destroy
|
||||
if: always()
|
||||
env:
|
||||
TF_E2E_ACTION: destroy
|
||||
TF_E2E_REFRESH_ON_DESTROY: "false"
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
integration-aws-nvidia-oss-lts:
|
||||
permissions:
|
||||
actions: read
|
||||
contents: write
|
||||
issues: read
|
||||
packages: write
|
||||
pull-requests: read
|
||||
runs-on:
|
||||
- self-hosted
|
||||
- generic
|
||||
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss-lts') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws')
|
||||
needs:
|
||||
- default
|
||||
steps:
|
||||
- name: gather-system-info
|
||||
id: system-info
|
||||
uses: kenchan0130/actions-system-info@v1.3.1
|
||||
continue-on-error: true
|
||||
- name: print-system-info
|
||||
run: |
|
||||
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
|
||||
|
||||
OUTPUTS=(
|
||||
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
|
||||
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
|
||||
"Hostname: ${{ steps.system-info.outputs.hostname }}"
|
||||
"NodeName: ${NODE_NAME}"
|
||||
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
|
||||
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
|
||||
"Name: ${{ steps.system-info.outputs.name }}"
|
||||
"Platform: ${{ steps.system-info.outputs.platform }}"
|
||||
"Release: ${{ steps.system-info.outputs.release }}"
|
||||
"Total memory: ${MEMORY_GB} GB"
|
||||
)
|
||||
|
||||
for OUTPUT in "${OUTPUTS[@]}";do
|
||||
echo "${OUTPUT}"
|
||||
done
|
||||
continue-on-error: true
|
||||
- name: checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Unshallow
|
||||
run: |
|
||||
git fetch --prune --unshallow
|
||||
- name: Set up Docker Buildx
|
||||
id: setup-buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
driver: remote
|
||||
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
|
||||
timeout-minutes: 10
|
||||
- name: Mask secrets
|
||||
run: |
|
||||
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
|
||||
- name: Set secrets for job
|
||||
run: |
|
||||
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
|
||||
- name: Download artifacts
|
||||
if: github.event_name != 'schedule'
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: talos-artifacts
|
||||
path: _out
|
||||
- name: Fix artifact permissions
|
||||
if: github.event_name != 'schedule'
|
||||
run: |
|
||||
xargs -a _out/executable-artifacts -I {} chmod +x {}
|
||||
- name: ci-temp-release-tag
|
||||
if: github.event_name != 'schedule'
|
||||
run: |
|
||||
make ci-temp-release-tag
|
||||
- name: generate
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make generate
|
||||
- name: uki-certs
|
||||
if: github.event_name == 'schedule'
|
||||
env:
|
||||
PLATFORM: linux/amd64
|
||||
run: |
|
||||
make uki-certs
|
||||
- name: build
|
||||
if: github.event_name == 'schedule'
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
PUSH: "true"
|
||||
run: |
|
||||
make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
|
||||
- name: talosctl-cni-bundle
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make talosctl-cni-bundle
|
||||
- name: image-aws
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
run: |
|
||||
make image-aws
|
||||
- name: checkout extensions
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: _out/extensions
|
||||
ref: main
|
||||
repository: siderolabs/extensions
|
||||
- name: set variables
|
||||
run: |
|
||||
cat _out/talos-metadata >> "$GITHUB_ENV"
|
||||
- name: build extensions
|
||||
env:
|
||||
PLATFORM: linux/amd64
|
||||
PUSH: "true"
|
||||
REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
make nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts zfs extensions-metadata -C _out/extensions
|
||||
- name: e2e-aws-prepare
|
||||
env:
|
||||
E2E_AWS_TARGET: nvidia-oss-lts
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
make e2e-aws-prepare
|
||||
- name: checkout contrib
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: _out/contrib
|
||||
ref: main
|
||||
repository: siderolabs/contrib
|
||||
- name: setup tf
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_wrapper: "false"
|
||||
- name: tf apply
|
||||
env:
|
||||
TF_E2E_ACTION: apply
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
- name: e2e-aws-nvidia-oss-lts
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
|
||||
run: |
|
||||
make e2e-aws
|
||||
- name: tf destroy
|
||||
if: always()
|
||||
env:
|
||||
TF_E2E_ACTION: destroy
|
||||
TF_E2E_REFRESH_ON_DESTROY: "false"
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
integration-aws-nvidia-oss-production:
|
||||
permissions:
|
||||
actions: read
|
||||
contents: write
|
||||
issues: read
|
||||
packages: write
|
||||
pull-requests: read
|
||||
runs-on:
|
||||
- self-hosted
|
||||
- generic
|
||||
if: contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss-production') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia-oss') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws-nvidia') || contains(fromJSON(needs.default.outputs.labels), 'integration/aws')
|
||||
needs:
|
||||
- default
|
||||
steps:
|
||||
- name: gather-system-info
|
||||
id: system-info
|
||||
uses: kenchan0130/actions-system-info@v1.3.1
|
||||
continue-on-error: true
|
||||
- name: print-system-info
|
||||
run: |
|
||||
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
|
||||
|
||||
OUTPUTS=(
|
||||
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
|
||||
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
|
||||
"Hostname: ${{ steps.system-info.outputs.hostname }}"
|
||||
"NodeName: ${NODE_NAME}"
|
||||
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
|
||||
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
|
||||
"Name: ${{ steps.system-info.outputs.name }}"
|
||||
"Platform: ${{ steps.system-info.outputs.platform }}"
|
||||
"Release: ${{ steps.system-info.outputs.release }}"
|
||||
"Total memory: ${MEMORY_GB} GB"
|
||||
)
|
||||
|
||||
for OUTPUT in "${OUTPUTS[@]}";do
|
||||
echo "${OUTPUT}"
|
||||
done
|
||||
continue-on-error: true
|
||||
- name: checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Unshallow
|
||||
run: |
|
||||
git fetch --prune --unshallow
|
||||
- name: Set up Docker Buildx
|
||||
id: setup-buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
driver: remote
|
||||
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
|
||||
timeout-minutes: 10
|
||||
- name: Mask secrets
|
||||
run: |
|
||||
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
|
||||
- name: Set secrets for job
|
||||
run: |
|
||||
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
|
||||
- name: Download artifacts
|
||||
if: github.event_name != 'schedule'
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: talos-artifacts
|
||||
path: _out
|
||||
- name: Fix artifact permissions
|
||||
if: github.event_name != 'schedule'
|
||||
run: |
|
||||
xargs -a _out/executable-artifacts -I {} chmod +x {}
|
||||
- name: ci-temp-release-tag
|
||||
if: github.event_name != 'schedule'
|
||||
run: |
|
||||
make ci-temp-release-tag
|
||||
- name: generate
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make generate
|
||||
- name: uki-certs
|
||||
if: github.event_name == 'schedule'
|
||||
env:
|
||||
PLATFORM: linux/amd64
|
||||
run: |
|
||||
make uki-certs
|
||||
- name: build
|
||||
if: github.event_name == 'schedule'
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
PUSH: "true"
|
||||
run: |
|
||||
make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
|
||||
- name: talosctl-cni-bundle
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make talosctl-cni-bundle
|
||||
- name: image-aws
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
@ -988,7 +1280,7 @@ jobs:
|
||||
make nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata -C _out/extensions
|
||||
- name: e2e-aws-prepare
|
||||
env:
|
||||
E2E_AWS_TARGET: nvidia-oss
|
||||
E2E_AWS_TARGET: nvidia-oss-production
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
@ -1010,7 +1302,7 @@ jobs:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
- name: e2e-aws-nvidia-oss
|
||||
- name: e2e-aws-nvidia-oss-production
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
|
||||
run: |
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
|
||||
#
|
||||
# Generated on 2025-07-21T09:52:07Z by kres b869533.
|
||||
# Generated on 2025-07-22T04:25:57Z by kres b869533.
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.head_ref || github.run_id }}
|
||||
@ -8,7 +8,7 @@ concurrency:
|
||||
"on":
|
||||
schedule:
|
||||
- cron: 30 7 * * *
|
||||
name: integration-aws-nvidia-nonfree-cron
|
||||
name: integration-aws-nvidia-nonfree-lts-cron
|
||||
jobs:
|
||||
default:
|
||||
runs-on:
|
||||
@ -94,13 +94,6 @@ jobs:
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make talosctl-cni-bundle
|
||||
- name: images-essential
|
||||
if: github.event_name == 'schedule'
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
run: |
|
||||
make images-essential
|
||||
- name: image-aws
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
@ -125,7 +118,7 @@ jobs:
|
||||
make nvidia-container-toolkit-lts nonfree-kmod-nvidia-lts extensions-metadata -C _out/extensions
|
||||
- name: e2e-aws-prepare
|
||||
env:
|
||||
E2E_AWS_TARGET: nvidia-nonfree
|
||||
E2E_AWS_TARGET: nvidia-nonfree-lts
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
@ -147,7 +140,7 @@ jobs:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
- name: e2e-aws-nvidia-nonfree
|
||||
- name: e2e-aws-nvidia-nonfree-lts
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
run: |
|
||||
156
.github/workflows/integration-aws-nvidia-nonfree-production-cron.yaml
vendored
Normal file
156
.github/workflows/integration-aws-nvidia-nonfree-production-cron.yaml
vendored
Normal file
@ -0,0 +1,156 @@
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
|
||||
#
|
||||
# Generated on 2025-07-22T04:25:57Z by kres b869533.
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
"on":
|
||||
schedule:
|
||||
- cron: 30 7 * * *
|
||||
name: integration-aws-nvidia-nonfree-production-cron
|
||||
jobs:
|
||||
default:
|
||||
runs-on:
|
||||
- self-hosted
|
||||
- generic
|
||||
steps:
|
||||
- name: gather-system-info
|
||||
id: system-info
|
||||
uses: kenchan0130/actions-system-info@v1.3.1
|
||||
continue-on-error: true
|
||||
- name: print-system-info
|
||||
run: |
|
||||
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
|
||||
|
||||
OUTPUTS=(
|
||||
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
|
||||
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
|
||||
"Hostname: ${{ steps.system-info.outputs.hostname }}"
|
||||
"NodeName: ${NODE_NAME}"
|
||||
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
|
||||
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
|
||||
"Name: ${{ steps.system-info.outputs.name }}"
|
||||
"Platform: ${{ steps.system-info.outputs.platform }}"
|
||||
"Release: ${{ steps.system-info.outputs.release }}"
|
||||
"Total memory: ${MEMORY_GB} GB"
|
||||
)
|
||||
|
||||
for OUTPUT in "${OUTPUTS[@]}";do
|
||||
echo "${OUTPUT}"
|
||||
done
|
||||
continue-on-error: true
|
||||
- name: checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Unshallow
|
||||
run: |
|
||||
git fetch --prune --unshallow
|
||||
- name: Set up Docker Buildx
|
||||
id: setup-buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
driver: remote
|
||||
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
|
||||
timeout-minutes: 10
|
||||
- name: Mask secrets
|
||||
run: |
|
||||
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
|
||||
- name: Set secrets for job
|
||||
run: |
|
||||
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
|
||||
- name: Download artifacts
|
||||
if: github.event_name != 'schedule'
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: talos-artifacts
|
||||
path: _out
|
||||
- name: Fix artifact permissions
|
||||
if: github.event_name != 'schedule'
|
||||
run: |
|
||||
xargs -a _out/executable-artifacts -I {} chmod +x {}
|
||||
- name: ci-temp-release-tag
|
||||
if: github.event_name != 'schedule'
|
||||
run: |
|
||||
make ci-temp-release-tag
|
||||
- name: generate
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make generate
|
||||
- name: uki-certs
|
||||
if: github.event_name == 'schedule'
|
||||
env:
|
||||
PLATFORM: linux/amd64
|
||||
run: |
|
||||
make uki-certs
|
||||
- name: build
|
||||
if: github.event_name == 'schedule'
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
PUSH: "true"
|
||||
run: |
|
||||
make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
|
||||
- name: talosctl-cni-bundle
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make talosctl-cni-bundle
|
||||
- name: image-aws
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
run: |
|
||||
make image-aws
|
||||
- name: checkout extensions
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: _out/extensions
|
||||
ref: main
|
||||
repository: siderolabs/extensions
|
||||
- name: set variables
|
||||
run: |
|
||||
cat _out/talos-metadata >> "$GITHUB_ENV"
|
||||
- name: build extensions
|
||||
env:
|
||||
PLATFORM: linux/amd64
|
||||
PUSH: "true"
|
||||
REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
make nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata -C _out/extensions
|
||||
- name: e2e-aws-prepare
|
||||
env:
|
||||
E2E_AWS_TARGET: nvidia-nonfree-production
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
make e2e-aws-prepare
|
||||
- name: checkout contrib
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: _out/contrib
|
||||
ref: main
|
||||
repository: siderolabs/contrib
|
||||
- name: setup tf
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_wrapper: "false"
|
||||
- name: tf apply
|
||||
env:
|
||||
TF_E2E_ACTION: apply
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
- name: e2e-aws-nvidia-nonfree-production
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
run: |
|
||||
make e2e-aws
|
||||
- name: tf destroy
|
||||
if: always()
|
||||
env:
|
||||
TF_E2E_ACTION: destroy
|
||||
TF_E2E_REFRESH_ON_DESTROY: "false"
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
156
.github/workflows/integration-aws-nvidia-oss-lts-cron.yaml
vendored
Normal file
156
.github/workflows/integration-aws-nvidia-oss-lts-cron.yaml
vendored
Normal file
@ -0,0 +1,156 @@
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
|
||||
#
|
||||
# Generated on 2025-07-22T04:25:57Z by kres b869533.
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
"on":
|
||||
schedule:
|
||||
- cron: 30 5 * * *
|
||||
name: integration-aws-nvidia-oss-lts-cron
|
||||
jobs:
|
||||
default:
|
||||
runs-on:
|
||||
- self-hosted
|
||||
- generic
|
||||
steps:
|
||||
- name: gather-system-info
|
||||
id: system-info
|
||||
uses: kenchan0130/actions-system-info@v1.3.1
|
||||
continue-on-error: true
|
||||
- name: print-system-info
|
||||
run: |
|
||||
MEMORY_GB=$((${{ steps.system-info.outputs.totalmem }}/1024/1024/1024))
|
||||
|
||||
OUTPUTS=(
|
||||
"CPU Core: ${{ steps.system-info.outputs.cpu-core }}"
|
||||
"CPU Model: ${{ steps.system-info.outputs.cpu-model }}"
|
||||
"Hostname: ${{ steps.system-info.outputs.hostname }}"
|
||||
"NodeName: ${NODE_NAME}"
|
||||
"Kernel release: ${{ steps.system-info.outputs.kernel-release }}"
|
||||
"Kernel version: ${{ steps.system-info.outputs.kernel-version }}"
|
||||
"Name: ${{ steps.system-info.outputs.name }}"
|
||||
"Platform: ${{ steps.system-info.outputs.platform }}"
|
||||
"Release: ${{ steps.system-info.outputs.release }}"
|
||||
"Total memory: ${MEMORY_GB} GB"
|
||||
)
|
||||
|
||||
for OUTPUT in "${OUTPUTS[@]}";do
|
||||
echo "${OUTPUT}"
|
||||
done
|
||||
continue-on-error: true
|
||||
- name: checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Unshallow
|
||||
run: |
|
||||
git fetch --prune --unshallow
|
||||
- name: Set up Docker Buildx
|
||||
id: setup-buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
driver: remote
|
||||
endpoint: tcp://buildkit-amd64.ci.svc.cluster.local:1234
|
||||
timeout-minutes: 10
|
||||
- name: Mask secrets
|
||||
run: |
|
||||
echo "$(sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | "::add-mask::" + .value')"
|
||||
- name: Set secrets for job
|
||||
run: |
|
||||
sops -d .secrets.yaml | yq -e '.secrets | to_entries[] | .key + "=" + .value' >> "$GITHUB_ENV"
|
||||
- name: Download artifacts
|
||||
if: github.event_name != 'schedule'
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: talos-artifacts
|
||||
path: _out
|
||||
- name: Fix artifact permissions
|
||||
if: github.event_name != 'schedule'
|
||||
run: |
|
||||
xargs -a _out/executable-artifacts -I {} chmod +x {}
|
||||
- name: ci-temp-release-tag
|
||||
if: github.event_name != 'schedule'
|
||||
run: |
|
||||
make ci-temp-release-tag
|
||||
- name: generate
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make generate
|
||||
- name: uki-certs
|
||||
if: github.event_name == 'schedule'
|
||||
env:
|
||||
PLATFORM: linux/amd64
|
||||
run: |
|
||||
make uki-certs
|
||||
- name: build
|
||||
if: github.event_name == 'schedule'
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
PUSH: "true"
|
||||
run: |
|
||||
make talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
|
||||
- name: talosctl-cni-bundle
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make talosctl-cni-bundle
|
||||
- name: image-aws
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
run: |
|
||||
make image-aws
|
||||
- name: checkout extensions
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: _out/extensions
|
||||
ref: main
|
||||
repository: siderolabs/extensions
|
||||
- name: set variables
|
||||
run: |
|
||||
cat _out/talos-metadata >> "$GITHUB_ENV"
|
||||
- name: build extensions
|
||||
env:
|
||||
PLATFORM: linux/amd64
|
||||
PUSH: "true"
|
||||
REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
make nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts zfs extensions-metadata -C _out/extensions
|
||||
- name: e2e-aws-prepare
|
||||
env:
|
||||
E2E_AWS_TARGET: nvidia-oss-lts
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
make e2e-aws-prepare
|
||||
- name: checkout contrib
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
path: _out/contrib
|
||||
ref: main
|
||||
repository: siderolabs/contrib
|
||||
- name: setup tf
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_wrapper: "false"
|
||||
- name: tf apply
|
||||
env:
|
||||
TF_E2E_ACTION: apply
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
- name: e2e-aws-nvidia-oss-lts
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
|
||||
run: |
|
||||
make e2e-aws
|
||||
- name: tf destroy
|
||||
if: always()
|
||||
env:
|
||||
TF_E2E_ACTION: destroy
|
||||
TF_E2E_REFRESH_ON_DESTROY: "false"
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
@ -1,6 +1,6 @@
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
|
||||
#
|
||||
# Generated on 2025-07-21T09:52:07Z by kres b869533.
|
||||
# Generated on 2025-07-22T04:25:57Z by kres b869533.
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.head_ref || github.run_id }}
|
||||
@ -8,7 +8,7 @@ concurrency:
|
||||
"on":
|
||||
schedule:
|
||||
- cron: 30 5 * * *
|
||||
name: integration-aws-nvidia-oss-cron
|
||||
name: integration-aws-nvidia-oss-production-cron
|
||||
jobs:
|
||||
default:
|
||||
runs-on:
|
||||
@ -94,13 +94,6 @@ jobs:
|
||||
if: github.event_name == 'schedule'
|
||||
run: |
|
||||
make talosctl-cni-bundle
|
||||
- name: images-essential
|
||||
if: github.event_name == 'schedule'
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
run: |
|
||||
make images-essential
|
||||
- name: image-aws
|
||||
env:
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
@ -125,7 +118,7 @@ jobs:
|
||||
make nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata -C _out/extensions
|
||||
- name: e2e-aws-prepare
|
||||
env:
|
||||
E2E_AWS_TARGET: nvidia-oss
|
||||
E2E_AWS_TARGET: nvidia-oss-production
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
run: |
|
||||
@ -147,7 +140,7 @@ jobs:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
run: |
|
||||
make e2e-cloud-tf
|
||||
- name: e2e-aws-nvidia-oss
|
||||
- name: e2e-aws-nvidia-oss-production
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
|
||||
run: |
|
||||
8
.github/workflows/slack-notify.yaml
vendored
8
.github/workflows/slack-notify.yaml
vendored
@ -1,6 +1,6 @@
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
|
||||
#
|
||||
# Generated on 2025-07-21T16:10:17Z by kres b869533.
|
||||
# Generated on 2025-07-22T04:25:57Z by kres b869533.
|
||||
|
||||
"on":
|
||||
workflow_run:
|
||||
@ -36,8 +36,10 @@
|
||||
- integration-image-cache-cron
|
||||
- integration-image-factory-cron
|
||||
- integration-aws-cron
|
||||
- integration-aws-nvidia-oss-cron
|
||||
- integration-aws-nvidia-nonfree-cron
|
||||
- integration-aws-nvidia-oss-lts-cron
|
||||
- integration-aws-nvidia-oss-production-cron
|
||||
- integration-aws-nvidia-nonfree-lts-cron
|
||||
- integration-aws-nvidia-nonfree-production-cron
|
||||
- integration-gcp-cron
|
||||
types:
|
||||
- completed
|
||||
|
||||
237
.kres.yaml
237
.kres.yaml
@ -85,8 +85,10 @@ spec:
|
||||
- integration-image-cache
|
||||
- integration-image-factory
|
||||
- integration-aws
|
||||
- integration-aws-nvidia-oss
|
||||
- integration-aws-nvidia-nonfree
|
||||
- integration-aws-nvidia-oss-lts
|
||||
- integration-aws-nvidia-oss-production
|
||||
- integration-aws-nvidia-nonfree-lts
|
||||
- integration-aws-nvidia-nonfree-production
|
||||
- integration-gcp
|
||||
---
|
||||
kind: common.GHWorkflow
|
||||
@ -2602,7 +2604,7 @@ spec:
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_E2E_ACTION: destroy
|
||||
TF_E2E_REFRESH_ON_DESTROY: false
|
||||
- name: integration-aws-nvidia-oss
|
||||
- name: integration-aws-nvidia-oss-lts
|
||||
buildxOptions:
|
||||
enabled: true
|
||||
sops: true
|
||||
@ -2614,7 +2616,10 @@ spec:
|
||||
crons:
|
||||
- '30 5 * * *'
|
||||
triggerLabels:
|
||||
- integration/aws-nvidia-oss-lts
|
||||
- integration/aws-nvidia-oss
|
||||
- integration/aws-nvidia
|
||||
- integration/aws
|
||||
steps:
|
||||
- name: download-artifacts
|
||||
conditions:
|
||||
@ -2645,12 +2650,6 @@ spec:
|
||||
- name: talosctl-cni-bundle
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
- name: images-essential
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
environment:
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
- name: image-aws
|
||||
environment:
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
@ -2664,7 +2663,9 @@ spec:
|
||||
nonMakeStep: true
|
||||
command: cat _out/talos-metadata >> "$GITHUB_ENV"
|
||||
- name: build extensions
|
||||
command: nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata
|
||||
# zfs is only added since it uses libtirpc from musl and nvidia needs libtirpc from glibc
|
||||
# this verifies that both libtirpc can co-exist together
|
||||
command: nvidia-container-toolkit-lts nvidia-open-gpu-kernel-modules-lts zfs extensions-metadata
|
||||
arguments:
|
||||
- -C
|
||||
- _out/extensions
|
||||
@ -2675,7 +2676,7 @@ spec:
|
||||
- name: e2e-aws-prepare
|
||||
environment:
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
E2E_AWS_TARGET: nvidia-oss
|
||||
E2E_AWS_TARGET: nvidia-oss-lts
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
- name: checkout contrib
|
||||
checkoutStep:
|
||||
@ -2690,7 +2691,7 @@ spec:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_E2E_ACTION: apply
|
||||
- name: e2e-aws-nvidia-oss
|
||||
- name: e2e-aws-nvidia-oss-lts
|
||||
command: e2e-aws
|
||||
environment:
|
||||
EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false"
|
||||
@ -2703,7 +2704,7 @@ spec:
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_E2E_ACTION: destroy
|
||||
TF_E2E_REFRESH_ON_DESTROY: false
|
||||
- name: integration-aws-nvidia-nonfree
|
||||
- name: integration-aws-nvidia-oss-production
|
||||
buildxOptions:
|
||||
enabled: true
|
||||
sops: true
|
||||
@ -2713,9 +2714,12 @@ spec:
|
||||
- self-hosted
|
||||
- generic # we can use generic here since the tests run against a remote talos cluster
|
||||
crons:
|
||||
- '30 7 * * *'
|
||||
- '30 5 * * *'
|
||||
triggerLabels:
|
||||
- integration/aws-nvidia-nonfree
|
||||
- integration/aws-nvidia-oss-production
|
||||
- integration/aws-nvidia-oss
|
||||
- integration/aws-nvidia
|
||||
- integration/aws
|
||||
steps:
|
||||
- name: download-artifacts
|
||||
conditions:
|
||||
@ -2746,12 +2750,106 @@ spec:
|
||||
- name: talosctl-cni-bundle
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
- name: images-essential
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
- name: image-aws
|
||||
environment:
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
- name: checkout extensions
|
||||
checkoutStep:
|
||||
repository: siderolabs/extensions
|
||||
ref: main
|
||||
path: _out/extensions
|
||||
- name: set variables
|
||||
nonMakeStep: true
|
||||
command: cat _out/talos-metadata >> "$GITHUB_ENV"
|
||||
- name: build extensions
|
||||
# zfs is only added since it uses libtirpc from musl and nvidia needs libtirpc from glibc
|
||||
# this verifies that both libtirpc can co-exist together
|
||||
command: nvidia-container-toolkit-production nvidia-open-gpu-kernel-modules-production zfs extensions-metadata
|
||||
arguments:
|
||||
- -C
|
||||
- _out/extensions
|
||||
environment:
|
||||
PLATFORM: linux/amd64
|
||||
PUSH: true
|
||||
REGISTRY: registry.dev.siderolabs.io
|
||||
- name: e2e-aws-prepare
|
||||
environment:
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
E2E_AWS_TARGET: nvidia-oss-production
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
- name: checkout contrib
|
||||
checkoutStep:
|
||||
repository: siderolabs/contrib
|
||||
ref: main
|
||||
path: _out/contrib
|
||||
- name: setup tf
|
||||
terraformStep: true
|
||||
- name: tf apply
|
||||
command: e2e-cloud-tf
|
||||
environment:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_E2E_ACTION: apply
|
||||
- name: e2e-aws-nvidia-oss-production
|
||||
command: e2e-aws
|
||||
environment:
|
||||
EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false"
|
||||
- name: tf destroy
|
||||
command: e2e-cloud-tf
|
||||
conditions:
|
||||
- always
|
||||
environment:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_E2E_ACTION: destroy
|
||||
TF_E2E_REFRESH_ON_DESTROY: false
|
||||
- name: integration-aws-nvidia-nonfree-lts
|
||||
buildxOptions:
|
||||
enabled: true
|
||||
sops: true
|
||||
depends:
|
||||
- default
|
||||
runners:
|
||||
- self-hosted
|
||||
- generic # we can use generic here since the tests run against a remote talos cluster
|
||||
crons:
|
||||
- '30 7 * * *'
|
||||
triggerLabels:
|
||||
- integration/aws-nvidia-nonfree-lts
|
||||
- integration/aws-nvidia-nonfree
|
||||
- integration/aws-nvidia
|
||||
- integration/aws
|
||||
steps:
|
||||
- name: download-artifacts
|
||||
conditions:
|
||||
- not-on-schedule
|
||||
artifactStep:
|
||||
type: download
|
||||
artifactName: talos-artifacts
|
||||
artifactPath: _out
|
||||
- name: ci-temp-release-tag
|
||||
conditions:
|
||||
- not-on-schedule
|
||||
- name: generate
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
- name: uki-certs
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
environment:
|
||||
PLATFORM: linux/amd64
|
||||
- name: build
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
command: talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
|
||||
environment:
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PUSH: true
|
||||
- name: talosctl-cni-bundle
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
- name: image-aws
|
||||
environment:
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
@ -2776,7 +2874,7 @@ spec:
|
||||
- name: e2e-aws-prepare
|
||||
environment:
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
E2E_AWS_TARGET: nvidia-nonfree
|
||||
E2E_AWS_TARGET: nvidia-nonfree-lts
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
- name: checkout contrib
|
||||
checkoutStep:
|
||||
@ -2791,7 +2889,105 @@ spec:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_E2E_ACTION: apply
|
||||
- name: e2e-aws-nvidia-nonfree
|
||||
- name: e2e-aws-nvidia-nonfree-lts
|
||||
command: e2e-aws
|
||||
environment:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
- name: tf destroy
|
||||
command: e2e-cloud-tf
|
||||
conditions:
|
||||
- always
|
||||
environment:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_E2E_ACTION: destroy
|
||||
TF_E2E_REFRESH_ON_DESTROY: false
|
||||
- name: integration-aws-nvidia-nonfree-production
|
||||
buildxOptions:
|
||||
enabled: true
|
||||
sops: true
|
||||
depends:
|
||||
- default
|
||||
runners:
|
||||
- self-hosted
|
||||
- generic # we can use generic here since the tests run against a remote talos cluster
|
||||
crons:
|
||||
- '30 7 * * *'
|
||||
triggerLabels:
|
||||
- integration/aws-nvidia-nonfree-production
|
||||
- integration/aws-nvidia-nonfree
|
||||
- integration/aws-nvidia
|
||||
- integration/aws
|
||||
steps:
|
||||
- name: download-artifacts
|
||||
conditions:
|
||||
- not-on-schedule
|
||||
artifactStep:
|
||||
type: download
|
||||
artifactName: talos-artifacts
|
||||
artifactPath: _out
|
||||
- name: ci-temp-release-tag
|
||||
conditions:
|
||||
- not-on-schedule
|
||||
- name: generate
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
- name: uki-certs
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
environment:
|
||||
PLATFORM: linux/amd64
|
||||
- name: build
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
command: talosctl-linux-amd64 kernel sd-boot sd-stub initramfs installer-base imager talos _out/integration-test-linux-amd64
|
||||
environment:
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
PUSH: true
|
||||
- name: talosctl-cni-bundle
|
||||
conditions:
|
||||
- only-on-schedule
|
||||
- name: image-aws
|
||||
environment:
|
||||
PLATFORM: linux/amd64,linux/arm64
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
- name: checkout extensions
|
||||
checkoutStep:
|
||||
repository: siderolabs/extensions
|
||||
ref: main
|
||||
path: _out/extensions
|
||||
- name: set variables
|
||||
nonMakeStep: true
|
||||
command: cat _out/talos-metadata >> "$GITHUB_ENV"
|
||||
- name: build extensions
|
||||
command: nvidia-container-toolkit-production nonfree-kmod-nvidia-production extensions-metadata
|
||||
arguments:
|
||||
- -C
|
||||
- _out/extensions
|
||||
environment:
|
||||
PLATFORM: linux/amd64
|
||||
PUSH: true
|
||||
REGISTRY: registry.dev.siderolabs.io
|
||||
- name: e2e-aws-prepare
|
||||
environment:
|
||||
EXTENSIONS_METADATA_FILE: _out/extensions/_out/extensions-metadata
|
||||
E2E_AWS_TARGET: nvidia-nonfree-production
|
||||
IMAGE_REGISTRY: registry.dev.siderolabs.io
|
||||
- name: checkout contrib
|
||||
checkoutStep:
|
||||
repository: siderolabs/contrib
|
||||
ref: main
|
||||
path: _out/contrib
|
||||
- name: setup tf
|
||||
terraformStep: true
|
||||
- name: tf apply
|
||||
command: e2e-cloud-tf
|
||||
environment:
|
||||
TF_SCRIPT_DIR: _out/contrib
|
||||
TF_E2E_TEST_TYPE: aws
|
||||
TF_E2E_ACTION: apply
|
||||
- name: e2e-aws-nvidia-nonfree-production
|
||||
command: e2e-aws
|
||||
environment:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
@ -2898,6 +3094,7 @@ spec:
|
||||
- customType: regex
|
||||
managerFilePatterns:
|
||||
- internal/integration/k8s/constants.go
|
||||
- internal/integration/api/constants.go
|
||||
matchStrings:
|
||||
- '\/\/\s+renovate: datasource=(?<datasource>.*?)(?:\s+extractVersion=(?<extractVersion>.+?))?(?:\s+versioning=(?<versioning>.+?))?\s+depName=(?<depName>.+?)?(?:\s+registryUrl=(?<registryUrl>.+?))?\s.*Version\s+=\s+\"(?<currentValue>.+?)\"'
|
||||
versioningTemplate: "{{#if versioning}}{{versioning}}{{else}}semver{{/if}}"
|
||||
|
||||
@ -10,7 +10,7 @@ function cloud_image_upload() {
|
||||
CLOUD_IMAGES_EXTRA_ARGS=("--name-prefix=${1}" "--target-clouds=aws" "--architectures=amd64" "--aws-regions=${REGION}")
|
||||
|
||||
case "${1}" in
|
||||
talos-e2e-nvidia-oss)
|
||||
talos-e2e-nvidia-oss-*)
|
||||
CLOUD_IMAGES_EXTRA_ARGS+=("--aws-force-bios")
|
||||
;;
|
||||
esac
|
||||
@ -24,15 +24,21 @@ function get_ami_id() {
|
||||
|
||||
function cloud_image_upload_with_extensions() {
|
||||
case "${1}" in
|
||||
nvidia-oss)
|
||||
nvidia-oss-lts)
|
||||
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nvidia-open-gpu-kernel-modules-lts") or contains("nvidia-container-toolkit-lts") or contains("zfs")) and (contains("nvidia-fabricmanager") or contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
|
||||
;;
|
||||
nvidia-oss-production)
|
||||
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nvidia-open-gpu-kernel-modules-production") or contains("nvidia-container-toolkit-production") or contains("zfs")) and (contains("nvidia-fabricmanager") or contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
|
||||
;;
|
||||
nvidia-oss-fabricmanager)
|
||||
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nvidia-open-gpu-kernel-modules-production") or contains("nvidia-container-toolkit-production")) and (contains("nonfree-kmod-nvidia") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
|
||||
;;
|
||||
nvidia-nonfree)
|
||||
nvidia-nonfree-lts)
|
||||
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nonfree-kmod-nvidia-lts") or contains("nvidia-container-toolkit-lts")) and (contains("nvidia-fabricmanager") or contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
|
||||
;;
|
||||
nvidia-nonfree-production)
|
||||
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nonfree-kmod-nvidia-production") or contains("nvidia-container-toolkit-production")) and (contains("nvidia-fabricmanager") or contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
|
||||
;;
|
||||
nvidia-nonfree-fabricmanager)
|
||||
EXTENSIONS=$(jq -R < "${EXTENSIONS_METADATA_FILE}" | jq -rs 'map(select(. | (contains("nonfree-kmod-nvidia-lts") or contains("nvidia-container-toolkit-lts")) and (contains("nvidia-open-gpu-kernel-modules") | not))) | .[] |= "--system-extension-image=" + . | join(" ")')
|
||||
;;
|
||||
@ -65,7 +71,7 @@ esac
|
||||
|
||||
mkdir -p "${ARTIFACTS}/e2e-aws-generated"
|
||||
|
||||
NAME_PREFIX="talos-e2e-${SHA}-aws-${E2E_AWS_TARGET}"
|
||||
NAME_PREFIX="${SHA}-${E2E_AWS_TARGET}"
|
||||
|
||||
jq --null-input \
|
||||
--arg WORKER_GROUP "${WORKER_GROUP}" \
|
||||
|
||||
16
internal/integration/api/constants.go
Normal file
16
internal/integration/api/constants.go
Normal file
@ -0,0 +1,16 @@
|
||||
// This Source Code Form is subject to the terms of the Mozilla Public
|
||||
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
//go:build integration_api
|
||||
|
||||
package api
|
||||
|
||||
const (
|
||||
// NvidiaDevicePluginChartVersion is the version of the NVIDA device plugin chart to use
|
||||
// renovate: datasource=helm versioning=helm depName=nvidia-device-plugin registryUrl=https://nvidia.github.io/k8s-device-plugin
|
||||
NvidiaDevicePluginChartVersion = "v0.17.2"
|
||||
// NvidiaCUDATestImageVersion is the version of the NVIDIA CUDA test image to use
|
||||
// renovate: datasource=docker depName=nvcr.io/nvidia/k8s/cuda-sample
|
||||
NvidiaCUDATestImageVersion = "vectoradd-cuda12.5.0"
|
||||
)
|
||||
@ -8,12 +8,13 @@ package api
|
||||
|
||||
import (
|
||||
"context"
|
||||
_ "embed"
|
||||
"fmt"
|
||||
"io"
|
||||
"time"
|
||||
|
||||
"github.com/siderolabs/go-pointer"
|
||||
"github.com/siderolabs/go-retry/retry"
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
batchv1 "k8s.io/api/batch/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
nodev1 "k8s.io/api/node/v1"
|
||||
@ -22,6 +23,9 @@ import (
|
||||
"github.com/siderolabs/talos/internal/integration/base"
|
||||
)
|
||||
|
||||
//go:embed testdata/nvidia-device-plugin.yaml
|
||||
var nvidiaDevicePluginHelmChartValues []byte
|
||||
|
||||
// ExtensionsSuiteNVIDIA verifies Talos is securebooted.
|
||||
type ExtensionsSuiteNVIDIA struct {
|
||||
base.K8sSuite
|
||||
@ -97,13 +101,18 @@ func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() {
|
||||
|
||||
suite.Require().NoError(err)
|
||||
|
||||
_, err = suite.Clientset.AppsV1().DaemonSets("kube-system").Create(suite.ctx, nvidiaDevicePluginDaemonSetSpec(), metav1.CreateOptions{})
|
||||
defer suite.Clientset.AppsV1().DaemonSets("kube-system").Delete(suite.ctx, "nvidia-device-plugin", metav1.DeleteOptions{}) //nolint:errcheck
|
||||
|
||||
suite.Require().NoError(err)
|
||||
suite.Require().NoError(suite.HelmInstall(
|
||||
suite.ctx,
|
||||
"kube-system",
|
||||
"https://nvidia.github.io/k8s-device-plugin",
|
||||
NvidiaDevicePluginChartVersion,
|
||||
"nvidia-device-plugin",
|
||||
"nvidia-device-plugin",
|
||||
nvidiaDevicePluginHelmChartValues,
|
||||
))
|
||||
|
||||
// now we can create a cuda test job
|
||||
_, err = suite.Clientset.BatchV1().Jobs("default").Create(suite.ctx, nvidiaCUDATestJob("nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1"), metav1.CreateOptions{})
|
||||
_, err = suite.Clientset.BatchV1().Jobs("default").Create(suite.ctx, nvidiaCUDATestJob(), metav1.CreateOptions{})
|
||||
defer suite.Clientset.BatchV1().Jobs("default").Delete(suite.ctx, "cuda-test", metav1.DeleteOptions{}) //nolint:errcheck
|
||||
|
||||
suite.Require().NoError(err)
|
||||
@ -199,80 +208,7 @@ func (suite *ExtensionsSuiteNVIDIA) getNVIDIANodes(labelQuery string) []string {
|
||||
return nodeList
|
||||
}
|
||||
|
||||
func nvidiaDevicePluginDaemonSetSpec() *appsv1.DaemonSet {
|
||||
return &appsv1.DaemonSet{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "nvidia-device-plugin",
|
||||
},
|
||||
Spec: appsv1.DaemonSetSpec{
|
||||
Selector: &metav1.LabelSelector{
|
||||
MatchLabels: map[string]string{
|
||||
"app.kubernetes.io/name": "nvidia-device-plugin",
|
||||
},
|
||||
},
|
||||
UpdateStrategy: appsv1.DaemonSetUpdateStrategy{
|
||||
Type: appsv1.RollingUpdateDaemonSetStrategyType,
|
||||
},
|
||||
Template: corev1.PodTemplateSpec{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Labels: map[string]string{
|
||||
"app.kubernetes.io/name": "nvidia-device-plugin",
|
||||
},
|
||||
},
|
||||
Spec: corev1.PodSpec{
|
||||
PriorityClassName: "system-node-critical",
|
||||
RuntimeClassName: pointer.To("nvidia"),
|
||||
Containers: []corev1.Container{
|
||||
{
|
||||
Name: "nvidia-device-plugin-ctr",
|
||||
Image: "nvcr.io/nvidia/k8s-device-plugin:v0.14.1",
|
||||
Env: []corev1.EnvVar{
|
||||
{
|
||||
Name: "NVIDIA_MIG_MONITOR_DEVICES",
|
||||
Value: "all",
|
||||
},
|
||||
},
|
||||
SecurityContext: &corev1.SecurityContext{
|
||||
Capabilities: &corev1.Capabilities{
|
||||
Add: []corev1.Capability{"SYS_ADMIN"},
|
||||
},
|
||||
},
|
||||
VolumeMounts: []corev1.VolumeMount{
|
||||
{
|
||||
Name: "device-plugin",
|
||||
MountPath: "/var/lib/kubelet/device-plugins",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Volumes: []corev1.Volume{
|
||||
{
|
||||
Name: "device-plugin",
|
||||
VolumeSource: corev1.VolumeSource{
|
||||
HostPath: &corev1.HostPathVolumeSource{
|
||||
Path: "/var/lib/kubelet/device-plugins",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Tolerations: []corev1.Toleration{
|
||||
{
|
||||
Key: "CriticalAddonsOnly",
|
||||
Operator: corev1.TolerationOpExists,
|
||||
},
|
||||
{
|
||||
Effect: corev1.TaintEffectNoSchedule,
|
||||
Key: "nvidia.com/gpu",
|
||||
Operator: corev1.TolerationOpExists,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func nvidiaCUDATestJob(image string) *batchv1.Job {
|
||||
func nvidiaCUDATestJob() *batchv1.Job {
|
||||
return &batchv1.Job{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "cuda-test",
|
||||
@ -290,7 +226,7 @@ func nvidiaCUDATestJob(image string) *batchv1.Job {
|
||||
Containers: []corev1.Container{
|
||||
{
|
||||
Name: "cuda-test",
|
||||
Image: image,
|
||||
Image: fmt.Sprintf("nvcr.io/nvidia/k8s/cuda-sample:%s", NvidiaCUDATestImageVersion),
|
||||
},
|
||||
},
|
||||
Affinity: &corev1.Affinity{
|
||||
|
||||
1
internal/integration/api/testdata/nvidia-device-plugin.yaml
vendored
Normal file
1
internal/integration/api/testdata/nvidia-device-plugin.yaml
vendored
Normal file
@ -0,0 +1 @@
|
||||
runtimeClassName: nvidia
|
||||
@ -57,16 +57,17 @@ The NVIDIA modules should be loaded and the system extension should be installed
|
||||
This can be confirmed by running:
|
||||
|
||||
```bash
|
||||
talosctl read /proc/modules
|
||||
talosctl get modules
|
||||
```
|
||||
|
||||
which should produce an output similar to below:
|
||||
|
||||
```text
|
||||
nvidia_uvm 1146880 - - Live 0xffffffffc2733000 (PO)
|
||||
nvidia_drm 69632 - - Live 0xffffffffc2721000 (PO)
|
||||
nvidia_modeset 1142784 - - Live 0xffffffffc25ea000 (PO)
|
||||
nvidia 39047168 - - Live 0xffffffffc00ac000 (PO)
|
||||
NODE NAMESPACE TYPE ID VERSION STATE
|
||||
10.5.0.3 runtime LoadedKernelModule nvidia_uvm 1 Live
|
||||
10.5.0.3 runtime LoadedKernelModule nvidia_drm 1 Live
|
||||
10.5.0.3 runtime LoadedKernelModule nvidia_modeset 1 Live
|
||||
10.5.0.3 runtime LoadedKernelModule nvidia 1 Live
|
||||
```
|
||||
|
||||
```bash
|
||||
@ -81,17 +82,6 @@ NODE NAMESPACE TYPE ID
|
||||
172.31.41.27 runtime ExtensionStatus 000.ghcr.io-siderolabs-nvidia-open-gpu-kernel-modules-515.65.01-v1.2.0 1 nvidia-open-gpu-kernel-modules 515.65.01-v1.2.0
|
||||
```
|
||||
|
||||
```bash
|
||||
talosctl read /proc/driver/nvidia/version
|
||||
```
|
||||
|
||||
which should produce an output similar to below:
|
||||
|
||||
```text
|
||||
NVRM version: NVIDIA UNIX x86_64 Kernel Module 515.65.01 Wed Mar 16 11:24:05 UTC 2022
|
||||
GCC version: gcc version 12.2.0 (GCC)
|
||||
```
|
||||
|
||||
## Deploying NVIDIA device plugin
|
||||
|
||||
First we need to create the `RuntimeClass`
|
||||
@ -151,7 +141,7 @@ kubectl run \
|
||||
nvidia-test \
|
||||
--restart=Never \
|
||||
-ti --rm \
|
||||
--image nvcr.io/nvidia/cuda:12.5.0-base-ubuntu22.04 \
|
||||
--image nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0 \
|
||||
--overrides '{"spec": {"runtimeClassName": "nvidia"}}' \
|
||||
nvidia-smi
|
||||
```
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user