test: fix NVIDIA OSS tests

Add more logging output.

Force non-UEFI boot.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
This commit is contained in:
Andrey Smirnov 2025-04-11 18:08:32 +04:00
parent 62f2d27cd4
commit 8cd3c8dc77
No known key found for this signature in database
GPG Key ID: FE042E3D4085A811
10 changed files with 48 additions and 26 deletions

View File

@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2025-04-10T08:42:48Z by kres d903dae.
# Generated on 2025-04-15T15:57:38Z by kres fd5cab0.
name: default
concurrency:
@ -634,7 +634,6 @@ jobs:
- name: e2e-aws-nvidia-nonfree
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia
TEST_NUM_NODES: "4"
run: |
make e2e-aws
- name: tf destroy
@ -788,8 +787,7 @@ jobs:
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia
TEST_NUM_NODES: "4"
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
run: |
make e2e-aws
- name: tf destroy

View File

@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2025-04-01T08:14:24Z by kres d903dae.
# Generated on 2025-04-15T15:57:38Z by kres fd5cab0.
name: integration-aws-nvidia-nonfree-cron
concurrency:
@ -144,7 +144,6 @@ jobs:
- name: e2e-aws-nvidia-nonfree
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia
TEST_NUM_NODES: "4"
run: |
make e2e-aws
- name: tf destroy

View File

@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2025-04-04T11:30:51Z by kres d903dae.
# Generated on 2025-04-15T15:57:38Z by kres fd5cab0.
name: integration-aws-nvidia-oss-cron
concurrency:
@ -143,8 +143,7 @@ jobs:
make e2e-cloud-tf
- name: e2e-aws-nvidia-oss
env:
EXTRA_TEST_ARGS: -talos.extensions.nvidia
TEST_NUM_NODES: "4"
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
run: |
make e2e-aws
- name: tf destroy

View File

@ -2562,8 +2562,7 @@ spec:
- name: e2e-aws-nvidia-oss
command: e2e-aws
environment:
TEST_NUM_NODES: 4
EXTRA_TEST_ARGS: -talos.extensions.nvidia
EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false"
- name: tf destroy
command: e2e-cloud-tf
conditions:
@ -2660,7 +2659,6 @@ spec:
- name: e2e-aws-nvidia-nonfree
command: e2e-aws
environment:
TEST_NUM_NODES: 4
EXTRA_TEST_ARGS: -talos.extensions.nvidia
- name: tf destroy
command: e2e-cloud-tf

View File

@ -384,7 +384,7 @@ func (au *AWSUploader) registerAMIArch(ctx context.Context, region string, svc *
log.Printf("aws: import into %s/%s, deregistered image ID %q", region, arch, *image.ImageId)
}
registerResp, err := svc.RegisterImage(ctx, &ec2.RegisterImageInput{
registerReq := &ec2.RegisterImageInput{
Name: aws.String(imageName),
BlockDeviceMappings: []types.BlockDeviceMapping{
{
@ -404,8 +404,13 @@ func (au *AWSUploader) registerAMIArch(ctx context.Context, region string, svc *
Description: pointer.To(fmt.Sprintf("Talos AMI %s %s %s", au.Options.Tag, arch, region)),
Architecture: awsArchitectures[arch],
ImdsSupport: types.ImdsSupportValuesV20,
BootMode: types.BootModeValuesUefiPreferred,
})
}
if !au.Options.AWSForceBIOS {
registerReq.BootMode = types.BootModeValuesUefiPreferred
}
registerResp, err := svc.RegisterImage(ctx, registerReq)
if err != nil {
return err
}

View File

@ -64,6 +64,7 @@ func run() error {
pflag.StringVar(&DefaultOptions.NamePrefix, "name-prefix", DefaultOptions.NamePrefix, "prefix for the name of the uploaded image")
pflag.StringSliceVar(&DefaultOptions.AWSRegions, "aws-regions", DefaultOptions.AWSRegions, "list of AWS regions to upload to")
pflag.BoolVar(&DefaultOptions.AWSForceBIOS, "aws-force-bios", DefaultOptions.AWSForceBIOS, "force BIOS boot mode for AWS images")
pflag.Parse()

View File

@ -18,7 +18,8 @@ type Options struct {
TargetClouds []string
// AWS options.
AWSRegions []string
AWSRegions []string
AWSForceBIOS bool
}
// DefaultOptions used throughout the cli.

View File

@ -9,6 +9,12 @@ REGION="us-east-1"
function cloud_image_upload() {
CLOUD_IMAGES_EXTRA_ARGS=("--name-prefix=${1}" "--target-clouds=aws" "--architectures=amd64" "--aws-regions=${REGION}")
case "${1}" in
talos-e2e-nvidia-oss)
CLOUD_IMAGES_EXTRA_ARGS+=("--aws-force-bios")
;;
esac
make cloud-images CLOUD_IMAGES_EXTRA_ARGS="${CLOUD_IMAGES_EXTRA_ARGS[*]}"
}

View File

@ -37,7 +37,6 @@ export KUBERNETES_VERSION=${KUBERNETES_VERSION:-1.33.0-rc.0}
export NAME_PREFIX="talos-e2e-${SHA}-${PLATFORM}"
export TIMEOUT=1200
export NUM_NODES=${TEST_NUM_NODES:-6}
# default values, overridden by talosctl cluster create tests
PROVISIONER=

View File

@ -53,6 +53,8 @@ func (suite *ExtensionsSuiteNVIDIA) TearDownTest() {
}
// TestExtensionsNVIDIA verifies that a cuda workload can be run.
//
//nolint:gocyclo
func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() {
expectedModulesModDep := map[string]string{
"nvidia": "nvidia.ko",
@ -126,6 +128,14 @@ func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() {
return retry.ExpectedErrorf("error getting pod: %s", listErr)
}
for _, pod := range podList.Items {
if pod.Status.Phase == corev1.PodFailed {
logData := suite.getPodLogs("default", pod.Name)
suite.T().Logf("pod %s logs:\n%s", pod.Name, logData)
}
}
if len(podList.Items) != 1 {
return retry.ExpectedErrorf("expected 1 pod, got %d", len(podList.Items))
}
@ -147,19 +157,25 @@ func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() {
suite.Require().Len(podList.Items, 1)
for _, pod := range podList.Items {
res := suite.Clientset.CoreV1().Pods("default").GetLogs(pod.Name, &corev1.PodLogOptions{})
stream, err := res.Stream(suite.ctx)
suite.Require().NoError(err)
logData := suite.getPodLogs("default", pod.Name)
defer stream.Close() //nolint:errcheck
logData, err := io.ReadAll(stream)
suite.Require().NoError(err)
suite.Require().Contains(string(logData), "Test PASSED")
suite.Require().Contains(logData, "Test PASSED")
}
}
func (suite *ExtensionsSuiteNVIDIA) getPodLogs(namespace, name string) string {
res := suite.Clientset.CoreV1().Pods(namespace).GetLogs(name, &corev1.PodLogOptions{})
stream, err := res.Stream(suite.ctx)
suite.Require().NoError(err)
defer stream.Close() //nolint:errcheck
logData, err := io.ReadAll(stream)
suite.Require().NoError(err)
return string(logData)
}
func (suite *ExtensionsSuiteNVIDIA) getNVIDIANodes(labelQuery string) []string {
nodes, err := suite.Clientset.CoreV1().Nodes().List(suite.ctx, metav1.ListOptions{
LabelSelector: labelQuery,