diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4fc9bfb28..522443509 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2025-04-10T08:42:48Z by kres d903dae. +# Generated on 2025-04-15T15:57:38Z by kres fd5cab0. name: default concurrency: @@ -634,7 +634,6 @@ jobs: - name: e2e-aws-nvidia-nonfree env: EXTRA_TEST_ARGS: -talos.extensions.nvidia - TEST_NUM_NODES: "4" run: | make e2e-aws - name: tf destroy @@ -788,8 +787,7 @@ jobs: make e2e-cloud-tf - name: e2e-aws-nvidia-oss env: - EXTRA_TEST_ARGS: -talos.extensions.nvidia - TEST_NUM_NODES: "4" + EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false run: | make e2e-aws - name: tf destroy diff --git a/.github/workflows/integration-aws-nvidia-nonfree-cron.yaml b/.github/workflows/integration-aws-nvidia-nonfree-cron.yaml index f7cdb8f95..70ce27e96 100644 --- a/.github/workflows/integration-aws-nvidia-nonfree-cron.yaml +++ b/.github/workflows/integration-aws-nvidia-nonfree-cron.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2025-04-01T08:14:24Z by kres d903dae. +# Generated on 2025-04-15T15:57:38Z by kres fd5cab0. name: integration-aws-nvidia-nonfree-cron concurrency: @@ -144,7 +144,6 @@ jobs: - name: e2e-aws-nvidia-nonfree env: EXTRA_TEST_ARGS: -talos.extensions.nvidia - TEST_NUM_NODES: "4" run: | make e2e-aws - name: tf destroy diff --git a/.github/workflows/integration-aws-nvidia-oss-cron.yaml b/.github/workflows/integration-aws-nvidia-oss-cron.yaml index a1175ab1f..3a2ebe635 100644 --- a/.github/workflows/integration-aws-nvidia-oss-cron.yaml +++ b/.github/workflows/integration-aws-nvidia-oss-cron.yaml @@ -1,6 +1,6 @@ # THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT. # -# Generated on 2025-04-04T11:30:51Z by kres d903dae. +# Generated on 2025-04-15T15:57:38Z by kres fd5cab0. name: integration-aws-nvidia-oss-cron concurrency: @@ -143,8 +143,7 @@ jobs: make e2e-cloud-tf - name: e2e-aws-nvidia-oss env: - EXTRA_TEST_ARGS: -talos.extensions.nvidia - TEST_NUM_NODES: "4" + EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false run: | make e2e-aws - name: tf destroy diff --git a/.kres.yaml b/.kres.yaml index a541c009c..65095aba8 100644 --- a/.kres.yaml +++ b/.kres.yaml @@ -2562,8 +2562,7 @@ spec: - name: e2e-aws-nvidia-oss command: e2e-aws environment: - TEST_NUM_NODES: 4 - EXTRA_TEST_ARGS: -talos.extensions.nvidia + EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false" - name: tf destroy command: e2e-cloud-tf conditions: @@ -2660,7 +2659,6 @@ spec: - name: e2e-aws-nvidia-nonfree command: e2e-aws environment: - TEST_NUM_NODES: 4 EXTRA_TEST_ARGS: -talos.extensions.nvidia - name: tf destroy command: e2e-cloud-tf diff --git a/hack/cloud-image-uploader/aws.go b/hack/cloud-image-uploader/aws.go index 5b2f9e508..c206f0847 100644 --- a/hack/cloud-image-uploader/aws.go +++ b/hack/cloud-image-uploader/aws.go @@ -384,7 +384,7 @@ func (au *AWSUploader) registerAMIArch(ctx context.Context, region string, svc * log.Printf("aws: import into %s/%s, deregistered image ID %q", region, arch, *image.ImageId) } - registerResp, err := svc.RegisterImage(ctx, &ec2.RegisterImageInput{ + registerReq := &ec2.RegisterImageInput{ Name: aws.String(imageName), BlockDeviceMappings: []types.BlockDeviceMapping{ { @@ -404,8 +404,13 @@ func (au *AWSUploader) registerAMIArch(ctx context.Context, region string, svc * Description: pointer.To(fmt.Sprintf("Talos AMI %s %s %s", au.Options.Tag, arch, region)), Architecture: awsArchitectures[arch], ImdsSupport: types.ImdsSupportValuesV20, - BootMode: types.BootModeValuesUefiPreferred, - }) + } + + if !au.Options.AWSForceBIOS { + registerReq.BootMode = types.BootModeValuesUefiPreferred + } + + registerResp, err := svc.RegisterImage(ctx, registerReq) if err != nil { return err } diff --git a/hack/cloud-image-uploader/main.go b/hack/cloud-image-uploader/main.go index 3b56c7914..f00d3e0ce 100644 --- a/hack/cloud-image-uploader/main.go +++ b/hack/cloud-image-uploader/main.go @@ -64,6 +64,7 @@ func run() error { pflag.StringVar(&DefaultOptions.NamePrefix, "name-prefix", DefaultOptions.NamePrefix, "prefix for the name of the uploaded image") pflag.StringSliceVar(&DefaultOptions.AWSRegions, "aws-regions", DefaultOptions.AWSRegions, "list of AWS regions to upload to") + pflag.BoolVar(&DefaultOptions.AWSForceBIOS, "aws-force-bios", DefaultOptions.AWSForceBIOS, "force BIOS boot mode for AWS images") pflag.Parse() diff --git a/hack/cloud-image-uploader/options.go b/hack/cloud-image-uploader/options.go index 00e709689..ae4c337ad 100644 --- a/hack/cloud-image-uploader/options.go +++ b/hack/cloud-image-uploader/options.go @@ -18,7 +18,8 @@ type Options struct { TargetClouds []string // AWS options. - AWSRegions []string + AWSRegions []string + AWSForceBIOS bool } // DefaultOptions used throughout the cli. diff --git a/hack/test/e2e-aws-prepare.sh b/hack/test/e2e-aws-prepare.sh index f37300114..cbdce9841 100755 --- a/hack/test/e2e-aws-prepare.sh +++ b/hack/test/e2e-aws-prepare.sh @@ -9,6 +9,12 @@ REGION="us-east-1" function cloud_image_upload() { CLOUD_IMAGES_EXTRA_ARGS=("--name-prefix=${1}" "--target-clouds=aws" "--architectures=amd64" "--aws-regions=${REGION}") + case "${1}" in + talos-e2e-nvidia-oss) + CLOUD_IMAGES_EXTRA_ARGS+=("--aws-force-bios") + ;; + esac + make cloud-images CLOUD_IMAGES_EXTRA_ARGS="${CLOUD_IMAGES_EXTRA_ARGS[*]}" } diff --git a/hack/test/e2e.sh b/hack/test/e2e.sh index cfbb7ef0a..d38160290 100755 --- a/hack/test/e2e.sh +++ b/hack/test/e2e.sh @@ -37,7 +37,6 @@ export KUBERNETES_VERSION=${KUBERNETES_VERSION:-1.33.0-rc.0} export NAME_PREFIX="talos-e2e-${SHA}-${PLATFORM}" export TIMEOUT=1200 -export NUM_NODES=${TEST_NUM_NODES:-6} # default values, overridden by talosctl cluster create tests PROVISIONER= diff --git a/internal/integration/api/extensions_nvidia.go b/internal/integration/api/extensions_nvidia.go index 71b222978..035a4b8bc 100644 --- a/internal/integration/api/extensions_nvidia.go +++ b/internal/integration/api/extensions_nvidia.go @@ -53,6 +53,8 @@ func (suite *ExtensionsSuiteNVIDIA) TearDownTest() { } // TestExtensionsNVIDIA verifies that a cuda workload can be run. +// +//nolint:gocyclo func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() { expectedModulesModDep := map[string]string{ "nvidia": "nvidia.ko", @@ -126,6 +128,14 @@ func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() { return retry.ExpectedErrorf("error getting pod: %s", listErr) } + for _, pod := range podList.Items { + if pod.Status.Phase == corev1.PodFailed { + logData := suite.getPodLogs("default", pod.Name) + + suite.T().Logf("pod %s logs:\n%s", pod.Name, logData) + } + } + if len(podList.Items) != 1 { return retry.ExpectedErrorf("expected 1 pod, got %d", len(podList.Items)) } @@ -147,19 +157,25 @@ func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() { suite.Require().Len(podList.Items, 1) for _, pod := range podList.Items { - res := suite.Clientset.CoreV1().Pods("default").GetLogs(pod.Name, &corev1.PodLogOptions{}) - stream, err := res.Stream(suite.ctx) - suite.Require().NoError(err) + logData := suite.getPodLogs("default", pod.Name) - defer stream.Close() //nolint:errcheck - - logData, err := io.ReadAll(stream) - suite.Require().NoError(err) - - suite.Require().Contains(string(logData), "Test PASSED") + suite.Require().Contains(logData, "Test PASSED") } } +func (suite *ExtensionsSuiteNVIDIA) getPodLogs(namespace, name string) string { + res := suite.Clientset.CoreV1().Pods(namespace).GetLogs(name, &corev1.PodLogOptions{}) + stream, err := res.Stream(suite.ctx) + suite.Require().NoError(err) + + defer stream.Close() //nolint:errcheck + + logData, err := io.ReadAll(stream) + suite.Require().NoError(err) + + return string(logData) +} + func (suite *ExtensionsSuiteNVIDIA) getNVIDIANodes(labelQuery string) []string { nodes, err := suite.Clientset.CoreV1().Nodes().List(suite.ctx, metav1.ListOptions{ LabelSelector: labelQuery,