mirror of
https://github.com/siderolabs/talos.git
synced 2026-05-05 04:16:21 +02:00
test: fix NVIDIA OSS tests
Add more logging output. Force non-UEFI boot. Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
This commit is contained in:
parent
62f2d27cd4
commit
8cd3c8dc77
6
.github/workflows/ci.yaml
vendored
6
.github/workflows/ci.yaml
vendored
@ -1,6 +1,6 @@
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
|
||||
#
|
||||
# Generated on 2025-04-10T08:42:48Z by kres d903dae.
|
||||
# Generated on 2025-04-15T15:57:38Z by kres fd5cab0.
|
||||
|
||||
name: default
|
||||
concurrency:
|
||||
@ -634,7 +634,6 @@ jobs:
|
||||
- name: e2e-aws-nvidia-nonfree
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
TEST_NUM_NODES: "4"
|
||||
run: |
|
||||
make e2e-aws
|
||||
- name: tf destroy
|
||||
@ -788,8 +787,7 @@ jobs:
|
||||
make e2e-cloud-tf
|
||||
- name: e2e-aws-nvidia-oss
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
TEST_NUM_NODES: "4"
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
|
||||
run: |
|
||||
make e2e-aws
|
||||
- name: tf destroy
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
|
||||
#
|
||||
# Generated on 2025-04-01T08:14:24Z by kres d903dae.
|
||||
# Generated on 2025-04-15T15:57:38Z by kres fd5cab0.
|
||||
|
||||
name: integration-aws-nvidia-nonfree-cron
|
||||
concurrency:
|
||||
@ -144,7 +144,6 @@ jobs:
|
||||
- name: e2e-aws-nvidia-nonfree
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
TEST_NUM_NODES: "4"
|
||||
run: |
|
||||
make e2e-aws
|
||||
- name: tf destroy
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
|
||||
#
|
||||
# Generated on 2025-04-04T11:30:51Z by kres d903dae.
|
||||
# Generated on 2025-04-15T15:57:38Z by kres fd5cab0.
|
||||
|
||||
name: integration-aws-nvidia-oss-cron
|
||||
concurrency:
|
||||
@ -143,8 +143,7 @@ jobs:
|
||||
make e2e-cloud-tf
|
||||
- name: e2e-aws-nvidia-oss
|
||||
env:
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
TEST_NUM_NODES: "4"
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia -talos.verifyukibooted=false
|
||||
run: |
|
||||
make e2e-aws
|
||||
- name: tf destroy
|
||||
|
||||
@ -2562,8 +2562,7 @@ spec:
|
||||
- name: e2e-aws-nvidia-oss
|
||||
command: e2e-aws
|
||||
environment:
|
||||
TEST_NUM_NODES: 4
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
EXTRA_TEST_ARGS: "-talos.extensions.nvidia -talos.verifyukibooted=false"
|
||||
- name: tf destroy
|
||||
command: e2e-cloud-tf
|
||||
conditions:
|
||||
@ -2660,7 +2659,6 @@ spec:
|
||||
- name: e2e-aws-nvidia-nonfree
|
||||
command: e2e-aws
|
||||
environment:
|
||||
TEST_NUM_NODES: 4
|
||||
EXTRA_TEST_ARGS: -talos.extensions.nvidia
|
||||
- name: tf destroy
|
||||
command: e2e-cloud-tf
|
||||
|
||||
@ -384,7 +384,7 @@ func (au *AWSUploader) registerAMIArch(ctx context.Context, region string, svc *
|
||||
log.Printf("aws: import into %s/%s, deregistered image ID %q", region, arch, *image.ImageId)
|
||||
}
|
||||
|
||||
registerResp, err := svc.RegisterImage(ctx, &ec2.RegisterImageInput{
|
||||
registerReq := &ec2.RegisterImageInput{
|
||||
Name: aws.String(imageName),
|
||||
BlockDeviceMappings: []types.BlockDeviceMapping{
|
||||
{
|
||||
@ -404,8 +404,13 @@ func (au *AWSUploader) registerAMIArch(ctx context.Context, region string, svc *
|
||||
Description: pointer.To(fmt.Sprintf("Talos AMI %s %s %s", au.Options.Tag, arch, region)),
|
||||
Architecture: awsArchitectures[arch],
|
||||
ImdsSupport: types.ImdsSupportValuesV20,
|
||||
BootMode: types.BootModeValuesUefiPreferred,
|
||||
})
|
||||
}
|
||||
|
||||
if !au.Options.AWSForceBIOS {
|
||||
registerReq.BootMode = types.BootModeValuesUefiPreferred
|
||||
}
|
||||
|
||||
registerResp, err := svc.RegisterImage(ctx, registerReq)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@ -64,6 +64,7 @@ func run() error {
|
||||
pflag.StringVar(&DefaultOptions.NamePrefix, "name-prefix", DefaultOptions.NamePrefix, "prefix for the name of the uploaded image")
|
||||
|
||||
pflag.StringSliceVar(&DefaultOptions.AWSRegions, "aws-regions", DefaultOptions.AWSRegions, "list of AWS regions to upload to")
|
||||
pflag.BoolVar(&DefaultOptions.AWSForceBIOS, "aws-force-bios", DefaultOptions.AWSForceBIOS, "force BIOS boot mode for AWS images")
|
||||
|
||||
pflag.Parse()
|
||||
|
||||
|
||||
@ -18,7 +18,8 @@ type Options struct {
|
||||
TargetClouds []string
|
||||
|
||||
// AWS options.
|
||||
AWSRegions []string
|
||||
AWSRegions []string
|
||||
AWSForceBIOS bool
|
||||
}
|
||||
|
||||
// DefaultOptions used throughout the cli.
|
||||
|
||||
@ -9,6 +9,12 @@ REGION="us-east-1"
|
||||
function cloud_image_upload() {
|
||||
CLOUD_IMAGES_EXTRA_ARGS=("--name-prefix=${1}" "--target-clouds=aws" "--architectures=amd64" "--aws-regions=${REGION}")
|
||||
|
||||
case "${1}" in
|
||||
talos-e2e-nvidia-oss)
|
||||
CLOUD_IMAGES_EXTRA_ARGS+=("--aws-force-bios")
|
||||
;;
|
||||
esac
|
||||
|
||||
make cloud-images CLOUD_IMAGES_EXTRA_ARGS="${CLOUD_IMAGES_EXTRA_ARGS[*]}"
|
||||
}
|
||||
|
||||
|
||||
@ -37,7 +37,6 @@ export KUBERNETES_VERSION=${KUBERNETES_VERSION:-1.33.0-rc.0}
|
||||
|
||||
export NAME_PREFIX="talos-e2e-${SHA}-${PLATFORM}"
|
||||
export TIMEOUT=1200
|
||||
export NUM_NODES=${TEST_NUM_NODES:-6}
|
||||
|
||||
# default values, overridden by talosctl cluster create tests
|
||||
PROVISIONER=
|
||||
|
||||
@ -53,6 +53,8 @@ func (suite *ExtensionsSuiteNVIDIA) TearDownTest() {
|
||||
}
|
||||
|
||||
// TestExtensionsNVIDIA verifies that a cuda workload can be run.
|
||||
//
|
||||
//nolint:gocyclo
|
||||
func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() {
|
||||
expectedModulesModDep := map[string]string{
|
||||
"nvidia": "nvidia.ko",
|
||||
@ -126,6 +128,14 @@ func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() {
|
||||
return retry.ExpectedErrorf("error getting pod: %s", listErr)
|
||||
}
|
||||
|
||||
for _, pod := range podList.Items {
|
||||
if pod.Status.Phase == corev1.PodFailed {
|
||||
logData := suite.getPodLogs("default", pod.Name)
|
||||
|
||||
suite.T().Logf("pod %s logs:\n%s", pod.Name, logData)
|
||||
}
|
||||
}
|
||||
|
||||
if len(podList.Items) != 1 {
|
||||
return retry.ExpectedErrorf("expected 1 pod, got %d", len(podList.Items))
|
||||
}
|
||||
@ -147,19 +157,25 @@ func (suite *ExtensionsSuiteNVIDIA) TestExtensionsNVIDIA() {
|
||||
suite.Require().Len(podList.Items, 1)
|
||||
|
||||
for _, pod := range podList.Items {
|
||||
res := suite.Clientset.CoreV1().Pods("default").GetLogs(pod.Name, &corev1.PodLogOptions{})
|
||||
stream, err := res.Stream(suite.ctx)
|
||||
suite.Require().NoError(err)
|
||||
logData := suite.getPodLogs("default", pod.Name)
|
||||
|
||||
defer stream.Close() //nolint:errcheck
|
||||
|
||||
logData, err := io.ReadAll(stream)
|
||||
suite.Require().NoError(err)
|
||||
|
||||
suite.Require().Contains(string(logData), "Test PASSED")
|
||||
suite.Require().Contains(logData, "Test PASSED")
|
||||
}
|
||||
}
|
||||
|
||||
func (suite *ExtensionsSuiteNVIDIA) getPodLogs(namespace, name string) string {
|
||||
res := suite.Clientset.CoreV1().Pods(namespace).GetLogs(name, &corev1.PodLogOptions{})
|
||||
stream, err := res.Stream(suite.ctx)
|
||||
suite.Require().NoError(err)
|
||||
|
||||
defer stream.Close() //nolint:errcheck
|
||||
|
||||
logData, err := io.ReadAll(stream)
|
||||
suite.Require().NoError(err)
|
||||
|
||||
return string(logData)
|
||||
}
|
||||
|
||||
func (suite *ExtensionsSuiteNVIDIA) getNVIDIANodes(labelQuery string) []string {
|
||||
nodes, err := suite.Clientset.CoreV1().Nodes().List(suite.ctx, metav1.ListOptions{
|
||||
LabelSelector: labelQuery,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user