From 329926cd8ab198aa025e2dca47bce2bdc3c419c1 Mon Sep 17 00:00:00 2001 From: Utku Ozdemir Date: Fri, 30 Jan 2026 19:43:03 +0100 Subject: [PATCH] fix: fix schematic generation for machines in agent mode We had an issue with bare metal provider where two different schematic IDs would fight each other, causing machine to get installed with a wrong schematic ID, only to be upgraded to the correct one immediately, and in some cases, go into an upgrade loop between a correct and an incorrect schematic. The cause: Omni treated schematics it observed when the machine in agent mode dialed in, and stored the information it received (like kernel args and initial schematic info). This was wrong, as agent mode information essentially meaningless. Fix this by changing the simple check of "was the schematic info for machine X ever observed" to be "is the schematic info for machine X ready". The readiness check involves schematic being populated and machine not being in agent mode. This change caused `SchematicConfiguration` resource to not be generated before the machine leaves the agent mode, and caused a side effect: `InfraMachineController` would not receive Talos version from it and would not populate it on the `InfraMachine` resource. And this would cause BM provider to never get notified about the fact that the machine is allocated to a cluster, and would not power it on (to PXE boot it to "regular" Talos, for it to receive the "install" call to Omni). Change that controller to get the Talos version info directly from the Cluster resource. Signed-off-by: Utku Ozdemir (cherry picked from commit c319d7bcf27966494dde99f62aa3c6eb636640cd) --- client/api/omni/specs/machine_status.go | 9 ++ internal/backend/grpc/management.go | 7 +- internal/backend/kernelargs/kernelargs.go | 5 +- .../omni/controllers/omni/infra_machine.go | 67 +++++++--- .../controllers/omni/infra_machine_test.go | 9 +- .../omni/internal/talos/schematic.go | 34 +---- .../controllers/omni/kernelargs/status.go | 4 +- .../omni/kernelargs/status_test.go | 4 +- .../omni/machine_config_gen_options.go | 2 +- .../omni/controllers/omni/machine_status.go | 121 ++++++++++-------- .../controllers/omni/machine_status_test.go | 6 +- .../machineconfig/reconciliation_context.go | 26 ++-- .../controllers/omni/machineconfig/status.go | 2 +- .../controllers/omni/machineupgrade/status.go | 5 +- .../omni/machineupgrade/status_test.go | 1 + .../omni/controllers/omni/omni_test.go | 1 + .../omni/schematic_configuration.go | 10 +- .../omni/schematic_configuration_test.go | 2 + .../controllers/omni/talos_upgrade_status.go | 5 +- .../omni/controllers/testutils/rmock/rmock.go | 1 + internal/integration/integration_test.go | 2 +- 21 files changed, 178 insertions(+), 145 deletions(-) create mode 100644 client/api/omni/specs/machine_status.go diff --git a/client/api/omni/specs/machine_status.go b/client/api/omni/specs/machine_status.go new file mode 100644 index 00000000..caf76523 --- /dev/null +++ b/client/api/omni/specs/machine_status.go @@ -0,0 +1,9 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package specs + +func (spec *MachineStatusSpec) SchematicReady() bool { + return spec.Schematic != nil && !spec.Schematic.InAgentMode && spec.Schematic.Id != "" && spec.Schematic.FullId != "" +} diff --git a/internal/backend/grpc/management.go b/internal/backend/grpc/management.go index c1cce48d..e2fe08d3 100644 --- a/internal/backend/grpc/management.go +++ b/internal/backend/grpc/management.go @@ -708,9 +708,8 @@ func (s *managementServer) MaintenanceUpgrade(ctx context.Context, req *manageme return nil, status.Error(codes.FailedPrecondition, "machine is not in maintenance mode") } - schematic := machineStatus.TypedSpec().Value.Schematic - if schematic == nil || schematic.FullId == "" { - return nil, status.Error(codes.FailedPrecondition, "machine schematic is not known yet") + if !machineStatus.TypedSpec().Value.SchematicReady() { + return nil, status.Error(codes.FailedPrecondition, "machine schematic is not ready yet") } platform := machineStatus.TypedSpec().Value.GetPlatformMetadata().GetPlatform() @@ -725,7 +724,7 @@ func (s *managementServer) MaintenanceUpgrade(ctx context.Context, req *manageme installImage := &specs.MachineConfigGenOptionsSpec_InstallImage{ TalosVersion: req.Version, - SchematicId: schematic.FullId, + SchematicId: machineStatus.TypedSpec().Value.Schematic.FullId, SchematicInitialized: true, Platform: platform, SecurityState: securityState, diff --git a/internal/backend/kernelargs/kernelargs.go b/internal/backend/kernelargs/kernelargs.go index d3edb5cb..648b805b 100644 --- a/internal/backend/kernelargs/kernelargs.go +++ b/internal/backend/kernelargs/kernelargs.go @@ -103,8 +103,7 @@ func UpdateSupported(machineStatus *omni.MachineStatus, getClusterMachineConfig } func Calculate(machineStatus *omni.MachineStatus, kernelArgs *omni.KernelArgs) (args []string, initialized bool, err error) { - schematicConfig := machineStatus.TypedSpec().Value.Schematic - if schematicConfig == nil { + if !machineStatus.TypedSpec().Value.SchematicReady() { return nil, false, nil } @@ -118,7 +117,7 @@ func Calculate(machineStatus *omni.MachineStatus, kernelArgs *omni.KernelArgs) ( extraArgs = kernelArgs.TypedSpec().Value.Args } - baseArgs := xslices.Filter(schematicConfig.KernelArgs, isProtected) + baseArgs := xslices.Filter(machineStatus.TypedSpec().Value.Schematic.KernelArgs, isProtected) return slices.Concat(baseArgs, extraArgs), true, nil } diff --git a/internal/backend/runtime/omni/controllers/omni/infra_machine.go b/internal/backend/runtime/omni/controllers/omni/infra_machine.go index bcecb889..2ba3cd75 100644 --- a/internal/backend/runtime/omni/controllers/omni/infra_machine.go +++ b/internal/backend/runtime/omni/controllers/omni/infra_machine.go @@ -74,11 +74,6 @@ func (ctrl *InfraMachineController) Settings() controller.QSettings { Type: omni.InfraMachineConfigType, Kind: controller.InputQMapped, }, - { - Namespace: resources.DefaultNamespace, - Type: omni.SchematicConfigurationType, - Kind: controller.InputQMapped, - }, { Namespace: resources.DefaultNamespace, Type: omni.MachineExtensionsType, @@ -104,6 +99,11 @@ func (ctrl *InfraMachineController) Settings() controller.QSettings { Type: infra.ProviderType, Kind: controller.InputQMapped, }, + { + Namespace: resources.DefaultNamespace, + Type: omni.ClusterType, + Kind: controller.InputQMapped, + }, }, Outputs: []controller.Output{ { @@ -127,7 +127,7 @@ func (ctrl *InfraMachineController) Settings() controller.QSettings { } // Reconcile implements the controller.QController interface. -func (ctrl *InfraMachineController) Reconcile(ctx context.Context, _ *zap.Logger, r controller.QRuntime, ptr resource.Pointer) error { +func (ctrl *InfraMachineController) Reconcile(ctx context.Context, logger *zap.Logger, r controller.QRuntime, ptr resource.Pointer) error { link, err := safe.ReaderGet[*siderolink.Link](ctx, r, ptr) if err != nil { if !state.IsNotFoundError(err) { @@ -143,7 +143,7 @@ func (ctrl *InfraMachineController) Reconcile(ctx context.Context, _ *zap.Logger return ctrl.reconcileTearingDown(ctx, r, link) } - return ctrl.reconcileRunning(ctx, r, link) + return ctrl.reconcileRunning(ctx, r, link, logger) } func (ctrl *InfraMachineController) reconcileTearingDown(ctx context.Context, r controller.QRuntime, link *siderolink.Link) error { @@ -214,7 +214,7 @@ func (ctrl *InfraMachineController) handleInfraProviderDeletion(ctx context.Cont }, controller.WithExpectedPhaseAny()) } -func (ctrl *InfraMachineController) reconcileRunning(ctx context.Context, r controller.QRuntime, link *siderolink.Link) error { +func (ctrl *InfraMachineController) reconcileRunning(ctx context.Context, r controller.QRuntime, link *siderolink.Link, logger *zap.Logger) error { config, err := safe.ReaderGetByID[*omni.InfraMachineConfig](ctx, r, link.Metadata().ID()) if err != nil && !state.IsNotFoundError(err) { return err @@ -260,6 +260,7 @@ func (ctrl *InfraMachineController) reconcileRunning(ctx context.Context, r cont machineInfoCollected: machineInfoCollected, providerID: providerID, controllerName: ctrl.Name(), + logger: logger, } return safe.WriterModify[*infra.Machine](ctx, r, infra.NewMachine(link.Metadata().ID()), func(res *infra.Machine) error { @@ -273,6 +274,7 @@ type infraMachineControllerHelper struct { machineExts *omni.MachineExtensions link *siderolink.Link nodeUniqueToken *siderolink.NodeUniqueToken + logger *zap.Logger providerID string controllerName string machineInfoCollected bool @@ -333,16 +335,6 @@ func (helper *infraMachineControllerHelper) modify(ctx context.Context, infraMac return err } - schematicConfig, err := safe.ReaderGetByID[*omni.SchematicConfiguration](ctx, helper.runtime, helper.link.Metadata().ID()) - if err != nil { - if state.IsNotFoundError(err) { - // the schema configuration is not created yet, skip the cluster information collection - return nil - } - - return err - } - var extensions []string if helper.machineExts != nil { @@ -351,12 +343,37 @@ func (helper *infraMachineControllerHelper) modify(ctx context.Context, infraMac // set the cluster allocation information - infraMachine.TypedSpec().Value.ClusterTalosVersion = schematicConfig.TypedSpec().Value.TalosVersion + clusterTalosVersion, err := helper.getClusterTalosVersion(ctx, helper.runtime, clusterMachine) + if err != nil { + return err + } + + infraMachine.TypedSpec().Value.ClusterTalosVersion = clusterTalosVersion infraMachine.TypedSpec().Value.Extensions = extensions return nil } +func (helper *infraMachineControllerHelper) getClusterTalosVersion(ctx context.Context, r controller.Reader, clusterMachine *omni.ClusterMachine) (string, error) { + clusterID, ok := clusterMachine.Metadata().Labels().Get(omni.LabelCluster) + if !ok { + return "", fmt.Errorf("cluster machine %q is missing cluster label", clusterMachine.Metadata().ID()) + } + + cluster, err := safe.ReaderGetByID[*omni.Cluster](ctx, r, clusterID) + if err != nil { + if state.IsNotFoundError(err) { + helper.logger.Info("cluster not yet found for cluster machine", zap.String("cluster_machine_id", clusterMachine.Metadata().ID()), zap.String("cluster_id", clusterID)) + + return "", nil + } + + return "", fmt.Errorf("failed to get cluster %q: %w", clusterID, err) + } + + return cluster.TypedSpec().Value.TalosVersion, nil +} + // MapInput implements the controller.QController interface. func (ctrl *InfraMachineController) MapInput(ctx context.Context, _ *zap.Logger, runtime controller.QRuntime, ptr controller.ReducedResourceMetadata) ([]resource.Pointer, error) { switch ptr.Type() { @@ -369,6 +386,18 @@ func (ctrl *InfraMachineController) MapInput(ctx context.Context, _ *zap.Logger, omni.ClusterMachineType, omni.MachineStatusType: return []resource.Pointer{siderolink.NewLink(ptr.ID(), nil).Metadata()}, nil + case omni.ClusterType: + clusterMachineList, err := safe.ReaderListAll[*omni.ClusterMachine](ctx, runtime, state.WithLabelQuery(resource.LabelEqual(omni.LabelCluster, ptr.ID()))) + if err != nil { + return nil, err + } + + linkPtrs := make([]resource.Pointer, 0, clusterMachineList.Len()) + for clusterMachine := range clusterMachineList.All() { + linkPtrs = append(linkPtrs, siderolink.NewLink(clusterMachine.Metadata().ID(), nil).Metadata()) + } + + return linkPtrs, nil case infra.InfraProviderStatusType, infra.ProviderType: linkList, err := safe.ReaderListAll[*siderolink.Link](ctx, runtime, state.WithLabelQuery(resource.LabelEqual(omni.LabelInfraProviderID, ptr.ID()))) if err != nil { diff --git a/internal/backend/runtime/omni/controllers/omni/infra_machine_test.go b/internal/backend/runtime/omni/controllers/omni/infra_machine_test.go index 5494463b..db5ed868 100644 --- a/internal/backend/runtime/omni/controllers/omni/infra_machine_test.go +++ b/internal/backend/runtime/omni/controllers/omni/infra_machine_test.go @@ -89,12 +89,11 @@ func (suite *InfraMachineControllerSuite) TestReconcile() { suite.Require().NoError(suite.state.Create(suite.ctx, clusterMachine)) - // create schematic configuration - schematicConfig := omni.NewSchematicConfiguration("machine-1") + // create cluster with talos version + cluster := omni.NewCluster("test-cluster") + cluster.TypedSpec().Value.TalosVersion = "v42.0.0" - schematicConfig.TypedSpec().Value.TalosVersion = "v42.0.0" - - suite.Require().NoError(suite.state.Create(suite.ctx, schematicConfig)) + suite.Require().NoError(suite.state.Create(suite.ctx, cluster)) // assert that the cluster machine has the correct talos version assertResource[*infra.Machine](&suite.OmniSuite, clusterMachine.Metadata(), func(r *infra.Machine, assertion *assert.Assertions) { diff --git a/internal/backend/runtime/omni/controllers/omni/internal/talos/schematic.go b/internal/backend/runtime/omni/controllers/omni/internal/talos/schematic.go index 29868ff6..0833df1d 100644 --- a/internal/backend/runtime/omni/controllers/omni/internal/talos/schematic.go +++ b/internal/backend/runtime/omni/controllers/omni/internal/talos/schematic.go @@ -11,7 +11,6 @@ import ( "strings" "github.com/cosi-project/runtime/pkg/safe" - "github.com/siderolabs/go-pointer" "github.com/siderolabs/image-factory/pkg/constants" "github.com/siderolabs/image-factory/pkg/schematic" "github.com/siderolabs/talos/pkg/machinery/client" @@ -52,15 +51,12 @@ func GetSchematicInfo(ctx context.Context, c *client.Client, fallbackKernelArgs fullID string rawSchematic = &schematic.Schematic{} manifest string - inAgentMode bool ) - err = items.ForEachErr(func(status *runtime.ExtensionStatus) error { + for status := range items.All() { name := status.TypedSpec().Metadata.Name if name == extensions.MetalAgentExtensionName { - inAgentMode = true - - return nil + return SchematicInfo{InAgentMode: true}, nil } if name == constants.SchematicIDExtensionName { // skip the meta extension @@ -69,14 +65,16 @@ func GetSchematicInfo(ctx context.Context, c *client.Client, fallbackKernelArgs if status.TypedSpec().Metadata.ExtraInfo != "" { manifest = status.TypedSpec().Metadata.ExtraInfo - return yaml.Unmarshal([]byte(manifest), rawSchematic) + if err = yaml.Unmarshal([]byte(manifest), rawSchematic); err != nil { + return SchematicInfo{}, fmt.Errorf("failed to unmarshal schematic manifest: %w", err) + } } - return nil + continue } if name == "modules.dep" { // ignore the virtual extension used for kernel modules dependencies - return nil + continue } if !strings.HasPrefix(name, extensions.OfficialPrefix) { @@ -84,24 +82,6 @@ func GetSchematicInfo(ctx context.Context, c *client.Client, fallbackKernelArgs } exts = append(exts, name) - - return nil - }) - if err != nil { - return SchematicInfo{}, err - } - - if inAgentMode { - id, idErr := pointer.To(schematic.Schematic{}).ID() - if idErr != nil { - return SchematicInfo{}, fmt.Errorf("failed to calculate extensions schematic ID: %w", idErr) - } - - return SchematicInfo{ - ID: id, - FullID: id, - InAgentMode: true, - }, nil } exts = extensions.MapNames(exts) diff --git a/internal/backend/runtime/omni/controllers/omni/kernelargs/status.go b/internal/backend/runtime/omni/controllers/omni/kernelargs/status.go index 40729ead..46bf6ec5 100644 --- a/internal/backend/runtime/omni/controllers/omni/kernelargs/status.go +++ b/internal/backend/runtime/omni/controllers/omni/kernelargs/status.go @@ -40,8 +40,8 @@ func NewStatusController() *StatusController { status.TypedSpec().Value.UnmetConditions = nil status.TypedSpec().Value.CurrentCmdline = ms.TypedSpec().Value.KernelCmdline - if ms.TypedSpec().Value.Schematic == nil { - status.TypedSpec().Value.UnmetConditions = append(status.TypedSpec().Value.UnmetConditions, "Schematic information is not yet known") + if !ms.TypedSpec().Value.SchematicReady() { + status.TypedSpec().Value.UnmetConditions = append(status.TypedSpec().Value.UnmetConditions, "Schematic information is not yet ready") } else { status.TypedSpec().Value.CurrentArgs = kernelargs.FilterExtras(ms.TypedSpec().Value.Schematic.KernelArgs) } diff --git a/internal/backend/runtime/omni/controllers/omni/kernelargs/status_test.go b/internal/backend/runtime/omni/controllers/omni/kernelargs/status_test.go index 59bdc9b8..9cdc8d7f 100644 --- a/internal/backend/runtime/omni/controllers/omni/kernelargs/status_test.go +++ b/internal/backend/runtime/omni/controllers/omni/kernelargs/status_test.go @@ -38,7 +38,7 @@ func TestReconcile(t *testing.T) { rtestutils.AssertResource(ctx, t, testContext.State, id, func(res *omni.KernelArgsStatus, assertion *assert.Assertions) { assertion.Equal([]string{ - "Schematic information is not yet known", + "Schematic information is not yet ready", "Talos is not installed, kernel args cannot be updated yet", "Cannot determine if kernel args update is supported: SecurityState and TalosVersion are not yet set", }, res.TypedSpec().Value.UnmetConditions) @@ -59,6 +59,8 @@ func TestReconcile(t *testing.T) { } res.TypedSpec().Value.Schematic = &specs.MachineStatusSpec_Schematic{ + Id: "test-id", + FullId: "test-full-id", KernelArgs: []string{"arg-1", "arg-2"}, } diff --git a/internal/backend/runtime/omni/controllers/omni/machine_config_gen_options.go b/internal/backend/runtime/omni/controllers/omni/machine_config_gen_options.go index 27e0991e..541b692e 100644 --- a/internal/backend/runtime/omni/controllers/omni/machine_config_gen_options.go +++ b/internal/backend/runtime/omni/controllers/omni/machine_config_gen_options.go @@ -69,7 +69,7 @@ func GenInstallConfig(machineStatus *omni.MachineStatus, clusterMachineTalosVers genOptions.TypedSpec().Value.InstallImage.SchematicId = clusterMachineTalosVersion.TypedSpec().Value.SchematicId genOptions.TypedSpec().Value.InstallImage.TalosVersion = clusterMachineTalosVersion.TypedSpec().Value.TalosVersion - genOptions.TypedSpec().Value.InstallImage.SchematicInitialized = machineStatus.TypedSpec().Value.Schematic != nil + genOptions.TypedSpec().Value.InstallImage.SchematicInitialized = machineStatus.TypedSpec().Value.SchematicReady() if genOptions.TypedSpec().Value.InstallImage.SchematicInitialized { genOptions.TypedSpec().Value.InstallImage.SchematicInvalid = machineStatus.TypedSpec().Value.GetSchematic().GetInvalid() diff --git a/internal/backend/runtime/omni/controllers/omni/machine_status.go b/internal/backend/runtime/omni/controllers/omni/machine_status.go index 3badbafc..a0fe7417 100644 --- a/internal/backend/runtime/omni/controllers/omni/machine_status.go +++ b/internal/backend/runtime/omni/controllers/omni/machine_status.go @@ -482,57 +482,8 @@ func (ctrl *MachineStatusController) handleNotification(ctx context.Context, r c } if event.Schematic != nil { - if spec.Schematic == nil { - spec.Schematic = &specs.MachineStatusSpec_Schematic{} - } - - spec.Schematic.Raw = event.Schematic.Raw - spec.Schematic.Id = event.Schematic.ID - spec.Schematic.FullId = event.Schematic.FullID - spec.Schematic.Extensions = event.Schematic.Extensions - spec.Schematic.Invalid = event.Schematic.Invalid - spec.Schematic.KernelArgs = event.Schematic.KernelArgs - - spec.Schematic.MetaValues = xslices.Map(event.Schematic.MetaValues, func(value schematic.MetaValue) *specs.MetaValue { - return &specs.MetaValue{ - Key: uint32(value.Key), - Value: value.Value, - } - }) - - if event.Schematic.Overlay.Name != "" { - spec.Schematic.Overlay = &specs.Overlay{ - Name: event.Schematic.Overlay.Name, - Image: event.Schematic.Overlay.Image, - } - } - - if spec.Schematic.InitialSchematic == "" { - spec.Schematic.InitialSchematic = spec.Schematic.FullId - } - - spec.Schematic.InAgentMode = event.Schematic.InAgentMode - - // We populate the initial state on a best-effort basis: we might have missed the moment to capture it, but it is acceptable. - if spec.Schematic.InitialState == nil { - spec.Schematic.InitialState = &specs.MachineStatusSpec_Schematic_InitialState{ - Extensions: event.Schematic.Extensions, - } - } - - // if the schematic is invalid or the machine is in agent mode, we reset the initial schematic information - if spec.Schematic.Invalid || spec.Schematic.InAgentMode { - spec.Schematic.InitialSchematic = "" - spec.Schematic.InitialState = &specs.MachineStatusSpec_Schematic_InitialState{} // reset to be empty but leave it initialized - } - - _, kernelArgsInitialized := m.Metadata().Annotations().Get(omni.KernelArgsInitialized) - if !kernelArgsInitialized { - if err := ctrl.kernelArgsInitializer.Init(ctx, m.Metadata().ID(), event.Schematic.KernelArgs); err != nil { - return fmt.Errorf("failed to initialize extra kernel args: %w", err) - } - - m.Metadata().Annotations().Set(omni.KernelArgsInitialized, "") + if err := ctrl.handleEventSchematic(ctx, m, event); err != nil { + return fmt.Errorf("failed to handle schematic event: %w", err) } } @@ -565,6 +516,74 @@ func (ctrl *MachineStatusController) handleNotification(ctx context.Context, r c return nil } +func (ctrl *MachineStatusController) handleEventSchematic(ctx context.Context, ms *omni.MachineStatus, event machinetask.Info) error { + spec := ms.TypedSpec().Value + + // If the machine is in agent mode, reset the schematic information as it were never observed. + // At that stage, anything on it (extensions, kernel args, etc.) is irrelevant and even wrong for Omni to process. + if event.Schematic.InAgentMode { + spec.Schematic = &specs.MachineStatusSpec_Schematic{ + InAgentMode: true, + } + + return nil + } + + if spec.Schematic == nil { + spec.Schematic = &specs.MachineStatusSpec_Schematic{} + } + + spec.Schematic.Raw = event.Schematic.Raw + spec.Schematic.Id = event.Schematic.ID + spec.Schematic.FullId = event.Schematic.FullID + spec.Schematic.Extensions = event.Schematic.Extensions + spec.Schematic.Invalid = event.Schematic.Invalid + spec.Schematic.KernelArgs = event.Schematic.KernelArgs + spec.Schematic.InAgentMode = event.Schematic.InAgentMode + + spec.Schematic.MetaValues = xslices.Map(event.Schematic.MetaValues, func(value schematic.MetaValue) *specs.MetaValue { + return &specs.MetaValue{ + Key: uint32(value.Key), + Value: value.Value, + } + }) + + if event.Schematic.Overlay.Name != "" { + spec.Schematic.Overlay = &specs.Overlay{ + Name: event.Schematic.Overlay.Name, + Image: event.Schematic.Overlay.Image, + } + } + + if spec.Schematic.InitialSchematic == "" { + spec.Schematic.InitialSchematic = spec.Schematic.FullId + } + + // We populate the initial state on a best-effort basis: we might have missed the moment to capture it, but it is acceptable. + if spec.Schematic.InitialState == nil { + spec.Schematic.InitialState = &specs.MachineStatusSpec_Schematic_InitialState{ + Extensions: event.Schematic.Extensions, + } + } + + // If the schematic is invalid, reset the initial schematic information. + if spec.Schematic.Invalid { + spec.Schematic.InitialSchematic = "" + spec.Schematic.InitialState = &specs.MachineStatusSpec_Schematic_InitialState{} // reset to be empty but leave it initialized + } + + _, kernelArgsInitialized := ms.Metadata().Annotations().Get(omni.KernelArgsInitialized) + if !kernelArgsInitialized { + if err := ctrl.kernelArgsInitializer.Init(ctx, ms.Metadata().ID(), event.Schematic.KernelArgs); err != nil { + return fmt.Errorf("failed to initialize extra kernel args: %w", err) + } + + ms.Metadata().Annotations().Set(omni.KernelArgsInitialized, "") + } + + return nil +} + // reconcileLabels is a wrapper around omni.MachineStatusReconcileLabels, but it overrides the "installed" label // based on a matching infra.MachineStatus for that machine. // diff --git a/internal/backend/runtime/omni/controllers/omni/machine_status_test.go b/internal/backend/runtime/omni/controllers/omni/machine_status_test.go index 0bccbc1a..ec77b68c 100644 --- a/internal/backend/runtime/omni/controllers/omni/machine_status_test.go +++ b/internal/backend/runtime/omni/controllers/omni/machine_status_test.go @@ -440,11 +440,7 @@ func (suite *MachineStatusSuite) TestMachineSchematic() { }, }, expected: &specs.MachineStatusSpec_Schematic{ - Id: defaultSchematic, - InitialSchematic: "", - FullId: defaultSchematic, - InAgentMode: true, - InitialState: &specs.MachineStatusSpec_Schematic_InitialState{}, + InAgentMode: true, }, }, } { diff --git a/internal/backend/runtime/omni/controllers/omni/machineconfig/reconciliation_context.go b/internal/backend/runtime/omni/controllers/omni/machineconfig/reconciliation_context.go index 34c0da43..4ab009e0 100644 --- a/internal/backend/runtime/omni/controllers/omni/machineconfig/reconciliation_context.go +++ b/internal/backend/runtime/omni/controllers/omni/machineconfig/reconciliation_context.go @@ -69,15 +69,11 @@ func checkClusterReady(ctx context.Context, r controller.Reader, machineConfig * func checkMachineStatus(ctx context.Context, r controller.Reader, machineStatus *omni.MachineStatus) error { if !machineStatus.TypedSpec().Value.Connected { - return xerrors.NewTaggedf[qtransform.SkipReconcileTag]("'%s' machine is not connected", machineStatus.Metadata().ID()) + return xerrors.NewTaggedf[qtransform.SkipReconcileTag]("machine %q is not connected", machineStatus.Metadata().ID()) } - if machineStatus.TypedSpec().Value.Schematic == nil { - return xerrors.NewTagged[qtransform.SkipReconcileTag](fmt.Errorf("machine status '%s' does not have schematic information", machineStatus.Metadata().ID())) - } - - if machineStatus.TypedSpec().Value.Schematic.InAgentMode { - return xerrors.NewTagged[qtransform.SkipReconcileTag](fmt.Errorf("machine status '%s' schematic is in agent mode", machineStatus.Metadata().ID())) + if !machineStatus.TypedSpec().Value.SchematicReady() { + return xerrors.NewTaggedf[qtransform.SkipReconcileTag]("machine %q schematic is not ready", machineStatus.Metadata().ID()) } // if the machine is managed by a static infra provider, we need to ensure that the infra machine is ready to use @@ -90,7 +86,7 @@ func checkMachineStatus(ctx context.Context, r controller.Reader, machineStatus return xerrors.NewTaggedf[qtransform.SkipReconcileTag]("machine is managed by a static infra provider but the infra machine status is not found: %w", err) } - return fmt.Errorf("failed to get infra machine status '%s': %w", machineStatus.Metadata().ID(), err) + return fmt.Errorf("failed to get infra machine status %q: %w", machineStatus.Metadata().ID(), err) } if !infraMachineStatus.TypedSpec().Value.ReadyToUse { @@ -130,7 +126,7 @@ func BuildReconciliationContext(ctx context.Context, r controller.Reader, machineSetNode, err := safe.ReaderGetByID[*omni.MachineSetNode](ctx, r, machineConfig.Metadata().ID()) if err != nil { if state.IsNotFoundError(err) { - return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("'%s' machine set node not found: %w", machineConfig.Metadata().ID(), err) + return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("%q machine set node not found: %w", machineConfig.Metadata().ID(), err) } return nil, err @@ -172,19 +168,19 @@ func BuildReconciliationContext(ctx context.Context, r controller.Reader, ) if err != nil { if state.IsNotFoundError(err) { - return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("'%s' machine status snapshot not found: %w", machineConfig.Metadata().ID(), err) + return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("%q machine status snapshot not found: %w", machineConfig.Metadata().ID(), err) } - return nil, fmt.Errorf("failed to get machine status snapshot '%s': %w", machineConfig.Metadata().ID(), err) + return nil, fmt.Errorf("failed to get machine status snapshot %q: %w", machineConfig.Metadata().ID(), err) } rc.machineStatus, err = safe.ReaderGetByID[*omni.MachineStatus](ctx, r, machineConfig.Metadata().ID()) if err != nil { if state.IsNotFoundError(err) { - return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("'%s' machine status not found: %w", machineConfig.Metadata().ID(), err) + return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("%q machine status not found: %w", machineConfig.Metadata().ID(), err) } - return nil, fmt.Errorf("failed to get machine status '%s': %w", machineConfig.Metadata().ID(), err) + return nil, fmt.Errorf("failed to get machine status %q: %w", machineConfig.Metadata().ID(), err) } if err = checkMachineStatus(ctx, r, rc.machineStatus); err != nil { @@ -194,10 +190,10 @@ func BuildReconciliationContext(ctx context.Context, r controller.Reader, genOptions, err := safe.ReaderGetByID[*omni.MachineConfigGenOptions](ctx, r, machineConfig.Metadata().ID()) if err != nil { if state.IsNotFoundError(err) { - return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("'%s' machine config gen options not found: %w", machineConfig.Metadata().ID(), err) + return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("%q machine config gen options not found: %w", machineConfig.Metadata().ID(), err) } - return nil, fmt.Errorf("failed to get install image '%s': %w", machineConfig.Metadata().ID(), err) + return nil, fmt.Errorf("failed to get install image %q: %w", machineConfig.Metadata().ID(), err) } rc.installImage = genOptions.TypedSpec().Value.InstallImage diff --git a/internal/backend/runtime/omni/controllers/omni/machineconfig/status.go b/internal/backend/runtime/omni/controllers/omni/machineconfig/status.go index c602633b..5b9f8f9a 100644 --- a/internal/backend/runtime/omni/controllers/omni/machineconfig/status.go +++ b/internal/backend/runtime/omni/controllers/omni/machineconfig/status.go @@ -771,7 +771,7 @@ func (ctrl *ClusterMachineConfigStatusController) shouldResetGraceful( machineSetStatus.TypedSpec().Value.Phase != specs.MachineSetPhase_Destroying, nil } -func (h *ClusterMachineConfigStatusController) gracefulEtcdLeave(ctx context.Context, c *client.Client, id string) error { +func (ctrl *ClusterMachineConfigStatusController) gracefulEtcdLeave(ctx context.Context, c *client.Client, id string) error { _, err := c.EtcdForfeitLeadership(ctx, &machineapi.EtcdForfeitLeadershipRequest{}) if err != nil { return fmt.Errorf("failed to forfeit leadership, node %q: %w", id, err) diff --git a/internal/backend/runtime/omni/controllers/omni/machineupgrade/status.go b/internal/backend/runtime/omni/controllers/omni/machineupgrade/status.go index 61be3f3e..6e43920a 100644 --- a/internal/backend/runtime/omni/controllers/omni/machineupgrade/status.go +++ b/internal/backend/runtime/omni/controllers/omni/machineupgrade/status.go @@ -69,8 +69,7 @@ func (ctrl *StatusController) transform(ctx context.Context, r controller.Reader status.TypedSpec().Value.IsMaintenance = ms.TypedSpec().Value.Maintenance - schematicSpec := ms.TypedSpec().Value.Schematic - if schematicSpec == nil { + if !ms.TypedSpec().Value.SchematicReady() { status.TypedSpec().Value.Phase = specs.MachineUpgradeStatusSpec_Unknown status.TypedSpec().Value.Status = "schematic info is not available" status.TypedSpec().Value.Error = "" @@ -78,6 +77,8 @@ func (ctrl *StatusController) transform(ctx context.Context, r controller.Reader return nil } + schematicSpec := ms.TypedSpec().Value.Schematic + status.TypedSpec().Value.CurrentSchematicId = schematicSpec.FullId if schematicSpec.Raw == "" { diff --git a/internal/backend/runtime/omni/controllers/omni/machineupgrade/status_test.go b/internal/backend/runtime/omni/controllers/omni/machineupgrade/status_test.go index 4d058081..c0f2238f 100644 --- a/internal/backend/runtime/omni/controllers/omni/machineupgrade/status_test.go +++ b/internal/backend/runtime/omni/controllers/omni/machineupgrade/status_test.go @@ -150,6 +150,7 @@ func TestReconcile(t *testing.T) { {Key: 1, Value: "value1"}, {Key: 2, Value: "value2"}, }, + Id: "test-id", FullId: currentSchematicID, Raw: string(currentSchematicRaw), InitialState: &specs.MachineStatusSpec_Schematic_InitialState{ diff --git a/internal/backend/runtime/omni/controllers/omni/omni_test.go b/internal/backend/runtime/omni/controllers/omni/omni_test.go index ed3bd0d8..45a04b80 100644 --- a/internal/backend/runtime/omni/controllers/omni/omni_test.go +++ b/internal/backend/runtime/omni/controllers/omni/omni_test.go @@ -513,6 +513,7 @@ func (suite *OmniSuite) createClusterWithTalosVersion(clusterName string, contro machineStatus.TypedSpec().Value.ManagementAddress = suite.socketConnectionString machineStatus.TypedSpec().Value.Schematic = &specs.MachineStatusSpec_Schematic{ Id: defaultSchematic, + FullId: defaultSchematic, InitialState: &specs.MachineStatusSpec_Schematic_InitialState{}, } machineStatus.TypedSpec().Value.InitialTalosVersion = cluster.TypedSpec().Value.TalosVersion diff --git a/internal/backend/runtime/omni/controllers/omni/schematic_configuration.go b/internal/backend/runtime/omni/controllers/omni/schematic_configuration.go index 6c4234aa..6832e4c1 100644 --- a/internal/backend/runtime/omni/controllers/omni/schematic_configuration.go +++ b/internal/backend/runtime/omni/controllers/omni/schematic_configuration.go @@ -143,8 +143,8 @@ func (helper *schematicConfigurationHelper) reconcile( ms *omni.MachineStatus, schematicConfiguration *omni.SchematicConfiguration, ) (*omni.MachineExtensionsStatus, error) { - if ms.TypedSpec().Value.Schematic == nil { - return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("schematic information is not yet available") + if !ms.TypedSpec().Value.SchematicReady() { + return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("schematic is not yet ready") } securityState := ms.TypedSpec().Value.SecurityState @@ -368,8 +368,8 @@ func determineKernelArgs(ctx context.Context, machineStatus *omni.MachineStatus, } if !updateSupported { - if machineStatus.TypedSpec().Value.Schematic == nil { - return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("machine schematic is not yet initialized") + if !machineStatus.TypedSpec().Value.SchematicReady() { + return nil, xerrors.NewTaggedf[qtransform.SkipReconcileTag]("machine schematic is not yet ready") } // keep them unchanged - take the current args as-is @@ -437,7 +437,7 @@ func computeMachineExtensionsStatus(ms *omni.MachineStatus, customization *machi requestedExtensions set.Set[string] ) - if ms.TypedSpec().Value.Schematic != nil { + if ms.TypedSpec().Value.SchematicReady() { installedExtensions = xslices.ToSet(ms.TypedSpec().Value.Schematic.Extensions) } diff --git a/internal/backend/runtime/omni/controllers/omni/schematic_configuration_test.go b/internal/backend/runtime/omni/controllers/omni/schematic_configuration_test.go index 9ed2b9d3..90c3c828 100644 --- a/internal/backend/runtime/omni/controllers/omni/schematic_configuration_test.go +++ b/internal/backend/runtime/omni/controllers/omni/schematic_configuration_test.go @@ -72,6 +72,8 @@ func (suite *SchematicConfigurationSuite) TestReconcile() { machineStatus.TypedSpec().Value.TalosVersion = talosVersion machineStatus.TypedSpec().Value.Schematic = &specs.MachineStatusSpec_Schematic{ + Id: "test-id", + FullId: "test-full-id", Extensions: []string{"siderolabs/hello-world-service"}, InitialSchematic: expectedSchematic, InitialState: &specs.MachineStatusSpec_Schematic_InitialState{ diff --git a/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status.go b/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status.go index dac12121..47c46222 100644 --- a/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status.go +++ b/internal/backend/runtime/omni/controllers/omni/talos_upgrade_status.go @@ -555,8 +555,7 @@ func populateEmptySchematics(ctx context.Context, r controller.ReaderWriter, clu return err } - schematic := machineStatus.TypedSpec().Value.Schematic - if schematic == nil { + if !machineStatus.TypedSpec().Value.SchematicReady() { return nil } @@ -566,7 +565,7 @@ func populateEmptySchematics(ctx context.Context, r controller.ReaderWriter, clu } return safe.WriterModify(ctx, r, clusterMachineTalosVersion, func(res *omni.ClusterMachineTalosVersion) error { - res.TypedSpec().Value.SchematicId = schematic.InitialSchematic + res.TypedSpec().Value.SchematicId = machineStatus.TypedSpec().Value.Schematic.InitialSchematic return nil }) diff --git a/internal/backend/runtime/omni/controllers/testutils/rmock/rmock.go b/internal/backend/runtime/omni/controllers/testutils/rmock/rmock.go index 0cd705b5..71e4f0de 100644 --- a/internal/backend/runtime/omni/controllers/testutils/rmock/rmock.go +++ b/internal/backend/runtime/omni/controllers/testutils/rmock/rmock.go @@ -143,6 +143,7 @@ func init() { res.TypedSpec().Value.Schematic = &specs.MachineStatusSpec_Schematic{ Id: defaultSchematic, + FullId: defaultSchematic, InitialState: &specs.MachineStatusSpec_Schematic_InitialState{}, } diff --git a/internal/integration/integration_test.go b/internal/integration/integration_test.go index 978f2daa..0a1d8b21 100644 --- a/internal/integration/integration_test.go +++ b/internal/integration/integration_test.go @@ -356,7 +356,7 @@ func preRunHooks(t *testing.T, options *TestOptions) { func postRunHooks(t *testing.T, options *TestOptions) { if t.Failed() { - t.Logf("there are failed tests, save support bundle for all cluster") + t.Logf("there are failed tests, save support bundle for all clusters") saveAllSupportBundles(t, options.omniClient, options.OutputDir)