fix: skip allocating nodes for deleted/tearing down MachineRequests

Without the fix `MachineRequestStatus` cleanup controller deletes the
`MachineSetNode` and the `MachineSetNode` controller might allocate it
back immediately.

Signed-off-by: Artem Chernyshev <artem.chernyshev@talos-systems.com>
This commit is contained in:
Artem Chernyshev 2026-04-24 20:02:05 +03:00
parent f9dd849153
commit 65af568b34
No known key found for this signature in database
GPG Key ID: 9BAC0E08F5067BB8
2 changed files with 87 additions and 0 deletions

View File

@ -536,6 +536,21 @@ func (ctrl *MachineSetNodeController) createNodes(
var machineVersion semver.Version
machineRequestID, ok := machine.Metadata().Labels().Get(omni.LabelMachineRequest)
if ok {
var machineRequest *infra.MachineRequest
machineRequest, err = safe.ReaderGetByID[*infra.MachineRequest](ctx, r, machineRequestID)
if err != nil && !state.IsNotFoundError(err) {
return err
}
// machineRequest is being torn down, skip it
if machineRequest == nil || machineRequest.Metadata().Phase() == resource.PhaseTearingDown {
continue
}
}
version, ok := machine.Metadata().Labels().Get(omni.MachineStatusLabelTalosVersion)
if !ok {
continue

View File

@ -264,6 +264,78 @@ func (suite *MachineSetNodeSuite) TestReconcile() {
rtestutils.Destroy[*omni.MachineSet](ctx, suite.T(), suite.state, []string{machineSet.Metadata().ID()})
}
func (suite *MachineSetNodeSuite) TestNoRaceBetweenCleanupAndMachineSetNodeController() {
suite.startRuntime()
ctx, cancel := context.WithTimeout(suite.ctx, time.Second*10)
defer cancel()
suite.Require().NoError(suite.runtime.RegisterQController(&omnictrl.MachineSetNodeController{}))
suite.Require().NoError(suite.runtime.RegisterQController(omnictrl.NewLabelsExtractorController[*omni.MachineStatus]()))
suite.Require().NoError(suite.runtime.RegisterController(omnictrl.NewMachineRequestStatusCleanupController()))
cluster := omni.NewCluster("cluster-race")
cluster.TypedSpec().Value.TalosVersion = "1.6.4"
suite.Require().NoError(suite.state.Create(ctx, cluster))
machineClass := newMachineClass(fmt.Sprintf("%s==amd64", omni.MachineStatusLabelArch))
suite.Require().NoError(suite.state.Create(ctx, machineClass))
machineRequest := infra.NewMachineRequest("request-race")
suite.Require().NoError(suite.state.Create(ctx, machineRequest))
machines := suite.createMachines(map[string]string{
omni.MachineStatusLabelArch: "amd64",
omni.MachineStatusLabelAvailable: "",
omni.MachineStatusLabelConnected: "",
omni.MachineStatusLabelReadyToUse: "",
omni.MachineStatusLabelReportingEvents: "",
omni.LabelMachineRequest: machineRequest.Metadata().ID(),
})
machineID := machines[0].Metadata().ID()
machineRequestStatus := infra.NewMachineRequestStatus(machineRequest.Metadata().ID())
machineRequestStatus.TypedSpec().Value.Id = machineID
suite.Require().NoError(suite.state.Create(ctx, machineRequestStatus))
machineSet := omni.NewMachineSet("set-race")
machineSet.Metadata().Labels().Set(omni.LabelCluster, cluster.Metadata().ID())
machineSet.Metadata().Labels().Set(omni.LabelWorkerRole, "")
machineSet.TypedSpec().Value.MachineAllocation = &specs.MachineSetSpec_MachineAllocation{
Name: machineClass.Metadata().ID(),
MachineCount: 1,
}
suite.Require().NoError(suite.state.Create(ctx, machineSet))
rtestutils.AssertResources(ctx, suite.T(), suite.state, []string{machineID},
func(n *omni.MachineSetNode, assert *assert.Assertions) {},
)
initialMSN, err := safe.StateGetByID[*omni.MachineSetNode](ctx, suite.state, machineID)
suite.Require().NoError(err)
initialCreated := initialMSN.Metadata().Created()
_, err = suite.state.Teardown(ctx, machineRequest.Metadata())
suite.Require().NoError(err)
_, err = suite.state.Teardown(ctx, machineRequestStatus.Metadata())
suite.Require().NoError(err)
// Under the bug, the cleanup handler destroys the MSN and MachineSetNodeController
// immediately creates a new one for the same machine (new Created() timestamp).
// Either keeping the original MSN or leaving it destroyed is acceptable — recreating is not.
suite.Require().Never(func() bool {
msn, getErr := safe.StateGetByID[*omni.MachineSetNode](ctx, suite.state, machineID)
if getErr != nil {
return false
}
return msn.Metadata().Created().After(initialCreated)
}, time.Second*2, time.Millisecond*50,
"MachineSetNode should not be recreated by MachineSetNodeController after MachineRequestStatus cleanup destroyed it")
}
func TestSortFunction(t *testing.T) {
machineStatuses := map[resource.ID]*system.ResourceLabels[*omni.MachineStatus]{}
machineSetNodes := make([]*omni.MachineSetNode, 0, 10)