diff --git a/app/cluster-api-provider-sidero/config/rbac/role.yaml b/app/cluster-api-provider-sidero/config/rbac/role.yaml index 4530c08e..29d4884d 100644 --- a/app/cluster-api-provider-sidero/config/rbac/role.yaml +++ b/app/cluster-api-provider-sidero/config/rbac/role.yaml @@ -12,6 +12,7 @@ rules: - events verbs: - create + - patch - apiGroups: - "" resources: diff --git a/app/cluster-api-provider-sidero/controllers/metalmachine_controller.go b/app/cluster-api-provider-sidero/controllers/metalmachine_controller.go index c8abf0f5..3cfbd87b 100644 --- a/app/cluster-api-provider-sidero/controllers/metalmachine_controller.go +++ b/app/cluster-api-provider-sidero/controllers/metalmachine_controller.go @@ -49,7 +49,7 @@ type MetalMachineReconciler struct { // +kubebuilder:rbac:groups=metal.sidero.dev,resources=servers,verbs=get;list;watch; // +kubebuilder:rbac:groups=metal.sidero.dev,resources=servers/status,verbs=get;update;patch // +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch -// +kubebuilder:rbac:groups="",resources=events,verbs=create +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch func (r *MetalMachineReconciler) Reconcile(req ctrl.Request) (_ ctrl.Result, err error) { ctx := context.Background() @@ -211,7 +211,7 @@ func (r *MetalMachineReconciler) Reconcile(req ctrl.Request) (_ ctrl.Result, err } if !mgmtClient.IsFake() { - r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", "Server powered on.") + r.Recorder.Event(serverRef, corev1.EventTypeNormal, "Server Management", "Server powered on.") } } @@ -260,7 +260,7 @@ func (r *MetalMachineReconciler) reconcileDelete(ctx context.Context, metalMachi return ctrl.Result{}, err } - r.Recorder.Event(ref, corev1.EventTypeNormal, "Server Allocation", "Server marked as unallocated") + r.Recorder.Event(ref, corev1.EventTypeNormal, "Server Allocation", "Server marked as unallocated.") } // Machine is deleted so remove the finalizer. @@ -397,52 +397,36 @@ func (r *MetalMachineReconciler) patchProviderID(ctx context.Context, cluster *c // patchServerInUse updates a server to mark it as "in use". func (r *MetalMachineReconciler) patchServerInUse(ctx context.Context, serverClass *metalv1alpha1.ServerClass, serverObj *metalv1alpha1.Server, metalMachine *infrav1.MetalMachine) error { - ref, err := reference.GetReference(r.Scheme, serverObj) - if err != nil { - return err - } + if !serverObj.Status.InUse || serverObj.Status.IsClean { + ref, err := reference.GetReference(r.Scheme, serverObj) + if err != nil { + return err + } - serverObj.Status.InUse = true - serverObj.Status.IsClean = false + serverObj.Status.InUse = true + serverObj.Status.IsClean = false - // nb: we update status and then update the object separately b/c statuses don't seem to get - // updated when doing the whole object below. - if err := r.Status().Update(ctx, serverObj); err != nil { - return err - } + // nb: we update status and then update the object separately b/c statuses don't seem to get + // updated when doing the whole object below. + if err := r.Status().Update(ctx, serverObj); err != nil { + return err + } - r.Recorder.Event(ref, corev1.EventTypeNormal, "Server Allocation", fmt.Sprintf("Server marked as allocated for metalMachine %q", metalMachine.Name)) - - rollback := func() { - // update failed, roll back Status changes - serverObj.Status.InUse = false - - r.Status().Update(ctx, serverObj) //nolint: errcheck + r.Recorder.Event(ref, corev1.EventTypeNormal, "Server Allocation", fmt.Sprintf("Server marked as allocated for metalMachine %q", metalMachine.Name)) } if serverClass != nil { - for { - serverObj.OwnerReferences = []metav1.OwnerReference{ - *metav1.NewControllerRef(serverClass, metalv1alpha1.GroupVersion.WithKind("ServerClass")), - } + patchHelper, err := patch.NewHelper(serverObj, r) + if err != nil { + return err + } - err := r.Update(ctx, serverObj) - if err == nil { - return nil - } + serverObj.OwnerReferences = []metav1.OwnerReference{ + *metav1.NewControllerRef(serverClass, metalv1alpha1.GroupVersion.WithKind("ServerClass")), + } - if !apierrors.IsConflict(err) { - rollback() - - return err - } - - // conflict happened, retry - if err = r.Get(ctx, types.NamespacedName{Namespace: serverObj.Namespace, Name: serverObj.Name}, serverObj); err != nil { - rollback() - - return err - } + if err := patchHelper.Patch(ctx, serverObj); err != nil { + return err } } diff --git a/app/metal-controller-manager/api/v1alpha1/server_types.go b/app/metal-controller-manager/api/v1alpha1/server_types.go index 94871e0a..1414da20 100644 --- a/app/metal-controller-manager/api/v1alpha1/server_types.go +++ b/app/metal-controller-manager/api/v1alpha1/server_types.go @@ -10,6 +10,7 @@ import ( corev1 "k8s.io/api/core/v1" apiextensions "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" ) // EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! @@ -100,12 +101,16 @@ type ServerSpec struct { Accepted bool `json:"accepted"` } +// ConditionPowerCycle is used to control the powercycle flow. +const ConditionPowerCycle clusterv1.ConditionType = "PowerCycle" + // ServerStatus defines the observed state of Server. type ServerStatus struct { - Ready bool `json:"ready"` - InUse bool `json:"inUse"` - IsClean bool `json:"isClean"` - Addresses []corev1.NodeAddress `json:"addresses,omitempty"` + Ready bool `json:"ready"` + InUse bool `json:"inUse"` + IsClean bool `json:"isClean"` + Conditions []clusterv1.Condition `json:"conditions,omitempty"` + Addresses []corev1.NodeAddress `json:"addresses,omitempty"` } // +kubebuilder:object:root=true @@ -125,6 +130,14 @@ type Server struct { Status ServerStatus `json:"status,omitempty"` } +func (s *Server) GetConditions() clusterv1.Conditions { + return s.Status.Conditions +} + +func (s *Server) SetConditions(conditions clusterv1.Conditions) { + s.Status.Conditions = conditions +} + // +kubebuilder:object:root=true // ServerList contains a list of Server. diff --git a/app/metal-controller-manager/api/v1alpha1/zz_generated.deepcopy.go b/app/metal-controller-manager/api/v1alpha1/zz_generated.deepcopy.go index d665a086..e7308a10 100644 --- a/app/metal-controller-manager/api/v1alpha1/zz_generated.deepcopy.go +++ b/app/metal-controller-manager/api/v1alpha1/zz_generated.deepcopy.go @@ -11,6 +11,7 @@ package v1alpha1 import ( v1 "k8s.io/api/core/v1" runtime "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/cluster-api/api/v1alpha3" ) // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. @@ -490,6 +491,13 @@ func (in *ServerSpec) DeepCopy() *ServerSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ServerStatus) DeepCopyInto(out *ServerStatus) { *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1alpha3.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } if in.Addresses != nil { in, out := &in.Addresses, &out.Addresses *out = make([]v1.NodeAddress, len(*in)) diff --git a/app/metal-controller-manager/cmd/agent/main.go b/app/metal-controller-manager/cmd/agent/main.go index 4598f5d7..d3e174d0 100644 --- a/app/metal-controller-manager/cmd/agent/main.go +++ b/app/metal-controller-manager/cmd/agent/main.go @@ -15,6 +15,7 @@ import ( "github.com/talos-systems/go-blockdevice/blockdevice/probe" "github.com/talos-systems/go-procfs/procfs" + "github.com/talos-systems/go-retry/retry" "github.com/talos-systems/go-smbios/smbios" talosnet "github.com/talos-systems/net" "golang.org/x/sys/unix" @@ -69,18 +70,7 @@ func setup() error { return nil } -func create(endpoint string, s *smbios.Smbios) (*api.CreateServerResponse, error) { - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - conn, err := grpc.DialContext(ctx, endpoint, grpc.WithInsecure(), grpc.WithBlock()) - if err != nil { - return nil, err - } - defer conn.Close() - - c := api.NewAgentClient(conn) - +func create(ctx context.Context, client api.AgentClient, s *smbios.Smbios) (*api.CreateServerResponse, error) { uuid, err := s.SystemInformation().UUID() if err != nil { return nil, err @@ -109,56 +99,48 @@ func create(endpoint string, s *smbios.Smbios) (*api.CreateServerResponse, error req.Hostname = hostname } - resp, err := c.CreateServer(ctx, req) - if err != nil { - return nil, err - } + var resp *api.CreateServerResponse - return resp, nil + err = retry.Constant(5*time.Minute, retry.WithUnits(30*time.Second)).Retry(func() error { + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + resp, err = client.CreateServer(ctx, req) + if err != nil { + return retry.ExpectedError(err) + } + + return nil + }) + + return resp, err } -func wipe(endpoint string, s *smbios.Smbios) error { +func wipe(ctx context.Context, client api.AgentClient, s *smbios.Smbios) error { uuid, err := s.SystemInformation().UUID() if err != nil { return err } - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() + return retry.Constant(5*time.Minute, retry.WithUnits(30*time.Second)).Retry(func() error { + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() - conn, err := grpc.DialContext(ctx, endpoint, grpc.WithInsecure(), grpc.WithBlock()) - if err != nil { - return err - } - defer conn.Close() + _, err = client.MarkServerAsWiped(ctx, &api.MarkServerAsWipedRequest{Uuid: uuid.String()}) + if err != nil { + return retry.ExpectedError(err) + } - c := api.NewAgentClient(conn) - - _, err = c.MarkServerAsWiped(ctx, &api.MarkServerAsWipedRequest{Uuid: uuid.String()}) - if err != nil { - return err - } - - return nil + return nil + }) } -func reconcileIPs(endpoint string, s *smbios.Smbios, ips []net.IP) error { +func reconcileIPs(ctx context.Context, client api.AgentClient, s *smbios.Smbios, ips []net.IP) error { uuid, err := s.SystemInformation().UUID() if err != nil { return err } - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - conn, err := grpc.DialContext(ctx, endpoint, grpc.WithInsecure(), grpc.WithBlock()) - if err != nil { - return err - } - defer conn.Close() - - c := api.NewAgentClient(conn) - addresses := make([]*api.Address, len(ips)) for i := range addresses { addresses[i] = &api.Address{ @@ -167,15 +149,20 @@ func reconcileIPs(endpoint string, s *smbios.Smbios, ips []net.IP) error { } } - _, err = c.ReconcileServerAddresses(ctx, &api.ReconcileServerAddressesRequest{ - Uuid: uuid.String(), - Address: addresses, - }) - if err != nil { - return err - } + return retry.Constant(5*time.Minute, retry.WithUnits(30*time.Second)).Retry(func() error { + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() - return nil + _, err = client.ReconcileServerAddresses(ctx, &api.ReconcileServerAddressesRequest{ + Uuid: uuid.String(), + Address: addresses, + }) + if err != nil { + return retry.ExpectedError(err) + } + + return nil + }) } func shutdown(err error) { @@ -186,6 +173,12 @@ func shutdown(err error) { log.Printf("rebooting in %d seconds\n", i) time.Sleep(1 * time.Second) } + + if unix.Reboot(unix.LINUX_REBOOT_CMD_RESTART) == nil { + select {} + } + + os.Exit(1) } if unix.Reboot(unix.LINUX_REBOOT_CMD_POWER_OFF) == nil { @@ -196,35 +189,65 @@ func shutdown(err error) { os.Exit(1) } -func main() { +func connect(ctx context.Context, endpoint string) (*grpc.ClientConn, error) { + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + return grpc.DialContext(ctx, endpoint, grpc.WithInsecure()) +} + +func mainFunc() error { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + if err := setup(); err != nil { - shutdown(err) + return err } var endpoint string if found := procfs.ProcCmdline().Get(constants.AgentEndpointArg).First(); found != nil { endpoint = *found } else { - shutdown(fmt.Errorf("no endpoint found")) + return fmt.Errorf("no endpoint found") } log.Printf("Using %q as API endpoint", endpoint) + conn, err := connect(ctx, endpoint) + if err != nil { + return err + } + + defer conn.Close() + + client := api.NewAgentClient(conn) + log.Println("Reading SMBIOS") s, err := smbios.New() if err != nil { - shutdown(err) + return err } - resp, err := create(endpoint, s) + createResp, err := create(ctx, client, s) if err != nil { - shutdown(err) + return err } log.Println("Registration complete") - if resp.GetWipe() { + ips, err := talosnet.IPAddrs() + if err != nil { + log.Println("failed to discover IPs") + } else { + if err = reconcileIPs(ctx, client, s, ips); err != nil { + shutdown(err) + } + + log.Printf("Reconciled IPs") + } + + if createResp.GetWipe() { bds, err := probe.All() if err != nil { shutdown(err) @@ -247,23 +270,16 @@ func main() { } } - if err := wipe(endpoint, s); err != nil { + if err := wipe(ctx, client, s); err != nil { shutdown(err) } log.Println("Wipe complete") } - ips, err := talosnet.IPAddrs() - if err != nil { - log.Println("failed to discover IPs") - } else { - if err := reconcileIPs(endpoint, s, ips); err != nil { - shutdown(err) - } - - log.Printf("Reconciled IPs") - } - - shutdown(nil) + return nil +} + +func main() { + shutdown(mainFunc()) } diff --git a/app/metal-controller-manager/config/crd/bases/metal.sidero.dev_servers.yaml b/app/metal-controller-manager/config/crd/bases/metal.sidero.dev_servers.yaml index be15ed4c..bc4e4b84 100644 --- a/app/metal-controller-manager/config/crd/bases/metal.sidero.dev_servers.yaml +++ b/app/metal-controller-manager/config/crd/bases/metal.sidero.dev_servers.yaml @@ -197,6 +197,49 @@ spec: - type type: object type: array + conditions: + items: + description: Condition defines an observation of a Cluster API resource + operational state. + properties: + lastTransitionTime: + description: Last time the condition transitioned from one status + to another. This should be when the underlying condition changed. + If that is not known, then using the time when the API field + changed is acceptable. + format: date-time + type: string + message: + description: A human readable message indicating details about + the transition. This field may be empty. + type: string + reason: + description: The reason for the condition's last transition + in CamelCase. The specific API may choose whether or not this + field is considered a guaranteed API. This field may not be + empty. + type: string + severity: + description: Severity provides an explicit classification of + Reason code, so the users or machines can immediately understand + the current situation and act accordingly. The Severity field + MUST be set only when Status=False. + type: string + status: + description: Status of the condition, one of True, False, Unknown. + type: string + type: + description: Type of condition in CamelCase or in foo.example.com/CamelCase. + Many .condition.type values are consistent across resources + like Available, but because arbitrary conditions can be useful + (see .node.status.conditions), the ability to deconflict is + important. + type: string + required: + - status + - type + type: object + type: array inUse: type: boolean isClean: diff --git a/app/metal-controller-manager/config/rbac/role.yaml b/app/metal-controller-manager/config/rbac/role.yaml index db9ca290..de24a397 100644 --- a/app/metal-controller-manager/config/rbac/role.yaml +++ b/app/metal-controller-manager/config/rbac/role.yaml @@ -12,6 +12,7 @@ rules: - events verbs: - create + - patch - apiGroups: - metal.sidero.dev resources: diff --git a/app/metal-controller-manager/controllers/server_controller.go b/app/metal-controller-manager/controllers/server_controller.go index 287d9be7..81404e34 100644 --- a/app/metal-controller-manager/controllers/server_controller.go +++ b/app/metal-controller-manager/controllers/server_controller.go @@ -13,6 +13,8 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" "k8s.io/client-go/tools/reference" + clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3" + "sigs.k8s.io/cluster-api/util/conditions" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" @@ -31,7 +33,7 @@ type ServerReconciler struct { // +kubebuilder:rbac:groups=metal.sidero.dev,resources=servers,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=metal.sidero.dev,resources=servers/status,verbs=get;update;patch -// +kubebuilder:rbac:groups="",resources=events,verbs=create +// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch func (r *ServerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { ctx := context.Background() @@ -94,7 +96,7 @@ func (r *ServerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { } if !mgmtClient.IsFake() { - r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", "Server powered off.") + r.Recorder.Event(serverRef, corev1.EventTypeNormal, "Server Management", "Server powered off.") } } @@ -102,6 +104,10 @@ func (r *ServerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { case s.Status.InUse && !s.Status.IsClean: return f(true, false) case !s.Status.InUse && !s.Status.IsClean: + if conditions.Has(&s, metalv1alpha1.ConditionPowerCycle) && conditions.IsFalse(&s, metalv1alpha1.ConditionPowerCycle) { + return f(false, false) + } + mgmtClient, err := metal.NewManagementClient(&s.Spec) if err != nil { log.Error(err, "failed to create management client") @@ -110,7 +116,7 @@ func (r *ServerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { return f(false, true) } - _, err = mgmtClient.IsPoweredOn() + poweredOn, err := mgmtClient.IsPoweredOn() if err != nil { log.Error(err, "failed to check power state") r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to determine power status: %s.", err)) @@ -126,16 +132,32 @@ func (r *ServerReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { return f(false, true) } - err = mgmtClient.PowerCycle() - if err != nil { - log.Error(err, "failed to power cycle") - r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to power cycle: %s.", err)) + if poweredOn { + err = mgmtClient.PowerCycle() + if err != nil { + log.Error(err, "failed to power cycle") + r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to power cycle: %s.", err)) - return f(false, true) + return f(false, true) + } + } else { + err = mgmtClient.PowerOn() + if err != nil { + log.Error(err, "failed to power on") + r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to power on: %s.", err)) + + return f(false, true) + } } if !mgmtClient.IsFake() { - r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", "Server power cycled.") + if poweredOn { + r.Recorder.Event(serverRef, corev1.EventTypeNormal, "Server Management", "Server power cycled and set to PXE boot once.") + } else { + r.Recorder.Event(serverRef, corev1.EventTypeNormal, "Server Management", "Server powered on and set to PXE boot once.") + } + + conditions.MarkFalse(&s, metalv1alpha1.ConditionPowerCycle, "InProgress", clusterv1.ConditionSeverityInfo, "Server power cycled for wiping.") } return f(false, false) diff --git a/app/metal-controller-manager/internal/server/server.go b/app/metal-controller-manager/internal/server/server.go index 67def938..ca86ed1d 100644 --- a/app/metal-controller-manager/internal/server/server.go +++ b/app/metal-controller-manager/internal/server/server.go @@ -23,6 +23,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/client-go/tools/record" "k8s.io/client-go/tools/reference" + "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" controllerclient "sigs.k8s.io/controller-runtime/pkg/client" @@ -124,6 +125,8 @@ func (s *server) MarkServerAsWiped(ctx context.Context, in *api.MarkServerAsWipe obj.Status.IsClean = true + conditions.MarkTrue(obj, metalv1alpha1.ConditionPowerCycle) + if err := patchHelper.Patch(ctx, obj); err != nil { return nil, err } diff --git a/go.mod b/go.mod index 89a791c9..1852aa1d 100644 --- a/go.mod +++ b/go.mod @@ -17,6 +17,7 @@ require ( github.com/talos-systems/cluster-api-control-plane-provider-talos v0.1.0-alpha.4 github.com/talos-systems/go-blockdevice v0.1.0 github.com/talos-systems/go-procfs v0.0.0-20200219015357-57c7311fdd45 + github.com/talos-systems/go-retry v0.1.1-0.20200922131245-752f081252cf github.com/talos-systems/go-smbios v0.0.0-20200219201045-94b8c4e489ee github.com/talos-systems/net v0.2.0 github.com/talos-systems/talos v0.7.0-alpha.4 diff --git a/internal/pkg/api/api.go b/internal/pkg/api/api.go index 4820564e..1aef12e6 100644 --- a/internal/pkg/api/api.go +++ b/internal/pkg/api/api.go @@ -69,11 +69,7 @@ func (c *Client) PowerOff() error { // PowerCycle will power cycle a given machine. func (c *Client) PowerCycle() error { - if err := c.postRequest("/poweroff"); err != nil { - return err - } - - return c.postRequest("/poweron") + return c.postRequest("/reboot") } // SetPXE makes sure the node will pxe boot next time. diff --git a/sfyra/pkg/tests/management_cluster.go b/sfyra/pkg/tests/management_cluster.go index 6f136b6f..6c4928e9 100644 --- a/sfyra/pkg/tests/management_cluster.go +++ b/sfyra/pkg/tests/management_cluster.go @@ -116,7 +116,9 @@ func TestManagementCluster(ctx context.Context, metalClient client.Client, clust obj := obj - _, err = dr.Create(ctx, &obj, metav1.CreateOptions{}) + _, err = dr.Create(ctx, &obj, metav1.CreateOptions{ + FieldManager: "sfyra", + }) if err != nil { if apierrors.IsAlreadyExists(err) { _, err = dr.Patch(ctx, obj.GetName(), types.ApplyPatchType, data, metav1.PatchOptions{ diff --git a/sfyra/pkg/tests/reconcile.go b/sfyra/pkg/tests/reconcile.go new file mode 100644 index 00000000..ca4a971c --- /dev/null +++ b/sfyra/pkg/tests/reconcile.go @@ -0,0 +1,88 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +package tests + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/require" + "github.com/talos-systems/go-retry/retry" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/cluster-api/api/v1alpha3" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// TestMachineDeploymentReconcile verifies that machine deployment can reconcile delete machines. +func TestMachineDeploymentReconcile(ctx context.Context, metalClient client.Client) TestFunc { + return func(t *testing.T) { + var machineDeployment v1alpha3.MachineDeployment + + const machineDeploymentName = "management-cluster-workers" + + err := metalClient.Get(ctx, types.NamespacedName{Namespace: "default", Name: machineDeploymentName}, &machineDeployment) + require.NoError(t, err) + + var machines v1alpha3.MachineList + + labelSelector, err := labels.Parse(machineDeployment.Status.Selector) + require.NoError(t, err) + + err = metalClient.List(ctx, &machines, client.MatchingLabelsSelector{Selector: labelSelector}) + require.NoError(t, err) + + replicas := int32(len(machines.Items)) + + for _, machine := range machines.Items { + machine := machine + + err = metalClient.Delete(ctx, &machine) + require.NoError(t, err) + } + + // first, controller should pick up the fact that some replicas are missing + err = retry.Constant(1*time.Minute, retry.WithUnits(100*time.Millisecond)).Retry(func() error { + var machineDeployment v1alpha3.MachineDeployment + + err = metalClient.Get(ctx, types.NamespacedName{Namespace: "default", Name: machineDeploymentName}, &machineDeployment) + if err != nil { + return retry.UnexpectedError(err) + } + + if machineDeployment.Status.UnavailableReplicas != replicas { + return retry.ExpectedError(fmt.Errorf("expected %d unavailable replicas, got %d", replicas, machineDeployment.Status.ReadyReplicas)) + } + + return nil + }) + require.NoError(t, err) + + // next, check that replicas get reconciled + err = retry.Constant(10*time.Minute, retry.WithUnits(10*time.Second)).Retry(func() error { + err = metalClient.Get(ctx, types.NamespacedName{Namespace: "default", Name: machineDeploymentName}, &machineDeployment) + if err != nil { + return retry.UnexpectedError(err) + } + + if v1alpha3.MachineDeploymentPhase(machineDeployment.Status.Phase) != v1alpha3.MachineDeploymentPhaseRunning { + return retry.ExpectedError(fmt.Errorf("expected %s phase, got %s", v1alpha3.MachineDeploymentPhaseRunning, machineDeployment.Status.Phase)) + } + + if machineDeployment.Status.Replicas != replicas { + return retry.ExpectedError(fmt.Errorf("expected %d replicas, got %d", replicas, machineDeployment.Status.Replicas)) + } + + if machineDeployment.Status.ReadyReplicas != replicas { + return retry.ExpectedError(fmt.Errorf("expected %d ready replicas, got %d", replicas, machineDeployment.Status.ReadyReplicas)) + } + + return nil + }) + require.NoError(t, err) + } +} diff --git a/sfyra/pkg/tests/reset.go b/sfyra/pkg/tests/reset.go index 8446608e..300e6a18 100644 --- a/sfyra/pkg/tests/reset.go +++ b/sfyra/pkg/tests/reset.go @@ -13,16 +13,14 @@ import ( "github.com/stretchr/testify/require" "github.com/talos-systems/go-retry/retry" "k8s.io/apimachinery/pkg/labels" - "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/controller-runtime/pkg/client" metal "github.com/talos-systems/sidero/app/cluster-api-provider-sidero/api/v1alpha3" sidero "github.com/talos-systems/sidero/app/metal-controller-manager/api/v1alpha1" - "github.com/talos-systems/sidero/sfyra/pkg/vm" ) // TestServerReset verifies that all the servers got reset. -func TestServerReset(ctx context.Context, metalClient client.Client, vmSet *vm.Set) TestFunc { +func TestServerReset(ctx context.Context, metalClient client.Client) TestFunc { return func(t *testing.T) { var machines metal.MetalMachineList @@ -40,14 +38,11 @@ func TestServerReset(ctx context.Context, metalClient client.Client, vmSet *vm.S } serverNamesToCheck = append(serverNamesToCheck, machines.Items[i].Spec.ServerRef.Name) - - ownerMachine, err := util.GetOwnerMachine(ctx, metalClient, machines.Items[i].ObjectMeta) - require.NoError(t, err) - - err = metalClient.Delete(ctx, ownerMachine) - require.NoError(t, err) } + err = scaleWorkers(ctx, metalClient, 0) + require.NoError(t, err) + err = retry.Constant(5*time.Minute, retry.WithUnits(10*time.Second)).Retry(func() error { var servers sidero.ServerList @@ -79,8 +74,6 @@ func TestServerReset(ctx context.Context, metalClient client.Client, vmSet *vm.S return nil }) - // TODO: Wait for machinedeployment to reconcile deleted machine. - require.NoError(t, err) } } diff --git a/sfyra/pkg/tests/scale.go b/sfyra/pkg/tests/scale.go index 315de6d3..a29891e0 100644 --- a/sfyra/pkg/tests/scale.go +++ b/sfyra/pkg/tests/scale.go @@ -102,6 +102,10 @@ func scaleWorkers(ctx context.Context, metalClient client.Client, replicas int32 return fmt.Errorf("expected %s phase, got %s", v1alpha3.MachineDeploymentPhaseRunning, o.Status.Phase) } + if o.Status.Replicas != replicas { + return fmt.Errorf("expected %d replicas, got %d", replicas, o.Status.Replicas) + } + if o.Status.ReadyReplicas != replicas { return fmt.Errorf("expected %d ready replicas, got %d", replicas, o.Status.ReadyReplicas) } @@ -123,6 +127,8 @@ func scaleWorkers(ctx context.Context, metalClient client.Client, replicas int32 } func scale(ctx context.Context, metalClient client.Client, name string, obj runtime.Object, set, verify ScaleCallBack) error { + cleanObj := obj.DeepCopyObject() + err := metalClient.Get(ctx, types.NamespacedName{Namespace: "default", Name: name}, obj) if err != nil { return err @@ -133,12 +139,16 @@ func scale(ctx context.Context, metalClient client.Client, name string, obj runt return err } - err = metalClient.Update(ctx, obj) + err = metalClient.Update(ctx, obj, &client.UpdateOptions{ + FieldManager: "sfyra", + }) if err != nil { return err } err = retry.Constant(10*time.Minute, retry.WithUnits(10*time.Second)).Retry(func() error { + obj = cleanObj.DeepCopyObject() + err := metalClient.Get(ctx, types.NamespacedName{Namespace: "default", Name: name}, obj) if err != nil { return retry.UnexpectedError(err) diff --git a/sfyra/pkg/tests/tests.go b/sfyra/pkg/tests/tests.go index 864b7b77..57aa27f2 100644 --- a/sfyra/pkg/tests/tests.go +++ b/sfyra/pkg/tests/tests.go @@ -99,9 +99,13 @@ func Run(ctx context.Context, cluster talos.Cluster, vmSet *vm.Set, capiManager "TestScaleControlPlaneDown", TestScaleControlPlaneDown(ctx, metalClient, vmSet), }, + { + "TestMachineDeploymentReconcile", + TestMachineDeploymentReconcile(ctx, metalClient), + }, { "TestServerReset", - TestServerReset(ctx, metalClient, vmSet), + TestServerReset(ctx, metalClient), }, }