sidero/app/caps-controller-manager/controllers/metalmachine_controller.go
Andrey Smirnov 01981eb1c6
feat: update Talos to 1.6.0
Also update CAPI to 1.6.0.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
2023-12-15 17:59:33 +04:00

470 lines
15 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package controllers
import (
"context"
"errors"
"fmt"
"github.com/go-logr/logr"
"github.com/siderolabs/go-pointer"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/tools/reference"
capiv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/controllers/remote"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
infrav1 "github.com/siderolabs/sidero/app/caps-controller-manager/api/v1alpha3"
"github.com/siderolabs/sidero/app/caps-controller-manager/pkg/constants"
metalv1 "github.com/siderolabs/sidero/app/sidero-controller-manager/api/v1alpha2"
)
var ErrNoServersInServerClass = errors.New("no servers available in serverclass")
// MetalMachineReconciler reconciles a MetalMachine object.
type MetalMachineReconciler struct {
client.Client
Log logr.Logger
Scheme *runtime.Scheme
Recorder record.EventRecorder
Tracker *remote.ClusterCacheTracker
}
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=metalmachines,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=metalmachines/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch
// +kubebuilder:rbac:groups=metal.sidero.dev,resources=serverclasses,verbs=get;list;watch;
// +kubebuilder:rbac:groups=metal.sidero.dev,resources=serverclasses/status,verbs=get;list;watch;
// +kubebuilder:rbac:groups=metal.sidero.dev,resources=servers,verbs=get;list;watch;
// +kubebuilder:rbac:groups=metal.sidero.dev,resources=servers/status,verbs=get;update;patch
// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch
func (r *MetalMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Result, err error) {
logger := r.Log.WithValues("metalmachine", req.NamespacedName)
// Fetch the metalMachine instance.
metalMachine := &infrav1.MetalMachine{}
err = r.Get(ctx, req.NamespacedName, metalMachine)
if apierrors.IsNotFound(err) {
return ctrl.Result{}, nil
}
if err != nil {
return ctrl.Result{}, err
}
machine, err := util.GetOwnerMachine(ctx, r.Client, metalMachine.ObjectMeta)
if err != nil {
r.Log.Error(err, "Failed to get machine")
return ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter}, nil
}
if machine == nil {
logger.Info("No ownerref for metalmachine")
return ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter}, nil
}
logger = logger.WithName(fmt.Sprintf("machine=%s", machine.Name))
cluster, err := util.GetClusterFromMetadata(ctx, r.Client, machine.ObjectMeta)
if err != nil {
return ctrl.Result{}, fmt.Errorf("no cluster label or cluster does not exist")
}
logger = logger.WithName(fmt.Sprintf("cluster=%s", cluster.Name))
if !cluster.Status.InfrastructureReady {
logger.Error(err, "Cluster infrastructure is not ready", "cluster", cluster.Name)
return ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter}, nil
}
if machine.Spec.Bootstrap.DataSecretName == nil {
logger.Info("Bootstrap secret is not available yet")
return ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter}, nil
}
// Initialize the patch helper
patchHelper, err := patch.NewHelper(metalMachine, r.Client)
if err != nil {
return ctrl.Result{}, err
}
// Always attempt to Patch the MetalMachine object and status after each reconciliation.
defer func() {
if e := patchHelper.Patch(ctx, metalMachine); e != nil {
logger.Error(e, "failed to patch metalMachine")
if err == nil {
err = e
}
}
}()
// Handle deleted machines
if !metalMachine.ObjectMeta.DeletionTimestamp.IsZero() {
logger.Info("deleting metalmachine")
return r.reconcileDelete(ctx, metalMachine)
}
controllerutil.AddFinalizer(metalMachine, infrav1.MachineFinalizer)
// TODO (smira):
// This is really weird that .Spec.ServerRef is used both to set the manual link to the server by the user
// and a way to store current binding when server is picked up automatically
//
// This should be refactored to keep _current_ binding ref in the .Status, while
// .spec should be reserved for manually chosen server only.
// This opens a question what to do if the `.Status` is lost after pivoting, how to reconcile it correctly?
// if server binding is missing, need to pick up a server
if metalMachine.Spec.ServerRef == nil {
if metalMachine.Spec.ServerClassRef == nil {
return ctrl.Result{}, fmt.Errorf("either a server or serverclass ref must be supplied")
}
serverResource, err := r.fetchServerFromClass(ctx, logger, metalMachine.Spec.ServerClassRef, metalMachine)
if err != nil {
if errors.Is(err, ErrNoServersInServerClass) {
return ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter}, nil
}
return ctrl.Result{}, err
}
metalMachine.Spec.ServerRef = &corev1.ObjectReference{
Kind: serverResource.Kind,
Name: serverResource.Name,
}
} else {
// If server ref is set, it could have been set by "us", or by the user.
// In any case, check if we need to create server binding
var serverBinding infrav1.ServerBinding
err = r.Get(ctx, types.NamespacedName{Namespace: metalMachine.Spec.ServerRef.Namespace, Name: metalMachine.Spec.ServerRef.Name}, &serverBinding)
if err != nil {
if apierrors.IsNotFound(err) {
var serverObj metalv1.Server
namespacedName := types.NamespacedName{
Namespace: "",
Name: metalMachine.Spec.ServerRef.Name,
}
if err := r.Get(ctx, namespacedName, &serverObj); err != nil {
return ctrl.Result{}, err
}
serverRef, err := reference.GetReference(r.Scheme, &serverObj)
if err != nil {
return ctrl.Result{}, err
}
if err = r.createServerBinding(ctx, nil, &serverObj, metalMachine); err != nil {
return ctrl.Result{}, err
}
r.Recorder.Event(serverRef, corev1.EventTypeNormal, "Server Assignment", fmt.Sprintf("Server as assigned via serverRef for metal machine %q.", metalMachine.Name))
}
return ctrl.Result{}, err
}
}
// Set the providerID, as its required in upstream capi for machine lifecycle
metalMachine.Spec.ProviderID = pointer.To(fmt.Sprintf("%s://%s", constants.ProviderID, metalMachine.Spec.ServerRef.Name))
// Copy over statuses from ServerBinding to MetalMachine
if metalMachine.Spec.ServerRef != nil {
var serverBinding infrav1.ServerBinding
err = r.Get(ctx, types.NamespacedName{Namespace: metalMachine.Spec.ServerRef.Namespace, Name: metalMachine.Spec.ServerRef.Name}, &serverBinding)
if err != nil {
if apierrors.IsNotFound(err) {
return ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter}, nil
}
return ctrl.Result{}, err
}
addresses := make([]capiv1.MachineAddress, 0, len(serverBinding.Spec.Addresses))
for _, addr := range serverBinding.Spec.Addresses {
addresses = append(addresses, capiv1.MachineAddress{
Type: capiv1.MachineInternalIP,
Address: addr,
})
}
if serverBinding.Spec.Hostname != "" {
addresses = append(addresses, capiv1.MachineAddress{
Type: capiv1.MachineHostName,
Address: serverBinding.Spec.Hostname,
})
}
metalMachine.Status.Addresses = addresses
metalMachine.Status.Ready = true
// copy conditions from the server binding
for _, condition := range serverBinding.GetConditions() {
conditions.Set(metalMachine, &condition)
}
}
err = r.patchProviderID(ctx, cluster, metalMachine)
if err != nil {
logger.Info("Failed to set provider ID", "error", err)
conditions.MarkFalse(metalMachine, infrav1.ProviderSetCondition, infrav1.ProviderUpdateFailedReason, capiv1.ConditionSeverityWarning, err.Error())
return ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter}, nil
}
conditions.MarkTrue(metalMachine, infrav1.ProviderSetCondition)
return ctrl.Result{}, nil
}
func (r *MetalMachineReconciler) reconcileDelete(ctx context.Context, metalMachine *infrav1.MetalMachine) (ctrl.Result, error) {
if metalMachine.Spec.ServerRef != nil {
var serverBinding infrav1.ServerBinding
err := r.Get(ctx, types.NamespacedName{Namespace: metalMachine.Spec.ServerRef.Namespace, Name: metalMachine.Spec.ServerRef.Name}, &serverBinding)
if err == nil {
return ctrl.Result{Requeue: true}, r.Delete(ctx, &serverBinding)
}
if !apierrors.IsNotFound(err) {
return ctrl.Result{}, err
}
}
metalMachine.Spec.ServerRef = nil
controllerutil.RemoveFinalizer(metalMachine, infrav1.MachineFinalizer)
return ctrl.Result{}, nil
}
func (r *MetalMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error {
if err := mgr.GetFieldIndexer().IndexField(ctx, &infrav1.ServerBinding{}, infrav1.ServerBindingMetalMachineRefField, func(rawObj client.Object) []string {
serverBinding := rawObj.(*infrav1.ServerBinding)
return []string{serverBinding.Spec.MetalMachineRef.Name}
}); err != nil {
return err
}
mapRequests := func(ctx context.Context, a client.Object) []reconcile.Request {
serverBinding := &infrav1.ServerBinding{}
if err := r.Get(ctx, types.NamespacedName{Namespace: a.GetNamespace(), Name: a.GetName()}, serverBinding); err != nil {
return nil
}
return []reconcile.Request{
{
NamespacedName: types.NamespacedName{
Name: serverBinding.Spec.MetalMachineRef.Name,
Namespace: serverBinding.Spec.MetalMachineRef.Namespace,
},
},
}
}
return ctrl.NewControllerManagedBy(mgr).
WithOptions(options).
For(&infrav1.MetalMachine{}).
Watches(
&infrav1.ServerBinding{},
handler.EnqueueRequestsFromMapFunc(mapRequests),
).
Complete(r)
}
func (r *MetalMachineReconciler) fetchServerFromClass(ctx context.Context, logger logr.Logger, classRef *corev1.ObjectReference, metalMachine *infrav1.MetalMachine) (*metalv1.Server, error) {
// First, check if there is already existing serverBinding for this metalmachine
var serverBindingList infrav1.ServerBindingList
if err := r.List(ctx, &serverBindingList, client.MatchingFields(fields.Set{infrav1.ServerBindingMetalMachineRefField: metalMachine.Name})); err != nil {
return nil, err
}
for _, serverBinding := range serverBindingList.Items {
if serverBinding.Spec.MetalMachineRef.Namespace == metalMachine.Namespace && serverBinding.Spec.MetalMachineRef.Name == metalMachine.Name {
// found existing serverBinding for this metalMachine
var server metalv1.Server
if err := r.Get(ctx, types.NamespacedName{Namespace: serverBinding.Namespace, Name: serverBinding.Name}, &server); err != nil {
return nil, err
}
logger.Info("reconciled missing server ref", "metalmachine", metalMachine.Name, "server", server.Name)
return &server, nil
}
}
// Grab server class and validate that we have nodes available
serverClassResource, err := r.fetchServerClass(ctx, classRef)
if err != nil {
return nil, err
}
if len(serverClassResource.Status.ServersAvailable) == 0 {
return nil, ErrNoServersInServerClass
}
serverClassRef, err := reference.GetReference(r.Scheme, serverClassResource)
if err != nil {
return nil, err
}
// Fetch server from available list
// NB: we added this loop to double check that an available server isn't "in use" because
// we saw raciness between server selection and it being removed from the ServersAvailable list.
for _, availServer := range serverClassResource.Status.ServersAvailable {
serverObj := &metalv1.Server{}
namespacedName := types.NamespacedName{
Namespace: "",
Name: availServer,
}
if err := r.Get(ctx, namespacedName, serverObj); err != nil {
return nil, err
}
serverRef, err := reference.GetReference(r.Scheme, serverObj)
if err != nil {
return nil, err
}
if serverObj.Status.InUse {
continue
}
if !serverObj.Status.IsClean {
continue
}
if err := r.createServerBinding(ctx, serverClassRef, serverObj, metalMachine); err != nil {
// the server we picked was updated by another metalmachine before we finished.
// move on to the next one.
if apierrors.IsAlreadyExists(err) {
continue
}
return nil, err
}
r.Recorder.Event(serverRef, corev1.EventTypeNormal, "Server Allocation", fmt.Sprintf("Server is allocated via serverclass %q for metal machine %q.", serverClassResource.Name, metalMachine.Name))
logger.Info("allocated new server", "metalmachine", metalMachine.Name, "server", serverObj.Name, "serverclass", serverClassResource.Name)
return serverObj, nil
}
return nil, ErrNoServersInServerClass
}
func (r *MetalMachineReconciler) patchProviderID(ctx context.Context, cluster *capiv1.Cluster, metalMachine *infrav1.MetalMachine) error {
workloadClient, err := r.Tracker.GetClient(ctx, client.ObjectKeyFromObject(cluster))
if err != nil {
return err
}
var nodes corev1.NodeList
if err = workloadClient.List(ctx, &nodes, client.MatchingLabels{"metal.sidero.dev/uuid": metalMachine.Spec.ServerRef.Name}); err != nil {
return err
}
if len(nodes.Items) == 0 {
return fmt.Errorf("no matching nodes found")
}
if len(nodes.Items) > 1 {
return fmt.Errorf("multiple nodes found with same uuid label")
}
providerID := fmt.Sprintf("%s://%s", constants.ProviderID, metalMachine.Spec.ServerRef.Name)
node := nodes.Items[0]
if node.Spec.ProviderID == providerID {
return nil
}
patchHelper, err := patch.NewHelper(&node, workloadClient)
if err != nil {
return err
}
r.Log.Info("Setting provider ID", "id", providerID)
node.Spec.ProviderID = providerID
return patchHelper.Patch(ctx, &node)
}
// createServerBinding updates a server to mark it as "in use" via ServerBinding resource.
func (r *MetalMachineReconciler) createServerBinding(ctx context.Context, serverClassRef *corev1.ObjectReference, serverObj *metalv1.Server, metalMachine *infrav1.MetalMachine) error {
var serverBinding infrav1.ServerBinding
serverBinding.Namespace = serverObj.Namespace
serverBinding.Name = serverObj.Name
serverBinding.Labels = make(map[string]string)
serverBinding.Spec.MetalMachineRef = corev1.ObjectReference{
Kind: metalMachine.Kind,
UID: metalMachine.UID,
Namespace: metalMachine.Namespace,
Name: metalMachine.Name,
}
serverBinding.Spec.ServerClassRef = serverClassRef.DeepCopy()
for label, value := range metalMachine.Labels {
serverBinding.Labels[label] = value
}
return r.Create(ctx, &serverBinding)
}
func (r *MetalMachineReconciler) fetchServerClass(ctx context.Context, classRef *corev1.ObjectReference) (*metalv1.ServerClass, error) {
serverClassResource := &metalv1.ServerClass{}
namespacedName := types.NamespacedName{
Namespace: classRef.Namespace,
Name: classRef.Name,
}
if err := r.Get(ctx, namespacedName, serverClassResource); err != nil {
return nil, err
}
return serverClassResource, nil
}