mirror of
https://github.com/siderolabs/sidero.git
synced 2025-08-06 06:37:04 +02:00
Rename to siderolabs, bump dependencies, controller-runtime, get rid of netaddr, new SideroLink API, etc. Use bootstrap cluster with a control plane + worker to avoid nasty restarts when host-mode SideroLink IP pops up. Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
350 lines
12 KiB
Go
350 lines
12 KiB
Go
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
package controllers
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/go-logr/logr"
|
|
"github.com/pkg/errors"
|
|
corev1 "k8s.io/api/core/v1"
|
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
|
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
"k8s.io/client-go/tools/record"
|
|
"k8s.io/client-go/tools/reference"
|
|
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
|
|
"sigs.k8s.io/cluster-api/util/conditions"
|
|
"sigs.k8s.io/cluster-api/util/patch"
|
|
ctrl "sigs.k8s.io/controller-runtime"
|
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
|
"sigs.k8s.io/controller-runtime/pkg/controller"
|
|
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
|
|
"sigs.k8s.io/controller-runtime/pkg/handler"
|
|
"sigs.k8s.io/controller-runtime/pkg/reconcile"
|
|
"sigs.k8s.io/controller-runtime/pkg/source"
|
|
|
|
infrav1 "github.com/siderolabs/sidero/app/caps-controller-manager/api/v1alpha3"
|
|
metalv1 "github.com/siderolabs/sidero/app/sidero-controller-manager/api/v1alpha2"
|
|
"github.com/siderolabs/sidero/app/sidero-controller-manager/internal/power"
|
|
"github.com/siderolabs/sidero/app/sidero-controller-manager/pkg/constants"
|
|
siderotypes "github.com/siderolabs/sidero/app/sidero-controller-manager/pkg/types"
|
|
)
|
|
|
|
const (
|
|
serverBindingFinalizer = "storage.finalizers.server.k8s.io"
|
|
)
|
|
|
|
// ServerReconciler reconciles a Server object.
|
|
type ServerReconciler struct {
|
|
client.Client
|
|
Log logr.Logger
|
|
Scheme *runtime.Scheme
|
|
APIReader client.Reader
|
|
Recorder record.EventRecorder
|
|
|
|
RebootTimeout time.Duration
|
|
PXEMode siderotypes.PXEMode
|
|
}
|
|
|
|
// +kubebuilder:rbac:groups=metal.sidero.dev,resources=servers,verbs=get;list;watch;create;update;patch;delete
|
|
// +kubebuilder:rbac:groups=metal.sidero.dev,resources=servers/status,verbs=get;update;patch
|
|
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=serverbindings,verbs=get;list;watch
|
|
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=serverbindings/status,verbs=get
|
|
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=metalmachines,verbs=get;list;watch
|
|
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=metalmachines/status,verbs=get
|
|
// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch
|
|
// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;create;update;patch;delete
|
|
|
|
//nolint:maintidx
|
|
func (r *ServerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
|
|
log := r.Log.WithValues("server", req.NamespacedName)
|
|
|
|
s := metalv1.Server{}
|
|
|
|
if err := r.APIReader.Get(ctx, req.NamespacedName, &s); err != nil {
|
|
return ctrl.Result{}, client.IgnoreNotFound(err)
|
|
}
|
|
|
|
patchHelper, err := patch.NewHelper(&s, r.Client)
|
|
if err != nil {
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
serverRef, err := reference.GetReference(r.Scheme, &s)
|
|
if err != nil {
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
mgmtClient, err := power.NewManagementClient(ctx, r.Client, &s.Spec)
|
|
if err != nil {
|
|
log.Error(err, "failed to create management client")
|
|
r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to initialize management client: %s.", err))
|
|
|
|
return ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter}, err
|
|
}
|
|
|
|
defer mgmtClient.Close() //nolint:errcheck
|
|
|
|
s.Status.Power = "off"
|
|
|
|
poweredOn, powerErr := mgmtClient.IsPoweredOn()
|
|
if powerErr != nil {
|
|
s.Status.Power = "unknown"
|
|
}
|
|
|
|
if poweredOn {
|
|
s.Status.Power = "on"
|
|
}
|
|
|
|
pxeMode := r.PXEMode
|
|
if s.Spec.PXEMode != "" {
|
|
pxeMode = s.Spec.PXEMode
|
|
}
|
|
|
|
f := func(ready bool, result ctrl.Result) (ctrl.Result, error) {
|
|
s.Status.Ready = ready
|
|
|
|
if err := patchHelper.Patch(ctx, &s, patch.WithOwnedConditions{
|
|
Conditions: []clusterv1.ConditionType{metalv1.ConditionPowerCycle, metalv1.ConditionPXEBooted},
|
|
}); err != nil {
|
|
return result, errors.WithStack(err)
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
allocated, serverBinding, err := r.getServerBinding(ctx, req)
|
|
if err != nil {
|
|
return ctrl.Result{}, err
|
|
}
|
|
|
|
if !allocated {
|
|
if s.Status.InUse {
|
|
// transitioning to false
|
|
r.Recorder.Event(serverRef, corev1.EventTypeNormal, "Server Allocation", "Server marked as unallocated.")
|
|
}
|
|
|
|
s.Status.InUse = false
|
|
|
|
conditions.Delete(&s, metalv1.ConditionPXEBooted)
|
|
} else {
|
|
s.Status.InUse = true
|
|
s.Status.IsClean = false
|
|
|
|
if serverBinding != nil {
|
|
// clear any leftover ownerreferences, they were transferred by serverbinding controller
|
|
s.OwnerReferences = []v1.OwnerReference{}
|
|
|
|
// Talos installation was successful, so mark the server as PXE booted.
|
|
if conditions.IsTrue(serverBinding, infrav1.TalosInstalledCondition) {
|
|
conditions.MarkTrue(serverBinding, metalv1.ConditionPXEBooted)
|
|
}
|
|
}
|
|
}
|
|
|
|
hasFinalizer := controllerutil.ContainsFinalizer(&s, serverBindingFinalizer)
|
|
|
|
if s.ObjectMeta.DeletionTimestamp.IsZero() {
|
|
if !hasFinalizer {
|
|
controllerutil.AddFinalizer(&s, serverBindingFinalizer)
|
|
|
|
if err := patchHelper.Patch(ctx, &s); err != nil {
|
|
return ctrl.Result{}, errors.WithStack(err)
|
|
}
|
|
}
|
|
} else {
|
|
// remove the finalizer from the server if it is not allocated
|
|
if hasFinalizer && !allocated {
|
|
controllerutil.RemoveFinalizer(&s, serverBindingFinalizer)
|
|
}
|
|
}
|
|
|
|
switch {
|
|
case !s.Spec.Accepted:
|
|
// if server is not accepted, Sidero doesn't control server lifecycle, so we can't assume that server is (still) clean
|
|
s.Status.IsClean = false
|
|
|
|
return f(false, ctrl.Result{})
|
|
case s.Status.InUse && s.Status.IsClean:
|
|
log.Error(fmt.Errorf("server cannot be in use and clean"), "server is in an impossible state", "inUse", s.Status.InUse, "isClean", s.Status.IsClean)
|
|
|
|
return f(false, ctrl.Result{})
|
|
case !s.Status.InUse && s.Status.IsClean:
|
|
if powerErr != nil {
|
|
log.Error(powerErr, "failed to check power state")
|
|
r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to determine power status: %s.", powerErr))
|
|
|
|
return f(false, ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter})
|
|
}
|
|
|
|
if poweredOn {
|
|
err = mgmtClient.PowerOff()
|
|
if err != nil {
|
|
log.Error(err, "failed to power off")
|
|
r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to power off: %s.", err))
|
|
|
|
return f(false, ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter})
|
|
}
|
|
|
|
if !mgmtClient.IsFake() {
|
|
r.Recorder.Event(serverRef, corev1.EventTypeNormal, "Server Management", "Server powered off.")
|
|
}
|
|
}
|
|
|
|
return f(true, ctrl.Result{})
|
|
case s.Status.InUse && !s.Status.IsClean:
|
|
if powerErr != nil {
|
|
log.Error(powerErr, "failed to check power state")
|
|
r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to determine power status: %s.", powerErr))
|
|
|
|
return f(false, ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter})
|
|
}
|
|
|
|
if !poweredOn {
|
|
// it's safe to set server to PXE boot even if it's already installed, as PXE server makes sure server is PXE booted only once
|
|
err = mgmtClient.SetPXE(pxeMode)
|
|
if err != nil {
|
|
log.Error(err, "failed to set PXE")
|
|
r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to set to PXE boot once: %s.", err))
|
|
|
|
return f(false, ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter})
|
|
}
|
|
|
|
err = mgmtClient.PowerOn()
|
|
if err != nil {
|
|
log.Error(err, "failed to power on")
|
|
r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to power on: %s.", err))
|
|
|
|
return f(false, ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter})
|
|
}
|
|
|
|
if !mgmtClient.IsFake() {
|
|
r.Recorder.Event(serverRef, corev1.EventTypeNormal, "Server Management", "Server powered on and set PXE boot once into the environment.")
|
|
}
|
|
}
|
|
|
|
// keep checking power state from time to time, as sometimes IPMI lies about the power state
|
|
return f(true, ctrl.Result{RequeueAfter: constants.PowerCheckPeriod})
|
|
case !s.Status.InUse && !s.Status.IsClean:
|
|
// when server is set to PXE boot to be wiped, ConditionPowerCycle is set to mark server
|
|
// as power cycled to avoid duplicate reboot attempts from subsequent Reconciles
|
|
//
|
|
// we check LastTransitionTime to see if the server is in the wiping state for too long and
|
|
// it's time to retry the IPMI sequence
|
|
if conditions.Has(&s, metalv1.ConditionPowerCycle) &&
|
|
conditions.IsFalse(&s, metalv1.ConditionPowerCycle) &&
|
|
time.Since(conditions.GetLastTransitionTime(&s, metalv1.ConditionPowerCycle).Time) < r.RebootTimeout {
|
|
// already powercycled, reboot/heartbeat timeout not elapsed, wait more
|
|
return f(false, ctrl.Result{RequeueAfter: r.RebootTimeout / 3})
|
|
}
|
|
|
|
if powerErr != nil {
|
|
log.Error(powerErr, "failed to check power state")
|
|
r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to determine power status: %s.", powerErr))
|
|
|
|
return f(false, ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter})
|
|
}
|
|
|
|
err = mgmtClient.SetPXE(pxeMode)
|
|
if err != nil {
|
|
log.Error(err, "failed to set PXE")
|
|
r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to set to PXE boot once: %s.", err))
|
|
|
|
return f(false, ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter})
|
|
}
|
|
|
|
if poweredOn {
|
|
err = mgmtClient.PowerCycle()
|
|
if err != nil {
|
|
log.Error(err, "failed to power cycle")
|
|
r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to power cycle: %s.", err))
|
|
|
|
return f(false, ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter})
|
|
}
|
|
} else {
|
|
err = mgmtClient.PowerOn()
|
|
if err != nil {
|
|
log.Error(err, "failed to power on")
|
|
r.Recorder.Event(serverRef, corev1.EventTypeWarning, "Server Management", fmt.Sprintf("Failed to power on: %s.", err))
|
|
|
|
return f(false, ctrl.Result{RequeueAfter: constants.DefaultRequeueAfter})
|
|
}
|
|
}
|
|
|
|
if !mgmtClient.IsFake() {
|
|
if poweredOn {
|
|
r.Recorder.Event(serverRef, corev1.EventTypeNormal, "Server Management", "Server power cycled and set to PXE boot once.")
|
|
} else {
|
|
r.Recorder.Event(serverRef, corev1.EventTypeNormal, "Server Management", "Server powered on and set to PXE boot once.")
|
|
}
|
|
|
|
// make sure message is updated in case condition was already set to make sure LastTransitionTime will be updated
|
|
conditions.MarkFalse(&s, metalv1.ConditionPowerCycle, "InProgress", clusterv1.ConditionSeverityInfo, fmt.Sprintf("Server power cycled for wiping at %s.", time.Now().Format(time.RFC3339)))
|
|
}
|
|
|
|
// requeue to check for wipe timeout
|
|
return f(false, ctrl.Result{RequeueAfter: r.RebootTimeout / 3})
|
|
}
|
|
|
|
return f(false, ctrl.Result{})
|
|
}
|
|
|
|
func (r *ServerReconciler) getServerBinding(ctx context.Context, req ctrl.Request) (bool, *infrav1.ServerBinding, error) {
|
|
var (
|
|
serverBinding infrav1.ServerBinding
|
|
err error
|
|
)
|
|
|
|
err = r.Get(ctx, req.NamespacedName, &serverBinding)
|
|
if err == nil {
|
|
return true, &serverBinding, nil
|
|
}
|
|
|
|
if apierrors.IsNotFound(err) {
|
|
return false, nil, nil
|
|
}
|
|
|
|
return false, nil, err
|
|
}
|
|
|
|
func (r *ServerReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error {
|
|
if err := mgr.GetFieldIndexer().IndexField(ctx, &infrav1.MetalMachine{}, infrav1.MetalMachineServerRefField, func(rawObj client.Object) []string {
|
|
metalMachine := rawObj.(*infrav1.MetalMachine)
|
|
|
|
if metalMachine.Spec.ServerRef == nil {
|
|
return nil
|
|
}
|
|
|
|
return []string{metalMachine.Spec.ServerRef.Name}
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
|
|
mapRequests := func(a client.Object) []reconcile.Request {
|
|
// servers and serverbindings always have matching names
|
|
return []reconcile.Request{
|
|
{
|
|
NamespacedName: types.NamespacedName{
|
|
Name: a.GetName(),
|
|
Namespace: a.GetNamespace(),
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
return ctrl.NewControllerManagedBy(mgr).
|
|
WithOptions(options).
|
|
For(&metalv1.Server{}).
|
|
Watches(
|
|
&source.Kind{Type: &infrav1.ServerBinding{}},
|
|
handler.EnqueueRequestsFromMapFunc(mapRequests),
|
|
).
|
|
Complete(r)
|
|
}
|