From e44f350d08ec51c8cfd091fc3ef28f5e61eae8a1 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Tue, 1 Feb 2022 23:20:09 +0300 Subject: [PATCH] fix: use controller-runtime standard healthz endpoints Fixes #717 `caps-controller-manager` has proper webhook support, so use that for readiness/liveness checks (standard CAPI way). `sidero-controller-manager` doesn't have webhooks (we should fix it eventually!), so using iPXE check Also: * bump Talos to 0.14.1 * use Talos provided default arguments for the agent environment (as agent is running Talos kernel, it makes sense). Signed-off-by: Andrey Smirnov --- Makefile | 2 +- .../config/manager/manager.yaml | 12 +++++ app/caps-controller-manager/main.go | 33 +++++++++---- .../config/manager/manager.yaml | 11 +++-- .../internal/healthz/healthz.go | 19 -------- .../internal/ipxe/ipxe_server.go | 48 ++++++++++++++----- app/sidero-controller-manager/main.go | 42 ++++++++++------ go.mod | 2 +- go.sum | 3 +- sfyra/go.mod | 4 +- sfyra/go.sum | 7 +-- 11 files changed, 117 insertions(+), 66 deletions(-) delete mode 100644 app/sidero-controller-manager/internal/healthz/healthz.go diff --git a/Makefile b/Makefile index fd97de34..8078654a 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ MODULE := $(shell head -1 go.mod | cut -d' ' -f2) ARTIFACTS := _out TEST_PKGS ?= ./... -TALOS_RELEASE ?= v0.14.0-alpha.2 +TALOS_RELEASE ?= v0.14.1 PREVIOUS_TALOS_RELEASE ?= v0.13.4 DEFAULT_K8S_VERSION ?= v1.22.3 diff --git a/app/caps-controller-manager/config/manager/manager.yaml b/app/caps-controller-manager/config/manager/manager.yaml index 44bc8cba..ae34784f 100644 --- a/app/caps-controller-manager/config/manager/manager.yaml +++ b/app/caps-controller-manager/config/manager/manager.yaml @@ -30,4 +30,16 @@ spec: requests: cpu: 100m memory: 128Mi + ports: + - containerPort: 9440 + name: healthz + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: healthz + livenessProbe: + httpGet: + path: /healthz + port: healthz terminationGracePeriodSeconds: 10 diff --git a/app/caps-controller-manager/main.go b/app/caps-controller-manager/main.go index 7d159c64..4a53b13e 100644 --- a/app/caps-controller-manager/main.go +++ b/app/caps-controller-manager/main.go @@ -51,11 +51,13 @@ func init() { func main() { var ( metricsAddr string + healthAddr string enableLeaderElection bool webhookPort int ) flag.StringVar(&metricsAddr, "metrics-bind-addr", ":8080", "The address the metric endpoint binds to.") + flag.StringVar(&healthAddr, "health-addr", ":9440", "The address the health endpoint binds to.") flag.BoolVar(&enableLeaderElection, "enable-leader-election", true, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") flag.IntVar(&webhookPort, "webhook-port", 9443, "Webhook Server port, disabled by default. When enabled, the manager will only work as webhook server, no reconcilers are installed.") @@ -82,12 +84,13 @@ func main() { }) mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ - Scheme: scheme, - MetricsBindAddress: metricsAddr, - LeaderElection: enableLeaderElection, - LeaderElectionID: "controller-leader-election-capm", - Port: webhookPort, - EventBroadcaster: broadcaster, + Scheme: scheme, + MetricsBindAddress: metricsAddr, + LeaderElection: enableLeaderElection, + LeaderElectionID: "controller-leader-election-capm", + Port: webhookPort, + EventBroadcaster: broadcaster, + HealthProbeBindAddress: healthAddr, }) if err != nil { setupLog.Error(err, "unable to start manager") @@ -110,7 +113,7 @@ func main() { mgr.GetScheme(), corev1.EventSource{Component: "caps-controller-manager"}) - ctx := context.Background() + ctx := ctrl.SetupSignalHandler() if err = (&controllers.MetalClusterReconciler{ Client: mgr.GetClient(), @@ -162,10 +165,24 @@ func main() { } // +kubebuilder:scaffold:builder + setupChecks(mgr) + setupLog.Info("starting manager") - if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + if err := mgr.Start(ctx); err != nil { setupLog.Error(err, "problem running manager") os.Exit(1) } } + +func setupChecks(mgr ctrl.Manager) { + if err := mgr.AddReadyzCheck("webhook", mgr.GetWebhookServer().StartedChecker()); err != nil { + setupLog.Error(err, "unable to create ready check") + os.Exit(1) + } + + if err := mgr.AddHealthzCheck("webhook", mgr.GetWebhookServer().StartedChecker()); err != nil { + setupLog.Error(err, "unable to create health check") + os.Exit(1) + } +} diff --git a/app/sidero-controller-manager/config/manager/manager.yaml b/app/sidero-controller-manager/config/manager/manager.yaml index 45a119f5..62f556d1 100644 --- a/app/sidero-controller-manager/config/manager/manager.yaml +++ b/app/sidero-controller-manager/config/manager/manager.yaml @@ -82,6 +82,9 @@ spec: - name: http containerPort: ${SIDERO_CONTROLLER_MANAGER_CONTAINER_API_PORT:=8081} protocol: TCP + - containerPort: 9440 + name: healthz + protocol: TCP env: - name: API_ENDPOINT valueFrom: @@ -96,14 +99,12 @@ spec: memory: 128Mi readinessProbe: httpGet: - path: /healthz - port: http - initialDelaySeconds: 15 + path: /readyz + port: healthz livenessProbe: httpGet: path: /healthz - port: http - initialDelaySeconds: 15 + port: healthz - command: - /siderolink-manager args: diff --git a/app/sidero-controller-manager/internal/healthz/healthz.go b/app/sidero-controller-manager/internal/healthz/healthz.go deleted file mode 100644 index d924aa72..00000000 --- a/app/sidero-controller-manager/internal/healthz/healthz.go +++ /dev/null @@ -1,19 +0,0 @@ -// This Source Code Form is subject to the terms of the Mozilla Public -// License, v. 2.0. If a copy of the MPL was not distributed with this -// file, You can obtain one at http://mozilla.org/MPL/2.0/. - -package healthz - -import ( - "net/http" -) - -func RegisterServer(mux *http.ServeMux) error { - mux.HandleFunc("/healthz", healthzHandler) - - return nil -} - -func healthzHandler(w http.ResponseWriter, req *http.Request) { - // do nothing, consider to be healthy always -} diff --git a/app/sidero-controller-manager/internal/ipxe/ipxe_server.go b/app/sidero-controller-manager/internal/ipxe/ipxe_server.go index 17170227..f70dfd3e 100644 --- a/app/sidero-controller-manager/internal/ipxe/ipxe_server.go +++ b/app/sidero-controller-manager/internal/ipxe/ipxe_server.go @@ -9,12 +9,14 @@ import ( "context" "errors" "fmt" + "io" "log" "net" "net/http" "strconv" "strings" "text/template" + "time" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -23,9 +25,11 @@ import ( "sigs.k8s.io/cluster-api/util/conditions" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/healthz" "github.com/talos-systems/go-procfs/procfs" talosconstants "github.com/talos-systems/talos/pkg/machinery/constants" + "github.com/talos-systems/talos/pkg/machinery/kernel" infrav1 "github.com/talos-systems/sidero/app/caps-controller-manager/api/v1alpha3" metalv1alpha1 "github.com/talos-systems/sidero/app/sidero-controller-manager/api/v1alpha1" @@ -369,23 +373,15 @@ func newEnvironment(server *metalv1alpha1.Server, serverBinding *infrav1.ServerB } func newAgentEnvironment(arch string) *metalv1alpha1.Environment { - args := []string{ + args := append([]string(nil), kernel.DefaultArgs...) + args = append(args, "console=tty0", "console=ttyS0", - "ima_appraise=fix", - "ima_hash=sha512", - "ima_template=ima-ng", "initrd=initramfs.xz", "ip=dhcp", - "page_poison=1", "panic=30", - "printk.devkmsg=on", - "pti=on", - "random.trust_cpu=on", - "slab_nomerge=", - "slub_debug=P", fmt.Sprintf("%s=%s:%d", constants.AgentEndpointArg, apiEndpoint, apiPort), - } + ) cmdline := procfs.NewCmdline(strings.Join(args, " ")) extra := procfs.NewCmdline(extraAgentKernelArgs) @@ -513,3 +509,33 @@ func markAsPXEBooted(server *metalv1alpha1.Server) error { Conditions: []clusterv1.ConditionType{metalv1alpha1.ConditionPXEBooted}, }) } + +func Check(addr string) healthz.Checker { + return func(_ *http.Request) error { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://%s/boot.ipxe", addr), nil) + if err != nil { + return err + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return err + } + + defer func() { + if resp.Body != nil { + io.Copy(io.Discard, resp.Body) //nolint:errcheck + resp.Body.Close() //nolint:errcheck + } + }() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("unexpected code %d", resp.StatusCode) + } + + return nil + } +} diff --git a/app/sidero-controller-manager/main.go b/app/sidero-controller-manager/main.go index 91108683..649276da 100644 --- a/app/sidero-controller-manager/main.go +++ b/app/sidero-controller-manager/main.go @@ -32,7 +32,6 @@ import ( infrav1 "github.com/talos-systems/sidero/app/caps-controller-manager/api/v1alpha3" metalv1alpha1 "github.com/talos-systems/sidero/app/sidero-controller-manager/api/v1alpha1" "github.com/talos-systems/sidero/app/sidero-controller-manager/controllers" - "github.com/talos-systems/sidero/app/sidero-controller-manager/internal/healthz" "github.com/talos-systems/sidero/app/sidero-controller-manager/internal/ipxe" "github.com/talos-systems/sidero/app/sidero-controller-manager/internal/metadata" "github.com/talos-systems/sidero/app/sidero-controller-manager/internal/power/api" @@ -71,6 +70,7 @@ func init() { func main() { var ( metricsAddr string + healthAddr string apiEndpoint string apiPort int httpPort int @@ -91,6 +91,7 @@ func main() { flag.IntVar(&apiPort, "api-port", 8081, "The TCP port Sidero components can be reached at from the servers.") flag.IntVar(&httpPort, "http-port", 8081, "The TCP port Sidero controller manager HTTP server is running.") flag.StringVar(&metricsAddr, "metrics-bind-addr", ":8081", "The address the metric endpoint binds to.") + flag.StringVar(&healthAddr, "health-addr", ":9440", "The address the health endpoint binds to.") flag.StringVar(&extraAgentKernelArgs, "extra-agent-kernel-args", "", "A list of Linux kernel command line arguments to add to the agent environment kernel parameters (e.g. 'console=tty1 console=ttyS1').") flag.StringVar(&bootFromDiskMethod, "boot-from-disk-method", string(ipxe.BootIPXEExit), "Default method to use to boot server from disk if it hits iPXE endpoint after install.") flag.BoolVar(&enableLeaderElection, "enable-leader-election", true, "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") @@ -150,11 +151,12 @@ func main() { api.DefaultDice = api.NewFailureDice(testPowerSimulatedExplicitFailureProb, testPowerSimulatedSilentFailureProb) mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ - Scheme: scheme, - MetricsBindAddress: metricsAddr, - LeaderElection: enableLeaderElection, - LeaderElectionID: "controller-leader-election-sidero-controller-manager", - Port: 9443, + Scheme: scheme, + MetricsBindAddress: metricsAddr, + LeaderElection: enableLeaderElection, + LeaderElectionID: "controller-leader-election-sidero-controller-manager", + Port: 9443, + HealthProbeBindAddress: healthAddr, }) if err != nil { setupLog.Error(err, "unable to start manager") @@ -177,7 +179,7 @@ func main() { mgr.GetScheme(), corev1.EventSource{Component: "sidero-controller-manager"}) - ctx := context.Background() + ctx := ctrl.SetupSignalHandler() if err = (&controllers.EnvironmentReconciler{ Client: mgr.GetClient(), @@ -212,6 +214,9 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "ServerClass") os.Exit(1) } + + setupChecks(mgr, httpPort) + // +kubebuilder:scaffold:builder errCh := make(chan error) @@ -242,13 +247,6 @@ func main() { os.Exit(1) } - setupLog.Info("starting healthz server") - - if err := healthz.RegisterServer(httpMux); err != nil { - setupLog.Error(err, "unable to start healthz server", "controller", "Environment") - os.Exit(1) - } - setupLog.Info("starting internal API server") apiRecorder := eventBroadcaster.NewRecorder( @@ -283,7 +281,7 @@ func main() { setupLog.Info("starting manager and HTTP server") go func() { - err := mgr.Start(ctrl.SetupSignalHandler()) + err := mgr.Start(ctx) if err != nil { setupLog.Error(err, "problem running manager") } @@ -326,3 +324,17 @@ func main() { } } } + +func setupChecks(mgr ctrl.Manager, httpPort int) { + addr := fmt.Sprintf("127.0.0.1:%d", httpPort) + + if err := mgr.AddReadyzCheck("ipxe", ipxe.Check(addr)); err != nil { + setupLog.Error(err, "unable to create ready check") + os.Exit(1) + } + + if err := mgr.AddHealthzCheck("webhook", ipxe.Check(addr)); err != nil { + setupLog.Error(err, "unable to create health check") + os.Exit(1) + } +} diff --git a/go.mod b/go.mod index d94cd388..139aabaf 100644 --- a/go.mod +++ b/go.mod @@ -27,7 +27,7 @@ require ( github.com/talos-systems/grpc-proxy v0.2.0 github.com/talos-systems/net v0.3.1 github.com/talos-systems/siderolink v0.1.1-0.20211130121818-9902ad2774f0 - github.com/talos-systems/talos/pkg/machinery v0.14.0 + github.com/talos-systems/talos/pkg/machinery v0.14.1 go.uber.org/zap v1.20.0 golang.org/x/net v0.0.0-20220114011407-0dd24b26b47d golang.org/x/sync v0.0.0-20210220032951-036812b2e83c diff --git a/go.sum b/go.sum index e3265fbf..61de92c9 100644 --- a/go.sum +++ b/go.sum @@ -663,8 +663,9 @@ github.com/talos-systems/siderolink v0.1.1-0.20211130121818-9902ad2774f0/go.mod github.com/talos-systems/talos/pkg/machinery v0.12.3/go.mod h1:qX77JMZawrDTQaJucqecdlFsHy+dbnZ9YL8Kw4qL7d4= github.com/talos-systems/talos/pkg/machinery v0.13.0/go.mod h1:fQx1FlvFLSexSOYL1DSl0EjtazujlzNmVDCt2yRoLJ4= github.com/talos-systems/talos/pkg/machinery v0.14.0-alpha.1.0.20211118180932-1ffa8e048008/go.mod h1:D8NT4Aj+X2OpA6yK6RAtpw1wcgkDS7oD23vqOQWRiP8= -github.com/talos-systems/talos/pkg/machinery v0.14.0 h1:UKk33z236rMWHsSMhu6ExlG1uB5dF7jws3qRDP+yycA= github.com/talos-systems/talos/pkg/machinery v0.14.0/go.mod h1:ctbMKkPJv8aiGfXT2NuWaoHch7fx62GaU81OVOyNVbc= +github.com/talos-systems/talos/pkg/machinery v0.14.1 h1:ecvzW8OMlWxfdGsiL6cVwtEOd4IwIYTIgRaEEFxyuTc= +github.com/talos-systems/talos/pkg/machinery v0.14.1/go.mod h1:ctbMKkPJv8aiGfXT2NuWaoHch7fx62GaU81OVOyNVbc= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc= diff --git a/sfyra/go.mod b/sfyra/go.mod index d614a3af..b284fa5b 100644 --- a/sfyra/go.mod +++ b/sfyra/go.mod @@ -22,8 +22,8 @@ require ( github.com/talos-systems/go-retry v0.3.1 github.com/talos-systems/net v0.3.1 github.com/talos-systems/sidero v0.0.0-00010101000000-000000000000 - github.com/talos-systems/talos v0.14.0 - github.com/talos-systems/talos/pkg/machinery v0.14.0 + github.com/talos-systems/talos v0.14.1 + github.com/talos-systems/talos/pkg/machinery v0.14.1 google.golang.org/grpc v1.43.0 gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b k8s.io/api v0.23.1 diff --git a/sfyra/go.sum b/sfyra/go.sum index c6518c81..0f2e4dd0 100644 --- a/sfyra/go.sum +++ b/sfyra/go.sum @@ -1125,13 +1125,14 @@ github.com/talos-systems/net v0.3.1 h1:F9mlDgKE4XVfgpoRmacVUTEAMAeQ5xuOaeCl+A9a0 github.com/talos-systems/net v0.3.1/go.mod h1:zhcGixNJz9dgwFiUwc7gkkAqdVqXagU1SNNoIVXYKGo= github.com/talos-systems/siderolink v0.1.0/go.mod h1:bEGwDYl9QgC3oZ4kdnJTuR2HX/XlUhxZjx/QAakKuBc= github.com/talos-systems/siderolink v0.1.1-0.20211130121818-9902ad2774f0/go.mod h1:bEGwDYl9QgC3oZ4kdnJTuR2HX/XlUhxZjx/QAakKuBc= -github.com/talos-systems/talos v0.14.0 h1:3RyC7FgpQ5ZWjiyfz57u4qG/l0+pR52hDSJnGPxBgZg= -github.com/talos-systems/talos v0.14.0/go.mod h1:UWuLwoQGsUHEddTqwfvxNOyYTOzd2n9KiV+pyfIJT5M= +github.com/talos-systems/talos v0.14.1 h1:z6pr4mvs32DiG8M46tb88v2d9jBNLEBq6zxWgSlTC2U= +github.com/talos-systems/talos v0.14.1/go.mod h1:GavvnvQzEp4A00+R8ecpd26TpXWVroRhKLavIK3P2fI= github.com/talos-systems/talos/pkg/machinery v0.12.3/go.mod h1:qX77JMZawrDTQaJucqecdlFsHy+dbnZ9YL8Kw4qL7d4= github.com/talos-systems/talos/pkg/machinery v0.13.0/go.mod h1:fQx1FlvFLSexSOYL1DSl0EjtazujlzNmVDCt2yRoLJ4= github.com/talos-systems/talos/pkg/machinery v0.14.0-alpha.1.0.20211118180932-1ffa8e048008/go.mod h1:D8NT4Aj+X2OpA6yK6RAtpw1wcgkDS7oD23vqOQWRiP8= -github.com/talos-systems/talos/pkg/machinery v0.14.0 h1:UKk33z236rMWHsSMhu6ExlG1uB5dF7jws3qRDP+yycA= github.com/talos-systems/talos/pkg/machinery v0.14.0/go.mod h1:ctbMKkPJv8aiGfXT2NuWaoHch7fx62GaU81OVOyNVbc= +github.com/talos-systems/talos/pkg/machinery v0.14.1 h1:ecvzW8OMlWxfdGsiL6cVwtEOd4IwIYTIgRaEEFxyuTc= +github.com/talos-systems/talos/pkg/machinery v0.14.1/go.mod h1:ctbMKkPJv8aiGfXT2NuWaoHch7fx62GaU81OVOyNVbc= github.com/tchap/go-patricia v2.2.6+incompatible/go.mod h1:bmLyhP68RS6kStMGxByiQ23RP/odRBOTVjwp2cDyi6I= github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=