mirror of
https://github.com/siderolabs/talos.git
synced 2025-10-05 04:31:26 +02:00
Handling of multiple endpoints has already been implemented in #2094. This PR enables round-robin policy so that grpc picks up new endpoint for each call (and not send each request to the first control plane node). Endpoint list is randomized to handle cases when only one request is going to be sent, so that it doesn't go always to the first node in the list. gprc handles dead/unresponsive nodes automatically for us. `talosctl cluster create` and provision tests switched to use client-side load balancer for Talos API. On the additional improvements we got: * `talosctl` now reports correct node IP when using commands without `-n`, not the loadbalancer IP (if using multiple endpoints of course) * loadbalancer can't provide reliable handling of errors when upstream server is unresponsive or there're no upstreams available, grpc returns much more helpful errors Fixes #1641 Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
137 lines
3.9 KiB
Go
137 lines
3.9 KiB
Go
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
// Package check provides set of checks to verify cluster readiness.
|
|
package check
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sort"
|
|
|
|
"github.com/hashicorp/go-multierror"
|
|
|
|
"github.com/talos-systems/talos/internal/app/machined/pkg/runtime"
|
|
"github.com/talos-systems/talos/pkg/client"
|
|
)
|
|
|
|
// ErrServiceNotFound is an error that indicates that a service was not found.
|
|
var ErrServiceNotFound = fmt.Errorf("service not found")
|
|
|
|
// ServiceStateAssertion checks whether service reached some specified state.
|
|
//
|
|
//nolint: gocyclo
|
|
func ServiceStateAssertion(ctx context.Context, cluster ClusterInfo, service string, states ...string) error {
|
|
cli, err := cluster.Client()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// by default, we check all control plane nodes. if some nodes don't have that service running,
|
|
// it won't be returned in the response
|
|
nodes := append(cluster.NodesByType(runtime.MachineTypeInit), cluster.NodesByType(runtime.MachineTypeControlPlane)...)
|
|
nodesCtx := client.WithNodes(ctx, nodes...)
|
|
|
|
servicesInfo, err := cli.ServiceInfo(nodesCtx, service)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if len(servicesInfo) == 0 {
|
|
return ErrServiceNotFound
|
|
}
|
|
|
|
acceptedStates := map[string]struct{}{}
|
|
for _, state := range states {
|
|
acceptedStates[state] = struct{}{}
|
|
}
|
|
|
|
var multiErr *multierror.Error
|
|
|
|
for _, serviceInfo := range servicesInfo {
|
|
node := serviceInfo.Metadata.GetHostname()
|
|
|
|
if len(serviceInfo.Service.Events.Events) == 0 {
|
|
multiErr = multierror.Append(multiErr, fmt.Errorf("%s: no events recorded yet for service %q", node, service))
|
|
continue
|
|
}
|
|
|
|
lastEvent := serviceInfo.Service.Events.Events[len(serviceInfo.Service.Events.Events)-1]
|
|
if _, ok := acceptedStates[lastEvent.State]; !ok {
|
|
multiErr = multierror.Append(multiErr, fmt.Errorf("%s: service %q not in expected state %q: current state [%s] %s", node, service, states, lastEvent.State, lastEvent.Msg))
|
|
}
|
|
}
|
|
|
|
return multiErr.ErrorOrNil()
|
|
}
|
|
|
|
// ServiceHealthAssertion checks whether service reached some specified state.
|
|
//nolint: gocyclo
|
|
func ServiceHealthAssertion(ctx context.Context, cluster ClusterInfo, service string, setters ...Option) error {
|
|
opts := DefaultOptions()
|
|
|
|
for _, setter := range setters {
|
|
if err := setter(opts); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
cli, err := cluster.Client()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var nodes []string
|
|
|
|
if len(opts.Types) > 0 {
|
|
for _, t := range opts.Types {
|
|
nodes = append(nodes, cluster.NodesByType(t)...)
|
|
}
|
|
} else {
|
|
nodes = cluster.Nodes()
|
|
}
|
|
|
|
count := len(nodes)
|
|
|
|
nodesCtx := client.WithNodes(ctx, nodes...)
|
|
|
|
servicesInfo, err := cli.ServiceInfo(nodesCtx, service)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if len(servicesInfo) != count {
|
|
return fmt.Errorf("expected a response with %d node(s), got %d", count, len(servicesInfo))
|
|
}
|
|
|
|
var multiErr *multierror.Error
|
|
|
|
// sort service info list so that errors returned are consistent
|
|
sort.Slice(servicesInfo, func(i, j int) bool {
|
|
return servicesInfo[i].Metadata.GetHostname() < servicesInfo[j].Metadata.GetHostname()
|
|
})
|
|
|
|
for _, serviceInfo := range servicesInfo {
|
|
node := serviceInfo.Metadata.GetHostname()
|
|
|
|
if len(serviceInfo.Service.Events.Events) == 0 {
|
|
multiErr = multierror.Append(multiErr, fmt.Errorf("%s: no events recorded yet for service %q", node, service))
|
|
continue
|
|
}
|
|
|
|
lastEvent := serviceInfo.Service.Events.Events[len(serviceInfo.Service.Events.Events)-1]
|
|
if lastEvent.State != "Running" {
|
|
multiErr = multierror.Append(multiErr, fmt.Errorf("%s: service %q not in expected state %q: current state [%s] %s", node, service, "Running", lastEvent.State, lastEvent.Msg))
|
|
continue
|
|
}
|
|
|
|
if !serviceInfo.Service.GetHealth().GetHealthy() {
|
|
multiErr = multierror.Append(multiErr, fmt.Errorf("%s: service is not healthy: %s", node, service))
|
|
continue
|
|
}
|
|
}
|
|
|
|
return multiErr.ErrorOrNil()
|
|
}
|