Andrey Smirnov aa14993539
feat: introduce network probes
Network probes are configured with the specs, and provide their output
as a status.

At the moment only platform code can configure network probes.

If any network probes are configured, they affect network.Status
'Connectivity' flag.

Example, create the probe:

```
talosctl -n 172.20.0.3 meta write 0xa '{"probes": [{"interval": "1s", "tcp": {"endpoint": "google.com:80", "timeout": "10s"}}]}'
```

Watch probe status:

```
$ talosctl -n 172.20.0.3 get probe
NODE         NAMESPACE   TYPE          ID                  VERSION   SUCCESS
172.20.0.3   network     ProbeStatus   tcp:google.com:80   5         true
```

With failing probes:

```
$ talosctl -n 172.20.0.3 get probe
NODE         NAMESPACE   TYPE          ID                  VERSION   SUCCESS
172.20.0.3   network     ProbeStatus   tcp:google.com:80   4         true
172.20.0.3   network     ProbeStatus   tcp:google.com:81   1         false
$ talosctl -n 172.20.0.3 get networkstatus
NODE         NAMESPACE   TYPE            ID       VERSION   ADDRESS   CONNECTIVITY   HOSTNAME   ETC
172.20.0.3   network     NetworkStatus   status   5         true      true           true       true

```

Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
2023-03-31 15:20:21 +04:00

157 lines
4.4 KiB
Go

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
package network
import (
"context"
"fmt"
"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/safe"
"github.com/cosi-project/runtime/pkg/state"
"go.uber.org/zap"
"github.com/siderolabs/talos/internal/app/machined/pkg/controllers/network/internal/probe"
"github.com/siderolabs/talos/pkg/machinery/resources/network"
)
// ProbeController runs network probes configured with ProbeSpecs and outputs ProbeStatuses.
type ProbeController struct {
runners map[string]*probe.Runner
}
// Name implements controller.Controller interface.
func (ctrl *ProbeController) Name() string {
return "network.ProbeController"
}
// Inputs implements controller.Controller interface.
func (ctrl *ProbeController) Inputs() []controller.Input {
return []controller.Input{
{
Namespace: network.NamespaceName,
Type: network.ProbeSpecType,
Kind: controller.InputWeak,
},
}
}
// Outputs implements controller.Controller interface.
func (ctrl *ProbeController) Outputs() []controller.Output {
return []controller.Output{
{
Type: network.ProbeStatusType,
Kind: controller.OutputExclusive,
},
}
}
// Run implements controller.Controller interface.
func (ctrl *ProbeController) Run(ctx context.Context, r controller.Runtime, logger *zap.Logger) error {
notifyCh := make(chan probe.Notification)
ctrl.runners = make(map[string]*probe.Runner)
defer func() {
for _, runner := range ctrl.runners {
runner.Stop()
}
}()
for {
select {
case <-ctx.Done():
return nil
case <-r.EventCh():
if err := ctrl.reconcileRunners(ctx, r, logger, notifyCh); err != nil {
return err
}
case ev := <-notifyCh:
if err := ctrl.reconcileOutputs(ctx, r, ev); err != nil {
return err
}
}
r.ResetRestartBackoff()
}
}
//nolint:gocyclo
func (ctrl *ProbeController) reconcileRunners(ctx context.Context, r controller.Runtime, logger *zap.Logger, notifyCh chan<- probe.Notification) error {
specList, err := safe.ReaderList[*network.ProbeSpec](ctx, r, resource.NewMetadata(network.NamespaceName, network.ProbeSpecType, "", resource.VersionUndefined))
if err != nil {
return fmt.Errorf("error listing probe specs: %w", err)
}
// figure out which operators should run
shouldRun := make(map[string]network.ProbeSpecSpec)
for iter := safe.IteratorFromList(specList); iter.Next(); {
shouldRun[iter.Value().Metadata().ID()] = *iter.Value().TypedSpec()
}
// stop running probes which shouldn't run
for id := range ctrl.runners {
if _, exists := shouldRun[id]; !exists {
logger.Debug("stopping probe", zap.String("probe", id))
ctrl.runners[id].Stop()
delete(ctrl.runners, id)
} else if !shouldRun[id].Equal(ctrl.runners[id].Spec) {
logger.Debug("replacing probe", zap.String("probe", id))
ctrl.runners[id].Stop()
delete(ctrl.runners, id)
}
}
// start probes which aren't running
for id := range shouldRun {
if _, exists := ctrl.runners[id]; !exists {
ctrl.runners[id] = &probe.Runner{
ID: id,
Spec: shouldRun[id],
}
logger.Debug("starting probe", zap.String("probe", id))
ctrl.runners[id].Start(ctx, notifyCh, logger)
}
}
// clean up statuses which should no longer exist
statusList, err := safe.ReaderList[*network.ProbeStatus](ctx, r, resource.NewMetadata(network.NamespaceName, network.ProbeStatusType, "", resource.VersionUndefined))
if err != nil {
return fmt.Errorf("error listing probe statuses: %w", err)
}
for iter := safe.IteratorFromList(statusList); iter.Next(); {
if _, exists := shouldRun[iter.Value().Metadata().ID()]; exists {
continue
}
if err = r.Destroy(ctx, iter.Value().Metadata()); err != nil && !state.IsNotFoundError(err) {
return fmt.Errorf("error destroying probe status: %w", err)
}
}
return nil
}
//nolint:gocyclo,cyclop
func (ctrl *ProbeController) reconcileOutputs(ctx context.Context, r controller.Runtime, ev probe.Notification) error {
if _, exists := ctrl.runners[ev.ID]; !exists {
// probe was already removed, late notification, ignore it
return nil
}
return safe.WriterModify(ctx, r, network.NewProbeStatus(network.NamespaceName, ev.ID),
func(status *network.ProbeStatus) error {
*status.TypedSpec() = ev.Status
return nil
})
}