mirror of
https://github.com/siderolabs/omni.git
synced 2025-08-10 11:36:58 +02:00
Some checks failed
default / default (push) Has been cancelled
default / e2e-backups (push) Has been cancelled
default / e2e-forced-removal (push) Has been cancelled
default / e2e-scaling (push) Has been cancelled
default / e2e-short (push) Has been cancelled
default / e2e-short-secureboot (push) Has been cancelled
default / e2e-templates (push) Has been cancelled
default / e2e-upgrades (push) Has been cancelled
default / e2e-workload-proxy (push) Has been cancelled
e2e-workload-proxy-cron / default (push) Has been cancelled
e2e-upgrades-cron / default (push) Has been cancelled
e2e-templates-cron / default (push) Has been cancelled
e2e-short-secureboot-cron / default (push) Has been cancelled
e2e-short-cron / default (push) Has been cancelled
e2e-scaling-cron / default (push) Has been cancelled
e2e-forced-removal-cron / default (push) Has been cancelled
e2e-backups-cron / default (push) Has been cancelled
There was a problem with the node resolution (a.k.a. DNS) cache of the nodes. When a machine is in maintenance mode, there is a corresponding `MachineStatus` resource for it, but there isn't any `ClusterMachineIdentity`. Both of these types trigger updates in the node resolution cache. When a machine was never part of a cluster, the only source is `MachineStatus`, and the cache updates on it did not populate the machine ID in the cache. This caused the GRPC router to pick the wrong destination. Furthermore, we did not remove the cluster and node name information from the cache when a machine was removed from a cluster. This caused the cache to contain obsolete cluster information, causing Talos GRPC proxy to not proxy the requests correctly after a machine was removed from a cluster. Co-authored-by: Artem Chernyshev <artem.chernyshev@talos-systems.com> Signed-off-by: Utku Ozdemir <utku.ozdemir@siderolabs.com>
398 lines
8.6 KiB
Go
398 lines
8.6 KiB
Go
// Copyright (c) 2025 Sidero Labs, Inc.
|
|
//
|
|
// Use of this software is governed by the Business Source License
|
|
// included in the LICENSE file.
|
|
|
|
// Package dns provides node name to node IP lookups, similar to a Service service.
|
|
package dns
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"slices"
|
|
"sync"
|
|
|
|
"github.com/cosi-project/runtime/pkg/resource"
|
|
"github.com/cosi-project/runtime/pkg/state"
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/siderolabs/omni/client/pkg/omni/resources"
|
|
"github.com/siderolabs/omni/client/pkg/omni/resources/omni"
|
|
)
|
|
|
|
type record struct {
|
|
cluster string
|
|
name string
|
|
}
|
|
|
|
// Info contains information about a node.
|
|
type Info struct {
|
|
Cluster string
|
|
ID string
|
|
Name string
|
|
TalosVersion string
|
|
|
|
address string
|
|
managementEndpoint string
|
|
|
|
Ambiguous bool
|
|
}
|
|
|
|
// NewInfo exports unexported.
|
|
func NewInfo(cluster, id, name, address string) Info {
|
|
return Info{
|
|
Cluster: cluster,
|
|
ID: id,
|
|
Name: name,
|
|
address: address,
|
|
}
|
|
}
|
|
|
|
// GetAddress reads node address from the DNS info.
|
|
func (i Info) GetAddress() string {
|
|
if i.address != "" {
|
|
return i.address
|
|
}
|
|
|
|
return i.managementEndpoint
|
|
}
|
|
|
|
type resolverMap map[string][]resource.ID
|
|
|
|
func (m resolverMap) get(key string) []resource.ID {
|
|
return m[key]
|
|
}
|
|
|
|
func (m resolverMap) add(key string, id resource.ID) {
|
|
if slices.Index(m[key], id) != -1 {
|
|
return
|
|
}
|
|
|
|
m[key] = append(m[key], id)
|
|
}
|
|
|
|
func (m resolverMap) remove(key string, id resource.ID) {
|
|
ids, ok := m[key]
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
ids = slices.DeleteFunc(ids, func(item string) bool { return item == id })
|
|
if len(ids) == 0 {
|
|
delete(m, key)
|
|
} else {
|
|
m[key] = ids
|
|
}
|
|
}
|
|
|
|
// Service is the DNS service.
|
|
type Service struct {
|
|
omniState state.State
|
|
logger *zap.Logger
|
|
|
|
recordToMachineID map[record]resource.ID
|
|
machineIDToInfo map[resource.ID]Info
|
|
addressToID resolverMap
|
|
nodenameToID resolverMap
|
|
|
|
lock sync.Mutex
|
|
}
|
|
|
|
// NewService creates a new DNS service. It needs to be started before use.
|
|
func NewService(omniState state.State, logger *zap.Logger) *Service {
|
|
return &Service{
|
|
omniState: omniState,
|
|
logger: logger,
|
|
recordToMachineID: make(map[record]string),
|
|
machineIDToInfo: make(map[string]Info),
|
|
addressToID: make(resolverMap),
|
|
nodenameToID: make(resolverMap),
|
|
}
|
|
}
|
|
|
|
// Start starts the DNS service.
|
|
func (d *Service) Start(ctx context.Context) error {
|
|
ch := make(chan state.Event)
|
|
|
|
if err := d.omniState.WatchKind(ctx,
|
|
omni.NewClusterMachineIdentity(resources.DefaultNamespace, "").Metadata(), ch,
|
|
state.WithBootstrapContents(true),
|
|
); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := d.omniState.WatchKind(ctx,
|
|
omni.NewMachineStatus(resources.DefaultNamespace, "").Metadata(), ch,
|
|
state.WithBootstrapContents(true),
|
|
); err != nil {
|
|
return err
|
|
}
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
if errors.Is(ctx.Err(), context.Canceled) {
|
|
d.logger.Debug("stopping DNS service")
|
|
|
|
return nil
|
|
}
|
|
|
|
return fmt.Errorf("dns service context error: %w", ctx.Err())
|
|
case ev := <-ch:
|
|
switch ev.Type {
|
|
case state.Errored:
|
|
return fmt.Errorf("dns service received an error event: %w", ev.Error)
|
|
case state.Bootstrapped, state.Noop:
|
|
// ignore
|
|
case state.Destroyed:
|
|
if ev.Resource == nil {
|
|
d.logger.Warn("dns service received a destroyed event without a resource")
|
|
|
|
continue
|
|
}
|
|
|
|
switch r := ev.Resource.(type) {
|
|
case *omni.ClusterMachineIdentity:
|
|
d.deleteIdentityMappings(r.Metadata().ID())
|
|
case *omni.MachineStatus:
|
|
d.deleteMachineMappings(r.Metadata().ID())
|
|
}
|
|
case state.Created, state.Updated:
|
|
switch r := ev.Resource.(type) {
|
|
case *omni.ClusterMachineIdentity:
|
|
d.updateEntryByIdentity(r)
|
|
case *omni.MachineStatus:
|
|
d.updateEntryByMachineStatus(r)
|
|
default:
|
|
d.logger.Warn(
|
|
"dns service received an event with an unexpected resource type",
|
|
zap.String("type", fmt.Sprintf("%T", r)),
|
|
)
|
|
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (d *Service) updateEntryByIdentity(res *omni.ClusterMachineIdentity) {
|
|
nodeName := res.TypedSpec().Value.Nodename
|
|
if nodeName == "" {
|
|
d.logger.Warn("received cluster machine identity without a node name", zap.String("id", res.Metadata().ID()))
|
|
|
|
return
|
|
}
|
|
|
|
clusterName, ok := res.Metadata().Labels().Get(omni.LabelCluster)
|
|
if !ok {
|
|
d.logger.Warn("received cluster machine identity without cluster label", zap.String("id", res.Metadata().ID()))
|
|
|
|
return
|
|
}
|
|
|
|
d.lock.Lock()
|
|
defer d.lock.Unlock()
|
|
|
|
// create or update info
|
|
info := d.machineIDToInfo[res.Metadata().ID()]
|
|
|
|
info.Cluster = clusterName
|
|
info.ID = res.Metadata().ID()
|
|
|
|
previousAddress := info.address
|
|
previousNodename := info.Name
|
|
|
|
info.Name = nodeName
|
|
|
|
nodeIPs := res.TypedSpec().Value.NodeIps
|
|
if len(nodeIPs) == 0 {
|
|
info.address = ""
|
|
} else {
|
|
info.address = nodeIPs[0]
|
|
}
|
|
|
|
d.machineIDToInfo[res.Metadata().ID()] = info
|
|
|
|
// create entry by node name
|
|
d.recordToMachineID[record{
|
|
cluster: clusterName,
|
|
name: nodeName,
|
|
}] = res.Metadata().ID()
|
|
|
|
// create entry by machine ID
|
|
d.recordToMachineID[record{
|
|
cluster: clusterName,
|
|
name: res.Metadata().ID(),
|
|
}] = res.Metadata().ID()
|
|
|
|
d.nodenameToID.add(nodeName, res.Metadata().ID())
|
|
|
|
// create entry by address
|
|
if info.address != "" {
|
|
d.recordToMachineID[record{
|
|
cluster: clusterName,
|
|
name: info.address,
|
|
}] = res.Metadata().ID()
|
|
|
|
d.addressToID.add(info.address, res.Metadata().ID())
|
|
}
|
|
|
|
// cleanup old entry by address
|
|
if previousAddress != "" && previousAddress != info.address {
|
|
delete(d.recordToMachineID, record{
|
|
cluster: clusterName,
|
|
name: previousAddress,
|
|
})
|
|
|
|
d.addressToID.remove(previousAddress, res.Metadata().ID())
|
|
}
|
|
|
|
// cleanup old entry by nodename
|
|
if previousNodename != "" && previousNodename != info.Name {
|
|
delete(d.recordToMachineID, record{
|
|
cluster: clusterName,
|
|
name: previousNodename,
|
|
})
|
|
|
|
d.nodenameToID.remove(previousNodename, res.Metadata().ID())
|
|
}
|
|
|
|
d.logger.Debug(
|
|
"set node DNS entry",
|
|
zap.String("id", res.Metadata().ID()),
|
|
zap.String("cluster", clusterName),
|
|
zap.String("node_name", nodeName),
|
|
zap.String("address", info.address),
|
|
)
|
|
}
|
|
|
|
func (d *Service) updateEntryByMachineStatus(res *omni.MachineStatus) {
|
|
version := res.TypedSpec().Value.TalosVersion
|
|
if version == "" {
|
|
d.logger.Warn("no Talos version in the machine status", zap.String("id", res.Metadata().ID()))
|
|
|
|
return
|
|
}
|
|
|
|
d.lock.Lock()
|
|
defer d.lock.Unlock()
|
|
|
|
info := d.machineIDToInfo[res.Metadata().ID()]
|
|
|
|
info.ID = res.Metadata().ID()
|
|
info.TalosVersion = version
|
|
info.managementEndpoint = res.TypedSpec().Value.ManagementAddress
|
|
|
|
d.machineIDToInfo[res.Metadata().ID()] = info
|
|
|
|
d.logger.Debug(
|
|
"update machine id -> address mapping",
|
|
zap.String("id", res.Metadata().ID()),
|
|
zap.String("talos_version", version),
|
|
)
|
|
}
|
|
|
|
func (d *Service) deleteIdentityMappings(id resource.ID) {
|
|
d.lock.Lock()
|
|
defer d.lock.Unlock()
|
|
|
|
info, infoOk := d.machineIDToInfo[id]
|
|
if infoOk {
|
|
delete(d.recordToMachineID, record{
|
|
cluster: info.Cluster,
|
|
name: info.ID,
|
|
})
|
|
delete(d.recordToMachineID, record{
|
|
cluster: info.Cluster,
|
|
name: info.Name,
|
|
})
|
|
delete(d.recordToMachineID, record{
|
|
cluster: info.Cluster,
|
|
name: info.address,
|
|
})
|
|
|
|
d.addressToID.remove(info.address, id)
|
|
d.nodenameToID.remove(info.Name, id)
|
|
}
|
|
|
|
info.Cluster = ""
|
|
info.Name = ""
|
|
info.address = ""
|
|
|
|
d.machineIDToInfo[id] = info
|
|
|
|
d.logger.Debug(
|
|
"deleted node identity DNS entry",
|
|
zap.String("id", id),
|
|
zap.String("cluster", info.Cluster),
|
|
zap.String("node_name", info.Name),
|
|
zap.String("address", info.address),
|
|
)
|
|
}
|
|
|
|
func (d *Service) deleteMachineMappings(id resource.ID) {
|
|
d.lock.Lock()
|
|
defer d.lock.Unlock()
|
|
|
|
info, infoOk := d.machineIDToInfo[id]
|
|
if !infoOk {
|
|
return
|
|
}
|
|
|
|
delete(d.machineIDToInfo, id)
|
|
|
|
d.logger.Debug(
|
|
"deleted node machine status DNS entry",
|
|
zap.String("id", id),
|
|
zap.String("cluster", info.Cluster),
|
|
zap.String("node_name", info.Name),
|
|
zap.String("address", info.address),
|
|
)
|
|
}
|
|
|
|
func (d *Service) resolveByAddressOrNodename(name string) (Info, bool) {
|
|
for _, resolver := range []resolverMap{
|
|
d.addressToID,
|
|
d.nodenameToID,
|
|
} {
|
|
ids := resolver.get(name)
|
|
if len(ids) > 1 {
|
|
return Info{
|
|
Name: name,
|
|
Ambiguous: true,
|
|
}, true
|
|
}
|
|
|
|
if len(ids) > 0 {
|
|
return d.machineIDToInfo[ids[0]], true
|
|
}
|
|
}
|
|
|
|
return Info{}, false
|
|
}
|
|
|
|
// Resolve returns the dns.Info for the given node name, address or machine UUID.
|
|
func (d *Service) Resolve(clusterName, name string) Info {
|
|
d.lock.Lock()
|
|
defer d.lock.Unlock()
|
|
|
|
nodeID, ok := d.recordToMachineID[record{
|
|
cluster: clusterName,
|
|
name: name,
|
|
}]
|
|
|
|
if !ok {
|
|
if info, found := d.resolveByAddressOrNodename(name); found {
|
|
return info
|
|
}
|
|
}
|
|
|
|
if !ok {
|
|
return d.machineIDToInfo[name]
|
|
}
|
|
|
|
return d.machineIDToInfo[nodeID]
|
|
}
|