fix: provide proper AWS platform metadata

1. Explicitly enable DHCPv4 on v4 instances.
2. Run DHCP6 if IPv6 is connected.
3. Support v6-only environments.
4. Add DNS for v6.

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
This commit is contained in:
Andrey Smirnov 2026-05-01 19:49:59 +04:00
parent 4f11f021de
commit 1371596d75
No known key found for this signature in database
GPG Key ID: 322C6F63F594CE7C
9 changed files with 528 additions and 43 deletions

View File

@ -12,7 +12,9 @@ import (
"log"
"net/netip"
"strings"
"time"
"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/feature/ec2/imds"
"github.com/cosi-project/runtime/pkg/state"
@ -20,7 +22,7 @@ import (
"github.com/siderolabs/go-retry/retry"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1/platform/errors"
platformerrors "github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1/platform/errors"
"github.com/siderolabs/talos/internal/app/machined/pkg/runtime/v1alpha1/platform/internal/netutils"
"github.com/siderolabs/talos/pkg/machinery/constants"
"github.com/siderolabs/talos/pkg/machinery/imager/quirks"
@ -28,26 +30,122 @@ import (
runtimeres "github.com/siderolabs/talos/pkg/machinery/resources/runtime"
)
// awsInterfaceName is the name of the (single) network interface configured by the AWS platform.
const awsInterfaceName = "eth0"
// awsIPv6DNSServer is the link-local IPv6 address of the Amazon-provided DNS resolver inside a VPC.
//
// See https://docs.aws.amazon.com/vpc/latest/userguide/AmazonDNS-concepts.html
//
// AWS distributes IPv6 addresses via DHCPv6 (stateful) and the IPv6 default route via Router Advertisements
// (the kernel installs it automatically when accept_ra is enabled). DHCPv6 in EC2 does not advertise
// resolvers, so we configure the well-known DNS address explicitly to make IPv6-only instances usable.
const awsIPv6DNSServer = "fd00:ec2::253"
// AWS is the concrete type that implements the runtime.Platform interface.
type AWS struct {
metadataClient *imds.Client
cfg aws.Config
}
// NewAWS initializes AWS platform building the IMDS client.
// NewAWS initializes AWS platform.
//
// The IMDS client is built lazily on first use because the IMDS endpoint
// (IPv4 vs IPv6) can only be determined once the network is reachable —
// AWS supports IPv4-only, IPv6-only, and dual-stack instances and the SDK
// will not auto-fall back between the two endpoints.
func NewAWS() (*AWS, error) {
a := &AWS{}
cfg, err := config.LoadDefaultConfig(context.TODO())
if err != nil {
return nil, fmt.Errorf("error initializing AWS default config: %w", err)
}
a.metadataClient = imds.NewFromConfig(cfg)
return &AWS{cfg: cfg}, nil
}
return a, nil
// buildIMDSClient picks an IMDS endpoint that responds (IPv4 or IPv6) and
// returns a client bound to it. It races a probe against both endpoints and
// returns the first to succeed; if both fail it retries with backoff because
// the network stack may not be fully up on the very first attempt.
func (a *AWS) buildIMDSClient(ctx context.Context) (*imds.Client, error) {
var resolved *imds.Client
err := retry.Constant(
30*time.Second,
retry.WithUnits(2*time.Second),
retry.WithErrorLogging(true),
).RetryWithContext(ctx, func(ctx context.Context) error {
client, probeErr := a.probeIMDS(ctx)
if probeErr != nil {
return retry.ExpectedError(probeErr)
}
resolved = client
return nil
})
if err != nil {
return nil, fmt.Errorf("failed to reach IMDS on IPv4 or IPv6: %w", err)
}
return resolved, nil
}
// probeIMDS races a metadata request against the IPv4 and IPv6 IMDS endpoints
// and returns a client bound to whichever one responds first.
func (a *AWS) probeIMDS(ctx context.Context) (*imds.Client, error) {
probeCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
type result struct {
client *imds.Client
mode string
err error
}
candidates := []struct {
mode imds.EndpointModeState
name string
}{
{imds.EndpointModeStateIPv4, "IPv4"},
{imds.EndpointModeStateIPv6, "IPv6"},
}
ch := make(chan result, len(candidates))
for _, c := range candidates {
client := imds.NewFromConfig(a.cfg, func(o *imds.Options) {
o.EndpointMode = c.mode
})
go func(client *imds.Client, name string) {
_, err := client.GetMetadata(probeCtx, &imds.GetMetadataInput{Path: "instance-id"})
ch <- result{client: client, mode: name, err: err}
}(client, c.name)
}
var lastErr error
for range candidates {
select {
case r := <-ch:
if r.err == nil {
log.Printf("AWS IMDS reachable via %s endpoint", r.mode)
return r.client, nil
}
lastErr = r.err
case <-ctx.Done():
return nil, ctx.Err()
}
}
return nil, lastErr
}
// ParseMetadata converts AWS platform metadata into platform network config.
//
//nolint:gocyclo
func (a *AWS) ParseMetadata(metadata *MetadataConfig) (*runtime.PlatformNetworkConfig, error) {
networkConfig := &runtime.PlatformNetworkConfig{
TimeServers: []network.TimeServerSpecSpec{
@ -77,18 +175,64 @@ func (a *AWS) ParseMetadata(metadata *MetadataConfig) (*runtime.PlatformNetworkC
networkConfig.Hostnames = append(networkConfig.Hostnames, hostnameSpec)
}
var publicIPs []string
// Configure the primary interface based on which address families are present.
//
// AWS supports IPv4-only, IPv6-only, and dual-stack instances. We detect IPv6-only
// by the absence of any IPv4 address (neither private nor public) and skip DHCPv4
// in that case so the network comes up cleanly without a doomed DHCPv4 client.
//
// The IPv6 default gateway is delivered via Router Advertisements (the kernel adds
// it when accept_ra is on) — DHCPv6 only hands out addresses — so we don't add any
// static route for IPv6 here.
if iface := metadata.PrimaryInterface; iface != nil {
networkConfig.Links = append(networkConfig.Links, network.LinkSpecSpec{
Name: awsInterfaceName,
Up: true,
ConfigLayer: network.ConfigPlatform,
})
hasIPv4 := len(iface.LocalIPv4s) > 0 || metadata.PublicIPv4 != ""
hasIPv6 := len(iface.IPv6s) > 0
// Default to IPv4 if the metadata is ambiguous (e.g. neither list populated).
if !hasIPv4 && !hasIPv6 {
hasIPv4 = true
}
if hasIPv4 {
networkConfig.Operators = append(networkConfig.Operators, network.OperatorSpecSpec{
Operator: network.OperatorDHCP4,
LinkName: awsInterfaceName,
RequireUp: true,
DHCP4: network.DHCP4OperatorSpec{
RouteMetric: network.DefaultRouteMetric,
},
ConfigLayer: network.ConfigPlatform,
})
}
if hasIPv6 {
networkConfig.Operators = append(networkConfig.Operators, network.OperatorSpecSpec{
Operator: network.OperatorDHCP6,
LinkName: awsInterfaceName,
RequireUp: true,
DHCP6: network.DHCP6OperatorSpec{
RouteMetric: network.DefaultRouteMetric,
},
ConfigLayer: network.ConfigPlatform,
})
dns, _ := netip.ParseAddr(awsIPv6DNSServer) //nolint:errcheck
networkConfig.Resolvers = append(networkConfig.Resolvers, network.ResolverSpecSpec{
DNSServers: []netip.Addr{dns},
ConfigLayer: network.ConfigPlatform,
})
}
}
if metadata.PublicIPv4 != "" {
publicIPs = append(publicIPs, metadata.PublicIPv4)
}
if metadata.PublicIPv6 != "" {
publicIPs = append(publicIPs, metadata.PublicIPv6)
}
for _, ipStr := range publicIPs {
if ip, err := netip.ParseAddr(ipStr); err == nil {
if ip, err := netip.ParseAddr(metadata.PublicIPv4); err == nil {
networkConfig.ExternalIPs = append(networkConfig.ExternalIPs, ip)
}
}
@ -123,23 +267,30 @@ func (a *AWS) Configuration(ctx context.Context, r state.State) ([]byte, error)
log.Printf("fetching machine config from AWS")
userdata, err := netutils.RetryFetch(ctx, a.fetchConfiguration)
client, err := a.buildIMDSClient(ctx)
if err != nil {
return nil, err
}
userdata, err := netutils.RetryFetch(ctx, func(ctx context.Context) (string, error) {
return fetchConfiguration(ctx, client)
})
if err != nil {
return nil, err
}
if strings.TrimSpace(userdata) == "" {
return nil, errors.ErrNoConfigSource
return nil, platformerrors.ErrNoConfigSource
}
return []byte(userdata), nil
}
func (a *AWS) fetchConfiguration(ctx context.Context) (string, error) {
resp, err := a.metadataClient.GetUserData(ctx, &imds.GetUserDataInput{})
func fetchConfiguration(ctx context.Context, client *imds.Client) (string, error) {
resp, err := client.GetUserData(ctx, &imds.GetUserDataInput{})
if err != nil {
if isNotFoundError(err) {
return "", errors.ErrNoConfigSource
return "", platformerrors.ErrNoConfigSource
}
return "", retry.ExpectedErrorf("failed to fetch EC2 userdata: %w", err)
@ -167,9 +318,29 @@ func (a *AWS) KernelArgs(string, quirks.Quirks) procfs.Parameters {
// NetworkConfiguration implements the runtime.Platform interface.
func (a *AWS) NetworkConfiguration(ctx context.Context, _ state.State, ch chan<- *runtime.PlatformNetworkConfig) error {
// Emit a bootstrap config before fetching IMDS. In IPv6-only deployments
// (subnets with IPv4 disabled) the IMDS endpoint at [fd00:ec2::254] is only
// reachable from a non-link-local IPv6 address — which we get from DHCPv6 —
// but the DHCPv6 operator only starts once the platform publishes a config
// asking for it. Without this step the platform deadlocks: IMDS is
// unreachable, NetworkConfiguration never returns, and DHCPv6 never runs.
// The bootstrap brings up eth0 and starts both DHCPv4 and DHCPv6 so either
// family can come up; the post-IMDS config below replaces it with the
// family that actually applies.
select {
case ch <- bootstrapNetworkConfig():
case <-ctx.Done():
return ctx.Err()
}
log.Printf("fetching aws instance config")
metadata, err := a.getMetadata(ctx)
client, err := a.buildIMDSClient(ctx)
if err != nil {
return err
}
metadata, err := a.getMetadata(ctx, client)
if err != nil {
return err
}
@ -187,3 +358,39 @@ func (a *AWS) NetworkConfiguration(ctx context.Context, _ state.State, ch chan<-
return nil
}
// bootstrapNetworkConfig returns the platform network config emitted before
// IMDS metadata is available. It brings up the primary interface and enables
// both DHCPv4 and DHCPv6 so the instance can reach IMDS regardless of which
// address family the VPC exposes.
func bootstrapNetworkConfig() *runtime.PlatformNetworkConfig {
return &runtime.PlatformNetworkConfig{
Links: []network.LinkSpecSpec{
{
Name: awsInterfaceName,
Up: true,
ConfigLayer: network.ConfigPlatform,
},
},
Operators: []network.OperatorSpecSpec{
{
Operator: network.OperatorDHCP4,
LinkName: awsInterfaceName,
RequireUp: true,
DHCP4: network.DHCP4OperatorSpec{
RouteMetric: network.DefaultRouteMetric,
},
ConfigLayer: network.ConfigPlatform,
},
{
Operator: network.OperatorDHCP6,
LinkName: awsInterfaceName,
RequireUp: true,
DHCP6: network.DHCP6OperatorSpec{
RouteMetric: network.DefaultRouteMetric,
},
ConfigLayer: network.ConfigPlatform,
},
},
}
}

View File

@ -19,21 +19,57 @@ import (
//go:embed testdata/metadata.json
var rawMetadata []byte
//go:embed testdata/metadata-v6.json
var rawMetadataV6 []byte
//go:embed testdata/metadata-v6only.json
var rawMetadataV6Only []byte
//go:embed testdata/expected.yaml
var expectedNetworkConfig string
func TestEmpty(t *testing.T) {
p := &aws.AWS{}
//go:embed testdata/expected-v6.yaml
var expectedNetworkConfigV6 string
var metadata aws.MetadataConfig
//go:embed testdata/expected-v6only.yaml
var expectedNetworkConfigV6Only string
require.NoError(t, json.Unmarshal(rawMetadata, &metadata))
func TestParseMetadata(t *testing.T) {
for _, tt := range []struct {
name string
raw []byte
expected string
}{
{
name: "IPv4 only",
raw: rawMetadata,
expected: expectedNetworkConfig,
},
{
name: "dual stack",
raw: rawMetadataV6,
expected: expectedNetworkConfigV6,
},
{
name: "IPv6 only",
raw: rawMetadataV6Only,
expected: expectedNetworkConfigV6Only,
},
} {
t.Run(tt.name, func(t *testing.T) {
p := &aws.AWS{}
networkConfig, err := p.ParseMetadata(&metadata)
require.NoError(t, err)
var metadata aws.MetadataConfig
marshaled, err := yaml.Marshal(networkConfig)
require.NoError(t, err)
require.NoError(t, json.Unmarshal(tt.raw, &metadata))
assert.Equal(t, expectedNetworkConfig, string(marshaled))
networkConfig, err := p.ParseMetadata(&metadata)
require.NoError(t, err)
marshaled, err := yaml.Marshal(networkConfig)
require.NoError(t, err)
assert.Equal(t, tt.expected, string(marshaled))
})
}
}

View File

@ -23,19 +23,31 @@ type MetadataConfig struct {
InstanceType string `json:"instance-type,omitempty"`
InstanceLifeCycle string `json:"instance-life-cycle,omitempty"`
PublicIPv4 string `json:"public-ipv4,omitempty"`
PublicIPv6 string `json:"ipv6,omitempty"`
InternalDNS string `json:"local-hostname,omitempty"`
ExternalDNS string `json:"public-hostname,omitempty"`
Region string `json:"region,omitempty"`
Zone string `json:"zone,omitempty"`
Tags map[string]string `json:"tags,omitempty"`
// PrimaryInterface holds metadata for the primary network interface.
//
// Talos only supports a single NIC on AWS, so secondary interfaces are ignored.
PrimaryInterface *InterfaceConfig `json:"primary-interface,omitempty"`
}
// InterfaceConfig holds the IMDS metadata for a single network interface.
type InterfaceConfig struct {
MAC string `json:"mac,omitempty"`
DeviceNumber string `json:"device-number,omitempty"`
LocalIPv4s []string `json:"local-ipv4s,omitempty"`
IPv6s []string `json:"ipv6s,omitempty"`
}
//nolint:gocyclo
func (a *AWS) getMetadata(ctx context.Context) (*MetadataConfig, error) {
func (a *AWS) getMetadata(ctx context.Context, client *imds.Client) (*MetadataConfig, error) {
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
getMetadataKey := func(key string) (string, error) {
resp, err := a.metadataClient.GetMetadata(ctx, &imds.GetMetadataInput{
resp, err := client.GetMetadata(ctx, &imds.GetMetadataInput{
Path: key,
})
if err != nil {
@ -77,10 +89,6 @@ func (a *AWS) getMetadata(ctx context.Context) (*MetadataConfig, error) {
return nil, err
}
if metadata.PublicIPv6, err = getMetadataKey("ipv6"); err != nil {
return nil, err
}
if metadata.InternalDNS, err = getMetadataKey("local-hostname"); err != nil {
return nil, err
}
@ -107,9 +115,98 @@ func (a *AWS) getMetadata(ctx context.Context) (*MetadataConfig, error) {
}
}
if metadata.PrimaryInterface, err = a.getPrimaryInterface(getMetadataKey); err != nil {
return nil, err
}
return &metadata, nil
}
// getPrimaryInterface returns metadata for the primary network interface.
//
// IMDS lists every NIC under network/interfaces/macs/, but Talos only supports
// the primary NIC. Pick the entry with device-number=0 — AWS guarantees this is
// the primary — and fall back to the first listed MAC if the field is missing.
//
//nolint:gocyclo
func (a *AWS) getPrimaryInterface(getMetadataKey func(string) (string, error)) (*InterfaceConfig, error) {
macsList, err := getMetadataKey("network/interfaces/macs/")
if err != nil {
return nil, err
}
var macs []string
for line := range strings.Lines(macsList) {
mac := strings.TrimSuffix(strings.TrimSpace(line), "/")
if mac == "" {
continue
}
macs = append(macs, mac)
}
if len(macs) == 0 {
return nil, nil
}
primaryMAC := macs[0]
for _, mac := range macs {
deviceNumber, err := getMetadataKey(fmt.Sprintf("network/interfaces/macs/%s/device-number", mac))
if err != nil {
return nil, err
}
if strings.TrimSpace(deviceNumber) == "0" {
primaryMAC = mac
break
}
}
iface := &InterfaceConfig{
MAC: primaryMAC,
}
if iface.DeviceNumber, err = getMetadataKey(fmt.Sprintf("network/interfaces/macs/%s/device-number", primaryMAC)); err != nil {
return nil, err
}
iface.DeviceNumber = strings.TrimSpace(iface.DeviceNumber)
if iface.LocalIPv4s, err = fetchAddressList(getMetadataKey, fmt.Sprintf("network/interfaces/macs/%s/local-ipv4s", primaryMAC)); err != nil {
return nil, err
}
if iface.IPv6s, err = fetchAddressList(getMetadataKey, fmt.Sprintf("network/interfaces/macs/%s/ipv6s", primaryMAC)); err != nil {
return nil, err
}
return iface, nil
}
// fetchAddressList reads a newline-separated list of addresses from IMDS.
func fetchAddressList(getMetadataKey func(string) (string, error), path string) ([]string, error) {
raw, err := getMetadataKey(path)
if err != nil {
return nil, err
}
var addrs []string
for line := range strings.Lines(raw) {
addr := strings.TrimSpace(line)
if addr == "" {
continue
}
addrs = append(addrs, addr)
}
return addrs, nil
}
func isNotFoundError(err error) bool {
var awsErr *smithyhttp.ResponseError
if errors.As(err, &awsErr) {

View File

@ -0,0 +1,49 @@
addresses: []
links:
- name: eth0
logical: false
up: true
mtu: 0
kind: ""
type: netrom
layer: platform
routes: []
hostnames:
- hostname: talos
domainname: ""
layer: platform
resolvers:
- dnsServers:
- fd00:ec2::253
layer: platform
timeServers:
- timeServers:
- 169.254.169.123
- fd00:ec2::123
layer: platform
operators:
- operator: dhcp4
linkName: eth0
requireUp: true
dhcp4:
routeMetric: 1024
layer: platform
- operator: dhcp6
linkName: eth0
requireUp: true
dhcp6:
routeMetric: 1024
layer: platform
externalIPs:
- 1.2.3.4
metadata:
platform: aws
hostname: talos
region: us-east-1
zone: us-east-1a
instanceType: t3.micro
instanceId: i-0a0a0a0a0a0a0a0a0
providerId: aws:///us-east-1a/i-0a0a0a0a0a0a0a0a0
spot: true
internalDNS: ip-10-0-0-10.ec2.internal
externalDNS: ec2-1-2-3-4.compute-1.amazonaws.com

View File

@ -0,0 +1,40 @@
addresses: []
links:
- name: eth0
logical: false
up: true
mtu: 0
kind: ""
type: netrom
layer: platform
routes: []
hostnames:
- hostname: talos
domainname: ""
layer: platform
resolvers:
- dnsServers:
- fd00:ec2::253
layer: platform
timeServers:
- timeServers:
- 169.254.169.123
- fd00:ec2::123
layer: platform
operators:
- operator: dhcp6
linkName: eth0
requireUp: true
dhcp6:
routeMetric: 1024
layer: platform
externalIPs: []
metadata:
platform: aws
hostname: talos
region: us-east-1
zone: us-east-1a
instanceType: t3.micro
instanceId: i-0a0a0a0a0a0a0a0a0
providerId: aws:///us-east-1a/i-0a0a0a0a0a0a0a0a0
internalDNS: ip-10-0-0-10.ec2.internal

View File

@ -1,5 +1,12 @@
addresses: []
links: []
links:
- name: eth0
logical: false
up: true
mtu: 0
kind: ""
type: netrom
layer: platform
routes: []
hostnames:
- hostname: talos
@ -11,7 +18,13 @@ timeServers:
- 169.254.169.123
- fd00:ec2::123
layer: platform
operators: []
operators:
- operator: dhcp4
linkName: eth0
requireUp: true
dhcp4:
routeMetric: 1024
layer: platform
externalIPs:
- 1.2.3.4
metadata:
@ -19,7 +32,10 @@ metadata:
hostname: talos
region: us-east-1
zone: us-east-1a
instanceType: t3.micro
instanceId: i-0a0a0a0a0a0a0a0a0
providerId: aws:///us-east-1a/i-0a0a0a0a0a0a0a0a0
internalDNS: ip-10-0-0-10.ec2.internal
externalDNS: ec2-1-2-3-4.compute-1.amazonaws.com
tags:
cluster: mycluster

View File

@ -0,0 +1,17 @@
{
"hostname": "talos",
"instance-id": "i-0a0a0a0a0a0a0a0a0",
"instance-type": "t3.micro",
"instance-life-cycle": "spot",
"public-ipv4": "1.2.3.4",
"local-hostname": "ip-10-0-0-10.ec2.internal",
"public-hostname": "ec2-1-2-3-4.compute-1.amazonaws.com",
"region": "us-east-1",
"zone": "us-east-1a",
"primary-interface": {
"mac": "06:17:04:d7:f0:6b",
"device-number": "0",
"local-ipv4s": ["10.0.0.10"],
"ipv6s": ["2001:db8::1"]
}
}

View File

@ -0,0 +1,14 @@
{
"hostname": "talos",
"instance-id": "i-0a0a0a0a0a0a0a0a0",
"instance-type": "t3.micro",
"instance-life-cycle": "on-demand",
"local-hostname": "ip-10-0-0-10.ec2.internal",
"region": "us-east-1",
"zone": "us-east-1a",
"primary-interface": {
"mac": "06:17:04:d7:f0:6b",
"device-number": "0",
"ipv6s": ["2001:db8::1"]
}
}

View File

@ -1,8 +1,17 @@
{
"hostname": "talos",
"instance-id": "i-0a0a0a0a0a0a0a0a0",
"instance-id": "i-0a0a0a0a0a0a0a0a0",
"instance-type": "t3.micro",
"instance-life-cycle": "on-demand",
"public-ipv4": "1.2.3.4",
"local-hostname": "ip-10-0-0-10.ec2.internal",
"public-hostname": "ec2-1-2-3-4.compute-1.amazonaws.com",
"region": "us-east-1",
"zone": "us-east-1a",
"tags": {"cluster": "mycluster"}
"tags": {"cluster": "mycluster"},
"primary-interface": {
"mac": "06:17:04:d7:f0:6b",
"device-number": "0",
"local-ipv4s": ["10.0.0.10"]
}
}