net/udprelay: XDP PoC

do not merge

Updates tailscale/corp#34849

Signed-off-by: Jordan Whited <jordan@tailscale.com>
This commit is contained in:
Jordan Whited 2025-12-08 14:51:13 -08:00
parent 6a44990b09
commit c3f9d1c22e
9 changed files with 819 additions and 6 deletions

View File

@ -25,6 +25,7 @@ import (
"golang.org/x/crypto/blake2s" "golang.org/x/crypto/blake2s"
"golang.org/x/net/ipv6" "golang.org/x/net/ipv6"
"tailscale.com/disco" "tailscale.com/disco"
"tailscale.com/envknob"
"tailscale.com/net/batching" "tailscale.com/net/batching"
"tailscale.com/net/netaddr" "tailscale.com/net/netaddr"
"tailscale.com/net/netcheck" "tailscale.com/net/netcheck"
@ -34,6 +35,7 @@ import (
"tailscale.com/net/stun" "tailscale.com/net/stun"
"tailscale.com/net/udprelay/endpoint" "tailscale.com/net/udprelay/endpoint"
"tailscale.com/net/udprelay/status" "tailscale.com/net/udprelay/status"
"tailscale.com/net/udprelay/xdp"
"tailscale.com/tailcfg" "tailscale.com/tailcfg"
"tailscale.com/tstime" "tailscale.com/tstime"
"tailscale.com/types/key" "tailscale.com/types/key"
@ -75,6 +77,7 @@ type Server struct {
wg sync.WaitGroup wg sync.WaitGroup
closeCh chan struct{} closeCh chan struct{}
netChecker *netcheck.Client netChecker *netcheck.Client
fib xdp.FIB
mu sync.Mutex // guards the following fields mu sync.Mutex // guards the following fields
macSecrets [][blake2s.Size]byte // [0] is most recent, max 2 elements macSecrets [][blake2s.Size]byte // [0] is most recent, max 2 elements
@ -140,7 +143,7 @@ func blakeMACFromBindMsg(blakeKey [blake2s.Size]byte, src netip.AddrPort, msg di
return out, nil return out, nil
} }
func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex int, discoMsg disco.Message, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) { func (e *serverEndpoint) handleDiscoControlMsg(logf logger.Logf, fib xdp.FIB, from netip.AddrPort, senderIndex int, discoMsg disco.Message, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) {
if senderIndex != 0 && senderIndex != 1 { if senderIndex != 0 && senderIndex != 1 {
return nil, netip.AddrPort{} return nil, netip.AddrPort{}
} }
@ -218,6 +221,12 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex
e.boundAddrPorts[senderIndex] = from e.boundAddrPorts[senderIndex] = from
e.lastSeen[senderIndex] = time.Now() // record last seen as bound time e.lastSeen[senderIndex] = time.Now() // record last seen as bound time
e.inProgressGeneration[senderIndex] = 0 // reset to zero, which indicates there is no in-progress handshake e.inProgressGeneration[senderIndex] = 0 // reset to zero, which indicates there is no in-progress handshake
if fib != nil && e.boundAddrPorts[0].IsValid() && e.boundAddrPorts[1].IsValid() {
err = fib.Upsert(e.vni, e.boundAddrPorts)
if err != nil {
logf("error upserting fib: %v", err)
}
}
return nil, netip.AddrPort{} return nil, netip.AddrPort{}
} }
} }
@ -229,7 +238,7 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex
} }
} }
func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []byte, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) { func (e *serverEndpoint) handleSealedDiscoControlMsg(logf logger.Logf, fib xdp.FIB, from netip.AddrPort, b []byte, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) {
senderRaw, isDiscoMsg := disco.Source(b) senderRaw, isDiscoMsg := disco.Source(b)
if !isDiscoMsg { if !isDiscoMsg {
// Not a Disco message // Not a Disco message
@ -260,7 +269,7 @@ func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []by
return nil, netip.AddrPort{} return nil, netip.AddrPort{}
} }
return e.handleDiscoControlMsg(from, senderIndex, discoMsg, serverDisco, macSecrets) return e.handleDiscoControlMsg(logf, fib, from, senderIndex, discoMsg, serverDisco, macSecrets)
} }
func (e *serverEndpoint) handleDataPacket(from netip.AddrPort, b []byte, now time.Time) (write []byte, to netip.AddrPort) { func (e *serverEndpoint) handleDataPacket(from netip.AddrPort, b []byte, now time.Time) (write []byte, to netip.AddrPort) {
@ -323,6 +332,17 @@ func NewServer(logf logger.Logf, port uint16, onlyStaticAddrPorts bool) (s *Serv
byVNI: make(map[uint32]*serverEndpoint), byVNI: make(map[uint32]*serverEndpoint),
} }
s.discoPublic = s.disco.Public() s.discoPublic = s.disco.Public()
xdpDev := envknob.String("TS_PEER_RELAY_XDP_DEVICE")
if xdpDev != "" {
s.fib, err = xdp.NewFIB(&xdp.FIBConfig{
DstPort: port,
DeviceName: xdpDev,
})
}
if err != nil {
return nil, err
}
// TODO(creachadair): Find a way to plumb this in during initialization. // TODO(creachadair): Find a way to plumb this in during initialization.
// As-written, messages published here will not be seen by other components // As-written, messages published here will not be seen by other components
@ -547,11 +567,11 @@ func trySetUDPSocketOptions(pconn nettype.PacketConn, logf logger.Logf) {
func (s *Server) bindSockets(desiredPort uint16) error { func (s *Server) bindSockets(desiredPort uint16) error {
// maxSocketsPerAF is a conservative starting point, but is somewhat // maxSocketsPerAF is a conservative starting point, but is somewhat
// arbitrary. // arbitrary.
maxSocketsPerAF := min(16, runtime.NumCPU()) maxSocketsPerAF := min(128, runtime.NumCPU())
listenConfig := &net.ListenConfig{ listenConfig := &net.ListenConfig{
Control: listenControl, Control: listenControl,
} }
for _, network := range []string{"udp4", "udp6"} { for _, network := range []string{"udp4"} { //, "udp6"} {
SocketsLoop: SocketsLoop:
for i := range maxSocketsPerAF { for i := range maxSocketsPerAF {
if i > 0 { if i > 0 {
@ -626,6 +646,9 @@ func (s *Server) bindSocketTo(listenConfig *net.ListenConfig, network string, po
// Close closes the server. // Close closes the server.
func (s *Server) Close() error { func (s *Server) Close() error {
s.closeOnce.Do(func() { s.closeOnce.Do(func() {
if s.fib != nil {
s.fib.Close()
}
for _, uc4 := range s.uc4 { for _, uc4 := range s.uc4 {
uc4.Close() uc4.Close()
} }
@ -662,6 +685,15 @@ func (s *Server) endpointGCLoop() {
if v.isExpired(now, s.bindLifetime, s.steadyStateLifetime) { if v.isExpired(now, s.bindLifetime, s.steadyStateLifetime) {
delete(s.byDisco, k) delete(s.byDisco, k)
delete(s.byVNI, v.vni) delete(s.byVNI, v.vni)
// TODO: isExpired only considers userspace counters/liveliness
// TODO: this is a syscall per VNI to delete while holding s.mu,
// consider batch delete
if s.fib != nil {
err := s.fib.Delete(v.vni)
if err != nil {
s.logf("failed to delete fib entry: %v", err)
}
}
} }
} }
} }
@ -708,7 +740,7 @@ func (s *Server) handlePacket(from netip.AddrPort, b []byte) (write []byte, to n
} }
msg := b[packet.GeneveFixedHeaderLength:] msg := b[packet.GeneveFixedHeaderLength:]
s.maybeRotateMACSecretLocked(now) s.maybeRotateMACSecretLocked(now)
return e.handleSealedDiscoControlMsg(from, msg, s.discoPublic, s.macSecrets) return e.handleSealedDiscoControlMsg(s.logf, s.fib, from, msg, s.discoPublic, s.macSecrets)
} }
return e.handleDataPacket(from, b, now) return e.handleDataPacket(from, b, now)
} }

View File

@ -0,0 +1,131 @@
// Code generated by bpf2go; DO NOT EDIT.
//go:build mips || mips64 || ppc64 || s390x
package xdp
import (
"bytes"
_ "embed"
"fmt"
"io"
"github.com/cilium/ebpf"
)
type bpfConfig struct{ DstPort uint16 }
type bpfEndpoint struct {
ParticipantAddrs [2][4]uint32
ParticipantPorts [2]uint16
ParticipantIsIpv6 [2]uint8
_ [2]byte
}
// loadBpf returns the embedded CollectionSpec for bpf.
func loadBpf() (*ebpf.CollectionSpec, error) {
reader := bytes.NewReader(_BpfBytes)
spec, err := ebpf.LoadCollectionSpecFromReader(reader)
if err != nil {
return nil, fmt.Errorf("can't load bpf: %w", err)
}
return spec, err
}
// loadBpfObjects loads bpf and converts it into a struct.
//
// The following types are suitable as obj argument:
//
// *bpfObjects
// *bpfPrograms
// *bpfMaps
//
// See ebpf.CollectionSpec.LoadAndAssign documentation for details.
func loadBpfObjects(obj interface{}, opts *ebpf.CollectionOptions) error {
spec, err := loadBpf()
if err != nil {
return err
}
return spec.LoadAndAssign(obj, opts)
}
// bpfSpecs contains maps and programs before they are loaded into the kernel.
//
// It can be passed ebpf.CollectionSpec.Assign.
type bpfSpecs struct {
bpfProgramSpecs
bpfMapSpecs
}
// bpfSpecs contains programs before they are loaded into the kernel.
//
// It can be passed ebpf.CollectionSpec.Assign.
type bpfProgramSpecs struct {
XdpProgFunc *ebpf.ProgramSpec `ebpf:"xdp_prog_func"`
}
// bpfMapSpecs contains maps before they are loaded into the kernel.
//
// It can be passed ebpf.CollectionSpec.Assign.
type bpfMapSpecs struct {
ConfigMap *ebpf.MapSpec `ebpf:"config_map"`
EndpointMap *ebpf.MapSpec `ebpf:"endpoint_map"`
}
// bpfObjects contains all objects after they have been loaded into the kernel.
//
// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
type bpfObjects struct {
bpfPrograms
bpfMaps
}
func (o *bpfObjects) Close() error {
return _BpfClose(
&o.bpfPrograms,
&o.bpfMaps,
)
}
// bpfMaps contains all maps after they have been loaded into the kernel.
//
// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
type bpfMaps struct {
ConfigMap *ebpf.Map `ebpf:"config_map"`
EndpointMap *ebpf.Map `ebpf:"endpoint_map"`
}
func (m *bpfMaps) Close() error {
return _BpfClose(
m.ConfigMap,
m.EndpointMap,
)
}
// bpfPrograms contains all programs after they have been loaded into the kernel.
//
// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
type bpfPrograms struct {
XdpProgFunc *ebpf.Program `ebpf:"xdp_prog_func"`
}
func (p *bpfPrograms) Close() error {
return _BpfClose(
p.XdpProgFunc,
)
}
func _BpfClose(closers ...io.Closer) error {
for _, closer := range closers {
if err := closer.Close(); err != nil {
return err
}
}
return nil
}
// Do not access this directly.
//
//go:embed bpf_bpfeb.o
var _BpfBytes []byte

Binary file not shown.

View File

@ -0,0 +1,131 @@
// Code generated by bpf2go; DO NOT EDIT.
//go:build 386 || amd64 || arm || arm64 || loong64 || mips64le || mipsle || ppc64le || riscv64
package xdp
import (
"bytes"
_ "embed"
"fmt"
"io"
"github.com/cilium/ebpf"
)
type bpfConfig struct{ DstPort uint16 }
type bpfEndpoint struct {
ParticipantAddrs [2][4]uint32
ParticipantPorts [2]uint16
ParticipantIsIpv6 [2]uint8
_ [2]byte
}
// loadBpf returns the embedded CollectionSpec for bpf.
func loadBpf() (*ebpf.CollectionSpec, error) {
reader := bytes.NewReader(_BpfBytes)
spec, err := ebpf.LoadCollectionSpecFromReader(reader)
if err != nil {
return nil, fmt.Errorf("can't load bpf: %w", err)
}
return spec, err
}
// loadBpfObjects loads bpf and converts it into a struct.
//
// The following types are suitable as obj argument:
//
// *bpfObjects
// *bpfPrograms
// *bpfMaps
//
// See ebpf.CollectionSpec.LoadAndAssign documentation for details.
func loadBpfObjects(obj interface{}, opts *ebpf.CollectionOptions) error {
spec, err := loadBpf()
if err != nil {
return err
}
return spec.LoadAndAssign(obj, opts)
}
// bpfSpecs contains maps and programs before they are loaded into the kernel.
//
// It can be passed ebpf.CollectionSpec.Assign.
type bpfSpecs struct {
bpfProgramSpecs
bpfMapSpecs
}
// bpfSpecs contains programs before they are loaded into the kernel.
//
// It can be passed ebpf.CollectionSpec.Assign.
type bpfProgramSpecs struct {
XdpProgFunc *ebpf.ProgramSpec `ebpf:"xdp_prog_func"`
}
// bpfMapSpecs contains maps before they are loaded into the kernel.
//
// It can be passed ebpf.CollectionSpec.Assign.
type bpfMapSpecs struct {
ConfigMap *ebpf.MapSpec `ebpf:"config_map"`
EndpointMap *ebpf.MapSpec `ebpf:"endpoint_map"`
}
// bpfObjects contains all objects after they have been loaded into the kernel.
//
// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
type bpfObjects struct {
bpfPrograms
bpfMaps
}
func (o *bpfObjects) Close() error {
return _BpfClose(
&o.bpfPrograms,
&o.bpfMaps,
)
}
// bpfMaps contains all maps after they have been loaded into the kernel.
//
// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
type bpfMaps struct {
ConfigMap *ebpf.Map `ebpf:"config_map"`
EndpointMap *ebpf.Map `ebpf:"endpoint_map"`
}
func (m *bpfMaps) Close() error {
return _BpfClose(
m.ConfigMap,
m.EndpointMap,
)
}
// bpfPrograms contains all programs after they have been loaded into the kernel.
//
// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign.
type bpfPrograms struct {
XdpProgFunc *ebpf.Program `ebpf:"xdp_prog_func"`
}
func (p *bpfPrograms) Close() error {
return _BpfClose(
p.XdpProgFunc,
)
}
func _BpfClose(closers ...io.Closer) error {
for _, closer := range closers {
if err := closer.Close(); err != nil {
return err
}
}
return nil
}
// Do not access this directly.
//
//go:embed bpf_bpfel.o
var _BpfBytes []byte

Binary file not shown.

350
net/udprelay/xdp/xdp.c Normal file
View File

@ -0,0 +1,350 @@
//go:build ignore
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/udp.h>
#include <bpf_endian.h>
#include <bpf_helpers.h>
char _license[4] SEC("license") = "GPL";
struct config {
__u16 dst_port;
};
struct config *unused_config __attribute__((unused)); // required by bpf2go -type
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(struct config));
__uint(max_entries, 1);
} config_map SEC(".maps");
struct endpoint {
__be32 participant_addrs[2][4];
__u16 participant_ports[2];
__u8 participant_is_ipv6[2];
};
struct endpoint *unused_endpoint __attribute__((unused)); // required by bpf2go -type
#define MAX_GENEVE_VNI (1 << 24) - 1
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__uint(key_size, sizeof(__u32)); // key is Geneve VNI
__uint(value_size, sizeof(struct endpoint));
__uint(max_entries, MAX_GENEVE_VNI);
} endpoint_map SEC(".maps");
#define MAX_UDP_LEN_IPV4 1480
#define MAX_UDP_LEN_IPV6 1460
#define IP_MF 0x2000
#define IP_OFFSET 0x1fff
/*
Geneve Header:
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
|Ver| Opt Len |O|C| Rsvd. | Protocol Type |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| Virtual Network Identifier (VNI) | Reserved |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
| |
~ Variable-Length Options ~
| |
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*/
struct geneve_header {
__u8 first;
__u8 second;
__be16 protocol;
__be32 vni;
};
static __always_inline __u16 csum_fold(__u32 csum) {
__u32 sum;
sum = (csum >> 16) + (csum & 0xffff); // maximum value 0x1fffe
sum += (sum >> 16); // maximum value 0xffff
return sum;
}
static __always_inline __u16 csum_fold_flip(__u32 csum) {
__u32 sum;
sum = (csum >> 16) + (csum & 0xffff); // maximum value 0x1fffe
sum += (sum >> 16); // maximum value 0xffff
return ~sum;
}
static __always_inline __u32 pseudo_sum_ipv6(struct ipv6hdr* ip6, __u16 udp_len) {
__u32 pseudo = 0; // TODO(jwhited): __u64 for intermediate checksum values to reduce number of ops
for (int i = 0; i < 8; i ++) {
pseudo += ip6->saddr.in6_u.u6_addr16[i];
pseudo += ip6->daddr.in6_u.u6_addr16[i];
}
pseudo += bpf_htons(ip6->nexthdr);
pseudo += udp_len;
return pseudo;
}
static __always_inline __u32 pseudo_sum_ipv4(struct iphdr* ip, __u16 udp_len) {
__u32 pseudo = (__u16)ip->saddr;
pseudo += (__u16)(ip->saddr >> 16);
pseudo += (__u16)ip->daddr;
pseudo += (__u16)(ip->daddr >> 16);
pseudo += bpf_htons(ip->protocol);
pseudo += udp_len;
return pseudo;
}
// csum_const_size is an alternative to bpf_csum_diff. It's a verifier
// workaround for when we are forced to use a constant max_size + bounds
// checking. The alternative being passing a dynamic length to bpf_csum_diff
// {from,to}_size arguments, which the verifier can't follow. For further info
// see: https://github.com/iovisor/bcc/issues/2463#issuecomment-512503958
static __always_inline __u16 csum_const_size(__u32 seed, void* from, void* data_end, int max_size) {
__u16 *buf = from;
for (int i = 0; i < max_size; i += 2) {
if ((void *)(buf + 1) > data_end) {
break;
}
seed += *buf;
buf++;
}
if ((void *)buf + 1 <= data_end) {
seed += *(__u8 *)buf;
}
return csum_fold_flip(seed);
}
SEC("xdp")
int xdp_prog_func(struct xdp_md *ctx) {
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct ethhdr *eth = data;
if ((void *)(eth + 1) > data_end) {
return XDP_PASS;
}
struct iphdr *ip;
struct ipv6hdr *ip6;
struct udphdr *udp;
int validate_udp_csum = 0;
int is_ipv6 = 0;
if (eth->h_proto == bpf_htons(ETH_P_IP)) {
ip = (void *)(eth + 1);
if ((void *)(ip + 1) > data_end) {
return XDP_PASS;
}
if (ip->ihl != 5 ||
ip->version != 4 ||
ip->protocol != IPPROTO_UDP ||
(ip->frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0) {
return XDP_PASS;
}
// validate ipv4 header checksum
__u32 cs_unfolded = bpf_csum_diff(0, 0, (void *)ip, sizeof(*ip), 0);
__u16 cs = csum_fold_flip(cs_unfolded);
if (cs != 0) {
return XDP_PASS;
}
if (bpf_ntohs(ip->tot_len) != data_end - (void *)ip) {
return XDP_PASS;
}
udp = (void *)(ip + 1);
if ((void *)(udp + 1) > data_end) {
return XDP_PASS;
}
if (udp->check != 0) {
// https://datatracker.ietf.org/doc/html/rfc768#page-3
// If the computed checksum is zero, it is transmitted as all
// ones (the equivalent in one's complement arithmetic). An all
// zero transmitted checksum value means that the transmitter
// generated no checksum (for debugging or for higher level
// protocols that don't care).
validate_udp_csum = 1;
}
} else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) {
ip6 = (void *)(eth + 1);
if ((void *)(ip6 + 1) > data_end) {
return XDP_PASS;
}
if (ip6->version != 6 || ip6->nexthdr != IPPROTO_UDP) {
return XDP_PASS;
}
udp = (void *)(ip6 + 1);
if ((void *)(udp + 1) > data_end) {
return XDP_PASS;
}
if (bpf_ntohs(ip6->payload_len) != data_end - (void *)udp) {
return XDP_PASS;
}
// https://datatracker.ietf.org/doc/html/rfc8200#page-28
// Unlike IPv4, the default behavior when UDP packets are
// originated by an IPv6 node is that the UDP checksum is not
// optional. That is, whenever originating a UDP packet, an IPv6
// node must compute a UDP checksum over the packet and the
// pseudo-header, and, if that computation yields a result of
// zero, it must be changed to hex FFFF for placement in the UDP
// header. IPv6 receivers must discard UDP packets containing a
// zero checksum and should log the error.
validate_udp_csum = 1;
is_ipv6 = 1;
} else {
return XDP_PASS;
}
__u32 config_key = 0;
struct config *c = bpf_map_lookup_elem(&config_map, &config_key);
if (!c) {
return XDP_PASS;
}
if (bpf_ntohs(udp->len) != data_end - (void *)udp) {
return XDP_PASS;
}
if (bpf_ntohs(udp->dest) != c->dst_port) {
return XDP_PASS;
}
if (validate_udp_csum) {
__u16 cs;
__u32 pseudo_sum;
if (is_ipv6) {
pseudo_sum = pseudo_sum_ipv6(ip6, udp->len);
cs = csum_const_size(pseudo_sum, udp, data_end, MAX_UDP_LEN_IPV6);
} else {
pseudo_sum = pseudo_sum_ipv4(ip, udp->len);
cs = csum_const_size(pseudo_sum, udp, data_end, MAX_UDP_LEN_IPV4);
}
if (cs != 0) {
return XDP_PASS;
}
}
struct geneve_header *geneve = (void *)(udp + 1);
if ((void *)(geneve +1) > data_end) {
return XDP_PASS;
}
if (geneve->first != 0) {
// first 2 bits are version, must be zero
// next 6 bits are opt len, must be zero
return XDP_PASS;
}
if (geneve->second != 0) {
// first bit is control, must be zero
// next bit is critical (options), must be zero
// next 6 bits are reserved, must be zero
return XDP_PASS;
}
if ((geneve->vni & 0x000000FF) != 0) {
// last byte is reserved, must be zero
return XDP_PASS;
}
__u32 vni_key = bpf_ntohl(geneve->vni) >> 8;
struct endpoint *e = bpf_map_lookup_elem(&endpoint_map, &vni_key);
if (!e) {
return XDP_PASS;
}
int out_participant_index = -1; // -1 = unmatched
if (is_ipv6) {
// TODO
} else {
for (int i = 0; i < 2; i ++) {
if (e->participant_is_ipv6[i] == 0 &&
e->participant_addrs[i][3] == ip->saddr &&
e->participant_ports[i] == bpf_ntohs(udp->source))
{
if (i == 0) {
out_participant_index = 1;
} else {
out_participant_index = 0;
}
break;
}
}
}
if (out_participant_index == -1) {
return XDP_PASS;
}
if (e->participant_is_ipv6[out_participant_index] == is_ipv6) {
// matching in/out address family
if (is_ipv6) {
// TODO: in ipv6, out ipv6
} else {
// TODO: in ipv4, out ipv4
// Update IPv4 header
__be32 p_addr = e->participant_addrs[out_participant_index][3];
__u32 ip_csum = ~(__u32)ip->check;
__u32 udp_csum = ~(__u32)udp->check;
ip_csum = bpf_csum_diff(&ip->saddr, 4, &p_addr, 4, ip_csum);
udp_csum = bpf_csum_diff(&ip->saddr, 4, &p_addr, 4, udp_csum);
ip->check = csum_fold_flip(ip_csum);
ip->saddr = ip->daddr;
ip->daddr = p_addr;
#define AF_INET 2
struct bpf_fib_lookup fib_params = {};
fib_params.family = AF_INET;
fib_params.tos = ip->tos;
fib_params.l4_protocol = ip->protocol;
fib_params.sport = 0;
fib_params.dport = 0;
fib_params.tot_len = bpf_ntohs(ip->tot_len);
fib_params.ipv4_src = ip->saddr;
fib_params.ipv4_dst = ip->daddr;
fib_params.ifindex = ctx->ingress_ifindex;
int rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), BPF_FIB_LOOKUP_DIRECT);
if (rc != BPF_FIB_LKUP_RET_SUCCESS) {
return XDP_ABORTED;
}
// Rewrite ethernet header source and destination address.
__builtin_memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
__builtin_memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
// Update UDP header
__u32 old_udp_port = (__u32)udp->source;
__u32 new_udp_port = (__u32)bpf_htons(e->participant_ports[out_participant_index]);
udp_csum = bpf_csum_diff(&old_udp_port, 4, &new_udp_port, 4, udp_csum);
udp->check = csum_fold_flip(udp_csum);
udp->source = udp->dest;
udp->dest = bpf_htons(e->participant_ports[out_participant_index]);
udp = (void *)(ip + 1);
if ((void *)(udp +1) > data_end) {
return XDP_ABORTED;
}
return XDP_TX;
}
} else if (e->participant_is_ipv6[out_participant_index] == 0) {
// TODO: in ipv4, out ipv6
} else {
// TODO: in ipv6, out ipv4
}
return XDP_PASS;
}

48
net/udprelay/xdp/xdp.go Normal file
View File

@ -0,0 +1,48 @@
package xdp
import "net/netip"
// XDPAttachFlags represents how XDP program will be attached to interface. This
// is a mirror of cilium/ebpf/link.AttachFlags, without pulling it in for
// non-Linux.
type XDPAttachFlags uint32
const (
// XDPDriverFallbackGenericMode attempts XDPDriverMode, and falls back to
// XDPGenericMode if the driver does not support XDP.
XDPDriverFallbackGenericMode = 0
)
const (
// XDPGenericMode (SKB) links XDP BPF program for drivers which do
// not yet support native XDP.
XDPGenericMode XDPAttachFlags = 1 << (iota + 1)
// XDPDriverMode links XDP BPF program into the drivers receive path.
XDPDriverMode
// XDPOffloadMode offloads the entire XDP BPF program into hardware.
XDPOffloadMode
)
type FIBConfig struct {
DeviceName string
// TODO: DstPort is singular, but udp4 and udp6 can be independent ports if
// the user supplied a zero port value.
DstPort uint16
AttachFlags XDPAttachFlags
}
func (f FIBConfig) validate() error { return nil }
type FIBOption interface {
apply(*fibOptions)
}
type fibOptions struct {
noAttach bool
}
type FIB interface {
Delete(vni uint32) error
Upsert(vni uint32, participants [2]netip.AddrPort) error
Close() error
}

View File

@ -0,0 +1,103 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build linux
package xdp
import (
"encoding/binary"
"errors"
"fmt"
"net"
"net/netip"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/link"
)
//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -type config -type endpoint bpf xdp.c -- -I ../../../derp/xdp/headers
func NewFIB(config *FIBConfig, opts ...FIBOption) (FIB, error) {
o := &fibOptions{}
for _, opt := range opts {
opt.apply(o)
}
err := config.validate()
if err != nil {
return nil, fmt.Errorf("invalid config: %v", err)
}
objs := new(bpfObjects)
err = loadBpfObjects(objs, nil)
if err != nil {
var ve *ebpf.VerifierError
if errors.As(err, &ve) {
err = fmt.Errorf("verifier error: %+v", ve)
}
return nil, fmt.Errorf("error loading XDP program: %w", err)
}
f := &linuxFIB{
objs: objs,
dstPort: config.DstPort,
}
var key uint32
xdpConfig := &bpfConfig{
DstPort: config.DstPort,
}
err = objs.ConfigMap.Put(key, xdpConfig)
if err != nil {
return nil, fmt.Errorf("error loading config in eBPF map: %w", err)
}
if o.noAttach {
return f, nil
}
iface, err := net.InterfaceByName(config.DeviceName)
if err != nil {
return nil, fmt.Errorf("error finding device: %w", err)
}
link, err := link.AttachXDP(link.XDPOptions{
Program: objs.XdpProgFunc,
Interface: iface.Index,
Flags: link.XDPAttachFlags(config.AttachFlags),
})
if err != nil {
return nil, fmt.Errorf("error attaching XDP program to dev: %w", err)
}
f.link = link
return f, nil
}
type linuxFIB struct {
objs *bpfObjects
dstPort uint16
link link.Link
}
func (l *linuxFIB) Delete(vni uint32) error {
return l.objs.EndpointMap.Delete(&vni)
}
func (l *linuxFIB) Upsert(vni uint32, participants [2]netip.AddrPort) error {
endpoint := bpfEndpoint{}
for i, participant := range participants {
as16 := participant.Addr().As16()
for j := 0; j < 4; j++ {
endpoint.ParticipantAddrs[i][j] = binary.NativeEndian.Uint32(as16[j*4:])
}
endpoint.ParticipantPorts[i] = participant.Port()
if participant.Addr().Is6() {
endpoint.ParticipantIsIpv6[i] = 1
}
}
numCPU, err := ebpf.PossibleCPU()
if err != nil {
return err
}
vals := make([]bpfEndpoint, numCPU)
for i := range vals {
vals[i] = endpoint
}
return l.objs.EndpointMap.Put(&vni, vals)
}
func (l *linuxFIB) Close() error { return nil }

View File

@ -0,0 +1,18 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build !linux
package xdp
import "net/netip"
type noopFIB struct{}
func (noopFIB) Delete(vni uint32) error { return nil }
func (noopFIB) Upsert(vni uint32, participants [2]netip.AddrPort) error { return nil }
func (noopFIB) Close(vni uint32, participants [2]netip.AddrPort) error { return nil }
func NewFIB(config FIBConfig, opts ...FIBOption) (FIB, error) {
return noopFIB{}
}