diff --git a/net/udprelay/server.go b/net/udprelay/server.go index 26b27bb7f..48de1dfc1 100644 --- a/net/udprelay/server.go +++ b/net/udprelay/server.go @@ -25,6 +25,7 @@ import ( "golang.org/x/crypto/blake2s" "golang.org/x/net/ipv6" "tailscale.com/disco" + "tailscale.com/envknob" "tailscale.com/net/batching" "tailscale.com/net/netaddr" "tailscale.com/net/netcheck" @@ -34,6 +35,7 @@ import ( "tailscale.com/net/stun" "tailscale.com/net/udprelay/endpoint" "tailscale.com/net/udprelay/status" + "tailscale.com/net/udprelay/xdp" "tailscale.com/tailcfg" "tailscale.com/tstime" "tailscale.com/types/key" @@ -75,6 +77,7 @@ type Server struct { wg sync.WaitGroup closeCh chan struct{} netChecker *netcheck.Client + fib xdp.FIB mu sync.Mutex // guards the following fields macSecrets [][blake2s.Size]byte // [0] is most recent, max 2 elements @@ -140,7 +143,7 @@ func blakeMACFromBindMsg(blakeKey [blake2s.Size]byte, src netip.AddrPort, msg di return out, nil } -func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex int, discoMsg disco.Message, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) { +func (e *serverEndpoint) handleDiscoControlMsg(logf logger.Logf, fib xdp.FIB, from netip.AddrPort, senderIndex int, discoMsg disco.Message, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) { if senderIndex != 0 && senderIndex != 1 { return nil, netip.AddrPort{} } @@ -218,6 +221,12 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex e.boundAddrPorts[senderIndex] = from e.lastSeen[senderIndex] = time.Now() // record last seen as bound time e.inProgressGeneration[senderIndex] = 0 // reset to zero, which indicates there is no in-progress handshake + if fib != nil && e.boundAddrPorts[0].IsValid() && e.boundAddrPorts[1].IsValid() { + err = fib.Upsert(e.vni, e.boundAddrPorts) + if err != nil { + logf("error upserting fib: %v", err) + } + } return nil, netip.AddrPort{} } } @@ -229,7 +238,7 @@ func (e *serverEndpoint) handleDiscoControlMsg(from netip.AddrPort, senderIndex } } -func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []byte, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) { +func (e *serverEndpoint) handleSealedDiscoControlMsg(logf logger.Logf, fib xdp.FIB, from netip.AddrPort, b []byte, serverDisco key.DiscoPublic, macSecrets [][blake2s.Size]byte) (write []byte, to netip.AddrPort) { senderRaw, isDiscoMsg := disco.Source(b) if !isDiscoMsg { // Not a Disco message @@ -260,7 +269,7 @@ func (e *serverEndpoint) handleSealedDiscoControlMsg(from netip.AddrPort, b []by return nil, netip.AddrPort{} } - return e.handleDiscoControlMsg(from, senderIndex, discoMsg, serverDisco, macSecrets) + return e.handleDiscoControlMsg(logf, fib, from, senderIndex, discoMsg, serverDisco, macSecrets) } func (e *serverEndpoint) handleDataPacket(from netip.AddrPort, b []byte, now time.Time) (write []byte, to netip.AddrPort) { @@ -323,6 +332,17 @@ func NewServer(logf logger.Logf, port uint16, onlyStaticAddrPorts bool) (s *Serv byVNI: make(map[uint32]*serverEndpoint), } s.discoPublic = s.disco.Public() + xdpDev := envknob.String("TS_PEER_RELAY_XDP_DEVICE") + if xdpDev != "" { + s.fib, err = xdp.NewFIB(&xdp.FIBConfig{ + DstPort: port, + DeviceName: xdpDev, + }) + } + + if err != nil { + return nil, err + } // TODO(creachadair): Find a way to plumb this in during initialization. // As-written, messages published here will not be seen by other components @@ -547,11 +567,11 @@ func trySetUDPSocketOptions(pconn nettype.PacketConn, logf logger.Logf) { func (s *Server) bindSockets(desiredPort uint16) error { // maxSocketsPerAF is a conservative starting point, but is somewhat // arbitrary. - maxSocketsPerAF := min(16, runtime.NumCPU()) + maxSocketsPerAF := min(128, runtime.NumCPU()) listenConfig := &net.ListenConfig{ Control: listenControl, } - for _, network := range []string{"udp4", "udp6"} { + for _, network := range []string{"udp4"} { //, "udp6"} { SocketsLoop: for i := range maxSocketsPerAF { if i > 0 { @@ -626,6 +646,9 @@ func (s *Server) bindSocketTo(listenConfig *net.ListenConfig, network string, po // Close closes the server. func (s *Server) Close() error { s.closeOnce.Do(func() { + if s.fib != nil { + s.fib.Close() + } for _, uc4 := range s.uc4 { uc4.Close() } @@ -662,6 +685,15 @@ func (s *Server) endpointGCLoop() { if v.isExpired(now, s.bindLifetime, s.steadyStateLifetime) { delete(s.byDisco, k) delete(s.byVNI, v.vni) + // TODO: isExpired only considers userspace counters/liveliness + // TODO: this is a syscall per VNI to delete while holding s.mu, + // consider batch delete + if s.fib != nil { + err := s.fib.Delete(v.vni) + if err != nil { + s.logf("failed to delete fib entry: %v", err) + } + } } } } @@ -708,7 +740,7 @@ func (s *Server) handlePacket(from netip.AddrPort, b []byte) (write []byte, to n } msg := b[packet.GeneveFixedHeaderLength:] s.maybeRotateMACSecretLocked(now) - return e.handleSealedDiscoControlMsg(from, msg, s.discoPublic, s.macSecrets) + return e.handleSealedDiscoControlMsg(s.logf, s.fib, from, msg, s.discoPublic, s.macSecrets) } return e.handleDataPacket(from, b, now) } diff --git a/net/udprelay/xdp/bpf_bpfeb.go b/net/udprelay/xdp/bpf_bpfeb.go new file mode 100644 index 000000000..dce7dd177 --- /dev/null +++ b/net/udprelay/xdp/bpf_bpfeb.go @@ -0,0 +1,131 @@ +// Code generated by bpf2go; DO NOT EDIT. +//go:build mips || mips64 || ppc64 || s390x + +package xdp + +import ( + "bytes" + _ "embed" + "fmt" + "io" + + "github.com/cilium/ebpf" +) + +type bpfConfig struct{ DstPort uint16 } + +type bpfEndpoint struct { + ParticipantAddrs [2][4]uint32 + ParticipantPorts [2]uint16 + ParticipantIsIpv6 [2]uint8 + _ [2]byte +} + +// loadBpf returns the embedded CollectionSpec for bpf. +func loadBpf() (*ebpf.CollectionSpec, error) { + reader := bytes.NewReader(_BpfBytes) + spec, err := ebpf.LoadCollectionSpecFromReader(reader) + if err != nil { + return nil, fmt.Errorf("can't load bpf: %w", err) + } + + return spec, err +} + +// loadBpfObjects loads bpf and converts it into a struct. +// +// The following types are suitable as obj argument: +// +// *bpfObjects +// *bpfPrograms +// *bpfMaps +// +// See ebpf.CollectionSpec.LoadAndAssign documentation for details. +func loadBpfObjects(obj interface{}, opts *ebpf.CollectionOptions) error { + spec, err := loadBpf() + if err != nil { + return err + } + + return spec.LoadAndAssign(obj, opts) +} + +// bpfSpecs contains maps and programs before they are loaded into the kernel. +// +// It can be passed ebpf.CollectionSpec.Assign. +type bpfSpecs struct { + bpfProgramSpecs + bpfMapSpecs +} + +// bpfSpecs contains programs before they are loaded into the kernel. +// +// It can be passed ebpf.CollectionSpec.Assign. +type bpfProgramSpecs struct { + XdpProgFunc *ebpf.ProgramSpec `ebpf:"xdp_prog_func"` +} + +// bpfMapSpecs contains maps before they are loaded into the kernel. +// +// It can be passed ebpf.CollectionSpec.Assign. +type bpfMapSpecs struct { + ConfigMap *ebpf.MapSpec `ebpf:"config_map"` + EndpointMap *ebpf.MapSpec `ebpf:"endpoint_map"` +} + +// bpfObjects contains all objects after they have been loaded into the kernel. +// +// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. +type bpfObjects struct { + bpfPrograms + bpfMaps +} + +func (o *bpfObjects) Close() error { + return _BpfClose( + &o.bpfPrograms, + &o.bpfMaps, + ) +} + +// bpfMaps contains all maps after they have been loaded into the kernel. +// +// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. +type bpfMaps struct { + ConfigMap *ebpf.Map `ebpf:"config_map"` + EndpointMap *ebpf.Map `ebpf:"endpoint_map"` +} + +func (m *bpfMaps) Close() error { + return _BpfClose( + m.ConfigMap, + m.EndpointMap, + ) +} + +// bpfPrograms contains all programs after they have been loaded into the kernel. +// +// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. +type bpfPrograms struct { + XdpProgFunc *ebpf.Program `ebpf:"xdp_prog_func"` +} + +func (p *bpfPrograms) Close() error { + return _BpfClose( + p.XdpProgFunc, + ) +} + +func _BpfClose(closers ...io.Closer) error { + for _, closer := range closers { + if err := closer.Close(); err != nil { + return err + } + } + return nil +} + +// Do not access this directly. +// +//go:embed bpf_bpfeb.o +var _BpfBytes []byte diff --git a/net/udprelay/xdp/bpf_bpfeb.o b/net/udprelay/xdp/bpf_bpfeb.o new file mode 100644 index 000000000..9b035f983 Binary files /dev/null and b/net/udprelay/xdp/bpf_bpfeb.o differ diff --git a/net/udprelay/xdp/bpf_bpfel.go b/net/udprelay/xdp/bpf_bpfel.go new file mode 100644 index 000000000..b6599db04 --- /dev/null +++ b/net/udprelay/xdp/bpf_bpfel.go @@ -0,0 +1,131 @@ +// Code generated by bpf2go; DO NOT EDIT. +//go:build 386 || amd64 || arm || arm64 || loong64 || mips64le || mipsle || ppc64le || riscv64 + +package xdp + +import ( + "bytes" + _ "embed" + "fmt" + "io" + + "github.com/cilium/ebpf" +) + +type bpfConfig struct{ DstPort uint16 } + +type bpfEndpoint struct { + ParticipantAddrs [2][4]uint32 + ParticipantPorts [2]uint16 + ParticipantIsIpv6 [2]uint8 + _ [2]byte +} + +// loadBpf returns the embedded CollectionSpec for bpf. +func loadBpf() (*ebpf.CollectionSpec, error) { + reader := bytes.NewReader(_BpfBytes) + spec, err := ebpf.LoadCollectionSpecFromReader(reader) + if err != nil { + return nil, fmt.Errorf("can't load bpf: %w", err) + } + + return spec, err +} + +// loadBpfObjects loads bpf and converts it into a struct. +// +// The following types are suitable as obj argument: +// +// *bpfObjects +// *bpfPrograms +// *bpfMaps +// +// See ebpf.CollectionSpec.LoadAndAssign documentation for details. +func loadBpfObjects(obj interface{}, opts *ebpf.CollectionOptions) error { + spec, err := loadBpf() + if err != nil { + return err + } + + return spec.LoadAndAssign(obj, opts) +} + +// bpfSpecs contains maps and programs before they are loaded into the kernel. +// +// It can be passed ebpf.CollectionSpec.Assign. +type bpfSpecs struct { + bpfProgramSpecs + bpfMapSpecs +} + +// bpfSpecs contains programs before they are loaded into the kernel. +// +// It can be passed ebpf.CollectionSpec.Assign. +type bpfProgramSpecs struct { + XdpProgFunc *ebpf.ProgramSpec `ebpf:"xdp_prog_func"` +} + +// bpfMapSpecs contains maps before they are loaded into the kernel. +// +// It can be passed ebpf.CollectionSpec.Assign. +type bpfMapSpecs struct { + ConfigMap *ebpf.MapSpec `ebpf:"config_map"` + EndpointMap *ebpf.MapSpec `ebpf:"endpoint_map"` +} + +// bpfObjects contains all objects after they have been loaded into the kernel. +// +// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. +type bpfObjects struct { + bpfPrograms + bpfMaps +} + +func (o *bpfObjects) Close() error { + return _BpfClose( + &o.bpfPrograms, + &o.bpfMaps, + ) +} + +// bpfMaps contains all maps after they have been loaded into the kernel. +// +// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. +type bpfMaps struct { + ConfigMap *ebpf.Map `ebpf:"config_map"` + EndpointMap *ebpf.Map `ebpf:"endpoint_map"` +} + +func (m *bpfMaps) Close() error { + return _BpfClose( + m.ConfigMap, + m.EndpointMap, + ) +} + +// bpfPrograms contains all programs after they have been loaded into the kernel. +// +// It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. +type bpfPrograms struct { + XdpProgFunc *ebpf.Program `ebpf:"xdp_prog_func"` +} + +func (p *bpfPrograms) Close() error { + return _BpfClose( + p.XdpProgFunc, + ) +} + +func _BpfClose(closers ...io.Closer) error { + for _, closer := range closers { + if err := closer.Close(); err != nil { + return err + } + } + return nil +} + +// Do not access this directly. +// +//go:embed bpf_bpfel.o +var _BpfBytes []byte diff --git a/net/udprelay/xdp/bpf_bpfel.o b/net/udprelay/xdp/bpf_bpfel.o new file mode 100644 index 000000000..c72c4cf84 Binary files /dev/null and b/net/udprelay/xdp/bpf_bpfel.o differ diff --git a/net/udprelay/xdp/xdp.c b/net/udprelay/xdp/xdp.c new file mode 100644 index 000000000..386712975 --- /dev/null +++ b/net/udprelay/xdp/xdp.c @@ -0,0 +1,350 @@ +//go:build ignore + +#include +#include +#include +#include +#include +#include +#include +#include + +char _license[4] SEC("license") = "GPL"; + +struct config { + __u16 dst_port; +}; +struct config *unused_config __attribute__((unused)); // required by bpf2go -type + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct config)); + __uint(max_entries, 1); +} config_map SEC(".maps"); + +struct endpoint { + __be32 participant_addrs[2][4]; + __u16 participant_ports[2]; + __u8 participant_is_ipv6[2]; +}; +struct endpoint *unused_endpoint __attribute__((unused)); // required by bpf2go -type + +#define MAX_GENEVE_VNI (1 << 24) - 1 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(__u32)); // key is Geneve VNI + __uint(value_size, sizeof(struct endpoint)); + __uint(max_entries, MAX_GENEVE_VNI); +} endpoint_map SEC(".maps"); + +#define MAX_UDP_LEN_IPV4 1480 + +#define MAX_UDP_LEN_IPV6 1460 + +#define IP_MF 0x2000 +#define IP_OFFSET 0x1fff + +/* +Geneve Header: + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + |Ver| Opt Len |O|C| Rsvd. | Protocol Type | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Virtual Network Identifier (VNI) | Reserved | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + ~ Variable-Length Options ~ + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ +struct geneve_header { + __u8 first; + __u8 second; + __be16 protocol; + __be32 vni; +}; + +static __always_inline __u16 csum_fold(__u32 csum) { + __u32 sum; + sum = (csum >> 16) + (csum & 0xffff); // maximum value 0x1fffe + sum += (sum >> 16); // maximum value 0xffff + return sum; +} + +static __always_inline __u16 csum_fold_flip(__u32 csum) { + __u32 sum; + sum = (csum >> 16) + (csum & 0xffff); // maximum value 0x1fffe + sum += (sum >> 16); // maximum value 0xffff + return ~sum; +} + +static __always_inline __u32 pseudo_sum_ipv6(struct ipv6hdr* ip6, __u16 udp_len) { + __u32 pseudo = 0; // TODO(jwhited): __u64 for intermediate checksum values to reduce number of ops + for (int i = 0; i < 8; i ++) { + pseudo += ip6->saddr.in6_u.u6_addr16[i]; + pseudo += ip6->daddr.in6_u.u6_addr16[i]; + } + pseudo += bpf_htons(ip6->nexthdr); + pseudo += udp_len; + return pseudo; +} + +static __always_inline __u32 pseudo_sum_ipv4(struct iphdr* ip, __u16 udp_len) { + __u32 pseudo = (__u16)ip->saddr; + pseudo += (__u16)(ip->saddr >> 16); + pseudo += (__u16)ip->daddr; + pseudo += (__u16)(ip->daddr >> 16); + pseudo += bpf_htons(ip->protocol); + pseudo += udp_len; + return pseudo; +} + +// csum_const_size is an alternative to bpf_csum_diff. It's a verifier +// workaround for when we are forced to use a constant max_size + bounds +// checking. The alternative being passing a dynamic length to bpf_csum_diff +// {from,to}_size arguments, which the verifier can't follow. For further info +// see: https://github.com/iovisor/bcc/issues/2463#issuecomment-512503958 +static __always_inline __u16 csum_const_size(__u32 seed, void* from, void* data_end, int max_size) { + __u16 *buf = from; + for (int i = 0; i < max_size; i += 2) { + if ((void *)(buf + 1) > data_end) { + break; + } + seed += *buf; + buf++; + } + if ((void *)buf + 1 <= data_end) { + seed += *(__u8 *)buf; + } + return csum_fold_flip(seed); +} + +SEC("xdp") +int xdp_prog_func(struct xdp_md *ctx) { + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + + struct ethhdr *eth = data; + if ((void *)(eth + 1) > data_end) { + return XDP_PASS; + } + + struct iphdr *ip; + struct ipv6hdr *ip6; + struct udphdr *udp; + + int validate_udp_csum = 0; + int is_ipv6 = 0; + if (eth->h_proto == bpf_htons(ETH_P_IP)) { + ip = (void *)(eth + 1); + if ((void *)(ip + 1) > data_end) { + return XDP_PASS; + } + + if (ip->ihl != 5 || + ip->version != 4 || + ip->protocol != IPPROTO_UDP || + (ip->frag_off & bpf_htons(IP_MF | IP_OFFSET)) != 0) { + return XDP_PASS; + } + + // validate ipv4 header checksum + __u32 cs_unfolded = bpf_csum_diff(0, 0, (void *)ip, sizeof(*ip), 0); + __u16 cs = csum_fold_flip(cs_unfolded); + if (cs != 0) { + return XDP_PASS; + } + + if (bpf_ntohs(ip->tot_len) != data_end - (void *)ip) { + return XDP_PASS; + } + + udp = (void *)(ip + 1); + if ((void *)(udp + 1) > data_end) { + return XDP_PASS; + } + + if (udp->check != 0) { + // https://datatracker.ietf.org/doc/html/rfc768#page-3 + // If the computed checksum is zero, it is transmitted as all + // ones (the equivalent in one's complement arithmetic). An all + // zero transmitted checksum value means that the transmitter + // generated no checksum (for debugging or for higher level + // protocols that don't care). + validate_udp_csum = 1; + } + } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) { + ip6 = (void *)(eth + 1); + if ((void *)(ip6 + 1) > data_end) { + return XDP_PASS; + } + + if (ip6->version != 6 || ip6->nexthdr != IPPROTO_UDP) { + return XDP_PASS; + } + + udp = (void *)(ip6 + 1); + if ((void *)(udp + 1) > data_end) { + return XDP_PASS; + } + + if (bpf_ntohs(ip6->payload_len) != data_end - (void *)udp) { + return XDP_PASS; + } + + // https://datatracker.ietf.org/doc/html/rfc8200#page-28 + // Unlike IPv4, the default behavior when UDP packets are + // originated by an IPv6 node is that the UDP checksum is not + // optional. That is, whenever originating a UDP packet, an IPv6 + // node must compute a UDP checksum over the packet and the + // pseudo-header, and, if that computation yields a result of + // zero, it must be changed to hex FFFF for placement in the UDP + // header. IPv6 receivers must discard UDP packets containing a + // zero checksum and should log the error. + validate_udp_csum = 1; + is_ipv6 = 1; + } else { + return XDP_PASS; + } + + __u32 config_key = 0; + struct config *c = bpf_map_lookup_elem(&config_map, &config_key); + if (!c) { + return XDP_PASS; + } + + if (bpf_ntohs(udp->len) != data_end - (void *)udp) { + return XDP_PASS; + } + + if (bpf_ntohs(udp->dest) != c->dst_port) { + return XDP_PASS; + } + + if (validate_udp_csum) { + __u16 cs; + __u32 pseudo_sum; + if (is_ipv6) { + pseudo_sum = pseudo_sum_ipv6(ip6, udp->len); + cs = csum_const_size(pseudo_sum, udp, data_end, MAX_UDP_LEN_IPV6); + } else { + pseudo_sum = pseudo_sum_ipv4(ip, udp->len); + cs = csum_const_size(pseudo_sum, udp, data_end, MAX_UDP_LEN_IPV4); + } + if (cs != 0) { + return XDP_PASS; + } + } + + struct geneve_header *geneve = (void *)(udp + 1); + if ((void *)(geneve +1) > data_end) { + return XDP_PASS; + } + + if (geneve->first != 0) { + // first 2 bits are version, must be zero + // next 6 bits are opt len, must be zero + return XDP_PASS; + } + + if (geneve->second != 0) { + // first bit is control, must be zero + // next bit is critical (options), must be zero + // next 6 bits are reserved, must be zero + return XDP_PASS; + } + + if ((geneve->vni & 0x000000FF) != 0) { + // last byte is reserved, must be zero + return XDP_PASS; + } + + __u32 vni_key = bpf_ntohl(geneve->vni) >> 8; + struct endpoint *e = bpf_map_lookup_elem(&endpoint_map, &vni_key); + if (!e) { + return XDP_PASS; + } + + int out_participant_index = -1; // -1 = unmatched + if (is_ipv6) { + // TODO + } else { + for (int i = 0; i < 2; i ++) { + if (e->participant_is_ipv6[i] == 0 && + e->participant_addrs[i][3] == ip->saddr && + e->participant_ports[i] == bpf_ntohs(udp->source)) + { + if (i == 0) { + out_participant_index = 1; + } else { + out_participant_index = 0; + } + break; + } + } + } + if (out_participant_index == -1) { + return XDP_PASS; + } + + if (e->participant_is_ipv6[out_participant_index] == is_ipv6) { + // matching in/out address family + if (is_ipv6) { + // TODO: in ipv6, out ipv6 + } else { + // TODO: in ipv4, out ipv4 + + // Update IPv4 header + __be32 p_addr = e->participant_addrs[out_participant_index][3]; + __u32 ip_csum = ~(__u32)ip->check; + __u32 udp_csum = ~(__u32)udp->check; + ip_csum = bpf_csum_diff(&ip->saddr, 4, &p_addr, 4, ip_csum); + udp_csum = bpf_csum_diff(&ip->saddr, 4, &p_addr, 4, udp_csum); + ip->check = csum_fold_flip(ip_csum); + ip->saddr = ip->daddr; + ip->daddr = p_addr; + + #define AF_INET 2 + struct bpf_fib_lookup fib_params = {}; + fib_params.family = AF_INET; + fib_params.tos = ip->tos; + fib_params.l4_protocol = ip->protocol; + fib_params.sport = 0; + fib_params.dport = 0; + fib_params.tot_len = bpf_ntohs(ip->tot_len); + fib_params.ipv4_src = ip->saddr; + fib_params.ipv4_dst = ip->daddr; + fib_params.ifindex = ctx->ingress_ifindex; + + int rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), BPF_FIB_LOOKUP_DIRECT); + if (rc != BPF_FIB_LKUP_RET_SUCCESS) { + return XDP_ABORTED; + } + + // Rewrite ethernet header source and destination address. + __builtin_memcpy(eth->h_source, fib_params.smac, ETH_ALEN); + __builtin_memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); + + // Update UDP header + __u32 old_udp_port = (__u32)udp->source; + __u32 new_udp_port = (__u32)bpf_htons(e->participant_ports[out_participant_index]); + udp_csum = bpf_csum_diff(&old_udp_port, 4, &new_udp_port, 4, udp_csum); + udp->check = csum_fold_flip(udp_csum); + udp->source = udp->dest; + udp->dest = bpf_htons(e->participant_ports[out_participant_index]); + udp = (void *)(ip + 1); + if ((void *)(udp +1) > data_end) { + return XDP_ABORTED; + } + + return XDP_TX; + } + } else if (e->participant_is_ipv6[out_participant_index] == 0) { + // TODO: in ipv4, out ipv6 + } else { + // TODO: in ipv6, out ipv4 + } + + return XDP_PASS; +} \ No newline at end of file diff --git a/net/udprelay/xdp/xdp.go b/net/udprelay/xdp/xdp.go new file mode 100644 index 000000000..14c361879 --- /dev/null +++ b/net/udprelay/xdp/xdp.go @@ -0,0 +1,48 @@ +package xdp + +import "net/netip" + +// XDPAttachFlags represents how XDP program will be attached to interface. This +// is a mirror of cilium/ebpf/link.AttachFlags, without pulling it in for +// non-Linux. +type XDPAttachFlags uint32 + +const ( + // XDPDriverFallbackGenericMode attempts XDPDriverMode, and falls back to + // XDPGenericMode if the driver does not support XDP. + XDPDriverFallbackGenericMode = 0 +) + +const ( + // XDPGenericMode (SKB) links XDP BPF program for drivers which do + // not yet support native XDP. + XDPGenericMode XDPAttachFlags = 1 << (iota + 1) + // XDPDriverMode links XDP BPF program into the driver’s receive path. + XDPDriverMode + // XDPOffloadMode offloads the entire XDP BPF program into hardware. + XDPOffloadMode +) + +type FIBConfig struct { + DeviceName string + // TODO: DstPort is singular, but udp4 and udp6 can be independent ports if + // the user supplied a zero port value. + DstPort uint16 + AttachFlags XDPAttachFlags +} + +func (f FIBConfig) validate() error { return nil } + +type FIBOption interface { + apply(*fibOptions) +} + +type fibOptions struct { + noAttach bool +} + +type FIB interface { + Delete(vni uint32) error + Upsert(vni uint32, participants [2]netip.AddrPort) error + Close() error +} diff --git a/net/udprelay/xdp/xdp_linux.go b/net/udprelay/xdp/xdp_linux.go new file mode 100644 index 000000000..9a61942e9 --- /dev/null +++ b/net/udprelay/xdp/xdp_linux.go @@ -0,0 +1,103 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build linux + +package xdp + +import ( + "encoding/binary" + "errors" + "fmt" + "net" + "net/netip" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/link" +) + +//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -type config -type endpoint bpf xdp.c -- -I ../../../derp/xdp/headers + +func NewFIB(config *FIBConfig, opts ...FIBOption) (FIB, error) { + o := &fibOptions{} + for _, opt := range opts { + opt.apply(o) + } + err := config.validate() + if err != nil { + return nil, fmt.Errorf("invalid config: %v", err) + } + objs := new(bpfObjects) + err = loadBpfObjects(objs, nil) + if err != nil { + var ve *ebpf.VerifierError + if errors.As(err, &ve) { + err = fmt.Errorf("verifier error: %+v", ve) + } + return nil, fmt.Errorf("error loading XDP program: %w", err) + } + f := &linuxFIB{ + objs: objs, + dstPort: config.DstPort, + } + var key uint32 + xdpConfig := &bpfConfig{ + DstPort: config.DstPort, + } + err = objs.ConfigMap.Put(key, xdpConfig) + if err != nil { + return nil, fmt.Errorf("error loading config in eBPF map: %w", err) + } + if o.noAttach { + return f, nil + } + iface, err := net.InterfaceByName(config.DeviceName) + if err != nil { + return nil, fmt.Errorf("error finding device: %w", err) + } + link, err := link.AttachXDP(link.XDPOptions{ + Program: objs.XdpProgFunc, + Interface: iface.Index, + Flags: link.XDPAttachFlags(config.AttachFlags), + }) + if err != nil { + return nil, fmt.Errorf("error attaching XDP program to dev: %w", err) + } + f.link = link + return f, nil +} + +type linuxFIB struct { + objs *bpfObjects + dstPort uint16 + link link.Link +} + +func (l *linuxFIB) Delete(vni uint32) error { + return l.objs.EndpointMap.Delete(&vni) +} + +func (l *linuxFIB) Upsert(vni uint32, participants [2]netip.AddrPort) error { + endpoint := bpfEndpoint{} + for i, participant := range participants { + as16 := participant.Addr().As16() + for j := 0; j < 4; j++ { + endpoint.ParticipantAddrs[i][j] = binary.NativeEndian.Uint32(as16[j*4:]) + } + endpoint.ParticipantPorts[i] = participant.Port() + if participant.Addr().Is6() { + endpoint.ParticipantIsIpv6[i] = 1 + } + } + numCPU, err := ebpf.PossibleCPU() + if err != nil { + return err + } + vals := make([]bpfEndpoint, numCPU) + for i := range vals { + vals[i] = endpoint + } + return l.objs.EndpointMap.Put(&vni, vals) +} + +func (l *linuxFIB) Close() error { return nil } diff --git a/net/udprelay/xdp/xdp_notlinux.go b/net/udprelay/xdp/xdp_notlinux.go new file mode 100644 index 000000000..ba1466e94 --- /dev/null +++ b/net/udprelay/xdp/xdp_notlinux.go @@ -0,0 +1,18 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !linux + +package xdp + +import "net/netip" + +type noopFIB struct{} + +func (noopFIB) Delete(vni uint32) error { return nil } +func (noopFIB) Upsert(vni uint32, participants [2]netip.AddrPort) error { return nil } +func (noopFIB) Close(vni uint32, participants [2]netip.AddrPort) error { return nil } + +func NewFIB(config FIBConfig, opts ...FIBOption) (FIB, error) { + return noopFIB{} +}