diff --git a/internal/app/machined/pkg/controllers/kubespan/manager.go b/internal/app/machined/pkg/controllers/kubespan/manager.go index 754e0576d..dfdb94b4e 100644 --- a/internal/app/machined/pkg/controllers/kubespan/manager.go +++ b/internal/app/machined/pkg/controllers/kubespan/manager.go @@ -56,10 +56,10 @@ type WireguardClient interface { } // RulesManagerFactory allows mocking RulesManager. -type RulesManagerFactory func(targetTable, internalMark int) RulesManager +type RulesManagerFactory func(targetTable, internalMark, markMask int) RulesManager // NfTablesManagerFactory allows mocking NfTablesManager. -type NfTablesManagerFactory func(externalMark, internalMark uint32) NfTablesManager +type NfTablesManagerFactory func(externalMark, internalMark, markMask uint32) NfTablesManager // Inputs implements controller.Controller interface. func (ctrl *ManagerController) Inputs() []controller.Input { @@ -221,7 +221,7 @@ func (ctrl *ManagerController) Run(ctx context.Context, r controller.Runtime, lo } if rulesMgr == nil { - rulesMgr = ctrl.RulesManagerFactory(constants.KubeSpanDefaultRoutingTable, constants.KubeSpanDefaultForceFirewallMark) + rulesMgr = ctrl.RulesManagerFactory(constants.KubeSpanDefaultRoutingTable, constants.KubeSpanDefaultForceFirewallMark, constants.KubeSpanDefaultFirewallMask) if err = rulesMgr.Install(); err != nil { return fmt.Errorf("failed setting up routing rules: %w", err) @@ -229,7 +229,7 @@ func (ctrl *ManagerController) Run(ctx context.Context, r controller.Runtime, lo } if nfTablesMgr == nil { - nfTablesMgr = ctrl.NfTablesManagerFactory(constants.KubeSpanDefaultFirewallMark, constants.KubeSpanDefaultForceFirewallMark) + nfTablesMgr = ctrl.NfTablesManagerFactory(constants.KubeSpanDefaultFirewallMark, constants.KubeSpanDefaultForceFirewallMark, constants.KubeSpanDefaultFirewallMask) } cfgSpec := cfg.(*kubespan.Config).TypedSpec() diff --git a/internal/app/machined/pkg/controllers/kubespan/manager_test.go b/internal/app/machined/pkg/controllers/kubespan/manager_test.go index 4aeefacc9..f8fa94009 100644 --- a/internal/app/machined/pkg/controllers/kubespan/manager_test.go +++ b/internal/app/machined/pkg/controllers/kubespan/manager_test.go @@ -122,10 +122,10 @@ func (suite *ManagerSuite) TestReconcile() { WireguardClientFactory: func() (kubespanctrl.WireguardClient, error) { return mockWireguard, nil }, - RulesManagerFactory: func(_, _ int) kubespanctrl.RulesManager { + RulesManagerFactory: func(_, _, _ int) kubespanctrl.RulesManager { return mockRulesManager{} }, - NfTablesManagerFactory: func(_, _ uint32) kubespanctrl.NfTablesManager { + NfTablesManagerFactory: func(_, _, _ uint32) kubespanctrl.NfTablesManager { return mockNfTables }, PeerReconcileInterval: time.Second, diff --git a/internal/app/machined/pkg/controllers/kubespan/nftables.go b/internal/app/machined/pkg/controllers/kubespan/nftables.go index 3ae9ba73f..28df291cc 100644 --- a/internal/app/machined/pkg/controllers/kubespan/nftables.go +++ b/internal/app/machined/pkg/controllers/kubespan/nftables.go @@ -20,7 +20,7 @@ type NfTablesManager interface { } // NewNfTablesManager initializes NfTablesManager. -func NewNfTablesManager(externalMark, internalMark uint32) NfTablesManager { +func NewNfTablesManager(externalMark, internalMark, markMask uint32) NfTablesManager { nfTable := &nftables.Table{ Family: nftables.TableFamilyINet, Name: "talos_kubespan", @@ -29,6 +29,7 @@ func NewNfTablesManager(externalMark, internalMark uint32) NfTablesManager { return &nfTablesManager{ ExternalMark: externalMark, InternalMark: internalMark, + MarkMask: markMask, nfTable: nfTable, targetSet4: &nftables.Set{ @@ -50,6 +51,7 @@ func NewNfTablesManager(externalMark, internalMark uint32) NfTablesManager { type nfTablesManager struct { InternalMark uint32 ExternalMark uint32 + MarkMask uint32 currentSet *netaddr.IPSet @@ -173,25 +175,40 @@ func (m *nfTablesManager) setNFTable(ips *netaddr.IPSet) error { return fmt.Errorf("failed to add IPv6 set: %w", err) } + // meta mark & 0x00000060 == 0x00000020 accept + ruleExpr := []expr.Any{ + // Load the firewall mark into register 1 + &expr.Meta{ + Key: expr.MetaKeyMARK, + Register: 1, + }, + // Mask the mark with the configured mask: + // R1 = R1 & mask + &expr.Bitwise{ + SourceRegister: 1, + DestRegister: 1, + Len: 4, + Xor: binaryutil.NativeEndian.PutUint32(0), + Mask: binaryutil.NativeEndian.PutUint32(m.MarkMask), + }, + // Compare the masked firewall mark with expected value + &expr.Cmp{ + Op: expr.CmpOpEq, + Register: 1, + Data: binaryutil.NativeEndian.PutUint32(m.ExternalMark), + }, + // Accept the packet to stop the ruleset processing + &expr.Verdict{ + Kind: expr.VerdictAccept, + }, + } + // match fwmark of Wireguard interface (not kubespan mark) // accept and return without modifying the table or mark c.AddRule(&nftables.Rule{ Table: m.nfTable, Chain: preChain, - Exprs: []expr.Any{ - &expr.Meta{ - Key: expr.MetaKeyMARK, - Register: 1, - }, - &expr.Cmp{ - Op: expr.CmpOpEq, - Register: 1, - Data: binaryutil.NativeEndian.PutUint32(m.ExternalMark), - }, - &expr.Verdict{ - Kind: expr.VerdictAccept, - }, - }, + Exprs: ruleExpr, }) // match fwmark of Wireguard interface (not kubespan mark) @@ -199,44 +216,31 @@ func (m *nfTablesManager) setNFTable(ips *netaddr.IPSet) error { c.AddRule(&nftables.Rule{ Table: m.nfTable, Chain: outChain, - Exprs: []expr.Any{ - &expr.Meta{ - Key: expr.MetaKeyMARK, - Register: 1, - }, - &expr.Cmp{ - Op: expr.CmpOpEq, - Register: 1, - Data: binaryutil.NativeEndian.PutUint32(m.ExternalMark), - }, - &expr.Verdict{ - Kind: expr.VerdictAccept, - }, - }, + Exprs: ruleExpr, }) c.AddRule(&nftables.Rule{ Table: m.nfTable, Chain: preChain, - Exprs: matchIPv4Set(m.targetSet4, m.InternalMark), + Exprs: matchIPv4Set(m.targetSet4, m.InternalMark, m.MarkMask), }) c.AddRule(&nftables.Rule{ Table: m.nfTable, Chain: preChain, - Exprs: matchIPv6Set(m.targetSet6, m.InternalMark), + Exprs: matchIPv6Set(m.targetSet6, m.InternalMark, m.MarkMask), }) c.AddRule(&nftables.Rule{ Table: m.nfTable, Chain: outChain, - Exprs: matchIPv4Set(m.targetSet4, m.InternalMark), + Exprs: matchIPv4Set(m.targetSet4, m.InternalMark, m.MarkMask), }) c.AddRule(&nftables.Rule{ Table: m.nfTable, Chain: outChain, - Exprs: matchIPv6Set(m.targetSet6, m.InternalMark), + Exprs: matchIPv6Set(m.targetSet6, m.InternalMark, m.MarkMask), }) if err := c.Flush(); err != nil { @@ -246,15 +250,15 @@ func (m *nfTablesManager) setNFTable(ips *netaddr.IPSet) error { return nil } -func matchIPv4Set(set *nftables.Set, mark uint32) []expr.Any { - return matchIPSet(set, mark, nftables.TableFamilyIPv4) +func matchIPv4Set(set *nftables.Set, mark, mask uint32) []expr.Any { + return matchIPSet(set, mark, mask, nftables.TableFamilyIPv4) } -func matchIPv6Set(set *nftables.Set, mark uint32) []expr.Any { - return matchIPSet(set, mark, nftables.TableFamilyIPv6) +func matchIPv6Set(set *nftables.Set, mark, mask uint32) []expr.Any { + return matchIPSet(set, mark, mask, nftables.TableFamilyIPv6) } -func matchIPSet(set *nftables.Set, mark uint32, family nftables.TableFamily) []expr.Any { +func matchIPSet(set *nftables.Set, mark, mask uint32, family nftables.TableFamily) []expr.Any { var ( offset uint32 = 16 length uint32 = 4 @@ -265,6 +269,7 @@ func matchIPSet(set *nftables.Set, mark uint32, family nftables.TableFamily) []e length = 16 } + // ip daddr @kubespan_targets_ipv4 meta mark set meta mark & 0xffffffdf | 0x00000040 accept return []expr.Any{ // Store protocol type to register 1 &expr.Meta{ @@ -290,17 +295,29 @@ func matchIPSet(set *nftables.Set, mark uint32, family nftables.TableFamily) []e SetName: set.Name, SetID: set.ID, }, - // Store Firewall Force mark to register 1 - &expr.Immediate{ + // Load the current packet mark into register 1 + &expr.Meta{ + Key: expr.MetaKeyMARK, Register: 1, - Data: binaryutil.NativeEndian.PutUint32(mark), }, - // Set firewall mark + // This bitwise is equivalent to: R1 = R1 | (R1 & mask | mark) + // + // The NFTables backend bitwise operation is R3 = R2 & MASK ^ XOR, + // so we need to do a bit of a trick to do what we need: R1 = R1 & ^mask ^ mark + &expr.Bitwise{ + SourceRegister: 1, + DestRegister: 1, + Len: 4, + Xor: binaryutil.NativeEndian.PutUint32(mark), + Mask: binaryutil.NativeEndian.PutUint32(^mask), + }, + // Set firewall mark to the value computed in register 1 &expr.Meta{ Key: expr.MetaKeyMARK, SourceRegister: true, Register: 1, }, + // Accept the packet to stop the ruleset processing &expr.Verdict{ Kind: expr.VerdictAccept, }, diff --git a/internal/app/machined/pkg/controllers/kubespan/nftables_test.go b/internal/app/machined/pkg/controllers/kubespan/nftables_test.go index 1fddde1a4..c776f3aae 100644 --- a/internal/app/machined/pkg/controllers/kubespan/nftables_test.go +++ b/internal/app/machined/pkg/controllers/kubespan/nftables_test.go @@ -16,7 +16,7 @@ import ( func TestNfTables(t *testing.T) { // use a different mark to avoid conflicts with running kubespan - mgr := kubespan.NewNfTablesManager(constants.KubeSpanDefaultFirewallMark+10, constants.KubeSpanDefaultForceFirewallMark+10) + mgr := kubespan.NewNfTablesManager(constants.KubeSpanDefaultFirewallMark+10, constants.KubeSpanDefaultForceFirewallMark<<1, constants.KubeSpanDefaultFirewallMask<<1) // cleanup should be fine if nothing is installed assert.NoError(t, mgr.Cleanup()) diff --git a/internal/app/machined/pkg/controllers/kubespan/routing_rules.go b/internal/app/machined/pkg/controllers/kubespan/routing_rules.go index 0d60288bc..09d0d2433 100644 --- a/internal/app/machined/pkg/controllers/kubespan/routing_rules.go +++ b/internal/app/machined/pkg/controllers/kubespan/routing_rules.go @@ -23,16 +23,18 @@ type RulesManager interface { } // NewRulesManager initializes new RulesManager. -func NewRulesManager(targetTable, internalMark int) RulesManager { +func NewRulesManager(targetTable, internalMark, markMask int) RulesManager { return &rulesManager{ TargetTable: targetTable, InternalMark: internalMark, + MarkMask: markMask, } } type rulesManager struct { TargetTable int InternalMark int + MarkMask int } // Install routing rules. @@ -49,7 +51,7 @@ func (m *rulesManager) Install() error { Family: unix.AF_INET, Table: m.TargetTable, Mark: m.InternalMark, - Mask: -1, + Mask: m.MarkMask, Goto: -1, Flow: -1, SuppressIfgroup: -1, @@ -65,7 +67,7 @@ func (m *rulesManager) Install() error { Family: unix.AF_INET6, Table: m.TargetTable, Mark: m.InternalMark, - Mask: -1, + Mask: m.MarkMask, Goto: -1, Flow: -1, SuppressIfgroup: -1, diff --git a/internal/app/machined/pkg/controllers/kubespan/routing_rules_test.go b/internal/app/machined/pkg/controllers/kubespan/routing_rules_test.go index 12aecc92c..edeb8186e 100644 --- a/internal/app/machined/pkg/controllers/kubespan/routing_rules_test.go +++ b/internal/app/machined/pkg/controllers/kubespan/routing_rules_test.go @@ -14,7 +14,7 @@ import ( func TestRoutingRules(t *testing.T) { // use a different table/mark to avoid conflicts with running kubespan - mgr := kubespan.NewRulesManager(constants.KubeSpanDefaultRoutingTable+10, constants.KubeSpanDefaultForceFirewallMark+10) + mgr := kubespan.NewRulesManager(constants.KubeSpanDefaultRoutingTable+10, constants.KubeSpanDefaultForceFirewallMark<<1, constants.KubeSpanDefaultFirewallMask<<1) // cleanup should be fine if nothing is installed assert.NoError(t, mgr.Cleanup()) diff --git a/pkg/machinery/constants/constants.go b/pkg/machinery/constants/constants.go index 004b67cc1..91c89f133 100644 --- a/pkg/machinery/constants/constants.go +++ b/pkg/machinery/constants/constants.go @@ -619,12 +619,17 @@ const ( // KubeSpanDefaultFirewallMark is the default firewall mark to use for Wireguard encrypted egress packets. // // Normal Wireguard configurations will NOT use this firewall mark. - KubeSpanDefaultFirewallMark = 0x51820 + KubeSpanDefaultFirewallMark = 0x20 // KubeSpanDefaultForceFirewallMark is the default firewall mark to use for packets destined to IPs serviced by KubeSpan. // // It is used to signal that matching packets should be forced into the Wireguard interface. - KubeSpanDefaultForceFirewallMark = 0x51821 + KubeSpanDefaultForceFirewallMark = 0x40 + + // KubeSpanDefaultFirewallMask is the mask applied to the packet mark when matching and setting the mark. + // + // This mask signals the bits of the firewall mark used by KubeSpan. + KubeSpanDefaultFirewallMask = KubeSpanDefaultFirewallMark | KubeSpanDefaultForceFirewallMark // KubeSpanDefaultPeerKeepalive is the interval at which Wireguard Peer Keepalives should be sent. KubeSpanDefaultPeerKeepalive = 25 * time.Second diff --git a/website/content/v1.2/learn-more/kubespan.md b/website/content/v1.2/learn-more/kubespan.md index f0f53647c..1a6bf803d 100644 --- a/website/content/v1.2/learn-more/kubespan.md +++ b/website/content/v1.2/learn-more/kubespan.md @@ -98,3 +98,113 @@ So in summary, we: - send anything which is sent to that routing table through the WireGuard interface This gives us an isolated, resilient, tolerant, and non-invasive way to route Kubernetes traffic safely, automatically, and transparently through WireGuard across almost any set of network topologies. + +## Design Decisions + +### Routing + +Routing for Wireguard is a touch complicated when the set of possible peer +endpoints includes at least one member of the set of _destinations_. +That is, packets from Wireguard to a peer endpoint should not be sent to +Wireguard, lest a loop be created. + +In order to handle this situation, Wireguard provides the ability to mark +packets which it generates, so their routing can be handled separately. + +In our case, though, we actually want the inverse of this: we want to route +Wireguard packets however the normal networking routes and rules say they should +be routed, while packets destined for the other side of Wireguard Peers should +be forced into Wireguard interfaces. + +While IP Rules allow you to invert matches, they do not support matching based +on IP sets. +That means, to use simple rules, we would have to add a rule for +each destination, which could reach into hundreds or thousands of rules to +manage. +This is not really much of a performance issue, but it is a management +issue, since it is expected that we would not be the only manager of rules in +the system, and rules offer no facility to tag for ownership. + +IP Sets are supported by IPTables, and we could integrate there. +However, IPTables exists in a global namespace, which makes it fragile having +multiple parties manipulating it. +The newer NFTables replacement for IPTables, though, allows users to +independently hook into various points of XTables, keeping all such rules and +sets independent. +This means that regardless of what CNIs or other user-side routing rules may do, +our KubeSpan setup will not be messed up. + +Therefore, we utilise NFTables (which natively supports IP sets and owner +grouping) instead, to mark matching traffic which should be sent to the +Wireguard interface. +This way, we can keep all our KubeSpan set logic in one place, allowing us to +simply use a single `ip rule` match: +for our fwmark, and sending those matched packets to a separate routing table +with one rule: default to the wireguard interface. + +So we have three components: + + 1. A routing table for Wireguard-destined packets + 2. An NFTables table which defines the set of destinations packets to which will + be marked with our firewall mark. + - Hook into PreRouting (type Filter) + - Hook into Outgoing (type Route) + 3. One IP Rule which sends packets marked with our firewall mark to our Wireguard + routing table. + +### Routing Table + +The routing table (number 180 by default) is simple, containing a single route for each family: send everything through the Wireguard interface. + +### NFTables + +The logic inside NFTables is fairly simple. +First, everything is compiled into a single table: `talos_kubespan`. + +Next, two chains are set up: one for the `prerouting` hook (`kubespan_prerouting`) +and the other for the `outgoing` hook (`kubespan_outgoing`). + +We define two sets of target IP prefixes: one for IPv6 (`kubespan_targets_ipv6`) +and the other for IPv4 (`kubespan_targets_ipv4`). + +Last, we add rules to each chain which basically specify: + + 1. If the packet is marked as _from_ Wireguard, just accept it and terminate + the chain. + 2. If the packet matches an IP in either of the target IP sets, mark that + packet with the _to_ Wireguard mark. + +### Rules + +There are two route rules defined: one to match IPv6 packets and the other to +match IPv4 packets. + +These rules say the same thing for each: if the packet is marked that it should +go _to_ Wireguard, send it to the Wireguard +routing table. + +### Firewall Mark + +KubeSpan is using only two bits of the firewall mark with the mask `0x00000060`. + +> Note: if other software on the node is using the bits `0x60` of the firewall mark, this +> might cause conflicts and break KubeSpan. +> +> At the moment of the writing, it was confirmed that Calico CNI is using bits `0xffff0000` and +> Cilium CNI is using bits `0xf00`, so KubeSpan is compatible with both. +> Flannel CNI doesn't use firewall mark at all. + +In the routing rules table, we match on the mark `0x40` with the mask `0x60`: + +```text +32500: from all fwmark 0x40/0x60 lookup 180 +``` + +In the NFTables table, we match with the same mask `0x60` and we set the mask by only modifying +bits from the `0x60` mask: + +```text +meta mark & 0x00000060 == 0x00000020 accept +ip daddr @kubespan_targets_ipv4 meta mark set meta mark & 0xffffffdf | 0x00000040 accept +ip6 daddr @kubespan_targets_ipv6 meta mark set meta mark & 0xffffffdf | 0x00000040 accept +```