mirror of
https://github.com/siderolabs/talos.git
synced 2026-05-05 04:16:21 +02:00
fix: use masks and different firewall mark for KubeSpan
Fixes #4836 Firewall mark is `uint32` attached to the packet in the Linux kernel (it's not transmitted on the wire). This is a shared value for all networking software, so multiple components might attempt to set and match on the firewall mark. Cilium and Calico CNIs are using firewall marks internally, but they touch only some bits of the firewall mark. The way KubeSpan was implemented before this PR, it was doing direct match on the firewall mark, and setting the whole `uint32`, so it comes into conflict with any other networking component using firewall marks. The other problem was that firewall mark 0x51820 (0x51821) was too "wide" touching random bits of the 32-bit value for no good reason. So this change contains two fixes: * make firewall mark exactly a single bit (we use bits `0x20` and `0x40` now) * match and mark packets with the mask (don't touch bits outside of the mask when setting the mark and ignore bits outside of the mask when matching on the mark). This was tested successfully with both Cilium CNI (default config + `ipam.mode=kubernetes`) and Calico CNI (default config). One thing to note is that for KubeSpan and Talos it's important to make sure that `podSubnets` in the machine config match CNI setting for `podCIDRs`. Signed-off-by: Andrey Smirnov <andrey.smirnov@talos-systems.com>
This commit is contained in:
parent
80444a43d9
commit
644e803adf
@ -56,10 +56,10 @@ type WireguardClient interface {
|
||||
}
|
||||
|
||||
// RulesManagerFactory allows mocking RulesManager.
|
||||
type RulesManagerFactory func(targetTable, internalMark int) RulesManager
|
||||
type RulesManagerFactory func(targetTable, internalMark, markMask int) RulesManager
|
||||
|
||||
// NfTablesManagerFactory allows mocking NfTablesManager.
|
||||
type NfTablesManagerFactory func(externalMark, internalMark uint32) NfTablesManager
|
||||
type NfTablesManagerFactory func(externalMark, internalMark, markMask uint32) NfTablesManager
|
||||
|
||||
// Inputs implements controller.Controller interface.
|
||||
func (ctrl *ManagerController) Inputs() []controller.Input {
|
||||
@ -221,7 +221,7 @@ func (ctrl *ManagerController) Run(ctx context.Context, r controller.Runtime, lo
|
||||
}
|
||||
|
||||
if rulesMgr == nil {
|
||||
rulesMgr = ctrl.RulesManagerFactory(constants.KubeSpanDefaultRoutingTable, constants.KubeSpanDefaultForceFirewallMark)
|
||||
rulesMgr = ctrl.RulesManagerFactory(constants.KubeSpanDefaultRoutingTable, constants.KubeSpanDefaultForceFirewallMark, constants.KubeSpanDefaultFirewallMask)
|
||||
|
||||
if err = rulesMgr.Install(); err != nil {
|
||||
return fmt.Errorf("failed setting up routing rules: %w", err)
|
||||
@ -229,7 +229,7 @@ func (ctrl *ManagerController) Run(ctx context.Context, r controller.Runtime, lo
|
||||
}
|
||||
|
||||
if nfTablesMgr == nil {
|
||||
nfTablesMgr = ctrl.NfTablesManagerFactory(constants.KubeSpanDefaultFirewallMark, constants.KubeSpanDefaultForceFirewallMark)
|
||||
nfTablesMgr = ctrl.NfTablesManagerFactory(constants.KubeSpanDefaultFirewallMark, constants.KubeSpanDefaultForceFirewallMark, constants.KubeSpanDefaultFirewallMask)
|
||||
}
|
||||
|
||||
cfgSpec := cfg.(*kubespan.Config).TypedSpec()
|
||||
|
||||
@ -122,10 +122,10 @@ func (suite *ManagerSuite) TestReconcile() {
|
||||
WireguardClientFactory: func() (kubespanctrl.WireguardClient, error) {
|
||||
return mockWireguard, nil
|
||||
},
|
||||
RulesManagerFactory: func(_, _ int) kubespanctrl.RulesManager {
|
||||
RulesManagerFactory: func(_, _, _ int) kubespanctrl.RulesManager {
|
||||
return mockRulesManager{}
|
||||
},
|
||||
NfTablesManagerFactory: func(_, _ uint32) kubespanctrl.NfTablesManager {
|
||||
NfTablesManagerFactory: func(_, _, _ uint32) kubespanctrl.NfTablesManager {
|
||||
return mockNfTables
|
||||
},
|
||||
PeerReconcileInterval: time.Second,
|
||||
|
||||
@ -20,7 +20,7 @@ type NfTablesManager interface {
|
||||
}
|
||||
|
||||
// NewNfTablesManager initializes NfTablesManager.
|
||||
func NewNfTablesManager(externalMark, internalMark uint32) NfTablesManager {
|
||||
func NewNfTablesManager(externalMark, internalMark, markMask uint32) NfTablesManager {
|
||||
nfTable := &nftables.Table{
|
||||
Family: nftables.TableFamilyINet,
|
||||
Name: "talos_kubespan",
|
||||
@ -29,6 +29,7 @@ func NewNfTablesManager(externalMark, internalMark uint32) NfTablesManager {
|
||||
return &nfTablesManager{
|
||||
ExternalMark: externalMark,
|
||||
InternalMark: internalMark,
|
||||
MarkMask: markMask,
|
||||
|
||||
nfTable: nfTable,
|
||||
targetSet4: &nftables.Set{
|
||||
@ -50,6 +51,7 @@ func NewNfTablesManager(externalMark, internalMark uint32) NfTablesManager {
|
||||
type nfTablesManager struct {
|
||||
InternalMark uint32
|
||||
ExternalMark uint32
|
||||
MarkMask uint32
|
||||
|
||||
currentSet *netaddr.IPSet
|
||||
|
||||
@ -173,25 +175,40 @@ func (m *nfTablesManager) setNFTable(ips *netaddr.IPSet) error {
|
||||
return fmt.Errorf("failed to add IPv6 set: %w", err)
|
||||
}
|
||||
|
||||
// meta mark & 0x00000060 == 0x00000020 accept
|
||||
ruleExpr := []expr.Any{
|
||||
// Load the firewall mark into register 1
|
||||
&expr.Meta{
|
||||
Key: expr.MetaKeyMARK,
|
||||
Register: 1,
|
||||
},
|
||||
// Mask the mark with the configured mask:
|
||||
// R1 = R1 & mask
|
||||
&expr.Bitwise{
|
||||
SourceRegister: 1,
|
||||
DestRegister: 1,
|
||||
Len: 4,
|
||||
Xor: binaryutil.NativeEndian.PutUint32(0),
|
||||
Mask: binaryutil.NativeEndian.PutUint32(m.MarkMask),
|
||||
},
|
||||
// Compare the masked firewall mark with expected value
|
||||
&expr.Cmp{
|
||||
Op: expr.CmpOpEq,
|
||||
Register: 1,
|
||||
Data: binaryutil.NativeEndian.PutUint32(m.ExternalMark),
|
||||
},
|
||||
// Accept the packet to stop the ruleset processing
|
||||
&expr.Verdict{
|
||||
Kind: expr.VerdictAccept,
|
||||
},
|
||||
}
|
||||
|
||||
// match fwmark of Wireguard interface (not kubespan mark)
|
||||
// accept and return without modifying the table or mark
|
||||
c.AddRule(&nftables.Rule{
|
||||
Table: m.nfTable,
|
||||
Chain: preChain,
|
||||
Exprs: []expr.Any{
|
||||
&expr.Meta{
|
||||
Key: expr.MetaKeyMARK,
|
||||
Register: 1,
|
||||
},
|
||||
&expr.Cmp{
|
||||
Op: expr.CmpOpEq,
|
||||
Register: 1,
|
||||
Data: binaryutil.NativeEndian.PutUint32(m.ExternalMark),
|
||||
},
|
||||
&expr.Verdict{
|
||||
Kind: expr.VerdictAccept,
|
||||
},
|
||||
},
|
||||
Exprs: ruleExpr,
|
||||
})
|
||||
|
||||
// match fwmark of Wireguard interface (not kubespan mark)
|
||||
@ -199,44 +216,31 @@ func (m *nfTablesManager) setNFTable(ips *netaddr.IPSet) error {
|
||||
c.AddRule(&nftables.Rule{
|
||||
Table: m.nfTable,
|
||||
Chain: outChain,
|
||||
Exprs: []expr.Any{
|
||||
&expr.Meta{
|
||||
Key: expr.MetaKeyMARK,
|
||||
Register: 1,
|
||||
},
|
||||
&expr.Cmp{
|
||||
Op: expr.CmpOpEq,
|
||||
Register: 1,
|
||||
Data: binaryutil.NativeEndian.PutUint32(m.ExternalMark),
|
||||
},
|
||||
&expr.Verdict{
|
||||
Kind: expr.VerdictAccept,
|
||||
},
|
||||
},
|
||||
Exprs: ruleExpr,
|
||||
})
|
||||
|
||||
c.AddRule(&nftables.Rule{
|
||||
Table: m.nfTable,
|
||||
Chain: preChain,
|
||||
Exprs: matchIPv4Set(m.targetSet4, m.InternalMark),
|
||||
Exprs: matchIPv4Set(m.targetSet4, m.InternalMark, m.MarkMask),
|
||||
})
|
||||
|
||||
c.AddRule(&nftables.Rule{
|
||||
Table: m.nfTable,
|
||||
Chain: preChain,
|
||||
Exprs: matchIPv6Set(m.targetSet6, m.InternalMark),
|
||||
Exprs: matchIPv6Set(m.targetSet6, m.InternalMark, m.MarkMask),
|
||||
})
|
||||
|
||||
c.AddRule(&nftables.Rule{
|
||||
Table: m.nfTable,
|
||||
Chain: outChain,
|
||||
Exprs: matchIPv4Set(m.targetSet4, m.InternalMark),
|
||||
Exprs: matchIPv4Set(m.targetSet4, m.InternalMark, m.MarkMask),
|
||||
})
|
||||
|
||||
c.AddRule(&nftables.Rule{
|
||||
Table: m.nfTable,
|
||||
Chain: outChain,
|
||||
Exprs: matchIPv6Set(m.targetSet6, m.InternalMark),
|
||||
Exprs: matchIPv6Set(m.targetSet6, m.InternalMark, m.MarkMask),
|
||||
})
|
||||
|
||||
if err := c.Flush(); err != nil {
|
||||
@ -246,15 +250,15 @@ func (m *nfTablesManager) setNFTable(ips *netaddr.IPSet) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func matchIPv4Set(set *nftables.Set, mark uint32) []expr.Any {
|
||||
return matchIPSet(set, mark, nftables.TableFamilyIPv4)
|
||||
func matchIPv4Set(set *nftables.Set, mark, mask uint32) []expr.Any {
|
||||
return matchIPSet(set, mark, mask, nftables.TableFamilyIPv4)
|
||||
}
|
||||
|
||||
func matchIPv6Set(set *nftables.Set, mark uint32) []expr.Any {
|
||||
return matchIPSet(set, mark, nftables.TableFamilyIPv6)
|
||||
func matchIPv6Set(set *nftables.Set, mark, mask uint32) []expr.Any {
|
||||
return matchIPSet(set, mark, mask, nftables.TableFamilyIPv6)
|
||||
}
|
||||
|
||||
func matchIPSet(set *nftables.Set, mark uint32, family nftables.TableFamily) []expr.Any {
|
||||
func matchIPSet(set *nftables.Set, mark, mask uint32, family nftables.TableFamily) []expr.Any {
|
||||
var (
|
||||
offset uint32 = 16
|
||||
length uint32 = 4
|
||||
@ -265,6 +269,7 @@ func matchIPSet(set *nftables.Set, mark uint32, family nftables.TableFamily) []e
|
||||
length = 16
|
||||
}
|
||||
|
||||
// ip daddr @kubespan_targets_ipv4 meta mark set meta mark & 0xffffffdf | 0x00000040 accept
|
||||
return []expr.Any{
|
||||
// Store protocol type to register 1
|
||||
&expr.Meta{
|
||||
@ -290,17 +295,29 @@ func matchIPSet(set *nftables.Set, mark uint32, family nftables.TableFamily) []e
|
||||
SetName: set.Name,
|
||||
SetID: set.ID,
|
||||
},
|
||||
// Store Firewall Force mark to register 1
|
||||
&expr.Immediate{
|
||||
// Load the current packet mark into register 1
|
||||
&expr.Meta{
|
||||
Key: expr.MetaKeyMARK,
|
||||
Register: 1,
|
||||
Data: binaryutil.NativeEndian.PutUint32(mark),
|
||||
},
|
||||
// Set firewall mark
|
||||
// This bitwise is equivalent to: R1 = R1 | (R1 & mask | mark)
|
||||
//
|
||||
// The NFTables backend bitwise operation is R3 = R2 & MASK ^ XOR,
|
||||
// so we need to do a bit of a trick to do what we need: R1 = R1 & ^mask ^ mark
|
||||
&expr.Bitwise{
|
||||
SourceRegister: 1,
|
||||
DestRegister: 1,
|
||||
Len: 4,
|
||||
Xor: binaryutil.NativeEndian.PutUint32(mark),
|
||||
Mask: binaryutil.NativeEndian.PutUint32(^mask),
|
||||
},
|
||||
// Set firewall mark to the value computed in register 1
|
||||
&expr.Meta{
|
||||
Key: expr.MetaKeyMARK,
|
||||
SourceRegister: true,
|
||||
Register: 1,
|
||||
},
|
||||
// Accept the packet to stop the ruleset processing
|
||||
&expr.Verdict{
|
||||
Kind: expr.VerdictAccept,
|
||||
},
|
||||
|
||||
@ -16,7 +16,7 @@ import (
|
||||
|
||||
func TestNfTables(t *testing.T) {
|
||||
// use a different mark to avoid conflicts with running kubespan
|
||||
mgr := kubespan.NewNfTablesManager(constants.KubeSpanDefaultFirewallMark+10, constants.KubeSpanDefaultForceFirewallMark+10)
|
||||
mgr := kubespan.NewNfTablesManager(constants.KubeSpanDefaultFirewallMark+10, constants.KubeSpanDefaultForceFirewallMark<<1, constants.KubeSpanDefaultFirewallMask<<1)
|
||||
|
||||
// cleanup should be fine if nothing is installed
|
||||
assert.NoError(t, mgr.Cleanup())
|
||||
|
||||
@ -23,16 +23,18 @@ type RulesManager interface {
|
||||
}
|
||||
|
||||
// NewRulesManager initializes new RulesManager.
|
||||
func NewRulesManager(targetTable, internalMark int) RulesManager {
|
||||
func NewRulesManager(targetTable, internalMark, markMask int) RulesManager {
|
||||
return &rulesManager{
|
||||
TargetTable: targetTable,
|
||||
InternalMark: internalMark,
|
||||
MarkMask: markMask,
|
||||
}
|
||||
}
|
||||
|
||||
type rulesManager struct {
|
||||
TargetTable int
|
||||
InternalMark int
|
||||
MarkMask int
|
||||
}
|
||||
|
||||
// Install routing rules.
|
||||
@ -49,7 +51,7 @@ func (m *rulesManager) Install() error {
|
||||
Family: unix.AF_INET,
|
||||
Table: m.TargetTable,
|
||||
Mark: m.InternalMark,
|
||||
Mask: -1,
|
||||
Mask: m.MarkMask,
|
||||
Goto: -1,
|
||||
Flow: -1,
|
||||
SuppressIfgroup: -1,
|
||||
@ -65,7 +67,7 @@ func (m *rulesManager) Install() error {
|
||||
Family: unix.AF_INET6,
|
||||
Table: m.TargetTable,
|
||||
Mark: m.InternalMark,
|
||||
Mask: -1,
|
||||
Mask: m.MarkMask,
|
||||
Goto: -1,
|
||||
Flow: -1,
|
||||
SuppressIfgroup: -1,
|
||||
|
||||
@ -14,7 +14,7 @@ import (
|
||||
|
||||
func TestRoutingRules(t *testing.T) {
|
||||
// use a different table/mark to avoid conflicts with running kubespan
|
||||
mgr := kubespan.NewRulesManager(constants.KubeSpanDefaultRoutingTable+10, constants.KubeSpanDefaultForceFirewallMark+10)
|
||||
mgr := kubespan.NewRulesManager(constants.KubeSpanDefaultRoutingTable+10, constants.KubeSpanDefaultForceFirewallMark<<1, constants.KubeSpanDefaultFirewallMask<<1)
|
||||
|
||||
// cleanup should be fine if nothing is installed
|
||||
assert.NoError(t, mgr.Cleanup())
|
||||
|
||||
@ -619,12 +619,17 @@ const (
|
||||
// KubeSpanDefaultFirewallMark is the default firewall mark to use for Wireguard encrypted egress packets.
|
||||
//
|
||||
// Normal Wireguard configurations will NOT use this firewall mark.
|
||||
KubeSpanDefaultFirewallMark = 0x51820
|
||||
KubeSpanDefaultFirewallMark = 0x20
|
||||
|
||||
// KubeSpanDefaultForceFirewallMark is the default firewall mark to use for packets destined to IPs serviced by KubeSpan.
|
||||
//
|
||||
// It is used to signal that matching packets should be forced into the Wireguard interface.
|
||||
KubeSpanDefaultForceFirewallMark = 0x51821
|
||||
KubeSpanDefaultForceFirewallMark = 0x40
|
||||
|
||||
// KubeSpanDefaultFirewallMask is the mask applied to the packet mark when matching and setting the mark.
|
||||
//
|
||||
// This mask signals the bits of the firewall mark used by KubeSpan.
|
||||
KubeSpanDefaultFirewallMask = KubeSpanDefaultFirewallMark | KubeSpanDefaultForceFirewallMark
|
||||
|
||||
// KubeSpanDefaultPeerKeepalive is the interval at which Wireguard Peer Keepalives should be sent.
|
||||
KubeSpanDefaultPeerKeepalive = 25 * time.Second
|
||||
|
||||
@ -98,3 +98,113 @@ So in summary, we:
|
||||
- send anything which is sent to that routing table through the WireGuard interface
|
||||
|
||||
This gives us an isolated, resilient, tolerant, and non-invasive way to route Kubernetes traffic safely, automatically, and transparently through WireGuard across almost any set of network topologies.
|
||||
|
||||
## Design Decisions
|
||||
|
||||
### Routing
|
||||
|
||||
Routing for Wireguard is a touch complicated when the set of possible peer
|
||||
endpoints includes at least one member of the set of _destinations_.
|
||||
That is, packets from Wireguard to a peer endpoint should not be sent to
|
||||
Wireguard, lest a loop be created.
|
||||
|
||||
In order to handle this situation, Wireguard provides the ability to mark
|
||||
packets which it generates, so their routing can be handled separately.
|
||||
|
||||
In our case, though, we actually want the inverse of this: we want to route
|
||||
Wireguard packets however the normal networking routes and rules say they should
|
||||
be routed, while packets destined for the other side of Wireguard Peers should
|
||||
be forced into Wireguard interfaces.
|
||||
|
||||
While IP Rules allow you to invert matches, they do not support matching based
|
||||
on IP sets.
|
||||
That means, to use simple rules, we would have to add a rule for
|
||||
each destination, which could reach into hundreds or thousands of rules to
|
||||
manage.
|
||||
This is not really much of a performance issue, but it is a management
|
||||
issue, since it is expected that we would not be the only manager of rules in
|
||||
the system, and rules offer no facility to tag for ownership.
|
||||
|
||||
IP Sets are supported by IPTables, and we could integrate there.
|
||||
However, IPTables exists in a global namespace, which makes it fragile having
|
||||
multiple parties manipulating it.
|
||||
The newer NFTables replacement for IPTables, though, allows users to
|
||||
independently hook into various points of XTables, keeping all such rules and
|
||||
sets independent.
|
||||
This means that regardless of what CNIs or other user-side routing rules may do,
|
||||
our KubeSpan setup will not be messed up.
|
||||
|
||||
Therefore, we utilise NFTables (which natively supports IP sets and owner
|
||||
grouping) instead, to mark matching traffic which should be sent to the
|
||||
Wireguard interface.
|
||||
This way, we can keep all our KubeSpan set logic in one place, allowing us to
|
||||
simply use a single `ip rule` match:
|
||||
for our fwmark, and sending those matched packets to a separate routing table
|
||||
with one rule: default to the wireguard interface.
|
||||
|
||||
So we have three components:
|
||||
|
||||
1. A routing table for Wireguard-destined packets
|
||||
2. An NFTables table which defines the set of destinations packets to which will
|
||||
be marked with our firewall mark.
|
||||
- Hook into PreRouting (type Filter)
|
||||
- Hook into Outgoing (type Route)
|
||||
3. One IP Rule which sends packets marked with our firewall mark to our Wireguard
|
||||
routing table.
|
||||
|
||||
### Routing Table
|
||||
|
||||
The routing table (number 180 by default) is simple, containing a single route for each family: send everything through the Wireguard interface.
|
||||
|
||||
### NFTables
|
||||
|
||||
The logic inside NFTables is fairly simple.
|
||||
First, everything is compiled into a single table: `talos_kubespan`.
|
||||
|
||||
Next, two chains are set up: one for the `prerouting` hook (`kubespan_prerouting`)
|
||||
and the other for the `outgoing` hook (`kubespan_outgoing`).
|
||||
|
||||
We define two sets of target IP prefixes: one for IPv6 (`kubespan_targets_ipv6`)
|
||||
and the other for IPv4 (`kubespan_targets_ipv4`).
|
||||
|
||||
Last, we add rules to each chain which basically specify:
|
||||
|
||||
1. If the packet is marked as _from_ Wireguard, just accept it and terminate
|
||||
the chain.
|
||||
2. If the packet matches an IP in either of the target IP sets, mark that
|
||||
packet with the _to_ Wireguard mark.
|
||||
|
||||
### Rules
|
||||
|
||||
There are two route rules defined: one to match IPv6 packets and the other to
|
||||
match IPv4 packets.
|
||||
|
||||
These rules say the same thing for each: if the packet is marked that it should
|
||||
go _to_ Wireguard, send it to the Wireguard
|
||||
routing table.
|
||||
|
||||
### Firewall Mark
|
||||
|
||||
KubeSpan is using only two bits of the firewall mark with the mask `0x00000060`.
|
||||
|
||||
> Note: if other software on the node is using the bits `0x60` of the firewall mark, this
|
||||
> might cause conflicts and break KubeSpan.
|
||||
>
|
||||
> At the moment of the writing, it was confirmed that Calico CNI is using bits `0xffff0000` and
|
||||
> Cilium CNI is using bits `0xf00`, so KubeSpan is compatible with both.
|
||||
> Flannel CNI doesn't use firewall mark at all.
|
||||
|
||||
In the routing rules table, we match on the mark `0x40` with the mask `0x60`:
|
||||
|
||||
```text
|
||||
32500: from all fwmark 0x40/0x60 lookup 180
|
||||
```
|
||||
|
||||
In the NFTables table, we match with the same mask `0x60` and we set the mask by only modifying
|
||||
bits from the `0x60` mask:
|
||||
|
||||
```text
|
||||
meta mark & 0x00000060 == 0x00000020 accept
|
||||
ip daddr @kubespan_targets_ipv4 meta mark set meta mark & 0xffffffdf | 0x00000040 accept
|
||||
ip6 daddr @kubespan_targets_ipv6 meta mark set meta mark & 0xffffffdf | 0x00000040 accept
|
||||
```
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user