wgengine/router: rely on events for deleted IP rules (#16744)

Adds the eventbus to the router subsystem.

The event is currently only used on linux.

Also includes facilities to inject events into the bus.

Updates #15160

Signed-off-by: Claus Lensbøl <claus@tailscale.com>
This commit is contained in:
Claus Lensbøl 2025-08-05 08:31:51 -04:00 committed by GitHub
parent b0018f1e7d
commit 5bb42e3018
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 132 additions and 82 deletions

View File

@ -800,7 +800,7 @@ func tryEngine(logf logger.Logf, sys *tsd.System, name string) (onlyNetstack boo
sys.NetMon.Get().SetTailscaleInterfaceName(devName)
}
r, err := router.New(logf, dev, sys.NetMon.Get(), sys.HealthTracker())
r, err := router.New(logf, dev, sys.NetMon.Get(), sys.HealthTracker(), sys.Bus.Get())
if err != nil {
dev.Close()
return false, fmt.Errorf("creating router: %w", err)

View File

@ -66,7 +66,6 @@ type Monitor struct {
mu sync.Mutex // guards all following fields
cbs set.HandleSet[ChangeFunc]
ruleDelCB set.HandleSet[RuleDeleteCallback]
ifState *State
gwValid bool // whether gw and gwSelfIP are valid
gw netip.Addr // our gateway's IP
@ -224,29 +223,6 @@ func (m *Monitor) RegisterChangeCallback(callback ChangeFunc) (unregister func()
}
}
// RuleDeleteCallback is a callback when a Linux IP policy routing
// rule is deleted. The table is the table number (52, 253, 354) and
// priority is the priority order number (for Tailscale rules
// currently: 5210, 5230, 5250, 5270)
type RuleDeleteCallback func(table uint8, priority uint32)
// RegisterRuleDeleteCallback adds callback to the set of parties to be
// notified (in their own goroutine) when a Linux ip rule is deleted.
// To remove this callback, call unregister (or close the monitor).
func (m *Monitor) RegisterRuleDeleteCallback(callback RuleDeleteCallback) (unregister func()) {
if m.static {
return func() {}
}
m.mu.Lock()
defer m.mu.Unlock()
handle := m.ruleDelCB.Add(callback)
return func() {
m.mu.Lock()
defer m.mu.Unlock()
delete(m.ruleDelCB, handle)
}
}
// Start starts the monitor.
// A monitor can only be started & closed once.
func (m *Monitor) Start() {
@ -359,10 +335,6 @@ func (m *Monitor) pump() {
time.Sleep(time.Second)
continue
}
if rdm, ok := msg.(ipRuleDeletedMessage); ok {
m.notifyRuleDeleted(rdm)
continue
}
if msg.ignore() {
continue
}
@ -370,14 +342,6 @@ func (m *Monitor) pump() {
}
}
func (m *Monitor) notifyRuleDeleted(rdm ipRuleDeletedMessage) {
m.mu.Lock()
defer m.mu.Unlock()
for _, cb := range m.ruleDelCB {
go cb(rdm.table, rdm.priority)
}
}
// isInterestingInterface reports whether the provided interface should be
// considered when checking for network state changes.
// The ips parameter should be the IPs of the provided interface.
@ -624,10 +588,3 @@ func (m *Monitor) checkWallTimeAdvanceLocked() bool {
func (m *Monitor) resetTimeJumpedLocked() {
m.timeJumped = false
}
type ipRuleDeletedMessage struct {
table uint8
priority uint32
}
func (ipRuleDeletedMessage) ignore() bool { return true }

View File

@ -241,18 +241,15 @@ func (c *nlConn) Receive() (message, error) {
// On `ip -4 rule del pref 5210 table main`, logs:
// monitor: ip rule deleted: {Family:2 DstLength:0 SrcLength:0 Tos:0 Table:254 Protocol:0 Scope:0 Type:1 Flags:0 Attributes:{Dst:<nil> Src:<nil> Gateway:<nil> OutIface:0 Priority:5210 Table:254 Mark:4294967295 Expires:<nil> Metrics:<nil> Multipath:[]}}
}
c.rulesDeleted.Publish(RuleDeleted{
rd := RuleDeleted{
Table: rmsg.Table,
Priority: rmsg.Attributes.Priority,
})
rdm := ipRuleDeletedMessage{
table: rmsg.Table,
priority: rmsg.Attributes.Priority,
}
c.rulesDeleted.Publish(rd)
if debugNetlinkMessages() {
c.logf("%+v", rdm)
c.logf("%+v", rd)
}
return rdm, nil
return ignoreMessage{}, nil
case unix.RTM_NEWLINK, unix.RTM_DELLINK:
// This is an unhandled message, but don't print an error.
// See https://github.com/tailscale/tailscale/issues/6806

View File

@ -119,7 +119,7 @@ func Subscribe[T any](c *Client) *Subscriber[T] {
return s
}
// Publisher returns a publisher for event type T using the given
// Publish returns a publisher for event type T using the given
// client.
func Publish[T any](c *Client) *Publisher[T] {
p := newPublisher[T](c)

View File

@ -21,7 +21,7 @@ func NewBus(t *testing.T) *eventbus.Bus {
return bus
}
// NewTestWatcher constructs a [Watcher] that can be used to check the stream of
// NewWatcher constructs a [Watcher] that can be used to check the stream of
// events generated by code under test. After construction the caller may use
// [Expect] and [ExpectExactly], to verify that the desired events were captured.
func NewWatcher(t *testing.T, bus *eventbus.Bus) *Watcher {
@ -201,3 +201,39 @@ func eventFilter(f any) filter {
return fixup(fv.Call([]reflect.Value{args[0].Elem()}))
}).Interface().(filter)
}
// Injector holds a map with [eventbus.Publisher], tied to an [eventbus.Client]
// for testing purposes.
type Injector struct {
client *eventbus.Client
publishers map[reflect.Type]any
// The value for a key is an *eventbus.Publisher[T] for the corresponding type.
}
// NewInjector constructs an [Injector] that can be used to inject events into
// the the stream of events used by code under test. After construction the
// caller may use [Inject] to insert events into the bus.
func NewInjector(t *testing.T, b *eventbus.Bus) *Injector {
inj := &Injector{
client: b.Client(t.Name()),
publishers: make(map[reflect.Type]any),
}
t.Cleanup(inj.client.Close)
return inj
}
// Inject inserts events of T onto an [eventbus.Bus]. If an [eventbus.Publisher]
// for the type does not exist, it will be initialized lazily. Calling inject is
// synchronous, and the event will as such have been published to the eventbus
// by the time the function returns.
func Inject[T any](inj *Injector, event T) {
eventType := reflect.TypeFor[T]()
pub, ok := inj.publishers[eventType]
if !ok {
pub = eventbus.Publish[T](inj.client)
inj.publishers[eventType] = pub
}
pub.(*eventbus.Publisher[T]).Publish(event)
}

View File

@ -14,6 +14,7 @@
"tailscale.com/net/netmon"
"tailscale.com/types/logger"
"tailscale.com/types/preftype"
"tailscale.com/util/eventbus"
)
// Router is responsible for managing the system network stack.
@ -45,9 +46,11 @@ type Router interface {
//
// If netMon is nil, it's not used. It's currently (2021-07-20) only
// used on Linux in some situations.
func New(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor, health *health.Tracker) (Router, error) {
func New(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor,
health *health.Tracker, bus *eventbus.Bus,
) (Router, error) {
logf = logger.WithPrefix(logf, "router: ")
return newUserspaceRouter(logf, tundev, netMon, health)
return newUserspaceRouter(logf, tundev, netMon, health, bus)
}
// CleanUp restores the system network configuration to its original state

View File

@ -10,9 +10,10 @@
"tailscale.com/health"
"tailscale.com/net/netmon"
"tailscale.com/types/logger"
"tailscale.com/util/eventbus"
)
func newUserspaceRouter(logf logger.Logf, tunDev tun.Device, netMon *netmon.Monitor, health *health.Tracker) (Router, error) {
func newUserspaceRouter(logf logger.Logf, tunDev tun.Device, netMon *netmon.Monitor, health *health.Tracker, _ *eventbus.Bus) (Router, error) {
// Note, this codepath is _not_ used when building the android app
// from github.com/tailscale/tailscale-android. The android app
// constructs its own wgengine with a custom router implementation

View File

@ -8,9 +8,10 @@
"tailscale.com/health"
"tailscale.com/net/netmon"
"tailscale.com/types/logger"
"tailscale.com/util/eventbus"
)
func newUserspaceRouter(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor, health *health.Tracker) (Router, error) {
func newUserspaceRouter(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor, health *health.Tracker, bus *eventbus.Bus) (Router, error) {
return newUserspaceBSDRouter(logf, tundev, netMon, health)
}

View File

@ -13,9 +13,10 @@
"tailscale.com/health"
"tailscale.com/net/netmon"
"tailscale.com/types/logger"
"tailscale.com/util/eventbus"
)
func newUserspaceRouter(logf logger.Logf, tunDev tun.Device, netMon *netmon.Monitor, health *health.Tracker) (Router, error) {
func newUserspaceRouter(logf logger.Logf, tunDev tun.Device, netMon *netmon.Monitor, health *health.Tracker, _ *eventbus.Bus) (Router, error) {
return nil, fmt.Errorf("unsupported OS %q", runtime.GOOS)
}

View File

@ -8,6 +8,7 @@
"tailscale.com/health"
"tailscale.com/net/netmon"
"tailscale.com/types/logger"
"tailscale.com/util/eventbus"
)
// For now this router only supports the userspace WireGuard implementations.
@ -15,7 +16,7 @@
// Work is currently underway for an in-kernel FreeBSD implementation of wireguard
// https://svnweb.freebsd.org/base?view=revision&revision=357986
func newUserspaceRouter(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor, health *health.Tracker) (Router, error) {
func newUserspaceRouter(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor, health *health.Tracker, bus *eventbus.Bus) (Router, error) {
return newUserspaceBSDRouter(logf, tundev, netMon, health)
}

View File

@ -29,6 +29,7 @@
"tailscale.com/types/logger"
"tailscale.com/types/opt"
"tailscale.com/types/preftype"
"tailscale.com/util/eventbus"
"tailscale.com/util/linuxfw"
"tailscale.com/util/multierr"
"tailscale.com/version/distro"
@ -48,6 +49,9 @@ type linuxRouter struct {
tunname string
netMon *netmon.Monitor
health *health.Tracker
eventClient *eventbus.Client
ruleDeletedSub *eventbus.Subscriber[netmon.RuleDeleted]
rulesAddedPub *eventbus.Publisher[AddIPRules]
unregNetMon func()
addrs map[netip.Prefix]bool
routes map[netip.Prefix]bool
@ -77,7 +81,7 @@ type linuxRouter struct {
magicsockPortV6 uint16
}
func newUserspaceRouter(logf logger.Logf, tunDev tun.Device, netMon *netmon.Monitor, health *health.Tracker) (Router, error) {
func newUserspaceRouter(logf logger.Logf, tunDev tun.Device, netMon *netmon.Monitor, health *health.Tracker, bus *eventbus.Bus) (Router, error) {
tunname, err := tunDev.Name()
if err != nil {
return nil, err
@ -87,15 +91,16 @@ func newUserspaceRouter(logf logger.Logf, tunDev tun.Device, netMon *netmon.Moni
ambientCapNetAdmin: useAmbientCaps(),
}
return newUserspaceRouterAdvanced(logf, tunname, netMon, cmd, health)
return newUserspaceRouterAdvanced(logf, tunname, netMon, cmd, health, bus)
}
func newUserspaceRouterAdvanced(logf logger.Logf, tunname string, netMon *netmon.Monitor, cmd commandRunner, health *health.Tracker) (Router, error) {
func newUserspaceRouterAdvanced(logf logger.Logf, tunname string, netMon *netmon.Monitor, cmd commandRunner, health *health.Tracker, bus *eventbus.Bus) (Router, error) {
r := &linuxRouter{
logf: logf,
tunname: tunname,
netfilterMode: netfilterOff,
netMon: netMon,
eventClient: bus.Client("router-linux"),
health: health,
cmd: cmd,
@ -103,6 +108,10 @@ func newUserspaceRouterAdvanced(logf logger.Logf, tunname string, netMon *netmon
ipRuleFixLimiter: rate.NewLimiter(rate.Every(5*time.Second), 10),
ipPolicyPrefBase: 5200,
}
r.ruleDeletedSub = eventbus.Subscribe[netmon.RuleDeleted](r.eventClient)
r.rulesAddedPub = eventbus.Publish[AddIPRules](r.eventClient)
go r.consumeEventbusTopics()
if r.useIPCommand() {
r.ipRuleAvailable = (cmd.run("ip", "rule") == nil)
} else {
@ -145,6 +154,24 @@ func newUserspaceRouterAdvanced(logf logger.Logf, tunname string, netMon *netmon
return r, nil
}
// consumeEventbusTopics consumes events from all [Conn]-relevant
// [eventbus.Subscriber]'s and passes them to their related handler. Events are
// always handled in the order they are received, i.e. the next event is not
// read until the previous event's handler has returned. It returns when the
// [portmapper.Mapping] subscriber is closed, which is interpreted to be the
// same as the [eventbus.Client] closing ([eventbus.Subscribers] are either
// all open or all closed).
func (r *linuxRouter) consumeEventbusTopics() {
for {
select {
case <-r.ruleDeletedSub.Done():
return
case rulesDeleted := <-r.ruleDeletedSub.Events():
r.onIPRuleDeleted(rulesDeleted.Table, rulesDeleted.Priority)
}
}
}
// ipCmdSupportsFwmask returns true if the system 'ip' binary supports using a
// fwmark stanza with a mask specified. To our knowledge, everything except busybox
// pre-1.33 supports this.
@ -276,6 +303,10 @@ func (r *linuxRouter) fwmaskWorks() bool {
return v
}
// AddIPRules is used as an event signal to signify that rules have been added.
// It is added to aid testing, but could be extended if there's a reason for it.
type AddIPRules struct{}
// onIPRuleDeleted is the callback from the network monitor for when an IP
// policy rule is deleted. See Issue 1591.
//
@ -303,6 +334,9 @@ func (r *linuxRouter) onIPRuleDeleted(table uint8, priority uint32) {
r.ruleRestorePending.Swap(false)
return
}
r.rulesAddedPub.Publish(AddIPRules{})
time.AfterFunc(rr.Delay()+250*time.Millisecond, func() {
if r.ruleRestorePending.Swap(false) && !r.closed.Load() {
r.logf("somebody (likely systemd-networkd) deleted ip rules; restoring Tailscale's")
@ -312,9 +346,6 @@ func (r *linuxRouter) onIPRuleDeleted(table uint8, priority uint32) {
}
func (r *linuxRouter) Up() error {
if r.unregNetMon == nil && r.netMon != nil {
r.unregNetMon = r.netMon.RegisterRuleDeleteCallback(r.onIPRuleDeleted)
}
if err := r.setNetfilterMode(netfilterOff); err != nil {
return fmt.Errorf("setting netfilter mode: %w", err)
}
@ -333,6 +364,7 @@ func (r *linuxRouter) Close() error {
if r.unregNetMon != nil {
r.unregNetMon()
}
r.eventClient.Close()
if err := r.downInterface(); err != nil {
return err
}
@ -1276,7 +1308,6 @@ func (r *linuxRouter) justAddIPRules() error {
}
var errAcc error
for _, family := range r.addrFamilies() {
for _, ru := range ipRules() {
// Note: r is a value type here; safe to mutate it.
ru.Family = family.netlinkInt()

View File

@ -28,6 +28,7 @@
"tailscale.com/tstest"
"tailscale.com/types/logger"
"tailscale.com/util/eventbus"
"tailscale.com/util/eventbus/eventbustest"
"tailscale.com/util/linuxfw"
"tailscale.com/version/distro"
)
@ -375,7 +376,7 @@ func TestRouterStates(t *testing.T) {
fake := NewFakeOS(t)
ht := new(health.Tracker)
router, err := newUserspaceRouterAdvanced(t.Logf, "tailscale0", mon, fake, ht)
router, err := newUserspaceRouterAdvanced(t.Logf, "tailscale0", mon, fake, ht, bus)
router.(*linuxRouter).nfr = fake.nfr
if err != nil {
t.Fatalf("failed to create router: %v", err)
@ -414,7 +415,7 @@ type fakeIPTablesRunner struct {
t *testing.T
ipt4 map[string][]string
ipt6 map[string][]string
//we always assume ipv6 and ipv6 nat are enabled when testing
// we always assume ipv6 and ipv6 nat are enabled when testing
}
func newIPTablesRunner(t *testing.T) linuxfw.NetfilterRunner {
@ -541,6 +542,7 @@ func (n *fakeIPTablesRunner) EnsureSNATForDst(src, dst netip.Addr) error {
func (n *fakeIPTablesRunner) DNATNonTailscaleTraffic(exemptInterface string, dst netip.Addr) error {
return errors.New("not implemented")
}
func (n *fakeIPTablesRunner) EnsurePortMapRuleForSvc(svc, tun string, targetIP netip.Addr, pm linuxfw.PortMap) error {
return errors.New("not implemented")
}
@ -781,8 +783,8 @@ type fakeOS struct {
ips []string
routes []string
rules []string
//This test tests on the router level, so we will not bother
//with using iptables or nftables, chose the simpler one.
// This test tests on the router level, so we will not bother
// with using iptables or nftables, chose the simpler one.
nfr linuxfw.NetfilterRunner
}
@ -974,7 +976,7 @@ func (lt *linuxTest) Close() error {
return nil
}
func newLinuxRootTest(t *testing.T) *linuxTest {
func newLinuxRootTest(t *testing.T) (*linuxTest, *eventbus.Bus) {
if os.Getuid() != 0 {
t.Skip("test requires root")
}
@ -984,8 +986,7 @@ func newLinuxRootTest(t *testing.T) *linuxTest {
logf := lt.logOutput.Logf
bus := eventbus.New()
defer bus.Close()
bus := eventbustest.NewBus(t)
mon, err := netmon.New(bus, logger.Discard)
if err != nil {
@ -995,7 +996,7 @@ func newLinuxRootTest(t *testing.T) *linuxTest {
mon.Start()
lt.mon = mon
r, err := newUserspaceRouter(logf, lt.tun, mon, nil)
r, err := newUserspaceRouter(logf, lt.tun, mon, nil, bus)
if err != nil {
lt.Close()
t.Fatal(err)
@ -1006,11 +1007,31 @@ func newLinuxRootTest(t *testing.T) *linuxTest {
t.Fatal(err)
}
lt.r = lr
return lt
return lt, bus
}
func TestRuleDeletedEvent(t *testing.T) {
fake := NewFakeOS(t)
lt, bus := newLinuxRootTest(t)
lt.r.nfr = fake.nfr
defer lt.Close()
event := netmon.RuleDeleted{
Table: 52,
Priority: 5210,
}
tw := eventbustest.NewWatcher(t, bus)
t.Logf("Value before: %t", lt.r.ruleRestorePending.Load())
if lt.r.ruleRestorePending.Load() {
t.Errorf("rule deletion already ongoing")
}
injector := eventbustest.NewInjector(t, bus)
eventbustest.Inject(injector, event)
eventbustest.Expect(tw, eventbustest.Type[AddIPRules]())
}
func TestDelRouteIdempotent(t *testing.T) {
lt := newLinuxRootTest(t)
lt, _ := newLinuxRootTest(t)
defer lt.Close()
for _, s := range []string{
@ -1036,7 +1057,7 @@ func TestDelRouteIdempotent(t *testing.T) {
}
func TestAddRemoveRules(t *testing.T) {
lt := newLinuxRootTest(t)
lt, _ := newLinuxRootTest(t)
defer lt.Close()
r := lt.r
@ -1054,14 +1075,12 @@ func TestAddRemoveRules(t *testing.T) {
t.Logf("Rule: %+v", r)
}
}
}
step("init_del_and_add", r.addIPRules)
step("dup_add", r.justAddIPRules)
step("del", r.delIPRules)
step("dup_del", r.delIPRules)
}
func TestDebugListLinks(t *testing.T) {

View File

@ -15,6 +15,7 @@
"tailscale.com/health"
"tailscale.com/net/netmon"
"tailscale.com/types/logger"
"tailscale.com/util/eventbus"
"tailscale.com/util/set"
)
@ -31,7 +32,7 @@ type openbsdRouter struct {
routes set.Set[netip.Prefix]
}
func newUserspaceRouter(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor, health *health.Tracker) (Router, error) {
func newUserspaceRouter(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor, health *health.Tracker, bus *eventbus.Bus) (Router, error) {
tunname, err := tundev.Name()
if err != nil {
return nil, err

View File

@ -15,9 +15,10 @@
"tailscale.com/health"
"tailscale.com/net/netmon"
"tailscale.com/types/logger"
"tailscale.com/util/eventbus"
)
func newUserspaceRouter(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor, health *health.Tracker) (Router, error) {
func newUserspaceRouter(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor, health *health.Tracker, bus *eventbus.Bus) (Router, error) {
r := &plan9Router{
logf: logf,
tundev: tundev,

View File

@ -27,6 +27,7 @@
"tailscale.com/net/dns"
"tailscale.com/net/netmon"
"tailscale.com/types/logger"
"tailscale.com/util/eventbus"
)
type winRouter struct {
@ -38,7 +39,7 @@ type winRouter struct {
firewall *firewallTweaker
}
func newUserspaceRouter(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor, health *health.Tracker) (Router, error) {
func newUserspaceRouter(logf logger.Logf, tundev tun.Device, netMon *netmon.Monitor, health *health.Tracker, bus *eventbus.Bus) (Router, error) {
nativeTun := tundev.(*tun.NativeTun)
luid := winipcfg.LUID(nativeTun.LUID())
guid, err := luid.GUID()