From be7cce74ba97f2f31820168aed0ea4c8e73e3367 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Claus=20Lensb=C3=B8l?= Date: Wed, 29 Apr 2026 13:23:04 -0400 Subject: [PATCH] wgengine/userspace: do not fall back to old key on tsmpLearned mismatch (#19575) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mismatch behaviour of falling back to a previous key could end up breaking connections when the netmap update took longer than the 2 seconds allowed in controlClient.auto for netmap updates, or if the controlClient context was canceled. This could end up breaking legitimate updates to the netmap for disco keys coming from control. Instead, log the event, and let the connection be reset to that of the key as that is safer. Issue found by @bradfitz. Updates #19574 Signed-off-by: Claus Lensbøl --- wgengine/userspace.go | 35 +++++++++++++++-------------------- wgengine/userspace_test.go | 5 +++-- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/wgengine/userspace.go b/wgengine/userspace.go index 76261d4d4..c2ac0c7dc 100644 --- a/wgengine/userspace.go +++ b/wgengine/userspace.go @@ -1176,31 +1176,26 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config, // wireguard config as the new key was received over an existing wireguard // connection. if discoTSMP, okTSMP := e.tsmpLearnedDisco[p.PublicKey]; okTSMP { + // Key matches, remove entry from map. + delete(e.tsmpLearnedDisco, p.PublicKey) if discoTSMP == p.DiscoKey { - // Key matches, remove entry from map. e.logf("wgengine: Skipping reconfig (TSMP key): %s changed from %q to %q", pub.ShortString(), old, p.DiscoKey) - delete(e.tsmpLearnedDisco, p.PublicKey) - } else { - // The new disco key does not match what we received via - // TSMP for this peer. This is unexpected, so log it. - // If it does happen, overwrite the previously-saved - // disco key with the new one for now: We expect another - // update must be pending in that case, so keep the map - // entry. - // The reason why this should never happen is that only a single - // request is coming through the netmap pipeline at a time, and there - // should realistically ever only be a single entry in the map. This - // is really a belt and suspenders solution to find usage that is - // inconsistent with our expectations. - e.logf("wgengine: [unexpected] Reconfig: using TSMP key for %s (control stale): tsmp=%q control=%q old=%q", - pub.ShortString(), discoTSMP, p.DiscoKey, old) - metricTSMPLearnedKeyMismatch.Add(1) - p.DiscoKey = discoTSMP + // Skip session clear. + continue } - // Skip session clear no matter what. - continue + // The new disco key does not match what we received via + // TSMP for this peer. This is unexpected, though possible + // if processing a change in a large netmap ends up taking + // longer than the 2 second timeout in + // [controlClient.mapRoutineState.UpdateNetmapDelta], or if + // the context is cancelled mid update. Log the event, and reset + // the connection as it is possibly a stale entry in the map + // instead of a TSMP disco key update that led us here. + e.logf("wgengine: [unexpected] Reconfig: using TSMP key for %s (control stale): tsmp=%q control=%q old=%q", + pub.ShortString(), discoTSMP, p.DiscoKey, old) + metricTSMPLearnedKeyMismatch.Add(1) } discoChanged[pub] = true diff --git a/wgengine/userspace_test.go b/wgengine/userspace_test.go index d5364a029..c874a778a 100644 --- a/wgengine/userspace_test.go +++ b/wgengine/userspace_test.go @@ -264,8 +264,9 @@ func TestUserspaceEngineTSMPLearnedMismatch(t *testing.T) { wrongKey bool }{ {tsmp: false, inMap: false, wrongKey: false}, - {tsmp: true, inMap: false, wrongKey: true}, - {tsmp: false, inMap: false, wrongKey: false}, + {tsmp: true, inMap: false, wrongKey: false}, + {tsmp: true, inMap: true, wrongKey: true}, + {tsmp: false, inMap: true, wrongKey: false}, } nkHex := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"