wgengine/userspace: do not fall back to old key on tsmpLearned mismatch (#19575)

The mismatch behaviour of falling back to a previous key could end up
breaking connections when the netmap update took longer than the 2
seconds allowed in controlClient.auto for netmap updates, or if the
controlClient context was canceled. This could end up breaking
legitimate updates to the netmap for disco keys coming from control.

Instead, log the event, and let the connection be reset to that of the
key as that is safer.

Issue found by @bradfitz.

Updates #19574

Signed-off-by: Claus Lensbøl <claus@tailscale.com>
This commit is contained in:
Claus Lensbøl 2026-04-29 13:23:04 -04:00 committed by GitHub
parent fd6ae2fad4
commit be7cce74ba
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 18 additions and 22 deletions

View File

@ -1176,31 +1176,26 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config,
// wireguard config as the new key was received over an existing wireguard
// connection.
if discoTSMP, okTSMP := e.tsmpLearnedDisco[p.PublicKey]; okTSMP {
// Key matches, remove entry from map.
delete(e.tsmpLearnedDisco, p.PublicKey)
if discoTSMP == p.DiscoKey {
// Key matches, remove entry from map.
e.logf("wgengine: Skipping reconfig (TSMP key): %s changed from %q to %q",
pub.ShortString(), old, p.DiscoKey)
delete(e.tsmpLearnedDisco, p.PublicKey)
} else {
// The new disco key does not match what we received via
// TSMP for this peer. This is unexpected, so log it.
// If it does happen, overwrite the previously-saved
// disco key with the new one for now: We expect another
// update must be pending in that case, so keep the map
// entry.
// The reason why this should never happen is that only a single
// request is coming through the netmap pipeline at a time, and there
// should realistically ever only be a single entry in the map. This
// is really a belt and suspenders solution to find usage that is
// inconsistent with our expectations.
e.logf("wgengine: [unexpected] Reconfig: using TSMP key for %s (control stale): tsmp=%q control=%q old=%q",
pub.ShortString(), discoTSMP, p.DiscoKey, old)
metricTSMPLearnedKeyMismatch.Add(1)
p.DiscoKey = discoTSMP
// Skip session clear.
continue
}
// Skip session clear no matter what.
continue
// The new disco key does not match what we received via
// TSMP for this peer. This is unexpected, though possible
// if processing a change in a large netmap ends up taking
// longer than the 2 second timeout in
// [controlClient.mapRoutineState.UpdateNetmapDelta], or if
// the context is cancelled mid update. Log the event, and reset
// the connection as it is possibly a stale entry in the map
// instead of a TSMP disco key update that led us here.
e.logf("wgengine: [unexpected] Reconfig: using TSMP key for %s (control stale): tsmp=%q control=%q old=%q",
pub.ShortString(), discoTSMP, p.DiscoKey, old)
metricTSMPLearnedKeyMismatch.Add(1)
}
discoChanged[pub] = true

View File

@ -264,8 +264,9 @@ func TestUserspaceEngineTSMPLearnedMismatch(t *testing.T) {
wrongKey bool
}{
{tsmp: false, inMap: false, wrongKey: false},
{tsmp: true, inMap: false, wrongKey: true},
{tsmp: false, inMap: false, wrongKey: false},
{tsmp: true, inMap: false, wrongKey: false},
{tsmp: true, inMap: true, wrongKey: true},
{tsmp: false, inMap: true, wrongKey: false},
}
nkHex := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"