ipn/ipnlocal, tka: compact TKA state after every sync

Previously a TKA compaction would only run when a node starts, which means a long-running node could use unbounded storage as it accumulates ever-increasing amounts of TKA state. This patch changes TKA so it runs a compaction after every sync.

Updates https://github.com/tailscale/corp/issues/33537

Change-Id: I91df887ea0c5a5b00cb6caced85aeffa2a4b24ee
Signed-off-by: Alex Chan <alexc@tailscale.com>
This commit is contained in:
Alex Chan 2025-11-17 16:38:57 +00:00 committed by Alex Chan
parent 38ccdbe35c
commit e1dd9222d4
10 changed files with 276 additions and 31 deletions

View File

@ -360,6 +360,13 @@ func (b *LocalBackend) tkaSyncIfNeeded(nm *netmap.NetworkMap, prefs ipn.PrefsVie
if err := b.tkaSyncLocked(ourNodeKey); err != nil {
return fmt.Errorf("tka sync: %w", err)
}
// Try to compact the TKA state, to avoid unbounded storage on nodes.
//
// We run this on every sync so that clients compact consistently. In many
// cases this will be a no-op.
if err := b.tka.authority.Compact(b.tka.storage, tkaCompactionDefaults); err != nil {
return fmt.Errorf("tka compact: %w", err)
}
}
return nil
@ -508,7 +515,7 @@ func (b *LocalBackend) tkaBootstrapFromGenesisLocked(g tkatype.MarshaledAUM, per
if root == "" {
b.health.SetUnhealthy(noNetworkLockStateDirWarnable, nil)
b.logf("network-lock using in-memory storage; no state directory")
storage = &tka.Mem{}
storage = tka.ChonkMem()
} else {
chonkDir := b.chonkPathLocked()
chonk, err := tka.ChonkDir(chonkDir)
@ -686,7 +693,7 @@ func (b *LocalBackend) NetworkLockInit(keys []tka.Key, disablementValues [][]byt
// We use an in-memory tailchonk because we don't want to commit to
// the filesystem until we've finished the initialization sequence,
// just in case something goes wrong.
_, genesisAUM, err := tka.Create(&tka.Mem{}, tka.State{
_, genesisAUM, err := tka.Create(tka.ChonkMem(), tka.State{
Keys: keys,
// TODO(tom): s/tka.State.DisablementSecrets/tka.State.DisablementValues
// This will center on consistent nomenclature:

View File

@ -17,6 +17,7 @@ import (
"path/filepath"
"reflect"
"testing"
"time"
go4mem "go4.org/mem"
@ -31,6 +32,7 @@ import (
"tailscale.com/tailcfg"
"tailscale.com/tka"
"tailscale.com/tsd"
"tailscale.com/tstest"
"tailscale.com/types/key"
"tailscale.com/types/netmap"
"tailscale.com/types/persist"
@ -89,7 +91,7 @@ func TestTKAEnablementFlow(t *testing.T) {
// our mock server can communicate.
nlPriv := key.NewNLPrivate()
key := tka.Key{Kind: tka.Key25519, Public: nlPriv.Public().Verifier(), Votes: 2}
a1, genesisAUM, err := tka.Create(&tka.Mem{}, tka.State{
a1, genesisAUM, err := tka.Create(tka.ChonkMem(), tka.State{
Keys: []tka.Key{key},
DisablementSecrets: [][]byte{bytes.Repeat([]byte{0xa5}, 32)},
}, nlPriv)
@ -399,7 +401,7 @@ func TestTKASync(t *testing.T) {
// Setup the tka authority on the control plane.
key := tka.Key{Kind: tka.Key25519, Public: nlPriv.Public().Verifier(), Votes: 2}
controlStorage := &tka.Mem{}
controlStorage := tka.ChonkMem()
controlAuthority, bootstrap, err := tka.Create(controlStorage, tka.State{
Keys: []tka.Key{key, someKey},
DisablementSecrets: [][]byte{tka.DisablementKDF(disablementSecret)},
@ -548,10 +550,226 @@ func TestTKASync(t *testing.T) {
}
}
// Whenever we run a TKA sync and get new state from control, we compact the
// local state.
func TestTKASyncTriggersCompact(t *testing.T) {
someKeyPriv := key.NewNLPrivate()
someKey := tka.Key{Kind: tka.Key25519, Public: someKeyPriv.Public().Verifier(), Votes: 1}
disablementSecret := bytes.Repeat([]byte{0xa5}, 32)
nodePriv := key.NewNode()
nlPriv := key.NewNLPrivate()
pm := must.Get(newProfileManager(new(mem.Store), t.Logf, health.NewTracker(eventbustest.NewBus(t))))
must.Do(pm.SetPrefs((&ipn.Prefs{
Persist: &persist.Persist{
PrivateNodeKey: nodePriv,
NetworkLockKey: nlPriv,
},
}).View(), ipn.NetworkProfile{}))
// Create a clock, and roll it back by 30 days.
//
// Our compaction algorithm preserves AUMs received in the last 14 days, so
// we need to backdate the commit times to make the AUMs eligible for compaction.
clock := tstest.NewClock(tstest.ClockOpts{})
clock.Advance(-30 * 24 * time.Hour)
// Set up the TKA authority on the control plane.
key := tka.Key{Kind: tka.Key25519, Public: nlPriv.Public().Verifier(), Votes: 2}
controlStorage := tka.ChonkMem()
controlStorage.SetClock(clock)
controlAuthority, bootstrap, err := tka.Create(controlStorage, tka.State{
Keys: []tka.Key{key, someKey},
DisablementSecrets: [][]byte{tka.DisablementKDF(disablementSecret)},
}, nlPriv)
if err != nil {
t.Fatalf("tka.Create() failed: %v", err)
}
// Fill the control plane TKA authority with a lot of AUMs, enough so that:
//
// 1. the chain of AUMs includes some checkpoints
// 2. the chain is long enough it would be trimmed if we ran the compaction
// algorithm with the defaults
for range 100 {
upd := controlAuthority.NewUpdater(nlPriv)
if err := upd.RemoveKey(someKey.MustID()); err != nil {
t.Fatalf("RemoveKey: %v", err)
}
if err := upd.AddKey(someKey); err != nil {
t.Fatalf("AddKey: %v", err)
}
aums, err := upd.Finalize(controlStorage)
if err != nil {
t.Fatalf("Finalize: %v", err)
}
if err := controlAuthority.Inform(controlStorage, aums); err != nil {
t.Fatalf("controlAuthority.Inform() failed: %v", err)
}
}
// Set up the TKA authority on the node.
nodeStorage := tka.ChonkMem()
nodeStorage.SetClock(clock)
nodeAuthority, err := tka.Bootstrap(nodeStorage, bootstrap)
if err != nil {
t.Fatalf("tka.Bootstrap() failed: %v", err)
}
// Make a mock control server.
ts, client := fakeNoiseServer(t, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
defer r.Body.Close()
switch r.URL.Path {
case "/machine/tka/sync/offer":
body := new(tailcfg.TKASyncOfferRequest)
if err := json.NewDecoder(r.Body).Decode(body); err != nil {
t.Fatal(err)
}
t.Logf("got sync offer:\n%+v", body)
nodeOffer, err := toSyncOffer(body.Head, body.Ancestors)
if err != nil {
t.Fatal(err)
}
controlOffer, err := controlAuthority.SyncOffer(controlStorage)
if err != nil {
t.Fatal(err)
}
sendAUMs, err := controlAuthority.MissingAUMs(controlStorage, nodeOffer)
if err != nil {
t.Fatal(err)
}
head, ancestors, err := fromSyncOffer(controlOffer)
if err != nil {
t.Fatal(err)
}
resp := tailcfg.TKASyncOfferResponse{
Head: head,
Ancestors: ancestors,
MissingAUMs: make([]tkatype.MarshaledAUM, len(sendAUMs)),
}
for i, a := range sendAUMs {
resp.MissingAUMs[i] = a.Serialize()
}
t.Logf("responding to sync offer with:\n%+v", resp)
w.WriteHeader(200)
if err := json.NewEncoder(w).Encode(resp); err != nil {
t.Fatal(err)
}
case "/machine/tka/sync/send":
body := new(tailcfg.TKASyncSendRequest)
if err := json.NewDecoder(r.Body).Decode(body); err != nil {
t.Fatal(err)
}
t.Logf("got sync send:\n%+v", body)
var remoteHead tka.AUMHash
if err := remoteHead.UnmarshalText([]byte(body.Head)); err != nil {
t.Fatalf("head unmarshal: %v", err)
}
toApply := make([]tka.AUM, len(body.MissingAUMs))
for i, a := range body.MissingAUMs {
if err := toApply[i].Unserialize(a); err != nil {
t.Fatalf("decoding missingAUM[%d]: %v", i, err)
}
}
if len(toApply) > 0 {
if err := controlAuthority.Inform(controlStorage, toApply); err != nil {
t.Fatalf("control.Inform(%+v) failed: %v", toApply, err)
}
}
head, err := controlAuthority.Head().MarshalText()
if err != nil {
t.Fatal(err)
}
w.WriteHeader(200)
if err := json.NewEncoder(w).Encode(tailcfg.TKASyncSendResponse{
Head: string(head),
}); err != nil {
t.Fatal(err)
}
default:
t.Errorf("unhandled endpoint path: %v", r.URL.Path)
w.WriteHeader(404)
}
}))
defer ts.Close()
// Setup the client.
cc, _ := fakeControlClient(t, client)
b := LocalBackend{
cc: cc,
ccAuto: cc,
logf: t.Logf,
pm: pm,
store: pm.Store(),
tka: &tkaState{
authority: nodeAuthority,
storage: nodeStorage,
},
}
// Trigger a sync.
err = b.tkaSyncIfNeeded(&netmap.NetworkMap{
TKAEnabled: true,
TKAHead: controlAuthority.Head(),
}, pm.CurrentPrefs())
if err != nil {
t.Errorf("tkaSyncIfNeeded() failed: %v", err)
}
// Add a new AUM in control.
upd := controlAuthority.NewUpdater(nlPriv)
if err := upd.RemoveKey(someKey.MustID()); err != nil {
t.Fatalf("RemoveKey: %v", err)
}
aums, err := upd.Finalize(controlStorage)
if err != nil {
t.Fatalf("Finalize: %v", err)
}
if err := controlAuthority.Inform(controlStorage, aums); err != nil {
t.Fatalf("controlAuthority.Inform() failed: %v", err)
}
// Run a second sync, which should trigger a compaction.
err = b.tkaSyncIfNeeded(&netmap.NetworkMap{
TKAEnabled: true,
TKAHead: controlAuthority.Head(),
}, pm.CurrentPrefs())
if err != nil {
t.Errorf("tkaSyncIfNeeded() failed: %v", err)
}
// Check that the node and control plane are in sync.
if nodeHead, controlHead := b.tka.authority.Head(), controlAuthority.Head(); nodeHead != controlHead {
t.Errorf("node head = %v, want %v", nodeHead, controlHead)
}
// Check the node has compacted away some of its AUMs; that it has purged some AUMs which
// are still kept in the control plane.
nodeAUMs, err := b.tka.storage.AllAUMs()
if err != nil {
t.Errorf("AllAUMs() for node failed: %v", err)
}
controlAUMS, err := controlStorage.AllAUMs()
if err != nil {
t.Errorf("AllAUMs() for control failed: %v", err)
}
if len(nodeAUMs) == len(controlAUMS) {
t.Errorf("node has not compacted; it has the same number of AUMs as control (node = control = %d)", len(nodeAUMs))
}
}
func TestTKAFilterNetmap(t *testing.T) {
nlPriv := key.NewNLPrivate()
nlKey := tka.Key{Kind: tka.Key25519, Public: nlPriv.Public().Verifier(), Votes: 2}
storage := &tka.Mem{}
storage := tka.ChonkMem()
authority, _, err := tka.Create(storage, tka.State{
Keys: []tka.Key{nlKey},
DisablementSecrets: [][]byte{bytes.Repeat([]byte{0xa5}, 32)},

View File

@ -28,7 +28,7 @@ func TestAuthorityBuilderAddKey(t *testing.T) {
pub, priv := testingKey25519(t, 1)
key := Key{Kind: Key25519, Public: pub, Votes: 2}
storage := &Mem{}
storage := ChonkMem()
a, _, err := Create(storage, State{
Keys: []Key{key},
DisablementSecrets: [][]byte{DisablementKDF([]byte{1, 2, 3})},
@ -62,7 +62,7 @@ func TestAuthorityBuilderMaxKey(t *testing.T) {
pub, priv := testingKey25519(t, 1)
key := Key{Kind: Key25519, Public: pub, Votes: 2}
storage := &Mem{}
storage := ChonkMem()
a, _, err := Create(storage, State{
Keys: []Key{key},
DisablementSecrets: [][]byte{DisablementKDF([]byte{1, 2, 3})},
@ -109,7 +109,7 @@ func TestAuthorityBuilderRemoveKey(t *testing.T) {
pub2, _ := testingKey25519(t, 2)
key2 := Key{Kind: Key25519, Public: pub2, Votes: 1}
storage := &Mem{}
storage := ChonkMem()
a, _, err := Create(storage, State{
Keys: []Key{key, key2},
DisablementSecrets: [][]byte{DisablementKDF([]byte{1, 2, 3})},
@ -155,7 +155,7 @@ func TestAuthorityBuilderSetKeyVote(t *testing.T) {
pub, priv := testingKey25519(t, 1)
key := Key{Kind: Key25519, Public: pub, Votes: 2}
storage := &Mem{}
storage := ChonkMem()
a, _, err := Create(storage, State{
Keys: []Key{key},
DisablementSecrets: [][]byte{DisablementKDF([]byte{1, 2, 3})},
@ -191,7 +191,7 @@ func TestAuthorityBuilderSetKeyMeta(t *testing.T) {
pub, priv := testingKey25519(t, 1)
key := Key{Kind: Key25519, Public: pub, Votes: 2, Meta: map[string]string{"a": "b"}}
storage := &Mem{}
storage := ChonkMem()
a, _, err := Create(storage, State{
Keys: []Key{key},
DisablementSecrets: [][]byte{DisablementKDF([]byte{1, 2, 3})},
@ -227,7 +227,7 @@ func TestAuthorityBuilderMultiple(t *testing.T) {
pub, priv := testingKey25519(t, 1)
key := Key{Kind: Key25519, Public: pub, Votes: 2}
storage := &Mem{}
storage := ChonkMem()
a, _, err := Create(storage, State{
Keys: []Key{key},
DisablementSecrets: [][]byte{DisablementKDF([]byte{1, 2, 3})},
@ -275,7 +275,7 @@ func TestAuthorityBuilderCheckpointsAfterXUpdates(t *testing.T) {
pub, priv := testingKey25519(t, 1)
key := Key{Kind: Key25519, Public: pub, Votes: 2}
storage := &Mem{}
storage := ChonkMem()
a, _, err := Create(storage, State{
Keys: []Key{key},
DisablementSecrets: [][]byte{DisablementKDF([]byte{1, 2, 3})},

View File

@ -285,25 +285,25 @@ func (c *testChain) makeAUM(v *testchainNode) AUM {
// Chonk returns a tailchonk containing all AUMs.
func (c *testChain) Chonk() Chonk {
var out Mem
out := ChonkMem()
for _, update := range c.AUMs {
if err := out.CommitVerifiedAUMs([]AUM{update}); err != nil {
panic(err)
}
}
return &out
return out
}
// ChonkWith returns a tailchonk containing the named AUMs.
func (c *testChain) ChonkWith(names ...string) Chonk {
var out Mem
out := ChonkMem()
for _, name := range names {
update := c.AUMs[name]
if err := out.CommitVerifiedAUMs([]AUM{update}); err != nil {
panic(err)
}
}
return &out
return out
}
type testchainOpt struct {

View File

@ -72,7 +72,7 @@ func TestNLPrivate(t *testing.T) {
// Test that key.NLPrivate implements Signer by making a new
// authority.
k := Key{Kind: Key25519, Public: pub.Verifier(), Votes: 1}
_, aum, err := Create(&Mem{}, State{
_, aum, err := Create(ChonkMem(), State{
Keys: []Key{k},
DisablementSecrets: [][]byte{bytes.Repeat([]byte{1}, 32)},
}, p)

View File

@ -346,7 +346,7 @@ func TestSyncSimpleE2E(t *testing.T) {
optKey("key", key, priv),
optSignAllUsing("key"))
nodeStorage := &Mem{}
nodeStorage := ChonkMem()
node, err := Bootstrap(nodeStorage, c.AUMs["G1"])
if err != nil {
t.Fatalf("node Bootstrap() failed: %v", err)

View File

@ -19,6 +19,8 @@ import (
"github.com/fxamacker/cbor/v2"
"tailscale.com/atomicfile"
"tailscale.com/tstime"
"tailscale.com/util/testenv"
)
// Chonk implementations provide durable storage for AUMs and other
@ -92,6 +94,7 @@ type Mem struct {
mu sync.RWMutex
aums map[AUMHash]AUM
commitTimes map[AUMHash]time.Time
clock tstime.Clock
// parentIndex is a map of AUMs to the AUMs for which they are
// the parent.
@ -103,6 +106,23 @@ type Mem struct {
lastActiveAncestor *AUMHash
}
// ChonkMem returns an implementation of Chonk which stores TKA state
// in-memory.
func ChonkMem() *Mem {
return &Mem{
clock: tstime.DefaultClock{},
}
}
// SetClock sets the clock used by [Mem]. This is only for use in tests,
// and will panic if called from non-test code.
func (c *Mem) SetClock(clock tstime.Clock) {
if !testenv.InTest() {
panic("used SetClock in non-test code")
}
c.clock = clock
}
func (c *Mem) SetLastActiveAncestor(hash AUMHash) error {
c.mu.Lock()
defer c.mu.Unlock()
@ -173,7 +193,7 @@ updateLoop:
for _, aum := range updates {
aumHash := aum.Hash()
c.aums[aumHash] = aum
c.commitTimes[aumHash] = time.Now()
c.commitTimes[aumHash] = c.clock.Now()
parent, ok := aum.Parent()
if ok {

View File

@ -35,7 +35,7 @@ func randHash(t *testing.T, seed int64) [blake2s.Size]byte {
}
func TestImplementsChonk(t *testing.T) {
impls := []Chonk{&Mem{}, &FS{}}
impls := []Chonk{ChonkMem(), &FS{}}
t.Logf("chonks: %v", impls)
}
@ -229,7 +229,7 @@ func TestMarkActiveChain(t *testing.T) {
verdict := make(map[AUMHash]retainState, len(tc.chain))
// Build the state of the tailchonk for tests.
storage := &Mem{}
storage := ChonkMem()
var prev AUMHash
for i := range tc.chain {
if !prev.IsZero() {
@ -608,7 +608,7 @@ func TestCompactLongButYoung(t *testing.T) {
ourKey := Key{Kind: Key25519, Public: ourPriv.Public().Verifier(), Votes: 1}
someOtherKey := Key{Kind: Key25519, Public: key.NewNLPrivate().Public().Verifier(), Votes: 1}
storage := &Mem{}
storage := ChonkMem()
auth, _, err := Create(storage, State{
Keys: []Key{ourKey, someOtherKey},
DisablementSecrets: [][]byte{DisablementKDF(bytes.Repeat([]byte{0xa5}, 32))},

View File

@ -253,7 +253,7 @@ func TestOpenAuthority(t *testing.T) {
}
// Construct the state of durable storage.
chonk := &Mem{}
chonk := ChonkMem()
err := chonk.CommitVerifiedAUMs([]AUM{g1, i1, l1, i2, i3, l2, l3, g2, l4})
if err != nil {
t.Fatal(err)
@ -275,7 +275,7 @@ func TestOpenAuthority(t *testing.T) {
}
func TestOpenAuthority_EmptyErrors(t *testing.T) {
_, err := Open(&Mem{})
_, err := Open(ChonkMem())
if err == nil {
t.Error("Expected an error initializing an empty authority, got nil")
}
@ -319,7 +319,7 @@ func TestCreateBootstrapAuthority(t *testing.T) {
pub, priv := testingKey25519(t, 1)
key := Key{Kind: Key25519, Public: pub, Votes: 2}
a1, genesisAUM, err := Create(&Mem{}, State{
a1, genesisAUM, err := Create(ChonkMem(), State{
Keys: []Key{key},
DisablementSecrets: [][]byte{DisablementKDF([]byte{1, 2, 3})},
}, signer25519(priv))
@ -327,7 +327,7 @@ func TestCreateBootstrapAuthority(t *testing.T) {
t.Fatalf("Create() failed: %v", err)
}
a2, err := Bootstrap(&Mem{}, genesisAUM)
a2, err := Bootstrap(ChonkMem(), genesisAUM)
if err != nil {
t.Fatalf("Bootstrap() failed: %v", err)
}
@ -366,7 +366,7 @@ func TestAuthorityInformNonLinear(t *testing.T) {
optKey("key", key, priv),
optSignAllUsing("key"))
storage := &Mem{}
storage := ChonkMem()
a, err := Bootstrap(storage, c.AUMs["G1"])
if err != nil {
t.Fatalf("Bootstrap() failed: %v", err)
@ -411,7 +411,7 @@ func TestAuthorityInformLinear(t *testing.T) {
optKey("key", key, priv),
optSignAllUsing("key"))
storage := &Mem{}
storage := ChonkMem()
a, err := Bootstrap(storage, c.AUMs["G1"])
if err != nil {
t.Fatalf("Bootstrap() failed: %v", err)
@ -444,7 +444,7 @@ func TestInteropWithNLKey(t *testing.T) {
pub2 := key.NewNLPrivate().Public()
pub3 := key.NewNLPrivate().Public()
a, _, err := Create(&Mem{}, State{
a, _, err := Create(ChonkMem(), State{
Keys: []Key{
{
Kind: Key25519,

View File

@ -18,7 +18,7 @@ func TestImplementsChonk(t *testing.T) {
{
name: "Mem",
newChonk: func(t *testing.T) tka.Chonk {
return &tka.Mem{}
return tka.ChonkMem()
},
},
{
@ -42,7 +42,7 @@ func TestImplementsCompactableChonk(t *testing.T) {
{
name: "Mem",
newChonk: func(t *testing.T) tka.CompactableChonk {
return &tka.Mem{}
return tka.ChonkMem()
},
},
{