mirror of
https://github.com/tailscale/tailscale.git
synced 2025-12-03 08:22:00 +01:00
cmd/tailscaled,ipn: show a health warning when state store fails to open (#17883)
With the introduction of node sealing, store.New fails in some cases due to the TPM device being reset or unavailable. Currently it results in tailscaled crashing at startup, which is not obvious to the user until they check the logs. Instead of crashing tailscaled at startup, start with an in-memory store with a health warning about state initialization and a link to (future) docs on what to do. When this health message is set, also block any login attempts to avoid masking the problem with an ephemeral node registration. Updates #15830 Updates #17654 Signed-off-by: Andrew Lytvynov <awly@tailscale.com>
This commit is contained in:
parent
de8ed203e0
commit
c679aaba32
@ -69,7 +69,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
|
||||
tailscale.com/ipn/ipnstate from tailscale.com/control/controlclient+
|
||||
tailscale.com/ipn/localapi from tailscale.com/ipn/ipnserver
|
||||
tailscale.com/ipn/store from tailscale.com/cmd/tailscaled
|
||||
tailscale.com/ipn/store/mem from tailscale.com/ipn/store
|
||||
tailscale.com/ipn/store/mem from tailscale.com/ipn/store+
|
||||
tailscale.com/kube/kubetypes from tailscale.com/envknob
|
||||
tailscale.com/log/filelogger from tailscale.com/logpolicy
|
||||
tailscale.com/log/sockstatlog from tailscale.com/ipn/ipnlocal
|
||||
|
||||
@ -92,7 +92,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
|
||||
tailscale.com/ipn/ipnstate from tailscale.com/control/controlclient+
|
||||
tailscale.com/ipn/localapi from tailscale.com/ipn/ipnserver
|
||||
tailscale.com/ipn/store from tailscale.com/cmd/tailscaled
|
||||
tailscale.com/ipn/store/mem from tailscale.com/ipn/store
|
||||
tailscale.com/ipn/store/mem from tailscale.com/ipn/store+
|
||||
tailscale.com/kube/kubetypes from tailscale.com/envknob
|
||||
tailscale.com/licenses from tailscale.com/cmd/tailscale/cli
|
||||
tailscale.com/log/filelogger from tailscale.com/logpolicy
|
||||
|
||||
@ -33,12 +33,14 @@ import (
|
||||
"tailscale.com/feature"
|
||||
"tailscale.com/feature/buildfeatures"
|
||||
_ "tailscale.com/feature/condregister"
|
||||
"tailscale.com/health"
|
||||
"tailscale.com/hostinfo"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/ipn/conffile"
|
||||
"tailscale.com/ipn/ipnlocal"
|
||||
"tailscale.com/ipn/ipnserver"
|
||||
"tailscale.com/ipn/store"
|
||||
"tailscale.com/ipn/store/mem"
|
||||
"tailscale.com/logpolicy"
|
||||
"tailscale.com/logtail"
|
||||
"tailscale.com/net/dns"
|
||||
@ -644,7 +646,16 @@ func getLocalBackend(ctx context.Context, logf logger.Logf, logID logid.PublicID
|
||||
|
||||
store, err := store.New(logf, statePathOrDefault())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("store.New: %w", err)
|
||||
// If we can't create the store (for example if it's TPM-sealed and the
|
||||
// TPM is reset), create a dummy in-memory store to propagate the error
|
||||
// to the user.
|
||||
ht, ok := sys.HealthTracker.GetOK()
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("store.New: %w", err)
|
||||
}
|
||||
logf("store.New failed: %v; starting with in-memory store with a health warning", err)
|
||||
store = new(mem.Store)
|
||||
ht.SetUnhealthy(ipn.StateStoreHealth, health.Args{health.ArgError: err.Error()})
|
||||
}
|
||||
sys.Set(store)
|
||||
|
||||
|
||||
@ -4,9 +4,17 @@
|
||||
package main // import "tailscale.com/cmd/tailscaled"
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"tailscale.com/envknob"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/net/netmon"
|
||||
"tailscale.com/tsd"
|
||||
"tailscale.com/tstest/deptest"
|
||||
"tailscale.com/types/logid"
|
||||
"tailscale.com/util/must"
|
||||
)
|
||||
|
||||
func TestNothing(t *testing.T) {
|
||||
@ -38,3 +46,45 @@ func TestDeps(t *testing.T) {
|
||||
},
|
||||
}.Check(t)
|
||||
}
|
||||
|
||||
func TestStateStoreError(t *testing.T) {
|
||||
logID, err := logid.NewPrivateID()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// Don't upload any logs from tests.
|
||||
envknob.SetNoLogsNoSupport()
|
||||
|
||||
args.statedir = t.TempDir()
|
||||
args.tunname = "userspace-networking"
|
||||
|
||||
t.Run("new state", func(t *testing.T) {
|
||||
sys := tsd.NewSystem()
|
||||
sys.NetMon.Set(must.Get(netmon.New(sys.Bus.Get(), t.Logf)))
|
||||
lb, err := getLocalBackend(t.Context(), t.Logf, logID.Public(), sys)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer lb.Shutdown()
|
||||
if lb.HealthTracker().IsUnhealthy(ipn.StateStoreHealth) {
|
||||
t.Errorf("StateStoreHealth is unhealthy on fresh LocalBackend:\n%s", strings.Join(lb.HealthTracker().Strings(), "\n"))
|
||||
}
|
||||
})
|
||||
t.Run("corrupt state", func(t *testing.T) {
|
||||
sys := tsd.NewSystem()
|
||||
sys.NetMon.Set(must.Get(netmon.New(sys.Bus.Get(), t.Logf)))
|
||||
// Populate the state file with something that will fail to parse to
|
||||
// trigger an error from store.New.
|
||||
if err := os.WriteFile(statePathOrDefault(), []byte("bad json"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
lb, err := getLocalBackend(t.Context(), t.Logf, logID.Public(), sys)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer lb.Shutdown()
|
||||
if !lb.HealthTracker().IsUnhealthy(ipn.StateStoreHealth) {
|
||||
t.Errorf("StateStoreHealth is healthy when state file is corrupt")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@ -3747,6 +3747,9 @@ func (b *LocalBackend) StartLoginInteractive(ctx context.Context) error {
|
||||
// the control plane sends us one. Otherwise, the notification will be delivered to all
|
||||
// active [watchSession]s.
|
||||
func (b *LocalBackend) StartLoginInteractiveAs(ctx context.Context, user ipnauth.Actor) error {
|
||||
if b.health.IsUnhealthy(ipn.StateStoreHealth) {
|
||||
return errors.New("cannot log in when state store is unhealthy")
|
||||
}
|
||||
b.mu.Lock()
|
||||
defer b.mu.Unlock()
|
||||
if b.cc == nil {
|
||||
@ -5677,6 +5680,9 @@ func (b *LocalBackend) NodeKey() key.NodePublic {
|
||||
//
|
||||
// b.mu must be held
|
||||
func (b *LocalBackend) nextStateLocked() ipn.State {
|
||||
if b.health.IsUnhealthy(ipn.StateStoreHealth) {
|
||||
return ipn.NoState
|
||||
}
|
||||
var (
|
||||
cc = b.cc
|
||||
cn = b.currentNode()
|
||||
@ -6936,6 +6942,9 @@ func (b *LocalBackend) CurrentProfile() ipn.LoginProfileView {
|
||||
|
||||
// NewProfile creates and switches to the new profile.
|
||||
func (b *LocalBackend) NewProfile() error {
|
||||
if b.health.IsUnhealthy(ipn.StateStoreHealth) {
|
||||
return errors.New("cannot log in when state store is unhealthy")
|
||||
}
|
||||
b.mu.Lock()
|
||||
defer b.mu.Unlock()
|
||||
|
||||
|
||||
@ -930,7 +930,10 @@ func (h *Handler) serveLoginInteractive(w http.ResponseWriter, r *http.Request)
|
||||
http.Error(w, "want POST", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
h.b.StartLoginInteractiveAs(r.Context(), h.Actor)
|
||||
if err := h.b.StartLoginInteractiveAs(r.Context(), h.Actor); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
return
|
||||
}
|
||||
@ -949,6 +952,11 @@ func (h *Handler) serveStart(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
if h.b.HealthTracker().IsUnhealthy(ipn.StateStoreHealth) {
|
||||
http.Error(w, "cannot start backend when state store is unhealthy", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
err := h.b.Start(o)
|
||||
if err != nil {
|
||||
// TODO(bradfitz): map error to a good HTTP error
|
||||
|
||||
@ -25,9 +25,11 @@ import (
|
||||
"testing"
|
||||
|
||||
"tailscale.com/client/tailscale/apitype"
|
||||
"tailscale.com/health"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/ipn/ipnauth"
|
||||
"tailscale.com/ipn/ipnlocal"
|
||||
"tailscale.com/ipn/ipnstate"
|
||||
"tailscale.com/ipn/store/mem"
|
||||
"tailscale.com/tailcfg"
|
||||
"tailscale.com/tsd"
|
||||
@ -428,3 +430,73 @@ func TestKeepItSorted(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestServeWithUnhealthyState(t *testing.T) {
|
||||
tstest.Replace(t, &validLocalHostForTesting, true)
|
||||
h := &Handler{
|
||||
PermitRead: true,
|
||||
PermitWrite: true,
|
||||
b: newTestLocalBackend(t),
|
||||
logf: t.Logf,
|
||||
}
|
||||
h.b.HealthTracker().SetUnhealthy(ipn.StateStoreHealth, health.Args{health.ArgError: "testing"})
|
||||
if err := h.b.Start(ipn.Options{}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
check500Body := func(wantResp string) func(t *testing.T, code int, resp []byte) {
|
||||
return func(t *testing.T, code int, resp []byte) {
|
||||
if code != http.StatusInternalServerError {
|
||||
t.Errorf("got code: %v, want %v\nresponse: %q", code, http.StatusInternalServerError, resp)
|
||||
}
|
||||
if got := strings.TrimSpace(string(resp)); got != wantResp {
|
||||
t.Errorf("got response: %q, want %q", got, wantResp)
|
||||
}
|
||||
}
|
||||
}
|
||||
tests := []struct {
|
||||
desc string
|
||||
req *http.Request
|
||||
check func(t *testing.T, code int, resp []byte)
|
||||
}{
|
||||
{
|
||||
desc: "status",
|
||||
req: httptest.NewRequest("GET", "http://localhost:1234/localapi/v0/status", nil),
|
||||
check: func(t *testing.T, code int, resp []byte) {
|
||||
if code != http.StatusOK {
|
||||
t.Errorf("got code: %v, want %v\nresponse: %q", code, http.StatusOK, resp)
|
||||
}
|
||||
var status ipnstate.Status
|
||||
if err := json.Unmarshal(resp, &status); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if status.BackendState != "NoState" {
|
||||
t.Errorf("got backend state: %q, want %q", status.BackendState, "NoState")
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "login-interactive",
|
||||
req: httptest.NewRequest("POST", "http://localhost:1234/localapi/v0/login-interactive", nil),
|
||||
check: check500Body("cannot log in when state store is unhealthy"),
|
||||
},
|
||||
{
|
||||
desc: "start",
|
||||
req: httptest.NewRequest("POST", "http://localhost:1234/localapi/v0/start", strings.NewReader("{}")),
|
||||
check: check500Body("cannot start backend when state store is unhealthy"),
|
||||
},
|
||||
{
|
||||
desc: "new-profile",
|
||||
req: httptest.NewRequest("PUT", "http://localhost:1234/localapi/v0/profiles/", nil),
|
||||
check: check500Body("cannot log in when state store is unhealthy"),
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.desc, func(t *testing.T) {
|
||||
resp := httptest.NewRecorder()
|
||||
h.ServeHTTP(resp, tt.req)
|
||||
tt.check(t, resp.Code, resp.Body.Bytes())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
15
ipn/store.go
15
ipn/store.go
@ -10,6 +10,8 @@ import (
|
||||
"fmt"
|
||||
"net"
|
||||
"strconv"
|
||||
|
||||
"tailscale.com/health"
|
||||
)
|
||||
|
||||
// ErrStateNotExist is returned by StateStore.ReadState when the
|
||||
@ -60,6 +62,19 @@ const (
|
||||
TaildropReceivedKey = StateKey("_taildrop-received")
|
||||
)
|
||||
|
||||
// StateStoreHealth is a Warnable set when store.New fails at startup. If
|
||||
// unhealthy, we block all login attempts and return a health message in status
|
||||
// responses.
|
||||
var StateStoreHealth = health.Register(&health.Warnable{
|
||||
Code: "state-store-health",
|
||||
Severity: health.SeverityHigh,
|
||||
Title: "Tailscale state store failed to initialize",
|
||||
Text: func(args health.Args) string {
|
||||
return fmt.Sprintf("State store failed to initialize, Tailscale will not work until this is resolved. See https://tailscale.com/s/state-store-init-error. Error: %s", args[health.ArgError])
|
||||
},
|
||||
ImpactsConnectivity: true,
|
||||
})
|
||||
|
||||
// CurrentProfileID returns the StateKey that stores the
|
||||
// current profile ID. The value is a JSON-encoded LoginProfile.
|
||||
// If the userID is empty, the key returned is CurrentProfileStateKey,
|
||||
|
||||
@ -22,6 +22,7 @@ import (
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
@ -36,6 +37,7 @@ import (
|
||||
"tailscale.com/cmd/testwrapper/flakytest"
|
||||
"tailscale.com/feature"
|
||||
_ "tailscale.com/feature/clientupdate"
|
||||
"tailscale.com/health"
|
||||
"tailscale.com/hostinfo"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/net/tsaddr"
|
||||
@ -2246,3 +2248,38 @@ func TestNetworkLock(t *testing.T) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestNodeWithBadStateFile(t *testing.T) {
|
||||
tstest.Shard(t)
|
||||
tstest.Parallel(t)
|
||||
env := NewTestEnv(t)
|
||||
n1 := NewTestNode(t, env)
|
||||
if err := os.WriteFile(n1.stateFile, []byte("bad json"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
d1 := n1.StartDaemon()
|
||||
n1.AwaitResponding()
|
||||
|
||||
// Make sure the health message shows up in status output.
|
||||
n1.AwaitBackendState("NoState")
|
||||
st := n1.MustStatus()
|
||||
wantHealth := ipn.StateStoreHealth.Text(health.Args{health.ArgError: ""})
|
||||
if !slices.ContainsFunc(st.Health, func(m string) bool { return strings.HasPrefix(m, wantHealth) }) {
|
||||
t.Errorf("Status does not contain expected health message %q\ngot health messages: %q", wantHealth, st.Health)
|
||||
}
|
||||
|
||||
// Make sure login attempts are rejected.
|
||||
cmd := n1.Tailscale("up", "--login-server="+n1.env.ControlURL())
|
||||
t.Logf("Running %v ...", cmd)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err == nil {
|
||||
t.Fatalf("up succeeded with output %q", out)
|
||||
}
|
||||
wantOut := "cannot start backend when state store is unhealthy"
|
||||
if !strings.Contains(string(out), wantOut) {
|
||||
t.Fatalf("got up output:\n%s\nwant:\n%s", string(out), wantOut)
|
||||
}
|
||||
|
||||
d1.MustCleanShutdown(t)
|
||||
}
|
||||
|
||||
@ -27,6 +27,7 @@ import (
|
||||
_ "tailscale.com/ipn/ipnlocal"
|
||||
_ "tailscale.com/ipn/ipnserver"
|
||||
_ "tailscale.com/ipn/store"
|
||||
_ "tailscale.com/ipn/store/mem"
|
||||
_ "tailscale.com/logpolicy"
|
||||
_ "tailscale.com/logtail"
|
||||
_ "tailscale.com/net/dns"
|
||||
|
||||
@ -27,6 +27,7 @@ import (
|
||||
_ "tailscale.com/ipn/ipnlocal"
|
||||
_ "tailscale.com/ipn/ipnserver"
|
||||
_ "tailscale.com/ipn/store"
|
||||
_ "tailscale.com/ipn/store/mem"
|
||||
_ "tailscale.com/logpolicy"
|
||||
_ "tailscale.com/logtail"
|
||||
_ "tailscale.com/net/dns"
|
||||
|
||||
@ -27,6 +27,7 @@ import (
|
||||
_ "tailscale.com/ipn/ipnlocal"
|
||||
_ "tailscale.com/ipn/ipnserver"
|
||||
_ "tailscale.com/ipn/store"
|
||||
_ "tailscale.com/ipn/store/mem"
|
||||
_ "tailscale.com/logpolicy"
|
||||
_ "tailscale.com/logtail"
|
||||
_ "tailscale.com/net/dns"
|
||||
|
||||
@ -27,6 +27,7 @@ import (
|
||||
_ "tailscale.com/ipn/ipnlocal"
|
||||
_ "tailscale.com/ipn/ipnserver"
|
||||
_ "tailscale.com/ipn/store"
|
||||
_ "tailscale.com/ipn/store/mem"
|
||||
_ "tailscale.com/logpolicy"
|
||||
_ "tailscale.com/logtail"
|
||||
_ "tailscale.com/net/dns"
|
||||
|
||||
@ -37,6 +37,7 @@ import (
|
||||
_ "tailscale.com/ipn/ipnlocal"
|
||||
_ "tailscale.com/ipn/ipnserver"
|
||||
_ "tailscale.com/ipn/store"
|
||||
_ "tailscale.com/ipn/store/mem"
|
||||
_ "tailscale.com/logpolicy"
|
||||
_ "tailscale.com/logtail"
|
||||
_ "tailscale.com/net/dns"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user