Backport Fix seal rewrap running unnecessarily into release/1.20.x+ent (#9899) (#9956)

This commit is contained in:
Vault Automation 2025-10-07 17:49:41 -04:00 committed by GitHub
parent 68dde4028d
commit cb2f69ee02
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 101 additions and 67 deletions

3
changelog/_9466.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
core (enterprise): Avoid duplicate seal rewrapping, and ensure that cluster secondaries rewrap after a seal migration.
```

View File

@ -10,7 +10,6 @@ import (
"io"
"os"
"path/filepath"
"strings"
"testing"
"time"
@ -211,20 +210,13 @@ type TestLogger struct {
}
func NewTestLogger(t testing.TB) *TestLogger {
return NewTestLoggerWithSuffix(t, "")
}
func NewTestLoggerWithSuffix(t testing.TB, logFileSuffix string) *TestLogger {
var logFile *os.File
var logPath string
output := os.Stderr
logDir := os.Getenv("VAULT_TEST_LOG_DIR")
if logDir != "" {
if logFileSuffix != "" && !strings.HasPrefix(logFileSuffix, "_") {
logFileSuffix = "_" + logFileSuffix
}
logPath = filepath.Join(logDir, t.Name()+logFileSuffix+".log")
logPath = filepath.Join(logDir, t.Name()+".log")
// t.Name may include slashes.
dir, _ := filepath.Split(logPath)
err := os.MkdirAll(dir, 0o755)

View File

@ -33,7 +33,6 @@ import (
"time"
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/volume"
docker "github.com/docker/docker/client"
"github.com/hashicorp/go-cleanhttp"
@ -79,6 +78,7 @@ type DockerCluster struct {
storage testcluster.ClusterStorage
disableMlock bool
disableTLS bool
}
func (dc *DockerCluster) NamedLogger(s string) log.Logger {
@ -134,6 +134,9 @@ func (dc *DockerCluster) SetRecoveryKeys(keys [][]byte) {
}
func (dc *DockerCluster) GetCACertPEMFile() string {
if dc.disableTLS {
return ""
}
return testcluster.DefaultCAFile
}
@ -449,6 +452,7 @@ func NewDockerCluster(ctx context.Context, opts *DockerClusterOptions) (*DockerC
CA: opts.CA,
storage: opts.Storage,
disableMlock: opts.DisableMlock,
disableTLS: opts.DisableTLS,
}
if err := dc.setupDockerCluster(ctx, opts); err != nil {
@ -778,12 +782,23 @@ func (n *DockerClusterNode) Start(ctx context.Context, opts *DockerClusterOption
}
n.Logger.Trace(s)
}
// If a container gets restarted, and we reissue a ContainerLogs call using
// Follow=true, we'll see all the logs already consumed before we start seeing
// new ones. Use lastTS to avoid duplication.
lastTS := ""
logStdout := &LogConsumerWriter{logConsumer}
logStderr := &LogConsumerWriter{func(s string) {
if seenLogs.CAS(false, true) {
wg.Done()
}
testcluster.JSONLogNoTimestamp(n.Logger, s)
d := json.NewDecoder(strings.NewReader(s))
m := map[string]any{}
if err := d.Decode(&m); err != nil {
n.Logger.Error("failed to decode json output from dev vault", "error", err, "input", s)
return
}
lastTS = testcluster.JSONLogNoTimestampFromMap(n.Logger, lastTS, m)
}}
postStartFunc := func(containerID string, realIP string) error {
@ -920,7 +935,7 @@ func (n *DockerClusterNode) Pause(ctx context.Context) error {
func (n *DockerClusterNode) Restart(ctx context.Context) error {
timeout := 5
err := n.DockerAPI.ContainerRestart(ctx, n.Container.ID, container.StopOptions{Timeout: &timeout})
err := n.runner.RestartContainerWithTimeout(ctx, n.Container.ID, timeout)
if err != nil {
return err
}

View File

@ -17,6 +17,18 @@ func JSONLogNoTimestamp(outlog hclog.Logger, text string) {
outlog.Error("failed to decode json output from dev vault", "error", err, "input", text)
return
}
JSONLogNoTimestampFromMap(outlog, "", m)
}
func JSONLogNoTimestampFromMap(outlog hclog.Logger, min string, m map[string]any) string {
timeStamp := ""
if ts, ok := m["@timestamp"]; ok {
timeStamp = ts.(string)
// This works because the time format is ISO8601.
if timeStamp < min {
return min
}
}
delete(m, "@timestamp")
message := m["@message"].(string)
@ -34,4 +46,5 @@ func JSONLogNoTimestamp(outlog hclog.Logger, text string) {
}
outlog.Log(hclog.LevelFromString(level), message, pairs...)
return timeStamp
}

View File

@ -323,7 +323,7 @@ type Core struct {
// migrationInfo will be populated, which on enterprise may be necessary for
// seal rewrap.
migrationInfo *migrationInformation
sealMigrationDone *uint32
sealMigrationDone atomic.Pointer[time.Time]
// barrier is the security barrier wrapping the physical backend
barrier SecurityBarrier
@ -1058,7 +1058,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
authLock: authLock,
router: NewRouter(),
sealed: new(uint32),
sealMigrationDone: new(uint32),
sealMigrationDone: atomic.Pointer[time.Time]{},
standby: true,
standbyStopCh: new(atomic.Value),
baseLogger: conf.Logger,
@ -1930,7 +1930,8 @@ func (c *Core) getUnsealKey(ctx context.Context, seal Seal) ([]byte, error) {
// if the preceding conditions are true but we cannot decrypt the master key
// in storage using the configured seal.
func (c *Core) sealMigrated(ctx context.Context) (bool, error) {
if atomic.LoadUint32(c.sealMigrationDone) == 1 {
sealMigDone := c.sealMigrationDone.Load()
if sealMigDone != nil && !sealMigDone.IsZero() {
return true, nil
}
@ -2072,7 +2073,7 @@ func (c *Core) migrateSeal(ctx context.Context) error {
}
// Flag migration performed for seal-rewrap later
atomic.StoreUint32(c.sealMigrationDone, 1)
c.SetSealMigrationDone()
c.logger.Info("seal migration complete")
return nil
@ -2569,10 +2570,11 @@ func (s standardUnsealStrategy) unseal(ctx context.Context, logger log.Logger, c
if err := c.runUnsealSetupForPrimary(ctx, logger); err != nil {
return err
}
} else if c.IsMultisealEnabled() {
sealGenInfo := c.SealAccess().GetAccess().GetSealGenerationInfo()
if sealGenInfo != nil && !sealGenInfo.IsRewrapped() {
atomic.StoreUint32(c.sealMigrationDone, 1)
}
if c.IsMultisealEnabled() {
if err := c.handleMultisealRewrapping(ctx, logger); err != nil {
return err
}
}
@ -2584,7 +2586,6 @@ func (s standardUnsealStrategy) unseal(ctx context.Context, logger log.Logger, c
if err := c.startForwarding(ctx); err != nil {
return err
}
}
c.clusterParamsLock.Lock()
@ -2742,59 +2743,66 @@ func runUnsealSetupFunctions(ctx context.Context, setupFunctions []func(context.
// runUnsealSetupForPrimary runs some setup code specific to clusters that are
// in the primary role (as defined by the (*Core).isPrimary method).
func (c *Core) runUnsealSetupForPrimary(ctx context.Context, logger log.Logger) error {
if err := c.setupPluginReload(); err != nil {
return c.setupPluginReload()
}
func (c *Core) handleMultisealRewrapping(ctx context.Context, logger log.Logger) error {
// Retrieve the seal generation information from storage
existingGenerationInfo, err := PhysicalSealGenInfo(ctx, c.physical)
if err != nil {
logger.Error("cannot read existing seal generation info from storage", "error", err)
return err
}
if c.IsMultisealEnabled() {
// Retrieve the seal generation information from storage
existingGenerationInfo, err := PhysicalSealGenInfo(ctx, c.physical)
if err != nil {
logger.Error("cannot read existing seal generation info from storage", "error", err)
sealGenerationInfo := c.seal.GetAccess().GetSealGenerationInfo()
shouldRewrap := !sealGenerationInfo.IsRewrapped()
var reason string
switch {
case existingGenerationInfo == nil:
reason = "First time storing seal generation information"
fallthrough
case existingGenerationInfo.Generation < sealGenerationInfo.Generation:
if reason == "" {
reason = fmt.Sprintf("Seal generation incremented from %d to %d",
existingGenerationInfo.Generation, sealGenerationInfo.Generation)
}
fallthrough
case !existingGenerationInfo.Enabled:
if reason == "" {
reason = fmt.Sprintf("Seal just become enabled again after previously being disabled")
}
if err := c.SetPhysicalSealGenInfo(ctx, sealGenerationInfo); err != nil {
logger.Error("failed to store seal generation info", "error", err)
return err
}
sealGenerationInfo := c.seal.GetAccess().GetSealGenerationInfo()
shouldRewrap := !sealGenerationInfo.IsRewrapped()
switch {
case existingGenerationInfo == nil:
// This is the first time we store seal generation information
fallthrough
case existingGenerationInfo.Generation < sealGenerationInfo.Generation || !existingGenerationInfo.Enabled:
// We have incremented the seal generation or we've just become enabled again after previously being disabled,
// trust the operator in the latter case
shouldRewrap = true
case existingGenerationInfo.Generation == sealGenerationInfo.Generation:
// Same generation, update the rewrapped flag in case the previous active node
// changed its value. In other words, a rewrap may have happened, or a rewrap may have been
// started but not completed.
c.seal.GetAccess().GetSealGenerationInfo().SetRewrapped(existingGenerationInfo.IsRewrapped())
if !existingGenerationInfo.Enabled {
// Weren't enabled but are now, persist the flag
if err := c.SetPhysicalSealGenInfo(ctx, sealGenerationInfo); err != nil {
logger.Error("failed to store seal generation info", "error", err)
return err
}
shouldRewrap = true
case existingGenerationInfo.Generation == sealGenerationInfo.Generation:
// Same generation, update the rewrapped flag in case the previous active node
// changed its value. In other words, a rewrap may have happened, or a rewrap may have been
// started but not completed.
c.seal.GetAccess().GetSealGenerationInfo().SetRewrapped(existingGenerationInfo.IsRewrapped())
if !existingGenerationInfo.Enabled {
// Weren't enabled but are now, persist the flag
if err := c.SetPhysicalSealGenInfo(ctx, sealGenerationInfo); err != nil {
logger.Error("failed to store seal generation info", "error", err)
return err
}
}
shouldRewrap = !existingGenerationInfo.IsRewrapped()
case existingGenerationInfo.Generation > sealGenerationInfo.Generation:
// Our seal information is out of date. The previous active node used a newer generation.
logger.Error("A newer seal generation was found in storage. The seal configuration in this node should be updated to match that of the previous active node, and this node should be restarted.")
return errors.New("newer seal generation found in storage, in memory seal configuration is out of date")
}
if shouldRewrap {
// Set the migration done flag so that a seal-rewrap gets triggered later.
// Note that in the case where multi seal is not supported, Core.migrateSeal() takes care of
// triggering the rewrap when necessary.
logger.Trace("seal generation information indicates that a seal-rewrap is needed", "generation", sealGenerationInfo.Generation)
atomic.StoreUint32(c.sealMigrationDone, 1)
}
startPartialSealRewrapping(c)
reason = "A rewrap may have happened, or a rewrap may have been started but not completed"
shouldRewrap = !existingGenerationInfo.IsRewrapped()
case existingGenerationInfo.Generation > sealGenerationInfo.Generation:
// Our seal information is out of date. The previous active node used a newer generation.
logger.Error("A newer seal generation was found in storage. The seal configuration in this node should be updated to match that of the previous active node, and this node should be restarted.")
return errors.New("newer seal generation found in storage, in memory seal configuration is out of date")
}
if shouldRewrap {
// Set the migration done flag so that a seal-rewrap gets triggered later.
// Note that in the case where multi seal is not supported, Core.migrateSeal() takes care of
// triggering the rewrap when necessary.
logger.Info("seal generation information indicates that a seal-rewrap is needed", "generation", sealGenerationInfo.Generation, "reason", reason)
c.SetSealMigrationDone()
}
startPartialSealRewrapping(c)
return nil
}
@ -2902,10 +2910,8 @@ func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc,
close(jobs)
}
if atomic.LoadUint32(c.sealMigrationDone) == 1 {
if err := c.postSealMigration(ctx); err != nil {
c.logger.Warn("post-unseal post seal migration failed", "error", err)
}
if err := c.postSealMigration(ctx); err != nil {
c.logger.Warn("post-unseal post seal migration failed", "error", err)
}
if os.Getenv(EnvVaultDisableLocalAuthMountEntities) != "" {
@ -4758,4 +4764,9 @@ func (c *Core) shutdownRemovedNode() {
}()
}
func (c *Core) SetSealMigrationDone() {
now := time.Now()
c.sealMigrationDone.Store(&now)
}
var errRemovedHANode = errors.New("node has been removed from the HA cluster")