mirror of
https://github.com/hashicorp/vault.git
synced 2025-08-22 07:01:09 +02:00
This commit introduces two new adaptive concurrency limiters in Vault, which should handle overloading of the server during periods of untenable request rate. The limiter adjusts the number of allowable in-flight requests based on latency measurements performed across the request duration. This approach allows us to reject entire requests prior to doing any work and prevents clients from exceeding server capacity. The limiters intentionally target two separate vectors that have been proven to lead to server over-utilization. - Back pressure from the storage backend, resulting in bufferbloat in the WAL system. (enterprise) - Back pressure from CPU over-utilization via PKI issue requests (specifically for RSA keys), resulting in failed heartbeats. Storage constraints can be accounted for by limiting logical requests according to their http.Method. We only limit requests with write-based methods, since these will result in storage Puts and exhibit the aforementioned bufferbloat. CPU constraints are accounted for using the same underlying library and technique; however, they require special treatment. The maximum number of concurrent pki/issue requests found in testing (again, specifically for RSA keys) is far lower than the minimum tolerable write request rate. Without separate limiting, we would artificially impose limits on tolerable request rates for non-PKI requests. To specifically target PKI issue requests, we add a new PathsSpecial field, called limited, allowing backends to specify a list of paths which should get special-case request limiting. For the sake of code cleanliness and future extensibility, we introduce the concept of a LimiterRegistry. The registry proposed in this PR has two entries, corresponding with the two vectors above. Each Limiter entry has its own corresponding maximum and minimum concurrency, allowing them to react to latency deviation independently and handle high volumes of requests to targeted bottlenecks (CPU and storage). In both cases, utilization will be effectively throttled before Vault reaches any degraded state. The resulting 503 - Service Unavailable is a retryable HTTP response code, which can be handled to gracefully retry and eventually succeed. Clients should handle this by retrying with jitter and exponential backoff. This is done within Vault's API, using the go-retryablehttp library. Limiter testing was performed via benchmarks of mixed workloads and across a deployment of agent pods with great success.
807 lines
24 KiB
Go
807 lines
24 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package pki
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/armon/go-metrics"
|
|
"github.com/hashicorp/go-multierror"
|
|
"github.com/hashicorp/vault/helper/metricsutil"
|
|
"github.com/hashicorp/vault/helper/namespace"
|
|
"github.com/hashicorp/vault/sdk/framework"
|
|
"github.com/hashicorp/vault/sdk/helper/consts"
|
|
"github.com/hashicorp/vault/sdk/helper/errutil"
|
|
"github.com/hashicorp/vault/sdk/logical"
|
|
|
|
"github.com/hashicorp/vault/builtin/logical/pki/issuing"
|
|
"github.com/hashicorp/vault/builtin/logical/pki/managed_key"
|
|
"github.com/hashicorp/vault/builtin/logical/pki/pki_backend"
|
|
)
|
|
|
|
const (
|
|
operationPrefixPKI = "pki"
|
|
operationPrefixPKIIssuer = "pki-issuer"
|
|
operationPrefixPKIIssuers = "pki-issuers"
|
|
operationPrefixPKIRoot = "pki-root"
|
|
|
|
noRole = 0
|
|
roleOptional = 1
|
|
roleRequired = 2
|
|
)
|
|
|
|
/*
|
|
* PKI requests are a bit special to keep up with the various failure and load issues.
|
|
*
|
|
* Any requests to write/delete shared data (such as roles, issuers, keys, and configuration)
|
|
* are always forwarded to the Primary cluster's active node to write and send the key
|
|
* material/config globally across all clusters. Reads should be handled locally, to give a
|
|
* sense of where this cluster's replication state is at.
|
|
*
|
|
* CRL/Revocation and Fetch Certificate APIs are handled by the active node within the cluster
|
|
* they originate. This means, if a request comes into a performance secondary cluster, the writes
|
|
* will be forwarded to that cluster's active node and not go all the way up to the performance primary's
|
|
* active node.
|
|
*
|
|
* If a certificate issue request has a role in which no_store is set to true, that node itself
|
|
* will issue the certificate and not forward the request to the active node, as this does not
|
|
* need to write to storage.
|
|
*
|
|
* Following the same pattern, if a managed key is involved to sign an issued certificate request
|
|
* and the local node does not have access for some reason to it, the request will be forwarded to
|
|
* the active node within the cluster only.
|
|
*
|
|
* To make sense of what goes where the following bits need to be analyzed within the codebase.
|
|
*
|
|
* 1. The backend LocalStorage paths determine what storage paths will remain within a
|
|
* cluster and not be forwarded to a performance primary
|
|
* 2. Within each path's OperationHandler definition, check to see if ForwardPerformanceStandby &
|
|
* ForwardPerformanceSecondary flags are set to short-circuit the request to a given active node
|
|
* 3. Within the managed key util class in pki, an initialization failure could cause the request
|
|
* to be forwarded to an active node if not already on it.
|
|
*/
|
|
|
|
// Factory creates a new backend implementing the logical.Backend interface
|
|
func Factory(ctx context.Context, conf *logical.BackendConfig) (logical.Backend, error) {
|
|
b := Backend(conf)
|
|
if err := b.Setup(ctx, conf); err != nil {
|
|
return nil, err
|
|
}
|
|
return b, nil
|
|
}
|
|
|
|
// Backend returns a new Backend framework struct
|
|
func Backend(conf *logical.BackendConfig) *backend {
|
|
var b backend
|
|
b.Backend = &framework.Backend{
|
|
Help: strings.TrimSpace(backendHelp),
|
|
|
|
PathsSpecial: &logical.Paths{
|
|
Unauthenticated: []string{
|
|
"cert/*",
|
|
"ca/pem",
|
|
"ca_chain",
|
|
"ca",
|
|
"crl/delta",
|
|
"crl/delta/pem",
|
|
"crl/pem",
|
|
"crl",
|
|
"issuer/+/crl/der",
|
|
"issuer/+/crl/pem",
|
|
"issuer/+/crl",
|
|
"issuer/+/crl/delta/der",
|
|
"issuer/+/crl/delta/pem",
|
|
"issuer/+/crl/delta",
|
|
"issuer/+/unified-crl/der",
|
|
"issuer/+/unified-crl/pem",
|
|
"issuer/+/unified-crl",
|
|
"issuer/+/unified-crl/delta/der",
|
|
"issuer/+/unified-crl/delta/pem",
|
|
"issuer/+/unified-crl/delta",
|
|
"issuer/+/pem",
|
|
"issuer/+/der",
|
|
"issuer/+/json",
|
|
"issuers/", // LIST operations append a '/' to the requested path
|
|
"ocsp", // OCSP POST
|
|
"ocsp/*", // OCSP GET
|
|
"unified-crl/delta",
|
|
"unified-crl/delta/pem",
|
|
"unified-crl/pem",
|
|
"unified-crl",
|
|
"unified-ocsp", // Unified OCSP POST
|
|
"unified-ocsp/*", // Unified OCSP GET
|
|
|
|
// ACME paths are added below
|
|
},
|
|
|
|
LocalStorage: []string{
|
|
revokedPath,
|
|
localDeltaWALPath,
|
|
legacyCRLPath,
|
|
clusterConfigPath,
|
|
"crls/",
|
|
"certs/",
|
|
acmePathPrefix,
|
|
},
|
|
|
|
Root: []string{
|
|
"root",
|
|
"root/sign-self-issued",
|
|
},
|
|
|
|
SealWrapStorage: []string{
|
|
legacyCertBundlePath,
|
|
legacyCertBundleBackupPath,
|
|
keyPrefix,
|
|
},
|
|
|
|
WriteForwardedStorage: []string{
|
|
crossRevocationPath,
|
|
unifiedRevocationWritePathPrefix,
|
|
unifiedDeltaWALPath,
|
|
},
|
|
|
|
Limited: []string{
|
|
"issue",
|
|
"issue/*",
|
|
},
|
|
|
|
Binary: []string{
|
|
"ocsp", // OCSP POST
|
|
"ocsp/*", // OCSP GET
|
|
"unified-ocsp", // Unified OCSP POST
|
|
"unified-ocsp/*", // Unified OCSP GET
|
|
},
|
|
},
|
|
|
|
Paths: []*framework.Path{
|
|
pathListRoles(&b),
|
|
pathRoles(&b),
|
|
pathGenerateRoot(&b),
|
|
pathSignIntermediate(&b),
|
|
pathSignSelfIssued(&b),
|
|
pathDeleteRoot(&b),
|
|
pathGenerateIntermediate(&b),
|
|
pathSetSignedIntermediate(&b),
|
|
pathConfigCA(&b),
|
|
pathConfigCRL(&b),
|
|
pathConfigURLs(&b),
|
|
pathConfigCluster(&b),
|
|
pathSignVerbatim(&b),
|
|
pathSign(&b),
|
|
pathIssue(&b),
|
|
pathRotateCRL(&b),
|
|
pathRotateDeltaCRL(&b),
|
|
pathRevoke(&b),
|
|
pathRevokeWithKey(&b),
|
|
pathListCertsRevoked(&b),
|
|
pathTidy(&b),
|
|
pathTidyCancel(&b),
|
|
pathTidyStatus(&b),
|
|
pathConfigAutoTidy(&b),
|
|
|
|
// Issuer APIs
|
|
pathListIssuers(&b),
|
|
pathGetIssuer(&b),
|
|
pathGetUnauthedIssuer(&b),
|
|
pathGetIssuerCRL(&b),
|
|
pathImportIssuer(&b),
|
|
pathIssuerIssue(&b),
|
|
pathIssuerSign(&b),
|
|
pathIssuerSignIntermediate(&b),
|
|
pathIssuerSignSelfIssued(&b),
|
|
pathIssuerSignVerbatim(&b),
|
|
pathIssuerGenerateRoot(&b),
|
|
pathRotateRoot(&b),
|
|
pathIssuerGenerateIntermediate(&b),
|
|
pathCrossSignIntermediate(&b),
|
|
pathConfigIssuers(&b),
|
|
pathReplaceRoot(&b),
|
|
pathRevokeIssuer(&b),
|
|
|
|
// Key APIs
|
|
pathListKeys(&b),
|
|
pathKey(&b),
|
|
pathGenerateKey(&b),
|
|
pathImportKey(&b),
|
|
pathConfigKeys(&b),
|
|
|
|
// Fetch APIs have been lowered to favor the newer issuer API endpoints
|
|
pathFetchCA(&b),
|
|
pathFetchCAChain(&b),
|
|
pathFetchCRL(&b),
|
|
pathFetchCRLViaCertPath(&b),
|
|
pathFetchValidRaw(&b),
|
|
pathFetchValid(&b),
|
|
pathFetchListCerts(&b),
|
|
|
|
// OCSP APIs
|
|
buildPathOcspGet(&b),
|
|
buildPathOcspPost(&b),
|
|
|
|
// CRL Signing
|
|
pathResignCrls(&b),
|
|
pathSignRevocationList(&b),
|
|
|
|
// ACME
|
|
pathAcmeConfig(&b),
|
|
pathAcmeEabList(&b),
|
|
pathAcmeEabDelete(&b),
|
|
},
|
|
|
|
Secrets: []*framework.Secret{
|
|
secretCerts(&b),
|
|
},
|
|
|
|
BackendType: logical.TypeLogical,
|
|
InitializeFunc: b.initialize,
|
|
Invalidate: b.invalidate,
|
|
PeriodicFunc: b.periodicFunc,
|
|
Clean: b.cleanup,
|
|
}
|
|
|
|
// Add ACME paths to backend
|
|
for _, prefix := range []struct {
|
|
acmePrefix string
|
|
unauthPrefix string
|
|
opts acmeWrapperOpts
|
|
}{
|
|
{
|
|
"acme",
|
|
"acme",
|
|
acmeWrapperOpts{true, false},
|
|
},
|
|
{
|
|
"roles/" + framework.GenericNameRegex("role") + "/acme",
|
|
"roles/+/acme",
|
|
acmeWrapperOpts{},
|
|
},
|
|
{
|
|
"issuer/" + framework.GenericNameRegex(issuerRefParam) + "/acme",
|
|
"issuer/+/acme",
|
|
acmeWrapperOpts{},
|
|
},
|
|
{
|
|
"issuer/" + framework.GenericNameRegex(issuerRefParam) + "/roles/" + framework.GenericNameRegex("role") + "/acme",
|
|
"issuer/+/roles/+/acme",
|
|
acmeWrapperOpts{},
|
|
},
|
|
} {
|
|
setupAcmeDirectory(&b, prefix.acmePrefix, prefix.unauthPrefix, prefix.opts)
|
|
}
|
|
|
|
b.tidyCASGuard = new(uint32)
|
|
b.tidyCancelCAS = new(uint32)
|
|
b.tidyStatus = &tidyStatus{state: tidyStatusInactive}
|
|
b.storage = conf.StorageView
|
|
b.backendUUID = conf.BackendUUID
|
|
|
|
b.pkiStorageVersion.Store(0)
|
|
|
|
// b isn't yet initialized with SystemView state; calling b.System() will
|
|
// result in a nil pointer dereference. Instead query BackendConfig's
|
|
// copy of SystemView.
|
|
cannotRebuildCRLs := conf.System.ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
|
|
conf.System.ReplicationState().HasState(consts.ReplicationDRSecondary)
|
|
b.crlBuilder = newCRLBuilder(!cannotRebuildCRLs)
|
|
|
|
// Delay the first tidy until after we've started up.
|
|
b.lastTidy = time.Now()
|
|
|
|
b.unifiedTransferStatus = newUnifiedTransferStatus()
|
|
|
|
b.acmeState = NewACMEState()
|
|
b.certificateCounter = NewCertificateCounter(b.backendUUID)
|
|
|
|
// It is important that we call SetupEnt at the very end as
|
|
// some ENT backends need access to the member vars initialized above.
|
|
b.SetupEnt()
|
|
return &b
|
|
}
|
|
|
|
type backend struct {
|
|
*framework.Backend
|
|
entBackend
|
|
|
|
backendUUID string
|
|
storage logical.Storage
|
|
revokeStorageLock sync.RWMutex
|
|
tidyCASGuard *uint32
|
|
tidyCancelCAS *uint32
|
|
|
|
tidyStatusLock sync.RWMutex
|
|
tidyStatus *tidyStatus
|
|
lastTidy time.Time
|
|
|
|
unifiedTransferStatus *UnifiedTransferStatus
|
|
|
|
certificateCounter *CertificateCounter
|
|
|
|
pkiStorageVersion atomic.Value
|
|
crlBuilder *CrlBuilder
|
|
|
|
// Write lock around issuers and keys.
|
|
issuersLock sync.RWMutex
|
|
|
|
// Context around ACME operations
|
|
acmeState *acmeState
|
|
acmeAccountLock sync.RWMutex // (Write) Locked on Tidy, (Read) Locked on Account Creation
|
|
}
|
|
|
|
// BackendOps a bridge/legacy interface until we can further
|
|
// separate out backend things into distinct packages.
|
|
type BackendOps interface {
|
|
managed_key.PkiManagedKeyView
|
|
pki_backend.SystemViewGetter
|
|
pki_backend.MountInfo
|
|
pki_backend.Logger
|
|
UseLegacyBundleCaStorage() bool
|
|
CrlBuilder() *CrlBuilder
|
|
GetRevokeStorageLock() *sync.RWMutex
|
|
GetUnifiedTransferStatus() *UnifiedTransferStatus
|
|
GetAcmeState() *acmeState
|
|
GetRole(ctx context.Context, s logical.Storage, n string) (*issuing.RoleEntry, error)
|
|
GetCertificateCounter() *CertificateCounter
|
|
}
|
|
|
|
var _ BackendOps = &backend{}
|
|
|
|
type roleOperation func(ctx context.Context, req *logical.Request, data *framework.FieldData, role *issuing.RoleEntry) (*logical.Response, error)
|
|
|
|
const backendHelp = `
|
|
The PKI backend dynamically generates X509 server and client certificates.
|
|
|
|
After mounting this backend, configure the CA using the "pem_bundle" endpoint within
|
|
the "config/" path.
|
|
`
|
|
|
|
func metricsKey(req *logical.Request, extra ...string) []string {
|
|
if req == nil || req.MountPoint == "" {
|
|
return extra
|
|
}
|
|
key := make([]string, len(extra)+1)
|
|
key[0] = req.MountPoint[:len(req.MountPoint)-1]
|
|
copy(key[1:], extra)
|
|
return key
|
|
}
|
|
|
|
func (b *backend) metricsWrap(callType string, roleMode int, ofunc roleOperation) framework.OperationFunc {
|
|
return func(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) {
|
|
key := metricsKey(req, callType)
|
|
var role *issuing.RoleEntry
|
|
var labels []metrics.Label
|
|
var err error
|
|
|
|
var roleName string
|
|
switch roleMode {
|
|
case roleRequired:
|
|
roleName = data.Get("role").(string)
|
|
case roleOptional:
|
|
r, ok := data.GetOk("role")
|
|
if ok {
|
|
roleName = r.(string)
|
|
}
|
|
}
|
|
if roleMode > noRole {
|
|
// Get the role
|
|
role, err = b.GetRole(ctx, req.Storage, roleName)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if role == nil && (roleMode == roleRequired || len(roleName) > 0) {
|
|
return logical.ErrorResponse(fmt.Sprintf("unknown role: %s", roleName)), nil
|
|
}
|
|
labels = []metrics.Label{{"role", roleName}}
|
|
}
|
|
|
|
ns, err := namespace.FromContext(ctx)
|
|
if err == nil {
|
|
labels = append(labels, metricsutil.NamespaceLabel(ns))
|
|
}
|
|
|
|
start := time.Now()
|
|
defer metrics.MeasureSinceWithLabels(key, start, labels)
|
|
resp, err := ofunc(ctx, req, data, role)
|
|
|
|
if err != nil || resp.IsError() {
|
|
metrics.IncrCounterWithLabels(append(key, "failure"), 1.0, labels)
|
|
} else {
|
|
metrics.IncrCounterWithLabels(key, 1.0, labels)
|
|
}
|
|
return resp, err
|
|
}
|
|
}
|
|
|
|
// initialize is used to perform a possible PKI storage migration if needed
|
|
func (b *backend) initialize(ctx context.Context, ir *logical.InitializationRequest) error {
|
|
sc := b.makeStorageContext(ctx, b.storage)
|
|
if err := b.CrlBuilder().reloadConfigIfRequired(sc); err != nil {
|
|
return err
|
|
}
|
|
|
|
err := b.initializePKIIssuersStorage(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err = b.GetAcmeState().Initialize(b, sc)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Initialize also needs to populate our certificate and revoked certificate count
|
|
err = b.initializeStoredCertificateCounts(ctx)
|
|
if err != nil {
|
|
// Don't block/err initialize/startup for metrics. Context on this call can time out due to number of certificates.
|
|
b.Logger().Error("Could not initialize stored certificate counts", "error", err)
|
|
b.GetCertificateCounter().SetError(err)
|
|
}
|
|
|
|
return b.initializeEnt(sc, ir)
|
|
}
|
|
|
|
func (b *backend) cleanup(ctx context.Context) {
|
|
sc := b.makeStorageContext(ctx, b.storage)
|
|
|
|
b.GetAcmeState().Shutdown(b)
|
|
|
|
b.cleanupEnt(sc)
|
|
}
|
|
|
|
func (b *backend) initializePKIIssuersStorage(ctx context.Context) error {
|
|
// Grab the lock prior to the updating of the storage lock preventing us flipping
|
|
// the storage flag midway through the request stream of other requests.
|
|
b.issuersLock.Lock()
|
|
defer b.issuersLock.Unlock()
|
|
|
|
// Load up our current pki storage state, no matter the host type we are on.
|
|
b.updatePkiStorageVersion(ctx, false)
|
|
|
|
// Early exit if not a primary cluster or performance secondary with a local mount.
|
|
if b.System().ReplicationState().HasState(consts.ReplicationDRSecondary|consts.ReplicationPerformanceStandby) ||
|
|
(!b.System().LocalMount() && b.System().ReplicationState().HasState(consts.ReplicationPerformanceSecondary)) {
|
|
b.Logger().Debug("skipping PKI migration as we are not on primary or secondary with a local mount")
|
|
return nil
|
|
}
|
|
|
|
if err := migrateStorage(ctx, b, b.storage); err != nil {
|
|
b.Logger().Error("Error during migration of PKI mount: " + err.Error())
|
|
return err
|
|
}
|
|
|
|
b.updatePkiStorageVersion(ctx, false)
|
|
|
|
return nil
|
|
}
|
|
|
|
func (b *backend) BackendUUID() string {
|
|
return b.backendUUID
|
|
}
|
|
|
|
func (b *backend) CrlBuilder() *CrlBuilder {
|
|
return b.crlBuilder
|
|
}
|
|
|
|
func (b *backend) GetRevokeStorageLock() *sync.RWMutex {
|
|
return &b.revokeStorageLock
|
|
}
|
|
|
|
func (b *backend) GetUnifiedTransferStatus() *UnifiedTransferStatus {
|
|
return b.unifiedTransferStatus
|
|
}
|
|
|
|
func (b *backend) GetAcmeState() *acmeState {
|
|
return b.acmeState
|
|
}
|
|
|
|
func (b *backend) GetCertificateCounter() *CertificateCounter {
|
|
return b.certificateCounter
|
|
}
|
|
|
|
func (b *backend) UseLegacyBundleCaStorage() bool {
|
|
// This helper function is here to choose whether or not we use the newer
|
|
// issuer/key storage format or the older legacy ca bundle format.
|
|
//
|
|
// This happens because we might've upgraded secondary PR clusters to
|
|
// newer vault code versions. We still want to be able to service requests
|
|
// with the old bundle format (e.g., issuing and revoking certs), until
|
|
// the primary cluster's active node is upgraded to the newer Vault version
|
|
// and the storage is migrated to the new format.
|
|
version := b.pkiStorageVersion.Load()
|
|
return version == nil || version == 0
|
|
}
|
|
|
|
func (b *backend) IsSecondaryNode() bool {
|
|
return b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby)
|
|
}
|
|
|
|
func (b *backend) GetManagedKeyView() (logical.ManagedKeySystemView, error) {
|
|
managedKeyView, ok := b.System().(logical.ManagedKeySystemView)
|
|
if !ok {
|
|
return nil, errutil.InternalError{Err: fmt.Sprintf("unsupported system view")}
|
|
}
|
|
return managedKeyView, nil
|
|
}
|
|
|
|
func (b *backend) updatePkiStorageVersion(ctx context.Context, grabIssuersLock bool) {
|
|
info, err := getMigrationInfo(ctx, b.storage)
|
|
if err != nil {
|
|
b.Logger().Error(fmt.Sprintf("Failed loading PKI migration status, staying in legacy mode: %v", err))
|
|
return
|
|
}
|
|
|
|
// If this method is called outside the initialize function, like say an
|
|
// invalidate func on a performance replica cluster, we should be grabbing
|
|
// the issuers lock to offer a consistent view of the storage version while
|
|
// other events are processing things. Its unknown what might happen during
|
|
// a single event if one part thinks we are in legacy mode, and then later
|
|
// on we aren't.
|
|
if grabIssuersLock {
|
|
b.issuersLock.Lock()
|
|
defer b.issuersLock.Unlock()
|
|
}
|
|
|
|
if info.isRequired {
|
|
b.pkiStorageVersion.Store(0)
|
|
} else {
|
|
b.pkiStorageVersion.Store(1)
|
|
}
|
|
}
|
|
|
|
func (b *backend) invalidate(ctx context.Context, key string) {
|
|
isNotPerfPrimary := b.System().ReplicationState().HasState(consts.ReplicationDRSecondary|consts.ReplicationPerformanceStandby) ||
|
|
(!b.System().LocalMount() && b.System().ReplicationState().HasState(consts.ReplicationPerformanceSecondary))
|
|
|
|
switch {
|
|
case strings.HasPrefix(key, legacyMigrationBundleLogKey):
|
|
// This is for a secondary cluster to pick up that the migration has completed
|
|
// and reset its compatibility mode and rebuild the CRL locally. Kick it off
|
|
// as a go routine to not block this call due to the lock grabbing
|
|
// within updatePkiStorageVersion.
|
|
go func() {
|
|
b.Logger().Info("Detected a migration completed, resetting pki storage version")
|
|
b.updatePkiStorageVersion(ctx, true)
|
|
b.CrlBuilder().requestRebuildIfActiveNode(b)
|
|
}()
|
|
case strings.HasPrefix(key, issuerPrefix):
|
|
if !b.UseLegacyBundleCaStorage() {
|
|
// See note in updateDefaultIssuerId about why this is necessary.
|
|
// We do this ahead of CRL rebuilding just so we know that things
|
|
// are stale.
|
|
b.CrlBuilder().invalidateCRLBuildTime()
|
|
|
|
// If an issuer has changed on the primary, we need to schedule an update of our CRL,
|
|
// the primary cluster would have done it already, but the CRL is cluster specific so
|
|
// force a rebuild of ours.
|
|
b.CrlBuilder().requestRebuildIfActiveNode(b)
|
|
} else {
|
|
b.Logger().Debug("Ignoring invalidation updates for issuer as the PKI migration has yet to complete.")
|
|
}
|
|
case key == "config/crl":
|
|
// We may need to reload our OCSP status flag
|
|
b.CrlBuilder().markConfigDirty()
|
|
case key == storageAcmeConfig:
|
|
b.GetAcmeState().markConfigDirty()
|
|
case key == storageIssuerConfig:
|
|
b.CrlBuilder().invalidateCRLBuildTime()
|
|
case strings.HasPrefix(key, crossRevocationPrefix):
|
|
split := strings.Split(key, "/")
|
|
|
|
if !strings.HasSuffix(key, "/confirmed") {
|
|
cluster := split[len(split)-2]
|
|
serial := split[len(split)-1]
|
|
b.CrlBuilder().addCertForRevocationCheck(cluster, serial)
|
|
} else {
|
|
if len(split) >= 3 {
|
|
cluster := split[len(split)-3]
|
|
serial := split[len(split)-2]
|
|
// Only process confirmations on the perf primary. The
|
|
// performance secondaries cannot remove other clusters'
|
|
// entries, and so do not need to track them (only to
|
|
// ignore them). On performance primary nodes though,
|
|
// we do want to track them to remove them.
|
|
if !isNotPerfPrimary {
|
|
b.CrlBuilder().addCertForRevocationRemoval(cluster, serial)
|
|
}
|
|
}
|
|
}
|
|
case strings.HasPrefix(key, unifiedRevocationReadPathPrefix):
|
|
// Three parts to this key: prefix, cluster, and serial.
|
|
split := strings.Split(key, "/")
|
|
cluster := split[len(split)-2]
|
|
serial := split[len(split)-1]
|
|
b.CrlBuilder().addCertFromCrossRevocation(cluster, serial)
|
|
}
|
|
|
|
b.invalidateEnt(ctx, key)
|
|
}
|
|
|
|
func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) error {
|
|
sc := b.makeStorageContext(ctx, request.Storage)
|
|
|
|
doCRL := func() error {
|
|
// First attempt to reload the CRL configuration.
|
|
if err := b.CrlBuilder().reloadConfigIfRequired(sc); err != nil {
|
|
return err
|
|
}
|
|
|
|
// As we're (below) modifying the backing storage, we need to ensure
|
|
// we're not on a standby/secondary node.
|
|
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
|
|
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
|
|
return nil
|
|
}
|
|
|
|
// First handle any global revocation queue entries.
|
|
if err := b.CrlBuilder().processRevocationQueue(sc); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Then handle any unified cross-cluster revocations.
|
|
if err := b.CrlBuilder().processCrossClusterRevocations(sc); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Check if we're set to auto rebuild and a CRL is set to expire.
|
|
if err := b.CrlBuilder().checkForAutoRebuild(sc); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Then attempt to rebuild the CRLs if required.
|
|
warnings, err := b.CrlBuilder().rebuildIfForced(sc)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if len(warnings) > 0 {
|
|
msg := "During rebuild of complete CRL, got the following warnings:"
|
|
for index, warning := range warnings {
|
|
msg = fmt.Sprintf("%v\n %d. %v", msg, index+1, warning)
|
|
}
|
|
b.Logger().Warn(msg)
|
|
}
|
|
|
|
// If a delta CRL was rebuilt above as part of the complete CRL rebuild,
|
|
// this will be a no-op. However, if we do need to rebuild delta CRLs,
|
|
// this would cause us to do so.
|
|
warnings, err = b.CrlBuilder().rebuildDeltaCRLsIfForced(sc, false)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if len(warnings) > 0 {
|
|
msg := "During rebuild of delta CRL, got the following warnings:"
|
|
for index, warning := range warnings {
|
|
msg = fmt.Sprintf("%v\n %d. %v", msg, index+1, warning)
|
|
}
|
|
b.Logger().Warn(msg)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
doAutoTidy := func() error {
|
|
// As we're (below) modifying the backing storage, we need to ensure
|
|
// we're not on a standby/secondary node.
|
|
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
|
|
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
|
|
return nil
|
|
}
|
|
|
|
config, err := sc.getAutoTidyConfig()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !config.Enabled || config.Interval <= 0*time.Second {
|
|
return nil
|
|
}
|
|
|
|
// Check if we should run another tidy...
|
|
now := time.Now()
|
|
b.tidyStatusLock.RLock()
|
|
nextOp := b.lastTidy.Add(config.Interval)
|
|
b.tidyStatusLock.RUnlock()
|
|
if now.Before(nextOp) {
|
|
return nil
|
|
}
|
|
|
|
// Ensure a tidy isn't already running... If it is, we'll trigger
|
|
// again when the running one finishes.
|
|
if !atomic.CompareAndSwapUint32(b.tidyCASGuard, 0, 1) {
|
|
return nil
|
|
}
|
|
|
|
// Prevent ourselves from starting another tidy operation while
|
|
// this one is still running. This operation runs in the background
|
|
// and has a separate error reporting mechanism.
|
|
b.tidyStatusLock.Lock()
|
|
b.lastTidy = now
|
|
b.tidyStatusLock.Unlock()
|
|
|
|
// Because the request from the parent storage will be cleared at
|
|
// some point (and potentially reused) -- due to tidy executing in
|
|
// a background goroutine -- we need to copy the storage entry off
|
|
// of the backend instead.
|
|
backendReq := &logical.Request{
|
|
Storage: b.storage,
|
|
}
|
|
|
|
b.startTidyOperation(backendReq, config)
|
|
return nil
|
|
}
|
|
|
|
// First tidy any ACME nonces to free memory.
|
|
b.GetAcmeState().DoTidyNonces()
|
|
|
|
// Then run unified transfer.
|
|
backgroundSc := b.makeStorageContext(context.Background(), b.storage)
|
|
go runUnifiedTransfer(backgroundSc)
|
|
|
|
// Then run the CRL rebuild and tidy operation.
|
|
crlErr := doCRL()
|
|
tidyErr := doAutoTidy()
|
|
|
|
// Periodically re-emit gauges so that they don't disappear/go stale
|
|
b.GetCertificateCounter().EmitCertStoreMetrics()
|
|
|
|
var errors error
|
|
if crlErr != nil {
|
|
errors = multierror.Append(errors, fmt.Errorf("Error building CRLs:\n - %w\n", crlErr))
|
|
}
|
|
|
|
if tidyErr != nil {
|
|
errors = multierror.Append(errors, fmt.Errorf("Error running auto-tidy:\n - %w\n", tidyErr))
|
|
}
|
|
|
|
if errors != nil {
|
|
return errors
|
|
}
|
|
|
|
// Check if the CRL was invalidated due to issuer swap and update
|
|
// accordingly.
|
|
if err := b.CrlBuilder().flushCRLBuildTimeInvalidation(sc); err != nil {
|
|
return err
|
|
}
|
|
|
|
// All good!
|
|
return b.periodicFuncEnt(backgroundSc, request)
|
|
}
|
|
|
|
func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
|
|
// For performance reasons, we can't lock on issuance/storage of certs until a list operation completes,
|
|
// but we want to limit possible miscounts / double-counts to over-counting, so we take the tidy lock which
|
|
// prevents (most) deletions - in particular we take a read lock (sufficient to block the write lock in
|
|
// tidyStatusStart while allowing tidy to still acquire a read lock to report via its endpoint)
|
|
b.tidyStatusLock.RLock()
|
|
defer b.tidyStatusLock.RUnlock()
|
|
sc := b.makeStorageContext(ctx, b.storage)
|
|
config, err := sc.getAutoTidyConfig()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
certCounter := b.GetCertificateCounter()
|
|
isEnabled := certCounter.ReconfigureWithTidyConfig(config)
|
|
if !isEnabled {
|
|
return nil
|
|
}
|
|
|
|
entries, err := b.storage.List(ctx, "certs/")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
revokedEntries, err := b.storage.List(ctx, "revoked/")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
certCounter.InitializeCountsFromStorage(entries, revokedEntries)
|
|
return nil
|
|
}
|