vault/builtin/logical/pki/backend.go
Alexander Scheel 76d89fd45f
Add ability to cancel PKI tidy operations, pause between tidying certs (#16958)
* Allow tidy operations to be cancelled

When tidy operations take a long time to execute (and especially when
executing them automatically), having the ability to cancel them becomes
useful to reduce strain on Vault clusters (and let them be rescheduled
at a later time).

To this end, we add the /tidy-cancel write endpoint.

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add missing auto-tidy synopsis / description

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add a pause duration between tidying certificates

By setting pause_duration, operators can have a little control over the
resource utilization of a tidy operation. While the list of certificates
remain in memory throughout the entire operation, a pause is added
between processing certificates and the revocation lock is released.
This allows other operations to occur during this gap and potentially
allows the tidy operation to consume less resources per unit of time
(due to the sleep -- though obviously consumes the same resources over
the time of the operation).

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add tests for cancellation, pause

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add API docs on pause_duration, /tidy-cancel

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add changelog entry

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add lock releasing around tidy pause

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Reset cancel guard, return errors

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>
2022-08-31 11:36:12 -07:00

529 lines
15 KiB
Go

package pki
import (
"context"
"fmt"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/armon/go-metrics"
"github.com/hashicorp/vault/helper/metricsutil"
"github.com/hashicorp/vault/helper/namespace"
"github.com/hashicorp/vault/sdk/framework"
"github.com/hashicorp/vault/sdk/logical"
)
const (
noRole = 0
roleOptional = 1
roleRequired = 2
)
/*
* PKI requests are a bit special to keep up with the various failure and load issues.
*
* Any requests to write/delete shared data (such as roles, issuers, keys, and configuration)
* are always forwarded to the Primary cluster's active node to write and send the key
* material/config globally across all clusters. Reads should be handled locally, to give a
* sense of where this cluster's replication state is at.
*
* CRL/Revocation and Fetch Certificate APIs are handled by the active node within the cluster
* they originate. This means, if a request comes into a performance secondary cluster, the writes
* will be forwarded to that cluster's active node and not go all the way up to the performance primary's
* active node.
*
* If a certificate issue request has a role in which no_store is set to true, that node itself
* will issue the certificate and not forward the request to the active node, as this does not
* need to write to storage.
*
* Following the same pattern, if a managed key is involved to sign an issued certificate request
* and the local node does not have access for some reason to it, the request will be forwarded to
* the active node within the cluster only.
*
* To make sense of what goes where the following bits need to be analyzed within the codebase.
*
* 1. The backend LocalStorage paths determine what storage paths will remain within a
* cluster and not be forwarded to a performance primary
* 2. Within each path's OperationHandler definition, check to see if ForwardPerformanceStandby &
* ForwardPerformanceSecondary flags are set to short-circuit the request to a given active node
* 3. Within the managed key util class in pki, an initialization failure could cause the request
* to be forwarded to an active node if not already on it.
*/
// Factory creates a new backend implementing the logical.Backend interface
func Factory(ctx context.Context, conf *logical.BackendConfig) (logical.Backend, error) {
b := Backend(conf)
if err := b.Setup(ctx, conf); err != nil {
return nil, err
}
return b, nil
}
// Backend returns a new Backend framework struct
func Backend(conf *logical.BackendConfig) *backend {
var b backend
b.Backend = &framework.Backend{
Help: strings.TrimSpace(backendHelp),
PathsSpecial: &logical.Paths{
Unauthenticated: []string{
"cert/*",
"ca/pem",
"ca_chain",
"ca",
"crl/delta",
"crl/delta/pem",
"crl/pem",
"crl",
"issuer/+/crl/der",
"issuer/+/crl/pem",
"issuer/+/crl",
"issuer/+/crl/delta/der",
"issuer/+/crl/delta/pem",
"issuer/+/crl/delta",
"issuer/+/pem",
"issuer/+/der",
"issuer/+/json",
"issuers/", // LIST operations append a '/' to the requested path
"ocsp", // OCSP POST
"ocsp/*", // OCSP GET
},
LocalStorage: []string{
revokedPath,
deltaWALPath,
legacyCRLPath,
"crls/",
"certs/",
},
Root: []string{
"root",
"root/sign-self-issued",
},
SealWrapStorage: []string{
legacyCertBundlePath,
keyPrefix,
},
},
Paths: []*framework.Path{
pathListRoles(&b),
pathRoles(&b),
pathGenerateRoot(&b),
pathSignIntermediate(&b),
pathSignSelfIssued(&b),
pathDeleteRoot(&b),
pathGenerateIntermediate(&b),
pathSetSignedIntermediate(&b),
pathConfigCA(&b),
pathConfigCRL(&b),
pathConfigURLs(&b),
pathSignVerbatim(&b),
pathSign(&b),
pathIssue(&b),
pathRotateCRL(&b),
pathRevoke(&b),
pathRevokeWithKey(&b),
pathTidy(&b),
pathTidyCancel(&b),
pathTidyStatus(&b),
pathConfigAutoTidy(&b),
// Issuer APIs
pathListIssuers(&b),
pathGetIssuer(&b),
pathGetIssuerCRL(&b),
pathImportIssuer(&b),
pathIssuerIssue(&b),
pathIssuerSign(&b),
pathIssuerSignIntermediate(&b),
pathIssuerSignSelfIssued(&b),
pathIssuerSignVerbatim(&b),
pathIssuerGenerateRoot(&b),
pathRotateRoot(&b),
pathIssuerGenerateIntermediate(&b),
pathCrossSignIntermediate(&b),
pathConfigIssuers(&b),
pathReplaceRoot(&b),
pathRevokeIssuer(&b),
// Key APIs
pathListKeys(&b),
pathKey(&b),
pathGenerateKey(&b),
pathImportKey(&b),
pathConfigKeys(&b),
// Fetch APIs have been lowered to favor the newer issuer API endpoints
pathFetchCA(&b),
pathFetchCAChain(&b),
pathFetchCRL(&b),
pathFetchCRLViaCertPath(&b),
pathFetchValidRaw(&b),
pathFetchValid(&b),
pathFetchListCerts(&b),
// OCSP APIs
buildPathOcspGet(&b),
buildPathOcspPost(&b),
},
Secrets: []*framework.Secret{
secretCerts(&b),
},
BackendType: logical.TypeLogical,
InitializeFunc: b.initialize,
Invalidate: b.invalidate,
PeriodicFunc: b.periodicFunc,
}
b.tidyCASGuard = new(uint32)
b.tidyCancelCAS = new(uint32)
b.tidyStatus = &tidyStatus{state: tidyStatusInactive}
b.storage = conf.StorageView
b.backendUUID = conf.BackendUUID
b.pkiStorageVersion.Store(0)
b.crlBuilder = newCRLBuilder()
// Delay the first tidy until after we've started up.
b.lastTidy = time.Now()
return &b
}
type backend struct {
*framework.Backend
backendUUID string
storage logical.Storage
revokeStorageLock sync.RWMutex
tidyCASGuard *uint32
tidyCancelCAS *uint32
tidyStatusLock sync.RWMutex
tidyStatus *tidyStatus
lastTidy time.Time
pkiStorageVersion atomic.Value
crlBuilder *crlBuilder
// Write lock around issuers and keys.
issuersLock sync.RWMutex
}
type (
tidyStatusState int
roleOperation func(ctx context.Context, req *logical.Request, data *framework.FieldData, role *roleEntry) (*logical.Response, error)
)
const (
tidyStatusInactive tidyStatusState = iota
tidyStatusStarted = iota
tidyStatusFinished = iota
tidyStatusError = iota
tidyStatusCancelling = iota
tidyStatusCancelled = iota
)
type tidyStatus struct {
// Parameters used to initiate the operation
safetyBuffer int
tidyCertStore bool
tidyRevokedCerts bool
tidyRevokedAssocs bool
pauseDuration string
// Status
state tidyStatusState
err error
timeStarted time.Time
timeFinished time.Time
message string
certStoreDeletedCount uint
revokedCertDeletedCount uint
missingIssuerCertCount uint
}
const backendHelp = `
The PKI backend dynamically generates X509 server and client certificates.
After mounting this backend, configure the CA using the "pem_bundle" endpoint within
the "config/" path.
`
func metricsKey(req *logical.Request, extra ...string) []string {
if req == nil || req.MountPoint == "" {
return extra
}
key := make([]string, len(extra)+1)
key[0] = req.MountPoint[:len(req.MountPoint)-1]
copy(key[1:], extra)
return key
}
func (b *backend) metricsWrap(callType string, roleMode int, ofunc roleOperation) framework.OperationFunc {
return func(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) {
key := metricsKey(req, callType)
var role *roleEntry
var labels []metrics.Label
var err error
var roleName string
switch roleMode {
case roleRequired:
roleName = data.Get("role").(string)
case roleOptional:
r, ok := data.GetOk("role")
if ok {
roleName = r.(string)
}
}
if roleMode > noRole {
// Get the role
role, err = b.getRole(ctx, req.Storage, roleName)
if err != nil {
return nil, err
}
if role == nil && (roleMode == roleRequired || len(roleName) > 0) {
return logical.ErrorResponse(fmt.Sprintf("unknown role: %s", roleName)), nil
}
labels = []metrics.Label{{"role", roleName}}
}
ns, err := namespace.FromContext(ctx)
if err == nil {
labels = append(labels, metricsutil.NamespaceLabel(ns))
}
start := time.Now()
defer metrics.MeasureSinceWithLabels(key, start, labels)
resp, err := ofunc(ctx, req, data, role)
if err != nil || resp.IsError() {
metrics.IncrCounterWithLabels(append(key, "failure"), 1.0, labels)
} else {
metrics.IncrCounterWithLabels(key, 1.0, labels)
}
return resp, err
}
}
// initialize is used to perform a possible PKI storage migration if needed
func (b *backend) initialize(ctx context.Context, _ *logical.InitializationRequest) error {
sc := b.makeStorageContext(ctx, b.storage)
if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil {
return err
}
// Grab the lock prior to the updating of the storage lock preventing us flipping
// the storage flag midway through the request stream of other requests.
b.issuersLock.Lock()
defer b.issuersLock.Unlock()
// Load up our current pki storage state, no matter the host type we are on.
b.updatePkiStorageVersion(ctx, false)
// Early exit if not a primary cluster or performance secondary with a local mount.
if b.System().ReplicationState().HasState(consts.ReplicationDRSecondary|consts.ReplicationPerformanceStandby) ||
(!b.System().LocalMount() && b.System().ReplicationState().HasState(consts.ReplicationPerformanceSecondary)) {
b.Logger().Debug("skipping PKI migration as we are not on primary or secondary with a local mount")
return nil
}
if err := migrateStorage(ctx, b, b.storage); err != nil {
b.Logger().Error("Error during migration of PKI mount: " + err.Error())
return err
}
b.updatePkiStorageVersion(ctx, false)
return nil
}
func (b *backend) useLegacyBundleCaStorage() bool {
// This helper function is here to choose whether or not we use the newer
// issuer/key storage format or the older legacy ca bundle format.
//
// This happens because we might've upgraded secondary PR clusters to
// newer vault code versions. We still want to be able to service requests
// with the old bundle format (e.g., issuing and revoking certs), until
// the primary cluster's active node is upgraded to the newer Vault version
// and the storage is migrated to the new format.
version := b.pkiStorageVersion.Load()
return version == nil || version == 0
}
func (b *backend) updatePkiStorageVersion(ctx context.Context, grabIssuersLock bool) {
info, err := getMigrationInfo(ctx, b.storage)
if err != nil {
b.Logger().Error(fmt.Sprintf("Failed loading PKI migration status, staying in legacy mode: %v", err))
return
}
if grabIssuersLock {
b.issuersLock.Lock()
defer b.issuersLock.Unlock()
}
if info.isRequired {
b.pkiStorageVersion.Store(0)
} else {
b.pkiStorageVersion.Store(1)
}
}
func (b *backend) invalidate(ctx context.Context, key string) {
switch {
case strings.HasPrefix(key, legacyMigrationBundleLogKey):
// This is for a secondary cluster to pick up that the migration has completed
// and reset its compatibility mode and rebuild the CRL locally. Kick it off
// as a go routine to not block this call due to the lock grabbing
// within updatePkiStorageVersion.
go func() {
b.Logger().Info("Detected a migration completed, resetting pki storage version")
b.updatePkiStorageVersion(ctx, true)
b.crlBuilder.requestRebuildIfActiveNode(b)
}()
case strings.HasPrefix(key, issuerPrefix):
if !b.useLegacyBundleCaStorage() {
// See note in updateDefaultIssuerId about why this is necessary.
// We do this ahead of CRL rebuilding just so we know that things
// are stale.
b.crlBuilder.invalidateCRLBuildTime()
// If an issuer has changed on the primary, we need to schedule an update of our CRL,
// the primary cluster would have done it already, but the CRL is cluster specific so
// force a rebuild of ours.
b.crlBuilder.requestRebuildIfActiveNode(b)
} else {
b.Logger().Debug("Ignoring invalidation updates for issuer as the PKI migration has yet to complete.")
}
case key == "config/crl":
// We may need to reload our OCSP status flag
b.crlBuilder.markConfigDirty()
case key == storageIssuerConfig:
b.crlBuilder.invalidateCRLBuildTime()
}
}
func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) error {
sc := b.makeStorageContext(ctx, request.Storage)
doCRL := func() error {
// First attempt to reload the CRL configuration.
if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil {
return err
}
// As we're (below) modifying the backing storage, we need to ensure
// we're not on a standby/secondary node.
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
return nil
}
// Check if we're set to auto rebuild and a CRL is set to expire.
if err := b.crlBuilder.checkForAutoRebuild(sc); err != nil {
return err
}
// Then attempt to rebuild the CRLs if required.
if err := b.crlBuilder.rebuildIfForced(ctx, b, request); err != nil {
return err
}
// If a delta CRL was rebuilt above as part of the complete CRL rebuild,
// this will be a no-op. However, if we do need to rebuild delta CRLs,
// this would cause us to do so.
if err := b.crlBuilder.rebuildDeltaCRLsIfForced(sc); err != nil {
return err
}
return nil
}
doAutoTidy := func() error {
// As we're (below) modifying the backing storage, we need to ensure
// we're not on a standby/secondary node.
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
return nil
}
config, err := sc.getAutoTidyConfig()
if err != nil {
return err
}
if !config.Enabled || config.Interval <= 0*time.Second {
return nil
}
// Check if we should run another tidy...
now := time.Now()
b.tidyStatusLock.RLock()
nextOp := b.lastTidy.Add(config.Interval)
b.tidyStatusLock.RUnlock()
if now.Before(nextOp) {
return nil
}
// Ensure a tidy isn't already running... If it is, we'll trigger
// again when the running one finishes.
if !atomic.CompareAndSwapUint32(b.tidyCASGuard, 0, 1) {
return nil
}
// Prevent ourselves from starting another tidy operation while
// this one is still running. This operation runs in the background
// and has a separate error reporting mechanism.
b.tidyStatusLock.Lock()
b.lastTidy = now
b.tidyStatusLock.Unlock()
// Because the request from the parent storage will be cleared at
// some point (and potentially reused) -- due to tidy executing in
// a background goroutine -- we need to copy the storage entry off
// of the backend instead.
backendReq := &logical.Request{
Storage: b.storage,
}
b.startTidyOperation(backendReq, config)
return nil
}
crlErr := doCRL()
tidyErr := doAutoTidy()
if crlErr != nil && tidyErr != nil {
return fmt.Errorf("Error building CRLs:\n - %v\n\nError running auto-tidy:\n - %v\n", crlErr, tidyErr)
}
if crlErr != nil {
return fmt.Errorf("Error building CRLs:\n - %v\n", crlErr)
}
if tidyErr != nil {
return fmt.Errorf("Error running auto-tidy:\n - %v\n", tidyErr)
}
// Check if the CRL was invalidated due to issuer swap and update
// accordingly.
if err := b.crlBuilder.flushCRLBuildTimeInvalidation(sc); err != nil {
return err
}
// All good!
return nil
}