From 42fa6ab32a85566ea7842beefccaaa5e34d49af5 Mon Sep 17 00:00:00 2001 From: Vault Automation Date: Tue, 13 Jan 2026 10:46:02 -0800 Subject: [PATCH] KV HWM Metrics (#11596) (#11725) * Added kv HWM metrics and a local test * Added go docs and some helper functions * Just added kv secret hwm to endpoint for tracking * Fixed some small things and added one more test * Fix a race test * Added require' Co-authored-by: divyaac --- vault/billing/billing_counts.go | 10 ++- vault/billing/billing_counts_test.go | 21 +++++ vault/consumption_billing.go | 15 +++- vault/consumption_billing_util.go | 78 +++++++++++++++++ vault/consumption_billing_util_test.go | 102 +++++++++++++++++++++++ vault/core_metrics.go | 55 ++++++++++++ vault/logical_system_use_case_billing.go | 23 +++++ 7 files changed, 298 insertions(+), 6 deletions(-) create mode 100644 vault/billing/billing_counts_test.go diff --git a/vault/billing/billing_counts.go b/vault/billing/billing_counts.go index 28f2088910..c206b4cd23 100644 --- a/vault/billing/billing_counts.go +++ b/vault/billing/billing_counts.go @@ -13,11 +13,12 @@ const ( BillingSubPath = "billing/" ReplicatedPrefix = "replicated/" RoleHWMCountsHWM = "maxRoleCounts/" + KvHWMCountsHWM = "maxKvCounts/" LocalPrefix = "local/" BillingWriteInterval = 10 * time.Minute ) -var BillingMonthStorageFormat = "%s/%d/%02d/%s" +var BillingMonthStorageFormat = "%s%d/%02d/%s" // e.g replicated/2026/01/maxKvCounts/ type ConsumptionBilling struct { // BillingStorageLock controls access to the billing storage paths @@ -32,7 +33,10 @@ type BillingConfig struct { } func GetMonthlyBillingPath(localPrefix string, now time.Time, billingMetric string) string { + // Normalize to avoid double slashes since our prefixes include trailing "/". + // Example: localPrefix="replicated/", billingMetric="maxKvCounts/" => + // "replicated/2026/01/maxKvCounts/" year := now.Year() - month := now.Month() - return fmt.Sprintf(localPrefix, month, year, billingMetric) + month := int(now.Month()) + return fmt.Sprintf(BillingMonthStorageFormat, localPrefix, year, month, billingMetric) } diff --git a/vault/billing/billing_counts_test.go b/vault/billing/billing_counts_test.go new file mode 100644 index 0000000000..bc0a0123f4 --- /dev/null +++ b/vault/billing/billing_counts_test.go @@ -0,0 +1,21 @@ +// Copyright IBM Corp. 2016, 2025 +// SPDX-License-Identifier: MPL-2.0 + +package billing + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +// TestGetMonthlyBillingPath verifies the GetMonthlyBillingPath function +// returns the correct billing path for the given product area and month +func TestGetMonthlyBillingPath(t *testing.T) { + ts := time.Date(2026, time.January, 5, 12, 0, 0, 0, time.UTC) + + got := GetMonthlyBillingPath(ReplicatedPrefix, ts, KvHWMCountsHWM) + want := "replicated/2026/01/maxKvCounts/" + require.Equal(t, got, want) +} diff --git a/vault/consumption_billing.go b/vault/consumption_billing.go index 60d97d3220..b2955f645f 100644 --- a/vault/consumption_billing.go +++ b/vault/consumption_billing.go @@ -84,16 +84,25 @@ func (c *Core) UpdateReplicatedHWMMetrics(ctx context.Context, currentMonth time } else { c.logger.Info("updated replicated hwm role counts", "prefix", billing.ReplicatedPrefix, "currentMonth", currentMonth) } + if _, err = c.UpdateMaxKvCounts(ctx, billing.ReplicatedPrefix, currentMonth); err != nil { + // We won't return an error. Instead we will log the errors and attempt to continue + c.logger.Error("error updating replicated max kv counts", "error", err) + } else { + c.logger.Info("updated replicated max kv counts", "prefix", billing.ReplicatedPrefix, "currentMonth", currentMonth) + } return nil } func (c *Core) UpdateLocalHWMMetrics(ctx context.Context, currentMonth time.Time) error { - _, err := c.UpdateMaxRoleCounts(ctx, billing.LocalPrefix, currentMonth) - if err != nil { + if _, err := c.UpdateMaxRoleCounts(ctx, billing.LocalPrefix, currentMonth); err != nil { c.logger.Error("error updating local max role counts", "error", err) - // We won't return an error. Instead we will log the errors and attempt to continue } else { c.logger.Info("updated local max role counts", "prefix", billing.LocalPrefix, "currentMonth", currentMonth) } + if _, err := c.UpdateMaxKvCounts(ctx, billing.LocalPrefix, currentMonth); err != nil { + c.logger.Error("error updating local max kv counts", "error", err) + } else { + c.logger.Info("updated local max kv counts", "prefix", billing.LocalPrefix, "currentMonth", currentMonth) + } return nil } diff --git a/vault/consumption_billing_util.go b/vault/consumption_billing_util.go index bc9326f029..b673f5d5ec 100644 --- a/vault/consumption_billing_util.go +++ b/vault/consumption_billing_util.go @@ -5,6 +5,7 @@ package vault import ( "context" + "strconv" "time" "github.com/hashicorp/vault/sdk/logical" @@ -37,6 +38,83 @@ func combineRoleCounts(ctx context.Context, a, b *RoleCounts) *RoleCounts { } } +// storeMaxKvCountsLocked must be called with BillingStorageLock held +func (c *Core) storeMaxKvCountsLocked(ctx context.Context, maxKvCounts int, localPathPrefix string, month time.Time) error { + billingPath := billing.GetMonthlyBillingPath(localPathPrefix, month, billing.KvHWMCountsHWM) + entry := &logical.StorageEntry{ + Key: billingPath, + Value: []byte(strconv.Itoa(maxKvCounts)), + } + return c.GetBillingSubView().Put(ctx, entry) +} + +// getStoredMaxKvCountsLocked must be called with BillingStorageLock held +func (c *Core) getStoredMaxKvCountsLocked(ctx context.Context, localPathPrefix string, month time.Time) (int, error) { + billingPath := billing.GetMonthlyBillingPath(localPathPrefix, month, billing.KvHWMCountsHWM) + entry, err := c.GetBillingSubView().Get(ctx, billingPath) + if err != nil { + return 0, err + } + if entry == nil { + return 0, nil + } + maxKvCounts, err := strconv.Atoi(string(entry.Value)) + if err != nil { + return 0, err + } + return maxKvCounts, nil +} + +func (c *Core) GetStoredHWMKvCounts(ctx context.Context, localPathPrefix string, month time.Time) (int, error) { + c.consumptionBilling.BillingStorageLock.RLock() + defer c.consumptionBilling.BillingStorageLock.RUnlock() + return c.getStoredMaxKvCountsLocked(ctx, localPathPrefix, month) +} + +// UpdateMaxKvCounts updates the HWM kv counts for the given month, and returns the value that was stored. +func (c *Core) UpdateMaxKvCounts(ctx context.Context, localPathPrefix string, currentMonth time.Time) (int, error) { + c.consumptionBilling.BillingStorageLock.Lock() + defer c.consumptionBilling.BillingStorageLock.Unlock() + + local := localPathPrefix == billing.LocalPrefix + + // Get the current count of kv version 1 secrets + currentKvCounts, err := c.GetKvUsageMetricsByNamespace(ctx, "1", "", local, !local) + if err != nil { + c.logger.Error("error getting count of kv version 1 secrets", "error", err) + return 0, err + } + totalKvCounts := getTotalSecretsAcrossAllNamespaces(currentKvCounts) + + // Get the current count of kv version 2 secrets + currentKvCounts, err = c.GetKvUsageMetricsByNamespace(ctx, "2", "", local, !local) + if err != nil { + c.logger.Error("error getting current count of kv version 2 secrets", "error", err) + return 0, err + } + totalKvCounts += getTotalSecretsAcrossAllNamespaces(currentKvCounts) + + // Get the stored max kv counts + maxKvCounts, err := c.getStoredMaxKvCountsLocked(ctx, localPathPrefix, currentMonth) + if err != nil { + c.logger.Error("error getting stored max kv counts", "error", err) + return 0, err + } + if maxKvCounts == 0 { + maxKvCounts = totalKvCounts + } + if totalKvCounts > maxKvCounts { + c.logger.Info("updating max kv counts", "totalKvCounts", totalKvCounts, "maxKvCounts", maxKvCounts) + maxKvCounts = totalKvCounts + } + err = c.storeMaxKvCountsLocked(ctx, maxKvCounts, localPathPrefix, currentMonth) + if err != nil { + c.logger.Error("error storing max kv counts", "error", err) + return 0, err + } + return maxKvCounts, nil +} + // storeMaxRoleCountsLocked must be called with BillingStorageLock held func (c *Core) storeMaxRoleCountsLocked(ctx context.Context, maxRoleCounts *RoleCounts, localPathPrefix string, month time.Time) error { billingPath := billing.GetMonthlyBillingPath(localPathPrefix, month, billing.RoleHWMCountsHWM) diff --git a/vault/consumption_billing_util_test.go b/vault/consumption_billing_util_test.go index 0e11cd7987..1bdf42566f 100644 --- a/vault/consumption_billing_util_test.go +++ b/vault/consumption_billing_util_test.go @@ -203,6 +203,9 @@ func TestHWMRoleCounts(t *testing.T) { }, } + // Sleep to prevent race conditions during the role initialization + time.Sleep(1 * time.Second) + core.mountsLock.RLock() defer core.mountsLock.RUnlock() for _, tc := range testCases { @@ -387,11 +390,74 @@ func TestHWMRoleCounts(t *testing.T) { }, counts) } +// TestHWMKvSecretsCounts tests that we correctly store and track the HWM kv counts +// for both kv-v1 and kv-v2 mounts. +func TestHWMKvSecretsCounts(t *testing.T) { + coreConfig := &CoreConfig{ + LogicalBackends: roleLogicalBackends, + BillingConfig: billing.BillingConfig{ + MetricsUpdateCadence: 3 * time.Second, + }, + } + core, _, root := TestCoreUnsealedWithConfig(t, coreConfig) + + // Add 1 kv-v1 mount and 1 kv-v2 mount in the root namespace + for _, mount := range []string{"kv-v1", "kv-v2"} { + req := logical.TestRequest(t, logical.CreateOperation, fmt.Sprintf("sys/mounts/%v", mount)) + req.Data["type"] = mount + req.ClientToken = root + ctx := namespace.RootContext(context.Background()) + + _, err := core.HandleRequest(ctx, req) + require.NoError(t, err) + } + + // Add two secrets to each mount + for _, mount := range []string{"kv-v1", "kv-v2"} { + for i := 0; i < 2; i++ { + secretName := fmt.Sprintf("secret-%d", i) + addKvSecretToStorage(t, namespace.RootContext(context.Background()), core, mount, root, secretName, mount) + } + } + + // Verify that the max kv counts are as expected + timer := time.NewTimer(3 * time.Second) + _ = <-timer.C + counts, err := core.GetStoredHWMKvCounts(context.Background(), billing.ReplicatedPrefix, time.Now()) + require.NoError(t, err) + require.Equal(t, 4, counts) + + // Add one more secret to the kv-v1 mount + addKvSecretToStorage(t, namespace.RootContext(context.Background()), core, "kv-v1", root, "secret-3", "kv-v1") + + // Wait for the metrics update + timer = time.NewTimer(3 * time.Second) + _ = <-timer.C + + // Verify that the max kv counts are updated + counts, err = core.GetStoredHWMKvCounts(context.Background(), billing.ReplicatedPrefix, time.Now()) + require.NoError(t, err) + require.Equal(t, 5, counts) + + // Now delete one secret from the kv-v2 mount + deleteKvSecretFromStorage(t, namespace.RootContext(context.Background()), core, "kv-v2", root, "secret-1", "kv-v2") + + // Wait for any metrics updates to complete + timer = time.NewTimer(3 * time.Second) + _ = <-timer.C + + // Verify that the max kv counts are still the same + counts, err = core.GetStoredHWMKvCounts(context.Background(), billing.ReplicatedPrefix, time.Now()) + require.NoError(t, err) + require.Equal(t, 5, counts) +} + func addRoleToStorage(t *testing.T, core *Core, mount string, key string, numberOfKeys int) { raw, ok := core.router.root.Get(mount + "/") if !ok { return } + require.NotNil(t, raw) re := raw.(*routeEntry) storageView := re.storageView @@ -435,3 +501,39 @@ func deleteAllRolesFromStorage(t *testing.T, core *Core, mount string, key strin require.NoError(t, err) require.Len(t, list, 0) } + +func addKvSecretToStorage(t *testing.T, ctx context.Context, core *Core, mount string, token string, secretName string, kvVersion string) { + var req *logical.Request + switch kvVersion { + case "kv-v2": + // KV v2 expects writes to /data/ with a nested "data" payload + req = logical.TestRequest(t, logical.UpdateOperation, fmt.Sprintf("%v/data/%s", mount, secretName)) + req.Data["data"] = map[string]interface{}{ + "foo": "bar", + } + case "kv-v1": + // KV v1 expects writes directly to / with a flat payload + req = logical.TestRequest(t, logical.UpdateOperation, fmt.Sprintf("%v/%s", mount, secretName)) + req.Data["foo"] = "bar" + default: + t.Fatalf("invalid kv version: %s", kvVersion) + } + req.ClientToken = token + _, err := core.HandleRequest(ctx, req) + require.NoError(t, err) +} + +func deleteKvSecretFromStorage(t *testing.T, ctx context.Context, core *Core, mount string, token string, secretName string, kvVersion string) { + var req *logical.Request + switch kvVersion { + case "kv-v2": + req = logical.TestRequest(t, logical.DeleteOperation, fmt.Sprintf("%v/data/%s", mount, secretName)) + case "kv-v1": + req = logical.TestRequest(t, logical.DeleteOperation, fmt.Sprintf("%v/%s", mount, secretName)) + default: + t.Fatalf("invalid kv version: %s", kvVersion) + } + req.ClientToken = token + _, err := core.HandleRequest(ctx, req) + require.NoError(t, err) +} diff --git a/vault/core_metrics.go b/vault/core_metrics.go index f685793e63..4d9fd53fa1 100644 --- a/vault/core_metrics.go +++ b/vault/core_metrics.go @@ -870,3 +870,58 @@ func (c *Core) GetRoleCounts() *RoleCounts { func (c *Core) GetRoleCountsForCluster() *RoleCounts { return c.getRoleCountsInternal(true, c.isPrimary()) } + +// GetKvUsageMetrics returns a map of namespace paths to KV secret counts. +func (c *Core) GetKvUsageMetrics(ctx context.Context, kvVersion string) (map[string]int, error) { + return c.GetKvUsageMetricsByNamespace(ctx, kvVersion, "", true, true) +} + +// GetKvUsageMetricsByNamespace returns a map of namespace paths to KV secret counts within a specific namespace. +func (c *Core) GetKvUsageMetricsByNamespace(ctx context.Context, kvVersion string, nsPath string, includeLocal bool, includeReplicated bool) (map[string]int, error) { + mounts := c.findKvMounts() + results := make(map[string]int) + + if kvVersion == "1" || kvVersion == "2" { + var newMounts []*kvMount + for _, mount := range mounts { + if mount.Version == kvVersion { + newMounts = append(newMounts, mount) + } + } + mounts = newMounts + } else if kvVersion != "0" { + return results, fmt.Errorf("kv version %s not supported, must be 0, 1, or 2", kvVersion) + } + + for _, m := range mounts { + if !includeLocal && m.Local { + continue + } + if !includeReplicated && !m.Local { + continue + } + + if nsPath != "" && !strings.HasPrefix(m.Namespace.Path, nsPath) { + continue + } + + select { + case <-ctx.Done(): + return nil, fmt.Errorf("context expired") + default: + break + } + + c.walkKvMountSecrets(ctx, m) + + _, ok := results[m.Namespace.Path] + if ok { + // we need to add, not overwrite + results[m.Namespace.Path] += m.NumSecrets + } else { + results[m.Namespace.Path] = m.NumSecrets + } + } + + return results, nil +} diff --git a/vault/logical_system_use_case_billing.go b/vault/logical_system_use_case_billing.go index b64fcbacfa..df43446e8e 100644 --- a/vault/logical_system_use_case_billing.go +++ b/vault/logical_system_use_case_billing.go @@ -52,6 +52,7 @@ func (b *SystemBackend) useCaseConsumptionBillingPaths() []*framework.Path { func (b *SystemBackend) handleUseCaseConsumption(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) { // Get HWM role counts replicatedMaxRoleCounts := &RoleCounts{} + replicatedKvHWMCounts := 0 var err error currentMonth := time.Now() previousMonth := timeutil.StartOfPreviousMonth(currentMonth) @@ -63,6 +64,10 @@ func (b *SystemBackend) handleUseCaseConsumption(ctx context.Context, req *logic if err != nil { return nil, fmt.Errorf("error retrieving replicated max role counts: %w", err) } + replicatedKvHWMCounts, err = b.Core.UpdateMaxKvCounts(ctx, billing.ReplicatedPrefix, currentMonth) + if err != nil { + return nil, fmt.Errorf("error retrieving replicated max kv counts: %w", err) + } } // We always want to get the local max role counts @@ -71,32 +76,50 @@ func (b *SystemBackend) handleUseCaseConsumption(ctx context.Context, req *logic if err != nil { return nil, fmt.Errorf("error retrieving local max role counts: %w", err) } + localKvHWMCounts, err := b.Core.UpdateMaxKvCounts(ctx, billing.LocalPrefix, currentMonth) + if err != nil { + return nil, fmt.Errorf("error retrieving local max kv counts: %w", err) + } // If we are the primary, then combine the replicated and local max role counts. Else just output the local // max role counts. replicatedMaxRoleCounts will be empty if we are not a primary, so this is taken care of for us. combinedMaxRoleCounts := combineRoleCounts(ctx, replicatedMaxRoleCounts, localMaxRoleCounts) + combinedMaxKvCounts := replicatedKvHWMCounts + localKvHWMCounts var replicatedPreviousMonthRoleCounts *RoleCounts + replicatedPreviousMonthKvHWMCounts := 0 if b.Core.isPrimary() { replicatedPreviousMonthRoleCounts, err = b.Core.GetStoredHWMRoleCounts(ctx, billing.ReplicatedPrefix, previousMonth) if err != nil { return nil, fmt.Errorf("error retrieving replicated max role counts for previous month: %w", err) } + replicatedPreviousMonthKvHWMCounts, err = b.Core.GetStoredHWMKvCounts(ctx, billing.ReplicatedPrefix, previousMonth) + if err != nil { + return nil, fmt.Errorf("error retrieving replicated max kv counts for previous month: %w", err) + } } localPreviousMonthRoleCounts, err := b.Core.GetStoredHWMRoleCounts(ctx, billing.LocalPrefix, previousMonth) if err != nil { return nil, fmt.Errorf("error retrieving local max role counts for previous month: %w", err) } + localPreviousMonthKvHWMCounts, err := b.Core.GetStoredHWMKvCounts(ctx, billing.LocalPrefix, previousMonth) + if err != nil { + return nil, fmt.Errorf("error retrieving local max kv counts for previous month: %w", err) + } + combinedPreviousMonthRoleCounts := combineRoleCounts(ctx, replicatedPreviousMonthRoleCounts, localPreviousMonthRoleCounts) + combinedPreviousMonthKvHWMCounts := replicatedPreviousMonthKvHWMCounts + localPreviousMonthKvHWMCounts resp := map[string]interface{}{ "current_month": map[string]interface{}{ "timestamp": timeutil.StartOfMonth(currentMonth), "maximum_role_counts": combinedMaxRoleCounts, + "maximum_kv_counts": combinedMaxKvCounts, }, "previous_month": map[string]interface{}{ "timestamp": previousMonth, "maximum_role_counts": combinedPreviousMonthRoleCounts, + "maximum_kv_counts": combinedPreviousMonthKvHWMCounts, }, }