From 34a1796d038ee5cba3bca235bf8efd8ef4d86f12 Mon Sep 17 00:00:00 2001 From: Yoko Hyakuna Date: Fri, 9 Aug 2024 14:21:41 -0700 Subject: [PATCH] [Docs] Create 'Troubleshoot' section (#28028) * Create 'Troubleshoot' section * Remove extra spaces * Update redirects.js * Remove extra comma * Change the title * Update website/content/docs/troubleshoot/generate-root-token.mdx Co-authored-by: Brian Shumate * Update website/content/docs/troubleshoot/generate-root-token.mdx Co-authored-by: Brian Shumate * Update website/content/docs/troubleshoot/generate-root-token.mdx Co-authored-by: Brian Shumate * Update website/content/docs/troubleshoot/generate-root-token.mdx Co-authored-by: Brian Shumate * Update website/content/docs/troubleshoot/generate-root-token.mdx Co-authored-by: Brian Shumate * Update website/content/docs/troubleshoot/generate-root-token.mdx Co-authored-by: Brian Shumate * Update website/content/docs/troubleshoot/generate-root-token.mdx Co-authored-by: Brian Shumate * Update website/content/docs/troubleshoot/generate-root-token.mdx Co-authored-by: Brian Shumate * Update website/content/docs/troubleshoot/generate-root-token.mdx Co-authored-by: Brian Shumate * Update website/content/docs/troubleshoot/generate-root-token.mdx Co-authored-by: Brian Shumate * edit suggestions (#28047) * Fix the relative path - add missing '/' * Fix a typo --------- Co-authored-by: Brian Shumate Co-authored-by: Sarah Chavis <62406755+schavis@users.noreply.github.com> --- website/content/api-docs/auth/userpass.mdx | 14 + website/content/docs/commands/write.mdx | 9 + .../concepts/lease-count-quota-exceeded.mdx | 41 -- .../docs/concepts/lease-explosions.mdx | 379 ------------------ .../create-lease-count-quota.mdx | 185 +++++++++ .../prevent-lease-explosions.mdx | 105 +++++ .../docs/troubleshoot/generate-root-token.mdx | 161 ++++++++ .../docs/troubleshoot/lease-issues.mdx | 122 ++++++ website/data/docs-nav-data.json | 33 +- website/redirects.js | 10 + 10 files changed, 629 insertions(+), 430 deletions(-) delete mode 100644 website/content/docs/concepts/lease-count-quota-exceeded.mdx delete mode 100644 website/content/docs/concepts/lease-explosions.mdx create mode 100644 website/content/docs/configuration/create-lease-count-quota.mdx create mode 100644 website/content/docs/configuration/prevent-lease-explosions.mdx create mode 100644 website/content/docs/troubleshoot/generate-root-token.mdx create mode 100644 website/content/docs/troubleshoot/lease-issues.mdx diff --git a/website/content/api-docs/auth/userpass.mdx b/website/content/api-docs/auth/userpass.mdx index fce3f2160c..6625f9528c 100644 --- a/website/content/api-docs/auth/userpass.mdx +++ b/website/content/api-docs/auth/userpass.mdx @@ -52,6 +52,20 @@ $ curl \ http://127.0.0.1:8200/v1/auth/userpass/users/mitchellh ``` +### Examples + +Set role-level TTL values for a user named "alice" so the generated lease has a +default TTL of 8 hours (28800 seconds) and maximum TTL of 12 hours +(43200 seconds): + +```shell-session +$ curl \ + --header "X-Vault-Token: $VAULT_TOKEN" \ + --request POST \ + --data '{"token_ttl":"8h","token_max_ttl":"12h"}' \ + $VAULT_ADDR/v1/auth/userpass/users/alice +``` + ## Read user Reads the properties of an existing username. diff --git a/website/content/docs/commands/write.mdx b/website/content/docs/commands/write.mdx index b73a77a47d..03614c8041 100644 --- a/website/content/docs/commands/write.mdx +++ b/website/content/docs/commands/write.mdx @@ -57,6 +57,15 @@ Configure access to Consul by providing an access token: $ echo $MY_TOKEN | vault write consul/config/access token=- ``` +Set role-level TTL values for a user named "alice" so the generated lease has a +default TTL of 8 hours (28800 seconds) and maximum TTL of 12 hours +(43200 seconds): + +```shell-session +$ VAULT_TOKEN=$VAULT_TOKEN vault write /auth/userpass/users/alice \ + token_ttl="8h" token_max_ttl="12h" +``` + ### API versus CLI Create a token with TTL set to 8 hours, limited to 3 uses, and attach `admin` diff --git a/website/content/docs/concepts/lease-count-quota-exceeded.mdx b/website/content/docs/concepts/lease-count-quota-exceeded.mdx deleted file mode 100644 index 5e2d8463ae..0000000000 --- a/website/content/docs/concepts/lease-count-quota-exceeded.mdx +++ /dev/null @@ -1,41 +0,0 @@ ---- -layout: docs -page_title: Lease count quota exceeded -description: |- - Vault Enterprise error when quota limit is reached. ---- - -# Lease count quota exceeded - -Vault returns a `429 - Too Many Requests` response if a new lease request -violates the quota limit, to guard against [lease -explosions](/vault/docs/concepts/lease-explosions): - - -``` -Error making API request. - -URL: PUT https://127.0.0.1:61555/v1/auth/userpass/login/foo -Code: 429. Errors: - -* 1 error occurred: - * request path "auth/userpass/login/foo": lease count quota exceeded -``` - -To resolve the error: - -1. Check for client-side errors that result in excessive lease creation and - correct them. -1. Tune your lease count quota to accommodate any expected increases in lease - creation. For example, due to new feature releases or an increase in users. - -## Tutorial - -Refer to [Protecting Vault with Resource -Quotas](/vault/tutorials/operations/resource-quotas) for a -step-by-step tutorial. - -## API - -Lease count quotas can be managed over the HTTP API. Please see -[Lease Count Quotas API](/vault/api-docs/system/lease-count-quotas) for more details. diff --git a/website/content/docs/concepts/lease-explosions.mdx b/website/content/docs/concepts/lease-explosions.mdx deleted file mode 100644 index 48934d1172..0000000000 --- a/website/content/docs/concepts/lease-explosions.mdx +++ /dev/null @@ -1,379 +0,0 @@ ---- -layout: docs -page_title: 'Lease Explosions' -description: >- - Learn about lease explosions and how you can prevent them. ---- - -# Lease Explosions - -As your Vault environment scales to meet deployment needs, it is important to avoid over-subscription. A lease explosion can occur when operators reach over-subscription and clients create leases much faster than Vault is set to revoke them. If this continues unchecked, the active node can run out of memory. Once a lease explosion occurs, mitigation is time consuming and resource intensive. - -This document shows you how to prevent lease explosions, mitigate when a lease explosion occurs, and clean up your environment after a lease explosion. - -Applications and users can overwhelm system resources through consistent and high-volume API requests, resulting in denial-of-service issues in some Vault nodes or even the entire Vault cluster. Review [Vault resource quotas](/vault/docs/concepts/resource-quotas) to learn more about enabling rate-limit quotas and lease-count quotas to protect against requests which could trigger lease explosions. - -These are common observations and behaviors operators experience as their Vault deployment matures: - -- TTL values for dynamic secret leases or authentication tokens could be too high, resulting in unused leases consuming storage space while waiting to expire. - -- Rapid lease count growth disproportionate to the number of clients is a sign of misconfiguration or potential anti-patterns in client usage. - -- Lease revocation is failing. This can be caused by failures in an external service in the case of dynamic secrets. - -- Valid credentials which have already been leased are not being reused when possible. e.g. a badly behaving app requests new credentials from Vault every time it starts instead of caching ones it previously requested and using them again. This encourages a build up of leases associated with otherwise unused credentials. - -- The Vault server is not processing lease revocations as quickly as they're expiring. Usually, this is due to insufficient IOPS for the storage backend. - -You can approach lease explosions in three phases: - -- Preventing lease explosions - -- Mitigating lease explosions - -- Cleaning up after lease explosions - -## Preventing lease explosions - -Prevention is the best tool against lease explosion. The following are three important areas you can focus on to prevent lease explosion in your Vault environment. - -Although no technical maximum exists, high lease counts can cause degradation in system performance. We recommend short default time-to-live (TTL) values on tokens and leases to avoid a large backlog of unexpired leases or many simultaneous expirations. Review [Vault lease limits](/vault/docs/internals/limits#lease-limits) to learn more. - -### Client best practices - -Ensure clients using Vault adhere to best practices for their authentication and secret retrieval, and do not make excessive dynamic secrets requests or service token authentications. Review [Lease Concepts](/vault/docs/concepts/lease) and [Auth Concepts](/vault/docs/concepts/auth) to learn more. - -You should avoid these client behavior anti-patterns: -Long TTLs configured, leading to a slow build over-subscription. -Acute aberrant client behavior leading to rapid over-subscription. -A combination of both. - -#### AppRole - -As Vault matures in your environment, it's important to review and ensure client behavior best practices around machine-based authentication as it can have more impact on lease explosion than human-based authentication typically does. - -- [Recommended pattern for Vault AppRole use](/vault/tutorials/recommended-patterns/pattern-approle) - -- [How and why to use AppRole correctly in HashiCorp Vault](https://www.hashicorp.com/blog/how-and-why-to-use-approle-correctly-in-hashicorp-vault) - -### Monitoring key metrics - -Proactive monitoring is key to identifying behavior and usage patterns before they become problematic. Review the following resources for more details: - -- [Vault key metrics](/well-architected-framework/reliability/reliability-vault-monitoring-key-metrics) - -- [Vault anti-patterns poor metrics](/well-architected-framework/operational-excellence/security-vault-anti-patterns#poor-metrics-or-no-telemetry-data) - -### Implementation guardrails - -You can choose the appropriate token type for your use case, and use resource quotas as guardrails against lease explosion in your implementation. - -#### TTLs - -| TTL type | Notes | -| -------- |------ | -| [System-wide maximum TTL](/vault/docs/configuration#default_lease_ttl) and [system-wide default TTL](/vault/docs/configuration#max_lease_ttl) | TTL values which you specify in the Vault server configuration file; they are the last used values by Vault in terms of precedence after mount TTLs and high granularity TTLs | -| [Mount maximum TTL](/vault/api-docs/system/mounts#default_lease_ttl-1) and [mount default TTL](/vault/api-docs/system/mounts#max_lease_ttl-1) | TTL values specified on a per mount instance of auth method or secrets engine. In terms of precedence, these TTL values override system-wide TTLs, but are overridden by highly granular TTLs. | -| Highly granular TTLs, for example: [Database secrets engine role default TTL](/vault/api-docs/secret/databases#default_ttl) and [Database secrets engine role maximum TTL](/vault/api-docs/secret/databases#max_ttl) | These TTLs are specified on a role, group, or user level, and their values override both mount and system-wide TTL values. | - -More details are available in the [Token Time-To-Live, periodic tokens, and explicit max TTLs](/vault/docs/concepts/tokens#token-time-to-live-periodic-tokens-and-explicit-max-ttls) and [Lease limits](/vault/docs/internals/limits#lease-limits) documentation. - -You should also review the details in the Vault anti-patterns guide: [not adjusting the default lease time](/well-architected-framework/operational-excellence/security-vault-anti-patterns#not-adjusting-the-default-lease-time) for a clear explanation of the issue and solution. - -The following are examples for setting default and maximum TTL values using the Vault API and CLI, which you can reference when setting values for your implementation. - - - -Adjusting TTL values is not a retroactive operation, and affects just those leases or tokens issued after you make the changes. - - - -Update the default TTL to 8 hours and maximum TTL to 12 hours on a username and password auth method user named "alice". The value of `$VAULT_TOKEN` should be that of a token with capabilities to perform the operations. - - - - - -```shell-session -$ curl \ - --header "X-Vault-Token: $VAULT_TOKEN" \ - --request POST \ - --data '{"token_ttl":"8h","token_max_ttl":"12h"}' \ - $VAULT_ADDR/v1/auth/userpass/users/alice -``` - -This command is not expected to produce output, but you can read the user to confirm the settings. - -```shell-session -$ curl \ - --header "X-Vault-Token: $VAULT_TOKEN" \ - --request GET \ - --silent \ - $VAULT_ADDR/v1/auth/userpass/users/alice \ - | jq -``` - -Example output: - - - -```json -{ - "request_id": "4cfc0293-a3f3-9b3b-b668-82aea63ced91", - "lease_id": "", - "renewable": false, - "lease_duration": 0, - "data": { - "token_bound_cidrs": [], - "token_explicit_max_ttl": 0, - "token_max_ttl": 43200, - "token_no_default_policy": false, - "token_num_uses": 0, - "token_period": 0, - "token_policies": [], - "token_ttl": 28800, - "token_type": "default" - }, - "wrap_info": null, - "warnings": null, - "auth": null -} -``` - - - -When Alice authenticates with Vault and gets a token, its default TTL value is set to 28800 seconds (8 hours) and the maximum TTL value is 43200 seconds (12 hours). - - - - - -```shell-session -$ VAULT_TOKEN=$VAULT_TOKEN vault write /auth/userpass/users/alice \ - token_ttl="8h" token_max_ttl="12h" -``` - -Example output: - - - -```plaintext -Success! Data written to: auth/userpass/users/alice -``` - - - -You can read the user to confirm the settings. - -```shell-session -$ VAULT_TOKEN=$VAULT_TOKEN vault read /auth/userpass/users/alice -``` - -Example output: - - - -```plaintext -Key Value ---- ----- -token_bound_cidrs [] -token_explicit_max_ttl 0s -token_max_ttl 12h -token_no_default_policy false -token_num_uses 0 -token_period 0s -token_policies [] -token_ttl 8h -token_type default -``` - - - -When Alice next authenticates with Vault and gets a token, its default TTL value is set to 8 hours and the maximum TTL value is 12 hours. - - - - - -#### Resource Quotas -You can use quotas to control Vault resource usage in the form of API rate limiting quotas and [lease count quotas](/vault/tutorials/operations/resource-quotas#lease-count-quotas). For the purposes of this overview, lease count quotas are most relevant as you can cap the maximum number of leases generated on a per-mount basis. - -Use this feature for use cases where a hard limit to the number of leases makes sense. Also, be sure to [monitor Vault audit device logs](/vault/tutorials/monitoring/monitor-telemetry-audit-splunk) where Vault emits messages about failures related to exceeding the quota. - -The following examples demonstrate creating a lease count quota on an instance of the Approle auth method, for the role named "webapp" to restrict leases to no more than 100. The value of `$VAULT_TOKEN` should be that of a token capable of performing the operations. - - - - - -1. Create a payload file containing the lease quota parameters. - - ```shell-session - $ cat > payload.json << EOF - { - "path": "auth/approle", - "role": "webapp", - "max_leases": 100 - } - EOF - ``` - -1. Write the webapp-tokens lease count quota. - - ```shell-session - $ curl \ - --request POST \ - --header "X-Vault-Token: $VAULT_TOKEN" \ - --data @payload.json \ - $VAULT_ADDR/v1/sys/quotas/lease-count/webapp-tokens - ``` - - This command is not expected to produce output, but you can read the user to confirm the settings. - -1. Confirm settings. - - ```shell-session - $ curl \ - --header "X-Vault-Token: $VAULT_TOKEN" \ - --request GET \ - --silent \ - $VAULT_ADDR/v1/sys/quotas/lease-count/webapp-tokens \ - | jq - ``` - - Example output: - - - - ```json - { - "request_id": "188e22f1-dc1a-251a-a0a1-005e256fe70f", - "lease_id": "", - "renewable": false, - "lease_duration": 0, - "data": { - "counter": 0, - "inheritable": true, - "max_leases": 100, - "name": "webapp-tokens", - "path": "auth/approle/", - "role": "webapp", - "type": "lease-count" - }, - "wrap_info": null, - "warnings": null, - "auth": null - } - ``` - - - - - - - -Write the webapp-tokens lease count quota. - -```shell-session -$ vault write sys/quotas/lease-count/webapp-tokens \ - max_leases=100 \ - path="auth/approle" \ - role="webapp" -``` - -Example output: - - - -```plaintext -Success! Data written to: sys/quotas/lease-count/webapp-tokens -``` - - - -Confirm the setting. - -```shell-session -$ vault read sys/quotas/lease-count/webapp-tokens -``` - -Example output: - - - -```plaintext -Key Value ---- ----- -counter 0 -inheritable true -max_leases 100 -name webapp-tokens -path auth/approle/ -role webapp -type lease-count -``` - - - - - - - -The limit is set to 100 leases for the AppRole auth method role named webapp. - - - -Enabling the rate limit audit logging may have an impact on the Vault performance if the volume of rejected requests is large. - - - -Review these resources for a deeper dive into controlling Vault resources: - -- [Vault resource quotas](/vault/docs/concepts/resource-quotas) - -- [Vault Enterprise lease count quotas](/vault/docs/enterprise/lease-count-quotas) - -- [Query audit device logs](/vault/tutorials/monitoring/query-audit-device-logs) - -#### Token type - -In some use cases, batch tokens can be a better fit than service tokens with respect to lease explosion. Review the following resources for help deciding when to use batch tokens and when to use service tokens: - -- [Vault service tokens vs batch tokens](/vault/tutorials/tokens/batch-tokens#service-tokens-vs-batch-tokens) - -- [Service vs batch token lease handling](/vault/docs/concepts/tokens#service-vs-batch-token-lease-handling) - -## Mitigating lease explosions - -Ultimately, the number of leases a system can handle is unique to the Vault deployment and environment. - -### Increase resources - -Increasing available resources in your Vault cluster can help mitigate lease explosion and allow for cluster recovery. Review [hardware sizing](/well-architected-framework/zero-trust-security/raft-reference-architecture#hardware-sizing-for-vault-servers), and focus on increasing available RAM. - -#### Within Vault - -Use the information from the Implementation guardrails section to adjust TTL values from the default values according to your use case needs. - -#### External to Vault - -You can use firewalls or load balancers to limit API calls to Vault from aberrant clients.) - -[Knowledge base article around load balancing](https://support.hashicorp.com/hc/en-us/articles/14496042865427-Vault-Global-Load-Balancing-Patterns) -[Vault & load balancing](/vault/tutorials/day-one-raft/raft-reference-architecture#load-balancer-recommendations) - -## Cleaning up environment after lease explosions - -Once the acute event subsides, the Vault active node will continue to purge leases. Sometimes, the explosion is so great, you will need to manually intervene to revoke [leases](/vault/api-docs/system/leases). If you are running a version of Vault prior to 1.13.0, this lease revocation can cause further performance degradation. - -Revoking or forcefully revoking leases is potentially a dangerous operation. You should ensure that you have recent valid snapshots of the cluster. Users of Vault versions prior to 1.13.0 on integrated storage must also perform freelist compaction. Vault Enterprise customers should consider proactively contacting the [Customer Support team](https://support.hashicorp.com) for help with this process. - -## Additional resources - -Proactive monitoring and periodic usage analysis are some of the best practices for Vault operators. Review the following resources for more details. - -- [Vault key metrics for common health checks](/well-architected-framework/reliability/reliability-vault-monitoring-key-metrics) - -- [Troubleshoot irrevocable leases](/vault/tutorials/monitoring/troubleshoot-irrevocable-leases) - -- [Troubleshooting Vault](/vault/tutorials/monitoring/troubleshooting-vault) diff --git a/website/content/docs/configuration/create-lease-count-quota.mdx b/website/content/docs/configuration/create-lease-count-quota.mdx new file mode 100644 index 0000000000..dc03264fc1 --- /dev/null +++ b/website/content/docs/configuration/create-lease-count-quota.mdx @@ -0,0 +1,185 @@ +--- +layout: docs +page_title: Create a lease count quota +description: >- + Step-by-step instructions for creating lease count quotas for an + authentication plugin +--- + +# Create a lease count quota + +Use lease count quotas to limit the number of leases generated on a per-mount +basis and control resource consumption for your Vault instance where hard +limits makes sense. + +## Before you start + +- **Confirm you have access to the root or administration namespace for your + Vault instance**. Modifying lease count quotas is a restricted activity. + + +## Step 1: Determine the appropriate granularity + +The granularity of your lease limits can affect the performance of your Vault +cluster. In particular, if your lease limits cause the number of rejected +requests to increase dramatically, the increased audit logging may impact Vault +performance. + +Review past system behavior to identify whether the quota limits should be +inheritable or limited to a specific role. + +## Step 2: Apply the count quota + + + + + +Use `vault write` and the `sys/quotas/lease-count/{quota-name}` mount path to +create a new lease count quota: + +```shell-session +$ vault write \ + sys/quotas/lease-count/ \ + name="" \ + path="" \ + role="" \ + max_leases= +``` + +For example, to create a targeted quota limit called **webapp-tokens** on the +`webapp` role for the `approle` plugin at the default mount path: + +```shell-session +$ vault write \ + sys/quotas/lease-count/webapp-tokens \ + name="webapp-tokens" \ + path="auth/approle" \ + role="webapp" \ + max_leases=100 + +Success! Data written to: sys/quotas/lease-count/webapp-tokens +``` + + + + +1. Create a payload file with your quota settings. + + ```json + { + "name": "", + "path": "", + "role": "", + "max_leases": , + } + ``` + + For example, to create a targeted quota limit called **webapp-tokens** on the + `webapp` role for the `approle` plugin at the default mount path: + + ```json + { + "name": "webapp-tokens", + "path": "auth/approle", + "role": "webapp", + "max_leases": 100, + } + ``` + +1. Call the `/sys/quotas/lease-count/{quota-name}` endpoint to apply the lease + count quota. For example, to apply the `webapp-tokens` quota: + + ```shell-session + $ curl \ + --request POST \ + --header "X-Vault-Token: ${VAULT_TOKEN}" \ + --data @payload.json \ + ${VAULT_ADDR}/v1/sys/quotas/lease-count/webapp-tokens + ``` + + + + The `/sys/quotas/lease-count/{quota-name}` endpoint succeeds silently. + + + + + + + +## Step 3: Confirm the quota settings + + + + + +Use `vault read` and the `sys/quotas/lease-count/{quota-name}` mount path to +display the lease count quota details: + +```shell-session +$ vault read sys/quotas/lease-count/ +``` + +For example, to read the **webapp-tokens** quota details: + +```shell-session +$ vault read sys/quotas/lease-count/webapp-tokens + +Key Value +--- ----- +counter 0 +inheritable true +max_leases 100 +name webapp-tokens +path auth/approle/ +role webapp +type lease-count +``` + + + + + +Call the `sys/quotas/lease-count/{quota-name}` endpoint to display the lease +count quota details. For example, to read the **webapp-tokens** quota details: + +```shell-session +$ curl \ + --header "X-Vault-Token: ${VAULT_TOKEN}" \ + --request GET \ + --silent \ + ${VAULT_ADDR}/v1/sys/quotas/lease-count/webapp-tokens | jq + +{ + "request_id": "188e22f1-dc1a-251a-a0a1-005e256fe70f", + "lease_id": "", + "renewable": false, + "lease_duration": 0, + "data": { + "counter": 0, + "inheritable": false, + "max_leases": 100, + "name": "webapp-tokens", + "path": "auth/approle/", + "role": "webapp", + "type": "lease-count" + }, + "wrap_info": null, + "warnings": null, + "auth": null +} +``` + + + + + +## Next steps + +Proactive monitoring and periodic usage analysis can help you identify potential +problems before they escalate. + +- Brush up on [general Vault resource quotas](/vault/docs/concepts/resource-quotas) in general. +- Learn about [lease count quotas for Vault Enterprise](/vault/docs/enterprise/lease-count-quotas). +- Learn how to [query audit device logs](/vault/tutorials/monitoring/query-audit-device-logs). +- Review [key Vault metrics for common health checks](/well-architected-framework/reliability/reliability-vault-monitoring-key-metrics). \ No newline at end of file diff --git a/website/content/docs/configuration/prevent-lease-explosions.mdx b/website/content/docs/configuration/prevent-lease-explosions.mdx new file mode 100644 index 0000000000..3029c7ed45 --- /dev/null +++ b/website/content/docs/configuration/prevent-lease-explosions.mdx @@ -0,0 +1,105 @@ +--- +layout: docs +page_title: Prevent lease explosions +description: >- + Learn how to prevent lease explosions in Vault. +--- + +# Prevent lease explosions + +As your Vault environment scales to meet deployment needs, you run the risk of +lease explosions. Lease explosions can occur when a Vault cluster is +over-subscribed and clients overwhelm system resources with consistent, +high-volume API requests + +Unchecked lease explosions create a memory drain on the active node, which can +cascade to other nodes and result in denial-of-service issues for the entire +cluster. + +## Look for early warning signs + +Cleaning up after a lease explosion is time consuming and resource intensive, so +we strongly recommend monitoring your Vault instance for signals that your +Vault deployment has matured and requires tuning: + +Issue | Possible cause +-------------------------------------------------------------------------------- | -------------- +Unused leases consume storage space for extended periods while waiting to expire | The TTL values for dynamic secret leases or authentication tokens may be too high +Lease revocation fails frequently | Failures in an external service (e.g., for dynamic secrets) +Build up of leases associated with unused credentials | Clients are not reusing valid, existing leases +Lease revocation is slow | Insufficient IOPS for the storage backend +Rapid lease count growth disproportionate to the number of clients | Misconfiguration or anti-patterns in client usage + + +## Enforce client best practices + +High lease counts can degrade system performance: + +- Use the smallest default time-to-live (TTL) possible for tokens and leases to + avoid excessive unexpired lease backlogs and high-volume, simultaneous + expirations. +- Review telemetry for aberrant client behavior that might lead to rapid + over-subscription. +- Limit the number of simultaneous dynamic secret requests and service token + authentication requests. +- Ensure that machine clients adhere to [recommended AppRole patterns](/vault/tutorials/recommended-patterns/pattern-approle). +- Review [AppRole best practices](https://www.hashicorp.com/blog/how-and-why-to-use-approle-correctly-in-hashicorp-vault). + +## Set reasonable TTL guardrails + +Choose appropriate defaults for your situation and use resource quotas as +guardrails against lease explosion. You can set default and maximum TTLs +globally, in the mount configuration for a specific authN or secrets plugin, and +at the role-level (e.g., database credential roles). + +Vault prioritizes TTL values by granularity: + +- Global values act as the default. +- Plugin TTL values override global values. +- Role, group, and user level TTL values override plugin and global values. + + + + Leases and tokens keep the TTL value in affect during their creation. When you + adjust TTL values, the new limits only apply to leases and tokens issued after + you deploy the changes. + + + +## Monitor key metrics and logs + +Proactive monitoring is key to finding problematic behavior and usage patterns +before they escalate: + +- Review [key Vault metrics](/well-architected-framework/reliability/reliability-vault-monitoring-key-metrics) +- Understand [metric anti-patterns](/well-architected-framework/operational-excellence/security-vault-anti-patterns#poor-metrics-or-no-telemetry-data) +- Monitor [Vault audit device logs](/vault/tutorials/monitoring/monitor-telemetry-audit-splunk) for quota-related failures. + +## Control resource usage with quotas + +Use API rate limiting quotas and +[lease count quotas](/vault/tutorials/operations/resource-quotas#lease-count-quotas) +to limit the number of leases generated on a per-mount basis and control +resource consumption for your Vault instance where hard limits makes sense. + +## Consider batch tokens + +If your environment inherently leads to a large number of lease requests, +consider using batch tokens over service tokens. + +The following resources can help you decide if batch tokens are reasonable for +your situation: + +- [Vault service tokens vs batch tokens](/vault/tutorials/tokens/batch-tokens#service-tokens-vs-batch-tokens) +- [Service vs batch token lease handling](/vault/docs/concepts/tokens#service-vs-batch-token-lease-handling) + +## Next steps + +Proactive monitoring and periodic usage analysis can help you identify potential +problems before they escalate. + +- Brush up on [general Vault resource quotas](/vault/docs/concepts/resource-quotas) in general. +- Learn about [lease count quotas for Vault Enterprise](/vault/docs/enterprise/lease-count-quotas). +- Learn how to [query audit device logs](/vault/tutorials/monitoring/query-audit-device-logs). +- Review [recommended Vault lease limits](/vault/docs/internals/limits#lease-limits). +- Review [lease anti-patterns](/well-architected-framework/operational-excellence/security-vault-anti-patterns#not-adjusting-the-default-lease-time) for a clear explanation of the issue and solution. diff --git a/website/content/docs/troubleshoot/generate-root-token.mdx b/website/content/docs/troubleshoot/generate-root-token.mdx new file mode 100644 index 0000000000..8f39d6463b --- /dev/null +++ b/website/content/docs/troubleshoot/generate-root-token.mdx @@ -0,0 +1,161 @@ +--- +layout: docs +page_title: Regenerate a Vault root token +description: >- + Regenerate a lost or revoked root token. +--- + +# Regenerate a Vault root token + +Your Vault root token is a special token that gives you access to **all** Vault +operations. Best practice is to enable an appropriate authentication method for +Vault admins once the server is running and revoke the root token. + +For emergency situations where your require a root token, you can use the +[`operator generate-root`](/vault/docs/commands/operator/generate-root) CLI +command and a one-time password (OTP) or Pretty Good Privacy (PGP) to generate +a new root token. + +## Before you start + +- **You need your Vault keys**. If you use auto-unseal, you need your + [recovery](/vault/docs/concepts/seal#recovery-key) keys, otherwise you need + your unseal keys. +- **Identify current key holders**. You must distribute the token nonce to your + unseal/recovery key holders during root token generation. + +## Step 1: Create a root token nonce + +1. Generate a token nonce for your new root token: + + + + + **You need the returned OTP value to decode the new root token**. + + ```shell-session + $ vault operator generate-root -init + + A One-Time-Password has been generated for you and is shown in the OTP field. + You will need this value to decode the resulting root token, so keep it safe. + Nonce 15565c79-cc9e-5e64-b986-8506e7bd1918 + Started true + Progress 0/1 + Complete false + OTP 5JFQaH76Ky2TIuSt4SPvO1CGkx + OTP Length 26 + ``` + + + + + Use the `-pgp-key` option to provide a path to your PGP public key or Keybase + username to encrypt the new root token. **You will need the returned PGP + value to decode the new root token**. + + ```shell-session + $ vault operator generate-root -init -pgp-key=keybase:sethvargo + + Nonce e24dec5e-f1ea-2dfe-ecce-604022006976 + Started true + Progress 0/5 + Complete false + PGP Fingerprint e2f8e2974623ba2a0e933a59c921994f9c27e0ff + ``` + + + + +1. Distribute the nonce to each of your unseal/recovery key holders. + +## Step 2: Establish key quorum with the token nonce + + + + If you use a TTY, the `operator generate-root` command prompts for your key + and automatically completes the nonce value. + + + +1. Have each unseal/recovery key holder run `operator generator-root` with their + key and the distributed nonce value: + + ```shell-session + $ echo ${UNSEAL_OR_RECOVERY_KEY} | vault operator generate-root -nonce=${NONCE_VALUE} - + + Root generation operation nonce: f67f4da3-4ae4-68fb-4716-91da6b609c3e + Unseal Key (will be hidden): + ``` + +1. Vault returns the new, encoded root token to the user who triggers quorum: + + + + + ```shell-session + Nonce f67f4da3-4ae4-68fb-4716-91da6b609c3e + Started true + Progress 5/5 + Complete true + Encoded Token IxJpyqxn3YafOGhqhvP6cQ== + ``` + + + + + + ```shell-session + Nonce e24dec5e-f1ea-2dfe-ecce-604022006976 + Started true + Progress 1/1 + Complete true + PGP Fingerprint e2f8e2974623ba2a0e933a59c921994f9c27e0ff + Encoded Token wcFMA0RVkFtoqzRlARAAI3Ux8kdSpfgXdF9mg... + ``` + + + + +## Step 3: Decode the new root token + +Decode the new root token using OTP or PGP. + + + + +Use `operator generate-root` and the OTP value from nonce generation to decode +the new root token: + +```shell-session +$ vault operator generate-root \ + -decode=${ENCODED_TOKEN} \ + -otp=${NONCE_OTP} + +hvs.XXXXXXXXXXXXXXXXXXXXXXXX +``` + + + + + +Use your PGP credentials and `gpg` or `keybase` to decrypt the new root token. + + +**`gpg`**: + +```shell-session +$ echo ${ENCODED_TOKEN} | base64 --decode | gpg --decrypt + +hvs.XXXXXXXXXXXXXXXXXXXXXXXX +``` + +**`keybase`**: + +```shell-session +$ echo ${ENCODED_TOKEN} | base64 --decode | keybase pgp decrypt + +hvs.XXXXXXXXXXXXXXXXXXXXXXXX +``` + + + diff --git a/website/content/docs/troubleshoot/lease-issues.mdx b/website/content/docs/troubleshoot/lease-issues.mdx new file mode 100644 index 0000000000..4c39a34f84 --- /dev/null +++ b/website/content/docs/troubleshoot/lease-issues.mdx @@ -0,0 +1,122 @@ +--- +layout: docs +page_title: Lease problems +description: >- + Troubleshoot lease problems in Vault. +--- + +# Troubleshoot lease problems + +Explanations, workarounds, and solutions for common lease problems in Vault. + +## `429 - Too Many Requests` + +### Problem + +Vault returns a `429 - Too Many Requests` response when users try to +authenticate. For example: + + + +```text +Error making API request. + +URL: PUT https://127.0.0.1:61555/v1/auth/userpass/login/foo +Code: 429. Errors: + +* 1 error occurred: + * request path "auth/userpass/login/foo": lease count quota exceeded +``` + + + +### Cause + +Vault returns a `429 - Too Many Requests` response if a new lease request +violates the configured lease quota limit. + +To guard against [lease explosions](/vault/docs/troubleshoot/lease-explosions), +Vault rejects authentication requests if completing the request would violate +the configured lease quota limit. + +### Solution + +1. Correct any client-side errors that may cause excessive lease creation. +1. Determine if your resource needs have changed and complete the + [Protecting Vault with Resource Quotas](/vault/tutorials/operations/resource-quotas) + tutorial to determine new, appropriate defaults. +1. Use the [`vault lease`](/vault/docs/commands/lease) CLI command or + [lease count quota endpoint](/vault/api-docs/system/lease-count-quotas) to + tune your lease count quota. + + + Consider making short-term changes to your lease quotas when you expect a + significant increase in lease creation. For example, when you release a new + feature or complete a marketing push to increase your user base. + + + +## Lease explosion (degraded performance) + +### Problem + +Your Vault nodes are out of memory and unresponsive to new lease requests. + +### Cause + +Clients have caused a lease explosion with consistent, high-volume API requests. + + + + Unchecked lease explosions create cascading denial-of-service issues for the + active node that can result in denial-of-service issues for the entire + cluster. + + + +### Solution + +To resolve a lease explosion, you need to mitigate the problem to stabilize +Vault and provide space for cluster recovery then clean up your Vault +environment. + +1. Mitigate resource stress by adjusting TTL values for your Vault instance: + + Config level | Parameter | Precedence + -------------------- | ---------------------- | ----------- + Database plugin | `ttl` or `default_ttl` | first + Database plugin | `max_ttl` | first + AuthN/secrets plugin | `ttl` or `default_ttl` | second + AuthN/secrets plugin | `max_ttl` | second + Vault | `default_lease_ttl` | last + Vault | `max_lease_ttl` | last + + **Granular TTLs on a role, group, or user level always override plugin and + system-wide TTL values**. + +1. Use firewalls or load balancers to limit API calls to Vault from aberrant + clients and reduce load on the struggling cluster . + +1. Once the cluster stabilizes, check the active node to determine if you can + wait for it to purge leases automatically or if you need to speed up the + process by manually revoking leases. + +1. If the cluster requires manual intervention, confirm you have a recent, valid + snapshots of the cluster. + +1. Once you confirm a valid snapshot of the cluster exists, use + [`vault lease revoke`](/vault/docs/commands/lease/revoke) to manually revoke + the offending leases. + + + + Revoking or forcefully revoking leases is potentially a dangerous operation. + Do not proceed without a valid snapshot. If you have a valid Vault + Enterprise license, consider contacting the + [HashiCorp Customer Support team](https://support.hashicorp.com/) for help. + + + +### Related tutorials + +- [Troubleshoot irrevocable leases](/vault/tutorials/monitoring/troubleshoot-irrevocable-leases) diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json index f3d3656149..f6007840af 100644 --- a/website/data/docs-nav-data.json +++ b/website/data/docs-nav-data.json @@ -180,14 +180,6 @@ "title": "Lease, Renew, and Revoke", "path": "concepts/lease" }, - { - "title": "Lease Explosions", - "path": "concepts/lease-explosions" - }, - { - "title": "Lease count quota exceeded", - "path": "concepts/lease-count-quota-exceeded" - }, { "title": "Authentication", "path": "concepts/auth" @@ -371,6 +363,14 @@ "title": "Manage resources programmatically", "path": "configuration/programmatic-management" }, + { + "title": "Prevent lease explosions", + "path": "configuration/prevent-lease-explosions" + }, + { + "title": "Create a lease count quota", + "path": "configuration/create-lease-count-quota" + }, { "title": "listener", "routes": [ @@ -1999,8 +1999,21 @@ "path": "interoperability-matrix" }, { - "title": "Troubleshoot", - "href": "https://learn.hashicorp.com/tutorials/vault/troubleshooting-vault" + "title": "Troubleshoot Vault", + "routes": [ + { + "title": "Generate a root token", + "path": "troubleshoot/generate-root-token" + }, + { + "title": "Troubleshoot lease errors", + "path": "troubleshoot/lease-issues" + }, + { + "title": "Troubleshooting tutorials", + "href": "https://learn.hashicorp.com/tutorials/vault/troubleshooting-vault" + } + ] }, { "divider": true diff --git a/website/redirects.js b/website/redirects.js index e5ee5fb250..d4bcfd1c3e 100644 --- a/website/redirects.js +++ b/website/redirects.js @@ -114,5 +114,15 @@ module.exports = [ source: '/vault/docs/deprecation/faq', destination: '/vault/docs/deprecation', permanent: true, + }, + { + source: '/vault/docs/concepts/lease-explosions', + destination: '/vault/docs/troubleshoot/lease-explosions', + permanent: true, + }, + { + source: '/vault/docs/concepts/lease-count-quota-exceeded', + destination: '/vault/docs/troubleshoot/lease-count-quota-exceeded', + permanent: true, } ]