tsdb: Deprecate retention flags; add tsdb.retention runtime configuration (#17026)

* Move storage from CL to config file

Signed-off-by: pipiland2612 <nguyen.t.dang.minh@gmail.com>

* Fix .md

Signed-off-by: pipiland2612 <nguyen.t.dang.minh@gmail.com>

* run make cli-documentation

Signed-off-by: pipiland2612 <nguyen.t.dang.minh@gmail.com>

* fix

Signed-off-by: pipiland2612 <nguyen.t.dang.minh@gmail.com>

* run make cli-documentation

Signed-off-by: pipiland2612 <nguyen.t.dang.minh@gmail.com>

* nit_fixed

Signed-off-by: pipiland2612 <nguyen.t.dang.minh@gmail.com>

* fix

Signed-off-by: pipiland2612 <nguyen.t.dang.minh@gmail.com>

* add test and update configuration.md

Signed-off-by: pipiland2612 <nguyen.t.dang.minh@gmail.com>

* fix lint

Signed-off-by: pipiland2612 <nguyen.t.dang.minh@gmail.com>

---------

Signed-off-by: pipiland2612 <nguyen.t.dang.minh@gmail.com>
This commit is contained in:
Minh Nguyen 2025-10-27 16:51:33 +02:00 committed by GitHub
parent 6bb367970e
commit ad4b59c504
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 156 additions and 8 deletions

View File

@ -447,10 +447,10 @@ func main() {
"Size at which to split the tsdb WAL segment files. Example: 100MB"). "Size at which to split the tsdb WAL segment files. Example: 100MB").
Hidden().PlaceHolder("<bytes>").BytesVar(&cfg.tsdb.WALSegmentSize) Hidden().PlaceHolder("<bytes>").BytesVar(&cfg.tsdb.WALSegmentSize)
serverOnlyFlag(a, "storage.tsdb.retention.time", "How long to retain samples in storage. If neither this flag nor \"storage.tsdb.retention.size\" is set, the retention time defaults to "+defaultRetentionString+". Units Supported: y, w, d, h, m, s, ms."). serverOnlyFlag(a, "storage.tsdb.retention.time", "[DEPRECATED] How long to retain samples in storage. If neither this flag nor \"storage.tsdb.retention.size\" is set, the retention time defaults to "+defaultRetentionString+". Units Supported: y, w, d, h, m, s, ms. This flag has been deprecated, use the storage.tsdb.retention.time field in the config file instead.").
SetValue(&cfg.tsdb.RetentionDuration) SetValue(&cfg.tsdb.RetentionDuration)
serverOnlyFlag(a, "storage.tsdb.retention.size", "Maximum number of bytes that can be stored for blocks. A unit is required, supported units: B, KB, MB, GB, TB, PB, EB. Ex: \"512MB\". Based on powers-of-2, so 1KB is 1024B."). serverOnlyFlag(a, "storage.tsdb.retention.size", "[DEPRECATED] Maximum number of bytes that can be stored for blocks. A unit is required, supported units: B, KB, MB, GB, TB, PB, EB. Ex: \"512MB\". Based on powers-of-2, so 1KB is 1024B. This flag has been deprecated, use the storage.tsdb.retention.size field in the config file instead.").
BytesVar(&cfg.tsdb.MaxBytes) BytesVar(&cfg.tsdb.MaxBytes)
serverOnlyFlag(a, "storage.tsdb.no-lockfile", "Do not create lockfile in data directory."). serverOnlyFlag(a, "storage.tsdb.no-lockfile", "Do not create lockfile in data directory.").
@ -671,6 +671,14 @@ func main() {
} }
if cfgFile.StorageConfig.TSDBConfig != nil { if cfgFile.StorageConfig.TSDBConfig != nil {
cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
if cfgFile.StorageConfig.TSDBConfig.Retention != nil {
if cfgFile.StorageConfig.TSDBConfig.Retention.Time > 0 {
cfg.tsdb.RetentionDuration = cfgFile.StorageConfig.TSDBConfig.Retention.Time
}
if cfgFile.StorageConfig.TSDBConfig.Retention.Size > 0 {
cfg.tsdb.MaxBytes = cfgFile.StorageConfig.TSDBConfig.Retention.Size
}
}
} }
// Set Go runtime parameters before we get too far into initialization. // Set Go runtime parameters before we get too far into initialization.

View File

@ -1051,6 +1051,15 @@ type StorageConfig struct {
ExemplarsConfig *ExemplarsConfig `yaml:"exemplars,omitempty"` ExemplarsConfig *ExemplarsConfig `yaml:"exemplars,omitempty"`
} }
// TSDBRetentionConfig holds the configuration retention of data in storage storage.
type TSDBRetentionConfig struct {
// How long to retain samples in storage.
Time model.Duration `yaml:"time,omitempty"`
// Maximum number of bytes that can be stored for blocks.
Size units.Base2Bytes `yaml:"size,omitempty"`
}
// TSDBConfig configures runtime reloadable configuration options. // TSDBConfig configures runtime reloadable configuration options.
type TSDBConfig struct { type TSDBConfig struct {
// OutOfOrderTimeWindow sets how long back in time an out-of-order sample can be inserted // OutOfOrderTimeWindow sets how long back in time an out-of-order sample can be inserted
@ -1063,6 +1072,8 @@ type TSDBConfig struct {
// During unmarshall, this is converted into milliseconds and stored in OutOfOrderTimeWindow. // During unmarshall, this is converted into milliseconds and stored in OutOfOrderTimeWindow.
// This should not be used directly and must be converted into OutOfOrderTimeWindow. // This should not be used directly and must be converted into OutOfOrderTimeWindow.
OutOfOrderTimeWindowFlag model.Duration `yaml:"out_of_order_time_window,omitempty"` OutOfOrderTimeWindowFlag model.Duration `yaml:"out_of_order_time_window,omitempty"`
Retention *TSDBRetentionConfig `yaml:"retention,omitempty"`
} }
// UnmarshalYAML implements the yaml.Unmarshaler interface. // UnmarshalYAML implements the yaml.Unmarshaler interface.

View File

@ -1706,6 +1706,10 @@ var expectedConf = &Config{
TSDBConfig: &TSDBConfig{ TSDBConfig: &TSDBConfig{
OutOfOrderTimeWindow: 30 * time.Minute.Milliseconds(), OutOfOrderTimeWindow: 30 * time.Minute.Milliseconds(),
OutOfOrderTimeWindowFlag: model.Duration(30 * time.Minute), OutOfOrderTimeWindowFlag: model.Duration(30 * time.Minute),
Retention: &TSDBRetentionConfig{
Time: model.Duration(24 * time.Hour),
Size: 1 * units.GiB,
},
}, },
}, },
TracingConfig: TracingConfig{ TracingConfig: TracingConfig{

View File

@ -453,6 +453,9 @@ alerting:
storage: storage:
tsdb: tsdb:
out_of_order_time_window: 30m out_of_order_time_window: 30m
retention:
time: 1d
size: 1GB
tracing: tracing:
endpoint: "localhost:4317" endpoint: "localhost:4317"

View File

@ -34,8 +34,8 @@ The Prometheus monitoring server
| <code class="text-nowrap">--web.page-title</code> | Document title of Prometheus instance. | `Prometheus Time Series Collection and Processing Server` | | <code class="text-nowrap">--web.page-title</code> | Document title of Prometheus instance. | `Prometheus Time Series Collection and Processing Server` |
| <code class="text-nowrap">--web.cors.origin</code> | Regex for CORS origin. It is fully anchored. Example: 'https?://(domain1\|domain2)\.com' | `.*` | | <code class="text-nowrap">--web.cors.origin</code> | Regex for CORS origin. It is fully anchored. Example: 'https?://(domain1\|domain2)\.com' | `.*` |
| <code class="text-nowrap">--storage.tsdb.path</code> | Base path for metrics storage. Use with server mode only. | `data/` | | <code class="text-nowrap">--storage.tsdb.path</code> | Base path for metrics storage. Use with server mode only. | `data/` |
| <code class="text-nowrap">--storage.tsdb.retention.time</code> | How long to retain samples in storage. If neither this flag nor "storage.tsdb.retention.size" is set, the retention time defaults to 15d. Units Supported: y, w, d, h, m, s, ms. Use with server mode only. | | | <code class="text-nowrap">--storage.tsdb.retention.time</code> | [DEPRECATED] How long to retain samples in storage. If neither this flag nor "storage.tsdb.retention.size" is set, the retention time defaults to 15d. Units Supported: y, w, d, h, m, s, ms. This flag has been deprecated, use the storage.tsdb.retention.time field in the config file instead. Use with server mode only. | |
| <code class="text-nowrap">--storage.tsdb.retention.size</code> | Maximum number of bytes that can be stored for blocks. A unit is required, supported units: B, KB, MB, GB, TB, PB, EB. Ex: "512MB". Based on powers-of-2, so 1KB is 1024B. Use with server mode only. | | | <code class="text-nowrap">--storage.tsdb.retention.size</code> | [DEPRECATED] Maximum number of bytes that can be stored for blocks. A unit is required, supported units: B, KB, MB, GB, TB, PB, EB. Ex: "512MB". Based on powers-of-2, so 1KB is 1024B. This flag has been deprecated, use the storage.tsdb.retention.size field in the config file instead. Use with server mode only. | |
| <code class="text-nowrap">--storage.tsdb.no-lockfile</code> | Do not create lockfile in data directory. Use with server mode only. | `false` | | <code class="text-nowrap">--storage.tsdb.no-lockfile</code> | Do not create lockfile in data directory. Use with server mode only. | `false` |
| <code class="text-nowrap">--storage.tsdb.head-chunks-write-queue-size</code> | Size of the queue through which head chunks are written to the disk to be m-mapped, 0 disables the queue completely. Experimental. Use with server mode only. | `0` | | <code class="text-nowrap">--storage.tsdb.head-chunks-write-queue-size</code> | Size of the queue through which head chunks are written to the disk to be m-mapped, 0 disables the queue completely. Experimental. Use with server mode only. | `0` |
| <code class="text-nowrap">--storage.agent.path</code> | Base path for metrics storage. Use with agent mode only. | `data-agent/` | | <code class="text-nowrap">--storage.agent.path</code> | Base path for metrics storage. Use with agent mode only. | `data-agent/` |

View File

@ -3229,6 +3229,26 @@ with this feature.
# the agent's WAL to accept out-of-order samples that fall within the specified time window relative # the agent's WAL to accept out-of-order samples that fall within the specified time window relative
# to the timestamp of the last appended sample for the same series. # to the timestamp of the last appended sample for the same series.
[ out_of_order_time_window: <duration> | default = 0s ] [ out_of_order_time_window: <duration> | default = 0s ]
# Configures data retention settings for TSDB.
#
# Note: When retention is changed at runtime, the retention
# settings are updated immediately, but block deletion based on the new retention policy
# occurs during the next block reload cycle. This happens automatically within 1 minute
# or when a compaction completes, whichever comes first.
[ retention: <retention> ] :
# How long to retain samples in storage. If neither this option nor the size option
# is set, the retention time defaults to 15d. Units Supported: y, w, d, h, m, s, ms.
# This option takes precedence over the deprecated command-line flag --storage.tsdb.retention.time.
[ time: <duration> | default = 15d ]
# Maximum number of bytes that can be stored for blocks. A unit is required,
# supported units: B, KB, MB, GB, TB, PB, EB. Ex: "512MB". Based on powers-of-2, so 1KB is 1024B.
# If set to 0 or not set, size-based retention is disabled.
# This option takes precedence over the deprecated command-line flag --storage.tsdb.retention.size.
[ size: <size> | default = 0 ]
``` ```
### `<exemplars>` ### `<exemplars>`

View File

@ -264,6 +264,10 @@ type DB struct {
autoCompactMtx sync.Mutex autoCompactMtx sync.Mutex
autoCompact bool autoCompact bool
// retentionMtx protects access to retention configuration values that can
// be updated at runtime through config file changes.
retentionMtx sync.RWMutex
// Cancel a running compaction when a shutdown is initiated. // Cancel a running compaction when a shutdown is initiated.
compactCancel context.CancelFunc compactCancel context.CancelFunc
@ -1153,6 +1157,20 @@ func (db *DB) ApplyConfig(conf *config.Config) error {
oooTimeWindow := int64(0) oooTimeWindow := int64(0)
if conf.StorageConfig.TSDBConfig != nil { if conf.StorageConfig.TSDBConfig != nil {
oooTimeWindow = conf.StorageConfig.TSDBConfig.OutOfOrderTimeWindow oooTimeWindow = conf.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
// Update retention configuration if provided.
if conf.StorageConfig.TSDBConfig.Retention != nil {
db.retentionMtx.Lock()
if conf.StorageConfig.TSDBConfig.Retention.Time > 0 {
db.opts.RetentionDuration = int64(conf.StorageConfig.TSDBConfig.Retention.Time)
db.metrics.retentionDuration.Set((time.Duration(db.opts.RetentionDuration) * time.Millisecond).Seconds())
}
if conf.StorageConfig.TSDBConfig.Retention.Size > 0 {
db.opts.MaxBytes = int64(conf.StorageConfig.TSDBConfig.Retention.Size)
db.metrics.maxBytes.Set(float64(db.opts.MaxBytes))
}
db.retentionMtx.Unlock()
}
} }
if oooTimeWindow < 0 { if oooTimeWindow < 0 {
oooTimeWindow = 0 oooTimeWindow = 0
@ -1187,6 +1205,20 @@ func (db *DB) ApplyConfig(conf *config.Config) error {
return nil return nil
} }
// getRetentionDuration returns the current retention duration in a thread-safe manner.
func (db *DB) getRetentionDuration() int64 {
db.retentionMtx.RLock()
defer db.retentionMtx.RUnlock()
return db.opts.RetentionDuration
}
// getMaxBytes returns the current max bytes setting in a thread-safe manner.
func (db *DB) getMaxBytes() int64 {
db.retentionMtx.RLock()
defer db.retentionMtx.RUnlock()
return db.opts.MaxBytes
}
// dbAppender wraps the DB's head appender and triggers compactions on commit // dbAppender wraps the DB's head appender and triggers compactions on commit
// if necessary. // if necessary.
type dbAppender struct { type dbAppender struct {
@ -1734,7 +1766,8 @@ func deletableBlocks(db *DB, blocks []*Block) map[ulid.ULID]struct{} {
// set in the db options. // set in the db options.
func BeyondTimeRetention(db *DB, blocks []*Block) (deletable map[ulid.ULID]struct{}) { func BeyondTimeRetention(db *DB, blocks []*Block) (deletable map[ulid.ULID]struct{}) {
// Time retention is disabled or no blocks to work with. // Time retention is disabled or no blocks to work with.
if len(blocks) == 0 || db.opts.RetentionDuration == 0 { retentionDuration := db.getRetentionDuration()
if len(blocks) == 0 || retentionDuration == 0 {
return return
} }
@ -1742,7 +1775,7 @@ func BeyondTimeRetention(db *DB, blocks []*Block) (deletable map[ulid.ULID]struc
for i, block := range blocks { for i, block := range blocks {
// The difference between the first block and this block is greater than or equal to // The difference between the first block and this block is greater than or equal to
// the retention period so any blocks after that are added as deletable. // the retention period so any blocks after that are added as deletable.
if i > 0 && blocks[0].Meta().MaxTime-block.Meta().MaxTime >= db.opts.RetentionDuration { if i > 0 && blocks[0].Meta().MaxTime-block.Meta().MaxTime >= retentionDuration {
for _, b := range blocks[i:] { for _, b := range blocks[i:] {
deletable[b.meta.ULID] = struct{}{} deletable[b.meta.ULID] = struct{}{}
} }
@ -1757,7 +1790,8 @@ func BeyondTimeRetention(db *DB, blocks []*Block) (deletable map[ulid.ULID]struc
// set in the db options. // set in the db options.
func BeyondSizeRetention(db *DB, blocks []*Block) (deletable map[ulid.ULID]struct{}) { func BeyondSizeRetention(db *DB, blocks []*Block) (deletable map[ulid.ULID]struct{}) {
// Size retention is disabled or no blocks to work with. // Size retention is disabled or no blocks to work with.
if len(blocks) == 0 || db.opts.MaxBytes <= 0 { maxBytes := db.getMaxBytes()
if len(blocks) == 0 || maxBytes <= 0 {
return return
} }
@ -1768,7 +1802,7 @@ func BeyondSizeRetention(db *DB, blocks []*Block) (deletable map[ulid.ULID]struc
blocksSize := db.Head().Size() blocksSize := db.Head().Size()
for i, block := range blocks { for i, block := range blocks {
blocksSize += block.Size() blocksSize += block.Size()
if blocksSize > db.opts.MaxBytes { if blocksSize > maxBytes {
// Add this and all following blocks for deletion. // Add this and all following blocks for deletion.
for _, b := range blocks[i:] { for _, b := range blocks[i:] {
deletable[b.meta.ULID] = struct{}{} deletable[b.meta.ULID] = struct{}{}

View File

@ -42,6 +42,7 @@ import (
"github.com/oklog/ulid/v2" "github.com/oklog/ulid/v2"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/common/model"
"github.com/prometheus/common/promslog" "github.com/prometheus/common/promslog"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"go.uber.org/atomic" "go.uber.org/atomic"
@ -1719,6 +1720,73 @@ func TestSizeRetentionMetric(t *testing.T) {
} }
} }
// TestRuntimeRetentionConfigChange tests that retention configuration can be
// changed at runtime via ApplyConfig and that the retention logic properly
// deletes blocks when retention is shortened. This test also ensures race-free
// concurrent access to retention settings.
func TestRuntimeRetentionConfigChange(t *testing.T) {
const (
initialRetentionDuration = int64(10 * time.Hour / time.Millisecond) // 10 hours
shorterRetentionDuration = int64(1 * time.Hour / time.Millisecond) // 1 hour
)
db := openTestDB(t, &Options{
RetentionDuration: initialRetentionDuration,
}, []int64{100})
defer func() {
require.NoError(t, db.Close())
}()
nineHoursMs := int64(9 * time.Hour / time.Millisecond)
nineAndHalfHoursMs := int64((9*time.Hour + 30*time.Minute) / time.Millisecond)
blocks := []*BlockMeta{
{MinTime: 0, MaxTime: 100}, // 10 hours old (beyond new retention)
{MinTime: 100, MaxTime: 200}, // 9.9 hours old (beyond new retention)
{MinTime: nineHoursMs, MaxTime: nineAndHalfHoursMs}, // 1 hour old (within new retention)
{MinTime: nineAndHalfHoursMs, MaxTime: initialRetentionDuration}, // 0.5 hours old (within new retention)
}
for _, m := range blocks {
createBlock(t, db.Dir(), genSeries(10, 10, m.MinTime, m.MaxTime))
}
// Reload blocks and verify all are loaded.
require.NoError(t, db.reloadBlocks())
require.Len(t, db.Blocks(), len(blocks), "expected all blocks to be loaded initially")
cfg := &config.Config{
StorageConfig: config.StorageConfig{
TSDBConfig: &config.TSDBConfig{
Retention: &config.TSDBRetentionConfig{
Time: model.Duration(shorterRetentionDuration),
},
},
},
}
require.NoError(t, db.ApplyConfig(cfg), "ApplyConfig should succeed")
actualRetention := db.getRetentionDuration()
require.Equal(t, shorterRetentionDuration, actualRetention, "retention duration should be updated")
expectedRetentionSeconds := (time.Duration(shorterRetentionDuration) * time.Millisecond).Seconds()
actualRetentionSeconds := prom_testutil.ToFloat64(db.metrics.retentionDuration)
require.Equal(t, expectedRetentionSeconds, actualRetentionSeconds, "retention duration metric should be updated")
require.NoError(t, db.reloadBlocks())
// Verify that blocks beyond the new retention were deleted.
// We expect only the last 2 blocks to remain (those within 1 hour).
actBlocks := db.Blocks()
require.Len(t, actBlocks, 2, "expected old blocks to be deleted after retention change")
// Verify the remaining blocks are the newest ones.
require.Equal(t, nineHoursMs, actBlocks[0].meta.MinTime, "first remaining block should be within retention")
require.Equal(t, initialRetentionDuration, actBlocks[1].meta.MaxTime, "last remaining block should be the newest")
require.Positive(t, int(prom_testutil.ToFloat64(db.metrics.timeRetentionCount)), "time retention count should be incremented")
}
func TestNotMatcherSelectsLabelsUnsetSeries(t *testing.T) { func TestNotMatcherSelectsLabelsUnsetSeries(t *testing.T) {
db := openTestDB(t, nil, nil) db := openTestDB(t, nil, nil)
defer func() { defer func() {