diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d12a4bef3d..6247e1927d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -142,9 +142,7 @@ jobs: enable_npm: false # NOTE: Those tests are based on https://github.com/prometheus/compliance and # are executed against the ./cmd/prometheus main package. - - run: go test -skip ${SKIP_TESTS} -v --tags=compliance ./compliance/... - env: - SKIP_TESTS: "TestRemoteWriteSender/prometheus/samples/rw2/start_timestamp*" # TODO(bwplotka): PROM-60 + - run: go test -v --tags=compliance ./compliance/... build: name: Build Prometheus for common architectures diff --git a/RELEASE.md b/RELEASE.md index 5a8f8601ab..5c29b0a522 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -20,7 +20,8 @@ Please see [the v2.55 RELEASE.md](https://github.com/prometheus/prometheus/blob/ | v3.8 | 2025-11-06 | Jan Fajerski (GitHub: @jan--f) | | v3.9 | 2025-12-18 | Bryan Boreham (GitHub: @bboreham) | | v3.10 | 2026-02-05 | Ganesh Vernekar (Github: @codesome) | -| v3.11 | 2026-03-19 | **volunteer welcome** | +| v3.11 | 2026-03-25 | Julien Pivotto (GitHub: @roidelapluie) | +| v3.12 | 2026-05-06 | **volunteer welcome** | If you are interested in volunteering please create a pull request against the [prometheus/prometheus](https://github.com/prometheus/prometheus) repository and propose yourself for the release series of your choice. diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index f77426d9ea..f46f1fa64d 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -271,6 +271,7 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { case "created-timestamp-zero-ingestion": // NOTE(bwplotka): Once AppendableV1 is removed, there will be only the TSDB and agent flags. c.scrape.EnableStartTimestampZeroIngestion = true + c.scrape.ParseST = true c.web.STZeroIngestionEnabled = true c.tsdb.EnableSTAsZeroSample = true c.agent.EnableSTAsZeroSample = true @@ -279,16 +280,19 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { // This is to widen the ST support surface. config.DefaultConfig.GlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols - logger.Info("Experimental start timestamp zero ingestion enabled. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) + logger.Info("Experimental start timestamp zero ingestion enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) + case "xor2-encoding": + c.tsdb.EnableXOR2Encoding = true + logger.Info("Experimental XOR2 chunk encoding enabled.") case "st-storage": - // TODO(bwplotka): Implement ST Storage as per PROM-60 and document this hidden feature flag. + c.scrape.ParseST = true c.tsdb.EnableSTStorage = true c.agent.EnableSTStorage = true // Change relevant global variables. Hacky, but it's hard to pass a new option or default to unmarshallers. This is to widen the ST support surface. config.DefaultConfig.GlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols - logger.Info("Experimental start timestamp storage enabled. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) + logger.Info("Experimental start timestamp storage enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) case "delayed-compaction": c.tsdb.EnableDelayedCompaction = true logger.Info("Experimental delayed compaction is enabled.") @@ -601,7 +605,7 @@ func main() { a.Flag("scrape.discovery-reload-interval", "Interval used by scrape manager to throttle target groups updates."). Hidden().Default("5s").SetValue(&cfg.scrape.DiscoveryReloadInterval) - a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). + a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, st-storage, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers, xor2-encoding. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). Default("").StringsVar(&cfg.featureList) a.Flag("agent", "Run Prometheus in 'Agent mode'.").BoolVar(&agentMode) @@ -671,6 +675,18 @@ func main() { os.Exit(2) } + // Set TSDB retention defaults from CLI flags before any config file is loaded. + // This makes CLI flags act as the default when no retention section is present. + cliRetentionDuration := cfg.tsdb.RetentionDuration + cliMaxBytes := cfg.tsdb.MaxBytes + if cliRetentionDuration == 0 && cliMaxBytes == 0 { + cliRetentionDuration = defaultRetentionDuration + } + config.DefaultTSDBRetentionConfig = config.TSDBRetentionConfig{ + Time: cliRetentionDuration, + Size: cliMaxBytes, + } + // Throw error for invalid config before starting other components. var cfgFile *config.Config if cfgFile, err = config.LoadFile(cfg.configFile, agentMode, promslog.NewNopLogger()); err != nil { @@ -712,21 +728,11 @@ func main() { logger.Warn("The option --storage.tsdb.block-reload-interval is set to a value less than 1s. Setting it to 1s to avoid overload.") cfg.tsdb.BlockReloadInterval = model.Duration(1 * time.Second) } - if cfgFile.StorageConfig.TSDBConfig != nil { - cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow - cfg.tsdb.StaleSeriesCompactionThreshold = cfgFile.StorageConfig.TSDBConfig.StaleSeriesCompactionThreshold - if cfgFile.StorageConfig.TSDBConfig.Retention != nil { - if cfgFile.StorageConfig.TSDBConfig.Retention.Time > 0 { - cfg.tsdb.RetentionDuration = cfgFile.StorageConfig.TSDBConfig.Retention.Time - } - if cfgFile.StorageConfig.TSDBConfig.Retention.Size > 0 { - cfg.tsdb.MaxBytes = cfgFile.StorageConfig.TSDBConfig.Retention.Size - } - if cfgFile.StorageConfig.TSDBConfig.Retention.Percentage > 0 { - cfg.tsdb.MaxPercentage = cfgFile.StorageConfig.TSDBConfig.Retention.Percentage - } - } - } + cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow + cfg.tsdb.StaleSeriesCompactionThreshold = cfgFile.StorageConfig.TSDBConfig.StaleSeriesCompactionThreshold + cfg.tsdb.RetentionDuration = cfgFile.StorageConfig.TSDBConfig.Retention.Time + cfg.tsdb.MaxBytes = cfgFile.StorageConfig.TSDBConfig.Retention.Size + cfg.tsdb.MaxPercentage = cfgFile.StorageConfig.TSDBConfig.Retention.Percentage // Set Go runtime parameters before we get too far into initialization. updateGoGC(cfgFile, logger) @@ -778,11 +784,6 @@ func main() { cfg.web.RoutePrefix = "/" + strings.Trim(cfg.web.RoutePrefix, "/") if !agentMode { - if cfg.tsdb.RetentionDuration == 0 && cfg.tsdb.MaxBytes == 0 && cfg.tsdb.MaxPercentage == 0 { - cfg.tsdb.RetentionDuration = defaultRetentionDuration - logger.Info("No time, size or percentage retention was set so using the default time retention", "duration", defaultRetentionDuration) - } - // Check for overflows. This limits our max retention to 100y. if cfg.tsdb.RetentionDuration < 0 { y, err := model.ParseDuration("100y") @@ -1031,8 +1032,29 @@ func main() { reloaders := []reloader{ { - name: "db_storage", - reloader: localStorage.ApplyConfig, + name: "db_storage", + reloader: func() func(*config.Config) error { + lastTSDBRetention := config.TSDBRetentionConfig{} + return func(cfg *config.Config) error { + err := localStorage.ApplyConfig(cfg) + if err != nil || agentMode || cfg.StorageConfig.TSDBConfig == nil || cfg.StorageConfig.TSDBConfig.Retention == nil { + return err + } + + curr := cfg.StorageConfig.TSDBConfig.Retention + if *curr == lastTSDBRetention { + return nil + } + + logger.Info("TSDB retention updated", + "duration", curr.Time, + "size", curr.Size, + "percentage", curr.Percentage, + ) + lastTSDBRetention = *curr + return nil + } + }(), }, { name: "remote_storage", reloader: remoteStorage.ApplyConfig, @@ -2009,6 +2031,7 @@ type tsdbOptions struct { BlockReloadInterval model.Duration EnableSTAsZeroSample bool EnableSTStorage bool + EnableXOR2Encoding bool StaleSeriesCompactionThreshold float64 } @@ -2039,6 +2062,7 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options { FeatureRegistry: features.DefaultRegistry, EnableSTAsZeroSample: opts.EnableSTAsZeroSample, EnableSTStorage: opts.EnableSTStorage, + EnableXOR2Encoding: opts.EnableXOR2Encoding, StaleSeriesCompactionThreshold: opts.StaleSeriesCompactionThreshold, } } diff --git a/cmd/prometheus/testdata/features.json b/cmd/prometheus/testdata/features.json index 60e6b65b40..d30b3b382f 100644 --- a/cmd/prometheus/testdata/features.json +++ b/cmd/prometheus/testdata/features.json @@ -251,6 +251,8 @@ "exemplar_storage": false, "isolation": true, "native_histograms": true, + "st_storage": false, + "xor2_encoding": false, "use_uncached_io": false }, "ui": { diff --git a/compliance/go.mod b/compliance/go.mod index cd0ad49cf8..81c5450ff3 100644 --- a/compliance/go.mod +++ b/compliance/go.mod @@ -2,7 +2,7 @@ module compliance go 1.25.0 -require github.com/prometheus/compliance/remotewrite v0.0.0-20260220101514-bccaa3a70275 +require github.com/prometheus/compliance/remotewrite v0.0.0-20260223092825-818283e1171e require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect diff --git a/compliance/go.sum b/compliance/go.sum index 6f273f49bd..799748d81d 100644 --- a/compliance/go.sum +++ b/compliance/go.sum @@ -30,8 +30,8 @@ github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNw github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.67.2 h1:PcBAckGFTIHt2+L3I33uNRTlKTplNzFctXcWhPyAEN8= github.com/prometheus/common v0.67.2/go.mod h1:63W3KZb1JOKgcjlIr64WW/LvFGAqKPj0atm+knVGEko= -github.com/prometheus/compliance/remotewrite v0.0.0-20260220101514-bccaa3a70275 h1:NLTtFqM00EuqtisYX9P+hQkjoxNxsR2oUQWDluyD2Xw= -github.com/prometheus/compliance/remotewrite v0.0.0-20260220101514-bccaa3a70275/go.mod h1:VEPZGvpSBbzTKc5acnBj9ng4gfo1DZ4qBsCQnoNFiSc= +github.com/prometheus/compliance/remotewrite v0.0.0-20260223092825-818283e1171e h1:tT/KBv0aSFq4AElo/bSVvUd+yNKj72hkRsyiKU45nIQ= +github.com/prometheus/compliance/remotewrite v0.0.0-20260223092825-818283e1171e/go.mod h1:VEPZGvpSBbzTKc5acnBj9ng4gfo1DZ4qBsCQnoNFiSc= github.com/prometheus/prometheus v0.307.4-0.20251119130332-1174b0ce4f1f h1:ERPCnBglv9Z4IjkEBTNbcHmZPlryMldXVWLkk7TeBIY= github.com/prometheus/prometheus v0.307.4-0.20251119130332-1174b0ce4f1f/go.mod h1:7hcXiGf9AXIKW2ehWWzxkvRYJTGmc2StUIJ8mprfxjg= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= diff --git a/compliance/remote_write_sender_test.go b/compliance/remote_write_sender_test.go index 6840132bd3..9822e2d3e6 100644 --- a/compliance/remote_write_sender_test.go +++ b/compliance/remote_write_sender_test.go @@ -53,7 +53,9 @@ scrape_configs: var scrapeConfigTmpl = template.Must(template.New("config").Parse(scrapeConfigTemplate)) -type internalPrometheus struct{} +type internalPrometheus struct { + agentMode bool +} func (p internalPrometheus) Name() string { return "internal-prometheus" } @@ -74,20 +76,33 @@ func (p internalPrometheus) Run(ctx context.Context, opts sender.Options) error } defer os.RemoveAll(dir) - return sender.RunCommand(ctx, "../cmd/prometheus", nil, - "go", "run", ".", + args := []string{ + "run", ".", "--web.listen-address=0.0.0.0:0", - fmt.Sprintf("--storage.tsdb.path=%v", dir), fmt.Sprintf("--config.file=%s", configFile), // Set important flags for the full remote write compliance: "--enable-feature=st-storage", - ) + } + if p.agentMode { + args = append(args, fmt.Sprintf("--storage.agent.path=%v", dir), "--agent") + } else { + args = append(args, fmt.Sprintf("--storage.tsdb.path=%v", dir)) + } + return sender.RunCommand(ctx, "../cmd/prometheus", nil, "go", args...) } var _ sender.Sender = internalPrometheus{} // TestRemoteWriteSender runs remote write sender compliance tests defined in -// https://github.com/prometheus/compliance/tree/main/remotewrite/sender +// https://github.com/prometheus/compliance/tree/main/remotewrite/sender against +// both agent and server modes. func TestRemoteWriteSender(t *testing.T) { - sender.RunTests(t, internalPrometheus{}, sender.ComplianceTests()) + t.Run("mode=server", func(t *testing.T) { + t.Parallel() + sender.RunTests(t, internalPrometheus{}, sender.ComplianceTests()) + }) + t.Run("mode=agent", func(t *testing.T) { + t.Parallel() + sender.RunTests(t, internalPrometheus{agentMode: true}, sender.ComplianceTests()) + }) } diff --git a/config/config.go b/config/config.go index b390a4a629..2082743b0d 100644 --- a/config/config.go +++ b/config/config.go @@ -83,6 +83,13 @@ func Load(s string, logger *slog.Logger) (*Config, error) { return nil, err } + // When the config body is empty, UnmarshalYAML is never called, so + // TSDBConfig may still be nil. + if cfg.StorageConfig.TSDBConfig == nil { + retention := DefaultTSDBRetentionConfig + cfg.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention} + } + b := labels.NewScratchBuilder(0) cfg.GlobalConfig.ExternalLabels.Range(func(v labels.Label) { newV := os.Expand(v.Value, func(s string) string { @@ -276,6 +283,9 @@ var ( // For backwards compatibility. LabelNamePreserveMultipleUnderscores: true, } + + // DefaultTSDBRetentionConfig is the default TSDB retention configuration. + DefaultTSDBRetentionConfig TSDBRetentionConfig ) // Config is the top-level configuration for Prometheus's config files. @@ -405,6 +415,13 @@ func (c *Config) UnmarshalYAML(unmarshal func(any) error) error { c.Runtime = DefaultRuntimeConfig } + // If no storage.tsdb section is present, TSDBConfig is nil and its + // UnmarshalYAML never runs. Inject the default retention here. + if c.StorageConfig.TSDBConfig == nil { + retention := DefaultTSDBRetentionConfig + c.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention} + } + for _, rf := range c.RuleFiles { if !patRulePath.MatchString(rf) { return fmt.Errorf("invalid rule file path %q", rf) @@ -1097,6 +1114,22 @@ type TSDBRetentionConfig struct { Percentage uint `yaml:"percentage,omitempty"` } +// UnmarshalYAML implements the yaml.Unmarshaler interface. +func (t *TSDBRetentionConfig) UnmarshalYAML(unmarshal func(any) error) error { + *t = TSDBRetentionConfig{} + type plain TSDBRetentionConfig + if err := unmarshal((*plain)(t)); err != nil { + return err + } + if t.Size < 0 { + return fmt.Errorf("'storage.tsdb.retention.size' must be greater than or equal to 0, got %v", t.Size) + } + if t.Percentage > 100 { + return fmt.Errorf("'storage.tsdb.retention.percentage' must be in the range [0, 100], got %v", t.Percentage) + } + return nil +} + // TSDBConfig configures runtime reloadable configuration options. type TSDBConfig struct { // OutOfOrderTimeWindow sets how long back in time an out-of-order sample can be inserted @@ -1127,6 +1160,11 @@ func (t *TSDBConfig) UnmarshalYAML(unmarshal func(any) error) error { t.OutOfOrderTimeWindow = time.Duration(t.OutOfOrderTimeWindowFlag).Milliseconds() + if t.Retention == nil { + retention := DefaultTSDBRetentionConfig + t.Retention = &retention + } + return nil } diff --git a/config/config_default_test.go b/config/config_default_test.go index 91c290ae4e..ec7a112824 100644 --- a/config/config_default_test.go +++ b/config/config_default_test.go @@ -20,9 +20,10 @@ const ruleFilesConfigFile = "testdata/rules_abs_path.good.yml" var ruleFilesExpectedConf = &Config{ loaded: true, - GlobalConfig: DefaultGlobalConfig, - Runtime: DefaultRuntimeConfig, - OTLPConfig: DefaultOTLPConfig, + GlobalConfig: DefaultGlobalConfig, + Runtime: DefaultRuntimeConfig, + OTLPConfig: DefaultOTLPConfig, + StorageConfig: StorageConfig{TSDBConfig: &TSDBConfig{Retention: &TSDBRetentionConfig{}}}, RuleFiles: []string{ "testdata/first.rules", "testdata/rules/second.rules", diff --git a/config/config_test.go b/config/config_test.go index 7001283443..8d4df86be6 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -2626,6 +2626,22 @@ var expectedErrors = []struct { filename: "stackit_endpoint.bad.yml", errMsg: "invalid endpoint", }, + { + filename: "tsdb_retention_time.bad.yml", + errMsg: `not a valid duration string: "-1h"`, + }, + { + filename: "tsdb_retention_size.bad.yml", + errMsg: `'storage.tsdb.retention.size' must be greater than or equal to 0`, + }, + { + filename: "tsdb_retention_percentage.bad.yml", + errMsg: `'storage.tsdb.retention.percentage' must be in the range [0, 100]`, + }, + { + filename: "tsdb_retention_percentage_negative.bad.yml", + errMsg: "cannot unmarshal !!int `-1` into uint", + }, } func TestBadConfigs(t *testing.T) { @@ -2649,6 +2665,8 @@ func TestEmptyConfig(t *testing.T) { require.NoError(t, err) exp := DefaultConfig exp.loaded = true + retention := DefaultTSDBRetentionConfig + exp.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention} require.Equal(t, exp, *c) require.Equal(t, 75, c.Runtime.GoGC) } @@ -2700,6 +2718,10 @@ func TestGlobalConfig(t *testing.T) { require.NoError(t, err) exp := DefaultConfig exp.loaded = true + // TSDBConfig is always injected by Config.UnmarshalYAML even when no + // storage.tsdb section is present, so the expected config must include it. + retention := DefaultTSDBRetentionConfig + exp.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention} require.Equal(t, exp, *c) }) diff --git a/config/config_windows_test.go b/config/config_windows_test.go index 72a56ff41a..e7627f562a 100644 --- a/config/config_windows_test.go +++ b/config/config_windows_test.go @@ -18,8 +18,9 @@ const ruleFilesConfigFile = "testdata/rules_abs_path_windows.good.yml" var ruleFilesExpectedConf = &Config{ loaded: true, - GlobalConfig: DefaultGlobalConfig, - Runtime: DefaultRuntimeConfig, + GlobalConfig: DefaultGlobalConfig, + Runtime: DefaultRuntimeConfig, + StorageConfig: StorageConfig{TSDBConfig: &TSDBConfig{Retention: &TSDBRetentionConfig{}}}, RuleFiles: []string{ "testdata\\first.rules", "testdata\\rules\\second.rules", diff --git a/config/testdata/tsdb_retention_percentage.bad.yml b/config/testdata/tsdb_retention_percentage.bad.yml new file mode 100644 index 0000000000..cb57abe0c0 --- /dev/null +++ b/config/testdata/tsdb_retention_percentage.bad.yml @@ -0,0 +1,4 @@ +storage: + tsdb: + retention: + percentage: 101 diff --git a/config/testdata/tsdb_retention_percentage_negative.bad.yml b/config/testdata/tsdb_retention_percentage_negative.bad.yml new file mode 100644 index 0000000000..2eeb60c091 --- /dev/null +++ b/config/testdata/tsdb_retention_percentage_negative.bad.yml @@ -0,0 +1,4 @@ +storage: + tsdb: + retention: + percentage: -1 diff --git a/config/testdata/tsdb_retention_size.bad.yml b/config/testdata/tsdb_retention_size.bad.yml new file mode 100644 index 0000000000..ecae64aae6 --- /dev/null +++ b/config/testdata/tsdb_retention_size.bad.yml @@ -0,0 +1,4 @@ +storage: + tsdb: + retention: + size: -1GB diff --git a/config/testdata/tsdb_retention_time.bad.yml b/config/testdata/tsdb_retention_time.bad.yml new file mode 100644 index 0000000000..465b3cf5da --- /dev/null +++ b/config/testdata/tsdb_retention_time.bad.yml @@ -0,0 +1,4 @@ +storage: + tsdb: + retention: + time: -1h diff --git a/discovery/azure/azure.go b/discovery/azure/azure.go index 834eaf1f29..0ac9a9af4e 100644 --- a/discovery/azure/azure.go +++ b/discovery/azure/azure.go @@ -298,7 +298,10 @@ func newCredential(cfg SDConfig, policyClientOptions policy.ClientOptions) (azco } credential = azcore.TokenCredential(workloadIdentityCredential) case authMethodManagedIdentity: - options := &azidentity.ManagedIdentityCredentialOptions{ClientOptions: policyClientOptions, ID: azidentity.ClientID(cfg.ClientID)} + options := &azidentity.ManagedIdentityCredentialOptions{ClientOptions: policyClientOptions} + if cfg.ClientID != "" { + options.ID = azidentity.ClientID(cfg.ClientID) + } managedIdentityCredential, err := azidentity.NewManagedIdentityCredential(options) if err != nil { return nil, err diff --git a/discovery/azure/azure_test.go b/discovery/azure/azure_test.go index 23c120ac6b..dd2eeb0a3f 100644 --- a/discovery/azure/azure_test.go +++ b/discovery/azure/azure_test.go @@ -24,6 +24,7 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" azfake "github.com/Azure/azure-sdk-for-go/sdk/azcore/fake" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" fake "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5/fake" @@ -490,6 +491,27 @@ func TestNewAzureResourceFromID(t *testing.T) { } } +func TestNewCredentialManagedIdentity(t *testing.T) { + // Test that system-assigned managed identity (empty ClientID) creates + // a valid credential. Previously, an empty ClientID was passed as + // azidentity.ClientID("") which is not nil and caused Azure SDK to + // look up a non-existent user-assigned identity instead of falling + // back to system-assigned identity. + cfg := SDConfig{ + AuthenticationMethod: authMethodManagedIdentity, + ClientID: "", + } + cred, err := newCredential(cfg, policy.ClientOptions{}) + require.NoError(t, err) + require.NotNil(t, cred) + + // Test that user-assigned managed identity (non-empty ClientID) also works. + cfg.ClientID = "00000000-0000-0000-0000-000000000000" + cred, err = newCredential(cfg, policy.ClientOptions{}) + require.NoError(t, err) + require.NotNil(t, cred) +} + func TestAzureRefresh(t *testing.T) { tests := []struct { scenario string diff --git a/discovery/hetzner/hcloud.go b/discovery/hetzner/hcloud.go index 7fe55ffded..c28bfd2a1f 100644 --- a/discovery/hetzner/hcloud.go +++ b/discovery/hetzner/hcloud.go @@ -38,8 +38,10 @@ const ( hetznerLabelHcloudImageOSVersion = hetznerHcloudLabelPrefix + "image_os_version" hetznerLabelHcloudImageOSFlavor = hetznerHcloudLabelPrefix + "image_os_flavor" hetznerLabelHcloudPrivateIPv4 = hetznerHcloudLabelPrefix + "private_ipv4_" - hetznerLabelHcloudDatacenterLocation = hetznerHcloudLabelPrefix + "datacenter_location" - hetznerLabelHcloudDatacenterLocationNetworkZone = hetznerHcloudLabelPrefix + "datacenter_location_network_zone" + hetznerLabelHcloudLocation = hetznerHcloudLabelPrefix + "location" + hetznerLabelHcloudLocationNetworkZone = hetznerHcloudLabelPrefix + "location_network_zone" + hetznerLabelHcloudDatacenterLocation = hetznerHcloudLabelPrefix + "datacenter_location" // Label name kept for backward compatibility + hetznerLabelHcloudDatacenterLocationNetworkZone = hetznerHcloudLabelPrefix + "datacenter_location_network_zone" // Label name kept for backward compatibility hetznerLabelHcloudCPUCores = hetznerHcloudLabelPrefix + "cpu_cores" hetznerLabelHcloudCPUType = hetznerHcloudLabelPrefix + "cpu_type" hetznerLabelHcloudMemoryGB = hetznerHcloudLabelPrefix + "memory_size_gb" @@ -98,13 +100,14 @@ func (d *hcloudDiscovery) refresh(ctx context.Context) ([]*targetgroup.Group, er hetznerLabelRole: model.LabelValue(HetznerRoleHcloud), hetznerLabelServerID: model.LabelValue(strconv.FormatInt(server.ID, 10)), hetznerLabelServerName: model.LabelValue(server.Name), - hetznerLabelDatacenter: model.LabelValue(server.Datacenter.Name), //nolint:staticcheck // server.Datacenter is deprecated but kept for backwards compatibility until the next minor release hetznerLabelPublicIPv4: model.LabelValue(server.PublicNet.IPv4.IP.String()), hetznerLabelPublicIPv6Network: model.LabelValue(server.PublicNet.IPv6.Network.String()), hetznerLabelServerStatus: model.LabelValue(server.Status), - hetznerLabelHcloudDatacenterLocation: model.LabelValue(server.Datacenter.Location.Name), //nolint:staticcheck // server.Datacenter is deprecated but kept for backwards compatibility until the next minor release - hetznerLabelHcloudDatacenterLocationNetworkZone: model.LabelValue(server.Datacenter.Location.NetworkZone), //nolint:staticcheck // server.Datacenter is deprecated but kept for backwards compatibility until the next minor release + hetznerLabelHcloudLocation: model.LabelValue(server.Location.Name), + hetznerLabelHcloudLocationNetworkZone: model.LabelValue(server.Location.NetworkZone), + hetznerLabelHcloudDatacenterLocation: model.LabelValue(server.Location.Name), // Label name kept for backward compatibility + hetznerLabelHcloudDatacenterLocationNetworkZone: model.LabelValue(server.Location.NetworkZone), // Label name kept for backward compatibility hetznerLabelHcloudType: model.LabelValue(server.ServerType.Name), hetznerLabelHcloudCPUCores: model.LabelValue(strconv.Itoa(server.ServerType.Cores)), hetznerLabelHcloudCPUType: model.LabelValue(server.ServerType.CPUType), @@ -114,6 +117,12 @@ func (d *hcloudDiscovery) refresh(ctx context.Context) ([]*targetgroup.Group, er model.AddressLabel: model.LabelValue(net.JoinHostPort(server.PublicNet.IPv4.IP.String(), strconv.FormatUint(uint64(d.port), 10))), } + // [hcloud.Server.Datacenter] is deprecated and will be removed after 1 July 2026. + // See https://docs.hetzner.cloud/changelog#2025-12-16-phasing-out-datacenters + if server.Datacenter != nil { // nolint: staticcheck + labels[hetznerLabelDatacenter] = model.LabelValue(server.Datacenter.Name) // nolint: staticcheck + } + if server.Image != nil { labels[hetznerLabelHcloudImageName] = model.LabelValue(server.Image.Name) labels[hetznerLabelHcloudImageDescription] = model.LabelValue(server.Image.Description) diff --git a/discovery/hetzner/hcloud_test.go b/discovery/hetzner/hcloud_test.go index 3f20bcb86c..e7a11608c5 100644 --- a/discovery/hetzner/hcloud_test.go +++ b/discovery/hetzner/hcloud_test.go @@ -69,6 +69,8 @@ func TestHCloudSDRefresh(t *testing.T) { "__meta_hetzner_hcloud_image_description": model.LabelValue("Ubuntu 20.04 Standard 64 bit"), "__meta_hetzner_hcloud_image_os_flavor": model.LabelValue("ubuntu"), "__meta_hetzner_hcloud_image_os_version": model.LabelValue("20.04"), + "__meta_hetzner_hcloud_location": model.LabelValue("fsn1"), + "__meta_hetzner_hcloud_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_datacenter_location": model.LabelValue("fsn1"), "__meta_hetzner_hcloud_datacenter_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_cpu_cores": model.LabelValue("1"), @@ -93,6 +95,8 @@ func TestHCloudSDRefresh(t *testing.T) { "__meta_hetzner_hcloud_image_description": model.LabelValue("Ubuntu 20.04 Standard 64 bit"), "__meta_hetzner_hcloud_image_os_flavor": model.LabelValue("ubuntu"), "__meta_hetzner_hcloud_image_os_version": model.LabelValue("20.04"), + "__meta_hetzner_hcloud_location": model.LabelValue("fsn1"), + "__meta_hetzner_hcloud_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_datacenter_location": model.LabelValue("fsn1"), "__meta_hetzner_hcloud_datacenter_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_cpu_cores": model.LabelValue("2"), @@ -114,6 +118,8 @@ func TestHCloudSDRefresh(t *testing.T) { "__meta_hetzner_datacenter": model.LabelValue("fsn1-dc14"), "__meta_hetzner_public_ipv4": model.LabelValue("1.2.3.6"), "__meta_hetzner_public_ipv6_network": model.LabelValue("2001:db7::/64"), + "__meta_hetzner_hcloud_location": model.LabelValue("fsn1"), + "__meta_hetzner_hcloud_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_datacenter_location": model.LabelValue("fsn1"), "__meta_hetzner_hcloud_datacenter_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_cpu_cores": model.LabelValue("2"), diff --git a/discovery/hetzner/hetzner.go b/discovery/hetzner/hetzner.go index 932cfc8c93..3b7349e896 100644 --- a/discovery/hetzner/hetzner.go +++ b/discovery/hetzner/hetzner.go @@ -36,7 +36,7 @@ const ( hetznerLabelServerID = hetznerLabelPrefix + "server_id" hetznerLabelServerName = hetznerLabelPrefix + "server_name" hetznerLabelServerStatus = hetznerLabelPrefix + "server_status" - hetznerLabelDatacenter = hetznerLabelPrefix + "datacenter" + hetznerLabelDatacenter = hetznerLabelPrefix + "datacenter" // Label name kept for backward compatibility hetznerLabelPublicIPv4 = hetznerLabelPrefix + "public_ipv4" hetznerLabelPublicIPv6Network = hetznerLabelPrefix + "public_ipv6_network" ) diff --git a/discovery/hetzner/mock_test.go b/discovery/hetzner/mock_test.go index 5f1e9c036b..fb69a76b04 100644 --- a/discovery/hetzner/mock_test.go +++ b/discovery/hetzner/mock_test.go @@ -124,6 +124,16 @@ func (m *SDMock) HandleHcloudServers() { "storage_type": "local", "cpu_type": "shared" }, + "location": { + "id": 1, + "name": "fsn1", + "description": "Falkenstein DC Park 1", + "country": "DE", + "city": "Falkenstein", + "latitude": 50.47612, + "longitude": 12.370071, + "network_zone": "eu-central" + }, "datacenter": { "id": 1, "name": "fsn1-dc8", @@ -244,6 +254,16 @@ func (m *SDMock) HandleHcloudServers() { "storage_type": "local", "cpu_type": "shared" }, + "location": { + "id": 1, + "name": "fsn1", + "description": "Falkenstein DC Park 1", + "country": "DE", + "city": "Falkenstein", + "latitude": 50.47612, + "longitude": 12.370071, + "network_zone": "eu-central" + }, "datacenter": { "id": 2, "name": "fsn1-dc14", @@ -365,6 +385,16 @@ func (m *SDMock) HandleHcloudServers() { "storage_type": "local", "cpu_type": "shared" }, + "location": { + "id": 1, + "name": "fsn1", + "description": "Falkenstein DC Park 1", + "country": "DE", + "city": "Falkenstein", + "latitude": 50.47612, + "longitude": 12.370071, + "network_zone": "eu-central" + }, "datacenter": { "id": 2, "name": "fsn1-dc14", diff --git a/discovery/hetzner/robot.go b/discovery/hetzner/robot.go index c112d5549a..5b1c149ccb 100644 --- a/discovery/hetzner/robot.go +++ b/discovery/hetzner/robot.go @@ -34,9 +34,10 @@ import ( ) const ( - hetznerRobotLabelPrefix = hetznerLabelPrefix + "robot_" - hetznerLabelRobotProduct = hetznerRobotLabelPrefix + "product" - hetznerLabelRobotCancelled = hetznerRobotLabelPrefix + "cancelled" + hetznerRobotLabelPrefix = hetznerLabelPrefix + "robot_" + hetznerLabelRobotDatacenter = hetznerRobotLabelPrefix + "datacenter" + hetznerLabelRobotProduct = hetznerRobotLabelPrefix + "product" + hetznerLabelRobotCancelled = hetznerRobotLabelPrefix + "cancelled" ) var userAgent = version.PrometheusUserAgent() @@ -105,14 +106,15 @@ func (d *robotDiscovery) refresh(context.Context) ([]*targetgroup.Group, error) targets := make([]model.LabelSet, len(servers)) for i, server := range servers { labels := model.LabelSet{ - hetznerLabelRole: model.LabelValue(HetznerRoleRobot), - hetznerLabelServerID: model.LabelValue(strconv.Itoa(server.Server.ServerNumber)), - hetznerLabelServerName: model.LabelValue(server.Server.ServerName), - hetznerLabelDatacenter: model.LabelValue(strings.ToLower(server.Server.Dc)), - hetznerLabelPublicIPv4: model.LabelValue(server.Server.ServerIP), - hetznerLabelServerStatus: model.LabelValue(server.Server.Status), - hetznerLabelRobotProduct: model.LabelValue(server.Server.Product), - hetznerLabelRobotCancelled: model.LabelValue(strconv.FormatBool(server.Server.Canceled)), + hetznerLabelRole: model.LabelValue(HetznerRoleRobot), + hetznerLabelServerID: model.LabelValue(strconv.Itoa(server.Server.ServerNumber)), + hetznerLabelServerName: model.LabelValue(server.Server.ServerName), + hetznerLabelDatacenter: model.LabelValue(strings.ToLower(server.Server.Dc)), // Label name kept for backward compatibility + hetznerLabelPublicIPv4: model.LabelValue(server.Server.ServerIP), + hetznerLabelServerStatus: model.LabelValue(server.Server.Status), + hetznerLabelRobotDatacenter: model.LabelValue(strings.ToLower(server.Server.Dc)), + hetznerLabelRobotProduct: model.LabelValue(server.Server.Product), + hetznerLabelRobotCancelled: model.LabelValue(strconv.FormatBool(server.Server.Canceled)), model.AddressLabel: model.LabelValue(net.JoinHostPort(server.Server.ServerIP, strconv.FormatUint(uint64(d.port), 10))), } diff --git a/discovery/hetzner/robot_test.go b/discovery/hetzner/robot_test.go index 0e8b7954cc..56f9978858 100644 --- a/discovery/hetzner/robot_test.go +++ b/discovery/hetzner/robot_test.go @@ -64,19 +64,21 @@ func TestRobotSDRefresh(t *testing.T) { "__meta_hetzner_public_ipv4": model.LabelValue("123.123.123.123"), "__meta_hetzner_public_ipv6_network": model.LabelValue("2a01:4f8:111:4221::/64"), "__meta_hetzner_datacenter": model.LabelValue("nbg1-dc1"), + "__meta_hetzner_robot_datacenter": model.LabelValue("nbg1-dc1"), "__meta_hetzner_robot_product": model.LabelValue("DS 3000"), "__meta_hetzner_robot_cancelled": model.LabelValue("false"), }, { - "__address__": model.LabelValue("123.123.123.124:80"), - "__meta_hetzner_role": model.LabelValue("robot"), - "__meta_hetzner_server_id": model.LabelValue("421"), - "__meta_hetzner_server_name": model.LabelValue("server2"), - "__meta_hetzner_server_status": model.LabelValue("in process"), - "__meta_hetzner_public_ipv4": model.LabelValue("123.123.123.124"), - "__meta_hetzner_datacenter": model.LabelValue("fsn1-dc10"), - "__meta_hetzner_robot_product": model.LabelValue("X5"), - "__meta_hetzner_robot_cancelled": model.LabelValue("true"), + "__address__": model.LabelValue("123.123.123.124:80"), + "__meta_hetzner_role": model.LabelValue("robot"), + "__meta_hetzner_server_id": model.LabelValue("421"), + "__meta_hetzner_server_name": model.LabelValue("server2"), + "__meta_hetzner_server_status": model.LabelValue("in process"), + "__meta_hetzner_public_ipv4": model.LabelValue("123.123.123.124"), + "__meta_hetzner_datacenter": model.LabelValue("fsn1-dc10"), + "__meta_hetzner_robot_datacenter": model.LabelValue("fsn1-dc10"), + "__meta_hetzner_robot_product": model.LabelValue("X5"), + "__meta_hetzner_robot_cancelled": model.LabelValue("true"), }, } { t.Run(fmt.Sprintf("item %d", i), func(t *testing.T) { diff --git a/docs/command-line/prometheus.md b/docs/command-line/prometheus.md index 251fdfd6a4..1c2a08c39c 100644 --- a/docs/command-line/prometheus.md +++ b/docs/command-line/prometheus.md @@ -59,7 +59,7 @@ The Prometheus monitoring server | --query.timeout | Maximum time a query may take before being aborted. Use with server mode only. | `2m` | | --query.max-concurrency | Maximum number of queries executed concurrently. Use with server mode only. | `20` | | --query.max-samples | Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return. Use with server mode only. | `50000000` | -| --enable-feature ... | Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | +| --enable-feature ... | Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, st-storage, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers, xor2-encoding. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | | --agent | Run Prometheus in 'Agent mode'. | | | --log.level | Only log messages with the given severity or above. One of: [debug, info, warn, error] | `info` | | --log.format | Output format of log messages. One of: [logfmt, json] | `logfmt` | diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 334c5da490..3682348e67 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -2238,7 +2238,10 @@ The following meta labels are available on all targets during [relabeling](#rela * `__meta_hetzner_server_status`: the status of the server * `__meta_hetzner_public_ipv4`: the public ipv4 address of the server * `__meta_hetzner_public_ipv6_network`: the public ipv6 network (/64) of the server -* `__meta_hetzner_datacenter`: the datacenter of the server + +Note that the `__meta_hetzner_datacenter` label is deprecated for both roles `robot` and `hcloud`: +- For the `robot` role, the replacement label is `__meta_hetzner_robot_datacenter`. +- For the `hcloud` role, the label will be removed after 1 July 2026. For more details, see the [changelog](https://docs.hetzner.cloud/changelog#2025-12-16-phasing-out-datacenters). The labels below are only available for targets with `role` set to `hcloud`: @@ -2246,8 +2249,10 @@ The labels below are only available for targets with `role` set to `hcloud`: * `__meta_hetzner_hcloud_image_description`: the description of the server image * `__meta_hetzner_hcloud_image_os_flavor`: the OS flavor of the server image * `__meta_hetzner_hcloud_image_os_version`: the OS version of the server image -* `__meta_hetzner_hcloud_datacenter_location`: the location of the server -* `__meta_hetzner_hcloud_datacenter_location_network_zone`: the network zone of the server +* `__meta_hetzner_hcloud_location`: the location of the server +* `__meta_hetzner_hcloud_location_network_zone`: the network zone of the server +* `__meta_hetzner_hcloud_datacenter_location`: the location of the server (deprecated in favor of `__meta_hetzner_hcloud_location`) +* `__meta_hetzner_hcloud_datacenter_location_network_zone`: the network zone of the server (deprecated in favor of `__meta_hetzner_hcloud_location_network_zone`) * `__meta_hetzner_hcloud_server_type`: the type of the server * `__meta_hetzner_hcloud_cpu_cores`: the CPU cores count of the server * `__meta_hetzner_hcloud_cpu_type`: the CPU type of the server (shared or dedicated) @@ -2259,6 +2264,7 @@ The labels below are only available for targets with `role` set to `hcloud`: The labels below are only available for targets with `role` set to `robot`: +* `__meta_hetzner_robot_datacenter`: the datacenter of the server * `__meta_hetzner_robot_product`: the product of the server * `__meta_hetzner_robot_cancelled`: the server cancellation status @@ -3871,9 +3877,9 @@ with this feature. # or when a compaction completes, whichever comes first. [ retention: ] : # How long to retain samples in storage. If neither this option nor the size option - # is set, the retention time defaults to 15d. Units Supported: y, w, d, h, m, s, ms. + # is set, the retention time defaults to 15d. Setting this to 0 disables time-based retention. # This option takes precedence over the deprecated command-line flag --storage.tsdb.retention.time. - [ time: | default = 15d ] + [ time: ] # Maximum number of bytes that can be stored for blocks. A unit is required, # supported units: B, KB, MB, GB, TB, PB, EB. Ex: "512MB". Based on powers-of-2, so 1KB is 1024B. diff --git a/docs/feature_flags.md b/docs/feature_flags.md index 45d14b72db..ccc3a2bcde 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -77,6 +77,30 @@ Therefore, when `created-timestamp-zero-ingestion` is enabled Prometheus changes Besides enabling this feature in Prometheus, start timestamps need to be exposed by the application being scraped. +## Start timestamp (ST) native storage + +`--enable-feature=st-storage` + +Enables the storage of start timestamps (ST) per sample, through WAL, TSDB/Agent and Remote-Write 2.0. This option +allows preserving the exact ST value as it was presented from scrape and receive protocols. In the future this feature +is meant to be a replacement of `created-timestamp-zero-ingestion` which injects synthetic 0 samples. + +Currently, Prometheus supports start timestamps on: + +* `PrometheusProto` +* `OpenMetrics1.0.0` + +`PrometheusProto` is recommended, due to efficiency of ST passing. + +Besides enabling this feature in Prometheus, start timestamps need to be exposed by the application being scraped. + +> NOTE: This is an experimental feature with known limitations until fully implemented. +> * It introduces new WAL record type (SamplesV2) that can only be replayed with Prometheus 3.11 or later versions. +> * For persistent storage support (TSDB blocks), you need to manually opt-in for XOR2 chunk format ([`xor2-encoding` flag](#xor2-chunk-encoding)). +> This might change later once we finish experimentation phase with XOR2. +> * ST for native histograms and NHCBs are not yet implemented (see [#18315](https://github.com/prometheus/prometheus/issues/18315)). +> * PromQL use of ST is out of scope of this feature. + ## Concurrent evaluation of independent rules `--enable-feature=concurrent-rule-eval` @@ -306,6 +330,17 @@ This is currently implemented using direct I/O. For more details, see the [proposal](https://github.com/prometheus/proposals/pull/45). +## XOR2 chunk encoding + +`--enable-feature=xor2-encoding` + +> WARNING: This is highly experimental and risky setting: +> * Chunks encoded with XOR2 **cannot be read by older Prometheus versions** that do not support the encoding. Once enabled and data is written, you need to **manually delete blocks from the disk**, otherwise Prometheus will return error on all queries. +> * We are still experimenting on the final encoding. As of now this encoding can change in any Prometheus version. All your persistent block data will be lost between versions. +> * This is encoding is new, meaning downstream tools and LTS systems might now support it yet (e.g. Thanos sidecar uploaded blocks). + +This setting enables the new XOR2 chunk encoding for float samples, which provides better disk compression than the default XOR encoding for typical Prometheus workloads. This format also allow storing Start Timestamp (ST). + ## Extended Range Selectors `--enable-feature=promql-extended-range-selectors` diff --git a/docs/querying/functions.md b/docs/querying/functions.md index 68a003359d..64e172000f 100644 --- a/docs/querying/functions.md +++ b/docs/querying/functions.md @@ -219,7 +219,7 @@ to their original value. Histogram samples in the input vector are ignored silen ## `histogram_avg()` `histogram_avg(v instant-vector)` returns the arithmetic average of observed -values stored in each histogram sample in `v`. Float samples are ignored and do +values stored in each native histogram sample in `v`. Float samples are ignored and do not show up in the returned vector. Use `histogram_avg` as demonstrated below to compute the average request duration @@ -236,11 +236,11 @@ Which is equivalent to the following query: ## `histogram_count()` and `histogram_sum()` `histogram_count(v instant-vector)` returns the count of observations stored in -each histogram sample in `v`. Float samples are ignored and do not show up in +each native histogram sample in `v`. Float samples are ignored and do not show up in the returned vector. Similarly, `histogram_sum(v instant-vector)` returns the sum of observations -stored in each histogram sample. +stored in each native histogram sample. Use `histogram_count` in the following way to calculate a rate of observations (in this case corresponding to “requests per second”) from a series of @@ -453,14 +453,14 @@ histogram_quantiles(sum(rate(foo[1m])), "quantile", 0.9, 0.99) ## `histogram_stddev()` and `histogram_stdvar()` `histogram_stddev(v instant-vector)` returns the estimated standard deviation -of observations for each histogram sample in `v`. For this estimation, all observations +of observations for each native histogram sample in `v`. For this estimation, all observations in a bucket are assumed to have the value of the mean of the bucket boundaries. For the zero bucket and for buckets with custom boundaries, the arithmetic mean is used. For the usual exponential buckets, the geometric mean is used. Float samples are ignored and do not show up in the returned vector. Similarly, `histogram_stdvar(v instant-vector)` returns the estimated standard -variance of observations for each histogram sample in `v`. +variance of observations for each native histogram sample in `v`. ## `hour()` diff --git a/scrape/helpers_test.go b/scrape/helpers_test.go index 45c89ad8d7..5119a4c66b 100644 --- a/scrape/helpers_test.go +++ b/scrape/helpers_test.go @@ -105,7 +105,9 @@ func newTestScrapeLoop(t testing.TB, opts ...func(sl *scrapeLoop)) (_ *scrapeLoo enableCompression: true, validationScheme: model.UTF8Validation, symbolTable: labels.NewSymbolTable(), - appendMetadataToWAL: true, // Tests assumes it's enabled, unless explicitly turned off. + // Tests assume those features are enabled, unless explicitly turned off. + appendMetadataToWAL: true, + parseST: true, } for _, o := range opts { o(sl) diff --git a/scrape/manager.go b/scrape/manager.go index a79dff6f77..5cd6824033 100644 --- a/scrape/manager.go +++ b/scrape/manager.go @@ -115,8 +115,26 @@ type Options struct { // Option to enable the ingestion of the created timestamp as a synthetic zero sample. // See: https://github.com/prometheus/proposals/blob/main/proposals/2023-06-13_created-timestamp.md + // + // NOTE: This option has no effect for AppenderV2 and will be removed with the AppenderV1 + // removal. EnableStartTimestampZeroIngestion bool + // ParseST controls if ST should be parsed and appended from the scrape formats. + // This should be by default true, but it's opt-in for OpenMetrics (OM) 1.0 reasons and might be moved + // to OM 1.0 only flow. + // + // Specifically for OpenMetrics 1.0 flow, it can have some additional effects that might not be desired for non-ST users: + // + // * OpenMetrics 1.0 _created series will be parsed as ST instead of normal sample. Could be breaking + // if downstream user depends on _created metric. TODO(bwplotka): Add "preserveOMLines" hidden option? + // * Add relatively small (but still) overhead. + // * Can yield wrong ST values in rare edge cases (unknown metadata and metric name collisions). + // + // This only applies to AppenderV2 flow (Prometheus default). + // TODO: Move this option to OM1 parser and use only on OM1 flow. + ParseST bool + // EnableTypeAndUnitLabels represents type-and-unit-labels feature flag. EnableTypeAndUnitLabels bool diff --git a/scrape/manager_test.go b/scrape/manager_test.go index 3dc05db011..37570c2f90 100644 --- a/scrape/manager_test.go +++ b/scrape/manager_test.go @@ -768,6 +768,7 @@ func TestManagerSTZeroIngestion(t *testing.T) { app := teststorage.NewAppendable() discoveryManager, scrapeManager := runManagers(t, ctx, &Options{ EnableStartTimestampZeroIngestion: testSTZeroIngest, + ParseST: testSTZeroIngest, skipJitterOffsetting: true, }, app, nil) defer scrapeManager.Stop() @@ -954,6 +955,7 @@ func TestManagerSTZeroIngestionHistogram(t *testing.T) { app := teststorage.NewAppendable() discoveryManager, scrapeManager := runManagers(t, ctx, &Options{ EnableStartTimestampZeroIngestion: tc.enableSTZeroIngestion, + ParseST: tc.enableSTZeroIngestion, skipJitterOffsetting: true, }, app, nil) defer scrapeManager.Stop() @@ -1066,6 +1068,7 @@ func TestNHCBAndSTZeroIngestion(t *testing.T) { app := teststorage.NewAppendable() discoveryManager, scrapeManager := runManagers(t, ctx, &Options{ EnableStartTimestampZeroIngestion: true, + ParseST: true, skipJitterOffsetting: true, }, app, nil) defer scrapeManager.Stop() diff --git a/scrape/scrape.go b/scrape/scrape.go index 55d0eaf70b..9b37a356cf 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -870,6 +870,7 @@ type scrapeLoop struct { // Options from scrape.Options. enableSTZeroIngestion bool + parseST bool // Used by AppenderV2 only. enableTypeAndUnitLabels bool reportExtraMetrics bool appendMetadataToWAL bool @@ -1224,7 +1225,12 @@ func newScrapeLoop(opts scrapeLoopOptions) *scrapeLoop { validationScheme: opts.sp.config.MetricNameValidationScheme, // scrape.Options. - enableSTZeroIngestion: opts.sp.options.EnableStartTimestampZeroIngestion, + enableSTZeroIngestion: opts.sp.options.EnableStartTimestampZeroIngestion, + // parseST was added recently. Before EnableStartTimestampZeroIngestion + // was enabling parsing ST. For non-Prometheus users of the scrape + // manager, we ensure appenderV2 parseST is set on EnableStartTimestampZeroIngestion + // This will be removed when EnableStartTimestampZeroIngestion is removed. + parseST: opts.sp.options.ParseST || opts.sp.options.EnableStartTimestampZeroIngestion, enableTypeAndUnitLabels: opts.sp.options.EnableTypeAndUnitLabels, appendMetadataToWAL: opts.sp.options.AppendMetadata, passMetadataInContext: opts.sp.options.PassMetadataInContext, @@ -1253,9 +1259,8 @@ func (sl *scrapeLoop) getScrapeOffset() time.Duration { func (sl *scrapeLoop) run(errc chan<- error) { var ( - last time.Time - alignedScrapeTime = time.Now().Round(0) - ticker = time.NewTicker(sl.interval) + last time.Time + ticker = time.NewTicker(sl.interval) ) defer func() { if sl.scrapeOnShutdown { @@ -1282,6 +1287,10 @@ func (sl *scrapeLoop) run(errc chan<- error) { } } + // Reset the ticker so target scrape times are aligned to the offset+intervals. + ticker.Reset(sl.interval) + alignedScrapeTime := time.Now().Round(0) + for { select { case <-sl.ctx.Done(): diff --git a/scrape/scrape_append_v2.go b/scrape/scrape_append_v2.go index 64969707e1..825e56f9df 100644 --- a/scrape/scrape_append_v2.go +++ b/scrape/scrape_append_v2.go @@ -102,7 +102,7 @@ func (sl *scrapeLoopAppenderV2) append(b []byte, contentType string, ts time.Tim IgnoreNativeHistograms: !sl.enableNativeHistogramScraping, ConvertClassicHistogramsToNHCB: sl.convertClassicHistToNHCB, KeepClassicOnClassicAndNativeHistograms: sl.alwaysScrapeClassicHist, - OpenMetricsSkipSTSeries: sl.enableSTZeroIngestion, + OpenMetricsSkipSTSeries: sl.parseST, FallbackContentType: sl.fallbackScrapeProtocol, }) if p == nil { @@ -254,7 +254,7 @@ loop: } st := int64(0) - if sl.enableSTZeroIngestion { + if sl.parseST { // p.StartTimestamp() tend to be expensive (e.g. OM1). Do it only if we care. st = p.StartTimestamp() } diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index 63547869be..d5cd765a5a 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -24,6 +24,7 @@ import ( "log/slog" "maps" "math" + "net" "net/http" "net/http/httptest" "net/url" @@ -51,6 +52,7 @@ import ( sdktrace "go.opentelemetry.io/otel/sdk/trace" "go.uber.org/atomic" "go.uber.org/goleak" + "go.yaml.in/yaml/v2" "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/discovery" @@ -69,6 +71,7 @@ import ( "github.com/prometheus/prometheus/util/pool" "github.com/prometheus/prometheus/util/teststorage" "github.com/prometheus/prometheus/util/testutil" + "github.com/prometheus/prometheus/util/testutil/synctest" ) func TestMain(m *testing.M) { @@ -1546,6 +1549,14 @@ func TestPromTextToProto(t *testing.T) { require.Equal(t, "promhttp_metric_handler_requests_total", got[236]) } +func seriesPerHistogramFor100HistsWithExemplars(appV2 bool) int { + if appV2 { + // AppenderV2 with parseST enabled, uses _created lines for ST instead of samples. + return 23 + } + return 24 +} + // TestScrapeLoopAppend_WithStorage tests appends and storage integration for the // large input files that are also used in benchmarks. func TestScrapeLoopAppend_WithStorage(t *testing.T) { @@ -1631,8 +1642,13 @@ func TestScrapeLoopAppend_WithStorage(t *testing.T) { name: "100HistsWithExemplars", parsableText: makeTestHistogramsWithExemplars(100), - expectedSamplesLen: 24 * 100, + expectedSamplesLen: seriesPerHistogramFor100HistsWithExemplars(appV2) * 100, testAppendedSamples: func(t *testing.T, committed []sample) { + st := int64(0) + if appV2 { + st = 1726839813016 + } + // Verify a few samples. m := metadata.Metadata{Type: model.MetricTypeHistogram, Help: "RPC latency distributions."} testutil.RequireEqual(t, sample{ @@ -1642,7 +1658,7 @@ func TestScrapeLoopAppend_WithStorage(t *testing.T) { } return "rpc_durations_histogram0_seconds" }(), - M: m, L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram0_seconds_bucket", "le", "0.0003100000000000002"), V: 15, T: timestamp.FromTime(ts), + M: m, L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram0_seconds_bucket", "le", "0.0003100000000000002"), V: 15, ST: st, T: timestamp.FromTime(ts), ES: []exemplar.Exemplar{ {Labels: labels.FromStrings("dummyID", "9818"), Value: 0.0002791130914009552, Ts: 1726839814982, HasTs: true}, }, @@ -1654,17 +1670,24 @@ func TestScrapeLoopAppend_WithStorage(t *testing.T) { } return "rpc_durations_histogram49_seconds" }(), - M: m, L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram49_seconds_sum"), V: -8.452185437166741e-05, T: timestamp.FromTime(ts), - }, committed[24*50-3]) + M: m, L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram49_seconds_sum"), V: -8.452185437166741e-05, ST: st, T: timestamp.FromTime(ts), + }, committed[seriesPerHistogramFor100HistsWithExemplars(appV2)*49+21]) - // This series does not have metadata, nor metric family, because of isSeriesPartOfFamily bug and OpenMetric 1.0 limitations around _created series. - // TODO(bwplotka): Fix with https://github.com/prometheus/prometheus/issues/17900 - testutil.RequireEqual(t, sample{ - L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram99_seconds_created"), V: 1.726839813016302e+09, T: timestamp.FromTime(ts), - }, committed[len(committed)-1]) + if !appV2 { + // This series does not have metadata, nor metric family, because of isSeriesPartOfFamily bug and OpenMetric 1.0 limitations around _created series. + // TODO(bwplotka): Fix with https://github.com/prometheus/prometheus/issues/17900 + testutil.RequireEqual(t, sample{ + L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram99_seconds_created"), V: 1.726839813016302e+09, T: timestamp.FromTime(ts), + }, committed[len(committed)-1]) + } else { + testutil.RequireEqual(t, sample{ + MF: "rpc_durations_histogram99_seconds", + M: m, L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram99_seconds_count"), V: 15, ST: st, T: timestamp.FromTime(ts), + }, committed[len(committed)-1]) + } }, testExemplars: func(t *testing.T, er []exemplar.QueryResult) { - // 12 out of 24 histogram series have exemplars. + // 12 out of 23/24 histogram series have exemplars. require.Len(t, er, 12*100) testutil.RequireEqual(t, exemplar.QueryResult{ SeriesLabels: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram0_seconds_bucket", "le", "0.0003100000000000002"), @@ -2901,6 +2924,11 @@ func TestScrapeLoopAppend(t *testing.T) { } func testScrapeLoopAppend(t *testing.T, appV2 bool) { + st := int64(0) + if appV2 { + st = 111111001 + } + for _, test := range []struct { title string alwaysScrapeClassicHist bool @@ -2953,6 +2981,32 @@ func testScrapeLoopAppend(t *testing.T, appV2 bool) { ES: []exemplar.Exemplar{{Labels: labels.FromStrings("a", "abc"), Value: 1, Ts: 10000000, HasTs: true}}, }}, }, + { + title: "Metric with ST", + scrapeText: `# TYPE metric counter +metric_total{n="1"} 1.1 +metric_created{n="1"} 9999.999 +# EOF`, + contentType: "application/openmetrics-text", + samples: func() []sample { + if !appV2 { + return []sample{ + { + L: labels.FromStrings("__name__", "metric_total", "n", "1"), + V: 1.1, + }, + { + L: labels.FromStrings("__name__", "metric_created", "n", "1"), + V: 9999.999, + }, + } + } + return []sample{{ + L: labels.FromStrings("__name__", "metric_total", "n", "1"), + ST: 9999999, V: 1.1, + }} + }(), + }, { title: "Two metrics and exemplars", scrapeText: `metric_total{n="1"} 1 # {t="1"} 1.0 10000 @@ -2970,7 +3024,7 @@ metric_total{n="2"} 2 # {t="2"} 2.0 20000 }}, }, { - title: "Native histogram with three exemplars from classic buckets", + title: "Native histogram with ST and three exemplars from classic buckets", enableNativeHistogramsIngestion: true, scrapeText: `name: "test_histogram" @@ -2978,6 +3032,10 @@ help: "Test histogram with many buckets removed to keep it manageable in size." type: HISTOGRAM metric: < histogram: < + created_timestamp: < + seconds: 111111 + nanos: 1000000 + > sample_count: 175 sample_sum: 0.0008280461746287094 bucket: < @@ -3060,8 +3118,9 @@ metric: < `, contentType: "application/vnd.google.protobuf", samples: []sample{{ - T: 1234568, - L: labels.FromStrings("__name__", "test_histogram"), + T: 1234568, + ST: st, + L: labels.FromStrings("__name__", "test_histogram"), H: &histogram.Histogram{ Count: 175, ZeroCount: 2, @@ -3087,7 +3146,7 @@ metric: < }}, }, { - title: "Native histogram with three exemplars scraped as classic histogram", + title: "Native histogram with ST and three exemplars scraped as classic histogram", enableNativeHistogramsIngestion: true, scrapeText: `name: "test_histogram" @@ -3095,6 +3154,10 @@ help: "Test histogram with many buckets removed to keep it manageable in size." type: HISTOGRAM metric: < histogram: < + created_timestamp: < + seconds: 111111 + nanos: 1000000 + > sample_count: 175 sample_sum: 0.0008280461746287094 bucket: < @@ -3179,8 +3242,9 @@ metric: < contentType: "application/vnd.google.protobuf", samples: []sample{ { - T: 1234568, - L: labels.FromStrings("__name__", "test_histogram"), + T: 1234568, + ST: st, + L: labels.FromStrings("__name__", "test_histogram"), H: &histogram.Histogram{ Count: 175, ZeroCount: 2, @@ -3205,26 +3269,26 @@ metric: < {Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, Ts: 1625851155146, HasTs: true}, }, }, - {L: labels.FromStrings("__name__", "test_histogram_count"), T: 1234568, V: 175}, - {L: labels.FromStrings("__name__", "test_histogram_sum"), T: 1234568, V: 0.0008280461746287094}, - {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0004899999999999998"), T: 1234568, V: 2}, + {L: labels.FromStrings("__name__", "test_histogram_count"), ST: st, T: 1234568, V: 175}, + {L: labels.FromStrings("__name__", "test_histogram_sum"), ST: st, T: 1234568, V: 0.0008280461746287094}, + {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0004899999999999998"), ST: st, T: 1234568, V: 2}, { - L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0003899999999999998"), T: 1234568, V: 4, + L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0003899999999999998"), ST: st, T: 1234568, V: 4, ES: []exemplar.Exemplar{{Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, Ts: 1625851155146, HasTs: true}}, }, { - L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0002899999999999998"), T: 1234568, V: 16, + L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0002899999999999998"), ST: st, T: 1234568, V: 16, ES: []exemplar.Exemplar{{Labels: labels.FromStrings("dummyID", "5617"), Value: -0.00029, Ts: 1234568, HasTs: false}}, }, { - L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0001899999999999998"), T: 1234568, V: 32, + L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0001899999999999998"), ST: st, T: 1234568, V: 32, ES: []exemplar.Exemplar{{Labels: labels.FromStrings("dummyID", "58215"), Value: -0.00019, Ts: 1625851055146, HasTs: true}}, }, - {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "+Inf"), T: 1234568, V: 175}, + {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "+Inf"), ST: st, T: 1234568, V: 175}, }, }, { - title: "Native histogram with exemplars and no classic buckets", + title: "Native histogram with ST, exemplars and no classic buckets", contentType: "application/vnd.google.protobuf", enableNativeHistogramsIngestion: true, scrapeText: `name: "test_histogram" @@ -3232,6 +3296,10 @@ help: "Test histogram." type: HISTOGRAM metric: < histogram: < + created_timestamp: < + seconds: 111111 + nanos: 1000000 + > sample_count: 175 sample_sum: 0.0008280461746287094 schema: 3 @@ -3297,8 +3365,9 @@ metric: < `, samples: []sample{{ - T: 1234568, - L: labels.FromStrings("__name__", "test_histogram"), + T: 1234568, + ST: st, + L: labels.FromStrings("__name__", "test_histogram"), H: &histogram.Histogram{ Count: 175, ZeroCount: 2, @@ -3324,7 +3393,7 @@ metric: < }}, }, { - title: "Native histogram with exemplars but ingestion disabled", + title: "Native histogram with ST, exemplars but ingestion disabled", contentType: "application/vnd.google.protobuf", enableNativeHistogramsIngestion: false, scrapeText: `name: "test_histogram" @@ -3332,6 +3401,10 @@ help: "Test histogram." type: HISTOGRAM metric: < histogram: < + created_timestamp: < + seconds: 111111 + nanos: 1000000 + > sample_count: 175 sample_sum: 0.0008280461746287094 schema: 3 @@ -3397,9 +3470,9 @@ metric: < `, samples: []sample{ - {L: labels.FromStrings("__name__", "test_histogram_count"), T: 1234568, V: 175}, - {L: labels.FromStrings("__name__", "test_histogram_sum"), T: 1234568, V: 0.0008280461746287094}, - {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "+Inf"), T: 1234568, V: 175}, + {L: labels.FromStrings("__name__", "test_histogram_count"), ST: st, T: 1234568, V: 175}, + {L: labels.FromStrings("__name__", "test_histogram_sum"), ST: st, T: 1234568, V: 0.0008280461746287094}, + {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "+Inf"), ST: st, T: 1234568, V: 175}, }, }, } { @@ -3421,7 +3494,7 @@ metric: < // This test does not care about metadata. // Having this true would mean we need to add metadata to sample // expectations. - // TODO(bwplotka): Add cases for append metadata to WAL and pass metadata + // TODO(bwplotka): Add cases for append metadata to WAL and pass metadata. sl.appendMetadataToWAL = false }) app := sl.appender() @@ -6786,3 +6859,100 @@ func TestScrapePoolSetScrapeFailureLoggerRace(t *testing.T) { wg.Wait() } + +func TestScrapeOffsetDistribution(t *testing.T) { + interval := 5 * time.Second + + synctest.Test(t, func(t *testing.T) { + startTime := time.Now() + + listener := newPipeListener() + + var mu sync.Mutex + scrapeTimes := make(map[string][]time.Duration) + + handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + select { + case <-r.Context().Done(): + return + default: + mu.Lock() + target := r.URL.Path + scrapeTimes[target] = append(scrapeTimes[target], time.Since(startTime)) + mu.Unlock() + + w.Header().Set("Content-Type", "text/plain; version=0.0.4") + fmt.Fprintln(w, "expected_metric 1") + } + }) + + srv := httptest.NewUnstartedServer(handler) + srv.Listener = listener + srv.Start() + t.Cleanup(srv.Close) + + app := teststorage.NewAppendable() + opts := &Options{ + HTTPClientOptions: []config_util.HTTPClientOption{ + config_util.WithDialContextFunc(func(ctx context.Context, _, _ string) (net.Conn, error) { + srvConn, cliConn := net.Pipe() + select { + case listener.conns <- srvConn: + return cliConn, nil + case <-listener.closed: + return nil, net.ErrClosed + case <-ctx.Done(): + return nil, ctx.Err() + } + }), + }, + } + scrapeManager, err := NewManager(opts, promslog.NewNopLogger(), nil, app, nil, prometheus.NewRegistry()) + scrapeManager.offsetSeed = 1 // Set a fixed offset seed for deterministic testing. + require.NoError(t, err) + + var targets []model.LabelSet + for i := range 5 { + targets = append(targets, model.LabelSet{ + model.SchemeLabel: "http", + model.AddressLabel: model.LabelValue(fmt.Sprintf("target-%d.local", i)), + model.MetricsPathLabel: model.LabelValue(fmt.Sprintf("/metrics/%d", i)), + }) + } + + scrapeManager.updateTsets(map[string][]*targetgroup.Group{ + "test": {{Targets: targets}}, + }) + + cfg := &config.Config{ + GlobalConfig: config.GlobalConfig{ + ScrapeInterval: model.Duration(interval), + ScrapeTimeout: model.Duration(interval), + ScrapeProtocols: []config.ScrapeProtocol{config.PrometheusProto}, + }, + ScrapeConfigs: []*config.ScrapeConfig{{JobName: "test"}}, + } + cfgText, err := yaml.Marshal(*cfg) + require.NoError(t, err) + cfg = loadConfiguration(t, string(cfgText)) + require.NoError(t, scrapeManager.ApplyConfig(cfg)) + + scrapeManager.reload() + + numScrapes := 4 + time.Sleep((time.Duration(numScrapes) * interval) + time.Second) + synctest.Wait() + + scrapeManager.Stop() + + for i := range numScrapes { + uniqueTimes := make(map[time.Duration]struct{}) + for _, times := range scrapeTimes { + if i < len(times) { + uniqueTimes[times[i]] = struct{}{} + } + } + require.Greater(t, len(uniqueTimes), 2, "Expected targets to be scraped at staggered offsets rather than simultaneously at scrape index %d", i) + } + }) +} diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index e650b0b5fd..a42cf0c932 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -761,11 +761,12 @@ outer: default: } if t.shards.enqueue(s.Ref, timeSeries{ - seriesLabels: lbls, - metadata: meta, - timestamp: s.T, - value: s.V, - sType: tSample, + seriesLabels: lbls, + metadata: meta, + startTimestamp: s.ST, + timestamp: s.T, + value: s.V, + sType: tSample, }) { continue outer } @@ -883,9 +884,10 @@ outer: if t.shards.enqueue(h.Ref, timeSeries{ seriesLabels: lbls, metadata: meta, - timestamp: h.T, - histogram: h.H, - sType: tHistogram, + // TODO(bwplotka): Populate ST once histogram Ref has it. + timestamp: h.T, + histogram: h.H, + sType: tHistogram, }) { continue outer } @@ -942,8 +944,9 @@ outer: default: } if t.shards.enqueue(h.Ref, timeSeries{ - seriesLabels: lbls, - metadata: meta, + seriesLabels: lbls, + metadata: meta, + // TODO(bwplotka): Populate ST once histogram Ref has it. timestamp: h.T, floatHistogram: h.FH, sType: tFloatHistogram, @@ -1397,13 +1400,13 @@ type queue struct { } type timeSeries struct { - seriesLabels labels.Labels - value float64 - histogram *histogram.Histogram - floatHistogram *histogram.FloatHistogram - metadata *metadata.Metadata - timestamp int64 - exemplarLabels labels.Labels + seriesLabels labels.Labels + value float64 + histogram *histogram.Histogram + floatHistogram *histogram.FloatHistogram + metadata *metadata.Metadata + startTimestamp, timestamp int64 + exemplarLabels labels.Labels // The type of series: sample, exemplar, or histogram. sType seriesType } @@ -1994,8 +1997,9 @@ func populateV2TimeSeries(symbolTable *writev2.SymbolsTable, batch []timeSeries, switch d.sType { case tSample: pendingData[nPending].Samples = append(pendingData[nPending].Samples, writev2.Sample{ - Value: d.value, - Timestamp: d.timestamp, + Value: d.value, + Timestamp: d.timestamp, + StartTimestamp: d.startTimestamp, }) nPendingSamples++ case tExemplar: @@ -2006,9 +2010,11 @@ func populateV2TimeSeries(symbolTable *writev2.SymbolsTable, batch []timeSeries, }) nPendingExemplars++ case tHistogram: + // TODO(bwplotka): Extend with ST once histograms populate it. pendingData[nPending].Histograms = append(pendingData[nPending].Histograms, writev2.FromIntHistogram(d.timestamp, d.histogram)) nPendingHistograms++ case tFloatHistogram: + // TODO(bwplotka): Extend with ST once histograms populate it. pendingData[nPending].Histograms = append(pendingData[nPending].Histograms, writev2.FromFloatHistogram(d.timestamp, d.floatHistogram)) nPendingHistograms++ case tMetadata: diff --git a/storage/remote/queue_manager_test.go b/storage/remote/queue_manager_test.go index ed8415d36c..b0a5627e2f 100644 --- a/storage/remote/queue_manager_test.go +++ b/storage/remote/queue_manager_test.go @@ -143,7 +143,10 @@ func TestBasicContentNegotiation(t *testing.T) { s := NewStorage(nil, nil, nil, dir, defaultFlushDeadline, nil, false) defer s.Close() - recs := testwal.GenerateRecords(recCase{Series: 1, SamplesPerSeries: 1}) + recs := testwal.GenerateRecords(recCase{ + NoST: tc.senderProtoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + Series: 1, SamplesPerSeries: 1, + }) conf.RemoteWriteConfigs[0].ProtobufMessage = tc.senderProtoMsg require.NoError(t, s.ApplyConfig(conf)) @@ -225,6 +228,7 @@ func TestSampleDelivery(t *testing.T) { s := NewStorage(nil, nil, nil, dir, defaultFlushDeadline, nil, false) defer s.Close() + rc.NoST = protoMsg == remoteapi.WriteV1MessageType // RW1 does not support ST. recs := testwal.GenerateRecords(rc) var ( @@ -388,7 +392,10 @@ func TestSampleDeliveryTimeout(t *testing.T) { t.Parallel() for _, protoMsg := range []remoteapi.WriteMessageType{remoteapi.WriteV1MessageType, remoteapi.WriteV2MessageType} { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { - recs := testwal.GenerateRecords(recCase{Series: 10, SamplesPerSeries: 10}) + recs := testwal.GenerateRecords(recCase{ + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + Series: 10, SamplesPerSeries: 10, + }) cfg := testDefaultQueueConfig() mcfg := config.DefaultMetadataConfig cfg.MaxShards = 1 @@ -417,7 +424,10 @@ func TestSampleDeliveryOrder(t *testing.T) { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { ts := 10 n := config.DefaultQueueConfig.MaxSamplesPerSend * ts - recs := testwal.GenerateRecords(recCase{Series: n, SamplesPerSeries: 1}) + recs := testwal.GenerateRecords(recCase{ + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + Series: n, SamplesPerSeries: 1, + }) c, m := newTestClientAndQueueManager(t, defaultFlushDeadline, protoMsg) c.expectSamples(recs.Samples, recs.Series) @@ -446,7 +456,10 @@ func TestShutdown(t *testing.T) { m := newTestQueueManager(t, cfg, mcfg, deadline, c, protoMsg) // Send 2x batch size, so we know it will need at least two sends. n := 2 * config.DefaultQueueConfig.MaxSamplesPerSend - recs := testwal.GenerateRecords(recCase{Series: n / 1000, SamplesPerSeries: 1000}) + recs := testwal.GenerateRecords(recCase{ + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + Series: n / 1000, SamplesPerSeries: 1000, + }) m.StoreSeries(recs.Series, 0) m.Start() @@ -515,7 +528,10 @@ func TestReshard(t *testing.T) { size := 10 // Make bigger to find more races. nSeries := 6 samplesPerSeries := config.DefaultQueueConfig.Capacity * size - recs := testwal.GenerateRecords(recCase{Series: nSeries, SamplesPerSeries: samplesPerSeries}) + recs := testwal.GenerateRecords(recCase{ + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + Series: nSeries, SamplesPerSeries: samplesPerSeries, + }) t.Logf("about to send %v samples", len(recs.Samples)) cfg := config.DefaultQueueConfig @@ -591,7 +607,10 @@ func TestReshardPartialBatch(t *testing.T) { t.Parallel() for _, protoMsg := range []remoteapi.WriteMessageType{remoteapi.WriteV1MessageType, remoteapi.WriteV2MessageType} { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { - recs := testwal.GenerateRecords(recCase{Series: 1, SamplesPerSeries: 10}) + recs := testwal.GenerateRecords(recCase{ + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + Series: 1, SamplesPerSeries: 10, + }) c := NewTestBlockedWriteClient() @@ -636,7 +655,10 @@ func TestReshardPartialBatch(t *testing.T) { func TestQueueFilledDeadlock(t *testing.T) { for _, protoMsg := range []remoteapi.WriteMessageType{remoteapi.WriteV1MessageType, remoteapi.WriteV2MessageType} { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { - recs := testwal.GenerateRecords(recCase{Series: 50, SamplesPerSeries: 1}) + recs := testwal.GenerateRecords(recCase{ + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + Series: 50, SamplesPerSeries: 1, + }) c := NewNopWriteClient() @@ -835,8 +857,8 @@ func getSeriesIDFromRef(r record.RefSeries) string { // TestWriteClient represents write client which does not call remote storage, // but instead re-implements fake WriteHandler for test purposes. type TestWriteClient struct { - receivedSamples map[string][]prompb.Sample - expectedSamples map[string][]prompb.Sample + receivedSamples map[string][]writev2.Sample + expectedSamples map[string][]writev2.Sample receivedExemplars map[string][]prompb.Exemplar expectedExemplars map[string][]prompb.Exemplar receivedHistograms map[string][]prompb.Histogram @@ -860,8 +882,8 @@ type TestWriteClient struct { // NewTestWriteClient creates a new testing write client. func NewTestWriteClient(protoMsg remoteapi.WriteMessageType) *TestWriteClient { return &TestWriteClient{ - receivedSamples: map[string][]prompb.Sample{}, - expectedSamples: map[string][]prompb.Sample{}, + receivedSamples: map[string][]writev2.Sample{}, + expectedSamples: map[string][]writev2.Sample{}, receivedMetadata: map[string][]prompb.MetricMetadata{}, expectedMetadata: map[string][]prompb.MetricMetadata{}, protoMsg: protoMsg, @@ -876,18 +898,20 @@ func (c *TestWriteClient) injectErrors(injectedErrs []error) { c.retry = false } +// expectSamples injects samples that will be expected on waitForExpectedData. func (c *TestWriteClient) expectSamples(ss []record.RefSample, series []record.RefSeries) { c.mtx.Lock() defer c.mtx.Unlock() - c.expectedSamples = map[string][]prompb.Sample{} - c.receivedSamples = map[string][]prompb.Sample{} + c.expectedSamples = map[string][]writev2.Sample{} + c.receivedSamples = map[string][]writev2.Sample{} for _, s := range ss { tsID := getSeriesIDFromRef(series[s.Ref]) - c.expectedSamples[tsID] = append(c.expectedSamples[tsID], prompb.Sample{ - Timestamp: s.T, - Value: s.V, + c.expectedSamples[tsID] = append(c.expectedSamples[tsID], writev2.Sample{ + StartTimestamp: s.ST, + Timestamp: s.T, + Value: s.V, }) } } @@ -1065,7 +1089,10 @@ func (c *TestWriteClient) Store(_ context.Context, req []byte, _ int) (WriteResp } } - var reqProto *prompb.WriteRequest + var ( + reqProto *prompb.WriteRequest + reqProtoV2 *writev2.Request + ) switch c.protoMsg { case remoteapi.WriteV1MessageType: reqProto = &prompb.WriteRequest{} @@ -1073,10 +1100,10 @@ func (c *TestWriteClient) Store(_ context.Context, req []byte, _ int) (WriteResp case remoteapi.WriteV2MessageType: // NOTE(bwplotka): v1 msg can be unmarshaled to v2 sometimes, without // errors. - var reqProtoV2 writev2.Request - err = proto.Unmarshal(reqBuf, &reqProtoV2) + reqProtoV2 = &writev2.Request{} + err = proto.Unmarshal(reqBuf, reqProtoV2) if err == nil { - reqProto, err = v2RequestToWriteRequest(&reqProtoV2) + reqProto, err = v2RequestToWriteRequest(reqProtoV2) } } if err != nil { @@ -1085,11 +1112,21 @@ func (c *TestWriteClient) Store(_ context.Context, req []byte, _ int) (WriteResp rs := WriteResponseStats{} b := labels.NewScratchBuilder(0) - for _, ts := range reqProto.Timeseries { + for i, ts := range reqProto.Timeseries { labels := ts.ToLabels(&b, nil) tsID := labels.String() - if len(ts.Samples) > 0 { - c.receivedSamples[tsID] = append(c.receivedSamples[tsID], ts.Samples...) + for j, s := range ts.Samples { + st := int64(0) + if reqProtoV2 != nil { + // TODO(bwplotka): Refactor queue manager TestWriteClient for tighter validation + // and native support for new RW2 features. For now we inject STs in RW2 case to the existing test suite. + st = reqProtoV2.Timeseries[i].Samples[j].StartTimestamp + } + c.receivedSamples[tsID] = append(c.receivedSamples[tsID], writev2.Sample{ + StartTimestamp: st, + Timestamp: s.Timestamp, + Value: s.Value, + }) } rs.Samples += len(ts.Samples) @@ -1265,6 +1302,13 @@ var extraLabels []labels.Label = []labels.Label{ {Name: "pod_name", Value: "some-other-name-5j8s8"}, } +// Recommended CLI invocation(s): +/* + export bench=sampleSend && go test ./storage/remote/... \ + -run '^$' -bench '^BenchmarkSampleSend' \ + -benchtime 1s -count 6 -cpu 2 -timeout 999m -benchmem \ + | tee ${bench}.txt +*/ func BenchmarkSampleSend(b *testing.B) { // Send one sample per series, which is the typical remote_write case const numSamples = 1 @@ -1771,6 +1815,13 @@ func createDummyTimeSeries(instances int) []timeSeries { return result } +// Recommended CLI invocation(s): +/* + export bench=buildWriteRequest && go test ./storage/remote/... \ + -run '^$' -bench '^BenchmarkBuildWriteRequest' \ + -benchtime 1s -count 6 -cpu 2 -timeout 999m -benchmem \ + | tee ${bench}.txt +*/ func BenchmarkBuildWriteRequest(b *testing.B) { noopLogger := promslog.NewNopLogger() bench := func(b *testing.B, batch []timeSeries) { @@ -1811,6 +1862,13 @@ func BenchmarkBuildWriteRequest(b *testing.B) { }) } +// Recommended CLI invocation(s): +/* + export bench=buildV2WriteRequest && go test ./storage/remote/... \ + -run '^$' -bench '^BenchmarkBuildV2WriteRequest' \ + -benchtime 1s -count 6 -cpu 2 -timeout 999m -benchmem \ + | tee ${bench}.txt +*/ func BenchmarkBuildV2WriteRequest(b *testing.B) { noopLogger := promslog.NewNopLogger() bench := func(b *testing.B, batch []timeSeries) { @@ -1860,7 +1918,9 @@ func TestDropOldTimeSeries(t *testing.T) { size := 10 nSeries := 6 nSamples := config.DefaultQueueConfig.Capacity * size + noST := protoMsg == remoteapi.WriteV1MessageType // RW1 pastRecs := testwal.GenerateRecords(recCase{ + NoST: noST, Series: nSeries, SamplesPerSeries: (nSamples / nSeries) / 2, // Half data is past. TsFn: func(_, j int) int64 { @@ -1869,6 +1929,7 @@ func TestDropOldTimeSeries(t *testing.T) { }, }) newRecs := testwal.GenerateRecords(recCase{ + NoST: noST, Series: nSeries, SamplesPerSeries: (nSamples / nSeries) / 2, // Half data is past. TsFn: func(_, j int) int64 { @@ -1943,6 +2004,7 @@ func TestSendSamplesWithBackoffWithSampleAgeLimit(t *testing.T) { r := rand.New(rand.NewSource(99)) recs := testwal.GenerateRecords(recCase{ + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. Series: numberOfSeries, SamplesPerSeries: 1, TsFn: func(_, _ int) int64 { @@ -1967,9 +2029,10 @@ func TestSendSamplesWithBackoffWithSampleAgeLimit(t *testing.T) { if !shouldBeDropped { for _, s := range recs.Samples { tsID := getSeriesIDFromRef(recs.Series[s.Ref]) - c.expectedSamples[tsID] = append(c.expectedSamples[tsID], prompb.Sample{ - Timestamp: s.T, - Value: s.V, + c.expectedSamples[tsID] = append(c.expectedSamples[tsID], writev2.Sample{ + StartTimestamp: s.ST, + Timestamp: s.T, + Value: s.V, }) } } @@ -2490,7 +2553,10 @@ func TestHighestTimestampOnAppend(t *testing.T) { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { nSamples := 11 * config.DefaultQueueConfig.Capacity nSeries := 3 - recs := testwal.GenerateRecords(recCase{Series: nSeries, SamplesPerSeries: nSamples / nSeries}) + recs := testwal.GenerateRecords(recCase{ + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + Series: nSeries, SamplesPerSeries: nSamples / nSeries, + }) _, m := newTestClientAndQueueManager(t, defaultFlushDeadline, protoMsg) m.Start() diff --git a/storage/series.go b/storage/series.go index bf6df7db3e..e51f8cfd96 100644 --- a/storage/series.go +++ b/storage/series.go @@ -341,11 +341,14 @@ func (s *seriesToChunkEncoder) Iterator(it chunks.Iterator) chunks.Iterator { i := 0 seriesIter := s.Series.Iterator(nil) lastType := chunkenc.ValNone + lastHadST := false for typ := seriesIter.Next(); typ != chunkenc.ValNone; typ = seriesIter.Next() { - if typ != lastType || i >= seriesToChunkEncoderSplit { + st := seriesIter.AtST() + hasST := st != 0 + if typ != lastType || lastHadST != hasST || i >= seriesToChunkEncoderSplit { // Create a new chunk if the sample type changed or too many samples in the current one. chks = appendChunk(chks, mint, maxt, chk) - chk, err = chunkenc.NewEmptyChunk(typ.ChunkEncoding()) + chk, err = typ.NewChunk(hasST) if err != nil { return errChunksIterator{err: err} } @@ -358,21 +361,20 @@ func (s *seriesToChunkEncoder) Iterator(it chunks.Iterator) chunks.Iterator { i = 0 } lastType = typ + lastHadST = hasST var ( - st, t int64 - v float64 - h *histogram.Histogram - fh *histogram.FloatHistogram + t int64 + v float64 + h *histogram.Histogram + fh *histogram.FloatHistogram ) switch typ { case chunkenc.ValFloat: t, v = seriesIter.At() - st = seriesIter.AtST() app.Append(st, t, v) case chunkenc.ValHistogram: t, h = seriesIter.AtHistogram(nil) - st = seriesIter.AtST() newChk, recoded, app, err = app.AppendHistogram(nil, st, t, h, false) if err != nil { return errChunksIterator{err: err} @@ -388,7 +390,6 @@ func (s *seriesToChunkEncoder) Iterator(it chunks.Iterator) chunks.Iterator { } case chunkenc.ValFloatHistogram: t, fh = seriesIter.AtFloatHistogram(nil) - st = seriesIter.AtST() newChk, recoded, app, err = app.AppendFloatHistogram(nil, st, t, fh, false) if err != nil { return errChunksIterator{err: err} diff --git a/tsdb/agent/db.go b/tsdb/agent/db.go index caa494415a..664805ea11 100644 --- a/tsdb/agent/db.go +++ b/tsdb/agent/db.go @@ -95,7 +95,9 @@ type Options struct { // EnableSTStorage determines whether agent DB should write a Start Timestamp (ST) // per sample to WAL. - // TODO(bwplotka): Implement this option as per PROM-60, currently it's noop. + // Controlled by the `--enable-feature=st-storage` CLI flag; when enabled, ST is + // persisted to the WAL for samples that include a non-zero start timestamp in + // supported record types. EnableSTStorage bool } @@ -490,7 +492,7 @@ func (db *DB) loadWAL(r *wlog.Reader, duplicateRefToValidRef map[chunks.HeadSeri return } decoded <- series - case record.Samples: + case record.Samples, record.SamplesV2: samples := db.walReplaySamplesPool.Get()[:0] samples, err = dec.Samples(rec, samples) if err != nil { @@ -750,7 +752,7 @@ func (db *DB) truncate(mint int64) error { db.metrics.checkpointCreationTotal.Inc() - if _, err = wlog.Checkpoint(db.logger, db.wal, first, last, db.keepSeriesInWALCheckpointFn(last), mint); err != nil { + if _, err = wlog.Checkpoint(db.logger, db.wal, first, last, db.keepSeriesInWALCheckpointFn(last), mint, db.opts.EnableSTStorage); err != nil { db.metrics.checkpointCreationFail.Inc() var cerr *wlog.CorruptionErr if errors.As(err, &cerr) { @@ -1196,7 +1198,7 @@ func (a *appenderBase) log() error { a.mtx.RLock() defer a.mtx.RUnlock() - var encoder record.Encoder + encoder := record.Encoder{EnableSTStorage: a.opts.EnableSTStorage} buf := a.bufPool.Get().([]byte) defer func() { a.bufPool.Put(buf) //nolint:staticcheck @@ -1320,7 +1322,7 @@ func (a *appenderBase) logSeries() error { a.bufPool.Put(buf) //nolint:staticcheck }() - var encoder record.Encoder + encoder := record.Encoder{EnableSTStorage: a.opts.EnableSTStorage} buf = encoder.Series(a.pendingSeries, buf) if err := a.wal.Log(buf); err != nil { return err diff --git a/tsdb/agent/db_append_v2.go b/tsdb/agent/db_append_v2.go index bb2601e1e3..b963608637 100644 --- a/tsdb/agent/db_append_v2.go +++ b/tsdb/agent/db_append_v2.go @@ -72,7 +72,6 @@ func (a *appenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t int64 lastTS := s.lastTs s.Unlock() - // TODO(bwplotka): Handle ST natively (as per PROM-60). if a.opts.EnableSTAsZeroSample && st != 0 { a.bestEffortAppendSTZeroSample(s, ls, lastTS, st, t, h, fh) } @@ -86,6 +85,7 @@ func (a *appenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t int64 case fh != nil: isStale = value.IsStaleNaN(fh.Sum) // NOTE: always modify pendingFloatHistograms and floatHistogramSeries together + // TODO(krajorama,ywwg,bwplotka): Pass ST when available in WAL. a.pendingFloatHistograms = append(a.pendingFloatHistograms, record.RefFloatHistogramSample{ Ref: s.ref, T: t, @@ -95,6 +95,7 @@ func (a *appenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t int64 case h != nil: isStale = value.IsStaleNaN(h.Sum) // NOTE: always modify pendingHistograms and histogramSeries together + // TODO(krajorama,ywwg,bwplotka): Pass ST when available in WAL. a.pendingHistograms = append(a.pendingHistograms, record.RefHistogramSample{ Ref: s.ref, T: t, @@ -107,6 +108,7 @@ func (a *appenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t int64 // NOTE: always modify pendingSamples and sampleSeries together. a.pendingSamples = append(a.pendingSamples, record.RefSample{ Ref: s.ref, + ST: st, T: t, V: v, }) diff --git a/tsdb/agent/db_append_v2_test.go b/tsdb/agent/db_append_v2_test.go index 3e10a1163b..92a5bb8f35 100644 --- a/tsdb/agent/db_append_v2_test.go +++ b/tsdb/agent/db_append_v2_test.go @@ -18,6 +18,7 @@ import ( "fmt" "math" "path/filepath" + "strconv" "testing" "time" @@ -89,278 +90,301 @@ func TestDB_InvalidSeries_AppendV2(t *testing.T) { }) } +// TestCommit_AppendV2 tests Appender commit. +// TODO(bwplotka): Rewrite this so Refs are generated, then appended, then expected so we test the +// exact data durability. func TestCommit_AppendV2(t *testing.T) { const ( numDatapoints = 1000 numHistograms = 100 numSeries = 8 ) + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + opts := DefaultOptions() + opts.EnableSTStorage = enableSTStorage + s := createTestAgentDB(t, nil, opts) - s := createTestAgentDB(t, nil, DefaultOptions()) - app := s.AppenderV2(context.TODO()) + var ( + expectedSampleSTs []int64 + gotSampleSTs []int64 + ) + if enableSTStorage { + expectedSampleSTs = make([]int64, 0, numSeries*numDatapoints) + gotSampleSTs = make([]int64, 0, numSeries*numDatapoints) + } - lbls := labelsForTest(t.Name(), numSeries) - for _, l := range lbls { - lset := labels.New(l...) + app := s.AppenderV2(t.Context()) + lbls := labelsForTest(t.Name(), numSeries) + for _, l := range lbls { + lset := labels.New(l...) - for i := range numDatapoints { - sample := chunks.GenerateSamples(0, 1) - _, err := app.Append(0, lset, 0, sample[0].T(), sample[0].F(), nil, nil, storage.AOptions{ - Exemplars: []exemplar.Exemplar{{ - Labels: lset, - Ts: sample[0].T() + int64(i), - Value: sample[0].F(), - HasTs: true, - }}, - }) + for i := range numDatapoints { + sample := chunks.GenerateSamples(0, 1) + st := int64(i + 1234) + _, err := app.Append(0, lset, st, sample[0].T()+2000, sample[0].F(), nil, nil, storage.AOptions{ + Exemplars: []exemplar.Exemplar{{ + Labels: lset, + Ts: sample[0].T() + int64(i) + 2000, + Value: sample[0].F(), + HasTs: true, + }}, + }) + require.NoError(t, err) + if enableSTStorage { + expectedSampleSTs = append(expectedSampleSTs, st) + } + } + } + + lbls = labelsForTest(t.Name()+"_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) + + histograms := tsdbutil.GenerateTestHistograms(numHistograms) + + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i+2234), int64(i+2000), 0, histograms[i], nil, storage.AOptions{}) + require.NoError(t, err) + } + } + + lbls = labelsForTest(t.Name()+"_custom_buckets_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) + + customBucketHistograms := tsdbutil.GenerateTestCustomBucketsHistograms(numHistograms) + + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i+3234), int64(i+2000), 0, customBucketHistograms[i], nil, storage.AOptions{}) + require.NoError(t, err) + } + } + + lbls = labelsForTest(t.Name()+"_float_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) + + floatHistograms := tsdbutil.GenerateTestFloatHistograms(numHistograms) + + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i+4234), int64(i+2000), 0, nil, floatHistograms[i], storage.AOptions{}) + require.NoError(t, err) + } + } + + lbls = labelsForTest(t.Name()+"_custom_buckets_float_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) + + customBucketFloatHistograms := tsdbutil.GenerateTestCustomBucketsFloatHistograms(numHistograms) + + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i+5234), int64(i+2000), 0, nil, customBucketFloatHistograms[i], storage.AOptions{}) + require.NoError(t, err) + } + } + + require.NoError(t, app.Commit()) + require.NoError(t, s.Close()) + + sr, err := wlog.NewSegmentsReader(s.wal.Dir()) require.NoError(t, err) - } + defer func() { + require.NoError(t, sr.Close()) + }() + + // Read records from WAL and check for expected count of series, samples, and exemplars. + var ( + r = wlog.NewReader(sr) + dec = record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + + walSeriesCount, walSamplesCount, walExemplarsCount, walHistogramCount, walFloatHistogramCount int + ) + for r.Next() { + rec := r.Record() + switch dec.Type(rec) { + case record.Series: + var series []record.RefSeries + series, err = dec.Series(rec, series) + require.NoError(t, err) + walSeriesCount += len(series) + + case record.Samples: + if enableSTStorage { + t.Errorf("Got V1 Samples when ST enabled") + } + var samples []record.RefSample + samples, err = dec.Samples(rec, samples) + require.NoError(t, err) + walSamplesCount += len(samples) + case record.SamplesV2: + if !enableSTStorage { + t.Errorf("Got V2 Samples when ST disabled") + } + var samples []record.RefSample + samples, err = dec.Samples(rec, samples) + require.NoError(t, err) + + for _, s := range samples { + gotSampleSTs = append(gotSampleSTs, s.ST) + } + walSamplesCount += len(samples) + + case record.HistogramSamples, record.CustomBucketsHistogramSamples: + var histograms []record.RefHistogramSample + histograms, err = dec.HistogramSamples(rec, histograms) + require.NoError(t, err) + walHistogramCount += len(histograms) + + case record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: + var floatHistograms []record.RefFloatHistogramSample + floatHistograms, err = dec.FloatHistogramSamples(rec, floatHistograms) + require.NoError(t, err) + walFloatHistogramCount += len(floatHistograms) + + case record.Exemplars: + var exemplars []record.RefExemplar + exemplars, err = dec.Exemplars(rec, exemplars) + require.NoError(t, err) + walExemplarsCount += len(exemplars) + + default: + } + } + + // Check that the WAL contained the same number of committed series/samples/exemplars. + require.Equal(t, numSeries*5, walSeriesCount, "unexpected number of series") + require.Equal(t, numSeries*numDatapoints, walSamplesCount, "unexpected number of samples") + require.Equal(t, expectedSampleSTs, gotSampleSTs, "unexpected STs received") + require.Equal(t, numSeries*numDatapoints, walExemplarsCount, "unexpected number of exemplars") + require.Equal(t, numSeries*numHistograms*2, walHistogramCount, "unexpected number of histograms") + require.Equal(t, numSeries*numHistograms*2, walFloatHistogramCount, "unexpected number of float histograms") + + // Check that we can still create both kinds of Appender. + // Regression test against https://github.com/prometheus/prometheus/issues/17800. + _ = s.Appender(t.Context()) + _ = s.AppenderV2(t.Context()) + }) } - - lbls = labelsForTest(t.Name()+"_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) - - histograms := tsdbutil.GenerateTestHistograms(numHistograms) - - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, histograms[i], nil, storage.AOptions{}) - require.NoError(t, err) - } - } - - lbls = labelsForTest(t.Name()+"_custom_buckets_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) - - customBucketHistograms := tsdbutil.GenerateTestCustomBucketsHistograms(numHistograms) - - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, customBucketHistograms[i], nil, storage.AOptions{}) - require.NoError(t, err) - } - } - - lbls = labelsForTest(t.Name()+"_float_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) - - floatHistograms := tsdbutil.GenerateTestFloatHistograms(numHistograms) - - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, nil, floatHistograms[i], storage.AOptions{}) - require.NoError(t, err) - } - } - - lbls = labelsForTest(t.Name()+"_custom_buckets_float_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) - - customBucketFloatHistograms := tsdbutil.GenerateTestCustomBucketsFloatHistograms(numHistograms) - - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, nil, customBucketFloatHistograms[i], storage.AOptions{}) - require.NoError(t, err) - } - } - - require.NoError(t, app.Commit()) - require.NoError(t, s.Close()) - - sr, err := wlog.NewSegmentsReader(s.wal.Dir()) - require.NoError(t, err) - defer func() { - require.NoError(t, sr.Close()) - }() - - // Read records from WAL and check for expected count of series, samples, and exemplars. - var ( - r = wlog.NewReader(sr) - dec = record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) - - walSeriesCount, walSamplesCount, walExemplarsCount, walHistogramCount, walFloatHistogramCount int - ) - for r.Next() { - rec := r.Record() - switch dec.Type(rec) { - case record.Series: - var series []record.RefSeries - series, err = dec.Series(rec, series) - require.NoError(t, err) - walSeriesCount += len(series) - - case record.Samples: - var samples []record.RefSample - samples, err = dec.Samples(rec, samples) - require.NoError(t, err) - walSamplesCount += len(samples) - - case record.HistogramSamples, record.CustomBucketsHistogramSamples: - var histograms []record.RefHistogramSample - histograms, err = dec.HistogramSamples(rec, histograms) - require.NoError(t, err) - walHistogramCount += len(histograms) - - case record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: - var floatHistograms []record.RefFloatHistogramSample - floatHistograms, err = dec.FloatHistogramSamples(rec, floatHistograms) - require.NoError(t, err) - walFloatHistogramCount += len(floatHistograms) - - case record.Exemplars: - var exemplars []record.RefExemplar - exemplars, err = dec.Exemplars(rec, exemplars) - require.NoError(t, err) - walExemplarsCount += len(exemplars) - - default: - } - } - - // Check that the WAL contained the same number of committed series/samples/exemplars. - require.Equal(t, numSeries*5, walSeriesCount, "unexpected number of series") - require.Equal(t, numSeries*numDatapoints, walSamplesCount, "unexpected number of samples") - require.Equal(t, numSeries*numDatapoints, walExemplarsCount, "unexpected number of exemplars") - require.Equal(t, numSeries*numHistograms*2, walHistogramCount, "unexpected number of histograms") - require.Equal(t, numSeries*numHistograms*2, walFloatHistogramCount, "unexpected number of float histograms") - - // Check that we can still create both kinds of Appender - see https://github.com/prometheus/prometheus/issues/17800. - _ = s.Appender(context.TODO()) - _ = s.AppenderV2(context.TODO()) } -func TestRollback_AppendV2(t *testing.T) { +func TestRollbackAppendV2(t *testing.T) { const ( numDatapoints = 1000 numHistograms = 100 numSeries = 8 ) - s := createTestAgentDB(t, nil, DefaultOptions()) - app := s.AppenderV2(context.TODO()) + for _, enableSTStorage := range []bool{false, true} { + opts := DefaultOptions() + opts.EnableSTStorage = enableSTStorage + s := createTestAgentDB(t, nil, opts) + app := s.AppenderV2(context.TODO()) - lbls := labelsForTest(t.Name(), numSeries) - for _, l := range lbls { - lset := labels.New(l...) + lbls := labelsForTest(t.Name(), numSeries) + for _, l := range lbls { + lset := labels.New(l...) - for range numDatapoints { - sample := chunks.GenerateSamples(0, 1) - _, err := app.Append(0, lset, 0, sample[0].T(), sample[0].F(), nil, nil, storage.AOptions{}) - require.NoError(t, err) + for i := range numDatapoints { + sample := chunks.GenerateSamples(0, 1) + _, err := app.Append(0, lset, int64(i), sample[0].T()+2000, sample[0].F(), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } } - } - lbls = labelsForTest(t.Name()+"_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) + lbls = labelsForTest(t.Name()+"_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) - histograms := tsdbutil.GenerateTestHistograms(numHistograms) + histograms := tsdbutil.GenerateTestHistograms(numHistograms) - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, histograms[i], nil, storage.AOptions{}) - require.NoError(t, err) + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, histograms[i], nil, storage.AOptions{}) + require.NoError(t, err) + } } - } - lbls = labelsForTest(t.Name()+"_custom_buckets_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) + lbls = labelsForTest(t.Name()+"_custom_buckets_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) - histograms := tsdbutil.GenerateTestCustomBucketsHistograms(numHistograms) + histograms := tsdbutil.GenerateTestCustomBucketsHistograms(numHistograms) - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, histograms[i], nil, storage.AOptions{}) - require.NoError(t, err) + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, histograms[i], nil, storage.AOptions{}) + require.NoError(t, err) + } } - } - lbls = labelsForTest(t.Name()+"_float_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) + lbls = labelsForTest(t.Name()+"_float_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) - floatHistograms := tsdbutil.GenerateTestFloatHistograms(numHistograms) + floatHistograms := tsdbutil.GenerateTestFloatHistograms(numHistograms) - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, nil, floatHistograms[i], storage.AOptions{}) - require.NoError(t, err) + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, nil, floatHistograms[i], storage.AOptions{}) + require.NoError(t, err) + } } - } - lbls = labelsForTest(t.Name()+"_custom_buckets_float_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) + lbls = labelsForTest(t.Name()+"_custom_buckets_float_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) - floatHistograms := tsdbutil.GenerateTestCustomBucketsFloatHistograms(numHistograms) + floatHistograms := tsdbutil.GenerateTestCustomBucketsFloatHistograms(numHistograms) - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, nil, floatHistograms[i], storage.AOptions{}) - require.NoError(t, err) + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, nil, floatHistograms[i], storage.AOptions{}) + require.NoError(t, err) + } } - } - // Do a rollback, which should clear uncommitted data. A followup call to - // commit should persist nothing to the WAL. - require.NoError(t, app.Rollback()) - require.NoError(t, app.Commit()) - require.NoError(t, s.Close()) + // Do a rollback, which should clear uncommitted data. A followup call to + // commit should persist nothing to the WAL. + require.NoError(t, app.Rollback()) + require.NoError(t, app.Commit()) + require.NoError(t, s.Close()) - sr, err := wlog.NewSegmentsReader(s.wal.Dir()) - require.NoError(t, err) - defer func() { - require.NoError(t, sr.Close()) - }() + sr, err := wlog.NewSegmentsReader(s.wal.Dir()) + require.NoError(t, err) + defer func() { + require.NoError(t, sr.Close()) + }() - // Read records from WAL and check for expected count of series and samples. - var ( - r = wlog.NewReader(sr) - dec = record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + // Read records from WAL and check for expected count of series and samples. + var ( + r = wlog.NewReader(sr) + dec = record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) - walSeriesCount, walSamplesCount, walHistogramCount, walFloatHistogramCount, walExemplarsCount int - ) - for r.Next() { - rec := r.Record() - switch dec.Type(rec) { - case record.Series: - var series []record.RefSeries - series, err = dec.Series(rec, series) - require.NoError(t, err) - walSeriesCount += len(series) + walSeriesCount int + ) + for r.Next() { + rec := r.Record() + switch dec.Type(rec) { + case record.Series: + var series []record.RefSeries + series, err = dec.Series(rec, series) + require.NoError(t, err) + walSeriesCount += len(series) - case record.Samples: - var samples []record.RefSample - samples, err = dec.Samples(rec, samples) - require.NoError(t, err) - walSamplesCount += len(samples) + case record.Samples, record.SamplesV2: + t.Errorf("should not have found samples") - case record.Exemplars: - var exemplars []record.RefExemplar - exemplars, err = dec.Exemplars(rec, exemplars) - require.NoError(t, err) - walExemplarsCount += len(exemplars) + case record.Exemplars: + t.Errorf("should not have found exemplars") - case record.HistogramSamples, record.CustomBucketsHistogramSamples: - var histograms []record.RefHistogramSample - histograms, err = dec.HistogramSamples(rec, histograms) - require.NoError(t, err) - walHistogramCount += len(histograms) + case record.HistogramSamples, record.CustomBucketsHistogramSamples, record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: + t.Errorf("should not have found histograms") - case record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: - var floatHistograms []record.RefFloatHistogramSample - floatHistograms, err = dec.FloatHistogramSamples(rec, floatHistograms) - require.NoError(t, err) - walFloatHistogramCount += len(floatHistograms) - - default: + default: + } } - } - // Check that only series get stored after calling Rollback. - require.Equal(t, numSeries*5, walSeriesCount, "series should have been written to WAL") - require.Equal(t, 0, walSamplesCount, "samples should not have been written to WAL") - require.Equal(t, 0, walExemplarsCount, "exemplars should not have been written to WAL") - require.Equal(t, 0, walHistogramCount, "histograms should not have been written to WAL") - require.Equal(t, 0, walFloatHistogramCount, "float histograms should not have been written to WAL") + // Check that only series get stored after calling Rollback. + require.Equal(t, numSeries*5, walSeriesCount, "series should have been written to WAL") + } } func TestFullTruncateWAL_AppendV2(t *testing.T) { diff --git a/tsdb/agent/db_test.go b/tsdb/agent/db_test.go index e6b8cadc22..4450565674 100644 --- a/tsdb/agent/db_test.go +++ b/tsdb/agent/db_test.go @@ -226,7 +226,7 @@ func TestCommit(t *testing.T) { require.NoError(t, err) walSeriesCount += len(series) - case record.Samples: + case record.Samples, record.SamplesV2: var samples []record.RefSample samples, err = dec.Samples(rec, samples) require.NoError(t, err) @@ -362,7 +362,7 @@ func TestRollback(t *testing.T) { require.NoError(t, err) walSeriesCount += len(series) - case record.Samples: + case record.Samples, record.SamplesV2: var samples []record.RefSample samples, err = dec.Samples(rec, samples) require.NoError(t, err) @@ -1425,7 +1425,7 @@ func readWALSamples(t *testing.T, walDir string) []walSample { series, err := dec.Series(rec, nil) require.NoError(t, err) lastSeries = series[0] - case record.Samples: + case record.Samples, record.SamplesV2: samples, err = dec.Samples(rec, samples[:0]) require.NoError(t, err) for _, s := range samples { diff --git a/tsdb/chunkenc/benchmark_test.go b/tsdb/chunkenc/benchmark_test.go new file mode 100644 index 0000000000..3f77b14ca3 --- /dev/null +++ b/tsdb/chunkenc/benchmark_test.go @@ -0,0 +1,343 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunkenc + +import ( + "errors" + "fmt" + "io" + "math" + "math/rand" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/model/timestamp" +) + +type sampleCase struct { + name string + samples []triple +} + +type fmtCase struct { + name string + newChunkFn func() Chunk + stUnsupported bool +} + +func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampleCase)) { + const nSamples = 120 // Same as tsdb.DefaultSamplesPerChunk. + + d, err := time.Parse(time.DateTime, "2025-11-04 10:01:05") + require.NoError(b, err) + + var ( + r = rand.New(rand.NewSource(1)) // Fixed seed for reproducibility. + initST = timestamp.FromTime(d) // Use realistic timestamp. + initT = initST + 15000 // 15s after initST. + initV = 1243535.123 + rInts = make([]int64, 2*nSamples) // Random ints for timestamps and STs. + rFloats = make([]float64, nSamples) + ) + + // Pre-generate random numbers so that adding/removing cases does not change + // the generated samples. + for i := range nSamples { + rInts[i] = int64(r.Intn(100)) + rInts[nSamples+i] = int64(r.Intn(100)) + rFloats[i] = float64(r.Intn(100)) + } + + // tPatterns control how the regular timestamp advances. + type tPattern struct { + name string + next func(t int64, i int) int64 + } + // vPatterns control how the value advances. + type vPattern struct { + name string + next func(v float64, i int) float64 + } + // stPatterns compute the start timestamp from the previous t (before the + // step), the new t (after the step), and the sample index. + type stPattern struct { + name string + compute func(prevT, newT int64, i int) int64 + } + + tPatterns := []tPattern{ + { + name: "t=constant", + next: func(t int64, _ int) int64 { return t + 15000 }, + }, + { + // 15 seconds ± up to 100ms of jitter. + name: "t=jitter", + next: func(t int64, i int) int64 { return t + rInts[i] - 50 + 15000 }, + }, + { + // First 10 samples at constant 60s, then one 10-interval gap (600s), + // then 60s ± 30ms jitter. The gap triggers XOR18111 full mode via + // multiplier encoding (dod=540000 = 9×60000). Subsequent small-jitter + // delta-of-deltas (≤30ms) use XOR18111's 7-bit full-mode code (9 bits + // total) vs XOR compact's minimum 14-bit code (16 bits total). + name: "t=gap-jitter", + next: func(t int64, i int) int64 { + if i < 10 { + return t + 60000 + } + if i == 10 { + return t + 10*60000 // 10-interval gap; triggers XOR18111 full mode. + } + return t + 60000 + rInts[i]%61 - 30 // 60s ± 30ms jitter. + }, + }, + } + vPatterns := []vPattern{ + { + name: "v=constant", + next: func(v float64, _ int) float64 { return v }, + }, + // We are not interested in float compression we're not changing it. + // { + // // Varying from -50 to +50 in 100 discrete steps. + // name: "v=rand-steps", + // next: func(v float64, i int) float64 { return v + rFloats[i] - 50 }, + // }, + // { + // // Random increment between 0 and 1.0. + // name: "v=rand0-1", + // next: func(v float64, i int) float64 { return v + rFloats[i]/100.0 }, + // }, + // { + // // Random decrement between 0 and -1.0. Tests negative varint encoding; + // // see https://victoriametrics.com/blog/go-protobuf/. + // name: "v=nrand0-1", + // next: func(v float64, i int) float64 { return v - rFloats[i]/100.0 }, + // }, + } + stPatterns := []stPattern{ + { + name: "st=0", + compute: func(_, _ int64, _ int) int64 { return 0 }, + }, + { + // Constant ST throughout the chunk, typical for long-running counters. + name: "st=cumulative", + compute: func(_, _ int64, _ int) int64 { return initST }, + }, + { + // ST is just after the previous sample's t: tight delta interval. + name: "st=delta-excl", + compute: func(prevT, _ int64, _ int) int64 { return prevT + 1 }, + }, + { + // ST equals the previous sample's t: inclusive delta interval. + name: "st=delta-incl", + compute: func(prevT, _ int64, _ int) int64 { return prevT }, + }, + { + // ST equals the current sample's t. + name: "st=t", + compute: func(_, newT int64, _ int) int64 { return newT }, + }, + { + // ST is equal to the previous t plus up to 100ms of jitter. + name: "st=delta-jitter", + compute: func(prevT, _ int64, i int) int64 { return prevT + rInts[nSamples+i] }, + }, + { + // Cumulative ST with periodic resets 10s before the current t. + name: "st=cum-resets", + compute: func(_, newT int64, i int) int64 { + if i%6 == 5 { + return newT - 10000 + } + return initST + }, + }, + { + // Cumulative ST with periodic zero resets. + name: "st=cum-zeros", + compute: func(_, _ int64, i int) int64 { + if i%6 == 5 { + return 0 + } + return initST + }, + }, + } + + var sampleCases []sampleCase + for _, tp := range tPatterns { + for _, vp := range vPatterns { + for _, sp := range stPatterns { + samples := make([]triple, 0, nSamples) + t, v := initT, initV + for i := range nSamples { + prevT := t + t = tp.next(t, i) + v = vp.next(v, i) + st := sp.compute(prevT, t, i) + samples = append(samples, triple{st: st, t: t, v: v}) + } + sampleCases = append(sampleCases, sampleCase{ + name: tp.name + "/" + vp.name + "/" + sp.name, + samples: samples, + }) + } + } + } + + for _, f := range []fmtCase{ + {name: "XOR", newChunkFn: func() Chunk { return NewXORChunk() }, stUnsupported: true}, + {name: "XOR2", newChunkFn: func() Chunk { return NewXOR2Chunk() }}, + } { + for _, s := range sampleCases { + b.Run(fmt.Sprintf("fmt=%s/%s", f.name, s.name), func(b *testing.B) { + fn(b, f, s) + }) + } + } +} + +/* + export bench=bw.bench/append.v2 && go test \ + -run '^$' -bench '^BenchmarkAppender' \ + -benchtime 1s -count 6 -cpu 2 -timeout 999m \ + | tee ${bench}.txt + +For profiles: + + export bench=bw.bench/appendprof && go test \ + -run '^$' -bench '^BenchmarkAppender' \ + -benchtime 1s -count 1 -cpu 2 -timeout 999m \ + -cpuprofile=${bench}.cpu.pprof \ + | tee ${bench}.txt +*/ +func BenchmarkAppender(b *testing.B) { + foreachFmtSampleCase(b, func(b *testing.B, f fmtCase, s sampleCase) { + b.ReportAllocs() + + for b.Loop() { + c := f.newChunkFn() + + a, err := c.Appender() + if err != nil { + b.Fatalf("get appender: %s", err) + } + for _, p := range s.samples { + a.Append(p.st, p.t, p.v) + } + // NOTE: Some buffered implementations only encode on Bytes(). + b.ReportMetric(float64(len(c.Bytes())), "B/chunk") + + require.Equal(b, len(s.samples), c.NumSamples()) + } + }) +} + +/* + export bench=bw.bench/iter && go test \ + -run '^$' -bench '^BenchmarkIterator' \ + -benchtime 1s -count 6 -cpu 2 -timeout 999m \ + | tee ${bench}.txt + +For profiles: + + export bench=bw.bench/iterprof && go test \ + -run '^$' -bench '^BenchmarkIterator' \ + -benchtime 1000000x -count 1 -cpu 2 -timeout 999m \ + -cpuprofile=${bench}.cpu.pprof \ + | tee ${bench}.txt + export bench=bw.bench/iterprof && go test \ + -run '^$' -bench '^BenchmarkIterator' \ + -benchtime 1000000x -count 1 -cpu 2 -timeout 999m \ + -memprofile=${bench}.mem.pprof \ + | tee ${bench}.txt +*/ +func BenchmarkIterator(b *testing.B) { + foreachFmtSampleCase(b, func(b *testing.B, f fmtCase, s sampleCase) { + floatEquals := func(a, b float64) bool { + return a == b + } + if f.name == "ALPBuffered" { + // Hack as ALP loses precision. + floatEquals = func(a, b float64) bool { + return math.Abs(a-b) < 1e-6 + } + } + b.ReportAllocs() + + c := f.newChunkFn() + a, err := c.Appender() + if err != nil { + b.Fatalf("get appender: %s", err) + } + for _, p := range s.samples { + a.Append(p.st, p.t, p.v) + } + + // Some chunk implementations might be buffered. Reset to ensure we don't reuse + // appending buffers. + c.Reset(c.Bytes()) + + // While we are at it, test if encoding/decoding works. + it := c.Iterator(nil) + require.Equal(b, len(s.samples), c.NumSamples()) + var got []triple + for i := 0; it.Next() == ValFloat; i++ { + t, v := it.At() + got = append(got, triple{st: it.AtST(), t: t, v: v}) + } + if err := it.Err(); err != nil && !errors.Is(err, io.EOF) { + require.NoError(b, err) + } + expectedSamples := s.samples + if f.stUnsupported { + // If the format does not support ST, zero them out for comparison. + expectedSamples = make([]triple, len(s.samples)) + copy(expectedSamples, s.samples) + for i := range s.samples { + expectedSamples[i].st = 0 + } + } + if diff := cmp.Diff(expectedSamples, got, cmp.AllowUnexported(triple{}), cmp.Comparer(floatEquals)); diff != "" { + b.Fatalf("mismatch (-want +got):\n%s", diff) + } + + var sink float64 + // Measure decoding efficiency. + for i := 0; b.Loop(); { + // Some chunk implementations might be buffered. Reset to ensure we don't reuse + // previous decoded data. + c.Reset(c.Bytes()) + b.ReportMetric(float64(len(c.Bytes())), "B/chunk") + + it := c.Iterator(it) + for it.Next() == ValFloat { + _, v := it.At() + sink = v + i++ + } + if err := it.Err(); err != nil && !errors.Is(err, io.EOF) { + require.NoError(b, err) + } + _ = sink + } + }) +} diff --git a/tsdb/chunkenc/bstream.go b/tsdb/chunkenc/bstream.go index abf6e4dbef..4fd37a140f 100644 --- a/tsdb/chunkenc/bstream.go +++ b/tsdb/chunkenc/bstream.go @@ -101,6 +101,7 @@ func (b *bstream) writeByte(byt byte) { // writeBits writes the nbits right-most bits of u to the stream // in left-to-right order. +// TODO: Once XOR2 stabilizes, replace writeBits with the writeBitsFast implementation and remove writeBitsFast. func (b *bstream) writeBits(u uint64, nbits int) { u <<= 64 - uint(nbits) for nbits >= 8 { @@ -117,6 +118,40 @@ func (b *bstream) writeBits(u uint64, nbits int) { } } +// writeBitsFast is like writeBits but handles the partial last byte inline to +// avoid per-byte writeByte calls, and writes complete bytes directly to the +// stream slice. +func (b *bstream) writeBitsFast(u uint64, nbits int) { + u <<= 64 - uint(nbits) + + // If the last byte is partial, fill its remaining bits first. + if b.count > 0 { + free := int(b.count) + last := len(b.stream) - 1 + b.stream[last] |= byte(u >> uint(64-free)) + if nbits < free { + b.count = uint8(free - nbits) + return + } + u <<= uint(free) + nbits -= free + b.count = 0 + } + + // Write complete bytes directly, avoiding per-byte function call overhead. + for nbits >= 8 { + b.stream = append(b.stream, byte(u>>56)) + u <<= 8 + nbits -= 8 + } + + // Write any remaining bits as a partial final byte. + if nbits > 0 { + b.stream = append(b.stream, byte(u>>56)) + b.count = uint8(8 - nbits) + } +} + type bstreamReader struct { stream []byte streamOffset int // The offset from which read the next byte from the stream. @@ -215,6 +250,156 @@ func (b *bstreamReader) ReadByte() (byte, error) { return byte(v), nil } +// readXOR2ControlFast is like readXOR2Control but returns false when the +// internal buffer has fewer than 4 valid bits, or when the control prefix +// indicates cases 4 or 5 (top4 == 0xf). The caller should retry with +// readXOR2Control. This function must be kept small and a leaf in order to +// help the compiler inlining it and further improve performance. +func (b *bstreamReader) readXOR2ControlFast() (uint8, bool) { + if b.valid < 4 { + return 0, false + } + top4 := uint8((b.buffer >> (b.valid - 4)) & 0xf) + if top4 < 8 { // '0xxx': dod=0, val=0 (case 0). + b.valid-- + return 0, true + } + if top4 < 12 { // '10xx': dod=0, val changed (case 1). + b.valid -= 2 + return 1, true + } + if top4 < 14 { // '110x': small dod (case 2). + b.valid -= 3 + return 2, true + } + if top4 == 14 { // '1110': medium dod (case 3). + b.valid -= 4 + return 3, true + } + return 0, false +} + +// readXOR2Control reads the XOR2 variable-length joint control prefix +// and returns 0-5 mapping to the six encoding cases: +// +// 0 → '0' dod=0, val=0 (1 bit consumed) +// 1 → '10' dod=0, val≠0 (2 bits consumed) +// 2 → '110' dod≠0, 13-bit signed dod (3 bits consumed) +// 3 → '1110' dod≠0, 20-bit signed dod (4 bits consumed) +// 4 → '11110' dod≠0, 64-bit escape (5 bits consumed) +// 5 → '11111' dod=0, stale NaN (5 bits consumed) +// +// The fast path peeks at 4 bits from the internal buffer; for the '1111' +// prefix a fifth bit is read to distinguish cases 4 and 5. +func (b *bstreamReader) readXOR2Control() (uint8, error) { + if b.valid >= 4 { + top4 := uint8((b.buffer >> (b.valid - 4)) & 0xf) + if top4 < 8 { // '0xxx' → case 0. + b.valid-- + return 0, nil + } + if top4 < 12 { // '10xx' → case 1. + b.valid -= 2 + return 1, nil + } + if top4 < 14 { // '110x' → case 2. + b.valid -= 3 + return 2, nil + } + if top4 == 14 { // '1110' → case 3. + b.valid -= 4 + return 3, nil + } + // '1111': need fifth bit to distinguish cases 4 and 5. + if b.valid >= 5 { + bit4 := uint8((b.buffer >> (b.valid - 5)) & 1) + b.valid -= 5 + return 4 + bit4, nil + } + // Fifth bit spans a buffer boundary; consume the four known bits + // and read the fifth from the stream. + b.valid -= 4 + bit4, err := b.readBit() + if err != nil { + return 0, err + } + if bit4 == zero { + return 4, nil + } + return 5, nil + } + + // Slow path: bits may span buffer boundaries, read one at a time. + bit0, err := b.readBit() + if err != nil { + return 0, err + } + if bit0 == zero { + return 0, nil + } + bit1, err := b.readBit() + if err != nil { + return 0, err + } + if bit1 == zero { + return 1, nil + } + bit2, err := b.readBit() + if err != nil { + return 0, err + } + if bit2 == zero { + return 2, nil + } + bit3, err := b.readBit() + if err != nil { + return 0, err + } + if bit3 == zero { + return 3, nil + } + bit4, err := b.readBit() + if err != nil { + return 0, err + } + if bit4 == zero { + return 4, nil + } + return 5, nil +} + +// readUvarint decodes a varint-encoded uint64 using direct method calls, +// avoiding the io.ByteReader interface dispatch used by binary.ReadUvarint, +// which causes the receiver to escape to the heap. +func (b *bstreamReader) readUvarint() (uint64, error) { + var x uint64 + var s uint + for range binary.MaxVarintLen64 { + byt, err := b.ReadByte() + if err != nil { + return x, err + } + if byt < 0x80 { + return x | uint64(byt)<> 1) + if ux&1 != 0 { + x = ^x + } + return x, err +} + // loadNextBuffer loads the next bytes from the stream into the internal buffer. // The input nbits is the minimum number of bits that must be read, but the implementation // can read more (if possible) to improve performances. diff --git a/tsdb/chunkenc/bstream_test.go b/tsdb/chunkenc/bstream_test.go index 3098be5945..0b6a0e9b35 100644 --- a/tsdb/chunkenc/bstream_test.go +++ b/tsdb/chunkenc/bstream_test.go @@ -14,6 +14,7 @@ package chunkenc import ( + "fmt" "testing" "github.com/stretchr/testify/require" @@ -32,6 +33,44 @@ func TestBstream_Reset(t *testing.T) { }, bs) } +// BenchmarkWriteBits benchmarks writeBits for various bit widths. +func BenchmarkWriteBits(b *testing.B) { + sizes := []int{1, 8, 17, 32, 52, 64} + for _, nbits := range sizes { + b.Run(fmt.Sprintf("nbits=%d", nbits), func(b *testing.B) { + b.ReportAllocs() + var bs bstream + bs.stream = make([]byte, 0, 1024) + for range b.N { + bs.stream = bs.stream[:0] + bs.count = 0 + for j := range 100 { + bs.writeBits(uint64(j), nbits) + } + } + }) + } +} + +// BenchmarkWriteBitsFast benchmarks writeBitsFast for various bit widths. +func BenchmarkWriteBitsFast(b *testing.B) { + sizes := []int{1, 8, 17, 32, 52, 64} + for _, nbits := range sizes { + b.Run(fmt.Sprintf("nbits=%d", nbits), func(b *testing.B) { + b.ReportAllocs() + var bs bstream + bs.stream = make([]byte, 0, 1024) + for range b.N { + bs.stream = bs.stream[:0] + bs.count = 0 + for j := range 100 { + bs.writeBitsFast(uint64(j), nbits) + } + } + }) + } +} + func TestBstreamReader(t *testing.T) { // Write to the bit stream. w := bstream{} diff --git a/tsdb/chunkenc/chunk.go b/tsdb/chunkenc/chunk.go index 711966ec39..3a405e8cf7 100644 --- a/tsdb/chunkenc/chunk.go +++ b/tsdb/chunkenc/chunk.go @@ -30,6 +30,7 @@ const ( EncXOR EncHistogram EncFloatHistogram + EncXOR2 ) func (e Encoding) String() string { @@ -42,13 +43,15 @@ func (e Encoding) String() string { return "histogram" case EncFloatHistogram: return "floathistogram" + case EncXOR2: + return "XOR2" } return "" } // IsValidEncoding returns true for supported encodings. func IsValidEncoding(e Encoding) bool { - return e == EncXOR || e == EncHistogram || e == EncFloatHistogram + return e == EncXOR || e == EncHistogram || e == EncFloatHistogram || e == EncXOR2 } const ( @@ -73,6 +76,8 @@ type Chunk interface { Bytes() []byte // Encoding returns the encoding type of the chunk. + // If the chunk is capable of storing ST (start timestamps), it should + // return the appropriate encoding type (e.g., EncXOR2). Encoding() Encoding // Appender returns an appender to append samples to the chunk. @@ -186,9 +191,12 @@ func (v ValueType) String() string { } } -func (v ValueType) ChunkEncoding() Encoding { +func (v ValueType) ChunkEncoding(useXOR2 bool) Encoding { switch v { case ValFloat: + if useXOR2 { + return EncXOR2 + } return EncXOR case ValHistogram: return EncHistogram @@ -199,17 +207,9 @@ func (v ValueType) ChunkEncoding() Encoding { } } -func (v ValueType) NewChunk() (Chunk, error) { - switch v { - case ValFloat: - return NewXORChunk(), nil - case ValHistogram: - return NewHistogramChunk(), nil - case ValFloatHistogram: - return NewFloatHistogramChunk(), nil - default: - return nil, fmt.Errorf("value type %v unsupported", v) - } +// NewChunk returns a new empty chunk for the given value type. +func (v ValueType) NewChunk(useXOR2 bool) (Chunk, error) { + return NewEmptyChunk(v.ChunkEncoding(useXOR2)) } // MockSeriesIterator returns an iterator for a mock series with custom @@ -299,6 +299,7 @@ type pool struct { xor sync.Pool histogram sync.Pool floatHistogram sync.Pool + xo2 sync.Pool } // NewPool returns a new pool. @@ -319,6 +320,11 @@ func NewPool() Pool { return &FloatHistogramChunk{b: bstream{}} }, }, + xo2: sync.Pool{ + New: func() any { + return &XOR2Chunk{b: bstream{}} + }, + }, } } @@ -331,6 +337,8 @@ func (p *pool) Get(e Encoding, b []byte) (Chunk, error) { c = p.histogram.Get().(*HistogramChunk) case EncFloatHistogram: c = p.floatHistogram.Get().(*FloatHistogramChunk) + case EncXOR2: + c = p.xo2.Get().(*XOR2Chunk) default: return nil, fmt.Errorf("invalid chunk encoding %q", e) } @@ -352,6 +360,9 @@ func (p *pool) Put(c Chunk) error { case EncFloatHistogram: _, ok = c.(*FloatHistogramChunk) sp = &p.floatHistogram + case EncXOR2: + _, ok = c.(*XOR2Chunk) + sp = &p.xo2 default: return fmt.Errorf("invalid chunk encoding %q", c.Encoding()) } @@ -378,6 +389,8 @@ func FromData(e Encoding, d []byte) (Chunk, error) { return &HistogramChunk{b: bstream{count: 0, stream: d}}, nil case EncFloatHistogram: return &FloatHistogramChunk{b: bstream{count: 0, stream: d}}, nil + case EncXOR2: + return &XOR2Chunk{b: bstream{count: 0, stream: d}}, nil } return nil, fmt.Errorf("invalid chunk encoding %q", e) } @@ -391,6 +404,8 @@ func NewEmptyChunk(e Encoding) (Chunk, error) { return NewHistogramChunk(), nil case EncFloatHistogram: return NewFloatHistogramChunk(), nil + case EncXOR2: + return NewXOR2Chunk(), nil } return nil, fmt.Errorf("invalid chunk encoding %q", e) } diff --git a/tsdb/chunkenc/chunk_test.go b/tsdb/chunkenc/chunk_test.go index 41bb23ddd1..4e19f15b42 100644 --- a/tsdb/chunkenc/chunk_test.go +++ b/tsdb/chunkenc/chunk_test.go @@ -16,36 +16,41 @@ package chunkenc import ( "errors" "fmt" - "io" "math/rand" "testing" "github.com/stretchr/testify/require" ) -type pair struct { - t int64 - v float64 +type triple struct { + st, t int64 + v float64 } func TestChunk(t *testing.T) { - for enc, nc := range map[Encoding]func() Chunk{ - EncXOR: func() Chunk { return NewXORChunk() }, - } { - t.Run(fmt.Sprintf("%v", enc), func(t *testing.T) { + testcases := []struct { + encoding Encoding + supportsST bool + factory func() Chunk + }{ + {encoding: EncXOR, supportsST: false, factory: func() Chunk { return NewXORChunk() }}, + {encoding: EncXOR2, supportsST: true, factory: func() Chunk { return NewXOR2Chunk() }}, + } + for _, tc := range testcases { + t.Run(fmt.Sprintf("%v", tc.encoding), func(t *testing.T) { for range make([]struct{}, 1) { - c := nc() - testChunk(t, c) + c := tc.factory() + testChunk(t, c, tc.supportsST) } }) } } -func testChunk(t *testing.T, c Chunk) { +func testChunk(t *testing.T, c Chunk, supportsST bool) { app, err := c.Appender() require.NoError(t, err) - var exp []pair + var exp []triple var ( ts = int64(1234123324) v = 1243535.123 @@ -65,26 +70,30 @@ func testChunk(t *testing.T, c Chunk) { require.NoError(t, err) } - app.Append(0, ts, v) - exp = append(exp, pair{t: ts, v: v}) + app.Append(ts-100, ts, v) + expST := int64(0) + if supportsST { + expST = ts - 100 + } + exp = append(exp, triple{st: expST, t: ts, v: v}) } // 1. Expand iterator in simple case. it1 := c.Iterator(nil) - var res1 []pair + var res1 []triple for it1.Next() == ValFloat { ts, v := it1.At() - res1 = append(res1, pair{t: ts, v: v}) + res1 = append(res1, triple{st: it1.AtST(), t: ts, v: v}) } require.NoError(t, it1.Err()) require.Equal(t, exp, res1) // 2. Expand second iterator while reusing first one. it2 := c.Iterator(it1) - var res2 []pair + var res2 []triple for it2.Next() == ValFloat { ts, v := it2.At() - res2 = append(res2, pair{t: ts, v: v}) + res2 = append(res2, triple{st: it2.AtST(), t: ts, v: v}) } require.NoError(t, it2.Err()) require.Equal(t, exp, res2) @@ -93,18 +102,22 @@ func testChunk(t *testing.T, c Chunk) { mid := len(exp) / 2 it3 := c.Iterator(nil) - var res3 []pair + var res3 []triple require.Equal(t, ValFloat, it3.Seek(exp[mid].t)) // Below ones should not matter. require.Equal(t, ValFloat, it3.Seek(exp[mid].t)) require.Equal(t, ValFloat, it3.Seek(exp[mid].t)) ts, v = it3.At() - res3 = append(res3, pair{t: ts, v: v}) + res3 = append(res3, triple{st: it3.AtST(), t: ts, v: v}) + lastTs := ts for it3.Next() == ValFloat { ts, v := it3.At() - res3 = append(res3, pair{t: ts, v: v}) + lastTs = ts + res3 = append(res3, triple{st: it3.AtST(), t: ts, v: v}) } + // Seeking to last timestamp should work and it is a no-op. + require.Equal(t, ValFloat, it3.Seek(lastTs)) require.NoError(t, it3.Err()) require.Equal(t, exp[mid:], res3) require.Equal(t, ValNone, it3.Seek(exp[len(exp)-1].t+1)) @@ -129,6 +142,10 @@ func TestPool(t *testing.T) { name: "float histogram", encoding: EncFloatHistogram, }, + { + name: "xor opt st", + encoding: EncXOR2, + }, { name: "invalid encoding", encoding: EncNone, @@ -150,6 +167,8 @@ func TestPool(t *testing.T) { b = &c.(*HistogramChunk).b case EncFloatHistogram: b = &c.(*FloatHistogramChunk).b + case EncXOR2: + b = &c.(*XOR2Chunk).b default: b = &c.(*XORChunk).b } @@ -199,111 +218,3 @@ func (c fakeChunk) Encoding() Encoding { func (c fakeChunk) Reset([]byte) { c.t.Fatal("Reset should not be called") } - -func benchmarkIterator(b *testing.B, newChunk func() Chunk) { - const samplesPerChunk = 250 - var ( - t = int64(1234123324) - v = 1243535.123 - exp []pair - ) - for range samplesPerChunk { - // t += int64(rand.Intn(10000) + 1) - t += int64(1000) - // v = rand.Float64() - v += float64(100) - exp = append(exp, pair{t: t, v: v}) - } - - chunk := newChunk() - { - a, err := chunk.Appender() - if err != nil { - b.Fatalf("get appender: %s", err) - } - j := 0 - for _, p := range exp { - if j > 250 { - break - } - a.Append(0, p.t, p.v) - j++ - } - } - - b.ReportAllocs() - - var res float64 - var it Iterator - for i := 0; b.Loop(); { - it := chunk.Iterator(it) - - for it.Next() == ValFloat { - _, v := it.At() - res = v - i++ - } - if err := it.Err(); err != nil && !errors.Is(err, io.EOF) { - require.NoError(b, err) - } - _ = res - } -} - -func newXORChunk() Chunk { - return NewXORChunk() -} - -func BenchmarkXORIterator(b *testing.B) { - benchmarkIterator(b, newXORChunk) -} - -func BenchmarkXORAppender(b *testing.B) { - r := rand.New(rand.NewSource(1)) - b.Run("constant", func(b *testing.B) { - benchmarkAppender(b, func() (int64, float64) { - return 1000, 0 - }, newXORChunk) - }) - b.Run("random steps", func(b *testing.B) { - benchmarkAppender(b, func() (int64, float64) { - return int64(r.Intn(100) - 50 + 15000), // 15 seconds +- up to 100ms of jitter. - float64(r.Intn(100) - 50) // Varying from -50 to +50 in 100 discrete steps. - }, newXORChunk) - }) - b.Run("random 0-1", func(b *testing.B) { - benchmarkAppender(b, func() (int64, float64) { - return int64(r.Intn(100) - 50 + 15000), // 15 seconds +- up to 100ms of jitter. - r.Float64() // Random between 0 and 1.0. - }, newXORChunk) - }) -} - -func benchmarkAppender(b *testing.B, deltas func() (int64, float64), newChunk func() Chunk) { - var ( - t = int64(1234123324) - v = 1243535.123 - ) - const nSamples = 120 // Same as tsdb.DefaultSamplesPerChunk. - var exp []pair - for range nSamples { - dt, dv := deltas() - t += dt - v += dv - exp = append(exp, pair{t: t, v: v}) - } - - b.ReportAllocs() - - for b.Loop() { - c := newChunk() - - a, err := c.Appender() - if err != nil { - b.Fatalf("get appender: %s", err) - } - for _, p := range exp { - a.Append(0, p.t, p.v) - } - } -} diff --git a/tsdb/chunkenc/st_helper_test.go b/tsdb/chunkenc/st_helper_test.go new file mode 100644 index 0000000000..7cc4e9f119 --- /dev/null +++ b/tsdb/chunkenc/st_helper_test.go @@ -0,0 +1,156 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunkenc + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/model/histogram" +) + +// testChunkSTHandling tests handling of start times in chunks. +// It uses 0-4 samples with timestamp 1000,2000,3000,4000 and monotonically +// increasing start times that are chosen from 0-(ts-500) for each sample. +// All combinations of start times are tested for each number of samples. +func testChunkSTHandling(t *testing.T, vt ValueType, chunkFactory func() Chunk) { + sampleAppend := func(app Appender, vt ValueType, st, ts int64, v float64) { + switch vt { + case ValFloat: + app.Append(st, ts, v) + case ValHistogram: + _, recoded, _, err := app.AppendHistogram(nil, st, ts, &histogram.Histogram{Sum: v, Count: uint64(v * 10)}, false) + require.NoError(t, err) + require.False(t, recoded) + case ValFloatHistogram: + _, recoded, _, err := app.AppendFloatHistogram(nil, st, ts, &histogram.FloatHistogram{Sum: v, Count: v * 10}, false) + require.NoError(t, err) + require.False(t, recoded) + default: + t.Fatalf("unsupported value type %v", vt) + } + } + + get := func(it Iterator, vt ValueType) (int64, int64, float64) { + switch vt { + case ValFloat: + ts, v := it.At() + return it.AtST(), ts, v + case ValHistogram: + ts, h := it.AtHistogram(nil) + return it.AtST(), ts, float64(h.Sum) + case ValFloatHistogram: + ts, fh := it.AtFloatHistogram(nil) + return it.AtST(), ts, fh.Sum + default: + t.Fatalf("unsupported value type %v", vt) + return 0, 0, 0 + } + } + + runCase := func(t *testing.T, samples []triple) { + chunk := chunkFactory() + app, err := chunk.Appender() + require.NoError(t, err) + var clone []byte + for i, s := range samples { + if i == len(samples)-1 { + clone = append(clone, chunk.Bytes()...) + } + sampleAppend(app, vt, s.st, s.t, s.v) + } + chunksToTest := []Chunk{chunk} + + if len(samples) > 0 { + // If there are samples, also test that appending to a chunk cloned from the original chunk works correctly. + // This tests resuming the appender from a previous chunk. + cloneChunk := chunkFactory() + cloneChunk.Reset(clone) + cloneApp, err := cloneChunk.Appender() + require.NoError(t, err) + sampleAppend(cloneApp, vt, samples[len(samples)-1].st, samples[len(samples)-1].t, samples[len(samples)-1].v) + chunksToTest = append(chunksToTest, cloneChunk) + } + + printChunkName := func(i int) string { + if i == 0 { + return "original" + } + return "cloned" + } + + for ci, chk := range chunksToTest { + require.Equal(t, len(samples), chk.NumSamples(), "%s chunk: number of samples mismatch", printChunkName(ci)) + it := chk.Iterator(nil) + for i, s := range samples { + require.Equal(t, vt, it.Next(), "%s[%d]: value type mismatch", printChunkName(ci), i) + st, ts, f := get(it, vt) + require.Equal(t, s.t, ts, "%s[%d]: timestamp mismatch", printChunkName(ci), i) + require.Equal(t, s.st, st, "%s[%d]: start time mismatch", printChunkName(ci), i) + require.InDelta(t, s.v, f, 1e-9, "%s[%d]: value mismatch", printChunkName(ci), i) + } + require.Equal(t, ValNone, it.Next()) + require.NoError(t, it.Err()) + } + } + + t.Run("manual for debugging", func(t *testing.T) { + samples := []triple{ + {st: 0, t: 1000, v: 1.5}, + {st: 0, t: 2000, v: 2.5}, + {st: 0, t: 3000, v: 3.5}, + {st: 0, t: 4000, v: 4.5}, + } + runCase(t, samples) + }) + + stTimes := []int64{0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000} + + ts := func(j int) int64 { + return int64(1000 * (j + 1)) + } + + for numberOfSamples := range 5 { + samples := make([]triple, numberOfSamples) + sampleSTidx := make([]int, numberOfSamples) + for { + for j := range numberOfSamples { + samples[j] = triple{ + st: stTimes[sampleSTidx[j]], + t: ts(j), + v: float64(j) + 0.5, + } + } + + t.Run(fmt.Sprintf("%v", samples), func(t *testing.T) { + runCase(t, samples) + }) + + exhausted := true + for j := numberOfSamples - 1; j >= 0; j-- { + if stTimes[sampleSTidx[j]] < ts(j) { + sampleSTidx[j]++ + exhausted = false + break + } + sampleSTidx[j] = 0 + } + if exhausted { + break + } + } + } +} diff --git a/tsdb/chunkenc/xor2.go b/tsdb/chunkenc/xor2.go new file mode 100644 index 0000000000..defe1e8102 --- /dev/null +++ b/tsdb/chunkenc/xor2.go @@ -0,0 +1,889 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// XOR2Chunk implements XOR encoding with joint timestamp+value control bits +// and byte-packed dod encoding for efficient appending. It also has an extra +// header byte after the sample count to allow for optionally encoding start +// timestamp (ST). +// +// Control prefix for samples >= 2: +// +// 0 → dod=0 AND value unchanged (1 bit) +// 10 → dod=0, value changed (2 bits, then value encoding) +// 110 → dod≠0, 13-bit signed [-4096, 4095] (prefix+dod packed into 2 bytes) +// 1110 → dod≠0, 20-bit signed [-524288, 524287] (prefix+dod packed into 3 bytes) +// 11110 → dod≠0, 64-bit escape (5+64 bits, then value encoding) +// 11111 → dod=0, stale NaN (5 bits, no value field) +// +// The dod bins are widened so that prefix+dod aligns to byte boundaries, +// replacing writeBit calls with writeByte for common cases. +// +// Value encoding for the dod≠0 cases (``): +// +// 0 → value unchanged +// 10 → reuse previous leading/trailing window +// 110 → new leading/trailing window +// 111 → stale NaN +// +// Value encoding for the dod=0, value-changed case (``): +// +// 0 → reuse previous leading/trailing window +// 1 → new leading/trailing window +// +// Start timestamp (ST) encoding: +// +// 1-byte ST header (at b[chunkHeaderSize]) layout: +// +// bit 7 (0x80): firstSTKnown — ST for the first sample is present in the stream +// bits 6-0: firstSTChangeOn — sample index where the first ST change begins +// +// When no ST is provided (st == 0 always), the header stays 0x00 and the +// chunk has no additional bits in it. +// +// When ST is present, the ST delta (prevT - st) is appended after each +// sample's joint timestamp+value encoding using putVarbitInt. + +package chunkenc + +import ( + "encoding/binary" + "math" + "math/bits" + + "github.com/prometheus/prometheus/model/histogram" + "github.com/prometheus/prometheus/model/value" +) + +const ( + chunkSTHeaderSize = 1 + maxFirstSTChangeOn = 0x7F +) + +func writeHeaderFirstSTKnown(b []byte) { + b[0] = 0x80 +} + +func writeHeaderFirstSTChangeOn(b []byte, firstSTChangeOn uint16) { + // First bit indicates the initial ST value. + // Here we save the sample number from where the first change occurs in the + // rest of the byte (7 bits) + + if firstSTChangeOn > maxFirstSTChangeOn { + // This should never happen, would cause corruption (ST already skipped but shouldn't). + return + } + b[0] |= uint8(firstSTChangeOn) +} + +func readSTHeader(b []byte) (firstSTKnown bool, firstSTChangeOn uint8) { + if b[0] == 0x00 { + return false, 0 + } + if b[0] == 0x80 { + return true, 0 + } + mask := byte(0x80) + if b[0]&mask != 0 { + firstSTKnown = true + } + mask = 0x7F + return firstSTKnown, b[0] & mask +} + +// XOR2Chunk holds XOR2 encoded samples with optional start +// timestamp per chunk or per sample. +type XOR2Chunk struct { + b bstream +} + +// NewXOR2Chunk returns a new chunk with XOR2 encoding. +func NewXOR2Chunk() *XOR2Chunk { + b := make([]byte, chunkHeaderSize+chunkSTHeaderSize, chunkAllocationSize) + return &XOR2Chunk{b: bstream{stream: b, count: 0}} +} + +func (c *XOR2Chunk) Reset(stream []byte) { + c.b.Reset(stream) +} + +// Encoding returns the encoding type. +func (*XOR2Chunk) Encoding() Encoding { + return EncXOR2 +} + +// Bytes returns the underlying byte slice of the chunk. +func (c *XOR2Chunk) Bytes() []byte { + return c.b.bytes() +} + +// NumSamples returns the number of samples in the chunk. +func (c *XOR2Chunk) NumSamples() int { + return int(binary.BigEndian.Uint16(c.Bytes())) +} + +// Compact implements the Chunk interface. +func (c *XOR2Chunk) Compact() { + if l := len(c.b.stream); cap(c.b.stream) > l+chunkCompactCapacityThreshold { + buf := make([]byte, l) + copy(buf, c.b.stream) + c.b.stream = buf + } +} + +// Appender implements the Chunk interface. +func (c *XOR2Chunk) Appender() (Appender, error) { + if len(c.b.stream) == chunkHeaderSize+chunkSTHeaderSize { + return &xor2Appender{ + b: &c.b, + t: math.MinInt64, + leading: 0xff, + }, nil + } + it := c.iterator(nil) + + for it.Next() != ValNone { + } + if err := it.Err(); err != nil { + return nil, err + } + + // Set the bit position for continuing writes. The iterator's reader tracks + // how many bits remain unread in the last byte. + c.b.count = it.br.valid + + a := &xor2Appender{ + b: &c.b, + st: it.st, + t: it.t, + v: it.baselineV, + tDelta: it.tDelta, + stDiff: it.stDiff, + leading: it.leading, + trailing: it.trailing, + numTotal: binary.BigEndian.Uint16(c.b.bytes()), + firstSTKnown: it.firstSTKnown, + firstSTChangeOn: uint16(it.firstSTChangeOn), + } + return a, nil +} + +func (c *XOR2Chunk) iterator(it Iterator) *xor2Iterator { + if iter, ok := it.(*xor2Iterator); ok { + iter.Reset(c.b.bytes()) + return iter + } + iter := &xor2Iterator{} + iter.Reset(c.b.bytes()) + return iter +} + +// Iterator implements the Chunk interface. +func (c *XOR2Chunk) Iterator(it Iterator) Iterator { + return c.iterator(it) +} + +// xor2Appender appends samples with optional start timestamps using +// the XOR2 joint control bit encoding for regular timestamp and value, +// and putVarbitInt for the start timestamp delta. +type xor2Appender struct { + b *bstream + + st int64 + t int64 + v float64 + tDelta uint64 + stDiff int64 // prevT - st for the previous sample. + + leading uint8 + trailing uint8 + + numTotal uint16 + firstSTChangeOn uint16 + firstSTKnown bool +} + +func (a *xor2Appender) Append(st, t int64, v float64) { + var ( + tDelta uint64 + stDiff int64 + ) + + switch a.numTotal { + case 0: + buf := make([]byte, binary.MaxVarintLen64) + for _, b := range buf[:binary.PutVarint(buf, t)] { + a.b.writeByte(b) + } + a.b.writeBitsFast(math.Float64bits(v), 64) + + if st != 0 { + for _, b := range buf[:binary.PutVarint(buf, t-st)] { + a.b.writeByte(b) + } + a.firstSTKnown = true + writeHeaderFirstSTKnown(a.b.bytes()[chunkHeaderSize:]) + } + + case 1: + tDelta = uint64(t - a.t) + + buf := make([]byte, binary.MaxVarintLen64) + for _, b := range buf[:binary.PutUvarint(buf, tDelta)] { + a.b.writeByte(b) + } + + a.writeVDelta(v) + + if st != a.st { + stDiff = a.t - st + a.firstSTChangeOn = 1 + writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], 1) + putVarbitInt(a.b, stDiff) + } + + default: + tDelta = uint64(t - a.t) + dod := int64(tDelta - a.tDelta) + + // Fast path: no ST involvement at all. + if st == 0 && a.numTotal != maxFirstSTChangeOn && a.firstSTChangeOn == 0 && !a.firstSTKnown { + a.encodeJoint(dod, v) + a.t = t + if !value.IsStaleNaN(v) { + a.v = v + } + a.tDelta = tDelta + a.numTotal++ + binary.BigEndian.PutUint16(a.b.bytes(), a.numTotal) + return + } + + // Slow path: ST may be involved. + a.encodeJoint(dod, v) + + if a.firstSTChangeOn == 0 { + if st != a.st || a.numTotal == maxFirstSTChangeOn { + // First ST change: record prevT - st. + stDiff = a.t - st + a.firstSTChangeOn = a.numTotal + writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], a.numTotal) + putVarbitInt(a.b, stDiff) + } + } else { + stDiff = a.t - st + putVarbitInt(a.b, stDiff-a.stDiff) + } + } + + a.st = st + a.t = t + if !value.IsStaleNaN(v) { + a.v = v + } + a.tDelta = tDelta + a.stDiff = stDiff + a.numTotal++ + binary.BigEndian.PutUint16(a.b.bytes(), a.numTotal) +} + +// encodeJoint writes the XOR2 joint timestamp+value control sequence for +// samples >= 2. +func (a *xor2Appender) encodeJoint(dod int64, v float64) { + if dod == 0 { + if value.IsStaleNaN(v) { + a.b.writeBitsFast(0b11111, 5) + return + } + vbits := math.Float64bits(v) ^ math.Float64bits(a.v) + if vbits == 0 { + a.b.writeBit(zero) + return + } + a.b.writeBitsFast(0b10, 2) + a.writeVDeltaKnownNonZero(vbits) + return + } + + switch { + case dod >= -(1<<12) && dod <= (1<<12)-1: + // 13-bit dod: prefix `110` packed with top 5 bits → 2 bytes total. + a.b.writeByte(0b110_00000 | byte(uint64(dod)>>8)&0x1F) + a.b.writeByte(byte(uint64(dod))) + case dod >= -(1<<19) && dod <= (1<<19)-1: + // 20-bit dod: prefix `1110` packed with top 4 bits → 3 bytes total. + a.b.writeByte(0b1110_0000 | byte(uint64(dod)>>16)&0x0F) + a.b.writeByte(byte(uint64(dod) >> 8)) + a.b.writeByte(byte(uint64(dod))) + default: + // 64-bit escape (rare): `11110`. + a.b.writeBitsFast(0b11110, 5) + a.b.writeBitsFast(uint64(dod), 64) + } + a.writeVDelta(v) +} + +// writeVDelta encodes the value delta for the dod≠0 case. +func (a *xor2Appender) writeVDelta(v float64) { + if value.IsStaleNaN(v) { + a.b.writeBitsFast(0b111, 3) + return + } + + delta := math.Float64bits(v) ^ math.Float64bits(a.v) + + if delta == 0 { + a.b.writeBit(zero) + return + } + + newLeading := uint8(bits.LeadingZeros64(delta)) + newTrailing := uint8(bits.TrailingZeros64(delta)) + + if newLeading >= 32 { + newLeading = 31 + } + + if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing { + a.b.writeBitsFast(0b10, 2) + a.b.writeBitsFast(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) + return + } + + a.leading, a.trailing = newLeading, newTrailing + + a.b.writeBitsFast(0b110, 3) + a.b.writeBitsFast(uint64(newLeading), 5) + + sigbits := 64 - newLeading - newTrailing + a.b.writeBitsFast(uint64(sigbits), 6) + a.b.writeBitsFast(delta>>newTrailing, int(sigbits)) +} + +// writeVDeltaKnownNonZero encodes a precomputed value XOR delta for the +// dod=0, value-changed case. delta must be non-zero or staleNaN. Stale NaN with dod=0 is +// handled at the joint control level (`11111`) and never reaches this function. +// +// Encoding: +// +// `0` → reuse previous leading/trailing window +// `1` → new leading/trailing window +func (a *xor2Appender) writeVDeltaKnownNonZero(delta uint64) { + newLeading := uint8(bits.LeadingZeros64(delta)) + newTrailing := uint8(bits.TrailingZeros64(delta)) + + if newLeading >= 32 { + newLeading = 31 + } + + if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing { + a.b.writeBit(zero) + a.b.writeBitsFast(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) + return + } + + a.leading, a.trailing = newLeading, newTrailing + + a.b.writeBit(one) + a.b.writeBitsFast(uint64(newLeading), 5) + + sigbits := 64 - newLeading - newTrailing + a.b.writeBitsFast(uint64(sigbits), 6) + a.b.writeBitsFast(delta>>newTrailing, int(sigbits)) +} + +func (*xor2Appender) AppendHistogram(*HistogramAppender, int64, int64, *histogram.Histogram, bool) (Chunk, bool, Appender, error) { + panic("appended a histogram sample to a float chunk") +} + +func (*xor2Appender) AppendFloatHistogram(*FloatHistogramAppender, int64, int64, *histogram.FloatHistogram, bool) (Chunk, bool, Appender, error) { + panic("appended a float histogram sample to a float chunk") +} + +// xor2Iterator decodes XOR2 chunks. +type xor2Iterator struct { + br bstreamReader + numTotal uint16 + numRead uint16 + + firstSTKnown bool + firstSTChangeOn uint8 + + leading uint8 + trailing uint8 + + st int64 + t int64 + val float64 + + tDelta uint64 + stDiff int64 // Accumulated prevT - st. + err error + + baselineV float64 // Last non-stale value for XOR baseline. +} + +func (it *xor2Iterator) Seek(t int64) ValueType { + if it.err != nil { + return ValNone + } + + for t > it.t || it.numRead == 0 { + if it.Next() == ValNone { + return ValNone + } + } + return ValFloat +} + +func (it *xor2Iterator) At() (int64, float64) { + return it.t, it.val +} + +func (*xor2Iterator) AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram) { + panic("cannot call xor2Iterator.AtHistogram") +} + +func (*xor2Iterator) AtFloatHistogram(*histogram.FloatHistogram) (int64, *histogram.FloatHistogram) { + panic("cannot call xor2Iterator.AtFloatHistogram") +} + +func (it *xor2Iterator) AtT() int64 { + return it.t +} + +func (it *xor2Iterator) AtST() int64 { + return it.st +} + +func (it *xor2Iterator) Err() error { + return it.err +} + +func (it *xor2Iterator) Reset(b []byte) { + it.br = newBReader(b[chunkHeaderSize+chunkSTHeaderSize:]) + it.numTotal = binary.BigEndian.Uint16(b) + it.firstSTKnown, it.firstSTChangeOn = readSTHeader(b[chunkHeaderSize:]) + + it.numRead = 0 + it.st = 0 + it.t = 0 + it.val = 0 + it.leading = 0 + it.trailing = 0 + it.tDelta = 0 + it.stDiff = 0 + it.baselineV = 0 + it.err = nil +} + +func (it *xor2Iterator) Next() ValueType { + if it.err != nil || it.numRead == it.numTotal { + return ValNone + } + + if it.numRead == 0 { + t, err := it.br.readVarint() + if err != nil { + it.err = err + return ValNone + } + v, err := it.br.readBits(64) + if err != nil { + it.err = err + return ValNone + } + it.t = t + it.val = math.Float64frombits(v) + if !value.IsStaleNaN(it.val) { + it.baselineV = it.val + } + + // Optional ST for sample 0. + if it.firstSTKnown { + stDiff, err := it.br.readVarint() + if err != nil { + it.err = err + return ValNone + } + it.st = t - stDiff + } + + it.numRead++ + return ValFloat + } + + if it.numRead == 1 { + tDelta, err := it.br.readUvarint() + if err != nil { + it.err = err + return ValNone + } + prevT := it.t + it.tDelta = tDelta + it.t += int64(it.tDelta) + + if err := it.decodeValue(); err != nil { + it.err = err + return ValNone + } + + // Optional ST delta for sample 1. + if it.firstSTChangeOn == 1 { + sdod, err := readVarbitInt(&it.br) + if err != nil { + it.err = err + return ValNone + } + it.stDiff = sdod + it.st = prevT - sdod + } + + it.numRead++ + return ValFloat + } + + // Sample N >= 2: read joint XOR2 control, then optional ST data. + prevT := it.t + savedNumRead := it.numRead + + ctrl, ok := it.br.readXOR2ControlFast() + if !ok { + var err error + ctrl, err = it.br.readXOR2Control() + if err != nil { + it.err = err + return ValNone + } + } + + switch ctrl { + case 0: + // dod=0, value unchanged. + it.t += int64(it.tDelta) + it.val = it.baselineV + case 1: + // dod=0, value changed. + it.t += int64(it.tDelta) + if err := it.decodeValueKnownNonZero(); err != nil { + it.err = err + return ValNone + } + case 2: + // 13-bit dod. + if err := it.readDod(13); err != nil { + it.err = err + return ValNone + } + if err := it.decodeValue(); err != nil { + it.err = err + return ValNone + } + case 3: + // 20-bit dod. + if err := it.readDod(20); err != nil { + it.err = err + return ValNone + } + if err := it.decodeValue(); err != nil { + it.err = err + return ValNone + } + case 4: + // 64-bit escape. + if err := it.readDod(64); err != nil { + it.err = err + return ValNone + } + if err := it.decodeValue(); err != nil { + it.err = err + return ValNone + } + default: + // dod=0, stale NaN. + it.t += int64(it.tDelta) + it.val = math.Float64frombits(value.StaleNaN) + } + + // Optional ST data, appended after the joint timestamp+value encoding. + // The ST delta was encoded as (prevT - st), using the PREVIOUS sample's t. + if it.firstSTChangeOn > 0 && savedNumRead >= uint16(it.firstSTChangeOn) { + sdod, err := readVarbitInt(&it.br) + if err != nil { + it.err = err + return ValNone + } + if savedNumRead == uint16(it.firstSTChangeOn) { + it.stDiff = sdod + } else { + it.stDiff += sdod + } + it.st = prevT - it.stDiff + } + + it.numRead++ + return ValFloat +} + +// readDod reads a signed dod of width w bits and updates it.tDelta and it.t. +func (it *xor2Iterator) readDod(w uint8) error { + var b uint64 + if it.br.valid >= w { + it.br.valid -= w + b = (it.br.buffer >> it.br.valid) & ((uint64(1) << w) - 1) + } else { + var err error + b, err = it.br.readBits(w) + if err != nil { + return err + } + } + + if w < 64 && b >= (1<<(w-1)) { + b -= 1 << w + } + + it.tDelta = uint64(int64(it.tDelta) + int64(b)) + it.t += int64(it.tDelta) + return nil +} + +// decodeValue reads the XOR2 value encoding for the dod≠0 case: +// +// `0` → value unchanged +// `10` → reuse previous leading/trailing window +// `110` → new leading/trailing window +// `111` → stale NaN +func (it *xor2Iterator) decodeValue() error { + // Fast path: 3 bits available — read the full control prefix in one shot. + // Encoding: `0`=unchanged, `10`=reuse window, `110`=new window, `111`=stale NaN. + if it.br.valid >= 3 { + ctrl := (it.br.buffer >> (it.br.valid - 3)) & 0x7 + if ctrl&0x4 == 0 { + // `0xx`: value unchanged, consume 1 bit. + it.br.valid-- + it.val = it.baselineV + return nil + } + if ctrl&0x6 == 0x4 { + // `10x`: reuse previous leading/trailing window, consume 2 bits. + it.br.valid -= 2 + sz := uint8(64 - int(it.leading) - int(it.trailing)) + var valueBits uint64 + if it.br.valid >= sz { + it.br.valid -= sz + valueBits = (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + } else { + var err error + valueBits, err = it.br.readBits(sz) + if err != nil { + return err + } + } + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + // `11x`: consume 3 bits. + it.br.valid -= 3 + if ctrl == 0x6 { + // `110`: new leading/trailing window. + return it.decodeNewLeadingTrailing() + } + // `111`: stale NaN. + it.val = math.Float64frombits(value.StaleNaN) + return nil + } + + // Slow path: fewer than 3 bits buffered (rare, only near buffer refills). + var bit bit + if it.br.valid > 0 { + it.br.valid-- + bit = (it.br.buffer & (uint64(1) << it.br.valid)) != 0 + } else { + var err error + bit, err = it.br.readBit() + if err != nil { + return err + } + } + + if bit == zero { + // `0` → value unchanged. + it.val = it.baselineV + return nil + } + + if it.br.valid > 0 { + it.br.valid-- + bit = (it.br.buffer & (uint64(1) << it.br.valid)) != 0 + } else { + var err error + bit, err = it.br.readBit() + if err != nil { + return err + } + } + + if bit == zero { + // `10` → reuse previous leading/trailing window. + sz := uint8(64 - int(it.leading) - int(it.trailing)) + var valueBits uint64 + if it.br.valid >= sz { + it.br.valid -= sz + valueBits = (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + } else { + var err error + valueBits, err = it.br.readBits(sz) + if err != nil { + return err + } + } + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + + if it.br.valid > 0 { + it.br.valid-- + bit = (it.br.buffer & (uint64(1) << it.br.valid)) != 0 + } else { + var err error + bit, err = it.br.readBit() + if err != nil { + return err + } + } + + if bit == zero { + // `110` → new leading/trailing window. + return it.decodeNewLeadingTrailing() + } + + // `111` → stale NaN. + it.val = math.Float64frombits(value.StaleNaN) + return nil +} + +// decodeValueKnownNonZero reads the XOR2 value encoding for the dod=0, +// value-changed case: +// +// `0` → reuse previous leading/trailing window +// `1` → new leading/trailing window +func (it *xor2Iterator) decodeValueKnownNonZero() error { + sz := uint8(64 - int(it.leading) - int(it.trailing)) + // Fast path: combine the 1-bit reuse/new-window control read with the + // sz-bit value read into a single buffer operation. + if it.br.valid >= 1+sz { + ctrlBit := (it.br.buffer >> (it.br.valid - 1)) & 1 + if ctrlBit == 0 { // `0`: reuse previous leading/trailing window. + it.br.valid -= 1 + sz + valueBits := (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + // `1`: new leading/trailing window. + it.br.valid-- + return it.decodeNewLeadingTrailing() + } + + // Slow path: read control bit then value bits separately. + var bit bit + if it.br.valid > 0 { + it.br.valid-- + bit = (it.br.buffer & (uint64(1) << it.br.valid)) != 0 + } else { + var err error + bit, err = it.br.readBit() + if err != nil { + return err + } + } + + if bit == zero { + // `0` → reuse previous leading/trailing window. + var valueBits uint64 + if it.br.valid >= sz { + it.br.valid -= sz + valueBits = (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + } else { + var err error + valueBits, err = it.br.readBits(sz) + if err != nil { + return err + } + } + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + + // `1` → new leading/trailing window. + return it.decodeNewLeadingTrailing() +} + +// decodeNewLeadingTrailing reads a new leading/sigbits/value triple and +// updates it.leading, it.trailing, it.val, and it.baselineV. +func (it *xor2Iterator) decodeNewLeadingTrailing() error { + var newLeading, sigbits uint64 + // Fast path: read leading (5 bits) and sigbits (6 bits) together as 11 bits. + if it.br.valid >= 11 { + val := (it.br.buffer >> (it.br.valid - 11)) & 0x7ff + it.br.valid -= 11 + newLeading = val >> 6 + sigbits = val & 0x3f + } else { + var err error + newLeading, err = it.br.readBits(5) + if err != nil { + return err + } + sigbits, err = it.br.readBits(6) + if err != nil { + return err + } + } + + it.leading = uint8(newLeading) + if sigbits == 0 { + sigbits = 64 + } + it.trailing = 64 - it.leading - uint8(sigbits) + + n := uint8(sigbits) + var valueBits uint64 + if it.br.valid >= n { + it.br.valid -= n + valueBits = (it.br.buffer >> it.br.valid) & ((uint64(1) << n) - 1) + } else { + var err error + valueBits, err = it.br.readBits(n) + if err != nil { + return err + } + } + + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil +} diff --git a/tsdb/chunkenc/xor2_test.go b/tsdb/chunkenc/xor2_test.go new file mode 100644 index 0000000000..c0c1af8a1b --- /dev/null +++ b/tsdb/chunkenc/xor2_test.go @@ -0,0 +1,527 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunkenc + +import ( + "fmt" + "math" + "math/bits" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/model/value" +) + +func newXOR2IteratorForPayload(t *testing.T, padding int, payload func(*bstream), setup func(*xor2Iterator)) *xor2Iterator { + t.Helper() + + var bs bstream + if padding > 0 { + bs.writeBitsFast(0, padding) + } + payload(&bs) + // Add tail bytes so the reader initially fills a full 64-bit buffer. + bs.writeBitsFast(0, 64) + + it := &xor2Iterator{} + if setup != nil { + setup(it) + } + it.br = newBReader(bs.bytes()) + + if padding > 0 { + _, err := it.br.readBits(uint8(padding)) + require.NoError(t, err) + } + + return it +} + +func writeXOR2NewWindowPayload(bs *bstream, delta uint64) (leading, trailing uint8) { + leading, trailing, sigbits := xor2DeltaWindow(delta) + encodedSigbits := sigbits + if sigbits == 64 { + encodedSigbits = 0 + } + + bs.writeBitsFast(uint64(leading), 5) + bs.writeBitsFast(uint64(encodedSigbits), 6) + bs.writeBitsFast(delta>>trailing, int(sigbits)) + + return leading, trailing +} + +func xor2DeltaWindow(delta uint64) (leading, trailing, sigbits uint8) { + leading = uint8(bits.LeadingZeros64(delta)) + trailing = uint8(bits.TrailingZeros64(delta)) + if leading >= 32 { + leading = 31 + } + + return leading, trailing, 64 - leading - trailing +} + +func BenchmarkXor2Write(b *testing.B) { + samples := make([]struct { + t int64 + v float64 + }, 120) + for i := range samples { + samples[i].t = int64(i) * 1000 + samples[i].v = float64(i) + float64(i)/10 + float64(i)/100 + float64(i)/1000 + } + + b.ReportAllocs() + + for b.Loop() { + c := NewXOR2Chunk() + app, _ := c.Appender() + for _, s := range samples { + app.Append(0, s.t, s.v) + } + } +} + +func BenchmarkXor2Read(b *testing.B) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(b, err) + for i := int64(0); i < 120*1000; i += 1000 { + app.Append(0, i, float64(i)+float64(i)/10+float64(i)/100+float64(i)/1000) + } + + b.ReportAllocs() + + var it Iterator + for b.Loop() { + var ts int64 + var v float64 + it = c.Iterator(it) + for it.Next() != ValNone { + ts, v = it.At() + } + _, _ = ts, v + } +} + +func TestXOR2Basic(t *testing.T) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(t, err) + + samples := []struct { + t int64 + v float64 + }{ + {1000, 1.0}, + {2000, 2.0}, + {3000, 3.0}, + {4000, 4.0}, + {5000, 5.0}, + } + + for _, s := range samples { + app.Append(0, s.t, s.v) + } + + it := c.Iterator(nil) + for _, expected := range samples { + require.Equal(t, ValFloat, it.Next()) + ts, v := it.At() + require.Equal(t, expected.t, ts) + require.Equal(t, expected.v, v) + } + require.Equal(t, ValNone, it.Next()) +} + +func TestXOR2WithStaleness(t *testing.T) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(t, err) + + samples := []struct { + t int64 + v float64 + stale bool + }{ + {1000, 1.0, false}, + {2000, 2.0, false}, + {3000, math.Float64frombits(value.StaleNaN), true}, + {4000, 4.0, false}, + {5000, math.Float64frombits(value.StaleNaN), true}, + {6000, 6.0, false}, + } + + for _, s := range samples { + app.Append(0, s.t, s.v) + } + + it := c.Iterator(nil) + for _, expected := range samples { + require.Equal(t, ValFloat, it.Next()) + ts, v := it.At() + require.Equal(t, expected.t, ts) + if expected.stale { + require.True(t, value.IsStaleNaN(v), "Expected stale NaN at ts=%d", ts) + } else { + require.Equal(t, expected.v, v) + } + } + require.Equal(t, ValNone, it.Next()) +} + +func TestXOR2StaleWithDodNonZero(t *testing.T) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(t, err) + + // Stale NaN samples where the timestamp dod is non-zero, exercising the + // `111` value encoding path inside writeVDelta. + samples := []struct { + t int64 + v float64 + stale bool + }{ + {1000, 1.0, false}, + {2000, 2.0, false}, + // dod = (1050 - 1000) - (2000 - 1000) = 50 - 1000 = -950: stale with dod≠0. + {3050, math.Float64frombits(value.StaleNaN), true}, + {4050, 4.0, false}, + {5050, 5.0, false}, + } + + for _, s := range samples { + app.Append(0, s.t, s.v) + } + + it := c.Iterator(nil) + for _, expected := range samples { + require.Equal(t, ValFloat, it.Next()) + ts, v := it.At() + require.Equal(t, expected.t, ts) + if expected.stale { + require.True(t, value.IsStaleNaN(v), "Expected stale NaN at ts=%d", ts) + } else { + require.Equal(t, expected.v, v) + } + } + require.Equal(t, ValNone, it.Next()) +} + +func TestXOR2IrregularTimestamps(t *testing.T) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(t, err) + + // Timestamps with dod values spanning multiple encoding ranges. + timestamps := []int64{ + 1000, 2000, 3000, + // dod in 13-bit range. + 3050, 4050, 5050, + // dod in 20-bit range (large jitter). + 5050 + 100000, 5050 + 200000, 5050 + 300000, + // Back to regular. + 5050 + 301000, + } + for _, ts := range timestamps { + app.Append(0, ts, 1.0) + } + + it := c.Iterator(nil) + for _, expected := range timestamps { + require.Equal(t, ValFloat, it.Next()) + ts, _ := it.At() + require.Equal(t, expected, ts) + } + require.Equal(t, ValNone, it.Next()) +} + +func TestXOR2LargeDod(t *testing.T) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(t, err) + + // Force the 64-bit escape path with a very large dod. + timestamps := []int64{0, 1000, 2000, 2000 + (1 << 20)} + for _, ts := range timestamps { + app.Append(0, ts, 1.0) + } + + it := c.Iterator(nil) + for _, expected := range timestamps { + require.Equal(t, ValFloat, it.Next()) + ts, _ := it.At() + require.Equal(t, expected, ts) + } + require.Equal(t, ValNone, it.Next()) +} + +func TestXOR2ChunkST(t *testing.T) { + testChunkSTHandling(t, ValFloat, func() Chunk { + return NewXOR2Chunk() + }) +} + +func TestXOR2Chunk_MoreThan127Samples(t *testing.T) { + const afterMax = maxFirstSTChangeOn + 3 + t.Run("zero ST", func(t *testing.T) { + chunk := NewXOR2Chunk() + app, err := chunk.Appender() + require.NoError(t, err) + for i := range afterMax { + app.Append(0, int64(i*10+1), float64(i)*1.5) + } + + it := chunk.Iterator(nil) + for i := range afterMax { + require.Equal(t, ValFloat, it.Next()) + st := it.AtST() + ts, v := it.At() + require.Equal(t, int64(0), st) + require.Equal(t, int64(i*10+1), ts) + require.Equal(t, float64(i)*1.5, v) + } + + require.Equal(t, ValNone, it.Next()) + require.NoError(t, it.Err()) + }) + + t.Run("non-zero ST after 127", func(t *testing.T) { + chunk := NewXOR2Chunk() + app, err := chunk.Appender() + require.NoError(t, err) + for i := range afterMax { + st := int64(0) + if i == afterMax-1 { + st = int64((afterMax - 1) * 10) + } + app.Append(st, int64(i*10+1), float64(i)*1.5) + } + + it := chunk.Iterator(nil) + for i := range afterMax { + require.Equal(t, ValFloat, it.Next()) + st := it.AtST() + ts, v := it.At() + if i == afterMax-1 { + require.Equal(t, int64((afterMax-1)*10), st) + } else { + require.Equal(t, int64(0), st) + } + require.Equal(t, int64(i*10+1), ts) + require.Equal(t, float64(i)*1.5, v) + } + + require.Equal(t, ValNone, it.Next()) + require.NoError(t, it.Err()) + }) +} + +// TestXOR2DecodeFunctionsAcrossPadding exercises decodeValue, +// decodeValueKnownNonZero, and decodeNewLeadingTrailing across all logical +// cases × all 64 bit-buffer alignments (padding 0..63). Padding controls the +// number of bits that precede the payload in the stream, which determines +// how many bits remain in the 64-bit read buffer when the decode function is +// called. This Cartesian product ensures both the fast path (enough bits +// buffered for a single-shot read) and the slow path (bits span a buffer +// refill) are exercised for every case. +func TestXOR2DecodeFunctionsAcrossPadding(t *testing.T) { + const baseline = 1234.5 + + type testCase struct { + name string + payload func(*bstream) + setup func(*xor2Iterator) + assert func(*testing.T, *xor2Iterator) + } + + runCases := func(t *testing.T, cases []testCase, fn func(*xor2Iterator) error) { + t.Helper() + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + for padding := range 64 { + t.Run(fmt.Sprintf("padding=%d", padding), func(t *testing.T) { + it := newXOR2IteratorForPayload(t, padding, tc.payload, tc.setup) + require.NoError(t, fn(it)) + tc.assert(t, it) + }) + } + }) + } + } + + // decodeValue: `0`=unchanged, `10`=reuse window, `110`=new window, `111`=stale NaN. + t.Run("decodeValue", func(t *testing.T) { + reuseD := uint64(0x000ABCDE000000) + rL, rT, rS := xor2DeltaWindow(reuseD) + + // Two new-window variants: full-width sigbits (encoded as 0) and small + // sigbits, to cover both value-bits read paths inside decodeNewLeadingTrailing. + newDFull := uint64(0xFEDCBA9876543211) + nLFull, nTFull, _ := xor2DeltaWindow(newDFull) + newDSmall := uint64(0x000ABCDE000000) + nLSmall, nTSmall, _ := xor2DeltaWindow(newDSmall) + + runCases(t, []testCase{ + { + name: "unchanged", + payload: func(bs *bstream) { bs.writeBit(zero) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.Equal(t, baseline, it.val) + require.Equal(t, baseline, it.baselineV) + }, + }, + { + name: "reuse_window", + payload: func(bs *bstream) { + bs.writeBitsFast(0b10, 2) + bs.writeBitsFast(reuseD>>rT, int(rS)) + }, + setup: func(it *xor2Iterator) { + it.baselineV = baseline + it.leading, it.trailing = rL, rT + }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ reuseD) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, rL, it.leading) + require.Equal(t, rT, it.trailing) + }, + }, + { + name: "new_window_full_sigbits", + payload: func(bs *bstream) { + bs.writeBitsFast(0b110, 3) + writeXOR2NewWindowPayload(bs, newDFull) + }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ newDFull) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, nLFull, it.leading) + require.Equal(t, nTFull, it.trailing) + }, + }, + { + name: "new_window_small_sigbits", + payload: func(bs *bstream) { + bs.writeBitsFast(0b110, 3) + writeXOR2NewWindowPayload(bs, newDSmall) + }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ newDSmall) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, nLSmall, it.leading) + require.Equal(t, nTSmall, it.trailing) + }, + }, + { + name: "stale_nan", + payload: func(bs *bstream) { bs.writeBitsFast(0b111, 3) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.True(t, value.IsStaleNaN(it.val)) + require.Equal(t, baseline, it.baselineV) + }, + }, + }, (*xor2Iterator).decodeValue) + }) + + // decodeValueKnownNonZero: `0`=reuse window, `1`=new window. + // The new_window case uses real leading/trailing (not 0xff) so that sz is + // small enough for the fast path (valid >= 1+sz) to be reached with ctrlBit=1. + t.Run("decodeValueKnownNonZero", func(t *testing.T) { + delta := uint64(0x000ABCDE000000) + dL, dT, dS := xor2DeltaWindow(delta) + + runCases(t, []testCase{ + { + name: "reuse_window", + payload: func(bs *bstream) { + bs.writeBit(zero) + bs.writeBitsFast(delta>>dT, int(dS)) + }, + setup: func(it *xor2Iterator) { + it.baselineV = baseline + it.leading, it.trailing = dL, dT + }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ delta) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + }, + }, + { + name: "new_window", + payload: func(bs *bstream) { + bs.writeBit(one) + writeXOR2NewWindowPayload(bs, delta) + }, + setup: func(it *xor2Iterator) { + it.baselineV = baseline + it.leading, it.trailing = dL, dT + }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ delta) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, dL, it.leading) + require.Equal(t, dT, it.trailing) + }, + }, + }, (*xor2Iterator).decodeValueKnownNonZero) + }) + + // decodeNewLeadingTrailing: exercises the 11-bit header fast path, the + // value-bits fast path (small sigbits), and full-width sigbits (encoded as 0). + t.Run("decodeNewLeadingTrailing", func(t *testing.T) { + smallD := uint64(0x000ABCDE000000) + sL, sT, _ := xor2DeltaWindow(smallD) + fullD := uint64(0xFEDCBA9876543211) + fL, fT, _ := xor2DeltaWindow(fullD) + + runCases(t, []testCase{ + { + name: "small_sigbits", + payload: func(bs *bstream) { writeXOR2NewWindowPayload(bs, smallD) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.Equal(t, sL, it.leading) + require.Equal(t, sT, it.trailing) + expected := math.Float64frombits(math.Float64bits(baseline) ^ smallD) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + }, + }, + { + name: "full_width_sigbits", + payload: func(bs *bstream) { writeXOR2NewWindowPayload(bs, fullD) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.Equal(t, fL, it.leading) + require.Equal(t, fT, it.trailing) + expected := math.Float64frombits(math.Float64bits(baseline) ^ fullD) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + }, + }, + }, (*xor2Iterator).decodeNewLeadingTrailing) + }) +} diff --git a/tsdb/chunks/chunks.go b/tsdb/chunks/chunks.go index 9b4e011562..6084f7148e 100644 --- a/tsdb/chunks/chunks.go +++ b/tsdb/chunks/chunks.go @@ -135,7 +135,9 @@ type Meta struct { } // ChunkFromSamples requires all samples to have the same type. -// TODO(krajorama): test with ST when chunk formats support it. +// It is not efficient and meant for testing purposes only. +// It scans the samples to determine whether any sample has ST set and +// creates a chunk accordingly. func ChunkFromSamples(s []Sample) (Meta, error) { return ChunkFromSamplesGeneric(SampleSlice(s)) } @@ -154,7 +156,17 @@ func ChunkFromSamplesGeneric(s Samples) (Meta, error) { } sampleType := s.Get(0).Type() - c, err := chunkenc.NewEmptyChunk(sampleType.ChunkEncoding()) + + hasST := false + for i := range s.Len() { + if s.Get(i).ST() != 0 { + hasST = true + break + } + } + + // Request storing ST in the chunk if available. + c, err := sampleType.NewChunk(hasST) if err != nil { return Meta{}, err } diff --git a/tsdb/chunks/chunks_test.go b/tsdb/chunks/chunks_test.go index db45fdf712..01104103a8 100644 --- a/tsdb/chunks/chunks_test.go +++ b/tsdb/chunks/chunks_test.go @@ -19,6 +19,7 @@ import ( "github.com/stretchr/testify/require" + "github.com/prometheus/prometheus/tsdb/chunkenc" "github.com/prometheus/prometheus/tsdb/tsdbutil" ) @@ -60,3 +61,35 @@ func TestWriterWithDefaultSegmentSize(t *testing.T) { require.NoError(t, err) require.Len(t, d, 1, "expected only one segment to be created to hold both chunks") } + +func TestChunkFromSamplesWithST(t *testing.T) { + // Create samples with explicit ST (source timestamp) values. + samples := []Sample{ + sample{t: 10, f: 11, st: 5}, + sample{t: 20, f: 12, st: 15}, + sample{t: 30, f: 13, st: 25}, + } + + chk, err := ChunkFromSamples(samples) + require.NoError(t, err) + require.NotNil(t, chk.Chunk) + + // Verify MinTime and MaxTime. + require.Equal(t, int64(10), chk.MinTime) + require.Equal(t, int64(30), chk.MaxTime) + + // Iterate over the chunk and verify ST values are preserved. + it := chk.Chunk.Iterator(nil) + idx := 0 + for vt := it.Next(); vt != chunkenc.ValNone; vt = it.Next() { + require.Equal(t, chunkenc.ValFloat, vt) + ts, v := it.At() + st := it.AtST() + require.Equal(t, samples[idx].ST(), st, "ST mismatch at index %d", idx) + require.Equal(t, samples[idx].T(), ts, "T mismatch at index %d", idx) + require.Equal(t, samples[idx].F(), v, "F mismatch at index %d", idx) + idx++ + } + require.NoError(t, it.Err()) + require.Equal(t, len(samples), idx, "expected all samples to be iterated") +} diff --git a/tsdb/compression/compression.go b/tsdb/compression/compression.go new file mode 100644 index 0000000000..147a526f7e --- /dev/null +++ b/tsdb/compression/compression.go @@ -0,0 +1,130 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package compression + +import ( + "errors" + "fmt" + + "github.com/golang/snappy" + "github.com/klauspost/compress/zstd" +) + +// Type represents the compression type used for encoding and decoding data. +type Type string + +const ( + // None represents no compression case. + // None it's a default when Type is empty. + None Type = "none" + // Snappy represents snappy block format. + Snappy Type = "snappy" + // Zstd represents zstd compression. + Zstd Type = "zstd" +) + +// Encoder provides compression encoding functionality for supported compression +// types. It is agnostic to the content being compressed, operating on byte +// slices of serialized data streams. The encoder maintains internal state for +// Zstd compression and can handle multiple compression types including None, +// Snappy, and Zstd. +type Encoder struct { + w *zstd.Encoder +} + +// NewEncoder creates a new Encoder. Returns an error if the zstd encoder cannot +// be initialized. +func NewEncoder() (*Encoder, error) { + e := &Encoder{} + w, err := zstd.NewWriter(nil) + if err != nil { + return nil, err + } + e.w = w + return e, nil +} + +// Encode returns the encoded form of src for the given compression type. It also +// returns the indicator if the compression was performed. Encode may skip +// compressing for None type, but also when src is too large e.g. for Snappy block format. +// +// The buf is used as a buffer for returned encoding, and it must not overlap with +// src. It is valid to pass a nil buf. +func (e *Encoder) Encode(t Type, src, buf []byte) (_ []byte, compressed bool, err error) { + switch { + case len(src) == 0, t == "", t == None: + return src, false, nil + case t == Snappy: + // If MaxEncodedLen is less than 0 the record is too large to be compressed. + if snappy.MaxEncodedLen(len(src)) < 0 { + return src, false, nil + } + + // The snappy library uses `len` to calculate if we need a new buffer. + // In order to allocate as few buffers as possible make the length + // equal to the capacity. + buf = buf[:cap(buf)] + return snappy.Encode(buf, src), true, nil + case t == Zstd: + if e == nil { + return nil, false, errors.New("zstd requested but encoder was not initialized with NewEncoder()") + } + return e.w.EncodeAll(src, buf[:0]), true, nil + default: + return nil, false, fmt.Errorf("unsupported compression type: %s", t) + } +} + +// Decoder provides decompression functionality for supported compression types. +// It is agnostic to the content being decompressed, operating on byte slices of +// serialized data streams. The decoder maintains internal state for Zstd +// decompression and can handle multiple compression types including None, +// Snappy, and Zstd. +type Decoder struct { + r *zstd.Decoder +} + +// NewDecoder creates a new Decoder. +func NewDecoder() *Decoder { + d := &Decoder{} + + // Calling zstd.NewReader with a nil io.Reader and no options cannot return an error. + r, _ := zstd.NewReader(nil) + d.r = r + return d +} + +// Decode returns the decoded form of src or error, given expected compression type. +// +// The buf is used as a buffer for the returned decoded entry, and it must not +// overlap with src. It is valid to pass a nil buf. +func (d *Decoder) Decode(t Type, src, buf []byte) (_ []byte, err error) { + switch { + case len(src) == 0, t == "", t == None: + return src, nil + case t == Snappy: + // The snappy library uses `len` to calculate if we need a new buffer. + // In order to allocate as few buffers as possible make the length + // equal to the capacity. + buf = buf[:cap(buf)] + return snappy.Decode(buf, src) + case t == Zstd: + if d == nil { + return nil, errors.New("zstd requested but Decoder was not initialized with NewDecoder()") + } + return d.r.DecodeAll(src, buf[:0]) + default: + return nil, fmt.Errorf("unsupported compression type: %s", t) + } +} diff --git a/tsdb/db.go b/tsdb/db.go index a5abc8fed9..2ca1bccf0d 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -240,6 +240,11 @@ type Options struct { // is implemented. EnableSTAsZeroSample bool + // EnableXOR2Encoding enables the XOR2 chunk encoding for float samples. + // XOR2 provides better compression than XOR, especially for stale markers. + // Automatically set to true when EnableSTStorage is true. + EnableXOR2Encoding bool + // EnableSTStorage determines whether TSDB should write a Start Timestamp (ST) // per sample to WAL. // TODO(bwplotka): Implement this option as per PROM-60, currently it's noop. @@ -868,6 +873,8 @@ func Open(dir string, l *slog.Logger, r prometheus.Registerer, opts *Options, st opts.FeatureRegistry.Set(features.TSDB, "isolation", !opts.IsolationDisabled) opts.FeatureRegistry.Set(features.TSDB, "use_uncached_io", opts.UseUncachedIO) opts.FeatureRegistry.Enable(features.TSDB, "native_histograms") + opts.FeatureRegistry.Set(features.TSDB, "st_storage", opts.EnableSTStorage) + opts.FeatureRegistry.Set(features.TSDB, "xor2_encoding", opts.EnableXOR2Encoding) } return open(dir, l, r, opts, rngs, stats) @@ -1074,6 +1081,8 @@ func open(dir string, l *slog.Logger, r prometheus.Registerer, opts *Options, rn headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax) headOpts.EnableSharding = opts.EnableSharding headOpts.EnableSTAsZeroSample = opts.EnableSTAsZeroSample + headOpts.EnableSTStorage.Store(opts.EnableSTStorage) + headOpts.EnableXOR2Encoding.Store(opts.EnableXOR2Encoding) headOpts.EnableMetadataWALRecords = opts.EnableMetadataWALRecords if opts.WALReplayConcurrency > 0 { headOpts.WALReplayConcurrency = opts.WALReplayConcurrency @@ -1277,18 +1286,12 @@ func (db *DB) ApplyConfig(conf *config.Config) error { // Update retention configuration if provided. if conf.StorageConfig.TSDBConfig.Retention != nil { db.retentionMtx.Lock() - if conf.StorageConfig.TSDBConfig.Retention.Time > 0 { - db.opts.RetentionDuration = int64(conf.StorageConfig.TSDBConfig.Retention.Time) - db.metrics.retentionDuration.Set((time.Duration(db.opts.RetentionDuration) * time.Millisecond).Seconds()) - } - if conf.StorageConfig.TSDBConfig.Retention.Size > 0 { - db.opts.MaxBytes = int64(conf.StorageConfig.TSDBConfig.Retention.Size) - db.metrics.maxBytes.Set(float64(db.opts.MaxBytes)) - } - if conf.StorageConfig.TSDBConfig.Retention.Percentage > 0 { - db.opts.MaxPercentage = conf.StorageConfig.TSDBConfig.Retention.Percentage - db.metrics.maxPercentage.Set(float64(db.opts.MaxPercentage)) - } + db.opts.RetentionDuration = int64(time.Duration(conf.StorageConfig.TSDBConfig.Retention.Time) / time.Millisecond) + db.metrics.retentionDuration.Set((time.Duration(db.opts.RetentionDuration) * time.Millisecond).Seconds()) + db.opts.MaxBytes = int64(conf.StorageConfig.TSDBConfig.Retention.Size) + db.metrics.maxBytes.Set(float64(db.opts.MaxBytes)) + db.opts.MaxPercentage = conf.StorageConfig.TSDBConfig.Retention.Percentage + db.metrics.maxPercentage.Set(float64(db.opts.MaxPercentage)) db.retentionMtx.Unlock() } } else { diff --git a/tsdb/db_append_v2_test.go b/tsdb/db_append_v2_test.go index 8083829537..0bb1763f3d 100644 --- a/tsdb/db_append_v2_test.go +++ b/tsdb/db_append_v2_test.go @@ -193,7 +193,7 @@ func TestDataNotAvailableAfterRollback_AppendV2(t *testing.T) { require.NoError(t, err) walSeriesCount += len(series) - case record.Samples: + case record.Samples, record.SamplesV2: var samples []record.RefSample samples, err = dec.Samples(rec, samples) require.NoError(t, err) @@ -968,16 +968,18 @@ func TestWALReplayRaceOnSamplesLoggedBeforeSeries_AppendV2(t *testing.T) { // We test both with few and many samples appended after series creation. If samples are < 120 then there's no // mmap-ed chunk, otherwise there's at least 1 mmap-ed chunk when replaying the WAL. - for _, numSamplesAfterSeriesCreation := range []int{1, 1000} { - for run := 1; run <= numRuns; run++ { - t.Run(fmt.Sprintf("samples after series creation = %d, run = %d", numSamplesAfterSeriesCreation, run), func(t *testing.T) { - testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation) - }) + for _, enableSTStorage := range []bool{false, true} { + for _, numSamplesAfterSeriesCreation := range []int{1, 1000} { + for run := 1; run <= numRuns; run++ { + t.Run(fmt.Sprintf("samples after series creation = %d, run = %d, stStorage = %v", numSamplesAfterSeriesCreation, run, enableSTStorage), func(t *testing.T) { + testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation, enableSTStorage) + }) + } } } } -func testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int) { +func testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int, enableSTStorage bool) { const numSeries = 1000 db := newTestDB(t) @@ -985,7 +987,7 @@ func testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t *testing.T, numSampl for seriesRef := 1; seriesRef <= numSeries; seriesRef++ { // Log samples before the series is logged to the WAL. - var enc record.Encoder + enc := record.Encoder{EnableSTStorage: enableSTStorage} var samples []record.RefSample for ts := range numSamplesBeforeSeriesCreation { @@ -1176,139 +1178,143 @@ func TestTombstoneCleanResultEmptyBlock_AppendV2(t *testing.T) { func TestSizeRetention_AppendV2(t *testing.T) { t.Parallel() - opts := DefaultOptions() - opts.OutOfOrderTimeWindow = 100 - db := newTestDB(t, withOpts(opts), withRngs(100)) + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 100 + db := newTestDB(t, withOpts(opts), withRngs(100)) - blocks := []*BlockMeta{ - {MinTime: 100, MaxTime: 200}, // Oldest block - {MinTime: 200, MaxTime: 300}, - {MinTime: 300, MaxTime: 400}, - {MinTime: 400, MaxTime: 500}, - {MinTime: 500, MaxTime: 600}, // Newest Block - } + blocks := []*BlockMeta{ + {MinTime: 100, MaxTime: 200}, // Oldest block + {MinTime: 200, MaxTime: 300}, + {MinTime: 300, MaxTime: 400}, + {MinTime: 400, MaxTime: 500}, + {MinTime: 500, MaxTime: 600}, // Newest Block + } - for _, m := range blocks { - createBlock(t, db.Dir(), genSeries(100, 10, m.MinTime, m.MaxTime)) - } + for _, m := range blocks { + createBlock(t, db.Dir(), genSeries(100, 10, m.MinTime, m.MaxTime)) + } - headBlocks := []*BlockMeta{ - {MinTime: 700, MaxTime: 800}, - } + headBlocks := []*BlockMeta{ + {MinTime: 700, MaxTime: 800}, + } - // Add some data to the WAL. - headApp := db.Head().AppenderV2(context.Background()) - var aSeries labels.Labels - var it chunkenc.Iterator - for _, m := range headBlocks { - series := genSeries(100, 10, m.MinTime, m.MaxTime+1) - for _, s := range series { - aSeries = s.Labels() - it = s.Iterator(it) - for it.Next() == chunkenc.ValFloat { - tim, v := it.At() - _, err := headApp.Append(0, s.Labels(), 0, tim, v, nil, nil, storage.AOptions{}) + // Add some data to the WAL. + headApp := db.Head().AppenderV2(context.Background()) + var aSeries labels.Labels + var it chunkenc.Iterator + for _, m := range headBlocks { + series := genSeries(100, 10, m.MinTime, m.MaxTime+1) + for _, s := range series { + aSeries = s.Labels() + it = s.Iterator(it) + for it.Next() == chunkenc.ValFloat { + tim, v := it.At() + _, err := headApp.Append(0, s.Labels(), 0, tim, v, nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, it.Err()) + } + } + require.NoError(t, headApp.Commit()) + db.Head().mmapHeadChunks() + + require.Eventually(t, func() bool { + return db.Head().chunkDiskMapper.IsQueueEmpty() + }, 2*time.Second, 100*time.Millisecond) + + // Test that registered size matches the actual disk size. + require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. + require.Len(t, db.Blocks(), len(blocks)) // Ensure all blocks are registered. + blockSize := int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. + walSize, err := db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err := db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + // Expected size should take into account block size + WAL size + Head + // chunks size + expSize := blockSize + walSize + cdmSize + actSize, err := fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Create a WAL checkpoint, and compare sizes. + first, last, err := wlog.Segments(db.Head().wal.Dir()) + require.NoError(t, err) + _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0, enableSTStorage) + require.NoError(t, err) + blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + expSize = blockSize + walSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Truncate Chunk Disk Mapper and compare sizes. + require.NoError(t, db.Head().chunkDiskMapper.Truncate(900)) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + expSize = blockSize + walSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Add some out of order samples to check the size of WBL. + headApp = db.Head().AppenderV2(context.Background()) + for ts := int64(750); ts < 800; ts++ { + _, err := headApp.Append(0, aSeries, 0, ts, float64(ts), nil, nil, storage.AOptions{}) require.NoError(t, err) } - require.NoError(t, it.Err()) - } + require.NoError(t, headApp.Commit()) + + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + wblSize, err := db.Head().wbl.Size() + require.NoError(t, err) + require.NotZero(t, wblSize) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + expSize = blockSize + walSize + wblSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Decrease the max bytes limit so that a delete is triggered. + // Check total size, total count and check that the oldest block was deleted. + firstBlockSize := db.Blocks()[0].Size() + sizeLimit := actSize - firstBlockSize + db.opts.MaxBytes = sizeLimit // Set the new db size limit one block smaller that the actual size. + require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. + + expBlocks := blocks[1:] + actBlocks := db.Blocks() + blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + // Expected size should take into account block size + WAL size + WBL size + expSize = blockSize + walSize + wblSize + cdmSize + actRetentionCount := int(prom_testutil.ToFloat64(db.metrics.sizeRetentionCount)) + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + + require.Equal(t, 1, actRetentionCount, "metric retention count mismatch") + require.Equal(t, expSize, actSize, "metric db size doesn't match actual disk size") + require.LessOrEqual(t, expSize, sizeLimit, "actual size (%v) is expected to be less than or equal to limit (%v)", expSize, sizeLimit) + require.Len(t, actBlocks, len(blocks)-1, "new block count should be decreased from:%v to:%v", len(blocks), len(blocks)-1) + require.Equal(t, expBlocks[0].MaxTime, actBlocks[0].meta.MaxTime, "maxT mismatch of the first block") + require.Equal(t, expBlocks[len(expBlocks)-1].MaxTime, actBlocks[len(actBlocks)-1].meta.MaxTime, "maxT mismatch of the last block") + }) } - require.NoError(t, headApp.Commit()) - db.Head().mmapHeadChunks() - - require.Eventually(t, func() bool { - return db.Head().chunkDiskMapper.IsQueueEmpty() - }, 2*time.Second, 100*time.Millisecond) - - // Test that registered size matches the actual disk size. - require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. - require.Len(t, db.Blocks(), len(blocks)) // Ensure all blocks are registered. - blockSize := int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. - walSize, err := db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err := db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - // Expected size should take into account block size + WAL size + Head - // chunks size - expSize := blockSize + walSize + cdmSize - actSize, err := fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Create a WAL checkpoint, and compare sizes. - first, last, err := wlog.Segments(db.Head().wal.Dir()) - require.NoError(t, err) - _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0) - require.NoError(t, err) - blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - expSize = blockSize + walSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Truncate Chunk Disk Mapper and compare sizes. - require.NoError(t, db.Head().chunkDiskMapper.Truncate(900)) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - expSize = blockSize + walSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Add some out of order samples to check the size of WBL. - headApp = db.Head().AppenderV2(context.Background()) - for ts := int64(750); ts < 800; ts++ { - _, err := headApp.Append(0, aSeries, 0, ts, float64(ts), nil, nil, storage.AOptions{}) - require.NoError(t, err) - } - require.NoError(t, headApp.Commit()) - - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - wblSize, err := db.Head().wbl.Size() - require.NoError(t, err) - require.NotZero(t, wblSize) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - expSize = blockSize + walSize + wblSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Decrease the max bytes limit so that a delete is triggered. - // Check total size, total count and check that the oldest block was deleted. - firstBlockSize := db.Blocks()[0].Size() - sizeLimit := actSize - firstBlockSize - db.opts.MaxBytes = sizeLimit // Set the new db size limit one block smaller that the actual size. - require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. - - expBlocks := blocks[1:] - actBlocks := db.Blocks() - blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - // Expected size should take into account block size + WAL size + WBL size - expSize = blockSize + walSize + wblSize + cdmSize - actRetentionCount := int(prom_testutil.ToFloat64(db.metrics.sizeRetentionCount)) - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - - require.Equal(t, 1, actRetentionCount, "metric retention count mismatch") - require.Equal(t, expSize, actSize, "metric db size doesn't match actual disk size") - require.LessOrEqual(t, expSize, sizeLimit, "actual size (%v) is expected to be less than or equal to limit (%v)", expSize, sizeLimit) - require.Len(t, actBlocks, len(blocks)-1, "new block count should be decreased from:%v to:%v", len(blocks), len(blocks)-1) - require.Equal(t, expBlocks[0].MaxTime, actBlocks[0].meta.MaxTime, "maxT mismatch of the first block") - require.Equal(t, expBlocks[len(expBlocks)-1].MaxTime, actBlocks[len(actBlocks)-1].meta.MaxTime, "maxT mismatch of the last block") } func TestNotMatcherSelectsLabelsUnsetSeries_AppendV2(t *testing.T) { @@ -1499,33 +1505,36 @@ func TestInitializeHeadTimestamp_AppendV2(t *testing.T) { require.Equal(t, int64(1000), db.head.MaxTime()) require.True(t, db.head.initialized()) }) - t.Run("wal-only", func(t *testing.T) { - dir := t.TempDir() - require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) - w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) - require.NoError(t, err) + for _, enableSTStorage := range []bool{false, true} { + t.Run("wal-only,stStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + dir := t.TempDir() - var enc record.Encoder - err = w.Log( - enc.Series([]record.RefSeries{ - {Ref: 123, Labels: labels.FromStrings("a", "1")}, - {Ref: 124, Labels: labels.FromStrings("a", "2")}, - }, nil), - enc.Samples([]record.RefSample{ - {Ref: 123, T: 5000, V: 1}, - {Ref: 124, T: 15000, V: 1}, - }, nil), - ) - require.NoError(t, err) - require.NoError(t, w.Close()) + require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) + w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) + require.NoError(t, err) - db := newTestDB(t, withDir(dir)) + enc := record.Encoder{EnableSTStorage: enableSTStorage} + err = w.Log( + enc.Series([]record.RefSeries{ + {Ref: 123, Labels: labels.FromStrings("a", "1")}, + {Ref: 124, Labels: labels.FromStrings("a", "2")}, + }, nil), + enc.Samples([]record.RefSample{ + {Ref: 123, T: 5000, V: 1}, + {Ref: 124, T: 15000, V: 1}, + }, nil), + ) + require.NoError(t, err) + require.NoError(t, w.Close()) - require.Equal(t, int64(5000), db.head.MinTime()) - require.Equal(t, int64(15000), db.head.MaxTime()) - require.True(t, db.head.initialized()) - }) + db := newTestDB(t, withDir(dir)) + + require.Equal(t, int64(5000), db.head.MinTime()) + require.Equal(t, int64(15000), db.head.MaxTime()) + require.True(t, db.head.initialized()) + }) + } t.Run("existing-block", func(t *testing.T) { dir := t.TempDir() @@ -1537,37 +1546,39 @@ func TestInitializeHeadTimestamp_AppendV2(t *testing.T) { require.Equal(t, int64(2000), db.head.MaxTime()) require.True(t, db.head.initialized()) }) - t.Run("existing-block-and-wal", func(t *testing.T) { - dir := t.TempDir() + for _, enableSTStorage := range []bool{false, true} { + t.Run("existing-block-and-wal,stStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + dir := t.TempDir() - createBlock(t, dir, genSeries(1, 1, 1000, 6000)) + createBlock(t, dir, genSeries(1, 1, 1000, 6000)) - require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) - w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) - require.NoError(t, err) + require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) + w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) + require.NoError(t, err) - var enc record.Encoder - err = w.Log( - enc.Series([]record.RefSeries{ - {Ref: 123, Labels: labels.FromStrings("a", "1")}, - {Ref: 124, Labels: labels.FromStrings("a", "2")}, - }, nil), - enc.Samples([]record.RefSample{ - {Ref: 123, T: 5000, V: 1}, - {Ref: 124, T: 15000, V: 1}, - }, nil), - ) - require.NoError(t, err) - require.NoError(t, w.Close()) + enc := record.Encoder{EnableSTStorage: enableSTStorage} + err = w.Log( + enc.Series([]record.RefSeries{ + {Ref: 123, Labels: labels.FromStrings("a", "1")}, + {Ref: 124, Labels: labels.FromStrings("a", "2")}, + }, nil), + enc.Samples([]record.RefSample{ + {Ref: 123, T: 5000, V: 1}, + {Ref: 124, T: 15000, V: 1}, + }, nil), + ) + require.NoError(t, err) + require.NoError(t, w.Close()) - db := newTestDB(t, withDir(dir)) + db := newTestDB(t, withDir(dir)) - require.Equal(t, int64(6000), db.head.MinTime()) - require.Equal(t, int64(15000), db.head.MaxTime()) - require.True(t, db.head.initialized()) - // Check that old series has been GCed. - require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.series)) - }) + require.Equal(t, int64(6000), db.head.MinTime()) + require.Equal(t, int64(15000), db.head.MaxTime()) + require.True(t, db.head.initialized()) + // Check that old series has been GCed. + require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.series)) + }) + } } func TestNoEmptyBlocks_AppendV2(t *testing.T) { @@ -3265,7 +3276,7 @@ func testOOOWALWriteAppendV2(t *testing.T, series, err := dec.Series(rec, nil) require.NoError(t, err) records = append(records, series) - case record.Samples: + case record.Samples, record.SamplesV2: samples, err := dec.Samples(rec, nil) require.NoError(t, err) records = append(records, samples) @@ -3422,112 +3433,116 @@ func TestMetadataInWAL_AppenderV2(t *testing.T) { } func TestMetadataCheckpointingOnlyKeepsLatestEntry_AppendV2(t *testing.T) { - ctx := context.Background() - numSamples := 10000 - hb, w := newTestHead(t, int64(numSamples)*10, compression.None, false) - hb.opts.EnableMetadataWALRecords = true + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + ctx := context.Background() + numSamples := 10000 + hb, w := newTestHead(t, int64(numSamples)*10, compression.None, false) + hb.opts.EnableMetadataWALRecords = true - // Add some series so we can append metadata to them. - s1 := labels.FromStrings("a", "b") - s2 := labels.FromStrings("c", "d") - s3 := labels.FromStrings("e", "f") - s4 := labels.FromStrings("g", "h") + // Add some series so we can append metadata to them. + s1 := labels.FromStrings("a", "b") + s2 := labels.FromStrings("c", "d") + s3 := labels.FromStrings("e", "f") + s4 := labels.FromStrings("g", "h") - m1 := metadata.Metadata{Type: "gauge", Unit: "unit_1", Help: "help_1"} - m2 := metadata.Metadata{Type: "gauge", Unit: "unit_2", Help: "help_2"} - m3 := metadata.Metadata{Type: "gauge", Unit: "unit_3", Help: "help_3"} - m4 := metadata.Metadata{Type: "gauge", Unit: "unit_4", Help: "help_4"} + m1 := metadata.Metadata{Type: "gauge", Unit: "unit_1", Help: "help_1"} + m2 := metadata.Metadata{Type: "gauge", Unit: "unit_2", Help: "help_2"} + m3 := metadata.Metadata{Type: "gauge", Unit: "unit_3", Help: "help_3"} + m4 := metadata.Metadata{Type: "gauge", Unit: "unit_4", Help: "help_4"} - app := hb.AppenderV2(ctx) - ts := int64(0) - _, err := app.Append(0, s1, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m1}) - require.NoError(t, err) - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) - require.NoError(t, err) - _, err = app.Append(0, s3, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m3}) - require.NoError(t, err) - _, err = app.Append(0, s4, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m4}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app := hb.AppenderV2(ctx) + ts := int64(0) + _, err := app.Append(0, s1, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m1}) + require.NoError(t, err) + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) + require.NoError(t, err) + _, err = app.Append(0, s3, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m3}) + require.NoError(t, err) + _, err = app.Append(0, s4, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m4}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - // Update metadata for first series. - m5 := metadata.Metadata{Type: "counter", Unit: "unit_5", Help: "help_5"} - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s1, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m5}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + // Update metadata for first series. + m5 := metadata.Metadata{Type: "counter", Unit: "unit_5", Help: "help_5"} + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s1, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m5}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - // Switch back-and-forth metadata for second series. - // Since it ended on a new metadata record, we expect a single new entry. - m6 := metadata.Metadata{Type: "counter", Unit: "unit_6", Help: "help_6"} + // Switch back-and-forth metadata for second series. + // Since it ended on a new metadata record, we expect a single new entry. + m6 := metadata.Metadata{Type: "counter", Unit: "unit_6", Help: "help_6"} - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - // Let's create a checkpoint. - first, last, err := wlog.Segments(w.Dir()) - require.NoError(t, err) - keep := func(id chunks.HeadSeriesRef) bool { - return id != 3 + // Let's create a checkpoint. + first, last, err := wlog.Segments(w.Dir()) + require.NoError(t, err) + keep := func(id chunks.HeadSeriesRef) bool { + return id != 3 + } + _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0, enableSTStorage) + require.NoError(t, err) + + // Confirm there's been a checkpoint. + cdir, _, err := wlog.LastCheckpoint(w.Dir()) + require.NoError(t, err) + + // Read in checkpoint and WAL. + recs := readTestWAL(t, cdir) + var gotMetadataBlocks [][]record.RefMetadata + for _, rec := range recs { + if mr, ok := rec.([]record.RefMetadata); ok { + gotMetadataBlocks = append(gotMetadataBlocks, mr) + } + } + + // There should only be 1 metadata block present, with only the latest + // metadata kept around. + wantMetadata := []record.RefMetadata{ + {Ref: 1, Type: record.GetMetricType(m5.Type), Unit: m5.Unit, Help: m5.Help}, + {Ref: 2, Type: record.GetMetricType(m6.Type), Unit: m6.Unit, Help: m6.Help}, + {Ref: 4, Type: record.GetMetricType(m4.Type), Unit: m4.Unit, Help: m4.Help}, + } + require.Len(t, gotMetadataBlocks, 1) + require.Len(t, gotMetadataBlocks[0], 3) + gotMetadataBlock := gotMetadataBlocks[0] + + sort.Slice(gotMetadataBlock, func(i, j int) bool { return gotMetadataBlock[i].Ref < gotMetadataBlock[j].Ref }) + require.Equal(t, wantMetadata, gotMetadataBlock) + require.NoError(t, hb.Close()) + }) } - _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0) - require.NoError(t, err) - - // Confirm there's been a checkpoint. - cdir, _, err := wlog.LastCheckpoint(w.Dir()) - require.NoError(t, err) - - // Read in checkpoint and WAL. - recs := readTestWAL(t, cdir) - var gotMetadataBlocks [][]record.RefMetadata - for _, rec := range recs { - if mr, ok := rec.([]record.RefMetadata); ok { - gotMetadataBlocks = append(gotMetadataBlocks, mr) - } - } - - // There should only be 1 metadata block present, with only the latest - // metadata kept around. - wantMetadata := []record.RefMetadata{ - {Ref: 1, Type: record.GetMetricType(m5.Type), Unit: m5.Unit, Help: m5.Help}, - {Ref: 2, Type: record.GetMetricType(m6.Type), Unit: m6.Unit, Help: m6.Help}, - {Ref: 4, Type: record.GetMetricType(m4.Type), Unit: m4.Unit, Help: m4.Help}, - } - require.Len(t, gotMetadataBlocks, 1) - require.Len(t, gotMetadataBlocks[0], 3) - gotMetadataBlock := gotMetadataBlocks[0] - - sort.Slice(gotMetadataBlock, func(i, j int) bool { return gotMetadataBlock[i].Ref < gotMetadataBlock[j].Ref }) - require.Equal(t, wantMetadata, gotMetadataBlock) - require.NoError(t, hb.Close()) } func TestMetadataAssertInMemoryData_AppendV2(t *testing.T) { @@ -7489,6 +7504,65 @@ func TestAbortBlockCompactions_AppendV2(t *testing.T) { require.Equal(t, 4, compactions, "expected 4 compactions to be completed") } +// TestCompactHeadWithSTStorage_AppendV2 ensures that when EnableSTStorage is true, +// compacted blocks contain chunks with EncXOR2 encoding for float samples. +func TestCompactHeadWithSTStorage_AppendV2(t *testing.T) { + t.Parallel() + + opts := &Options{ + RetentionDuration: int64(time.Hour * 24 * 15 / time.Millisecond), + NoLockfile: true, + MinBlockDuration: int64(time.Hour * 2 / time.Millisecond), + MaxBlockDuration: int64(time.Hour * 2 / time.Millisecond), + WALCompression: compression.Snappy, + EnableSTStorage: true, + EnableXOR2Encoding: true, + } + db := newTestDB(t, withOpts(opts)) + ctx := context.Background() + app := db.AppenderV2(ctx) + + mint := 100 + maxt := 200 + for i := mint; i < maxt; i++ { + _, err := app.Append(0, labels.FromStrings("a", "b"), 50, int64(i), float64(i), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + + require.NoError(t, db.CompactHead(NewRangeHead(db.Head(), int64(mint), int64(maxt)-1))) + require.Len(t, db.Blocks(), 1) + b := db.Blocks()[0] + + chunkr, err := b.Chunks() + require.NoError(t, err) + defer chunkr.Close() + + indexr, err := b.Index() + require.NoError(t, err) + defer indexr.Close() + + p, err := indexr.Postings(ctx, "a", "b") + require.NoError(t, err) + + chunkCount := 0 + for p.Next() { + var builder labels.ScratchBuilder + var chks []chunks.Meta + require.NoError(t, indexr.Series(p.At(), &builder, &chks)) + + for _, chk := range chks { + c, _, err := chunkr.ChunkOrIterable(chk) + require.NoError(t, err) + require.Equal(t, chunkenc.EncXOR2, c.Encoding(), + "unexpected chunk encoding, got %s", c.Encoding()) + chunkCount++ + } + } + require.NoError(t, p.Err()) + require.Positive(t, chunkCount, "expected at least one chunk") +} + func TestNewCompactorFunc_AppendV2(t *testing.T) { opts := DefaultOptions() block1 := ulid.MustNew(1, nil) @@ -7520,3 +7594,111 @@ func TestNewCompactorFunc_AppendV2(t *testing.T) { require.Len(t, ulids, 1) require.Equal(t, block2, ulids[0]) } + +// TestDBAppenderV2_STStorage_OutOfOrder verifies that ST storage works correctly +// when samples are appended out of order and can be queried using ChunkQuerier. +func TestDBAppenderV2_STStorage_OutOfOrder(t *testing.T) { + testHistogram := tsdbutil.GenerateTestHistogram(1) + testHistogram.CounterResetHint = histogram.NotCounterReset + + testCases := []struct { + name string + appendSamples []chunks.Sample + expectedSamples []chunks.Sample + }{ + { + name: "Float samples out of order", + appendSamples: []chunks.Sample{ + newSample(20, 200, 2.0, nil, nil), // Append second sample first. + newSample(10, 100, 1.0, nil, nil), // Append first sample second (OOO). + newSample(30, 300, 3.0, nil, nil), // Append third sample last. + newSample(25, 250, 2.5, nil, nil), // Append middle sample (OOO). + }, + expectedSamples: []chunks.Sample{ + newSample(10, 100, 1.0, nil, nil), + newSample(20, 200, 2.0, nil, nil), + newSample(25, 250, 2.5, nil, nil), + newSample(30, 300, 3.0, nil, nil), + }, + }, + { + name: "Histogram samples out of order", + appendSamples: []chunks.Sample{ + newSample(30, 300, 0, testHistogram, nil), // Append third sample first. + newSample(10, 100, 0, testHistogram, nil), // Append first sample second (OOO). + newSample(20, 200, 0, testHistogram, nil), // Append second sample last (OOO). + }, + // Histograms don't support ST storage yet, should return 0 for ST. + expectedSamples: []chunks.Sample{ + newSample(0, 100, 0, testHistogram, nil), + newSample(0, 200, 0, testHistogram, nil), + newSample(0, 300, 0, testHistogram, nil), + }, + }, + { + name: "Mixed float samples with same ST", + appendSamples: []chunks.Sample{ + newSample(10, 200, 2.0, nil, nil), + newSample(10, 100, 1.0, nil, nil), // OOO with same ST. + newSample(10, 300, 3.0, nil, nil), + }, + expectedSamples: []chunks.Sample{ + newSample(10, 100, 1.0, nil, nil), + newSample(10, 200, 2.0, nil, nil), + newSample(10, 300, 3.0, nil, nil), + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 300 * time.Minute.Milliseconds() + opts.EnableSTStorage = true + opts.EnableXOR2Encoding = true + db := newTestDB(t, withOpts(opts)) + db.DisableCompactions() + + lbls := labels.FromStrings("foo", "bar") + + for _, s := range tc.appendSamples { + app := db.AppenderV2(context.Background()) + _, err := app.Append(0, lbls, s.ST(), s.T(), s.F(), s.H(), s.FH(), storage.AOptions{}) + require.NoError(t, err, "Appending OOO sample with ST should succeed") + require.NoError(t, app.Commit(), "Committing OOO sample with ST should succeed") + } + + querier, err := db.ChunkQuerier(math.MinInt64, math.MaxInt64) + require.NoError(t, err) + defer querier.Close() + + ss := querier.Select(context.Background(), false, nil, labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")) + require.True(t, ss.Next(), "Should have series") + series := ss.At() + require.NoError(t, ss.Err()) + require.False(t, ss.Next(), "Should have only one series") + + chunkIt := series.Iterator(nil) + var actualSamples []chunks.Sample + + for chunkIt.Next() { + chk := chunkIt.At() + it := chk.Chunk.Iterator(nil) + samples, err := storage.ExpandSamples(it, newSample) + require.NoError(t, err) + actualSamples = append(actualSamples, samples...) + } + require.NoError(t, chunkIt.Err()) + + // Use requireEqualSamplesIgnoreCounterResets to ignore histogram counter reset hints. + requireEqualSamples(t, lbls.String(), tc.expectedSamples, actualSamples, requireEqualSamplesIgnoreCounterResets) + + // Additionally verify ST values match expectations. + require.Len(t, actualSamples, len(tc.expectedSamples)) + for i, expected := range tc.expectedSamples { + actual := actualSamples[i] + require.Equal(t, expected.ST(), actual.ST(), "Sample %d: ST should match", i) + } + }) + } +} diff --git a/tsdb/db_test.go b/tsdb/db_test.go index 7cab2a0f55..21b2c08124 100644 --- a/tsdb/db_test.go +++ b/tsdb/db_test.go @@ -395,7 +395,7 @@ func TestDataNotAvailableAfterRollback(t *testing.T) { require.NoError(t, err) walSeriesCount += len(series) - case record.Samples: + case record.Samples, record.SamplesV2: var samples []record.RefSample samples, err = dec.Samples(rec, samples) require.NoError(t, err) @@ -1170,24 +1170,25 @@ func TestWALReplayRaceOnSamplesLoggedBeforeSeries(t *testing.T) { // We test both with few and many samples appended after series creation. If samples are < 120 then there's no // mmap-ed chunk, otherwise there's at least 1 mmap-ed chunk when replaying the WAL. - for _, numSamplesAfterSeriesCreation := range []int{1, 1000} { - for run := 1; run <= numRuns; run++ { - t.Run(fmt.Sprintf("samples after series creation = %d, run = %d", numSamplesAfterSeriesCreation, run), func(t *testing.T) { - testWALReplayRaceOnSamplesLoggedBeforeSeries(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation) - }) + for _, enableSTStorage := range []bool{false, true} { + for _, numSamplesAfterSeriesCreation := range []int{1, 1000} { + for run := 1; run <= numRuns; run++ { + t.Run(fmt.Sprintf("samples after series creation = %d, run = %d, stStorage=%v", numSamplesAfterSeriesCreation, run, enableSTStorage), func(t *testing.T) { + testWALReplayRaceOnSamplesLoggedBeforeSeries(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation, enableSTStorage) + }) + } } } } -func testWALReplayRaceOnSamplesLoggedBeforeSeries(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int) { +func testWALReplayRaceOnSamplesLoggedBeforeSeries(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int, enableSTStorage bool) { const numSeries = 1000 - db := newTestDB(t) db.DisableCompactions() for seriesRef := 1; seriesRef <= numSeries; seriesRef++ { // Log samples before the series is logged to the WAL. - var enc record.Encoder + enc := record.Encoder{EnableSTStorage: enableSTStorage} var samples []record.RefSample for ts := range numSamplesBeforeSeriesCreation { @@ -1551,139 +1552,143 @@ func TestRetentionDurationMetric(t *testing.T) { func TestSizeRetention(t *testing.T) { t.Parallel() - opts := DefaultOptions() - opts.OutOfOrderTimeWindow = 100 - db := newTestDB(t, withOpts(opts), withRngs(100)) + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 100 + db := newTestDB(t, withOpts(opts), withRngs(100)) - blocks := []*BlockMeta{ - {MinTime: 100, MaxTime: 200}, // Oldest block - {MinTime: 200, MaxTime: 300}, - {MinTime: 300, MaxTime: 400}, - {MinTime: 400, MaxTime: 500}, - {MinTime: 500, MaxTime: 600}, // Newest Block - } + blocks := []*BlockMeta{ + {MinTime: 100, MaxTime: 200}, // Oldest block + {MinTime: 200, MaxTime: 300}, + {MinTime: 300, MaxTime: 400}, + {MinTime: 400, MaxTime: 500}, + {MinTime: 500, MaxTime: 600}, // Newest Block + } - for _, m := range blocks { - createBlock(t, db.Dir(), genSeries(100, 10, m.MinTime, m.MaxTime)) - } + for _, m := range blocks { + createBlock(t, db.Dir(), genSeries(100, 10, m.MinTime, m.MaxTime)) + } - headBlocks := []*BlockMeta{ - {MinTime: 700, MaxTime: 800}, - } + headBlocks := []*BlockMeta{ + {MinTime: 700, MaxTime: 800}, + } - // Add some data to the WAL. - headApp := db.Head().Appender(context.Background()) - var aSeries labels.Labels - var it chunkenc.Iterator - for _, m := range headBlocks { - series := genSeries(100, 10, m.MinTime, m.MaxTime+1) - for _, s := range series { - aSeries = s.Labels() - it = s.Iterator(it) - for it.Next() == chunkenc.ValFloat { - tim, v := it.At() - _, err := headApp.Append(0, s.Labels(), tim, v) + // Add some data to the WAL. + headApp := db.Head().Appender(context.Background()) + var aSeries labels.Labels + var it chunkenc.Iterator + for _, m := range headBlocks { + series := genSeries(100, 10, m.MinTime, m.MaxTime+1) + for _, s := range series { + aSeries = s.Labels() + it = s.Iterator(it) + for it.Next() == chunkenc.ValFloat { + tim, v := it.At() + _, err := headApp.Append(0, s.Labels(), tim, v) + require.NoError(t, err) + } + require.NoError(t, it.Err()) + } + } + require.NoError(t, headApp.Commit()) + db.Head().mmapHeadChunks() + + require.Eventually(t, func() bool { + return db.Head().chunkDiskMapper.IsQueueEmpty() + }, 2*time.Second, 100*time.Millisecond) + + // Test that registered size matches the actual disk size. + require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. + require.Len(t, db.Blocks(), len(blocks)) // Ensure all blocks are registered. + blockSize := int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. + walSize, err := db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err := db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + // Expected size should take into account block size + WAL size + Head + // chunks size + expSize := blockSize + walSize + cdmSize + actSize, err := fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Create a WAL checkpoint, and compare sizes. + first, last, err := wlog.Segments(db.Head().wal.Dir()) + require.NoError(t, err) + _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0, enableSTStorage) + require.NoError(t, err) + blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + expSize = blockSize + walSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Truncate Chunk Disk Mapper and compare sizes. + require.NoError(t, db.Head().chunkDiskMapper.Truncate(900)) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + expSize = blockSize + walSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Add some out of order samples to check the size of WBL. + headApp = db.Head().Appender(context.Background()) + for ts := int64(750); ts < 800; ts++ { + _, err := headApp.Append(0, aSeries, ts, float64(ts)) require.NoError(t, err) } - require.NoError(t, it.Err()) - } + require.NoError(t, headApp.Commit()) + + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + wblSize, err := db.Head().wbl.Size() + require.NoError(t, err) + require.NotZero(t, wblSize) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + expSize = blockSize + walSize + wblSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Decrease the max bytes limit so that a delete is triggered. + // Check total size, total count and check that the oldest block was deleted. + firstBlockSize := db.Blocks()[0].Size() + sizeLimit := actSize - firstBlockSize + db.opts.MaxBytes = sizeLimit // Set the new db size limit one block smaller that the actual size. + require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. + + expBlocks := blocks[1:] + actBlocks := db.Blocks() + blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + // Expected size should take into account block size + WAL size + WBL size + expSize = blockSize + walSize + wblSize + cdmSize + actRetentionCount := int(prom_testutil.ToFloat64(db.metrics.sizeRetentionCount)) + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + + require.Equal(t, 1, actRetentionCount, "metric retention count mismatch") + require.Equal(t, expSize, actSize, "metric db size doesn't match actual disk size") + require.LessOrEqual(t, expSize, sizeLimit, "actual size (%v) is expected to be less than or equal to limit (%v)", expSize, sizeLimit) + require.Len(t, actBlocks, len(blocks)-1, "new block count should be decreased from:%v to:%v", len(blocks), len(blocks)-1) + require.Equal(t, expBlocks[0].MaxTime, actBlocks[0].meta.MaxTime, "maxT mismatch of the first block") + require.Equal(t, expBlocks[len(expBlocks)-1].MaxTime, actBlocks[len(actBlocks)-1].meta.MaxTime, "maxT mismatch of the last block") + }) } - require.NoError(t, headApp.Commit()) - db.Head().mmapHeadChunks() - - require.Eventually(t, func() bool { - return db.Head().chunkDiskMapper.IsQueueEmpty() - }, 2*time.Second, 100*time.Millisecond) - - // Test that registered size matches the actual disk size. - require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. - require.Len(t, db.Blocks(), len(blocks)) // Ensure all blocks are registered. - blockSize := int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. - walSize, err := db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err := db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - // Expected size should take into account block size + WAL size + Head - // chunks size - expSize := blockSize + walSize + cdmSize - actSize, err := fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Create a WAL checkpoint, and compare sizes. - first, last, err := wlog.Segments(db.Head().wal.Dir()) - require.NoError(t, err) - _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0) - require.NoError(t, err) - blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - expSize = blockSize + walSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Truncate Chunk Disk Mapper and compare sizes. - require.NoError(t, db.Head().chunkDiskMapper.Truncate(900)) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - expSize = blockSize + walSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Add some out of order samples to check the size of WBL. - headApp = db.Head().Appender(context.Background()) - for ts := int64(750); ts < 800; ts++ { - _, err := headApp.Append(0, aSeries, ts, float64(ts)) - require.NoError(t, err) - } - require.NoError(t, headApp.Commit()) - - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - wblSize, err := db.Head().wbl.Size() - require.NoError(t, err) - require.NotZero(t, wblSize) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - expSize = blockSize + walSize + wblSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Decrease the max bytes limit so that a delete is triggered. - // Check total size, total count and check that the oldest block was deleted. - firstBlockSize := db.Blocks()[0].Size() - sizeLimit := actSize - firstBlockSize - db.opts.MaxBytes = sizeLimit // Set the new db size limit one block smaller that the actual size. - require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. - - expBlocks := blocks[1:] - actBlocks := db.Blocks() - blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - // Expected size should take into account block size + WAL size + WBL size - expSize = blockSize + walSize + wblSize + cdmSize - actRetentionCount := int(prom_testutil.ToFloat64(db.metrics.sizeRetentionCount)) - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - - require.Equal(t, 1, actRetentionCount, "metric retention count mismatch") - require.Equal(t, expSize, actSize, "metric db size doesn't match actual disk size") - require.LessOrEqual(t, expSize, sizeLimit, "actual size (%v) is expected to be less than or equal to limit (%v)", expSize, sizeLimit) - require.Len(t, actBlocks, len(blocks)-1, "new block count should be decreased from:%v to:%v", len(blocks), len(blocks)-1) - require.Equal(t, expBlocks[0].MaxTime, actBlocks[0].meta.MaxTime, "maxT mismatch of the first block") - require.Equal(t, expBlocks[len(expBlocks)-1].MaxTime, actBlocks[len(actBlocks)-1].meta.MaxTime, "maxT mismatch of the last block") } func TestSizeRetentionMetric(t *testing.T) { @@ -1743,7 +1748,7 @@ func TestRuntimeRetentionConfigChange(t *testing.T) { StorageConfig: config.StorageConfig{ TSDBConfig: &config.TSDBConfig{ Retention: &config.TSDBRetentionConfig{ - Time: model.Duration(shorterRetentionDuration), + Time: model.Duration(time.Duration(shorterRetentionDuration) * time.Millisecond), }, }, }, @@ -1772,6 +1777,31 @@ func TestRuntimeRetentionConfigChange(t *testing.T) { require.Positive(t, int(prom_testutil.ToFloat64(db.metrics.timeRetentionCount)), "time retention count should be incremented") } +// TestApplyConfigRetentionDurationMetricUnit verifies that after a config +// reload the prometheus_tsdb_retention_limit_seconds metric reports the +// retention in seconds. +func TestApplyConfigRetentionDurationMetricUnit(t *testing.T) { + oneHourMs := int64(time.Hour / time.Millisecond) + db := newTestDB(t, withOpts(&Options{RetentionDuration: oneHourMs})) + + cfg := &config.Config{ + StorageConfig: config.StorageConfig{ + TSDBConfig: &config.TSDBConfig{ + Retention: &config.TSDBRetentionConfig{ + Time: model.Duration(time.Hour), + }, + }, + }, + } + require.NoError(t, db.ApplyConfig(cfg)) + + require.Equal(t, oneHourMs, db.getRetentionDuration()) + + gotSeconds := prom_testutil.ToFloat64(db.metrics.retentionDuration) + wantSeconds := time.Hour.Seconds() + require.Equal(t, wantSeconds, gotSeconds) +} + func TestNotMatcherSelectsLabelsUnsetSeries(t *testing.T) { db := newTestDB(t) @@ -2072,33 +2102,36 @@ func TestInitializeHeadTimestamp(t *testing.T) { require.Equal(t, int64(1000), db.head.MaxTime()) require.True(t, db.head.initialized()) }) - t.Run("wal-only", func(t *testing.T) { - dir := t.TempDir() - require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) - w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) - require.NoError(t, err) + for _, enableSTStorage := range []bool{false, true} { + t.Run("wal-only-st-"+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + dir := t.TempDir() - var enc record.Encoder - err = w.Log( - enc.Series([]record.RefSeries{ - {Ref: 123, Labels: labels.FromStrings("a", "1")}, - {Ref: 124, Labels: labels.FromStrings("a", "2")}, - }, nil), - enc.Samples([]record.RefSample{ - {Ref: 123, T: 5000, V: 1}, - {Ref: 124, T: 15000, V: 1}, - }, nil), - ) - require.NoError(t, err) - require.NoError(t, w.Close()) + require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) + w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) + require.NoError(t, err) - db := newTestDB(t, withDir(dir)) + enc := record.Encoder{EnableSTStorage: enableSTStorage} + err = w.Log( + enc.Series([]record.RefSeries{ + {Ref: 123, Labels: labels.FromStrings("a", "1")}, + {Ref: 124, Labels: labels.FromStrings("a", "2")}, + }, nil), + enc.Samples([]record.RefSample{ + {Ref: 123, T: 5000, V: 1}, + {Ref: 124, T: 15000, V: 1}, + }, nil), + ) + require.NoError(t, err) + require.NoError(t, w.Close()) - require.Equal(t, int64(5000), db.head.MinTime()) - require.Equal(t, int64(15000), db.head.MaxTime()) - require.True(t, db.head.initialized()) - }) + db := newTestDB(t, withDir(dir)) + + require.Equal(t, int64(5000), db.head.MinTime()) + require.Equal(t, int64(15000), db.head.MaxTime()) + require.True(t, db.head.initialized()) + }) + } t.Run("existing-block", func(t *testing.T) { dir := t.TempDir() @@ -2110,37 +2143,40 @@ func TestInitializeHeadTimestamp(t *testing.T) { require.Equal(t, int64(2000), db.head.MaxTime()) require.True(t, db.head.initialized()) }) - t.Run("existing-block-and-wal", func(t *testing.T) { - dir := t.TempDir() - createBlock(t, dir, genSeries(1, 1, 1000, 6000)) + for _, enableSTStorage := range []bool{false, true} { + t.Run("existing-block-and-wal,enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + dir := t.TempDir() - require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) - w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) - require.NoError(t, err) + createBlock(t, dir, genSeries(1, 1, 1000, 6000)) - var enc record.Encoder - err = w.Log( - enc.Series([]record.RefSeries{ - {Ref: 123, Labels: labels.FromStrings("a", "1")}, - {Ref: 124, Labels: labels.FromStrings("a", "2")}, - }, nil), - enc.Samples([]record.RefSample{ - {Ref: 123, T: 5000, V: 1}, - {Ref: 124, T: 15000, V: 1}, - }, nil), - ) - require.NoError(t, err) - require.NoError(t, w.Close()) + require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) + w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) + require.NoError(t, err) - db := newTestDB(t, withDir(dir)) + enc := record.Encoder{EnableSTStorage: enableSTStorage} + err = w.Log( + enc.Series([]record.RefSeries{ + {Ref: 123, Labels: labels.FromStrings("a", "1")}, + {Ref: 124, Labels: labels.FromStrings("a", "2")}, + }, nil), + enc.Samples([]record.RefSample{ + {Ref: 123, T: 5000, V: 1}, + {Ref: 124, T: 15000, V: 1}, + }, nil), + ) + require.NoError(t, err) + require.NoError(t, w.Close()) - require.Equal(t, int64(6000), db.head.MinTime()) - require.Equal(t, int64(15000), db.head.MaxTime()) - require.True(t, db.head.initialized()) - // Check that old series has been GCed. - require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.series)) - }) + db := newTestDB(t, withDir(dir)) + + require.Equal(t, int64(6000), db.head.MinTime()) + require.Equal(t, int64(15000), db.head.MaxTime()) + require.True(t, db.head.initialized()) + // Check that old series has been GCed. + require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.series)) + }) + } } func TestNoEmptyBlocks(t *testing.T) { @@ -4523,7 +4559,7 @@ func testOOOWALWrite(t *testing.T, series, err := dec.Series(rec, nil) require.NoError(t, err) records = append(records, series) - case record.Samples: + case record.Samples, record.SamplesV2: samples, err := dec.Samples(rec, nil) require.NoError(t, err) records = append(records, samples) @@ -4684,102 +4720,106 @@ func TestMetadataCheckpointingOnlyKeepsLatestEntry(t *testing.T) { require.NoError(t, err) } - ctx := context.Background() - numSamples := 10000 - hb, w := newTestHead(t, int64(numSamples)*10, compression.None, false) + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + ctx := context.Background() + numSamples := 10000 + hb, w := newTestHead(t, int64(numSamples)*10, compression.None, false) - // Add some series so we can append metadata to them. - app := hb.Appender(ctx) - s1 := labels.FromStrings("a", "b") - s2 := labels.FromStrings("c", "d") - s3 := labels.FromStrings("e", "f") - s4 := labels.FromStrings("g", "h") + // Add some series so we can append metadata to them. + app := hb.Appender(ctx) + s1 := labels.FromStrings("a", "b") + s2 := labels.FromStrings("c", "d") + s3 := labels.FromStrings("e", "f") + s4 := labels.FromStrings("g", "h") - for _, s := range []labels.Labels{s1, s2, s3, s4} { - _, err := app.Append(0, s, 0, 0) - require.NoError(t, err) + for _, s := range []labels.Labels{s1, s2, s3, s4} { + _, err := app.Append(0, s, 0, 0) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + + // Add a first round of metadata to the first three series. + // Re-take the Appender, as the previous Commit will have it closed. + m1 := metadata.Metadata{Type: "gauge", Unit: "unit_1", Help: "help_1"} + m2 := metadata.Metadata{Type: "gauge", Unit: "unit_2", Help: "help_2"} + m3 := metadata.Metadata{Type: "gauge", Unit: "unit_3", Help: "help_3"} + m4 := metadata.Metadata{Type: "gauge", Unit: "unit_4", Help: "help_4"} + app = hb.Appender(ctx) + updateMetadata(t, app, s1, m1) + updateMetadata(t, app, s2, m2) + updateMetadata(t, app, s3, m3) + updateMetadata(t, app, s4, m4) + require.NoError(t, app.Commit()) + + // Update metadata for first series. + m5 := metadata.Metadata{Type: "counter", Unit: "unit_5", Help: "help_5"} + app = hb.Appender(ctx) + updateMetadata(t, app, s1, m5) + require.NoError(t, app.Commit()) + + // Switch back-and-forth metadata for second series. + // Since it ended on a new metadata record, we expect a single new entry. + m6 := metadata.Metadata{Type: "counter", Unit: "unit_6", Help: "help_6"} + + app = hb.Appender(ctx) + updateMetadata(t, app, s2, m6) + require.NoError(t, app.Commit()) + + app = hb.Appender(ctx) + updateMetadata(t, app, s2, m2) + require.NoError(t, app.Commit()) + + app = hb.Appender(ctx) + updateMetadata(t, app, s2, m6) + require.NoError(t, app.Commit()) + + app = hb.Appender(ctx) + updateMetadata(t, app, s2, m2) + require.NoError(t, app.Commit()) + + app = hb.Appender(ctx) + updateMetadata(t, app, s2, m6) + require.NoError(t, app.Commit()) + + // Let's create a checkpoint. + first, last, err := wlog.Segments(w.Dir()) + require.NoError(t, err) + keep := func(id chunks.HeadSeriesRef) bool { + return id != 3 + } + _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0, enableSTStorage) + require.NoError(t, err) + + // Confirm there's been a checkpoint. + cdir, _, err := wlog.LastCheckpoint(w.Dir()) + require.NoError(t, err) + + // Read in checkpoint and WAL. + recs := readTestWAL(t, cdir) + var gotMetadataBlocks [][]record.RefMetadata + for _, rec := range recs { + if mr, ok := rec.([]record.RefMetadata); ok { + gotMetadataBlocks = append(gotMetadataBlocks, mr) + } + } + + // There should only be 1 metadata block present, with only the latest + // metadata kept around. + wantMetadata := []record.RefMetadata{ + {Ref: 1, Type: record.GetMetricType(m5.Type), Unit: m5.Unit, Help: m5.Help}, + {Ref: 2, Type: record.GetMetricType(m6.Type), Unit: m6.Unit, Help: m6.Help}, + {Ref: 4, Type: record.GetMetricType(m4.Type), Unit: m4.Unit, Help: m4.Help}, + } + require.Len(t, gotMetadataBlocks, 1) + require.Len(t, gotMetadataBlocks[0], 3) + gotMetadataBlock := gotMetadataBlocks[0] + + sort.Slice(gotMetadataBlock, func(i, j int) bool { return gotMetadataBlock[i].Ref < gotMetadataBlock[j].Ref }) + require.Equal(t, wantMetadata, gotMetadataBlock) + require.NoError(t, hb.Close()) + }) } - require.NoError(t, app.Commit()) - - // Add a first round of metadata to the first three series. - // Re-take the Appender, as the previous Commit will have it closed. - m1 := metadata.Metadata{Type: "gauge", Unit: "unit_1", Help: "help_1"} - m2 := metadata.Metadata{Type: "gauge", Unit: "unit_2", Help: "help_2"} - m3 := metadata.Metadata{Type: "gauge", Unit: "unit_3", Help: "help_3"} - m4 := metadata.Metadata{Type: "gauge", Unit: "unit_4", Help: "help_4"} - app = hb.Appender(ctx) - updateMetadata(t, app, s1, m1) - updateMetadata(t, app, s2, m2) - updateMetadata(t, app, s3, m3) - updateMetadata(t, app, s4, m4) - require.NoError(t, app.Commit()) - - // Update metadata for first series. - m5 := metadata.Metadata{Type: "counter", Unit: "unit_5", Help: "help_5"} - app = hb.Appender(ctx) - updateMetadata(t, app, s1, m5) - require.NoError(t, app.Commit()) - - // Switch back-and-forth metadata for second series. - // Since it ended on a new metadata record, we expect a single new entry. - m6 := metadata.Metadata{Type: "counter", Unit: "unit_6", Help: "help_6"} - - app = hb.Appender(ctx) - updateMetadata(t, app, s2, m6) - require.NoError(t, app.Commit()) - - app = hb.Appender(ctx) - updateMetadata(t, app, s2, m2) - require.NoError(t, app.Commit()) - - app = hb.Appender(ctx) - updateMetadata(t, app, s2, m6) - require.NoError(t, app.Commit()) - - app = hb.Appender(ctx) - updateMetadata(t, app, s2, m2) - require.NoError(t, app.Commit()) - - app = hb.Appender(ctx) - updateMetadata(t, app, s2, m6) - require.NoError(t, app.Commit()) - - // Let's create a checkpoint. - first, last, err := wlog.Segments(w.Dir()) - require.NoError(t, err) - keep := func(id chunks.HeadSeriesRef) bool { - return id != 3 - } - _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0) - require.NoError(t, err) - - // Confirm there's been a checkpoint. - cdir, _, err := wlog.LastCheckpoint(w.Dir()) - require.NoError(t, err) - - // Read in checkpoint and WAL. - recs := readTestWAL(t, cdir) - var gotMetadataBlocks [][]record.RefMetadata - for _, rec := range recs { - if mr, ok := rec.([]record.RefMetadata); ok { - gotMetadataBlocks = append(gotMetadataBlocks, mr) - } - } - - // There should only be 1 metadata block present, with only the latest - // metadata kept around. - wantMetadata := []record.RefMetadata{ - {Ref: 1, Type: record.GetMetricType(m5.Type), Unit: m5.Unit, Help: m5.Help}, - {Ref: 2, Type: record.GetMetricType(m6.Type), Unit: m6.Unit, Help: m6.Help}, - {Ref: 4, Type: record.GetMetricType(m4.Type), Unit: m4.Unit, Help: m4.Help}, - } - require.Len(t, gotMetadataBlocks, 1) - require.Len(t, gotMetadataBlocks[0], 3) - gotMetadataBlock := gotMetadataBlocks[0] - - sort.Slice(gotMetadataBlock, func(i, j int) bool { return gotMetadataBlock[i].Ref < gotMetadataBlock[j].Ref }) - require.Equal(t, wantMetadata, gotMetadataBlock) - require.NoError(t, hb.Close()) } func TestMetadataAssertInMemoryData(t *testing.T) { diff --git a/tsdb/docs/format/chunks.md b/tsdb/docs/format/chunks.md index a604c9ea55..32538d436b 100644 --- a/tsdb/docs/format/chunks.md +++ b/tsdb/docs/format/chunks.md @@ -65,6 +65,96 @@ Notes: * `padding` of 0 to 7 bits so that the whole chunk data is byte-aligned. * The chunk can have as few as one sample, i.e. `ts_1`, `v_1`, etc. are optional. +## XOR2 chunk data + +XOR2 uses the same structure as XOR for samples 0 and 1. Starting from sample 2, +a joint control prefix encodes both the timestamp delta-of-delta (dod) and whether +the value changed, with common dod cases byte-aligned for efficient writing. + +XOR2 can encode start timestamp (ST) as well optionally, see details further +down. + + +``` +┌──────────────────────┬───────────────────┬───────────────┬───────────────┬────────────────┬─- +│ num_samples │ st_header | ts_0 │ v_0 │ ?st_0 | +└──────────────────────┴───────────────────┴───────────────┴───────────────┴────────────────┴─- + +-─────────────────────┬───────────────────────┬─────────────────────────┬─- + ts_1_delta │ v_1_xor │ ?st_1_delta | +-─────────────────────┴───────────────────────┴─────────────────────────┴─- + +-─────────────────────────┬───────────────────────┬─────┬─- + sample_2 │ ?st_2_dod | ... │ +-─────────────────────────┴───────────────────────┴─────┴─- + +-─────────────────────────┬───────────────────────┬──────────────────┐ + sample_n │ ?st_n_dod | padding │ +-─────────────────────────┴───────────────────────┴──────────────────┘ + +``` + +### Joint sample encoding for n >= 2 (``): + +Each sample starts with a variable-length control prefix that jointly encodes the +dod and value change status: + +| Control prefix | dod | Value encoding that follows | +|---|---|---| +| `0` | 0 | (none, value unchanged) | +| `10` | 0 | `` (value known non-zero and non-stale) | +| `110DDDDD` `DDDDDDDD` | 13-bit signed [-4096, 4095] | `` | +| `1110DDDD` `DDDDDDDD` `DDDDDDDD` | 20-bit signed [-524288, 524287] | `` | +| `11110` + 64-bit dod | exact | `` | +| `11111` | 0 | (none, stale NaN — no value field) | + +The `110` and `1110` cases pack the prefix and the most-significant dod bits into +the first byte, making the full dod field byte-aligned. + +### Value delta encoding (``): + +Used after the dod≠0 control prefixes. The XOR of the current and previous value is encoded as: + +| Prefix | Meaning | +|---|---| +| `0` | XOR = 0 (value unchanged) | +| `10` | Reuse previous leading/trailing window; `sigbits` value bits follow | +| `110` + leading(5) + sigbits(6) + value(sigbits) | New leading/trailing window | +| `111` | Stale NaN marker (3 bits) | + +### Value delta encoding, known non-zero (``): + +Used after the `10` control prefix (dod=0, value known to have changed and be non-stale). +The delta=0 check is skipped, saving one bit on the reuse path: + +| Prefix | Meaning | +|---|---| +| `0` | Reuse previous leading/trailing window; `sigbits` value bits follow | +| `1` + leading(5) + sigbits(6) + value(sigbits) | New leading/trailing window | + +### Start timestamp encoding + +* We use `st_i_dod` and `st_i` interchangeably when `i>1` in these notes. +* `st_header` is one byte: + ``` + ┌───────────────────────┬───────────────────────┐ + │ first_st_known<1 bit> | st_changed_on<7 bits> │ + └───────────────────────┴───────────────────────┘ + ``` + where the highest bit `first_st_known` indicates if `st_0` is present or not. + If the lower 7bits `st_changed_on` is 0, no `st_i (i>0)` is present. + Otherwise `st_i (i>=st_changed_on>)` is present, while + `st_i (01)` is encoded as a `varbit_ts` "delta of delta" from + `st_i-1` (or from 0 if `st_i-1` is not present). + ## Histogram chunk data ``` diff --git a/tsdb/head.go b/tsdb/head.go index 33a32cad8a..838b4bb699 100644 --- a/tsdb/head.go +++ b/tsdb/head.go @@ -161,6 +161,15 @@ type HeadOptions struct { OutOfOrderTimeWindow atomic.Int64 OutOfOrderCapMax atomic.Int64 + // EnableSTStorage determines whether databases (WAL/WBL, tsdb, + // agent) should set a Start Time value per sample. + // Represents 'st-storage' feature flag. + EnableSTStorage atomic.Bool + + // EnableXOR2Encoding enables XOR2 chunk encoding for float samples. + // Represents 'xor2-encoding' feature flag. + EnableXOR2Encoding atomic.Bool + ChunkRange int64 // ChunkDirRoot is the parent directory of the chunks directory. ChunkDirRoot string @@ -1382,7 +1391,7 @@ func (h *Head) truncateWAL(mint int64) error { } h.metrics.checkpointCreationTotal.Inc() - if _, err = wlog.Checkpoint(h.logger, h.wal, first, last, h.keepSeriesInWALCheckpointFn(mint), mint); err != nil { + if _, err = wlog.Checkpoint(h.logger, h.wal, first, last, h.keepSeriesInWALCheckpointFn(mint), mint, h.opts.EnableSTStorage.Load()); err != nil { h.metrics.checkpointCreationFail.Inc() var cerr *chunks.CorruptionErr if errors.As(err, &cerr) { @@ -1676,7 +1685,7 @@ func (h *Head) Delete(ctx context.Context, mint, maxt int64, ms ...*labels.Match } if h.wal != nil { - var enc record.Encoder + enc := record.Encoder{EnableSTStorage: h.opts.EnableSTStorage.Load()} if err := h.wal.Log(enc.Tombstones(stones, nil)); err != nil { return err } diff --git a/tsdb/head_append.go b/tsdb/head_append.go index e6c9f2828a..c7143d8d96 100644 --- a/tsdb/head_append.go +++ b/tsdb/head_append.go @@ -185,6 +185,8 @@ func (h *Head) appender() *headAppender { typesInBatch: h.getTypeMap(), appendID: appendID, cleanupAppendIDsBelow: cleanupAppendIDsBelow, + storeST: h.opts.EnableSTStorage.Load(), + useXOR2: h.opts.EnableXOR2Encoding.Load(), }, } } @@ -412,6 +414,8 @@ type headAppenderBase struct { appendID, cleanupAppendIDsBelow uint64 closed bool + storeST bool + useXOR2 bool } type headAppender struct { headAppenderBase @@ -1059,7 +1063,7 @@ func (a *headAppenderBase) log() error { defer func() { a.head.putBytesBuffer(buf) }() var rec []byte - var enc record.Encoder + enc := record.Encoder{EnableSTStorage: a.storeST} if len(a.seriesRefs) > 0 { rec = enc.Series(a.seriesRefs, buf) @@ -1168,6 +1172,7 @@ type appenderCommitContext struct { histoOOBRejected int inOrderMint int64 inOrderMaxt int64 + appendChunkOpts chunkOpts oooMinT int64 oooMaxT int64 wblSamples []record.RefSample @@ -1177,8 +1182,7 @@ type appenderCommitContext struct { oooMmapMarkersCount int oooRecords [][]byte oooCapMax int64 - appendChunkOpts chunkOpts - enc record.Encoder + oooEnc record.Encoder } // commitExemplars adds all exemplars from the provided batch to the head's exemplar storage. @@ -1228,31 +1232,31 @@ func (acc *appenderCommitContext) collectOOORecords(a *headAppenderBase) { }) } } - r := acc.enc.MmapMarkers(markers, a.head.getBytesBuffer()) + r := acc.oooEnc.MmapMarkers(markers, a.head.getBytesBuffer()) acc.oooRecords = append(acc.oooRecords, r) } if len(acc.wblSamples) > 0 { - r := acc.enc.Samples(acc.wblSamples, a.head.getBytesBuffer()) + r := acc.oooEnc.Samples(acc.wblSamples, a.head.getBytesBuffer()) acc.oooRecords = append(acc.oooRecords, r) } if len(acc.wblHistograms) > 0 { - r, customBucketsHistograms := acc.enc.HistogramSamples(acc.wblHistograms, a.head.getBytesBuffer()) + r, customBucketsHistograms := acc.oooEnc.HistogramSamples(acc.wblHistograms, a.head.getBytesBuffer()) if len(r) > 0 { acc.oooRecords = append(acc.oooRecords, r) } if len(customBucketsHistograms) > 0 { - r := acc.enc.CustomBucketsHistogramSamples(customBucketsHistograms, a.head.getBytesBuffer()) + r := acc.oooEnc.CustomBucketsHistogramSamples(customBucketsHistograms, a.head.getBytesBuffer()) acc.oooRecords = append(acc.oooRecords, r) } } if len(acc.wblFloatHistograms) > 0 { - r, customBucketsFloatHistograms := acc.enc.FloatHistogramSamples(acc.wblFloatHistograms, a.head.getBytesBuffer()) + r, customBucketsFloatHistograms := acc.oooEnc.FloatHistogramSamples(acc.wblFloatHistograms, a.head.getBytesBuffer()) if len(r) > 0 { acc.oooRecords = append(acc.oooRecords, r) } if len(customBucketsFloatHistograms) > 0 { - r := acc.enc.CustomBucketsFloatHistogramSamples(customBucketsFloatHistograms, a.head.getBytesBuffer()) + r := acc.oooEnc.CustomBucketsFloatHistogramSamples(customBucketsFloatHistograms, a.head.getBytesBuffer()) acc.oooRecords = append(acc.oooRecords, r) } } @@ -1387,7 +1391,7 @@ func (a *headAppenderBase) commitFloats(b *appendBatch, acc *appenderCommitConte // Sample is OOO and OOO handling is enabled // and the delta is within the OOO tolerance. var mmapRefs []chunks.ChunkDiskMapperRef - ok, chunkCreated, mmapRefs = series.insert(s.T, s.V, nil, nil, a.head.chunkDiskMapper, acc.oooCapMax, a.head.logger) + ok, chunkCreated, mmapRefs = series.insert(s.ST, s.T, s.V, nil, nil, acc.appendChunkOpts, acc.oooCapMax, a.head.logger) if chunkCreated { r, ok := acc.oooMmapMarkers[series.ref] if !ok || r != nil { @@ -1431,7 +1435,7 @@ func (a *headAppenderBase) commitFloats(b *appendBatch, acc *appenderCommitConte default: newlyStale := !value.IsStaleNaN(series.lastValue) && value.IsStaleNaN(s.V) staleToNonStale := value.IsStaleNaN(series.lastValue) && !value.IsStaleNaN(s.V) - ok, chunkCreated = series.append(s.T, s.V, a.appendID, acc.appendChunkOpts) + ok, chunkCreated = series.append(s.ST, s.T, s.V, a.appendID, acc.appendChunkOpts) if ok { if s.T < acc.inOrderMint { acc.inOrderMint = s.T @@ -1492,7 +1496,8 @@ func (a *headAppenderBase) commitHistograms(b *appendBatch, acc *appenderCommitC // Sample is OOO and OOO handling is enabled // and the delta is within the OOO tolerance. var mmapRefs []chunks.ChunkDiskMapperRef - ok, chunkCreated, mmapRefs = series.insert(s.T, 0, s.H, nil, a.head.chunkDiskMapper, acc.oooCapMax, a.head.logger) + // TODO(krajorama,ywwg): Pass ST when available in WAL. + ok, chunkCreated, mmapRefs = series.insert(0, s.T, 0, s.H, nil, acc.appendChunkOpts, acc.oooCapMax, a.head.logger) if chunkCreated { r, ok := acc.oooMmapMarkers[series.ref] if !ok || r != nil { @@ -1540,7 +1545,8 @@ func (a *headAppenderBase) commitHistograms(b *appendBatch, acc *appenderCommitC newlyStale = newlyStale && !value.IsStaleNaN(series.lastHistogramValue.Sum) staleToNonStale = value.IsStaleNaN(series.lastHistogramValue.Sum) && !value.IsStaleNaN(s.H.Sum) } - ok, chunkCreated = series.appendHistogram(s.T, s.H, a.appendID, acc.appendChunkOpts) + // TODO(krajorama,ywwg): pass ST when available in WAL. + ok, chunkCreated = series.appendHistogram(0, s.T, s.H, a.appendID, acc.appendChunkOpts) if ok { if s.T < acc.inOrderMint { acc.inOrderMint = s.T @@ -1601,7 +1607,8 @@ func (a *headAppenderBase) commitFloatHistograms(b *appendBatch, acc *appenderCo // Sample is OOO and OOO handling is enabled // and the delta is within the OOO tolerance. var mmapRefs []chunks.ChunkDiskMapperRef - ok, chunkCreated, mmapRefs = series.insert(s.T, 0, nil, s.FH, a.head.chunkDiskMapper, acc.oooCapMax, a.head.logger) + // TODO(krajorama,ywwg): Pass ST when available in WAL. + ok, chunkCreated, mmapRefs = series.insert(0, s.T, 0, nil, s.FH, acc.appendChunkOpts, acc.oooCapMax, a.head.logger) if chunkCreated { r, ok := acc.oooMmapMarkers[series.ref] if !ok || r != nil { @@ -1649,7 +1656,8 @@ func (a *headAppenderBase) commitFloatHistograms(b *appendBatch, acc *appenderCo newlyStale = newlyStale && !value.IsStaleNaN(series.lastFloatHistogramValue.Sum) staleToNonStale = value.IsStaleNaN(series.lastFloatHistogramValue.Sum) && !value.IsStaleNaN(s.FH.Sum) } - ok, chunkCreated = series.appendFloatHistogram(s.T, s.FH, a.appendID, acc.appendChunkOpts) + // TODO(krajorama,ywwg): pass ST when available in WAL. + ok, chunkCreated = series.appendFloatHistogram(0, s.T, s.FH, a.appendID, acc.appendChunkOpts) if ok { if s.T < acc.inOrderMint { acc.inOrderMint = s.T @@ -1741,6 +1749,10 @@ func (a *headAppenderBase) Commit() (err error) { chunkDiskMapper: h.chunkDiskMapper, chunkRange: h.chunkRange.Load(), samplesPerChunk: h.opts.SamplesPerChunk, + useXOR2: a.useXOR2, + }, + oooEnc: record.Encoder{ + EnableSTStorage: a.storeST, }, } @@ -1796,18 +1808,18 @@ func (a *headAppenderBase) Commit() (err error) { } // insert is like append, except it inserts. Used for OOO samples. -func (s *memSeries) insert(t int64, v float64, h *histogram.Histogram, fh *histogram.FloatHistogram, chunkDiskMapper *chunks.ChunkDiskMapper, oooCapMax int64, logger *slog.Logger) (inserted, chunkCreated bool, mmapRefs []chunks.ChunkDiskMapperRef) { +func (s *memSeries) insert(st, t int64, v float64, h *histogram.Histogram, fh *histogram.FloatHistogram, o chunkOpts, oooCapMax int64, logger *slog.Logger) (inserted, chunkCreated bool, mmapRefs []chunks.ChunkDiskMapperRef) { if s.ooo == nil { s.ooo = &memSeriesOOOFields{} } c := s.ooo.oooHeadChunk if c == nil || c.chunk.NumSamples() == int(oooCapMax) { // Note: If no new samples come in then we rely on compaction to clean up stale in-memory OOO chunks. - c, mmapRefs = s.cutNewOOOHeadChunk(t, chunkDiskMapper, logger) + c, mmapRefs = s.cutNewOOOHeadChunk(t, o, logger) chunkCreated = true } - ok := c.chunk.Insert(t, v, h, fh) + ok := c.chunk.Insert(st, t, v, h, fh) if ok { if chunkCreated || t < c.minTime { c.minTime = t @@ -1824,19 +1836,19 @@ type chunkOpts struct { chunkDiskMapper *chunks.ChunkDiskMapper chunkRange int64 samplesPerChunk int + useXOR2 bool // Selects XOR2 encoding for float chunks. } // append adds the sample (t, v) to the series. The caller also has to provide // the appendID for isolation. (The appendID can be zero, which results in no // isolation for this append.) // Series lock must be held when calling. -func (s *memSeries) append(t int64, v float64, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { - c, sampleInOrder, chunkCreated := s.appendPreprocessor(t, chunkenc.EncXOR, o) +func (s *memSeries) append(st, t int64, v float64, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { + c, sampleInOrder, chunkCreated := s.appendPreprocessor(t, chunkenc.ValFloat.ChunkEncoding(o.useXOR2), o) if !sampleInOrder { return sampleInOrder, chunkCreated } - // TODO(krajorama): pass ST. - s.app.Append(0, t, v) + s.app.Append(st, t, v) c.maxTime = t @@ -1856,14 +1868,14 @@ func (s *memSeries) append(t int64, v float64, appendID uint64, o chunkOpts) (sa // In case of recoding the existing chunk, a new chunk is allocated and the old chunk is dropped. // To keep the meaning of prometheus_tsdb_head_chunks and prometheus_tsdb_head_chunks_created_total // consistent, we return chunkCreated=false in this case. -func (s *memSeries) appendHistogram(t int64, h *histogram.Histogram, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { +func (s *memSeries) appendHistogram(st, t int64, h *histogram.Histogram, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { // Head controls the execution of recoding, so that we own the proper // chunk reference afterwards and mmap used up chunks. // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway. prevApp, _ := s.app.(*chunkenc.HistogramAppender) - c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.EncHistogram, o) + c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.ValHistogram.ChunkEncoding(o.useXOR2), o) if !sampleInOrder { return sampleInOrder, chunkCreated } @@ -1878,8 +1890,7 @@ func (s *memSeries) appendHistogram(t int64, h *histogram.Histogram, appendID ui prevApp = nil } - // TODO(krajorama): pass ST. - newChunk, recoded, s.app, _ = s.app.AppendHistogram(prevApp, 0, t, h, false) // false=request a new chunk if needed + newChunk, recoded, s.app, _ = s.app.AppendHistogram(prevApp, st, t, h, false) // false=request a new chunk if needed s.lastHistogramValue = h s.lastFloatHistogramValue = nil @@ -1914,14 +1925,14 @@ func (s *memSeries) appendHistogram(t int64, h *histogram.Histogram, appendID ui // In case of recoding the existing chunk, a new chunk is allocated and the old chunk is dropped. // To keep the meaning of prometheus_tsdb_head_chunks and prometheus_tsdb_head_chunks_created_total // consistent, we return chunkCreated=false in this case. -func (s *memSeries) appendFloatHistogram(t int64, fh *histogram.FloatHistogram, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { +func (s *memSeries) appendFloatHistogram(st, t int64, fh *histogram.FloatHistogram, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { // Head controls the execution of recoding, so that we own the proper // chunk reference afterwards and mmap used up chunks. // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway. prevApp, _ := s.app.(*chunkenc.FloatHistogramAppender) - c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.EncFloatHistogram, o) + c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.ValFloatHistogram.ChunkEncoding(o.useXOR2), o) if !sampleInOrder { return sampleInOrder, chunkCreated } @@ -1936,8 +1947,7 @@ func (s *memSeries) appendFloatHistogram(t int64, fh *histogram.FloatHistogram, prevApp = nil } - // TODO(krajorama): pass ST. - newChunk, recoded, s.app, _ = s.app.AppendFloatHistogram(prevApp, 0, t, fh, false) // False means request a new chunk if needed. + newChunk, recoded, s.app, _ = s.app.AppendFloatHistogram(prevApp, st, t, fh, false) // False means request a new chunk if needed. s.lastHistogramValue = nil s.lastFloatHistogramValue = fh @@ -2161,8 +2171,8 @@ func (s *memSeries) cutNewHeadChunk(mint int64, e chunkenc.Encoding, chunkRange // cutNewOOOHeadChunk cuts a new OOO chunk and m-maps the old chunk. // The caller must ensure that s is locked and s.ooo is not nil. -func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDiskMapper, logger *slog.Logger) (*oooHeadChunk, []chunks.ChunkDiskMapperRef) { - ref := s.mmapCurrentOOOHeadChunk(chunkDiskMapper, logger) +func (s *memSeries) cutNewOOOHeadChunk(mint int64, o chunkOpts, logger *slog.Logger) (*oooHeadChunk, []chunks.ChunkDiskMapperRef) { + ref := s.mmapCurrentOOOHeadChunk(o, logger) s.ooo.oooHeadChunk = &oooHeadChunk{ chunk: NewOOOChunk(), @@ -2174,12 +2184,12 @@ func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.Chunk } // s must be locked when calling. -func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper, logger *slog.Logger) []chunks.ChunkDiskMapperRef { +func (s *memSeries) mmapCurrentOOOHeadChunk(o chunkOpts, logger *slog.Logger) []chunks.ChunkDiskMapperRef { if s.ooo == nil || s.ooo.oooHeadChunk == nil { // OOO is not enabled or there is no head chunk, so nothing to m-map here. return nil } - chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64) + chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, o.useXOR2) if err != nil { handleChunkWriteError(err) return nil @@ -2190,7 +2200,7 @@ func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMap logger.Error("Too many OOO chunks, dropping data", "series", s.lset.String()) break } - chunkRef := chunkDiskMapper.WriteChunk(s.ref, memchunk.minTime, memchunk.maxTime, memchunk.chunk, true, handleChunkWriteError) + chunkRef := o.chunkDiskMapper.WriteChunk(s.ref, memchunk.minTime, memchunk.maxTime, memchunk.chunk, true, handleChunkWriteError) chunkRefs = append(chunkRefs, chunkRef) s.ooo.oooMmappedChunks = append(s.ooo.oooMmappedChunks, &mmappedChunk{ ref: chunkRef, diff --git a/tsdb/head_append_v2.go b/tsdb/head_append_v2.go index 87b62df536..29e19f4265 100644 --- a/tsdb/head_append_v2.go +++ b/tsdb/head_append_v2.go @@ -95,6 +95,8 @@ func (h *Head) appenderV2() *headAppenderV2 { typesInBatch: h.getTypeMap(), appendID: appendID, cleanupAppendIDsBelow: cleanupAppendIDsBelow, + storeST: h.opts.EnableSTStorage.Load(), + useXOR2: h.opts.EnableXOR2Encoding.Load(), }, } } @@ -140,7 +142,6 @@ func (a *headAppenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t i } } - // TODO(bwplotka): Handle ST natively (as per PROM-60). if a.head.opts.EnableSTAsZeroSample && st != 0 { a.bestEffortAppendSTZeroSample(s, ls, st, t, h, fh) } @@ -177,7 +178,7 @@ func (a *headAppenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t i // we do not need to check for the difference between "unknown // series" and "known series with stNone". } - appErr = a.appendFloat(s, t, v, opts.RejectOutOfOrder) + appErr = a.appendFloat(s, st, t, v, opts.RejectOutOfOrder) } // Handle append error, if any. if appErr != nil { @@ -218,7 +219,7 @@ func (a *headAppenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t i return storage.SeriesRef(s.ref), partialErr } -func (a *headAppenderV2) appendFloat(s *memSeries, t int64, v float64, fastRejectOOO bool) error { +func (a *headAppenderV2) appendFloat(s *memSeries, st, t int64, v float64, fastRejectOOO bool) error { s.Lock() // TODO(codesome): If we definitely know at this point that the sample is ooo, then optimise // to skip that sample from the WAL and write only in the WBL. @@ -239,7 +240,7 @@ func (a *headAppenderV2) appendFloat(s *memSeries, t int64, v float64, fastRejec } b := a.getCurrentBatch(stFloat, s.ref) - b.floats = append(b.floats, record.RefSample{Ref: s.ref, T: t, V: v}) + b.floats = append(b.floats, record.RefSample{Ref: s.ref, ST: st, T: t, V: v}) b.floatSeries = append(b.floatSeries, s) return nil } @@ -366,7 +367,7 @@ func (a *headAppenderV2) bestEffortAppendSTZeroSample(s *memSeries, ls labels.La } err = a.appendHistogram(s, st, zeroHistogram, true) default: - err = a.appendFloat(s, st, 0, true) + err = a.appendFloat(s, 0, st, 0, true) } if err != nil { diff --git a/tsdb/head_append_v2_test.go b/tsdb/head_append_v2_test.go index 61b2eecf4e..9464a9ef66 100644 --- a/tsdb/head_append_v2_test.go +++ b/tsdb/head_append_v2_test.go @@ -1865,296 +1865,300 @@ func TestHistogramInWALAndMmapChunk_AppenderV2(t *testing.T) { } func TestChunkSnapshot_AppenderV2(t *testing.T) { - head, _ := newTestHead(t, 120*4, compression.None, false) - defer func() { - head.opts.EnableMemorySnapshotOnShutdown = false - require.NoError(t, head.Close()) - }() + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + head, _ := newTestHead(t, 120*4, compression.None, false) + defer func() { + head.opts.EnableMemorySnapshotOnShutdown = false + require.NoError(t, head.Close()) + }() - type ex struct { - seriesLabels labels.Labels - e exemplar.Exemplar - } - - numSeries := 10 - expSeries := make(map[string][]chunks.Sample) - expHist := make(map[string][]chunks.Sample) - expFloatHist := make(map[string][]chunks.Sample) - expTombstones := make(map[storage.SeriesRef]tombstones.Intervals) - expExemplars := make([]ex, 0) - histograms := tsdbutil.GenerateTestGaugeHistograms(481) - floatHistogram := tsdbutil.GenerateTestGaugeFloatHistograms(481) - - newExemplar := func(lbls labels.Labels, ts int64) exemplar.Exemplar { - e := ex{ - seriesLabels: lbls, - e: exemplar.Exemplar{ - Labels: labels.FromStrings("trace_id", strconv.Itoa(rand.Int())), - Value: rand.Float64(), - Ts: ts, - }, - } - expExemplars = append(expExemplars, e) - return e.e - } - - checkSamples := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) - require.Equal(t, expSeries, series) - } - checkHistograms := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "hist", "baz.*")) - require.Equal(t, expHist, series) - } - checkFloatHistograms := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "floathist", "bat.*")) - require.Equal(t, expFloatHist, series) - } - checkTombstones := func() { - tr, err := head.Tombstones() - require.NoError(t, err) - actTombstones := make(map[storage.SeriesRef]tombstones.Intervals) - require.NoError(t, tr.Iter(func(ref storage.SeriesRef, itvs tombstones.Intervals) error { - for _, itv := range itvs { - actTombstones[ref].Add(itv) + type ex struct { + seriesLabels labels.Labels + e exemplar.Exemplar + } + + numSeries := 10 + expSeries := make(map[string][]chunks.Sample) + expHist := make(map[string][]chunks.Sample) + expFloatHist := make(map[string][]chunks.Sample) + expTombstones := make(map[storage.SeriesRef]tombstones.Intervals) + expExemplars := make([]ex, 0) + histograms := tsdbutil.GenerateTestGaugeHistograms(481) + floatHistogram := tsdbutil.GenerateTestGaugeFloatHistograms(481) + + newExemplar := func(lbls labels.Labels, ts int64) exemplar.Exemplar { + e := ex{ + seriesLabels: lbls, + e: exemplar.Exemplar{ + Labels: labels.FromStrings("trace_id", strconv.Itoa(rand.Int())), + Value: rand.Float64(), + Ts: ts, + }, + } + expExemplars = append(expExemplars, e) + return e.e + } + + checkSamples := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expSeries, series) + } + checkHistograms := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "hist", "baz.*")) + require.Equal(t, expHist, series) + } + checkFloatHistograms := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "floathist", "bat.*")) + require.Equal(t, expFloatHist, series) + } + checkTombstones := func() { + tr, err := head.Tombstones() + require.NoError(t, err) + actTombstones := make(map[storage.SeriesRef]tombstones.Intervals) + require.NoError(t, tr.Iter(func(ref storage.SeriesRef, itvs tombstones.Intervals) error { + for _, itv := range itvs { + actTombstones[ref].Add(itv) + } + return nil + })) + require.Equal(t, expTombstones, actTombstones) + } + checkExemplars := func() { + actExemplars := make([]ex, 0, len(expExemplars)) + err := head.exemplars.IterateExemplars(func(seriesLabels labels.Labels, e exemplar.Exemplar) error { + actExemplars = append(actExemplars, ex{ + seriesLabels: seriesLabels, + e: e, + }) + return nil + }) + require.NoError(t, err) + // Verifies both existence of right exemplars and order of exemplars in the buffer. + testutil.RequireEqualWithOptions(t, expExemplars, actExemplars, []cmp.Option{cmp.AllowUnexported(ex{})}) + } + + var ( + wlast, woffset int + err error + ) + + closeHeadAndCheckSnapshot := func() { + require.NoError(t, head.Close()) + + _, sidx, soffset, err := LastChunkSnapshot(head.opts.ChunkDirRoot) + require.NoError(t, err) + require.Equal(t, wlast, sidx) + require.Equal(t, woffset, soffset) + } + + openHeadAndCheckReplay := func() { + w, err := wlog.NewSize(nil, nil, head.wal.Dir(), 32768, compression.None) + require.NoError(t, err) + head, err = NewHead(nil, nil, w, nil, head.opts, nil) + require.NoError(t, err) + require.NoError(t, head.Init(math.MinInt64)) + + checkSamples() + checkHistograms() + checkFloatHistograms() + checkTombstones() + checkExemplars() + } + + { // Initial data that goes into snapshot. + // Add some initial samples with >=1 m-map chunk. + app := head.AppenderV2(context.Background()) + for i := 1; i <= numSeries; i++ { + lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) + lblStr := lbls.String() + lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) + lblsHistStr := lblsHist.String() + lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) + lblsFloatHistStr := lblsFloatHist.String() + + // 240 samples should m-map at least 1 chunk. + for ts := int64(1); ts <= 240; ts++ { + // Add an exemplar, but only to float sample. + aOpts := storage.AOptions{} + if ts%10 == 0 { + aOpts.Exemplars = []exemplar.Exemplar{newExemplar(lbls, ts)} + } + val := rand.Float64() + expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) + _, err := app.Append(0, lbls, 0, ts, val, nil, nil, aOpts) + require.NoError(t, err) + + hist := histograms[int(ts)] + expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) + _, err = app.Append(0, lblsHist, 0, ts, 0, hist, nil, storage.AOptions{}) + require.NoError(t, err) + + floatHist := floatHistogram[int(ts)] + expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) + _, err = app.Append(0, lblsFloatHist, 0, ts, 0, nil, floatHist, storage.AOptions{}) + require.NoError(t, err) + + // Create multiple WAL records (commit). + if ts%10 == 0 { + require.NoError(t, app.Commit()) + app = head.AppenderV2(context.Background()) + } + } + } + require.NoError(t, app.Commit()) + + // Add some tombstones. + enc := record.Encoder{EnableSTStorage: enableSTStorage} + for i := 1; i <= numSeries; i++ { + ref := storage.SeriesRef(i) + itvs := tombstones.Intervals{ + {Mint: 1234, Maxt: 2345}, + {Mint: 3456, Maxt: 4567}, + } + for _, itv := range itvs { + expTombstones[ref].Add(itv) + } + head.tombstones.AddInterval(ref, itvs...) + err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ + {Ref: ref, Intervals: itvs}, + }, nil)) + require.NoError(t, err) + } + } + + // These references should be the ones used for the snapshot. + wlast, woffset, err = head.wal.LastSegmentAndOffset() + require.NoError(t, err) + if woffset != 0 && woffset < 32*1024 { + // The page is always filled before taking the snapshot. + woffset = 32 * 1024 + } + + { + // Creating snapshot and verifying it. + head.opts.EnableMemorySnapshotOnShutdown = true + closeHeadAndCheckSnapshot() // This will create a snapshot. + + // Test the replay of snapshot. + openHeadAndCheckReplay() + } + + { // Additional data to only include in WAL and m-mapped chunks and not snapshot. This mimics having an old snapshot on disk. + // Add more samples. + app := head.AppenderV2(context.Background()) + for i := 1; i <= numSeries; i++ { + lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) + lblStr := lbls.String() + lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) + lblsHistStr := lblsHist.String() + lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) + lblsFloatHistStr := lblsFloatHist.String() + + // 240 samples should m-map at least 1 chunk. + for ts := int64(241); ts <= 480; ts++ { + // Add an exemplar, but only to float sample. + aOpts := storage.AOptions{} + if ts%10 == 0 { + aOpts.Exemplars = []exemplar.Exemplar{newExemplar(lbls, ts)} + } + val := rand.Float64() + expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) + _, err := app.Append(0, lbls, 0, ts, val, nil, nil, aOpts) + require.NoError(t, err) + + hist := histograms[int(ts)] + expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) + _, err = app.Append(0, lblsHist, 0, ts, 0, hist, nil, storage.AOptions{}) + require.NoError(t, err) + + floatHist := floatHistogram[int(ts)] + expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) + _, err = app.Append(0, lblsFloatHist, 0, ts, 0, nil, floatHist, storage.AOptions{}) + require.NoError(t, err) + + // Create multiple WAL records (commit). + if ts%10 == 0 { + require.NoError(t, app.Commit()) + app = head.AppenderV2(context.Background()) + } + } + } + require.NoError(t, app.Commit()) + + // Add more tombstones. + enc := record.Encoder{EnableSTStorage: enableSTStorage} + for i := 1; i <= numSeries; i++ { + ref := storage.SeriesRef(i) + itvs := tombstones.Intervals{ + {Mint: 12345, Maxt: 23456}, + {Mint: 34567, Maxt: 45678}, + } + for _, itv := range itvs { + expTombstones[ref].Add(itv) + } + head.tombstones.AddInterval(ref, itvs...) + err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ + {Ref: ref, Intervals: itvs}, + }, nil)) + require.NoError(t, err) + } + } + { + // Close Head and verify that new snapshot was not created. + head.opts.EnableMemorySnapshotOnShutdown = false + closeHeadAndCheckSnapshot() // This should not create a snapshot. + + // Test the replay of snapshot, m-map chunks, and WAL. + head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. + openHeadAndCheckReplay() + } + + // Creating another snapshot should delete the older snapshot and replay still works fine. + wlast, woffset, err = head.wal.LastSegmentAndOffset() + require.NoError(t, err) + if woffset != 0 && woffset < 32*1024 { + // The page is always filled before taking the snapshot. + woffset = 32 * 1024 + } + + { + // Close Head and verify that new snapshot was created. + closeHeadAndCheckSnapshot() + + // Verify that there is only 1 snapshot. + files, err := os.ReadDir(head.opts.ChunkDirRoot) + require.NoError(t, err) + snapshots := 0 + for i := len(files) - 1; i >= 0; i-- { + fi := files[i] + if strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) { + snapshots++ + require.Equal(t, chunkSnapshotDir(wlast, woffset), fi.Name()) + } + } + require.Equal(t, 1, snapshots) + + // Test the replay of snapshot. + head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. + + // Disabling exemplars to check that it does not hard fail replay + // https://github.com/prometheus/prometheus/issues/9437#issuecomment-933285870. + head.opts.EnableExemplarStorage = false + head.opts.MaxExemplars.Store(0) + expExemplars = expExemplars[:0] + + openHeadAndCheckReplay() + + require.Equal(t, 0.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) } - return nil - })) - require.Equal(t, expTombstones, actTombstones) - } - checkExemplars := func() { - actExemplars := make([]ex, 0, len(expExemplars)) - err := head.exemplars.IterateExemplars(func(seriesLabels labels.Labels, e exemplar.Exemplar) error { - actExemplars = append(actExemplars, ex{ - seriesLabels: seriesLabels, - e: e, - }) - return nil }) - require.NoError(t, err) - // Verifies both existence of right exemplars and order of exemplars in the buffer. - testutil.RequireEqualWithOptions(t, expExemplars, actExemplars, []cmp.Option{cmp.AllowUnexported(ex{})}) - } - - var ( - wlast, woffset int - err error - ) - - closeHeadAndCheckSnapshot := func() { - require.NoError(t, head.Close()) - - _, sidx, soffset, err := LastChunkSnapshot(head.opts.ChunkDirRoot) - require.NoError(t, err) - require.Equal(t, wlast, sidx) - require.Equal(t, woffset, soffset) - } - - openHeadAndCheckReplay := func() { - w, err := wlog.NewSize(nil, nil, head.wal.Dir(), 32768, compression.None) - require.NoError(t, err) - head, err = NewHead(nil, nil, w, nil, head.opts, nil) - require.NoError(t, err) - require.NoError(t, head.Init(math.MinInt64)) - - checkSamples() - checkHistograms() - checkFloatHistograms() - checkTombstones() - checkExemplars() - } - - { // Initial data that goes into snapshot. - // Add some initial samples with >=1 m-map chunk. - app := head.AppenderV2(context.Background()) - for i := 1; i <= numSeries; i++ { - lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) - lblStr := lbls.String() - lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) - lblsHistStr := lblsHist.String() - lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) - lblsFloatHistStr := lblsFloatHist.String() - - // 240 samples should m-map at least 1 chunk. - for ts := int64(1); ts <= 240; ts++ { - // Add an exemplar, but only to float sample. - aOpts := storage.AOptions{} - if ts%10 == 0 { - aOpts.Exemplars = []exemplar.Exemplar{newExemplar(lbls, ts)} - } - val := rand.Float64() - expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) - _, err := app.Append(0, lbls, 0, ts, val, nil, nil, aOpts) - require.NoError(t, err) - - hist := histograms[int(ts)] - expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) - _, err = app.Append(0, lblsHist, 0, ts, 0, hist, nil, storage.AOptions{}) - require.NoError(t, err) - - floatHist := floatHistogram[int(ts)] - expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) - _, err = app.Append(0, lblsFloatHist, 0, ts, 0, nil, floatHist, storage.AOptions{}) - require.NoError(t, err) - - // Create multiple WAL records (commit). - if ts%10 == 0 { - require.NoError(t, app.Commit()) - app = head.AppenderV2(context.Background()) - } - } - } - require.NoError(t, app.Commit()) - - // Add some tombstones. - var enc record.Encoder - for i := 1; i <= numSeries; i++ { - ref := storage.SeriesRef(i) - itvs := tombstones.Intervals{ - {Mint: 1234, Maxt: 2345}, - {Mint: 3456, Maxt: 4567}, - } - for _, itv := range itvs { - expTombstones[ref].Add(itv) - } - head.tombstones.AddInterval(ref, itvs...) - err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ - {Ref: ref, Intervals: itvs}, - }, nil)) - require.NoError(t, err) - } - } - - // These references should be the ones used for the snapshot. - wlast, woffset, err = head.wal.LastSegmentAndOffset() - require.NoError(t, err) - if woffset != 0 && woffset < 32*1024 { - // The page is always filled before taking the snapshot. - woffset = 32 * 1024 - } - - { - // Creating snapshot and verifying it. - head.opts.EnableMemorySnapshotOnShutdown = true - closeHeadAndCheckSnapshot() // This will create a snapshot. - - // Test the replay of snapshot. - openHeadAndCheckReplay() - } - - { // Additional data to only include in WAL and m-mapped chunks and not snapshot. This mimics having an old snapshot on disk. - // Add more samples. - app := head.AppenderV2(context.Background()) - for i := 1; i <= numSeries; i++ { - lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) - lblStr := lbls.String() - lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) - lblsHistStr := lblsHist.String() - lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) - lblsFloatHistStr := lblsFloatHist.String() - - // 240 samples should m-map at least 1 chunk. - for ts := int64(241); ts <= 480; ts++ { - // Add an exemplar, but only to float sample. - aOpts := storage.AOptions{} - if ts%10 == 0 { - aOpts.Exemplars = []exemplar.Exemplar{newExemplar(lbls, ts)} - } - val := rand.Float64() - expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) - _, err := app.Append(0, lbls, 0, ts, val, nil, nil, aOpts) - require.NoError(t, err) - - hist := histograms[int(ts)] - expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) - _, err = app.Append(0, lblsHist, 0, ts, 0, hist, nil, storage.AOptions{}) - require.NoError(t, err) - - floatHist := floatHistogram[int(ts)] - expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) - _, err = app.Append(0, lblsFloatHist, 0, ts, 0, nil, floatHist, storage.AOptions{}) - require.NoError(t, err) - - // Create multiple WAL records (commit). - if ts%10 == 0 { - require.NoError(t, app.Commit()) - app = head.AppenderV2(context.Background()) - } - } - } - require.NoError(t, app.Commit()) - - // Add more tombstones. - var enc record.Encoder - for i := 1; i <= numSeries; i++ { - ref := storage.SeriesRef(i) - itvs := tombstones.Intervals{ - {Mint: 12345, Maxt: 23456}, - {Mint: 34567, Maxt: 45678}, - } - for _, itv := range itvs { - expTombstones[ref].Add(itv) - } - head.tombstones.AddInterval(ref, itvs...) - err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ - {Ref: ref, Intervals: itvs}, - }, nil)) - require.NoError(t, err) - } - } - { - // Close Head and verify that new snapshot was not created. - head.opts.EnableMemorySnapshotOnShutdown = false - closeHeadAndCheckSnapshot() // This should not create a snapshot. - - // Test the replay of snapshot, m-map chunks, and WAL. - head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. - openHeadAndCheckReplay() - } - - // Creating another snapshot should delete the older snapshot and replay still works fine. - wlast, woffset, err = head.wal.LastSegmentAndOffset() - require.NoError(t, err) - if woffset != 0 && woffset < 32*1024 { - // The page is always filled before taking the snapshot. - woffset = 32 * 1024 - } - - { - // Close Head and verify that new snapshot was created. - closeHeadAndCheckSnapshot() - - // Verify that there is only 1 snapshot. - files, err := os.ReadDir(head.opts.ChunkDirRoot) - require.NoError(t, err) - snapshots := 0 - for i := len(files) - 1; i >= 0; i-- { - fi := files[i] - if strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) { - snapshots++ - require.Equal(t, chunkSnapshotDir(wlast, woffset), fi.Name()) - } - } - require.Equal(t, 1, snapshots) - - // Test the replay of snapshot. - head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. - - // Disabling exemplars to check that it does not hard fail replay - // https://github.com/prometheus/prometheus/issues/9437#issuecomment-933285870. - head.opts.EnableExemplarStorage = false - head.opts.MaxExemplars.Store(0) - expExemplars = expExemplars[:0] - - openHeadAndCheckReplay() - - require.Equal(t, 0.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) } } @@ -2919,13 +2923,15 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot_AppenderV2(t *testing.T) { // TestWBLReplay checks the replay at a low level. func TestWBLReplay_AppenderV2(t *testing.T) { for name, scenario := range sampleTypeScenarios { - t.Run(name, func(t *testing.T) { - testWBLReplayAppenderV2(t, scenario) - }) + for _, enableSTstorage := range []bool{false, true} { + t.Run(fmt.Sprintf("%s/st-storage=%v", name, enableSTstorage), func(t *testing.T) { + testWBLReplayAppenderV2(t, scenario, enableSTstorage) + }) + } } } -func testWBLReplayAppenderV2(t *testing.T, scenario sampleTypeScenario) { +func testWBLReplayAppenderV2(t *testing.T, scenario sampleTypeScenario, enableSTstorage bool) { dir := t.TempDir() wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.Snappy) require.NoError(t, err) @@ -2936,6 +2942,8 @@ func testWBLReplayAppenderV2(t *testing.T, scenario sampleTypeScenario) { opts.ChunkRange = 1000 opts.ChunkDirRoot = dir opts.OutOfOrderTimeWindow.Store(30 * time.Minute.Milliseconds()) + opts.EnableSTStorage.Store(enableSTstorage) + opts.EnableXOR2Encoding.Store(enableSTstorage) h, err := NewHead(nil, nil, wal, oooWlog, opts, nil) require.NoError(t, err) @@ -2987,7 +2995,7 @@ func testWBLReplayAppenderV2(t *testing.T, scenario sampleTypeScenario) { require.False(t, ok) require.NotNil(t, ms) - chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64) + chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, h.opts.EnableXOR2Encoding.Load()) require.NoError(t, err) require.Len(t, chks, 1) @@ -4748,3 +4756,135 @@ func TestHeadAppenderV2_Append_HistogramStalenessConversionMetrics(t *testing.T) }) } } + +// TestHeadAppender_STStorage verifies that when EnableSTStorage is true, +// start timestamps are properly stored in chunks and returned by queries. +// This test uses AppenderV2 which has native ST support. +func TestHeadAppenderV2_STStorage(t *testing.T) { + testHistogram := tsdbutil.GenerateTestHistogram(1) + testHistogram.CounterResetHint = histogram.NotCounterReset + + type sampleData struct { + st int64 + ts int64 + fSample float64 + h *histogram.Histogram + } + + testCases := []struct { + name string + samples []sampleData + expectedSTs []int64 + isHistogram bool + }{ + { + name: "Float samples with ST", + samples: []sampleData{ + {st: 10, ts: 100, fSample: 1.0}, + {st: 20, ts: 200, fSample: 2.0}, + {st: 30, ts: 300, fSample: 3.0}, + }, + expectedSTs: []int64{10, 20, 30}, + isHistogram: false, + }, + { + name: "Float samples with varying ST", + samples: []sampleData{ + {st: 5, ts: 100, fSample: 1.0}, + {st: 5, ts: 200, fSample: 2.0}, + {st: 150, ts: 300, fSample: 3.0}, + }, + expectedSTs: []int64{5, 5, 150}, + isHistogram: false, + }, + { + name: "Histogram samples", + samples: []sampleData{ + {st: 10, ts: 100, h: testHistogram}, + {st: 20, ts: 200, h: testHistogram}, + {st: 30, ts: 300, h: testHistogram}, + }, + // Histograms don't support ST storage yet, should return 0. + expectedSTs: []int64{0, 0, 0}, + isHistogram: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) + opts.EnableSTStorage.Store(true) + opts.EnableXOR2Encoding.Store(true) + h, _ := newTestHeadWithOptions(t, compression.None, opts) + + lbls := labels.FromStrings("foo", "bar") + + a := h.AppenderV2(context.Background()) + for _, s := range tc.samples { + _, err := a.Append(0, lbls, s.st, s.ts, s.fSample, s.h, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, a.Commit()) + + // Verify ST values are stored in chunks. + ctx := context.Background() + idxReader, err := h.Index() + require.NoError(t, err) + defer idxReader.Close() + + chkReader, err := h.Chunks() + require.NoError(t, err) + defer chkReader.Close() + + p, err := idxReader.Postings(ctx, "foo", "bar") + require.NoError(t, err) + + var lblBuilder labels.ScratchBuilder + require.True(t, p.Next()) + sRef := p.At() + + var chkMetas []chunks.Meta + require.NoError(t, idxReader.Series(sRef, &lblBuilder, &chkMetas)) + + var actualSTs []int64 + for _, meta := range chkMetas { + chk, iterable, err := chkReader.ChunkOrIterable(meta) + require.NoError(t, err) + require.Nil(t, iterable) + + it := chk.Iterator(nil) + for it.Next() != chunkenc.ValNone { + st := it.AtST() + actualSTs = append(actualSTs, st) + } + require.NoError(t, it.Err()) + } + + if tc.isHistogram { + require.Equal(t, tc.expectedSTs, actualSTs, "Histogram samples should return 0 for ST") + } else { + require.Equal(t, tc.expectedSTs, actualSTs, "Float samples should have ST stored") + } + + // Also verify via querier. + q, err := NewBlockQuerier(h, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + defer q.Close() + + ss := q.Select(ctx, false, nil, labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")) + require.True(t, ss.Next()) + series := ss.At() + require.NoError(t, ss.Err()) + + seriesIt := series.Iterator(nil) + var queriedSTs []int64 + for seriesIt.Next() != chunkenc.ValNone { + st := seriesIt.AtST() + queriedSTs = append(queriedSTs, st) + } + require.NoError(t, seriesIt.Err()) + + require.Equal(t, tc.expectedSTs, queriedSTs, "Querier should return same ST values as chunk iterator") + }) + } +} diff --git a/tsdb/head_read_test.go b/tsdb/head_read_test.go index cf55973a01..0849c257b5 100644 --- a/tsdb/head_read_test.go +++ b/tsdb/head_read_test.go @@ -33,7 +33,7 @@ func TestMemSeries_chunk(t *testing.T) { appendSamples := func(t *testing.T, s *memSeries, start, end int64, cdm *chunks.ChunkDiskMapper) { for i := start; i < end; i += chunkStep { - ok, _ := s.append(i, float64(i), 0, chunkOpts{ + ok, _ := s.append(0, i, float64(i), 0, chunkOpts{ chunkDiskMapper: cdm, chunkRange: chunkRange, samplesPerChunk: DefaultSamplesPerChunk, diff --git a/tsdb/head_test.go b/tsdb/head_test.go index 56f3b70f5e..c04cd51278 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -112,8 +112,8 @@ func BenchmarkCreateSeries(b *testing.B) { } } -func populateTestWL(t testing.TB, w *wlog.WL, recs []any, buf []byte) []byte { - var enc record.Encoder +func populateTestWL(t testing.TB, w *wlog.WL, recs []any, buf []byte, enableSTStorage bool) []byte { + enc := record.Encoder{EnableSTStorage: enableSTStorage} for _, r := range recs { buf = buf[:0] switch v := r.(type) { @@ -159,7 +159,7 @@ func readTestWAL(t testing.TB, dir string) (recs []any) { series, err := dec.Series(rec, nil) require.NoError(t, err) recs = append(recs, series) - case record.Samples: + case record.Samples, record.SamplesV2: samples, err := dec.Samples(rec, nil) require.NoError(t, err) recs = append(recs, samples) @@ -256,177 +256,179 @@ func BenchmarkLoadWLs(b *testing.B) { // Rough estimates of most common % of samples that have an exemplar for each scrape. exemplarsPercentages := []float64{0, 0.5, 1, 5} lastExemplarsPerSeries := -1 - for _, c := range cases { - missingSeriesPercentages := []float64{0, 0.1} - for _, missingSeriesPct := range missingSeriesPercentages { - for _, p := range exemplarsPercentages { - exemplarsPerSeries := int(math.RoundToEven(float64(c.samplesPerSeries) * p / 100)) - // For tests with low samplesPerSeries we could end up testing with 0 exemplarsPerSeries - // multiple times without this check. - if exemplarsPerSeries == lastExemplarsPerSeries { - continue - } - lastExemplarsPerSeries = exemplarsPerSeries - b.Run(fmt.Sprintf("batches=%d,seriesPerBatch=%d,samplesPerSeries=%d,exemplarsPerSeries=%d,mmappedChunkT=%d,oooSeriesPct=%.3f,oooSamplesPct=%.3f,oooCapMax=%d,missingSeriesPct=%.3f", c.batches, c.seriesPerBatch, c.samplesPerSeries, exemplarsPerSeries, c.mmappedChunkT, c.oooSeriesPct, c.oooSamplesPct, c.oooCapMax, missingSeriesPct), - func(b *testing.B) { - dir := b.TempDir() + for _, enableSTStorage := range []bool{false, true} { + for _, c := range cases { + missingSeriesPercentages := []float64{0, 0.1} + for _, missingSeriesPct := range missingSeriesPercentages { + for _, p := range exemplarsPercentages { + exemplarsPerSeries := int(math.RoundToEven(float64(c.samplesPerSeries) * p / 100)) + // For tests with low samplesPerSeries we could end up testing with 0 exemplarsPerSeries + // multiple times without this check. + if exemplarsPerSeries == lastExemplarsPerSeries { + continue + } + lastExemplarsPerSeries = exemplarsPerSeries + b.Run(fmt.Sprintf("batches=%d,seriesPerBatch=%d,samplesPerSeries=%d,exemplarsPerSeries=%d,mmappedChunkT=%d,oooSeriesPct=%.3f,oooSamplesPct=%.3f,oooCapMax=%d,missingSeriesPct=%.3f,stStorage=%v", c.batches, c.seriesPerBatch, c.samplesPerSeries, exemplarsPerSeries, c.mmappedChunkT, c.oooSeriesPct, c.oooSamplesPct, c.oooCapMax, missingSeriesPct, enableSTStorage), + func(b *testing.B) { + dir := b.TempDir() - wal, err := wlog.New(nil, nil, dir, compression.None) - require.NoError(b, err) - var wbl *wlog.WL - if c.oooSeriesPct != 0 { - wbl, err = wlog.New(nil, nil, dir, compression.None) + wal, err := wlog.New(nil, nil, dir, compression.None) require.NoError(b, err) - } - - // Write series. - refSeries := make([]record.RefSeries, 0, c.seriesPerBatch) - var buf []byte - builder := labels.NewBuilder(labels.EmptyLabels()) - for j := 1; j < labelsPerSeries; j++ { - builder.Set(defaultLabelName+strconv.Itoa(j), defaultLabelValue+strconv.Itoa(j)) - } - for k := 0; k < c.batches; k++ { - refSeries = refSeries[:0] - for i := k * c.seriesPerBatch; i < (k+1)*c.seriesPerBatch; i++ { - builder.Set(defaultLabelName, strconv.Itoa(i)) - refSeries = append(refSeries, record.RefSeries{Ref: chunks.HeadSeriesRef(i) * 101, Labels: builder.Labels()}) + var wbl *wlog.WL + if c.oooSeriesPct != 0 { + wbl, err = wlog.New(nil, nil, dir, compression.None) + require.NoError(b, err) } - writeSeries := refSeries - if missingSeriesPct > 0 { - newWriteSeries := make([]record.RefSeries, 0, int(float64(len(refSeries))*(1.0-missingSeriesPct))) - keepRatio := 1.0 - missingSeriesPct - // Keep approximately every 1/keepRatio series. - for i, s := range refSeries { - if int(float64(i)*keepRatio) != int(float64(i+1)*keepRatio) { - newWriteSeries = append(newWriteSeries, s) + // Write series. + refSeries := make([]record.RefSeries, 0, c.seriesPerBatch) + var buf []byte + builder := labels.NewBuilder(labels.EmptyLabels()) + for j := 1; j < labelsPerSeries; j++ { + builder.Set(defaultLabelName+strconv.Itoa(j), defaultLabelValue+strconv.Itoa(j)) + } + for k := 0; k < c.batches; k++ { + refSeries = refSeries[:0] + for i := k * c.seriesPerBatch; i < (k+1)*c.seriesPerBatch; i++ { + builder.Set(defaultLabelName, strconv.Itoa(i)) + refSeries = append(refSeries, record.RefSeries{Ref: chunks.HeadSeriesRef(i) * 101, Labels: builder.Labels()}) + } + + writeSeries := refSeries + if missingSeriesPct > 0 { + newWriteSeries := make([]record.RefSeries, 0, int(float64(len(refSeries))*(1.0-missingSeriesPct))) + keepRatio := 1.0 - missingSeriesPct + // Keep approximately every 1/keepRatio series. + for i, s := range refSeries { + if int(float64(i)*keepRatio) != int(float64(i+1)*keepRatio) { + newWriteSeries = append(newWriteSeries, s) + } } + writeSeries = newWriteSeries } - writeSeries = newWriteSeries + + buf = populateTestWL(b, wal, []any{writeSeries}, buf, enableSTStorage) } - buf = populateTestWL(b, wal, []any{writeSeries}, buf) - } + // Write samples. + refSamples := make([]record.RefSample, 0, c.seriesPerBatch) - // Write samples. - refSamples := make([]record.RefSample, 0, c.seriesPerBatch) + oooSeriesPerBatch := int(float64(c.seriesPerBatch) * c.oooSeriesPct) + oooSamplesPerSeries := int(float64(c.samplesPerSeries) * c.oooSamplesPct) - oooSeriesPerBatch := int(float64(c.seriesPerBatch) * c.oooSeriesPct) - oooSamplesPerSeries := int(float64(c.samplesPerSeries) * c.oooSamplesPct) + for i := 0; i < c.samplesPerSeries; i++ { + for j := 0; j < c.batches; j++ { + refSamples = refSamples[:0] - for i := 0; i < c.samplesPerSeries; i++ { - for j := 0; j < c.batches; j++ { - refSamples = refSamples[:0] - - k := j * c.seriesPerBatch - // Skip appending the first oooSamplesPerSeries samples for the series in the batch that - // should have OOO samples. OOO samples are appended after all the in-order samples. - if i < oooSamplesPerSeries { - k += oooSeriesPerBatch + k := j * c.seriesPerBatch + // Skip appending the first oooSamplesPerSeries samples for the series in the batch that + // should have OOO samples. OOO samples are appended after all the in-order samples. + if i < oooSamplesPerSeries { + k += oooSeriesPerBatch + } + for ; k < (j+1)*c.seriesPerBatch; k++ { + refSamples = append(refSamples, record.RefSample{ + Ref: chunks.HeadSeriesRef(k) * 101, + T: int64(i) * 10, + V: float64(i) * 100, + }) + } + buf = populateTestWL(b, wal, []any{refSamples}, buf, enableSTStorage) } - for ; k < (j+1)*c.seriesPerBatch; k++ { - refSamples = append(refSamples, record.RefSample{ - Ref: chunks.HeadSeriesRef(k) * 101, - T: int64(i) * 10, - V: float64(i) * 100, - }) + } + + // Write mmapped chunks. + if c.mmappedChunkT != 0 { + chunkDiskMapper, err := chunks.NewChunkDiskMapper(nil, mmappedChunksDir(dir), chunkenc.NewPool(), chunks.DefaultWriteBufferSize, chunks.DefaultWriteQueueSize) + require.NoError(b, err) + cOpts := chunkOpts{ + chunkDiskMapper: chunkDiskMapper, + chunkRange: c.mmappedChunkT, + samplesPerChunk: DefaultSamplesPerChunk, } - buf = populateTestWL(b, wal, []any{refSamples}, buf) - } - } - - // Write mmapped chunks. - if c.mmappedChunkT != 0 { - chunkDiskMapper, err := chunks.NewChunkDiskMapper(nil, mmappedChunksDir(dir), chunkenc.NewPool(), chunks.DefaultWriteBufferSize, chunks.DefaultWriteQueueSize) - require.NoError(b, err) - cOpts := chunkOpts{ - chunkDiskMapper: chunkDiskMapper, - chunkRange: c.mmappedChunkT, - samplesPerChunk: DefaultSamplesPerChunk, - } - for k := 0; k < c.batches*c.seriesPerBatch; k++ { - // Create one mmapped chunk per series, with one sample at the given time. - s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, 0, defaultIsolationDisabled, false) - s.append(c.mmappedChunkT, 42, 0, cOpts) - // There's only one head chunk because only a single sample is appended. mmapChunks() - // ignores the latest chunk, so we need to cut a new head chunk to guarantee the chunk with - // the sample at c.mmappedChunkT is mmapped. - s.cutNewHeadChunk(c.mmappedChunkT, chunkenc.EncXOR, c.mmappedChunkT) - s.mmapChunks(chunkDiskMapper) - } - require.NoError(b, chunkDiskMapper.Close()) - } - - // Write exemplars. - refExemplars := make([]record.RefExemplar, 0, c.seriesPerBatch) - for i := range exemplarsPerSeries { - for j := 0; j < c.batches; j++ { - refExemplars = refExemplars[:0] - for k := j * c.seriesPerBatch; k < (j+1)*c.seriesPerBatch; k++ { - refExemplars = append(refExemplars, record.RefExemplar{ - Ref: chunks.HeadSeriesRef(k) * 101, - T: int64(i) * 10, - V: float64(i) * 100, - Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", i)), - }) + for k := 0; k < c.batches*c.seriesPerBatch; k++ { + // Create one mmapped chunk per series, with one sample at the given time. + s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, 0, defaultIsolationDisabled, false) + s.append(0, c.mmappedChunkT, 42, 0, cOpts) + // There's only one head chunk because only a single sample is appended. mmapChunks() + // ignores the latest chunk, so we need to cut a new head chunk to guarantee the chunk with + // the sample at c.mmappedChunkT is mmapped. + s.cutNewHeadChunk(c.mmappedChunkT, chunkenc.EncXOR, c.mmappedChunkT) + s.mmapChunks(chunkDiskMapper) } - buf = populateTestWL(b, wal, []any{refExemplars}, buf) + require.NoError(b, chunkDiskMapper.Close()) } - } - // Write OOO samples and mmap markers. - refMarkers := make([]record.RefMmapMarker, 0, oooSeriesPerBatch) - refSamples = make([]record.RefSample, 0, oooSeriesPerBatch) - for i := range oooSamplesPerSeries { - shouldAddMarkers := c.oooCapMax != 0 && i != 0 && int64(i)%c.oooCapMax == 0 - - for j := 0; j < c.batches; j++ { - refSamples = refSamples[:0] - if shouldAddMarkers { - refMarkers = refMarkers[:0] + // Write exemplars. + refExemplars := make([]record.RefExemplar, 0, c.seriesPerBatch) + for i := range exemplarsPerSeries { + for j := 0; j < c.batches; j++ { + refExemplars = refExemplars[:0] + for k := j * c.seriesPerBatch; k < (j+1)*c.seriesPerBatch; k++ { + refExemplars = append(refExemplars, record.RefExemplar{ + Ref: chunks.HeadSeriesRef(k) * 101, + T: int64(i) * 10, + V: float64(i) * 100, + Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", i)), + }) + } + buf = populateTestWL(b, wal, []any{refExemplars}, buf, enableSTStorage) } - for k := j * c.seriesPerBatch; k < (j*c.seriesPerBatch)+oooSeriesPerBatch; k++ { - ref := chunks.HeadSeriesRef(k) * 101 + } + + // Write OOO samples and mmap markers. + refMarkers := make([]record.RefMmapMarker, 0, oooSeriesPerBatch) + refSamples = make([]record.RefSample, 0, oooSeriesPerBatch) + for i := range oooSamplesPerSeries { + shouldAddMarkers := c.oooCapMax != 0 && i != 0 && int64(i)%c.oooCapMax == 0 + + for j := 0; j < c.batches; j++ { + refSamples = refSamples[:0] if shouldAddMarkers { - // loadWBL() checks that the marker's MmapRef is less than or equal to the ref - // for the last mmap chunk. Setting MmapRef to 0 to always pass that check. - refMarkers = append(refMarkers, record.RefMmapMarker{Ref: ref, MmapRef: 0}) + refMarkers = refMarkers[:0] } - refSamples = append(refSamples, record.RefSample{ - Ref: ref, - T: int64(i) * 10, - V: float64(i) * 100, - }) + for k := j * c.seriesPerBatch; k < (j*c.seriesPerBatch)+oooSeriesPerBatch; k++ { + ref := chunks.HeadSeriesRef(k) * 101 + if shouldAddMarkers { + // loadWBL() checks that the marker's MmapRef is less than or equal to the ref + // for the last mmap chunk. Setting MmapRef to 0 to always pass that check. + refMarkers = append(refMarkers, record.RefMmapMarker{Ref: ref, MmapRef: 0}) + } + refSamples = append(refSamples, record.RefSample{ + Ref: ref, + T: int64(i) * 10, + V: float64(i) * 100, + }) + } + if shouldAddMarkers { + populateTestWL(b, wbl, []any{refMarkers}, buf, enableSTStorage) + } + buf = populateTestWL(b, wal, []any{refSamples}, buf, enableSTStorage) + buf = populateTestWL(b, wbl, []any{refSamples}, buf, enableSTStorage) } - if shouldAddMarkers { - populateTestWL(b, wbl, []any{refMarkers}, buf) + } + + b.ResetTimer() + + // Load the WAL. + for b.Loop() { + opts := DefaultHeadOptions() + opts.ChunkRange = 1000 + opts.ChunkDirRoot = dir + if c.oooCapMax > 0 { + opts.OutOfOrderCapMax.Store(c.oooCapMax) } - buf = populateTestWL(b, wal, []any{refSamples}, buf) - buf = populateTestWL(b, wbl, []any{refSamples}, buf) + h, err := NewHead(nil, nil, wal, wbl, opts, nil) + require.NoError(b, err) + h.Init(0) } - } - - b.ResetTimer() - - // Load the WAL. - for b.Loop() { - opts := DefaultHeadOptions() - opts.ChunkRange = 1000 - opts.ChunkDirRoot = dir - if c.oooCapMax > 0 { - opts.OutOfOrderCapMax.Store(c.oooCapMax) + b.StopTimer() + wal.Close() + if wbl != nil { + wbl.Close() } - h, err := NewHead(nil, nil, wal, wbl, opts, nil) - require.NoError(b, err) - h.Init(0) - } - b.StopTimer() - wal.Close() - if wbl != nil { - wbl.Close() - } - }) + }) + } } } } @@ -711,124 +713,126 @@ func TestHead_HighConcurrencyReadAndWrite(t *testing.T) { } func TestHead_ReadWAL(t *testing.T) { - for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - entries := []any{ - []record.RefSeries{ - {Ref: 10, Labels: labels.FromStrings("a", "1")}, - {Ref: 11, Labels: labels.FromStrings("a", "2")}, - {Ref: 100, Labels: labels.FromStrings("a", "3")}, - }, - []record.RefSample{ - {Ref: 0, T: 99, V: 1}, - {Ref: 10, T: 100, V: 2}, - {Ref: 100, T: 100, V: 3}, - }, - []record.RefSeries{ - {Ref: 50, Labels: labels.FromStrings("a", "4")}, - // This series has two refs pointing to it. - {Ref: 101, Labels: labels.FromStrings("a", "3")}, - }, - []record.RefSample{ - {Ref: 10, T: 101, V: 5}, - {Ref: 50, T: 101, V: 6}, - // Sample for duplicate series record. - {Ref: 101, T: 101, V: 7}, - }, - []tombstones.Stone{ - {Ref: 0, Intervals: []tombstones.Interval{{Mint: 99, Maxt: 101}}}, - // Tombstone for duplicate series record. - {Ref: 101, Intervals: []tombstones.Interval{{Mint: 0, Maxt: 100}}}, - }, - []record.RefExemplar{ - {Ref: 10, T: 100, V: 1, Labels: labels.FromStrings("trace_id", "asdf")}, - // Exemplar for duplicate series record. - {Ref: 101, T: 101, V: 7, Labels: labels.FromStrings("trace_id", "zxcv")}, - }, - []record.RefMetadata{ - // Metadata for duplicate series record. - {Ref: 101, Type: uint8(record.Counter), Unit: "foo", Help: "total foo"}, - }, - } - - head, w := newTestHead(t, 1000, compress, false) - - populateTestWL(t, w, entries, nil) - - require.NoError(t, head.Init(math.MinInt64)) - require.Equal(t, uint64(101), head.lastSeriesID.Load()) - - s10 := head.series.getByID(10) - s11 := head.series.getByID(11) - s50 := head.series.getByID(50) - s100 := head.series.getByID(100) - s101 := head.series.getByID(101) - - testutil.RequireEqual(t, labels.FromStrings("a", "1"), s10.lset) - require.Nil(t, s11) // Series without samples should be garbage collected at head.Init(). - testutil.RequireEqual(t, labels.FromStrings("a", "4"), s50.lset) - testutil.RequireEqual(t, labels.FromStrings("a", "3"), s100.lset) - - // Duplicate series record should not be written to the head. - require.Nil(t, s101) - // But it should have a WAL expiry set. - keepUntil, ok := head.getWALExpiry(101) - require.True(t, ok) - require.Equal(t, int64(101), keepUntil) - // Only the duplicate series record should have a WAL expiry set. - _, ok = head.getWALExpiry(50) - require.False(t, ok) - - expandChunk := func(c chunkenc.Iterator) (x []sample) { - for c.Next() == chunkenc.ValFloat { - t, v := c.At() - x = append(x, sample{t: t, f: v}) + for _, enableSTStorage := range []bool{false, true} { + for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { + entries := []any{ + []record.RefSeries{ + {Ref: 10, Labels: labels.FromStrings("a", "1")}, + {Ref: 11, Labels: labels.FromStrings("a", "2")}, + {Ref: 100, Labels: labels.FromStrings("a", "3")}, + }, + []record.RefSample{ + {Ref: 0, T: 99, V: 1}, + {Ref: 10, T: 100, V: 2}, + {Ref: 100, T: 100, V: 3}, + }, + []record.RefSeries{ + {Ref: 50, Labels: labels.FromStrings("a", "4")}, + // This series has two refs pointing to it. + {Ref: 101, Labels: labels.FromStrings("a", "3")}, + }, + []record.RefSample{ + {Ref: 10, T: 101, V: 5}, + {Ref: 50, T: 101, V: 6}, + // Sample for duplicate series record. + {Ref: 101, T: 101, V: 7}, + }, + []tombstones.Stone{ + {Ref: 0, Intervals: []tombstones.Interval{{Mint: 99, Maxt: 101}}}, + // Tombstone for duplicate series record. + {Ref: 101, Intervals: []tombstones.Interval{{Mint: 0, Maxt: 100}}}, + }, + []record.RefExemplar{ + {Ref: 10, T: 100, V: 1, Labels: labels.FromStrings("trace_id", "asdf")}, + // Exemplar for duplicate series record. + {Ref: 101, T: 101, V: 7, Labels: labels.FromStrings("trace_id", "zxcv")}, + }, + []record.RefMetadata{ + // Metadata for duplicate series record. + {Ref: 101, Type: uint8(record.Counter), Unit: "foo", Help: "total foo"}, + }, } - require.NoError(t, c.Err()) - return x - } - // Verify samples and exemplar for series 10. - c, _, _, err := s10.chunk(0, head.chunkDiskMapper, &head.memChunkPool) - require.NoError(t, err) - require.Equal(t, []sample{{0, 100, 2, nil, nil}, {0, 101, 5, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + head, w := newTestHead(t, 1000, compress, false) - q, err := head.ExemplarQuerier(context.Background()) - require.NoError(t, err) - e, err := q.Select(0, 1000, []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "a", "1")}) - require.NoError(t, err) - require.NotEmpty(t, e) - require.NotEmpty(t, e[0].Exemplars) - require.True(t, exemplar.Exemplar{Ts: 100, Value: 1, Labels: labels.FromStrings("trace_id", "asdf")}.Equals(e[0].Exemplars[0])) + populateTestWL(t, w, entries, nil, enableSTStorage) - // Verify samples for series 50 - c, _, _, err = s50.chunk(0, head.chunkDiskMapper, &head.memChunkPool) - require.NoError(t, err) - require.Equal(t, []sample{{0, 101, 6, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + require.NoError(t, head.Init(math.MinInt64)) + require.Equal(t, uint64(101), head.lastSeriesID.Load()) - // Verify records for series 100 and its duplicate, series 101. - // The samples before the new series record should be discarded since a duplicate record - // is only possible when old samples were compacted. - c, _, _, err = s100.chunk(0, head.chunkDiskMapper, &head.memChunkPool) - require.NoError(t, err) - require.Equal(t, []sample{{0, 101, 7, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + s10 := head.series.getByID(10) + s11 := head.series.getByID(11) + s50 := head.series.getByID(50) + s100 := head.series.getByID(100) + s101 := head.series.getByID(101) - q, err = head.ExemplarQuerier(context.Background()) - require.NoError(t, err) - e, err = q.Select(0, 1000, []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "a", "3")}) - require.NoError(t, err) - require.NotEmpty(t, e) - require.NotEmpty(t, e[0].Exemplars) - require.True(t, exemplar.Exemplar{Ts: 101, Value: 7, Labels: labels.FromStrings("trace_id", "zxcv")}.Equals(e[0].Exemplars[0])) + testutil.RequireEqual(t, labels.FromStrings("a", "1"), s10.lset) + require.Nil(t, s11) // Series without samples should be garbage collected at head.Init(). + testutil.RequireEqual(t, labels.FromStrings("a", "4"), s50.lset) + testutil.RequireEqual(t, labels.FromStrings("a", "3"), s100.lset) - require.NotNil(t, s100.meta) - require.Equal(t, "foo", s100.meta.Unit) - require.Equal(t, "total foo", s100.meta.Help) + // Duplicate series record should not be written to the head. + require.Nil(t, s101) + // But it should have a WAL expiry set. + keepUntil, ok := head.getWALExpiry(101) + require.True(t, ok) + require.Equal(t, int64(101), keepUntil) + // Only the duplicate series record should have a WAL expiry set. + _, ok = head.getWALExpiry(50) + require.False(t, ok) - intervals, err := head.tombstones.Get(storage.SeriesRef(s100.ref)) - require.NoError(t, err) - require.Equal(t, tombstones.Intervals{{Mint: 0, Maxt: 100}}, intervals) - }) + expandChunk := func(c chunkenc.Iterator) (x []sample) { + for c.Next() == chunkenc.ValFloat { + t, v := c.At() + x = append(x, sample{t: t, f: v}) + } + require.NoError(t, c.Err()) + return x + } + + // Verify samples and exemplar for series 10. + c, _, _, err := s10.chunk(0, head.chunkDiskMapper, &head.memChunkPool) + require.NoError(t, err) + require.Equal(t, []sample{{0, 100, 2, nil, nil}, {0, 101, 5, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + + q, err := head.ExemplarQuerier(context.Background()) + require.NoError(t, err) + e, err := q.Select(0, 1000, []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "a", "1")}) + require.NoError(t, err) + require.NotEmpty(t, e) + require.NotEmpty(t, e[0].Exemplars) + require.True(t, exemplar.Exemplar{Ts: 100, Value: 1, Labels: labels.FromStrings("trace_id", "asdf")}.Equals(e[0].Exemplars[0])) + + // Verify samples for series 50 + c, _, _, err = s50.chunk(0, head.chunkDiskMapper, &head.memChunkPool) + require.NoError(t, err) + require.Equal(t, []sample{{0, 101, 6, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + + // Verify records for series 100 and its duplicate, series 101. + // The samples before the new series record should be discarded since a duplicate record + // is only possible when old samples were compacted. + c, _, _, err = s100.chunk(0, head.chunkDiskMapper, &head.memChunkPool) + require.NoError(t, err) + require.Equal(t, []sample{{0, 101, 7, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + + q, err = head.ExemplarQuerier(context.Background()) + require.NoError(t, err) + e, err = q.Select(0, 1000, []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "a", "3")}) + require.NoError(t, err) + require.NotEmpty(t, e) + require.NotEmpty(t, e[0].Exemplars) + require.True(t, exemplar.Exemplar{Ts: 101, Value: 7, Labels: labels.FromStrings("trace_id", "zxcv")}.Equals(e[0].Exemplars[0])) + + require.NotNil(t, s100.meta) + require.Equal(t, "foo", s100.meta.Unit) + require.Equal(t, "total foo", s100.meta.Help) + + intervals, err := head.tombstones.Get(storage.SeriesRef(s100.ref)) + require.NoError(t, err) + require.Equal(t, tombstones.Intervals{{Mint: 0, Maxt: 100}}, intervals) + }) + } } } @@ -1099,42 +1103,43 @@ func TestHead_WALCheckpointMultiRef(t *testing.T) { }, } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - h, w := newTestHead(t, 1000, compression.None, false) - - populateTestWL(t, w, tc.walEntries, nil) - first, _, err := wlog.Segments(w.Dir()) - require.NoError(t, err) - - require.NoError(t, h.Init(0)) - - keepUntil, ok := h.getWALExpiry(2) - require.True(t, ok) - require.Equal(t, tc.expectedWalExpiry, keepUntil) - - // Each truncation creates a new segment, so attempt truncations until a checkpoint is created - for { - h.lastWALTruncationTime.Store(0) // Reset so that it's always time to truncate the WAL - err := h.truncateWAL(tc.walTruncateMinT) + for _, enableSTStorage := range []bool{false, true} { + for _, tc := range cases { + t.Run(tc.name+",stStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + h, w := newTestHead(t, 1000, compression.None, false) + populateTestWL(t, w, tc.walEntries, nil, enableSTStorage) + first, _, err := wlog.Segments(w.Dir()) require.NoError(t, err) - f, _, err := wlog.Segments(w.Dir()) - require.NoError(t, err) - if f > first { - break + + require.NoError(t, h.Init(0)) + + keepUntil, ok := h.getWALExpiry(2) + require.True(t, ok) + require.Equal(t, tc.expectedWalExpiry, keepUntil) + + // Each truncation creates a new segment, so attempt truncations until a checkpoint is created + for { + h.lastWALTruncationTime.Store(0) // Reset so that it's always time to truncate the WAL + err := h.truncateWAL(tc.walTruncateMinT) + require.NoError(t, err) + f, _, err := wlog.Segments(w.Dir()) + require.NoError(t, err) + if f > first { + break + } } - } - // Read test WAL , checkpoint first - checkpointDir, _, err := wlog.LastCheckpoint(w.Dir()) - require.NoError(t, err) - cprecs := readTestWAL(t, checkpointDir) - recs := readTestWAL(t, w.Dir()) - recs = append(cprecs, recs...) + // Read test WAL , checkpoint first + checkpointDir, _, err := wlog.LastCheckpoint(w.Dir()) + require.NoError(t, err) + cprecs := readTestWAL(t, checkpointDir) + recs := readTestWAL(t, w.Dir()) + recs = append(cprecs, recs...) - // Use testutil.RequireEqual which handles labels properly with dedupelabels - testutil.RequireEqual(t, tc.expectedWalEntries, recs) - }) + // Use testutil.RequireEqual which handles labels properly with dedupelabels + testutil.RequireEqual(t, tc.expectedWalEntries, recs) + }) + } } } @@ -1487,7 +1492,7 @@ func TestMemSeries_truncateChunks(t *testing.T) { s := newMemSeries(labels.FromStrings("a", "b"), 1, 0, defaultIsolationDisabled, false) for i := 0; i < 4000; i += 5 { - ok, _ := s.append(int64(i), float64(i), 0, cOpts) + ok, _ := s.append(0, int64(i), float64(i), 0, cOpts) require.True(t, ok, "sample append failed") } s.mmapChunks(chunkDiskMapper) @@ -1637,7 +1642,7 @@ func TestMemSeries_truncateChunks_scenarios(t *testing.T) { if tc.mmappedChunks > 0 { headStart = (tc.mmappedChunks + 1) * chunkRange for i := 0; i < (tc.mmappedChunks+1)*chunkRange; i += chunkStep { - ok, _ := series.append(int64(i), float64(i), 0, cOpts) + ok, _ := series.append(0, int64(i), float64(i), 0, cOpts) require.True(t, ok, "sample append failed") } series.mmapChunks(chunkDiskMapper) @@ -1647,7 +1652,7 @@ func TestMemSeries_truncateChunks_scenarios(t *testing.T) { series.headChunks = nil } else { for i := headStart; i < chunkRange*(tc.mmappedChunks+tc.headChunks); i += chunkStep { - ok, _ := series.append(int64(i), float64(i), 0, cOpts) + ok, _ := series.append(0, int64(i), float64(i), 0, cOpts) require.True(t, ok, "sample append failed: %d", i) } } @@ -1685,29 +1690,31 @@ func TestMemSeries_truncateChunks_scenarios(t *testing.T) { } func TestHeadDeleteSeriesWithoutSamples(t *testing.T) { - for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - entries := []any{ - []record.RefSeries{ - {Ref: 10, Labels: labels.FromStrings("a", "1")}, - }, - []record.RefSample{}, - []record.RefSeries{ - {Ref: 50, Labels: labels.FromStrings("a", "2")}, - }, - []record.RefSample{ - {Ref: 50, T: 80, V: 1}, - {Ref: 50, T: 90, V: 1}, - }, - } - head, w := newTestHead(t, 1000, compress, false) + for _, enableSTStorage := range []bool{false, true} { + for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { + entries := []any{ + []record.RefSeries{ + {Ref: 10, Labels: labels.FromStrings("a", "1")}, + }, + []record.RefSample{}, + []record.RefSeries{ + {Ref: 50, Labels: labels.FromStrings("a", "2")}, + }, + []record.RefSample{ + {Ref: 50, T: 80, V: 1}, + {Ref: 50, T: 90, V: 1}, + }, + } + head, w := newTestHead(t, 1000, compress, false) - populateTestWL(t, w, entries, nil) + populateTestWL(t, w, entries, nil, enableSTStorage) - require.NoError(t, head.Init(math.MinInt64)) + require.NoError(t, head.Init(math.MinInt64)) - require.NoError(t, head.Delete(context.Background(), 0, 100, labels.MustNewMatcher(labels.MatchEqual, "a", "1"))) - }) + require.NoError(t, head.Delete(context.Background(), 0, 100, labels.MustNewMatcher(labels.MatchEqual, "a", "1"))) + }) + } } } @@ -2176,7 +2183,47 @@ func TestComputeChunkEndTime(t *testing.T) { } } +// TestMemSeries_append tests float appending with various useXOR2/st combinations. func TestMemSeries_append(t *testing.T) { + scenarios := []struct { + name string + useXOR2 bool + stFunc func(ts int64) int64 // Function to compute st from ts + }{ + { + name: "useXOR2=false st=0", + useXOR2: false, + stFunc: func(_ int64) int64 { return 0 }, + }, + { + name: "useXOR2=true st=0", + useXOR2: true, + stFunc: func(_ int64) int64 { return 0 }, + }, + { + name: "useXOR2=true st=ts", + useXOR2: true, + stFunc: func(ts int64) int64 { return ts }, + }, + { + name: "useXOR2=true st=ts-100", + useXOR2: true, + stFunc: func(ts int64) int64 { return ts - 100 }, + }, + { + name: "useXOR2=false st=ts (st ignored)", + useXOR2: false, + stFunc: func(ts int64) int64 { return ts }, + }, + } + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + testMemSeriesAppend(t, scenario.useXOR2, scenario.stFunc) + }) + } +} + +func testMemSeriesAppend(t *testing.T, useXOR2 bool, stFunc func(ts int64) int64) { dir := t.TempDir() // This is usually taken from the Head, but passing manually here. chunkDiskMapper, err := chunks.NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), chunks.DefaultWriteBufferSize, chunks.DefaultWriteQueueSize) @@ -2188,6 +2235,7 @@ func TestMemSeries_append(t *testing.T) { chunkDiskMapper: chunkDiskMapper, chunkRange: 500, samplesPerChunk: DefaultSamplesPerChunk, + useXOR2: useXOR2, } s := newMemSeries(labels.Labels{}, 1, 0, defaultIsolationDisabled, false) @@ -2195,20 +2243,20 @@ func TestMemSeries_append(t *testing.T) { // Add first two samples at the very end of a chunk range and the next two // on and after it. // New chunk must correctly be cut at 1000. - ok, chunkCreated := s.append(998, 1, 0, cOpts) + ok, chunkCreated := s.append(stFunc(998), 998, 1, 0, cOpts) require.True(t, ok, "append failed") require.True(t, chunkCreated, "first sample created chunk") - ok, chunkCreated = s.append(999, 2, 0, cOpts) + ok, chunkCreated = s.append(stFunc(999), 999, 2, 0, cOpts) require.True(t, ok, "append failed") require.False(t, chunkCreated, "second sample should use same chunk") s.mmapChunks(chunkDiskMapper) - ok, chunkCreated = s.append(1000, 3, 0, cOpts) + ok, chunkCreated = s.append(stFunc(1000), 1000, 3, 0, cOpts) require.True(t, ok, "append failed") require.True(t, chunkCreated, "expected new chunk on boundary") - ok, chunkCreated = s.append(1001, 4, 0, cOpts) + ok, chunkCreated = s.append(stFunc(1001), 1001, 4, 0, cOpts) require.True(t, ok, "append failed") require.False(t, chunkCreated, "second sample should use same chunk") @@ -2222,7 +2270,8 @@ func TestMemSeries_append(t *testing.T) { // Fill the range [1000,2000) with many samples. Intermediate chunks should be cut // at approximately 120 samples per chunk. for i := 1; i < 1000; i++ { - ok, _ := s.append(1001+int64(i), float64(i), 0, cOpts) + ts := 1001 + int64(i) + ok, _ := s.append(stFunc(ts), ts, float64(i), 0, cOpts) require.True(t, ok, "append failed") } s.mmapChunks(chunkDiskMapper) @@ -2237,7 +2286,47 @@ func TestMemSeries_append(t *testing.T) { } } +// TestMemSeries_appendHistogram tests histogram appending with various useXOR2/st combinations. func TestMemSeries_appendHistogram(t *testing.T) { + scenarios := []struct { + name string + useXOR2 bool + stFunc func(ts int64) int64 // Function to compute st from ts + }{ + { + name: "useXOR2=false st=0", + useXOR2: false, + stFunc: func(_ int64) int64 { return 0 }, + }, + { + name: "useXOR2=true st=0", + useXOR2: true, + stFunc: func(_ int64) int64 { return 0 }, + }, + { + name: "useXOR2=true st=ts", + useXOR2: true, + stFunc: func(ts int64) int64 { return ts }, + }, + { + name: "useXOR2=true st=ts-100", + useXOR2: true, + stFunc: func(ts int64) int64 { return ts - 100 }, + }, + { + name: "useXOR2=false st=ts (st ignored)", + useXOR2: false, + stFunc: func(ts int64) int64 { return ts }, + }, + } + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + testMemSeriesAppendHistogram(t, scenario.useXOR2, scenario.stFunc) + }) + } +} + +func testMemSeriesAppendHistogram(t *testing.T, useXOR2 bool, stFunc func(ts int64) int64) { dir := t.TempDir() // This is usually taken from the Head, but passing manually here. chunkDiskMapper, err := chunks.NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), chunks.DefaultWriteBufferSize, chunks.DefaultWriteQueueSize) @@ -2249,6 +2338,7 @@ func TestMemSeries_appendHistogram(t *testing.T) { chunkDiskMapper: chunkDiskMapper, chunkRange: int64(1000), samplesPerChunk: DefaultSamplesPerChunk, + useXOR2: useXOR2, } s := newMemSeries(labels.Labels{}, 1, 0, defaultIsolationDisabled, false) @@ -2263,19 +2353,19 @@ func TestMemSeries_appendHistogram(t *testing.T) { // Add first two samples at the very end of a chunk range and the next two // on and after it. // New chunk must correctly be cut at 1000. - ok, chunkCreated := s.appendHistogram(998, histograms[0], 0, cOpts) + ok, chunkCreated := s.appendHistogram(stFunc(998), 998, histograms[0], 0, cOpts) require.True(t, ok, "append failed") require.True(t, chunkCreated, "first sample created chunk") - ok, chunkCreated = s.appendHistogram(999, histograms[1], 0, cOpts) + ok, chunkCreated = s.appendHistogram(stFunc(999), 999, histograms[1], 0, cOpts) require.True(t, ok, "append failed") require.False(t, chunkCreated, "second sample should use same chunk") - ok, chunkCreated = s.appendHistogram(1000, histograms[2], 0, cOpts) + ok, chunkCreated = s.appendHistogram(stFunc(1000), 1000, histograms[2], 0, cOpts) require.True(t, ok, "append failed") require.True(t, chunkCreated, "expected new chunk on boundary") - ok, chunkCreated = s.appendHistogram(1001, histograms[3], 0, cOpts) + ok, chunkCreated = s.appendHistogram(stFunc(1001), 1001, histograms[3], 0, cOpts) require.True(t, ok, "append failed") require.False(t, chunkCreated, "second sample should use same chunk") @@ -2286,7 +2376,7 @@ func TestMemSeries_appendHistogram(t *testing.T) { require.Equal(t, int64(1000), s.headChunks.minTime, "wrong chunk range") require.Equal(t, int64(1001), s.headChunks.maxTime, "wrong chunk range") - ok, chunkCreated = s.appendHistogram(1002, histogramWithOneMoreBucket, 0, cOpts) + ok, chunkCreated = s.appendHistogram(stFunc(1002), 1002, histogramWithOneMoreBucket, 0, cOpts) require.True(t, ok, "append failed") require.False(t, chunkCreated, "third sample should trigger a re-encoded chunk") @@ -2321,7 +2411,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) { var nextTs int64 var totalAppendedSamples int for i := range samplesPerChunk / 4 { - ok, _ := s.append(nextTs, float64(i), 0, cOpts) + ok, _ := s.append(0, nextTs, float64(i), 0, cOpts) require.Truef(t, ok, "slow sample %d was not appended", i) nextTs += slowRate totalAppendedSamples++ @@ -2330,12 +2420,12 @@ func TestMemSeries_append_atVariableRate(t *testing.T) { // Suddenly, the rate increases and we receive a sample every millisecond. for i := range math.MaxUint16 { - ok, _ := s.append(nextTs, float64(i), 0, cOpts) + ok, _ := s.append(0, nextTs, float64(i), 0, cOpts) require.Truef(t, ok, "quick sample %d was not appended", i) nextTs++ totalAppendedSamples++ } - ok, chunkCreated := s.append(DefaultBlockDuration, float64(0), 0, cOpts) + ok, chunkCreated := s.append(0, DefaultBlockDuration, float64(0), 0, cOpts) require.True(t, ok, "new chunk sample was not appended") require.True(t, chunkCreated, "sample at block duration timestamp should create a new chunk") @@ -2364,43 +2454,43 @@ func TestGCChunkAccess(t *testing.T) { s, _, _ := h.getOrCreate(1, labels.FromStrings("a", "1"), false) // Appending 2 samples for the first chunk. - ok, chunkCreated := s.append(0, 0, 0, cOpts) + ok, chunkCreated := s.append(0, 0, 0, 0, cOpts) require.True(t, ok, "series append failed") require.True(t, chunkCreated, "chunks was not created") - ok, chunkCreated = s.append(999, 999, 0, cOpts) + ok, chunkCreated = s.append(0, 999, 999, 0, cOpts) require.True(t, ok, "series append failed") require.False(t, chunkCreated, "chunks was created") // A new chunks should be created here as it's beyond the chunk range. - ok, chunkCreated = s.append(1000, 1000, 0, cOpts) + ok, chunkCreated = s.append(0, 1000, 1000, 0, cOpts) require.True(t, ok, "series append failed") require.True(t, chunkCreated, "chunks was not created") - ok, chunkCreated = s.append(1999, 1999, 0, cOpts) + ok, chunkCreated = s.append(0, 1999, 1999, 0, cOpts) require.True(t, ok, "series append failed") require.False(t, chunkCreated, "chunks was created") idx := h.indexRange(0, 1500) var ( - chunks []chunks.Meta + chnks []chunks.Meta builder labels.ScratchBuilder ) - require.NoError(t, idx.Series(1, &builder, &chunks)) + require.NoError(t, idx.Series(1, &builder, &chnks)) require.Equal(t, labels.FromStrings("a", "1"), builder.Labels()) - require.Len(t, chunks, 2) + require.Len(t, chnks, 2) cr, err := h.chunksRange(0, 1500, nil) require.NoError(t, err) - _, _, err = cr.ChunkOrIterable(chunks[0]) + _, _, err = cr.ChunkOrIterable(chnks[0]) require.NoError(t, err) - _, _, err = cr.ChunkOrIterable(chunks[1]) + _, _, err = cr.ChunkOrIterable(chnks[1]) require.NoError(t, err) require.NoError(t, h.Truncate(1500)) // Remove a chunk. - _, _, err = cr.ChunkOrIterable(chunks[0]) + _, _, err = cr.ChunkOrIterable(chnks[0]) require.Equal(t, storage.ErrNotFound, err) - _, _, err = cr.ChunkOrIterable(chunks[1]) + _, _, err = cr.ChunkOrIterable(chnks[1]) require.NoError(t, err) } @@ -2420,18 +2510,18 @@ func TestGCSeriesAccess(t *testing.T) { s, _, _ := h.getOrCreate(1, labels.FromStrings("a", "1"), false) // Appending 2 samples for the first chunk. - ok, chunkCreated := s.append(0, 0, 0, cOpts) + ok, chunkCreated := s.append(0, 0, 0, 0, cOpts) require.True(t, ok, "series append failed") require.True(t, chunkCreated, "chunks was not created") - ok, chunkCreated = s.append(999, 999, 0, cOpts) + ok, chunkCreated = s.append(0, 999, 999, 0, cOpts) require.True(t, ok, "series append failed") require.False(t, chunkCreated, "chunks was created") // A new chunks should be created here as it's beyond the chunk range. - ok, chunkCreated = s.append(1000, 1000, 0, cOpts) + ok, chunkCreated = s.append(0, 1000, 1000, 0, cOpts) require.True(t, ok, "series append failed") require.True(t, chunkCreated, "chunks was not created") - ok, chunkCreated = s.append(1999, 1999, 0, cOpts) + ok, chunkCreated = s.append(0, 1999, 1999, 0, cOpts) require.True(t, ok, "series append failed") require.False(t, chunkCreated, "chunks was created") @@ -2568,94 +2658,96 @@ func TestHead_ReturnsSortedLabelValues(t *testing.T) { // TestWalRepair_DecodingError ensures that a repair is run for an error // when decoding a record. func TestWalRepair_DecodingError(t *testing.T) { - var enc record.Encoder - for name, test := range map[string]struct { - corrFunc func(rec []byte) []byte // Func that applies the corruption to a record. - rec []byte - totalRecs int - expRecs int - }{ - "decode_series": { - func(rec []byte) []byte { - return rec[:3] + for _, enableSTStorage := range []bool{false, true} { + enc := record.Encoder{EnableSTStorage: enableSTStorage} + for name, test := range map[string]struct { + corrFunc func(rec []byte) []byte // Func that applies the corruption to a record. + rec []byte + totalRecs int + expRecs int + }{ + "decode_series": { + func(rec []byte) []byte { + return rec[:3] + }, + enc.Series([]record.RefSeries{{Ref: 1, Labels: labels.FromStrings("a", "b")}}, []byte{}), + 9, + 5, }, - enc.Series([]record.RefSeries{{Ref: 1, Labels: labels.FromStrings("a", "b")}}, []byte{}), - 9, - 5, - }, - "decode_samples": { - func(rec []byte) []byte { - return rec[:3] + "decode_samples": { + func(rec []byte) []byte { + return rec[:3] + }, + enc.Samples([]record.RefSample{{Ref: 0, T: 99, V: 1}}, []byte{}), + 9, + 5, }, - enc.Samples([]record.RefSample{{Ref: 0, T: 99, V: 1}}, []byte{}), - 9, - 5, - }, - "decode_tombstone": { - func(rec []byte) []byte { - return rec[:3] + "decode_tombstone": { + func(rec []byte) []byte { + return rec[:3] + }, + enc.Tombstones([]tombstones.Stone{{Ref: 1, Intervals: tombstones.Intervals{}}}, []byte{}), + 9, + 5, }, - enc.Tombstones([]tombstones.Stone{{Ref: 1, Intervals: tombstones.Intervals{}}}, []byte{}), - 9, - 5, - }, - } { - for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { - t.Run(fmt.Sprintf("%s,compress=%s", name, compress), func(t *testing.T) { - dir := t.TempDir() + } { + for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { + t.Run(fmt.Sprintf("%s,compress=%s,stStorage=%v", name, compress, enableSTStorage), func(t *testing.T) { + dir := t.TempDir() - // Fill the wal and corrupt it. - { - w, err := wlog.New(nil, nil, filepath.Join(dir, "wal"), compress) - require.NoError(t, err) + // Fill the wal and corrupt it. + { + w, err := wlog.New(nil, nil, filepath.Join(dir, "wal"), compress) + require.NoError(t, err) - for i := 1; i <= test.totalRecs; i++ { - // At this point insert a corrupted record. - if i-1 == test.expRecs { - require.NoError(t, w.Log(test.corrFunc(test.rec))) - continue + for i := 1; i <= test.totalRecs; i++ { + // At this point insert a corrupted record. + if i-1 == test.expRecs { + require.NoError(t, w.Log(test.corrFunc(test.rec))) + continue + } + require.NoError(t, w.Log(test.rec)) } - require.NoError(t, w.Log(test.rec)) + + opts := DefaultHeadOptions() + opts.ChunkRange = 1 + opts.ChunkDirRoot = w.Dir() + h, err := NewHead(nil, nil, w, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) + initErr := h.Init(math.MinInt64) + + var cerr *wlog.CorruptionErr + require.ErrorAs(t, initErr, &cerr, "reading the wal didn't return corruption error") + require.NoError(t, h.Close()) // Head will close the wal as well. } - opts := DefaultHeadOptions() - opts.ChunkRange = 1 - opts.ChunkDirRoot = w.Dir() - h, err := NewHead(nil, nil, w, nil, opts, nil) - require.NoError(t, err) - require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) - initErr := h.Init(math.MinInt64) - - var cerr *wlog.CorruptionErr - require.ErrorAs(t, initErr, &cerr, "reading the wal didn't return corruption error") - require.NoError(t, h.Close()) // Head will close the wal as well. - } - - // Open the db to trigger a repair. - { - db, err := Open(dir, nil, nil, DefaultOptions(), nil) - require.NoError(t, err) - defer func() { - require.NoError(t, db.Close()) - }() - require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal)) - } - - // Read the wal content after the repair. - { - sr, err := wlog.NewSegmentsReader(filepath.Join(dir, "wal")) - require.NoError(t, err) - defer sr.Close() - r := wlog.NewReader(sr) - - var actRec int - for r.Next() { - actRec++ + // Open the db to trigger a repair. + { + db, err := Open(dir, nil, nil, DefaultOptions(), nil) + require.NoError(t, err) + defer func() { + require.NoError(t, db.Close()) + }() + require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal)) } - require.NoError(t, r.Err()) - require.Equal(t, test.expRecs, actRec, "Wrong number of intact records") - } - }) + + // Read the wal content after the repair. + { + sr, err := wlog.NewSegmentsReader(filepath.Join(dir, "wal")) + require.NoError(t, err) + defer sr.Close() + r := wlog.NewReader(sr) + + var actRec int + for r.Next() { + actRec++ + } + require.NoError(t, r.Err()) + require.Equal(t, test.expRecs, actRec, "Wrong number of intact records") + } + }) + } } } } @@ -2663,72 +2755,76 @@ func TestWalRepair_DecodingError(t *testing.T) { // TestWblRepair_DecodingError ensures that a repair is run for an error // when decoding a record. func TestWblRepair_DecodingError(t *testing.T) { - var enc record.Encoder - corrFunc := func(rec []byte) []byte { - return rec[:3] - } - rec := enc.Samples([]record.RefSample{{Ref: 0, T: 99, V: 1}}, []byte{}) - totalRecs := 9 - expRecs := 5 - dir := t.TempDir() - - // Fill the wbl and corrupt it. - { - wal, err := wlog.New(nil, nil, filepath.Join(dir, "wal"), compression.None) - require.NoError(t, err) - wbl, err := wlog.New(nil, nil, filepath.Join(dir, "wbl"), compression.None) - require.NoError(t, err) - - for i := 1; i <= totalRecs; i++ { - // At this point insert a corrupted record. - if i-1 == expRecs { - require.NoError(t, wbl.Log(corrFunc(rec))) - continue + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + enc := record.Encoder{EnableSTStorage: enableSTStorage} + corrFunc := func(rec []byte) []byte { + return rec[:3] } - require.NoError(t, wbl.Log(rec)) - } + rec := enc.Samples([]record.RefSample{{Ref: 0, T: 99, V: 1}}, []byte{}) + totalRecs := 9 + expRecs := 5 + dir := t.TempDir() - opts := DefaultHeadOptions() - opts.ChunkRange = 1 - opts.ChunkDirRoot = wal.Dir() - opts.OutOfOrderCapMax.Store(30) - opts.OutOfOrderTimeWindow.Store(1000 * time.Minute.Milliseconds()) - h, err := NewHead(nil, nil, wal, wbl, opts, nil) - require.NoError(t, err) - require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) - initErr := h.Init(math.MinInt64) + // Fill the wbl and corrupt it. + { + wal, err := wlog.New(nil, nil, filepath.Join(dir, "wal"), compression.None) + require.NoError(t, err) + wbl, err := wlog.New(nil, nil, filepath.Join(dir, "wbl"), compression.None) + require.NoError(t, err) - var elb *errLoadWbl - require.ErrorAs(t, initErr, &elb) // Wbl errors are wrapped into errLoadWbl, make sure we can unwrap it. + for i := 1; i <= totalRecs; i++ { + // At this point insert a corrupted record. + if i-1 == expRecs { + require.NoError(t, wbl.Log(corrFunc(rec))) + continue + } + require.NoError(t, wbl.Log(rec)) + } - var cerr *wlog.CorruptionErr - require.ErrorAs(t, initErr, &cerr, "reading the wal didn't return corruption error") - require.NoError(t, h.Close()) // Head will close the wal as well. - } + opts := DefaultHeadOptions() + opts.ChunkRange = 1 + opts.ChunkDirRoot = wal.Dir() + opts.OutOfOrderCapMax.Store(30) + opts.OutOfOrderTimeWindow.Store(1000 * time.Minute.Milliseconds()) + h, err := NewHead(nil, nil, wal, wbl, opts, nil) + require.NoError(t, err) + require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) + initErr := h.Init(math.MinInt64) - // Open the db to trigger a repair. - { - db, err := Open(dir, nil, nil, DefaultOptions(), nil) - require.NoError(t, err) - defer func() { - require.NoError(t, db.Close()) - }() - require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal)) - } + var elb *errLoadWbl + require.ErrorAs(t, initErr, &elb) // Wbl errors are wrapped into errLoadWbl, make sure we can unwrap it. - // Read the wbl content after the repair. - { - sr, err := wlog.NewSegmentsReader(filepath.Join(dir, "wbl")) - require.NoError(t, err) - defer sr.Close() - r := wlog.NewReader(sr) + var cerr *wlog.CorruptionErr + require.ErrorAs(t, initErr, &cerr, "reading the wal didn't return corruption error") + require.NoError(t, h.Close()) // Head will close the wal as well. + } - var actRec int - for r.Next() { - actRec++ - } - require.NoError(t, r.Err()) - require.Equal(t, expRecs, actRec, "Wrong number of intact records") + // Open the db to trigger a repair. + { + db, err := Open(dir, nil, nil, DefaultOptions(), nil) + require.NoError(t, err) + defer func() { + require.NoError(t, db.Close()) + }() + require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal)) + } + + // Read the wbl content after the repair. + { + sr, err := wlog.NewSegmentsReader(filepath.Join(dir, "wbl")) + require.NoError(t, err) + defer sr.Close() + r := wlog.NewReader(sr) + + var actRec int + for r.Next() { + actRec++ + } + require.NoError(t, r.Err()) + require.Equal(t, expRecs, actRec, "Wrong number of intact records") + } + }) } } @@ -2762,10 +2858,10 @@ func TestHeadReadWriterRepair(t *testing.T) { require.True(t, created, "series was not created") for i := range 7 { - ok, chunkCreated := s.append(int64(i*chunkRange), float64(i*chunkRange), 0, cOpts) + ok, chunkCreated := s.append(0, int64(i*chunkRange), float64(i*chunkRange), 0, cOpts) require.True(t, ok, "series append failed") require.True(t, chunkCreated, "chunk was not created") - ok, chunkCreated = s.append(int64(i*chunkRange)+chunkRange-1, float64(i*chunkRange), 0, cOpts) + ok, chunkCreated = s.append(0, int64(i*chunkRange)+chunkRange-1, float64(i*chunkRange), 0, cOpts) require.True(t, ok, "series append failed") require.False(t, chunkCreated, "chunk was created") h.chunkDiskMapper.CutNewFile() @@ -3105,7 +3201,7 @@ func TestIsolationAppendIDZeroIsNoop(t *testing.T) { s, _, _ := h.getOrCreate(1, labels.FromStrings("a", "1"), false) - ok, _ := s.append(0, 0, 0, cOpts) + ok, _ := s.append(0, 0, 0, 0, cOpts) require.True(t, ok, "Series append failed.") require.Equal(t, 0, int(s.txs.txIDCount), "Series should not have an appendID after append with appendID=0.") } @@ -3663,7 +3759,7 @@ func TestIteratorSeekIntoBuffer(t *testing.T) { s := newMemSeries(labels.Labels{}, 1, 0, defaultIsolationDisabled, false) for i := range 7 { - ok, _ := s.append(int64(i), float64(i), 0, cOpts) + ok, _ := s.append(0, int64(i), float64(i), 0, cOpts) require.True(t, ok, "sample append failed") } @@ -4359,289 +4455,293 @@ func TestHistogramInWALAndMmapChunk(t *testing.T) { } func TestChunkSnapshot(t *testing.T) { - head, _ := newTestHead(t, 120*4, compression.None, false) - defer func() { - head.opts.EnableMemorySnapshotOnShutdown = false - require.NoError(t, head.Close()) - }() + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + head, _ := newTestHead(t, 120*4, compression.None, false) + defer func() { + head.opts.EnableMemorySnapshotOnShutdown = false + require.NoError(t, head.Close()) + }() - type ex struct { - seriesLabels labels.Labels - e exemplar.Exemplar - } - - numSeries := 10 - expSeries := make(map[string][]chunks.Sample) - expHist := make(map[string][]chunks.Sample) - expFloatHist := make(map[string][]chunks.Sample) - expTombstones := make(map[storage.SeriesRef]tombstones.Intervals) - expExemplars := make([]ex, 0) - histograms := tsdbutil.GenerateTestGaugeHistograms(481) - floatHistogram := tsdbutil.GenerateTestGaugeFloatHistograms(481) - - addExemplar := func(app storage.Appender, ref storage.SeriesRef, lbls labels.Labels, ts int64) { - e := ex{ - seriesLabels: lbls, - e: exemplar.Exemplar{ - Labels: labels.FromStrings("trace_id", strconv.Itoa(rand.Int())), - Value: rand.Float64(), - Ts: ts, - }, - } - expExemplars = append(expExemplars, e) - _, err := app.AppendExemplar(ref, e.seriesLabels, e.e) - require.NoError(t, err) - } - - checkSamples := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) - require.Equal(t, expSeries, series) - } - checkHistograms := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "hist", "baz.*")) - require.Equal(t, expHist, series) - } - checkFloatHistograms := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "floathist", "bat.*")) - require.Equal(t, expFloatHist, series) - } - checkTombstones := func() { - tr, err := head.Tombstones() - require.NoError(t, err) - actTombstones := make(map[storage.SeriesRef]tombstones.Intervals) - require.NoError(t, tr.Iter(func(ref storage.SeriesRef, itvs tombstones.Intervals) error { - for _, itv := range itvs { - actTombstones[ref].Add(itv) + type ex struct { + seriesLabels labels.Labels + e exemplar.Exemplar + } + + numSeries := 10 + expSeries := make(map[string][]chunks.Sample) + expHist := make(map[string][]chunks.Sample) + expFloatHist := make(map[string][]chunks.Sample) + expTombstones := make(map[storage.SeriesRef]tombstones.Intervals) + expExemplars := make([]ex, 0) + histograms := tsdbutil.GenerateTestGaugeHistograms(481) + floatHistogram := tsdbutil.GenerateTestGaugeFloatHistograms(481) + + addExemplar := func(app storage.Appender, ref storage.SeriesRef, lbls labels.Labels, ts int64) { + e := ex{ + seriesLabels: lbls, + e: exemplar.Exemplar{ + Labels: labels.FromStrings("trace_id", strconv.Itoa(rand.Int())), + Value: rand.Float64(), + Ts: ts, + }, + } + expExemplars = append(expExemplars, e) + _, err := app.AppendExemplar(ref, e.seriesLabels, e.e) + require.NoError(t, err) + } + + checkSamples := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expSeries, series) + } + checkHistograms := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "hist", "baz.*")) + require.Equal(t, expHist, series) + } + checkFloatHistograms := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "floathist", "bat.*")) + require.Equal(t, expFloatHist, series) + } + checkTombstones := func() { + tr, err := head.Tombstones() + require.NoError(t, err) + actTombstones := make(map[storage.SeriesRef]tombstones.Intervals) + require.NoError(t, tr.Iter(func(ref storage.SeriesRef, itvs tombstones.Intervals) error { + for _, itv := range itvs { + actTombstones[ref].Add(itv) + } + return nil + })) + require.Equal(t, expTombstones, actTombstones) + } + checkExemplars := func() { + actExemplars := make([]ex, 0, len(expExemplars)) + err := head.exemplars.IterateExemplars(func(seriesLabels labels.Labels, e exemplar.Exemplar) error { + actExemplars = append(actExemplars, ex{ + seriesLabels: seriesLabels, + e: e, + }) + return nil + }) + require.NoError(t, err) + // Verifies both existence of right exemplars and order of exemplars in the buffer. + testutil.RequireEqualWithOptions(t, expExemplars, actExemplars, []cmp.Option{cmp.AllowUnexported(ex{})}) + } + + var ( + wlast, woffset int + err error + ) + + closeHeadAndCheckSnapshot := func() { + require.NoError(t, head.Close()) + + _, sidx, soffset, err := LastChunkSnapshot(head.opts.ChunkDirRoot) + require.NoError(t, err) + require.Equal(t, wlast, sidx) + require.Equal(t, woffset, soffset) + } + + openHeadAndCheckReplay := func() { + w, err := wlog.NewSize(nil, nil, head.wal.Dir(), 32768, compression.None) + require.NoError(t, err) + head, err = NewHead(nil, nil, w, nil, head.opts, nil) + require.NoError(t, err) + require.NoError(t, head.Init(math.MinInt64)) + + checkSamples() + checkHistograms() + checkFloatHistograms() + checkTombstones() + checkExemplars() + } + + { // Initial data that goes into snapshot. + // Add some initial samples with >=1 m-map chunk. + app := head.Appender(context.Background()) + for i := 1; i <= numSeries; i++ { + lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) + lblStr := lbls.String() + lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) + lblsHistStr := lblsHist.String() + lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) + lblsFloatHistStr := lblsFloatHist.String() + + // 240 samples should m-map at least 1 chunk. + for ts := int64(1); ts <= 240; ts++ { + val := rand.Float64() + expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) + ref, err := app.Append(0, lbls, ts, val) + require.NoError(t, err) + + hist := histograms[int(ts)] + expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) + _, err = app.AppendHistogram(0, lblsHist, ts, hist, nil) + require.NoError(t, err) + + floatHist := floatHistogram[int(ts)] + expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) + _, err = app.AppendHistogram(0, lblsFloatHist, ts, nil, floatHist) + require.NoError(t, err) + + // Add an exemplar and to create multiple WAL records. + if ts%10 == 0 { + addExemplar(app, ref, lbls, ts) + require.NoError(t, app.Commit()) + app = head.Appender(context.Background()) + } + } + } + require.NoError(t, app.Commit()) + + // Add some tombstones. + enc := record.Encoder{EnableSTStorage: enableSTStorage} + for i := 1; i <= numSeries; i++ { + ref := storage.SeriesRef(i) + itvs := tombstones.Intervals{ + {Mint: 1234, Maxt: 2345}, + {Mint: 3456, Maxt: 4567}, + } + for _, itv := range itvs { + expTombstones[ref].Add(itv) + } + head.tombstones.AddInterval(ref, itvs...) + err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ + {Ref: ref, Intervals: itvs}, + }, nil)) + require.NoError(t, err) + } + } + + // These references should be the ones used for the snapshot. + wlast, woffset, err = head.wal.LastSegmentAndOffset() + require.NoError(t, err) + if woffset != 0 && woffset < 32*1024 { + // The page is always filled before taking the snapshot. + woffset = 32 * 1024 + } + + { + // Creating snapshot and verifying it. + head.opts.EnableMemorySnapshotOnShutdown = true + closeHeadAndCheckSnapshot() // This will create a snapshot. + + // Test the replay of snapshot. + openHeadAndCheckReplay() + } + + { // Additional data to only include in WAL and m-mapped chunks and not snapshot. This mimics having an old snapshot on disk. + // Add more samples. + app := head.Appender(context.Background()) + for i := 1; i <= numSeries; i++ { + lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) + lblStr := lbls.String() + lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) + lblsHistStr := lblsHist.String() + lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) + lblsFloatHistStr := lblsFloatHist.String() + + // 240 samples should m-map at least 1 chunk. + for ts := int64(241); ts <= 480; ts++ { + val := rand.Float64() + expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) + ref, err := app.Append(0, lbls, ts, val) + require.NoError(t, err) + + hist := histograms[int(ts)] + expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) + _, err = app.AppendHistogram(0, lblsHist, ts, hist, nil) + require.NoError(t, err) + + floatHist := floatHistogram[int(ts)] + expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) + _, err = app.AppendHistogram(0, lblsFloatHist, ts, nil, floatHist) + require.NoError(t, err) + + // Add an exemplar and to create multiple WAL records. + if ts%10 == 0 { + addExemplar(app, ref, lbls, ts) + require.NoError(t, app.Commit()) + app = head.Appender(context.Background()) + } + } + } + require.NoError(t, app.Commit()) + + // Add more tombstones. + enc := record.Encoder{EnableSTStorage: enableSTStorage} + for i := 1; i <= numSeries; i++ { + ref := storage.SeriesRef(i) + itvs := tombstones.Intervals{ + {Mint: 12345, Maxt: 23456}, + {Mint: 34567, Maxt: 45678}, + } + for _, itv := range itvs { + expTombstones[ref].Add(itv) + } + head.tombstones.AddInterval(ref, itvs...) + err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ + {Ref: ref, Intervals: itvs}, + }, nil)) + require.NoError(t, err) + } + } + { + // Close Head and verify that new snapshot was not created. + head.opts.EnableMemorySnapshotOnShutdown = false + closeHeadAndCheckSnapshot() // This should not create a snapshot. + + // Test the replay of snapshot, m-map chunks, and WAL. + head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. + openHeadAndCheckReplay() + } + + // Creating another snapshot should delete the older snapshot and replay still works fine. + wlast, woffset, err = head.wal.LastSegmentAndOffset() + require.NoError(t, err) + if woffset != 0 && woffset < 32*1024 { + // The page is always filled before taking the snapshot. + woffset = 32 * 1024 + } + + { + // Close Head and verify that new snapshot was created. + closeHeadAndCheckSnapshot() + + // Verify that there is only 1 snapshot. + files, err := os.ReadDir(head.opts.ChunkDirRoot) + require.NoError(t, err) + snapshots := 0 + for i := len(files) - 1; i >= 0; i-- { + fi := files[i] + if strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) { + snapshots++ + require.Equal(t, chunkSnapshotDir(wlast, woffset), fi.Name()) + } + } + require.Equal(t, 1, snapshots) + + // Test the replay of snapshot. + head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. + + // Disabling exemplars to check that it does not hard fail replay + // https://github.com/prometheus/prometheus/issues/9437#issuecomment-933285870. + head.opts.EnableExemplarStorage = false + head.opts.MaxExemplars.Store(0) + expExemplars = expExemplars[:0] + + openHeadAndCheckReplay() + + require.Equal(t, 0.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) } - return nil - })) - require.Equal(t, expTombstones, actTombstones) - } - checkExemplars := func() { - actExemplars := make([]ex, 0, len(expExemplars)) - err := head.exemplars.IterateExemplars(func(seriesLabels labels.Labels, e exemplar.Exemplar) error { - actExemplars = append(actExemplars, ex{ - seriesLabels: seriesLabels, - e: e, - }) - return nil }) - require.NoError(t, err) - // Verifies both existence of right exemplars and order of exemplars in the buffer. - testutil.RequireEqualWithOptions(t, expExemplars, actExemplars, []cmp.Option{cmp.AllowUnexported(ex{})}) - } - - var ( - wlast, woffset int - err error - ) - - closeHeadAndCheckSnapshot := func() { - require.NoError(t, head.Close()) - - _, sidx, soffset, err := LastChunkSnapshot(head.opts.ChunkDirRoot) - require.NoError(t, err) - require.Equal(t, wlast, sidx) - require.Equal(t, woffset, soffset) - } - - openHeadAndCheckReplay := func() { - w, err := wlog.NewSize(nil, nil, head.wal.Dir(), 32768, compression.None) - require.NoError(t, err) - head, err = NewHead(nil, nil, w, nil, head.opts, nil) - require.NoError(t, err) - require.NoError(t, head.Init(math.MinInt64)) - - checkSamples() - checkHistograms() - checkFloatHistograms() - checkTombstones() - checkExemplars() - } - - { // Initial data that goes into snapshot. - // Add some initial samples with >=1 m-map chunk. - app := head.Appender(context.Background()) - for i := 1; i <= numSeries; i++ { - lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) - lblStr := lbls.String() - lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) - lblsHistStr := lblsHist.String() - lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) - lblsFloatHistStr := lblsFloatHist.String() - - // 240 samples should m-map at least 1 chunk. - for ts := int64(1); ts <= 240; ts++ { - val := rand.Float64() - expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) - ref, err := app.Append(0, lbls, ts, val) - require.NoError(t, err) - - hist := histograms[int(ts)] - expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) - _, err = app.AppendHistogram(0, lblsHist, ts, hist, nil) - require.NoError(t, err) - - floatHist := floatHistogram[int(ts)] - expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) - _, err = app.AppendHistogram(0, lblsFloatHist, ts, nil, floatHist) - require.NoError(t, err) - - // Add an exemplar and to create multiple WAL records. - if ts%10 == 0 { - addExemplar(app, ref, lbls, ts) - require.NoError(t, app.Commit()) - app = head.Appender(context.Background()) - } - } - } - require.NoError(t, app.Commit()) - - // Add some tombstones. - var enc record.Encoder - for i := 1; i <= numSeries; i++ { - ref := storage.SeriesRef(i) - itvs := tombstones.Intervals{ - {Mint: 1234, Maxt: 2345}, - {Mint: 3456, Maxt: 4567}, - } - for _, itv := range itvs { - expTombstones[ref].Add(itv) - } - head.tombstones.AddInterval(ref, itvs...) - err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ - {Ref: ref, Intervals: itvs}, - }, nil)) - require.NoError(t, err) - } - } - - // These references should be the ones used for the snapshot. - wlast, woffset, err = head.wal.LastSegmentAndOffset() - require.NoError(t, err) - if woffset != 0 && woffset < 32*1024 { - // The page is always filled before taking the snapshot. - woffset = 32 * 1024 - } - - { - // Creating snapshot and verifying it. - head.opts.EnableMemorySnapshotOnShutdown = true - closeHeadAndCheckSnapshot() // This will create a snapshot. - - // Test the replay of snapshot. - openHeadAndCheckReplay() - } - - { // Additional data to only include in WAL and m-mapped chunks and not snapshot. This mimics having an old snapshot on disk. - // Add more samples. - app := head.Appender(context.Background()) - for i := 1; i <= numSeries; i++ { - lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) - lblStr := lbls.String() - lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) - lblsHistStr := lblsHist.String() - lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) - lblsFloatHistStr := lblsFloatHist.String() - - // 240 samples should m-map at least 1 chunk. - for ts := int64(241); ts <= 480; ts++ { - val := rand.Float64() - expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) - ref, err := app.Append(0, lbls, ts, val) - require.NoError(t, err) - - hist := histograms[int(ts)] - expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) - _, err = app.AppendHistogram(0, lblsHist, ts, hist, nil) - require.NoError(t, err) - - floatHist := floatHistogram[int(ts)] - expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) - _, err = app.AppendHistogram(0, lblsFloatHist, ts, nil, floatHist) - require.NoError(t, err) - - // Add an exemplar and to create multiple WAL records. - if ts%10 == 0 { - addExemplar(app, ref, lbls, ts) - require.NoError(t, app.Commit()) - app = head.Appender(context.Background()) - } - } - } - require.NoError(t, app.Commit()) - - // Add more tombstones. - var enc record.Encoder - for i := 1; i <= numSeries; i++ { - ref := storage.SeriesRef(i) - itvs := tombstones.Intervals{ - {Mint: 12345, Maxt: 23456}, - {Mint: 34567, Maxt: 45678}, - } - for _, itv := range itvs { - expTombstones[ref].Add(itv) - } - head.tombstones.AddInterval(ref, itvs...) - err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ - {Ref: ref, Intervals: itvs}, - }, nil)) - require.NoError(t, err) - } - } - { - // Close Head and verify that new snapshot was not created. - head.opts.EnableMemorySnapshotOnShutdown = false - closeHeadAndCheckSnapshot() // This should not create a snapshot. - - // Test the replay of snapshot, m-map chunks, and WAL. - head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. - openHeadAndCheckReplay() - } - - // Creating another snapshot should delete the older snapshot and replay still works fine. - wlast, woffset, err = head.wal.LastSegmentAndOffset() - require.NoError(t, err) - if woffset != 0 && woffset < 32*1024 { - // The page is always filled before taking the snapshot. - woffset = 32 * 1024 - } - - { - // Close Head and verify that new snapshot was created. - closeHeadAndCheckSnapshot() - - // Verify that there is only 1 snapshot. - files, err := os.ReadDir(head.opts.ChunkDirRoot) - require.NoError(t, err) - snapshots := 0 - for i := len(files) - 1; i >= 0; i-- { - fi := files[i] - if strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) { - snapshots++ - require.Equal(t, chunkSnapshotDir(wlast, woffset), fi.Name()) - } - } - require.Equal(t, 1, snapshots) - - // Test the replay of snapshot. - head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. - - // Disabling exemplars to check that it does not hard fail replay - // https://github.com/prometheus/prometheus/issues/9437#issuecomment-933285870. - head.opts.EnableExemplarStorage = false - head.opts.MaxExemplars.Store(0) - expExemplars = expExemplars[:0] - - openHeadAndCheckReplay() - - require.Equal(t, 0.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) } } @@ -5369,70 +5469,74 @@ func TestAppendingDifferentEncodingToSameSeries(t *testing.T) { // Tests https://github.com/prometheus/prometheus/issues/9725. func TestChunkSnapshotReplayBug(t *testing.T) { - dir := t.TempDir() - wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.Snappy) - require.NoError(t, err) + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + dir := t.TempDir() + wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.Snappy) + require.NoError(t, err) - // Write few series records and samples such that the series references are not in order in the WAL - // for status_code="200". - var buf []byte - for i := 1; i <= 1000; i++ { - var ref chunks.HeadSeriesRef - if i <= 500 { - ref = chunks.HeadSeriesRef(i * 100) - } else { - ref = chunks.HeadSeriesRef((i - 500) * 50) - } - seriesRec := record.RefSeries{ - Ref: ref, - Labels: labels.FromStrings( - "__name__", "request_duration", - "status_code", "200", - "foo", fmt.Sprintf("baz%d", rand.Int()), - ), - } - // Add a sample so that the series is not garbage collected. - samplesRec := record.RefSample{Ref: ref, T: 1000, V: 1000} - var enc record.Encoder + // Write few series records and samples such that the series references are not in order in the WAL + // for status_code="200". + var buf []byte + for i := 1; i <= 1000; i++ { + var ref chunks.HeadSeriesRef + if i <= 500 { + ref = chunks.HeadSeriesRef(i * 100) + } else { + ref = chunks.HeadSeriesRef((i - 500) * 50) + } + seriesRec := record.RefSeries{ + Ref: ref, + Labels: labels.FromStrings( + "__name__", "request_duration", + "status_code", "200", + "foo", fmt.Sprintf("baz%d", rand.Int()), + ), + } + // Add a sample so that the series is not garbage collected. + samplesRec := record.RefSample{Ref: ref, T: 1000, V: 1000} + enc := record.Encoder{EnableSTStorage: enableSTStorage} - rec := enc.Series([]record.RefSeries{seriesRec}, buf) - buf = rec[:0] - require.NoError(t, wal.Log(rec)) - rec = enc.Samples([]record.RefSample{samplesRec}, buf) - buf = rec[:0] - require.NoError(t, wal.Log(rec)) + rec := enc.Series([]record.RefSeries{seriesRec}, buf) + buf = rec[:0] + require.NoError(t, wal.Log(rec)) + rec = enc.Samples([]record.RefSample{samplesRec}, buf) + buf = rec[:0] + require.NoError(t, wal.Log(rec)) + } + + // Write a corrupt snapshot to fail the replay on startup. + snapshotName := chunkSnapshotDir(0, 100) + cpdir := filepath.Join(dir, snapshotName) + require.NoError(t, os.MkdirAll(cpdir, 0o777)) + + err = os.WriteFile(filepath.Join(cpdir, "00000000"), []byte{1, 5, 3, 5, 6, 7, 4, 2, 2}, 0o777) + require.NoError(t, err) + + opts := DefaultHeadOptions() + opts.ChunkDirRoot = dir + opts.EnableMemorySnapshotOnShutdown = true + head, err := NewHead(nil, nil, wal, nil, opts, nil) + require.NoError(t, err) + require.NoError(t, head.Init(math.MinInt64)) + defer func() { + require.NoError(t, head.Close()) + }() + + // Snapshot replay should error out. + require.Equal(t, 1.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) + + // Querying `request_duration{status_code!="200"}` should return no series since all of + // them have status_code="200". + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, + labels.MustNewMatcher(labels.MatchEqual, "__name__", "request_duration"), + labels.MustNewMatcher(labels.MatchNotEqual, "status_code", "200"), + ) + require.Empty(t, series, "there should be no series found") + }) } - - // Write a corrupt snapshot to fail the replay on startup. - snapshotName := chunkSnapshotDir(0, 100) - cpdir := filepath.Join(dir, snapshotName) - require.NoError(t, os.MkdirAll(cpdir, 0o777)) - - err = os.WriteFile(filepath.Join(cpdir, "00000000"), []byte{1, 5, 3, 5, 6, 7, 4, 2, 2}, 0o777) - require.NoError(t, err) - - opts := DefaultHeadOptions() - opts.ChunkDirRoot = dir - opts.EnableMemorySnapshotOnShutdown = true - head, err := NewHead(nil, nil, wal, nil, opts, nil) - require.NoError(t, err) - require.NoError(t, head.Init(math.MinInt64)) - defer func() { - require.NoError(t, head.Close()) - }() - - // Snapshot replay should error out. - require.Equal(t, 1.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) - - // Querying `request_duration{status_code!="200"}` should return no series since all of - // them have status_code="200". - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, - labels.MustNewMatcher(labels.MatchEqual, "__name__", "request_duration"), - labels.MustNewMatcher(labels.MatchNotEqual, "status_code", "200"), - ) - require.Empty(t, series, "there should be no series found") } func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) { @@ -5542,7 +5646,7 @@ func testWBLReplay(t *testing.T, scenario sampleTypeScenario) { require.False(t, ok) require.NotNil(t, ms) - chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64) + chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, false) require.NoError(t, err) require.Len(t, chks, 1) @@ -7328,6 +7432,372 @@ func TestHistogramStalenessConversionMetrics(t *testing.T) { } } +// TestHeadAppender_WALEncoder_EnableSTStorage verifies that when EnableSTStorage +// is true the WAL encoder writes SamplesV2 records, and when false it writes +// plain Samples (V1) records. The bug was that log() always created a zero-value +// record.Encoder (EnableSTStorage=false), ignoring the head option. +func TestHeadAppender_WALEncoder_EnableSTStorage(t *testing.T) { + for _, enableST := range []bool{false, true} { + t.Run(fmt.Sprintf("enableSTStorage=%v", enableST), func(t *testing.T) { + opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) + opts.EnableSTStorage.Store(enableST) + opts.EnableXOR2Encoding.Store(enableST) + h, w := newTestHeadWithOptions(t, compression.None, opts) + + lbls := labels.FromStrings("foo", "bar") + app := h.AppenderV2(context.Background()) + for ts := int64(100); ts < 110; ts++ { + _, err := app.Append(0, lbls, 0, ts, float64(ts), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + require.NoError(t, h.Close()) + + // Read WAL segments directly and check the sample record type. + sr, err := wlog.NewSegmentsReader(w.Dir()) + require.NoError(t, err) + defer func() { require.NoError(t, sr.Close()) }() + + dec := record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + r := wlog.NewReader(sr) + + var foundSampleRecord bool + for r.Next() { + rt := dec.Type(r.Record()) + switch rt { + case record.Samples: + require.False(t, enableST, "WAL contains Samples (V1) record but EnableSTStorage=true, expected SamplesV2") + foundSampleRecord = true + case record.SamplesV2: + require.True(t, enableST, "WAL contains SamplesV2 record but EnableSTStorage=false, expected Samples (V1)") + foundSampleRecord = true + } + } + require.NoError(t, r.Err()) + require.True(t, foundSampleRecord, "no sample record found in WAL") + }) + } +} + +// TestHeadAppender_WBLEncoder_EnableSTStorage verifies that when EnableSTStorage +// is true the WBL encoder writes SamplesV2 records for out-of-order samples, and +// when false it writes plain Samples (V1) records. The bug was that collectOOORecords() +// always created record.Encoder{EnableSTStorage: false}, ignoring the head option. +func TestHeadAppender_WBLEncoder_EnableSTStorage(t *testing.T) { + for _, enableST := range []bool{false, true} { + t.Run(fmt.Sprintf("enableSTStorage=%v", enableST), func(t *testing.T) { + dir := t.TempDir() + wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.None) + require.NoError(t, err) + wbl, err := wlog.NewSize(nil, nil, filepath.Join(dir, wlog.WblDirName), 32768, compression.None) + require.NoError(t, err) + + opts := DefaultHeadOptions() + opts.ChunkRange = DefaultBlockDuration + opts.ChunkDirRoot = dir + opts.OutOfOrderTimeWindow.Store(60 * time.Minute.Milliseconds()) + opts.EnableSTStorage.Store(enableST) + opts.EnableXOR2Encoding.Store(enableST) + + h, err := NewHead(nil, nil, wal, wbl, opts, nil) + require.NoError(t, err) + require.NoError(t, h.Init(0)) + t.Cleanup(func() { _ = h.Close() }) + + lbls := labels.FromStrings("foo", "bar") + + // Append an in-order sample to establish head maxt. + app := h.AppenderV2(context.Background()) + _, err = app.Append(0, lbls, 0, 200, 200, nil, nil, storage.AOptions{}) + require.NoError(t, err) + require.NoError(t, app.Commit()) + + // Append OOO samples; these are written to the WBL. + app = h.AppenderV2(context.Background()) + for ts := int64(100); ts < 110; ts++ { + _, err = app.Append(0, lbls, 0, ts, float64(ts), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + + require.NoError(t, h.Close()) + + // Read WBL segments directly and check the sample record type. + sr, err := wlog.NewSegmentsReader(filepath.Join(dir, wlog.WblDirName)) + require.NoError(t, err) + defer func() { require.NoError(t, sr.Close()) }() + + dec := record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + r := wlog.NewReader(sr) + + var foundSampleRecord bool + for r.Next() { + rt := dec.Type(r.Record()) + switch rt { + case record.Samples: + require.False(t, enableST, "WBL contains Samples (V1) record but EnableSTStorage=true, expected SamplesV2") + foundSampleRecord = true + case record.SamplesV2: + require.True(t, enableST, "WBL contains SamplesV2 record but EnableSTStorage=false, expected Samples (V1)") + foundSampleRecord = true + } + } + require.NoError(t, r.Err()) + require.True(t, foundSampleRecord, "no sample record found in WBL") + }) + } +} + +// TestHeadAppender_STStorage_Disabled verifies that when EnableSTStorage is false, +// start timestamps are NOT stored in chunks (AtST returns 0). +func TestHeadAppender_STStorage_Disabled(t *testing.T) { + type sampleData struct { + st int64 + ts int64 + fSample float64 + } + + samples := []sampleData{ + {st: 10, ts: 100, fSample: 1.0}, + {st: 20, ts: 200, fSample: 2.0}, + {st: 30, ts: 300, fSample: 3.0}, + } + + opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) + opts.EnableSTStorage.Store(false) // Explicitly disable ST storage. + h, _ := newTestHeadWithOptions(t, compression.None, opts) + + lbls := labels.FromStrings("foo", "bar") + + a := h.AppenderV2(context.Background()) + for _, s := range samples { + _, err := a.Append(0, lbls, s.st, s.ts, s.fSample, nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, a.Commit()) + + ctx := context.Background() + idxReader, err := h.Index() + require.NoError(t, err) + defer idxReader.Close() + + chkReader, err := h.Chunks() + require.NoError(t, err) + defer chkReader.Close() + + p, err := idxReader.Postings(ctx, "foo", "bar") + require.NoError(t, err) + + var lblBuilder labels.ScratchBuilder + require.True(t, p.Next()) + sRef := p.At() + + var chkMetas []chunks.Meta + require.NoError(t, idxReader.Series(sRef, &lblBuilder, &chkMetas)) + + for _, meta := range chkMetas { + chk, iterable, err := chkReader.ChunkOrIterable(meta) + require.NoError(t, err) + require.Nil(t, iterable) + + it := chk.Iterator(nil) + for it.Next() != chunkenc.ValNone { + st := it.AtST() + require.Equal(t, int64(0), st, "ST should be 0 when EnableSTStorage is false") + } + require.NoError(t, it.Err()) + } +} + +// TestHeadAppender_STStorage_WALReplay verifies that ST values are preserved +// across a WAL replay when EnableSTStorage is true. The bug was that Commit() +// hardcoded EnableSTStorage=false in the WAL encoder, so ST values were written +// as V1 records (without ST) and lost on replay. +func TestHeadAppender_STStorage_WALReplay(t *testing.T) { + opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) + opts.EnableSTStorage.Store(true) + opts.EnableXOR2Encoding.Store(true) + h, w := newTestHeadWithOptions(t, compression.None, opts) + + lbls := labels.FromStrings("foo", "bar") + const st = int64(50) + + a := h.AppenderV2(context.Background()) + for ts := int64(100); ts < 200; ts++ { + _, err := a.Append(0, lbls, st, ts, float64(ts), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, a.Commit()) + require.NoError(t, h.Close()) + + // Reopen the head, triggering WAL replay. + w, err := wlog.New(nil, nil, w.Dir(), compression.None) + require.NoError(t, err) + opts.ChunkDirRoot = h.opts.ChunkDirRoot + h2, err := NewHead(nil, nil, w, nil, opts, nil) + require.NoError(t, err) + t.Cleanup(func() { _ = h2.Close() }) + require.NoError(t, h2.Init(0)) + + // Query and verify ST values survived the WAL replay. + q, err := NewBlockQuerier(h2, 100, 199) + require.NoError(t, err) + got := query(t, q, labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")) + + var expected []chunks.Sample + for ts := int64(100); ts < 200; ts++ { + expected = append(expected, sample{st, ts, float64(ts), nil, nil}) + } + require.Equal(t, map[string][]chunks.Sample{`{foo="bar"}`: expected}, got) +} + +// TestHeadAppender_STStorage_WBLReplay verifies that ST values are preserved +// across a WBL replay for out-of-order samples when EnableSTStorage is true. +// The bug was that collectOOORecords() hardcoded EnableSTStorage=false in the +// WBL encoder (acc.enc), so OOO sample ST values were written as V1 records +// (without ST) and lost on WBL replay. +func TestHeadAppender_STStorage_WBLReplay(t *testing.T) { + dir := t.TempDir() + wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.None) + require.NoError(t, err) + wbl, err := wlog.NewSize(nil, nil, filepath.Join(dir, wlog.WblDirName), 32768, compression.None) + require.NoError(t, err) + + opts := DefaultHeadOptions() + opts.ChunkRange = DefaultBlockDuration + opts.ChunkDirRoot = dir + opts.OutOfOrderTimeWindow.Store(60 * time.Minute.Milliseconds()) + opts.EnableSTStorage.Store(true) + opts.EnableXOR2Encoding.Store(true) + + h, err := NewHead(nil, nil, wal, wbl, opts, nil) + require.NoError(t, err) + require.NoError(t, h.Init(0)) + + lbls := labels.FromStrings("foo", "bar") + const st = int64(50) + + // Append an in-order sample to establish the head's maxt. + app := h.AppenderV2(context.Background()) + _, err = app.Append(0, lbls, st, 200, 200, nil, nil, storage.AOptions{}) + require.NoError(t, err) + require.NoError(t, app.Commit()) + + // Append OOO samples with non-zero ST; these go to the WBL. + // Use fewer than DefaultOutOfOrderCapMax (32) samples so they all stay in the + // OOO head chunk (not mmap'd) and are exclusively recovered via WBL replay. + app = h.AppenderV2(context.Background()) + for ts := int64(100); ts < 120; ts++ { + _, err = app.Append(0, lbls, st, ts, float64(ts), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + + require.NoError(t, h.Close()) + + // Reopen the head, triggering WBL replay. + wal, err = wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.None) + require.NoError(t, err) + wbl, err = wlog.NewSize(nil, nil, filepath.Join(dir, wlog.WblDirName), 32768, compression.None) + require.NoError(t, err) + h2, err := NewHead(nil, nil, wal, wbl, opts, nil) + require.NoError(t, err) + t.Cleanup(func() { _ = h2.Close() }) + require.NoError(t, h2.Init(0)) + + // Access the OOO head chunk directly and verify ST values survived WBL replay. + ms, created, err := h2.getOrCreate(lbls.Hash(), lbls, false) + require.NoError(t, err) + require.False(t, created) + require.NotNil(t, ms.ooo) + require.NotNil(t, ms.ooo.oooHeadChunk) + + chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, true) + require.NoError(t, err) + require.Len(t, chks, 1) + + it := chks[0].chunk.Iterator(nil) + var got []chunks.Sample + for it.Next() != chunkenc.ValNone { + t2, v := it.At() + got = append(got, sample{it.AtST(), t2, v, nil, nil}) + } + require.NoError(t, it.Err()) + + var expected []chunks.Sample + for ts := int64(100); ts < 120; ts++ { + expected = append(expected, sample{st, ts, float64(ts), nil, nil}) + } + require.Equal(t, expected, got) +} + +// TestHeadAppender_STStorage_ChunkEncoding verifies that the correct chunk encoding +// is used based on EnableSTStorage setting. +func TestHeadAppender_STStorage_ChunkEncoding(t *testing.T) { + samples := []struct { + st int64 + ts int64 + fSample float64 + }{ + {st: 10, ts: 100, fSample: 1.0}, + {st: 20, ts: 200, fSample: 2.0}, + } + + for _, enableST := range []bool{false, true} { + t.Run(fmt.Sprintf("EnableSTStorage=%t", enableST), func(t *testing.T) { + opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) + opts.EnableSTStorage.Store(enableST) + opts.EnableXOR2Encoding.Store(enableST) // ST storage implies XOR2 encoding. + h, _ := newTestHeadWithOptions(t, compression.None, opts) + + lbls := labels.FromStrings("foo", "bar") + a := h.Appender(context.Background()) + for _, s := range samples { + _, err := a.AppendSTZeroSample(0, lbls, s.ts, s.st) + require.NoError(t, err) + _, err = a.Append(0, lbls, s.ts, s.fSample) + require.NoError(t, err) + } + require.NoError(t, a.Commit()) + + ctx := context.Background() + idxReader, err := h.Index() + require.NoError(t, err) + defer idxReader.Close() + + chkReader, err := h.Chunks() + require.NoError(t, err) + defer chkReader.Close() + + p, err := idxReader.Postings(ctx, "foo", "bar") + require.NoError(t, err) + + var lblBuilder labels.ScratchBuilder + require.True(t, p.Next()) + sRef := p.At() + + var chkMetas []chunks.Meta + require.NoError(t, idxReader.Series(sRef, &lblBuilder, &chkMetas)) + require.NotEmpty(t, chkMetas) + + for _, meta := range chkMetas { + chk, iterable, err := chkReader.ChunkOrIterable(meta) + require.NoError(t, err) + require.Nil(t, iterable) + + encoding := chk.Encoding() + if enableST { + require.Equal(t, chunkenc.EncXOR2, encoding, + "Expected ST-capable encoding when EnableSTStorage is true") + } else { + require.Equal(t, chunkenc.EncXOR, encoding, + "Expected regular XOR encoding when EnableSTStorage is false") + } + } + }) + } +} + // TestWALReplayRaceWithStaleSeriesCompaction verifies that deleteSeriesByID correctly locks the // hash shard (not only the ref shard) when deleting from the hashes map. // The race only occurs when Prometheus restarts after having done a stale series compaction because diff --git a/tsdb/head_wal.go b/tsdb/head_wal.go index d39ee6b4f4..b8837a3aa9 100644 --- a/tsdb/head_wal.go +++ b/tsdb/head_wal.go @@ -169,7 +169,7 @@ func (h *Head) loadWAL(r *wlog.Reader, syms *labels.SymbolTable, multiRef map[ch return } decoded <- series - case record.Samples: + case record.Samples, record.SamplesV2: samples := h.wlReplaySamplesPool.Get()[:0] samples, err = dec.Samples(r.Record(), samples) if err != nil { @@ -646,6 +646,7 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmapp chunkDiskMapper: h.chunkDiskMapper, chunkRange: h.chunkRange.Load(), samplesPerChunk: h.opts.SamplesPerChunk, + useXOR2: h.opts.EnableXOR2Encoding.Load(), } for in := range wp.input { @@ -676,7 +677,7 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmapp h.numStaleSeries.Dec() } - if _, chunkCreated := ms.append(s.T, s.V, 0, appendChunkOpts); chunkCreated { + if _, chunkCreated := ms.append(s.ST, s.T, s.V, 0, appendChunkOpts); chunkCreated { h.metrics.chunksCreated.Inc() h.metrics.chunks.Inc() _ = ms.mmapChunks(h.chunkDiskMapper) @@ -713,14 +714,16 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmapp newlyStale = newlyStale && !value.IsStaleNaN(ms.lastHistogramValue.Sum) staleToNonStale = value.IsStaleNaN(ms.lastHistogramValue.Sum) && !value.IsStaleNaN(s.h.Sum) } - _, chunkCreated = ms.appendHistogram(s.t, s.h, 0, appendChunkOpts) + // TODO(krajorama,ywwg): Pass ST when available in WBL. + _, chunkCreated = ms.appendHistogram(0, s.t, s.h, 0, appendChunkOpts) } else { newlyStale = value.IsStaleNaN(s.fh.Sum) if ms.lastFloatHistogramValue != nil { newlyStale = newlyStale && !value.IsStaleNaN(ms.lastFloatHistogramValue.Sum) staleToNonStale = value.IsStaleNaN(ms.lastFloatHistogramValue.Sum) && !value.IsStaleNaN(s.fh.Sum) } - _, chunkCreated = ms.appendFloatHistogram(s.t, s.fh, 0, appendChunkOpts) + // TODO(krajorama,ywwg): Pass ST when available in WBL. + _, chunkCreated = ms.appendFloatHistogram(0, s.t, s.fh, 0, appendChunkOpts) } if newlyStale { h.numStaleSeries.Inc() @@ -809,7 +812,7 @@ func (h *Head) loadWBL(r *wlog.Reader, syms *labels.SymbolTable, multiRef map[ch var err error rec := r.Record() switch dec.Type(rec) { - case record.Samples: + case record.Samples, record.SamplesV2: samples := h.wlReplaySamplesPool.Get()[:0] samples, err = dec.Samples(rec, samples) if err != nil { @@ -1090,6 +1093,12 @@ func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (map[chunks.HeadSeriesR var unknownSampleRefs, unknownHistogramRefs uint64 oooCapMax := h.opts.OutOfOrderCapMax.Load() + appendChunkOpts := chunkOpts{ + chunkDiskMapper: h.chunkDiskMapper, + chunkRange: h.chunkRange.Load(), + samplesPerChunk: h.opts.SamplesPerChunk, + useXOR2: h.opts.EnableXOR2Encoding.Load(), + } // We don't check for minValidTime for ooo samples. mint, maxt := int64(math.MaxInt64), int64(math.MinInt64) for in := range wp.input { @@ -1109,7 +1118,7 @@ func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (map[chunks.HeadSeriesR missingSeries[s.Ref] = struct{}{} continue } - ok, chunkCreated, _ := ms.insert(s.T, s.V, nil, nil, h.chunkDiskMapper, oooCapMax, h.logger) + ok, chunkCreated, _ := ms.insert(s.ST, s.T, s.V, nil, nil, appendChunkOpts, oooCapMax, h.logger) if chunkCreated { h.metrics.chunksCreated.Inc() h.metrics.chunks.Inc() @@ -1137,9 +1146,11 @@ func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (map[chunks.HeadSeriesR var chunkCreated bool var ok bool if s.h != nil { - ok, chunkCreated, _ = ms.insert(s.t, 0, s.h, nil, h.chunkDiskMapper, oooCapMax, h.logger) + // TODO(krajorama,ywwg): Pass ST when available in WBL. + ok, chunkCreated, _ = ms.insert(0, s.t, 0, s.h, nil, appendChunkOpts, oooCapMax, h.logger) } else { - ok, chunkCreated, _ = ms.insert(s.t, 0, nil, s.fh, h.chunkDiskMapper, oooCapMax, h.logger) + // TODO(krajorama,ywwg): Pass ST when available in WBL. + ok, chunkCreated, _ = ms.insert(0, s.t, 0, nil, s.fh, appendChunkOpts, oooCapMax, h.logger) } if chunkCreated { h.metrics.chunksCreated.Inc() @@ -1253,7 +1264,7 @@ func decodeSeriesFromChunkSnapshot(d *record.Decoder, b []byte) (csr chunkSnapsh csr.mc.chunk = chk switch enc { - case chunkenc.EncXOR: + case chunkenc.EncXOR, chunkenc.EncXOR2: // Backwards-compatibility for old sampleBuf which had last 4 samples. for range 3 { _ = dec.Be64int64() @@ -1413,7 +1424,7 @@ func (h *Head) ChunkSnapshot() (*ChunkSnapshotStats, error) { // Assuming 100 bytes (overestimate) per exemplar, that's ~1MB. maxExemplarsPerRecord := 10000 batch := make([]record.RefExemplar, 0, maxExemplarsPerRecord) - enc := record.Encoder{} + enc := record.Encoder{EnableSTStorage: h.opts.EnableSTStorage.Load()} flushExemplars := func() error { if len(batch) == 0 { return nil diff --git a/tsdb/ooo_head.go b/tsdb/ooo_head.go index f9746c4c61..60cee8d005 100644 --- a/tsdb/ooo_head.go +++ b/tsdb/ooo_head.go @@ -34,14 +34,13 @@ func NewOOOChunk() *OOOChunk { // Insert inserts the sample such that order is maintained. // Returns false if insert was not possible due to the same timestamp already existing. -func (o *OOOChunk) Insert(t int64, v float64, h *histogram.Histogram, fh *histogram.FloatHistogram) bool { +func (o *OOOChunk) Insert(st, t int64, v float64, h *histogram.Histogram, fh *histogram.FloatHistogram) bool { // Although out-of-order samples can be out-of-order amongst themselves, we // are opinionated and expect them to be usually in-order meaning we could // try to append at the end first if the new timestamp is higher than the // last known timestamp. if len(o.samples) == 0 || t > o.samples[len(o.samples)-1].t { - // TODO(krajorama): pass ST. - o.samples = append(o.samples, sample{0, t, v, h, fh}) + o.samples = append(o.samples, sample{st, t, v, h, fh}) return true } @@ -50,8 +49,7 @@ func (o *OOOChunk) Insert(t int64, v float64, h *histogram.Histogram, fh *histog if i >= len(o.samples) { // none found. append it at the end - // TODO(krajorama): pass ST. - o.samples = append(o.samples, sample{0, t, v, h, fh}) + o.samples = append(o.samples, sample{st, t, v, h, fh}) return true } @@ -63,8 +61,7 @@ func (o *OOOChunk) Insert(t int64, v float64, h *histogram.Histogram, fh *histog // Expand length by 1 to make room. use a zero sample, we will overwrite it anyway. o.samples = append(o.samples, sample{}) copy(o.samples[i+1:], o.samples[i:]) - // TODO(krajorama): pass ST. - o.samples[i] = sample{0, t, v, h, fh} + o.samples[i] = sample{st, t, v, h, fh} return true } @@ -76,7 +73,7 @@ func (o *OOOChunk) NumSamples() int { // ToEncodedChunks returns chunks with the samples in the OOOChunk. // //nolint:revive -func (o *OOOChunk) ToEncodedChunks(mint, maxt int64) (chks []memChunk, err error) { +func (o *OOOChunk) ToEncodedChunks(mint, maxt int64, useXOR2 bool) (chks []memChunk, err error) { if len(o.samples) == 0 { return nil, nil } @@ -96,10 +93,13 @@ func (o *OOOChunk) ToEncodedChunks(mint, maxt int64) (chks []memChunk, err error if s.t > maxt { break } - encoding := chunkenc.EncXOR - if s.h != nil { + encoding := chunkenc.ValFloat.ChunkEncoding(useXOR2) + switch { + case s.h != nil: + // TODO(krajorama): use ST capable histogram chunk. encoding = chunkenc.EncHistogram - } else if s.fh != nil { + case s.fh != nil: + // TODO(krajorama): use ST capable float histogram chunk. encoding = chunkenc.EncFloatHistogram } @@ -111,15 +111,11 @@ func (o *OOOChunk) ToEncodedChunks(mint, maxt int64) (chks []memChunk, err error chks = append(chks, memChunk{chunk, cmint, cmaxt, nil}) } cmint = s.t - switch encoding { - case chunkenc.EncXOR: - chunk = chunkenc.NewXORChunk() - case chunkenc.EncHistogram: - chunk = chunkenc.NewHistogramChunk() - case chunkenc.EncFloatHistogram: - chunk = chunkenc.NewFloatHistogramChunk() - default: - chunk = chunkenc.NewXORChunk() + chunk, err = chunkenc.NewEmptyChunk(encoding) + if err != nil { + // This should never happen. No point using a default type as + // calling the wrong append function would panic. + return chks, err } app, err = chunk.Appender() if err != nil { @@ -127,18 +123,17 @@ func (o *OOOChunk) ToEncodedChunks(mint, maxt int64) (chks []memChunk, err error } } switch encoding { - case chunkenc.EncXOR: - // TODO(krajorama): pass ST. - app.Append(0, s.t, s.f) + case chunkenc.EncXOR, chunkenc.EncXOR2: + app.Append(s.st, s.t, s.f) case chunkenc.EncHistogram: + // TODO(krajorama): handle ST capable histogram chunk. // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway. prevHApp, _ := prevApp.(*chunkenc.HistogramAppender) var ( newChunk chunkenc.Chunk recoded bool ) - // TODO(krajorama): pass ST. - newChunk, recoded, app, _ = app.AppendHistogram(prevHApp, 0, s.t, s.h, false) + newChunk, recoded, app, _ = app.AppendHistogram(prevHApp, s.st, s.t, s.h, false) if newChunk != nil { // A new chunk was allocated. if !recoded { chks = append(chks, memChunk{chunk, cmint, cmaxt, nil}) @@ -147,14 +142,14 @@ func (o *OOOChunk) ToEncodedChunks(mint, maxt int64) (chks []memChunk, err error chunk = newChunk } case chunkenc.EncFloatHistogram: + // TODO(krajorama): handle ST capable float histogram chunk. // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway. prevHApp, _ := prevApp.(*chunkenc.FloatHistogramAppender) var ( newChunk chunkenc.Chunk recoded bool ) - // TODO(krajorama): pass ST. - newChunk, recoded, app, _ = app.AppendFloatHistogram(prevHApp, 0, s.t, s.fh, false) + newChunk, recoded, app, _ = app.AppendFloatHistogram(prevHApp, s.st, s.t, s.fh, false) if newChunk != nil { // A new chunk was allocated. if !recoded { chks = append(chks, memChunk{chunk, cmint, cmaxt, nil}) diff --git a/tsdb/ooo_head_read.go b/tsdb/ooo_head_read.go index 5d2347c2d7..ed3e7baeb5 100644 --- a/tsdb/ooo_head_read.go +++ b/tsdb/ooo_head_read.go @@ -77,7 +77,7 @@ func (oh *HeadAndOOOIndexReader) Series(ref storage.SeriesRef, builder *labels.S *chks = (*chks)[:0] if s.ooo != nil { - return getOOOSeriesChunks(s, oh.mint, oh.maxt, oh.lastGarbageCollectedMmapRef, 0, true, oh.inoMint, chks) + return getOOOSeriesChunks(s, oh.head.opts.EnableXOR2Encoding.Load(), oh.mint, oh.maxt, oh.lastGarbageCollectedMmapRef, 0, true, oh.inoMint, chks) } *chks = appendSeriesChunks(s, oh.inoMint, oh.maxt, *chks) return nil @@ -88,7 +88,7 @@ func (oh *HeadAndOOOIndexReader) Series(ref storage.SeriesRef, builder *labels.S // // maxMmapRef tells upto what max m-map chunk that we can consider. If it is non-0, then // the oooHeadChunk will not be considered. -func getOOOSeriesChunks(s *memSeries, mint, maxt int64, lastGarbageCollectedMmapRef, maxMmapRef chunks.ChunkDiskMapperRef, includeInOrder bool, inoMint int64, chks *[]chunks.Meta) error { +func getOOOSeriesChunks(s *memSeries, useXOR2 bool, mint, maxt int64, lastGarbageCollectedMmapRef, maxMmapRef chunks.ChunkDiskMapperRef, includeInOrder bool, inoMint int64, chks *[]chunks.Meta) error { tmpChks := make([]chunks.Meta, 0, len(s.ooo.oooMmappedChunks)) addChunk := func(minT, maxT int64, ref chunks.ChunkRef, chunk chunkenc.Chunk) { @@ -106,7 +106,7 @@ func getOOOSeriesChunks(s *memSeries, mint, maxt int64, lastGarbageCollectedMmap if c.OverlapsClosedInterval(mint, maxt) && maxMmapRef == 0 { ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.ooo.oooMmappedChunks)))) if len(c.chunk.samples) > 0 { // Empty samples happens in tests, at least. - chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(c.minTime, c.maxTime) + chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(c.minTime, c.maxTime, useXOR2) if err != nil { handleChunkWriteError(err) return nil @@ -347,7 +347,7 @@ func NewOOOCompactionHead(ctx context.Context, head *Head) (*OOOCompactionHead, } var lastMmapRef chunks.ChunkDiskMapperRef - mmapRefs := ms.mmapCurrentOOOHeadChunk(head.chunkDiskMapper, head.logger) + mmapRefs := ms.mmapCurrentOOOHeadChunk(chunkOpts{chunkDiskMapper: head.chunkDiskMapper, useXOR2: head.opts.EnableXOR2Encoding.Load()}, head.logger) if len(mmapRefs) == 0 && len(ms.ooo.oooMmappedChunks) > 0 { // Nothing was m-mapped. So take the mmapRef from the existing slice if it exists. mmapRefs = []chunks.ChunkDiskMapperRef{ms.ooo.oooMmappedChunks[len(ms.ooo.oooMmappedChunks)-1].ref} @@ -481,7 +481,7 @@ func (ir *OOOCompactionHeadIndexReader) Series(ref storage.SeriesRef, builder *l return nil } - return getOOOSeriesChunks(s, ir.ch.mint, ir.ch.maxt, 0, ir.ch.lastMmapRef, false, 0, chks) + return getOOOSeriesChunks(s, ir.ch.head.opts.EnableXOR2Encoding.Load(), ir.ch.mint, ir.ch.maxt, 0, ir.ch.lastMmapRef, false, 0, chks) } func (*OOOCompactionHeadIndexReader) SortedLabelValues(_ context.Context, _ string, _ *storage.LabelHints, _ ...*labels.Matcher) ([]string, error) { diff --git a/tsdb/ooo_head_test.go b/tsdb/ooo_head_test.go index 99cd357a30..d410835571 100644 --- a/tsdb/ooo_head_test.go +++ b/tsdb/ooo_head_test.go @@ -31,10 +31,11 @@ const testMaxSize int = 32 func valEven(pos int) int64 { return int64(pos*2 + 2) } // s[0]=2, s[1]=4, s[2]=6, ..., s[31]=64 - Predictable pre-existing values func valOdd(pos int) int64 { return int64(pos*2 + 1) } // s[0]=1, s[1]=3, s[2]=5, ..., s[31]=63 - New values will interject at chosen position because they sort before the pre-existing vals. -func makeEvenSampleSlice(n int, sampleFunc func(ts int64) sample) []sample { +func makeEvenSampleSlice(n int, sampleFunc func(st, ts int64) sample) []sample { s := make([]sample, n) for i := range n { - s[i] = sampleFunc(valEven(i)) + ts := valEven(i) + s[i] = sampleFunc(ts, ts) // Use ts as st for consistency } return s } @@ -43,23 +44,50 @@ func makeEvenSampleSlice(n int, sampleFunc func(ts int64) sample) []sample { // - Number of pre-existing samples anywhere from 0 to testMaxSize-1. // - Insert new sample before first pre-existing samples, after the last, and anywhere in between. // - With a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves. +// - With st=0 and st!=0 to verify ordering is based on sample.t, not sample.st. func TestOOOInsert(t *testing.T) { scenarios := map[string]struct { - sampleFunc func(ts int64) sample + sampleFunc func(st, ts int64) sample }{ - "float": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, f: float64(ts)} + "float st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, f: float64(ts)} }, }, - "integer histogram": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + "float st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, f: float64(ts)} }, }, - "float histogram": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} + "float st=ts-100": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts - 100, t: ts, f: float64(ts)} + }, + }, + "float st descending while t ascending": { + // st values go in opposite direction of t to ensure ordering is by t. + sampleFunc: func(st, ts int64) sample { + return sample{st: 1000 - ts, t: ts, f: float64(ts)} + }, + }, + "integer histogram st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + }, + }, + "integer histogram st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + }, + }, + "float histogram st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} + }, + }, + "float histogram st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} }, }, } @@ -71,7 +99,7 @@ func TestOOOInsert(t *testing.T) { } func testOOOInsert(t *testing.T, - sampleFunc func(ts int64) sample, + sampleFunc func(st, ts int64) sample, ) { for numPreExisting := 0; numPreExisting <= testMaxSize; numPreExisting++ { // For example, if we have numPreExisting 2, then: @@ -84,19 +112,22 @@ func testOOOInsert(t *testing.T, chunk := NewOOOChunk() chunk.samples = make([]sample, numPreExisting) chunk.samples = makeEvenSampleSlice(numPreExisting, sampleFunc) - newSample := sampleFunc(valOdd(insertPos)) - chunk.Insert(newSample.t, newSample.f, newSample.h, newSample.fh) + ts := valOdd(insertPos) + newSample := sampleFunc(ts, ts) // Use ts as st for consistency + chunk.Insert(newSample.st, newSample.t, newSample.f, newSample.h, newSample.fh) var expSamples []sample // Our expected new samples slice, will be first the original samples. for i := 0; i < insertPos; i++ { - expSamples = append(expSamples, sampleFunc(valEven(i))) + ts := valEven(i) + expSamples = append(expSamples, sampleFunc(ts, ts)) } // Then the new sample. expSamples = append(expSamples, newSample) // Followed by any original samples that were pushed back by the new one. for i := insertPos; i < numPreExisting; i++ { - expSamples = append(expSamples, sampleFunc(valEven(i))) + ts := valEven(i) + expSamples = append(expSamples, sampleFunc(ts, ts)) } require.Equal(t, expSamples, chunk.samples, "numPreExisting %d, insertPos %d", numPreExisting, insertPos) @@ -107,23 +138,50 @@ func testOOOInsert(t *testing.T, // TestOOOInsertDuplicate tests the correct behavior when inserting a sample that is a duplicate of any // pre-existing samples, with between 1 and testMaxSize pre-existing samples and // with a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves. +// With st=0 and st!=0 to verify duplicate detection is based on sample.t, not sample.st. func TestOOOInsertDuplicate(t *testing.T) { scenarios := map[string]struct { - sampleFunc func(ts int64) sample + sampleFunc func(st, ts int64) sample }{ - "float": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, f: float64(ts)} + "float st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, f: float64(ts)} }, }, - "integer histogram": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + "float st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, f: float64(ts)} }, }, - "float histogram": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} + "float st=ts-100": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts - 100, t: ts, f: float64(ts)} + }, + }, + "float st descending while t ascending": { + // st values go in opposite direction of t to ensure duplicate detection is by t. + sampleFunc: func(st, ts int64) sample { + return sample{st: 1000 - ts, t: ts, f: float64(ts)} + }, + }, + "integer histogram st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + }, + }, + "integer histogram st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + }, + }, + "float histogram st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} + }, + }, + "float histogram st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} }, }, } @@ -135,7 +193,7 @@ func TestOOOInsertDuplicate(t *testing.T) { } func testOOOInsertDuplicate(t *testing.T, - sampleFunc func(ts int64) sample, + sampleFunc func(st, ts int64) sample, ) { for num := 1; num <= testMaxSize; num++ { for dupPos := 0; dupPos < num; dupPos++ { @@ -145,7 +203,7 @@ func testOOOInsertDuplicate(t *testing.T, dupSample := chunk.samples[dupPos] dupSample.f = 0.123 - ok := chunk.Insert(dupSample.t, dupSample.f, dupSample.h, dupSample.fh) + ok := chunk.Insert(dupSample.st, dupSample.t, dupSample.f, dupSample.h, dupSample.fh) expSamples := makeEvenSampleSlice(num, sampleFunc) // We expect no change. require.False(t, ok) @@ -252,17 +310,17 @@ func TestOOOChunks_ToEncodedChunks(t *testing.T) { for _, s := range tc.samples { switch s.Type() { case chunkenc.ValFloat: - oooChunk.Insert(s.t, s.f, nil, nil) + oooChunk.Insert(s.st, s.t, s.f, nil, nil) case chunkenc.ValHistogram: - oooChunk.Insert(s.t, 0, s.h.Copy(), nil) + oooChunk.Insert(s.st, s.t, 0, s.h.Copy(), nil) case chunkenc.ValFloatHistogram: - oooChunk.Insert(s.t, 0, nil, s.fh.Copy()) + oooChunk.Insert(s.st, s.t, 0, nil, s.fh.Copy()) default: t.Fatalf("unexpected sample type %d", s.Type()) } } - chunks, err := oooChunk.ToEncodedChunks(math.MinInt64, math.MaxInt64) + chunks, err := oooChunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, false) require.NoError(t, err) require.Len(t, chunks, len(tc.expectedChunks), "number of chunks") sampleIndex := 0 @@ -308,3 +366,87 @@ func TestOOOChunks_ToEncodedChunks(t *testing.T) { }) } } + +// TestOOOChunks_ToEncodedChunks_WithST tests ToEncodedChunks with useXOR2=true and useXOR2=false for float samples. +// When useXOR2=true, st values are preserved; when useXOR2=false, AtST() returns 0. +// TODO(@krajorama): Add histogram test cases once ST storage is implemented for histograms. +func TestOOOChunks_ToEncodedChunks_WithST(t *testing.T) { + testCases := map[string]struct { + samples []sample + }{ + "floats with st=0": { + samples: []sample{ + {st: 0, t: 1000, f: 43.0}, + {st: 0, t: 1100, f: 42.0}, + }, + }, + "floats with st=t": { + samples: []sample{ + {st: 1000, t: 1000, f: 43.0}, + {st: 1100, t: 1100, f: 42.0}, + }, + }, + "floats with st=t-100": { + samples: []sample{ + {st: 900, t: 1000, f: 43.0}, + {st: 1000, t: 1100, f: 42.0}, + }, + }, + "floats with varying st": { + samples: []sample{ + {st: 500, t: 1000, f: 43.0}, + {st: 1100, t: 1100, f: 42.0}, // st == t + {st: 0, t: 1200, f: 41.0}, // st == 0 + }, + }, + } + + storageScenarios := []struct { + name string + useXOR2 bool + expectedEncoding chunkenc.Encoding + }{ + {"useXOR2=true", true, chunkenc.EncXOR2}, + {"useXOR2=false", false, chunkenc.EncXOR}, + } + + for name, tc := range testCases { + for _, ss := range storageScenarios { + t.Run(name+"/"+ss.name, func(t *testing.T) { + oooChunk := OOOChunk{} + for _, s := range tc.samples { + oooChunk.Insert(s.st, s.t, s.f, nil, nil) + } + + chunks, err := oooChunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, ss.useXOR2) + require.NoError(t, err) + require.Len(t, chunks, 1, "number of chunks") + + c := chunks[0] + require.Equal(t, ss.expectedEncoding, c.chunk.Encoding(), "chunk encoding") + require.Equal(t, tc.samples[0].t, c.minTime, "chunk minTime") + require.Equal(t, tc.samples[len(tc.samples)-1].t, c.maxTime, "chunk maxTime") + + // Verify samples can be read back with correct st and t values. + it := c.chunk.Iterator(nil) + sampleIndex := 0 + for it.Next() == chunkenc.ValFloat { + gotT, gotF := it.At() + gotST := it.AtST() + + if ss.useXOR2 { + // When useXOR2=true, st values should be preserved. + require.Equal(t, tc.samples[sampleIndex].st, gotST, "sample %d st", sampleIndex) + } else { + // When useXOR2=false, AtST() should return 0. + require.Equal(t, int64(0), gotST, "sample %d st should be 0 when useXOR2=false", sampleIndex) + } + require.Equal(t, tc.samples[sampleIndex].t, gotT, "sample %d t", sampleIndex) + require.Equal(t, tc.samples[sampleIndex].f, gotF, "sample %d f", sampleIndex) + sampleIndex++ + } + require.Equal(t, len(tc.samples), sampleIndex, "number of samples") + }) + } + } +} diff --git a/tsdb/querier.go b/tsdb/querier.go index ac7a14e1b3..6d0cf36db4 100644 --- a/tsdb/querier.go +++ b/tsdb/querier.go @@ -866,7 +866,6 @@ func (p *populateWithDelChunkSeriesIterator) Next() bool { // populateCurrForSingleChunk sets the fields within p.currMetaWithChunk. This // should be called if the samples in p.currDelIter only form one chunk. -// TODO(krajorama): test ST when chunks support it. func (p *populateWithDelChunkSeriesIterator) populateCurrForSingleChunk() bool { valueType := p.currDelIter.Next() if valueType == chunkenc.ValNone { @@ -885,60 +884,47 @@ func (p *populateWithDelChunkSeriesIterator) populateCurrForSingleChunk() bool { st, t int64 err error ) - switch valueType { - case chunkenc.ValHistogram: - newChunk = chunkenc.NewHistogramChunk() - if app, err = newChunk.Appender(); err != nil { + newChunk, err = chunkenc.NewEmptyChunk(p.currMeta.Chunk.Encoding()) + if err != nil { + p.err = fmt.Errorf("create new chunk while re-encoding: %w", err) + return false + } + app, err = newChunk.Appender() + if err != nil { + p.err = fmt.Errorf("create appender while re-encoding: %w", err) + return false + } + +loop: + for vt := valueType; vt != chunkenc.ValNone; vt = p.currDelIter.Next() { + if vt != valueType { + err = fmt.Errorf("found value type %v in chunk with %v", vt, valueType) break } - for vt := valueType; vt != chunkenc.ValNone; vt = p.currDelIter.Next() { - if vt != chunkenc.ValHistogram { - err = fmt.Errorf("found value type %v in histogram chunk", vt) - break - } - var h *histogram.Histogram - t, h = p.currDelIter.AtHistogram(nil) - st = p.currDelIter.AtST() - _, _, app, err = app.AppendHistogram(nil, st, t, h, true) - if err != nil { - break - } - } - case chunkenc.ValFloat: - newChunk = chunkenc.NewXORChunk() - if app, err = newChunk.Appender(); err != nil { - break - } - for vt := valueType; vt != chunkenc.ValNone; vt = p.currDelIter.Next() { - if vt != chunkenc.ValFloat { - err = fmt.Errorf("found value type %v in float chunk", vt) - break - } + st = p.currDelIter.AtST() + switch vt { + case chunkenc.ValFloat: var v float64 t, v = p.currDelIter.At() - st = p.currDelIter.AtST() app.Append(st, t, v) - } - case chunkenc.ValFloatHistogram: - newChunk = chunkenc.NewFloatHistogramChunk() - if app, err = newChunk.Appender(); err != nil { - break - } - for vt := valueType; vt != chunkenc.ValNone; vt = p.currDelIter.Next() { - if vt != chunkenc.ValFloatHistogram { - err = fmt.Errorf("found value type %v in histogram chunk", vt) - break + case chunkenc.ValHistogram: + var h *histogram.Histogram + t, h = p.currDelIter.AtHistogram(nil) + _, _, app, err = app.AppendHistogram(nil, st, t, h, true) + if err != nil { + break loop } + case chunkenc.ValFloatHistogram: var h *histogram.FloatHistogram t, h = p.currDelIter.AtFloatHistogram(nil) - st = p.currDelIter.AtST() _, _, app, err = app.AppendFloatHistogram(nil, st, t, h, true) if err != nil { - break + break loop } + default: + err = fmt.Errorf("populateCurrForSingleChunk: value type %v unsupported", valueType) + break loop } - default: - err = fmt.Errorf("populateCurrForSingleChunk: value type %v unsupported", valueType) } if err != nil { @@ -958,7 +944,6 @@ func (p *populateWithDelChunkSeriesIterator) populateCurrForSingleChunk() bool { // populateChunksFromIterable reads the samples from currDelIter to create // chunks for chunksFromIterable. It also sets p.currMetaWithChunk to the first // chunk. -// TODO(krajorama): test ST when chunks support it. func (p *populateWithDelChunkSeriesIterator) populateChunksFromIterable() bool { p.chunksFromIterable = p.chunksFromIterable[:0] p.chunksFromIterableIdx = -1 @@ -982,30 +967,37 @@ func (p *populateWithDelChunkSeriesIterator) populateChunksFromIterable() bool { app chunkenc.Appender - newChunk chunkenc.Chunk - recoded bool - err error ) prevValueType := chunkenc.ValNone + hasTS := false for currentValueType := firstValueType; currentValueType != chunkenc.ValNone; currentValueType = p.currDelIter.Next() { + var ( + newChunk chunkenc.Chunk + recoded bool + ) // Check if the encoding has changed (i.e. we need to create a new // chunk as chunks can't have multiple encoding types). // For the first sample, the following condition will always be true as // ValNone != ValFloat | ValHistogram | ValFloatHistogram. - if currentValueType != prevValueType { + // Also if we need to store start time (ST), but the current chunk is + // not capable. + st = p.currDelIter.AtST() + needTS := st != 0 + if currentValueType != prevValueType || !hasTS && needTS { if prevValueType != chunkenc.ValNone { p.chunksFromIterable = append(p.chunksFromIterable, chunks.Meta{Chunk: currentChunk, MinTime: cmint, MaxTime: cmaxt}) } cmint = p.currDelIter.AtT() - if currentChunk, err = currentValueType.NewChunk(); err != nil { + if currentChunk, err = currentValueType.NewChunk(needTS); err != nil { break } if app, err = currentChunk.Appender(); err != nil { break } + hasTS = needTS } switch currentValueType { @@ -1013,14 +1005,12 @@ func (p *populateWithDelChunkSeriesIterator) populateChunksFromIterable() bool { { var v float64 t, v = p.currDelIter.At() - st = p.currDelIter.AtST() app.Append(st, t, v) } case chunkenc.ValHistogram: { var v *histogram.Histogram t, v = p.currDelIter.AtHistogram(nil) - st = p.currDelIter.AtST() // No need to set prevApp as AppendHistogram will set the // counter reset header for the appender that's returned. newChunk, recoded, app, err = app.AppendHistogram(nil, st, t, v, false) @@ -1029,7 +1019,6 @@ func (p *populateWithDelChunkSeriesIterator) populateChunksFromIterable() bool { { var v *histogram.FloatHistogram t, v = p.currDelIter.AtFloatHistogram(nil) - st = p.currDelIter.AtST() // No need to set prevApp as AppendHistogram will set the // counter reset header for the appender that's returned. newChunk, recoded, app, err = app.AppendFloatHistogram(nil, st, t, v, false) diff --git a/tsdb/querier_test.go b/tsdb/querier_test.go index 4387635959..de96755e23 100644 --- a/tsdb/querier_test.go +++ b/tsdb/querier_test.go @@ -2025,6 +2025,207 @@ func TestPopulateWithDelSeriesIterator_NextWithMinTime(t *testing.T) { } } +// TestPopulateWithDelSeriesIterator_WithST tests that ST (start time) values are +// correctly preserved when iterating through chunks with ST support. +func TestPopulateWithDelSeriesIterator_WithST(t *testing.T) { + // Samples with non-zero ST values to test ST preservation. + samplesWithST := [][]chunks.Sample{ + { + sample{st: 100, t: 1000, f: 1.0}, + sample{st: 200, t: 2000, f: 2.0}, + sample{st: 300, t: 3000, f: 3.0}, + }, + } + + // Samples with varying ST patterns. + samplesVaryingST := [][]chunks.Sample{ + { + sample{st: 0, t: 1000, f: 1.0}, // st=0 + sample{st: 1500, t: 1500, f: 1.5}, // st=t + sample{st: 1900, t: 2000, f: 2.0}, // st=t-100 + sample{st: 500, t: 3000, f: 3.0}, // st < t + }, + } + + cases := []struct { + name string + samples [][]chunks.Sample + expected []chunks.Sample + }{ + { + name: "all samples have non-zero ST", + samples: samplesWithST, + expected: []chunks.Sample{ + sample{st: 100, t: 1000, f: 1.0}, + sample{st: 200, t: 2000, f: 2.0}, + sample{st: 300, t: 3000, f: 3.0}, + }, + }, + { + name: "samples with varying ST patterns", + samples: samplesVaryingST, + expected: []chunks.Sample{ + sample{st: 0, t: 1000, f: 1.0}, + sample{st: 1500, t: 1500, f: 1.5}, + sample{st: 1900, t: 2000, f: 2.0}, + sample{st: 500, t: 3000, f: 3.0}, + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + // Test with chunks (not iterables). + t.Run("chunks", func(t *testing.T) { + f, chkMetas := createFakeReaderAndNotPopulatedChunks(tc.samples...) + it := &populateWithDelSeriesIterator{} + it.reset(ulid.ULID{}, f, chkMetas, nil) + + var result []chunks.Sample + for it.Next() != chunkenc.ValNone { + st := it.AtST() + ts, v := it.At() + result = append(result, sample{st: st, t: ts, f: v}) + } + require.NoError(t, it.Err()) + require.Equal(t, tc.expected, result) + }) + + // Test with iterables. + t.Run("iterables", func(t *testing.T) { + f, chkMetas := createFakeReaderAndIterables(tc.samples...) + it := &populateWithDelSeriesIterator{} + it.reset(ulid.ULID{}, f, chkMetas, nil) + + var result []chunks.Sample + for it.Next() != chunkenc.ValNone { + st := it.AtST() + ts, v := it.At() + result = append(result, sample{st: st, t: ts, f: v}) + } + require.NoError(t, it.Err()) + require.Equal(t, tc.expected, result) + }) + }) + } +} + +// TestPopulateWithDelChunkSeriesIterator_WithST tests that ST (start time) values are +// correctly preserved when re-encoding chunks with deletions. +func TestPopulateWithDelChunkSeriesIterator_WithST(t *testing.T) { + samplesWithST := []chunks.Sample{ + sample{st: 100, t: 1000, f: 1.0}, + sample{st: 200, t: 2000, f: 2.0}, + sample{st: 300, t: 3000, f: 3.0}, + sample{st: 400, t: 4000, f: 4.0}, + sample{st: 500, t: 5000, f: 5.0}, + } + samplesWithNoLeadingST := []chunks.Sample{ + sample{st: 0, t: 1000, f: 1.0}, + sample{st: 0, t: 2000, f: 2.0}, + sample{st: 0, t: 3000, f: 3.0}, + sample{st: 400, t: 4000, f: 4.0}, + sample{st: 500, t: 5000, f: 5.0}, + } + + cases := []struct { + name string + samples [][]chunks.Sample + intervals tombstones.Intervals + expected []chunks.Sample + }{ + { + name: "no deletions - ST preserved", + samples: [][]chunks.Sample{samplesWithST}, + intervals: nil, + expected: samplesWithST, + }, + { + name: "with deletions - ST preserved in remaining samples", + samples: [][]chunks.Sample{samplesWithST}, + // Delete samples at t=2000 and t=4000. + intervals: tombstones.Intervals{{Mint: 2000, Maxt: 2000}, {Mint: 4000, Maxt: 4000}}, + expected: []chunks.Sample{ + sample{st: 100, t: 1000, f: 1.0}, + sample{st: 300, t: 3000, f: 3.0}, + sample{st: 500, t: 5000, f: 5.0}, + }, + }, + { + name: "delete first sample - ST preserved", + samples: [][]chunks.Sample{samplesWithST}, + // Delete first sample. + intervals: tombstones.Intervals{{Mint: 1000, Maxt: 1000}}, + expected: []chunks.Sample{ + sample{st: 200, t: 2000, f: 2.0}, + sample{st: 300, t: 3000, f: 3.0}, + sample{st: 400, t: 4000, f: 4.0}, + sample{st: 500, t: 5000, f: 5.0}, + }, + }, + { + // This tests that populateCurrForSingleChunk can handle + // chunks that don't start with ST, but introduce ST later. + name: "delete first sample - ST late preserved", + samples: [][]chunks.Sample{samplesWithNoLeadingST}, + // Delete first sample. + intervals: tombstones.Intervals{{Mint: 1000, Maxt: 1000}}, + expected: []chunks.Sample{ + sample{st: 0, t: 2000, f: 2.0}, + sample{st: 0, t: 3000, f: 3.0}, + sample{st: 400, t: 4000, f: 4.0}, + sample{st: 500, t: 5000, f: 5.0}, + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + // Test with chunks that need re-encoding due to deletions. + t.Run("chunks", func(t *testing.T) { + f, chkMetas := createFakeReaderAndNotPopulatedChunks(tc.samples...) + it := &populateWithDelChunkSeriesIterator{} + it.reset(ulid.ULID{}, f, chkMetas, tc.intervals) + + var result []chunks.Sample + for it.Next() { + meta := it.At() + chkIt := meta.Chunk.Iterator(nil) + for chkIt.Next() != chunkenc.ValNone { + st := chkIt.AtST() + ts, v := chkIt.At() + result = append(result, sample{st: st, t: ts, f: v}) + } + require.NoError(t, chkIt.Err()) + } + require.NoError(t, it.Err()) + require.Equal(t, tc.expected, result) + }) + + // Test with iterables. + t.Run("iterables", func(t *testing.T) { + f, chkMetas := createFakeReaderAndIterables(tc.samples...) + it := &populateWithDelChunkSeriesIterator{} + it.reset(ulid.ULID{}, f, chkMetas, tc.intervals) + + var result []chunks.Sample + for it.Next() { + meta := it.At() + chkIt := meta.Chunk.Iterator(nil) + for chkIt.Next() != chunkenc.ValNone { + st := chkIt.AtST() + ts, v := chkIt.At() + result = append(result, sample{st: st, t: ts, f: v}) + } + require.NoError(t, chkIt.Err()) + } + require.NoError(t, it.Err()) + require.Equal(t, tc.expected, result) + }) + }) + } +} + // Test the cost of merging series sets for different number of merged sets and their size. // The subset are all equivalent so this does not capture merging of partial or non-overlapping sets well. // TODO(bwplotka): Merge with storage merged series set benchmark. diff --git a/tsdb/record/bench_test.go b/tsdb/record/bench_test.go new file mode 100644 index 0000000000..1420fffc46 --- /dev/null +++ b/tsdb/record/bench_test.go @@ -0,0 +1,207 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package record_test + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/tsdb/compression" + "github.com/prometheus/prometheus/tsdb/record" + "github.com/prometheus/prometheus/util/testrecord" +) + +func zeroOutSTs(samples []record.RefSample) []record.RefSample { + out := make([]record.RefSample, len(samples)) + for i := range samples { + out[i] = samples[i] + out[i].ST = 0 + } + return out +} + +func TestEncodeDecode(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + for _, tcase := range []testrecord.RefSamplesCase{ + testrecord.Realistic1000Samples, + testrecord.Realistic1000WithVariableSTSamples, + testrecord.Realistic1000WithConstSTSamples, + testrecord.WorstCase1000, + testrecord.WorstCase1000WithSTSamples, + } { + var ( + dec record.Decoder + buf []byte + enc = record.Encoder{EnableSTStorage: enableSTStorage} + ) + + s := testrecord.GenTestRefSamplesCase(t, tcase) + + { + got, err := dec.Samples(enc.Samples(s, nil), nil) + require.NoError(t, err) + // if ST is off, we expect all STs to be zero + expected := s + if !enableSTStorage { + expected = zeroOutSTs(s) + } + + require.Equal(t, expected, got) + } + + // With byte buffer (append!) + { + buf = make([]byte, 10, 1e5) + got, err := dec.Samples(enc.Samples(s, buf)[10:], nil) + require.NoError(t, err) + + expected := s + if !enableSTStorage { + expected = zeroOutSTs(s) + } + require.Equal(t, expected, got) + } + + // With sample slice + { + samples := make([]record.RefSample, 0, len(s)+1) + got, err := dec.Samples(enc.Samples(s, nil), samples) + require.NoError(t, err) + expected := s + if !enableSTStorage { + expected = zeroOutSTs(s) + } + require.Equal(t, expected, got) + } + + // With compression. + { + buf := enc.Samples(s, nil) + + cEnc, err := compression.NewEncoder() + require.NoError(t, err) + buf, _, err = cEnc.Encode(compression.Zstd, buf, nil) + require.NoError(t, err) + + buf, err = compression.NewDecoder().Decode(compression.Zstd, buf, nil) + require.NoError(t, err) + + got, err := dec.Samples(buf, nil) + require.NoError(t, err) + expected := s + if !enableSTStorage { + expected = zeroOutSTs(s) + } + require.Equal(t, expected, got) + } + } + } +} + +var ( + compressions = []compression.Type{compression.None, compression.Snappy, compression.Zstd} + dataCases = []testrecord.RefSamplesCase{ + testrecord.Realistic1000Samples, + testrecord.Realistic1000WithVariableSTSamples, + testrecord.Realistic1000WithConstSTSamples, + testrecord.WorstCase1000, + testrecord.WorstCase1000WithSTSamples, + } + UseV2 = true +) + +/* + export bench=encode-v2 && go test ./tsdb/record/... \ + -run '^$' -bench '^BenchmarkEncode_Samples' \ + -benchtime 5s -count 6 -cpu 2 -timeout 999m \ + | tee ${bench}.txt +*/ +func BenchmarkEncode_Samples(b *testing.B) { + for _, compr := range compressions { + for _, data := range dataCases { + b.Run(fmt.Sprintf("compr=%v/data=%v", compr, data), func(b *testing.B) { + var ( + samples = testrecord.GenTestRefSamplesCase(b, data) + enc = record.Encoder{EnableSTStorage: UseV2} + buf []byte + cBuf []byte + ) + + cEnc, err := compression.NewEncoder() + require.NoError(b, err) + + // Warm up. + buf = enc.Samples(samples, buf[:0]) + cBuf, _, err = cEnc.Encode(compr, buf, cBuf[:0]) + require.NoError(b, err) + + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + buf = enc.Samples(samples, buf[:0]) + b.ReportMetric(float64(len(buf)), "B/rec") + + cBuf, _, _ = cEnc.Encode(compr, buf, cBuf[:0]) + b.ReportMetric(float64(len(cBuf)), "B/compressed-rec") + } + }) + } + } +} + +/* + export bench=decode-v2 && go test ./tsdb/record/... \ + -run '^$' -bench '^BenchmarkDecode_Samples' \ + -benchtime 5s -count 6 -cpu 2 -timeout 999m \ + | tee ${bench}.txt +*/ +func BenchmarkDecode_Samples(b *testing.B) { + for _, compr := range compressions { + for _, data := range dataCases { + b.Run(fmt.Sprintf("compr=%v/data=%v", compr, data), func(b *testing.B) { + var ( + samples = testrecord.GenTestRefSamplesCase(b, data) + enc = record.Encoder{EnableSTStorage: UseV2} + dec record.Decoder + cDec = compression.NewDecoder() + cBuf []byte + samplesBuf []record.RefSample + ) + + buf := enc.Samples(samples, nil) + + cEnc, err := compression.NewEncoder() + require.NoError(b, err) + + buf, _, err = cEnc.Encode(compr, buf, nil) + require.NoError(b, err) + + // Warm up. + cBuf, err = cDec.Decode(compr, buf, cBuf[:0]) + require.NoError(b, err) + samplesBuf, err = dec.Samples(cBuf, samplesBuf[:0]) + require.NoError(b, err) + + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + cBuf, _ = cDec.Decode(compr, buf, cBuf[:0]) + samplesBuf, _ = dec.Samples(cBuf, samplesBuf[:0]) + } + }) + } + } +} diff --git a/tsdb/record/record.go b/tsdb/record/record.go index bf0e41b66b..2a4f45e490 100644 --- a/tsdb/record/record.go +++ b/tsdb/record/record.go @@ -58,6 +58,8 @@ const ( CustomBucketsHistogramSamples Type = 9 // CustomBucketsFloatHistogramSamples is used to match WAL records of type Float Histogram with custom buckets. CustomBucketsFloatHistogramSamples Type = 10 + // SamplesV2 is an enhanced sample record with an encoding scheme that allows storing float samples with timestamp and an optional ST per sample. + SamplesV2 Type = 11 ) func (rt Type) String() string { @@ -66,6 +68,8 @@ func (rt Type) String() string { return "series" case Samples: return "samples" + case SamplesV2: + return "samples-v2" case Tombstones: return "tombstones" case Exemplars: @@ -157,12 +161,12 @@ type RefSeries struct { Labels labels.Labels } -// RefSample is a timestamp/value pair associated with a reference to a series. +// RefSample is a timestamp/st/value struct associated with a reference to a series. // TODO(beorn7): Perhaps make this "polymorphic", including histogram and float-histogram pointers? Then get rid of RefHistogramSample. type RefSample struct { - Ref chunks.HeadSeriesRef - T int64 - V float64 + Ref chunks.HeadSeriesRef + ST, T int64 + V float64 } // RefMetadata is the metadata associated with a series ID. @@ -182,6 +186,7 @@ type RefExemplar struct { } // RefHistogramSample is a histogram. +// TODO(owilliams): Add support for ST. type RefHistogramSample struct { Ref chunks.HeadSeriesRef T int64 @@ -189,6 +194,7 @@ type RefHistogramSample struct { } // RefFloatHistogramSample is a float histogram. +// TODO(owilliams): Add support for ST. type RefFloatHistogramSample struct { Ref chunks.HeadSeriesRef T int64 @@ -220,7 +226,7 @@ func (*Decoder) Type(rec []byte) Type { return Unknown } switch t := Type(rec[0]); t { - case Series, Samples, Tombstones, Exemplars, MmapMarkers, Metadata, HistogramSamples, FloatHistogramSamples, CustomBucketsHistogramSamples, CustomBucketsFloatHistogramSamples: + case Series, Samples, SamplesV2, Tombstones, Exemplars, MmapMarkers, Metadata, HistogramSamples, FloatHistogramSamples, CustomBucketsHistogramSamples, CustomBucketsFloatHistogramSamples: return t } return Unknown @@ -311,12 +317,20 @@ func (d *Decoder) DecodeLabels(dec *encoding.Decbuf) labels.Labels { } // Samples appends samples in rec to the given slice. -func (*Decoder) Samples(rec []byte, samples []RefSample) ([]RefSample, error) { +func (d *Decoder) Samples(rec []byte, samples []RefSample) ([]RefSample, error) { dec := encoding.Decbuf{B: rec} - - if Type(dec.Byte()) != Samples { - return nil, errors.New("invalid record type") + switch typ := dec.Byte(); Type(typ) { + case Samples: + return d.samplesV1(&dec, samples) + case SamplesV2: + return d.samplesV2(&dec, samples) + default: + return nil, fmt.Errorf("invalid record type %v, expected Samples(2) or SamplesV2(11)", typ) } +} + +// samplesV1 appends samples in rec to the given slice, while ignoring ST information. +func (*Decoder) samplesV1(dec *encoding.Decbuf, samples []RefSample) ([]RefSample, error) { if dec.Len() == 0 { return samples, nil } @@ -349,6 +363,60 @@ func (*Decoder) Samples(rec []byte, samples []RefSample) ([]RefSample, error) { return samples, nil } +// samplesV2 appends samples in rec to the given slice using the V2 algorithm, +// which is more efficient and supports ST (See Encoder.samplesV2 definition). +func (*Decoder) samplesV2(dec *encoding.Decbuf, samples []RefSample) ([]RefSample, error) { + if dec.Len() == 0 { + return samples, nil + } + // Allow 1 byte for each varint and 8 for the value; the output slice must be at least that big. + if minSize := dec.Len() / (1 + 1 + 8); cap(samples) < minSize { + samples = make([]RefSample, 0, minSize) + } + var firstT, firstST int64 + for len(dec.B) > 0 && dec.Err() == nil { + var prev RefSample + var ref, t, ST int64 + var val uint64 + + if len(samples) == 0 { + ref = dec.Varint64() + firstT = dec.Varint64() + t = firstT + ST = dec.Varint64() + firstST = ST + } else { + prev = samples[len(samples)-1] + ref = int64(prev.Ref) + dec.Varint64() + t = firstT + dec.Varint64() + stMarker := dec.Byte() + switch stMarker { + case noST: + case sameST: + ST = prev.ST + default: + ST = firstST + dec.Varint64() + } + } + + val = dec.Be64() + samples = append(samples, RefSample{ + Ref: chunks.HeadSeriesRef(ref), + ST: ST, + T: t, + V: math.Float64frombits(val), + }) + } + + if dec.Err() != nil { + return nil, fmt.Errorf("decode error after %d samples: %w", len(samples), dec.Err()) + } + if len(dec.B) > 0 { + return nil, fmt.Errorf("unexpected %d bytes left in entry", len(dec.B)) + } + return samples, nil +} + // Tombstones appends tombstones in rec to the given slice. func (*Decoder) Tombstones(rec []byte, tstones []tombstones.Stone) ([]tombstones.Stone, error) { dec := encoding.Decbuf{B: rec} @@ -656,7 +724,11 @@ func DecodeFloatHistogram(buf *encoding.Decbuf, fh *histogram.FloatHistogram) { // Encoder encodes series, sample, and tombstones records. // The zero value is ready to use. -type Encoder struct{} +type Encoder struct { + // EnableSTStorage enables the SamplesV2 encoding, which is more efficient + // than V1 and supports start time per sample. + EnableSTStorage bool +} // Series appends the encoded series to b and returns the resulting slice. func (*Encoder) Series(series []RefSeries, b []byte) []byte { @@ -702,7 +774,16 @@ func EncodeLabels(buf *encoding.Encbuf, lbls labels.Labels) { } // Samples appends the encoded samples to b and returns the resulting slice. -func (*Encoder) Samples(samples []RefSample, b []byte) []byte { +// Depending on the ST existence it either writes Samples or SamplesWithST record. +func (e *Encoder) Samples(samples []RefSample, b []byte) []byte { + if e.EnableSTStorage { + return e.samplesV2(samples, b) + } + return e.samplesV1(samples, b) +} + +// Samples appends the encoded samples to b and returns the resulting slice. +func (*Encoder) samplesV1(samples []RefSample, b []byte) []byte { buf := encoding.Encbuf{B: b} buf.PutByte(byte(Samples)) @@ -725,6 +806,56 @@ func (*Encoder) Samples(samples []RefSample, b []byte) []byte { return buf.Get() } +const ( + // Start timestamp marker values for indicating trivial cases. + + noST byte = iota // Sample has no start time. + sameST // Sample timestamp exists and is the same as the start time of the previous series. + explicitST // Explicit start timestamp value, delta to first start time. +) + +// samplesV2 appends the encoded samples to b and returns the resulting slice +// using a more efficient per-sample delta encoding and allows for ST +// storage. +func (*Encoder) samplesV2(samples []RefSample, b []byte) []byte { + buf := encoding.Encbuf{B: b} + buf.PutByte(byte(SamplesV2)) + + if len(samples) == 0 { + return buf.Get() + } + + // Store first ref, timestamp, ST, and value. + first := samples[0] + buf.PutVarint64(int64(first.Ref)) + buf.PutVarint64(first.T) + buf.PutVarint64(first.ST) + buf.PutBE64(math.Float64bits(first.V)) + + // Subsequent values are delta to the immediate previous values, and in the + // case of start timestamp, use the marker byte to indicate what the value should + // be if it's one of the trivial cases. + for i := 1; i < len(samples); i++ { + s := samples[i] + prev := samples[i-1] + + buf.PutVarint64(int64(s.Ref) - int64(prev.Ref)) + buf.PutVarint64(s.T - first.T) + + switch s.ST { + case 0: + buf.PutByte(noST) + case prev.ST: + buf.PutByte(sameST) + default: + buf.PutByte(explicitST) + buf.PutVarint64(s.ST - first.ST) + } + buf.PutBE64(math.Float64bits(s.V)) + } + return buf.Get() +} + // Tombstones appends the encoded tombstones to b and returns the resulting slice. func (*Encoder) Tombstones(tstones []tombstones.Stone, b []byte) []byte { buf := encoding.Encbuf{B: b} diff --git a/tsdb/record/record_test.go b/tsdb/record/record_test.go index 8ebd805d4d..970930fbe5 100644 --- a/tsdb/record/record_test.go +++ b/tsdb/record/record_test.go @@ -76,15 +76,63 @@ func TestRecord_EncodeDecode(t *testing.T) { require.NoError(t, err) require.Equal(t, metadata, decMetadata) + // Without ST. samples := []RefSample{ {Ref: 0, T: 12423423, V: 1.2345}, {Ref: 123, T: -1231, V: -123}, {Ref: 2, T: 0, V: 99999}, } - decSamples, err := dec.Samples(enc.Samples(samples, nil), nil) + encoded := enc.Samples(samples, nil) + require.Equal(t, Samples, dec.Type(encoded)) + decSamples, err := dec.Samples(encoded, nil) require.NoError(t, err) require.Equal(t, samples, decSamples) + enc = Encoder{EnableSTStorage: true} + // Without ST again, but with V1 encoder that enables SamplesV2. + samples = []RefSample{ + {Ref: 0, T: 12423423, V: 1.2345}, + {Ref: 123, T: -1231, V: -123}, + {Ref: 2, T: 0, V: 99999}, + } + encoded = enc.Samples(samples, nil) + require.Equal(t, SamplesV2, dec.Type(encoded)) + decSamples, err = dec.Samples(encoded, nil) + require.NoError(t, err) + require.Equal(t, samples, decSamples) + + // With ST. + samplesWithST := []RefSample{ + {Ref: 0, T: 12423423, ST: 14, V: 1.2345}, + {Ref: 123, T: -1231, ST: 14, V: -123}, + {Ref: 2, T: 0, ST: 14, V: 99999}, + } + encoded = enc.Samples(samplesWithST, nil) + require.Equal(t, SamplesV2, dec.Type(encoded)) + decSamples, err = dec.Samples(encoded, nil) + require.NoError(t, err) + require.Equal(t, samplesWithST, decSamples) + + // With ST (ST[i] == T[i-1]). + samplesWithSTDelta := []RefSample{ + {Ref: 0, T: 12423400, ST: 12423300, V: 1.2345}, + {Ref: 123, T: 12423500, ST: 12423400, V: -123}, + {Ref: 2, T: 12423600, ST: 12423500, V: 99999}, + } + decSamples, err = dec.Samples(enc.Samples(samplesWithSTDelta, nil), nil) + require.NoError(t, err) + require.Equal(t, samplesWithSTDelta, decSamples) + + // With ST (ST[i] == ST[i-1]). + samplesWithConstST := []RefSample{ + {Ref: 0, T: 12423400, ST: 12423300, V: 1.2345}, + {Ref: 123, T: 12423500, ST: 12423300, V: -123}, + {Ref: 2, T: 12423600, ST: 12423300, V: 99999}, + } + decSamples, err = dec.Samples(enc.Samples(samplesWithConstST, nil), nil) + require.NoError(t, err) + require.Equal(t, samplesWithConstST, decSamples) + // Intervals get split up into single entries. So we don't get back exactly // what we put in. tstones := []tombstones.Stone{ @@ -227,252 +275,262 @@ func TestRecord_EncodeDecode(t *testing.T) { } func TestRecord_DecodeInvalidHistogramSchema(t *testing.T) { - for _, schema := range []int32{-100, 100} { - t.Run(fmt.Sprintf("schema=%d", schema), func(t *testing.T) { - var enc Encoder + for _, enableSTStorage := range []bool{false, true} { + for _, schema := range []int32{-100, 100} { + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableSTStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableSTStorage} - var output bytes.Buffer - logger := promslog.New(&promslog.Config{Writer: &output}) - dec := NewDecoder(labels.NewSymbolTable(), logger) - histograms := []RefHistogramSample{ - { - Ref: 56, - T: 1234, - H: &histogram.Histogram{ - Count: 5, - ZeroCount: 2, - ZeroThreshold: 0.001, - Sum: 18.4 * rand.Float64(), - Schema: schema, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 1, Length: 2}, + var output bytes.Buffer + logger := promslog.New(&promslog.Config{Writer: &output}) + dec := NewDecoder(labels.NewSymbolTable(), logger) + histograms := []RefHistogramSample{ + { + Ref: 56, + T: 1234, + H: &histogram.Histogram{ + Count: 5, + ZeroCount: 2, + ZeroThreshold: 0.001, + Sum: 18.4 * rand.Float64(), + Schema: schema, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 1, Length: 2}, + }, + PositiveBuckets: []int64{1, 1, -1, 0}, }, - PositiveBuckets: []int64{1, 1, -1, 0}, }, - }, - } - histSamples, _ := enc.HistogramSamples(histograms, nil) - decHistograms, err := dec.HistogramSamples(histSamples, nil) - require.NoError(t, err) - require.Empty(t, decHistograms) - require.Contains(t, output.String(), "skipping histogram with unknown schema in WAL record") - }) + } + histSamples, _ := enc.HistogramSamples(histograms, nil) + decHistograms, err := dec.HistogramSamples(histSamples, nil) + require.NoError(t, err) + require.Empty(t, decHistograms) + require.Contains(t, output.String(), "skipping histogram with unknown schema in WAL record") + }) + } } } func TestRecord_DecodeInvalidFloatHistogramSchema(t *testing.T) { - for _, schema := range []int32{-100, 100} { - t.Run(fmt.Sprintf("schema=%d", schema), func(t *testing.T) { - var enc Encoder + for _, enableSTStorage := range []bool{false, true} { + for _, schema := range []int32{-100, 100} { + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableSTStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableSTStorage} - var output bytes.Buffer - logger := promslog.New(&promslog.Config{Writer: &output}) - dec := NewDecoder(labels.NewSymbolTable(), logger) - histograms := []RefFloatHistogramSample{ - { - Ref: 56, - T: 1234, - FH: &histogram.FloatHistogram{ - Count: 5, - ZeroCount: 2, - ZeroThreshold: 0.001, - Sum: 18.4 * rand.Float64(), - Schema: schema, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 1, Length: 2}, + var output bytes.Buffer + logger := promslog.New(&promslog.Config{Writer: &output}) + dec := NewDecoder(labels.NewSymbolTable(), logger) + histograms := []RefFloatHistogramSample{ + { + Ref: 56, + T: 1234, + FH: &histogram.FloatHistogram{ + Count: 5, + ZeroCount: 2, + ZeroThreshold: 0.001, + Sum: 18.4 * rand.Float64(), + Schema: schema, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 1, Length: 2}, + }, + PositiveBuckets: []float64{1, 1, -1, 0}, }, - PositiveBuckets: []float64{1, 1, -1, 0}, }, - }, - } - histSamples, _ := enc.FloatHistogramSamples(histograms, nil) - decHistograms, err := dec.FloatHistogramSamples(histSamples, nil) - require.NoError(t, err) - require.Empty(t, decHistograms) - require.Contains(t, output.String(), "skipping histogram with unknown schema in WAL record") - }) + } + histSamples, _ := enc.FloatHistogramSamples(histograms, nil) + decHistograms, err := dec.FloatHistogramSamples(histSamples, nil) + require.NoError(t, err) + require.Empty(t, decHistograms) + require.Contains(t, output.String(), "skipping histogram with unknown schema in WAL record") + }) + } } } func TestRecord_DecodeTooHighResolutionHistogramSchema(t *testing.T) { - for _, schema := range []int32{9, 52} { - t.Run(fmt.Sprintf("schema=%d", schema), func(t *testing.T) { - var enc Encoder + for _, enableSTStorage := range []bool{false, true} { + for _, schema := range []int32{9, 52} { + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableSTStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableSTStorage} - var output bytes.Buffer - logger := promslog.New(&promslog.Config{Writer: &output}) - dec := NewDecoder(labels.NewSymbolTable(), logger) - histograms := []RefHistogramSample{ - { - Ref: 56, - T: 1234, - H: &histogram.Histogram{ - Count: 5, - ZeroCount: 2, - ZeroThreshold: 0.001, - Sum: 18.4 * rand.Float64(), - Schema: schema, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 1, Length: 2}, + var output bytes.Buffer + logger := promslog.New(&promslog.Config{Writer: &output}) + dec := NewDecoder(labels.NewSymbolTable(), logger) + histograms := []RefHistogramSample{ + { + Ref: 56, + T: 1234, + H: &histogram.Histogram{ + Count: 5, + ZeroCount: 2, + ZeroThreshold: 0.001, + Sum: 18.4 * rand.Float64(), + Schema: schema, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 1, Length: 2}, + }, + PositiveBuckets: []int64{1, 1, -1, 0}, }, - PositiveBuckets: []int64{1, 1, -1, 0}, }, - }, - } - histSamples, _ := enc.HistogramSamples(histograms, nil) - decHistograms, err := dec.HistogramSamples(histSamples, nil) - require.NoError(t, err) - require.Len(t, decHistograms, 1) - require.Equal(t, histogram.ExponentialSchemaMax, decHistograms[0].H.Schema) - }) + } + histSamples, _ := enc.HistogramSamples(histograms, nil) + decHistograms, err := dec.HistogramSamples(histSamples, nil) + require.NoError(t, err) + require.Len(t, decHistograms, 1) + require.Equal(t, histogram.ExponentialSchemaMax, decHistograms[0].H.Schema) + }) + } } } func TestRecord_DecodeTooHighResolutionFloatHistogramSchema(t *testing.T) { - for _, schema := range []int32{9, 52} { - t.Run(fmt.Sprintf("schema=%d", schema), func(t *testing.T) { - var enc Encoder + for _, enableSTStorage := range []bool{false, true} { + for _, schema := range []int32{9, 52} { + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableSTStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableSTStorage} - var output bytes.Buffer - logger := promslog.New(&promslog.Config{Writer: &output}) - dec := NewDecoder(labels.NewSymbolTable(), logger) - histograms := []RefFloatHistogramSample{ - { - Ref: 56, - T: 1234, - FH: &histogram.FloatHistogram{ - Count: 5, - ZeroCount: 2, - ZeroThreshold: 0.001, - Sum: 18.4 * rand.Float64(), - Schema: schema, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 1, Length: 2}, + var output bytes.Buffer + logger := promslog.New(&promslog.Config{Writer: &output}) + dec := NewDecoder(labels.NewSymbolTable(), logger) + histograms := []RefFloatHistogramSample{ + { + Ref: 56, + T: 1234, + FH: &histogram.FloatHistogram{ + Count: 5, + ZeroCount: 2, + ZeroThreshold: 0.001, + Sum: 18.4 * rand.Float64(), + Schema: schema, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 1, Length: 2}, + }, + PositiveBuckets: []float64{1, 1, -1, 0}, }, - PositiveBuckets: []float64{1, 1, -1, 0}, }, - }, - } - histSamples, _ := enc.FloatHistogramSamples(histograms, nil) - decHistograms, err := dec.FloatHistogramSamples(histSamples, nil) - require.NoError(t, err) - require.Len(t, decHistograms, 1) - require.Equal(t, histogram.ExponentialSchemaMax, decHistograms[0].FH.Schema) - }) + } + histSamples, _ := enc.FloatHistogramSamples(histograms, nil) + decHistograms, err := dec.FloatHistogramSamples(histSamples, nil) + require.NoError(t, err) + require.Len(t, decHistograms, 1) + require.Equal(t, histogram.ExponentialSchemaMax, decHistograms[0].FH.Schema) + }) + } } } // TestRecord_Corrupted ensures that corrupted records return the correct error. // Bugfix check for pull/521 and pull/523. func TestRecord_Corrupted(t *testing.T) { - var enc Encoder - dec := NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + for _, enableSTStorage := range []bool{false, true} { + enc := Encoder{EnableSTStorage: enableSTStorage} + dec := NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) - t.Run("Test corrupted series record", func(t *testing.T) { - series := []RefSeries{ - { - Ref: 100, - Labels: labels.FromStrings("abc", "def", "123", "456"), - }, - } - - corrupted := enc.Series(series, nil)[:8] - _, err := dec.Series(corrupted, nil) - require.Equal(t, err, encoding.ErrInvalidSize) - }) - - t.Run("Test corrupted sample record", func(t *testing.T) { - samples := []RefSample{ - {Ref: 0, T: 12423423, V: 1.2345}, - } - - corrupted := enc.Samples(samples, nil)[:8] - _, err := dec.Samples(corrupted, nil) - require.ErrorIs(t, err, encoding.ErrInvalidSize) - }) - - t.Run("Test corrupted tombstone record", func(t *testing.T) { - tstones := []tombstones.Stone{ - {Ref: 123, Intervals: tombstones.Intervals{ - {Mint: -1000, Maxt: 1231231}, - {Mint: 5000, Maxt: 0}, - }}, - } - - corrupted := enc.Tombstones(tstones, nil)[:8] - _, err := dec.Tombstones(corrupted, nil) - require.Equal(t, err, encoding.ErrInvalidSize) - }) - - t.Run("Test corrupted exemplar record", func(t *testing.T) { - exemplars := []RefExemplar{ - {Ref: 0, T: 12423423, V: 1.2345, Labels: labels.FromStrings("trace_id", "asdf")}, - } - - corrupted := enc.Exemplars(exemplars, nil)[:8] - _, err := dec.Exemplars(corrupted, nil) - require.ErrorIs(t, err, encoding.ErrInvalidSize) - }) - - t.Run("Test corrupted metadata record", func(t *testing.T) { - meta := []RefMetadata{ - {Ref: 147, Type: uint8(Counter), Unit: "unit", Help: "help"}, - } - - corrupted := enc.Metadata(meta, nil)[:8] - _, err := dec.Metadata(corrupted, nil) - require.ErrorIs(t, err, encoding.ErrInvalidSize) - }) - - t.Run("Test corrupted histogram record", func(t *testing.T) { - histograms := []RefHistogramSample{ - { - Ref: 56, - T: 1234, - H: &histogram.Histogram{ - Count: 5, - ZeroCount: 2, - ZeroThreshold: 0.001, - Sum: 18.4 * rand.Float64(), - Schema: 1, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 1, Length: 2}, - }, - PositiveBuckets: []int64{1, 1, -1, 0}, + t.Run("Test corrupted series record", func(t *testing.T) { + series := []RefSeries{ + { + Ref: 100, + Labels: labels.FromStrings("abc", "def", "123", "456"), }, - }, - { - Ref: 67, - T: 5678, - H: &histogram.Histogram{ - Count: 8, - ZeroThreshold: 0.001, - Sum: 35.5, - Schema: -53, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 2, Length: 2}, - }, - PositiveBuckets: []int64{2, -1, 2, 0}, - CustomValues: []float64{0, 2, 4, 6, 8}, - }, - }, - } + } - corruptedHists, customBucketsHists := enc.HistogramSamples(histograms, nil) - corruptedHists = corruptedHists[:8] - corruptedCustomBucketsHists := enc.CustomBucketsHistogramSamples(customBucketsHists, nil) - corruptedCustomBucketsHists = corruptedCustomBucketsHists[:8] - _, err := dec.HistogramSamples(corruptedHists, nil) - require.ErrorIs(t, err, encoding.ErrInvalidSize) - _, err = dec.HistogramSamples(corruptedCustomBucketsHists, nil) - require.ErrorIs(t, err, encoding.ErrInvalidSize) - }) + corrupted := enc.Series(series, nil)[:8] + _, err := dec.Series(corrupted, nil) + require.Equal(t, err, encoding.ErrInvalidSize) + }) + + t.Run("Test corrupted sample record", func(t *testing.T) { + samples := []RefSample{ + {Ref: 0, T: 12423423, V: 1.2345}, + } + + corrupted := enc.Samples(samples, nil)[:8] + _, err := dec.Samples(corrupted, nil) + require.ErrorIs(t, err, encoding.ErrInvalidSize) + }) + + t.Run("Test corrupted tombstone record", func(t *testing.T) { + tstones := []tombstones.Stone{ + {Ref: 123, Intervals: tombstones.Intervals{ + {Mint: -1000, Maxt: 1231231}, + {Mint: 5000, Maxt: 0}, + }}, + } + + corrupted := enc.Tombstones(tstones, nil)[:8] + _, err := dec.Tombstones(corrupted, nil) + require.Equal(t, err, encoding.ErrInvalidSize) + }) + + t.Run("Test corrupted exemplar record", func(t *testing.T) { + exemplars := []RefExemplar{ + {Ref: 0, T: 12423423, V: 1.2345, Labels: labels.FromStrings("trace_id", "asdf")}, + } + + corrupted := enc.Exemplars(exemplars, nil)[:8] + _, err := dec.Exemplars(corrupted, nil) + require.ErrorIs(t, err, encoding.ErrInvalidSize) + }) + + t.Run("Test corrupted metadata record", func(t *testing.T) { + meta := []RefMetadata{ + {Ref: 147, Type: uint8(Counter), Unit: "unit", Help: "help"}, + } + + corrupted := enc.Metadata(meta, nil)[:8] + _, err := dec.Metadata(corrupted, nil) + require.ErrorIs(t, err, encoding.ErrInvalidSize) + }) + + t.Run("Test corrupted histogram record", func(t *testing.T) { + histograms := []RefHistogramSample{ + { + Ref: 56, + T: 1234, + H: &histogram.Histogram{ + Count: 5, + ZeroCount: 2, + ZeroThreshold: 0.001, + Sum: 18.4 * rand.Float64(), + Schema: 1, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 1, Length: 2}, + }, + PositiveBuckets: []int64{1, 1, -1, 0}, + }, + }, + { + Ref: 67, + T: 5678, + H: &histogram.Histogram{ + Count: 8, + ZeroThreshold: 0.001, + Sum: 35.5, + Schema: -53, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 2, Length: 2}, + }, + PositiveBuckets: []int64{2, -1, 2, 0}, + CustomValues: []float64{0, 2, 4, 6, 8}, + }, + }, + } + + corruptedHists, customBucketsHists := enc.HistogramSamples(histograms, nil) + corruptedHists = corruptedHists[:8] + corruptedCustomBucketsHists := enc.CustomBucketsHistogramSamples(customBucketsHists, nil) + corruptedCustomBucketsHists = corruptedCustomBucketsHists[:8] + _, err := dec.HistogramSamples(corruptedHists, nil) + require.ErrorIs(t, err, encoding.ErrInvalidSize) + _, err = dec.HistogramSamples(corruptedCustomBucketsHists, nil) + require.ErrorIs(t, err, encoding.ErrInvalidSize) + }) + } } func TestRecord_Type(t *testing.T) { @@ -487,6 +545,16 @@ func TestRecord_Type(t *testing.T) { recordType = dec.Type(enc.Samples(samples, nil)) require.Equal(t, Samples, recordType) + // With EnableSTStorage set, all Samples are V2. + enc = Encoder{EnableSTStorage: true} + samples = []RefSample{{Ref: 123, T: 12345, V: 1.2345}} + recordType = dec.Type(enc.Samples(samples, nil)) + require.Equal(t, SamplesV2, recordType) + + samplesST := []RefSample{{Ref: 123, ST: 1, T: 12345, V: 1.2345}} + recordType = dec.Type(enc.Samples(samplesST, nil)) + require.Equal(t, SamplesV2, recordType) + tstones := []tombstones.Stone{{Ref: 1, Intervals: tombstones.Intervals{{Mint: 1, Maxt: 2}}}} recordType = dec.Type(enc.Tombstones(tstones, nil)) require.Equal(t, Tombstones, recordType) @@ -716,24 +784,26 @@ func BenchmarkWAL_HistogramEncoding(b *testing.B) { make: initNHCBRefs, }, } { - for _, labelCount := range []int{0, 10, 50} { - for _, histograms := range []int{10, 100, 1000} { - for _, buckets := range []int{0, 1, 10, 100} { - b.Run(fmt.Sprintf("type=%s/labels=%d/histograms=%d/buckets=%d", maker.name, labelCount, histograms, buckets), func(b *testing.B) { - series, samples, nhcbs := maker.make(labelCount, histograms, buckets) - enc := Encoder{} - for b.Loop() { - var buf []byte - enc.Series(series, buf) - enc.Samples(samples, buf) - var leftOver []RefHistogramSample - _, leftOver = enc.HistogramSamples(nhcbs, buf) - if len(leftOver) > 0 { - enc.CustomBucketsHistogramSamples(leftOver, buf) + for _, enableSTStorage := range []bool{false, true} { + for _, labelCount := range []int{0, 10, 50} { + for _, histograms := range []int{10, 100, 1000} { + for _, buckets := range []int{0, 1, 10, 100} { + b.Run(fmt.Sprintf("type=%s/labels=%d/histograms=%d/buckets=%d", maker.name, labelCount, histograms, buckets), func(b *testing.B) { + series, samples, nhcbs := maker.make(labelCount, histograms, buckets) + enc := Encoder{EnableSTStorage: enableSTStorage} + for b.Loop() { + var buf []byte + enc.Series(series, buf) + enc.Samples(samples, buf) + var leftOver []RefHistogramSample + _, leftOver = enc.HistogramSamples(nhcbs, buf) + if len(leftOver) > 0 { + enc.CustomBucketsHistogramSamples(leftOver, buf) + } + b.ReportMetric(float64(len(buf)), "recordBytes/ops") } - b.ReportMetric(float64(len(buf)), "recordBytes/ops") - } - }) + }) + } } } } diff --git a/tsdb/wlog/checkpoint.go b/tsdb/wlog/checkpoint.go index 3a4e194fec..a41935044d 100644 --- a/tsdb/wlog/checkpoint.go +++ b/tsdb/wlog/checkpoint.go @@ -102,7 +102,7 @@ func DeleteTempCheckpoints(logger *slog.Logger, dir string) error { // segmented format as the original WAL itself. // This makes it easy to read it through the WAL package and concatenate // it with the original WAL. -func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.HeadSeriesRef) bool, mint int64) (*CheckpointStats, error) { +func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.HeadSeriesRef) bool, mint int64, enableSTStorage bool) (*CheckpointStats, error) { stats := &CheckpointStats{} var sgmReader io.ReadCloser @@ -166,7 +166,7 @@ func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.He metadata []record.RefMetadata st = labels.NewSymbolTable() // Needed for decoding; labels do not outlive this function. dec = record.NewDecoder(st, logger) - enc record.Encoder + enc = record.Encoder{EnableSTStorage: enableSTStorage} buf []byte recs [][]byte @@ -200,7 +200,7 @@ func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.He stats.TotalSeries += len(series) stats.DroppedSeries += len(series) - len(repl) - case record.Samples: + case record.Samples, record.SamplesV2: samples, err = dec.Samples(rec, samples) if err != nil { return nil, fmt.Errorf("decode samples: %w", err) diff --git a/tsdb/wlog/checkpoint_test.go b/tsdb/wlog/checkpoint_test.go index a348239ec7..9056aab70b 100644 --- a/tsdb/wlog/checkpoint_test.go +++ b/tsdb/wlog/checkpoint_test.go @@ -171,251 +171,257 @@ func TestCheckpoint(t *testing.T) { } } - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() + for _, enableSTStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { + dir := t.TempDir() - var enc record.Encoder - // Create a dummy segment to bump the initial number. - seg, err := CreateSegment(dir, 100) - require.NoError(t, err) - require.NoError(t, seg.Close()) - - // Manually create checkpoint for 99 and earlier. - w, err := New(nil, nil, filepath.Join(dir, "checkpoint.0099"), compress) - require.NoError(t, err) - - // Add some data we expect to be around later. - err = w.Log(enc.Series([]record.RefSeries{ - {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")}, - {Ref: 1, Labels: labels.FromStrings("a", "b", "c", "1")}, - }, nil)) - require.NoError(t, err) - // Log an unknown record, that might have come from a future Prometheus version. - require.NoError(t, w.Log([]byte{255})) - require.NoError(t, w.Close()) - - // Start a WAL and write records to it as usual. - w, err = NewSize(nil, nil, dir, 128*1024, compress) - require.NoError(t, err) - - samplesInWAL, histogramsInWAL, floatHistogramsInWAL := 0, 0, 0 - var last int64 - for i := 0; ; i++ { - _, n, err := Segments(w.Dir()) + enc := record.Encoder{EnableSTStorage: enableSTStorage} + // Create a dummy segment to bump the initial number. + seg, err := CreateSegment(dir, 100) require.NoError(t, err) - if n >= 106 { - break - } - // Write some series initially. - if i == 0 { - b := enc.Series([]record.RefSeries{ - {Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")}, - {Ref: 3, Labels: labels.FromStrings("a", "b", "c", "3")}, - {Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")}, - {Ref: 5, Labels: labels.FromStrings("a", "b", "c", "5")}, + require.NoError(t, seg.Close()) + + // Manually create checkpoint for 99 and earlier. + w, err := New(nil, nil, filepath.Join(dir, "checkpoint.0099"), compress) + require.NoError(t, err) + + // Add some data we expect to be around later. + err = w.Log(enc.Series([]record.RefSeries{ + {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")}, + {Ref: 1, Labels: labels.FromStrings("a", "b", "c", "1")}, + }, nil)) + require.NoError(t, err) + // Log an unknown record, that might have come from a future Prometheus version. + require.NoError(t, w.Log([]byte{255})) + require.NoError(t, w.Close()) + + // Start a WAL and write records to it as usual. + w, err = NewSize(nil, nil, dir, 128*1024, compress) + require.NoError(t, err) + + samplesInWAL, histogramsInWAL, floatHistogramsInWAL := 0, 0, 0 + var last int64 + for i := 0; ; i++ { + _, n, err := Segments(w.Dir()) + require.NoError(t, err) + if n >= 106 { + break + } + // Write some series initially. + if i == 0 { + b := enc.Series([]record.RefSeries{ + {Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")}, + {Ref: 3, Labels: labels.FromStrings("a", "b", "c", "3")}, + {Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")}, + {Ref: 5, Labels: labels.FromStrings("a", "b", "c", "5")}, + }, nil) + require.NoError(t, w.Log(b)) + + b = enc.Metadata([]record.RefMetadata{ + {Ref: 2, Unit: "unit", Help: "help"}, + {Ref: 3, Unit: "unit", Help: "help"}, + {Ref: 4, Unit: "unit", Help: "help"}, + {Ref: 5, Unit: "unit", Help: "help"}, + }, nil) + require.NoError(t, w.Log(b)) + } + // Write samples until the WAL has enough segments. + // Make them have drifting timestamps within a record to see that they + // get filtered properly. + b := enc.Samples([]record.RefSample{ + {Ref: 0, T: last, V: float64(i)}, + {Ref: 1, T: last + 10000, V: float64(i)}, + {Ref: 2, T: last + 20000, V: float64(i)}, + {Ref: 3, T: last + 30000, V: float64(i)}, + }, nil) + require.NoError(t, w.Log(b)) + samplesInWAL += 4 + h := makeHistogram(i) + b, _ = enc.HistogramSamples([]record.RefHistogramSample{ + {Ref: 0, T: last, H: h}, + {Ref: 1, T: last + 10000, H: h}, + {Ref: 2, T: last + 20000, H: h}, + {Ref: 3, T: last + 30000, H: h}, + }, nil) + require.NoError(t, w.Log(b)) + histogramsInWAL += 4 + cbh := makeCustomBucketHistogram(i) + b = enc.CustomBucketsHistogramSamples([]record.RefHistogramSample{ + {Ref: 0, T: last, H: cbh}, + {Ref: 1, T: last + 10000, H: cbh}, + {Ref: 2, T: last + 20000, H: cbh}, + {Ref: 3, T: last + 30000, H: cbh}, + }, nil) + require.NoError(t, w.Log(b)) + histogramsInWAL += 4 + fh := makeFloatHistogram(i) + b, _ = enc.FloatHistogramSamples([]record.RefFloatHistogramSample{ + {Ref: 0, T: last, FH: fh}, + {Ref: 1, T: last + 10000, FH: fh}, + {Ref: 2, T: last + 20000, FH: fh}, + {Ref: 3, T: last + 30000, FH: fh}, + }, nil) + require.NoError(t, w.Log(b)) + floatHistogramsInWAL += 4 + cbfh := makeCustomBucketFloatHistogram(i) + b = enc.CustomBucketsFloatHistogramSamples([]record.RefFloatHistogramSample{ + {Ref: 0, T: last, FH: cbfh}, + {Ref: 1, T: last + 10000, FH: cbfh}, + {Ref: 2, T: last + 20000, FH: cbfh}, + {Ref: 3, T: last + 30000, FH: cbfh}, + }, nil) + require.NoError(t, w.Log(b)) + floatHistogramsInWAL += 4 + + b = enc.Exemplars([]record.RefExemplar{ + {Ref: 1, T: last, V: float64(i), Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", i))}, }, nil) require.NoError(t, w.Log(b)) + // Write changing metadata for each series. In the end, only the latest + // version should end up in the checkpoint. b = enc.Metadata([]record.RefMetadata{ - {Ref: 2, Unit: "unit", Help: "help"}, - {Ref: 3, Unit: "unit", Help: "help"}, - {Ref: 4, Unit: "unit", Help: "help"}, - {Ref: 5, Unit: "unit", Help: "help"}, + {Ref: 0, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, + {Ref: 1, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, + {Ref: 2, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, + {Ref: 3, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, }, nil) require.NoError(t, w.Log(b)) + + last += 100 } - // Write samples until the WAL has enough segments. - // Make them have drifting timestamps within a record to see that they - // get filtered properly. - b := enc.Samples([]record.RefSample{ - {Ref: 0, T: last, V: float64(i)}, - {Ref: 1, T: last + 10000, V: float64(i)}, - {Ref: 2, T: last + 20000, V: float64(i)}, - {Ref: 3, T: last + 30000, V: float64(i)}, - }, nil) - require.NoError(t, w.Log(b)) - samplesInWAL += 4 - h := makeHistogram(i) - b, _ = enc.HistogramSamples([]record.RefHistogramSample{ - {Ref: 0, T: last, H: h}, - {Ref: 1, T: last + 10000, H: h}, - {Ref: 2, T: last + 20000, H: h}, - {Ref: 3, T: last + 30000, H: h}, - }, nil) - require.NoError(t, w.Log(b)) - histogramsInWAL += 4 - cbh := makeCustomBucketHistogram(i) - b = enc.CustomBucketsHistogramSamples([]record.RefHistogramSample{ - {Ref: 0, T: last, H: cbh}, - {Ref: 1, T: last + 10000, H: cbh}, - {Ref: 2, T: last + 20000, H: cbh}, - {Ref: 3, T: last + 30000, H: cbh}, - }, nil) - require.NoError(t, w.Log(b)) - histogramsInWAL += 4 - fh := makeFloatHistogram(i) - b, _ = enc.FloatHistogramSamples([]record.RefFloatHistogramSample{ - {Ref: 0, T: last, FH: fh}, - {Ref: 1, T: last + 10000, FH: fh}, - {Ref: 2, T: last + 20000, FH: fh}, - {Ref: 3, T: last + 30000, FH: fh}, - }, nil) - require.NoError(t, w.Log(b)) - floatHistogramsInWAL += 4 - cbfh := makeCustomBucketFloatHistogram(i) - b = enc.CustomBucketsFloatHistogramSamples([]record.RefFloatHistogramSample{ - {Ref: 0, T: last, FH: cbfh}, - {Ref: 1, T: last + 10000, FH: cbfh}, - {Ref: 2, T: last + 20000, FH: cbfh}, - {Ref: 3, T: last + 30000, FH: cbfh}, - }, nil) - require.NoError(t, w.Log(b)) - floatHistogramsInWAL += 4 + require.NoError(t, w.Close()) - b = enc.Exemplars([]record.RefExemplar{ - {Ref: 1, T: last, V: float64(i), Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", i))}, - }, nil) - require.NoError(t, w.Log(b)) + stats, err := Checkpoint(promslog.NewNopLogger(), w, 100, 106, func(x chunks.HeadSeriesRef) bool { + return x%2 == 0 + }, last/2, enableSTStorage) + require.NoError(t, err) + require.NoError(t, w.Truncate(107)) + require.NoError(t, DeleteCheckpoints(w.Dir(), 106)) + require.Equal(t, histogramsInWAL+floatHistogramsInWAL+samplesInWAL, stats.TotalSamples) + require.Positive(t, stats.DroppedSamples) - // Write changing metadata for each series. In the end, only the latest - // version should end up in the checkpoint. - b = enc.Metadata([]record.RefMetadata{ - {Ref: 0, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, - {Ref: 1, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, - {Ref: 2, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, - {Ref: 3, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, - }, nil) - require.NoError(t, w.Log(b)) + // Only the new checkpoint should be left. + files, err := os.ReadDir(dir) + require.NoError(t, err) + require.Len(t, files, 1) + require.Equal(t, "checkpoint.00000106", files[0].Name()) - last += 100 - } - require.NoError(t, w.Close()) + sr, err := NewSegmentsReader(filepath.Join(dir, "checkpoint.00000106")) + require.NoError(t, err) + defer sr.Close() - stats, err := Checkpoint(promslog.NewNopLogger(), w, 100, 106, func(x chunks.HeadSeriesRef) bool { - return x%2 == 0 - }, last/2) - require.NoError(t, err) - require.NoError(t, w.Truncate(107)) - require.NoError(t, DeleteCheckpoints(w.Dir(), 106)) - require.Equal(t, histogramsInWAL+floatHistogramsInWAL+samplesInWAL, stats.TotalSamples) - require.Positive(t, stats.DroppedSamples) + dec := record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + var series []record.RefSeries + var metadata []record.RefMetadata + r := NewReader(sr) - // Only the new checkpoint should be left. - files, err := os.ReadDir(dir) - require.NoError(t, err) - require.Len(t, files, 1) - require.Equal(t, "checkpoint.00000106", files[0].Name()) + samplesInCheckpoint, histogramsInCheckpoint, floatHistogramsInCheckpoint := 0, 0, 0 + for r.Next() { + rec := r.Record() - sr, err := NewSegmentsReader(filepath.Join(dir, "checkpoint.00000106")) - require.NoError(t, err) - defer sr.Close() - - dec := record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) - var series []record.RefSeries - var metadata []record.RefMetadata - r := NewReader(sr) - - samplesInCheckpoint, histogramsInCheckpoint, floatHistogramsInCheckpoint := 0, 0, 0 - for r.Next() { - rec := r.Record() - - switch dec.Type(rec) { - case record.Series: - series, err = dec.Series(rec, series) - require.NoError(t, err) - case record.Samples: - samples, err := dec.Samples(rec, nil) - require.NoError(t, err) - for _, s := range samples { - require.GreaterOrEqual(t, s.T, last/2, "sample with wrong timestamp") + switch dec.Type(rec) { + case record.Series: + series, err = dec.Series(rec, series) + require.NoError(t, err) + case record.Samples, record.SamplesV2: + samples, err := dec.Samples(rec, nil) + require.NoError(t, err) + for _, s := range samples { + require.GreaterOrEqual(t, s.T, last/2, "sample with wrong timestamp") + } + samplesInCheckpoint += len(samples) + case record.HistogramSamples, record.CustomBucketsHistogramSamples: + histograms, err := dec.HistogramSamples(rec, nil) + require.NoError(t, err) + for _, h := range histograms { + require.GreaterOrEqual(t, h.T, last/2, "histogram with wrong timestamp") + } + histogramsInCheckpoint += len(histograms) + case record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: + floatHistograms, err := dec.FloatHistogramSamples(rec, nil) + require.NoError(t, err) + for _, h := range floatHistograms { + require.GreaterOrEqual(t, h.T, last/2, "float histogram with wrong timestamp") + } + floatHistogramsInCheckpoint += len(floatHistograms) + case record.Exemplars: + exemplars, err := dec.Exemplars(rec, nil) + require.NoError(t, err) + for _, e := range exemplars { + require.GreaterOrEqual(t, e.T, last/2, "exemplar with wrong timestamp") + } + case record.Metadata: + metadata, err = dec.Metadata(rec, metadata) + require.NoError(t, err) } - samplesInCheckpoint += len(samples) - case record.HistogramSamples, record.CustomBucketsHistogramSamples: - histograms, err := dec.HistogramSamples(rec, nil) - require.NoError(t, err) - for _, h := range histograms { - require.GreaterOrEqual(t, h.T, last/2, "histogram with wrong timestamp") - } - histogramsInCheckpoint += len(histograms) - case record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: - floatHistograms, err := dec.FloatHistogramSamples(rec, nil) - require.NoError(t, err) - for _, h := range floatHistograms { - require.GreaterOrEqual(t, h.T, last/2, "float histogram with wrong timestamp") - } - floatHistogramsInCheckpoint += len(floatHistograms) - case record.Exemplars: - exemplars, err := dec.Exemplars(rec, nil) - require.NoError(t, err) - for _, e := range exemplars { - require.GreaterOrEqual(t, e.T, last/2, "exemplar with wrong timestamp") - } - case record.Metadata: - metadata, err = dec.Metadata(rec, metadata) - require.NoError(t, err) } - } - require.NoError(t, r.Err()) - // Making sure we replayed some samples. We expect >50% samples to be still present. - require.Greater(t, float64(samplesInCheckpoint)/float64(samplesInWAL), 0.5) - require.Less(t, float64(samplesInCheckpoint)/float64(samplesInWAL), 0.8) - require.Greater(t, float64(histogramsInCheckpoint)/float64(histogramsInWAL), 0.5) - require.Less(t, float64(histogramsInCheckpoint)/float64(histogramsInWAL), 0.8) - require.Greater(t, float64(floatHistogramsInCheckpoint)/float64(floatHistogramsInWAL), 0.5) - require.Less(t, float64(floatHistogramsInCheckpoint)/float64(floatHistogramsInWAL), 0.8) + require.NoError(t, r.Err()) + // Making sure we replayed some samples. We expect >50% samples to be still present. + require.Greater(t, float64(samplesInCheckpoint)/float64(samplesInWAL), 0.5) + require.Less(t, float64(samplesInCheckpoint)/float64(samplesInWAL), 0.8) + require.Greater(t, float64(histogramsInCheckpoint)/float64(histogramsInWAL), 0.5) + require.Less(t, float64(histogramsInCheckpoint)/float64(histogramsInWAL), 0.8) + require.Greater(t, float64(floatHistogramsInCheckpoint)/float64(floatHistogramsInWAL), 0.5) + require.Less(t, float64(floatHistogramsInCheckpoint)/float64(floatHistogramsInWAL), 0.8) - expectedRefSeries := []record.RefSeries{ - {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")}, - {Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")}, - {Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")}, - } - testutil.RequireEqual(t, expectedRefSeries, series) + expectedRefSeries := []record.RefSeries{ + {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")}, + {Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")}, + {Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")}, + } + testutil.RequireEqual(t, expectedRefSeries, series) - expectedRefMetadata := []record.RefMetadata{ - {Ref: 0, Unit: strconv.FormatInt(last-100, 10), Help: strconv.FormatInt(last-100, 10)}, - {Ref: 2, Unit: strconv.FormatInt(last-100, 10), Help: strconv.FormatInt(last-100, 10)}, - {Ref: 4, Unit: "unit", Help: "help"}, - } - sort.Slice(metadata, func(i, j int) bool { return metadata[i].Ref < metadata[j].Ref }) - require.Equal(t, expectedRefMetadata, metadata) - }) + expectedRefMetadata := []record.RefMetadata{ + {Ref: 0, Unit: strconv.FormatInt(last-100, 10), Help: strconv.FormatInt(last-100, 10)}, + {Ref: 2, Unit: strconv.FormatInt(last-100, 10), Help: strconv.FormatInt(last-100, 10)}, + {Ref: 4, Unit: "unit", Help: "help"}, + } + sort.Slice(metadata, func(i, j int) bool { return metadata[i].Ref < metadata[j].Ref }) + require.Equal(t, expectedRefMetadata, metadata) + }) + } } } func TestCheckpointNoTmpFolderAfterError(t *testing.T) { - // Create a new wlog with invalid data. - dir := t.TempDir() - w, err := NewSize(nil, nil, dir, 64*1024, compression.None) - require.NoError(t, err) - var enc record.Encoder - require.NoError(t, w.Log(enc.Series([]record.RefSeries{ - {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "2")}, - }, nil))) - require.NoError(t, w.Close()) + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + // Create a new wlog with invalid data. + dir := t.TempDir() + w, err := NewSize(nil, nil, dir, 64*1024, compression.None) + require.NoError(t, err) + enc := record.Encoder{EnableSTStorage: enableSTStorage} + require.NoError(t, w.Log(enc.Series([]record.RefSeries{ + {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "2")}, + }, nil))) + require.NoError(t, w.Close()) - // Corrupt data. - f, err := os.OpenFile(filepath.Join(w.Dir(), "00000000"), os.O_WRONLY, 0o666) - require.NoError(t, err) - _, err = f.WriteAt([]byte{42}, 1) - require.NoError(t, err) - require.NoError(t, f.Close()) + // Corrupt data. + f, err := os.OpenFile(filepath.Join(w.Dir(), "00000000"), os.O_WRONLY, 0o666) + require.NoError(t, err) + _, err = f.WriteAt([]byte{42}, 1) + require.NoError(t, err) + require.NoError(t, f.Close()) - // Run the checkpoint and since the wlog contains corrupt data this should return an error. - _, err = Checkpoint(promslog.NewNopLogger(), w, 0, 1, nil, 0) - require.Error(t, err) + // Run the checkpoint and since the wlog contains corrupt data this should return an error. + _, err = Checkpoint(promslog.NewNopLogger(), w, 0, 1, nil, 0, enableSTStorage) + require.Error(t, err) - // Walk the wlog dir to make sure there are no tmp folder left behind after the error. - err = filepath.Walk(w.Dir(), func(path string, info os.FileInfo, err error) error { - if err != nil { - return fmt.Errorf("access err %q: %w", path, err) - } - if info.IsDir() && strings.HasSuffix(info.Name(), ".tmp") { - return fmt.Errorf("wlog dir contains temporary folder:%s", info.Name()) - } - return nil - }) - require.NoError(t, err) + // Walk the wlog dir to make sure there are no tmp folder left behind after the error. + err = filepath.Walk(w.Dir(), func(path string, info os.FileInfo, err error) error { + if err != nil { + return fmt.Errorf("access err %q: %w", path, err) + } + if info.IsDir() && strings.HasSuffix(info.Name(), ".tmp") { + return fmt.Errorf("wlog dir contains temporary folder:%s", info.Name()) + } + return nil + }) + require.NoError(t, err) + }) + } } func TestCheckpointDeletesTemporaryCheckpoints(t *testing.T) { @@ -428,7 +434,7 @@ func TestCheckpointDeletesTemporaryCheckpoints(t *testing.T) { require.NoError(t, err) defer w.Close() - _, err = Checkpoint(promslog.NewNopLogger(), w, 0, 1000, func(_ chunks.HeadSeriesRef) bool { return true }, 1000) + _, err = Checkpoint(promslog.NewNopLogger(), w, 0, 1000, func(_ chunks.HeadSeriesRef) bool { return true }, 1000, false) require.NoError(t, err) files, err := os.ReadDir(dir) diff --git a/tsdb/wlog/watcher.go b/tsdb/wlog/watcher.go index 2eeaf0dd99..aedf16fecf 100644 --- a/tsdb/wlog/watcher.go +++ b/tsdb/wlog/watcher.go @@ -543,7 +543,7 @@ func (w *Watcher) readSegment(r *LiveReader, segmentNum int, tail bool) error { } w.writer.StoreSeries(series, segmentNum) - case record.Samples: + case record.Samples, record.SamplesV2: // If we're not tailing a segment we can ignore any samples records we see. // This speeds up replay of the WAL by > 10x. if !tail { diff --git a/tsdb/wlog/watcher_test.go b/tsdb/wlog/watcher_test.go index abf5187b65..6c82ec8dcb 100644 --- a/tsdb/wlog/watcher_test.go +++ b/tsdb/wlog/watcher_test.go @@ -17,6 +17,7 @@ import ( "math/rand" "os" "path" + "path/filepath" "runtime" "sync" "testing" @@ -192,143 +193,146 @@ func TestWatcher_Tail(t *testing.T) { seriesPerBatch = 100 exemplarsPerSeries = 2 ) + for _, enableSTStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s/stStorage=%v", compress, enableSTStorage), func(t *testing.T) { + var ( + now = time.Now() + dir = t.TempDir() + wdir = path.Join(dir, "wal") + enc = record.Encoder{EnableSTStorage: enableSTStorage} + ) + require.NoError(t, os.Mkdir(wdir, 0o777)) - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - var ( - now = time.Now() - dir = t.TempDir() - wdir = path.Join(dir, "wal") - enc record.Encoder - ) - require.NoError(t, os.Mkdir(wdir, 0o777)) + // Generate test records that represents batches of records data. + // "batch" simulates a single scrape or RW/OTLP receive message. + // Watcher does not inspect the data other than watching start timestamp, so records + // does not need any certain shape. + records := make([]testwal.Records, batches) + cbHistogramRecords := make([]testwal.Records, batches) + for i := range records { + tsFn := func(_, _ int) int64 { + return timestamp.FromTime(now.Add(1 * time.Second)) + } + records[i] = testwal.GenerateRecords(testwal.RecordsCase{ + NoST: !enableSTStorage, + RefPadding: i * seriesPerBatch, + TsFn: tsFn, - // Generate test records that represents batches of records data. - // "batch" simulates a single scrape or RW/OTLP receive message. - // Watcher does not inspect the data other than watching start timestamp, so records - // does not need any certain shape. - records := make([]testwal.Records, batches) - cbHistogramRecords := make([]testwal.Records, batches) - for i := range records { - tsFn := func(_, _ int) int64 { - return timestamp.FromTime(now.Add(1 * time.Second)) + Series: seriesPerBatch, + SamplesPerSeries: 10, + HistogramsPerSeries: 5, + FloatHistogramsPerSeries: 5, + ExemplarsPerSeries: exemplarsPerSeries, + }) + cbHistogramRecords[i] = testwal.GenerateRecords(testwal.RecordsCase{ + NoST: !enableSTStorage, + RefPadding: i * seriesPerBatch, + TsFn: tsFn, + + Series: seriesPerBatch, + HistogramsPerSeries: 5, + FloatHistogramsPerSeries: 5, + HistogramFn: func(ref int) *histogram.Histogram { + return &histogram.Histogram{ + Schema: -53, + ZeroThreshold: 1e-128, + ZeroCount: 0, + Count: 2, + Sum: 0, + PositiveSpans: []histogram.Span{{Offset: 0, Length: 1}}, + CustomValues: []float64{float64(ref) + 2}, + } + }, + }) } - records[i] = testwal.GenerateRecords(testwal.RecordsCase{ - RefPadding: i * seriesPerBatch, - TsFn: tsFn, - Series: seriesPerBatch, - SamplesPerSeries: 10, - HistogramsPerSeries: 5, - FloatHistogramsPerSeries: 5, - ExemplarsPerSeries: exemplarsPerSeries, + // Create WAL for writing. + w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, w.Close()) }) - cbHistogramRecords[i] = testwal.GenerateRecords(testwal.RecordsCase{ - RefPadding: i * seriesPerBatch, - TsFn: tsFn, - Series: seriesPerBatch, - HistogramsPerSeries: 5, - FloatHistogramsPerSeries: 5, - HistogramFn: func(ref int) *histogram.Histogram { - return &histogram.Histogram{ - Schema: -53, - ZeroThreshold: 1e-128, - ZeroCount: 0, - Count: 2, - Sum: 0, - PositiveSpans: []histogram.Span{{Offset: 0, Length: 1}}, - CustomValues: []float64{float64(ref) + 2}, - } - }, - }) - } + // Start watcher to that reads into a mock. + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "test", wt, dir, true, true, true, nil) + // Update the time because we just created samples around "now" time and watcher + // only starts watching after that time. + watcher.SetStartTime(now) + // Start spins up watcher loop in a go-routine. + watcher.Start() + t.Cleanup(watcher.Stop) - // Create WAL for writing. - w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) - require.NoError(t, err) - t.Cleanup(func() { - require.NoError(t, w.Close()) - }) + // Write to WAL like append commit would do, while watcher is tailing. - // Start watcher to that reads into a mock. - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "test", wt, dir, true, true, true, nil) - // Update the time because we just created samples around "now" time and watcher - // only starts watching after that time. - watcher.SetStartTime(now) - // Start spins up watcher loop in a go-routine. - watcher.Start() - t.Cleanup(watcher.Stop) + // Write first a few samples before the start time, we don't expect those to be appended. + require.NoError(t, w.Log(enc.Samples([]record.RefSample{ + {Ref: 1, T: timestamp.FromTime(now), V: 123}, + {Ref: 2, T: timestamp.FromTime(now), V: 123.1}, + }, nil))) - // Write to WAL like append commit would do, while watcher is tailing. + for i := range records { + // Similar order as tsdb/head_appender.go.headAppenderBase.log + // https://github.com/prometheus/prometheus/blob/1751685dd4f6430757ba3078a96cffeffcb2bb47/tsdb/head_append.go#L1053 + require.NoError(t, w.Log(enc.Series(records[i].Series, nil))) + require.NoError(t, w.Log(enc.Metadata(records[i].Metadata, nil))) + require.NoError(t, w.Log(enc.Samples(records[i].Samples, nil))) - // Write first a few samples before the start time, we don't expect those to be appended. - require.NoError(t, w.Log(enc.Samples([]record.RefSample{ - {Ref: 1, T: timestamp.FromTime(now), V: 123}, - {Ref: 2, T: timestamp.FromTime(now), V: 123.1}, - }, nil))) + hs, cbHs := enc.HistogramSamples(records[i].Histograms, nil) + require.Empty(t, cbHs) + require.NoError(t, w.Log(hs)) + fhs, cbFhs := enc.FloatHistogramSamples(records[i].FloatHistograms, nil) + require.Empty(t, cbFhs) + require.NoError(t, w.Log(fhs)) + require.NoError(t, w.Log(enc.CustomBucketsHistogramSamples(cbHistogramRecords[i].Histograms, nil))) + require.NoError(t, w.Log(enc.CustomBucketsFloatHistogramSamples(cbHistogramRecords[i].FloatHistograms, nil))) - for i := range records { - // Similar order as tsdb/head_appender.go.headAppenderBase.log - // https://github.com/prometheus/prometheus/blob/1751685dd4f6430757ba3078a96cffeffcb2bb47/tsdb/head_append.go#L1053 - require.NoError(t, w.Log(enc.Series(records[i].Series, nil))) - require.NoError(t, w.Log(enc.Metadata(records[i].Metadata, nil))) - require.NoError(t, w.Log(enc.Samples(records[i].Samples, nil))) + require.NoError(t, w.Log(enc.Exemplars(records[i].Exemplars, nil))) - hs, cbHs := enc.HistogramSamples(records[i].Histograms, nil) - require.Empty(t, cbHs) - require.NoError(t, w.Log(hs)) - fhs, cbFhs := enc.FloatHistogramSamples(records[i].FloatHistograms, nil) - require.Empty(t, cbFhs) - require.NoError(t, w.Log(fhs)) - require.NoError(t, w.Log(enc.CustomBucketsHistogramSamples(cbHistogramRecords[i].Histograms, nil))) - require.NoError(t, w.Log(enc.CustomBucketsFloatHistogramSamples(cbHistogramRecords[i].FloatHistograms, nil))) + // Ping watcher for faster test. Watcher is checking for segment changes or 15s timeout. + watcher.Notify() + } - require.NoError(t, w.Log(enc.Exemplars(records[i].Exemplars, nil))) + // Wait for watcher to lead all. + require.Eventually(t, func() bool { + wt.mu.Lock() + defer wt.mu.Unlock() - // Ping watcher for faster test. Watcher is checking for segment changes or 15s timeout. - watcher.Notify() - } + // Exemplars are logged as the last one, so assert on those. + return wt.exemplarAppends >= batches + }, 2*time.Minute, 1*time.Second) - // Wait for watcher to lead all. - require.Eventually(t, func() bool { wt.mu.Lock() defer wt.mu.Unlock() - // Exemplars are logged as the last one, so assert on those. - return wt.exemplarAppends >= batches - }, 2*time.Minute, 1*time.Second) + require.Equal(t, batches, wt.seriesStores) + require.Equal(t, batches, wt.metadataStores) + require.Equal(t, batches, wt.sampleAppends) + require.Equal(t, 2*batches, wt.histogramAppends) + require.Equal(t, 2*batches, wt.floatHistogramsAppends) + require.Equal(t, batches, wt.exemplarAppends) - wt.mu.Lock() - defer wt.mu.Unlock() + for i := range batches { + sector := len(records[i].Series) + testutil.RequireEqual(t, records[i].Series, wt.seriesStored[i*sector:(i+1)*sector], i) + sector = len(records[i].Metadata) + require.Equal(t, records[i].Metadata, wt.metadataStored[i*sector:(i+1)*sector], i) + sector = len(records[i].Samples) + require.Equal(t, records[i].Samples, wt.samplesAppended[i*sector:(i+1)*sector], i) - require.Equal(t, batches, wt.seriesStores) - require.Equal(t, batches, wt.metadataStores) - require.Equal(t, batches, wt.sampleAppends) - require.Equal(t, 2*batches, wt.histogramAppends) - require.Equal(t, 2*batches, wt.floatHistogramsAppends) - require.Equal(t, batches, wt.exemplarAppends) + sector = len(records[i].Histograms) + len(cbHistogramRecords[i].Histograms) + require.Equal(t, records[i].Histograms, wt.histogramsAppended[i*sector:i*sector+len(records[i].Histograms)], i) + require.Equal(t, cbHistogramRecords[i].Histograms, wt.histogramsAppended[i*sector+len(records[i].Histograms):(i+1)*sector]) + sector = len(records[i].FloatHistograms) + len(cbHistogramRecords[i].FloatHistograms) + require.Equal(t, records[i].FloatHistograms, wt.floatHistogramsAppended[i*sector:i*sector+len(records[i].FloatHistograms)]) + require.Equal(t, cbHistogramRecords[i].FloatHistograms, wt.floatHistogramsAppended[i*sector+len(records[i].FloatHistograms):(i+1)*sector]) - for i := range batches { - sector := len(records[i].Series) - testutil.RequireEqual(t, records[i].Series, wt.seriesStored[i*sector:(i+1)*sector], i) - sector = len(records[i].Metadata) - require.Equal(t, records[i].Metadata, wt.metadataStored[i*sector:(i+1)*sector], i) - sector = len(records[i].Samples) - require.Equal(t, records[i].Samples, wt.samplesAppended[i*sector:(i+1)*sector], i) - - sector = len(records[i].Histograms) + len(cbHistogramRecords[i].Histograms) - require.Equal(t, records[i].Histograms, wt.histogramsAppended[i*sector:i*sector+len(records[i].Histograms)], i) - require.Equal(t, cbHistogramRecords[i].Histograms, wt.histogramsAppended[i*sector+len(records[i].Histograms):(i+1)*sector]) - sector = len(records[i].FloatHistograms) + len(cbHistogramRecords[i].FloatHistograms) - require.Equal(t, records[i].FloatHistograms, wt.floatHistogramsAppended[i*sector:i*sector+len(records[i].FloatHistograms)]) - require.Equal(t, cbHistogramRecords[i].FloatHistograms, wt.floatHistogramsAppended[i*sector+len(records[i].FloatHistograms):(i+1)*sector]) - - sector = len(records[i].Exemplars) - testutil.RequireEqual(t, records[i].Exemplars, wt.exemplarsAppended[i*sector:(i+1)*sector]) - } - }) + sector = len(records[i].Exemplars) + testutil.RequireEqual(t, records[i].Exemplars, wt.exemplarsAppended[i*sector:(i+1)*sector]) + } + }) + } } } @@ -337,64 +341,66 @@ func TestReadToEndNoCheckpoint(t *testing.T) { const seriesCount = 10 const samplesCount = 250 - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) + for _, enableSTStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { + dir := t.TempDir() + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) - w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) - require.NoError(t, err) - defer func() { - require.NoError(t, w.Close()) - }() + w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) + require.NoError(t, err) + defer func() { + require.NoError(t, w.Close()) + }() - var recs [][]byte + var recs [][]byte - enc := record.Encoder{} + enc := record.Encoder{EnableSTStorage: enableSTStorage} - for i := range seriesCount { - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(i), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - recs = append(recs, series) - for j := range samplesCount { - sample := enc.Samples([]record.RefSample{ + for i := range seriesCount { + series := enc.Series([]record.RefSeries{ { - Ref: chunks.HeadSeriesRef(j), - T: int64(i), - V: float64(i), + Ref: chunks.HeadSeriesRef(i), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), }, }, nil) + recs = append(recs, series) + for j := range samplesCount { + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(j), + T: int64(i), + V: float64(i), + }, + }, nil) - recs = append(recs, sample) + recs = append(recs, sample) - // Randomly batch up records. - if rand.Intn(4) < 3 { - require.NoError(t, w.Log(recs...)) - recs = recs[:0] + // Randomly batch up records. + if rand.Intn(4) < 3 { + require.NoError(t, w.Log(recs...)) + recs = recs[:0] + } } } - } - require.NoError(t, w.Log(recs...)) - overwriteReadTimeout(t, time.Second) - _, _, err = Segments(w.Dir()) - require.NoError(t, err) + require.NoError(t, w.Log(recs...)) + overwriteReadTimeout(t, time.Second) + _, _, err = Segments(w.Dir()) + require.NoError(t, err) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - go watcher.Start() + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + go watcher.Start() - expected := seriesCount - require.Eventually(t, func() bool { - return wt.checkNumSeries() == expected - }, 20*time.Second, 1*time.Second) - watcher.Stop() - }) + expected := seriesCount + require.Eventually(t, func() bool { + return wt.checkNumSeries() == expected + }, 20*time.Second, 1*time.Second) + watcher.Stop() + }) + } } } @@ -405,184 +411,119 @@ func TestReadToEndWithCheckpoint(t *testing.T) { const seriesCount = 10 const samplesCount = 250 - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() + for _, enableSTStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { + dir := t.TempDir() - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) - enc := record.Encoder{} - w, err := NewSize(nil, nil, wdir, segmentSize, compress) - require.NoError(t, err) - defer func() { - require.NoError(t, w.Close()) - }() + enc := record.Encoder{EnableSTStorage: enableSTStorage} + w, err := NewSize(nil, nil, wdir, segmentSize, compress) + require.NoError(t, err) + defer func() { + require.NoError(t, w.Close()) + }() - // Write to the initial segment then checkpoint. - for i := range seriesCount { - ref := i + 100 - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(ref), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - // Add in an unknown record type, which should be ignored. - require.NoError(t, w.Log([]byte{255})) - - for range samplesCount { - inner := rand.Intn(ref + 1) - sample := enc.Samples([]record.RefSample{ + // Write to the initial segment then checkpoint. + for i := range seriesCount { + ref := i + 100 + series := enc.Series([]record.RefSeries{ { - Ref: chunks.HeadSeriesRef(inner), - T: int64(i), - V: float64(i), + Ref: chunks.HeadSeriesRef(ref), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), }, }, nil) - require.NoError(t, w.Log(sample)) + require.NoError(t, w.Log(series)) + // Add in an unknown record type, which should be ignored. + require.NoError(t, w.Log([]byte{255})) + + for range samplesCount { + inner := rand.Intn(ref + 1) + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(inner), + T: int64(i), + V: float64(i), + }, + }, nil) + require.NoError(t, w.Log(sample)) + } } - } - Checkpoint(promslog.NewNopLogger(), w, 0, 1, func(chunks.HeadSeriesRef) bool { return true }, 0) - w.Truncate(1) + Checkpoint(promslog.NewNopLogger(), w, 0, 1, func(chunks.HeadSeriesRef) bool { return true }, 0, enableSTStorage) + w.Truncate(1) - // Write more records after checkpointing. - for i := range seriesCount { - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(i), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - - for j := range samplesCount { - sample := enc.Samples([]record.RefSample{ + // Write more records after checkpointing. + for i := range seriesCount { + series := enc.Series([]record.RefSeries{ { - Ref: chunks.HeadSeriesRef(j), - T: int64(i), - V: float64(i), + Ref: chunks.HeadSeriesRef(i), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), }, }, nil) - require.NoError(t, w.Log(sample)) + require.NoError(t, w.Log(series)) + + for j := range samplesCount { + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(j), + T: int64(i), + V: float64(i), + }, + }, nil) + require.NoError(t, w.Log(sample)) + } } - } - _, _, err = Segments(w.Dir()) - require.NoError(t, err) - overwriteReadTimeout(t, time.Second) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - go watcher.Start() + _, _, err = Segments(w.Dir()) + require.NoError(t, err) + overwriteReadTimeout(t, time.Second) + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + go watcher.Start() - expected := seriesCount * 2 + expected := seriesCount * 2 - require.Eventually(t, func() bool { - return wt.checkNumSeries() == expected - }, 10*time.Second, 1*time.Second) - watcher.Stop() - }) + require.Eventually(t, func() bool { + return wt.checkNumSeries() == expected + }, 10*time.Second, 1*time.Second) + watcher.Stop() + }) + } } } func TestReadCheckpoint(t *testing.T) { - t.Parallel() pageSize := 32 * 1024 const seriesCount = 10 const samplesCount = 250 - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() + for _, enableSTStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { + dir := t.TempDir() - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) - f, err := os.Create(SegmentName(wdir, 30)) - require.NoError(t, err) - require.NoError(t, f.Close()) + f, err := os.Create(SegmentName(wdir, 30)) + require.NoError(t, err) + require.NoError(t, f.Close()) - enc := record.Encoder{} - w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) - require.NoError(t, err) - t.Cleanup(func() { - require.NoError(t, w.Close()) - }) + enc := record.Encoder{EnableSTStorage: enableSTStorage} + w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, w.Close()) + }) - // Write to the initial segment then checkpoint. - for i := range seriesCount { - ref := i + 100 - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(ref), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - - for range samplesCount { - inner := rand.Intn(ref + 1) - sample := enc.Samples([]record.RefSample{ - { - Ref: chunks.HeadSeriesRef(inner), - T: int64(i), - V: float64(i), - }, - }, nil) - require.NoError(t, w.Log(sample)) - } - } - _, err = w.NextSegmentSync() - require.NoError(t, err) - _, err = Checkpoint(promslog.NewNopLogger(), w, 30, 31, func(chunks.HeadSeriesRef) bool { return true }, 0) - require.NoError(t, err) - require.NoError(t, w.Truncate(32)) - - // Start read after checkpoint, no more data written. - _, _, err = Segments(w.Dir()) - require.NoError(t, err) - - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - go watcher.Start() - - expectedSeries := seriesCount - retry(t, defaultRetryInterval, defaultRetries, func() bool { - return wt.checkNumSeries() >= expectedSeries - }) - watcher.Stop() - require.Equal(t, expectedSeries, wt.checkNumSeries()) - }) - } -} - -func TestReadCheckpointMultipleSegments(t *testing.T) { - pageSize := 32 * 1024 - - const segments = 1 - const seriesCount = 20 - const samplesCount = 300 - - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() - - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) - - enc := record.Encoder{} - w, err := NewSize(nil, nil, wdir, pageSize, compress) - require.NoError(t, err) - - // Write a bunch of data. - for i := range segments { - for j := range seriesCount { - ref := j + (i * 100) + // Write to the initial segment then checkpoint. + for i := range seriesCount { + ref := i + 100 series := enc.Series([]record.RefSeries{ { Ref: chunks.HeadSeriesRef(ref), @@ -603,57 +544,132 @@ func TestReadCheckpointMultipleSegments(t *testing.T) { require.NoError(t, w.Log(sample)) } } - } - require.NoError(t, w.Close()) - - // At this point we should have at least 6 segments, lets create a checkpoint dir of the first 5. - checkpointDir := dir + "/wal/checkpoint.000004" - err = os.Mkdir(checkpointDir, 0o777) - require.NoError(t, err) - for i := 0; i <= 4; i++ { - err := os.Rename(SegmentName(dir+"/wal", i), SegmentName(checkpointDir, i)) + _, err = w.NextSegmentSync() require.NoError(t, err) - } + _, err = Checkpoint(promslog.NewNopLogger(), w, 30, 31, func(chunks.HeadSeriesRef) bool { return true }, 0, enableSTStorage) + require.NoError(t, err) + require.NoError(t, w.Truncate(32)) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - watcher.MaxSegment = -1 + // Start read after checkpoint, no more data written. + _, _, err = Segments(w.Dir()) + require.NoError(t, err) - // Set the Watcher's metrics so they're not nil pointers. - watcher.SetMetrics() + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + go watcher.Start() - lastCheckpoint, _, err := LastCheckpoint(watcher.walDir) - require.NoError(t, err) + expectedSeries := seriesCount + retry(t, defaultRetryInterval, defaultRetries, func() bool { + return wt.checkNumSeries() >= expectedSeries + }) + watcher.Stop() + require.Equal(t, expectedSeries, wt.checkNumSeries()) + }) + } + } +} - err = watcher.readCheckpoint(lastCheckpoint, (*Watcher).readSegment) - require.NoError(t, err) - }) +func TestReadCheckpointMultipleSegments(t *testing.T) { + pageSize := 32 * 1024 + + const segments = 1 + const seriesCount = 40 + const samplesCount = 500 + + for _, enableSTStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { + dir := t.TempDir() + + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) + + enc := record.Encoder{EnableSTStorage: enableSTStorage} + w, err := NewSize(nil, nil, wdir, pageSize, compress) + require.NoError(t, err) + + // Write a bunch of data. + for i := range segments { + for j := range seriesCount { + ref := j + (i * 100) + series := enc.Series([]record.RefSeries{ + { + Ref: chunks.HeadSeriesRef(ref), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), + }, + }, nil) + require.NoError(t, w.Log(series)) + + for range samplesCount { + inner := rand.Intn(ref + 1) + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(inner), + T: int64(i), + V: float64(i), + }, + }, nil) + require.NoError(t, w.Log(sample)) + } + } + } + require.NoError(t, w.Close()) + + // At this point we should have at least 6 segments, lets create a checkpoint dir of the first 5. + checkpointDir := dir + "/wal/checkpoint.000004" + err = os.Mkdir(checkpointDir, 0o777) + require.NoError(t, err) + for i := 0; i <= 4; i++ { + err := os.Rename(SegmentName(dir+"/wal", i), SegmentName(checkpointDir, i)) + require.NoError(t, err) + } + + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + watcher.MaxSegment = -1 + + // Set the Watcher's metrics so they're not nil pointers. + watcher.SetMetrics() + + lastCheckpoint, _, err := LastCheckpoint(watcher.walDir) + require.NoError(t, err) + + err = watcher.readCheckpoint(lastCheckpoint, (*Watcher).readSegment) + require.NoError(t, err) + }) + } } } func TestCheckpointSeriesReset(t *testing.T) { - segmentSize := 32 * 1024 + segmentSize := 64 * 1024 // We need something similar to this # of series and samples // in order to get enough segments for us to checkpoint. - const seriesCount = 20 - const samplesCount = 350 + const seriesCount = 30 + const samplesCount = 700 testCases := []struct { - compress compression.Type - segments int + compress compression.Type + enableSTStorage bool + segments int }{ - {compress: compression.None, segments: 14}, - {compress: compression.Snappy, segments: 13}, + {compress: compression.None, enableSTStorage: false, segments: 24}, + {compress: compression.Snappy, enableSTStorage: false, segments: 23}, + {compress: compression.None, enableSTStorage: true, segments: 20}, + {compress: compression.Snappy, enableSTStorage: true, segments: 20}, } + dir := t.TempDir() for _, tc := range testCases { - t.Run(fmt.Sprintf("compress=%s", tc.compress), func(t *testing.T) { - dir := t.TempDir() - - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", tc.compress, tc.enableSTStorage), func(t *testing.T) { + subdir := filepath.Join(dir, fmt.Sprintf("%s-%v", tc.compress, tc.enableSTStorage)) + err := os.MkdirAll(subdir, 0o777) + require.NoError(t, err) + wdir := filepath.Join(subdir, "wal") + err = os.MkdirAll(wdir, 0o777) require.NoError(t, err) - enc := record.Encoder{} + enc := record.Encoder{EnableSTStorage: tc.enableSTStorage} w, err := NewSize(nil, nil, wdir, segmentSize, tc.compress) require.NoError(t, err) defer func() { @@ -689,7 +705,7 @@ func TestCheckpointSeriesReset(t *testing.T) { overwriteReadTimeout(t, time.Second) wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, subdir, false, false, false, nil) watcher.MaxSegment = -1 go watcher.Start() @@ -701,13 +717,13 @@ func TestCheckpointSeriesReset(t *testing.T) { return wt.checkNumSeries() == seriesCount }, 10*time.Second, 1*time.Second) - _, err = Checkpoint(promslog.NewNopLogger(), w, 2, 4, func(chunks.HeadSeriesRef) bool { return true }, 0) + _, err = Checkpoint(promslog.NewNopLogger(), w, 2, 4, func(chunks.HeadSeriesRef) bool { return true }, 0, true) require.NoError(t, err) err = w.Truncate(5) require.NoError(t, err) - _, cpi, err := LastCheckpoint(path.Join(dir, "wal")) + _, cpi, err := LastCheckpoint(wdir) require.NoError(t, err) err = watcher.garbageCollectSeries(cpi + 1) require.NoError(t, err) @@ -724,66 +740,67 @@ func TestCheckpointSeriesReset(t *testing.T) { } func TestRun_StartupTime(t *testing.T) { - t.Parallel() const pageSize = 32 * 1024 - const segments = 10 - const seriesCount = 20 - const samplesCount = 300 + const segments = 20 + const seriesCount = 40 + const samplesCount = 500 - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() + for _, enableSTStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { + dir := t.TempDir() - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) - enc := record.Encoder{} - w, err := NewSize(nil, nil, wdir, pageSize, compress) - require.NoError(t, err) + enc := record.Encoder{EnableSTStorage: enableSTStorage} + w, err := NewSize(nil, nil, wdir, pageSize, compress) + require.NoError(t, err) - for i := range segments { - for j := range seriesCount { - ref := j + (i * 100) - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(ref), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - - for range samplesCount { - inner := rand.Intn(ref + 1) - sample := enc.Samples([]record.RefSample{ + for i := range segments { + for j := range seriesCount { + ref := j + (i * 100) + series := enc.Series([]record.RefSeries{ { - Ref: chunks.HeadSeriesRef(inner), - T: int64(i), - V: float64(i), + Ref: chunks.HeadSeriesRef(ref), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), }, }, nil) - require.NoError(t, w.Log(sample)) + require.NoError(t, w.Log(series)) + + for range samplesCount { + inner := rand.Intn(ref + 1) + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(inner), + T: int64(i), + V: float64(i), + }, + }, nil) + require.NoError(t, w.Log(sample)) + } } } - } - require.NoError(t, w.Close()) + require.NoError(t, w.Close()) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - watcher.MaxSegment = segments + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + watcher.MaxSegment = segments - watcher.SetMetrics() - startTime := time.Now() + watcher.SetMetrics() + startTime := time.Now() - err = watcher.Run() - require.Less(t, time.Since(startTime), readTimeout) - require.NoError(t, err) - }) + err = watcher.Run() + require.Less(t, time.Since(startTime), readTimeout) + require.NoError(t, err) + }) + } } } -func generateWALRecords(w *WL, segment, seriesCount, samplesCount int) error { - enc := record.Encoder{} +func generateWALRecords(w *WL, segment, seriesCount, samplesCount int, enableSTStorage bool) error { + enc := record.Encoder{EnableSTStorage: enableSTStorage} for j := range seriesCount { ref := j + (segment * 100) series := enc.Series([]record.RefSeries{ @@ -823,61 +840,63 @@ func TestRun_AvoidNotifyWhenBehind(t *testing.T) { const seriesCount = 10 const samplesCount = 50 - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() + for _, enableSTStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { + dir := t.TempDir() - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) - w, err := NewSize(nil, nil, wdir, segmentSize, compress) - require.NoError(t, err) - // Write to 00000000, the watcher will read series from it. - require.NoError(t, generateWALRecords(w, 0, seriesCount, samplesCount)) - // Create 00000001, the watcher will tail it once started. - w.NextSegment() - - // Set up the watcher and run it in the background. - wt := newWriteToMock(time.Millisecond) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - watcher.SetMetrics() - watcher.MaxSegment = segmentsToRead - - var g errgroup.Group - g.Go(func() error { - startTime := time.Now() - err = watcher.Run() - if err != nil { - return err - } - // If the watcher was to wait for readTicker to read every new segment, it would need readTimeout * segmentsToRead. - d := time.Since(startTime) - if d > readTimeout { - return fmt.Errorf("watcher ran for %s, it shouldn't rely on readTicker=%s to read the new segments", d, readTimeout) - } - return nil - }) - - // The watcher went through 00000000 and is tailing the next one. - retry(t, defaultRetryInterval, defaultRetries, func() bool { - return wt.checkNumSeries() == seriesCount - }) - - // In the meantime, add some new segments in bulk. - // We should end up with segmentsToWrite + 1 segments now. - for i := 1; i < segmentsToWrite; i++ { - require.NoError(t, generateWALRecords(w, i, seriesCount, samplesCount)) + w, err := NewSize(nil, nil, wdir, segmentSize, compress) + require.NoError(t, err) + // Write to 00000000, the watcher will read series from it. + require.NoError(t, generateWALRecords(w, 0, seriesCount, samplesCount, enableSTStorage)) + // Create 00000001, the watcher will tail it once started. w.NextSegment() - } - // Wait for the watcher. - require.NoError(t, g.Wait()) + // Set up the watcher and run it in the background. + wt := newWriteToMock(time.Millisecond) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + watcher.SetMetrics() + watcher.MaxSegment = segmentsToRead - // All series and samples were read. - require.Equal(t, (segmentsToRead+1)*seriesCount, wt.checkNumSeries()) // Series from 00000000 are also read. - require.Len(t, wt.samplesAppended, segmentsToRead*seriesCount*samplesCount) - require.NoError(t, w.Close()) - }) + var g errgroup.Group + g.Go(func() error { + startTime := time.Now() + err = watcher.Run() + if err != nil { + return err + } + // If the watcher was to wait for readTicker to read every new segment, it would need readTimeout * segmentsToRead. + d := time.Since(startTime) + if d > readTimeout { + return fmt.Errorf("watcher ran for %s, it shouldn't rely on readTicker=%s to read the new segments", d, readTimeout) + } + return nil + }) + + // The watcher went through 00000000 and is tailing the next one. + retry(t, defaultRetryInterval, defaultRetries, func() bool { + return wt.checkNumSeries() == seriesCount + }) + + // In the meantime, add some new segments in bulk. + // We should end up with segmentsToWrite + 1 segments now. + for i := 1; i < segmentsToWrite; i++ { + require.NoError(t, generateWALRecords(w, i, seriesCount, samplesCount, enableSTStorage)) + w.NextSegment() + } + + // Wait for the watcher. + require.NoError(t, g.Wait()) + + // All series and samples were read. + require.Equal(t, (segmentsToRead+1)*seriesCount, wt.checkNumSeries()) // Series from 00000000 are also read. + require.Len(t, wt.samplesAppended, segmentsToRead*seriesCount*samplesCount) + require.NoError(t, w.Close()) + }) + } } } diff --git a/util/kahansum/kahansum.go b/util/kahansum/kahansum.go index d55defcb29..b9a02889b3 100644 --- a/util/kahansum/kahansum.go +++ b/util/kahansum/kahansum.go @@ -16,10 +16,21 @@ package kahansum import "math" // Inc performs addition of two floating-point numbers using the Kahan summation algorithm. -// We get incorrect results if this function is inlined; see https://github.com/prometheus/prometheus/issues/16714. -// -//go:noinline func Inc(inc, sum, c float64) (newSum, newC float64) { + // We've seen Kahan summation return less accurate results when Inc function is + // allowed to be inlined (see https://github.com/prometheus/prometheus/pull/16895). + // Go permits fusing float operations (e.g. using fused multiply-add, which allows + // calculating a*b+c without rounding the result of a*b to precision available in float64), + // and Kahan sum is sensitive to float rounding behavior. Instead of forbidding inlining + // (which only disallows fusing operations outside of Inc with operations happening inside) + // and eating the performance cost of non-inlined function calls, we forbid just the fusing + // across Inc call boundary. We can do that by explicitly requesting Inc arguments and results + // to be rounded to float64 precision, as documented in go spec (https://go.dev/ref/spec#Floating_point_operators). + // The following casts are not no-ops! + inc = float64(inc) + sum = float64(sum) + c = float64(c) + t := sum + inc switch { case math.IsInf(t, 0): @@ -31,6 +42,9 @@ func Inc(inc, sum, c float64) (newSum, newC float64) { default: c += (inc - t) + sum } + + t = float64(t) + c = float64(c) return t, c } diff --git a/util/testrecord/record.go b/util/testrecord/record.go new file mode 100644 index 0000000000..e5071d42c8 --- /dev/null +++ b/util/testrecord/record.go @@ -0,0 +1,96 @@ +// Copyright 2025 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testrecord + +import ( + "math" + "testing" + + "github.com/prometheus/prometheus/tsdb/chunks" + "github.com/prometheus/prometheus/tsdb/record" +) + +type RefSamplesCase string + +const ( + Realistic1000Samples RefSamplesCase = "real1000" + Realistic1000WithVariableSTSamples RefSamplesCase = "real1000-vst" + Realistic1000WithConstSTSamples RefSamplesCase = "real1000-cst" + WorstCase1000 RefSamplesCase = "worst1000" + WorstCase1000WithSTSamples RefSamplesCase = "worst1000-st" +) + +func GenTestRefSamplesCase(t testing.TB, c RefSamplesCase) []record.RefSample { + t.Helper() + + ret := make([]record.RefSample, 1e3) + switch c { + // Samples are across series, so likely all have the same timestamp. + case Realistic1000Samples: + for i := range ret { + ret[i].Ref = chunks.HeadSeriesRef(i) + ret[i].T = int64(12423423) + ret[i].V = highVarianceFloat(i) + } + // Likely the start times will all be the same with deltas. + case Realistic1000WithConstSTSamples: + for i := range ret { + ret[i].Ref = chunks.HeadSeriesRef(i) + ret[i].ST = int64(12423423) + ret[i].T = int64(12423423 + 15) + ret[i].V = highVarianceFloat(i) + } + // Maybe series have different start times though + case Realistic1000WithVariableSTSamples: + for i := range ret { + ret[i].Ref = chunks.HeadSeriesRef(i) + ret[i].ST = int64((12423423 / 9) * (i % 10)) + ret[i].T = int64(12423423) + ret[i].V = highVarianceFloat(i) + } + case WorstCase1000: + for i := range ret { + ret[i].Ref = chunks.HeadSeriesRef(i) + ret[i].T = highVarianceInt(i) + ret[i].V = highVarianceFloat(i) + } + case WorstCase1000WithSTSamples: + for i := range ret { + ret[i].Ref = chunks.HeadSeriesRef(i) + + // Worst case is when the values are significantly different + // to each other which breaks delta encoding. + ret[i].ST = highVarianceInt(i+1) / 1024 // Make sure ST is not comparable to T + ret[i].T = highVarianceInt(i) + ret[i].V = highVarianceFloat(i) + } + default: + t.Fatal("unknown case", c) + } + return ret +} + +func highVarianceInt(i int) int64 { + if i%2 == 0 { + return math.MinInt32 + } + return math.MaxInt32 +} + +func highVarianceFloat(i int) float64 { + if i%2 == 0 { + return math.SmallestNonzeroFloat32 + } + return math.MaxFloat32 +} diff --git a/util/testwal/records.go b/util/testwal/records.go index 5f85e42c3c..1fe5938461 100644 --- a/util/testwal/records.go +++ b/util/testwal/records.go @@ -48,6 +48,8 @@ type RecordsCase struct { // HistogramFn source histogram for histogram and float histogram records. // By default, newTestHist is used (exponential bucketing) HistogramFn func(ref int) *histogram.Histogram + // NoST controls if ref samples should skip generating Start Timestamps. If true, ST is 0. + NoST bool } // Records represents batches of generated WAL records. @@ -118,10 +120,18 @@ func GenerateRecords(c RecordsCase) (ret Records) { Help: fmt.Sprintf("help text for %d", ref), } for j := range c.SamplesPerSeries { + ts := c.TsFn(ref, j) + // Keep ST simple for now; we don't test the exact semantics. + // We can improve later (e.g. STsFN). + sts := ts - 1 + if c.NoST { + sts = 0 + } + ret.Samples[i*c.SamplesPerSeries+j] = record.RefSample{ Ref: chunks.HeadSeriesRef(ref), - T: c.TsFn(ref, j), - V: float64(ref), + ST: sts, T: ts, + V: float64(ref), } } h := c.HistogramFn(ref) diff --git a/web/ui/mantine-ui/src/promql/functionDocs.tsx b/web/ui/mantine-ui/src/promql/functionDocs.tsx index c7f744ba6f..75a4a767a7 100644 --- a/web/ui/mantine-ui/src/promql/functionDocs.tsx +++ b/web/ui/mantine-ui/src/promql/functionDocs.tsx @@ -1257,7 +1257,7 @@ const funcDocs: Record = { <>

histogram_avg(v instant-vector) returns the arithmetic average of observed values stored in each - histogram sample in v. Float samples are ignored and do not show up in the returned vector. + native histogram sample in v. Float samples are ignored and do not show up in the returned vector.

@@ -1283,13 +1283,13 @@ const funcDocs: Record = { histogram_count: ( <>

- histogram_count(v instant-vector) returns the count of observations stored in each histogram sample - in v. Float samples are ignored and do not show up in the returned vector. + histogram_count(v instant-vector) returns the count of observations stored in each native histogram + sample in v. Float samples are ignored and do not show up in the returned vector.

- Similarly, histogram_sum(v instant-vector) returns the sum of observations stored in each histogram - sample. + Similarly, histogram_sum(v instant-vector) returns the sum of observations stored in each native + histogram sample.

@@ -1574,15 +1574,15 @@ const funcDocs: Record = { <>

histogram_stddev(v instant-vector) returns the estimated standard deviation of observations for - each histogram sample in v. For this estimation, all observations in a bucket are assumed to have - the value of the mean of the bucket boundaries. For the zero bucket and for buckets with custom boundaries, the - arithmetic mean is used. For the usual exponential buckets, the geometric mean is used. Float samples are + each native histogram sample in v. For this estimation, all observations in a bucket are assumed to + have the value of the mean of the bucket boundaries. For the zero bucket and for buckets with custom boundaries, + the arithmetic mean is used. For the usual exponential buckets, the geometric mean is used. Float samples are ignored and do not show up in the returned vector.

Similarly, histogram_stdvar(v instant-vector) returns the estimated standard variance of - observations for each histogram sample in v. + observations for each native histogram sample in v.

), @@ -1590,28 +1590,28 @@ const funcDocs: Record = { <>

histogram_stddev(v instant-vector) returns the estimated standard deviation of observations for - each histogram sample in v. For this estimation, all observations in a bucket are assumed to have - the value of the mean of the bucket boundaries. For the zero bucket and for buckets with custom boundaries, the - arithmetic mean is used. For the usual exponential buckets, the geometric mean is used. Float samples are + each native histogram sample in v. For this estimation, all observations in a bucket are assumed to + have the value of the mean of the bucket boundaries. For the zero bucket and for buckets with custom boundaries, + the arithmetic mean is used. For the usual exponential buckets, the geometric mean is used. Float samples are ignored and do not show up in the returned vector.

Similarly, histogram_stdvar(v instant-vector) returns the estimated standard variance of - observations for each histogram sample in v. + observations for each native histogram sample in v.

), histogram_sum: ( <>

- histogram_count(v instant-vector) returns the count of observations stored in each histogram sample - in v. Float samples are ignored and do not show up in the returned vector. + histogram_count(v instant-vector) returns the count of observations stored in each native histogram + sample in v. Float samples are ignored and do not show up in the returned vector.

- Similarly, histogram_sum(v instant-vector) returns the sum of observations stored in each histogram - sample. + Similarly, histogram_sum(v instant-vector) returns the sum of observations stored in each native + histogram sample.

diff --git a/web/ui/module/codemirror-promql/src/complete/promql.terms.ts b/web/ui/module/codemirror-promql/src/complete/promql.terms.ts index 645b507855..7fb89bf062 100644 --- a/web/ui/module/codemirror-promql/src/complete/promql.terms.ts +++ b/web/ui/module/codemirror-promql/src/complete/promql.terms.ts @@ -317,10 +317,16 @@ export const functionIdentifierTerms = [ info: 'Join together label values into new label', type: 'function', }, + { + label: 'first_over_time', + detail: 'function', + info: 'Return the value of the oldest sample in the specified interval', + type: 'function', + }, { label: 'last_over_time', detail: 'function', - info: 'The most recent point value in specified interval.', + info: 'Return the value of the most recent sample in the specified interval', type: 'function', }, { @@ -371,6 +377,12 @@ export const functionIdentifierTerms = [ info: 'Return the timestamp of the minimum value over time for input series', type: 'function', }, + { + label: 'ts_of_first_over_time', + detail: 'function', + info: 'Return the timestamp of the first value over time for input series', + type: 'function', + }, { label: 'ts_of_last_over_time', detail: 'function', diff --git a/web/web.go b/web/web.go index 90eaf13afe..c4fcfdb2c4 100644 --- a/web/web.go +++ b/web/web.go @@ -253,6 +253,11 @@ func (h *Handler) ApplyConfig(conf *config.Config) error { defer h.mtx.Unlock() h.config = conf + if conf.StorageConfig.TSDBConfig != nil && conf.StorageConfig.TSDBConfig.Retention != nil { + h.options.TSDBRetentionDuration = conf.StorageConfig.TSDBConfig.Retention.Time + h.options.TSDBMaxBytes = conf.StorageConfig.TSDBConfig.Retention.Size + h.options.TSDBMaxPercentage = conf.StorageConfig.TSDBConfig.Retention.Percentage + } return nil } @@ -866,20 +871,25 @@ func (h *Handler) runtimeInfo() (api_v1.RuntimeInfo, error) { status.Hostname = hostname status.ServerTime = time.Now().UTC() - if h.options.TSDBRetentionDuration != 0 { - status.StorageRetention = h.options.TSDBRetentionDuration.String() + h.mtx.RLock() + tsdbRetentionDuration := h.options.TSDBRetentionDuration + tsdbMaxBytes := h.options.TSDBMaxBytes + tsdbMaxPercentage := h.options.TSDBMaxPercentage + h.mtx.RUnlock() + if tsdbRetentionDuration != 0 { + status.StorageRetention = tsdbRetentionDuration.String() } - if h.options.TSDBMaxBytes != 0 { + if tsdbMaxBytes != 0 { if status.StorageRetention != "" { status.StorageRetention += " or " } - status.StorageRetention += h.options.TSDBMaxBytes.String() + status.StorageRetention += tsdbMaxBytes.String() } - if h.options.TSDBMaxPercentage != 0 { + if tsdbMaxPercentage != 0 { if status.StorageRetention != "" { status.StorageRetention += " or " } - status.StorageRetention = status.StorageRetention + strconv.FormatUint(uint64(h.options.TSDBMaxPercentage), 10) + "%" + status.StorageRetention = status.StorageRetention + strconv.FormatUint(uint64(tsdbMaxPercentage), 10) + "%" } metrics, err := h.gatherer.Gather()