From 4b8fb76d95fc9936a4d1e050b3e8310d31b05724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 21 Jan 2026 16:55:11 +0100 Subject: [PATCH 01/73] feat(tsdb/chunkenc): add float chunk format with start timestamp support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/benchmark_test.go | 342 ++++++++++++++ tsdb/chunkenc/chunk.go | 16 +- tsdb/chunkenc/chunk_test.go | 165 ++----- tsdb/chunkenc/st_helper_test.go | 124 +++++ tsdb/chunkenc/varbit_classic.go | 103 ++++ tsdb/chunkenc/varbit_classic_test.go | 57 +++ tsdb/chunkenc/xoroptst.go | 681 +++++++++++++++++++++++++++ tsdb/chunkenc/xoroptst_test.go | 108 +++++ 8 files changed, 1466 insertions(+), 130 deletions(-) create mode 100644 tsdb/chunkenc/benchmark_test.go create mode 100644 tsdb/chunkenc/st_helper_test.go create mode 100644 tsdb/chunkenc/varbit_classic.go create mode 100644 tsdb/chunkenc/varbit_classic_test.go create mode 100644 tsdb/chunkenc/xoroptst.go create mode 100644 tsdb/chunkenc/xoroptst_test.go diff --git a/tsdb/chunkenc/benchmark_test.go b/tsdb/chunkenc/benchmark_test.go new file mode 100644 index 0000000000..cc69725858 --- /dev/null +++ b/tsdb/chunkenc/benchmark_test.go @@ -0,0 +1,342 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunkenc + +import ( + "errors" + "fmt" + "io" + "math" + "math/rand" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/model/timestamp" +) + +type sampleCase struct { + name string + samples []triple +} + +type fmtCase struct { + name string + newChunkFn func() Chunk +} + +func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampleCase)) { + const nSamples = 120 // Same as tsdb.DefaultSamplesPerChunk. + + d, err := time.Parse(time.DateTime, "2025-11-04 10:01:05") + require.NoError(b, err) + + var ( + r = rand.New(rand.NewSource(1)) + initST = timestamp.FromTime(d) // Use realistic timestamp. + initT = initST + 15000 // 15s after initST. + initV = 1243535.123 + ) + + sampleCases := []sampleCase{ + { + name: "vt=constant/st=0", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + t += 15000 + ret = append(ret, triple{st: 0, t: t, v: v}) + } + return ret + }(), + }, + + { + // Cumulative with a constant ST through the whole chunk, typical case (e.g. long counting counter). + name: "vt=constant/st=cumulative", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + t += 15000 + ret = append(ret, triple{st: initST, t: t, v: v}) + } + return ret + }(), + }, + { + // Delta simulates delta type or worst case for cumulatives, where ST + // is changing on every sample. + name: "vt=constant/st=delta", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + st := t + 1 // ST is a tight interval after the last t+1ms. + t += 15000 + ret = append(ret, triple{st: st, t: t, v: v}) + } + return ret + }(), + }, + { + name: "vt=random steps/st=0", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. + v += float64(r.Intn(100) - 50) // Varying from -50 to +50 in 100 discrete steps. + ret = append(ret, triple{st: 0, t: t, v: v}) + } + return ret + }(), + }, + { + name: "vt=random steps/st=cumulative", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. + v += float64(r.Intn(100) - 50) // Varying from -50 to +50 in 100 discrete steps. + ret = append(ret, triple{st: initST, t: t, v: v}) + } + return ret + }(), + }, + { + name: "vt=random steps/st=delta", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + st := t + 1 // ST is a tight interval after the last t+1ms. + t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. + v += float64(r.Intn(100) - 50) // Varying from -50 to +50 in 100 discrete steps. + ret = append(ret, triple{st: st, t: t, v: v}) + } + return ret + }(), + }, + { + name: "vt=random 0-1/st=0", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. + v += r.Float64() // Random between 0 and 1.0. + ret = append(ret, triple{st: 0, t: t, v: v}) + } + return ret + }(), + }, + { + // Are we impacted by https://victoriametrics.com/blog/go-protobuf/ negative varint issue? (zig-zag needed?) + name: "vt=negrandom 0-1/st=0", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. + v -= r.Float64() // Random between 0 and 1.0. + ret = append(ret, triple{st: 0, t: t, v: v}) + } + return ret + }(), + }, + { + name: "vt=random 0-1/st=cumulative", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. + v += r.Float64() // Random between 0 and 1.0. + ret = append(ret, triple{st: initST, t: t, v: v}) + } + return ret + }(), + }, + { + name: "vt=random 0-1/st=cumulative-periodic-resets", + samples: func() (ret []triple) { + t, v := initT, initV + for i := range nSamples { + t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. + v += r.Float64() // Random between 0 and 1.0. + st := initST + if i%6 == 5 { + st = t - 10000 // Reset of 10s before current t. + } + ret = append(ret, triple{st: st, t: t, v: v}) + } + return ret + }(), + }, + { + name: "vt=random 0-1/st=cumulative-periodic-zeros", + samples: func() (ret []triple) { + t, v := initT, initV + for i := range nSamples { + t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. + v += r.Float64() // Random between 0 and 1.0. + st := initST + if i%6 == 5 { + st = 0 + } + ret = append(ret, triple{st: st, t: t, v: v}) + } + return ret + }(), + }, + { + name: "vt=random 0-1/st=delta", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + st := t + 1 // ST is a tight interval after the last t+1ms. + t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. + v += r.Float64() // Random between 0 and 1.0. + ret = append(ret, triple{st: st, t: t, v: v}) + } + return ret + }(), + }, + } + + for _, f := range []fmtCase{ + {name: "XOR", newChunkFn: func() Chunk { return NewXORChunk() }}, + {name: "XOR_OPT_ST", newChunkFn: func() Chunk { return NewXOROptSTChunk() }}, + } { + for _, s := range sampleCases { + b.Run(fmt.Sprintf("fmt=%s/%s", f.name, s.name), func(b *testing.B) { + fn(b, f, s) + }) + } + } +} + +/* + export bench=bw.bench/append.v2 && go test \ + -run '^$' -bench '^BenchmarkAppender' \ + -benchtime 1s -count 6 -cpu 2 -timeout 999m \ + | tee ${bench}.txt + +For profiles: + + export bench=bw.bench/appendprof && go test \ + -run '^$' -bench '^BenchmarkAppender' \ + -benchtime 1s -count 1 -cpu 2 -timeout 999m \ + -cpuprofile=${bench}.cpu.pprof \ + | tee ${bench}.txt +*/ +func BenchmarkAppender(b *testing.B) { + foreachFmtSampleCase(b, func(b *testing.B, f fmtCase, s sampleCase) { + b.ReportAllocs() + + for b.Loop() { + c := f.newChunkFn() + + a, err := c.Appender() + if err != nil { + b.Fatalf("get appender: %s", err) + } + for _, p := range s.samples { + a.Append(p.st, p.t, p.v) + } + // NOTE: Some buffered implementations only encode on Bytes(). + b.ReportMetric(float64(len(c.Bytes())), "B/chunk") + + require.Equal(b, len(s.samples), c.NumSamples()) + } + }) +} + +/* + export bench=bw.bench/iter && go test \ + -run '^$' -bench '^BenchmarkIterator' \ + -benchtime 1s -count 6 -cpu 2 -timeout 999m \ + | tee ${bench}.txt + +For profiles: + + export bench=bw.bench/iterprof && go test \ + -run '^$' -bench '^BenchmarkIterator' \ + -benchtime 1000000x -count 1 -cpu 2 -timeout 999m \ + -cpuprofile=${bench}.cpu.pprof \ + | tee ${bench}.txt + export bench=bw.bench/iterprof && go test \ + -run '^$' -bench '^BenchmarkIterator' \ + -benchtime 1000000x -count 1 -cpu 2 -timeout 999m \ + -memprofile=${bench}.mem.pprof \ + | tee ${bench}.txt +*/ +func BenchmarkIterator(b *testing.B) { + foreachFmtSampleCase(b, func(b *testing.B, f fmtCase, s sampleCase) { + floatEquals := func(a, b float64) bool { + return a == b + } + if f.name == "ALPBuffered" { + // Hack as ALP loses precision. + floatEquals = func(a, b float64) bool { + return math.Abs(a-b) < 1e-6 + } + } + b.ReportAllocs() + + c := f.newChunkFn() + a, err := c.Appender() + if err != nil { + b.Fatalf("get appender: %s", err) + } + for _, p := range s.samples { + a.Append(p.st, p.t, p.v) + } + + // Some chunk implementations might be buffered. Reset to ensure we don't reuse + // appending buffers. + c.Reset(c.Bytes()) + + // While we are at it, test if encoding/decoding works. + it := c.Iterator(nil) + require.Equal(b, len(s.samples), c.NumSamples()) + var got []triple + for i := 0; it.Next() == ValFloat; i++ { + t, v := it.At() + got = append(got, triple{st: it.AtST(), t: t, v: v}) + } + if err := it.Err(); err != nil && !errors.Is(err, io.EOF) { + require.NoError(b, err) + } + if diff := cmp.Diff(s.samples, got, cmp.AllowUnexported(triple{}), cmp.Comparer(floatEquals)); diff != "" { + b.Fatalf("mismatch (-want +got):\n%s", diff) + } + + var sink float64 + // Measure decoding efficiency. + for i := 0; b.Loop(); { + // Some chunk implementations might be buffered. Reset to ensure we don't reuse + // previous decoded data. + c.Reset(c.Bytes()) + b.ReportMetric(float64(len(c.Bytes())), "B/chunk") + + it := c.Iterator(it) + for it.Next() == ValFloat { + _, v := it.At() + sink = v + i++ + } + if err := it.Err(); err != nil && !errors.Is(err, io.EOF) { + require.NoError(b, err) + } + _ = sink + } + }) +} diff --git a/tsdb/chunkenc/chunk.go b/tsdb/chunkenc/chunk.go index 711966ec39..71f38e7a7e 100644 --- a/tsdb/chunkenc/chunk.go +++ b/tsdb/chunkenc/chunk.go @@ -30,6 +30,7 @@ const ( EncXOR EncHistogram EncFloatHistogram + EncXOROptST ) func (e Encoding) String() string { @@ -42,13 +43,15 @@ func (e Encoding) String() string { return "histogram" case EncFloatHistogram: return "floathistogram" + case EncXOROptST: + return "XOR-start-timestamp" } return "" } // IsValidEncoding returns true for supported encodings. func IsValidEncoding(e Encoding) bool { - return e == EncXOR || e == EncHistogram || e == EncFloatHistogram + return e == EncXOR || e == EncHistogram || e == EncFloatHistogram || e == EncXOROptST } const ( @@ -299,6 +302,7 @@ type pool struct { xor sync.Pool histogram sync.Pool floatHistogram sync.Pool + xoroptst sync.Pool } // NewPool returns a new pool. @@ -319,6 +323,11 @@ func NewPool() Pool { return &FloatHistogramChunk{b: bstream{}} }, }, + xoroptst: sync.Pool{ + New: func() any { + return &XorOptSTChunk{b: bstream{}} + }, + }, } } @@ -331,6 +340,8 @@ func (p *pool) Get(e Encoding, b []byte) (Chunk, error) { c = p.histogram.Get().(*HistogramChunk) case EncFloatHistogram: c = p.floatHistogram.Get().(*FloatHistogramChunk) + case EncXOROptST: + c = p.xoroptst.Get().(*XorOptSTChunk) default: return nil, fmt.Errorf("invalid chunk encoding %q", e) } @@ -352,6 +363,9 @@ func (p *pool) Put(c Chunk) error { case EncFloatHistogram: _, ok = c.(*FloatHistogramChunk) sp = &p.floatHistogram + case EncXOROptST: + _, ok = c.(*XorOptSTChunk) + sp = &p.xoroptst default: return fmt.Errorf("invalid chunk encoding %q", c.Encoding()) } diff --git a/tsdb/chunkenc/chunk_test.go b/tsdb/chunkenc/chunk_test.go index 41bb23ddd1..92fa3cab38 100644 --- a/tsdb/chunkenc/chunk_test.go +++ b/tsdb/chunkenc/chunk_test.go @@ -16,36 +16,41 @@ package chunkenc import ( "errors" "fmt" - "io" "math/rand" "testing" "github.com/stretchr/testify/require" ) -type pair struct { - t int64 - v float64 +type triple struct { + st, t int64 + v float64 } func TestChunk(t *testing.T) { - for enc, nc := range map[Encoding]func() Chunk{ - EncXOR: func() Chunk { return NewXORChunk() }, - } { - t.Run(fmt.Sprintf("%v", enc), func(t *testing.T) { + testcases := []struct { + encoding Encoding + supportsST bool + factory func() Chunk + }{ + {encoding: EncXOR, supportsST: false, factory: func() Chunk { return NewXORChunk() }}, + {encoding: EncXOROptST, supportsST: true, factory: func() Chunk { return NewXOROptSTChunk() }}, + } + for _, tc := range testcases { + t.Run(fmt.Sprintf("%v", tc.encoding), func(t *testing.T) { for range make([]struct{}, 1) { - c := nc() - testChunk(t, c) + c := tc.factory() + testChunk(t, c, tc.supportsST) } }) } } -func testChunk(t *testing.T, c Chunk) { +func testChunk(t *testing.T, c Chunk, supportsST bool) { app, err := c.Appender() require.NoError(t, err) - var exp []pair + var exp []triple var ( ts = int64(1234123324) v = 1243535.123 @@ -65,26 +70,30 @@ func testChunk(t *testing.T, c Chunk) { require.NoError(t, err) } - app.Append(0, ts, v) - exp = append(exp, pair{t: ts, v: v}) + app.Append(ts-100, ts, v) + expST := int64(0) + if supportsST { + expST = ts - 100 + } + exp = append(exp, triple{st: expST, t: ts, v: v}) } // 1. Expand iterator in simple case. it1 := c.Iterator(nil) - var res1 []pair + var res1 []triple for it1.Next() == ValFloat { ts, v := it1.At() - res1 = append(res1, pair{t: ts, v: v}) + res1 = append(res1, triple{st: it1.AtST(), t: ts, v: v}) } require.NoError(t, it1.Err()) require.Equal(t, exp, res1) // 2. Expand second iterator while reusing first one. it2 := c.Iterator(it1) - var res2 []pair + var res2 []triple for it2.Next() == ValFloat { ts, v := it2.At() - res2 = append(res2, pair{t: ts, v: v}) + res2 = append(res2, triple{st: it2.AtST(), t: ts, v: v}) } require.NoError(t, it2.Err()) require.Equal(t, exp, res2) @@ -93,17 +102,17 @@ func testChunk(t *testing.T, c Chunk) { mid := len(exp) / 2 it3 := c.Iterator(nil) - var res3 []pair + var res3 []triple require.Equal(t, ValFloat, it3.Seek(exp[mid].t)) // Below ones should not matter. require.Equal(t, ValFloat, it3.Seek(exp[mid].t)) require.Equal(t, ValFloat, it3.Seek(exp[mid].t)) ts, v = it3.At() - res3 = append(res3, pair{t: ts, v: v}) + res3 = append(res3, triple{st: it3.AtST(), t: ts, v: v}) for it3.Next() == ValFloat { ts, v := it3.At() - res3 = append(res3, pair{t: ts, v: v}) + res3 = append(res3, triple{st: it3.AtST(), t: ts, v: v}) } require.NoError(t, it3.Err()) require.Equal(t, exp[mid:], res3) @@ -129,6 +138,10 @@ func TestPool(t *testing.T) { name: "float histogram", encoding: EncFloatHistogram, }, + { + name: "xor opt st", + encoding: EncXOROptST, + }, { name: "invalid encoding", encoding: EncNone, @@ -150,6 +163,8 @@ func TestPool(t *testing.T) { b = &c.(*HistogramChunk).b case EncFloatHistogram: b = &c.(*FloatHistogramChunk).b + case EncXOROptST: + b = &c.(*XorOptSTChunk).b default: b = &c.(*XORChunk).b } @@ -199,111 +214,3 @@ func (c fakeChunk) Encoding() Encoding { func (c fakeChunk) Reset([]byte) { c.t.Fatal("Reset should not be called") } - -func benchmarkIterator(b *testing.B, newChunk func() Chunk) { - const samplesPerChunk = 250 - var ( - t = int64(1234123324) - v = 1243535.123 - exp []pair - ) - for range samplesPerChunk { - // t += int64(rand.Intn(10000) + 1) - t += int64(1000) - // v = rand.Float64() - v += float64(100) - exp = append(exp, pair{t: t, v: v}) - } - - chunk := newChunk() - { - a, err := chunk.Appender() - if err != nil { - b.Fatalf("get appender: %s", err) - } - j := 0 - for _, p := range exp { - if j > 250 { - break - } - a.Append(0, p.t, p.v) - j++ - } - } - - b.ReportAllocs() - - var res float64 - var it Iterator - for i := 0; b.Loop(); { - it := chunk.Iterator(it) - - for it.Next() == ValFloat { - _, v := it.At() - res = v - i++ - } - if err := it.Err(); err != nil && !errors.Is(err, io.EOF) { - require.NoError(b, err) - } - _ = res - } -} - -func newXORChunk() Chunk { - return NewXORChunk() -} - -func BenchmarkXORIterator(b *testing.B) { - benchmarkIterator(b, newXORChunk) -} - -func BenchmarkXORAppender(b *testing.B) { - r := rand.New(rand.NewSource(1)) - b.Run("constant", func(b *testing.B) { - benchmarkAppender(b, func() (int64, float64) { - return 1000, 0 - }, newXORChunk) - }) - b.Run("random steps", func(b *testing.B) { - benchmarkAppender(b, func() (int64, float64) { - return int64(r.Intn(100) - 50 + 15000), // 15 seconds +- up to 100ms of jitter. - float64(r.Intn(100) - 50) // Varying from -50 to +50 in 100 discrete steps. - }, newXORChunk) - }) - b.Run("random 0-1", func(b *testing.B) { - benchmarkAppender(b, func() (int64, float64) { - return int64(r.Intn(100) - 50 + 15000), // 15 seconds +- up to 100ms of jitter. - r.Float64() // Random between 0 and 1.0. - }, newXORChunk) - }) -} - -func benchmarkAppender(b *testing.B, deltas func() (int64, float64), newChunk func() Chunk) { - var ( - t = int64(1234123324) - v = 1243535.123 - ) - const nSamples = 120 // Same as tsdb.DefaultSamplesPerChunk. - var exp []pair - for range nSamples { - dt, dv := deltas() - t += dt - v += dv - exp = append(exp, pair{t: t, v: v}) - } - - b.ReportAllocs() - - for b.Loop() { - c := newChunk() - - a, err := c.Appender() - if err != nil { - b.Fatalf("get appender: %s", err) - } - for _, p := range exp { - a.Append(0, p.t, p.v) - } - } -} diff --git a/tsdb/chunkenc/st_helper_test.go b/tsdb/chunkenc/st_helper_test.go new file mode 100644 index 0000000000..7f657a4293 --- /dev/null +++ b/tsdb/chunkenc/st_helper_test.go @@ -0,0 +1,124 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunkenc + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/model/histogram" +) + +// testChunkSTHandling tests handling of start times in chunks. +// It uses 0-4 samples with timestamp 1000,2000,3000,4000 and monotonically +// increasing start times that are chosen from 0-(ts-500) for each sample. +// All combinations of start times are tested for each number of samples. +func testChunkSTHandling(t *testing.T, vt ValueType, chunkFactory func() Chunk) { + sampleAppend := func(app Appender, vt ValueType, st, ts int64, v float64) { + switch vt { + case ValFloat: + app.Append(st, ts, v) + case ValHistogram: + _, recoded, _, err := app.AppendHistogram(nil, st, ts, &histogram.Histogram{Sum: v, Count: uint64(v * 10)}, false) + require.NoError(t, err) + require.False(t, recoded) + case ValFloatHistogram: + _, recoded, _, err := app.AppendFloatHistogram(nil, st, ts, &histogram.FloatHistogram{Sum: v, Count: v * 10}, false) + require.NoError(t, err) + require.False(t, recoded) + default: + t.Fatalf("unsupported value type %v", vt) + } + } + + get := func(it Iterator, vt ValueType) (int64, int64, float64) { + switch vt { + case ValFloat: + ts, v := it.At() + return it.AtST(), ts, v + case ValHistogram: + ts, h := it.AtHistogram(nil) + return it.AtST(), ts, float64(h.Sum) + case ValFloatHistogram: + ts, fh := it.AtFloatHistogram(nil) + return it.AtST(), ts, fh.Sum + default: + t.Fatalf("unsupported value type %v", vt) + return 0, 0, 0 + } + } + + runCase := func(t *testing.T, samples []triple) { + chunk := chunkFactory() + app, err := chunk.Appender() + require.NoError(t, err) + for _, s := range samples { + sampleAppend(app, vt, s.st, s.t, s.v) + } + it := chunk.Iterator(nil) + for i, s := range samples { + require.Equal(t, vt, it.Next()) + st, ts, f := get(it, vt) + require.Equal(t, s.t, ts, "%d: timestamp mismatch", i) + require.Equal(t, s.st, st, "%d: start time mismatch", i) + require.InDelta(t, s.v, f, 1e-9, "%d: value mismatch", i) + } + require.Equal(t, ValNone, it.Next()) + require.NoError(t, it.Err()) + } + + t.Run("manual for debugging", func(t *testing.T) { + samples := []triple{ + {st: 0, t: 1000, v: 1.5}, + {st: 0, t: 2000, v: 2.5}, + {st: 0, t: 3000, v: 3.5}, + {st: 0, t: 4000, v: 4.5}, + } + runCase(t, samples) + }) + + stTimes := []int64{0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000} + for numberOfSamples := range 5 { + samples := make([]triple, numberOfSamples) + sampleSTidx := make([]int, numberOfSamples) + for { + for j := range numberOfSamples { + samples[j] = triple{ + st: stTimes[sampleSTidx[j]], + t: int64(1000 * (j + 1)), + v: float64(j) + 0.5, + } + } + + t.Run(fmt.Sprintf("%v", samples), func(t *testing.T) { + runCase(t, samples) + }) + + exhausted := true + for j := numberOfSamples - 1; j >= 0; j-- { + if sampleSTidx[j] < j+2 { + sampleSTidx[j]++ + exhausted = false + break + } + sampleSTidx[j] = 0 + } + if exhausted { + break + } + } + } +} diff --git a/tsdb/chunkenc/varbit_classic.go b/tsdb/chunkenc/varbit_classic.go new file mode 100644 index 0000000000..b8f293bc3f --- /dev/null +++ b/tsdb/chunkenc/varbit_classic.go @@ -0,0 +1,103 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunkenc + +// putClassicVarbitInt writes an int64 using varbit encoding with a bit bucketing +// as it was done for a long time in the initial XOR chunk format. +func putClassicVarbitInt(b *bstream, val int64) { + // Gorilla has a max resolution of seconds, Prometheus milliseconds. + // Thus we use higher value range steps with larger bit size. + // + // TODO(beorn7): This seems to needlessly jump to large bit + // sizes even for very small deviations from zero. Timestamp + // compression can probably benefit from some smaller bit + // buckets. See also what was done for histogram encoding in + // varbit.go. + switch { + case val == 0: + b.writeBit(zero) + case bitRange(val, 14): + b.writeByte(0b10<<6 | (uint8(val>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. + b.writeByte(uint8(val)) // Bottom 8 bits of dod. + case bitRange(val, 17): + b.writeBits(0b110, 3) + b.writeBits(uint64(val), 17) + case bitRange(val, 20): + b.writeBits(0b1110, 4) + b.writeBits(uint64(val), 20) + default: + b.writeBits(0b1111, 4) + b.writeBits(uint64(val), 64) + } +} + +// readClassicVarbitInt reads an int64 encoded with putClassicVarbitInt. +// This is copied into production code to make it inline. +func readClassicVarbitInt(b *bstreamReader) (int64, error) { + var d byte + // read delta-of-delta + for range 4 { + d <<= 1 + bit, err := b.readBitFast() + if err != nil { + bit, err = b.readBit() + if err != nil { + return 0, err + } + } + if bit == zero { + break + } + d |= 1 + } + var sz uint8 + var val int64 + switch d { + case 0b0: + // dod == 0 + case 0b10: + sz = 14 + case 0b110: + sz = 17 + case 0b1110: + sz = 20 + case 0b1111: + // Do not use fast because it's very unlikely it will succeed. + bits, err := b.readBits(64) + if err != nil { + return 0, err + } + + val = int64(bits) + } + + if sz != 0 { + bits, err := b.readBitsFast(sz) + if err != nil { + bits, err = b.readBits(sz) + if err != nil { + return 0, err + } + } + + // Account for negative numbers, which come back as high unsigned numbers. + // See docs/bstream.md. + if bits > (1 << (sz - 1)) { + bits -= 1 << sz + } + val = int64(bits) + } + + return val, nil +} diff --git a/tsdb/chunkenc/varbit_classic_test.go b/tsdb/chunkenc/varbit_classic_test.go new file mode 100644 index 0000000000..f64d2ca9a9 --- /dev/null +++ b/tsdb/chunkenc/varbit_classic_test.go @@ -0,0 +1,57 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunkenc + +import ( + "math" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestClassicVarbitInt(t *testing.T) { + numbers := []int64{ + math.MinInt64, + -36028797018963968, -36028797018963967, + -16777216, -16777215, + -131072, -131071, + -2048, -2047, + -256, -255, + -32, -31, + -4, -3, + -1, 0, 1, + 4, 5, + 32, 33, + 256, 257, + 2048, 2049, + 131072, 131073, + 16777216, 16777217, + 36028797018963968, 36028797018963969, + math.MaxInt64, + } + + bs := bstream{} + + for _, n := range numbers { + putClassicVarbitInt(&bs, n) + } + + bsr := newBReader(bs.bytes()) + + for _, want := range numbers { + got, err := readClassicVarbitInt(&bsr) + require.NoError(t, err) + require.Equal(t, want, got) + } +} diff --git a/tsdb/chunkenc/xoroptst.go b/tsdb/chunkenc/xoroptst.go new file mode 100644 index 0000000000..6ac4122d73 --- /dev/null +++ b/tsdb/chunkenc/xoroptst.go @@ -0,0 +1,681 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunkenc + +import ( + "encoding/binary" + "math" + + "github.com/prometheus/prometheus/model/histogram" +) + +const ( + chunkSTHeaderSize = 1 + maxFirstSTChangeOn = 0x7F +) + +func writeHeaderFirstSTKnown(b []byte) { + b[0] = 0x80 +} + +func writeHeaderFirstSTChangeOn(b []byte, firstSTChangeOn uint16) { + // First bit indicates the initial ST value. + // Here we save the sample number from where the first change occurs in the + // rest of the byte (7 bits) + + if firstSTChangeOn > maxFirstSTChangeOn { + // This should never happen, would cause corruption (ST already skipped but shouldn't). + return + } + b[0] |= uint8(firstSTChangeOn) +} + +func readSTHeader(b []byte) (firstSTKnown bool, firstSTChangeOn uint16) { + if b[0] == 0x00 { + return false, 0 + } + if b[0] == 0x80 { + return true, 0 + } + mask := byte(0x80) + if b[0]&mask != 0 { + firstSTKnown = true + } + mask = 0x7F + return firstSTKnown, uint16(b[0] & mask) +} + +// XorOptSTChunk holds encoded sample data: +// 2B(numSamples), 1B(stHeader), ?varint(st), varint(t), xor(v), ?varuint(stDelta), varuint(tDelta), xor(v), ?classicvarbitint(stDod), classicvarbitint(tDod), xor(v), ... +// stHeader: 1b(firstSTKnown), 7b(firstSTChangeOn). +type XorOptSTChunk struct { + b bstream +} + +// NewXOROptSTChunk returns a new chunk with XORv2 encoding. +func NewXOROptSTChunk() *XorOptSTChunk { + b := make([]byte, chunkHeaderSize+chunkSTHeaderSize, chunkAllocationSize) + return &XorOptSTChunk{b: bstream{stream: b, count: 0}} +} + +func (c *XorOptSTChunk) Reset(stream []byte) { + c.b.Reset(stream) +} + +// Encoding returns the encoding type. +func (*XorOptSTChunk) Encoding() Encoding { + return EncXOROptST +} + +// Bytes returns the underlying byte slice of the chunk. +func (c *XorOptSTChunk) Bytes() []byte { + return c.b.bytes() +} + +// NumSamples returns the number of samples in the chunk. +func (c *XorOptSTChunk) NumSamples() int { + return int(binary.BigEndian.Uint16(c.Bytes())) +} + +// Compact implements the Chunk interface. +func (c *XorOptSTChunk) Compact() { + if l := len(c.b.stream); cap(c.b.stream) > l+chunkCompactCapacityThreshold { + buf := make([]byte, l) + copy(buf, c.b.stream) + c.b.stream = buf + } +} + +// Appender implements the Chunk interface. +// It is not valid to call Appender() multiple times concurrently or to use multiple +// Appenders on the same chunk. +func (c *XorOptSTChunk) Appender() (Appender, error) { + if len(c.b.stream) == chunkHeaderSize+chunkSTHeaderSize { // Avoid allocating an Iterator when chunk is empty. + return &xorOptSTAppender{b: &c.b, t: math.MinInt64, leading: 0xff}, nil + } + it := c.iterator(nil) + + // To get an appender we must know the state it would have if we had + // appended all existing data from scratch. + // We iterate through the end and populate via the iterator's state. + for it.Next() != ValNone { + } + if err := it.Err(); err != nil { + return nil, err + } + + a := &xorOptSTAppender{ + b: &c.b, + st: it.st, + t: it.t, + v: it.val, + stDelta: it.stDelta, + tDelta: it.tDelta, + leading: it.leading, + trailing: it.trailing, + + numTotal: it.numTotal, + firstSTKnown: it.firstSTKnown, + firstSTChangeOn: it.firstSTChangeOn, + } + return a, nil +} + +func (c *XorOptSTChunk) iterator(it Iterator) *xorOptSTtIterator { + xorIter, ok := it.(*xorOptSTtIterator) + if !ok { + xorIter = &xorOptSTtIterator{} + } + + xorIter.Reset(c.b.bytes()) + return xorIter +} + +// Iterator implements the Chunk interface. +// Iterator() must not be called concurrently with any modifications to the chunk, +// but after it returns you can use an Iterator concurrently with an Appender or +// other Iterators. +func (c *XorOptSTChunk) Iterator(it Iterator) Iterator { + return c.iterator(it) +} + +type xorOptSTAppender struct { + b *bstream + numTotal uint16 + + firstSTKnown bool + firstSTChangeOn uint16 + + st, t int64 + v float64 + stDelta int64 + tDelta uint64 + + leading uint8 + trailing uint8 +} + +func (a *xorOptSTAppender) writeVDelta(v float64) { + xorWrite(a.b, v, a.v, &a.leading, &a.trailing) +} + +func (*xorOptSTAppender) AppendHistogram(*HistogramAppender, int64, int64, *histogram.Histogram, bool) (Chunk, bool, Appender, error) { + panic("appended a histogram sample to a float chunk") +} + +func (*xorOptSTAppender) AppendFloatHistogram(*FloatHistogramAppender, int64, int64, *histogram.FloatHistogram, bool) (Chunk, bool, Appender, error) { + panic("appended a float histogram sample to a float chunk") +} + +const ( + read0State uint8 = iota + read1State + readDoDMaybeSTState + readDoDNoSTState + readDoDState + + eofState uint8 = 1<<8 - 1 +) + +type xorOptSTtIterator struct { + br bstreamReader + numTotal uint16 + + firstSTKnown bool + firstSTChangeOn uint16 + + state uint8 + numRead uint16 + + st, t int64 + val float64 + + leading uint8 + trailing uint8 + + stDelta int64 + tDelta uint64 + err error +} + +func (it *xorOptSTtIterator) Seek(t int64) ValueType { + if it.state == eofState { + return ValNone + } + + for t > it.t || it.state == read0State { + if it.Next() == ValNone { + return ValNone + } + } + return ValFloat +} + +func (it *xorOptSTtIterator) At() (int64, float64) { + return it.t, it.val +} + +func (*xorOptSTtIterator) AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram) { + panic("cannot call xorIterator.AtHistogram") +} + +func (*xorOptSTtIterator) AtFloatHistogram(*histogram.FloatHistogram) (int64, *histogram.FloatHistogram) { + panic("cannot call xorIterator.AtFloatHistogram") +} + +func (it *xorOptSTtIterator) AtT() int64 { + return it.t +} + +func (it *xorOptSTtIterator) AtST() int64 { + return it.st +} + +func (it *xorOptSTtIterator) Err() error { + return it.err +} + +func (it *xorOptSTtIterator) Reset(b []byte) { + // We skip initial headers for actual samples. + it.br = newBReader(b[chunkHeaderSize+chunkSTHeaderSize:]) + it.numTotal = binary.BigEndian.Uint16(b) + it.firstSTKnown, it.firstSTChangeOn = readSTHeader(b[chunkHeaderSize:]) + it.numRead = 0 + it.st = 0 + it.t = 0 + it.val = 0 + it.leading = 0 + it.trailing = 0 + it.stDelta = 0 + it.tDelta = 0 + it.err = nil + it.state = read0State + if it.numRead >= it.numTotal { + it.state = eofState + } +} + +func (a *xorOptSTAppender) Append(st, t int64, v float64) { + if st == 0 && a.numTotal != maxFirstSTChangeOn && a.firstSTChangeOn == 0 && !a.firstSTKnown { + // Fast path for no ST usage at all. + // Same as classic XOR chunk appender. + + var tDelta uint64 + + switch a.numTotal { + case 0: + buf := make([]byte, binary.MaxVarintLen64) + for _, b := range buf[:binary.PutVarint(buf, t)] { + a.b.writeByte(b) + } + a.b.writeBits(math.Float64bits(v), 64) + case 1: + buf := make([]byte, binary.MaxVarintLen64) + tDelta = uint64(t - a.t) + for _, b := range buf[:binary.PutUvarint(buf, tDelta)] { + a.b.writeByte(b) + } + a.writeVDelta(v) + default: + tDelta = uint64(t - a.t) + dod := int64(tDelta - a.tDelta) + + // Gorilla has a max resolution of seconds, Prometheus milliseconds. + // Thus we use higher value range steps with larger bit size. + // + // TODO(beorn7): This seems to needlessly jump to large bit + // sizes even for very small deviations from zero. Timestamp + // compression can probably benefit from some smaller bit + // buckets. See also what was done for histogram encoding in + // varbit.go. + switch { + case dod == 0: + a.b.writeBit(zero) + case bitRange(dod, 14): + a.b.writeByte(0b10<<6 | (uint8(dod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. + a.b.writeByte(uint8(dod)) // Bottom 8 bits of dod. + case bitRange(dod, 17): + a.b.writeBits(0b110, 3) + a.b.writeBits(uint64(dod), 17) + case bitRange(dod, 20): + a.b.writeBits(0b1110, 4) + a.b.writeBits(uint64(dod), 20) + default: + a.b.writeBits(0b1111, 4) + a.b.writeBits(uint64(dod), 64) + } + + a.writeVDelta(v) + } + + a.t = t + a.v = v + a.tDelta = tDelta + a.numTotal++ + binary.BigEndian.PutUint16(a.b.bytes(), a.numTotal) + + return + } + + var ( + stDelta int64 + tDelta uint64 + stChanged bool + ) + + // Slow path for ST usage. + switch a.numTotal { + case 0: + buf := make([]byte, binary.MaxVarintLen64) + + for _, b := range buf[:binary.PutVarint(buf, st)] { + a.b.writeByte(b) + } + writeHeaderFirstSTKnown(a.b.bytes()[chunkHeaderSize:]) + a.firstSTKnown = true + + for _, b := range buf[:binary.PutVarint(buf, t)] { + a.b.writeByte(b) + } + a.b.writeBits(math.Float64bits(v), 64) + case 1: + buf := make([]byte, binary.MaxVarintLen64) + stDelta = st - a.st + if stDelta != 0 { + stChanged = true + for _, b := range buf[:binary.PutVarint(buf, stDelta)] { + a.b.writeByte(b) + } + } + + tDelta = uint64(t - a.t) + for _, b := range buf[:binary.PutUvarint(buf, tDelta)] { + a.b.writeByte(b) + } + a.writeVDelta(v) + default: + if a.firstSTChangeOn == 0 && a.numTotal == maxFirstSTChangeOn { + // We are at the 127th sample. firstSTChangeOn can only fit 7 bits due to a + // single byte header constrain, which is fine, given typical 120 sample size. + a.firstSTChangeOn = a.numTotal + writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], a.firstSTChangeOn) + } + + stDelta = st - a.st + sdod := stDelta - a.stDelta + if sdod != 0 || a.firstSTChangeOn != 0 { + stChanged = true + // Gorilla has a max resolution of seconds, Prometheus milliseconds. + // Thus we use higher value range steps with larger bit size. + // + // TODO(beorn7): This seems to needlessly jump to large bit + // sizes even for very small deviations from zero. Timestamp + // compression can probably benefit from some smaller bit + // buckets. See also what was done for histogram encoding in + // varbit.go. + switch { + case sdod == 0: + a.b.writeBit(zero) + case bitRange(sdod, 14): + a.b.writeByte(0b10<<6 | (uint8(sdod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. + a.b.writeByte(uint8(sdod)) // Bottom 8 bits of dod. + case bitRange(sdod, 17): + a.b.writeBits(0b110, 3) + a.b.writeBits(uint64(sdod), 17) + case bitRange(sdod, 20): + a.b.writeBits(0b1110, 4) + a.b.writeBits(uint64(sdod), 20) + default: + a.b.writeBits(0b1111, 4) + a.b.writeBits(uint64(sdod), 64) + } + // putClassicVarbitInt(a.b, sdod) + } + + tDelta = uint64(t - a.t) + dod := int64(tDelta - a.tDelta) + + // Gorilla has a max resolution of seconds, Prometheus milliseconds. + // Thus we use higher value range steps with larger bit size. + // + // TODO(beorn7): This seems to needlessly jump to large bit + // sizes even for very small deviations from zero. Timestamp + // compression can probably benefit from some smaller bit + // buckets. See also what was done for histogram encoding in + // varbit.go. + switch { + case dod == 0: + a.b.writeBit(zero) + case bitRange(dod, 14): + a.b.writeByte(0b10<<6 | (uint8(dod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. + a.b.writeByte(uint8(dod)) // Bottom 8 bits of dod. + case bitRange(dod, 17): + a.b.writeBits(0b110, 3) + a.b.writeBits(uint64(dod), 17) + case bitRange(dod, 20): + a.b.writeBits(0b1110, 4) + a.b.writeBits(uint64(dod), 20) + default: + a.b.writeBits(0b1111, 4) + a.b.writeBits(uint64(dod), 64) + } + + a.writeVDelta(v) + } + + a.st = st + a.t = t + a.v = v + a.tDelta = tDelta + a.stDelta = stDelta + + a.numTotal++ + binary.BigEndian.PutUint16(a.b.bytes(), a.numTotal) + + // firstSTChangeOn == 0 indicates that we have one ST value (zero or not) + // for all STs in the appends until now. If we see a first "update" + // we are saving this number in the header and continue tracking all DoD (including zeros). + if a.firstSTChangeOn == 0 && stChanged { + a.firstSTChangeOn = a.numTotal - 1 + writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], a.firstSTChangeOn) + } +} + +func (it *xorOptSTtIterator) retErr(err error) ValueType { + it.err = err + it.state = eofState + return ValNone +} + +func (it *xorOptSTtIterator) Next() ValueType { + switch it.state { + case eofState: + return ValNone + case read0State: + it.state++ + + // Optional ST read. + if it.firstSTKnown { + st, err := binary.ReadVarint(&it.br) + if err != nil { + return it.retErr(err) + } + it.st = st + } + + // TS. + t, err := binary.ReadVarint(&it.br) + if err != nil { + return it.retErr(err) + } + // Value. + v, err := it.br.readBits(64) + if err != nil { + return it.retErr(err) + } + + it.t = t + it.val = math.Float64frombits(v) + + // State EOF check. + it.numRead++ + if it.numRead >= it.numTotal { + it.state = eofState + } + return ValFloat + case read1State: + it.state++ + switch it.firstSTChangeOn { + case 0: + // This means we have same (zero or non-zero) ST value for the rest of + // chunk. We can simply use ~classic XOR chunk iterations. + it.state = readDoDNoSTState + case 1: + // We got early ST change on the second sample, likely delta. + // Continue ST rich flow from the next iteration. + it.state = readDoDState + + stDelta, err := binary.ReadVarint(&it.br) + if err != nil { + return it.retErr(err) + } + it.stDelta = stDelta + it.st += it.stDelta + } + // TS. + tDelta, err := binary.ReadUvarint(&it.br) + if err != nil { + return it.retErr(err) + } + it.tDelta = tDelta + it.t += int64(it.tDelta) + + // Value. + if err := xorRead(&it.br, &it.val, &it.leading, &it.trailing); err != nil { + return it.retErr(err) + } + + // State EOF check. + it.numRead++ + if it.numRead >= it.numTotal { + it.state = eofState + } + return ValFloat + case readDoDMaybeSTState: + if it.firstSTChangeOn == it.numRead { + // ST changes from this iteration, change state for future. + it.state = readDoDState + return it.dodNext() + } + return it.dodNoSTNext() + case readDoDState: + return it.dodNext() + case readDoDNoSTState: + return it.dodNoSTNext() + default: + panic("xorOptSTtIterator: broken machine state") + } +} + +func (it *xorOptSTtIterator) dodNext() ValueType { + // Inlined readClassicVarbitInt(&it.br) + var d byte + // read delta-of-delta + for range 4 { + d <<= 1 + bit, err := it.br.readBitFast() + if err != nil { + bit, err = it.br.readBit() + if err != nil { + return it.retErr(err) + } + } + if bit == zero { + break + } + d |= 1 + } + var sz uint8 + var sdod int64 + switch d { + case 0b0: + // dod == 0 + case 0b10: + sz = 14 + case 0b110: + sz = 17 + case 0b1110: + sz = 20 + case 0b1111: + // Do not use fast because it's very unlikely it will succeed. + bits, err := it.br.readBits(64) + if err != nil { + return it.retErr(err) + } + + sdod = int64(bits) + } + + if sz != 0 { + bits, err := it.br.readBitsFast(sz) + if err != nil { + bits, err = it.br.readBits(sz) + if err != nil { + return it.retErr(err) + } + } + + // Account for negative numbers, which come back as high unsigned numbers. + // See docs/bstream.md. + if bits > (1 << (sz - 1)) { + bits -= 1 << sz + } + sdod = int64(bits) + } + + it.stDelta += sdod + it.st += it.stDelta + return it.dodNoSTNext() +} + +func (it *xorOptSTtIterator) dodNoSTNext() ValueType { + // Inlined readClassicVarbitInt(&it.br) + var d byte + // read delta-of-delta + for range 4 { + d <<= 1 + bit, err := it.br.readBitFast() + if err != nil { + bit, err = it.br.readBit() + if err != nil { + return it.retErr(err) + } + } + if bit == zero { + break + } + d |= 1 + } + var sz uint8 + var dod int64 + switch d { + case 0b0: + // dod == 0 + case 0b10: + sz = 14 + case 0b110: + sz = 17 + case 0b1110: + sz = 20 + case 0b1111: + // Do not use fast because it's very unlikely it will succeed. + bits, err := it.br.readBits(64) + if err != nil { + return it.retErr(err) + } + + dod = int64(bits) + } + + if sz != 0 { + bits, err := it.br.readBitsFast(sz) + if err != nil { + bits, err = it.br.readBits(sz) + if err != nil { + return it.retErr(err) + } + } + + // Account for negative numbers, which come back as high unsigned numbers. + // See docs/bstream.md. + if bits > (1 << (sz - 1)) { + bits -= 1 << sz + } + dod = int64(bits) + } + + it.tDelta = uint64(int64(it.tDelta) + dod) + it.t += int64(it.tDelta) + // Value. + if err := xorRead(&it.br, &it.val, &it.leading, &it.trailing); err != nil { + return it.retErr(err) + } + + // State EOF check. + it.numRead++ + if it.numRead >= it.numTotal { + it.state = eofState + } + return ValFloat +} diff --git a/tsdb/chunkenc/xoroptst_test.go b/tsdb/chunkenc/xoroptst_test.go new file mode 100644 index 0000000000..fe41b751fc --- /dev/null +++ b/tsdb/chunkenc/xoroptst_test.go @@ -0,0 +1,108 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunkenc + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestXorOptSTChunk(t *testing.T) { + testChunkSTHandling(t, ValFloat, func() Chunk { + return NewXOROptSTChunk() + }, + ) +} + +func TestXorOptSTChunk_MoreThan127Samples(t *testing.T) { + const afterMax = maxFirstSTChangeOn + 3 + t.Run("zero ST", func(t *testing.T) { + chunk := NewXOROptSTChunk() + app, err := chunk.Appender() + require.NoError(t, err) + for i := range afterMax { + app.Append(0, int64(i*10+1), float64(i)*1.5) + } + + it := chunk.Iterator(nil) + for i := range afterMax { + require.Equal(t, ValFloat, it.Next()) + st := it.AtST() + ts, v := it.At() + require.Equal(t, int64(0), st) + require.Equal(t, int64(i*10+1), ts) + require.Equal(t, float64(i)*1.5, v) + } + + require.Equal(t, ValNone, it.Next()) + require.NoError(t, it.Err()) + }) + + t.Run("non-zero ST after 127", func(t *testing.T) { + chunk := NewXOROptSTChunk() + app, err := chunk.Appender() + require.NoError(t, err) + for i := range afterMax { + st := int64(0) + if i == afterMax-1 { + st = int64((afterMax - 1) * 10) + } + app.Append(st, int64(i*10+1), float64(i)*1.5) + } + + it := chunk.Iterator(nil) + for i := range afterMax { + require.Equal(t, ValFloat, it.Next()) + st := it.AtST() + ts, v := it.At() + if i == afterMax-1 { + require.Equal(t, int64((afterMax-1)*10), st) + } else { + require.Equal(t, int64(0), st) + } + require.Equal(t, int64(i*10+1), ts) + require.Equal(t, float64(i)*1.5, v) + } + + require.Equal(t, ValNone, it.Next()) + require.NoError(t, it.Err()) + }) +} + +func TestXorOptSTChunk_STHeader(t *testing.T) { + b := make([]byte, 1) + writeHeaderFirstSTKnown(b) + firstSTKnown, firstSTChangeOn := readSTHeader(b) + require.True(t, firstSTKnown) + require.Equal(t, uint16(0), firstSTChangeOn) + + b = make([]byte, 1) + firstSTKnown, firstSTChangeOn = readSTHeader(b) + require.False(t, firstSTKnown) + require.Equal(t, uint16(0), firstSTChangeOn) + + b = make([]byte, 1) + writeHeaderFirstSTChangeOn(b, 1) + firstSTKnown, firstSTChangeOn = readSTHeader(b) + require.False(t, firstSTKnown) + require.Equal(t, uint16(1), firstSTChangeOn) + + b = make([]byte, 1) + writeHeaderFirstSTKnown(b) + writeHeaderFirstSTChangeOn(b, 119) + firstSTKnown, firstSTChangeOn = readSTHeader(b) + require.True(t, firstSTKnown) + require.Equal(t, uint16(119), firstSTChangeOn) +} From 8c14b3c99c4740bdf617e1dd6f25632d5ca1c3dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 22 Jan 2026 14:03:53 +0100 Subject: [PATCH 02/73] optimize code path and layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/xoroptst.go | 66 +++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/tsdb/chunkenc/xoroptst.go b/tsdb/chunkenc/xoroptst.go index 6ac4122d73..5271398bd1 100644 --- a/tsdb/chunkenc/xoroptst.go +++ b/tsdb/chunkenc/xoroptst.go @@ -151,19 +151,16 @@ func (c *XorOptSTChunk) Iterator(it Iterator) Iterator { } type xorOptSTAppender struct { - b *bstream - numTotal uint16 - - firstSTKnown bool + b *bstream + numTotal uint16 firstSTChangeOn uint16 - - st, t int64 - v float64 - stDelta int64 - tDelta uint64 - - leading uint8 - trailing uint8 + leading uint8 + trailing uint8 + firstSTKnown bool + st, t int64 + v float64 + stDelta int64 + tDelta uint64 } func (a *xorOptSTAppender) writeVDelta(v float64) { @@ -329,9 +326,8 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { } var ( - stDelta int64 - tDelta uint64 - stChanged bool + stDelta int64 + tDelta uint64 ) // Slow path for ST usage. @@ -342,8 +338,8 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { for _, b := range buf[:binary.PutVarint(buf, st)] { a.b.writeByte(b) } - writeHeaderFirstSTKnown(a.b.bytes()[chunkHeaderSize:]) a.firstSTKnown = true + writeHeaderFirstSTKnown(a.b.bytes()[chunkHeaderSize:]) for _, b := range buf[:binary.PutVarint(buf, t)] { a.b.writeByte(b) @@ -353,7 +349,8 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { buf := make([]byte, binary.MaxVarintLen64) stDelta = st - a.st if stDelta != 0 { - stChanged = true + a.firstSTChangeOn = 1 + writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], 1) for _, b := range buf[:binary.PutVarint(buf, stDelta)] { a.b.writeByte(b) } @@ -365,17 +362,13 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { } a.writeVDelta(v) default: - if a.firstSTChangeOn == 0 && a.numTotal == maxFirstSTChangeOn { - // We are at the 127th sample. firstSTChangeOn can only fit 7 bits due to a - // single byte header constrain, which is fine, given typical 120 sample size. - a.firstSTChangeOn = a.numTotal - writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], a.firstSTChangeOn) - } - stDelta = st - a.st sdod := stDelta - a.stDelta - if sdod != 0 || a.firstSTChangeOn != 0 { - stChanged = true + if sdod != 0 { + if a.firstSTChangeOn == 0 { + a.firstSTChangeOn = a.numTotal + writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], a.numTotal) + } // Gorilla has a max resolution of seconds, Prometheus milliseconds. // Thus we use higher value range steps with larger bit size. // @@ -401,6 +394,19 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { a.b.writeBits(uint64(sdod), 64) } // putClassicVarbitInt(a.b, sdod) + } else { + if a.firstSTChangeOn == 0 { + if a.numTotal == maxFirstSTChangeOn { + // We are at the 127th sample. firstSTChangeOn can only fit + // 7 bits due to a single byte header constrain, which is fine, + // given typical 120 sample size. + a.firstSTChangeOn = maxFirstSTChangeOn + writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], maxFirstSTChangeOn) + a.b.writeBit(zero) + } + } else { + a.b.writeBit(zero) + } } tDelta = uint64(t - a.t) @@ -442,14 +448,6 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { a.numTotal++ binary.BigEndian.PutUint16(a.b.bytes(), a.numTotal) - - // firstSTChangeOn == 0 indicates that we have one ST value (zero or not) - // for all STs in the appends until now. If we see a first "update" - // we are saving this number in the header and continue tracking all DoD (including zeros). - if a.firstSTChangeOn == 0 && stChanged { - a.firstSTChangeOn = a.numTotal - 1 - writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], a.firstSTChangeOn) - } } func (it *xorOptSTtIterator) retErr(err error) ValueType { From 091379dc44769a53203e33425ebedd0245c74e9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 22 Jan 2026 14:17:51 +0100 Subject: [PATCH 03/73] make new format usable in head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/chunk.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tsdb/chunkenc/chunk.go b/tsdb/chunkenc/chunk.go index 71f38e7a7e..6fb8de2a77 100644 --- a/tsdb/chunkenc/chunk.go +++ b/tsdb/chunkenc/chunk.go @@ -392,6 +392,8 @@ func FromData(e Encoding, d []byte) (Chunk, error) { return &HistogramChunk{b: bstream{count: 0, stream: d}}, nil case EncFloatHistogram: return &FloatHistogramChunk{b: bstream{count: 0, stream: d}}, nil + case EncXOROptST: + return &XorOptSTChunk{b: bstream{count: 0, stream: d}}, nil } return nil, fmt.Errorf("invalid chunk encoding %q", e) } @@ -405,6 +407,8 @@ func NewEmptyChunk(e Encoding) (Chunk, error) { return NewHistogramChunk(), nil case EncFloatHistogram: return NewFloatHistogramChunk(), nil + case EncXOROptST: + return NewXOROptSTChunk(), nil } return nil, fmt.Errorf("invalid chunk encoding %q", e) } From ca530d7f85ca38c1dfecb16e36dd72718ea552e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 22 Jan 2026 18:08:41 +0100 Subject: [PATCH 04/73] fix issue with seeking to last sample again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/chunk_test.go | 4 ++++ tsdb/chunkenc/xoroptst.go | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tsdb/chunkenc/chunk_test.go b/tsdb/chunkenc/chunk_test.go index 92fa3cab38..1717300288 100644 --- a/tsdb/chunkenc/chunk_test.go +++ b/tsdb/chunkenc/chunk_test.go @@ -110,10 +110,14 @@ func testChunk(t *testing.T, c Chunk, supportsST bool) { ts, v = it3.At() res3 = append(res3, triple{st: it3.AtST(), t: ts, v: v}) + lastTs := ts for it3.Next() == ValFloat { ts, v := it3.At() + lastTs = ts res3 = append(res3, triple{st: it3.AtST(), t: ts, v: v}) } + // Seeking to last timestamp should work and it is a no-op. + require.Equal(t, ValFloat, it3.Seek(lastTs)) require.NoError(t, it3.Err()) require.Equal(t, exp[mid:], res3) require.Equal(t, ValNone, it3.Seek(exp[len(exp)-1].t+1)) diff --git a/tsdb/chunkenc/xoroptst.go b/tsdb/chunkenc/xoroptst.go index 5271398bd1..e61882fd84 100644 --- a/tsdb/chunkenc/xoroptst.go +++ b/tsdb/chunkenc/xoroptst.go @@ -207,7 +207,7 @@ type xorOptSTtIterator struct { } func (it *xorOptSTtIterator) Seek(t int64) ValueType { - if it.state == eofState { + if it.err != nil { return ValNone } From e55f09f460e121976274220a37cae49221704bb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Mon, 26 Jan 2026 10:27:24 +0100 Subject: [PATCH 05/73] fix iterator benchmark for chunks not supporting ST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/benchmark_test.go | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tsdb/chunkenc/benchmark_test.go b/tsdb/chunkenc/benchmark_test.go index cc69725858..b628be43c8 100644 --- a/tsdb/chunkenc/benchmark_test.go +++ b/tsdb/chunkenc/benchmark_test.go @@ -34,8 +34,9 @@ type sampleCase struct { } type fmtCase struct { - name string - newChunkFn func() Chunk + name string + newChunkFn func() Chunk + stUnsupported bool } func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampleCase)) { @@ -315,7 +316,16 @@ func BenchmarkIterator(b *testing.B) { if err := it.Err(); err != nil && !errors.Is(err, io.EOF) { require.NoError(b, err) } - if diff := cmp.Diff(s.samples, got, cmp.AllowUnexported(triple{}), cmp.Comparer(floatEquals)); diff != "" { + expectedSamples := s.samples + if f.stUnsupported { + // If the format does not support ST, zero them out for comparison. + expectedSamples = make([]triple, len(s.samples)) + copy(expectedSamples, s.samples) + for i := range s.samples { + expectedSamples[i].st = 0 + } + } + if diff := cmp.Diff(expectedSamples, got, cmp.AllowUnexported(triple{}), cmp.Comparer(floatEquals)); diff != "" { b.Fatalf("mismatch (-want +got):\n%s", diff) } From 5d0f59d8fe2086ebbb288656588215d387925e37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Mon, 26 Jan 2026 11:07:52 +0100 Subject: [PATCH 06/73] reduce footprint of the xoroptst chunk iterator object MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It was 80 bytes with a lot of padding compared to the 56 bytes of the original xor chunk iterator. Made it 64 bytes, tightly packed. Signed-off-by: György Krajcsovits --- tsdb/chunkenc/st_helper_test.go | 2 +- tsdb/chunkenc/xoroptst.go | 231 ++++++++++++-------------------- tsdb/chunkenc/xoroptst_test.go | 8 +- 3 files changed, 90 insertions(+), 151 deletions(-) diff --git a/tsdb/chunkenc/st_helper_test.go b/tsdb/chunkenc/st_helper_test.go index 7f657a4293..662866de93 100644 --- a/tsdb/chunkenc/st_helper_test.go +++ b/tsdb/chunkenc/st_helper_test.go @@ -70,7 +70,7 @@ func testChunkSTHandling(t *testing.T, vt ValueType, chunkFactory func() Chunk) } it := chunk.Iterator(nil) for i, s := range samples { - require.Equal(t, vt, it.Next()) + require.Equal(t, vt, it.Next(), "%d: value type mismatch", i) st, ts, f := get(it, vt) require.Equal(t, s.t, ts, "%d: timestamp mismatch", i) require.Equal(t, s.st, st, "%d: start time mismatch", i) diff --git a/tsdb/chunkenc/xoroptst.go b/tsdb/chunkenc/xoroptst.go index e61882fd84..6f27e69f66 100644 --- a/tsdb/chunkenc/xoroptst.go +++ b/tsdb/chunkenc/xoroptst.go @@ -41,7 +41,7 @@ func writeHeaderFirstSTChangeOn(b []byte, firstSTChangeOn uint16) { b[0] |= uint8(firstSTChangeOn) } -func readSTHeader(b []byte) (firstSTKnown bool, firstSTChangeOn uint16) { +func readSTHeader(b []byte) (firstSTKnown bool, firstSTChangeOn uint8) { if b[0] == 0x00 { return false, 0 } @@ -53,7 +53,7 @@ func readSTHeader(b []byte) (firstSTKnown bool, firstSTChangeOn uint16) { firstSTKnown = true } mask = 0x7F - return firstSTKnown, uint16(b[0] & mask) + return firstSTKnown, b[0] & mask } // XorOptSTChunk holds encoded sample data: @@ -127,7 +127,7 @@ func (c *XorOptSTChunk) Appender() (Appender, error) { numTotal: it.numTotal, firstSTKnown: it.firstSTKnown, - firstSTChangeOn: it.firstSTChangeOn, + firstSTChangeOn: uint16(it.firstSTChangeOn), } return a, nil } @@ -175,32 +175,20 @@ func (*xorOptSTAppender) AppendFloatHistogram(*FloatHistogramAppender, int64, in panic("appended a float histogram sample to a float chunk") } -const ( - read0State uint8 = iota - read1State - readDoDMaybeSTState - readDoDNoSTState - readDoDState - - eofState uint8 = 1<<8 - 1 -) - type xorOptSTtIterator struct { br bstreamReader numTotal uint16 firstSTKnown bool - firstSTChangeOn uint16 + firstSTChangeOn uint8 + leading uint8 + trailing uint8 - state uint8 numRead uint16 st, t int64 val float64 - leading uint8 - trailing uint8 - stDelta int64 tDelta uint64 err error @@ -211,7 +199,7 @@ func (it *xorOptSTtIterator) Seek(t int64) ValueType { return ValNone } - for t > it.t || it.state == read0State { + for t > it.t || it.numRead == 0 { if it.Next() == ValNone { return ValNone } @@ -257,10 +245,6 @@ func (it *xorOptSTtIterator) Reset(b []byte) { it.stDelta = 0 it.tDelta = 0 it.err = nil - it.state = read0State - if it.numRead >= it.numTotal { - it.state = eofState - } } func (a *xorOptSTAppender) Append(st, t int64, v float64) { @@ -452,17 +436,15 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { func (it *xorOptSTtIterator) retErr(err error) ValueType { it.err = err - it.state = eofState return ValNone } func (it *xorOptSTtIterator) Next() ValueType { - switch it.state { - case eofState: + if it.err != nil || it.numRead == it.numTotal { return ValNone - case read0State: - it.state++ + } + if it.numRead == 0 { // Optional ST read. if it.firstSTKnown { st, err := binary.ReadVarint(&it.br) @@ -471,39 +453,24 @@ func (it *xorOptSTtIterator) Next() ValueType { } it.st = st } - - // TS. t, err := binary.ReadVarint(&it.br) if err != nil { return it.retErr(err) } - // Value. v, err := it.br.readBits(64) if err != nil { return it.retErr(err) } - it.t = t it.val = math.Float64frombits(v) - // State EOF check. it.numRead++ - if it.numRead >= it.numTotal { - it.state = eofState - } return ValFloat - case read1State: - it.state++ - switch it.firstSTChangeOn { - case 0: - // This means we have same (zero or non-zero) ST value for the rest of - // chunk. We can simply use ~classic XOR chunk iterations. - it.state = readDoDNoSTState - case 1: - // We got early ST change on the second sample, likely delta. - // Continue ST rich flow from the next iteration. - it.state = readDoDState + } + if it.numRead == 1 { + // Optional ST delta read. + if it.firstSTChangeOn == 1 { stDelta, err := binary.ReadVarint(&it.br) if err != nil { return it.retErr(err) @@ -511,7 +478,6 @@ func (it *xorOptSTtIterator) Next() ValueType { it.stDelta = stDelta it.st += it.stDelta } - // TS. tDelta, err := binary.ReadUvarint(&it.br) if err != nil { return it.retErr(err) @@ -519,35 +485,69 @@ func (it *xorOptSTtIterator) Next() ValueType { it.tDelta = tDelta it.t += int64(it.tDelta) - // Value. - if err := xorRead(&it.br, &it.val, &it.leading, &it.trailing); err != nil { - return it.retErr(err) - } - - // State EOF check. - it.numRead++ - if it.numRead >= it.numTotal { - it.state = eofState - } - return ValFloat - case readDoDMaybeSTState: - if it.firstSTChangeOn == it.numRead { - // ST changes from this iteration, change state for future. - it.state = readDoDState - return it.dodNext() - } - return it.dodNoSTNext() - case readDoDState: - return it.dodNext() - case readDoDNoSTState: - return it.dodNoSTNext() - default: - panic("xorOptSTtIterator: broken machine state") + return it.readValue() + } + + if it.firstSTChangeOn > 0 && it.numRead >= uint16(it.firstSTChangeOn) { + // Inlined readClassicVarbitInt(&it.br) + var d byte + // read delta-of-delta + for range 4 { + d <<= 1 + bit, err := it.br.readBitFast() + if err != nil { + bit, err = it.br.readBit() + if err != nil { + return it.retErr(err) + } + } + if bit == zero { + break + } + d |= 1 + } + var sz uint8 + var sdod int64 + switch d { + case 0b0: + // dod == 0 + case 0b10: + sz = 14 + case 0b110: + sz = 17 + case 0b1110: + sz = 20 + case 0b1111: + // Do not use fast because it's very unlikely it will succeed. + bits, err := it.br.readBits(64) + if err != nil { + return it.retErr(err) + } + + sdod = int64(bits) + } + + if sz != 0 { + bits, err := it.br.readBitsFast(sz) + if err != nil { + bits, err = it.br.readBits(sz) + if err != nil { + return it.retErr(err) + } + } + + // Account for negative numbers, which come back as high unsigned numbers. + // See docs/bstream.md. + if bits > (1 << (sz - 1)) { + bits -= 1 << sz + } + sdod = int64(bits) + } + + it.stDelta += sdod + it.st += it.stDelta } -} -func (it *xorOptSTtIterator) dodNext() ValueType { - // Inlined readClassicVarbitInt(&it.br) var d byte // read delta-of-delta for range 4 { @@ -555,71 +555,10 @@ func (it *xorOptSTtIterator) dodNext() ValueType { bit, err := it.br.readBitFast() if err != nil { bit, err = it.br.readBit() - if err != nil { - return it.retErr(err) - } } - if bit == zero { - break - } - d |= 1 - } - var sz uint8 - var sdod int64 - switch d { - case 0b0: - // dod == 0 - case 0b10: - sz = 14 - case 0b110: - sz = 17 - case 0b1110: - sz = 20 - case 0b1111: - // Do not use fast because it's very unlikely it will succeed. - bits, err := it.br.readBits(64) if err != nil { return it.retErr(err) } - - sdod = int64(bits) - } - - if sz != 0 { - bits, err := it.br.readBitsFast(sz) - if err != nil { - bits, err = it.br.readBits(sz) - if err != nil { - return it.retErr(err) - } - } - - // Account for negative numbers, which come back as high unsigned numbers. - // See docs/bstream.md. - if bits > (1 << (sz - 1)) { - bits -= 1 << sz - } - sdod = int64(bits) - } - - it.stDelta += sdod - it.st += it.stDelta - return it.dodNoSTNext() -} - -func (it *xorOptSTtIterator) dodNoSTNext() ValueType { - // Inlined readClassicVarbitInt(&it.br) - var d byte - // read delta-of-delta - for range 4 { - d <<= 1 - bit, err := it.br.readBitFast() - if err != nil { - bit, err = it.br.readBit() - if err != nil { - return it.retErr(err) - } - } if bit == zero { break } @@ -650,9 +589,9 @@ func (it *xorOptSTtIterator) dodNoSTNext() ValueType { bits, err := it.br.readBitsFast(sz) if err != nil { bits, err = it.br.readBits(sz) - if err != nil { - return it.retErr(err) - } + } + if err != nil { + return it.retErr(err) } // Account for negative numbers, which come back as high unsigned numbers. @@ -665,15 +604,15 @@ func (it *xorOptSTtIterator) dodNoSTNext() ValueType { it.tDelta = uint64(int64(it.tDelta) + dod) it.t += int64(it.tDelta) - // Value. - if err := xorRead(&it.br, &it.val, &it.leading, &it.trailing); err != nil { + + return it.readValue() +} + +func (it *xorOptSTtIterator) readValue() ValueType { + err := xorRead(&it.br, &it.val, &it.leading, &it.trailing) + if err != nil { return it.retErr(err) } - - // State EOF check. it.numRead++ - if it.numRead >= it.numTotal { - it.state = eofState - } return ValFloat } diff --git a/tsdb/chunkenc/xoroptst_test.go b/tsdb/chunkenc/xoroptst_test.go index fe41b751fc..15b87993de 100644 --- a/tsdb/chunkenc/xoroptst_test.go +++ b/tsdb/chunkenc/xoroptst_test.go @@ -86,23 +86,23 @@ func TestXorOptSTChunk_STHeader(t *testing.T) { writeHeaderFirstSTKnown(b) firstSTKnown, firstSTChangeOn := readSTHeader(b) require.True(t, firstSTKnown) - require.Equal(t, uint16(0), firstSTChangeOn) + require.Equal(t, uint8(0), firstSTChangeOn) b = make([]byte, 1) firstSTKnown, firstSTChangeOn = readSTHeader(b) require.False(t, firstSTKnown) - require.Equal(t, uint16(0), firstSTChangeOn) + require.Equal(t, uint8(0), firstSTChangeOn) b = make([]byte, 1) writeHeaderFirstSTChangeOn(b, 1) firstSTKnown, firstSTChangeOn = readSTHeader(b) require.False(t, firstSTKnown) - require.Equal(t, uint16(1), firstSTChangeOn) + require.Equal(t, uint8(1), firstSTChangeOn) b = make([]byte, 1) writeHeaderFirstSTKnown(b) writeHeaderFirstSTChangeOn(b, 119) firstSTKnown, firstSTChangeOn = readSTHeader(b) require.True(t, firstSTKnown) - require.Equal(t, uint16(119), firstSTChangeOn) + require.Equal(t, uint8(119), firstSTChangeOn) } From a0314d5539a14992fe46416fec8fde505871d50c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 30 Jan 2026 09:01:29 +0100 Subject: [PATCH 07/73] Fix benchmark expectations on ST in interator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/benchmark_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsdb/chunkenc/benchmark_test.go b/tsdb/chunkenc/benchmark_test.go index b628be43c8..bf72f855ef 100644 --- a/tsdb/chunkenc/benchmark_test.go +++ b/tsdb/chunkenc/benchmark_test.go @@ -213,7 +213,7 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl } for _, f := range []fmtCase{ - {name: "XOR", newChunkFn: func() Chunk { return NewXORChunk() }}, + {name: "XOR", newChunkFn: func() Chunk { return NewXORChunk() }, stUnsupported: true}, {name: "XOR_OPT_ST", newChunkFn: func() Chunk { return NewXOROptSTChunk() }}, } { for _, s := range sampleCases { From 4ef81865752304973580fe8d8aad591454ddf00a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 30 Jan 2026 09:23:29 +0100 Subject: [PATCH 08/73] add inclusive delta test case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/benchmark_test.go | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/tsdb/chunkenc/benchmark_test.go b/tsdb/chunkenc/benchmark_test.go index bf72f855ef..1679158f16 100644 --- a/tsdb/chunkenc/benchmark_test.go +++ b/tsdb/chunkenc/benchmark_test.go @@ -80,7 +80,7 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl { // Delta simulates delta type or worst case for cumulatives, where ST // is changing on every sample. - name: "vt=constant/st=delta", + name: "vt=constant/st=delta-exclusive", samples: func() (ret []triple) { t, v := initT, initV for range nSamples { @@ -91,6 +91,20 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl return ret }(), }, + { + // Delta simulates delta type or worst case for cumulatives, where ST + // is changing on every sample. + name: "vt=constant/st=delta-inclusive", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + st := t // ST is the same as the previous t. + t += 15000 + ret = append(ret, triple{st: st, t: t, v: v}) + } + return ret + }(), + }, { name: "vt=random steps/st=0", samples: func() (ret []triple) { @@ -198,7 +212,7 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl }(), }, { - name: "vt=random 0-1/st=delta", + name: "vt=random 0-1/st=delta-exclusive", samples: func() (ret []triple) { t, v := initT, initV for range nSamples { @@ -210,6 +224,19 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl return ret }(), }, + { + name: "vt=random 0-1/st=delta-inclusive", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + st := t // ST is the same as the previous t. + t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. + v += r.Float64() // Random between 0 and 1.0. + ret = append(ret, triple{st: st, t: t, v: v}) + } + return ret + }(), + }, } for _, f := range []fmtCase{ From e6a0193a27e36851cb137ee001de7b8a69c00065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 30 Jan 2026 09:27:13 +0100 Subject: [PATCH 09/73] make testcases independent of order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/benchmark_test.go | 79 ++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/tsdb/chunkenc/benchmark_test.go b/tsdb/chunkenc/benchmark_test.go index 1679158f16..c9d5ab46aa 100644 --- a/tsdb/chunkenc/benchmark_test.go +++ b/tsdb/chunkenc/benchmark_test.go @@ -46,12 +46,21 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl require.NoError(b, err) var ( - r = rand.New(rand.NewSource(1)) - initST = timestamp.FromTime(d) // Use realistic timestamp. - initT = initST + 15000 // 15s after initST. - initV = 1243535.123 + r = rand.New(rand.NewSource(1)) // Fixed seed for reproducibility. + initST = timestamp.FromTime(d) // Use realistic timestamp. + initT = initST + 15000 // 15s after initST. + initV = 1243535.123 + rInts = make([]int64, nSamples) + rFloats = make([]float64, nSamples) ) + // Pre-generate random numbers so that adding/removing cases does not change + // the generated samples. + for i := range nSamples { + rInts[i] = int64(r.Intn(100)) + rFloats[i] = float64(r.Intn(100)) + } + sampleCases := []sampleCase{ { name: "vt=constant/st=0", @@ -109,9 +118,9 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl name: "vt=random steps/st=0", samples: func() (ret []triple) { t, v := initT, initV - for range nSamples { - t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. - v += float64(r.Intn(100) - 50) // Varying from -50 to +50 in 100 discrete steps. + for i := range nSamples { + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. ret = append(ret, triple{st: 0, t: t, v: v}) } return ret @@ -121,9 +130,9 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl name: "vt=random steps/st=cumulative", samples: func() (ret []triple) { t, v := initT, initV - for range nSamples { - t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. - v += float64(r.Intn(100) - 50) // Varying from -50 to +50 in 100 discrete steps. + for i := range nSamples { + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. ret = append(ret, triple{st: initST, t: t, v: v}) } return ret @@ -133,10 +142,10 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl name: "vt=random steps/st=delta", samples: func() (ret []triple) { t, v := initT, initV - for range nSamples { - st := t + 1 // ST is a tight interval after the last t+1ms. - t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. - v += float64(r.Intn(100) - 50) // Varying from -50 to +50 in 100 discrete steps. + for i := range nSamples { + st := t + 1 // ST is a tight interval after the last t+1ms. + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. ret = append(ret, triple{st: st, t: t, v: v}) } return ret @@ -146,9 +155,9 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl name: "vt=random 0-1/st=0", samples: func() (ret []triple) { t, v := initT, initV - for range nSamples { - t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. - v += r.Float64() // Random between 0 and 1.0. + for i := range nSamples { + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] / 100.0 // Random between 0 and 1.0. ret = append(ret, triple{st: 0, t: t, v: v}) } return ret @@ -159,9 +168,9 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl name: "vt=negrandom 0-1/st=0", samples: func() (ret []triple) { t, v := initT, initV - for range nSamples { - t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. - v -= r.Float64() // Random between 0 and 1.0. + for i := range nSamples { + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v -= rFloats[i] / 100.0 // Random between 0 and 1.0. ret = append(ret, triple{st: 0, t: t, v: v}) } return ret @@ -171,9 +180,9 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl name: "vt=random 0-1/st=cumulative", samples: func() (ret []triple) { t, v := initT, initV - for range nSamples { - t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. - v += r.Float64() // Random between 0 and 1.0. + for i := range nSamples { + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] / 100.0 // Random between 0 and 1.0. ret = append(ret, triple{st: initST, t: t, v: v}) } return ret @@ -184,8 +193,8 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl samples: func() (ret []triple) { t, v := initT, initV for i := range nSamples { - t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. - v += r.Float64() // Random between 0 and 1.0. + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] / 100.0 // Random between 0 and 1.0. st := initST if i%6 == 5 { st = t - 10000 // Reset of 10s before current t. @@ -200,8 +209,8 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl samples: func() (ret []triple) { t, v := initT, initV for i := range nSamples { - t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. - v += r.Float64() // Random between 0 and 1.0. + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] / 100.0 // Random between 0 and 1.0. st := initST if i%6 == 5 { st = 0 @@ -215,10 +224,10 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl name: "vt=random 0-1/st=delta-exclusive", samples: func() (ret []triple) { t, v := initT, initV - for range nSamples { - st := t + 1 // ST is a tight interval after the last t+1ms. - t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. - v += r.Float64() // Random between 0 and 1.0. + for i := range nSamples { + st := t + 1 // ST is a tight interval after the last t+1ms. + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] / 100.0 // Random between 0 and 1.0. ret = append(ret, triple{st: st, t: t, v: v}) } return ret @@ -228,10 +237,10 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl name: "vt=random 0-1/st=delta-inclusive", samples: func() (ret []triple) { t, v := initT, initV - for range nSamples { - st := t // ST is the same as the previous t. - t += int64(r.Intn(100) - 50 + 15000) // 15 seconds +- up to 100ms of jitter. - v += r.Float64() // Random between 0 and 1.0. + for i := range nSamples { + st := t // ST is the same as the previous t. + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] / 100.0 // Random between 0 and 1.0. ret = append(ret, triple{st: st, t: t, v: v}) } return ret From 65b70bbe775c1aa38c1dba881e81a6e592eb8960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 30 Jan 2026 10:39:50 +0100 Subject: [PATCH 10/73] drop unused code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/varbit_classic.go | 103 --------------------------- tsdb/chunkenc/varbit_classic_test.go | 57 --------------- 2 files changed, 160 deletions(-) delete mode 100644 tsdb/chunkenc/varbit_classic.go delete mode 100644 tsdb/chunkenc/varbit_classic_test.go diff --git a/tsdb/chunkenc/varbit_classic.go b/tsdb/chunkenc/varbit_classic.go deleted file mode 100644 index b8f293bc3f..0000000000 --- a/tsdb/chunkenc/varbit_classic.go +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package chunkenc - -// putClassicVarbitInt writes an int64 using varbit encoding with a bit bucketing -// as it was done for a long time in the initial XOR chunk format. -func putClassicVarbitInt(b *bstream, val int64) { - // Gorilla has a max resolution of seconds, Prometheus milliseconds. - // Thus we use higher value range steps with larger bit size. - // - // TODO(beorn7): This seems to needlessly jump to large bit - // sizes even for very small deviations from zero. Timestamp - // compression can probably benefit from some smaller bit - // buckets. See also what was done for histogram encoding in - // varbit.go. - switch { - case val == 0: - b.writeBit(zero) - case bitRange(val, 14): - b.writeByte(0b10<<6 | (uint8(val>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. - b.writeByte(uint8(val)) // Bottom 8 bits of dod. - case bitRange(val, 17): - b.writeBits(0b110, 3) - b.writeBits(uint64(val), 17) - case bitRange(val, 20): - b.writeBits(0b1110, 4) - b.writeBits(uint64(val), 20) - default: - b.writeBits(0b1111, 4) - b.writeBits(uint64(val), 64) - } -} - -// readClassicVarbitInt reads an int64 encoded with putClassicVarbitInt. -// This is copied into production code to make it inline. -func readClassicVarbitInt(b *bstreamReader) (int64, error) { - var d byte - // read delta-of-delta - for range 4 { - d <<= 1 - bit, err := b.readBitFast() - if err != nil { - bit, err = b.readBit() - if err != nil { - return 0, err - } - } - if bit == zero { - break - } - d |= 1 - } - var sz uint8 - var val int64 - switch d { - case 0b0: - // dod == 0 - case 0b10: - sz = 14 - case 0b110: - sz = 17 - case 0b1110: - sz = 20 - case 0b1111: - // Do not use fast because it's very unlikely it will succeed. - bits, err := b.readBits(64) - if err != nil { - return 0, err - } - - val = int64(bits) - } - - if sz != 0 { - bits, err := b.readBitsFast(sz) - if err != nil { - bits, err = b.readBits(sz) - if err != nil { - return 0, err - } - } - - // Account for negative numbers, which come back as high unsigned numbers. - // See docs/bstream.md. - if bits > (1 << (sz - 1)) { - bits -= 1 << sz - } - val = int64(bits) - } - - return val, nil -} diff --git a/tsdb/chunkenc/varbit_classic_test.go b/tsdb/chunkenc/varbit_classic_test.go deleted file mode 100644 index f64d2ca9a9..0000000000 --- a/tsdb/chunkenc/varbit_classic_test.go +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package chunkenc - -import ( - "math" - "testing" - - "github.com/stretchr/testify/require" -) - -func TestClassicVarbitInt(t *testing.T) { - numbers := []int64{ - math.MinInt64, - -36028797018963968, -36028797018963967, - -16777216, -16777215, - -131072, -131071, - -2048, -2047, - -256, -255, - -32, -31, - -4, -3, - -1, 0, 1, - 4, 5, - 32, 33, - 256, 257, - 2048, 2049, - 131072, 131073, - 16777216, 16777217, - 36028797018963968, 36028797018963969, - math.MaxInt64, - } - - bs := bstream{} - - for _, n := range numbers { - putClassicVarbitInt(&bs, n) - } - - bsr := newBReader(bs.bytes()) - - for _, want := range numbers { - got, err := readClassicVarbitInt(&bsr) - require.NoError(t, err) - require.Equal(t, want, got) - } -} From 475dfb5ea95fbef847ade47affccd52afc89d100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 30 Jan 2026 10:41:05 +0100 Subject: [PATCH 11/73] Drop commented out line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/xoroptst.go | 1 - 1 file changed, 1 deletion(-) diff --git a/tsdb/chunkenc/xoroptst.go b/tsdb/chunkenc/xoroptst.go index 6f27e69f66..d538e06dfa 100644 --- a/tsdb/chunkenc/xoroptst.go +++ b/tsdb/chunkenc/xoroptst.go @@ -377,7 +377,6 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { a.b.writeBits(0b1111, 4) a.b.writeBits(uint64(sdod), 64) } - // putClassicVarbitInt(a.b, sdod) } else { if a.firstSTChangeOn == 0 { if a.numTotal == maxFirstSTChangeOn { From 6133dd5f3e73b59ed25bc10041250dbaee9c79a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 30 Jan 2026 11:31:54 +0100 Subject: [PATCH 12/73] documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/xoroptst.go | 5 ++-- tsdb/docs/format/chunks.md | 51 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/tsdb/chunkenc/xoroptst.go b/tsdb/chunkenc/xoroptst.go index d538e06dfa..dcc8bf6c41 100644 --- a/tsdb/chunkenc/xoroptst.go +++ b/tsdb/chunkenc/xoroptst.go @@ -56,9 +56,8 @@ func readSTHeader(b []byte) (firstSTKnown bool, firstSTChangeOn uint8) { return firstSTKnown, b[0] & mask } -// XorOptSTChunk holds encoded sample data: -// 2B(numSamples), 1B(stHeader), ?varint(st), varint(t), xor(v), ?varuint(stDelta), varuint(tDelta), xor(v), ?classicvarbitint(stDod), classicvarbitint(tDod), xor(v), ... -// stHeader: 1b(firstSTKnown), 7b(firstSTChangeOn). +// XorOptSTChunk holds XOR enncoded samples with optional start time (ST) +// per chunk or per sample. See tsdb/docs/format/chunks.md for details. type XorOptSTChunk struct { b bstream } diff --git a/tsdb/docs/format/chunks.md b/tsdb/docs/format/chunks.md index a604c9ea55..0ef7b18d32 100644 --- a/tsdb/docs/format/chunks.md +++ b/tsdb/docs/format/chunks.md @@ -65,6 +65,57 @@ Notes: * `padding` of 0 to 7 bits so that the whole chunk data is byte-aligned. * The chunk can have as few as one sample, i.e. `ts_1`, `v_1`, etc. are optional. +## XOR chunk data with start timestamp + +This is experimental, related to supporting delta temporality metrics. +Subject to change. + +The format is similar to XOR chunk data, except there's an additional one byte +start time (ST) header and optional start time values, delta, delta of deltas. + +``` +┌──────────────────────┬───────────────────┬────────────────┬───────────────────────────────┬─- +│ num_samples │ st_header | ?st_0 | ts_0 │ v_0 │ +└──────────────────────┴───────────────────┴────────────────┴───────────────────────────────┴─- + +-──────────────────────┬──────────────────────┬──────────────────────┬─- + ?st_1_delta | ts_1_delta │ v_1_xor │ +-──────────────────────┴──────────────────────┴──────────────────────┴─- + +-──────────────────────┬──────────────────────┬──────────────────────┬─────┬─- + ?st_2_dod | ts_2_dod │ v_2_xor │ ... │ +-──────────────────────┴──────────────────────┴──────────────────────┴─────┴─- + +-──────────────────────┬──────────────────────┬──────────────────────┬──────────────────┐ + ?st_n_dod | ts_n_dod │ v_n_xor │ padding │ +-──────────────────────┴──────────────────────┴──────────────────────┴──────────────────┘ +``` + +### Notes + +In addition to the notes from [XOR chunk data](#xor-chunk-data). + +* We use `st_i_dod` and `st_i` interchangeably when `i>1` in these notes. +* `st_header` is one byte: + ``` + ┌───────────────────────┬───────────────────────┐ + │ first_st_known<1 bit> | st_changed_on<7 bits> │ + └───────────────────────┴───────────────────────┘ + ``` + where the highest bit `first_st_known` indicates if `st_0` is present or not. + If the lower 7bits `st_changed_on` is 0, no `st_i (i>0)` is present. + Otherwise `st_i (i>=st_changed_on>)` is present, while + `st_i (01)` is encoded as a `varbit_ts` "delta of delta" from + `st_i-1` (or from 0 if `st_i-1` is not present). + ## Histogram chunk data ``` From 1ac3b8d8ec79a216cb6e0398ba1cdfde8302654d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 30 Jan 2026 11:34:59 +0100 Subject: [PATCH 13/73] Small simplification in the doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/docs/format/chunks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsdb/docs/format/chunks.md b/tsdb/docs/format/chunks.md index 0ef7b18d32..c126a4d8dd 100644 --- a/tsdb/docs/format/chunks.md +++ b/tsdb/docs/format/chunks.md @@ -71,7 +71,7 @@ This is experimental, related to supporting delta temporality metrics. Subject to change. The format is similar to XOR chunk data, except there's an additional one byte -start time (ST) header and optional start time values, delta, delta of deltas. +start time (ST) header and optional start time values. ``` ┌──────────────────────┬───────────────────┬────────────────┬───────────────────────────────┬─- From 9fc9033ad8788af54aa6885a3a3739ae575dc2bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Tue, 3 Feb 2026 08:24:12 +0100 Subject: [PATCH 14/73] Add delta st inclusive test case for random vt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/benchmark_test.go | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tsdb/chunkenc/benchmark_test.go b/tsdb/chunkenc/benchmark_test.go index c9d5ab46aa..c1335d1e44 100644 --- a/tsdb/chunkenc/benchmark_test.go +++ b/tsdb/chunkenc/benchmark_test.go @@ -139,7 +139,7 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl }(), }, { - name: "vt=random steps/st=delta", + name: "vt=random steps/st=delta-exclusive", samples: func() (ret []triple) { t, v := initT, initV for i := range nSamples { @@ -151,6 +151,19 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl return ret }(), }, + { + name: "vt=random steps/st=delta-inclusive", + samples: func() (ret []triple) { + t, v := initT, initV + for i := range nSamples { + st := t // ST is equal to the previous t. + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. + ret = append(ret, triple{st: st, t: t, v: v}) + } + return ret + }(), + }, { name: "vt=random 0-1/st=0", samples: func() (ret []triple) { From 59a6706abb49ad2569cdc76261b2edc85abecd1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Tue, 3 Feb 2026 20:55:44 +0100 Subject: [PATCH 15/73] Switch to delta of difference of st to prev t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit from delta of delta of st. Signed-off-by: György Krajcsovits --- tsdb/chunkenc/xoroptst.go | 90 +++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 37 deletions(-) diff --git a/tsdb/chunkenc/xoroptst.go b/tsdb/chunkenc/xoroptst.go index dcc8bf6c41..ec912f1089 100644 --- a/tsdb/chunkenc/xoroptst.go +++ b/tsdb/chunkenc/xoroptst.go @@ -119,7 +119,7 @@ func (c *XorOptSTChunk) Appender() (Appender, error) { st: it.st, t: it.t, v: it.val, - stDelta: it.stDelta, + stDiff: it.stDiff, tDelta: it.tDelta, leading: it.leading, trailing: it.trailing, @@ -158,8 +158,8 @@ type xorOptSTAppender struct { firstSTKnown bool st, t int64 v float64 - stDelta int64 - tDelta uint64 + stDiff int64 // Difference between current ST and previous T. Undefined for first sample. + tDelta uint64 // Difference between current T and previous T. Undefined for first sample. } func (a *xorOptSTAppender) writeVDelta(v float64) { @@ -188,9 +188,9 @@ type xorOptSTtIterator struct { st, t int64 val float64 - stDelta int64 - tDelta uint64 - err error + stDiff int64 + tDelta uint64 + err error } func (it *xorOptSTtIterator) Seek(t int64) ValueType { @@ -241,7 +241,7 @@ func (it *xorOptSTtIterator) Reset(b []byte) { it.val = 0 it.leading = 0 it.trailing = 0 - it.stDelta = 0 + it.stDiff = 0 it.tDelta = 0 it.err = nil } @@ -309,8 +309,8 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { } var ( - stDelta int64 - tDelta uint64 + stDiff int64 // Difference between current ST and previous T. Undefined for first sample. + tDelta uint64 // Difference between current T and previous T. Undefined for first sample. ) // Slow path for ST usage. @@ -330,11 +330,11 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { a.b.writeBits(math.Float64bits(v), 64) case 1: buf := make([]byte, binary.MaxVarintLen64) - stDelta = st - a.st - if stDelta != 0 { + if st != a.st { + stDiff = a.t - st a.firstSTChangeOn = 1 writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], 1) - for _, b := range buf[:binary.PutVarint(buf, stDelta)] { + for _, b := range buf[:binary.PutVarint(buf, stDiff)] { a.b.writeByte(b) } } @@ -345,13 +345,40 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { } a.writeVDelta(v) default: - stDelta = st - a.st - sdod := stDelta - a.stDelta - if sdod != 0 { - if a.firstSTChangeOn == 0 { + if a.firstSTChangeOn == 0 { + if st != a.st || a.numTotal == maxFirstSTChangeOn { + stDiff = a.t - st a.firstSTChangeOn = a.numTotal writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], a.numTotal) + sdod := stDiff + // Gorilla has a max resolution of seconds, Prometheus milliseconds. + // Thus we use higher value range steps with larger bit size. + // + // TODO(beorn7): This seems to needlessly jump to large bit + // sizes even for very small deviations from zero. Timestamp + // compression can probably benefit from some smaller bit + // buckets. See also what was done for histogram encoding in + // varbit.go. + switch { + case sdod == 0: + a.b.writeBit(zero) + case bitRange(sdod, 14): + a.b.writeByte(0b10<<6 | (uint8(sdod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. + a.b.writeByte(uint8(sdod)) // Bottom 8 bits of dod. + case bitRange(sdod, 17): + a.b.writeBits(0b110, 3) + a.b.writeBits(uint64(sdod), 17) + case bitRange(sdod, 20): + a.b.writeBits(0b1110, 4) + a.b.writeBits(uint64(sdod), 20) + default: + a.b.writeBits(0b1111, 4) + a.b.writeBits(uint64(sdod), 64) + } } + } else { + stDiff = a.t - st + sdod := stDiff - a.stDiff // Gorilla has a max resolution of seconds, Prometheus milliseconds. // Thus we use higher value range steps with larger bit size. // @@ -376,19 +403,6 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { a.b.writeBits(0b1111, 4) a.b.writeBits(uint64(sdod), 64) } - } else { - if a.firstSTChangeOn == 0 { - if a.numTotal == maxFirstSTChangeOn { - // We are at the 127th sample. firstSTChangeOn can only fit - // 7 bits due to a single byte header constrain, which is fine, - // given typical 120 sample size. - a.firstSTChangeOn = maxFirstSTChangeOn - writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], maxFirstSTChangeOn) - a.b.writeBit(zero) - } - } else { - a.b.writeBit(zero) - } } tDelta = uint64(t - a.t) @@ -426,7 +440,7 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { a.t = t a.v = v a.tDelta = tDelta - a.stDelta = stDelta + a.stDiff = stDiff a.numTotal++ binary.BigEndian.PutUint16(a.b.bytes(), a.numTotal) @@ -469,12 +483,12 @@ func (it *xorOptSTtIterator) Next() ValueType { if it.numRead == 1 { // Optional ST delta read. if it.firstSTChangeOn == 1 { - stDelta, err := binary.ReadVarint(&it.br) + stDiff, err := binary.ReadVarint(&it.br) if err != nil { return it.retErr(err) } - it.stDelta = stDelta - it.st += it.stDelta + it.stDiff = stDiff + it.st = it.t - stDiff } tDelta, err := binary.ReadUvarint(&it.br) if err != nil { @@ -487,7 +501,6 @@ func (it *xorOptSTtIterator) Next() ValueType { } if it.firstSTChangeOn > 0 && it.numRead >= uint16(it.firstSTChangeOn) { - // Inlined readClassicVarbitInt(&it.br) var d byte // read delta-of-delta for range 4 { @@ -541,9 +554,12 @@ func (it *xorOptSTtIterator) Next() ValueType { } sdod = int64(bits) } - - it.stDelta += sdod - it.st += it.stDelta + if it.numRead == uint16(it.firstSTChangeOn) { + it.stDiff = sdod + } else { + it.stDiff += sdod + } + it.st = it.t - it.stDiff } var d byte From bbc85a3a96003400db840825d8013be1d7638036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Tue, 3 Feb 2026 22:11:04 +0100 Subject: [PATCH 16/73] Write ST after T and V so we can write a single bit on the second sample MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/xoroptst.go | 347 ++++++++++++++++++++++++-------------- 1 file changed, 220 insertions(+), 127 deletions(-) diff --git a/tsdb/chunkenc/xoroptst.go b/tsdb/chunkenc/xoroptst.go index ec912f1089..95c40c3a92 100644 --- a/tsdb/chunkenc/xoroptst.go +++ b/tsdb/chunkenc/xoroptst.go @@ -318,33 +318,96 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { case 0: buf := make([]byte, binary.MaxVarintLen64) + // Write T. + for _, b := range buf[:binary.PutVarint(buf, t)] { + a.b.writeByte(b) + } + + // Write V. + a.b.writeBits(math.Float64bits(v), 64) + + // Write ST. for _, b := range buf[:binary.PutVarint(buf, st)] { a.b.writeByte(b) } a.firstSTKnown = true writeHeaderFirstSTKnown(a.b.bytes()[chunkHeaderSize:]) - for _, b := range buf[:binary.PutVarint(buf, t)] { - a.b.writeByte(b) - } - a.b.writeBits(math.Float64bits(v), 64) case 1: buf := make([]byte, binary.MaxVarintLen64) - if st != a.st { - stDiff = a.t - st - a.firstSTChangeOn = 1 - writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], 1) - for _, b := range buf[:binary.PutVarint(buf, stDiff)] { - a.b.writeByte(b) - } - } - tDelta = uint64(t - a.t) for _, b := range buf[:binary.PutUvarint(buf, tDelta)] { a.b.writeByte(b) } a.writeVDelta(v) + + if st == a.st { + break + } + + stDiff = a.t - st + a.firstSTChangeOn = 1 + writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], 1) + // for _, b := range buf[:binary.PutVarint(buf, stDiff)] { + // a.b.writeByte(b) + // } + sdod := stDiff + // Gorilla has a max resolution of seconds, Prometheus milliseconds. + // Thus we use higher value range steps with larger bit size. + // + // TODO(beorn7): This seems to needlessly jump to large bit + // sizes even for very small deviations from zero. Timestamp + // compression can probably benefit from some smaller bit + // buckets. See also what was done for histogram encoding in + // varbit.go. + switch { + case sdod == 0: + a.b.writeBit(zero) + case bitRange(sdod, 14): + a.b.writeByte(0b10<<6 | (uint8(sdod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. + a.b.writeByte(uint8(sdod)) // Bottom 8 bits of dod. + case bitRange(sdod, 17): + a.b.writeBits(0b110, 3) + a.b.writeBits(uint64(sdod), 17) + case bitRange(sdod, 20): + a.b.writeBits(0b1110, 4) + a.b.writeBits(uint64(sdod), 20) + default: + a.b.writeBits(0b1111, 4) + a.b.writeBits(uint64(sdod), 64) + } + default: + tDelta = uint64(t - a.t) + dod := int64(tDelta - a.tDelta) + + // Gorilla has a max resolution of seconds, Prometheus milliseconds. + // Thus we use higher value range steps with larger bit size. + // + // TODO(beorn7): This seems to needlessly jump to large bit + // sizes even for very small deviations from zero. Timestamp + // compression can probably benefit from some smaller bit + // buckets. See also what was done for histogram encoding in + // varbit.go. + switch { + case dod == 0: + a.b.writeBit(zero) + case bitRange(dod, 14): + a.b.writeByte(0b10<<6 | (uint8(dod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. + a.b.writeByte(uint8(dod)) // Bottom 8 bits of dod. + case bitRange(dod, 17): + a.b.writeBits(0b110, 3) + a.b.writeBits(uint64(dod), 17) + case bitRange(dod, 20): + a.b.writeBits(0b1110, 4) + a.b.writeBits(uint64(dod), 20) + default: + a.b.writeBits(0b1111, 4) + a.b.writeBits(uint64(dod), 64) + } + + a.writeVDelta(v) + if a.firstSTChangeOn == 0 { if st != a.st || a.numTotal == maxFirstSTChangeOn { stDiff = a.t - st @@ -404,36 +467,6 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { a.b.writeBits(uint64(sdod), 64) } } - - tDelta = uint64(t - a.t) - dod := int64(tDelta - a.tDelta) - - // Gorilla has a max resolution of seconds, Prometheus milliseconds. - // Thus we use higher value range steps with larger bit size. - // - // TODO(beorn7): This seems to needlessly jump to large bit - // sizes even for very small deviations from zero. Timestamp - // compression can probably benefit from some smaller bit - // buckets. See also what was done for histogram encoding in - // varbit.go. - switch { - case dod == 0: - a.b.writeBit(zero) - case bitRange(dod, 14): - a.b.writeByte(0b10<<6 | (uint8(dod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. - a.b.writeByte(uint8(dod)) // Bottom 8 bits of dod. - case bitRange(dod, 17): - a.b.writeBits(0b110, 3) - a.b.writeBits(uint64(dod), 17) - case bitRange(dod, 20): - a.b.writeBits(0b1110, 4) - a.b.writeBits(uint64(dod), 20) - default: - a.b.writeBits(0b1111, 4) - a.b.writeBits(uint64(dod), 64) - } - - a.writeVDelta(v) } a.st = st @@ -457,6 +490,18 @@ func (it *xorOptSTtIterator) Next() ValueType { } if it.numRead == 0 { + t, err := binary.ReadVarint(&it.br) + if err != nil { + return it.retErr(err) + } + + v, err := it.br.readBits(64) + if err != nil { + return it.retErr(err) + } + it.t = t + it.val = math.Float64frombits(v) + // Optional ST read. if it.firstSTKnown { st, err := binary.ReadVarint(&it.br) @@ -465,39 +510,150 @@ func (it *xorOptSTtIterator) Next() ValueType { } it.st = st } - t, err := binary.ReadVarint(&it.br) - if err != nil { - return it.retErr(err) - } - v, err := it.br.readBits(64) - if err != nil { - return it.retErr(err) - } - it.t = t - it.val = math.Float64frombits(v) it.numRead++ return ValFloat } if it.numRead == 1 { - // Optional ST delta read. - if it.firstSTChangeOn == 1 { - stDiff, err := binary.ReadVarint(&it.br) - if err != nil { - return it.retErr(err) - } - it.stDiff = stDiff - it.st = it.t - stDiff - } tDelta, err := binary.ReadUvarint(&it.br) if err != nil { return it.retErr(err) } it.tDelta = tDelta - it.t += int64(it.tDelta) - return it.readValue() + if err := xorRead(&it.br, &it.val, &it.leading, &it.trailing); err != nil { + return it.retErr(err) + } + + // Optional ST delta read. + if it.firstSTChangeOn == 1 { + // stDiff, err := binary.ReadVarint(&it.br) + // if err != nil { + // return it.retErr(err) + // } + // it.stDiff = stDiff + // it.st = it.t - stDiff + var d byte + // read delta-of-delta + for range 4 { + d <<= 1 + bit, err := it.br.readBitFast() + if err != nil { + bit, err = it.br.readBit() + if err != nil { + return it.retErr(err) + } + } + if bit == zero { + break + } + d |= 1 + } + var sz uint8 + var sdod int64 + switch d { + case 0b0: + // dod == 0 + case 0b10: + sz = 14 + case 0b110: + sz = 17 + case 0b1110: + sz = 20 + case 0b1111: + // Do not use fast because it's very unlikely it will succeed. + bits, err := it.br.readBits(64) + if err != nil { + return it.retErr(err) + } + + sdod = int64(bits) + } + + if sz != 0 { + bits, err := it.br.readBitsFast(sz) + if err != nil { + bits, err = it.br.readBits(sz) + if err != nil { + return it.retErr(err) + } + } + + // Account for negative numbers, which come back as high unsigned numbers. + // See docs/bstream.md. + if bits > (1 << (sz - 1)) { + bits -= 1 << sz + } + sdod = int64(bits) + } + it.stDiff = sdod + it.st = it.t - sdod + } + + it.t += int64(it.tDelta) + it.numRead++ + return ValFloat + } + + var d byte + // read delta-of-delta + for range 4 { + d <<= 1 + bit, err := it.br.readBitFast() + if err != nil { + bit, err = it.br.readBit() + } + if err != nil { + return it.retErr(err) + } + if bit == zero { + break + } + d |= 1 + } + var sz uint8 + var dod int64 + switch d { + case 0b0: + // dod == 0 + case 0b10: + sz = 14 + case 0b110: + sz = 17 + case 0b1110: + sz = 20 + case 0b1111: + // Do not use fast because it's very unlikely it will succeed. + bits, err := it.br.readBits(64) + if err != nil { + return it.retErr(err) + } + + dod = int64(bits) + } + + if sz != 0 { + bits, err := it.br.readBitsFast(sz) + if err != nil { + bits, err = it.br.readBits(sz) + } + if err != nil { + return it.retErr(err) + } + + // Account for negative numbers, which come back as high unsigned numbers. + // See docs/bstream.md. + if bits > (1 << (sz - 1)) { + bits -= 1 << sz + } + dod = int64(bits) + } + + it.tDelta = uint64(int64(it.tDelta) + dod) + + if err := xorRead(&it.br, &it.val, &it.leading, &it.trailing); err != nil { + return it.retErr(err) } if it.firstSTChangeOn > 0 && it.numRead >= uint16(it.firstSTChangeOn) { @@ -562,71 +718,8 @@ func (it *xorOptSTtIterator) Next() ValueType { it.st = it.t - it.stDiff } - var d byte - // read delta-of-delta - for range 4 { - d <<= 1 - bit, err := it.br.readBitFast() - if err != nil { - bit, err = it.br.readBit() - } - if err != nil { - return it.retErr(err) - } - if bit == zero { - break - } - d |= 1 - } - var sz uint8 - var dod int64 - switch d { - case 0b0: - // dod == 0 - case 0b10: - sz = 14 - case 0b110: - sz = 17 - case 0b1110: - sz = 20 - case 0b1111: - // Do not use fast because it's very unlikely it will succeed. - bits, err := it.br.readBits(64) - if err != nil { - return it.retErr(err) - } - - dod = int64(bits) - } - - if sz != 0 { - bits, err := it.br.readBitsFast(sz) - if err != nil { - bits, err = it.br.readBits(sz) - } - if err != nil { - return it.retErr(err) - } - - // Account for negative numbers, which come back as high unsigned numbers. - // See docs/bstream.md. - if bits > (1 << (sz - 1)) { - bits -= 1 << sz - } - dod = int64(bits) - } - - it.tDelta = uint64(int64(it.tDelta) + dod) it.t += int64(it.tDelta) - return it.readValue() -} - -func (it *xorOptSTtIterator) readValue() ValueType { - err := xorRead(&it.br, &it.val, &it.leading, &it.trailing) - if err != nil { - return it.retErr(err) - } it.numRead++ return ValFloat } From e777d4077cd567f73133d0a60f5bc11eaec2ace6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 4 Feb 2026 19:48:15 +0100 Subject: [PATCH 17/73] verify chunk sample len function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/st_helper_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tsdb/chunkenc/st_helper_test.go b/tsdb/chunkenc/st_helper_test.go index 662866de93..e3d64e31c0 100644 --- a/tsdb/chunkenc/st_helper_test.go +++ b/tsdb/chunkenc/st_helper_test.go @@ -68,6 +68,7 @@ func testChunkSTHandling(t *testing.T, vt ValueType, chunkFactory func() Chunk) for _, s := range samples { sampleAppend(app, vt, s.st, s.t, s.v) } + require.Equal(t, len(samples), chunk.NumSamples()) it := chunk.Iterator(nil) for i, s := range samples { require.Equal(t, vt, it.Next(), "%d: value type mismatch", i) From 59bd0a2a3018e98872c6c786822f73a2842e1fb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Wed, 4 Feb 2026 21:20:32 +0100 Subject: [PATCH 18/73] Reduce size of first st stored a little MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/xoroptst.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tsdb/chunkenc/xoroptst.go b/tsdb/chunkenc/xoroptst.go index 95c40c3a92..71bb195d0a 100644 --- a/tsdb/chunkenc/xoroptst.go +++ b/tsdb/chunkenc/xoroptst.go @@ -327,7 +327,7 @@ func (a *xorOptSTAppender) Append(st, t int64, v float64) { a.b.writeBits(math.Float64bits(v), 64) // Write ST. - for _, b := range buf[:binary.PutVarint(buf, st)] { + for _, b := range buf[:binary.PutVarint(buf, t-st)] { a.b.writeByte(b) } a.firstSTKnown = true @@ -508,7 +508,7 @@ func (it *xorOptSTtIterator) Next() ValueType { if err != nil { return it.retErr(err) } - it.st = st + it.st = t - st } it.numRead++ From 65efa8f95a7eab6485660e00a27ee836b10d66b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 5 Feb 2026 18:44:01 +0100 Subject: [PATCH 19/73] test the case where st equals the t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/st_helper_test.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tsdb/chunkenc/st_helper_test.go b/tsdb/chunkenc/st_helper_test.go index e3d64e31c0..fa9c9b451c 100644 --- a/tsdb/chunkenc/st_helper_test.go +++ b/tsdb/chunkenc/st_helper_test.go @@ -92,6 +92,11 @@ func testChunkSTHandling(t *testing.T, vt ValueType, chunkFactory func() Chunk) }) stTimes := []int64{0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000} + + ts := func(j int) int64 { + return int64(1000 * (j + 1)) + } + for numberOfSamples := range 5 { samples := make([]triple, numberOfSamples) sampleSTidx := make([]int, numberOfSamples) @@ -99,7 +104,7 @@ func testChunkSTHandling(t *testing.T, vt ValueType, chunkFactory func() Chunk) for j := range numberOfSamples { samples[j] = triple{ st: stTimes[sampleSTidx[j]], - t: int64(1000 * (j + 1)), + t: ts(j), v: float64(j) + 0.5, } } @@ -110,7 +115,7 @@ func testChunkSTHandling(t *testing.T, vt ValueType, chunkFactory func() Chunk) exhausted := true for j := numberOfSamples - 1; j >= 0; j-- { - if sampleSTidx[j] < j+2 { + if stTimes[sampleSTidx[j]] < ts(j) { sampleSTidx[j]++ exhausted = false break From de4a24325e1a02eb475f0bdb2d866e7c114254d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 5 Feb 2026 18:52:37 +0100 Subject: [PATCH 20/73] add st equal t to bechmarks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/benchmark_test.go | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tsdb/chunkenc/benchmark_test.go b/tsdb/chunkenc/benchmark_test.go index c1335d1e44..5efa866e31 100644 --- a/tsdb/chunkenc/benchmark_test.go +++ b/tsdb/chunkenc/benchmark_test.go @@ -114,6 +114,17 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl return ret }(), }, + { + name: "vt=constant/st=t", + samples: func() (ret []triple) { + t, v := initT, initV + for range nSamples { + t += 15000 + ret = append(ret, triple{st: t, t: t, v: v}) + } + return ret + }(), + }, { name: "vt=random steps/st=0", samples: func() (ret []triple) { @@ -164,6 +175,18 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl return ret }(), }, + { + name: "vt=random steps/st=t", + samples: func() (ret []triple) { + t, v := initT, initV + for i := range nSamples { + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. + ret = append(ret, triple{st: t, t: t, v: v}) + } + return ret + }(), + }, { name: "vt=random 0-1/st=0", samples: func() (ret []triple) { @@ -259,6 +282,18 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl return ret }(), }, + { + name: "vt=random 0-1/st=t", + samples: func() (ret []triple) { + t, v := initT, initV + for i := range nSamples { + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] / 100.0 // Random between 0 and 1.0. + ret = append(ret, triple{st: t, t: t, v: v}) + } + return ret + }(), + }, } for _, f := range []fmtCase{ From 07c543f02d75536bb7768db9be2ce33b032b289b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 6 Feb 2026 15:33:05 +0100 Subject: [PATCH 21/73] test(chunkenc): test that appender can contonue chunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test that initializing a chunk appender from an existing chunk works correctly. Signed-off-by: György Krajcsovits --- tsdb/chunkenc/st_helper_test.go | 48 +++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/tsdb/chunkenc/st_helper_test.go b/tsdb/chunkenc/st_helper_test.go index fa9c9b451c..7cc4e9f119 100644 --- a/tsdb/chunkenc/st_helper_test.go +++ b/tsdb/chunkenc/st_helper_test.go @@ -65,20 +65,46 @@ func testChunkSTHandling(t *testing.T, vt ValueType, chunkFactory func() Chunk) chunk := chunkFactory() app, err := chunk.Appender() require.NoError(t, err) - for _, s := range samples { + var clone []byte + for i, s := range samples { + if i == len(samples)-1 { + clone = append(clone, chunk.Bytes()...) + } sampleAppend(app, vt, s.st, s.t, s.v) } - require.Equal(t, len(samples), chunk.NumSamples()) - it := chunk.Iterator(nil) - for i, s := range samples { - require.Equal(t, vt, it.Next(), "%d: value type mismatch", i) - st, ts, f := get(it, vt) - require.Equal(t, s.t, ts, "%d: timestamp mismatch", i) - require.Equal(t, s.st, st, "%d: start time mismatch", i) - require.InDelta(t, s.v, f, 1e-9, "%d: value mismatch", i) + chunksToTest := []Chunk{chunk} + + if len(samples) > 0 { + // If there are samples, also test that appending to a chunk cloned from the original chunk works correctly. + // This tests resuming the appender from a previous chunk. + cloneChunk := chunkFactory() + cloneChunk.Reset(clone) + cloneApp, err := cloneChunk.Appender() + require.NoError(t, err) + sampleAppend(cloneApp, vt, samples[len(samples)-1].st, samples[len(samples)-1].t, samples[len(samples)-1].v) + chunksToTest = append(chunksToTest, cloneChunk) + } + + printChunkName := func(i int) string { + if i == 0 { + return "original" + } + return "cloned" + } + + for ci, chk := range chunksToTest { + require.Equal(t, len(samples), chk.NumSamples(), "%s chunk: number of samples mismatch", printChunkName(ci)) + it := chk.Iterator(nil) + for i, s := range samples { + require.Equal(t, vt, it.Next(), "%s[%d]: value type mismatch", printChunkName(ci), i) + st, ts, f := get(it, vt) + require.Equal(t, s.t, ts, "%s[%d]: timestamp mismatch", printChunkName(ci), i) + require.Equal(t, s.st, st, "%s[%d]: start time mismatch", printChunkName(ci), i) + require.InDelta(t, s.v, f, 1e-9, "%s[%d]: value mismatch", printChunkName(ci), i) + } + require.Equal(t, ValNone, it.Next()) + require.NoError(t, it.Err()) } - require.Equal(t, ValNone, it.Next()) - require.NoError(t, it.Err()) } t.Run("manual for debugging", func(t *testing.T) { From b2491c7cf7797ad54a0be80744d178ba6759de08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 6 Feb 2026 15:34:56 +0100 Subject: [PATCH 22/73] fix(chunkenc): bug in initializing appender on existing chunk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/xoroptst.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tsdb/chunkenc/xoroptst.go b/tsdb/chunkenc/xoroptst.go index 71bb195d0a..b138ddbdf4 100644 --- a/tsdb/chunkenc/xoroptst.go +++ b/tsdb/chunkenc/xoroptst.go @@ -114,6 +114,10 @@ func (c *XorOptSTChunk) Appender() (Appender, error) { return nil, err } + // Set the bit position for continuing writes. + // The iterator's reader tracks how many bits remain unread in the last byte. + c.b.count = it.br.valid + a := &xorOptSTAppender{ b: &c.b, st: it.st, From 64208c7e6284d39c117935f7fff867268e4df415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Sat, 7 Feb 2026 16:10:26 +0100 Subject: [PATCH 23/73] Add cases with jitter in the start time as well MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/benchmark_test.go | 43 ++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tsdb/chunkenc/benchmark_test.go b/tsdb/chunkenc/benchmark_test.go index 5efa866e31..702e3a95e2 100644 --- a/tsdb/chunkenc/benchmark_test.go +++ b/tsdb/chunkenc/benchmark_test.go @@ -50,7 +50,7 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl initST = timestamp.FromTime(d) // Use realistic timestamp. initT = initST + 15000 // 15s after initST. initV = 1243535.123 - rInts = make([]int64, nSamples) + rInts = make([]int64, 2*nSamples) // Random ints for timestamps and STs. rFloats = make([]float64, nSamples) ) @@ -58,6 +58,7 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl // the generated samples. for i := range nSamples { rInts[i] = int64(r.Intn(100)) + rInts[nSamples+i] = int64(r.Intn(100)) rFloats[i] = float64(r.Intn(100)) } @@ -125,6 +126,20 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl return ret }(), }, + { + // Delta simulates delta type or worst case for cumulatives, where ST + // is changing on every sample. + name: "vt=constant/st=delta-jitter", + samples: func() (ret []triple) { + t, v := initT, initV + for i := range nSamples { + st := t + rInts[nSamples+i] // ST is the same as the previous t + jitter of up to 100ms. + t += 15000 + ret = append(ret, triple{st: st, t: t, v: v}) + } + return ret + }(), + }, { name: "vt=random steps/st=0", samples: func() (ret []triple) { @@ -187,6 +202,19 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl return ret }(), }, + { + name: "vt=random steps/st=delta-jittery", + samples: func() (ret []triple) { + t, v := initT, initV + for i := range nSamples { + st := t + rInts[nSamples+i] // ST is equal to the previous t + jitter of up to 100ms. + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. + ret = append(ret, triple{st: st, t: t, v: v}) + } + return ret + }(), + }, { name: "vt=random 0-1/st=0", samples: func() (ret []triple) { @@ -294,6 +322,19 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl return ret }(), }, + { + name: "vt=random 0-1/st=delta-jittery", + samples: func() (ret []triple) { + t, v := initT, initV + for i := range nSamples { + st := t + rInts[nSamples+i] // ST is equal to the previous t + jitter of up to 100ms. + t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. + v += rFloats[i] / 100.0 // Random between 0 and 1.0. + ret = append(ret, triple{st: st, t: t, v: v}) + } + return ret + }(), + }, } for _, f := range []fmtCase{ From b57f5b59b3d8b3c5fd558aca5a856b617b2738ce Mon Sep 17 00:00:00 2001 From: Owen Williams Date: Thu, 12 Feb 2026 13:17:50 -0500 Subject: [PATCH 24/73] tsdb: ST-in-WAL: Counter implementation and benchmarks (#17671) Initial implementation of https://github.com/prometheus/prometheus/issues/17790. Only implements ST-per-sample for Counters. Tests and benchmarks updated. Note: This increases the size of the RefSample object for all users, whether st-per-sample is turned on or not. Signed-off-by: Owen Williams --- storage/remote/queue_manager_test.go | 3 - tsdb/agent/db.go | 8 +- tsdb/agent/db_append_v2_test.go | 452 +++---- tsdb/agent/db_test.go | 6 +- tsdb/compression/compression.go | 130 +++ tsdb/db_append_v2_test.go | 571 ++++----- tsdb/db_test.go | 571 ++++----- tsdb/head.go | 9 +- tsdb/head_append.go | 3 + tsdb/head_append_v2_test.go | 578 ++++----- tsdb/head_test.go | 1619 +++++++++++++------------- tsdb/head_wal.go | 6 +- tsdb/record/bench_test.go | 207 ++++ tsdb/record/record.go | 153 ++- tsdb/record/record_test.go | 532 +++++---- tsdb/wlog/checkpoint.go | 6 +- tsdb/wlog/checkpoint_test.go | 444 +++---- tsdb/wlog/watcher.go | 2 +- tsdb/wlog/watcher_test.go | 936 +++++++-------- util/testrecord/record.go | 96 ++ 20 files changed, 3527 insertions(+), 2805 deletions(-) create mode 100644 tsdb/compression/compression.go create mode 100644 tsdb/record/bench_test.go create mode 100644 util/testrecord/record.go diff --git a/storage/remote/queue_manager_test.go b/storage/remote/queue_manager_test.go index a4b05d387a..1386a64aec 100644 --- a/storage/remote/queue_manager_test.go +++ b/storage/remote/queue_manager_test.go @@ -200,7 +200,6 @@ func TestBasicContentNegotiation(t *testing.T) { } func TestSampleDelivery(t *testing.T) { - t.Parallel() // Let's create an even number of send batches, so we don't run into the // batch timeout case. n := 3 @@ -409,7 +408,6 @@ func TestWALMetadataDelivery(t *testing.T) { } func TestSampleDeliveryTimeout(t *testing.T) { - t.Parallel() for _, protoMsg := range []remoteapi.WriteMessageType{remoteapi.WriteV1MessageType, remoteapi.WriteV2MessageType} { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { // Let's send one less sample than batch size, and wait the timeout duration @@ -2038,7 +2036,6 @@ func TestIsSampleOld(t *testing.T) { // Simulates scenario in which remote write endpoint is down and a subset of samples is dropped due to age limit while backoffing. func TestSendSamplesWithBackoffWithSampleAgeLimit(t *testing.T) { - t.Parallel() for _, protoMsg := range []remoteapi.WriteMessageType{remoteapi.WriteV1MessageType, remoteapi.WriteV2MessageType} { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { maxSamplesPerSend := 10 diff --git a/tsdb/agent/db.go b/tsdb/agent/db.go index 460ceb7c04..3f79d9176a 100644 --- a/tsdb/agent/db.go +++ b/tsdb/agent/db.go @@ -490,7 +490,7 @@ func (db *DB) loadWAL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H return } decoded <- series - case record.Samples: + case record.Samples, record.SamplesV2: samples := db.walReplaySamplesPool.Get()[:0] samples, err = dec.Samples(rec, samples) if err != nil { @@ -710,7 +710,7 @@ func (db *DB) truncate(mint int64) error { db.metrics.checkpointCreationTotal.Inc() - if _, err = wlog.Checkpoint(db.logger, db.wal, first, last, db.keepSeriesInWALCheckpointFn(last), mint); err != nil { + if _, err = wlog.Checkpoint(db.logger, db.wal, first, last, db.keepSeriesInWALCheckpointFn(last), mint, db.opts.EnableSTStorage); err != nil { db.metrics.checkpointCreationFail.Inc() var cerr *wlog.CorruptionErr if errors.As(err, &cerr) { @@ -1156,7 +1156,7 @@ func (a *appenderBase) log() error { a.mtx.RLock() defer a.mtx.RUnlock() - var encoder record.Encoder + encoder := record.Encoder{EnableSTStorage: a.opts.EnableSTStorage} buf := a.bufPool.Get().([]byte) defer func() { a.bufPool.Put(buf) //nolint:staticcheck @@ -1280,7 +1280,7 @@ func (a *appenderBase) logSeries() error { a.bufPool.Put(buf) //nolint:staticcheck }() - var encoder record.Encoder + encoder := record.Encoder{EnableSTStorage: a.opts.EnableSTStorage} buf = encoder.Series(a.pendingSeries, buf) if err := a.wal.Log(buf); err != nil { return err diff --git a/tsdb/agent/db_append_v2_test.go b/tsdb/agent/db_append_v2_test.go index 3e10a1163b..139e7baa19 100644 --- a/tsdb/agent/db_append_v2_test.go +++ b/tsdb/agent/db_append_v2_test.go @@ -18,6 +18,7 @@ import ( "fmt" "math" "path/filepath" + "strconv" "testing" "time" @@ -95,272 +96,275 @@ func TestCommit_AppendV2(t *testing.T) { numHistograms = 100 numSeries = 8 ) + for _, enableStStorage := range []bool{false, true} { + t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + opts := DefaultOptions() + opts.EnableSTStorage = enableStStorage + s := createTestAgentDB(t, nil, opts) - s := createTestAgentDB(t, nil, DefaultOptions()) - app := s.AppenderV2(context.TODO()) + app := s.AppenderV2(context.TODO()) - lbls := labelsForTest(t.Name(), numSeries) - for _, l := range lbls { - lset := labels.New(l...) + lbls := labelsForTest(t.Name(), numSeries) + for _, l := range lbls { + lset := labels.New(l...) - for i := range numDatapoints { - sample := chunks.GenerateSamples(0, 1) - _, err := app.Append(0, lset, 0, sample[0].T(), sample[0].F(), nil, nil, storage.AOptions{ - Exemplars: []exemplar.Exemplar{{ - Labels: lset, - Ts: sample[0].T() + int64(i), - Value: sample[0].F(), - HasTs: true, - }}, - }) + for i := range numDatapoints { + sample := chunks.GenerateSamples(0, 1) + _, err := app.Append(0, lset, int64(i), sample[0].T()+2000, sample[0].F(), nil, nil, storage.AOptions{ + Exemplars: []exemplar.Exemplar{{ + Labels: lset, + Ts: sample[0].T() + int64(i) + 2000, + Value: sample[0].F(), + HasTs: true, + }}, + }) + require.NoError(t, err) + } + } + + lbls = labelsForTest(t.Name()+"_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) + + histograms := tsdbutil.GenerateTestHistograms(numHistograms) + + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, histograms[i], nil, storage.AOptions{}) + require.NoError(t, err) + } + } + + lbls = labelsForTest(t.Name()+"_custom_buckets_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) + + customBucketHistograms := tsdbutil.GenerateTestCustomBucketsHistograms(numHistograms) + + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, customBucketHistograms[i], nil, storage.AOptions{}) + require.NoError(t, err) + } + } + + lbls = labelsForTest(t.Name()+"_float_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) + + floatHistograms := tsdbutil.GenerateTestFloatHistograms(numHistograms) + + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, nil, floatHistograms[i], storage.AOptions{}) + require.NoError(t, err) + } + } + + lbls = labelsForTest(t.Name()+"_custom_buckets_float_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) + + customBucketFloatHistograms := tsdbutil.GenerateTestCustomBucketsFloatHistograms(numHistograms) + + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, nil, customBucketFloatHistograms[i], storage.AOptions{}) + require.NoError(t, err) + } + } + + require.NoError(t, app.Commit()) + require.NoError(t, s.Close()) + + sr, err := wlog.NewSegmentsReader(s.wal.Dir()) require.NoError(t, err) - } + defer func() { + require.NoError(t, sr.Close()) + }() + + // Read records from WAL and check for expected count of series, samples, and exemplars. + var ( + r = wlog.NewReader(sr) + dec = record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + + walSeriesCount, walSamplesCount, walExemplarsCount, walHistogramCount, walFloatHistogramCount int + ) + for r.Next() { + rec := r.Record() + switch dec.Type(rec) { + case record.Series: + var series []record.RefSeries + series, err = dec.Series(rec, series) + require.NoError(t, err) + walSeriesCount += len(series) + + case record.Samples: + if enableStStorage { + t.Errorf("Got V1 Samples when ST enabled") + } + var samples []record.RefSample + samples, err = dec.Samples(rec, samples) + require.NoError(t, err) + walSamplesCount += len(samples) + + case record.SamplesV2: + if !enableStStorage { + t.Errorf("Got V2 Samples when ST disabled") + } + var samples []record.RefSample + samples, err = dec.Samples(rec, samples) + require.NoError(t, err) + walSamplesCount += len(samples) + + case record.HistogramSamples, record.CustomBucketsHistogramSamples: + var histograms []record.RefHistogramSample + histograms, err = dec.HistogramSamples(rec, histograms) + require.NoError(t, err) + walHistogramCount += len(histograms) + + case record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: + var floatHistograms []record.RefFloatHistogramSample + floatHistograms, err = dec.FloatHistogramSamples(rec, floatHistograms) + require.NoError(t, err) + walFloatHistogramCount += len(floatHistograms) + + case record.Exemplars: + var exemplars []record.RefExemplar + exemplars, err = dec.Exemplars(rec, exemplars) + require.NoError(t, err) + walExemplarsCount += len(exemplars) + + default: + } + } + + // Check that the WAL contained the same number of committed series/samples/exemplars. + require.Equal(t, numSeries*5, walSeriesCount, "unexpected number of series") + require.Equal(t, numSeries*numDatapoints, walSamplesCount, "unexpected number of samples") + require.Equal(t, numSeries*numDatapoints, walExemplarsCount, "unexpected number of exemplars") + require.Equal(t, numSeries*numHistograms*2, walHistogramCount, "unexpected number of histograms") + require.Equal(t, numSeries*numHistograms*2, walFloatHistogramCount, "unexpected number of float histograms") + + // Check that we can still create both kinds of Appender - see https://github.com/prometheus/prometheus/issues/17800. + _ = s.Appender(context.TODO()) + _ = s.AppenderV2(context.TODO()) + }) } - - lbls = labelsForTest(t.Name()+"_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) - - histograms := tsdbutil.GenerateTestHistograms(numHistograms) - - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, histograms[i], nil, storage.AOptions{}) - require.NoError(t, err) - } - } - - lbls = labelsForTest(t.Name()+"_custom_buckets_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) - - customBucketHistograms := tsdbutil.GenerateTestCustomBucketsHistograms(numHistograms) - - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, customBucketHistograms[i], nil, storage.AOptions{}) - require.NoError(t, err) - } - } - - lbls = labelsForTest(t.Name()+"_float_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) - - floatHistograms := tsdbutil.GenerateTestFloatHistograms(numHistograms) - - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, nil, floatHistograms[i], storage.AOptions{}) - require.NoError(t, err) - } - } - - lbls = labelsForTest(t.Name()+"_custom_buckets_float_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) - - customBucketFloatHistograms := tsdbutil.GenerateTestCustomBucketsFloatHistograms(numHistograms) - - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, nil, customBucketFloatHistograms[i], storage.AOptions{}) - require.NoError(t, err) - } - } - - require.NoError(t, app.Commit()) - require.NoError(t, s.Close()) - - sr, err := wlog.NewSegmentsReader(s.wal.Dir()) - require.NoError(t, err) - defer func() { - require.NoError(t, sr.Close()) - }() - - // Read records from WAL and check for expected count of series, samples, and exemplars. - var ( - r = wlog.NewReader(sr) - dec = record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) - - walSeriesCount, walSamplesCount, walExemplarsCount, walHistogramCount, walFloatHistogramCount int - ) - for r.Next() { - rec := r.Record() - switch dec.Type(rec) { - case record.Series: - var series []record.RefSeries - series, err = dec.Series(rec, series) - require.NoError(t, err) - walSeriesCount += len(series) - - case record.Samples: - var samples []record.RefSample - samples, err = dec.Samples(rec, samples) - require.NoError(t, err) - walSamplesCount += len(samples) - - case record.HistogramSamples, record.CustomBucketsHistogramSamples: - var histograms []record.RefHistogramSample - histograms, err = dec.HistogramSamples(rec, histograms) - require.NoError(t, err) - walHistogramCount += len(histograms) - - case record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: - var floatHistograms []record.RefFloatHistogramSample - floatHistograms, err = dec.FloatHistogramSamples(rec, floatHistograms) - require.NoError(t, err) - walFloatHistogramCount += len(floatHistograms) - - case record.Exemplars: - var exemplars []record.RefExemplar - exemplars, err = dec.Exemplars(rec, exemplars) - require.NoError(t, err) - walExemplarsCount += len(exemplars) - - default: - } - } - - // Check that the WAL contained the same number of committed series/samples/exemplars. - require.Equal(t, numSeries*5, walSeriesCount, "unexpected number of series") - require.Equal(t, numSeries*numDatapoints, walSamplesCount, "unexpected number of samples") - require.Equal(t, numSeries*numDatapoints, walExemplarsCount, "unexpected number of exemplars") - require.Equal(t, numSeries*numHistograms*2, walHistogramCount, "unexpected number of histograms") - require.Equal(t, numSeries*numHistograms*2, walFloatHistogramCount, "unexpected number of float histograms") - - // Check that we can still create both kinds of Appender - see https://github.com/prometheus/prometheus/issues/17800. - _ = s.Appender(context.TODO()) - _ = s.AppenderV2(context.TODO()) } -func TestRollback_AppendV2(t *testing.T) { +func TestRollbackAppendV2(t *testing.T) { const ( numDatapoints = 1000 numHistograms = 100 numSeries = 8 ) - s := createTestAgentDB(t, nil, DefaultOptions()) - app := s.AppenderV2(context.TODO()) + for _, enableStStorage := range []bool{false, true} { + opts := DefaultOptions() + opts.EnableSTStorage = enableStStorage + s := createTestAgentDB(t, nil, opts) + app := s.AppenderV2(context.TODO()) - lbls := labelsForTest(t.Name(), numSeries) - for _, l := range lbls { - lset := labels.New(l...) + lbls := labelsForTest(t.Name(), numSeries) + for _, l := range lbls { + lset := labels.New(l...) - for range numDatapoints { - sample := chunks.GenerateSamples(0, 1) - _, err := app.Append(0, lset, 0, sample[0].T(), sample[0].F(), nil, nil, storage.AOptions{}) - require.NoError(t, err) + for i := range numDatapoints { + sample := chunks.GenerateSamples(0, 1) + _, err := app.Append(0, lset, int64(i), sample[0].T()+2000, sample[0].F(), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } } - } - lbls = labelsForTest(t.Name()+"_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) + lbls = labelsForTest(t.Name()+"_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) - histograms := tsdbutil.GenerateTestHistograms(numHistograms) + histograms := tsdbutil.GenerateTestHistograms(numHistograms) - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, histograms[i], nil, storage.AOptions{}) - require.NoError(t, err) + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, histograms[i], nil, storage.AOptions{}) + require.NoError(t, err) + } } - } - lbls = labelsForTest(t.Name()+"_custom_buckets_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) + lbls = labelsForTest(t.Name()+"_custom_buckets_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) - histograms := tsdbutil.GenerateTestCustomBucketsHistograms(numHistograms) + histograms := tsdbutil.GenerateTestCustomBucketsHistograms(numHistograms) - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, histograms[i], nil, storage.AOptions{}) - require.NoError(t, err) + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, histograms[i], nil, storage.AOptions{}) + require.NoError(t, err) + } } - } - lbls = labelsForTest(t.Name()+"_float_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) + lbls = labelsForTest(t.Name()+"_float_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) - floatHistograms := tsdbutil.GenerateTestFloatHistograms(numHistograms) + floatHistograms := tsdbutil.GenerateTestFloatHistograms(numHistograms) - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, nil, floatHistograms[i], storage.AOptions{}) - require.NoError(t, err) + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, nil, floatHistograms[i], storage.AOptions{}) + require.NoError(t, err) + } } - } - lbls = labelsForTest(t.Name()+"_custom_buckets_float_histogram", numSeries) - for _, l := range lbls { - lset := labels.New(l...) + lbls = labelsForTest(t.Name()+"_custom_buckets_float_histogram", numSeries) + for _, l := range lbls { + lset := labels.New(l...) - floatHistograms := tsdbutil.GenerateTestCustomBucketsFloatHistograms(numHistograms) + floatHistograms := tsdbutil.GenerateTestCustomBucketsFloatHistograms(numHistograms) - for i := range numHistograms { - _, err := app.Append(0, lset, 0, int64(i), 0, nil, floatHistograms[i], storage.AOptions{}) - require.NoError(t, err) + for i := range numHistograms { + _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, nil, floatHistograms[i], storage.AOptions{}) + require.NoError(t, err) + } } - } - // Do a rollback, which should clear uncommitted data. A followup call to - // commit should persist nothing to the WAL. - require.NoError(t, app.Rollback()) - require.NoError(t, app.Commit()) - require.NoError(t, s.Close()) + // Do a rollback, which should clear uncommitted data. A followup call to + // commit should persist nothing to the WAL. + require.NoError(t, app.Rollback()) + require.NoError(t, app.Commit()) + require.NoError(t, s.Close()) - sr, err := wlog.NewSegmentsReader(s.wal.Dir()) - require.NoError(t, err) - defer func() { - require.NoError(t, sr.Close()) - }() + sr, err := wlog.NewSegmentsReader(s.wal.Dir()) + require.NoError(t, err) + defer func() { + require.NoError(t, sr.Close()) + }() - // Read records from WAL and check for expected count of series and samples. - var ( - r = wlog.NewReader(sr) - dec = record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + // Read records from WAL and check for expected count of series and samples. + var ( + r = wlog.NewReader(sr) + dec = record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) - walSeriesCount, walSamplesCount, walHistogramCount, walFloatHistogramCount, walExemplarsCount int - ) - for r.Next() { - rec := r.Record() - switch dec.Type(rec) { - case record.Series: - var series []record.RefSeries - series, err = dec.Series(rec, series) - require.NoError(t, err) - walSeriesCount += len(series) + walSeriesCount int + ) + for r.Next() { + rec := r.Record() + switch dec.Type(rec) { + case record.Series: + var series []record.RefSeries + series, err = dec.Series(rec, series) + require.NoError(t, err) + walSeriesCount += len(series) - case record.Samples: - var samples []record.RefSample - samples, err = dec.Samples(rec, samples) - require.NoError(t, err) - walSamplesCount += len(samples) + case record.Samples, record.SamplesV2: + t.Errorf("should not have found samples") - case record.Exemplars: - var exemplars []record.RefExemplar - exemplars, err = dec.Exemplars(rec, exemplars) - require.NoError(t, err) - walExemplarsCount += len(exemplars) + case record.Exemplars: + t.Errorf("should not have found exemplars") - case record.HistogramSamples, record.CustomBucketsHistogramSamples: - var histograms []record.RefHistogramSample - histograms, err = dec.HistogramSamples(rec, histograms) - require.NoError(t, err) - walHistogramCount += len(histograms) + case record.HistogramSamples, record.CustomBucketsHistogramSamples, record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: + t.Errorf("should not have found histograms") - case record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: - var floatHistograms []record.RefFloatHistogramSample - floatHistograms, err = dec.FloatHistogramSamples(rec, floatHistograms) - require.NoError(t, err) - walFloatHistogramCount += len(floatHistograms) - - default: + default: + } } - } - // Check that only series get stored after calling Rollback. - require.Equal(t, numSeries*5, walSeriesCount, "series should have been written to WAL") - require.Equal(t, 0, walSamplesCount, "samples should not have been written to WAL") - require.Equal(t, 0, walExemplarsCount, "exemplars should not have been written to WAL") - require.Equal(t, 0, walHistogramCount, "histograms should not have been written to WAL") - require.Equal(t, 0, walFloatHistogramCount, "float histograms should not have been written to WAL") + // Check that only series get stored after calling Rollback. + require.Equal(t, numSeries*5, walSeriesCount, "series should have been written to WAL") + } } func TestFullTruncateWAL_AppendV2(t *testing.T) { diff --git a/tsdb/agent/db_test.go b/tsdb/agent/db_test.go index 31e309d3fd..2f8212ff7a 100644 --- a/tsdb/agent/db_test.go +++ b/tsdb/agent/db_test.go @@ -225,7 +225,7 @@ func TestCommit(t *testing.T) { require.NoError(t, err) walSeriesCount += len(series) - case record.Samples: + case record.Samples, record.SamplesV2: var samples []record.RefSample samples, err = dec.Samples(rec, samples) require.NoError(t, err) @@ -361,7 +361,7 @@ func TestRollback(t *testing.T) { require.NoError(t, err) walSeriesCount += len(series) - case record.Samples: + case record.Samples, record.SamplesV2: var samples []record.RefSample samples, err = dec.Samples(rec, samples) require.NoError(t, err) @@ -1344,7 +1344,7 @@ func readWALSamples(t *testing.T, walDir string) []walSample { series, err := dec.Series(rec, nil) require.NoError(t, err) lastSeries = series[0] - case record.Samples: + case record.Samples, record.SamplesV2: samples, err = dec.Samples(rec, samples[:0]) require.NoError(t, err) for _, s := range samples { diff --git a/tsdb/compression/compression.go b/tsdb/compression/compression.go new file mode 100644 index 0000000000..147a526f7e --- /dev/null +++ b/tsdb/compression/compression.go @@ -0,0 +1,130 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package compression + +import ( + "errors" + "fmt" + + "github.com/golang/snappy" + "github.com/klauspost/compress/zstd" +) + +// Type represents the compression type used for encoding and decoding data. +type Type string + +const ( + // None represents no compression case. + // None it's a default when Type is empty. + None Type = "none" + // Snappy represents snappy block format. + Snappy Type = "snappy" + // Zstd represents zstd compression. + Zstd Type = "zstd" +) + +// Encoder provides compression encoding functionality for supported compression +// types. It is agnostic to the content being compressed, operating on byte +// slices of serialized data streams. The encoder maintains internal state for +// Zstd compression and can handle multiple compression types including None, +// Snappy, and Zstd. +type Encoder struct { + w *zstd.Encoder +} + +// NewEncoder creates a new Encoder. Returns an error if the zstd encoder cannot +// be initialized. +func NewEncoder() (*Encoder, error) { + e := &Encoder{} + w, err := zstd.NewWriter(nil) + if err != nil { + return nil, err + } + e.w = w + return e, nil +} + +// Encode returns the encoded form of src for the given compression type. It also +// returns the indicator if the compression was performed. Encode may skip +// compressing for None type, but also when src is too large e.g. for Snappy block format. +// +// The buf is used as a buffer for returned encoding, and it must not overlap with +// src. It is valid to pass a nil buf. +func (e *Encoder) Encode(t Type, src, buf []byte) (_ []byte, compressed bool, err error) { + switch { + case len(src) == 0, t == "", t == None: + return src, false, nil + case t == Snappy: + // If MaxEncodedLen is less than 0 the record is too large to be compressed. + if snappy.MaxEncodedLen(len(src)) < 0 { + return src, false, nil + } + + // The snappy library uses `len` to calculate if we need a new buffer. + // In order to allocate as few buffers as possible make the length + // equal to the capacity. + buf = buf[:cap(buf)] + return snappy.Encode(buf, src), true, nil + case t == Zstd: + if e == nil { + return nil, false, errors.New("zstd requested but encoder was not initialized with NewEncoder()") + } + return e.w.EncodeAll(src, buf[:0]), true, nil + default: + return nil, false, fmt.Errorf("unsupported compression type: %s", t) + } +} + +// Decoder provides decompression functionality for supported compression types. +// It is agnostic to the content being decompressed, operating on byte slices of +// serialized data streams. The decoder maintains internal state for Zstd +// decompression and can handle multiple compression types including None, +// Snappy, and Zstd. +type Decoder struct { + r *zstd.Decoder +} + +// NewDecoder creates a new Decoder. +func NewDecoder() *Decoder { + d := &Decoder{} + + // Calling zstd.NewReader with a nil io.Reader and no options cannot return an error. + r, _ := zstd.NewReader(nil) + d.r = r + return d +} + +// Decode returns the decoded form of src or error, given expected compression type. +// +// The buf is used as a buffer for the returned decoded entry, and it must not +// overlap with src. It is valid to pass a nil buf. +func (d *Decoder) Decode(t Type, src, buf []byte) (_ []byte, err error) { + switch { + case len(src) == 0, t == "", t == None: + return src, nil + case t == Snappy: + // The snappy library uses `len` to calculate if we need a new buffer. + // In order to allocate as few buffers as possible make the length + // equal to the capacity. + buf = buf[:cap(buf)] + return snappy.Decode(buf, src) + case t == Zstd: + if d == nil { + return nil, errors.New("zstd requested but Decoder was not initialized with NewDecoder()") + } + return d.r.DecodeAll(src, buf[:0]) + default: + return nil, fmt.Errorf("unsupported compression type: %s", t) + } +} diff --git a/tsdb/db_append_v2_test.go b/tsdb/db_append_v2_test.go index e6bcfb696d..15201d3dc7 100644 --- a/tsdb/db_append_v2_test.go +++ b/tsdb/db_append_v2_test.go @@ -193,7 +193,7 @@ func TestDataNotAvailableAfterRollback_AppendV2(t *testing.T) { require.NoError(t, err) walSeriesCount += len(series) - case record.Samples: + case record.Samples, record.SamplesV2: var samples []record.RefSample samples, err = dec.Samples(rec, samples) require.NoError(t, err) @@ -968,16 +968,18 @@ func TestWALReplayRaceOnSamplesLoggedBeforeSeries_AppendV2(t *testing.T) { // We test both with few and many samples appended after series creation. If samples are < 120 then there's no // mmap-ed chunk, otherwise there's at least 1 mmap-ed chunk when replaying the WAL. - for _, numSamplesAfterSeriesCreation := range []int{1, 1000} { - for run := 1; run <= numRuns; run++ { - t.Run(fmt.Sprintf("samples after series creation = %d, run = %d", numSamplesAfterSeriesCreation, run), func(t *testing.T) { - testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation) - }) + for _, enableStStorage := range []bool{false, true} { + for _, numSamplesAfterSeriesCreation := range []int{1, 1000} { + for run := 1; run <= numRuns; run++ { + t.Run(fmt.Sprintf("samples after series creation = %d, run = %d, stStorage = %v", numSamplesAfterSeriesCreation, run, enableStStorage), func(t *testing.T) { + testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation, enableStStorage) + }) + } } } } -func testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int) { +func testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int, enableStStorage bool) { const numSeries = 1000 db := newTestDB(t) @@ -985,7 +987,7 @@ func testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t *testing.T, numSampl for seriesRef := 1; seriesRef <= numSeries; seriesRef++ { // Log samples before the series is logged to the WAL. - var enc record.Encoder + enc := record.Encoder{EnableSTStorage: enableStStorage} var samples []record.RefSample for ts := range numSamplesBeforeSeriesCreation { @@ -1176,139 +1178,143 @@ func TestTombstoneCleanResultEmptyBlock_AppendV2(t *testing.T) { func TestSizeRetention_AppendV2(t *testing.T) { t.Parallel() - opts := DefaultOptions() - opts.OutOfOrderTimeWindow = 100 - db := newTestDB(t, withOpts(opts), withRngs(100)) + for _, enableStStorage := range []bool{false, true} { + t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 100 + db := newTestDB(t, withOpts(opts), withRngs(100)) - blocks := []*BlockMeta{ - {MinTime: 100, MaxTime: 200}, // Oldest block - {MinTime: 200, MaxTime: 300}, - {MinTime: 300, MaxTime: 400}, - {MinTime: 400, MaxTime: 500}, - {MinTime: 500, MaxTime: 600}, // Newest Block - } + blocks := []*BlockMeta{ + {MinTime: 100, MaxTime: 200}, // Oldest block + {MinTime: 200, MaxTime: 300}, + {MinTime: 300, MaxTime: 400}, + {MinTime: 400, MaxTime: 500}, + {MinTime: 500, MaxTime: 600}, // Newest Block + } - for _, m := range blocks { - createBlock(t, db.Dir(), genSeries(100, 10, m.MinTime, m.MaxTime)) - } + for _, m := range blocks { + createBlock(t, db.Dir(), genSeries(100, 10, m.MinTime, m.MaxTime)) + } - headBlocks := []*BlockMeta{ - {MinTime: 700, MaxTime: 800}, - } + headBlocks := []*BlockMeta{ + {MinTime: 700, MaxTime: 800}, + } - // Add some data to the WAL. - headApp := db.Head().AppenderV2(context.Background()) - var aSeries labels.Labels - var it chunkenc.Iterator - for _, m := range headBlocks { - series := genSeries(100, 10, m.MinTime, m.MaxTime+1) - for _, s := range series { - aSeries = s.Labels() - it = s.Iterator(it) - for it.Next() == chunkenc.ValFloat { - tim, v := it.At() - _, err := headApp.Append(0, s.Labels(), 0, tim, v, nil, nil, storage.AOptions{}) + // Add some data to the WAL. + headApp := db.Head().AppenderV2(context.Background()) + var aSeries labels.Labels + var it chunkenc.Iterator + for _, m := range headBlocks { + series := genSeries(100, 10, m.MinTime, m.MaxTime+1) + for _, s := range series { + aSeries = s.Labels() + it = s.Iterator(it) + for it.Next() == chunkenc.ValFloat { + tim, v := it.At() + _, err := headApp.Append(0, s.Labels(), 0, tim, v, nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, it.Err()) + } + } + require.NoError(t, headApp.Commit()) + db.Head().mmapHeadChunks() + + require.Eventually(t, func() bool { + return db.Head().chunkDiskMapper.IsQueueEmpty() + }, 2*time.Second, 100*time.Millisecond) + + // Test that registered size matches the actual disk size. + require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. + require.Len(t, db.Blocks(), len(blocks)) // Ensure all blocks are registered. + blockSize := int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. + walSize, err := db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err := db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + // Expected size should take into account block size + WAL size + Head + // chunks size + expSize := blockSize + walSize + cdmSize + actSize, err := fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Create a WAL checkpoint, and compare sizes. + first, last, err := wlog.Segments(db.Head().wal.Dir()) + require.NoError(t, err) + _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0, enableStStorage) + require.NoError(t, err) + blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + expSize = blockSize + walSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Truncate Chunk Disk Mapper and compare sizes. + require.NoError(t, db.Head().chunkDiskMapper.Truncate(900)) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + expSize = blockSize + walSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Add some out of order samples to check the size of WBL. + headApp = db.Head().AppenderV2(context.Background()) + for ts := int64(750); ts < 800; ts++ { + _, err := headApp.Append(0, aSeries, 0, ts, float64(ts), nil, nil, storage.AOptions{}) require.NoError(t, err) } - require.NoError(t, it.Err()) - } + require.NoError(t, headApp.Commit()) + + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + wblSize, err := db.Head().wbl.Size() + require.NoError(t, err) + require.NotZero(t, wblSize) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + expSize = blockSize + walSize + wblSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Decrease the max bytes limit so that a delete is triggered. + // Check total size, total count and check that the oldest block was deleted. + firstBlockSize := db.Blocks()[0].Size() + sizeLimit := actSize - firstBlockSize + db.opts.MaxBytes = sizeLimit // Set the new db size limit one block smaller that the actual size. + require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. + + expBlocks := blocks[1:] + actBlocks := db.Blocks() + blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + // Expected size should take into account block size + WAL size + WBL size + expSize = blockSize + walSize + wblSize + cdmSize + actRetentionCount := int(prom_testutil.ToFloat64(db.metrics.sizeRetentionCount)) + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + + require.Equal(t, 1, actRetentionCount, "metric retention count mismatch") + require.Equal(t, expSize, actSize, "metric db size doesn't match actual disk size") + require.LessOrEqual(t, expSize, sizeLimit, "actual size (%v) is expected to be less than or equal to limit (%v)", expSize, sizeLimit) + require.Len(t, actBlocks, len(blocks)-1, "new block count should be decreased from:%v to:%v", len(blocks), len(blocks)-1) + require.Equal(t, expBlocks[0].MaxTime, actBlocks[0].meta.MaxTime, "maxT mismatch of the first block") + require.Equal(t, expBlocks[len(expBlocks)-1].MaxTime, actBlocks[len(actBlocks)-1].meta.MaxTime, "maxT mismatch of the last block") + }) } - require.NoError(t, headApp.Commit()) - db.Head().mmapHeadChunks() - - require.Eventually(t, func() bool { - return db.Head().chunkDiskMapper.IsQueueEmpty() - }, 2*time.Second, 100*time.Millisecond) - - // Test that registered size matches the actual disk size. - require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. - require.Len(t, db.Blocks(), len(blocks)) // Ensure all blocks are registered. - blockSize := int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. - walSize, err := db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err := db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - // Expected size should take into account block size + WAL size + Head - // chunks size - expSize := blockSize + walSize + cdmSize - actSize, err := fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Create a WAL checkpoint, and compare sizes. - first, last, err := wlog.Segments(db.Head().wal.Dir()) - require.NoError(t, err) - _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0) - require.NoError(t, err) - blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - expSize = blockSize + walSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Truncate Chunk Disk Mapper and compare sizes. - require.NoError(t, db.Head().chunkDiskMapper.Truncate(900)) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - expSize = blockSize + walSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Add some out of order samples to check the size of WBL. - headApp = db.Head().AppenderV2(context.Background()) - for ts := int64(750); ts < 800; ts++ { - _, err := headApp.Append(0, aSeries, 0, ts, float64(ts), nil, nil, storage.AOptions{}) - require.NoError(t, err) - } - require.NoError(t, headApp.Commit()) - - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - wblSize, err := db.Head().wbl.Size() - require.NoError(t, err) - require.NotZero(t, wblSize) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - expSize = blockSize + walSize + wblSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Decrease the max bytes limit so that a delete is triggered. - // Check total size, total count and check that the oldest block was deleted. - firstBlockSize := db.Blocks()[0].Size() - sizeLimit := actSize - firstBlockSize - db.opts.MaxBytes = sizeLimit // Set the new db size limit one block smaller that the actual size. - require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. - - expBlocks := blocks[1:] - actBlocks := db.Blocks() - blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - // Expected size should take into account block size + WAL size + WBL size - expSize = blockSize + walSize + wblSize + cdmSize - actRetentionCount := int(prom_testutil.ToFloat64(db.metrics.sizeRetentionCount)) - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - - require.Equal(t, 1, actRetentionCount, "metric retention count mismatch") - require.Equal(t, expSize, actSize, "metric db size doesn't match actual disk size") - require.LessOrEqual(t, expSize, sizeLimit, "actual size (%v) is expected to be less than or equal to limit (%v)", expSize, sizeLimit) - require.Len(t, actBlocks, len(blocks)-1, "new block count should be decreased from:%v to:%v", len(blocks), len(blocks)-1) - require.Equal(t, expBlocks[0].MaxTime, actBlocks[0].meta.MaxTime, "maxT mismatch of the first block") - require.Equal(t, expBlocks[len(expBlocks)-1].MaxTime, actBlocks[len(actBlocks)-1].meta.MaxTime, "maxT mismatch of the last block") } func TestNotMatcherSelectsLabelsUnsetSeries_AppendV2(t *testing.T) { @@ -1499,33 +1505,36 @@ func TestInitializeHeadTimestamp_AppendV2(t *testing.T) { require.Equal(t, int64(1000), db.head.MaxTime()) require.True(t, db.head.initialized()) }) - t.Run("wal-only", func(t *testing.T) { - dir := t.TempDir() - require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) - w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) - require.NoError(t, err) + for _, enableStStorage := range []bool{false, true} { + t.Run("wal-only,stStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + dir := t.TempDir() - var enc record.Encoder - err = w.Log( - enc.Series([]record.RefSeries{ - {Ref: 123, Labels: labels.FromStrings("a", "1")}, - {Ref: 124, Labels: labels.FromStrings("a", "2")}, - }, nil), - enc.Samples([]record.RefSample{ - {Ref: 123, T: 5000, V: 1}, - {Ref: 124, T: 15000, V: 1}, - }, nil), - ) - require.NoError(t, err) - require.NoError(t, w.Close()) + require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) + w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) + require.NoError(t, err) - db := newTestDB(t, withDir(dir)) + enc := record.Encoder{EnableSTStorage: enableStStorage} + err = w.Log( + enc.Series([]record.RefSeries{ + {Ref: 123, Labels: labels.FromStrings("a", "1")}, + {Ref: 124, Labels: labels.FromStrings("a", "2")}, + }, nil), + enc.Samples([]record.RefSample{ + {Ref: 123, T: 5000, V: 1}, + {Ref: 124, T: 15000, V: 1}, + }, nil), + ) + require.NoError(t, err) + require.NoError(t, w.Close()) - require.Equal(t, int64(5000), db.head.MinTime()) - require.Equal(t, int64(15000), db.head.MaxTime()) - require.True(t, db.head.initialized()) - }) + db := newTestDB(t, withDir(dir)) + + require.Equal(t, int64(5000), db.head.MinTime()) + require.Equal(t, int64(15000), db.head.MaxTime()) + require.True(t, db.head.initialized()) + }) + } t.Run("existing-block", func(t *testing.T) { dir := t.TempDir() @@ -1537,37 +1546,39 @@ func TestInitializeHeadTimestamp_AppendV2(t *testing.T) { require.Equal(t, int64(2000), db.head.MaxTime()) require.True(t, db.head.initialized()) }) - t.Run("existing-block-and-wal", func(t *testing.T) { - dir := t.TempDir() + for _, enableStStorage := range []bool{false, true} { + t.Run("existing-block-and-wal,stStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + dir := t.TempDir() - createBlock(t, dir, genSeries(1, 1, 1000, 6000)) + createBlock(t, dir, genSeries(1, 1, 1000, 6000)) - require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) - w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) - require.NoError(t, err) + require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) + w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) + require.NoError(t, err) - var enc record.Encoder - err = w.Log( - enc.Series([]record.RefSeries{ - {Ref: 123, Labels: labels.FromStrings("a", "1")}, - {Ref: 124, Labels: labels.FromStrings("a", "2")}, - }, nil), - enc.Samples([]record.RefSample{ - {Ref: 123, T: 5000, V: 1}, - {Ref: 124, T: 15000, V: 1}, - }, nil), - ) - require.NoError(t, err) - require.NoError(t, w.Close()) + enc := record.Encoder{EnableSTStorage: enableStStorage} + err = w.Log( + enc.Series([]record.RefSeries{ + {Ref: 123, Labels: labels.FromStrings("a", "1")}, + {Ref: 124, Labels: labels.FromStrings("a", "2")}, + }, nil), + enc.Samples([]record.RefSample{ + {Ref: 123, T: 5000, V: 1}, + {Ref: 124, T: 15000, V: 1}, + }, nil), + ) + require.NoError(t, err) + require.NoError(t, w.Close()) - db := newTestDB(t, withDir(dir)) + db := newTestDB(t, withDir(dir)) - require.Equal(t, int64(6000), db.head.MinTime()) - require.Equal(t, int64(15000), db.head.MaxTime()) - require.True(t, db.head.initialized()) - // Check that old series has been GCed. - require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.series)) - }) + require.Equal(t, int64(6000), db.head.MinTime()) + require.Equal(t, int64(15000), db.head.MaxTime()) + require.True(t, db.head.initialized()) + // Check that old series has been GCed. + require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.series)) + }) + } } func TestNoEmptyBlocks_AppendV2(t *testing.T) { @@ -3273,7 +3284,7 @@ func testOOOWALWriteAppendV2(t *testing.T, series, err := dec.Series(rec, nil) require.NoError(t, err) records = append(records, series) - case record.Samples: + case record.Samples, record.SamplesV2: samples, err := dec.Samples(rec, nil) require.NoError(t, err) records = append(records, samples) @@ -3430,112 +3441,116 @@ func TestMetadataInWAL_AppenderV2(t *testing.T) { } func TestMetadataCheckpointingOnlyKeepsLatestEntry_AppendV2(t *testing.T) { - ctx := context.Background() - numSamples := 10000 - hb, w := newTestHead(t, int64(numSamples)*10, compression.None, false) - hb.opts.EnableMetadataWALRecords = true + for _, enableStStorage := range []bool{false, true} { + t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + ctx := context.Background() + numSamples := 10000 + hb, w := newTestHead(t, int64(numSamples)*10, compression.None, false) + hb.opts.EnableMetadataWALRecords = true - // Add some series so we can append metadata to them. - s1 := labels.FromStrings("a", "b") - s2 := labels.FromStrings("c", "d") - s3 := labels.FromStrings("e", "f") - s4 := labels.FromStrings("g", "h") + // Add some series so we can append metadata to them. + s1 := labels.FromStrings("a", "b") + s2 := labels.FromStrings("c", "d") + s3 := labels.FromStrings("e", "f") + s4 := labels.FromStrings("g", "h") - m1 := metadata.Metadata{Type: "gauge", Unit: "unit_1", Help: "help_1"} - m2 := metadata.Metadata{Type: "gauge", Unit: "unit_2", Help: "help_2"} - m3 := metadata.Metadata{Type: "gauge", Unit: "unit_3", Help: "help_3"} - m4 := metadata.Metadata{Type: "gauge", Unit: "unit_4", Help: "help_4"} + m1 := metadata.Metadata{Type: "gauge", Unit: "unit_1", Help: "help_1"} + m2 := metadata.Metadata{Type: "gauge", Unit: "unit_2", Help: "help_2"} + m3 := metadata.Metadata{Type: "gauge", Unit: "unit_3", Help: "help_3"} + m4 := metadata.Metadata{Type: "gauge", Unit: "unit_4", Help: "help_4"} - app := hb.AppenderV2(ctx) - ts := int64(0) - _, err := app.Append(0, s1, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m1}) - require.NoError(t, err) - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) - require.NoError(t, err) - _, err = app.Append(0, s3, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m3}) - require.NoError(t, err) - _, err = app.Append(0, s4, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m4}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app := hb.AppenderV2(ctx) + ts := int64(0) + _, err := app.Append(0, s1, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m1}) + require.NoError(t, err) + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) + require.NoError(t, err) + _, err = app.Append(0, s3, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m3}) + require.NoError(t, err) + _, err = app.Append(0, s4, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m4}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - // Update metadata for first series. - m5 := metadata.Metadata{Type: "counter", Unit: "unit_5", Help: "help_5"} - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s1, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m5}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + // Update metadata for first series. + m5 := metadata.Metadata{Type: "counter", Unit: "unit_5", Help: "help_5"} + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s1, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m5}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - // Switch back-and-forth metadata for second series. - // Since it ended on a new metadata record, we expect a single new entry. - m6 := metadata.Metadata{Type: "counter", Unit: "unit_6", Help: "help_6"} + // Switch back-and-forth metadata for second series. + // Since it ended on a new metadata record, we expect a single new entry. + m6 := metadata.Metadata{Type: "counter", Unit: "unit_6", Help: "help_6"} - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m2}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - app = hb.AppenderV2(ctx) - ts++ - _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) - require.NoError(t, err) - require.NoError(t, app.Commit()) + app = hb.AppenderV2(ctx) + ts++ + _, err = app.Append(0, s2, 0, ts, 0, nil, nil, storage.AOptions{Metadata: m6}) + require.NoError(t, err) + require.NoError(t, app.Commit()) - // Let's create a checkpoint. - first, last, err := wlog.Segments(w.Dir()) - require.NoError(t, err) - keep := func(id chunks.HeadSeriesRef) bool { - return id != 3 + // Let's create a checkpoint. + first, last, err := wlog.Segments(w.Dir()) + require.NoError(t, err) + keep := func(id chunks.HeadSeriesRef) bool { + return id != 3 + } + _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0, enableStStorage) + require.NoError(t, err) + + // Confirm there's been a checkpoint. + cdir, _, err := wlog.LastCheckpoint(w.Dir()) + require.NoError(t, err) + + // Read in checkpoint and WAL. + recs := readTestWAL(t, cdir) + var gotMetadataBlocks [][]record.RefMetadata + for _, rec := range recs { + if mr, ok := rec.([]record.RefMetadata); ok { + gotMetadataBlocks = append(gotMetadataBlocks, mr) + } + } + + // There should only be 1 metadata block present, with only the latest + // metadata kept around. + wantMetadata := []record.RefMetadata{ + {Ref: 1, Type: record.GetMetricType(m5.Type), Unit: m5.Unit, Help: m5.Help}, + {Ref: 2, Type: record.GetMetricType(m6.Type), Unit: m6.Unit, Help: m6.Help}, + {Ref: 4, Type: record.GetMetricType(m4.Type), Unit: m4.Unit, Help: m4.Help}, + } + require.Len(t, gotMetadataBlocks, 1) + require.Len(t, gotMetadataBlocks[0], 3) + gotMetadataBlock := gotMetadataBlocks[0] + + sort.Slice(gotMetadataBlock, func(i, j int) bool { return gotMetadataBlock[i].Ref < gotMetadataBlock[j].Ref }) + require.Equal(t, wantMetadata, gotMetadataBlock) + require.NoError(t, hb.Close()) + }) } - _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0) - require.NoError(t, err) - - // Confirm there's been a checkpoint. - cdir, _, err := wlog.LastCheckpoint(w.Dir()) - require.NoError(t, err) - - // Read in checkpoint and WAL. - recs := readTestWAL(t, cdir) - var gotMetadataBlocks [][]record.RefMetadata - for _, rec := range recs { - if mr, ok := rec.([]record.RefMetadata); ok { - gotMetadataBlocks = append(gotMetadataBlocks, mr) - } - } - - // There should only be 1 metadata block present, with only the latest - // metadata kept around. - wantMetadata := []record.RefMetadata{ - {Ref: 1, Type: record.GetMetricType(m5.Type), Unit: m5.Unit, Help: m5.Help}, - {Ref: 2, Type: record.GetMetricType(m6.Type), Unit: m6.Unit, Help: m6.Help}, - {Ref: 4, Type: record.GetMetricType(m4.Type), Unit: m4.Unit, Help: m4.Help}, - } - require.Len(t, gotMetadataBlocks, 1) - require.Len(t, gotMetadataBlocks[0], 3) - gotMetadataBlock := gotMetadataBlocks[0] - - sort.Slice(gotMetadataBlock, func(i, j int) bool { return gotMetadataBlock[i].Ref < gotMetadataBlock[j].Ref }) - require.Equal(t, wantMetadata, gotMetadataBlock) - require.NoError(t, hb.Close()) } func TestMetadataAssertInMemoryData_AppendV2(t *testing.T) { diff --git a/tsdb/db_test.go b/tsdb/db_test.go index 18e969f952..13c37eb219 100644 --- a/tsdb/db_test.go +++ b/tsdb/db_test.go @@ -395,7 +395,7 @@ func TestDataNotAvailableAfterRollback(t *testing.T) { require.NoError(t, err) walSeriesCount += len(series) - case record.Samples: + case record.Samples, record.SamplesV2: var samples []record.RefSample samples, err = dec.Samples(rec, samples) require.NoError(t, err) @@ -1170,24 +1170,25 @@ func TestWALReplayRaceOnSamplesLoggedBeforeSeries(t *testing.T) { // We test both with few and many samples appended after series creation. If samples are < 120 then there's no // mmap-ed chunk, otherwise there's at least 1 mmap-ed chunk when replaying the WAL. - for _, numSamplesAfterSeriesCreation := range []int{1, 1000} { - for run := 1; run <= numRuns; run++ { - t.Run(fmt.Sprintf("samples after series creation = %d, run = %d", numSamplesAfterSeriesCreation, run), func(t *testing.T) { - testWALReplayRaceOnSamplesLoggedBeforeSeries(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation) - }) + for _, enableStStorage := range []bool{false, true} { + for _, numSamplesAfterSeriesCreation := range []int{1, 1000} { + for run := 1; run <= numRuns; run++ { + t.Run(fmt.Sprintf("samples after series creation = %d, run = %d, stStorage=%v", numSamplesAfterSeriesCreation, run, enableStStorage), func(t *testing.T) { + testWALReplayRaceOnSamplesLoggedBeforeSeries(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation, enableStStorage) + }) + } } } } -func testWALReplayRaceOnSamplesLoggedBeforeSeries(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int) { +func testWALReplayRaceOnSamplesLoggedBeforeSeries(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int, enableStStorage bool) { const numSeries = 1000 - db := newTestDB(t) db.DisableCompactions() for seriesRef := 1; seriesRef <= numSeries; seriesRef++ { // Log samples before the series is logged to the WAL. - var enc record.Encoder + enc := record.Encoder{EnableSTStorage: enableStStorage} var samples []record.RefSample for ts := range numSamplesBeforeSeriesCreation { @@ -1551,139 +1552,143 @@ func TestRetentionDurationMetric(t *testing.T) { func TestSizeRetention(t *testing.T) { t.Parallel() - opts := DefaultOptions() - opts.OutOfOrderTimeWindow = 100 - db := newTestDB(t, withOpts(opts), withRngs(100)) + for _, enableStStorage := range []bool{false, true} { + t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 100 + db := newTestDB(t, withOpts(opts), withRngs(100)) - blocks := []*BlockMeta{ - {MinTime: 100, MaxTime: 200}, // Oldest block - {MinTime: 200, MaxTime: 300}, - {MinTime: 300, MaxTime: 400}, - {MinTime: 400, MaxTime: 500}, - {MinTime: 500, MaxTime: 600}, // Newest Block - } + blocks := []*BlockMeta{ + {MinTime: 100, MaxTime: 200}, // Oldest block + {MinTime: 200, MaxTime: 300}, + {MinTime: 300, MaxTime: 400}, + {MinTime: 400, MaxTime: 500}, + {MinTime: 500, MaxTime: 600}, // Newest Block + } - for _, m := range blocks { - createBlock(t, db.Dir(), genSeries(100, 10, m.MinTime, m.MaxTime)) - } + for _, m := range blocks { + createBlock(t, db.Dir(), genSeries(100, 10, m.MinTime, m.MaxTime)) + } - headBlocks := []*BlockMeta{ - {MinTime: 700, MaxTime: 800}, - } + headBlocks := []*BlockMeta{ + {MinTime: 700, MaxTime: 800}, + } - // Add some data to the WAL. - headApp := db.Head().Appender(context.Background()) - var aSeries labels.Labels - var it chunkenc.Iterator - for _, m := range headBlocks { - series := genSeries(100, 10, m.MinTime, m.MaxTime+1) - for _, s := range series { - aSeries = s.Labels() - it = s.Iterator(it) - for it.Next() == chunkenc.ValFloat { - tim, v := it.At() - _, err := headApp.Append(0, s.Labels(), tim, v) + // Add some data to the WAL. + headApp := db.Head().Appender(context.Background()) + var aSeries labels.Labels + var it chunkenc.Iterator + for _, m := range headBlocks { + series := genSeries(100, 10, m.MinTime, m.MaxTime+1) + for _, s := range series { + aSeries = s.Labels() + it = s.Iterator(it) + for it.Next() == chunkenc.ValFloat { + tim, v := it.At() + _, err := headApp.Append(0, s.Labels(), tim, v) + require.NoError(t, err) + } + require.NoError(t, it.Err()) + } + } + require.NoError(t, headApp.Commit()) + db.Head().mmapHeadChunks() + + require.Eventually(t, func() bool { + return db.Head().chunkDiskMapper.IsQueueEmpty() + }, 2*time.Second, 100*time.Millisecond) + + // Test that registered size matches the actual disk size. + require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. + require.Len(t, db.Blocks(), len(blocks)) // Ensure all blocks are registered. + blockSize := int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. + walSize, err := db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err := db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + // Expected size should take into account block size + WAL size + Head + // chunks size + expSize := blockSize + walSize + cdmSize + actSize, err := fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Create a WAL checkpoint, and compare sizes. + first, last, err := wlog.Segments(db.Head().wal.Dir()) + require.NoError(t, err) + _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0, enableStStorage) + require.NoError(t, err) + blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + expSize = blockSize + walSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Truncate Chunk Disk Mapper and compare sizes. + require.NoError(t, db.Head().chunkDiskMapper.Truncate(900)) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + expSize = blockSize + walSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Add some out of order samples to check the size of WBL. + headApp = db.Head().Appender(context.Background()) + for ts := int64(750); ts < 800; ts++ { + _, err := headApp.Append(0, aSeries, ts, float64(ts)) require.NoError(t, err) } - require.NoError(t, it.Err()) - } + require.NoError(t, headApp.Commit()) + + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + wblSize, err := db.Head().wbl.Size() + require.NoError(t, err) + require.NotZero(t, wblSize) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + expSize = blockSize + walSize + wblSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + + // Decrease the max bytes limit so that a delete is triggered. + // Check total size, total count and check that the oldest block was deleted. + firstBlockSize := db.Blocks()[0].Size() + sizeLimit := actSize - firstBlockSize + db.opts.MaxBytes = sizeLimit // Set the new db size limit one block smaller that the actual size. + require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. + + expBlocks := blocks[1:] + actBlocks := db.Blocks() + blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + require.NotZero(t, cdmSize) + // Expected size should take into account block size + WAL size + WBL size + expSize = blockSize + walSize + wblSize + cdmSize + actRetentionCount := int(prom_testutil.ToFloat64(db.metrics.sizeRetentionCount)) + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + + require.Equal(t, 1, actRetentionCount, "metric retention count mismatch") + require.Equal(t, expSize, actSize, "metric db size doesn't match actual disk size") + require.LessOrEqual(t, expSize, sizeLimit, "actual size (%v) is expected to be less than or equal to limit (%v)", expSize, sizeLimit) + require.Len(t, actBlocks, len(blocks)-1, "new block count should be decreased from:%v to:%v", len(blocks), len(blocks)-1) + require.Equal(t, expBlocks[0].MaxTime, actBlocks[0].meta.MaxTime, "maxT mismatch of the first block") + require.Equal(t, expBlocks[len(expBlocks)-1].MaxTime, actBlocks[len(actBlocks)-1].meta.MaxTime, "maxT mismatch of the last block") + }) } - require.NoError(t, headApp.Commit()) - db.Head().mmapHeadChunks() - - require.Eventually(t, func() bool { - return db.Head().chunkDiskMapper.IsQueueEmpty() - }, 2*time.Second, 100*time.Millisecond) - - // Test that registered size matches the actual disk size. - require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. - require.Len(t, db.Blocks(), len(blocks)) // Ensure all blocks are registered. - blockSize := int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. - walSize, err := db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err := db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - // Expected size should take into account block size + WAL size + Head - // chunks size - expSize := blockSize + walSize + cdmSize - actSize, err := fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Create a WAL checkpoint, and compare sizes. - first, last, err := wlog.Segments(db.Head().wal.Dir()) - require.NoError(t, err) - _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0) - require.NoError(t, err) - blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - expSize = blockSize + walSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Truncate Chunk Disk Mapper and compare sizes. - require.NoError(t, db.Head().chunkDiskMapper.Truncate(900)) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - expSize = blockSize + walSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Add some out of order samples to check the size of WBL. - headApp = db.Head().Appender(context.Background()) - for ts := int64(750); ts < 800; ts++ { - _, err := headApp.Append(0, aSeries, ts, float64(ts)) - require.NoError(t, err) - } - require.NoError(t, headApp.Commit()) - - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - wblSize, err := db.Head().wbl.Size() - require.NoError(t, err) - require.NotZero(t, wblSize) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - expSize = blockSize + walSize + wblSize + cdmSize - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") - - // Decrease the max bytes limit so that a delete is triggered. - // Check total size, total count and check that the oldest block was deleted. - firstBlockSize := db.Blocks()[0].Size() - sizeLimit := actSize - firstBlockSize - db.opts.MaxBytes = sizeLimit // Set the new db size limit one block smaller that the actual size. - require.NoError(t, db.reloadBlocks()) // Reload the db to register the new db size. - - expBlocks := blocks[1:] - actBlocks := db.Blocks() - blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) - walSize, err = db.Head().wal.Size() - require.NoError(t, err) - cdmSize, err = db.Head().chunkDiskMapper.Size() - require.NoError(t, err) - require.NotZero(t, cdmSize) - // Expected size should take into account block size + WAL size + WBL size - expSize = blockSize + walSize + wblSize + cdmSize - actRetentionCount := int(prom_testutil.ToFloat64(db.metrics.sizeRetentionCount)) - actSize, err = fileutil.DirSize(db.Dir()) - require.NoError(t, err) - - require.Equal(t, 1, actRetentionCount, "metric retention count mismatch") - require.Equal(t, expSize, actSize, "metric db size doesn't match actual disk size") - require.LessOrEqual(t, expSize, sizeLimit, "actual size (%v) is expected to be less than or equal to limit (%v)", expSize, sizeLimit) - require.Len(t, actBlocks, len(blocks)-1, "new block count should be decreased from:%v to:%v", len(blocks), len(blocks)-1) - require.Equal(t, expBlocks[0].MaxTime, actBlocks[0].meta.MaxTime, "maxT mismatch of the first block") - require.Equal(t, expBlocks[len(expBlocks)-1].MaxTime, actBlocks[len(actBlocks)-1].meta.MaxTime, "maxT mismatch of the last block") } func TestSizeRetentionMetric(t *testing.T) { @@ -2072,33 +2077,36 @@ func TestInitializeHeadTimestamp(t *testing.T) { require.Equal(t, int64(1000), db.head.MaxTime()) require.True(t, db.head.initialized()) }) - t.Run("wal-only", func(t *testing.T) { - dir := t.TempDir() - require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) - w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) - require.NoError(t, err) + for _, enableStStorage := range []bool{false, true} { + t.Run("wal-only-st-"+strconv.FormatBool(enableStStorage), func(t *testing.T) { + dir := t.TempDir() - var enc record.Encoder - err = w.Log( - enc.Series([]record.RefSeries{ - {Ref: 123, Labels: labels.FromStrings("a", "1")}, - {Ref: 124, Labels: labels.FromStrings("a", "2")}, - }, nil), - enc.Samples([]record.RefSample{ - {Ref: 123, T: 5000, V: 1}, - {Ref: 124, T: 15000, V: 1}, - }, nil), - ) - require.NoError(t, err) - require.NoError(t, w.Close()) + require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) + w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) + require.NoError(t, err) - db := newTestDB(t, withDir(dir)) + enc := record.Encoder{EnableSTStorage: enableStStorage} + err = w.Log( + enc.Series([]record.RefSeries{ + {Ref: 123, Labels: labels.FromStrings("a", "1")}, + {Ref: 124, Labels: labels.FromStrings("a", "2")}, + }, nil), + enc.Samples([]record.RefSample{ + {Ref: 123, T: 5000, V: 1}, + {Ref: 124, T: 15000, V: 1}, + }, nil), + ) + require.NoError(t, err) + require.NoError(t, w.Close()) - require.Equal(t, int64(5000), db.head.MinTime()) - require.Equal(t, int64(15000), db.head.MaxTime()) - require.True(t, db.head.initialized()) - }) + db := newTestDB(t, withDir(dir)) + + require.Equal(t, int64(5000), db.head.MinTime()) + require.Equal(t, int64(15000), db.head.MaxTime()) + require.True(t, db.head.initialized()) + }) + } t.Run("existing-block", func(t *testing.T) { dir := t.TempDir() @@ -2110,37 +2118,40 @@ func TestInitializeHeadTimestamp(t *testing.T) { require.Equal(t, int64(2000), db.head.MaxTime()) require.True(t, db.head.initialized()) }) - t.Run("existing-block-and-wal", func(t *testing.T) { - dir := t.TempDir() - createBlock(t, dir, genSeries(1, 1, 1000, 6000)) + for _, enableStStorage := range []bool{false, true} { + t.Run("existing-block-and-wal,enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + dir := t.TempDir() - require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) - w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) - require.NoError(t, err) + createBlock(t, dir, genSeries(1, 1, 1000, 6000)) - var enc record.Encoder - err = w.Log( - enc.Series([]record.RefSeries{ - {Ref: 123, Labels: labels.FromStrings("a", "1")}, - {Ref: 124, Labels: labels.FromStrings("a", "2")}, - }, nil), - enc.Samples([]record.RefSample{ - {Ref: 123, T: 5000, V: 1}, - {Ref: 124, T: 15000, V: 1}, - }, nil), - ) - require.NoError(t, err) - require.NoError(t, w.Close()) + require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) + w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) + require.NoError(t, err) - db := newTestDB(t, withDir(dir)) + enc := record.Encoder{EnableSTStorage: enableStStorage} + err = w.Log( + enc.Series([]record.RefSeries{ + {Ref: 123, Labels: labels.FromStrings("a", "1")}, + {Ref: 124, Labels: labels.FromStrings("a", "2")}, + }, nil), + enc.Samples([]record.RefSample{ + {Ref: 123, T: 5000, V: 1}, + {Ref: 124, T: 15000, V: 1}, + }, nil), + ) + require.NoError(t, err) + require.NoError(t, w.Close()) - require.Equal(t, int64(6000), db.head.MinTime()) - require.Equal(t, int64(15000), db.head.MaxTime()) - require.True(t, db.head.initialized()) - // Check that old series has been GCed. - require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.series)) - }) + db := newTestDB(t, withDir(dir)) + + require.Equal(t, int64(6000), db.head.MinTime()) + require.Equal(t, int64(15000), db.head.MaxTime()) + require.True(t, db.head.initialized()) + // Check that old series has been GCed. + require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.series)) + }) + } } func TestNoEmptyBlocks(t *testing.T) { @@ -4531,7 +4542,7 @@ func testOOOWALWrite(t *testing.T, series, err := dec.Series(rec, nil) require.NoError(t, err) records = append(records, series) - case record.Samples: + case record.Samples, record.SamplesV2: samples, err := dec.Samples(rec, nil) require.NoError(t, err) records = append(records, samples) @@ -4692,102 +4703,106 @@ func TestMetadataCheckpointingOnlyKeepsLatestEntry(t *testing.T) { require.NoError(t, err) } - ctx := context.Background() - numSamples := 10000 - hb, w := newTestHead(t, int64(numSamples)*10, compression.None, false) + for _, enableStStorage := range []bool{false, true} { + t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + ctx := context.Background() + numSamples := 10000 + hb, w := newTestHead(t, int64(numSamples)*10, compression.None, false) - // Add some series so we can append metadata to them. - app := hb.Appender(ctx) - s1 := labels.FromStrings("a", "b") - s2 := labels.FromStrings("c", "d") - s3 := labels.FromStrings("e", "f") - s4 := labels.FromStrings("g", "h") + // Add some series so we can append metadata to them. + app := hb.Appender(ctx) + s1 := labels.FromStrings("a", "b") + s2 := labels.FromStrings("c", "d") + s3 := labels.FromStrings("e", "f") + s4 := labels.FromStrings("g", "h") - for _, s := range []labels.Labels{s1, s2, s3, s4} { - _, err := app.Append(0, s, 0, 0) - require.NoError(t, err) + for _, s := range []labels.Labels{s1, s2, s3, s4} { + _, err := app.Append(0, s, 0, 0) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + + // Add a first round of metadata to the first three series. + // Re-take the Appender, as the previous Commit will have it closed. + m1 := metadata.Metadata{Type: "gauge", Unit: "unit_1", Help: "help_1"} + m2 := metadata.Metadata{Type: "gauge", Unit: "unit_2", Help: "help_2"} + m3 := metadata.Metadata{Type: "gauge", Unit: "unit_3", Help: "help_3"} + m4 := metadata.Metadata{Type: "gauge", Unit: "unit_4", Help: "help_4"} + app = hb.Appender(ctx) + updateMetadata(t, app, s1, m1) + updateMetadata(t, app, s2, m2) + updateMetadata(t, app, s3, m3) + updateMetadata(t, app, s4, m4) + require.NoError(t, app.Commit()) + + // Update metadata for first series. + m5 := metadata.Metadata{Type: "counter", Unit: "unit_5", Help: "help_5"} + app = hb.Appender(ctx) + updateMetadata(t, app, s1, m5) + require.NoError(t, app.Commit()) + + // Switch back-and-forth metadata for second series. + // Since it ended on a new metadata record, we expect a single new entry. + m6 := metadata.Metadata{Type: "counter", Unit: "unit_6", Help: "help_6"} + + app = hb.Appender(ctx) + updateMetadata(t, app, s2, m6) + require.NoError(t, app.Commit()) + + app = hb.Appender(ctx) + updateMetadata(t, app, s2, m2) + require.NoError(t, app.Commit()) + + app = hb.Appender(ctx) + updateMetadata(t, app, s2, m6) + require.NoError(t, app.Commit()) + + app = hb.Appender(ctx) + updateMetadata(t, app, s2, m2) + require.NoError(t, app.Commit()) + + app = hb.Appender(ctx) + updateMetadata(t, app, s2, m6) + require.NoError(t, app.Commit()) + + // Let's create a checkpoint. + first, last, err := wlog.Segments(w.Dir()) + require.NoError(t, err) + keep := func(id chunks.HeadSeriesRef) bool { + return id != 3 + } + _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0, enableStStorage) + require.NoError(t, err) + + // Confirm there's been a checkpoint. + cdir, _, err := wlog.LastCheckpoint(w.Dir()) + require.NoError(t, err) + + // Read in checkpoint and WAL. + recs := readTestWAL(t, cdir) + var gotMetadataBlocks [][]record.RefMetadata + for _, rec := range recs { + if mr, ok := rec.([]record.RefMetadata); ok { + gotMetadataBlocks = append(gotMetadataBlocks, mr) + } + } + + // There should only be 1 metadata block present, with only the latest + // metadata kept around. + wantMetadata := []record.RefMetadata{ + {Ref: 1, Type: record.GetMetricType(m5.Type), Unit: m5.Unit, Help: m5.Help}, + {Ref: 2, Type: record.GetMetricType(m6.Type), Unit: m6.Unit, Help: m6.Help}, + {Ref: 4, Type: record.GetMetricType(m4.Type), Unit: m4.Unit, Help: m4.Help}, + } + require.Len(t, gotMetadataBlocks, 1) + require.Len(t, gotMetadataBlocks[0], 3) + gotMetadataBlock := gotMetadataBlocks[0] + + sort.Slice(gotMetadataBlock, func(i, j int) bool { return gotMetadataBlock[i].Ref < gotMetadataBlock[j].Ref }) + require.Equal(t, wantMetadata, gotMetadataBlock) + require.NoError(t, hb.Close()) + }) } - require.NoError(t, app.Commit()) - - // Add a first round of metadata to the first three series. - // Re-take the Appender, as the previous Commit will have it closed. - m1 := metadata.Metadata{Type: "gauge", Unit: "unit_1", Help: "help_1"} - m2 := metadata.Metadata{Type: "gauge", Unit: "unit_2", Help: "help_2"} - m3 := metadata.Metadata{Type: "gauge", Unit: "unit_3", Help: "help_3"} - m4 := metadata.Metadata{Type: "gauge", Unit: "unit_4", Help: "help_4"} - app = hb.Appender(ctx) - updateMetadata(t, app, s1, m1) - updateMetadata(t, app, s2, m2) - updateMetadata(t, app, s3, m3) - updateMetadata(t, app, s4, m4) - require.NoError(t, app.Commit()) - - // Update metadata for first series. - m5 := metadata.Metadata{Type: "counter", Unit: "unit_5", Help: "help_5"} - app = hb.Appender(ctx) - updateMetadata(t, app, s1, m5) - require.NoError(t, app.Commit()) - - // Switch back-and-forth metadata for second series. - // Since it ended on a new metadata record, we expect a single new entry. - m6 := metadata.Metadata{Type: "counter", Unit: "unit_6", Help: "help_6"} - - app = hb.Appender(ctx) - updateMetadata(t, app, s2, m6) - require.NoError(t, app.Commit()) - - app = hb.Appender(ctx) - updateMetadata(t, app, s2, m2) - require.NoError(t, app.Commit()) - - app = hb.Appender(ctx) - updateMetadata(t, app, s2, m6) - require.NoError(t, app.Commit()) - - app = hb.Appender(ctx) - updateMetadata(t, app, s2, m2) - require.NoError(t, app.Commit()) - - app = hb.Appender(ctx) - updateMetadata(t, app, s2, m6) - require.NoError(t, app.Commit()) - - // Let's create a checkpoint. - first, last, err := wlog.Segments(w.Dir()) - require.NoError(t, err) - keep := func(id chunks.HeadSeriesRef) bool { - return id != 3 - } - _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0) - require.NoError(t, err) - - // Confirm there's been a checkpoint. - cdir, _, err := wlog.LastCheckpoint(w.Dir()) - require.NoError(t, err) - - // Read in checkpoint and WAL. - recs := readTestWAL(t, cdir) - var gotMetadataBlocks [][]record.RefMetadata - for _, rec := range recs { - if mr, ok := rec.([]record.RefMetadata); ok { - gotMetadataBlocks = append(gotMetadataBlocks, mr) - } - } - - // There should only be 1 metadata block present, with only the latest - // metadata kept around. - wantMetadata := []record.RefMetadata{ - {Ref: 1, Type: record.GetMetricType(m5.Type), Unit: m5.Unit, Help: m5.Help}, - {Ref: 2, Type: record.GetMetricType(m6.Type), Unit: m6.Unit, Help: m6.Help}, - {Ref: 4, Type: record.GetMetricType(m4.Type), Unit: m4.Unit, Help: m4.Help}, - } - require.Len(t, gotMetadataBlocks, 1) - require.Len(t, gotMetadataBlocks[0], 3) - gotMetadataBlock := gotMetadataBlocks[0] - - sort.Slice(gotMetadataBlock, func(i, j int) bool { return gotMetadataBlock[i].Ref < gotMetadataBlock[j].Ref }) - require.Equal(t, wantMetadata, gotMetadataBlock) - require.NoError(t, hb.Close()) } func TestMetadataAssertInMemoryData(t *testing.T) { diff --git a/tsdb/head.go b/tsdb/head.go index 6fe42c8cf2..19c2538b12 100644 --- a/tsdb/head.go +++ b/tsdb/head.go @@ -200,6 +200,11 @@ type HeadOptions struct { // NOTE(bwplotka): This feature might be deprecated and removed once PROM-60 // is implemented. EnableMetadataWALRecords bool + + // EnableSTStorage determines whether agent DB should write a Start Timestamp (ST) + // per sample to WAL. + // TODO(bwplotka): Implement this option as per PROM-60, currently it's noop. + EnableSTStorage bool } const ( @@ -1381,7 +1386,7 @@ func (h *Head) truncateWAL(mint int64) error { } h.metrics.checkpointCreationTotal.Inc() - if _, err = wlog.Checkpoint(h.logger, h.wal, first, last, h.keepSeriesInWALCheckpointFn(mint), mint); err != nil { + if _, err = wlog.Checkpoint(h.logger, h.wal, first, last, h.keepSeriesInWALCheckpointFn(mint), mint, h.opts.EnableSTStorage); err != nil { h.metrics.checkpointCreationFail.Inc() var cerr *chunks.CorruptionErr if errors.As(err, &cerr) { @@ -1675,7 +1680,7 @@ func (h *Head) Delete(ctx context.Context, mint, maxt int64, ms ...*labels.Match } if h.wal != nil { - var enc record.Encoder + enc := record.Encoder{EnableSTStorage: h.opts.EnableSTStorage} if err := h.wal.Log(enc.Tombstones(stones, nil)); err != nil { return err } diff --git a/tsdb/head_append.go b/tsdb/head_append.go index e6c9f2828a..846ad476e3 100644 --- a/tsdb/head_append.go +++ b/tsdb/head_append.go @@ -1742,6 +1742,9 @@ func (a *headAppenderBase) Commit() (err error) { chunkRange: h.chunkRange.Load(), samplesPerChunk: h.opts.SamplesPerChunk, }, + enc: record.Encoder{ + EnableSTStorage: false, + }, } for _, b := range a.batches { diff --git a/tsdb/head_append_v2_test.go b/tsdb/head_append_v2_test.go index 082d756e60..ba756f801f 100644 --- a/tsdb/head_append_v2_test.go +++ b/tsdb/head_append_v2_test.go @@ -1867,296 +1867,300 @@ func TestHistogramInWALAndMmapChunk_AppenderV2(t *testing.T) { } func TestChunkSnapshot_AppenderV2(t *testing.T) { - head, _ := newTestHead(t, 120*4, compression.None, false) - defer func() { - head.opts.EnableMemorySnapshotOnShutdown = false - require.NoError(t, head.Close()) - }() + for _, enableStStorage := range []bool{false, true} { + t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + head, _ := newTestHead(t, 120*4, compression.None, false) + defer func() { + head.opts.EnableMemorySnapshotOnShutdown = false + require.NoError(t, head.Close()) + }() - type ex struct { - seriesLabels labels.Labels - e exemplar.Exemplar - } - - numSeries := 10 - expSeries := make(map[string][]chunks.Sample) - expHist := make(map[string][]chunks.Sample) - expFloatHist := make(map[string][]chunks.Sample) - expTombstones := make(map[storage.SeriesRef]tombstones.Intervals) - expExemplars := make([]ex, 0) - histograms := tsdbutil.GenerateTestGaugeHistograms(481) - floatHistogram := tsdbutil.GenerateTestGaugeFloatHistograms(481) - - newExemplar := func(lbls labels.Labels, ts int64) exemplar.Exemplar { - e := ex{ - seriesLabels: lbls, - e: exemplar.Exemplar{ - Labels: labels.FromStrings("trace_id", strconv.Itoa(rand.Int())), - Value: rand.Float64(), - Ts: ts, - }, - } - expExemplars = append(expExemplars, e) - return e.e - } - - checkSamples := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) - require.Equal(t, expSeries, series) - } - checkHistograms := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "hist", "baz.*")) - require.Equal(t, expHist, series) - } - checkFloatHistograms := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "floathist", "bat.*")) - require.Equal(t, expFloatHist, series) - } - checkTombstones := func() { - tr, err := head.Tombstones() - require.NoError(t, err) - actTombstones := make(map[storage.SeriesRef]tombstones.Intervals) - require.NoError(t, tr.Iter(func(ref storage.SeriesRef, itvs tombstones.Intervals) error { - for _, itv := range itvs { - actTombstones[ref].Add(itv) + type ex struct { + seriesLabels labels.Labels + e exemplar.Exemplar + } + + numSeries := 10 + expSeries := make(map[string][]chunks.Sample) + expHist := make(map[string][]chunks.Sample) + expFloatHist := make(map[string][]chunks.Sample) + expTombstones := make(map[storage.SeriesRef]tombstones.Intervals) + expExemplars := make([]ex, 0) + histograms := tsdbutil.GenerateTestGaugeHistograms(481) + floatHistogram := tsdbutil.GenerateTestGaugeFloatHistograms(481) + + newExemplar := func(lbls labels.Labels, ts int64) exemplar.Exemplar { + e := ex{ + seriesLabels: lbls, + e: exemplar.Exemplar{ + Labels: labels.FromStrings("trace_id", strconv.Itoa(rand.Int())), + Value: rand.Float64(), + Ts: ts, + }, + } + expExemplars = append(expExemplars, e) + return e.e + } + + checkSamples := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expSeries, series) + } + checkHistograms := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "hist", "baz.*")) + require.Equal(t, expHist, series) + } + checkFloatHistograms := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "floathist", "bat.*")) + require.Equal(t, expFloatHist, series) + } + checkTombstones := func() { + tr, err := head.Tombstones() + require.NoError(t, err) + actTombstones := make(map[storage.SeriesRef]tombstones.Intervals) + require.NoError(t, tr.Iter(func(ref storage.SeriesRef, itvs tombstones.Intervals) error { + for _, itv := range itvs { + actTombstones[ref].Add(itv) + } + return nil + })) + require.Equal(t, expTombstones, actTombstones) + } + checkExemplars := func() { + actExemplars := make([]ex, 0, len(expExemplars)) + err := head.exemplars.IterateExemplars(func(seriesLabels labels.Labels, e exemplar.Exemplar) error { + actExemplars = append(actExemplars, ex{ + seriesLabels: seriesLabels, + e: e, + }) + return nil + }) + require.NoError(t, err) + // Verifies both existence of right exemplars and order of exemplars in the buffer. + testutil.RequireEqualWithOptions(t, expExemplars, actExemplars, []cmp.Option{cmp.AllowUnexported(ex{})}) + } + + var ( + wlast, woffset int + err error + ) + + closeHeadAndCheckSnapshot := func() { + require.NoError(t, head.Close()) + + _, sidx, soffset, err := LastChunkSnapshot(head.opts.ChunkDirRoot) + require.NoError(t, err) + require.Equal(t, wlast, sidx) + require.Equal(t, woffset, soffset) + } + + openHeadAndCheckReplay := func() { + w, err := wlog.NewSize(nil, nil, head.wal.Dir(), 32768, compression.None) + require.NoError(t, err) + head, err = NewHead(nil, nil, w, nil, head.opts, nil) + require.NoError(t, err) + require.NoError(t, head.Init(math.MinInt64)) + + checkSamples() + checkHistograms() + checkFloatHistograms() + checkTombstones() + checkExemplars() + } + + { // Initial data that goes into snapshot. + // Add some initial samples with >=1 m-map chunk. + app := head.AppenderV2(context.Background()) + for i := 1; i <= numSeries; i++ { + lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) + lblStr := lbls.String() + lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) + lblsHistStr := lblsHist.String() + lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) + lblsFloatHistStr := lblsFloatHist.String() + + // 240 samples should m-map at least 1 chunk. + for ts := int64(1); ts <= 240; ts++ { + // Add an exemplar, but only to float sample. + aOpts := storage.AOptions{} + if ts%10 == 0 { + aOpts.Exemplars = []exemplar.Exemplar{newExemplar(lbls, ts)} + } + val := rand.Float64() + expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) + _, err := app.Append(0, lbls, 0, ts, val, nil, nil, aOpts) + require.NoError(t, err) + + hist := histograms[int(ts)] + expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) + _, err = app.Append(0, lblsHist, 0, ts, 0, hist, nil, storage.AOptions{}) + require.NoError(t, err) + + floatHist := floatHistogram[int(ts)] + expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) + _, err = app.Append(0, lblsFloatHist, 0, ts, 0, nil, floatHist, storage.AOptions{}) + require.NoError(t, err) + + // Create multiple WAL records (commit). + if ts%10 == 0 { + require.NoError(t, app.Commit()) + app = head.AppenderV2(context.Background()) + } + } + } + require.NoError(t, app.Commit()) + + // Add some tombstones. + enc := record.Encoder{EnableSTStorage: enableStStorage} + for i := 1; i <= numSeries; i++ { + ref := storage.SeriesRef(i) + itvs := tombstones.Intervals{ + {Mint: 1234, Maxt: 2345}, + {Mint: 3456, Maxt: 4567}, + } + for _, itv := range itvs { + expTombstones[ref].Add(itv) + } + head.tombstones.AddInterval(ref, itvs...) + err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ + {Ref: ref, Intervals: itvs}, + }, nil)) + require.NoError(t, err) + } + } + + // These references should be the ones used for the snapshot. + wlast, woffset, err = head.wal.LastSegmentAndOffset() + require.NoError(t, err) + if woffset != 0 && woffset < 32*1024 { + // The page is always filled before taking the snapshot. + woffset = 32 * 1024 + } + + { + // Creating snapshot and verifying it. + head.opts.EnableMemorySnapshotOnShutdown = true + closeHeadAndCheckSnapshot() // This will create a snapshot. + + // Test the replay of snapshot. + openHeadAndCheckReplay() + } + + { // Additional data to only include in WAL and m-mapped chunks and not snapshot. This mimics having an old snapshot on disk. + // Add more samples. + app := head.AppenderV2(context.Background()) + for i := 1; i <= numSeries; i++ { + lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) + lblStr := lbls.String() + lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) + lblsHistStr := lblsHist.String() + lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) + lblsFloatHistStr := lblsFloatHist.String() + + // 240 samples should m-map at least 1 chunk. + for ts := int64(241); ts <= 480; ts++ { + // Add an exemplar, but only to float sample. + aOpts := storage.AOptions{} + if ts%10 == 0 { + aOpts.Exemplars = []exemplar.Exemplar{newExemplar(lbls, ts)} + } + val := rand.Float64() + expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) + _, err := app.Append(0, lbls, 0, ts, val, nil, nil, aOpts) + require.NoError(t, err) + + hist := histograms[int(ts)] + expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) + _, err = app.Append(0, lblsHist, 0, ts, 0, hist, nil, storage.AOptions{}) + require.NoError(t, err) + + floatHist := floatHistogram[int(ts)] + expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) + _, err = app.Append(0, lblsFloatHist, 0, ts, 0, nil, floatHist, storage.AOptions{}) + require.NoError(t, err) + + // Create multiple WAL records (commit). + if ts%10 == 0 { + require.NoError(t, app.Commit()) + app = head.AppenderV2(context.Background()) + } + } + } + require.NoError(t, app.Commit()) + + // Add more tombstones. + enc := record.Encoder{EnableSTStorage: enableStStorage} + for i := 1; i <= numSeries; i++ { + ref := storage.SeriesRef(i) + itvs := tombstones.Intervals{ + {Mint: 12345, Maxt: 23456}, + {Mint: 34567, Maxt: 45678}, + } + for _, itv := range itvs { + expTombstones[ref].Add(itv) + } + head.tombstones.AddInterval(ref, itvs...) + err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ + {Ref: ref, Intervals: itvs}, + }, nil)) + require.NoError(t, err) + } + } + { + // Close Head and verify that new snapshot was not created. + head.opts.EnableMemorySnapshotOnShutdown = false + closeHeadAndCheckSnapshot() // This should not create a snapshot. + + // Test the replay of snapshot, m-map chunks, and WAL. + head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. + openHeadAndCheckReplay() + } + + // Creating another snapshot should delete the older snapshot and replay still works fine. + wlast, woffset, err = head.wal.LastSegmentAndOffset() + require.NoError(t, err) + if woffset != 0 && woffset < 32*1024 { + // The page is always filled before taking the snapshot. + woffset = 32 * 1024 + } + + { + // Close Head and verify that new snapshot was created. + closeHeadAndCheckSnapshot() + + // Verify that there is only 1 snapshot. + files, err := os.ReadDir(head.opts.ChunkDirRoot) + require.NoError(t, err) + snapshots := 0 + for i := len(files) - 1; i >= 0; i-- { + fi := files[i] + if strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) { + snapshots++ + require.Equal(t, chunkSnapshotDir(wlast, woffset), fi.Name()) + } + } + require.Equal(t, 1, snapshots) + + // Test the replay of snapshot. + head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. + + // Disabling exemplars to check that it does not hard fail replay + // https://github.com/prometheus/prometheus/issues/9437#issuecomment-933285870. + head.opts.EnableExemplarStorage = false + head.opts.MaxExemplars.Store(0) + expExemplars = expExemplars[:0] + + openHeadAndCheckReplay() + + require.Equal(t, 0.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) } - return nil - })) - require.Equal(t, expTombstones, actTombstones) - } - checkExemplars := func() { - actExemplars := make([]ex, 0, len(expExemplars)) - err := head.exemplars.IterateExemplars(func(seriesLabels labels.Labels, e exemplar.Exemplar) error { - actExemplars = append(actExemplars, ex{ - seriesLabels: seriesLabels, - e: e, - }) - return nil }) - require.NoError(t, err) - // Verifies both existence of right exemplars and order of exemplars in the buffer. - testutil.RequireEqualWithOptions(t, expExemplars, actExemplars, []cmp.Option{cmp.AllowUnexported(ex{})}) - } - - var ( - wlast, woffset int - err error - ) - - closeHeadAndCheckSnapshot := func() { - require.NoError(t, head.Close()) - - _, sidx, soffset, err := LastChunkSnapshot(head.opts.ChunkDirRoot) - require.NoError(t, err) - require.Equal(t, wlast, sidx) - require.Equal(t, woffset, soffset) - } - - openHeadAndCheckReplay := func() { - w, err := wlog.NewSize(nil, nil, head.wal.Dir(), 32768, compression.None) - require.NoError(t, err) - head, err = NewHead(nil, nil, w, nil, head.opts, nil) - require.NoError(t, err) - require.NoError(t, head.Init(math.MinInt64)) - - checkSamples() - checkHistograms() - checkFloatHistograms() - checkTombstones() - checkExemplars() - } - - { // Initial data that goes into snapshot. - // Add some initial samples with >=1 m-map chunk. - app := head.AppenderV2(context.Background()) - for i := 1; i <= numSeries; i++ { - lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) - lblStr := lbls.String() - lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) - lblsHistStr := lblsHist.String() - lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) - lblsFloatHistStr := lblsFloatHist.String() - - // 240 samples should m-map at least 1 chunk. - for ts := int64(1); ts <= 240; ts++ { - // Add an exemplar, but only to float sample. - aOpts := storage.AOptions{} - if ts%10 == 0 { - aOpts.Exemplars = []exemplar.Exemplar{newExemplar(lbls, ts)} - } - val := rand.Float64() - expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) - _, err := app.Append(0, lbls, 0, ts, val, nil, nil, aOpts) - require.NoError(t, err) - - hist := histograms[int(ts)] - expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) - _, err = app.Append(0, lblsHist, 0, ts, 0, hist, nil, storage.AOptions{}) - require.NoError(t, err) - - floatHist := floatHistogram[int(ts)] - expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) - _, err = app.Append(0, lblsFloatHist, 0, ts, 0, nil, floatHist, storage.AOptions{}) - require.NoError(t, err) - - // Create multiple WAL records (commit). - if ts%10 == 0 { - require.NoError(t, app.Commit()) - app = head.AppenderV2(context.Background()) - } - } - } - require.NoError(t, app.Commit()) - - // Add some tombstones. - var enc record.Encoder - for i := 1; i <= numSeries; i++ { - ref := storage.SeriesRef(i) - itvs := tombstones.Intervals{ - {Mint: 1234, Maxt: 2345}, - {Mint: 3456, Maxt: 4567}, - } - for _, itv := range itvs { - expTombstones[ref].Add(itv) - } - head.tombstones.AddInterval(ref, itvs...) - err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ - {Ref: ref, Intervals: itvs}, - }, nil)) - require.NoError(t, err) - } - } - - // These references should be the ones used for the snapshot. - wlast, woffset, err = head.wal.LastSegmentAndOffset() - require.NoError(t, err) - if woffset != 0 && woffset < 32*1024 { - // The page is always filled before taking the snapshot. - woffset = 32 * 1024 - } - - { - // Creating snapshot and verifying it. - head.opts.EnableMemorySnapshotOnShutdown = true - closeHeadAndCheckSnapshot() // This will create a snapshot. - - // Test the replay of snapshot. - openHeadAndCheckReplay() - } - - { // Additional data to only include in WAL and m-mapped chunks and not snapshot. This mimics having an old snapshot on disk. - // Add more samples. - app := head.AppenderV2(context.Background()) - for i := 1; i <= numSeries; i++ { - lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) - lblStr := lbls.String() - lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) - lblsHistStr := lblsHist.String() - lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) - lblsFloatHistStr := lblsFloatHist.String() - - // 240 samples should m-map at least 1 chunk. - for ts := int64(241); ts <= 480; ts++ { - // Add an exemplar, but only to float sample. - aOpts := storage.AOptions{} - if ts%10 == 0 { - aOpts.Exemplars = []exemplar.Exemplar{newExemplar(lbls, ts)} - } - val := rand.Float64() - expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) - _, err := app.Append(0, lbls, 0, ts, val, nil, nil, aOpts) - require.NoError(t, err) - - hist := histograms[int(ts)] - expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) - _, err = app.Append(0, lblsHist, 0, ts, 0, hist, nil, storage.AOptions{}) - require.NoError(t, err) - - floatHist := floatHistogram[int(ts)] - expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) - _, err = app.Append(0, lblsFloatHist, 0, ts, 0, nil, floatHist, storage.AOptions{}) - require.NoError(t, err) - - // Create multiple WAL records (commit). - if ts%10 == 0 { - require.NoError(t, app.Commit()) - app = head.AppenderV2(context.Background()) - } - } - } - require.NoError(t, app.Commit()) - - // Add more tombstones. - var enc record.Encoder - for i := 1; i <= numSeries; i++ { - ref := storage.SeriesRef(i) - itvs := tombstones.Intervals{ - {Mint: 12345, Maxt: 23456}, - {Mint: 34567, Maxt: 45678}, - } - for _, itv := range itvs { - expTombstones[ref].Add(itv) - } - head.tombstones.AddInterval(ref, itvs...) - err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ - {Ref: ref, Intervals: itvs}, - }, nil)) - require.NoError(t, err) - } - } - { - // Close Head and verify that new snapshot was not created. - head.opts.EnableMemorySnapshotOnShutdown = false - closeHeadAndCheckSnapshot() // This should not create a snapshot. - - // Test the replay of snapshot, m-map chunks, and WAL. - head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. - openHeadAndCheckReplay() - } - - // Creating another snapshot should delete the older snapshot and replay still works fine. - wlast, woffset, err = head.wal.LastSegmentAndOffset() - require.NoError(t, err) - if woffset != 0 && woffset < 32*1024 { - // The page is always filled before taking the snapshot. - woffset = 32 * 1024 - } - - { - // Close Head and verify that new snapshot was created. - closeHeadAndCheckSnapshot() - - // Verify that there is only 1 snapshot. - files, err := os.ReadDir(head.opts.ChunkDirRoot) - require.NoError(t, err) - snapshots := 0 - for i := len(files) - 1; i >= 0; i-- { - fi := files[i] - if strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) { - snapshots++ - require.Equal(t, chunkSnapshotDir(wlast, woffset), fi.Name()) - } - } - require.Equal(t, 1, snapshots) - - // Test the replay of snapshot. - head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. - - // Disabling exemplars to check that it does not hard fail replay - // https://github.com/prometheus/prometheus/issues/9437#issuecomment-933285870. - head.opts.EnableExemplarStorage = false - head.opts.MaxExemplars.Store(0) - expExemplars = expExemplars[:0] - - openHeadAndCheckReplay() - - require.Equal(t, 0.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) } } diff --git a/tsdb/head_test.go b/tsdb/head_test.go index 7b8ae0ecbd..81cb236801 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -112,8 +112,8 @@ func BenchmarkCreateSeries(b *testing.B) { } } -func populateTestWL(t testing.TB, w *wlog.WL, recs []any, buf []byte) []byte { - var enc record.Encoder +func populateTestWL(t testing.TB, w *wlog.WL, recs []any, buf []byte, enableSTStorage bool) []byte { + enc := record.Encoder{EnableSTStorage: enableSTStorage} for _, r := range recs { buf = buf[:0] switch v := r.(type) { @@ -159,7 +159,7 @@ func readTestWAL(t testing.TB, dir string) (recs []any) { series, err := dec.Series(rec, nil) require.NoError(t, err) recs = append(recs, series) - case record.Samples: + case record.Samples, record.SamplesV2: samples, err := dec.Samples(rec, nil) require.NoError(t, err) recs = append(recs, samples) @@ -256,177 +256,179 @@ func BenchmarkLoadWLs(b *testing.B) { // Rough estimates of most common % of samples that have an exemplar for each scrape. exemplarsPercentages := []float64{0, 0.5, 1, 5} lastExemplarsPerSeries := -1 - for _, c := range cases { - missingSeriesPercentages := []float64{0, 0.1} - for _, missingSeriesPct := range missingSeriesPercentages { - for _, p := range exemplarsPercentages { - exemplarsPerSeries := int(math.RoundToEven(float64(c.samplesPerSeries) * p / 100)) - // For tests with low samplesPerSeries we could end up testing with 0 exemplarsPerSeries - // multiple times without this check. - if exemplarsPerSeries == lastExemplarsPerSeries { - continue - } - lastExemplarsPerSeries = exemplarsPerSeries - b.Run(fmt.Sprintf("batches=%d,seriesPerBatch=%d,samplesPerSeries=%d,exemplarsPerSeries=%d,mmappedChunkT=%d,oooSeriesPct=%.3f,oooSamplesPct=%.3f,oooCapMax=%d,missingSeriesPct=%.3f", c.batches, c.seriesPerBatch, c.samplesPerSeries, exemplarsPerSeries, c.mmappedChunkT, c.oooSeriesPct, c.oooSamplesPct, c.oooCapMax, missingSeriesPct), - func(b *testing.B) { - dir := b.TempDir() + for _, enableStStorage := range []bool{false, true} { + for _, c := range cases { + missingSeriesPercentages := []float64{0, 0.1} + for _, missingSeriesPct := range missingSeriesPercentages { + for _, p := range exemplarsPercentages { + exemplarsPerSeries := int(math.RoundToEven(float64(c.samplesPerSeries) * p / 100)) + // For tests with low samplesPerSeries we could end up testing with 0 exemplarsPerSeries + // multiple times without this check. + if exemplarsPerSeries == lastExemplarsPerSeries { + continue + } + lastExemplarsPerSeries = exemplarsPerSeries + b.Run(fmt.Sprintf("batches=%d,seriesPerBatch=%d,samplesPerSeries=%d,exemplarsPerSeries=%d,mmappedChunkT=%d,oooSeriesPct=%.3f,oooSamplesPct=%.3f,oooCapMax=%d,missingSeriesPct=%.3f,stStorage=%v", c.batches, c.seriesPerBatch, c.samplesPerSeries, exemplarsPerSeries, c.mmappedChunkT, c.oooSeriesPct, c.oooSamplesPct, c.oooCapMax, missingSeriesPct, enableStStorage), + func(b *testing.B) { + dir := b.TempDir() - wal, err := wlog.New(nil, nil, dir, compression.None) - require.NoError(b, err) - var wbl *wlog.WL - if c.oooSeriesPct != 0 { - wbl, err = wlog.New(nil, nil, dir, compression.None) + wal, err := wlog.New(nil, nil, dir, compression.None) require.NoError(b, err) - } - - // Write series. - refSeries := make([]record.RefSeries, 0, c.seriesPerBatch) - var buf []byte - builder := labels.NewBuilder(labels.EmptyLabels()) - for j := 1; j < labelsPerSeries; j++ { - builder.Set(defaultLabelName+strconv.Itoa(j), defaultLabelValue+strconv.Itoa(j)) - } - for k := 0; k < c.batches; k++ { - refSeries = refSeries[:0] - for i := k * c.seriesPerBatch; i < (k+1)*c.seriesPerBatch; i++ { - builder.Set(defaultLabelName, strconv.Itoa(i)) - refSeries = append(refSeries, record.RefSeries{Ref: chunks.HeadSeriesRef(i) * 101, Labels: builder.Labels()}) + var wbl *wlog.WL + if c.oooSeriesPct != 0 { + wbl, err = wlog.New(nil, nil, dir, compression.None) + require.NoError(b, err) } - writeSeries := refSeries - if missingSeriesPct > 0 { - newWriteSeries := make([]record.RefSeries, 0, int(float64(len(refSeries))*(1.0-missingSeriesPct))) - keepRatio := 1.0 - missingSeriesPct - // Keep approximately every 1/keepRatio series. - for i, s := range refSeries { - if int(float64(i)*keepRatio) != int(float64(i+1)*keepRatio) { - newWriteSeries = append(newWriteSeries, s) + // Write series. + refSeries := make([]record.RefSeries, 0, c.seriesPerBatch) + var buf []byte + builder := labels.NewBuilder(labels.EmptyLabels()) + for j := 1; j < labelsPerSeries; j++ { + builder.Set(defaultLabelName+strconv.Itoa(j), defaultLabelValue+strconv.Itoa(j)) + } + for k := 0; k < c.batches; k++ { + refSeries = refSeries[:0] + for i := k * c.seriesPerBatch; i < (k+1)*c.seriesPerBatch; i++ { + builder.Set(defaultLabelName, strconv.Itoa(i)) + refSeries = append(refSeries, record.RefSeries{Ref: chunks.HeadSeriesRef(i) * 101, Labels: builder.Labels()}) + } + + writeSeries := refSeries + if missingSeriesPct > 0 { + newWriteSeries := make([]record.RefSeries, 0, int(float64(len(refSeries))*(1.0-missingSeriesPct))) + keepRatio := 1.0 - missingSeriesPct + // Keep approximately every 1/keepRatio series. + for i, s := range refSeries { + if int(float64(i)*keepRatio) != int(float64(i+1)*keepRatio) { + newWriteSeries = append(newWriteSeries, s) + } } + writeSeries = newWriteSeries } - writeSeries = newWriteSeries + + buf = populateTestWL(b, wal, []any{writeSeries}, buf, enableStStorage) } - buf = populateTestWL(b, wal, []any{writeSeries}, buf) - } + // Write samples. + refSamples := make([]record.RefSample, 0, c.seriesPerBatch) - // Write samples. - refSamples := make([]record.RefSample, 0, c.seriesPerBatch) + oooSeriesPerBatch := int(float64(c.seriesPerBatch) * c.oooSeriesPct) + oooSamplesPerSeries := int(float64(c.samplesPerSeries) * c.oooSamplesPct) - oooSeriesPerBatch := int(float64(c.seriesPerBatch) * c.oooSeriesPct) - oooSamplesPerSeries := int(float64(c.samplesPerSeries) * c.oooSamplesPct) + for i := 0; i < c.samplesPerSeries; i++ { + for j := 0; j < c.batches; j++ { + refSamples = refSamples[:0] - for i := 0; i < c.samplesPerSeries; i++ { - for j := 0; j < c.batches; j++ { - refSamples = refSamples[:0] - - k := j * c.seriesPerBatch - // Skip appending the first oooSamplesPerSeries samples for the series in the batch that - // should have OOO samples. OOO samples are appended after all the in-order samples. - if i < oooSamplesPerSeries { - k += oooSeriesPerBatch + k := j * c.seriesPerBatch + // Skip appending the first oooSamplesPerSeries samples for the series in the batch that + // should have OOO samples. OOO samples are appended after all the in-order samples. + if i < oooSamplesPerSeries { + k += oooSeriesPerBatch + } + for ; k < (j+1)*c.seriesPerBatch; k++ { + refSamples = append(refSamples, record.RefSample{ + Ref: chunks.HeadSeriesRef(k) * 101, + T: int64(i) * 10, + V: float64(i) * 100, + }) + } + buf = populateTestWL(b, wal, []any{refSamples}, buf, enableStStorage) } - for ; k < (j+1)*c.seriesPerBatch; k++ { - refSamples = append(refSamples, record.RefSample{ - Ref: chunks.HeadSeriesRef(k) * 101, - T: int64(i) * 10, - V: float64(i) * 100, - }) + } + + // Write mmapped chunks. + if c.mmappedChunkT != 0 { + chunkDiskMapper, err := chunks.NewChunkDiskMapper(nil, mmappedChunksDir(dir), chunkenc.NewPool(), chunks.DefaultWriteBufferSize, chunks.DefaultWriteQueueSize) + require.NoError(b, err) + cOpts := chunkOpts{ + chunkDiskMapper: chunkDiskMapper, + chunkRange: c.mmappedChunkT, + samplesPerChunk: DefaultSamplesPerChunk, } - buf = populateTestWL(b, wal, []any{refSamples}, buf) - } - } - - // Write mmapped chunks. - if c.mmappedChunkT != 0 { - chunkDiskMapper, err := chunks.NewChunkDiskMapper(nil, mmappedChunksDir(dir), chunkenc.NewPool(), chunks.DefaultWriteBufferSize, chunks.DefaultWriteQueueSize) - require.NoError(b, err) - cOpts := chunkOpts{ - chunkDiskMapper: chunkDiskMapper, - chunkRange: c.mmappedChunkT, - samplesPerChunk: DefaultSamplesPerChunk, - } - for k := 0; k < c.batches*c.seriesPerBatch; k++ { - // Create one mmapped chunk per series, with one sample at the given time. - s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, 0, defaultIsolationDisabled, false) - s.append(c.mmappedChunkT, 42, 0, cOpts) - // There's only one head chunk because only a single sample is appended. mmapChunks() - // ignores the latest chunk, so we need to cut a new head chunk to guarantee the chunk with - // the sample at c.mmappedChunkT is mmapped. - s.cutNewHeadChunk(c.mmappedChunkT, chunkenc.EncXOR, c.mmappedChunkT) - s.mmapChunks(chunkDiskMapper) - } - require.NoError(b, chunkDiskMapper.Close()) - } - - // Write exemplars. - refExemplars := make([]record.RefExemplar, 0, c.seriesPerBatch) - for i := range exemplarsPerSeries { - for j := 0; j < c.batches; j++ { - refExemplars = refExemplars[:0] - for k := j * c.seriesPerBatch; k < (j+1)*c.seriesPerBatch; k++ { - refExemplars = append(refExemplars, record.RefExemplar{ - Ref: chunks.HeadSeriesRef(k) * 101, - T: int64(i) * 10, - V: float64(i) * 100, - Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", i)), - }) + for k := 0; k < c.batches*c.seriesPerBatch; k++ { + // Create one mmapped chunk per series, with one sample at the given time. + s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, 0, defaultIsolationDisabled, false) + s.append(c.mmappedChunkT, 42, 0, cOpts) + // There's only one head chunk because only a single sample is appended. mmapChunks() + // ignores the latest chunk, so we need to cut a new head chunk to guarantee the chunk with + // the sample at c.mmappedChunkT is mmapped. + s.cutNewHeadChunk(c.mmappedChunkT, chunkenc.EncXOR, c.mmappedChunkT) + s.mmapChunks(chunkDiskMapper) } - buf = populateTestWL(b, wal, []any{refExemplars}, buf) + require.NoError(b, chunkDiskMapper.Close()) } - } - // Write OOO samples and mmap markers. - refMarkers := make([]record.RefMmapMarker, 0, oooSeriesPerBatch) - refSamples = make([]record.RefSample, 0, oooSeriesPerBatch) - for i := range oooSamplesPerSeries { - shouldAddMarkers := c.oooCapMax != 0 && i != 0 && int64(i)%c.oooCapMax == 0 - - for j := 0; j < c.batches; j++ { - refSamples = refSamples[:0] - if shouldAddMarkers { - refMarkers = refMarkers[:0] + // Write exemplars. + refExemplars := make([]record.RefExemplar, 0, c.seriesPerBatch) + for i := range exemplarsPerSeries { + for j := 0; j < c.batches; j++ { + refExemplars = refExemplars[:0] + for k := j * c.seriesPerBatch; k < (j+1)*c.seriesPerBatch; k++ { + refExemplars = append(refExemplars, record.RefExemplar{ + Ref: chunks.HeadSeriesRef(k) * 101, + T: int64(i) * 10, + V: float64(i) * 100, + Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", i)), + }) + } + buf = populateTestWL(b, wal, []any{refExemplars}, buf, enableStStorage) } - for k := j * c.seriesPerBatch; k < (j*c.seriesPerBatch)+oooSeriesPerBatch; k++ { - ref := chunks.HeadSeriesRef(k) * 101 + } + + // Write OOO samples and mmap markers. + refMarkers := make([]record.RefMmapMarker, 0, oooSeriesPerBatch) + refSamples = make([]record.RefSample, 0, oooSeriesPerBatch) + for i := range oooSamplesPerSeries { + shouldAddMarkers := c.oooCapMax != 0 && i != 0 && int64(i)%c.oooCapMax == 0 + + for j := 0; j < c.batches; j++ { + refSamples = refSamples[:0] if shouldAddMarkers { - // loadWBL() checks that the marker's MmapRef is less than or equal to the ref - // for the last mmap chunk. Setting MmapRef to 0 to always pass that check. - refMarkers = append(refMarkers, record.RefMmapMarker{Ref: ref, MmapRef: 0}) + refMarkers = refMarkers[:0] } - refSamples = append(refSamples, record.RefSample{ - Ref: ref, - T: int64(i) * 10, - V: float64(i) * 100, - }) + for k := j * c.seriesPerBatch; k < (j*c.seriesPerBatch)+oooSeriesPerBatch; k++ { + ref := chunks.HeadSeriesRef(k) * 101 + if shouldAddMarkers { + // loadWBL() checks that the marker's MmapRef is less than or equal to the ref + // for the last mmap chunk. Setting MmapRef to 0 to always pass that check. + refMarkers = append(refMarkers, record.RefMmapMarker{Ref: ref, MmapRef: 0}) + } + refSamples = append(refSamples, record.RefSample{ + Ref: ref, + T: int64(i) * 10, + V: float64(i) * 100, + }) + } + if shouldAddMarkers { + populateTestWL(b, wbl, []any{refMarkers}, buf, enableStStorage) + } + buf = populateTestWL(b, wal, []any{refSamples}, buf, enableStStorage) + buf = populateTestWL(b, wbl, []any{refSamples}, buf, enableStStorage) } - if shouldAddMarkers { - populateTestWL(b, wbl, []any{refMarkers}, buf) + } + + b.ResetTimer() + + // Load the WAL. + for b.Loop() { + opts := DefaultHeadOptions() + opts.ChunkRange = 1000 + opts.ChunkDirRoot = dir + if c.oooCapMax > 0 { + opts.OutOfOrderCapMax.Store(c.oooCapMax) } - buf = populateTestWL(b, wal, []any{refSamples}, buf) - buf = populateTestWL(b, wbl, []any{refSamples}, buf) + h, err := NewHead(nil, nil, wal, wbl, opts, nil) + require.NoError(b, err) + h.Init(0) } - } - - b.ResetTimer() - - // Load the WAL. - for b.Loop() { - opts := DefaultHeadOptions() - opts.ChunkRange = 1000 - opts.ChunkDirRoot = dir - if c.oooCapMax > 0 { - opts.OutOfOrderCapMax.Store(c.oooCapMax) + b.StopTimer() + wal.Close() + if wbl != nil { + wbl.Close() } - h, err := NewHead(nil, nil, wal, wbl, opts, nil) - require.NoError(b, err) - h.Init(0) - } - b.StopTimer() - wal.Close() - if wbl != nil { - wbl.Close() - } - }) + }) + } } } } @@ -711,124 +713,126 @@ func TestHead_HighConcurrencyReadAndWrite(t *testing.T) { } func TestHead_ReadWAL(t *testing.T) { - for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - entries := []any{ - []record.RefSeries{ - {Ref: 10, Labels: labels.FromStrings("a", "1")}, - {Ref: 11, Labels: labels.FromStrings("a", "2")}, - {Ref: 100, Labels: labels.FromStrings("a", "3")}, - }, - []record.RefSample{ - {Ref: 0, T: 99, V: 1}, - {Ref: 10, T: 100, V: 2}, - {Ref: 100, T: 100, V: 3}, - }, - []record.RefSeries{ - {Ref: 50, Labels: labels.FromStrings("a", "4")}, - // This series has two refs pointing to it. - {Ref: 101, Labels: labels.FromStrings("a", "3")}, - }, - []record.RefSample{ - {Ref: 10, T: 101, V: 5}, - {Ref: 50, T: 101, V: 6}, - // Sample for duplicate series record. - {Ref: 101, T: 101, V: 7}, - }, - []tombstones.Stone{ - {Ref: 0, Intervals: []tombstones.Interval{{Mint: 99, Maxt: 101}}}, - // Tombstone for duplicate series record. - {Ref: 101, Intervals: []tombstones.Interval{{Mint: 0, Maxt: 100}}}, - }, - []record.RefExemplar{ - {Ref: 10, T: 100, V: 1, Labels: labels.FromStrings("trace_id", "asdf")}, - // Exemplar for duplicate series record. - {Ref: 101, T: 101, V: 7, Labels: labels.FromStrings("trace_id", "zxcv")}, - }, - []record.RefMetadata{ - // Metadata for duplicate series record. - {Ref: 101, Type: uint8(record.Counter), Unit: "foo", Help: "total foo"}, - }, - } - - head, w := newTestHead(t, 1000, compress, false) - - populateTestWL(t, w, entries, nil) - - require.NoError(t, head.Init(math.MinInt64)) - require.Equal(t, uint64(101), head.lastSeriesID.Load()) - - s10 := head.series.getByID(10) - s11 := head.series.getByID(11) - s50 := head.series.getByID(50) - s100 := head.series.getByID(100) - s101 := head.series.getByID(101) - - testutil.RequireEqual(t, labels.FromStrings("a", "1"), s10.lset) - require.Nil(t, s11) // Series without samples should be garbage collected at head.Init(). - testutil.RequireEqual(t, labels.FromStrings("a", "4"), s50.lset) - testutil.RequireEqual(t, labels.FromStrings("a", "3"), s100.lset) - - // Duplicate series record should not be written to the head. - require.Nil(t, s101) - // But it should have a WAL expiry set. - keepUntil, ok := head.getWALExpiry(101) - require.True(t, ok) - require.Equal(t, int64(101), keepUntil) - // Only the duplicate series record should have a WAL expiry set. - _, ok = head.getWALExpiry(50) - require.False(t, ok) - - expandChunk := func(c chunkenc.Iterator) (x []sample) { - for c.Next() == chunkenc.ValFloat { - t, v := c.At() - x = append(x, sample{t: t, f: v}) + for _, enableStStorage := range []bool{false, true} { + for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + entries := []any{ + []record.RefSeries{ + {Ref: 10, Labels: labels.FromStrings("a", "1")}, + {Ref: 11, Labels: labels.FromStrings("a", "2")}, + {Ref: 100, Labels: labels.FromStrings("a", "3")}, + }, + []record.RefSample{ + {Ref: 0, T: 99, V: 1}, + {Ref: 10, T: 100, V: 2}, + {Ref: 100, T: 100, V: 3}, + }, + []record.RefSeries{ + {Ref: 50, Labels: labels.FromStrings("a", "4")}, + // This series has two refs pointing to it. + {Ref: 101, Labels: labels.FromStrings("a", "3")}, + }, + []record.RefSample{ + {Ref: 10, T: 101, V: 5}, + {Ref: 50, T: 101, V: 6}, + // Sample for duplicate series record. + {Ref: 101, T: 101, V: 7}, + }, + []tombstones.Stone{ + {Ref: 0, Intervals: []tombstones.Interval{{Mint: 99, Maxt: 101}}}, + // Tombstone for duplicate series record. + {Ref: 101, Intervals: []tombstones.Interval{{Mint: 0, Maxt: 100}}}, + }, + []record.RefExemplar{ + {Ref: 10, T: 100, V: 1, Labels: labels.FromStrings("trace_id", "asdf")}, + // Exemplar for duplicate series record. + {Ref: 101, T: 101, V: 7, Labels: labels.FromStrings("trace_id", "zxcv")}, + }, + []record.RefMetadata{ + // Metadata for duplicate series record. + {Ref: 101, Type: uint8(record.Counter), Unit: "foo", Help: "total foo"}, + }, } - require.NoError(t, c.Err()) - return x - } - // Verify samples and exemplar for series 10. - c, _, _, err := s10.chunk(0, head.chunkDiskMapper, &head.memChunkPool) - require.NoError(t, err) - require.Equal(t, []sample{{0, 100, 2, nil, nil}, {0, 101, 5, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + head, w := newTestHead(t, 1000, compress, false) - q, err := head.ExemplarQuerier(context.Background()) - require.NoError(t, err) - e, err := q.Select(0, 1000, []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "a", "1")}) - require.NoError(t, err) - require.NotEmpty(t, e) - require.NotEmpty(t, e[0].Exemplars) - require.True(t, exemplar.Exemplar{Ts: 100, Value: 1, Labels: labels.FromStrings("trace_id", "asdf")}.Equals(e[0].Exemplars[0])) + populateTestWL(t, w, entries, nil, enableStStorage) - // Verify samples for series 50 - c, _, _, err = s50.chunk(0, head.chunkDiskMapper, &head.memChunkPool) - require.NoError(t, err) - require.Equal(t, []sample{{0, 101, 6, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + require.NoError(t, head.Init(math.MinInt64)) + require.Equal(t, uint64(101), head.lastSeriesID.Load()) - // Verify records for series 100 and its duplicate, series 101. - // The samples before the new series record should be discarded since a duplicate record - // is only possible when old samples were compacted. - c, _, _, err = s100.chunk(0, head.chunkDiskMapper, &head.memChunkPool) - require.NoError(t, err) - require.Equal(t, []sample{{0, 101, 7, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + s10 := head.series.getByID(10) + s11 := head.series.getByID(11) + s50 := head.series.getByID(50) + s100 := head.series.getByID(100) + s101 := head.series.getByID(101) - q, err = head.ExemplarQuerier(context.Background()) - require.NoError(t, err) - e, err = q.Select(0, 1000, []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "a", "3")}) - require.NoError(t, err) - require.NotEmpty(t, e) - require.NotEmpty(t, e[0].Exemplars) - require.True(t, exemplar.Exemplar{Ts: 101, Value: 7, Labels: labels.FromStrings("trace_id", "zxcv")}.Equals(e[0].Exemplars[0])) + testutil.RequireEqual(t, labels.FromStrings("a", "1"), s10.lset) + require.Nil(t, s11) // Series without samples should be garbage collected at head.Init(). + testutil.RequireEqual(t, labels.FromStrings("a", "4"), s50.lset) + testutil.RequireEqual(t, labels.FromStrings("a", "3"), s100.lset) - require.NotNil(t, s100.meta) - require.Equal(t, "foo", s100.meta.Unit) - require.Equal(t, "total foo", s100.meta.Help) + // Duplicate series record should not be written to the head. + require.Nil(t, s101) + // But it should have a WAL expiry set. + keepUntil, ok := head.getWALExpiry(101) + require.True(t, ok) + require.Equal(t, int64(101), keepUntil) + // Only the duplicate series record should have a WAL expiry set. + _, ok = head.getWALExpiry(50) + require.False(t, ok) - intervals, err := head.tombstones.Get(storage.SeriesRef(s100.ref)) - require.NoError(t, err) - require.Equal(t, tombstones.Intervals{{Mint: 0, Maxt: 100}}, intervals) - }) + expandChunk := func(c chunkenc.Iterator) (x []sample) { + for c.Next() == chunkenc.ValFloat { + t, v := c.At() + x = append(x, sample{t: t, f: v}) + } + require.NoError(t, c.Err()) + return x + } + + // Verify samples and exemplar for series 10. + c, _, _, err := s10.chunk(0, head.chunkDiskMapper, &head.memChunkPool) + require.NoError(t, err) + require.Equal(t, []sample{{0, 100, 2, nil, nil}, {0, 101, 5, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + + q, err := head.ExemplarQuerier(context.Background()) + require.NoError(t, err) + e, err := q.Select(0, 1000, []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "a", "1")}) + require.NoError(t, err) + require.NotEmpty(t, e) + require.NotEmpty(t, e[0].Exemplars) + require.True(t, exemplar.Exemplar{Ts: 100, Value: 1, Labels: labels.FromStrings("trace_id", "asdf")}.Equals(e[0].Exemplars[0])) + + // Verify samples for series 50 + c, _, _, err = s50.chunk(0, head.chunkDiskMapper, &head.memChunkPool) + require.NoError(t, err) + require.Equal(t, []sample{{0, 101, 6, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + + // Verify records for series 100 and its duplicate, series 101. + // The samples before the new series record should be discarded since a duplicate record + // is only possible when old samples were compacted. + c, _, _, err = s100.chunk(0, head.chunkDiskMapper, &head.memChunkPool) + require.NoError(t, err) + require.Equal(t, []sample{{0, 101, 7, nil, nil}}, expandChunk(c.chunk.Iterator(nil))) + + q, err = head.ExemplarQuerier(context.Background()) + require.NoError(t, err) + e, err = q.Select(0, 1000, []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "a", "3")}) + require.NoError(t, err) + require.NotEmpty(t, e) + require.NotEmpty(t, e[0].Exemplars) + require.True(t, exemplar.Exemplar{Ts: 101, Value: 7, Labels: labels.FromStrings("trace_id", "zxcv")}.Equals(e[0].Exemplars[0])) + + require.NotNil(t, s100.meta) + require.Equal(t, "foo", s100.meta.Unit) + require.Equal(t, "total foo", s100.meta.Help) + + intervals, err := head.tombstones.Get(storage.SeriesRef(s100.ref)) + require.NoError(t, err) + require.Equal(t, tombstones.Intervals{{Mint: 0, Maxt: 100}}, intervals) + }) + } } } @@ -1099,42 +1103,43 @@ func TestHead_WALCheckpointMultiRef(t *testing.T) { }, } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - h, w := newTestHead(t, 1000, compression.None, false) - - populateTestWL(t, w, tc.walEntries, nil) - first, _, err := wlog.Segments(w.Dir()) - require.NoError(t, err) - - require.NoError(t, h.Init(0)) - - keepUntil, ok := h.getWALExpiry(2) - require.True(t, ok) - require.Equal(t, tc.expectedWalExpiry, keepUntil) - - // Each truncation creates a new segment, so attempt truncations until a checkpoint is created - for { - h.lastWALTruncationTime.Store(0) // Reset so that it's always time to truncate the WAL - err := h.truncateWAL(tc.walTruncateMinT) + for _, enableStStorage := range []bool{false, true} { + for _, tc := range cases { + t.Run(tc.name+",stStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + h, w := newTestHead(t, 1000, compression.None, false) + populateTestWL(t, w, tc.walEntries, nil, enableStStorage) + first, _, err := wlog.Segments(w.Dir()) require.NoError(t, err) - f, _, err := wlog.Segments(w.Dir()) - require.NoError(t, err) - if f > first { - break + + require.NoError(t, h.Init(0)) + + keepUntil, ok := h.getWALExpiry(2) + require.True(t, ok) + require.Equal(t, tc.expectedWalExpiry, keepUntil) + + // Each truncation creates a new segment, so attempt truncations until a checkpoint is created + for { + h.lastWALTruncationTime.Store(0) // Reset so that it's always time to truncate the WAL + err := h.truncateWAL(tc.walTruncateMinT) + require.NoError(t, err) + f, _, err := wlog.Segments(w.Dir()) + require.NoError(t, err) + if f > first { + break + } } - } - // Read test WAL , checkpoint first - checkpointDir, _, err := wlog.LastCheckpoint(w.Dir()) - require.NoError(t, err) - cprecs := readTestWAL(t, checkpointDir) - recs := readTestWAL(t, w.Dir()) - recs = append(cprecs, recs...) + // Read test WAL , checkpoint first + checkpointDir, _, err := wlog.LastCheckpoint(w.Dir()) + require.NoError(t, err) + cprecs := readTestWAL(t, checkpointDir) + recs := readTestWAL(t, w.Dir()) + recs = append(cprecs, recs...) - // Use testutil.RequireEqual which handles labels properly with dedupelabels - testutil.RequireEqual(t, tc.expectedWalEntries, recs) - }) + // Use testutil.RequireEqual which handles labels properly with dedupelabels + testutil.RequireEqual(t, tc.expectedWalEntries, recs) + }) + } } } @@ -1685,29 +1690,31 @@ func TestMemSeries_truncateChunks_scenarios(t *testing.T) { } func TestHeadDeleteSeriesWithoutSamples(t *testing.T) { - for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - entries := []any{ - []record.RefSeries{ - {Ref: 10, Labels: labels.FromStrings("a", "1")}, - }, - []record.RefSample{}, - []record.RefSeries{ - {Ref: 50, Labels: labels.FromStrings("a", "2")}, - }, - []record.RefSample{ - {Ref: 50, T: 80, V: 1}, - {Ref: 50, T: 90, V: 1}, - }, - } - head, w := newTestHead(t, 1000, compress, false) + for _, enableStStorage := range []bool{false, true} { + for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + entries := []any{ + []record.RefSeries{ + {Ref: 10, Labels: labels.FromStrings("a", "1")}, + }, + []record.RefSample{}, + []record.RefSeries{ + {Ref: 50, Labels: labels.FromStrings("a", "2")}, + }, + []record.RefSample{ + {Ref: 50, T: 80, V: 1}, + {Ref: 50, T: 90, V: 1}, + }, + } + head, w := newTestHead(t, 1000, compress, false) - populateTestWL(t, w, entries, nil) + populateTestWL(t, w, entries, nil, enableStStorage) - require.NoError(t, head.Init(math.MinInt64)) + require.NoError(t, head.Init(math.MinInt64)) - require.NoError(t, head.Delete(context.Background(), 0, 100, labels.MustNewMatcher(labels.MatchEqual, "a", "1"))) - }) + require.NoError(t, head.Delete(context.Background(), 0, 100, labels.MustNewMatcher(labels.MatchEqual, "a", "1"))) + }) + } } } @@ -2381,26 +2388,26 @@ func TestGCChunkAccess(t *testing.T) { idx := h.indexRange(0, 1500) var ( - chunks []chunks.Meta + chnks []chunks.Meta builder labels.ScratchBuilder ) - require.NoError(t, idx.Series(1, &builder, &chunks)) + require.NoError(t, idx.Series(1, &builder, &chnks)) require.Equal(t, labels.FromStrings("a", "1"), builder.Labels()) - require.Len(t, chunks, 2) + require.Len(t, chnks, 2) cr, err := h.chunksRange(0, 1500, nil) require.NoError(t, err) - _, _, err = cr.ChunkOrIterable(chunks[0]) + _, _, err = cr.ChunkOrIterable(chnks[0]) require.NoError(t, err) - _, _, err = cr.ChunkOrIterable(chunks[1]) + _, _, err = cr.ChunkOrIterable(chnks[1]) require.NoError(t, err) require.NoError(t, h.Truncate(1500)) // Remove a chunk. - _, _, err = cr.ChunkOrIterable(chunks[0]) + _, _, err = cr.ChunkOrIterable(chnks[0]) require.Equal(t, storage.ErrNotFound, err) - _, _, err = cr.ChunkOrIterable(chunks[1]) + _, _, err = cr.ChunkOrIterable(chnks[1]) require.NoError(t, err) } @@ -2568,94 +2575,96 @@ func TestHead_ReturnsSortedLabelValues(t *testing.T) { // TestWalRepair_DecodingError ensures that a repair is run for an error // when decoding a record. func TestWalRepair_DecodingError(t *testing.T) { - var enc record.Encoder - for name, test := range map[string]struct { - corrFunc func(rec []byte) []byte // Func that applies the corruption to a record. - rec []byte - totalRecs int - expRecs int - }{ - "decode_series": { - func(rec []byte) []byte { - return rec[:3] + for _, enableStStorage := range []bool{false, true} { + enc := record.Encoder{EnableSTStorage: enableStStorage} + for name, test := range map[string]struct { + corrFunc func(rec []byte) []byte // Func that applies the corruption to a record. + rec []byte + totalRecs int + expRecs int + }{ + "decode_series": { + func(rec []byte) []byte { + return rec[:3] + }, + enc.Series([]record.RefSeries{{Ref: 1, Labels: labels.FromStrings("a", "b")}}, []byte{}), + 9, + 5, }, - enc.Series([]record.RefSeries{{Ref: 1, Labels: labels.FromStrings("a", "b")}}, []byte{}), - 9, - 5, - }, - "decode_samples": { - func(rec []byte) []byte { - return rec[:3] + "decode_samples": { + func(rec []byte) []byte { + return rec[:3] + }, + enc.Samples([]record.RefSample{{Ref: 0, T: 99, V: 1}}, []byte{}), + 9, + 5, }, - enc.Samples([]record.RefSample{{Ref: 0, T: 99, V: 1}}, []byte{}), - 9, - 5, - }, - "decode_tombstone": { - func(rec []byte) []byte { - return rec[:3] + "decode_tombstone": { + func(rec []byte) []byte { + return rec[:3] + }, + enc.Tombstones([]tombstones.Stone{{Ref: 1, Intervals: tombstones.Intervals{}}}, []byte{}), + 9, + 5, }, - enc.Tombstones([]tombstones.Stone{{Ref: 1, Intervals: tombstones.Intervals{}}}, []byte{}), - 9, - 5, - }, - } { - for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { - t.Run(fmt.Sprintf("%s,compress=%s", name, compress), func(t *testing.T) { - dir := t.TempDir() + } { + for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { + t.Run(fmt.Sprintf("%s,compress=%s,stStorage=%v", name, compress, enableStStorage), func(t *testing.T) { + dir := t.TempDir() - // Fill the wal and corrupt it. - { - w, err := wlog.New(nil, nil, filepath.Join(dir, "wal"), compress) - require.NoError(t, err) + // Fill the wal and corrupt it. + { + w, err := wlog.New(nil, nil, filepath.Join(dir, "wal"), compress) + require.NoError(t, err) - for i := 1; i <= test.totalRecs; i++ { - // At this point insert a corrupted record. - if i-1 == test.expRecs { - require.NoError(t, w.Log(test.corrFunc(test.rec))) - continue + for i := 1; i <= test.totalRecs; i++ { + // At this point insert a corrupted record. + if i-1 == test.expRecs { + require.NoError(t, w.Log(test.corrFunc(test.rec))) + continue + } + require.NoError(t, w.Log(test.rec)) } - require.NoError(t, w.Log(test.rec)) + + opts := DefaultHeadOptions() + opts.ChunkRange = 1 + opts.ChunkDirRoot = w.Dir() + h, err := NewHead(nil, nil, w, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) + initErr := h.Init(math.MinInt64) + + var cerr *wlog.CorruptionErr + require.ErrorAs(t, initErr, &cerr, "reading the wal didn't return corruption error") + require.NoError(t, h.Close()) // Head will close the wal as well. } - opts := DefaultHeadOptions() - opts.ChunkRange = 1 - opts.ChunkDirRoot = w.Dir() - h, err := NewHead(nil, nil, w, nil, opts, nil) - require.NoError(t, err) - require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) - initErr := h.Init(math.MinInt64) - - var cerr *wlog.CorruptionErr - require.ErrorAs(t, initErr, &cerr, "reading the wal didn't return corruption error") - require.NoError(t, h.Close()) // Head will close the wal as well. - } - - // Open the db to trigger a repair. - { - db, err := Open(dir, nil, nil, DefaultOptions(), nil) - require.NoError(t, err) - defer func() { - require.NoError(t, db.Close()) - }() - require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal)) - } - - // Read the wal content after the repair. - { - sr, err := wlog.NewSegmentsReader(filepath.Join(dir, "wal")) - require.NoError(t, err) - defer sr.Close() - r := wlog.NewReader(sr) - - var actRec int - for r.Next() { - actRec++ + // Open the db to trigger a repair. + { + db, err := Open(dir, nil, nil, DefaultOptions(), nil) + require.NoError(t, err) + defer func() { + require.NoError(t, db.Close()) + }() + require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal)) } - require.NoError(t, r.Err()) - require.Equal(t, test.expRecs, actRec, "Wrong number of intact records") - } - }) + + // Read the wal content after the repair. + { + sr, err := wlog.NewSegmentsReader(filepath.Join(dir, "wal")) + require.NoError(t, err) + defer sr.Close() + r := wlog.NewReader(sr) + + var actRec int + for r.Next() { + actRec++ + } + require.NoError(t, r.Err()) + require.Equal(t, test.expRecs, actRec, "Wrong number of intact records") + } + }) + } } } } @@ -2663,72 +2672,76 @@ func TestWalRepair_DecodingError(t *testing.T) { // TestWblRepair_DecodingError ensures that a repair is run for an error // when decoding a record. func TestWblRepair_DecodingError(t *testing.T) { - var enc record.Encoder - corrFunc := func(rec []byte) []byte { - return rec[:3] - } - rec := enc.Samples([]record.RefSample{{Ref: 0, T: 99, V: 1}}, []byte{}) - totalRecs := 9 - expRecs := 5 - dir := t.TempDir() - - // Fill the wbl and corrupt it. - { - wal, err := wlog.New(nil, nil, filepath.Join(dir, "wal"), compression.None) - require.NoError(t, err) - wbl, err := wlog.New(nil, nil, filepath.Join(dir, "wbl"), compression.None) - require.NoError(t, err) - - for i := 1; i <= totalRecs; i++ { - // At this point insert a corrupted record. - if i-1 == expRecs { - require.NoError(t, wbl.Log(corrFunc(rec))) - continue + for _, enableStStorage := range []bool{false, true} { + t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + enc := record.Encoder{EnableSTStorage: enableStStorage} + corrFunc := func(rec []byte) []byte { + return rec[:3] } - require.NoError(t, wbl.Log(rec)) - } + rec := enc.Samples([]record.RefSample{{Ref: 0, T: 99, V: 1}}, []byte{}) + totalRecs := 9 + expRecs := 5 + dir := t.TempDir() - opts := DefaultHeadOptions() - opts.ChunkRange = 1 - opts.ChunkDirRoot = wal.Dir() - opts.OutOfOrderCapMax.Store(30) - opts.OutOfOrderTimeWindow.Store(1000 * time.Minute.Milliseconds()) - h, err := NewHead(nil, nil, wal, wbl, opts, nil) - require.NoError(t, err) - require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) - initErr := h.Init(math.MinInt64) + // Fill the wbl and corrupt it. + { + wal, err := wlog.New(nil, nil, filepath.Join(dir, "wal"), compression.None) + require.NoError(t, err) + wbl, err := wlog.New(nil, nil, filepath.Join(dir, "wbl"), compression.None) + require.NoError(t, err) - var elb *errLoadWbl - require.ErrorAs(t, initErr, &elb) // Wbl errors are wrapped into errLoadWbl, make sure we can unwrap it. + for i := 1; i <= totalRecs; i++ { + // At this point insert a corrupted record. + if i-1 == expRecs { + require.NoError(t, wbl.Log(corrFunc(rec))) + continue + } + require.NoError(t, wbl.Log(rec)) + } - var cerr *wlog.CorruptionErr - require.ErrorAs(t, initErr, &cerr, "reading the wal didn't return corruption error") - require.NoError(t, h.Close()) // Head will close the wal as well. - } + opts := DefaultHeadOptions() + opts.ChunkRange = 1 + opts.ChunkDirRoot = wal.Dir() + opts.OutOfOrderCapMax.Store(30) + opts.OutOfOrderTimeWindow.Store(1000 * time.Minute.Milliseconds()) + h, err := NewHead(nil, nil, wal, wbl, opts, nil) + require.NoError(t, err) + require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) + initErr := h.Init(math.MinInt64) - // Open the db to trigger a repair. - { - db, err := Open(dir, nil, nil, DefaultOptions(), nil) - require.NoError(t, err) - defer func() { - require.NoError(t, db.Close()) - }() - require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal)) - } + var elb *errLoadWbl + require.ErrorAs(t, initErr, &elb) // Wbl errors are wrapped into errLoadWbl, make sure we can unwrap it. - // Read the wbl content after the repair. - { - sr, err := wlog.NewSegmentsReader(filepath.Join(dir, "wbl")) - require.NoError(t, err) - defer sr.Close() - r := wlog.NewReader(sr) + var cerr *wlog.CorruptionErr + require.ErrorAs(t, initErr, &cerr, "reading the wal didn't return corruption error") + require.NoError(t, h.Close()) // Head will close the wal as well. + } - var actRec int - for r.Next() { - actRec++ - } - require.NoError(t, r.Err()) - require.Equal(t, expRecs, actRec, "Wrong number of intact records") + // Open the db to trigger a repair. + { + db, err := Open(dir, nil, nil, DefaultOptions(), nil) + require.NoError(t, err) + defer func() { + require.NoError(t, db.Close()) + }() + require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal)) + } + + // Read the wbl content after the repair. + { + sr, err := wlog.NewSegmentsReader(filepath.Join(dir, "wbl")) + require.NoError(t, err) + defer sr.Close() + r := wlog.NewReader(sr) + + var actRec int + for r.Next() { + actRec++ + } + require.NoError(t, r.Err()) + require.Equal(t, expRecs, actRec, "Wrong number of intact records") + } + }) } } @@ -4365,289 +4378,293 @@ func TestHistogramInWALAndMmapChunk(t *testing.T) { } func TestChunkSnapshot(t *testing.T) { - head, _ := newTestHead(t, 120*4, compression.None, false) - defer func() { - head.opts.EnableMemorySnapshotOnShutdown = false - require.NoError(t, head.Close()) - }() + for _, enableStStorage := range []bool{false, true} { + t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + head, _ := newTestHead(t, 120*4, compression.None, false) + defer func() { + head.opts.EnableMemorySnapshotOnShutdown = false + require.NoError(t, head.Close()) + }() - type ex struct { - seriesLabels labels.Labels - e exemplar.Exemplar - } - - numSeries := 10 - expSeries := make(map[string][]chunks.Sample) - expHist := make(map[string][]chunks.Sample) - expFloatHist := make(map[string][]chunks.Sample) - expTombstones := make(map[storage.SeriesRef]tombstones.Intervals) - expExemplars := make([]ex, 0) - histograms := tsdbutil.GenerateTestGaugeHistograms(481) - floatHistogram := tsdbutil.GenerateTestGaugeFloatHistograms(481) - - addExemplar := func(app storage.Appender, ref storage.SeriesRef, lbls labels.Labels, ts int64) { - e := ex{ - seriesLabels: lbls, - e: exemplar.Exemplar{ - Labels: labels.FromStrings("trace_id", strconv.Itoa(rand.Int())), - Value: rand.Float64(), - Ts: ts, - }, - } - expExemplars = append(expExemplars, e) - _, err := app.AppendExemplar(ref, e.seriesLabels, e.e) - require.NoError(t, err) - } - - checkSamples := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) - require.Equal(t, expSeries, series) - } - checkHistograms := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "hist", "baz.*")) - require.Equal(t, expHist, series) - } - checkFloatHistograms := func() { - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "floathist", "bat.*")) - require.Equal(t, expFloatHist, series) - } - checkTombstones := func() { - tr, err := head.Tombstones() - require.NoError(t, err) - actTombstones := make(map[storage.SeriesRef]tombstones.Intervals) - require.NoError(t, tr.Iter(func(ref storage.SeriesRef, itvs tombstones.Intervals) error { - for _, itv := range itvs { - actTombstones[ref].Add(itv) + type ex struct { + seriesLabels labels.Labels + e exemplar.Exemplar + } + + numSeries := 10 + expSeries := make(map[string][]chunks.Sample) + expHist := make(map[string][]chunks.Sample) + expFloatHist := make(map[string][]chunks.Sample) + expTombstones := make(map[storage.SeriesRef]tombstones.Intervals) + expExemplars := make([]ex, 0) + histograms := tsdbutil.GenerateTestGaugeHistograms(481) + floatHistogram := tsdbutil.GenerateTestGaugeFloatHistograms(481) + + addExemplar := func(app storage.Appender, ref storage.SeriesRef, lbls labels.Labels, ts int64) { + e := ex{ + seriesLabels: lbls, + e: exemplar.Exemplar{ + Labels: labels.FromStrings("trace_id", strconv.Itoa(rand.Int())), + Value: rand.Float64(), + Ts: ts, + }, + } + expExemplars = append(expExemplars, e) + _, err := app.AppendExemplar(ref, e.seriesLabels, e.e) + require.NoError(t, err) + } + + checkSamples := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expSeries, series) + } + checkHistograms := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "hist", "baz.*")) + require.Equal(t, expHist, series) + } + checkFloatHistograms := func() { + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "floathist", "bat.*")) + require.Equal(t, expFloatHist, series) + } + checkTombstones := func() { + tr, err := head.Tombstones() + require.NoError(t, err) + actTombstones := make(map[storage.SeriesRef]tombstones.Intervals) + require.NoError(t, tr.Iter(func(ref storage.SeriesRef, itvs tombstones.Intervals) error { + for _, itv := range itvs { + actTombstones[ref].Add(itv) + } + return nil + })) + require.Equal(t, expTombstones, actTombstones) + } + checkExemplars := func() { + actExemplars := make([]ex, 0, len(expExemplars)) + err := head.exemplars.IterateExemplars(func(seriesLabels labels.Labels, e exemplar.Exemplar) error { + actExemplars = append(actExemplars, ex{ + seriesLabels: seriesLabels, + e: e, + }) + return nil + }) + require.NoError(t, err) + // Verifies both existence of right exemplars and order of exemplars in the buffer. + testutil.RequireEqualWithOptions(t, expExemplars, actExemplars, []cmp.Option{cmp.AllowUnexported(ex{})}) + } + + var ( + wlast, woffset int + err error + ) + + closeHeadAndCheckSnapshot := func() { + require.NoError(t, head.Close()) + + _, sidx, soffset, err := LastChunkSnapshot(head.opts.ChunkDirRoot) + require.NoError(t, err) + require.Equal(t, wlast, sidx) + require.Equal(t, woffset, soffset) + } + + openHeadAndCheckReplay := func() { + w, err := wlog.NewSize(nil, nil, head.wal.Dir(), 32768, compression.None) + require.NoError(t, err) + head, err = NewHead(nil, nil, w, nil, head.opts, nil) + require.NoError(t, err) + require.NoError(t, head.Init(math.MinInt64)) + + checkSamples() + checkHistograms() + checkFloatHistograms() + checkTombstones() + checkExemplars() + } + + { // Initial data that goes into snapshot. + // Add some initial samples with >=1 m-map chunk. + app := head.Appender(context.Background()) + for i := 1; i <= numSeries; i++ { + lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) + lblStr := lbls.String() + lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) + lblsHistStr := lblsHist.String() + lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) + lblsFloatHistStr := lblsFloatHist.String() + + // 240 samples should m-map at least 1 chunk. + for ts := int64(1); ts <= 240; ts++ { + val := rand.Float64() + expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) + ref, err := app.Append(0, lbls, ts, val) + require.NoError(t, err) + + hist := histograms[int(ts)] + expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) + _, err = app.AppendHistogram(0, lblsHist, ts, hist, nil) + require.NoError(t, err) + + floatHist := floatHistogram[int(ts)] + expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) + _, err = app.AppendHistogram(0, lblsFloatHist, ts, nil, floatHist) + require.NoError(t, err) + + // Add an exemplar and to create multiple WAL records. + if ts%10 == 0 { + addExemplar(app, ref, lbls, ts) + require.NoError(t, app.Commit()) + app = head.Appender(context.Background()) + } + } + } + require.NoError(t, app.Commit()) + + // Add some tombstones. + enc := record.Encoder{EnableSTStorage: enableStStorage} + for i := 1; i <= numSeries; i++ { + ref := storage.SeriesRef(i) + itvs := tombstones.Intervals{ + {Mint: 1234, Maxt: 2345}, + {Mint: 3456, Maxt: 4567}, + } + for _, itv := range itvs { + expTombstones[ref].Add(itv) + } + head.tombstones.AddInterval(ref, itvs...) + err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ + {Ref: ref, Intervals: itvs}, + }, nil)) + require.NoError(t, err) + } + } + + // These references should be the ones used for the snapshot. + wlast, woffset, err = head.wal.LastSegmentAndOffset() + require.NoError(t, err) + if woffset != 0 && woffset < 32*1024 { + // The page is always filled before taking the snapshot. + woffset = 32 * 1024 + } + + { + // Creating snapshot and verifying it. + head.opts.EnableMemorySnapshotOnShutdown = true + closeHeadAndCheckSnapshot() // This will create a snapshot. + + // Test the replay of snapshot. + openHeadAndCheckReplay() + } + + { // Additional data to only include in WAL and m-mapped chunks and not snapshot. This mimics having an old snapshot on disk. + // Add more samples. + app := head.Appender(context.Background()) + for i := 1; i <= numSeries; i++ { + lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) + lblStr := lbls.String() + lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) + lblsHistStr := lblsHist.String() + lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) + lblsFloatHistStr := lblsFloatHist.String() + + // 240 samples should m-map at least 1 chunk. + for ts := int64(241); ts <= 480; ts++ { + val := rand.Float64() + expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) + ref, err := app.Append(0, lbls, ts, val) + require.NoError(t, err) + + hist := histograms[int(ts)] + expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) + _, err = app.AppendHistogram(0, lblsHist, ts, hist, nil) + require.NoError(t, err) + + floatHist := floatHistogram[int(ts)] + expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) + _, err = app.AppendHistogram(0, lblsFloatHist, ts, nil, floatHist) + require.NoError(t, err) + + // Add an exemplar and to create multiple WAL records. + if ts%10 == 0 { + addExemplar(app, ref, lbls, ts) + require.NoError(t, app.Commit()) + app = head.Appender(context.Background()) + } + } + } + require.NoError(t, app.Commit()) + + // Add more tombstones. + enc := record.Encoder{EnableSTStorage: enableStStorage} + for i := 1; i <= numSeries; i++ { + ref := storage.SeriesRef(i) + itvs := tombstones.Intervals{ + {Mint: 12345, Maxt: 23456}, + {Mint: 34567, Maxt: 45678}, + } + for _, itv := range itvs { + expTombstones[ref].Add(itv) + } + head.tombstones.AddInterval(ref, itvs...) + err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ + {Ref: ref, Intervals: itvs}, + }, nil)) + require.NoError(t, err) + } + } + { + // Close Head and verify that new snapshot was not created. + head.opts.EnableMemorySnapshotOnShutdown = false + closeHeadAndCheckSnapshot() // This should not create a snapshot. + + // Test the replay of snapshot, m-map chunks, and WAL. + head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. + openHeadAndCheckReplay() + } + + // Creating another snapshot should delete the older snapshot and replay still works fine. + wlast, woffset, err = head.wal.LastSegmentAndOffset() + require.NoError(t, err) + if woffset != 0 && woffset < 32*1024 { + // The page is always filled before taking the snapshot. + woffset = 32 * 1024 + } + + { + // Close Head and verify that new snapshot was created. + closeHeadAndCheckSnapshot() + + // Verify that there is only 1 snapshot. + files, err := os.ReadDir(head.opts.ChunkDirRoot) + require.NoError(t, err) + snapshots := 0 + for i := len(files) - 1; i >= 0; i-- { + fi := files[i] + if strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) { + snapshots++ + require.Equal(t, chunkSnapshotDir(wlast, woffset), fi.Name()) + } + } + require.Equal(t, 1, snapshots) + + // Test the replay of snapshot. + head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. + + // Disabling exemplars to check that it does not hard fail replay + // https://github.com/prometheus/prometheus/issues/9437#issuecomment-933285870. + head.opts.EnableExemplarStorage = false + head.opts.MaxExemplars.Store(0) + expExemplars = expExemplars[:0] + + openHeadAndCheckReplay() + + require.Equal(t, 0.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) } - return nil - })) - require.Equal(t, expTombstones, actTombstones) - } - checkExemplars := func() { - actExemplars := make([]ex, 0, len(expExemplars)) - err := head.exemplars.IterateExemplars(func(seriesLabels labels.Labels, e exemplar.Exemplar) error { - actExemplars = append(actExemplars, ex{ - seriesLabels: seriesLabels, - e: e, - }) - return nil }) - require.NoError(t, err) - // Verifies both existence of right exemplars and order of exemplars in the buffer. - testutil.RequireEqualWithOptions(t, expExemplars, actExemplars, []cmp.Option{cmp.AllowUnexported(ex{})}) - } - - var ( - wlast, woffset int - err error - ) - - closeHeadAndCheckSnapshot := func() { - require.NoError(t, head.Close()) - - _, sidx, soffset, err := LastChunkSnapshot(head.opts.ChunkDirRoot) - require.NoError(t, err) - require.Equal(t, wlast, sidx) - require.Equal(t, woffset, soffset) - } - - openHeadAndCheckReplay := func() { - w, err := wlog.NewSize(nil, nil, head.wal.Dir(), 32768, compression.None) - require.NoError(t, err) - head, err = NewHead(nil, nil, w, nil, head.opts, nil) - require.NoError(t, err) - require.NoError(t, head.Init(math.MinInt64)) - - checkSamples() - checkHistograms() - checkFloatHistograms() - checkTombstones() - checkExemplars() - } - - { // Initial data that goes into snapshot. - // Add some initial samples with >=1 m-map chunk. - app := head.Appender(context.Background()) - for i := 1; i <= numSeries; i++ { - lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) - lblStr := lbls.String() - lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) - lblsHistStr := lblsHist.String() - lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) - lblsFloatHistStr := lblsFloatHist.String() - - // 240 samples should m-map at least 1 chunk. - for ts := int64(1); ts <= 240; ts++ { - val := rand.Float64() - expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) - ref, err := app.Append(0, lbls, ts, val) - require.NoError(t, err) - - hist := histograms[int(ts)] - expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) - _, err = app.AppendHistogram(0, lblsHist, ts, hist, nil) - require.NoError(t, err) - - floatHist := floatHistogram[int(ts)] - expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) - _, err = app.AppendHistogram(0, lblsFloatHist, ts, nil, floatHist) - require.NoError(t, err) - - // Add an exemplar and to create multiple WAL records. - if ts%10 == 0 { - addExemplar(app, ref, lbls, ts) - require.NoError(t, app.Commit()) - app = head.Appender(context.Background()) - } - } - } - require.NoError(t, app.Commit()) - - // Add some tombstones. - var enc record.Encoder - for i := 1; i <= numSeries; i++ { - ref := storage.SeriesRef(i) - itvs := tombstones.Intervals{ - {Mint: 1234, Maxt: 2345}, - {Mint: 3456, Maxt: 4567}, - } - for _, itv := range itvs { - expTombstones[ref].Add(itv) - } - head.tombstones.AddInterval(ref, itvs...) - err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ - {Ref: ref, Intervals: itvs}, - }, nil)) - require.NoError(t, err) - } - } - - // These references should be the ones used for the snapshot. - wlast, woffset, err = head.wal.LastSegmentAndOffset() - require.NoError(t, err) - if woffset != 0 && woffset < 32*1024 { - // The page is always filled before taking the snapshot. - woffset = 32 * 1024 - } - - { - // Creating snapshot and verifying it. - head.opts.EnableMemorySnapshotOnShutdown = true - closeHeadAndCheckSnapshot() // This will create a snapshot. - - // Test the replay of snapshot. - openHeadAndCheckReplay() - } - - { // Additional data to only include in WAL and m-mapped chunks and not snapshot. This mimics having an old snapshot on disk. - // Add more samples. - app := head.Appender(context.Background()) - for i := 1; i <= numSeries; i++ { - lbls := labels.FromStrings("foo", fmt.Sprintf("bar%d", i)) - lblStr := lbls.String() - lblsHist := labels.FromStrings("hist", fmt.Sprintf("baz%d", i)) - lblsHistStr := lblsHist.String() - lblsFloatHist := labels.FromStrings("floathist", fmt.Sprintf("bat%d", i)) - lblsFloatHistStr := lblsFloatHist.String() - - // 240 samples should m-map at least 1 chunk. - for ts := int64(241); ts <= 480; ts++ { - val := rand.Float64() - expSeries[lblStr] = append(expSeries[lblStr], sample{0, ts, val, nil, nil}) - ref, err := app.Append(0, lbls, ts, val) - require.NoError(t, err) - - hist := histograms[int(ts)] - expHist[lblsHistStr] = append(expHist[lblsHistStr], sample{0, ts, 0, hist, nil}) - _, err = app.AppendHistogram(0, lblsHist, ts, hist, nil) - require.NoError(t, err) - - floatHist := floatHistogram[int(ts)] - expFloatHist[lblsFloatHistStr] = append(expFloatHist[lblsFloatHistStr], sample{0, ts, 0, nil, floatHist}) - _, err = app.AppendHistogram(0, lblsFloatHist, ts, nil, floatHist) - require.NoError(t, err) - - // Add an exemplar and to create multiple WAL records. - if ts%10 == 0 { - addExemplar(app, ref, lbls, ts) - require.NoError(t, app.Commit()) - app = head.Appender(context.Background()) - } - } - } - require.NoError(t, app.Commit()) - - // Add more tombstones. - var enc record.Encoder - for i := 1; i <= numSeries; i++ { - ref := storage.SeriesRef(i) - itvs := tombstones.Intervals{ - {Mint: 12345, Maxt: 23456}, - {Mint: 34567, Maxt: 45678}, - } - for _, itv := range itvs { - expTombstones[ref].Add(itv) - } - head.tombstones.AddInterval(ref, itvs...) - err := head.wal.Log(enc.Tombstones([]tombstones.Stone{ - {Ref: ref, Intervals: itvs}, - }, nil)) - require.NoError(t, err) - } - } - { - // Close Head and verify that new snapshot was not created. - head.opts.EnableMemorySnapshotOnShutdown = false - closeHeadAndCheckSnapshot() // This should not create a snapshot. - - // Test the replay of snapshot, m-map chunks, and WAL. - head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. - openHeadAndCheckReplay() - } - - // Creating another snapshot should delete the older snapshot and replay still works fine. - wlast, woffset, err = head.wal.LastSegmentAndOffset() - require.NoError(t, err) - if woffset != 0 && woffset < 32*1024 { - // The page is always filled before taking the snapshot. - woffset = 32 * 1024 - } - - { - // Close Head and verify that new snapshot was created. - closeHeadAndCheckSnapshot() - - // Verify that there is only 1 snapshot. - files, err := os.ReadDir(head.opts.ChunkDirRoot) - require.NoError(t, err) - snapshots := 0 - for i := len(files) - 1; i >= 0; i-- { - fi := files[i] - if strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) { - snapshots++ - require.Equal(t, chunkSnapshotDir(wlast, woffset), fi.Name()) - } - } - require.Equal(t, 1, snapshots) - - // Test the replay of snapshot. - head.opts.EnableMemorySnapshotOnShutdown = true // Enabled to read from snapshot. - - // Disabling exemplars to check that it does not hard fail replay - // https://github.com/prometheus/prometheus/issues/9437#issuecomment-933285870. - head.opts.EnableExemplarStorage = false - head.opts.MaxExemplars.Store(0) - expExemplars = expExemplars[:0] - - openHeadAndCheckReplay() - - require.Equal(t, 0.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) } } @@ -5375,70 +5392,74 @@ func TestAppendingDifferentEncodingToSameSeries(t *testing.T) { // Tests https://github.com/prometheus/prometheus/issues/9725. func TestChunkSnapshotReplayBug(t *testing.T) { - dir := t.TempDir() - wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.Snappy) - require.NoError(t, err) + for _, enableStStorage := range []bool{false, true} { + t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + dir := t.TempDir() + wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.Snappy) + require.NoError(t, err) - // Write few series records and samples such that the series references are not in order in the WAL - // for status_code="200". - var buf []byte - for i := 1; i <= 1000; i++ { - var ref chunks.HeadSeriesRef - if i <= 500 { - ref = chunks.HeadSeriesRef(i * 100) - } else { - ref = chunks.HeadSeriesRef((i - 500) * 50) - } - seriesRec := record.RefSeries{ - Ref: ref, - Labels: labels.FromStrings( - "__name__", "request_duration", - "status_code", "200", - "foo", fmt.Sprintf("baz%d", rand.Int()), - ), - } - // Add a sample so that the series is not garbage collected. - samplesRec := record.RefSample{Ref: ref, T: 1000, V: 1000} - var enc record.Encoder + // Write few series records and samples such that the series references are not in order in the WAL + // for status_code="200". + var buf []byte + for i := 1; i <= 1000; i++ { + var ref chunks.HeadSeriesRef + if i <= 500 { + ref = chunks.HeadSeriesRef(i * 100) + } else { + ref = chunks.HeadSeriesRef((i - 500) * 50) + } + seriesRec := record.RefSeries{ + Ref: ref, + Labels: labels.FromStrings( + "__name__", "request_duration", + "status_code", "200", + "foo", fmt.Sprintf("baz%d", rand.Int()), + ), + } + // Add a sample so that the series is not garbage collected. + samplesRec := record.RefSample{Ref: ref, T: 1000, V: 1000} + enc := record.Encoder{EnableSTStorage: enableStStorage} - rec := enc.Series([]record.RefSeries{seriesRec}, buf) - buf = rec[:0] - require.NoError(t, wal.Log(rec)) - rec = enc.Samples([]record.RefSample{samplesRec}, buf) - buf = rec[:0] - require.NoError(t, wal.Log(rec)) + rec := enc.Series([]record.RefSeries{seriesRec}, buf) + buf = rec[:0] + require.NoError(t, wal.Log(rec)) + rec = enc.Samples([]record.RefSample{samplesRec}, buf) + buf = rec[:0] + require.NoError(t, wal.Log(rec)) + } + + // Write a corrupt snapshot to fail the replay on startup. + snapshotName := chunkSnapshotDir(0, 100) + cpdir := filepath.Join(dir, snapshotName) + require.NoError(t, os.MkdirAll(cpdir, 0o777)) + + err = os.WriteFile(filepath.Join(cpdir, "00000000"), []byte{1, 5, 3, 5, 6, 7, 4, 2, 2}, 0o777) + require.NoError(t, err) + + opts := DefaultHeadOptions() + opts.ChunkDirRoot = dir + opts.EnableMemorySnapshotOnShutdown = true + head, err := NewHead(nil, nil, wal, nil, opts, nil) + require.NoError(t, err) + require.NoError(t, head.Init(math.MinInt64)) + defer func() { + require.NoError(t, head.Close()) + }() + + // Snapshot replay should error out. + require.Equal(t, 1.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) + + // Querying `request_duration{status_code!="200"}` should return no series since all of + // them have status_code="200". + q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + series := query(t, q, + labels.MustNewMatcher(labels.MatchEqual, "__name__", "request_duration"), + labels.MustNewMatcher(labels.MatchNotEqual, "status_code", "200"), + ) + require.Empty(t, series, "there should be no series found") + }) } - - // Write a corrupt snapshot to fail the replay on startup. - snapshotName := chunkSnapshotDir(0, 100) - cpdir := filepath.Join(dir, snapshotName) - require.NoError(t, os.MkdirAll(cpdir, 0o777)) - - err = os.WriteFile(filepath.Join(cpdir, "00000000"), []byte{1, 5, 3, 5, 6, 7, 4, 2, 2}, 0o777) - require.NoError(t, err) - - opts := DefaultHeadOptions() - opts.ChunkDirRoot = dir - opts.EnableMemorySnapshotOnShutdown = true - head, err := NewHead(nil, nil, wal, nil, opts, nil) - require.NoError(t, err) - require.NoError(t, head.Init(math.MinInt64)) - defer func() { - require.NoError(t, head.Close()) - }() - - // Snapshot replay should error out. - require.Equal(t, 1.0, prom_testutil.ToFloat64(head.metrics.snapshotReplayErrorTotal)) - - // Querying `request_duration{status_code!="200"}` should return no series since all of - // them have status_code="200". - q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64) - require.NoError(t, err) - series := query(t, q, - labels.MustNewMatcher(labels.MatchEqual, "__name__", "request_duration"), - labels.MustNewMatcher(labels.MatchNotEqual, "status_code", "200"), - ) - require.Empty(t, series, "there should be no series found") } func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) { diff --git a/tsdb/head_wal.go b/tsdb/head_wal.go index 0581b9306e..6e9b80060c 100644 --- a/tsdb/head_wal.go +++ b/tsdb/head_wal.go @@ -169,7 +169,7 @@ func (h *Head) loadWAL(r *wlog.Reader, syms *labels.SymbolTable, multiRef map[ch return } decoded <- series - case record.Samples: + case record.Samples, record.SamplesV2: samples := h.wlReplaySamplesPool.Get()[:0] samples, err = dec.Samples(r.Record(), samples) if err != nil { @@ -798,7 +798,7 @@ func (h *Head) loadWBL(r *wlog.Reader, syms *labels.SymbolTable, multiRef map[ch var err error rec := r.Record() switch dec.Type(rec) { - case record.Samples: + case record.Samples, record.SamplesV2: samples := h.wlReplaySamplesPool.Get()[:0] samples, err = dec.Samples(rec, samples) if err != nil { @@ -1400,7 +1400,7 @@ func (h *Head) ChunkSnapshot() (*ChunkSnapshotStats, error) { // Assuming 100 bytes (overestimate) per exemplar, that's ~1MB. maxExemplarsPerRecord := 10000 batch := make([]record.RefExemplar, 0, maxExemplarsPerRecord) - enc := record.Encoder{} + enc := record.Encoder{EnableSTStorage: h.opts.EnableSTStorage} flushExemplars := func() error { if len(batch) == 0 { return nil diff --git a/tsdb/record/bench_test.go b/tsdb/record/bench_test.go new file mode 100644 index 0000000000..f65cb34ff3 --- /dev/null +++ b/tsdb/record/bench_test.go @@ -0,0 +1,207 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package record_test + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/tsdb/compression" + "github.com/prometheus/prometheus/tsdb/record" + "github.com/prometheus/prometheus/util/testrecord" +) + +func zeroOutSTs(samples []record.RefSample) []record.RefSample { + out := make([]record.RefSample, len(samples)) + for i := range samples { + out[i] = samples[i] + out[i].ST = 0 + } + return out +} + +func TestEncodeDecode(t *testing.T) { + for _, enableStStorage := range []bool{false, true} { + for _, tcase := range []testrecord.RefSamplesCase{ + testrecord.Realistic1000Samples, + testrecord.Realistic1000WithVariableSTSamples, + testrecord.Realistic1000WithConstSTSamples, + testrecord.WorstCase1000, + testrecord.WorstCase1000WithSTSamples, + } { + var ( + dec record.Decoder + buf []byte + enc = record.Encoder{EnableSTStorage: enableStStorage} + ) + + s := testrecord.GenTestRefSamplesCase(t, tcase) + + { + got, err := dec.Samples(enc.Samples(s, nil), nil) + require.NoError(t, err) + // if ST is off, we expect all STs to be zero + expected := s + if !enableStStorage { + expected = zeroOutSTs(s) + } + + require.Equal(t, expected, got) + } + + // With byte buffer (append!) + { + buf = make([]byte, 10, 1e5) + got, err := dec.Samples(enc.Samples(s, buf)[10:], nil) + require.NoError(t, err) + + expected := s + if !enableStStorage { + expected = zeroOutSTs(s) + } + require.Equal(t, expected, got) + } + + // With sample slice + { + samples := make([]record.RefSample, 0, len(s)+1) + got, err := dec.Samples(enc.Samples(s, nil), samples) + require.NoError(t, err) + expected := s + if !enableStStorage { + expected = zeroOutSTs(s) + } + require.Equal(t, expected, got) + } + + // With compression. + { + buf := enc.Samples(s, nil) + + cEnc, err := compression.NewEncoder() + require.NoError(t, err) + buf, _, err = cEnc.Encode(compression.Zstd, buf, nil) + require.NoError(t, err) + + buf, err = compression.NewDecoder().Decode(compression.Zstd, buf, nil) + require.NoError(t, err) + + got, err := dec.Samples(buf, nil) + require.NoError(t, err) + expected := s + if !enableStStorage { + expected = zeroOutSTs(s) + } + require.Equal(t, expected, got) + } + } + } +} + +var ( + compressions = []compression.Type{compression.None, compression.Snappy, compression.Zstd} + dataCases = []testrecord.RefSamplesCase{ + testrecord.Realistic1000Samples, + testrecord.Realistic1000WithVariableSTSamples, + testrecord.Realistic1000WithConstSTSamples, + testrecord.WorstCase1000, + testrecord.WorstCase1000WithSTSamples, + } + UseV2 = true +) + +/* + export bench=encode-v2 && go test ./tsdb/record/... \ + -run '^$' -bench '^BenchmarkEncode_Samples' \ + -benchtime 5s -count 6 -cpu 2 -timeout 999m \ + | tee ${bench}.txt +*/ +func BenchmarkEncode_Samples(b *testing.B) { + for _, compr := range compressions { + for _, data := range dataCases { + b.Run(fmt.Sprintf("compr=%v/data=%v", compr, data), func(b *testing.B) { + var ( + samples = testrecord.GenTestRefSamplesCase(b, data) + enc = record.Encoder{EnableSTStorage: UseV2} + buf []byte + cBuf []byte + ) + + cEnc, err := compression.NewEncoder() + require.NoError(b, err) + + // Warm up. + buf = enc.Samples(samples, buf[:0]) + cBuf, _, err = cEnc.Encode(compr, buf, cBuf[:0]) + require.NoError(b, err) + + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + buf = enc.Samples(samples, buf[:0]) + b.ReportMetric(float64(len(buf)), "B/rec") + + cBuf, _, _ = cEnc.Encode(compr, buf, cBuf[:0]) + b.ReportMetric(float64(len(cBuf)), "B/compressed-rec") + } + }) + } + } +} + +/* + export bench=decode-v2 && go test ./tsdb/record/... \ + -run '^$' -bench '^BenchmarkDecode_Samples' \ + -benchtime 5s -count 6 -cpu 2 -timeout 999m \ + | tee ${bench}.txt +*/ +func BenchmarkDecode_Samples(b *testing.B) { + for _, compr := range compressions { + for _, data := range dataCases { + b.Run(fmt.Sprintf("compr=%v/data=%v", compr, data), func(b *testing.B) { + var ( + samples = testrecord.GenTestRefSamplesCase(b, data) + enc = record.Encoder{EnableSTStorage: UseV2} + dec record.Decoder + cDec = compression.NewDecoder() + cBuf []byte + samplesBuf []record.RefSample + ) + + buf := enc.Samples(samples, nil) + + cEnc, err := compression.NewEncoder() + require.NoError(b, err) + + buf, _, err = cEnc.Encode(compr, buf, nil) + require.NoError(b, err) + + // Warm up. + cBuf, err = cDec.Decode(compr, buf, cBuf[:0]) + require.NoError(b, err) + samplesBuf, err = dec.Samples(cBuf, samplesBuf[:0]) + require.NoError(b, err) + + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + cBuf, _ = cDec.Decode(compr, buf, cBuf[:0]) + samplesBuf, _ = dec.Samples(cBuf, samplesBuf[:0]) + } + }) + } + } +} diff --git a/tsdb/record/record.go b/tsdb/record/record.go index 106b8e51bc..d03c8c62d3 100644 --- a/tsdb/record/record.go +++ b/tsdb/record/record.go @@ -58,6 +58,8 @@ const ( CustomBucketsHistogramSamples Type = 9 // CustomBucketsFloatHistogramSamples is used to match WAL records of type Float Histogram with custom buckets. CustomBucketsFloatHistogramSamples Type = 10 + // SamplesV2 is an enhanced sample record with an encoding scheme that allows storing float samples with timestamp and an optional ST per sample. + SamplesV2 Type = 11 ) func (rt Type) String() string { @@ -66,6 +68,8 @@ func (rt Type) String() string { return "series" case Samples: return "samples" + case SamplesV2: + return "samples-v2" case Tombstones: return "tombstones" case Exemplars: @@ -157,12 +161,12 @@ type RefSeries struct { Labels labels.Labels } -// RefSample is a timestamp/value pair associated with a reference to a series. +// RefSample is a timestamp/st/value struct associated with a reference to a series. // TODO(beorn7): Perhaps make this "polymorphic", including histogram and float-histogram pointers? Then get rid of RefHistogramSample. type RefSample struct { - Ref chunks.HeadSeriesRef - T int64 - V float64 + Ref chunks.HeadSeriesRef + ST, T int64 + V float64 } // RefMetadata is the metadata associated with a series ID. @@ -182,6 +186,7 @@ type RefExemplar struct { } // RefHistogramSample is a histogram. +// TODO(owilliams): Add support for ST. type RefHistogramSample struct { Ref chunks.HeadSeriesRef T int64 @@ -189,6 +194,7 @@ type RefHistogramSample struct { } // RefFloatHistogramSample is a float histogram. +// TODO(owilliams): Add support for ST. type RefFloatHistogramSample struct { Ref chunks.HeadSeriesRef T int64 @@ -220,7 +226,7 @@ func (*Decoder) Type(rec []byte) Type { return Unknown } switch t := Type(rec[0]); t { - case Series, Samples, Tombstones, Exemplars, MmapMarkers, Metadata, HistogramSamples, FloatHistogramSamples, CustomBucketsHistogramSamples, CustomBucketsFloatHistogramSamples: + case Series, Samples, SamplesV2, Tombstones, Exemplars, MmapMarkers, Metadata, HistogramSamples, FloatHistogramSamples, CustomBucketsHistogramSamples, CustomBucketsFloatHistogramSamples: return t } return Unknown @@ -311,12 +317,20 @@ func (d *Decoder) DecodeLabels(dec *encoding.Decbuf) labels.Labels { } // Samples appends samples in rec to the given slice. -func (*Decoder) Samples(rec []byte, samples []RefSample) ([]RefSample, error) { +func (d *Decoder) Samples(rec []byte, samples []RefSample) ([]RefSample, error) { dec := encoding.Decbuf{B: rec} - - if Type(dec.Byte()) != Samples { - return nil, errors.New("invalid record type") + switch typ := dec.Byte(); Type(typ) { + case Samples: + return d.samplesV1(&dec, samples) + case SamplesV2: + return d.samplesV2(&dec, samples) + default: + return nil, fmt.Errorf("invalid record type %v, expected Samples(2) or SamplesV2(11)", typ) } +} + +// samplesV1 appends samples in rec to the given slice, while ignoring ST information. +func (*Decoder) samplesV1(dec *encoding.Decbuf, samples []RefSample) ([]RefSample, error) { if dec.Len() == 0 { return samples, nil } @@ -349,6 +363,60 @@ func (*Decoder) Samples(rec []byte, samples []RefSample) ([]RefSample, error) { return samples, nil } +// samplesV2 appends samples in rec to the given slice using the V2 algorithm, +// which is more efficient and supports ST (See Encoder.samplesV2 definition). +func (*Decoder) samplesV2(dec *encoding.Decbuf, samples []RefSample) ([]RefSample, error) { + if dec.Len() == 0 { + return samples, nil + } + // Allow 1 byte for each varint and 8 for the value; the output slice must be at least that big. + if minSize := dec.Len() / (1 + 1 + 8); cap(samples) < minSize { + samples = make([]RefSample, 0, minSize) + } + var firstT, firstST int64 + for len(dec.B) > 0 && dec.Err() == nil { + var prev RefSample + var ref, t, ST int64 + var val uint64 + + if len(samples) == 0 { + ref = dec.Varint64() + firstT = dec.Varint64() + t = firstT + ST = dec.Varint64() + firstST = ST + } else { + prev = samples[len(samples)-1] + ref = int64(prev.Ref) + dec.Varint64() + t = firstT + dec.Varint64() + stMarker := dec.Byte() + switch stMarker { + case noST: + case sameST: + ST = prev.ST + default: + ST = firstST + dec.Varint64() + } + } + + val = dec.Be64() + samples = append(samples, RefSample{ + Ref: chunks.HeadSeriesRef(ref), + ST: ST, + T: t, + V: math.Float64frombits(val), + }) + } + + if dec.Err() != nil { + return nil, fmt.Errorf("decode error after %d samples: %w", len(samples), dec.Err()) + } + if len(dec.B) > 0 { + return nil, fmt.Errorf("unexpected %d bytes left in entry", len(dec.B)) + } + return samples, nil +} + // Tombstones appends tombstones in rec to the given slice. func (*Decoder) Tombstones(rec []byte, tstones []tombstones.Stone) ([]tombstones.Stone, error) { dec := encoding.Decbuf{B: rec} @@ -656,7 +724,11 @@ func DecodeFloatHistogram(buf *encoding.Decbuf, fh *histogram.FloatHistogram) { // Encoder encodes series, sample, and tombstones records. // The zero value is ready to use. -type Encoder struct{} +type Encoder struct { + // EnableSTStorage enables the SamplesV2 encoding, which is more efficient + // than V1 and supports start time per sample. + EnableSTStorage bool +} // Series appends the encoded series to b and returns the resulting slice. func (*Encoder) Series(series []RefSeries, b []byte) []byte { @@ -702,7 +774,16 @@ func EncodeLabels(buf *encoding.Encbuf, lbls labels.Labels) { } // Samples appends the encoded samples to b and returns the resulting slice. -func (*Encoder) Samples(samples []RefSample, b []byte) []byte { +// Depending on the ST existence it either writes Samples or SamplesWithST record. +func (e *Encoder) Samples(samples []RefSample, b []byte) []byte { + if e.EnableSTStorage { + return e.samplesV2(samples, b) + } + return e.samplesV1(samples, b) +} + +// Samples appends the encoded samples to b and returns the resulting slice. +func (*Encoder) samplesV1(samples []RefSample, b []byte) []byte { buf := encoding.Encbuf{B: b} buf.PutByte(byte(Samples)) @@ -725,6 +806,56 @@ func (*Encoder) Samples(samples []RefSample, b []byte) []byte { return buf.Get() } +const ( + // Start timestamp marker values for indicating trivial cases. + + noST byte = iota // Sample has no start time. + sameST // Sample timestamp exists and is the same as the start time of the previous series. + explicitST // Explicit start timestamp value, delta to first start time. +) + +// samplesV2 appends the encoded samples to b and returns the resulting slice +// using a more efficient per-sample delta encoding and allows for ST +// storage. +func (*Encoder) samplesV2(samples []RefSample, b []byte) []byte { + buf := encoding.Encbuf{B: b} + buf.PutByte(byte(SamplesV2)) + + if len(samples) == 0 { + return buf.Get() + } + + // Store first ref, timestamp, ST, and value. + first := samples[0] + buf.PutVarint64(int64(first.Ref)) + buf.PutVarint64(first.T) + buf.PutVarint64(first.ST) + buf.PutBE64(math.Float64bits(first.V)) + + // Subsequent values are delta to the immediate previous values, and in the + // case of start timestamp, use the marker byte to indicate what the value should + // be if it's one of the trivial cases. + for i := 1; i < len(samples); i++ { + s := samples[i] + prev := samples[i-1] + + buf.PutVarint64(int64(s.Ref) - int64(prev.Ref)) + buf.PutVarint64(s.T - first.T) + + switch s.ST { + case 0: + buf.PutByte(noST) + case prev.ST: + buf.PutByte(sameST) + default: + buf.PutByte(explicitST) + buf.PutVarint64(s.ST - first.ST) + } + buf.PutBE64(math.Float64bits(s.V)) + } + return buf.Get() +} + // Tombstones appends the encoded tombstones to b and returns the resulting slice. func (*Encoder) Tombstones(tstones []tombstones.Stone, b []byte) []byte { buf := encoding.Encbuf{B: b} diff --git a/tsdb/record/record_test.go b/tsdb/record/record_test.go index 8ebd805d4d..ab4342c3a8 100644 --- a/tsdb/record/record_test.go +++ b/tsdb/record/record_test.go @@ -76,15 +76,63 @@ func TestRecord_EncodeDecode(t *testing.T) { require.NoError(t, err) require.Equal(t, metadata, decMetadata) + // Without ST. samples := []RefSample{ {Ref: 0, T: 12423423, V: 1.2345}, {Ref: 123, T: -1231, V: -123}, {Ref: 2, T: 0, V: 99999}, } - decSamples, err := dec.Samples(enc.Samples(samples, nil), nil) + encoded := enc.Samples(samples, nil) + require.Equal(t, Samples, dec.Type(encoded)) + decSamples, err := dec.Samples(encoded, nil) require.NoError(t, err) require.Equal(t, samples, decSamples) + enc = Encoder{EnableSTStorage: true} + // Without ST again, but with V1 encoder that enables SamplesV2. + samples = []RefSample{ + {Ref: 0, T: 12423423, V: 1.2345}, + {Ref: 123, T: -1231, V: -123}, + {Ref: 2, T: 0, V: 99999}, + } + encoded = enc.Samples(samples, nil) + require.Equal(t, SamplesV2, dec.Type(encoded)) + decSamples, err = dec.Samples(encoded, nil) + require.NoError(t, err) + require.Equal(t, samples, decSamples) + + // With ST. + samplesWithST := []RefSample{ + {Ref: 0, T: 12423423, ST: 14, V: 1.2345}, + {Ref: 123, T: -1231, ST: 14, V: -123}, + {Ref: 2, T: 0, ST: 14, V: 99999}, + } + encoded = enc.Samples(samplesWithST, nil) + require.Equal(t, SamplesV2, dec.Type(encoded)) + decSamples, err = dec.Samples(encoded, nil) + require.NoError(t, err) + require.Equal(t, samplesWithST, decSamples) + + // With ST (ST[i] == T[i-1]). + samplesWithSTDelta := []RefSample{ + {Ref: 0, T: 12423400, ST: 12423300, V: 1.2345}, + {Ref: 123, T: 12423500, ST: 12423400, V: -123}, + {Ref: 2, T: 12423600, ST: 12423500, V: 99999}, + } + decSamples, err = dec.Samples(enc.Samples(samplesWithSTDelta, nil), nil) + require.NoError(t, err) + require.Equal(t, samplesWithSTDelta, decSamples) + + // With ST (ST[i] == ST[i-1]). + samplesWithConstST := []RefSample{ + {Ref: 0, T: 12423400, ST: 12423300, V: 1.2345}, + {Ref: 123, T: 12423500, ST: 12423300, V: -123}, + {Ref: 2, T: 12423600, ST: 12423300, V: 99999}, + } + decSamples, err = dec.Samples(enc.Samples(samplesWithConstST, nil), nil) + require.NoError(t, err) + require.Equal(t, samplesWithConstST, decSamples) + // Intervals get split up into single entries. So we don't get back exactly // what we put in. tstones := []tombstones.Stone{ @@ -227,252 +275,262 @@ func TestRecord_EncodeDecode(t *testing.T) { } func TestRecord_DecodeInvalidHistogramSchema(t *testing.T) { - for _, schema := range []int32{-100, 100} { - t.Run(fmt.Sprintf("schema=%d", schema), func(t *testing.T) { - var enc Encoder + for _, enableStStorage := range []bool{false, true} { + for _, schema := range []int32{-100, 100} { + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableStStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableStStorage} - var output bytes.Buffer - logger := promslog.New(&promslog.Config{Writer: &output}) - dec := NewDecoder(labels.NewSymbolTable(), logger) - histograms := []RefHistogramSample{ - { - Ref: 56, - T: 1234, - H: &histogram.Histogram{ - Count: 5, - ZeroCount: 2, - ZeroThreshold: 0.001, - Sum: 18.4 * rand.Float64(), - Schema: schema, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 1, Length: 2}, + var output bytes.Buffer + logger := promslog.New(&promslog.Config{Writer: &output}) + dec := NewDecoder(labels.NewSymbolTable(), logger) + histograms := []RefHistogramSample{ + { + Ref: 56, + T: 1234, + H: &histogram.Histogram{ + Count: 5, + ZeroCount: 2, + ZeroThreshold: 0.001, + Sum: 18.4 * rand.Float64(), + Schema: schema, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 1, Length: 2}, + }, + PositiveBuckets: []int64{1, 1, -1, 0}, }, - PositiveBuckets: []int64{1, 1, -1, 0}, }, - }, - } - histSamples, _ := enc.HistogramSamples(histograms, nil) - decHistograms, err := dec.HistogramSamples(histSamples, nil) - require.NoError(t, err) - require.Empty(t, decHistograms) - require.Contains(t, output.String(), "skipping histogram with unknown schema in WAL record") - }) + } + histSamples, _ := enc.HistogramSamples(histograms, nil) + decHistograms, err := dec.HistogramSamples(histSamples, nil) + require.NoError(t, err) + require.Empty(t, decHistograms) + require.Contains(t, output.String(), "skipping histogram with unknown schema in WAL record") + }) + } } } func TestRecord_DecodeInvalidFloatHistogramSchema(t *testing.T) { - for _, schema := range []int32{-100, 100} { - t.Run(fmt.Sprintf("schema=%d", schema), func(t *testing.T) { - var enc Encoder + for _, enableStStorage := range []bool{false, true} { + for _, schema := range []int32{-100, 100} { + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableStStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableStStorage} - var output bytes.Buffer - logger := promslog.New(&promslog.Config{Writer: &output}) - dec := NewDecoder(labels.NewSymbolTable(), logger) - histograms := []RefFloatHistogramSample{ - { - Ref: 56, - T: 1234, - FH: &histogram.FloatHistogram{ - Count: 5, - ZeroCount: 2, - ZeroThreshold: 0.001, - Sum: 18.4 * rand.Float64(), - Schema: schema, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 1, Length: 2}, + var output bytes.Buffer + logger := promslog.New(&promslog.Config{Writer: &output}) + dec := NewDecoder(labels.NewSymbolTable(), logger) + histograms := []RefFloatHistogramSample{ + { + Ref: 56, + T: 1234, + FH: &histogram.FloatHistogram{ + Count: 5, + ZeroCount: 2, + ZeroThreshold: 0.001, + Sum: 18.4 * rand.Float64(), + Schema: schema, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 1, Length: 2}, + }, + PositiveBuckets: []float64{1, 1, -1, 0}, }, - PositiveBuckets: []float64{1, 1, -1, 0}, }, - }, - } - histSamples, _ := enc.FloatHistogramSamples(histograms, nil) - decHistograms, err := dec.FloatHistogramSamples(histSamples, nil) - require.NoError(t, err) - require.Empty(t, decHistograms) - require.Contains(t, output.String(), "skipping histogram with unknown schema in WAL record") - }) + } + histSamples, _ := enc.FloatHistogramSamples(histograms, nil) + decHistograms, err := dec.FloatHistogramSamples(histSamples, nil) + require.NoError(t, err) + require.Empty(t, decHistograms) + require.Contains(t, output.String(), "skipping histogram with unknown schema in WAL record") + }) + } } } func TestRecord_DecodeTooHighResolutionHistogramSchema(t *testing.T) { - for _, schema := range []int32{9, 52} { - t.Run(fmt.Sprintf("schema=%d", schema), func(t *testing.T) { - var enc Encoder + for _, enableStStorage := range []bool{false, true} { + for _, schema := range []int32{9, 52} { + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableStStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableStStorage} - var output bytes.Buffer - logger := promslog.New(&promslog.Config{Writer: &output}) - dec := NewDecoder(labels.NewSymbolTable(), logger) - histograms := []RefHistogramSample{ - { - Ref: 56, - T: 1234, - H: &histogram.Histogram{ - Count: 5, - ZeroCount: 2, - ZeroThreshold: 0.001, - Sum: 18.4 * rand.Float64(), - Schema: schema, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 1, Length: 2}, + var output bytes.Buffer + logger := promslog.New(&promslog.Config{Writer: &output}) + dec := NewDecoder(labels.NewSymbolTable(), logger) + histograms := []RefHistogramSample{ + { + Ref: 56, + T: 1234, + H: &histogram.Histogram{ + Count: 5, + ZeroCount: 2, + ZeroThreshold: 0.001, + Sum: 18.4 * rand.Float64(), + Schema: schema, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 1, Length: 2}, + }, + PositiveBuckets: []int64{1, 1, -1, 0}, }, - PositiveBuckets: []int64{1, 1, -1, 0}, }, - }, - } - histSamples, _ := enc.HistogramSamples(histograms, nil) - decHistograms, err := dec.HistogramSamples(histSamples, nil) - require.NoError(t, err) - require.Len(t, decHistograms, 1) - require.Equal(t, histogram.ExponentialSchemaMax, decHistograms[0].H.Schema) - }) + } + histSamples, _ := enc.HistogramSamples(histograms, nil) + decHistograms, err := dec.HistogramSamples(histSamples, nil) + require.NoError(t, err) + require.Len(t, decHistograms, 1) + require.Equal(t, histogram.ExponentialSchemaMax, decHistograms[0].H.Schema) + }) + } } } func TestRecord_DecodeTooHighResolutionFloatHistogramSchema(t *testing.T) { - for _, schema := range []int32{9, 52} { - t.Run(fmt.Sprintf("schema=%d", schema), func(t *testing.T) { - var enc Encoder + for _, enableStStorage := range []bool{false, true} { + for _, schema := range []int32{9, 52} { + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableStStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableStStorage} - var output bytes.Buffer - logger := promslog.New(&promslog.Config{Writer: &output}) - dec := NewDecoder(labels.NewSymbolTable(), logger) - histograms := []RefFloatHistogramSample{ - { - Ref: 56, - T: 1234, - FH: &histogram.FloatHistogram{ - Count: 5, - ZeroCount: 2, - ZeroThreshold: 0.001, - Sum: 18.4 * rand.Float64(), - Schema: schema, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 1, Length: 2}, + var output bytes.Buffer + logger := promslog.New(&promslog.Config{Writer: &output}) + dec := NewDecoder(labels.NewSymbolTable(), logger) + histograms := []RefFloatHistogramSample{ + { + Ref: 56, + T: 1234, + FH: &histogram.FloatHistogram{ + Count: 5, + ZeroCount: 2, + ZeroThreshold: 0.001, + Sum: 18.4 * rand.Float64(), + Schema: schema, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 1, Length: 2}, + }, + PositiveBuckets: []float64{1, 1, -1, 0}, }, - PositiveBuckets: []float64{1, 1, -1, 0}, }, - }, - } - histSamples, _ := enc.FloatHistogramSamples(histograms, nil) - decHistograms, err := dec.FloatHistogramSamples(histSamples, nil) - require.NoError(t, err) - require.Len(t, decHistograms, 1) - require.Equal(t, histogram.ExponentialSchemaMax, decHistograms[0].FH.Schema) - }) + } + histSamples, _ := enc.FloatHistogramSamples(histograms, nil) + decHistograms, err := dec.FloatHistogramSamples(histSamples, nil) + require.NoError(t, err) + require.Len(t, decHistograms, 1) + require.Equal(t, histogram.ExponentialSchemaMax, decHistograms[0].FH.Schema) + }) + } } } // TestRecord_Corrupted ensures that corrupted records return the correct error. // Bugfix check for pull/521 and pull/523. func TestRecord_Corrupted(t *testing.T) { - var enc Encoder - dec := NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + for _, enableStStorage := range []bool{false, true} { + enc := Encoder{EnableSTStorage: enableStStorage} + dec := NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) - t.Run("Test corrupted series record", func(t *testing.T) { - series := []RefSeries{ - { - Ref: 100, - Labels: labels.FromStrings("abc", "def", "123", "456"), - }, - } - - corrupted := enc.Series(series, nil)[:8] - _, err := dec.Series(corrupted, nil) - require.Equal(t, err, encoding.ErrInvalidSize) - }) - - t.Run("Test corrupted sample record", func(t *testing.T) { - samples := []RefSample{ - {Ref: 0, T: 12423423, V: 1.2345}, - } - - corrupted := enc.Samples(samples, nil)[:8] - _, err := dec.Samples(corrupted, nil) - require.ErrorIs(t, err, encoding.ErrInvalidSize) - }) - - t.Run("Test corrupted tombstone record", func(t *testing.T) { - tstones := []tombstones.Stone{ - {Ref: 123, Intervals: tombstones.Intervals{ - {Mint: -1000, Maxt: 1231231}, - {Mint: 5000, Maxt: 0}, - }}, - } - - corrupted := enc.Tombstones(tstones, nil)[:8] - _, err := dec.Tombstones(corrupted, nil) - require.Equal(t, err, encoding.ErrInvalidSize) - }) - - t.Run("Test corrupted exemplar record", func(t *testing.T) { - exemplars := []RefExemplar{ - {Ref: 0, T: 12423423, V: 1.2345, Labels: labels.FromStrings("trace_id", "asdf")}, - } - - corrupted := enc.Exemplars(exemplars, nil)[:8] - _, err := dec.Exemplars(corrupted, nil) - require.ErrorIs(t, err, encoding.ErrInvalidSize) - }) - - t.Run("Test corrupted metadata record", func(t *testing.T) { - meta := []RefMetadata{ - {Ref: 147, Type: uint8(Counter), Unit: "unit", Help: "help"}, - } - - corrupted := enc.Metadata(meta, nil)[:8] - _, err := dec.Metadata(corrupted, nil) - require.ErrorIs(t, err, encoding.ErrInvalidSize) - }) - - t.Run("Test corrupted histogram record", func(t *testing.T) { - histograms := []RefHistogramSample{ - { - Ref: 56, - T: 1234, - H: &histogram.Histogram{ - Count: 5, - ZeroCount: 2, - ZeroThreshold: 0.001, - Sum: 18.4 * rand.Float64(), - Schema: 1, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 1, Length: 2}, - }, - PositiveBuckets: []int64{1, 1, -1, 0}, + t.Run("Test corrupted series record", func(t *testing.T) { + series := []RefSeries{ + { + Ref: 100, + Labels: labels.FromStrings("abc", "def", "123", "456"), }, - }, - { - Ref: 67, - T: 5678, - H: &histogram.Histogram{ - Count: 8, - ZeroThreshold: 0.001, - Sum: 35.5, - Schema: -53, - PositiveSpans: []histogram.Span{ - {Offset: 0, Length: 2}, - {Offset: 2, Length: 2}, - }, - PositiveBuckets: []int64{2, -1, 2, 0}, - CustomValues: []float64{0, 2, 4, 6, 8}, - }, - }, - } + } - corruptedHists, customBucketsHists := enc.HistogramSamples(histograms, nil) - corruptedHists = corruptedHists[:8] - corruptedCustomBucketsHists := enc.CustomBucketsHistogramSamples(customBucketsHists, nil) - corruptedCustomBucketsHists = corruptedCustomBucketsHists[:8] - _, err := dec.HistogramSamples(corruptedHists, nil) - require.ErrorIs(t, err, encoding.ErrInvalidSize) - _, err = dec.HistogramSamples(corruptedCustomBucketsHists, nil) - require.ErrorIs(t, err, encoding.ErrInvalidSize) - }) + corrupted := enc.Series(series, nil)[:8] + _, err := dec.Series(corrupted, nil) + require.Equal(t, err, encoding.ErrInvalidSize) + }) + + t.Run("Test corrupted sample record", func(t *testing.T) { + samples := []RefSample{ + {Ref: 0, T: 12423423, V: 1.2345}, + } + + corrupted := enc.Samples(samples, nil)[:8] + _, err := dec.Samples(corrupted, nil) + require.ErrorIs(t, err, encoding.ErrInvalidSize) + }) + + t.Run("Test corrupted tombstone record", func(t *testing.T) { + tstones := []tombstones.Stone{ + {Ref: 123, Intervals: tombstones.Intervals{ + {Mint: -1000, Maxt: 1231231}, + {Mint: 5000, Maxt: 0}, + }}, + } + + corrupted := enc.Tombstones(tstones, nil)[:8] + _, err := dec.Tombstones(corrupted, nil) + require.Equal(t, err, encoding.ErrInvalidSize) + }) + + t.Run("Test corrupted exemplar record", func(t *testing.T) { + exemplars := []RefExemplar{ + {Ref: 0, T: 12423423, V: 1.2345, Labels: labels.FromStrings("trace_id", "asdf")}, + } + + corrupted := enc.Exemplars(exemplars, nil)[:8] + _, err := dec.Exemplars(corrupted, nil) + require.ErrorIs(t, err, encoding.ErrInvalidSize) + }) + + t.Run("Test corrupted metadata record", func(t *testing.T) { + meta := []RefMetadata{ + {Ref: 147, Type: uint8(Counter), Unit: "unit", Help: "help"}, + } + + corrupted := enc.Metadata(meta, nil)[:8] + _, err := dec.Metadata(corrupted, nil) + require.ErrorIs(t, err, encoding.ErrInvalidSize) + }) + + t.Run("Test corrupted histogram record", func(t *testing.T) { + histograms := []RefHistogramSample{ + { + Ref: 56, + T: 1234, + H: &histogram.Histogram{ + Count: 5, + ZeroCount: 2, + ZeroThreshold: 0.001, + Sum: 18.4 * rand.Float64(), + Schema: 1, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 1, Length: 2}, + }, + PositiveBuckets: []int64{1, 1, -1, 0}, + }, + }, + { + Ref: 67, + T: 5678, + H: &histogram.Histogram{ + Count: 8, + ZeroThreshold: 0.001, + Sum: 35.5, + Schema: -53, + PositiveSpans: []histogram.Span{ + {Offset: 0, Length: 2}, + {Offset: 2, Length: 2}, + }, + PositiveBuckets: []int64{2, -1, 2, 0}, + CustomValues: []float64{0, 2, 4, 6, 8}, + }, + }, + } + + corruptedHists, customBucketsHists := enc.HistogramSamples(histograms, nil) + corruptedHists = corruptedHists[:8] + corruptedCustomBucketsHists := enc.CustomBucketsHistogramSamples(customBucketsHists, nil) + corruptedCustomBucketsHists = corruptedCustomBucketsHists[:8] + _, err := dec.HistogramSamples(corruptedHists, nil) + require.ErrorIs(t, err, encoding.ErrInvalidSize) + _, err = dec.HistogramSamples(corruptedCustomBucketsHists, nil) + require.ErrorIs(t, err, encoding.ErrInvalidSize) + }) + } } func TestRecord_Type(t *testing.T) { @@ -487,6 +545,16 @@ func TestRecord_Type(t *testing.T) { recordType = dec.Type(enc.Samples(samples, nil)) require.Equal(t, Samples, recordType) + // With EnableSTStorage set, all Samples are V2 + enc = Encoder{EnableSTStorage: true} + samples = []RefSample{{Ref: 123, T: 12345, V: 1.2345}} + recordType = dec.Type(enc.Samples(samples, nil)) + require.Equal(t, SamplesV2, recordType) + + samplesST := []RefSample{{Ref: 123, ST: 1, T: 12345, V: 1.2345}} + recordType = dec.Type(enc.Samples(samplesST, nil)) + require.Equal(t, SamplesV2, recordType) + tstones := []tombstones.Stone{{Ref: 1, Intervals: tombstones.Intervals{{Mint: 1, Maxt: 2}}}} recordType = dec.Type(enc.Tombstones(tstones, nil)) require.Equal(t, Tombstones, recordType) @@ -716,24 +784,26 @@ func BenchmarkWAL_HistogramEncoding(b *testing.B) { make: initNHCBRefs, }, } { - for _, labelCount := range []int{0, 10, 50} { - for _, histograms := range []int{10, 100, 1000} { - for _, buckets := range []int{0, 1, 10, 100} { - b.Run(fmt.Sprintf("type=%s/labels=%d/histograms=%d/buckets=%d", maker.name, labelCount, histograms, buckets), func(b *testing.B) { - series, samples, nhcbs := maker.make(labelCount, histograms, buckets) - enc := Encoder{} - for b.Loop() { - var buf []byte - enc.Series(series, buf) - enc.Samples(samples, buf) - var leftOver []RefHistogramSample - _, leftOver = enc.HistogramSamples(nhcbs, buf) - if len(leftOver) > 0 { - enc.CustomBucketsHistogramSamples(leftOver, buf) + for _, enableStStorage := range []bool{false, true} { + for _, labelCount := range []int{0, 10, 50} { + for _, histograms := range []int{10, 100, 1000} { + for _, buckets := range []int{0, 1, 10, 100} { + b.Run(fmt.Sprintf("type=%s/labels=%d/histograms=%d/buckets=%d", maker.name, labelCount, histograms, buckets), func(b *testing.B) { + series, samples, nhcbs := maker.make(labelCount, histograms, buckets) + enc := Encoder{EnableSTStorage: enableStStorage} + for b.Loop() { + var buf []byte + enc.Series(series, buf) + enc.Samples(samples, buf) + var leftOver []RefHistogramSample + _, leftOver = enc.HistogramSamples(nhcbs, buf) + if len(leftOver) > 0 { + enc.CustomBucketsHistogramSamples(leftOver, buf) + } + b.ReportMetric(float64(len(buf)), "recordBytes/ops") } - b.ReportMetric(float64(len(buf)), "recordBytes/ops") - } - }) + }) + } } } } diff --git a/tsdb/wlog/checkpoint.go b/tsdb/wlog/checkpoint.go index 6742141fbc..86a858e70a 100644 --- a/tsdb/wlog/checkpoint.go +++ b/tsdb/wlog/checkpoint.go @@ -92,7 +92,7 @@ const CheckpointPrefix = "checkpoint." // segmented format as the original WAL itself. // This makes it easy to read it through the WAL package and concatenate // it with the original WAL. -func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.HeadSeriesRef) bool, mint int64) (*CheckpointStats, error) { +func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.HeadSeriesRef) bool, mint int64, enableStStorage bool) (*CheckpointStats, error) { stats := &CheckpointStats{} var sgmReader io.ReadCloser @@ -156,7 +156,7 @@ func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.He metadata []record.RefMetadata st = labels.NewSymbolTable() // Needed for decoding; labels do not outlive this function. dec = record.NewDecoder(st, logger) - enc record.Encoder + enc = record.Encoder{EnableSTStorage: enableStStorage} buf []byte recs [][]byte @@ -190,7 +190,7 @@ func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.He stats.TotalSeries += len(series) stats.DroppedSeries += len(series) - len(repl) - case record.Samples: + case record.Samples, record.SamplesV2: samples, err = dec.Samples(rec, samples) if err != nil { return nil, fmt.Errorf("decode samples: %w", err) diff --git a/tsdb/wlog/checkpoint_test.go b/tsdb/wlog/checkpoint_test.go index 97ca2e768d..18a2c2d3dc 100644 --- a/tsdb/wlog/checkpoint_test.go +++ b/tsdb/wlog/checkpoint_test.go @@ -171,249 +171,255 @@ func TestCheckpoint(t *testing.T) { } } - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() + for _, enableStStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + dir := t.TempDir() - var enc record.Encoder - // Create a dummy segment to bump the initial number. - seg, err := CreateSegment(dir, 100) - require.NoError(t, err) - require.NoError(t, seg.Close()) - - // Manually create checkpoint for 99 and earlier. - w, err := New(nil, nil, filepath.Join(dir, "checkpoint.0099"), compress) - require.NoError(t, err) - - // Add some data we expect to be around later. - err = w.Log(enc.Series([]record.RefSeries{ - {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")}, - {Ref: 1, Labels: labels.FromStrings("a", "b", "c", "1")}, - }, nil)) - require.NoError(t, err) - // Log an unknown record, that might have come from a future Prometheus version. - require.NoError(t, w.Log([]byte{255})) - require.NoError(t, w.Close()) - - // Start a WAL and write records to it as usual. - w, err = NewSize(nil, nil, dir, 128*1024, compress) - require.NoError(t, err) - - samplesInWAL, histogramsInWAL, floatHistogramsInWAL := 0, 0, 0 - var last int64 - for i := 0; ; i++ { - _, n, err := Segments(w.Dir()) + enc := record.Encoder{EnableSTStorage: enableStStorage} + // Create a dummy segment to bump the initial number. + seg, err := CreateSegment(dir, 100) require.NoError(t, err) - if n >= 106 { - break - } - // Write some series initially. - if i == 0 { - b := enc.Series([]record.RefSeries{ - {Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")}, - {Ref: 3, Labels: labels.FromStrings("a", "b", "c", "3")}, - {Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")}, - {Ref: 5, Labels: labels.FromStrings("a", "b", "c", "5")}, + require.NoError(t, seg.Close()) + + // Manually create checkpoint for 99 and earlier. + w, err := New(nil, nil, filepath.Join(dir, "checkpoint.0099"), compress) + require.NoError(t, err) + + // Add some data we expect to be around later. + err = w.Log(enc.Series([]record.RefSeries{ + {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")}, + {Ref: 1, Labels: labels.FromStrings("a", "b", "c", "1")}, + }, nil)) + require.NoError(t, err) + // Log an unknown record, that might have come from a future Prometheus version. + require.NoError(t, w.Log([]byte{255})) + require.NoError(t, w.Close()) + + // Start a WAL and write records to it as usual. + w, err = NewSize(nil, nil, dir, 128*1024, compress) + require.NoError(t, err) + + samplesInWAL, histogramsInWAL, floatHistogramsInWAL := 0, 0, 0 + var last int64 + for i := 0; ; i++ { + _, n, err := Segments(w.Dir()) + require.NoError(t, err) + if n >= 106 { + break + } + // Write some series initially. + if i == 0 { + b := enc.Series([]record.RefSeries{ + {Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")}, + {Ref: 3, Labels: labels.FromStrings("a", "b", "c", "3")}, + {Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")}, + {Ref: 5, Labels: labels.FromStrings("a", "b", "c", "5")}, + }, nil) + require.NoError(t, w.Log(b)) + + b = enc.Metadata([]record.RefMetadata{ + {Ref: 2, Unit: "unit", Help: "help"}, + {Ref: 3, Unit: "unit", Help: "help"}, + {Ref: 4, Unit: "unit", Help: "help"}, + {Ref: 5, Unit: "unit", Help: "help"}, + }, nil) + require.NoError(t, w.Log(b)) + } + // Write samples until the WAL has enough segments. + // Make them have drifting timestamps within a record to see that they + // get filtered properly. + b := enc.Samples([]record.RefSample{ + {Ref: 0, T: last, V: float64(i)}, + {Ref: 1, T: last + 10000, V: float64(i)}, + {Ref: 2, T: last + 20000, V: float64(i)}, + {Ref: 3, T: last + 30000, V: float64(i)}, + }, nil) + require.NoError(t, w.Log(b)) + samplesInWAL += 4 + h := makeHistogram(i) + b, _ = enc.HistogramSamples([]record.RefHistogramSample{ + {Ref: 0, T: last, H: h}, + {Ref: 1, T: last + 10000, H: h}, + {Ref: 2, T: last + 20000, H: h}, + {Ref: 3, T: last + 30000, H: h}, + }, nil) + require.NoError(t, w.Log(b)) + histogramsInWAL += 4 + cbh := makeCustomBucketHistogram(i) + b = enc.CustomBucketsHistogramSamples([]record.RefHistogramSample{ + {Ref: 0, T: last, H: cbh}, + {Ref: 1, T: last + 10000, H: cbh}, + {Ref: 2, T: last + 20000, H: cbh}, + {Ref: 3, T: last + 30000, H: cbh}, + }, nil) + require.NoError(t, w.Log(b)) + histogramsInWAL += 4 + fh := makeFloatHistogram(i) + b, _ = enc.FloatHistogramSamples([]record.RefFloatHistogramSample{ + {Ref: 0, T: last, FH: fh}, + {Ref: 1, T: last + 10000, FH: fh}, + {Ref: 2, T: last + 20000, FH: fh}, + {Ref: 3, T: last + 30000, FH: fh}, + }, nil) + require.NoError(t, w.Log(b)) + floatHistogramsInWAL += 4 + cbfh := makeCustomBucketFloatHistogram(i) + b = enc.CustomBucketsFloatHistogramSamples([]record.RefFloatHistogramSample{ + {Ref: 0, T: last, FH: cbfh}, + {Ref: 1, T: last + 10000, FH: cbfh}, + {Ref: 2, T: last + 20000, FH: cbfh}, + {Ref: 3, T: last + 30000, FH: cbfh}, + }, nil) + require.NoError(t, w.Log(b)) + floatHistogramsInWAL += 4 + + b = enc.Exemplars([]record.RefExemplar{ + {Ref: 1, T: last, V: float64(i), Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", i))}, }, nil) require.NoError(t, w.Log(b)) + // Write changing metadata for each series. In the end, only the latest + // version should end up in the checkpoint. b = enc.Metadata([]record.RefMetadata{ - {Ref: 2, Unit: "unit", Help: "help"}, - {Ref: 3, Unit: "unit", Help: "help"}, - {Ref: 4, Unit: "unit", Help: "help"}, - {Ref: 5, Unit: "unit", Help: "help"}, + {Ref: 0, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, + {Ref: 1, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, + {Ref: 2, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, + {Ref: 3, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, }, nil) require.NoError(t, w.Log(b)) + + last += 100 } - // Write samples until the WAL has enough segments. - // Make them have drifting timestamps within a record to see that they - // get filtered properly. - b := enc.Samples([]record.RefSample{ - {Ref: 0, T: last, V: float64(i)}, - {Ref: 1, T: last + 10000, V: float64(i)}, - {Ref: 2, T: last + 20000, V: float64(i)}, - {Ref: 3, T: last + 30000, V: float64(i)}, - }, nil) - require.NoError(t, w.Log(b)) - samplesInWAL += 4 - h := makeHistogram(i) - b, _ = enc.HistogramSamples([]record.RefHistogramSample{ - {Ref: 0, T: last, H: h}, - {Ref: 1, T: last + 10000, H: h}, - {Ref: 2, T: last + 20000, H: h}, - {Ref: 3, T: last + 30000, H: h}, - }, nil) - require.NoError(t, w.Log(b)) - histogramsInWAL += 4 - cbh := makeCustomBucketHistogram(i) - b = enc.CustomBucketsHistogramSamples([]record.RefHistogramSample{ - {Ref: 0, T: last, H: cbh}, - {Ref: 1, T: last + 10000, H: cbh}, - {Ref: 2, T: last + 20000, H: cbh}, - {Ref: 3, T: last + 30000, H: cbh}, - }, nil) - require.NoError(t, w.Log(b)) - histogramsInWAL += 4 - fh := makeFloatHistogram(i) - b, _ = enc.FloatHistogramSamples([]record.RefFloatHistogramSample{ - {Ref: 0, T: last, FH: fh}, - {Ref: 1, T: last + 10000, FH: fh}, - {Ref: 2, T: last + 20000, FH: fh}, - {Ref: 3, T: last + 30000, FH: fh}, - }, nil) - require.NoError(t, w.Log(b)) - floatHistogramsInWAL += 4 - cbfh := makeCustomBucketFloatHistogram(i) - b = enc.CustomBucketsFloatHistogramSamples([]record.RefFloatHistogramSample{ - {Ref: 0, T: last, FH: cbfh}, - {Ref: 1, T: last + 10000, FH: cbfh}, - {Ref: 2, T: last + 20000, FH: cbfh}, - {Ref: 3, T: last + 30000, FH: cbfh}, - }, nil) - require.NoError(t, w.Log(b)) - floatHistogramsInWAL += 4 + require.NoError(t, w.Close()) - b = enc.Exemplars([]record.RefExemplar{ - {Ref: 1, T: last, V: float64(i), Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", i))}, - }, nil) - require.NoError(t, w.Log(b)) + stats, err := Checkpoint(promslog.NewNopLogger(), w, 100, 106, func(x chunks.HeadSeriesRef) bool { + return x%2 == 0 + }, last/2, enableStStorage) + require.NoError(t, err) + require.NoError(t, w.Truncate(107)) + require.NoError(t, DeleteCheckpoints(w.Dir(), 106)) + require.Equal(t, histogramsInWAL+floatHistogramsInWAL+samplesInWAL, stats.TotalSamples) + require.Positive(t, stats.DroppedSamples) - // Write changing metadata for each series. In the end, only the latest - // version should end up in the checkpoint. - b = enc.Metadata([]record.RefMetadata{ - {Ref: 0, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, - {Ref: 1, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, - {Ref: 2, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, - {Ref: 3, Unit: strconv.FormatInt(last, 10), Help: strconv.FormatInt(last, 10)}, - }, nil) - require.NoError(t, w.Log(b)) + // Only the new checkpoint should be left. + files, err := os.ReadDir(dir) + require.NoError(t, err) + require.Len(t, files, 1) + require.Equal(t, "checkpoint.00000106", files[0].Name()) - last += 100 - } - require.NoError(t, w.Close()) + sr, err := NewSegmentsReader(filepath.Join(dir, "checkpoint.00000106")) + require.NoError(t, err) + defer sr.Close() - stats, err := Checkpoint(promslog.NewNopLogger(), w, 100, 106, func(x chunks.HeadSeriesRef) bool { - return x%2 == 0 - }, last/2) - require.NoError(t, err) - require.NoError(t, w.Truncate(107)) - require.NoError(t, DeleteCheckpoints(w.Dir(), 106)) - require.Equal(t, histogramsInWAL+floatHistogramsInWAL+samplesInWAL, stats.TotalSamples) - require.Positive(t, stats.DroppedSamples) + dec := record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + var series []record.RefSeries + var metadata []record.RefMetadata + r := NewReader(sr) - // Only the new checkpoint should be left. - files, err := os.ReadDir(dir) - require.NoError(t, err) - require.Len(t, files, 1) - require.Equal(t, "checkpoint.00000106", files[0].Name()) + samplesInCheckpoint, histogramsInCheckpoint, floatHistogramsInCheckpoint := 0, 0, 0 + for r.Next() { + rec := r.Record() - sr, err := NewSegmentsReader(filepath.Join(dir, "checkpoint.00000106")) - require.NoError(t, err) - defer sr.Close() - - dec := record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) - var series []record.RefSeries - var metadata []record.RefMetadata - r := NewReader(sr) - - samplesInCheckpoint, histogramsInCheckpoint, floatHistogramsInCheckpoint := 0, 0, 0 - for r.Next() { - rec := r.Record() - - switch dec.Type(rec) { - case record.Series: - series, err = dec.Series(rec, series) - require.NoError(t, err) - case record.Samples: - samples, err := dec.Samples(rec, nil) - require.NoError(t, err) - for _, s := range samples { - require.GreaterOrEqual(t, s.T, last/2, "sample with wrong timestamp") + switch dec.Type(rec) { + case record.Series: + series, err = dec.Series(rec, series) + require.NoError(t, err) + case record.Samples, record.SamplesV2: + samples, err := dec.Samples(rec, nil) + require.NoError(t, err) + for _, s := range samples { + require.GreaterOrEqual(t, s.T, last/2, "sample with wrong timestamp") + } + samplesInCheckpoint += len(samples) + case record.HistogramSamples, record.CustomBucketsHistogramSamples: + histograms, err := dec.HistogramSamples(rec, nil) + require.NoError(t, err) + for _, h := range histograms { + require.GreaterOrEqual(t, h.T, last/2, "histogram with wrong timestamp") + } + histogramsInCheckpoint += len(histograms) + case record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: + floatHistograms, err := dec.FloatHistogramSamples(rec, nil) + require.NoError(t, err) + for _, h := range floatHistograms { + require.GreaterOrEqual(t, h.T, last/2, "float histogram with wrong timestamp") + } + floatHistogramsInCheckpoint += len(floatHistograms) + case record.Exemplars: + exemplars, err := dec.Exemplars(rec, nil) + require.NoError(t, err) + for _, e := range exemplars { + require.GreaterOrEqual(t, e.T, last/2, "exemplar with wrong timestamp") + } + case record.Metadata: + metadata, err = dec.Metadata(rec, metadata) + require.NoError(t, err) } - samplesInCheckpoint += len(samples) - case record.HistogramSamples, record.CustomBucketsHistogramSamples: - histograms, err := dec.HistogramSamples(rec, nil) - require.NoError(t, err) - for _, h := range histograms { - require.GreaterOrEqual(t, h.T, last/2, "histogram with wrong timestamp") - } - histogramsInCheckpoint += len(histograms) - case record.FloatHistogramSamples, record.CustomBucketsFloatHistogramSamples: - floatHistograms, err := dec.FloatHistogramSamples(rec, nil) - require.NoError(t, err) - for _, h := range floatHistograms { - require.GreaterOrEqual(t, h.T, last/2, "float histogram with wrong timestamp") - } - floatHistogramsInCheckpoint += len(floatHistograms) - case record.Exemplars: - exemplars, err := dec.Exemplars(rec, nil) - require.NoError(t, err) - for _, e := range exemplars { - require.GreaterOrEqual(t, e.T, last/2, "exemplar with wrong timestamp") - } - case record.Metadata: - metadata, err = dec.Metadata(rec, metadata) - require.NoError(t, err) } - } - require.NoError(t, r.Err()) - // Making sure we replayed some samples. We expect >50% samples to be still present. - require.Greater(t, float64(samplesInCheckpoint)/float64(samplesInWAL), 0.5) - require.Less(t, float64(samplesInCheckpoint)/float64(samplesInWAL), 0.8) - require.Greater(t, float64(histogramsInCheckpoint)/float64(histogramsInWAL), 0.5) - require.Less(t, float64(histogramsInCheckpoint)/float64(histogramsInWAL), 0.8) - require.Greater(t, float64(floatHistogramsInCheckpoint)/float64(floatHistogramsInWAL), 0.5) - require.Less(t, float64(floatHistogramsInCheckpoint)/float64(floatHistogramsInWAL), 0.8) + require.NoError(t, r.Err()) + // Making sure we replayed some samples. We expect >50% samples to be still present. + require.Greater(t, float64(samplesInCheckpoint)/float64(samplesInWAL), 0.5) + require.Less(t, float64(samplesInCheckpoint)/float64(samplesInWAL), 0.8) + require.Greater(t, float64(histogramsInCheckpoint)/float64(histogramsInWAL), 0.5) + require.Less(t, float64(histogramsInCheckpoint)/float64(histogramsInWAL), 0.8) + require.Greater(t, float64(floatHistogramsInCheckpoint)/float64(floatHistogramsInWAL), 0.5) + require.Less(t, float64(floatHistogramsInCheckpoint)/float64(floatHistogramsInWAL), 0.8) - expectedRefSeries := []record.RefSeries{ - {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")}, - {Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")}, - {Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")}, - } - testutil.RequireEqual(t, expectedRefSeries, series) + expectedRefSeries := []record.RefSeries{ + {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "0")}, + {Ref: 2, Labels: labels.FromStrings("a", "b", "c", "2")}, + {Ref: 4, Labels: labels.FromStrings("a", "b", "c", "4")}, + } + testutil.RequireEqual(t, expectedRefSeries, series) - expectedRefMetadata := []record.RefMetadata{ - {Ref: 0, Unit: strconv.FormatInt(last-100, 10), Help: strconv.FormatInt(last-100, 10)}, - {Ref: 2, Unit: strconv.FormatInt(last-100, 10), Help: strconv.FormatInt(last-100, 10)}, - {Ref: 4, Unit: "unit", Help: "help"}, - } - sort.Slice(metadata, func(i, j int) bool { return metadata[i].Ref < metadata[j].Ref }) - require.Equal(t, expectedRefMetadata, metadata) - }) + expectedRefMetadata := []record.RefMetadata{ + {Ref: 0, Unit: strconv.FormatInt(last-100, 10), Help: strconv.FormatInt(last-100, 10)}, + {Ref: 2, Unit: strconv.FormatInt(last-100, 10), Help: strconv.FormatInt(last-100, 10)}, + {Ref: 4, Unit: "unit", Help: "help"}, + } + sort.Slice(metadata, func(i, j int) bool { return metadata[i].Ref < metadata[j].Ref }) + require.Equal(t, expectedRefMetadata, metadata) + }) + } } } func TestCheckpointNoTmpFolderAfterError(t *testing.T) { - // Create a new wlog with invalid data. - dir := t.TempDir() - w, err := NewSize(nil, nil, dir, 64*1024, compression.None) - require.NoError(t, err) - var enc record.Encoder - require.NoError(t, w.Log(enc.Series([]record.RefSeries{ - {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "2")}, - }, nil))) - require.NoError(t, w.Close()) + for _, enableStStorage := range []bool{false, true} { + t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + // Create a new wlog with invalid data. + dir := t.TempDir() + w, err := NewSize(nil, nil, dir, 64*1024, compression.None) + require.NoError(t, err) + enc := record.Encoder{EnableSTStorage: enableStStorage} + require.NoError(t, w.Log(enc.Series([]record.RefSeries{ + {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "2")}, + }, nil))) + require.NoError(t, w.Close()) - // Corrupt data. - f, err := os.OpenFile(filepath.Join(w.Dir(), "00000000"), os.O_WRONLY, 0o666) - require.NoError(t, err) - _, err = f.WriteAt([]byte{42}, 1) - require.NoError(t, err) - require.NoError(t, f.Close()) + // Corrupt data. + f, err := os.OpenFile(filepath.Join(w.Dir(), "00000000"), os.O_WRONLY, 0o666) + require.NoError(t, err) + _, err = f.WriteAt([]byte{42}, 1) + require.NoError(t, err) + require.NoError(t, f.Close()) - // Run the checkpoint and since the wlog contains corrupt data this should return an error. - _, err = Checkpoint(promslog.NewNopLogger(), w, 0, 1, nil, 0) - require.Error(t, err) + // Run the checkpoint and since the wlog contains corrupt data this should return an error. + _, err = Checkpoint(promslog.NewNopLogger(), w, 0, 1, nil, 0, enableStStorage) + require.Error(t, err) - // Walk the wlog dir to make sure there are no tmp folder left behind after the error. - err = filepath.Walk(w.Dir(), func(path string, info os.FileInfo, err error) error { - if err != nil { - return fmt.Errorf("access err %q: %w", path, err) - } - if info.IsDir() && strings.HasSuffix(info.Name(), ".tmp") { - return fmt.Errorf("wlog dir contains temporary folder:%s", info.Name()) - } - return nil - }) - require.NoError(t, err) + // Walk the wlog dir to make sure there are no tmp folder left behind after the error. + err = filepath.Walk(w.Dir(), func(path string, info os.FileInfo, err error) error { + if err != nil { + return fmt.Errorf("access err %q: %w", path, err) + } + if info.IsDir() && strings.HasSuffix(info.Name(), ".tmp") { + return fmt.Errorf("wlog dir contains temporary folder:%s", info.Name()) + } + return nil + }) + require.NoError(t, err) + }) + } } diff --git a/tsdb/wlog/watcher.go b/tsdb/wlog/watcher.go index a841a44fc8..83453463eb 100644 --- a/tsdb/wlog/watcher.go +++ b/tsdb/wlog/watcher.go @@ -519,7 +519,7 @@ func (w *Watcher) readSegment(r *LiveReader, segmentNum int, tail bool) error { } w.writer.StoreSeries(series, segmentNum) - case record.Samples: + case record.Samples, record.SamplesV2: // If we're not tailing a segment we can ignore any samples records we see. // This speeds up replay of the WAL by > 10x. if !tail { diff --git a/tsdb/wlog/watcher_test.go b/tsdb/wlog/watcher_test.go index b9a6504298..e29aac4d47 100644 --- a/tsdb/wlog/watcher_test.go +++ b/tsdb/wlog/watcher_test.go @@ -17,6 +17,7 @@ import ( "math/rand" "os" "path" + "path/filepath" "runtime" "sync" "testing" @@ -144,145 +145,147 @@ func TestTailSamples(t *testing.T) { const exemplarsCount = 25 const histogramsCount = 50 for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - now := time.Now() + for _, enableStStorage := range []bool{false, true} { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + now := time.Now() - dir := t.TempDir() + dir := t.TempDir() - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) - - enc := record.Encoder{} - w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) - require.NoError(t, err) - defer func() { - require.NoError(t, w.Close()) - }() - - // Write to the initial segment then checkpoint. - for i := range seriesCount { - ref := i + 100 - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(ref), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - - for range samplesCount { - inner := rand.Intn(ref + 1) - sample := enc.Samples([]record.RefSample{ - { - Ref: chunks.HeadSeriesRef(inner), - T: now.UnixNano() + 1, - V: float64(i), - }, - }, nil) - require.NoError(t, w.Log(sample)) - } - - for range exemplarsCount { - inner := rand.Intn(ref + 1) - exemplar := enc.Exemplars([]record.RefExemplar{ - { - Ref: chunks.HeadSeriesRef(inner), - T: now.UnixNano() + 1, - V: float64(i), - Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", inner)), - }, - }, nil) - require.NoError(t, w.Log(exemplar)) - } - - for range histogramsCount { - inner := rand.Intn(ref + 1) - hist := &histogram.Histogram{ - Schema: 2, - ZeroThreshold: 1e-128, - ZeroCount: 0, - Count: 2, - Sum: 0, - PositiveSpans: []histogram.Span{{Offset: 0, Length: 1}}, - PositiveBuckets: []int64{int64(i) + 1}, - NegativeSpans: []histogram.Span{{Offset: 0, Length: 1}}, - NegativeBuckets: []int64{int64(-i) - 1}, - } - - histograms, _ := enc.HistogramSamples([]record.RefHistogramSample{{ - Ref: chunks.HeadSeriesRef(inner), - T: now.UnixNano() + 1, - H: hist, - }}, nil) - require.NoError(t, w.Log(histograms)) - - customBucketHist := &histogram.Histogram{ - Schema: -53, - ZeroThreshold: 1e-128, - ZeroCount: 0, - Count: 2, - Sum: 0, - PositiveSpans: []histogram.Span{{Offset: 0, Length: 1}}, - CustomValues: []float64{float64(i) + 2}, - } - - customBucketHistograms := enc.CustomBucketsHistogramSamples([]record.RefHistogramSample{{ - Ref: chunks.HeadSeriesRef(inner), - T: now.UnixNano() + 1, - H: customBucketHist, - }}, nil) - require.NoError(t, w.Log(customBucketHistograms)) - - floatHistograms, _ := enc.FloatHistogramSamples([]record.RefFloatHistogramSample{{ - Ref: chunks.HeadSeriesRef(inner), - T: now.UnixNano() + 1, - FH: hist.ToFloat(nil), - }}, nil) - require.NoError(t, w.Log(floatHistograms)) - - customBucketFloatHistograms := enc.CustomBucketsFloatHistogramSamples([]record.RefFloatHistogramSample{{ - Ref: chunks.HeadSeriesRef(inner), - T: now.UnixNano() + 1, - FH: customBucketHist.ToFloat(nil), - }}, nil) - require.NoError(t, w.Log(customBucketFloatHistograms)) - } - } - - // Start read after checkpoint, no more data written. - first, last, err := Segments(w.Dir()) - require.NoError(t, err) - - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, true, true, true) - watcher.SetStartTime(now) - - // Set the Watcher's metrics so they're not nil pointers. - watcher.SetMetrics() - for i := first; i <= last; i++ { - segment, err := OpenReadSegment(SegmentName(watcher.walDir, i)) + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) require.NoError(t, err) - reader := NewLiveReader(nil, NewLiveReaderMetrics(nil), segment) - // Use tail true so we can ensure we got the right number of samples. - watcher.readSegment(reader, i, true) - require.NoError(t, segment.Close()) - } + enc := record.Encoder{EnableSTStorage: enableStStorage} + w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) + require.NoError(t, err) + defer func() { + require.NoError(t, w.Close()) + }() - expectedSeries := seriesCount - expectedSamples := seriesCount * samplesCount - expectedExemplars := seriesCount * exemplarsCount - expectedHistograms := seriesCount * histogramsCount * 2 - retry(t, defaultRetryInterval, defaultRetries, func() bool { - return wt.checkNumSeries() >= expectedSeries + // Write to the initial segment then checkpoint. + for i := range seriesCount { + ref := i + 100 + series := enc.Series([]record.RefSeries{ + { + Ref: chunks.HeadSeriesRef(ref), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), + }, + }, nil) + require.NoError(t, w.Log(series)) + + for range samplesCount { + inner := rand.Intn(ref + 1) + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(inner), + T: now.UnixNano() + 1, + V: float64(i), + }, + }, nil) + require.NoError(t, w.Log(sample)) + } + + for range exemplarsCount { + inner := rand.Intn(ref + 1) + exemplar := enc.Exemplars([]record.RefExemplar{ + { + Ref: chunks.HeadSeriesRef(inner), + T: now.UnixNano() + 1, + V: float64(i), + Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", inner)), + }, + }, nil) + require.NoError(t, w.Log(exemplar)) + } + + for range histogramsCount { + inner := rand.Intn(ref + 1) + hist := &histogram.Histogram{ + Schema: 2, + ZeroThreshold: 1e-128, + ZeroCount: 0, + Count: 2, + Sum: 0, + PositiveSpans: []histogram.Span{{Offset: 0, Length: 1}}, + PositiveBuckets: []int64{int64(i) + 1}, + NegativeSpans: []histogram.Span{{Offset: 0, Length: 1}}, + NegativeBuckets: []int64{int64(-i) - 1}, + } + + histograms, _ := enc.HistogramSamples([]record.RefHistogramSample{{ + Ref: chunks.HeadSeriesRef(inner), + T: now.UnixNano() + 1, + H: hist, + }}, nil) + require.NoError(t, w.Log(histograms)) + + customBucketHist := &histogram.Histogram{ + Schema: -53, + ZeroThreshold: 1e-128, + ZeroCount: 0, + Count: 2, + Sum: 0, + PositiveSpans: []histogram.Span{{Offset: 0, Length: 1}}, + CustomValues: []float64{float64(i) + 2}, + } + + customBucketHistograms := enc.CustomBucketsHistogramSamples([]record.RefHistogramSample{{ + Ref: chunks.HeadSeriesRef(inner), + T: now.UnixNano() + 1, + H: customBucketHist, + }}, nil) + require.NoError(t, w.Log(customBucketHistograms)) + + floatHistograms, _ := enc.FloatHistogramSamples([]record.RefFloatHistogramSample{{ + Ref: chunks.HeadSeriesRef(inner), + T: now.UnixNano() + 1, + FH: hist.ToFloat(nil), + }}, nil) + require.NoError(t, w.Log(floatHistograms)) + + customBucketFloatHistograms := enc.CustomBucketsFloatHistogramSamples([]record.RefFloatHistogramSample{{ + Ref: chunks.HeadSeriesRef(inner), + T: now.UnixNano() + 1, + FH: customBucketHist.ToFloat(nil), + }}, nil) + require.NoError(t, w.Log(customBucketFloatHistograms)) + } + } + + // Start read after checkpoint, no more data written. + first, last, err := Segments(w.Dir()) + require.NoError(t, err) + + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, true, true, true) + watcher.SetStartTime(now) + + // Set the Watcher's metrics so they're not nil pointers. + watcher.SetMetrics() + for i := first; i <= last; i++ { + segment, err := OpenReadSegment(SegmentName(watcher.walDir, i)) + require.NoError(t, err) + + reader := NewLiveReader(nil, NewLiveReaderMetrics(nil), segment) + // Use tail true so we can ensure we got the right number of samples. + watcher.readSegment(reader, i, true) + require.NoError(t, segment.Close()) + } + + expectedSeries := seriesCount + expectedSamples := seriesCount * samplesCount + expectedExemplars := seriesCount * exemplarsCount + expectedHistograms := seriesCount * histogramsCount * 2 + retry(t, defaultRetryInterval, defaultRetries, func() bool { + return wt.checkNumSeries() >= expectedSeries + }) + require.Equal(t, expectedSeries, wt.checkNumSeries(), "did not receive the expected number of series") + require.Equal(t, expectedSamples, wt.samplesAppended, "did not receive the expected number of samples") + require.Equal(t, expectedExemplars, wt.exemplarsAppended, "did not receive the expected number of exemplars") + require.Equal(t, expectedHistograms, wt.histogramsAppended, "did not receive the expected number of histograms") + require.Equal(t, expectedHistograms, wt.floatHistogramsAppended, "did not receive the expected number of float histograms") }) - require.Equal(t, expectedSeries, wt.checkNumSeries(), "did not receive the expected number of series") - require.Equal(t, expectedSamples, wt.samplesAppended, "did not receive the expected number of samples") - require.Equal(t, expectedExemplars, wt.exemplarsAppended, "did not receive the expected number of exemplars") - require.Equal(t, expectedHistograms, wt.histogramsAppended, "did not receive the expected number of histograms") - require.Equal(t, expectedHistograms, wt.floatHistogramsAppended, "did not receive the expected number of float histograms") - }) + } } } @@ -291,64 +294,66 @@ func TestReadToEndNoCheckpoint(t *testing.T) { const seriesCount = 10 const samplesCount = 250 - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) + for _, enableStStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + dir := t.TempDir() + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) - w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) - require.NoError(t, err) - defer func() { - require.NoError(t, w.Close()) - }() + w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) + require.NoError(t, err) + defer func() { + require.NoError(t, w.Close()) + }() - var recs [][]byte + var recs [][]byte - enc := record.Encoder{} + enc := record.Encoder{EnableSTStorage: enableStStorage} - for i := range seriesCount { - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(i), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - recs = append(recs, series) - for j := range samplesCount { - sample := enc.Samples([]record.RefSample{ + for i := range seriesCount { + series := enc.Series([]record.RefSeries{ { - Ref: chunks.HeadSeriesRef(j), - T: int64(i), - V: float64(i), + Ref: chunks.HeadSeriesRef(i), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), }, }, nil) + recs = append(recs, series) + for j := range samplesCount { + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(j), + T: int64(i), + V: float64(i), + }, + }, nil) - recs = append(recs, sample) + recs = append(recs, sample) - // Randomly batch up records. - if rand.Intn(4) < 3 { - require.NoError(t, w.Log(recs...)) - recs = recs[:0] + // Randomly batch up records. + if rand.Intn(4) < 3 { + require.NoError(t, w.Log(recs...)) + recs = recs[:0] + } } } - } - require.NoError(t, w.Log(recs...)) - overwriteReadTimeout(t, time.Second) - _, _, err = Segments(w.Dir()) - require.NoError(t, err) + require.NoError(t, w.Log(recs...)) + overwriteReadTimeout(t, time.Second) + _, _, err = Segments(w.Dir()) + require.NoError(t, err) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) - go watcher.Start() + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) + go watcher.Start() - expected := seriesCount - require.Eventually(t, func() bool { - return wt.checkNumSeries() == expected - }, 20*time.Second, 1*time.Second) - watcher.Stop() - }) + expected := seriesCount + require.Eventually(t, func() bool { + return wt.checkNumSeries() == expected + }, 20*time.Second, 1*time.Second) + watcher.Stop() + }) + } } } @@ -359,184 +364,119 @@ func TestReadToEndWithCheckpoint(t *testing.T) { const seriesCount = 10 const samplesCount = 250 - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() + for _, enableStStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + dir := t.TempDir() - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) - enc := record.Encoder{} - w, err := NewSize(nil, nil, wdir, segmentSize, compress) - require.NoError(t, err) - defer func() { - require.NoError(t, w.Close()) - }() + enc := record.Encoder{EnableSTStorage: enableStStorage} + w, err := NewSize(nil, nil, wdir, segmentSize, compress) + require.NoError(t, err) + defer func() { + require.NoError(t, w.Close()) + }() - // Write to the initial segment then checkpoint. - for i := range seriesCount { - ref := i + 100 - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(ref), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - // Add in an unknown record type, which should be ignored. - require.NoError(t, w.Log([]byte{255})) - - for range samplesCount { - inner := rand.Intn(ref + 1) - sample := enc.Samples([]record.RefSample{ + // Write to the initial segment then checkpoint. + for i := range seriesCount { + ref := i + 100 + series := enc.Series([]record.RefSeries{ { - Ref: chunks.HeadSeriesRef(inner), - T: int64(i), - V: float64(i), + Ref: chunks.HeadSeriesRef(ref), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), }, }, nil) - require.NoError(t, w.Log(sample)) + require.NoError(t, w.Log(series)) + // Add in an unknown record type, which should be ignored. + require.NoError(t, w.Log([]byte{255})) + + for range samplesCount { + inner := rand.Intn(ref + 1) + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(inner), + T: int64(i), + V: float64(i), + }, + }, nil) + require.NoError(t, w.Log(sample)) + } } - } - Checkpoint(promslog.NewNopLogger(), w, 0, 1, func(chunks.HeadSeriesRef) bool { return true }, 0) - w.Truncate(1) + Checkpoint(promslog.NewNopLogger(), w, 0, 1, func(chunks.HeadSeriesRef) bool { return true }, 0, enableStStorage) + w.Truncate(1) - // Write more records after checkpointing. - for i := range seriesCount { - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(i), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - - for j := range samplesCount { - sample := enc.Samples([]record.RefSample{ + // Write more records after checkpointing. + for i := range seriesCount { + series := enc.Series([]record.RefSeries{ { - Ref: chunks.HeadSeriesRef(j), - T: int64(i), - V: float64(i), + Ref: chunks.HeadSeriesRef(i), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), }, }, nil) - require.NoError(t, w.Log(sample)) + require.NoError(t, w.Log(series)) + + for j := range samplesCount { + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(j), + T: int64(i), + V: float64(i), + }, + }, nil) + require.NoError(t, w.Log(sample)) + } } - } - _, _, err = Segments(w.Dir()) - require.NoError(t, err) - overwriteReadTimeout(t, time.Second) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) - go watcher.Start() + _, _, err = Segments(w.Dir()) + require.NoError(t, err) + overwriteReadTimeout(t, time.Second) + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) + go watcher.Start() - expected := seriesCount * 2 + expected := seriesCount * 2 - require.Eventually(t, func() bool { - return wt.checkNumSeries() == expected - }, 10*time.Second, 1*time.Second) - watcher.Stop() - }) + require.Eventually(t, func() bool { + return wt.checkNumSeries() == expected + }, 10*time.Second, 1*time.Second) + watcher.Stop() + }) + } } } func TestReadCheckpoint(t *testing.T) { - t.Parallel() pageSize := 32 * 1024 const seriesCount = 10 const samplesCount = 250 - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() + for _, enableStStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + dir := t.TempDir() - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) - f, err := os.Create(SegmentName(wdir, 30)) - require.NoError(t, err) - require.NoError(t, f.Close()) + f, err := os.Create(SegmentName(wdir, 30)) + require.NoError(t, err) + require.NoError(t, f.Close()) - enc := record.Encoder{} - w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) - require.NoError(t, err) - t.Cleanup(func() { - require.NoError(t, w.Close()) - }) + enc := record.Encoder{EnableSTStorage: enableStStorage} + w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, w.Close()) + }) - // Write to the initial segment then checkpoint. - for i := range seriesCount { - ref := i + 100 - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(ref), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - - for range samplesCount { - inner := rand.Intn(ref + 1) - sample := enc.Samples([]record.RefSample{ - { - Ref: chunks.HeadSeriesRef(inner), - T: int64(i), - V: float64(i), - }, - }, nil) - require.NoError(t, w.Log(sample)) - } - } - _, err = w.NextSegmentSync() - require.NoError(t, err) - _, err = Checkpoint(promslog.NewNopLogger(), w, 30, 31, func(chunks.HeadSeriesRef) bool { return true }, 0) - require.NoError(t, err) - require.NoError(t, w.Truncate(32)) - - // Start read after checkpoint, no more data written. - _, _, err = Segments(w.Dir()) - require.NoError(t, err) - - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) - go watcher.Start() - - expectedSeries := seriesCount - retry(t, defaultRetryInterval, defaultRetries, func() bool { - return wt.checkNumSeries() >= expectedSeries - }) - watcher.Stop() - require.Equal(t, expectedSeries, wt.checkNumSeries()) - }) - } -} - -func TestReadCheckpointMultipleSegments(t *testing.T) { - pageSize := 32 * 1024 - - const segments = 1 - const seriesCount = 20 - const samplesCount = 300 - - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() - - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) - - enc := record.Encoder{} - w, err := NewSize(nil, nil, wdir, pageSize, compress) - require.NoError(t, err) - - // Write a bunch of data. - for i := range segments { - for j := range seriesCount { - ref := j + (i * 100) + // Write to the initial segment then checkpoint. + for i := range seriesCount { + ref := i + 100 series := enc.Series([]record.RefSeries{ { Ref: chunks.HeadSeriesRef(ref), @@ -557,57 +497,132 @@ func TestReadCheckpointMultipleSegments(t *testing.T) { require.NoError(t, w.Log(sample)) } } - } - require.NoError(t, w.Close()) - - // At this point we should have at least 6 segments, lets create a checkpoint dir of the first 5. - checkpointDir := dir + "/wal/checkpoint.000004" - err = os.Mkdir(checkpointDir, 0o777) - require.NoError(t, err) - for i := 0; i <= 4; i++ { - err := os.Rename(SegmentName(dir+"/wal", i), SegmentName(checkpointDir, i)) + _, err = w.NextSegmentSync() require.NoError(t, err) - } + _, err = Checkpoint(promslog.NewNopLogger(), w, 30, 31, func(chunks.HeadSeriesRef) bool { return true }, 0, enableStStorage) + require.NoError(t, err) + require.NoError(t, w.Truncate(32)) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) - watcher.MaxSegment = -1 + // Start read after checkpoint, no more data written. + _, _, err = Segments(w.Dir()) + require.NoError(t, err) - // Set the Watcher's metrics so they're not nil pointers. - watcher.SetMetrics() + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) + go watcher.Start() - lastCheckpoint, _, err := LastCheckpoint(watcher.walDir) - require.NoError(t, err) + expectedSeries := seriesCount + retry(t, defaultRetryInterval, defaultRetries, func() bool { + return wt.checkNumSeries() >= expectedSeries + }) + watcher.Stop() + require.Equal(t, expectedSeries, wt.checkNumSeries()) + }) + } + } +} - err = watcher.readCheckpoint(lastCheckpoint, (*Watcher).readSegment) - require.NoError(t, err) - }) +func TestReadCheckpointMultipleSegments(t *testing.T) { + pageSize := 32 * 1024 + + const segments = 1 + const seriesCount = 40 + const samplesCount = 500 + + for _, enableStStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + dir := t.TempDir() + + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) + + enc := record.Encoder{EnableSTStorage: enableStStorage} + w, err := NewSize(nil, nil, wdir, pageSize, compress) + require.NoError(t, err) + + // Write a bunch of data. + for i := range segments { + for j := range seriesCount { + ref := j + (i * 100) + series := enc.Series([]record.RefSeries{ + { + Ref: chunks.HeadSeriesRef(ref), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), + }, + }, nil) + require.NoError(t, w.Log(series)) + + for range samplesCount { + inner := rand.Intn(ref + 1) + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(inner), + T: int64(i), + V: float64(i), + }, + }, nil) + require.NoError(t, w.Log(sample)) + } + } + } + require.NoError(t, w.Close()) + + // At this point we should have at least 6 segments, lets create a checkpoint dir of the first 5. + checkpointDir := dir + "/wal/checkpoint.000004" + err = os.Mkdir(checkpointDir, 0o777) + require.NoError(t, err) + for i := 0; i <= 4; i++ { + err := os.Rename(SegmentName(dir+"/wal", i), SegmentName(checkpointDir, i)) + require.NoError(t, err) + } + + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) + watcher.MaxSegment = -1 + + // Set the Watcher's metrics so they're not nil pointers. + watcher.SetMetrics() + + lastCheckpoint, _, err := LastCheckpoint(watcher.walDir) + require.NoError(t, err) + + err = watcher.readCheckpoint(lastCheckpoint, (*Watcher).readSegment) + require.NoError(t, err) + }) + } } } func TestCheckpointSeriesReset(t *testing.T) { - segmentSize := 32 * 1024 + segmentSize := 64 * 1024 // We need something similar to this # of series and samples // in order to get enough segments for us to checkpoint. - const seriesCount = 20 - const samplesCount = 350 + const seriesCount = 30 + const samplesCount = 700 testCases := []struct { - compress compression.Type - segments int + compress compression.Type + enableStStorage bool + segments int }{ - {compress: compression.None, segments: 14}, - {compress: compression.Snappy, segments: 13}, + {compress: compression.None, enableStStorage: false, segments: 24}, + {compress: compression.Snappy, enableStStorage: false, segments: 23}, + {compress: compression.None, enableStStorage: true, segments: 20}, + {compress: compression.Snappy, enableStStorage: true, segments: 20}, } + dir := t.TempDir() for _, tc := range testCases { - t.Run(fmt.Sprintf("compress=%s", tc.compress), func(t *testing.T) { - dir := t.TempDir() - - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", tc.compress, tc.enableStStorage), func(t *testing.T) { + subdir := filepath.Join(dir, fmt.Sprintf("%s-%v", tc.compress, tc.enableStStorage)) + err := os.MkdirAll(subdir, 0o777) + require.NoError(t, err) + wdir := filepath.Join(subdir, "wal") + err = os.MkdirAll(wdir, 0o777) require.NoError(t, err) - enc := record.Encoder{} + enc := record.Encoder{EnableSTStorage: tc.enableStStorage} w, err := NewSize(nil, nil, wdir, segmentSize, tc.compress) require.NoError(t, err) defer func() { @@ -643,7 +658,7 @@ func TestCheckpointSeriesReset(t *testing.T) { overwriteReadTimeout(t, time.Second) wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, subdir, false, false, false) watcher.MaxSegment = -1 go watcher.Start() @@ -655,13 +670,13 @@ func TestCheckpointSeriesReset(t *testing.T) { return wt.checkNumSeries() == seriesCount }, 10*time.Second, 1*time.Second) - _, err = Checkpoint(promslog.NewNopLogger(), w, 2, 4, func(chunks.HeadSeriesRef) bool { return true }, 0) + _, err = Checkpoint(promslog.NewNopLogger(), w, 2, 4, func(chunks.HeadSeriesRef) bool { return true }, 0, true) require.NoError(t, err) err = w.Truncate(5) require.NoError(t, err) - _, cpi, err := LastCheckpoint(path.Join(dir, "wal")) + _, cpi, err := LastCheckpoint(wdir) require.NoError(t, err) err = watcher.garbageCollectSeries(cpi + 1) require.NoError(t, err) @@ -678,66 +693,67 @@ func TestCheckpointSeriesReset(t *testing.T) { } func TestRun_StartupTime(t *testing.T) { - t.Parallel() const pageSize = 32 * 1024 - const segments = 10 - const seriesCount = 20 - const samplesCount = 300 + const segments = 20 + const seriesCount = 40 + const samplesCount = 500 - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() + for _, enableStStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + dir := t.TempDir() - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) - enc := record.Encoder{} - w, err := NewSize(nil, nil, wdir, pageSize, compress) - require.NoError(t, err) + enc := record.Encoder{EnableSTStorage: enableStStorage} + w, err := NewSize(nil, nil, wdir, pageSize, compress) + require.NoError(t, err) - for i := range segments { - for j := range seriesCount { - ref := j + (i * 100) - series := enc.Series([]record.RefSeries{ - { - Ref: chunks.HeadSeriesRef(ref), - Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), - }, - }, nil) - require.NoError(t, w.Log(series)) - - for range samplesCount { - inner := rand.Intn(ref + 1) - sample := enc.Samples([]record.RefSample{ + for i := range segments { + for j := range seriesCount { + ref := j + (i * 100) + series := enc.Series([]record.RefSeries{ { - Ref: chunks.HeadSeriesRef(inner), - T: int64(i), - V: float64(i), + Ref: chunks.HeadSeriesRef(ref), + Labels: labels.FromStrings("__name__", fmt.Sprintf("metric_%d", i)), }, }, nil) - require.NoError(t, w.Log(sample)) + require.NoError(t, w.Log(series)) + + for range samplesCount { + inner := rand.Intn(ref + 1) + sample := enc.Samples([]record.RefSample{ + { + Ref: chunks.HeadSeriesRef(inner), + T: int64(i), + V: float64(i), + }, + }, nil) + require.NoError(t, w.Log(sample)) + } } } - } - require.NoError(t, w.Close()) + require.NoError(t, w.Close()) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) - watcher.MaxSegment = segments + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) + watcher.MaxSegment = segments - watcher.SetMetrics() - startTime := time.Now() + watcher.SetMetrics() + startTime := time.Now() - err = watcher.Run() - require.Less(t, time.Since(startTime), readTimeout) - require.NoError(t, err) - }) + err = watcher.Run() + require.Less(t, time.Since(startTime), readTimeout) + require.NoError(t, err) + }) + } } } -func generateWALRecords(w *WL, segment, seriesCount, samplesCount int) error { - enc := record.Encoder{} +func generateWALRecords(w *WL, segment, seriesCount, samplesCount int, enableStStorage bool) error { + enc := record.Encoder{EnableSTStorage: enableStStorage} for j := range seriesCount { ref := j + (segment * 100) series := enc.Series([]record.RefSeries{ @@ -777,61 +793,63 @@ func TestRun_AvoidNotifyWhenBehind(t *testing.T) { const seriesCount = 10 const samplesCount = 50 - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s", compress), func(t *testing.T) { - dir := t.TempDir() + for _, enableStStorage := range []bool{false, true} { + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + dir := t.TempDir() - wdir := path.Join(dir, "wal") - err := os.Mkdir(wdir, 0o777) - require.NoError(t, err) + wdir := path.Join(dir, "wal") + err := os.Mkdir(wdir, 0o777) + require.NoError(t, err) - w, err := NewSize(nil, nil, wdir, segmentSize, compress) - require.NoError(t, err) - // Write to 00000000, the watcher will read series from it. - require.NoError(t, generateWALRecords(w, 0, seriesCount, samplesCount)) - // Create 00000001, the watcher will tail it once started. - w.NextSegment() - - // Set up the watcher and run it in the background. - wt := newWriteToMock(time.Millisecond) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) - watcher.SetMetrics() - watcher.MaxSegment = segmentsToRead - - var g errgroup.Group - g.Go(func() error { - startTime := time.Now() - err = watcher.Run() - if err != nil { - return err - } - // If the watcher was to wait for readTicker to read every new segment, it would need readTimeout * segmentsToRead. - d := time.Since(startTime) - if d > readTimeout { - return fmt.Errorf("watcher ran for %s, it shouldn't rely on readTicker=%s to read the new segments", d, readTimeout) - } - return nil - }) - - // The watcher went through 00000000 and is tailing the next one. - retry(t, defaultRetryInterval, defaultRetries, func() bool { - return wt.checkNumSeries() == seriesCount - }) - - // In the meantime, add some new segments in bulk. - // We should end up with segmentsToWrite + 1 segments now. - for i := 1; i < segmentsToWrite; i++ { - require.NoError(t, generateWALRecords(w, i, seriesCount, samplesCount)) + w, err := NewSize(nil, nil, wdir, segmentSize, compress) + require.NoError(t, err) + // Write to 00000000, the watcher will read series from it. + require.NoError(t, generateWALRecords(w, 0, seriesCount, samplesCount, enableStStorage)) + // Create 00000001, the watcher will tail it once started. w.NextSegment() - } - // Wait for the watcher. - require.NoError(t, g.Wait()) + // Set up the watcher and run it in the background. + wt := newWriteToMock(time.Millisecond) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false) + watcher.SetMetrics() + watcher.MaxSegment = segmentsToRead - // All series and samples were read. - require.Equal(t, (segmentsToRead+1)*seriesCount, wt.checkNumSeries()) // Series from 00000000 are also read. - require.Equal(t, segmentsToRead*seriesCount*samplesCount, wt.samplesAppended) - require.NoError(t, w.Close()) - }) + var g errgroup.Group + g.Go(func() error { + startTime := time.Now() + err = watcher.Run() + if err != nil { + return err + } + // If the watcher was to wait for readTicker to read every new segment, it would need readTimeout * segmentsToRead. + d := time.Since(startTime) + if d > readTimeout { + return fmt.Errorf("watcher ran for %s, it shouldn't rely on readTicker=%s to read the new segments", d, readTimeout) + } + return nil + }) + + // The watcher went through 00000000 and is tailing the next one. + retry(t, defaultRetryInterval, defaultRetries, func() bool { + return wt.checkNumSeries() == seriesCount + }) + + // In the meantime, add some new segments in bulk. + // We should end up with segmentsToWrite + 1 segments now. + for i := 1; i < segmentsToWrite; i++ { + require.NoError(t, generateWALRecords(w, i, seriesCount, samplesCount, enableStStorage)) + w.NextSegment() + } + + // Wait for the watcher. + require.NoError(t, g.Wait()) + + // All series and samples were read. + require.Equal(t, (segmentsToRead+1)*seriesCount, wt.checkNumSeries()) // Series from 00000000 are also read. + require.Equal(t, segmentsToRead*seriesCount*samplesCount, wt.samplesAppended) + require.NoError(t, w.Close()) + }) + } } } diff --git a/util/testrecord/record.go b/util/testrecord/record.go new file mode 100644 index 0000000000..e5071d42c8 --- /dev/null +++ b/util/testrecord/record.go @@ -0,0 +1,96 @@ +// Copyright 2025 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testrecord + +import ( + "math" + "testing" + + "github.com/prometheus/prometheus/tsdb/chunks" + "github.com/prometheus/prometheus/tsdb/record" +) + +type RefSamplesCase string + +const ( + Realistic1000Samples RefSamplesCase = "real1000" + Realistic1000WithVariableSTSamples RefSamplesCase = "real1000-vst" + Realistic1000WithConstSTSamples RefSamplesCase = "real1000-cst" + WorstCase1000 RefSamplesCase = "worst1000" + WorstCase1000WithSTSamples RefSamplesCase = "worst1000-st" +) + +func GenTestRefSamplesCase(t testing.TB, c RefSamplesCase) []record.RefSample { + t.Helper() + + ret := make([]record.RefSample, 1e3) + switch c { + // Samples are across series, so likely all have the same timestamp. + case Realistic1000Samples: + for i := range ret { + ret[i].Ref = chunks.HeadSeriesRef(i) + ret[i].T = int64(12423423) + ret[i].V = highVarianceFloat(i) + } + // Likely the start times will all be the same with deltas. + case Realistic1000WithConstSTSamples: + for i := range ret { + ret[i].Ref = chunks.HeadSeriesRef(i) + ret[i].ST = int64(12423423) + ret[i].T = int64(12423423 + 15) + ret[i].V = highVarianceFloat(i) + } + // Maybe series have different start times though + case Realistic1000WithVariableSTSamples: + for i := range ret { + ret[i].Ref = chunks.HeadSeriesRef(i) + ret[i].ST = int64((12423423 / 9) * (i % 10)) + ret[i].T = int64(12423423) + ret[i].V = highVarianceFloat(i) + } + case WorstCase1000: + for i := range ret { + ret[i].Ref = chunks.HeadSeriesRef(i) + ret[i].T = highVarianceInt(i) + ret[i].V = highVarianceFloat(i) + } + case WorstCase1000WithSTSamples: + for i := range ret { + ret[i].Ref = chunks.HeadSeriesRef(i) + + // Worst case is when the values are significantly different + // to each other which breaks delta encoding. + ret[i].ST = highVarianceInt(i+1) / 1024 // Make sure ST is not comparable to T + ret[i].T = highVarianceInt(i) + ret[i].V = highVarianceFloat(i) + } + default: + t.Fatal("unknown case", c) + } + return ret +} + +func highVarianceInt(i int) int64 { + if i%2 == 0 { + return math.MinInt32 + } + return math.MaxInt32 +} + +func highVarianceFloat(i int) float64 { + if i%2 == 0 { + return math.SmallestNonzeroFloat32 + } + return math.MaxFloat32 +} From 5ac1080a60a2a6aaba8974c93c9d42bf0796afae Mon Sep 17 00:00:00 2001 From: bwplotka Date: Tue, 17 Feb 2026 11:11:46 +0000 Subject: [PATCH 25/73] refactor: sed enableStStorage/enableSTStorage Signed-off-by: bwplotka --- tsdb/agent/db_append_v2_test.go | 14 +++---- tsdb/db_append_v2_test.go | 34 ++++++++--------- tsdb/db_test.go | 34 ++++++++--------- tsdb/head_append_v2_test.go | 8 ++-- tsdb/head_test.go | 60 ++++++++++++++--------------- tsdb/record/bench_test.go | 12 +++--- tsdb/record/record_test.go | 32 ++++++++-------- tsdb/wlog/checkpoint.go | 4 +- tsdb/wlog/checkpoint_test.go | 16 ++++---- tsdb/wlog/watcher_test.go | 68 ++++++++++++++++----------------- 10 files changed, 141 insertions(+), 141 deletions(-) diff --git a/tsdb/agent/db_append_v2_test.go b/tsdb/agent/db_append_v2_test.go index 139e7baa19..cbe9b09374 100644 --- a/tsdb/agent/db_append_v2_test.go +++ b/tsdb/agent/db_append_v2_test.go @@ -96,10 +96,10 @@ func TestCommit_AppendV2(t *testing.T) { numHistograms = 100 numSeries = 8 ) - for _, enableStStorage := range []bool{false, true} { - t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { opts := DefaultOptions() - opts.EnableSTStorage = enableStStorage + opts.EnableSTStorage = enableSTStorage s := createTestAgentDB(t, nil, opts) app := s.AppenderV2(context.TODO()) @@ -196,7 +196,7 @@ func TestCommit_AppendV2(t *testing.T) { walSeriesCount += len(series) case record.Samples: - if enableStStorage { + if enableSTStorage { t.Errorf("Got V1 Samples when ST enabled") } var samples []record.RefSample @@ -205,7 +205,7 @@ func TestCommit_AppendV2(t *testing.T) { walSamplesCount += len(samples) case record.SamplesV2: - if !enableStStorage { + if !enableSTStorage { t.Errorf("Got V2 Samples when ST disabled") } var samples []record.RefSample @@ -256,9 +256,9 @@ func TestRollbackAppendV2(t *testing.T) { numSeries = 8 ) - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { opts := DefaultOptions() - opts.EnableSTStorage = enableStStorage + opts.EnableSTStorage = enableSTStorage s := createTestAgentDB(t, nil, opts) app := s.AppenderV2(context.TODO()) diff --git a/tsdb/db_append_v2_test.go b/tsdb/db_append_v2_test.go index 15201d3dc7..08e97d1113 100644 --- a/tsdb/db_append_v2_test.go +++ b/tsdb/db_append_v2_test.go @@ -968,18 +968,18 @@ func TestWALReplayRaceOnSamplesLoggedBeforeSeries_AppendV2(t *testing.T) { // We test both with few and many samples appended after series creation. If samples are < 120 then there's no // mmap-ed chunk, otherwise there's at least 1 mmap-ed chunk when replaying the WAL. - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, numSamplesAfterSeriesCreation := range []int{1, 1000} { for run := 1; run <= numRuns; run++ { - t.Run(fmt.Sprintf("samples after series creation = %d, run = %d, stStorage = %v", numSamplesAfterSeriesCreation, run, enableStStorage), func(t *testing.T) { - testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation, enableStStorage) + t.Run(fmt.Sprintf("samples after series creation = %d, run = %d, stStorage = %v", numSamplesAfterSeriesCreation, run, enableSTStorage), func(t *testing.T) { + testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation, enableSTStorage) }) } } } } -func testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int, enableStStorage bool) { +func testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int, enableSTStorage bool) { const numSeries = 1000 db := newTestDB(t) @@ -987,7 +987,7 @@ func testWALReplayRaceOnSamplesLoggedBeforeSeriesAppendV2(t *testing.T, numSampl for seriesRef := 1; seriesRef <= numSeries; seriesRef++ { // Log samples before the series is logged to the WAL. - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} var samples []record.RefSample for ts := range numSamplesBeforeSeriesCreation { @@ -1178,8 +1178,8 @@ func TestTombstoneCleanResultEmptyBlock_AppendV2(t *testing.T) { func TestSizeRetention_AppendV2(t *testing.T) { t.Parallel() - for _, enableStStorage := range []bool{false, true} { - t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { opts := DefaultOptions() opts.OutOfOrderTimeWindow = 100 db := newTestDB(t, withOpts(opts), withRngs(100)) @@ -1243,7 +1243,7 @@ func TestSizeRetention_AppendV2(t *testing.T) { // Create a WAL checkpoint, and compare sizes. first, last, err := wlog.Segments(db.Head().wal.Dir()) require.NoError(t, err) - _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0, enableStStorage) + _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0, enableSTStorage) require.NoError(t, err) blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. walSize, err = db.Head().wal.Size() @@ -1506,15 +1506,15 @@ func TestInitializeHeadTimestamp_AppendV2(t *testing.T) { require.True(t, db.head.initialized()) }) - for _, enableStStorage := range []bool{false, true} { - t.Run("wal-only,stStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("wal-only,stStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { dir := t.TempDir() require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) require.NoError(t, err) - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} err = w.Log( enc.Series([]record.RefSeries{ {Ref: 123, Labels: labels.FromStrings("a", "1")}, @@ -1546,8 +1546,8 @@ func TestInitializeHeadTimestamp_AppendV2(t *testing.T) { require.Equal(t, int64(2000), db.head.MaxTime()) require.True(t, db.head.initialized()) }) - for _, enableStStorage := range []bool{false, true} { - t.Run("existing-block-and-wal,stStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("existing-block-and-wal,stStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { dir := t.TempDir() createBlock(t, dir, genSeries(1, 1, 1000, 6000)) @@ -1556,7 +1556,7 @@ func TestInitializeHeadTimestamp_AppendV2(t *testing.T) { w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) require.NoError(t, err) - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} err = w.Log( enc.Series([]record.RefSeries{ {Ref: 123, Labels: labels.FromStrings("a", "1")}, @@ -3441,8 +3441,8 @@ func TestMetadataInWAL_AppenderV2(t *testing.T) { } func TestMetadataCheckpointingOnlyKeepsLatestEntry_AppendV2(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { - t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { ctx := context.Background() numSamples := 10000 hb, w := newTestHead(t, int64(numSamples)*10, compression.None, false) @@ -3519,7 +3519,7 @@ func TestMetadataCheckpointingOnlyKeepsLatestEntry_AppendV2(t *testing.T) { keep := func(id chunks.HeadSeriesRef) bool { return id != 3 } - _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0, enableStStorage) + _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0, enableSTStorage) require.NoError(t, err) // Confirm there's been a checkpoint. diff --git a/tsdb/db_test.go b/tsdb/db_test.go index 13c37eb219..13464c26e5 100644 --- a/tsdb/db_test.go +++ b/tsdb/db_test.go @@ -1170,25 +1170,25 @@ func TestWALReplayRaceOnSamplesLoggedBeforeSeries(t *testing.T) { // We test both with few and many samples appended after series creation. If samples are < 120 then there's no // mmap-ed chunk, otherwise there's at least 1 mmap-ed chunk when replaying the WAL. - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, numSamplesAfterSeriesCreation := range []int{1, 1000} { for run := 1; run <= numRuns; run++ { - t.Run(fmt.Sprintf("samples after series creation = %d, run = %d, stStorage=%v", numSamplesAfterSeriesCreation, run, enableStStorage), func(t *testing.T) { - testWALReplayRaceOnSamplesLoggedBeforeSeries(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation, enableStStorage) + t.Run(fmt.Sprintf("samples after series creation = %d, run = %d, stStorage=%v", numSamplesAfterSeriesCreation, run, enableSTStorage), func(t *testing.T) { + testWALReplayRaceOnSamplesLoggedBeforeSeries(t, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation, enableSTStorage) }) } } } } -func testWALReplayRaceOnSamplesLoggedBeforeSeries(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int, enableStStorage bool) { +func testWALReplayRaceOnSamplesLoggedBeforeSeries(t *testing.T, numSamplesBeforeSeriesCreation, numSamplesAfterSeriesCreation int, enableSTStorage bool) { const numSeries = 1000 db := newTestDB(t) db.DisableCompactions() for seriesRef := 1; seriesRef <= numSeries; seriesRef++ { // Log samples before the series is logged to the WAL. - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} var samples []record.RefSample for ts := range numSamplesBeforeSeriesCreation { @@ -1552,8 +1552,8 @@ func TestRetentionDurationMetric(t *testing.T) { func TestSizeRetention(t *testing.T) { t.Parallel() - for _, enableStStorage := range []bool{false, true} { - t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { opts := DefaultOptions() opts.OutOfOrderTimeWindow = 100 db := newTestDB(t, withOpts(opts), withRngs(100)) @@ -1617,7 +1617,7 @@ func TestSizeRetention(t *testing.T) { // Create a WAL checkpoint, and compare sizes. first, last, err := wlog.Segments(db.Head().wal.Dir()) require.NoError(t, err) - _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0, enableStStorage) + _, err = wlog.Checkpoint(promslog.NewNopLogger(), db.Head().wal, first, last-1, func(chunks.HeadSeriesRef) bool { return false }, 0, enableSTStorage) require.NoError(t, err) blockSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the actual internal metrics. walSize, err = db.Head().wal.Size() @@ -2078,15 +2078,15 @@ func TestInitializeHeadTimestamp(t *testing.T) { require.True(t, db.head.initialized()) }) - for _, enableStStorage := range []bool{false, true} { - t.Run("wal-only-st-"+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("wal-only-st-"+strconv.FormatBool(enableSTStorage), func(t *testing.T) { dir := t.TempDir() require.NoError(t, os.MkdirAll(path.Join(dir, "wal"), 0o777)) w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) require.NoError(t, err) - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} err = w.Log( enc.Series([]record.RefSeries{ {Ref: 123, Labels: labels.FromStrings("a", "1")}, @@ -2119,8 +2119,8 @@ func TestInitializeHeadTimestamp(t *testing.T) { require.True(t, db.head.initialized()) }) - for _, enableStStorage := range []bool{false, true} { - t.Run("existing-block-and-wal,enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("existing-block-and-wal,enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { dir := t.TempDir() createBlock(t, dir, genSeries(1, 1, 1000, 6000)) @@ -2129,7 +2129,7 @@ func TestInitializeHeadTimestamp(t *testing.T) { w, err := wlog.New(nil, nil, path.Join(dir, "wal"), compression.None) require.NoError(t, err) - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} err = w.Log( enc.Series([]record.RefSeries{ {Ref: 123, Labels: labels.FromStrings("a", "1")}, @@ -4703,8 +4703,8 @@ func TestMetadataCheckpointingOnlyKeepsLatestEntry(t *testing.T) { require.NoError(t, err) } - for _, enableStStorage := range []bool{false, true} { - t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { ctx := context.Background() numSamples := 10000 hb, w := newTestHead(t, int64(numSamples)*10, compression.None, false) @@ -4771,7 +4771,7 @@ func TestMetadataCheckpointingOnlyKeepsLatestEntry(t *testing.T) { keep := func(id chunks.HeadSeriesRef) bool { return id != 3 } - _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0, enableStStorage) + _, err = wlog.Checkpoint(promslog.NewNopLogger(), w, first, last-1, keep, 0, enableSTStorage) require.NoError(t, err) // Confirm there's been a checkpoint. diff --git a/tsdb/head_append_v2_test.go b/tsdb/head_append_v2_test.go index ba756f801f..539ac22fd7 100644 --- a/tsdb/head_append_v2_test.go +++ b/tsdb/head_append_v2_test.go @@ -1867,8 +1867,8 @@ func TestHistogramInWALAndMmapChunk_AppenderV2(t *testing.T) { } func TestChunkSnapshot_AppenderV2(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { - t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { head, _ := newTestHead(t, 120*4, compression.None, false) defer func() { head.opts.EnableMemorySnapshotOnShutdown = false @@ -2017,7 +2017,7 @@ func TestChunkSnapshot_AppenderV2(t *testing.T) { require.NoError(t, app.Commit()) // Add some tombstones. - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} for i := 1; i <= numSeries; i++ { ref := storage.SeriesRef(i) itvs := tombstones.Intervals{ @@ -2095,7 +2095,7 @@ func TestChunkSnapshot_AppenderV2(t *testing.T) { require.NoError(t, app.Commit()) // Add more tombstones. - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} for i := 1; i <= numSeries; i++ { ref := storage.SeriesRef(i) itvs := tombstones.Intervals{ diff --git a/tsdb/head_test.go b/tsdb/head_test.go index 81cb236801..2cee989e40 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -256,7 +256,7 @@ func BenchmarkLoadWLs(b *testing.B) { // Rough estimates of most common % of samples that have an exemplar for each scrape. exemplarsPercentages := []float64{0, 0.5, 1, 5} lastExemplarsPerSeries := -1 - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, c := range cases { missingSeriesPercentages := []float64{0, 0.1} for _, missingSeriesPct := range missingSeriesPercentages { @@ -268,7 +268,7 @@ func BenchmarkLoadWLs(b *testing.B) { continue } lastExemplarsPerSeries = exemplarsPerSeries - b.Run(fmt.Sprintf("batches=%d,seriesPerBatch=%d,samplesPerSeries=%d,exemplarsPerSeries=%d,mmappedChunkT=%d,oooSeriesPct=%.3f,oooSamplesPct=%.3f,oooCapMax=%d,missingSeriesPct=%.3f,stStorage=%v", c.batches, c.seriesPerBatch, c.samplesPerSeries, exemplarsPerSeries, c.mmappedChunkT, c.oooSeriesPct, c.oooSamplesPct, c.oooCapMax, missingSeriesPct, enableStStorage), + b.Run(fmt.Sprintf("batches=%d,seriesPerBatch=%d,samplesPerSeries=%d,exemplarsPerSeries=%d,mmappedChunkT=%d,oooSeriesPct=%.3f,oooSamplesPct=%.3f,oooCapMax=%d,missingSeriesPct=%.3f,stStorage=%v", c.batches, c.seriesPerBatch, c.samplesPerSeries, exemplarsPerSeries, c.mmappedChunkT, c.oooSeriesPct, c.oooSamplesPct, c.oooCapMax, missingSeriesPct, enableSTStorage), func(b *testing.B) { dir := b.TempDir() @@ -307,7 +307,7 @@ func BenchmarkLoadWLs(b *testing.B) { writeSeries = newWriteSeries } - buf = populateTestWL(b, wal, []any{writeSeries}, buf, enableStStorage) + buf = populateTestWL(b, wal, []any{writeSeries}, buf, enableSTStorage) } // Write samples. @@ -333,7 +333,7 @@ func BenchmarkLoadWLs(b *testing.B) { V: float64(i) * 100, }) } - buf = populateTestWL(b, wal, []any{refSamples}, buf, enableStStorage) + buf = populateTestWL(b, wal, []any{refSamples}, buf, enableSTStorage) } } @@ -372,7 +372,7 @@ func BenchmarkLoadWLs(b *testing.B) { Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", i)), }) } - buf = populateTestWL(b, wal, []any{refExemplars}, buf, enableStStorage) + buf = populateTestWL(b, wal, []any{refExemplars}, buf, enableSTStorage) } } @@ -401,10 +401,10 @@ func BenchmarkLoadWLs(b *testing.B) { }) } if shouldAddMarkers { - populateTestWL(b, wbl, []any{refMarkers}, buf, enableStStorage) + populateTestWL(b, wbl, []any{refMarkers}, buf, enableSTStorage) } - buf = populateTestWL(b, wal, []any{refSamples}, buf, enableStStorage) - buf = populateTestWL(b, wbl, []any{refSamples}, buf, enableStStorage) + buf = populateTestWL(b, wal, []any{refSamples}, buf, enableSTStorage) + buf = populateTestWL(b, wbl, []any{refSamples}, buf, enableSTStorage) } } @@ -713,9 +713,9 @@ func TestHead_HighConcurrencyReadAndWrite(t *testing.T) { } func TestHead_ReadWAL(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { - t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { entries := []any{ []record.RefSeries{ {Ref: 10, Labels: labels.FromStrings("a", "1")}, @@ -756,7 +756,7 @@ func TestHead_ReadWAL(t *testing.T) { head, w := newTestHead(t, 1000, compress, false) - populateTestWL(t, w, entries, nil, enableStStorage) + populateTestWL(t, w, entries, nil, enableSTStorage) require.NoError(t, head.Init(math.MinInt64)) require.Equal(t, uint64(101), head.lastSeriesID.Load()) @@ -1103,11 +1103,11 @@ func TestHead_WALCheckpointMultiRef(t *testing.T) { }, } - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, tc := range cases { - t.Run(tc.name+",stStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + t.Run(tc.name+",stStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { h, w := newTestHead(t, 1000, compression.None, false) - populateTestWL(t, w, tc.walEntries, nil, enableStStorage) + populateTestWL(t, w, tc.walEntries, nil, enableSTStorage) first, _, err := wlog.Segments(w.Dir()) require.NoError(t, err) @@ -1690,9 +1690,9 @@ func TestMemSeries_truncateChunks_scenarios(t *testing.T) { } func TestHeadDeleteSeriesWithoutSamples(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { - t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { entries := []any{ []record.RefSeries{ {Ref: 10, Labels: labels.FromStrings("a", "1")}, @@ -1708,7 +1708,7 @@ func TestHeadDeleteSeriesWithoutSamples(t *testing.T) { } head, w := newTestHead(t, 1000, compress, false) - populateTestWL(t, w, entries, nil, enableStStorage) + populateTestWL(t, w, entries, nil, enableSTStorage) require.NoError(t, head.Init(math.MinInt64)) @@ -2575,8 +2575,8 @@ func TestHead_ReturnsSortedLabelValues(t *testing.T) { // TestWalRepair_DecodingError ensures that a repair is run for an error // when decoding a record. func TestWalRepair_DecodingError(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { - enc := record.Encoder{EnableSTStorage: enableStStorage} + for _, enableSTStorage := range []bool{false, true} { + enc := record.Encoder{EnableSTStorage: enableSTStorage} for name, test := range map[string]struct { corrFunc func(rec []byte) []byte // Func that applies the corruption to a record. rec []byte @@ -2609,7 +2609,7 @@ func TestWalRepair_DecodingError(t *testing.T) { }, } { for _, compress := range []compression.Type{compression.None, compression.Snappy, compression.Zstd} { - t.Run(fmt.Sprintf("%s,compress=%s,stStorage=%v", name, compress, enableStStorage), func(t *testing.T) { + t.Run(fmt.Sprintf("%s,compress=%s,stStorage=%v", name, compress, enableSTStorage), func(t *testing.T) { dir := t.TempDir() // Fill the wal and corrupt it. @@ -2672,9 +2672,9 @@ func TestWalRepair_DecodingError(t *testing.T) { // TestWblRepair_DecodingError ensures that a repair is run for an error // when decoding a record. func TestWblRepair_DecodingError(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { - t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { - enc := record.Encoder{EnableSTStorage: enableStStorage} + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { + enc := record.Encoder{EnableSTStorage: enableSTStorage} corrFunc := func(rec []byte) []byte { return rec[:3] } @@ -4378,8 +4378,8 @@ func TestHistogramInWALAndMmapChunk(t *testing.T) { } func TestChunkSnapshot(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { - t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { head, _ := newTestHead(t, 120*4, compression.None, false) defer func() { head.opts.EnableMemorySnapshotOnShutdown = false @@ -4525,7 +4525,7 @@ func TestChunkSnapshot(t *testing.T) { require.NoError(t, app.Commit()) // Add some tombstones. - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} for i := 1; i <= numSeries; i++ { ref := storage.SeriesRef(i) itvs := tombstones.Intervals{ @@ -4599,7 +4599,7 @@ func TestChunkSnapshot(t *testing.T) { require.NoError(t, app.Commit()) // Add more tombstones. - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} for i := 1; i <= numSeries; i++ { ref := storage.SeriesRef(i) itvs := tombstones.Intervals{ @@ -5392,8 +5392,8 @@ func TestAppendingDifferentEncodingToSameSeries(t *testing.T) { // Tests https://github.com/prometheus/prometheus/issues/9725. func TestChunkSnapshotReplayBug(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { - t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { dir := t.TempDir() wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.Snappy) require.NoError(t, err) @@ -5418,7 +5418,7 @@ func TestChunkSnapshotReplayBug(t *testing.T) { } // Add a sample so that the series is not garbage collected. samplesRec := record.RefSample{Ref: ref, T: 1000, V: 1000} - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} rec := enc.Series([]record.RefSeries{seriesRec}, buf) buf = rec[:0] diff --git a/tsdb/record/bench_test.go b/tsdb/record/bench_test.go index f65cb34ff3..1420fffc46 100644 --- a/tsdb/record/bench_test.go +++ b/tsdb/record/bench_test.go @@ -34,7 +34,7 @@ func zeroOutSTs(samples []record.RefSample) []record.RefSample { } func TestEncodeDecode(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, tcase := range []testrecord.RefSamplesCase{ testrecord.Realistic1000Samples, testrecord.Realistic1000WithVariableSTSamples, @@ -45,7 +45,7 @@ func TestEncodeDecode(t *testing.T) { var ( dec record.Decoder buf []byte - enc = record.Encoder{EnableSTStorage: enableStStorage} + enc = record.Encoder{EnableSTStorage: enableSTStorage} ) s := testrecord.GenTestRefSamplesCase(t, tcase) @@ -55,7 +55,7 @@ func TestEncodeDecode(t *testing.T) { require.NoError(t, err) // if ST is off, we expect all STs to be zero expected := s - if !enableStStorage { + if !enableSTStorage { expected = zeroOutSTs(s) } @@ -69,7 +69,7 @@ func TestEncodeDecode(t *testing.T) { require.NoError(t, err) expected := s - if !enableStStorage { + if !enableSTStorage { expected = zeroOutSTs(s) } require.Equal(t, expected, got) @@ -81,7 +81,7 @@ func TestEncodeDecode(t *testing.T) { got, err := dec.Samples(enc.Samples(s, nil), samples) require.NoError(t, err) expected := s - if !enableStStorage { + if !enableSTStorage { expected = zeroOutSTs(s) } require.Equal(t, expected, got) @@ -102,7 +102,7 @@ func TestEncodeDecode(t *testing.T) { got, err := dec.Samples(buf, nil) require.NoError(t, err) expected := s - if !enableStStorage { + if !enableSTStorage { expected = zeroOutSTs(s) } require.Equal(t, expected, got) diff --git a/tsdb/record/record_test.go b/tsdb/record/record_test.go index ab4342c3a8..c15c9aa33c 100644 --- a/tsdb/record/record_test.go +++ b/tsdb/record/record_test.go @@ -275,10 +275,10 @@ func TestRecord_EncodeDecode(t *testing.T) { } func TestRecord_DecodeInvalidHistogramSchema(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, schema := range []int32{-100, 100} { - t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableStStorage), func(t *testing.T) { - enc := Encoder{EnableSTStorage: enableStStorage} + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableSTStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableSTStorage} var output bytes.Buffer logger := promslog.New(&promslog.Config{Writer: &output}) @@ -312,10 +312,10 @@ func TestRecord_DecodeInvalidHistogramSchema(t *testing.T) { } func TestRecord_DecodeInvalidFloatHistogramSchema(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, schema := range []int32{-100, 100} { - t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableStStorage), func(t *testing.T) { - enc := Encoder{EnableSTStorage: enableStStorage} + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableSTStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableSTStorage} var output bytes.Buffer logger := promslog.New(&promslog.Config{Writer: &output}) @@ -349,10 +349,10 @@ func TestRecord_DecodeInvalidFloatHistogramSchema(t *testing.T) { } func TestRecord_DecodeTooHighResolutionHistogramSchema(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, schema := range []int32{9, 52} { - t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableStStorage), func(t *testing.T) { - enc := Encoder{EnableSTStorage: enableStStorage} + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableSTStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableSTStorage} var output bytes.Buffer logger := promslog.New(&promslog.Config{Writer: &output}) @@ -386,10 +386,10 @@ func TestRecord_DecodeTooHighResolutionHistogramSchema(t *testing.T) { } func TestRecord_DecodeTooHighResolutionFloatHistogramSchema(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, schema := range []int32{9, 52} { - t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableStStorage), func(t *testing.T) { - enc := Encoder{EnableSTStorage: enableStStorage} + t.Run(fmt.Sprintf("schema=%d,stStorage=%v", schema, enableSTStorage), func(t *testing.T) { + enc := Encoder{EnableSTStorage: enableSTStorage} var output bytes.Buffer logger := promslog.New(&promslog.Config{Writer: &output}) @@ -425,8 +425,8 @@ func TestRecord_DecodeTooHighResolutionFloatHistogramSchema(t *testing.T) { // TestRecord_Corrupted ensures that corrupted records return the correct error. // Bugfix check for pull/521 and pull/523. func TestRecord_Corrupted(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { - enc := Encoder{EnableSTStorage: enableStStorage} + for _, enableSTStorage := range []bool{false, true} { + enc := Encoder{EnableSTStorage: enableSTStorage} dec := NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) t.Run("Test corrupted series record", func(t *testing.T) { @@ -784,13 +784,13 @@ func BenchmarkWAL_HistogramEncoding(b *testing.B) { make: initNHCBRefs, }, } { - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, labelCount := range []int{0, 10, 50} { for _, histograms := range []int{10, 100, 1000} { for _, buckets := range []int{0, 1, 10, 100} { b.Run(fmt.Sprintf("type=%s/labels=%d/histograms=%d/buckets=%d", maker.name, labelCount, histograms, buckets), func(b *testing.B) { series, samples, nhcbs := maker.make(labelCount, histograms, buckets) - enc := Encoder{EnableSTStorage: enableStStorage} + enc := Encoder{EnableSTStorage: enableSTStorage} for b.Loop() { var buf []byte enc.Series(series, buf) diff --git a/tsdb/wlog/checkpoint.go b/tsdb/wlog/checkpoint.go index 86a858e70a..4c4a53e1b4 100644 --- a/tsdb/wlog/checkpoint.go +++ b/tsdb/wlog/checkpoint.go @@ -92,7 +92,7 @@ const CheckpointPrefix = "checkpoint." // segmented format as the original WAL itself. // This makes it easy to read it through the WAL package and concatenate // it with the original WAL. -func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.HeadSeriesRef) bool, mint int64, enableStStorage bool) (*CheckpointStats, error) { +func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.HeadSeriesRef) bool, mint int64, enableSTStorage bool) (*CheckpointStats, error) { stats := &CheckpointStats{} var sgmReader io.ReadCloser @@ -156,7 +156,7 @@ func Checkpoint(logger *slog.Logger, w *WL, from, to int, keep func(id chunks.He metadata []record.RefMetadata st = labels.NewSymbolTable() // Needed for decoding; labels do not outlive this function. dec = record.NewDecoder(st, logger) - enc = record.Encoder{EnableSTStorage: enableStStorage} + enc = record.Encoder{EnableSTStorage: enableSTStorage} buf []byte recs [][]byte diff --git a/tsdb/wlog/checkpoint_test.go b/tsdb/wlog/checkpoint_test.go index 18a2c2d3dc..b491a27455 100644 --- a/tsdb/wlog/checkpoint_test.go +++ b/tsdb/wlog/checkpoint_test.go @@ -171,12 +171,12 @@ func TestCheckpoint(t *testing.T) { } } - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { dir := t.TempDir() - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} // Create a dummy segment to bump the initial number. seg, err := CreateSegment(dir, 100) require.NoError(t, err) @@ -295,7 +295,7 @@ func TestCheckpoint(t *testing.T) { stats, err := Checkpoint(promslog.NewNopLogger(), w, 100, 106, func(x chunks.HeadSeriesRef) bool { return x%2 == 0 - }, last/2, enableStStorage) + }, last/2, enableSTStorage) require.NoError(t, err) require.NoError(t, w.Truncate(107)) require.NoError(t, DeleteCheckpoints(w.Dir(), 106)) @@ -386,13 +386,13 @@ func TestCheckpoint(t *testing.T) { } func TestCheckpointNoTmpFolderAfterError(t *testing.T) { - for _, enableStStorage := range []bool{false, true} { - t.Run("enableStStorage="+strconv.FormatBool(enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run("enableSTStorage="+strconv.FormatBool(enableSTStorage), func(t *testing.T) { // Create a new wlog with invalid data. dir := t.TempDir() w, err := NewSize(nil, nil, dir, 64*1024, compression.None) require.NoError(t, err) - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} require.NoError(t, w.Log(enc.Series([]record.RefSeries{ {Ref: 0, Labels: labels.FromStrings("a", "b", "c", "2")}, }, nil))) @@ -406,7 +406,7 @@ func TestCheckpointNoTmpFolderAfterError(t *testing.T) { require.NoError(t, f.Close()) // Run the checkpoint and since the wlog contains corrupt data this should return an error. - _, err = Checkpoint(promslog.NewNopLogger(), w, 0, 1, nil, 0, enableStStorage) + _, err = Checkpoint(promslog.NewNopLogger(), w, 0, 1, nil, 0, enableSTStorage) require.Error(t, err) // Walk the wlog dir to make sure there are no tmp folder left behind after the error. diff --git a/tsdb/wlog/watcher_test.go b/tsdb/wlog/watcher_test.go index e29aac4d47..cc8bac4d75 100644 --- a/tsdb/wlog/watcher_test.go +++ b/tsdb/wlog/watcher_test.go @@ -145,8 +145,8 @@ func TestTailSamples(t *testing.T) { const exemplarsCount = 25 const histogramsCount = 50 for _, compress := range compression.Types() { - for _, enableStStorage := range []bool{false, true} { - t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + for _, enableSTStorage := range []bool{false, true} { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { now := time.Now() dir := t.TempDir() @@ -155,7 +155,7 @@ func TestTailSamples(t *testing.T) { err := os.Mkdir(wdir, 0o777) require.NoError(t, err) - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) require.NoError(t, err) defer func() { @@ -294,9 +294,9 @@ func TestReadToEndNoCheckpoint(t *testing.T) { const seriesCount = 10 const samplesCount = 250 - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { dir := t.TempDir() wdir := path.Join(dir, "wal") err := os.Mkdir(wdir, 0o777) @@ -310,7 +310,7 @@ func TestReadToEndNoCheckpoint(t *testing.T) { var recs [][]byte - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} for i := range seriesCount { series := enc.Series([]record.RefSeries{ @@ -364,16 +364,16 @@ func TestReadToEndWithCheckpoint(t *testing.T) { const seriesCount = 10 const samplesCount = 250 - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { dir := t.TempDir() wdir := path.Join(dir, "wal") err := os.Mkdir(wdir, 0o777) require.NoError(t, err) - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} w, err := NewSize(nil, nil, wdir, segmentSize, compress) require.NoError(t, err) defer func() { @@ -406,7 +406,7 @@ func TestReadToEndWithCheckpoint(t *testing.T) { } } - Checkpoint(promslog.NewNopLogger(), w, 0, 1, func(chunks.HeadSeriesRef) bool { return true }, 0, enableStStorage) + Checkpoint(promslog.NewNopLogger(), w, 0, 1, func(chunks.HeadSeriesRef) bool { return true }, 0, enableSTStorage) w.Truncate(1) // Write more records after checkpointing. @@ -454,9 +454,9 @@ func TestReadCheckpoint(t *testing.T) { const seriesCount = 10 const samplesCount = 250 - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { dir := t.TempDir() wdir := path.Join(dir, "wal") @@ -467,7 +467,7 @@ func TestReadCheckpoint(t *testing.T) { require.NoError(t, err) require.NoError(t, f.Close()) - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) require.NoError(t, err) t.Cleanup(func() { @@ -499,7 +499,7 @@ func TestReadCheckpoint(t *testing.T) { } _, err = w.NextSegmentSync() require.NoError(t, err) - _, err = Checkpoint(promslog.NewNopLogger(), w, 30, 31, func(chunks.HeadSeriesRef) bool { return true }, 0, enableStStorage) + _, err = Checkpoint(promslog.NewNopLogger(), w, 30, 31, func(chunks.HeadSeriesRef) bool { return true }, 0, enableSTStorage) require.NoError(t, err) require.NoError(t, w.Truncate(32)) @@ -529,16 +529,16 @@ func TestReadCheckpointMultipleSegments(t *testing.T) { const seriesCount = 40 const samplesCount = 500 - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { dir := t.TempDir() wdir := path.Join(dir, "wal") err := os.Mkdir(wdir, 0o777) require.NoError(t, err) - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} w, err := NewSize(nil, nil, wdir, pageSize, compress) require.NoError(t, err) @@ -603,26 +603,26 @@ func TestCheckpointSeriesReset(t *testing.T) { const samplesCount = 700 testCases := []struct { compress compression.Type - enableStStorage bool + enableSTStorage bool segments int }{ - {compress: compression.None, enableStStorage: false, segments: 24}, - {compress: compression.Snappy, enableStStorage: false, segments: 23}, - {compress: compression.None, enableStStorage: true, segments: 20}, - {compress: compression.Snappy, enableStStorage: true, segments: 20}, + {compress: compression.None, enableSTStorage: false, segments: 24}, + {compress: compression.Snappy, enableSTStorage: false, segments: 23}, + {compress: compression.None, enableSTStorage: true, segments: 20}, + {compress: compression.Snappy, enableSTStorage: true, segments: 20}, } dir := t.TempDir() for _, tc := range testCases { - t.Run(fmt.Sprintf("compress=%s,stStorage=%v", tc.compress, tc.enableStStorage), func(t *testing.T) { - subdir := filepath.Join(dir, fmt.Sprintf("%s-%v", tc.compress, tc.enableStStorage)) + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", tc.compress, tc.enableSTStorage), func(t *testing.T) { + subdir := filepath.Join(dir, fmt.Sprintf("%s-%v", tc.compress, tc.enableSTStorage)) err := os.MkdirAll(subdir, 0o777) require.NoError(t, err) wdir := filepath.Join(subdir, "wal") err = os.MkdirAll(wdir, 0o777) require.NoError(t, err) - enc := record.Encoder{EnableSTStorage: tc.enableStStorage} + enc := record.Encoder{EnableSTStorage: tc.enableSTStorage} w, err := NewSize(nil, nil, wdir, segmentSize, tc.compress) require.NoError(t, err) defer func() { @@ -698,16 +698,16 @@ func TestRun_StartupTime(t *testing.T) { const seriesCount = 40 const samplesCount = 500 - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { dir := t.TempDir() wdir := path.Join(dir, "wal") err := os.Mkdir(wdir, 0o777) require.NoError(t, err) - enc := record.Encoder{EnableSTStorage: enableStStorage} + enc := record.Encoder{EnableSTStorage: enableSTStorage} w, err := NewSize(nil, nil, wdir, pageSize, compress) require.NoError(t, err) @@ -752,8 +752,8 @@ func TestRun_StartupTime(t *testing.T) { } } -func generateWALRecords(w *WL, segment, seriesCount, samplesCount int, enableStStorage bool) error { - enc := record.Encoder{EnableSTStorage: enableStStorage} +func generateWALRecords(w *WL, segment, seriesCount, samplesCount int, enableSTStorage bool) error { + enc := record.Encoder{EnableSTStorage: enableSTStorage} for j := range seriesCount { ref := j + (segment * 100) series := enc.Series([]record.RefSeries{ @@ -793,9 +793,9 @@ func TestRun_AvoidNotifyWhenBehind(t *testing.T) { const seriesCount = 10 const samplesCount = 50 - for _, enableStStorage := range []bool{false, true} { + for _, enableSTStorage := range []bool{false, true} { for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableStStorage), func(t *testing.T) { + t.Run(fmt.Sprintf("compress=%s,stStorage=%v", compress, enableSTStorage), func(t *testing.T) { dir := t.TempDir() wdir := path.Join(dir, "wal") @@ -805,7 +805,7 @@ func TestRun_AvoidNotifyWhenBehind(t *testing.T) { w, err := NewSize(nil, nil, wdir, segmentSize, compress) require.NoError(t, err) // Write to 00000000, the watcher will read series from it. - require.NoError(t, generateWALRecords(w, 0, seriesCount, samplesCount, enableStStorage)) + require.NoError(t, generateWALRecords(w, 0, seriesCount, samplesCount, enableSTStorage)) // Create 00000001, the watcher will tail it once started. w.NextSegment() @@ -838,7 +838,7 @@ func TestRun_AvoidNotifyWhenBehind(t *testing.T) { // In the meantime, add some new segments in bulk. // We should end up with segmentsToWrite + 1 segments now. for i := 1; i < segmentsToWrite; i++ { - require.NoError(t, generateWALRecords(w, i, seriesCount, samplesCount, enableStStorage)) + require.NoError(t, generateWALRecords(w, i, seriesCount, samplesCount, enableSTStorage)) w.NextSegment() } From 23d2ab447e3762a24f18ecf7fb915f255ae5c8e4 Mon Sep 17 00:00:00 2001 From: Bartlomiej Plotka Date: Wed, 18 Feb 2026 09:15:14 +0000 Subject: [PATCH 26/73] feat[scrape]: add ST parsing support to scrape AppenderV2 flow (#18103) Signed-off-by: bwplotka --- cmd/prometheus/main.go | 2 ++ scrape/manager.go | 11 +++++++++++ scrape/manager_test.go | 3 +++ scrape/scrape.go | 8 +++++++- scrape/scrape_append_v2.go | 4 ++-- 5 files changed, 25 insertions(+), 3 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 763911363b..ad9108a6df 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -269,6 +269,7 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { case "created-timestamp-zero-ingestion": // NOTE(bwplotka): Once AppendableV1 is removed, there will be only the TSDB and agent flags. c.scrape.EnableStartTimestampZeroIngestion = true + c.scrape.ParseST = true c.web.STZeroIngestionEnabled = true c.tsdb.EnableSTAsZeroSample = true c.agent.EnableSTAsZeroSample = true @@ -280,6 +281,7 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { logger.Info("Experimental start timestamp zero ingestion enabled. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) case "st-storage": // TODO(bwplotka): Implement ST Storage as per PROM-60 and document this hidden feature flag. + c.scrape.ParseST = true c.tsdb.EnableSTStorage = true c.agent.EnableSTStorage = true diff --git a/scrape/manager.go b/scrape/manager.go index 24a63b056b..17f7b804a3 100644 --- a/scrape/manager.go +++ b/scrape/manager.go @@ -115,8 +115,19 @@ type Options struct { // Option to enable the ingestion of the created timestamp as a synthetic zero sample. // See: https://github.com/prometheus/proposals/blob/main/proposals/2023-06-13_created-timestamp.md + // + // NOTE: This option has no effect for AppenderV2 and will be removed with the AppenderV1 + // removal. EnableStartTimestampZeroIngestion bool + // ParseST controls if ST should be parsed and appended from the scrape format + // notably from the expensive OpenMetrics 1.0 _created line flow. This adds some + // overhead and can yield wrong ST values on OM1 edge cases. + // + // This only applies to AppenderV2 flow. + // TODO: Move this option to OM1 parser and use only on OM1 flow + ParseST bool + // EnableTypeAndUnitLabels represents type-and-unit-labels feature flag. EnableTypeAndUnitLabels bool diff --git a/scrape/manager_test.go b/scrape/manager_test.go index 395cc98a82..137596151b 100644 --- a/scrape/manager_test.go +++ b/scrape/manager_test.go @@ -767,6 +767,7 @@ func TestManagerSTZeroIngestion(t *testing.T) { app := teststorage.NewAppendable() discoveryManager, scrapeManager := runManagers(t, ctx, &Options{ EnableStartTimestampZeroIngestion: testSTZeroIngest, + ParseST: testSTZeroIngest, skipOffsetting: true, }, app, nil) defer scrapeManager.Stop() @@ -953,6 +954,7 @@ func TestManagerSTZeroIngestionHistogram(t *testing.T) { app := teststorage.NewAppendable() discoveryManager, scrapeManager := runManagers(t, ctx, &Options{ EnableStartTimestampZeroIngestion: tc.enableSTZeroIngestion, + ParseST: tc.enableSTZeroIngestion, skipOffsetting: true, }, app, nil) defer scrapeManager.Stop() @@ -1065,6 +1067,7 @@ func TestNHCBAndSTZeroIngestion(t *testing.T) { app := teststorage.NewAppendable() discoveryManager, scrapeManager := runManagers(t, ctx, &Options{ EnableStartTimestampZeroIngestion: true, + ParseST: true, skipOffsetting: true, }, app, nil) defer scrapeManager.Stop() diff --git a/scrape/scrape.go b/scrape/scrape.go index d5a9ba72b4..cd102a23ba 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -870,6 +870,7 @@ type scrapeLoop struct { // Options from scrape.Options. enableSTZeroIngestion bool + parseST bool // Used by AppenderV2 only. enableTypeAndUnitLabels bool reportExtraMetrics bool appendMetadataToWAL bool @@ -1223,7 +1224,12 @@ func newScrapeLoop(opts scrapeLoopOptions) *scrapeLoop { validationScheme: opts.sp.config.MetricNameValidationScheme, // scrape.Options. - enableSTZeroIngestion: opts.sp.options.EnableStartTimestampZeroIngestion, + enableSTZeroIngestion: opts.sp.options.EnableStartTimestampZeroIngestion, + // parseST was added recently. Before EnableStartTimestampZeroIngestion + // was enabling parsing ST. For non-Prometheus users of the scrape + // manager, we ensure appenderV2 parseST is set on EnableStartTimestampZeroIngestion + // This will be removed when EnableStartTimestampZeroIngestion is removed. + parseST: opts.sp.options.ParseST || opts.sp.options.EnableStartTimestampZeroIngestion, enableTypeAndUnitLabels: opts.sp.options.EnableTypeAndUnitLabels, appendMetadataToWAL: opts.sp.options.AppendMetadata, passMetadataInContext: opts.sp.options.PassMetadataInContext, diff --git a/scrape/scrape_append_v2.go b/scrape/scrape_append_v2.go index 64969707e1..825e56f9df 100644 --- a/scrape/scrape_append_v2.go +++ b/scrape/scrape_append_v2.go @@ -102,7 +102,7 @@ func (sl *scrapeLoopAppenderV2) append(b []byte, contentType string, ts time.Tim IgnoreNativeHistograms: !sl.enableNativeHistogramScraping, ConvertClassicHistogramsToNHCB: sl.convertClassicHistToNHCB, KeepClassicOnClassicAndNativeHistograms: sl.alwaysScrapeClassicHist, - OpenMetricsSkipSTSeries: sl.enableSTZeroIngestion, + OpenMetricsSkipSTSeries: sl.parseST, FallbackContentType: sl.fallbackScrapeProtocol, }) if p == nil { @@ -254,7 +254,7 @@ loop: } st := int64(0) - if sl.enableSTZeroIngestion { + if sl.parseST { // p.StartTimestamp() tend to be expensive (e.g. OM1). Do it only if we care. st = p.StartTimestamp() } From e25dc289be2a7c592653b9a11d43b25db4f1d4bf Mon Sep 17 00:00:00 2001 From: George Krajcsovits Date: Wed, 18 Feb 2026 10:56:03 +0100 Subject: [PATCH 27/73] feat(tsdb): change head opt EnableSTStorage to atomic (#18107) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In downstream projects this needs to be set dynamically per tenant. Signed-off-by: György Krajcsovits --- tsdb/head.go | 14 +++++++------- tsdb/head_wal.go | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tsdb/head.go b/tsdb/head.go index 19c2538b12..e88a5e0803 100644 --- a/tsdb/head.go +++ b/tsdb/head.go @@ -160,6 +160,11 @@ type HeadOptions struct { OutOfOrderTimeWindow atomic.Int64 OutOfOrderCapMax atomic.Int64 + // EnableSTStorage determines whether databases (WAL/WBL, tsdb, + // agent) should set a Start Time value per sample. Currently not + // user-settable and only set in tests. + EnableSTStorage atomic.Bool + ChunkRange int64 // ChunkDirRoot is the parent directory of the chunks directory. ChunkDirRoot string @@ -200,11 +205,6 @@ type HeadOptions struct { // NOTE(bwplotka): This feature might be deprecated and removed once PROM-60 // is implemented. EnableMetadataWALRecords bool - - // EnableSTStorage determines whether agent DB should write a Start Timestamp (ST) - // per sample to WAL. - // TODO(bwplotka): Implement this option as per PROM-60, currently it's noop. - EnableSTStorage bool } const ( @@ -1386,7 +1386,7 @@ func (h *Head) truncateWAL(mint int64) error { } h.metrics.checkpointCreationTotal.Inc() - if _, err = wlog.Checkpoint(h.logger, h.wal, first, last, h.keepSeriesInWALCheckpointFn(mint), mint, h.opts.EnableSTStorage); err != nil { + if _, err = wlog.Checkpoint(h.logger, h.wal, first, last, h.keepSeriesInWALCheckpointFn(mint), mint, h.opts.EnableSTStorage.Load()); err != nil { h.metrics.checkpointCreationFail.Inc() var cerr *chunks.CorruptionErr if errors.As(err, &cerr) { @@ -1680,7 +1680,7 @@ func (h *Head) Delete(ctx context.Context, mint, maxt int64, ms ...*labels.Match } if h.wal != nil { - enc := record.Encoder{EnableSTStorage: h.opts.EnableSTStorage} + enc := record.Encoder{EnableSTStorage: h.opts.EnableSTStorage.Load()} if err := h.wal.Log(enc.Tombstones(stones, nil)); err != nil { return err } diff --git a/tsdb/head_wal.go b/tsdb/head_wal.go index 6e9b80060c..0a54ae3878 100644 --- a/tsdb/head_wal.go +++ b/tsdb/head_wal.go @@ -1400,7 +1400,7 @@ func (h *Head) ChunkSnapshot() (*ChunkSnapshotStats, error) { // Assuming 100 bytes (overestimate) per exemplar, that's ~1MB. maxExemplarsPerRecord := 10000 batch := make([]record.RefExemplar, 0, maxExemplarsPerRecord) - enc := record.Encoder{EnableSTStorage: h.opts.EnableSTStorage} + enc := record.Encoder{EnableSTStorage: h.opts.EnableSTStorage.Load()} flushExemplars := func() error { if len(batch) == 0 { return nil From 8d8371244b77a096625b84229b812f1d4d9fac20 Mon Sep 17 00:00:00 2001 From: Bartlomiej Plotka Date: Wed, 18 Feb 2026 12:26:17 +0000 Subject: [PATCH 28/73] Merge pull request #18108 from prometheus/bwplotka/fix scrape: add tests for ST appending; add warnings for ST feature flag users around _created drop --- cmd/prometheus/main.go | 4 +- scrape/helpers_test.go | 4 +- scrape/manager.go | 17 ++++-- scrape/scrape_test.go | 132 +++++++++++++++++++++++++++++++---------- 4 files changed, 118 insertions(+), 39 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index ad9108a6df..3910991148 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -278,7 +278,7 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { // This is to widen the ST support surface. config.DefaultConfig.GlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols - logger.Info("Experimental start timestamp zero ingestion enabled. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) + logger.Info("Experimental start timestamp zero ingestion enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) case "st-storage": // TODO(bwplotka): Implement ST Storage as per PROM-60 and document this hidden feature flag. c.scrape.ParseST = true @@ -288,7 +288,7 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { // Change relevant global variables. Hacky, but it's hard to pass a new option or default to unmarshallers. This is to widen the ST support surface. config.DefaultConfig.GlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols - logger.Info("Experimental start timestamp storage enabled. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) + logger.Info("Experimental start timestamp storage enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) case "delayed-compaction": c.tsdb.EnableDelayedCompaction = true logger.Info("Experimental delayed compaction is enabled.") diff --git a/scrape/helpers_test.go b/scrape/helpers_test.go index 1db229561d..cfc0b92532 100644 --- a/scrape/helpers_test.go +++ b/scrape/helpers_test.go @@ -98,7 +98,9 @@ func newTestScrapeLoop(t testing.TB, opts ...func(sl *scrapeLoop)) (_ *scrapeLoo enableCompression: true, validationScheme: model.UTF8Validation, symbolTable: labels.NewSymbolTable(), - appendMetadataToWAL: true, // Tests assumes it's enabled, unless explicitly turned off. + // Tests assume those features are enabled, unless explicitly turned off. + appendMetadataToWAL: true, + parseST: true, } for _, o := range opts { o(sl) diff --git a/scrape/manager.go b/scrape/manager.go index 17f7b804a3..29b8a53d77 100644 --- a/scrape/manager.go +++ b/scrape/manager.go @@ -120,12 +120,19 @@ type Options struct { // removal. EnableStartTimestampZeroIngestion bool - // ParseST controls if ST should be parsed and appended from the scrape format - // notably from the expensive OpenMetrics 1.0 _created line flow. This adds some - // overhead and can yield wrong ST values on OM1 edge cases. + // ParseST controls if ST should be parsed and appended from the scrape formats. + // This should be by default true, but it's opt-in for OpenMetrics (OM) 1.0 reasons and might be moved + // to OM 1.0 only flow. // - // This only applies to AppenderV2 flow. - // TODO: Move this option to OM1 parser and use only on OM1 flow + // Specifically for OpenMetrics 1.0 flow, it can have some additional effects that might not be desired for non-ST users: + // + // * OpenMetrics 1.0 _created series will be parsed as ST instead of normal sample. Could be breaking + // if downstream user depends on _created metric. TODO(bwplotka): Add "preserveOMLines" hidden option? + // * Add relatively small (but still) overhead. + // * Can yield wrong ST values in rare edge cases (unknown metadata and metric name collisions). + // + // This only applies to AppenderV2 flow (Prometheus default). + // TODO: Move this option to OM1 parser and use only on OM1 flow. ParseST bool // EnableTypeAndUnitLabels represents type-and-unit-labels feature flag. diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index cab2b2918a..83c150a091 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -1545,6 +1545,14 @@ func TestPromTextToProto(t *testing.T) { require.Equal(t, "promhttp_metric_handler_requests_total", got[236]) } +func seriesPerHistogramFor100HistsWithExemplars(appV2 bool) int { + if appV2 { + // AppenderV2 with parseST enabled, uses _created lines for ST instead of samples. + return 23 + } + return 24 +} + // TestScrapeLoopAppend_WithStorage tests appends and storage integration for the // large input files that are also used in benchmarks. func TestScrapeLoopAppend_WithStorage(t *testing.T) { @@ -1630,8 +1638,13 @@ func TestScrapeLoopAppend_WithStorage(t *testing.T) { name: "100HistsWithExemplars", parsableText: makeTestHistogramsWithExemplars(100), - expectedSamplesLen: 24 * 100, + expectedSamplesLen: seriesPerHistogramFor100HistsWithExemplars(appV2) * 100, testAppendedSamples: func(t *testing.T, committed []sample) { + st := int64(0) + if appV2 { + st = 1726839813016 + } + // Verify a few samples. m := metadata.Metadata{Type: model.MetricTypeHistogram, Help: "RPC latency distributions."} testutil.RequireEqual(t, sample{ @@ -1641,7 +1654,7 @@ func TestScrapeLoopAppend_WithStorage(t *testing.T) { } return "rpc_durations_histogram0_seconds" }(), - M: m, L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram0_seconds_bucket", "le", "0.0003100000000000002"), V: 15, T: timestamp.FromTime(ts), + M: m, L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram0_seconds_bucket", "le", "0.0003100000000000002"), V: 15, ST: st, T: timestamp.FromTime(ts), ES: []exemplar.Exemplar{ {Labels: labels.FromStrings("dummyID", "9818"), Value: 0.0002791130914009552, Ts: 1726839814982, HasTs: true}, }, @@ -1653,17 +1666,24 @@ func TestScrapeLoopAppend_WithStorage(t *testing.T) { } return "rpc_durations_histogram49_seconds" }(), - M: m, L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram49_seconds_sum"), V: -8.452185437166741e-05, T: timestamp.FromTime(ts), - }, committed[24*50-3]) + M: m, L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram49_seconds_sum"), V: -8.452185437166741e-05, ST: st, T: timestamp.FromTime(ts), + }, committed[seriesPerHistogramFor100HistsWithExemplars(appV2)*49+21]) - // This series does not have metadata, nor metric family, because of isSeriesPartOfFamily bug and OpenMetric 1.0 limitations around _created series. - // TODO(bwplotka): Fix with https://github.com/prometheus/prometheus/issues/17900 - testutil.RequireEqual(t, sample{ - L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram99_seconds_created"), V: 1.726839813016302e+09, T: timestamp.FromTime(ts), - }, committed[len(committed)-1]) + if !appV2 { + // This series does not have metadata, nor metric family, because of isSeriesPartOfFamily bug and OpenMetric 1.0 limitations around _created series. + // TODO(bwplotka): Fix with https://github.com/prometheus/prometheus/issues/17900 + testutil.RequireEqual(t, sample{ + L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram99_seconds_created"), V: 1.726839813016302e+09, T: timestamp.FromTime(ts), + }, committed[len(committed)-1]) + } else { + testutil.RequireEqual(t, sample{ + MF: "rpc_durations_histogram99_seconds", + M: m, L: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram99_seconds_count"), V: 15, ST: st, T: timestamp.FromTime(ts), + }, committed[len(committed)-1]) + } }, testExemplars: func(t *testing.T, er []exemplar.QueryResult) { - // 12 out of 24 histogram series have exemplars. + // 12 out of 23/24 histogram series have exemplars. require.Len(t, er, 12*100) testutil.RequireEqual(t, exemplar.QueryResult{ SeriesLabels: labels.FromStrings(model.MetricNameLabel, "rpc_durations_histogram0_seconds_bucket", "le", "0.0003100000000000002"), @@ -2900,6 +2920,11 @@ func TestScrapeLoopAppend(t *testing.T) { } func testScrapeLoopAppend(t *testing.T, appV2 bool) { + st := int64(0) + if appV2 { + st = 111111001 + } + for _, test := range []struct { title string alwaysScrapeClassicHist bool @@ -2952,6 +2977,32 @@ func testScrapeLoopAppend(t *testing.T, appV2 bool) { ES: []exemplar.Exemplar{{Labels: labels.FromStrings("a", "abc"), Value: 1, Ts: 10000000, HasTs: true}}, }}, }, + { + title: "Metric with ST", + scrapeText: `# TYPE metric counter +metric_total{n="1"} 1.1 +metric_created{n="1"} 9999.999 +# EOF`, + contentType: "application/openmetrics-text", + samples: func() []sample { + if !appV2 { + return []sample{ + { + L: labels.FromStrings("__name__", "metric_total", "n", "1"), + V: 1.1, + }, + { + L: labels.FromStrings("__name__", "metric_created", "n", "1"), + V: 9999.999, + }, + } + } + return []sample{{ + L: labels.FromStrings("__name__", "metric_total", "n", "1"), + ST: 9999999, V: 1.1, + }} + }(), + }, { title: "Two metrics and exemplars", scrapeText: `metric_total{n="1"} 1 # {t="1"} 1.0 10000 @@ -2969,7 +3020,7 @@ metric_total{n="2"} 2 # {t="2"} 2.0 20000 }}, }, { - title: "Native histogram with three exemplars from classic buckets", + title: "Native histogram with ST and three exemplars from classic buckets", enableNativeHistogramsIngestion: true, scrapeText: `name: "test_histogram" @@ -2977,6 +3028,10 @@ help: "Test histogram with many buckets removed to keep it manageable in size." type: HISTOGRAM metric: < histogram: < + created_timestamp: < + seconds: 111111 + nanos: 1000000 + > sample_count: 175 sample_sum: 0.0008280461746287094 bucket: < @@ -3059,8 +3114,9 @@ metric: < `, contentType: "application/vnd.google.protobuf", samples: []sample{{ - T: 1234568, - L: labels.FromStrings("__name__", "test_histogram"), + T: 1234568, + ST: st, + L: labels.FromStrings("__name__", "test_histogram"), H: &histogram.Histogram{ Count: 175, ZeroCount: 2, @@ -3086,7 +3142,7 @@ metric: < }}, }, { - title: "Native histogram with three exemplars scraped as classic histogram", + title: "Native histogram with ST and three exemplars scraped as classic histogram", enableNativeHistogramsIngestion: true, scrapeText: `name: "test_histogram" @@ -3094,6 +3150,10 @@ help: "Test histogram with many buckets removed to keep it manageable in size." type: HISTOGRAM metric: < histogram: < + created_timestamp: < + seconds: 111111 + nanos: 1000000 + > sample_count: 175 sample_sum: 0.0008280461746287094 bucket: < @@ -3178,8 +3238,9 @@ metric: < contentType: "application/vnd.google.protobuf", samples: []sample{ { - T: 1234568, - L: labels.FromStrings("__name__", "test_histogram"), + T: 1234568, + ST: st, + L: labels.FromStrings("__name__", "test_histogram"), H: &histogram.Histogram{ Count: 175, ZeroCount: 2, @@ -3204,26 +3265,26 @@ metric: < {Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, Ts: 1625851155146, HasTs: true}, }, }, - {L: labels.FromStrings("__name__", "test_histogram_count"), T: 1234568, V: 175}, - {L: labels.FromStrings("__name__", "test_histogram_sum"), T: 1234568, V: 0.0008280461746287094}, - {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0004899999999999998"), T: 1234568, V: 2}, + {L: labels.FromStrings("__name__", "test_histogram_count"), ST: st, T: 1234568, V: 175}, + {L: labels.FromStrings("__name__", "test_histogram_sum"), ST: st, T: 1234568, V: 0.0008280461746287094}, + {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0004899999999999998"), ST: st, T: 1234568, V: 2}, { - L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0003899999999999998"), T: 1234568, V: 4, + L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0003899999999999998"), ST: st, T: 1234568, V: 4, ES: []exemplar.Exemplar{{Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, Ts: 1625851155146, HasTs: true}}, }, { - L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0002899999999999998"), T: 1234568, V: 16, + L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0002899999999999998"), ST: st, T: 1234568, V: 16, ES: []exemplar.Exemplar{{Labels: labels.FromStrings("dummyID", "5617"), Value: -0.00029, Ts: 1234568, HasTs: false}}, }, { - L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0001899999999999998"), T: 1234568, V: 32, + L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "-0.0001899999999999998"), ST: st, T: 1234568, V: 32, ES: []exemplar.Exemplar{{Labels: labels.FromStrings("dummyID", "58215"), Value: -0.00019, Ts: 1625851055146, HasTs: true}}, }, - {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "+Inf"), T: 1234568, V: 175}, + {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "+Inf"), ST: st, T: 1234568, V: 175}, }, }, { - title: "Native histogram with exemplars and no classic buckets", + title: "Native histogram with ST, exemplars and no classic buckets", contentType: "application/vnd.google.protobuf", enableNativeHistogramsIngestion: true, scrapeText: `name: "test_histogram" @@ -3231,6 +3292,10 @@ help: "Test histogram." type: HISTOGRAM metric: < histogram: < + created_timestamp: < + seconds: 111111 + nanos: 1000000 + > sample_count: 175 sample_sum: 0.0008280461746287094 schema: 3 @@ -3296,8 +3361,9 @@ metric: < `, samples: []sample{{ - T: 1234568, - L: labels.FromStrings("__name__", "test_histogram"), + T: 1234568, + ST: st, + L: labels.FromStrings("__name__", "test_histogram"), H: &histogram.Histogram{ Count: 175, ZeroCount: 2, @@ -3323,7 +3389,7 @@ metric: < }}, }, { - title: "Native histogram with exemplars but ingestion disabled", + title: "Native histogram with ST, exemplars but ingestion disabled", contentType: "application/vnd.google.protobuf", enableNativeHistogramsIngestion: false, scrapeText: `name: "test_histogram" @@ -3331,6 +3397,10 @@ help: "Test histogram." type: HISTOGRAM metric: < histogram: < + created_timestamp: < + seconds: 111111 + nanos: 1000000 + > sample_count: 175 sample_sum: 0.0008280461746287094 schema: 3 @@ -3396,9 +3466,9 @@ metric: < `, samples: []sample{ - {L: labels.FromStrings("__name__", "test_histogram_count"), T: 1234568, V: 175}, - {L: labels.FromStrings("__name__", "test_histogram_sum"), T: 1234568, V: 0.0008280461746287094}, - {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "+Inf"), T: 1234568, V: 175}, + {L: labels.FromStrings("__name__", "test_histogram_count"), ST: st, T: 1234568, V: 175}, + {L: labels.FromStrings("__name__", "test_histogram_sum"), ST: st, T: 1234568, V: 0.0008280461746287094}, + {L: labels.FromStrings("__name__", "test_histogram_bucket", "le", "+Inf"), ST: st, T: 1234568, V: 175}, }, }, } { @@ -3420,7 +3490,7 @@ metric: < // This test does not care about metadata. // Having this true would mean we need to add metadata to sample // expectations. - // TODO(bwplotka): Add cases for append metadata to WAL and pass metadata + // TODO(bwplotka): Add cases for append metadata to WAL and pass metadata. sl.appendMetadataToWAL = false }) app := sl.appender() From d1220defd3e149cf8b88042e81a59f7f3fea8a3b Mon Sep 17 00:00:00 2001 From: George Krajcsovits Date: Wed, 18 Feb 2026 15:23:37 +0100 Subject: [PATCH 29/73] refact(tsdb): trivial rename (#18109) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/head_append.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tsdb/head_append.go b/tsdb/head_append.go index 846ad476e3..17bb5de835 100644 --- a/tsdb/head_append.go +++ b/tsdb/head_append.go @@ -1178,7 +1178,7 @@ type appenderCommitContext struct { oooRecords [][]byte oooCapMax int64 appendChunkOpts chunkOpts - enc record.Encoder + oooEnc record.Encoder } // commitExemplars adds all exemplars from the provided batch to the head's exemplar storage. @@ -1228,31 +1228,31 @@ func (acc *appenderCommitContext) collectOOORecords(a *headAppenderBase) { }) } } - r := acc.enc.MmapMarkers(markers, a.head.getBytesBuffer()) + r := acc.oooEnc.MmapMarkers(markers, a.head.getBytesBuffer()) acc.oooRecords = append(acc.oooRecords, r) } if len(acc.wblSamples) > 0 { - r := acc.enc.Samples(acc.wblSamples, a.head.getBytesBuffer()) + r := acc.oooEnc.Samples(acc.wblSamples, a.head.getBytesBuffer()) acc.oooRecords = append(acc.oooRecords, r) } if len(acc.wblHistograms) > 0 { - r, customBucketsHistograms := acc.enc.HistogramSamples(acc.wblHistograms, a.head.getBytesBuffer()) + r, customBucketsHistograms := acc.oooEnc.HistogramSamples(acc.wblHistograms, a.head.getBytesBuffer()) if len(r) > 0 { acc.oooRecords = append(acc.oooRecords, r) } if len(customBucketsHistograms) > 0 { - r := acc.enc.CustomBucketsHistogramSamples(customBucketsHistograms, a.head.getBytesBuffer()) + r := acc.oooEnc.CustomBucketsHistogramSamples(customBucketsHistograms, a.head.getBytesBuffer()) acc.oooRecords = append(acc.oooRecords, r) } } if len(acc.wblFloatHistograms) > 0 { - r, customBucketsFloatHistograms := acc.enc.FloatHistogramSamples(acc.wblFloatHistograms, a.head.getBytesBuffer()) + r, customBucketsFloatHistograms := acc.oooEnc.FloatHistogramSamples(acc.wblFloatHistograms, a.head.getBytesBuffer()) if len(r) > 0 { acc.oooRecords = append(acc.oooRecords, r) } if len(customBucketsFloatHistograms) > 0 { - r := acc.enc.CustomBucketsFloatHistogramSamples(customBucketsFloatHistograms, a.head.getBytesBuffer()) + r := acc.oooEnc.CustomBucketsFloatHistogramSamples(customBucketsFloatHistograms, a.head.getBytesBuffer()) acc.oooRecords = append(acc.oooRecords, r) } } @@ -1742,7 +1742,7 @@ func (a *headAppenderBase) Commit() (err error) { chunkRange: h.chunkRange.Load(), samplesPerChunk: h.opts.SamplesPerChunk, }, - enc: record.Encoder{ + oooEnc: record.Encoder{ EnableSTStorage: false, }, } From dc8613df5408bd6960649a36ef174761334ea4d6 Mon Sep 17 00:00:00 2001 From: George Krajcsovits Date: Wed, 18 Feb 2026 18:49:25 +0100 Subject: [PATCH 30/73] fix(tsdb): missing passing head option to wal/wbl write (#18113) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/head_append.go | 4 +- tsdb/head_test.go | 114 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 2 deletions(-) diff --git a/tsdb/head_append.go b/tsdb/head_append.go index 17bb5de835..e673466ccc 100644 --- a/tsdb/head_append.go +++ b/tsdb/head_append.go @@ -1059,7 +1059,7 @@ func (a *headAppenderBase) log() error { defer func() { a.head.putBytesBuffer(buf) }() var rec []byte - var enc record.Encoder + enc := record.Encoder{EnableSTStorage: a.head.opts.EnableSTStorage.Load()} if len(a.seriesRefs) > 0 { rec = enc.Series(a.seriesRefs, buf) @@ -1743,7 +1743,7 @@ func (a *headAppenderBase) Commit() (err error) { samplesPerChunk: h.opts.SamplesPerChunk, }, oooEnc: record.Encoder{ - EnableSTStorage: false, + EnableSTStorage: h.opts.EnableSTStorage.Load(), }, } diff --git a/tsdb/head_test.go b/tsdb/head_test.go index 2cee989e40..91cd742330 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -7267,3 +7267,117 @@ func TestHistogramStalenessConversionMetrics(t *testing.T) { }) } } + +// TestHeadAppender_WALEncoder_EnableSTStorage verifies that when EnableSTStorage +// is true the WAL encoder writes SamplesV2 records, and when false it writes +// plain Samples (V1) records. The bug was that log() always created a zero-value +// record.Encoder (EnableSTStorage=false), ignoring the head option. +func TestHeadAppender_WALEncoder_EnableSTStorage(t *testing.T) { + for _, enableST := range []bool{false, true} { + t.Run(fmt.Sprintf("enableSTStorage=%v", enableST), func(t *testing.T) { + opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) + opts.EnableSTStorage.Store(enableST) + h, w := newTestHeadWithOptions(t, compression.None, opts) + + lbls := labels.FromStrings("foo", "bar") + app := h.AppenderV2(context.Background()) + for ts := int64(100); ts < 110; ts++ { + _, err := app.Append(0, lbls, 0, ts, float64(ts), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + require.NoError(t, h.Close()) + + // Read WAL segments directly and check the sample record type. + sr, err := wlog.NewSegmentsReader(w.Dir()) + require.NoError(t, err) + defer func() { require.NoError(t, sr.Close()) }() + + dec := record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + r := wlog.NewReader(sr) + + var foundSampleRecord bool + for r.Next() { + rt := dec.Type(r.Record()) + switch rt { + case record.Samples: + require.False(t, enableST, "WAL contains Samples (V1) record but EnableSTStorage=true, expected SamplesV2") + foundSampleRecord = true + case record.SamplesV2: + require.True(t, enableST, "WAL contains SamplesV2 record but EnableSTStorage=false, expected Samples (V1)") + foundSampleRecord = true + } + } + require.NoError(t, r.Err()) + require.True(t, foundSampleRecord, "no sample record found in WAL") + }) + } +} + +// TestHeadAppender_WBLEncoder_EnableSTStorage verifies that when EnableSTStorage +// is true the WBL encoder writes SamplesV2 records for out-of-order samples, and +// when false it writes plain Samples (V1) records. The bug was that collectOOORecords() +// always created record.Encoder{EnableSTStorage: false}, ignoring the head option. +func TestHeadAppender_WBLEncoder_EnableSTStorage(t *testing.T) { + for _, enableST := range []bool{false, true} { + t.Run(fmt.Sprintf("enableSTStorage=%v", enableST), func(t *testing.T) { + dir := t.TempDir() + wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.None) + require.NoError(t, err) + wbl, err := wlog.NewSize(nil, nil, filepath.Join(dir, wlog.WblDirName), 32768, compression.None) + require.NoError(t, err) + + opts := DefaultHeadOptions() + opts.ChunkRange = DefaultBlockDuration + opts.ChunkDirRoot = dir + opts.OutOfOrderTimeWindow.Store(60 * time.Minute.Milliseconds()) + opts.EnableSTStorage.Store(enableST) + + h, err := NewHead(nil, nil, wal, wbl, opts, nil) + require.NoError(t, err) + require.NoError(t, h.Init(0)) + t.Cleanup(func() { _ = h.Close() }) + + lbls := labels.FromStrings("foo", "bar") + + // Append an in-order sample to establish head maxt. + app := h.AppenderV2(context.Background()) + _, err = app.Append(0, lbls, 0, 200, 200, nil, nil, storage.AOptions{}) + require.NoError(t, err) + require.NoError(t, app.Commit()) + + // Append OOO samples; these are written to the WBL. + app = h.AppenderV2(context.Background()) + for ts := int64(100); ts < 110; ts++ { + _, err = app.Append(0, lbls, 0, ts, float64(ts), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + + require.NoError(t, h.Close()) + + // Read WBL segments directly and check the sample record type. + sr, err := wlog.NewSegmentsReader(filepath.Join(dir, wlog.WblDirName)) + require.NoError(t, err) + defer func() { require.NoError(t, sr.Close()) }() + + dec := record.NewDecoder(labels.NewSymbolTable(), promslog.NewNopLogger()) + r := wlog.NewReader(sr) + + var foundSampleRecord bool + for r.Next() { + rt := dec.Type(r.Record()) + switch rt { + case record.Samples: + require.False(t, enableST, "WBL contains Samples (V1) record but EnableSTStorage=true, expected SamplesV2") + foundSampleRecord = true + case record.SamplesV2: + require.True(t, enableST, "WBL contains SamplesV2 record but EnableSTStorage=false, expected Samples (V1)") + foundSampleRecord = true + } + } + require.NoError(t, r.Err()) + require.True(t, foundSampleRecord, "no sample record found in WBL") + }) + } +} From 223f016c44d91947c30aed77718c71f796ad8017 Mon Sep 17 00:00:00 2001 From: George Krajcsovits Date: Fri, 20 Feb 2026 09:15:51 +0100 Subject: [PATCH 31/73] feat(tsdb): allow using ST capable XOR chunks - retain format on read (#18013) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(tsdb): allow appending to ST capable XOR chunk optionally Only for float samples as of now. Supports for in-order and out-of-order samples. Make sure that on readout the ST capable chunks are returned automatically. When the chunks are returned as is, this is trivially true. When a chunk needs to be re-coded due to deletion (tombstone) markers, we take the encoding of the original chunk. When a chunk needs to be created from overlapping chunks, we observe whether ST is zero or not and create the new chunk based on that. Signed-off-by: György Krajcsovits --- storage/series.go | 19 +- tsdb/chunkenc/chunk.go | 20 +- tsdb/chunks/chunks.go | 16 +- tsdb/chunks/chunks_test.go | 33 +++ tsdb/db.go | 1 + tsdb/db_append_v2_test.go | 165 +++++++++++++++ tsdb/head.go | 4 +- tsdb/head_append.go | 63 +++--- tsdb/head_append_v2.go | 10 +- tsdb/head_append_v2_test.go | 144 ++++++++++++- tsdb/head_read_test.go | 2 +- tsdb/head_test.go | 390 +++++++++++++++++++++++++++++++++--- tsdb/head_wal.go | 23 ++- tsdb/ooo_head.go | 49 ++--- tsdb/ooo_head_read.go | 10 +- tsdb/ooo_head_test.go | 208 ++++++++++++++++--- tsdb/querier.go | 93 ++++----- tsdb/querier_test.go | 201 +++++++++++++++++++ tsdb/record/record_test.go | 2 +- 19 files changed, 1234 insertions(+), 219 deletions(-) diff --git a/storage/series.go b/storage/series.go index bf6df7db3e..e51f8cfd96 100644 --- a/storage/series.go +++ b/storage/series.go @@ -341,11 +341,14 @@ func (s *seriesToChunkEncoder) Iterator(it chunks.Iterator) chunks.Iterator { i := 0 seriesIter := s.Series.Iterator(nil) lastType := chunkenc.ValNone + lastHadST := false for typ := seriesIter.Next(); typ != chunkenc.ValNone; typ = seriesIter.Next() { - if typ != lastType || i >= seriesToChunkEncoderSplit { + st := seriesIter.AtST() + hasST := st != 0 + if typ != lastType || lastHadST != hasST || i >= seriesToChunkEncoderSplit { // Create a new chunk if the sample type changed or too many samples in the current one. chks = appendChunk(chks, mint, maxt, chk) - chk, err = chunkenc.NewEmptyChunk(typ.ChunkEncoding()) + chk, err = typ.NewChunk(hasST) if err != nil { return errChunksIterator{err: err} } @@ -358,21 +361,20 @@ func (s *seriesToChunkEncoder) Iterator(it chunks.Iterator) chunks.Iterator { i = 0 } lastType = typ + lastHadST = hasST var ( - st, t int64 - v float64 - h *histogram.Histogram - fh *histogram.FloatHistogram + t int64 + v float64 + h *histogram.Histogram + fh *histogram.FloatHistogram ) switch typ { case chunkenc.ValFloat: t, v = seriesIter.At() - st = seriesIter.AtST() app.Append(st, t, v) case chunkenc.ValHistogram: t, h = seriesIter.AtHistogram(nil) - st = seriesIter.AtST() newChk, recoded, app, err = app.AppendHistogram(nil, st, t, h, false) if err != nil { return errChunksIterator{err: err} @@ -388,7 +390,6 @@ func (s *seriesToChunkEncoder) Iterator(it chunks.Iterator) chunks.Iterator { } case chunkenc.ValFloatHistogram: t, fh = seriesIter.AtFloatHistogram(nil) - st = seriesIter.AtST() newChk, recoded, app, err = app.AppendFloatHistogram(nil, st, t, fh, false) if err != nil { return errChunksIterator{err: err} diff --git a/tsdb/chunkenc/chunk.go b/tsdb/chunkenc/chunk.go index 6fb8de2a77..de5fa0c2de 100644 --- a/tsdb/chunkenc/chunk.go +++ b/tsdb/chunkenc/chunk.go @@ -76,6 +76,8 @@ type Chunk interface { Bytes() []byte // Encoding returns the encoding type of the chunk. + // If the chunk is capable of storing ST (start timestamps), it should + // return the appropriate encoding type (e.g., EncXOROptST). Encoding() Encoding // Appender returns an appender to append samples to the chunk. @@ -189,9 +191,12 @@ func (v ValueType) String() string { } } -func (v ValueType) ChunkEncoding() Encoding { +func (v ValueType) ChunkEncoding(storeST bool) Encoding { switch v { case ValFloat: + if storeST { + return EncXOROptST + } return EncXOR case ValHistogram: return EncHistogram @@ -202,17 +207,8 @@ func (v ValueType) ChunkEncoding() Encoding { } } -func (v ValueType) NewChunk() (Chunk, error) { - switch v { - case ValFloat: - return NewXORChunk(), nil - case ValHistogram: - return NewHistogramChunk(), nil - case ValFloatHistogram: - return NewFloatHistogramChunk(), nil - default: - return nil, fmt.Errorf("value type %v unsupported", v) - } +func (v ValueType) NewChunk(storeST bool) (Chunk, error) { + return NewEmptyChunk(v.ChunkEncoding(storeST)) } // MockSeriesIterator returns an iterator for a mock series with custom diff --git a/tsdb/chunks/chunks.go b/tsdb/chunks/chunks.go index 9b4e011562..6084f7148e 100644 --- a/tsdb/chunks/chunks.go +++ b/tsdb/chunks/chunks.go @@ -135,7 +135,9 @@ type Meta struct { } // ChunkFromSamples requires all samples to have the same type. -// TODO(krajorama): test with ST when chunk formats support it. +// It is not efficient and meant for testing purposes only. +// It scans the samples to determine whether any sample has ST set and +// creates a chunk accordingly. func ChunkFromSamples(s []Sample) (Meta, error) { return ChunkFromSamplesGeneric(SampleSlice(s)) } @@ -154,7 +156,17 @@ func ChunkFromSamplesGeneric(s Samples) (Meta, error) { } sampleType := s.Get(0).Type() - c, err := chunkenc.NewEmptyChunk(sampleType.ChunkEncoding()) + + hasST := false + for i := range s.Len() { + if s.Get(i).ST() != 0 { + hasST = true + break + } + } + + // Request storing ST in the chunk if available. + c, err := sampleType.NewChunk(hasST) if err != nil { return Meta{}, err } diff --git a/tsdb/chunks/chunks_test.go b/tsdb/chunks/chunks_test.go index f40f996fde..827b11070e 100644 --- a/tsdb/chunks/chunks_test.go +++ b/tsdb/chunks/chunks_test.go @@ -19,6 +19,7 @@ import ( "github.com/stretchr/testify/require" + "github.com/prometheus/prometheus/tsdb/chunkenc" "github.com/prometheus/prometheus/tsdb/tsdbutil" ) @@ -58,3 +59,35 @@ func TestWriterWithDefaultSegmentSize(t *testing.T) { require.NoError(t, err) require.Len(t, d, 1, "expected only one segment to be created to hold both chunks") } + +func TestChunkFromSamplesWithST(t *testing.T) { + // Create samples with explicit ST (source timestamp) values. + samples := []Sample{ + sample{t: 10, f: 11, st: 5}, + sample{t: 20, f: 12, st: 15}, + sample{t: 30, f: 13, st: 25}, + } + + chk, err := ChunkFromSamples(samples) + require.NoError(t, err) + require.NotNil(t, chk.Chunk) + + // Verify MinTime and MaxTime. + require.Equal(t, int64(10), chk.MinTime) + require.Equal(t, int64(30), chk.MaxTime) + + // Iterate over the chunk and verify ST values are preserved. + it := chk.Chunk.Iterator(nil) + idx := 0 + for vt := it.Next(); vt != chunkenc.ValNone; vt = it.Next() { + require.Equal(t, chunkenc.ValFloat, vt) + ts, v := it.At() + st := it.AtST() + require.Equal(t, samples[idx].ST(), st, "ST mismatch at index %d", idx) + require.Equal(t, samples[idx].T(), ts, "T mismatch at index %d", idx) + require.Equal(t, samples[idx].F(), v, "F mismatch at index %d", idx) + idx++ + } + require.NoError(t, it.Err()) + require.Equal(t, len(samples), idx, "expected all samples to be iterated") +} diff --git a/tsdb/db.go b/tsdb/db.go index 1d73628bfd..81c7a6c460 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -1044,6 +1044,7 @@ func open(dir string, l *slog.Logger, r prometheus.Registerer, opts *Options, rn headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax) headOpts.EnableSharding = opts.EnableSharding headOpts.EnableSTAsZeroSample = opts.EnableSTAsZeroSample + headOpts.EnableSTStorage.Store(opts.EnableSTStorage) headOpts.EnableMetadataWALRecords = opts.EnableMetadataWALRecords if opts.WALReplayConcurrency > 0 { headOpts.WALReplayConcurrency = opts.WALReplayConcurrency diff --git a/tsdb/db_append_v2_test.go b/tsdb/db_append_v2_test.go index 08e97d1113..acd72a986f 100644 --- a/tsdb/db_append_v2_test.go +++ b/tsdb/db_append_v2_test.go @@ -7512,6 +7512,64 @@ func TestAbortBlockCompactions_AppendV2(t *testing.T) { require.Equal(t, 4, compactions, "expected 4 compactions to be completed") } +// TestCompactHeadWithSTStorage_AppendV2 ensures that when EnableSTStorage is true, +// compacted blocks contain chunks with EncXOROptST encoding for float samples. +func TestCompactHeadWithSTStorage_AppendV2(t *testing.T) { + t.Parallel() + + opts := &Options{ + RetentionDuration: int64(time.Hour * 24 * 15 / time.Millisecond), + NoLockfile: true, + MinBlockDuration: int64(time.Hour * 2 / time.Millisecond), + MaxBlockDuration: int64(time.Hour * 2 / time.Millisecond), + WALCompression: compression.Snappy, + EnableSTStorage: true, + } + db := newTestDB(t, withOpts(opts)) + ctx := context.Background() + app := db.AppenderV2(ctx) + + mint := 100 + maxt := 200 + for i := mint; i < maxt; i++ { + _, err := app.Append(0, labels.FromStrings("a", "b"), 50, int64(i), float64(i), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + + require.NoError(t, db.CompactHead(NewRangeHead(db.Head(), int64(mint), int64(maxt)-1))) + require.Len(t, db.Blocks(), 1) + b := db.Blocks()[0] + + chunkr, err := b.Chunks() + require.NoError(t, err) + defer chunkr.Close() + + indexr, err := b.Index() + require.NoError(t, err) + defer indexr.Close() + + p, err := indexr.Postings(ctx, "a", "b") + require.NoError(t, err) + + chunkCount := 0 + for p.Next() { + var builder labels.ScratchBuilder + var chks []chunks.Meta + require.NoError(t, indexr.Series(p.At(), &builder, &chks)) + + for _, chk := range chks { + c, _, err := chunkr.ChunkOrIterable(chk) + require.NoError(t, err) + require.Equal(t, chunkenc.EncXOROptST, c.Encoding(), + "unexpected chunk encoding, got %s", c.Encoding()) + chunkCount++ + } + } + require.NoError(t, p.Err()) + require.Positive(t, chunkCount, "expected at least one chunk") +} + func TestNewCompactorFunc_AppendV2(t *testing.T) { opts := DefaultOptions() block1 := ulid.MustNew(1, nil) @@ -7543,3 +7601,110 @@ func TestNewCompactorFunc_AppendV2(t *testing.T) { require.Len(t, ulids, 1) require.Equal(t, block2, ulids[0]) } + +// TestDBAppenderV2_STStorage_OutOfOrder verifies that ST storage works correctly +// when samples are appended out of order and can be queried using ChunkQuerier. +func TestDBAppenderV2_STStorage_OutOfOrder(t *testing.T) { + testHistogram := tsdbutil.GenerateTestHistogram(1) + testHistogram.CounterResetHint = histogram.NotCounterReset + + testCases := []struct { + name string + appendSamples []chunks.Sample + expectedSamples []chunks.Sample + }{ + { + name: "Float samples out of order", + appendSamples: []chunks.Sample{ + newSample(20, 200, 2.0, nil, nil), // Append second sample first. + newSample(10, 100, 1.0, nil, nil), // Append first sample second (OOO). + newSample(30, 300, 3.0, nil, nil), // Append third sample last. + newSample(25, 250, 2.5, nil, nil), // Append middle sample (OOO). + }, + expectedSamples: []chunks.Sample{ + newSample(10, 100, 1.0, nil, nil), + newSample(20, 200, 2.0, nil, nil), + newSample(25, 250, 2.5, nil, nil), + newSample(30, 300, 3.0, nil, nil), + }, + }, + { + name: "Histogram samples out of order", + appendSamples: []chunks.Sample{ + newSample(30, 300, 0, testHistogram, nil), // Append third sample first. + newSample(10, 100, 0, testHistogram, nil), // Append first sample second (OOO). + newSample(20, 200, 0, testHistogram, nil), // Append second sample last (OOO). + }, + // Histograms don't support ST storage yet, should return 0 for ST. + expectedSamples: []chunks.Sample{ + newSample(0, 100, 0, testHistogram, nil), + newSample(0, 200, 0, testHistogram, nil), + newSample(0, 300, 0, testHistogram, nil), + }, + }, + { + name: "Mixed float samples with same ST", + appendSamples: []chunks.Sample{ + newSample(10, 200, 2.0, nil, nil), + newSample(10, 100, 1.0, nil, nil), // OOO with same ST. + newSample(10, 300, 3.0, nil, nil), + }, + expectedSamples: []chunks.Sample{ + newSample(10, 100, 1.0, nil, nil), + newSample(10, 200, 2.0, nil, nil), + newSample(10, 300, 3.0, nil, nil), + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 300 * time.Minute.Milliseconds() + opts.EnableSTStorage = true + db := newTestDB(t, withOpts(opts)) + db.DisableCompactions() + + lbls := labels.FromStrings("foo", "bar") + + for _, s := range tc.appendSamples { + app := db.AppenderV2(context.Background()) + _, err := app.Append(0, lbls, s.ST(), s.T(), s.F(), s.H(), s.FH(), storage.AOptions{}) + require.NoError(t, err, "Appending OOO sample with ST should succeed") + require.NoError(t, app.Commit(), "Committing OOO sample with ST should succeed") + } + + querier, err := db.ChunkQuerier(math.MinInt64, math.MaxInt64) + require.NoError(t, err) + defer querier.Close() + + ss := querier.Select(context.Background(), false, nil, labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")) + require.True(t, ss.Next(), "Should have series") + series := ss.At() + require.NoError(t, ss.Err()) + require.False(t, ss.Next(), "Should have only one series") + + chunkIt := series.Iterator(nil) + var actualSamples []chunks.Sample + + for chunkIt.Next() { + chk := chunkIt.At() + it := chk.Chunk.Iterator(nil) + samples, err := storage.ExpandSamples(it, newSample) + require.NoError(t, err) + actualSamples = append(actualSamples, samples...) + } + require.NoError(t, chunkIt.Err()) + + // Use requireEqualSamplesIgnoreCounterResets to ignore histogram counter reset hints. + requireEqualSamples(t, lbls.String(), tc.expectedSamples, actualSamples, requireEqualSamplesIgnoreCounterResets) + + // Additionally verify ST values match expectations. + require.Len(t, actualSamples, len(tc.expectedSamples)) + for i, expected := range tc.expectedSamples { + actual := actualSamples[i] + require.Equal(t, expected.ST(), actual.ST(), "Sample %d: ST should match", i) + } + }) + } +} diff --git a/tsdb/head.go b/tsdb/head.go index e88a5e0803..917bd666d3 100644 --- a/tsdb/head.go +++ b/tsdb/head.go @@ -161,8 +161,8 @@ type HeadOptions struct { OutOfOrderCapMax atomic.Int64 // EnableSTStorage determines whether databases (WAL/WBL, tsdb, - // agent) should set a Start Time value per sample. Currently not - // user-settable and only set in tests. + // agent) should set a Start Time value per sample. + // Represents 'st-storage' feature flag. EnableSTStorage atomic.Bool ChunkRange int64 diff --git a/tsdb/head_append.go b/tsdb/head_append.go index e673466ccc..adb11ec076 100644 --- a/tsdb/head_append.go +++ b/tsdb/head_append.go @@ -185,6 +185,7 @@ func (h *Head) appender() *headAppender { typesInBatch: h.getTypeMap(), appendID: appendID, cleanupAppendIDsBelow: cleanupAppendIDsBelow, + storeST: h.opts.EnableSTStorage.Load(), }, } } @@ -412,6 +413,7 @@ type headAppenderBase struct { appendID, cleanupAppendIDsBelow uint64 closed bool + storeST bool } type headAppender struct { headAppenderBase @@ -1059,7 +1061,7 @@ func (a *headAppenderBase) log() error { defer func() { a.head.putBytesBuffer(buf) }() var rec []byte - enc := record.Encoder{EnableSTStorage: a.head.opts.EnableSTStorage.Load()} + enc := record.Encoder{EnableSTStorage: a.storeST} if len(a.seriesRefs) > 0 { rec = enc.Series(a.seriesRefs, buf) @@ -1168,6 +1170,7 @@ type appenderCommitContext struct { histoOOBRejected int inOrderMint int64 inOrderMaxt int64 + appendChunkOpts chunkOpts oooMinT int64 oooMaxT int64 wblSamples []record.RefSample @@ -1177,7 +1180,6 @@ type appenderCommitContext struct { oooMmapMarkersCount int oooRecords [][]byte oooCapMax int64 - appendChunkOpts chunkOpts oooEnc record.Encoder } @@ -1387,7 +1389,7 @@ func (a *headAppenderBase) commitFloats(b *appendBatch, acc *appenderCommitConte // Sample is OOO and OOO handling is enabled // and the delta is within the OOO tolerance. var mmapRefs []chunks.ChunkDiskMapperRef - ok, chunkCreated, mmapRefs = series.insert(s.T, s.V, nil, nil, a.head.chunkDiskMapper, acc.oooCapMax, a.head.logger) + ok, chunkCreated, mmapRefs = series.insert(s.ST, s.T, s.V, nil, nil, acc.appendChunkOpts, acc.oooCapMax, a.head.logger) if chunkCreated { r, ok := acc.oooMmapMarkers[series.ref] if !ok || r != nil { @@ -1431,7 +1433,7 @@ func (a *headAppenderBase) commitFloats(b *appendBatch, acc *appenderCommitConte default: newlyStale := !value.IsStaleNaN(series.lastValue) && value.IsStaleNaN(s.V) staleToNonStale := value.IsStaleNaN(series.lastValue) && !value.IsStaleNaN(s.V) - ok, chunkCreated = series.append(s.T, s.V, a.appendID, acc.appendChunkOpts) + ok, chunkCreated = series.append(s.ST, s.T, s.V, a.appendID, acc.appendChunkOpts) if ok { if s.T < acc.inOrderMint { acc.inOrderMint = s.T @@ -1492,7 +1494,8 @@ func (a *headAppenderBase) commitHistograms(b *appendBatch, acc *appenderCommitC // Sample is OOO and OOO handling is enabled // and the delta is within the OOO tolerance. var mmapRefs []chunks.ChunkDiskMapperRef - ok, chunkCreated, mmapRefs = series.insert(s.T, 0, s.H, nil, a.head.chunkDiskMapper, acc.oooCapMax, a.head.logger) + // TODO(krajorama,ywwg): Pass ST when available in WAL. + ok, chunkCreated, mmapRefs = series.insert(0, s.T, 0, s.H, nil, acc.appendChunkOpts, acc.oooCapMax, a.head.logger) if chunkCreated { r, ok := acc.oooMmapMarkers[series.ref] if !ok || r != nil { @@ -1540,7 +1543,8 @@ func (a *headAppenderBase) commitHistograms(b *appendBatch, acc *appenderCommitC newlyStale = newlyStale && !value.IsStaleNaN(series.lastHistogramValue.Sum) staleToNonStale = value.IsStaleNaN(series.lastHistogramValue.Sum) && !value.IsStaleNaN(s.H.Sum) } - ok, chunkCreated = series.appendHistogram(s.T, s.H, a.appendID, acc.appendChunkOpts) + // TODO(krajorama,ywwg): pass ST when available in WAL. + ok, chunkCreated = series.appendHistogram(0, s.T, s.H, a.appendID, acc.appendChunkOpts) if ok { if s.T < acc.inOrderMint { acc.inOrderMint = s.T @@ -1601,7 +1605,8 @@ func (a *headAppenderBase) commitFloatHistograms(b *appendBatch, acc *appenderCo // Sample is OOO and OOO handling is enabled // and the delta is within the OOO tolerance. var mmapRefs []chunks.ChunkDiskMapperRef - ok, chunkCreated, mmapRefs = series.insert(s.T, 0, nil, s.FH, a.head.chunkDiskMapper, acc.oooCapMax, a.head.logger) + // TODO(krajorama,ywwg): Pass ST when available in WAL. + ok, chunkCreated, mmapRefs = series.insert(0, s.T, 0, nil, s.FH, acc.appendChunkOpts, acc.oooCapMax, a.head.logger) if chunkCreated { r, ok := acc.oooMmapMarkers[series.ref] if !ok || r != nil { @@ -1649,7 +1654,8 @@ func (a *headAppenderBase) commitFloatHistograms(b *appendBatch, acc *appenderCo newlyStale = newlyStale && !value.IsStaleNaN(series.lastFloatHistogramValue.Sum) staleToNonStale = value.IsStaleNaN(series.lastFloatHistogramValue.Sum) && !value.IsStaleNaN(s.FH.Sum) } - ok, chunkCreated = series.appendFloatHistogram(s.T, s.FH, a.appendID, acc.appendChunkOpts) + // TODO(krajorama,ywwg): pass ST when available in WAL. + ok, chunkCreated = series.appendFloatHistogram(0, s.T, s.FH, a.appendID, acc.appendChunkOpts) if ok { if s.T < acc.inOrderMint { acc.inOrderMint = s.T @@ -1741,9 +1747,10 @@ func (a *headAppenderBase) Commit() (err error) { chunkDiskMapper: h.chunkDiskMapper, chunkRange: h.chunkRange.Load(), samplesPerChunk: h.opts.SamplesPerChunk, + storeST: a.storeST, }, oooEnc: record.Encoder{ - EnableSTStorage: h.opts.EnableSTStorage.Load(), + EnableSTStorage: a.storeST, }, } @@ -1799,18 +1806,18 @@ func (a *headAppenderBase) Commit() (err error) { } // insert is like append, except it inserts. Used for OOO samples. -func (s *memSeries) insert(t int64, v float64, h *histogram.Histogram, fh *histogram.FloatHistogram, chunkDiskMapper *chunks.ChunkDiskMapper, oooCapMax int64, logger *slog.Logger) (inserted, chunkCreated bool, mmapRefs []chunks.ChunkDiskMapperRef) { +func (s *memSeries) insert(st, t int64, v float64, h *histogram.Histogram, fh *histogram.FloatHistogram, o chunkOpts, oooCapMax int64, logger *slog.Logger) (inserted, chunkCreated bool, mmapRefs []chunks.ChunkDiskMapperRef) { if s.ooo == nil { s.ooo = &memSeriesOOOFields{} } c := s.ooo.oooHeadChunk if c == nil || c.chunk.NumSamples() == int(oooCapMax) { // Note: If no new samples come in then we rely on compaction to clean up stale in-memory OOO chunks. - c, mmapRefs = s.cutNewOOOHeadChunk(t, chunkDiskMapper, logger) + c, mmapRefs = s.cutNewOOOHeadChunk(t, o, logger) chunkCreated = true } - ok := c.chunk.Insert(t, v, h, fh) + ok := c.chunk.Insert(st, t, v, h, fh) if ok { if chunkCreated || t < c.minTime { c.minTime = t @@ -1827,19 +1834,19 @@ type chunkOpts struct { chunkDiskMapper *chunks.ChunkDiskMapper chunkRange int64 samplesPerChunk int + storeST bool } // append adds the sample (t, v) to the series. The caller also has to provide // the appendID for isolation. (The appendID can be zero, which results in no // isolation for this append.) // Series lock must be held when calling. -func (s *memSeries) append(t int64, v float64, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { - c, sampleInOrder, chunkCreated := s.appendPreprocessor(t, chunkenc.EncXOR, o) +func (s *memSeries) append(st, t int64, v float64, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { + c, sampleInOrder, chunkCreated := s.appendPreprocessor(t, chunkenc.ValFloat.ChunkEncoding(o.storeST), o) if !sampleInOrder { return sampleInOrder, chunkCreated } - // TODO(krajorama): pass ST. - s.app.Append(0, t, v) + s.app.Append(st, t, v) c.maxTime = t @@ -1859,14 +1866,14 @@ func (s *memSeries) append(t int64, v float64, appendID uint64, o chunkOpts) (sa // In case of recoding the existing chunk, a new chunk is allocated and the old chunk is dropped. // To keep the meaning of prometheus_tsdb_head_chunks and prometheus_tsdb_head_chunks_created_total // consistent, we return chunkCreated=false in this case. -func (s *memSeries) appendHistogram(t int64, h *histogram.Histogram, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { +func (s *memSeries) appendHistogram(st, t int64, h *histogram.Histogram, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { // Head controls the execution of recoding, so that we own the proper // chunk reference afterwards and mmap used up chunks. // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway. prevApp, _ := s.app.(*chunkenc.HistogramAppender) - c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.EncHistogram, o) + c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.ValHistogram.ChunkEncoding(o.storeST), o) if !sampleInOrder { return sampleInOrder, chunkCreated } @@ -1881,8 +1888,7 @@ func (s *memSeries) appendHistogram(t int64, h *histogram.Histogram, appendID ui prevApp = nil } - // TODO(krajorama): pass ST. - newChunk, recoded, s.app, _ = s.app.AppendHistogram(prevApp, 0, t, h, false) // false=request a new chunk if needed + newChunk, recoded, s.app, _ = s.app.AppendHistogram(prevApp, st, t, h, false) // false=request a new chunk if needed s.lastHistogramValue = h s.lastFloatHistogramValue = nil @@ -1917,14 +1923,14 @@ func (s *memSeries) appendHistogram(t int64, h *histogram.Histogram, appendID ui // In case of recoding the existing chunk, a new chunk is allocated and the old chunk is dropped. // To keep the meaning of prometheus_tsdb_head_chunks and prometheus_tsdb_head_chunks_created_total // consistent, we return chunkCreated=false in this case. -func (s *memSeries) appendFloatHistogram(t int64, fh *histogram.FloatHistogram, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { +func (s *memSeries) appendFloatHistogram(st, t int64, fh *histogram.FloatHistogram, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { // Head controls the execution of recoding, so that we own the proper // chunk reference afterwards and mmap used up chunks. // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway. prevApp, _ := s.app.(*chunkenc.FloatHistogramAppender) - c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.EncFloatHistogram, o) + c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.ValFloatHistogram.ChunkEncoding(o.storeST), o) if !sampleInOrder { return sampleInOrder, chunkCreated } @@ -1939,8 +1945,7 @@ func (s *memSeries) appendFloatHistogram(t int64, fh *histogram.FloatHistogram, prevApp = nil } - // TODO(krajorama): pass ST. - newChunk, recoded, s.app, _ = s.app.AppendFloatHistogram(prevApp, 0, t, fh, false) // False means request a new chunk if needed. + newChunk, recoded, s.app, _ = s.app.AppendFloatHistogram(prevApp, st, t, fh, false) // False means request a new chunk if needed. s.lastHistogramValue = nil s.lastFloatHistogramValue = fh @@ -2164,8 +2169,8 @@ func (s *memSeries) cutNewHeadChunk(mint int64, e chunkenc.Encoding, chunkRange // cutNewOOOHeadChunk cuts a new OOO chunk and m-maps the old chunk. // The caller must ensure that s is locked and s.ooo is not nil. -func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDiskMapper, logger *slog.Logger) (*oooHeadChunk, []chunks.ChunkDiskMapperRef) { - ref := s.mmapCurrentOOOHeadChunk(chunkDiskMapper, logger) +func (s *memSeries) cutNewOOOHeadChunk(mint int64, o chunkOpts, logger *slog.Logger) (*oooHeadChunk, []chunks.ChunkDiskMapperRef) { + ref := s.mmapCurrentOOOHeadChunk(o, logger) s.ooo.oooHeadChunk = &oooHeadChunk{ chunk: NewOOOChunk(), @@ -2177,12 +2182,12 @@ func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.Chunk } // s must be locked when calling. -func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper, logger *slog.Logger) []chunks.ChunkDiskMapperRef { +func (s *memSeries) mmapCurrentOOOHeadChunk(o chunkOpts, logger *slog.Logger) []chunks.ChunkDiskMapperRef { if s.ooo == nil || s.ooo.oooHeadChunk == nil { // OOO is not enabled or there is no head chunk, so nothing to m-map here. return nil } - chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64) + chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, o.storeST) if err != nil { handleChunkWriteError(err) return nil @@ -2193,7 +2198,7 @@ func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMap logger.Error("Too many OOO chunks, dropping data", "series", s.lset.String()) break } - chunkRef := chunkDiskMapper.WriteChunk(s.ref, memchunk.minTime, memchunk.maxTime, memchunk.chunk, true, handleChunkWriteError) + chunkRef := o.chunkDiskMapper.WriteChunk(s.ref, memchunk.minTime, memchunk.maxTime, memchunk.chunk, true, handleChunkWriteError) chunkRefs = append(chunkRefs, chunkRef) s.ooo.oooMmappedChunks = append(s.ooo.oooMmappedChunks, &mmappedChunk{ ref: chunkRef, diff --git a/tsdb/head_append_v2.go b/tsdb/head_append_v2.go index 87b62df536..40f5b0b102 100644 --- a/tsdb/head_append_v2.go +++ b/tsdb/head_append_v2.go @@ -95,6 +95,7 @@ func (h *Head) appenderV2() *headAppenderV2 { typesInBatch: h.getTypeMap(), appendID: appendID, cleanupAppendIDsBelow: cleanupAppendIDsBelow, + storeST: h.opts.EnableSTStorage.Load(), }, } } @@ -140,7 +141,6 @@ func (a *headAppenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t i } } - // TODO(bwplotka): Handle ST natively (as per PROM-60). if a.head.opts.EnableSTAsZeroSample && st != 0 { a.bestEffortAppendSTZeroSample(s, ls, st, t, h, fh) } @@ -177,7 +177,7 @@ func (a *headAppenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t i // we do not need to check for the difference between "unknown // series" and "known series with stNone". } - appErr = a.appendFloat(s, t, v, opts.RejectOutOfOrder) + appErr = a.appendFloat(s, st, t, v, opts.RejectOutOfOrder) } // Handle append error, if any. if appErr != nil { @@ -218,7 +218,7 @@ func (a *headAppenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t i return storage.SeriesRef(s.ref), partialErr } -func (a *headAppenderV2) appendFloat(s *memSeries, t int64, v float64, fastRejectOOO bool) error { +func (a *headAppenderV2) appendFloat(s *memSeries, st, t int64, v float64, fastRejectOOO bool) error { s.Lock() // TODO(codesome): If we definitely know at this point that the sample is ooo, then optimise // to skip that sample from the WAL and write only in the WBL. @@ -239,7 +239,7 @@ func (a *headAppenderV2) appendFloat(s *memSeries, t int64, v float64, fastRejec } b := a.getCurrentBatch(stFloat, s.ref) - b.floats = append(b.floats, record.RefSample{Ref: s.ref, T: t, V: v}) + b.floats = append(b.floats, record.RefSample{Ref: s.ref, ST: st, T: t, V: v}) b.floatSeries = append(b.floatSeries, s) return nil } @@ -366,7 +366,7 @@ func (a *headAppenderV2) bestEffortAppendSTZeroSample(s *memSeries, ls labels.La } err = a.appendHistogram(s, st, zeroHistogram, true) default: - err = a.appendFloat(s, st, 0, true) + err = a.appendFloat(s, 0, st, 0, true) } if err != nil { diff --git a/tsdb/head_append_v2_test.go b/tsdb/head_append_v2_test.go index 539ac22fd7..ccc75e18ed 100644 --- a/tsdb/head_append_v2_test.go +++ b/tsdb/head_append_v2_test.go @@ -2925,13 +2925,15 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot_AppenderV2(t *testing.T) { // TestWBLReplay checks the replay at a low level. func TestWBLReplay_AppenderV2(t *testing.T) { for name, scenario := range sampleTypeScenarios { - t.Run(name, func(t *testing.T) { - testWBLReplayAppenderV2(t, scenario) - }) + for _, enableSTstorage := range []bool{false, true} { + t.Run(fmt.Sprintf("%s/st-storage=%v", name, enableSTstorage), func(t *testing.T) { + testWBLReplayAppenderV2(t, scenario, enableSTstorage) + }) + } } } -func testWBLReplayAppenderV2(t *testing.T, scenario sampleTypeScenario) { +func testWBLReplayAppenderV2(t *testing.T, scenario sampleTypeScenario, enableSTstorage bool) { dir := t.TempDir() wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.Snappy) require.NoError(t, err) @@ -2942,6 +2944,7 @@ func testWBLReplayAppenderV2(t *testing.T, scenario sampleTypeScenario) { opts.ChunkRange = 1000 opts.ChunkDirRoot = dir opts.OutOfOrderTimeWindow.Store(30 * time.Minute.Milliseconds()) + opts.EnableSTStorage.Store(enableSTstorage) h, err := NewHead(nil, nil, wal, oooWlog, opts, nil) require.NoError(t, err) @@ -2993,7 +2996,7 @@ func testWBLReplayAppenderV2(t *testing.T, scenario sampleTypeScenario) { require.False(t, ok) require.NotNil(t, ms) - chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64) + chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, h.opts.EnableSTStorage.Load()) require.NoError(t, err) require.Len(t, chks, 1) @@ -4754,3 +4757,134 @@ func TestHeadAppenderV2_Append_HistogramStalenessConversionMetrics(t *testing.T) }) } } + +// TestHeadAppender_STStorage verifies that when EnableSTStorage is true, +// start timestamps are properly stored in chunks and returned by queries. +// This test uses AppenderV2 which has native ST support. +func TestHeadAppenderV2_STStorage(t *testing.T) { + testHistogram := tsdbutil.GenerateTestHistogram(1) + testHistogram.CounterResetHint = histogram.NotCounterReset + + type sampleData struct { + st int64 + ts int64 + fSample float64 + h *histogram.Histogram + } + + testCases := []struct { + name string + samples []sampleData + expectedSTs []int64 + isHistogram bool + }{ + { + name: "Float samples with ST", + samples: []sampleData{ + {st: 10, ts: 100, fSample: 1.0}, + {st: 20, ts: 200, fSample: 2.0}, + {st: 30, ts: 300, fSample: 3.0}, + }, + expectedSTs: []int64{10, 20, 30}, + isHistogram: false, + }, + { + name: "Float samples with varying ST", + samples: []sampleData{ + {st: 5, ts: 100, fSample: 1.0}, + {st: 5, ts: 200, fSample: 2.0}, + {st: 150, ts: 300, fSample: 3.0}, + }, + expectedSTs: []int64{5, 5, 150}, + isHistogram: false, + }, + { + name: "Histogram samples", + samples: []sampleData{ + {st: 10, ts: 100, h: testHistogram}, + {st: 20, ts: 200, h: testHistogram}, + {st: 30, ts: 300, h: testHistogram}, + }, + // Histograms don't support ST storage yet, should return 0. + expectedSTs: []int64{0, 0, 0}, + isHistogram: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) + opts.EnableSTStorage.Store(true) + h, _ := newTestHeadWithOptions(t, compression.None, opts) + + lbls := labels.FromStrings("foo", "bar") + + a := h.AppenderV2(context.Background()) + for _, s := range tc.samples { + _, err := a.Append(0, lbls, s.st, s.ts, s.fSample, s.h, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, a.Commit()) + + // Verify ST values are stored in chunks. + ctx := context.Background() + idxReader, err := h.Index() + require.NoError(t, err) + defer idxReader.Close() + + chkReader, err := h.Chunks() + require.NoError(t, err) + defer chkReader.Close() + + p, err := idxReader.Postings(ctx, "foo", "bar") + require.NoError(t, err) + + var lblBuilder labels.ScratchBuilder + require.True(t, p.Next()) + sRef := p.At() + + var chkMetas []chunks.Meta + require.NoError(t, idxReader.Series(sRef, &lblBuilder, &chkMetas)) + + var actualSTs []int64 + for _, meta := range chkMetas { + chk, iterable, err := chkReader.ChunkOrIterable(meta) + require.NoError(t, err) + require.Nil(t, iterable) + + it := chk.Iterator(nil) + for it.Next() != chunkenc.ValNone { + st := it.AtST() + actualSTs = append(actualSTs, st) + } + require.NoError(t, it.Err()) + } + + if tc.isHistogram { + require.Equal(t, tc.expectedSTs, actualSTs, "Histogram samples should return 0 for ST") + } else { + require.Equal(t, tc.expectedSTs, actualSTs, "Float samples should have ST stored") + } + + // Also verify via querier. + q, err := NewBlockQuerier(h, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + defer q.Close() + + ss := q.Select(ctx, false, nil, labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")) + require.True(t, ss.Next()) + series := ss.At() + require.NoError(t, ss.Err()) + + seriesIt := series.Iterator(nil) + var queriedSTs []int64 + for seriesIt.Next() != chunkenc.ValNone { + st := seriesIt.AtST() + queriedSTs = append(queriedSTs, st) + } + require.NoError(t, seriesIt.Err()) + + require.Equal(t, tc.expectedSTs, queriedSTs, "Querier should return same ST values as chunk iterator") + }) + } +} diff --git a/tsdb/head_read_test.go b/tsdb/head_read_test.go index cf55973a01..0849c257b5 100644 --- a/tsdb/head_read_test.go +++ b/tsdb/head_read_test.go @@ -33,7 +33,7 @@ func TestMemSeries_chunk(t *testing.T) { appendSamples := func(t *testing.T, s *memSeries, start, end int64, cdm *chunks.ChunkDiskMapper) { for i := start; i < end; i += chunkStep { - ok, _ := s.append(i, float64(i), 0, chunkOpts{ + ok, _ := s.append(0, i, float64(i), 0, chunkOpts{ chunkDiskMapper: cdm, chunkRange: chunkRange, samplesPerChunk: DefaultSamplesPerChunk, diff --git a/tsdb/head_test.go b/tsdb/head_test.go index 91cd742330..f03ec05572 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -349,7 +349,7 @@ func BenchmarkLoadWLs(b *testing.B) { for k := 0; k < c.batches*c.seriesPerBatch; k++ { // Create one mmapped chunk per series, with one sample at the given time. s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, 0, defaultIsolationDisabled, false) - s.append(c.mmappedChunkT, 42, 0, cOpts) + s.append(0, c.mmappedChunkT, 42, 0, cOpts) // There's only one head chunk because only a single sample is appended. mmapChunks() // ignores the latest chunk, so we need to cut a new head chunk to guarantee the chunk with // the sample at c.mmappedChunkT is mmapped. @@ -1492,7 +1492,7 @@ func TestMemSeries_truncateChunks(t *testing.T) { s := newMemSeries(labels.FromStrings("a", "b"), 1, 0, defaultIsolationDisabled, false) for i := 0; i < 4000; i += 5 { - ok, _ := s.append(int64(i), float64(i), 0, cOpts) + ok, _ := s.append(0, int64(i), float64(i), 0, cOpts) require.True(t, ok, "sample append failed") } s.mmapChunks(chunkDiskMapper) @@ -1642,7 +1642,7 @@ func TestMemSeries_truncateChunks_scenarios(t *testing.T) { if tc.mmappedChunks > 0 { headStart = (tc.mmappedChunks + 1) * chunkRange for i := 0; i < (tc.mmappedChunks+1)*chunkRange; i += chunkStep { - ok, _ := series.append(int64(i), float64(i), 0, cOpts) + ok, _ := series.append(0, int64(i), float64(i), 0, cOpts) require.True(t, ok, "sample append failed") } series.mmapChunks(chunkDiskMapper) @@ -1652,7 +1652,7 @@ func TestMemSeries_truncateChunks_scenarios(t *testing.T) { series.headChunks = nil } else { for i := headStart; i < chunkRange*(tc.mmappedChunks+tc.headChunks); i += chunkStep { - ok, _ := series.append(int64(i), float64(i), 0, cOpts) + ok, _ := series.append(0, int64(i), float64(i), 0, cOpts) require.True(t, ok, "sample append failed: %d", i) } } @@ -2183,7 +2183,47 @@ func TestComputeChunkEndTime(t *testing.T) { } } +// TestMemSeries_append tests float appending with various storeST/st combinations. func TestMemSeries_append(t *testing.T) { + scenarios := []struct { + name string + storeST bool + stFunc func(ts int64) int64 // Function to compute st from ts + }{ + { + name: "storeST=false st=0", + storeST: false, + stFunc: func(_ int64) int64 { return 0 }, + }, + { + name: "storeST=true st=0", + storeST: true, + stFunc: func(_ int64) int64 { return 0 }, + }, + { + name: "storeST=true st=ts", + storeST: true, + stFunc: func(ts int64) int64 { return ts }, + }, + { + name: "storeST=true st=ts-100", + storeST: true, + stFunc: func(ts int64) int64 { return ts - 100 }, + }, + { + name: "storeST=false st=ts (st ignored)", + storeST: false, + stFunc: func(ts int64) int64 { return ts }, + }, + } + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + testMemSeriesAppend(t, scenario.storeST, scenario.stFunc) + }) + } +} + +func testMemSeriesAppend(t *testing.T, storeST bool, stFunc func(ts int64) int64) { dir := t.TempDir() // This is usually taken from the Head, but passing manually here. chunkDiskMapper, err := chunks.NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), chunks.DefaultWriteBufferSize, chunks.DefaultWriteQueueSize) @@ -2195,6 +2235,7 @@ func TestMemSeries_append(t *testing.T) { chunkDiskMapper: chunkDiskMapper, chunkRange: 500, samplesPerChunk: DefaultSamplesPerChunk, + storeST: storeST, } s := newMemSeries(labels.Labels{}, 1, 0, defaultIsolationDisabled, false) @@ -2202,20 +2243,20 @@ func TestMemSeries_append(t *testing.T) { // Add first two samples at the very end of a chunk range and the next two // on and after it. // New chunk must correctly be cut at 1000. - ok, chunkCreated := s.append(998, 1, 0, cOpts) + ok, chunkCreated := s.append(stFunc(998), 998, 1, 0, cOpts) require.True(t, ok, "append failed") require.True(t, chunkCreated, "first sample created chunk") - ok, chunkCreated = s.append(999, 2, 0, cOpts) + ok, chunkCreated = s.append(stFunc(999), 999, 2, 0, cOpts) require.True(t, ok, "append failed") require.False(t, chunkCreated, "second sample should use same chunk") s.mmapChunks(chunkDiskMapper) - ok, chunkCreated = s.append(1000, 3, 0, cOpts) + ok, chunkCreated = s.append(stFunc(1000), 1000, 3, 0, cOpts) require.True(t, ok, "append failed") require.True(t, chunkCreated, "expected new chunk on boundary") - ok, chunkCreated = s.append(1001, 4, 0, cOpts) + ok, chunkCreated = s.append(stFunc(1001), 1001, 4, 0, cOpts) require.True(t, ok, "append failed") require.False(t, chunkCreated, "second sample should use same chunk") @@ -2229,7 +2270,8 @@ func TestMemSeries_append(t *testing.T) { // Fill the range [1000,2000) with many samples. Intermediate chunks should be cut // at approximately 120 samples per chunk. for i := 1; i < 1000; i++ { - ok, _ := s.append(1001+int64(i), float64(i), 0, cOpts) + ts := 1001 + int64(i) + ok, _ := s.append(stFunc(ts), ts, float64(i), 0, cOpts) require.True(t, ok, "append failed") } s.mmapChunks(chunkDiskMapper) @@ -2244,7 +2286,47 @@ func TestMemSeries_append(t *testing.T) { } } +// TestMemSeries_appendHistogram tests histogram appending with various storeST/st combinations. func TestMemSeries_appendHistogram(t *testing.T) { + scenarios := []struct { + name string + storeST bool + stFunc func(ts int64) int64 // Function to compute st from ts + }{ + { + name: "storeST=false st=0", + storeST: false, + stFunc: func(_ int64) int64 { return 0 }, + }, + { + name: "storeST=true st=0", + storeST: true, + stFunc: func(_ int64) int64 { return 0 }, + }, + { + name: "storeST=true st=ts", + storeST: true, + stFunc: func(ts int64) int64 { return ts }, + }, + { + name: "storeST=true st=ts-100", + storeST: true, + stFunc: func(ts int64) int64 { return ts - 100 }, + }, + { + name: "storeST=false st=ts (st ignored)", + storeST: false, + stFunc: func(ts int64) int64 { return ts }, + }, + } + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + testMemSeriesAppendHistogram(t, scenario.storeST, scenario.stFunc) + }) + } +} + +func testMemSeriesAppendHistogram(t *testing.T, storeST bool, stFunc func(ts int64) int64) { dir := t.TempDir() // This is usually taken from the Head, but passing manually here. chunkDiskMapper, err := chunks.NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), chunks.DefaultWriteBufferSize, chunks.DefaultWriteQueueSize) @@ -2256,6 +2338,7 @@ func TestMemSeries_appendHistogram(t *testing.T) { chunkDiskMapper: chunkDiskMapper, chunkRange: int64(1000), samplesPerChunk: DefaultSamplesPerChunk, + storeST: storeST, } s := newMemSeries(labels.Labels{}, 1, 0, defaultIsolationDisabled, false) @@ -2270,19 +2353,19 @@ func TestMemSeries_appendHistogram(t *testing.T) { // Add first two samples at the very end of a chunk range and the next two // on and after it. // New chunk must correctly be cut at 1000. - ok, chunkCreated := s.appendHistogram(998, histograms[0], 0, cOpts) + ok, chunkCreated := s.appendHistogram(stFunc(998), 998, histograms[0], 0, cOpts) require.True(t, ok, "append failed") require.True(t, chunkCreated, "first sample created chunk") - ok, chunkCreated = s.appendHistogram(999, histograms[1], 0, cOpts) + ok, chunkCreated = s.appendHistogram(stFunc(999), 999, histograms[1], 0, cOpts) require.True(t, ok, "append failed") require.False(t, chunkCreated, "second sample should use same chunk") - ok, chunkCreated = s.appendHistogram(1000, histograms[2], 0, cOpts) + ok, chunkCreated = s.appendHistogram(stFunc(1000), 1000, histograms[2], 0, cOpts) require.True(t, ok, "append failed") require.True(t, chunkCreated, "expected new chunk on boundary") - ok, chunkCreated = s.appendHistogram(1001, histograms[3], 0, cOpts) + ok, chunkCreated = s.appendHistogram(stFunc(1001), 1001, histograms[3], 0, cOpts) require.True(t, ok, "append failed") require.False(t, chunkCreated, "second sample should use same chunk") @@ -2293,7 +2376,7 @@ func TestMemSeries_appendHistogram(t *testing.T) { require.Equal(t, int64(1000), s.headChunks.minTime, "wrong chunk range") require.Equal(t, int64(1001), s.headChunks.maxTime, "wrong chunk range") - ok, chunkCreated = s.appendHistogram(1002, histogramWithOneMoreBucket, 0, cOpts) + ok, chunkCreated = s.appendHistogram(stFunc(1002), 1002, histogramWithOneMoreBucket, 0, cOpts) require.True(t, ok, "append failed") require.False(t, chunkCreated, "third sample should trigger a re-encoded chunk") @@ -2328,7 +2411,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) { var nextTs int64 var totalAppendedSamples int for i := range samplesPerChunk / 4 { - ok, _ := s.append(nextTs, float64(i), 0, cOpts) + ok, _ := s.append(0, nextTs, float64(i), 0, cOpts) require.Truef(t, ok, "slow sample %d was not appended", i) nextTs += slowRate totalAppendedSamples++ @@ -2337,12 +2420,12 @@ func TestMemSeries_append_atVariableRate(t *testing.T) { // Suddenly, the rate increases and we receive a sample every millisecond. for i := range math.MaxUint16 { - ok, _ := s.append(nextTs, float64(i), 0, cOpts) + ok, _ := s.append(0, nextTs, float64(i), 0, cOpts) require.Truef(t, ok, "quick sample %d was not appended", i) nextTs++ totalAppendedSamples++ } - ok, chunkCreated := s.append(DefaultBlockDuration, float64(0), 0, cOpts) + ok, chunkCreated := s.append(0, DefaultBlockDuration, float64(0), 0, cOpts) require.True(t, ok, "new chunk sample was not appended") require.True(t, chunkCreated, "sample at block duration timestamp should create a new chunk") @@ -2371,18 +2454,18 @@ func TestGCChunkAccess(t *testing.T) { s, _, _ := h.getOrCreate(1, labels.FromStrings("a", "1"), false) // Appending 2 samples for the first chunk. - ok, chunkCreated := s.append(0, 0, 0, cOpts) + ok, chunkCreated := s.append(0, 0, 0, 0, cOpts) require.True(t, ok, "series append failed") require.True(t, chunkCreated, "chunks was not created") - ok, chunkCreated = s.append(999, 999, 0, cOpts) + ok, chunkCreated = s.append(0, 999, 999, 0, cOpts) require.True(t, ok, "series append failed") require.False(t, chunkCreated, "chunks was created") // A new chunks should be created here as it's beyond the chunk range. - ok, chunkCreated = s.append(1000, 1000, 0, cOpts) + ok, chunkCreated = s.append(0, 1000, 1000, 0, cOpts) require.True(t, ok, "series append failed") require.True(t, chunkCreated, "chunks was not created") - ok, chunkCreated = s.append(1999, 1999, 0, cOpts) + ok, chunkCreated = s.append(0, 1999, 1999, 0, cOpts) require.True(t, ok, "series append failed") require.False(t, chunkCreated, "chunks was created") @@ -2427,18 +2510,18 @@ func TestGCSeriesAccess(t *testing.T) { s, _, _ := h.getOrCreate(1, labels.FromStrings("a", "1"), false) // Appending 2 samples for the first chunk. - ok, chunkCreated := s.append(0, 0, 0, cOpts) + ok, chunkCreated := s.append(0, 0, 0, 0, cOpts) require.True(t, ok, "series append failed") require.True(t, chunkCreated, "chunks was not created") - ok, chunkCreated = s.append(999, 999, 0, cOpts) + ok, chunkCreated = s.append(0, 999, 999, 0, cOpts) require.True(t, ok, "series append failed") require.False(t, chunkCreated, "chunks was created") // A new chunks should be created here as it's beyond the chunk range. - ok, chunkCreated = s.append(1000, 1000, 0, cOpts) + ok, chunkCreated = s.append(0, 1000, 1000, 0, cOpts) require.True(t, ok, "series append failed") require.True(t, chunkCreated, "chunks was not created") - ok, chunkCreated = s.append(1999, 1999, 0, cOpts) + ok, chunkCreated = s.append(0, 1999, 1999, 0, cOpts) require.True(t, ok, "series append failed") require.False(t, chunkCreated, "chunks was created") @@ -2775,10 +2858,10 @@ func TestHeadReadWriterRepair(t *testing.T) { require.True(t, created, "series was not created") for i := range 7 { - ok, chunkCreated := s.append(int64(i*chunkRange), float64(i*chunkRange), 0, cOpts) + ok, chunkCreated := s.append(0, int64(i*chunkRange), float64(i*chunkRange), 0, cOpts) require.True(t, ok, "series append failed") require.True(t, chunkCreated, "chunk was not created") - ok, chunkCreated = s.append(int64(i*chunkRange)+chunkRange-1, float64(i*chunkRange), 0, cOpts) + ok, chunkCreated = s.append(0, int64(i*chunkRange)+chunkRange-1, float64(i*chunkRange), 0, cOpts) require.True(t, ok, "series append failed") require.False(t, chunkCreated, "chunk was created") h.chunkDiskMapper.CutNewFile() @@ -3118,7 +3201,7 @@ func TestIsolationAppendIDZeroIsNoop(t *testing.T) { s, _, _ := h.getOrCreate(1, labels.FromStrings("a", "1"), false) - ok, _ := s.append(0, 0, 0, cOpts) + ok, _ := s.append(0, 0, 0, 0, cOpts) require.True(t, ok, "Series append failed.") require.Equal(t, 0, int(s.txs.txIDCount), "Series should not have an appendID after append with appendID=0.") } @@ -3678,7 +3761,7 @@ func TestIteratorSeekIntoBuffer(t *testing.T) { s := newMemSeries(labels.Labels{}, 1, 0, defaultIsolationDisabled, false) for i := range 7 { - ok, _ := s.append(int64(i), float64(i), 0, cOpts) + ok, _ := s.append(0, int64(i), float64(i), 0, cOpts) require.True(t, ok, "sample append failed") } @@ -5569,7 +5652,7 @@ func testWBLReplay(t *testing.T, scenario sampleTypeScenario) { require.False(t, ok) require.NotNil(t, ms) - chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64) + chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, false) require.NoError(t, err) require.Len(t, chks, 1) @@ -7381,3 +7464,250 @@ func TestHeadAppender_WBLEncoder_EnableSTStorage(t *testing.T) { }) } } + +// TestHeadAppender_STStorage_Disabled verifies that when EnableSTStorage is false, +// start timestamps are NOT stored in chunks (AtST returns 0). +func TestHeadAppender_STStorage_Disabled(t *testing.T) { + type sampleData struct { + st int64 + ts int64 + fSample float64 + } + + samples := []sampleData{ + {st: 10, ts: 100, fSample: 1.0}, + {st: 20, ts: 200, fSample: 2.0}, + {st: 30, ts: 300, fSample: 3.0}, + } + + opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) + opts.EnableSTStorage.Store(false) // Explicitly disable ST storage. + h, _ := newTestHeadWithOptions(t, compression.None, opts) + + lbls := labels.FromStrings("foo", "bar") + + a := h.AppenderV2(context.Background()) + for _, s := range samples { + _, err := a.Append(0, lbls, s.st, s.ts, s.fSample, nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, a.Commit()) + + ctx := context.Background() + idxReader, err := h.Index() + require.NoError(t, err) + defer idxReader.Close() + + chkReader, err := h.Chunks() + require.NoError(t, err) + defer chkReader.Close() + + p, err := idxReader.Postings(ctx, "foo", "bar") + require.NoError(t, err) + + var lblBuilder labels.ScratchBuilder + require.True(t, p.Next()) + sRef := p.At() + + var chkMetas []chunks.Meta + require.NoError(t, idxReader.Series(sRef, &lblBuilder, &chkMetas)) + + for _, meta := range chkMetas { + chk, iterable, err := chkReader.ChunkOrIterable(meta) + require.NoError(t, err) + require.Nil(t, iterable) + + it := chk.Iterator(nil) + for it.Next() != chunkenc.ValNone { + st := it.AtST() + require.Equal(t, int64(0), st, "ST should be 0 when EnableSTStorage is false") + } + require.NoError(t, it.Err()) + } +} + +// TestHeadAppender_STStorage_WALReplay verifies that ST values are preserved +// across a WAL replay when EnableSTStorage is true. The bug was that Commit() +// hardcoded EnableSTStorage=false in the WAL encoder, so ST values were written +// as V1 records (without ST) and lost on replay. +func TestHeadAppender_STStorage_WALReplay(t *testing.T) { + opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) + opts.EnableSTStorage.Store(true) + h, w := newTestHeadWithOptions(t, compression.None, opts) + + lbls := labels.FromStrings("foo", "bar") + const st = int64(50) + + a := h.AppenderV2(context.Background()) + for ts := int64(100); ts < 200; ts++ { + _, err := a.Append(0, lbls, st, ts, float64(ts), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, a.Commit()) + require.NoError(t, h.Close()) + + // Reopen the head, triggering WAL replay. + w, err := wlog.New(nil, nil, w.Dir(), compression.None) + require.NoError(t, err) + opts.ChunkDirRoot = h.opts.ChunkDirRoot + h2, err := NewHead(nil, nil, w, nil, opts, nil) + require.NoError(t, err) + t.Cleanup(func() { _ = h2.Close() }) + require.NoError(t, h2.Init(0)) + + // Query and verify ST values survived the WAL replay. + q, err := NewBlockQuerier(h2, 100, 199) + require.NoError(t, err) + got := query(t, q, labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")) + + var expected []chunks.Sample + for ts := int64(100); ts < 200; ts++ { + expected = append(expected, sample{st, ts, float64(ts), nil, nil}) + } + require.Equal(t, map[string][]chunks.Sample{`{foo="bar"}`: expected}, got) +} + +// TestHeadAppender_STStorage_WBLReplay verifies that ST values are preserved +// across a WBL replay for out-of-order samples when EnableSTStorage is true. +// The bug was that collectOOORecords() hardcoded EnableSTStorage=false in the +// WBL encoder (acc.enc), so OOO sample ST values were written as V1 records +// (without ST) and lost on WBL replay. +func TestHeadAppender_STStorage_WBLReplay(t *testing.T) { + dir := t.TempDir() + wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.None) + require.NoError(t, err) + wbl, err := wlog.NewSize(nil, nil, filepath.Join(dir, wlog.WblDirName), 32768, compression.None) + require.NoError(t, err) + + opts := DefaultHeadOptions() + opts.ChunkRange = DefaultBlockDuration + opts.ChunkDirRoot = dir + opts.OutOfOrderTimeWindow.Store(60 * time.Minute.Milliseconds()) + opts.EnableSTStorage.Store(true) + + h, err := NewHead(nil, nil, wal, wbl, opts, nil) + require.NoError(t, err) + require.NoError(t, h.Init(0)) + + lbls := labels.FromStrings("foo", "bar") + const st = int64(50) + + // Append an in-order sample to establish the head's maxt. + app := h.AppenderV2(context.Background()) + _, err = app.Append(0, lbls, st, 200, 200, nil, nil, storage.AOptions{}) + require.NoError(t, err) + require.NoError(t, app.Commit()) + + // Append OOO samples with non-zero ST; these go to the WBL. + // Use fewer than DefaultOutOfOrderCapMax (32) samples so they all stay in the + // OOO head chunk (not mmap'd) and are exclusively recovered via WBL replay. + app = h.AppenderV2(context.Background()) + for ts := int64(100); ts < 120; ts++ { + _, err = app.Append(0, lbls, st, ts, float64(ts), nil, nil, storage.AOptions{}) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + + require.NoError(t, h.Close()) + + // Reopen the head, triggering WBL replay. + wal, err = wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compression.None) + require.NoError(t, err) + wbl, err = wlog.NewSize(nil, nil, filepath.Join(dir, wlog.WblDirName), 32768, compression.None) + require.NoError(t, err) + h2, err := NewHead(nil, nil, wal, wbl, opts, nil) + require.NoError(t, err) + t.Cleanup(func() { _ = h2.Close() }) + require.NoError(t, h2.Init(0)) + + // Access the OOO head chunk directly and verify ST values survived WBL replay. + ms, created, err := h2.getOrCreate(lbls.Hash(), lbls, false) + require.NoError(t, err) + require.False(t, created) + require.NotNil(t, ms.ooo) + require.NotNil(t, ms.ooo.oooHeadChunk) + + chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, true) + require.NoError(t, err) + require.Len(t, chks, 1) + + it := chks[0].chunk.Iterator(nil) + var got []chunks.Sample + for it.Next() != chunkenc.ValNone { + t2, v := it.At() + got = append(got, sample{it.AtST(), t2, v, nil, nil}) + } + require.NoError(t, it.Err()) + + var expected []chunks.Sample + for ts := int64(100); ts < 120; ts++ { + expected = append(expected, sample{st, ts, float64(ts), nil, nil}) + } + require.Equal(t, expected, got) +} + +// TestHeadAppender_STStorage_ChunkEncoding verifies that the correct chunk encoding +// is used based on EnableSTStorage setting. +func TestHeadAppender_STStorage_ChunkEncoding(t *testing.T) { + samples := []struct { + st int64 + ts int64 + fSample float64 + }{ + {st: 10, ts: 100, fSample: 1.0}, + {st: 20, ts: 200, fSample: 2.0}, + } + + for _, enableST := range []bool{false, true} { + t.Run(fmt.Sprintf("EnableSTStorage=%t", enableST), func(t *testing.T) { + opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) + opts.EnableSTStorage.Store(enableST) + h, _ := newTestHeadWithOptions(t, compression.None, opts) + + lbls := labels.FromStrings("foo", "bar") + a := h.Appender(context.Background()) + for _, s := range samples { + _, err := a.AppendSTZeroSample(0, lbls, s.ts, s.st) + require.NoError(t, err) + _, err = a.Append(0, lbls, s.ts, s.fSample) + require.NoError(t, err) + } + require.NoError(t, a.Commit()) + + ctx := context.Background() + idxReader, err := h.Index() + require.NoError(t, err) + defer idxReader.Close() + + chkReader, err := h.Chunks() + require.NoError(t, err) + defer chkReader.Close() + + p, err := idxReader.Postings(ctx, "foo", "bar") + require.NoError(t, err) + + var lblBuilder labels.ScratchBuilder + require.True(t, p.Next()) + sRef := p.At() + + var chkMetas []chunks.Meta + require.NoError(t, idxReader.Series(sRef, &lblBuilder, &chkMetas)) + require.NotEmpty(t, chkMetas) + + for _, meta := range chkMetas { + chk, iterable, err := chkReader.ChunkOrIterable(meta) + require.NoError(t, err) + require.Nil(t, iterable) + + encoding := chk.Encoding() + if enableST { + require.Equal(t, chunkenc.EncXOROptST, encoding, + "Expected ST-capable encoding when EnableSTStorage is true") + } else { + require.Equal(t, chunkenc.EncXOR, encoding, + "Expected regular XOR encoding when EnableSTStorage is false") + } + } + }) + } +} diff --git a/tsdb/head_wal.go b/tsdb/head_wal.go index 0a54ae3878..1851e99230 100644 --- a/tsdb/head_wal.go +++ b/tsdb/head_wal.go @@ -636,6 +636,7 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmapp chunkDiskMapper: h.chunkDiskMapper, chunkRange: h.chunkRange.Load(), samplesPerChunk: h.opts.SamplesPerChunk, + storeST: h.opts.EnableSTStorage.Load(), } for in := range wp.input { @@ -666,7 +667,7 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmapp h.numStaleSeries.Dec() } - if _, chunkCreated := ms.append(s.T, s.V, 0, appendChunkOpts); chunkCreated { + if _, chunkCreated := ms.append(s.ST, s.T, s.V, 0, appendChunkOpts); chunkCreated { h.metrics.chunksCreated.Inc() h.metrics.chunks.Inc() _ = ms.mmapChunks(h.chunkDiskMapper) @@ -703,14 +704,16 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmapp newlyStale = newlyStale && !value.IsStaleNaN(ms.lastHistogramValue.Sum) staleToNonStale = value.IsStaleNaN(ms.lastHistogramValue.Sum) && !value.IsStaleNaN(s.h.Sum) } - _, chunkCreated = ms.appendHistogram(s.t, s.h, 0, appendChunkOpts) + // TODO(krajorama,ywwg): Pass ST when available in WBL. + _, chunkCreated = ms.appendHistogram(0, s.t, s.h, 0, appendChunkOpts) } else { newlyStale = value.IsStaleNaN(s.fh.Sum) if ms.lastFloatHistogramValue != nil { newlyStale = newlyStale && !value.IsStaleNaN(ms.lastFloatHistogramValue.Sum) staleToNonStale = value.IsStaleNaN(ms.lastFloatHistogramValue.Sum) && !value.IsStaleNaN(s.fh.Sum) } - _, chunkCreated = ms.appendFloatHistogram(s.t, s.fh, 0, appendChunkOpts) + // TODO(krajorama,ywwg): Pass ST when available in WBL. + _, chunkCreated = ms.appendFloatHistogram(0, s.t, s.fh, 0, appendChunkOpts) } if newlyStale { h.numStaleSeries.Inc() @@ -1077,6 +1080,12 @@ func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (map[chunks.HeadSeriesR var unknownSampleRefs, unknownHistogramRefs uint64 oooCapMax := h.opts.OutOfOrderCapMax.Load() + appendChunkOpts := chunkOpts{ + chunkDiskMapper: h.chunkDiskMapper, + chunkRange: h.chunkRange.Load(), + samplesPerChunk: h.opts.SamplesPerChunk, + storeST: h.opts.EnableSTStorage.Load(), + } // We don't check for minValidTime for ooo samples. mint, maxt := int64(math.MaxInt64), int64(math.MinInt64) for in := range wp.input { @@ -1096,7 +1105,7 @@ func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (map[chunks.HeadSeriesR missingSeries[s.Ref] = struct{}{} continue } - ok, chunkCreated, _ := ms.insert(s.T, s.V, nil, nil, h.chunkDiskMapper, oooCapMax, h.logger) + ok, chunkCreated, _ := ms.insert(s.ST, s.T, s.V, nil, nil, appendChunkOpts, oooCapMax, h.logger) if chunkCreated { h.metrics.chunksCreated.Inc() h.metrics.chunks.Inc() @@ -1124,9 +1133,11 @@ func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (map[chunks.HeadSeriesR var chunkCreated bool var ok bool if s.h != nil { - ok, chunkCreated, _ = ms.insert(s.t, 0, s.h, nil, h.chunkDiskMapper, oooCapMax, h.logger) + // TODO(krajorama,ywwg): Pass ST when available in WBL. + ok, chunkCreated, _ = ms.insert(0, s.t, 0, s.h, nil, appendChunkOpts, oooCapMax, h.logger) } else { - ok, chunkCreated, _ = ms.insert(s.t, 0, nil, s.fh, h.chunkDiskMapper, oooCapMax, h.logger) + // TODO(krajorama,ywwg): Pass ST when available in WBL. + ok, chunkCreated, _ = ms.insert(0, s.t, 0, nil, s.fh, appendChunkOpts, oooCapMax, h.logger) } if chunkCreated { h.metrics.chunksCreated.Inc() diff --git a/tsdb/ooo_head.go b/tsdb/ooo_head.go index f9746c4c61..04f859154f 100644 --- a/tsdb/ooo_head.go +++ b/tsdb/ooo_head.go @@ -34,14 +34,13 @@ func NewOOOChunk() *OOOChunk { // Insert inserts the sample such that order is maintained. // Returns false if insert was not possible due to the same timestamp already existing. -func (o *OOOChunk) Insert(t int64, v float64, h *histogram.Histogram, fh *histogram.FloatHistogram) bool { +func (o *OOOChunk) Insert(st, t int64, v float64, h *histogram.Histogram, fh *histogram.FloatHistogram) bool { // Although out-of-order samples can be out-of-order amongst themselves, we // are opinionated and expect them to be usually in-order meaning we could // try to append at the end first if the new timestamp is higher than the // last known timestamp. if len(o.samples) == 0 || t > o.samples[len(o.samples)-1].t { - // TODO(krajorama): pass ST. - o.samples = append(o.samples, sample{0, t, v, h, fh}) + o.samples = append(o.samples, sample{st, t, v, h, fh}) return true } @@ -50,8 +49,7 @@ func (o *OOOChunk) Insert(t int64, v float64, h *histogram.Histogram, fh *histog if i >= len(o.samples) { // none found. append it at the end - // TODO(krajorama): pass ST. - o.samples = append(o.samples, sample{0, t, v, h, fh}) + o.samples = append(o.samples, sample{st, t, v, h, fh}) return true } @@ -63,8 +61,7 @@ func (o *OOOChunk) Insert(t int64, v float64, h *histogram.Histogram, fh *histog // Expand length by 1 to make room. use a zero sample, we will overwrite it anyway. o.samples = append(o.samples, sample{}) copy(o.samples[i+1:], o.samples[i:]) - // TODO(krajorama): pass ST. - o.samples[i] = sample{0, t, v, h, fh} + o.samples[i] = sample{st, t, v, h, fh} return true } @@ -76,7 +73,7 @@ func (o *OOOChunk) NumSamples() int { // ToEncodedChunks returns chunks with the samples in the OOOChunk. // //nolint:revive -func (o *OOOChunk) ToEncodedChunks(mint, maxt int64) (chks []memChunk, err error) { +func (o *OOOChunk) ToEncodedChunks(mint, maxt int64, storeST bool) (chks []memChunk, err error) { if len(o.samples) == 0 { return nil, nil } @@ -96,10 +93,13 @@ func (o *OOOChunk) ToEncodedChunks(mint, maxt int64) (chks []memChunk, err error if s.t > maxt { break } - encoding := chunkenc.EncXOR - if s.h != nil { + encoding := chunkenc.ValFloat.ChunkEncoding(storeST) + switch { + case s.h != nil: + // TODO(krajorama): use ST capable histogram chunk. encoding = chunkenc.EncHistogram - } else if s.fh != nil { + case s.fh != nil: + // TODO(krajorama): use ST capable float histogram chunk. encoding = chunkenc.EncFloatHistogram } @@ -111,15 +111,11 @@ func (o *OOOChunk) ToEncodedChunks(mint, maxt int64) (chks []memChunk, err error chks = append(chks, memChunk{chunk, cmint, cmaxt, nil}) } cmint = s.t - switch encoding { - case chunkenc.EncXOR: - chunk = chunkenc.NewXORChunk() - case chunkenc.EncHistogram: - chunk = chunkenc.NewHistogramChunk() - case chunkenc.EncFloatHistogram: - chunk = chunkenc.NewFloatHistogramChunk() - default: - chunk = chunkenc.NewXORChunk() + chunk, err = chunkenc.NewEmptyChunk(encoding) + if err != nil { + // This should never happen. No point using a default type as + // calling the wrong append function would panic. + return chks, err } app, err = chunk.Appender() if err != nil { @@ -127,18 +123,17 @@ func (o *OOOChunk) ToEncodedChunks(mint, maxt int64) (chks []memChunk, err error } } switch encoding { - case chunkenc.EncXOR: - // TODO(krajorama): pass ST. - app.Append(0, s.t, s.f) + case chunkenc.EncXOR, chunkenc.EncXOROptST: + app.Append(s.st, s.t, s.f) case chunkenc.EncHistogram: + // TODO(krajorama): handle ST capable histogram chunk. // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway. prevHApp, _ := prevApp.(*chunkenc.HistogramAppender) var ( newChunk chunkenc.Chunk recoded bool ) - // TODO(krajorama): pass ST. - newChunk, recoded, app, _ = app.AppendHistogram(prevHApp, 0, s.t, s.h, false) + newChunk, recoded, app, _ = app.AppendHistogram(prevHApp, s.st, s.t, s.h, false) if newChunk != nil { // A new chunk was allocated. if !recoded { chks = append(chks, memChunk{chunk, cmint, cmaxt, nil}) @@ -147,14 +142,14 @@ func (o *OOOChunk) ToEncodedChunks(mint, maxt int64) (chks []memChunk, err error chunk = newChunk } case chunkenc.EncFloatHistogram: + // TODO(krajorama): handle ST capable float histogram chunk. // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway. prevHApp, _ := prevApp.(*chunkenc.FloatHistogramAppender) var ( newChunk chunkenc.Chunk recoded bool ) - // TODO(krajorama): pass ST. - newChunk, recoded, app, _ = app.AppendFloatHistogram(prevHApp, 0, s.t, s.fh, false) + newChunk, recoded, app, _ = app.AppendFloatHistogram(prevHApp, s.st, s.t, s.fh, false) if newChunk != nil { // A new chunk was allocated. if !recoded { chks = append(chks, memChunk{chunk, cmint, cmaxt, nil}) diff --git a/tsdb/ooo_head_read.go b/tsdb/ooo_head_read.go index 5d2347c2d7..86c64ff6e0 100644 --- a/tsdb/ooo_head_read.go +++ b/tsdb/ooo_head_read.go @@ -77,7 +77,7 @@ func (oh *HeadAndOOOIndexReader) Series(ref storage.SeriesRef, builder *labels.S *chks = (*chks)[:0] if s.ooo != nil { - return getOOOSeriesChunks(s, oh.mint, oh.maxt, oh.lastGarbageCollectedMmapRef, 0, true, oh.inoMint, chks) + return getOOOSeriesChunks(s, oh.head.opts.EnableSTStorage.Load(), oh.mint, oh.maxt, oh.lastGarbageCollectedMmapRef, 0, true, oh.inoMint, chks) } *chks = appendSeriesChunks(s, oh.inoMint, oh.maxt, *chks) return nil @@ -88,7 +88,7 @@ func (oh *HeadAndOOOIndexReader) Series(ref storage.SeriesRef, builder *labels.S // // maxMmapRef tells upto what max m-map chunk that we can consider. If it is non-0, then // the oooHeadChunk will not be considered. -func getOOOSeriesChunks(s *memSeries, mint, maxt int64, lastGarbageCollectedMmapRef, maxMmapRef chunks.ChunkDiskMapperRef, includeInOrder bool, inoMint int64, chks *[]chunks.Meta) error { +func getOOOSeriesChunks(s *memSeries, storeST bool, mint, maxt int64, lastGarbageCollectedMmapRef, maxMmapRef chunks.ChunkDiskMapperRef, includeInOrder bool, inoMint int64, chks *[]chunks.Meta) error { tmpChks := make([]chunks.Meta, 0, len(s.ooo.oooMmappedChunks)) addChunk := func(minT, maxT int64, ref chunks.ChunkRef, chunk chunkenc.Chunk) { @@ -106,7 +106,7 @@ func getOOOSeriesChunks(s *memSeries, mint, maxt int64, lastGarbageCollectedMmap if c.OverlapsClosedInterval(mint, maxt) && maxMmapRef == 0 { ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.ooo.oooMmappedChunks)))) if len(c.chunk.samples) > 0 { // Empty samples happens in tests, at least. - chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(c.minTime, c.maxTime) + chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(c.minTime, c.maxTime, storeST) if err != nil { handleChunkWriteError(err) return nil @@ -347,7 +347,7 @@ func NewOOOCompactionHead(ctx context.Context, head *Head) (*OOOCompactionHead, } var lastMmapRef chunks.ChunkDiskMapperRef - mmapRefs := ms.mmapCurrentOOOHeadChunk(head.chunkDiskMapper, head.logger) + mmapRefs := ms.mmapCurrentOOOHeadChunk(chunkOpts{chunkDiskMapper: head.chunkDiskMapper, storeST: head.opts.EnableSTStorage.Load()}, head.logger) if len(mmapRefs) == 0 && len(ms.ooo.oooMmappedChunks) > 0 { // Nothing was m-mapped. So take the mmapRef from the existing slice if it exists. mmapRefs = []chunks.ChunkDiskMapperRef{ms.ooo.oooMmappedChunks[len(ms.ooo.oooMmappedChunks)-1].ref} @@ -481,7 +481,7 @@ func (ir *OOOCompactionHeadIndexReader) Series(ref storage.SeriesRef, builder *l return nil } - return getOOOSeriesChunks(s, ir.ch.mint, ir.ch.maxt, 0, ir.ch.lastMmapRef, false, 0, chks) + return getOOOSeriesChunks(s, ir.ch.head.opts.EnableSTStorage.Load(), ir.ch.mint, ir.ch.maxt, 0, ir.ch.lastMmapRef, false, 0, chks) } func (*OOOCompactionHeadIndexReader) SortedLabelValues(_ context.Context, _ string, _ *storage.LabelHints, _ ...*labels.Matcher) ([]string, error) { diff --git a/tsdb/ooo_head_test.go b/tsdb/ooo_head_test.go index 99cd357a30..f7e73233fb 100644 --- a/tsdb/ooo_head_test.go +++ b/tsdb/ooo_head_test.go @@ -31,10 +31,11 @@ const testMaxSize int = 32 func valEven(pos int) int64 { return int64(pos*2 + 2) } // s[0]=2, s[1]=4, s[2]=6, ..., s[31]=64 - Predictable pre-existing values func valOdd(pos int) int64 { return int64(pos*2 + 1) } // s[0]=1, s[1]=3, s[2]=5, ..., s[31]=63 - New values will interject at chosen position because they sort before the pre-existing vals. -func makeEvenSampleSlice(n int, sampleFunc func(ts int64) sample) []sample { +func makeEvenSampleSlice(n int, sampleFunc func(st, ts int64) sample) []sample { s := make([]sample, n) for i := range n { - s[i] = sampleFunc(valEven(i)) + ts := valEven(i) + s[i] = sampleFunc(ts, ts) // Use ts as st for consistency } return s } @@ -43,23 +44,50 @@ func makeEvenSampleSlice(n int, sampleFunc func(ts int64) sample) []sample { // - Number of pre-existing samples anywhere from 0 to testMaxSize-1. // - Insert new sample before first pre-existing samples, after the last, and anywhere in between. // - With a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves. +// - With st=0 and st!=0 to verify ordering is based on sample.t, not sample.st. func TestOOOInsert(t *testing.T) { scenarios := map[string]struct { - sampleFunc func(ts int64) sample + sampleFunc func(st, ts int64) sample }{ - "float": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, f: float64(ts)} + "float st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, f: float64(ts)} }, }, - "integer histogram": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + "float st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, f: float64(ts)} }, }, - "float histogram": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} + "float st=ts-100": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts - 100, t: ts, f: float64(ts)} + }, + }, + "float st descending while t ascending": { + // st values go in opposite direction of t to ensure ordering is by t. + sampleFunc: func(st, ts int64) sample { + return sample{st: 1000 - ts, t: ts, f: float64(ts)} + }, + }, + "integer histogram st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + }, + }, + "integer histogram st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + }, + }, + "float histogram st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} + }, + }, + "float histogram st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} }, }, } @@ -71,7 +99,7 @@ func TestOOOInsert(t *testing.T) { } func testOOOInsert(t *testing.T, - sampleFunc func(ts int64) sample, + sampleFunc func(st, ts int64) sample, ) { for numPreExisting := 0; numPreExisting <= testMaxSize; numPreExisting++ { // For example, if we have numPreExisting 2, then: @@ -84,19 +112,22 @@ func testOOOInsert(t *testing.T, chunk := NewOOOChunk() chunk.samples = make([]sample, numPreExisting) chunk.samples = makeEvenSampleSlice(numPreExisting, sampleFunc) - newSample := sampleFunc(valOdd(insertPos)) - chunk.Insert(newSample.t, newSample.f, newSample.h, newSample.fh) + ts := valOdd(insertPos) + newSample := sampleFunc(ts, ts) // Use ts as st for consistency + chunk.Insert(newSample.st, newSample.t, newSample.f, newSample.h, newSample.fh) var expSamples []sample // Our expected new samples slice, will be first the original samples. for i := 0; i < insertPos; i++ { - expSamples = append(expSamples, sampleFunc(valEven(i))) + ts := valEven(i) + expSamples = append(expSamples, sampleFunc(ts, ts)) } // Then the new sample. expSamples = append(expSamples, newSample) // Followed by any original samples that were pushed back by the new one. for i := insertPos; i < numPreExisting; i++ { - expSamples = append(expSamples, sampleFunc(valEven(i))) + ts := valEven(i) + expSamples = append(expSamples, sampleFunc(ts, ts)) } require.Equal(t, expSamples, chunk.samples, "numPreExisting %d, insertPos %d", numPreExisting, insertPos) @@ -107,23 +138,50 @@ func testOOOInsert(t *testing.T, // TestOOOInsertDuplicate tests the correct behavior when inserting a sample that is a duplicate of any // pre-existing samples, with between 1 and testMaxSize pre-existing samples and // with a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves. +// With st=0 and st!=0 to verify duplicate detection is based on sample.t, not sample.st. func TestOOOInsertDuplicate(t *testing.T) { scenarios := map[string]struct { - sampleFunc func(ts int64) sample + sampleFunc func(st, ts int64) sample }{ - "float": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, f: float64(ts)} + "float st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, f: float64(ts)} }, }, - "integer histogram": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + "float st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, f: float64(ts)} }, }, - "float histogram": { - sampleFunc: func(ts int64) sample { - return sample{t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} + "float st=ts-100": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts - 100, t: ts, f: float64(ts)} + }, + }, + "float st descending while t ascending": { + // st values go in opposite direction of t to ensure duplicate detection is by t. + sampleFunc: func(st, ts int64) sample { + return sample{st: 1000 - ts, t: ts, f: float64(ts)} + }, + }, + "integer histogram st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + }, + }, + "integer histogram st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, h: tsdbutil.GenerateTestHistogram(ts)} + }, + }, + "float histogram st=0": { + sampleFunc: func(st, ts int64) sample { + return sample{st: 0, t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} + }, + }, + "float histogram st=ts": { + sampleFunc: func(st, ts int64) sample { + return sample{st: ts, t: ts, fh: tsdbutil.GenerateTestFloatHistogram(ts)} }, }, } @@ -135,7 +193,7 @@ func TestOOOInsertDuplicate(t *testing.T) { } func testOOOInsertDuplicate(t *testing.T, - sampleFunc func(ts int64) sample, + sampleFunc func(st, ts int64) sample, ) { for num := 1; num <= testMaxSize; num++ { for dupPos := 0; dupPos < num; dupPos++ { @@ -145,7 +203,7 @@ func testOOOInsertDuplicate(t *testing.T, dupSample := chunk.samples[dupPos] dupSample.f = 0.123 - ok := chunk.Insert(dupSample.t, dupSample.f, dupSample.h, dupSample.fh) + ok := chunk.Insert(dupSample.st, dupSample.t, dupSample.f, dupSample.h, dupSample.fh) expSamples := makeEvenSampleSlice(num, sampleFunc) // We expect no change. require.False(t, ok) @@ -252,17 +310,17 @@ func TestOOOChunks_ToEncodedChunks(t *testing.T) { for _, s := range tc.samples { switch s.Type() { case chunkenc.ValFloat: - oooChunk.Insert(s.t, s.f, nil, nil) + oooChunk.Insert(s.st, s.t, s.f, nil, nil) case chunkenc.ValHistogram: - oooChunk.Insert(s.t, 0, s.h.Copy(), nil) + oooChunk.Insert(s.st, s.t, 0, s.h.Copy(), nil) case chunkenc.ValFloatHistogram: - oooChunk.Insert(s.t, 0, nil, s.fh.Copy()) + oooChunk.Insert(s.st, s.t, 0, nil, s.fh.Copy()) default: t.Fatalf("unexpected sample type %d", s.Type()) } } - chunks, err := oooChunk.ToEncodedChunks(math.MinInt64, math.MaxInt64) + chunks, err := oooChunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, false) require.NoError(t, err) require.Len(t, chunks, len(tc.expectedChunks), "number of chunks") sampleIndex := 0 @@ -308,3 +366,87 @@ func TestOOOChunks_ToEncodedChunks(t *testing.T) { }) } } + +// TestOOOChunks_ToEncodedChunks_WithST tests ToEncodedChunks with storeST=true and storeST=false for float samples. +// When storeST=true, st values are preserved; when storeST=false, AtST() returns 0. +// TODO(@krajorama): Add histogram test cases once ST storage is implemented for histograms. +func TestOOOChunks_ToEncodedChunks_WithST(t *testing.T) { + testCases := map[string]struct { + samples []sample + }{ + "floats with st=0": { + samples: []sample{ + {st: 0, t: 1000, f: 43.0}, + {st: 0, t: 1100, f: 42.0}, + }, + }, + "floats with st=t": { + samples: []sample{ + {st: 1000, t: 1000, f: 43.0}, + {st: 1100, t: 1100, f: 42.0}, + }, + }, + "floats with st=t-100": { + samples: []sample{ + {st: 900, t: 1000, f: 43.0}, + {st: 1000, t: 1100, f: 42.0}, + }, + }, + "floats with varying st": { + samples: []sample{ + {st: 500, t: 1000, f: 43.0}, + {st: 1100, t: 1100, f: 42.0}, // st == t + {st: 0, t: 1200, f: 41.0}, // st == 0 + }, + }, + } + + storageScenarios := []struct { + name string + storeST bool + expectedEncoding chunkenc.Encoding + }{ + {"storeST=true", true, chunkenc.EncXOROptST}, + {"storeST=false", false, chunkenc.EncXOR}, + } + + for name, tc := range testCases { + for _, ss := range storageScenarios { + t.Run(name+"/"+ss.name, func(t *testing.T) { + oooChunk := OOOChunk{} + for _, s := range tc.samples { + oooChunk.Insert(s.st, s.t, s.f, nil, nil) + } + + chunks, err := oooChunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, ss.storeST) + require.NoError(t, err) + require.Len(t, chunks, 1, "number of chunks") + + c := chunks[0] + require.Equal(t, ss.expectedEncoding, c.chunk.Encoding(), "chunk encoding") + require.Equal(t, tc.samples[0].t, c.minTime, "chunk minTime") + require.Equal(t, tc.samples[len(tc.samples)-1].t, c.maxTime, "chunk maxTime") + + // Verify samples can be read back with correct st and t values. + it := c.chunk.Iterator(nil) + sampleIndex := 0 + for it.Next() == chunkenc.ValFloat { + gotT, gotF := it.At() + gotST := it.AtST() + + if ss.storeST { + // When storeST=true, st values should be preserved. + require.Equal(t, tc.samples[sampleIndex].st, gotST, "sample %d st", sampleIndex) + } else { + // When storeST=false, AtST() should return 0. + require.Equal(t, int64(0), gotST, "sample %d st should be 0 when storeST=false", sampleIndex) + } + require.Equal(t, tc.samples[sampleIndex].t, gotT, "sample %d t", sampleIndex) + require.Equal(t, tc.samples[sampleIndex].f, gotF, "sample %d f", sampleIndex) + sampleIndex++ + } + require.Equal(t, len(tc.samples), sampleIndex, "number of samples") + }) + } + } +} diff --git a/tsdb/querier.go b/tsdb/querier.go index ac7a14e1b3..6d0cf36db4 100644 --- a/tsdb/querier.go +++ b/tsdb/querier.go @@ -866,7 +866,6 @@ func (p *populateWithDelChunkSeriesIterator) Next() bool { // populateCurrForSingleChunk sets the fields within p.currMetaWithChunk. This // should be called if the samples in p.currDelIter only form one chunk. -// TODO(krajorama): test ST when chunks support it. func (p *populateWithDelChunkSeriesIterator) populateCurrForSingleChunk() bool { valueType := p.currDelIter.Next() if valueType == chunkenc.ValNone { @@ -885,60 +884,47 @@ func (p *populateWithDelChunkSeriesIterator) populateCurrForSingleChunk() bool { st, t int64 err error ) - switch valueType { - case chunkenc.ValHistogram: - newChunk = chunkenc.NewHistogramChunk() - if app, err = newChunk.Appender(); err != nil { + newChunk, err = chunkenc.NewEmptyChunk(p.currMeta.Chunk.Encoding()) + if err != nil { + p.err = fmt.Errorf("create new chunk while re-encoding: %w", err) + return false + } + app, err = newChunk.Appender() + if err != nil { + p.err = fmt.Errorf("create appender while re-encoding: %w", err) + return false + } + +loop: + for vt := valueType; vt != chunkenc.ValNone; vt = p.currDelIter.Next() { + if vt != valueType { + err = fmt.Errorf("found value type %v in chunk with %v", vt, valueType) break } - for vt := valueType; vt != chunkenc.ValNone; vt = p.currDelIter.Next() { - if vt != chunkenc.ValHistogram { - err = fmt.Errorf("found value type %v in histogram chunk", vt) - break - } - var h *histogram.Histogram - t, h = p.currDelIter.AtHistogram(nil) - st = p.currDelIter.AtST() - _, _, app, err = app.AppendHistogram(nil, st, t, h, true) - if err != nil { - break - } - } - case chunkenc.ValFloat: - newChunk = chunkenc.NewXORChunk() - if app, err = newChunk.Appender(); err != nil { - break - } - for vt := valueType; vt != chunkenc.ValNone; vt = p.currDelIter.Next() { - if vt != chunkenc.ValFloat { - err = fmt.Errorf("found value type %v in float chunk", vt) - break - } + st = p.currDelIter.AtST() + switch vt { + case chunkenc.ValFloat: var v float64 t, v = p.currDelIter.At() - st = p.currDelIter.AtST() app.Append(st, t, v) - } - case chunkenc.ValFloatHistogram: - newChunk = chunkenc.NewFloatHistogramChunk() - if app, err = newChunk.Appender(); err != nil { - break - } - for vt := valueType; vt != chunkenc.ValNone; vt = p.currDelIter.Next() { - if vt != chunkenc.ValFloatHistogram { - err = fmt.Errorf("found value type %v in histogram chunk", vt) - break + case chunkenc.ValHistogram: + var h *histogram.Histogram + t, h = p.currDelIter.AtHistogram(nil) + _, _, app, err = app.AppendHistogram(nil, st, t, h, true) + if err != nil { + break loop } + case chunkenc.ValFloatHistogram: var h *histogram.FloatHistogram t, h = p.currDelIter.AtFloatHistogram(nil) - st = p.currDelIter.AtST() _, _, app, err = app.AppendFloatHistogram(nil, st, t, h, true) if err != nil { - break + break loop } + default: + err = fmt.Errorf("populateCurrForSingleChunk: value type %v unsupported", valueType) + break loop } - default: - err = fmt.Errorf("populateCurrForSingleChunk: value type %v unsupported", valueType) } if err != nil { @@ -958,7 +944,6 @@ func (p *populateWithDelChunkSeriesIterator) populateCurrForSingleChunk() bool { // populateChunksFromIterable reads the samples from currDelIter to create // chunks for chunksFromIterable. It also sets p.currMetaWithChunk to the first // chunk. -// TODO(krajorama): test ST when chunks support it. func (p *populateWithDelChunkSeriesIterator) populateChunksFromIterable() bool { p.chunksFromIterable = p.chunksFromIterable[:0] p.chunksFromIterableIdx = -1 @@ -982,30 +967,37 @@ func (p *populateWithDelChunkSeriesIterator) populateChunksFromIterable() bool { app chunkenc.Appender - newChunk chunkenc.Chunk - recoded bool - err error ) prevValueType := chunkenc.ValNone + hasTS := false for currentValueType := firstValueType; currentValueType != chunkenc.ValNone; currentValueType = p.currDelIter.Next() { + var ( + newChunk chunkenc.Chunk + recoded bool + ) // Check if the encoding has changed (i.e. we need to create a new // chunk as chunks can't have multiple encoding types). // For the first sample, the following condition will always be true as // ValNone != ValFloat | ValHistogram | ValFloatHistogram. - if currentValueType != prevValueType { + // Also if we need to store start time (ST), but the current chunk is + // not capable. + st = p.currDelIter.AtST() + needTS := st != 0 + if currentValueType != prevValueType || !hasTS && needTS { if prevValueType != chunkenc.ValNone { p.chunksFromIterable = append(p.chunksFromIterable, chunks.Meta{Chunk: currentChunk, MinTime: cmint, MaxTime: cmaxt}) } cmint = p.currDelIter.AtT() - if currentChunk, err = currentValueType.NewChunk(); err != nil { + if currentChunk, err = currentValueType.NewChunk(needTS); err != nil { break } if app, err = currentChunk.Appender(); err != nil { break } + hasTS = needTS } switch currentValueType { @@ -1013,14 +1005,12 @@ func (p *populateWithDelChunkSeriesIterator) populateChunksFromIterable() bool { { var v float64 t, v = p.currDelIter.At() - st = p.currDelIter.AtST() app.Append(st, t, v) } case chunkenc.ValHistogram: { var v *histogram.Histogram t, v = p.currDelIter.AtHistogram(nil) - st = p.currDelIter.AtST() // No need to set prevApp as AppendHistogram will set the // counter reset header for the appender that's returned. newChunk, recoded, app, err = app.AppendHistogram(nil, st, t, v, false) @@ -1029,7 +1019,6 @@ func (p *populateWithDelChunkSeriesIterator) populateChunksFromIterable() bool { { var v *histogram.FloatHistogram t, v = p.currDelIter.AtFloatHistogram(nil) - st = p.currDelIter.AtST() // No need to set prevApp as AppendHistogram will set the // counter reset header for the appender that's returned. newChunk, recoded, app, err = app.AppendFloatHistogram(nil, st, t, v, false) diff --git a/tsdb/querier_test.go b/tsdb/querier_test.go index 4387635959..de96755e23 100644 --- a/tsdb/querier_test.go +++ b/tsdb/querier_test.go @@ -2025,6 +2025,207 @@ func TestPopulateWithDelSeriesIterator_NextWithMinTime(t *testing.T) { } } +// TestPopulateWithDelSeriesIterator_WithST tests that ST (start time) values are +// correctly preserved when iterating through chunks with ST support. +func TestPopulateWithDelSeriesIterator_WithST(t *testing.T) { + // Samples with non-zero ST values to test ST preservation. + samplesWithST := [][]chunks.Sample{ + { + sample{st: 100, t: 1000, f: 1.0}, + sample{st: 200, t: 2000, f: 2.0}, + sample{st: 300, t: 3000, f: 3.0}, + }, + } + + // Samples with varying ST patterns. + samplesVaryingST := [][]chunks.Sample{ + { + sample{st: 0, t: 1000, f: 1.0}, // st=0 + sample{st: 1500, t: 1500, f: 1.5}, // st=t + sample{st: 1900, t: 2000, f: 2.0}, // st=t-100 + sample{st: 500, t: 3000, f: 3.0}, // st < t + }, + } + + cases := []struct { + name string + samples [][]chunks.Sample + expected []chunks.Sample + }{ + { + name: "all samples have non-zero ST", + samples: samplesWithST, + expected: []chunks.Sample{ + sample{st: 100, t: 1000, f: 1.0}, + sample{st: 200, t: 2000, f: 2.0}, + sample{st: 300, t: 3000, f: 3.0}, + }, + }, + { + name: "samples with varying ST patterns", + samples: samplesVaryingST, + expected: []chunks.Sample{ + sample{st: 0, t: 1000, f: 1.0}, + sample{st: 1500, t: 1500, f: 1.5}, + sample{st: 1900, t: 2000, f: 2.0}, + sample{st: 500, t: 3000, f: 3.0}, + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + // Test with chunks (not iterables). + t.Run("chunks", func(t *testing.T) { + f, chkMetas := createFakeReaderAndNotPopulatedChunks(tc.samples...) + it := &populateWithDelSeriesIterator{} + it.reset(ulid.ULID{}, f, chkMetas, nil) + + var result []chunks.Sample + for it.Next() != chunkenc.ValNone { + st := it.AtST() + ts, v := it.At() + result = append(result, sample{st: st, t: ts, f: v}) + } + require.NoError(t, it.Err()) + require.Equal(t, tc.expected, result) + }) + + // Test with iterables. + t.Run("iterables", func(t *testing.T) { + f, chkMetas := createFakeReaderAndIterables(tc.samples...) + it := &populateWithDelSeriesIterator{} + it.reset(ulid.ULID{}, f, chkMetas, nil) + + var result []chunks.Sample + for it.Next() != chunkenc.ValNone { + st := it.AtST() + ts, v := it.At() + result = append(result, sample{st: st, t: ts, f: v}) + } + require.NoError(t, it.Err()) + require.Equal(t, tc.expected, result) + }) + }) + } +} + +// TestPopulateWithDelChunkSeriesIterator_WithST tests that ST (start time) values are +// correctly preserved when re-encoding chunks with deletions. +func TestPopulateWithDelChunkSeriesIterator_WithST(t *testing.T) { + samplesWithST := []chunks.Sample{ + sample{st: 100, t: 1000, f: 1.0}, + sample{st: 200, t: 2000, f: 2.0}, + sample{st: 300, t: 3000, f: 3.0}, + sample{st: 400, t: 4000, f: 4.0}, + sample{st: 500, t: 5000, f: 5.0}, + } + samplesWithNoLeadingST := []chunks.Sample{ + sample{st: 0, t: 1000, f: 1.0}, + sample{st: 0, t: 2000, f: 2.0}, + sample{st: 0, t: 3000, f: 3.0}, + sample{st: 400, t: 4000, f: 4.0}, + sample{st: 500, t: 5000, f: 5.0}, + } + + cases := []struct { + name string + samples [][]chunks.Sample + intervals tombstones.Intervals + expected []chunks.Sample + }{ + { + name: "no deletions - ST preserved", + samples: [][]chunks.Sample{samplesWithST}, + intervals: nil, + expected: samplesWithST, + }, + { + name: "with deletions - ST preserved in remaining samples", + samples: [][]chunks.Sample{samplesWithST}, + // Delete samples at t=2000 and t=4000. + intervals: tombstones.Intervals{{Mint: 2000, Maxt: 2000}, {Mint: 4000, Maxt: 4000}}, + expected: []chunks.Sample{ + sample{st: 100, t: 1000, f: 1.0}, + sample{st: 300, t: 3000, f: 3.0}, + sample{st: 500, t: 5000, f: 5.0}, + }, + }, + { + name: "delete first sample - ST preserved", + samples: [][]chunks.Sample{samplesWithST}, + // Delete first sample. + intervals: tombstones.Intervals{{Mint: 1000, Maxt: 1000}}, + expected: []chunks.Sample{ + sample{st: 200, t: 2000, f: 2.0}, + sample{st: 300, t: 3000, f: 3.0}, + sample{st: 400, t: 4000, f: 4.0}, + sample{st: 500, t: 5000, f: 5.0}, + }, + }, + { + // This tests that populateCurrForSingleChunk can handle + // chunks that don't start with ST, but introduce ST later. + name: "delete first sample - ST late preserved", + samples: [][]chunks.Sample{samplesWithNoLeadingST}, + // Delete first sample. + intervals: tombstones.Intervals{{Mint: 1000, Maxt: 1000}}, + expected: []chunks.Sample{ + sample{st: 0, t: 2000, f: 2.0}, + sample{st: 0, t: 3000, f: 3.0}, + sample{st: 400, t: 4000, f: 4.0}, + sample{st: 500, t: 5000, f: 5.0}, + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + // Test with chunks that need re-encoding due to deletions. + t.Run("chunks", func(t *testing.T) { + f, chkMetas := createFakeReaderAndNotPopulatedChunks(tc.samples...) + it := &populateWithDelChunkSeriesIterator{} + it.reset(ulid.ULID{}, f, chkMetas, tc.intervals) + + var result []chunks.Sample + for it.Next() { + meta := it.At() + chkIt := meta.Chunk.Iterator(nil) + for chkIt.Next() != chunkenc.ValNone { + st := chkIt.AtST() + ts, v := chkIt.At() + result = append(result, sample{st: st, t: ts, f: v}) + } + require.NoError(t, chkIt.Err()) + } + require.NoError(t, it.Err()) + require.Equal(t, tc.expected, result) + }) + + // Test with iterables. + t.Run("iterables", func(t *testing.T) { + f, chkMetas := createFakeReaderAndIterables(tc.samples...) + it := &populateWithDelChunkSeriesIterator{} + it.reset(ulid.ULID{}, f, chkMetas, tc.intervals) + + var result []chunks.Sample + for it.Next() { + meta := it.At() + chkIt := meta.Chunk.Iterator(nil) + for chkIt.Next() != chunkenc.ValNone { + st := chkIt.AtST() + ts, v := chkIt.At() + result = append(result, sample{st: st, t: ts, f: v}) + } + require.NoError(t, chkIt.Err()) + } + require.NoError(t, it.Err()) + require.Equal(t, tc.expected, result) + }) + }) + } +} + // Test the cost of merging series sets for different number of merged sets and their size. // The subset are all equivalent so this does not capture merging of partial or non-overlapping sets well. // TODO(bwplotka): Merge with storage merged series set benchmark. diff --git a/tsdb/record/record_test.go b/tsdb/record/record_test.go index c15c9aa33c..970930fbe5 100644 --- a/tsdb/record/record_test.go +++ b/tsdb/record/record_test.go @@ -545,7 +545,7 @@ func TestRecord_Type(t *testing.T) { recordType = dec.Type(enc.Samples(samples, nil)) require.Equal(t, Samples, recordType) - // With EnableSTStorage set, all Samples are V2 + // With EnableSTStorage set, all Samples are V2. enc = Encoder{EnableSTStorage: true} samples = []RefSample{{Ref: 123, T: 12345, V: 1.2345}} recordType = dec.Type(enc.Samples(samples, nil)) From d3f405301244667e2655c84a3700eb41e0e4d9b4 Mon Sep 17 00:00:00 2001 From: bwplotka Date: Mon, 23 Feb 2026 10:02:32 +0000 Subject: [PATCH 32/73] fix test after merge Signed-off-by: bwplotka --- tsdb/wlog/checkpoint_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsdb/wlog/checkpoint_test.go b/tsdb/wlog/checkpoint_test.go index 18bf8fc2a2..9056aab70b 100644 --- a/tsdb/wlog/checkpoint_test.go +++ b/tsdb/wlog/checkpoint_test.go @@ -434,7 +434,7 @@ func TestCheckpointDeletesTemporaryCheckpoints(t *testing.T) { require.NoError(t, err) defer w.Close() - _, err = Checkpoint(promslog.NewNopLogger(), w, 0, 1000, func(_ chunks.HeadSeriesRef) bool { return true }, 1000) + _, err = Checkpoint(promslog.NewNopLogger(), w, 0, 1000, func(_ chunks.HeadSeriesRef) bool { return true }, 1000, false) require.NoError(t, err) files, err := os.ReadDir(dir) From ba1b87f51ff4ba71e90f0cade6f1f1c46ce66635 Mon Sep 17 00:00:00 2001 From: bwplotka Date: Fri, 20 Feb 2026 10:45:16 +0000 Subject: [PATCH 33/73] feat: RW2 sending ST support Signed-off-by: bwplotka tmp Signed-off-by: bwplotka --- .github/workflows/ci.yml | 4 +-- compliance/go.mod | 2 +- compliance/go.sum | 4 +-- storage/remote/queue_manager.go | 44 ++++++++++++++++------------ storage/remote/queue_manager_test.go | 5 +++- 5 files changed, 33 insertions(+), 26 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2482055fa2..6c712849e9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -147,9 +147,7 @@ jobs: enable_npm: false # NOTE: Those tests are based on https://github.com/prometheus/compliance and # are executed against the ./cmd/prometheus main package. - - run: go test -skip ${SKIP_TESTS} -v --tags=compliance ./compliance/... - env: - SKIP_TESTS: "TestRemoteWriteSender/prometheus/samples/rw2/start_timestamp*" # TODO(bwplotka): PROM-60 + - run: go test -v --tags=compliance ./compliance/... build: name: Build Prometheus for common architectures diff --git a/compliance/go.mod b/compliance/go.mod index 54adc20b6c..efc9342375 100644 --- a/compliance/go.mod +++ b/compliance/go.mod @@ -2,7 +2,7 @@ module compliance go 1.25.5 -require github.com/prometheus/compliance/remotewrite v0.0.0-20260220101514-bccaa3a70275 +require github.com/prometheus/compliance/remotewrite v0.0.0-20260223092825-818283e1171e require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect diff --git a/compliance/go.sum b/compliance/go.sum index 6f273f49bd..799748d81d 100644 --- a/compliance/go.sum +++ b/compliance/go.sum @@ -30,8 +30,8 @@ github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNw github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.67.2 h1:PcBAckGFTIHt2+L3I33uNRTlKTplNzFctXcWhPyAEN8= github.com/prometheus/common v0.67.2/go.mod h1:63W3KZb1JOKgcjlIr64WW/LvFGAqKPj0atm+knVGEko= -github.com/prometheus/compliance/remotewrite v0.0.0-20260220101514-bccaa3a70275 h1:NLTtFqM00EuqtisYX9P+hQkjoxNxsR2oUQWDluyD2Xw= -github.com/prometheus/compliance/remotewrite v0.0.0-20260220101514-bccaa3a70275/go.mod h1:VEPZGvpSBbzTKc5acnBj9ng4gfo1DZ4qBsCQnoNFiSc= +github.com/prometheus/compliance/remotewrite v0.0.0-20260223092825-818283e1171e h1:tT/KBv0aSFq4AElo/bSVvUd+yNKj72hkRsyiKU45nIQ= +github.com/prometheus/compliance/remotewrite v0.0.0-20260223092825-818283e1171e/go.mod h1:VEPZGvpSBbzTKc5acnBj9ng4gfo1DZ4qBsCQnoNFiSc= github.com/prometheus/prometheus v0.307.4-0.20251119130332-1174b0ce4f1f h1:ERPCnBglv9Z4IjkEBTNbcHmZPlryMldXVWLkk7TeBIY= github.com/prometheus/prometheus v0.307.4-0.20251119130332-1174b0ce4f1f/go.mod h1:7hcXiGf9AXIKW2ehWWzxkvRYJTGmc2StUIJ8mprfxjg= github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index 63cdfb36f4..70afe6fbaf 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -760,11 +760,12 @@ outer: default: } if t.shards.enqueue(s.Ref, timeSeries{ - seriesLabels: lbls, - metadata: meta, - timestamp: s.T, - value: s.V, - sType: tSample, + seriesLabels: lbls, + metadata: meta, + startTimestamp: s.ST, + timestamp: s.T, + value: s.V, + sType: tSample, }) { continue outer } @@ -882,9 +883,10 @@ outer: if t.shards.enqueue(h.Ref, timeSeries{ seriesLabels: lbls, metadata: meta, - timestamp: h.T, - histogram: h.H, - sType: tHistogram, + // TODO(bwplotka): Populate ST once histogram Ref has it. + timestamp: h.T, + histogram: h.H, + sType: tHistogram, }) { continue outer } @@ -941,8 +943,9 @@ outer: default: } if t.shards.enqueue(h.Ref, timeSeries{ - seriesLabels: lbls, - metadata: meta, + seriesLabels: lbls, + metadata: meta, + // TODO(bwplotka): Populate ST once histogram Ref has it. timestamp: h.T, floatHistogram: h.FH, sType: tFloatHistogram, @@ -1396,13 +1399,13 @@ type queue struct { } type timeSeries struct { - seriesLabels labels.Labels - value float64 - histogram *histogram.Histogram - floatHistogram *histogram.FloatHistogram - metadata *metadata.Metadata - timestamp int64 - exemplarLabels labels.Labels + seriesLabels labels.Labels + value float64 + histogram *histogram.Histogram + floatHistogram *histogram.FloatHistogram + metadata *metadata.Metadata + startTimestamp, timestamp int64 + exemplarLabels labels.Labels // The type of series: sample, exemplar, or histogram. sType seriesType } @@ -1995,8 +1998,9 @@ func populateV2TimeSeries(symbolTable *writev2.SymbolsTable, batch []timeSeries, switch d.sType { case tSample: pendingData[nPending].Samples = append(pendingData[nPending].Samples, writev2.Sample{ - Value: d.value, - Timestamp: d.timestamp, + Value: d.value, + Timestamp: d.timestamp, + StartTimestamp: d.startTimestamp, }) nPendingSamples++ case tExemplar: @@ -2007,9 +2011,11 @@ func populateV2TimeSeries(symbolTable *writev2.SymbolsTable, batch []timeSeries, }) nPendingExemplars++ case tHistogram: + // TODO(bwplotka): Extend with ST once histograms populate it. pendingData[nPending].Histograms = append(pendingData[nPending].Histograms, writev2.FromIntHistogram(d.timestamp, d.histogram)) nPendingHistograms++ case tFloatHistogram: + // TODO(bwplotka): Extend with ST once histograms populate it. pendingData[nPending].Histograms = append(pendingData[nPending].Histograms, writev2.FromFloatHistogram(d.timestamp, d.floatHistogram)) nPendingHistograms++ case tMetadata: diff --git a/storage/remote/queue_manager_test.go b/storage/remote/queue_manager_test.go index e329b8d710..f97b0cd1e7 100644 --- a/storage/remote/queue_manager_test.go +++ b/storage/remote/queue_manager_test.go @@ -866,9 +866,12 @@ func generateRecords(c recCase) (ret records) { Help: "help text", } for j := range c.samplesPerSeries { + ts := c.tsFn(i, j) + st := ts - 1 // Keep ST simple for now; we don't need to + // test exact semantics. ret.samples[i*c.samplesPerSeries+j] = record.RefSample{ Ref: chunks.HeadSeriesRef(i), - T: c.tsFn(i, j), + T: st, V: float64(i), } } From c2eac549d5b787dfc732a33b2337afc29e95b4e5 Mon Sep 17 00:00:00 2001 From: bwplotka Date: Mon, 23 Feb 2026 16:43:07 +0000 Subject: [PATCH 34/73] tests: test ST in a cheapest way possible Signed-off-by: bwplotka --- storage/remote/queue_manager_test.go | 50 ++++++++++++++++++---------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/storage/remote/queue_manager_test.go b/storage/remote/queue_manager_test.go index f97b0cd1e7..d73e8c72b1 100644 --- a/storage/remote/queue_manager_test.go +++ b/storage/remote/queue_manager_test.go @@ -871,7 +871,8 @@ func generateRecords(c recCase) (ret records) { // test exact semantics. ret.samples[i*c.samplesPerSeries+j] = record.RefSample{ Ref: chunks.HeadSeriesRef(i), - T: st, + ST: st, + T: ts, V: float64(i), } } @@ -952,8 +953,8 @@ func getSeriesIDFromRef(r record.RefSeries) string { // TestWriteClient represents write client which does not call remote storage, // but instead re-implements fake WriteHandler for test purposes. type TestWriteClient struct { - receivedSamples map[string][]prompb.Sample - expectedSamples map[string][]prompb.Sample + receivedSamples map[string][]writev2.Sample + expectedSamples map[string][]writev2.Sample receivedExemplars map[string][]prompb.Exemplar expectedExemplars map[string][]prompb.Exemplar receivedHistograms map[string][]prompb.Histogram @@ -977,8 +978,8 @@ type TestWriteClient struct { // NewTestWriteClient creates a new testing write client. func NewTestWriteClient(protoMsg remoteapi.WriteMessageType) *TestWriteClient { return &TestWriteClient{ - receivedSamples: map[string][]prompb.Sample{}, - expectedSamples: map[string][]prompb.Sample{}, + receivedSamples: map[string][]writev2.Sample{}, + expectedSamples: map[string][]writev2.Sample{}, receivedMetadata: map[string][]prompb.MetricMetadata{}, expectedMetadata: map[string][]prompb.MetricMetadata{}, protoMsg: protoMsg, @@ -993,18 +994,20 @@ func (c *TestWriteClient) injectErrors(injectedErrs []error) { c.retry = false } +// expectSamples injects samples that will be expected on waitForExpectedData. func (c *TestWriteClient) expectSamples(ss []record.RefSample, series []record.RefSeries) { c.mtx.Lock() defer c.mtx.Unlock() - c.expectedSamples = map[string][]prompb.Sample{} - c.receivedSamples = map[string][]prompb.Sample{} + c.expectedSamples = map[string][]writev2.Sample{} + c.receivedSamples = map[string][]writev2.Sample{} for _, s := range ss { tsID := getSeriesIDFromRef(series[s.Ref]) - c.expectedSamples[tsID] = append(c.expectedSamples[tsID], prompb.Sample{ - Timestamp: s.T, - Value: s.V, + c.expectedSamples[tsID] = append(c.expectedSamples[tsID], writev2.Sample{ + StartTimestamp: s.ST, + Timestamp: s.T, + Value: s.V, }) } } @@ -1182,7 +1185,10 @@ func (c *TestWriteClient) Store(_ context.Context, req []byte, _ int) (WriteResp } } - var reqProto *prompb.WriteRequest + var ( + reqProto *prompb.WriteRequest + reqProtoV2 *writev2.Request + ) switch c.protoMsg { case remoteapi.WriteV1MessageType: reqProto = &prompb.WriteRequest{} @@ -1190,10 +1196,10 @@ func (c *TestWriteClient) Store(_ context.Context, req []byte, _ int) (WriteResp case remoteapi.WriteV2MessageType: // NOTE(bwplotka): v1 msg can be unmarshaled to v2 sometimes, without // errors. - var reqProtoV2 writev2.Request - err = proto.Unmarshal(reqBuf, &reqProtoV2) + reqProtoV2 = &writev2.Request{} + err = proto.Unmarshal(reqBuf, reqProtoV2) if err == nil { - reqProto, err = v2RequestToWriteRequest(&reqProtoV2) + reqProto, err = v2RequestToWriteRequest(reqProtoV2) } } if err != nil { @@ -1202,11 +1208,21 @@ func (c *TestWriteClient) Store(_ context.Context, req []byte, _ int) (WriteResp rs := WriteResponseStats{} b := labels.NewScratchBuilder(0) - for _, ts := range reqProto.Timeseries { + for i, ts := range reqProto.Timeseries { labels := ts.ToLabels(&b, nil) tsID := labels.String() - if len(ts.Samples) > 0 { - c.receivedSamples[tsID] = append(c.receivedSamples[tsID], ts.Samples...) + for j, s := range ts.Samples { + st := int64(0) + if reqProtoV2 != nil { + // TODO(bwplotka): Refactor queue manager TestWriteClient for tighter validation + // and native support for new RW2 features. For now we inject STs in RW2 case to the existing test suite. + st = reqProtoV2.Timeseries[i].Samples[j].StartTimestamp + } + c.receivedSamples[tsID] = append(c.receivedSamples[tsID], writev2.Sample{ + StartTimestamp: st, + Timestamp: s.Timestamp, + Value: s.Value, + }) } rs.Samples += len(ts.Samples) From f27ca31bed5efa15f83bd3c2bbc1ef28fbfc1347 Mon Sep 17 00:00:00 2001 From: bwplotka Date: Mon, 23 Feb 2026 16:50:37 +0000 Subject: [PATCH 35/73] tests: add bench CLI recommended invokations Signed-off-by: bwplotka --- storage/remote/queue_manager_test.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/storage/remote/queue_manager_test.go b/storage/remote/queue_manager_test.go index d73e8c72b1..5a572e7deb 100644 --- a/storage/remote/queue_manager_test.go +++ b/storage/remote/queue_manager_test.go @@ -1398,6 +1398,13 @@ var extraLabels []labels.Label = []labels.Label{ {Name: "pod_name", Value: "some-other-name-5j8s8"}, } +// Recommended CLI invocation(s): +/* + export bench=sampleSend && go test ./storage/remote/... \ + -run '^$' -bench '^BenchmarkSampleSend' \ + -benchtime 1s -count 6 -cpu 2 -timeout 999m -benchmem \ + | tee ${bench}.txt +*/ func BenchmarkSampleSend(b *testing.B) { // Send one sample per series, which is the typical remote_write case const numSamples = 1 @@ -1904,6 +1911,13 @@ func createDummyTimeSeries(instances int) []timeSeries { return result } +// Recommended CLI invocation(s): +/* + export bench=buildWriteRequest && go test ./storage/remote/... \ + -run '^$' -bench '^BenchmarkBuildWriteRequest' \ + -benchtime 1s -count 6 -cpu 2 -timeout 999m -benchmem \ + | tee ${bench}.txt +*/ func BenchmarkBuildWriteRequest(b *testing.B) { noopLogger := promslog.NewNopLogger() bench := func(b *testing.B, batch []timeSeries) { @@ -1944,6 +1958,13 @@ func BenchmarkBuildWriteRequest(b *testing.B) { }) } +// Recommended CLI invocation(s): +/* + export bench=buildV2WriteRequest && go test ./storage/remote/... \ + -run '^$' -bench '^BenchmarkBuildV2WriteRequest' \ + -benchtime 1s -count 6 -cpu 2 -timeout 999m -benchmem \ + | tee ${bench}.txt +*/ func BenchmarkBuildV2WriteRequest(b *testing.B) { noopLogger := promslog.NewNopLogger() bench := func(b *testing.B, batch []timeSeries) { From 0ad8516ce09dba2367d003f4acdbbf792b692313 Mon Sep 17 00:00:00 2001 From: bwplotka Date: Wed, 25 Feb 2026 19:15:22 +0000 Subject: [PATCH 36/73] fixed tests after rebase Signed-off-by: bwplotka --- storage/remote/queue_manager_test.go | 62 ++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/storage/remote/queue_manager_test.go b/storage/remote/queue_manager_test.go index 5a572e7deb..e6b933bb78 100644 --- a/storage/remote/queue_manager_test.go +++ b/storage/remote/queue_manager_test.go @@ -139,7 +139,10 @@ func TestBasicContentNegotiation(t *testing.T) { s := NewStorage(nil, nil, nil, dir, defaultFlushDeadline, nil, false) defer s.Close() - recs := generateRecords(recCase{series: 1, samplesPerSeries: 1}) + recs := generateRecords(recCase{ + noST: tc.senderProtoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + series: 1, samplesPerSeries: 1, + }) conf.RemoteWriteConfigs[0].ProtobufMessage = tc.senderProtoMsg require.NoError(t, s.ApplyConfig(conf)) @@ -221,6 +224,7 @@ func TestSampleDelivery(t *testing.T) { s := NewStorage(nil, nil, nil, dir, defaultFlushDeadline, nil, false) defer s.Close() + rc.noST = protoMsg == remoteapi.WriteV1MessageType // RW1 does not support ST. recs := generateRecords(rc) var ( @@ -374,7 +378,10 @@ func TestWALMetadataDelivery(t *testing.T) { func TestSampleDeliveryTimeout(t *testing.T) { for _, protoMsg := range []remoteapi.WriteMessageType{remoteapi.WriteV1MessageType, remoteapi.WriteV2MessageType} { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { - recs := generateRecords(recCase{series: 10, samplesPerSeries: 10}) + recs := generateRecords(recCase{ + noST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + series: 10, samplesPerSeries: 10, + }) cfg := testDefaultQueueConfig() mcfg := config.DefaultMetadataConfig cfg.MaxShards = 1 @@ -403,7 +410,10 @@ func TestSampleDeliveryOrder(t *testing.T) { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { ts := 10 n := config.DefaultQueueConfig.MaxSamplesPerSend * ts - recs := generateRecords(recCase{series: n, samplesPerSeries: 1}) + recs := generateRecords(recCase{ + noST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + series: n, samplesPerSeries: 1, + }) c, m := newTestClientAndQueueManager(t, defaultFlushDeadline, protoMsg) c.expectSamples(recs.samples, recs.series) @@ -432,7 +442,10 @@ func TestShutdown(t *testing.T) { m := newTestQueueManager(t, cfg, mcfg, deadline, c, protoMsg) // Send 2x batch size, so we know it will need at least two sends. n := 2 * config.DefaultQueueConfig.MaxSamplesPerSend - recs := generateRecords(recCase{series: n / 1000, samplesPerSeries: 1000}) + recs := generateRecords(recCase{ + noST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + series: n / 1000, samplesPerSeries: 1000, + }) m.StoreSeries(recs.series, 0) m.Start() @@ -501,7 +514,10 @@ func TestReshard(t *testing.T) { size := 10 // Make bigger to find more races. nSeries := 6 samplesPerSeries := config.DefaultQueueConfig.Capacity * size - recs := generateRecords(recCase{series: nSeries, samplesPerSeries: samplesPerSeries}) + recs := generateRecords(recCase{ + noST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + series: nSeries, samplesPerSeries: samplesPerSeries, + }) t.Logf("about to send %v samples", len(recs.samples)) cfg := config.DefaultQueueConfig @@ -577,7 +593,10 @@ func TestReshardPartialBatch(t *testing.T) { t.Parallel() for _, protoMsg := range []remoteapi.WriteMessageType{remoteapi.WriteV1MessageType, remoteapi.WriteV2MessageType} { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { - recs := generateRecords(recCase{series: 1, samplesPerSeries: 10}) + recs := generateRecords(recCase{ + noST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + series: 1, samplesPerSeries: 10, + }) c := NewTestBlockedWriteClient() @@ -622,7 +641,10 @@ func TestReshardPartialBatch(t *testing.T) { func TestQueueFilledDeadlock(t *testing.T) { for _, protoMsg := range []remoteapi.WriteMessageType{remoteapi.WriteV1MessageType, remoteapi.WriteV2MessageType} { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { - recs := generateRecords(recCase{series: 50, samplesPerSeries: 1}) + recs := generateRecords(recCase{ + noST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + series: 50, samplesPerSeries: 1, + }) c := NewNopWriteClient() @@ -803,6 +825,8 @@ type recCase struct { labelsFn func(lb *labels.ScratchBuilder, i int) labels.Labels tsFn func(i, j int) int64 + + noST bool } type records struct { @@ -867,8 +891,12 @@ func generateRecords(c recCase) (ret records) { } for j := range c.samplesPerSeries { ts := c.tsFn(i, j) - st := ts - 1 // Keep ST simple for now; we don't need to - // test exact semantics. + st := int64(0) + if !c.noST { + // Keep ST simple for now; we don't test the exact semantics, just + // if RW passes this data. + st = ts - 1 + } ret.samples[i*c.samplesPerSeries+j] = record.RefSample{ Ref: chunks.HeadSeriesRef(i), ST: st, @@ -2014,7 +2042,9 @@ func TestDropOldTimeSeries(t *testing.T) { size := 10 nSeries := 6 nSamples := config.DefaultQueueConfig.Capacity * size + noST := protoMsg == remoteapi.WriteV1MessageType // RW1 does not support ST. pastRecs := generateRecords(recCase{ + noST: noST, series: nSeries, samplesPerSeries: (nSamples / nSeries) / 2, // Half data is past. tsFn: func(_, j int) int64 { @@ -2023,6 +2053,7 @@ func TestDropOldTimeSeries(t *testing.T) { }, }) newRecs := generateRecords(recCase{ + noST: noST, series: nSeries, samplesPerSeries: (nSamples / nSeries) / 2, // Half data is past. tsFn: func(_, j int) int64 { @@ -2097,6 +2128,7 @@ func TestSendSamplesWithBackoffWithSampleAgeLimit(t *testing.T) { r := rand.New(rand.NewSource(99)) recs := generateRecords(recCase{ + noST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. series: numberOfSeries, samplesPerSeries: 1, tsFn: func(_, _ int) int64 { @@ -2121,9 +2153,10 @@ func TestSendSamplesWithBackoffWithSampleAgeLimit(t *testing.T) { if !shouldBeDropped { for _, s := range recs.samples { tsID := getSeriesIDFromRef(recs.series[s.Ref]) - c.expectedSamples[tsID] = append(c.expectedSamples[tsID], prompb.Sample{ - Timestamp: s.T, - Value: s.V, + c.expectedSamples[tsID] = append(c.expectedSamples[tsID], writev2.Sample{ + StartTimestamp: s.ST, + Timestamp: s.T, + Value: s.V, }) } } @@ -2644,7 +2677,10 @@ func TestHighestTimestampOnAppend(t *testing.T) { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { nSamples := 11 * config.DefaultQueueConfig.Capacity nSeries := 3 - recs := generateRecords(recCase{series: nSeries, samplesPerSeries: nSamples / nSeries}) + recs := generateRecords(recCase{ + noST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + series: nSeries, samplesPerSeries: nSamples / nSeries, + }) _, m := newTestClientAndQueueManager(t, defaultFlushDeadline, protoMsg) m.Start() From dcfa1b96c6f8c306286e1ec486aae1e8cfbdca34 Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Thu, 26 Feb 2026 15:16:09 +0100 Subject: [PATCH 37/73] config: validate TSDB retention settings during config parsing Move retention validation from tsdb/db.go into a TSDBRetentionConfig UnmarshalYAML method so that invalid values are rejected at config load/reload time rather than at apply time. - Reject negative retention size values. - Reject retention percentage values above 100. - Simplify ApplyConfig to assign retention values unconditionally, enabling setting a value back to 0 to disable it. Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- config/config.go | 16 ++++++++++++++++ config/config_test.go | 16 ++++++++++++++++ .../testdata/tsdb_retention_percentage.bad.yml | 4 ++++ .../tsdb_retention_percentage_negative.bad.yml | 4 ++++ config/testdata/tsdb_retention_size.bad.yml | 4 ++++ config/testdata/tsdb_retention_time.bad.yml | 4 ++++ tsdb/db.go | 18 ++++++------------ 7 files changed, 54 insertions(+), 12 deletions(-) create mode 100644 config/testdata/tsdb_retention_percentage.bad.yml create mode 100644 config/testdata/tsdb_retention_percentage_negative.bad.yml create mode 100644 config/testdata/tsdb_retention_size.bad.yml create mode 100644 config/testdata/tsdb_retention_time.bad.yml diff --git a/config/config.go b/config/config.go index 0ebebc26d5..9da37c48b0 100644 --- a/config/config.go +++ b/config/config.go @@ -1097,6 +1097,22 @@ type TSDBRetentionConfig struct { Percentage uint `yaml:"percentage,omitempty"` } +// UnmarshalYAML implements the yaml.Unmarshaler interface. +func (t *TSDBRetentionConfig) UnmarshalYAML(unmarshal func(any) error) error { + *t = TSDBRetentionConfig{} + type plain TSDBRetentionConfig + if err := unmarshal((*plain)(t)); err != nil { + return err + } + if t.Size < 0 { + return fmt.Errorf("'storage.tsdb.retention.size' must be greater than or equal to 0, got %v", t.Size) + } + if t.Percentage > 100 { + return fmt.Errorf("'storage.tsdb.retention.percentage' must be in the range [0, 100], got %v", t.Percentage) + } + return nil +} + // TSDBConfig configures runtime reloadable configuration options. type TSDBConfig struct { // OutOfOrderTimeWindow sets how long back in time an out-of-order sample can be inserted diff --git a/config/config_test.go b/config/config_test.go index 43c56a501f..91e29259ef 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -2626,6 +2626,22 @@ var expectedErrors = []struct { filename: "stackit_endpoint.bad.yml", errMsg: "invalid endpoint", }, + { + filename: "tsdb_retention_time.bad.yml", + errMsg: `not a valid duration string: "-1h"`, + }, + { + filename: "tsdb_retention_size.bad.yml", + errMsg: `'storage.tsdb.retention.size' must be greater than or equal to 0`, + }, + { + filename: "tsdb_retention_percentage.bad.yml", + errMsg: `'storage.tsdb.retention.percentage' must be in the range [0, 100]`, + }, + { + filename: "tsdb_retention_percentage_negative.bad.yml", + errMsg: "cannot unmarshal !!int `-1` into uint", + }, } func TestBadConfigs(t *testing.T) { diff --git a/config/testdata/tsdb_retention_percentage.bad.yml b/config/testdata/tsdb_retention_percentage.bad.yml new file mode 100644 index 0000000000..cb57abe0c0 --- /dev/null +++ b/config/testdata/tsdb_retention_percentage.bad.yml @@ -0,0 +1,4 @@ +storage: + tsdb: + retention: + percentage: 101 diff --git a/config/testdata/tsdb_retention_percentage_negative.bad.yml b/config/testdata/tsdb_retention_percentage_negative.bad.yml new file mode 100644 index 0000000000..2eeb60c091 --- /dev/null +++ b/config/testdata/tsdb_retention_percentage_negative.bad.yml @@ -0,0 +1,4 @@ +storage: + tsdb: + retention: + percentage: -1 diff --git a/config/testdata/tsdb_retention_size.bad.yml b/config/testdata/tsdb_retention_size.bad.yml new file mode 100644 index 0000000000..ecae64aae6 --- /dev/null +++ b/config/testdata/tsdb_retention_size.bad.yml @@ -0,0 +1,4 @@ +storage: + tsdb: + retention: + size: -1GB diff --git a/config/testdata/tsdb_retention_time.bad.yml b/config/testdata/tsdb_retention_time.bad.yml new file mode 100644 index 0000000000..465b3cf5da --- /dev/null +++ b/config/testdata/tsdb_retention_time.bad.yml @@ -0,0 +1,4 @@ +storage: + tsdb: + retention: + time: -1h diff --git a/tsdb/db.go b/tsdb/db.go index a5abc8fed9..ff1d6876d6 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -1277,18 +1277,12 @@ func (db *DB) ApplyConfig(conf *config.Config) error { // Update retention configuration if provided. if conf.StorageConfig.TSDBConfig.Retention != nil { db.retentionMtx.Lock() - if conf.StorageConfig.TSDBConfig.Retention.Time > 0 { - db.opts.RetentionDuration = int64(conf.StorageConfig.TSDBConfig.Retention.Time) - db.metrics.retentionDuration.Set((time.Duration(db.opts.RetentionDuration) * time.Millisecond).Seconds()) - } - if conf.StorageConfig.TSDBConfig.Retention.Size > 0 { - db.opts.MaxBytes = int64(conf.StorageConfig.TSDBConfig.Retention.Size) - db.metrics.maxBytes.Set(float64(db.opts.MaxBytes)) - } - if conf.StorageConfig.TSDBConfig.Retention.Percentage > 0 { - db.opts.MaxPercentage = conf.StorageConfig.TSDBConfig.Retention.Percentage - db.metrics.maxPercentage.Set(float64(db.opts.MaxPercentage)) - } + db.opts.RetentionDuration = int64(conf.StorageConfig.TSDBConfig.Retention.Time) + db.metrics.retentionDuration.Set((time.Duration(db.opts.RetentionDuration) * time.Millisecond).Seconds()) + db.opts.MaxBytes = int64(conf.StorageConfig.TSDBConfig.Retention.Size) + db.metrics.maxBytes.Set(float64(db.opts.MaxBytes)) + db.opts.MaxPercentage = conf.StorageConfig.TSDBConfig.Retention.Percentage + db.metrics.maxPercentage.Set(float64(db.opts.MaxPercentage)) db.retentionMtx.Unlock() } } else { From bf3c217bbdaec1163588384270d872e024f41738 Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Thu, 26 Feb 2026 16:36:05 +0100 Subject: [PATCH 38/73] config: apply retention CLI flags as defaults and update UI on reload Introduce DefaultTSDBRetentionConfig, populated from CLI flags before any config file is loaded, so that retention falls back to CLI flags when the config file has no storage.tsdb section. Config.UnmarshalYAML always injects a non-nil TSDBConfig with those defaults, removing the need for nil checks in main.go. ApplyConfig in web.go now propagates retention settings on each config reload so the runtime info endpoint stays up to date. Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- cmd/prometheus/main.go | 37 ++++++++++++++++------------------- config/config.go | 15 ++++++++++++++ config/config_default_test.go | 7 ++++--- config/config_test.go | 4 ++++ config/config_windows_test.go | 5 +++-- web/web.go | 22 +++++++++++++++------ 6 files changed, 59 insertions(+), 31 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index debff1f6af..dfafd0902a 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -666,6 +666,18 @@ func main() { os.Exit(2) } + // Set TSDB retention defaults from CLI flags before any config file is loaded. + // This makes CLI flags act as the default when no retention section is present. + cliRetentionDuration := cfg.tsdb.RetentionDuration + cliMaxBytes := cfg.tsdb.MaxBytes + if cliRetentionDuration == 0 && cliMaxBytes == 0 { + cliRetentionDuration = defaultRetentionDuration + } + config.DefaultTSDBRetentionConfig = config.TSDBRetentionConfig{ + Time: cliRetentionDuration, + Size: cliMaxBytes, + } + // Throw error for invalid config before starting other components. var cfgFile *config.Config if cfgFile, err = config.LoadFile(cfg.configFile, agentMode, promslog.NewNopLogger()); err != nil { @@ -707,21 +719,11 @@ func main() { logger.Warn("The option --storage.tsdb.block-reload-interval is set to a value less than 1s. Setting it to 1s to avoid overload.") cfg.tsdb.BlockReloadInterval = model.Duration(1 * time.Second) } - if cfgFile.StorageConfig.TSDBConfig != nil { - cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow - cfg.tsdb.StaleSeriesCompactionThreshold = cfgFile.StorageConfig.TSDBConfig.StaleSeriesCompactionThreshold - if cfgFile.StorageConfig.TSDBConfig.Retention != nil { - if cfgFile.StorageConfig.TSDBConfig.Retention.Time > 0 { - cfg.tsdb.RetentionDuration = cfgFile.StorageConfig.TSDBConfig.Retention.Time - } - if cfgFile.StorageConfig.TSDBConfig.Retention.Size > 0 { - cfg.tsdb.MaxBytes = cfgFile.StorageConfig.TSDBConfig.Retention.Size - } - if cfgFile.StorageConfig.TSDBConfig.Retention.Percentage > 0 { - cfg.tsdb.MaxPercentage = cfgFile.StorageConfig.TSDBConfig.Retention.Percentage - } - } - } + cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow + cfg.tsdb.StaleSeriesCompactionThreshold = cfgFile.StorageConfig.TSDBConfig.StaleSeriesCompactionThreshold + cfg.tsdb.RetentionDuration = cfgFile.StorageConfig.TSDBConfig.Retention.Time + cfg.tsdb.MaxBytes = cfgFile.StorageConfig.TSDBConfig.Retention.Size + cfg.tsdb.MaxPercentage = cfgFile.StorageConfig.TSDBConfig.Retention.Percentage // Set Go runtime parameters before we get too far into initialization. updateGoGC(cfgFile, logger) @@ -773,11 +775,6 @@ func main() { cfg.web.RoutePrefix = "/" + strings.Trim(cfg.web.RoutePrefix, "/") if !agentMode { - if cfg.tsdb.RetentionDuration == 0 && cfg.tsdb.MaxBytes == 0 && cfg.tsdb.MaxPercentage == 0 { - cfg.tsdb.RetentionDuration = defaultRetentionDuration - logger.Info("No time, size or percentage retention was set so using the default time retention", "duration", defaultRetentionDuration) - } - // Check for overflows. This limits our max retention to 100y. if cfg.tsdb.RetentionDuration < 0 { y, err := model.ParseDuration("100y") diff --git a/config/config.go b/config/config.go index 9da37c48b0..2ccca31b4f 100644 --- a/config/config.go +++ b/config/config.go @@ -278,6 +278,9 @@ var ( } ) +// DefaultTSDBRetentionConfig is the default TSDB retention configuration. +var DefaultTSDBRetentionConfig TSDBRetentionConfig + // Config is the top-level configuration for Prometheus's config files. type Config struct { GlobalConfig GlobalConfig `yaml:"global"` @@ -405,6 +408,13 @@ func (c *Config) UnmarshalYAML(unmarshal func(any) error) error { c.Runtime = DefaultRuntimeConfig } + // If no storage.tsdb section is present, TSDBConfig is nil and its + // UnmarshalYAML never runs. Inject the default retention here. + if c.StorageConfig.TSDBConfig == nil { + retention := DefaultTSDBRetentionConfig + c.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention} + } + for _, rf := range c.RuleFiles { if !patRulePath.MatchString(rf) { return fmt.Errorf("invalid rule file path %q", rf) @@ -1143,6 +1153,11 @@ func (t *TSDBConfig) UnmarshalYAML(unmarshal func(any) error) error { t.OutOfOrderTimeWindow = time.Duration(t.OutOfOrderTimeWindowFlag).Milliseconds() + if t.Retention == nil { + retention := DefaultTSDBRetentionConfig + t.Retention = &retention + } + return nil } diff --git a/config/config_default_test.go b/config/config_default_test.go index 91c290ae4e..ec7a112824 100644 --- a/config/config_default_test.go +++ b/config/config_default_test.go @@ -20,9 +20,10 @@ const ruleFilesConfigFile = "testdata/rules_abs_path.good.yml" var ruleFilesExpectedConf = &Config{ loaded: true, - GlobalConfig: DefaultGlobalConfig, - Runtime: DefaultRuntimeConfig, - OTLPConfig: DefaultOTLPConfig, + GlobalConfig: DefaultGlobalConfig, + Runtime: DefaultRuntimeConfig, + OTLPConfig: DefaultOTLPConfig, + StorageConfig: StorageConfig{TSDBConfig: &TSDBConfig{Retention: &TSDBRetentionConfig{}}}, RuleFiles: []string{ "testdata/first.rules", "testdata/rules/second.rules", diff --git a/config/config_test.go b/config/config_test.go index 91e29259ef..dbc221329d 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -2716,6 +2716,10 @@ func TestGlobalConfig(t *testing.T) { require.NoError(t, err) exp := DefaultConfig exp.loaded = true + // TSDBConfig is always injected by Config.UnmarshalYAML even when no + // storage.tsdb section is present, so the expected config must include it. + retention := DefaultTSDBRetentionConfig + exp.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention} require.Equal(t, exp, *c) }) diff --git a/config/config_windows_test.go b/config/config_windows_test.go index 72a56ff41a..e7627f562a 100644 --- a/config/config_windows_test.go +++ b/config/config_windows_test.go @@ -18,8 +18,9 @@ const ruleFilesConfigFile = "testdata/rules_abs_path_windows.good.yml" var ruleFilesExpectedConf = &Config{ loaded: true, - GlobalConfig: DefaultGlobalConfig, - Runtime: DefaultRuntimeConfig, + GlobalConfig: DefaultGlobalConfig, + Runtime: DefaultRuntimeConfig, + StorageConfig: StorageConfig{TSDBConfig: &TSDBConfig{Retention: &TSDBRetentionConfig{}}}, RuleFiles: []string{ "testdata\\first.rules", "testdata\\rules\\second.rules", diff --git a/web/web.go b/web/web.go index 90eaf13afe..c4fcfdb2c4 100644 --- a/web/web.go +++ b/web/web.go @@ -253,6 +253,11 @@ func (h *Handler) ApplyConfig(conf *config.Config) error { defer h.mtx.Unlock() h.config = conf + if conf.StorageConfig.TSDBConfig != nil && conf.StorageConfig.TSDBConfig.Retention != nil { + h.options.TSDBRetentionDuration = conf.StorageConfig.TSDBConfig.Retention.Time + h.options.TSDBMaxBytes = conf.StorageConfig.TSDBConfig.Retention.Size + h.options.TSDBMaxPercentage = conf.StorageConfig.TSDBConfig.Retention.Percentage + } return nil } @@ -866,20 +871,25 @@ func (h *Handler) runtimeInfo() (api_v1.RuntimeInfo, error) { status.Hostname = hostname status.ServerTime = time.Now().UTC() - if h.options.TSDBRetentionDuration != 0 { - status.StorageRetention = h.options.TSDBRetentionDuration.String() + h.mtx.RLock() + tsdbRetentionDuration := h.options.TSDBRetentionDuration + tsdbMaxBytes := h.options.TSDBMaxBytes + tsdbMaxPercentage := h.options.TSDBMaxPercentage + h.mtx.RUnlock() + if tsdbRetentionDuration != 0 { + status.StorageRetention = tsdbRetentionDuration.String() } - if h.options.TSDBMaxBytes != 0 { + if tsdbMaxBytes != 0 { if status.StorageRetention != "" { status.StorageRetention += " or " } - status.StorageRetention += h.options.TSDBMaxBytes.String() + status.StorageRetention += tsdbMaxBytes.String() } - if h.options.TSDBMaxPercentage != 0 { + if tsdbMaxPercentage != 0 { if status.StorageRetention != "" { status.StorageRetention += " or " } - status.StorageRetention = status.StorageRetention + strconv.FormatUint(uint64(h.options.TSDBMaxPercentage), 10) + "%" + status.StorageRetention = status.StorageRetention + strconv.FormatUint(uint64(tsdbMaxPercentage), 10) + "%" } metrics, err := h.gatherer.Gather() From 3675a5e56c86ef09f8bb35d93a05012c381b1445 Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Thu, 26 Feb 2026 16:41:34 +0100 Subject: [PATCH 39/73] tsdb: fix unit mismatch in retention duration on config reload conf.StorageConfig.TSDBConfig.Retention.Time is model.Duration which is type-aliased to time.Duration (nanoseconds), but RetentionDuration is int64 in milliseconds. The missing division by time.Millisecond caused the metric prometheus_tsdb_retention_limit_seconds to be reported 1e6 times too large after a config reload. Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- tsdb/db.go | 2 +- tsdb/db_test.go | 27 ++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/tsdb/db.go b/tsdb/db.go index ff1d6876d6..0e92c2b70e 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -1277,7 +1277,7 @@ func (db *DB) ApplyConfig(conf *config.Config) error { // Update retention configuration if provided. if conf.StorageConfig.TSDBConfig.Retention != nil { db.retentionMtx.Lock() - db.opts.RetentionDuration = int64(conf.StorageConfig.TSDBConfig.Retention.Time) + db.opts.RetentionDuration = int64(time.Duration(conf.StorageConfig.TSDBConfig.Retention.Time) / time.Millisecond) db.metrics.retentionDuration.Set((time.Duration(db.opts.RetentionDuration) * time.Millisecond).Seconds()) db.opts.MaxBytes = int64(conf.StorageConfig.TSDBConfig.Retention.Size) db.metrics.maxBytes.Set(float64(db.opts.MaxBytes)) diff --git a/tsdb/db_test.go b/tsdb/db_test.go index bd868d945a..be914cd87d 100644 --- a/tsdb/db_test.go +++ b/tsdb/db_test.go @@ -1743,7 +1743,7 @@ func TestRuntimeRetentionConfigChange(t *testing.T) { StorageConfig: config.StorageConfig{ TSDBConfig: &config.TSDBConfig{ Retention: &config.TSDBRetentionConfig{ - Time: model.Duration(shorterRetentionDuration), + Time: model.Duration(time.Duration(shorterRetentionDuration) * time.Millisecond), }, }, }, @@ -1772,6 +1772,31 @@ func TestRuntimeRetentionConfigChange(t *testing.T) { require.Positive(t, int(prom_testutil.ToFloat64(db.metrics.timeRetentionCount)), "time retention count should be incremented") } +// TestApplyConfigRetentionDurationMetricUnit verifies that after a config +// reload the prometheus_tsdb_retention_limit_seconds metric reports the +// retention in seconds. +func TestApplyConfigRetentionDurationMetricUnit(t *testing.T) { + oneHourMs := int64(time.Hour / time.Millisecond) + db := newTestDB(t, withOpts(&Options{RetentionDuration: oneHourMs})) + + cfg := &config.Config{ + StorageConfig: config.StorageConfig{ + TSDBConfig: &config.TSDBConfig{ + Retention: &config.TSDBRetentionConfig{ + Time: model.Duration(time.Hour), + }, + }, + }, + } + require.NoError(t, db.ApplyConfig(cfg)) + + require.Equal(t, oneHourMs, db.getRetentionDuration()) + + gotSeconds := prom_testutil.ToFloat64(db.metrics.retentionDuration) + wantSeconds := time.Hour.Seconds() + require.Equal(t, wantSeconds, gotSeconds) +} + func TestNotMatcherSelectsLabelsUnsetSeries(t *testing.T) { db := newTestDB(t) From 8edc676cbe73b348386ce5db0272dff72ccead76 Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Thu, 26 Feb 2026 16:59:21 +0100 Subject: [PATCH 40/73] config: inject TSDBConfig defaults in Load for empty config bodies When the config body is empty, UnmarshalYAML is never called, so the TSDBConfig nil injection added there never ran. Replicate the same guard in Load, which is the entry point that already handles this case for other defaults via DefaultConfig. Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- config/config.go | 7 +++++++ config/config_test.go | 2 ++ 2 files changed, 9 insertions(+) diff --git a/config/config.go b/config/config.go index 2ccca31b4f..cb45347e41 100644 --- a/config/config.go +++ b/config/config.go @@ -83,6 +83,13 @@ func Load(s string, logger *slog.Logger) (*Config, error) { return nil, err } + // When the config body is empty, UnmarshalYAML is never called, so + // TSDBConfig may still be nil. + if cfg.StorageConfig.TSDBConfig == nil { + retention := DefaultTSDBRetentionConfig + cfg.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention} + } + b := labels.NewScratchBuilder(0) cfg.GlobalConfig.ExternalLabels.Range(func(v labels.Label) { newV := os.Expand(v.Value, func(s string) string { diff --git a/config/config_test.go b/config/config_test.go index dbc221329d..a845fac719 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -2665,6 +2665,8 @@ func TestEmptyConfig(t *testing.T) { require.NoError(t, err) exp := DefaultConfig exp.loaded = true + retention := DefaultTSDBRetentionConfig + exp.StorageConfig.TSDBConfig = &TSDBConfig{Retention: &retention} require.Equal(t, exp, *c) require.Equal(t, 75, c.Runtime.GoGC) } From 5e5b14c04b52d3d6f293d95897177a50dbb872b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Fri, 6 Mar 2026 14:03:09 +0100 Subject: [PATCH 41/73] feat(chunkenc): replace xoroptst chunk encoding with xor2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XOR2 is based on https://github.com/prometheus/prometheus/pull/18238 With additional ST support. Signed-off-by: György Krajcsovits --- tsdb/chunkenc/benchmark_test.go | 386 +++++---------- tsdb/chunkenc/bstream.go | 89 ++++ tsdb/chunkenc/chunk.go | 28 +- tsdb/chunkenc/chunk_test.go | 8 +- tsdb/chunkenc/xor2.go | 822 ++++++++++++++++++++++++++++++++ tsdb/chunkenc/xor2_test.go | 279 +++++++++++ tsdb/chunkenc/xoroptst.go | 729 ---------------------------- tsdb/chunkenc/xoroptst_test.go | 108 ----- tsdb/db_append_v2_test.go | 2 +- tsdb/docs/format/chunks.md | 81 +++- tsdb/head_test.go | 2 +- tsdb/ooo_head.go | 2 +- tsdb/ooo_head_test.go | 2 +- 13 files changed, 1398 insertions(+), 1140 deletions(-) create mode 100644 tsdb/chunkenc/xor2.go create mode 100644 tsdb/chunkenc/xor2_test.go delete mode 100644 tsdb/chunkenc/xoroptst.go delete mode 100644 tsdb/chunkenc/xoroptst_test.go diff --git a/tsdb/chunkenc/benchmark_test.go b/tsdb/chunkenc/benchmark_test.go index 702e3a95e2..3f77b14ca3 100644 --- a/tsdb/chunkenc/benchmark_test.go +++ b/tsdb/chunkenc/benchmark_test.go @@ -62,284 +62,150 @@ func foreachFmtSampleCase(b *testing.B, fn func(b *testing.B, f fmtCase, s sampl rFloats[i] = float64(r.Intn(100)) } - sampleCases := []sampleCase{ - { - name: "vt=constant/st=0", - samples: func() (ret []triple) { - t, v := initT, initV - for range nSamples { - t += 15000 - ret = append(ret, triple{st: 0, t: t, v: v}) - } - return ret - }(), - }, + // tPatterns control how the regular timestamp advances. + type tPattern struct { + name string + next func(t int64, i int) int64 + } + // vPatterns control how the value advances. + type vPattern struct { + name string + next func(v float64, i int) float64 + } + // stPatterns compute the start timestamp from the previous t (before the + // step), the new t (after the step), and the sample index. + type stPattern struct { + name string + compute func(prevT, newT int64, i int) int64 + } + tPatterns := []tPattern{ { - // Cumulative with a constant ST through the whole chunk, typical case (e.g. long counting counter). - name: "vt=constant/st=cumulative", - samples: func() (ret []triple) { - t, v := initT, initV - for range nSamples { - t += 15000 - ret = append(ret, triple{st: initST, t: t, v: v}) - } - return ret - }(), + name: "t=constant", + next: func(t int64, _ int) int64 { return t + 15000 }, }, { - // Delta simulates delta type or worst case for cumulatives, where ST - // is changing on every sample. - name: "vt=constant/st=delta-exclusive", - samples: func() (ret []triple) { - t, v := initT, initV - for range nSamples { - st := t + 1 // ST is a tight interval after the last t+1ms. - t += 15000 - ret = append(ret, triple{st: st, t: t, v: v}) - } - return ret - }(), + // 15 seconds ± up to 100ms of jitter. + name: "t=jitter", + next: func(t int64, i int) int64 { return t + rInts[i] - 50 + 15000 }, }, { - // Delta simulates delta type or worst case for cumulatives, where ST - // is changing on every sample. - name: "vt=constant/st=delta-inclusive", - samples: func() (ret []triple) { - t, v := initT, initV - for range nSamples { - st := t // ST is the same as the previous t. - t += 15000 - ret = append(ret, triple{st: st, t: t, v: v}) + // First 10 samples at constant 60s, then one 10-interval gap (600s), + // then 60s ± 30ms jitter. The gap triggers XOR18111 full mode via + // multiplier encoding (dod=540000 = 9×60000). Subsequent small-jitter + // delta-of-deltas (≤30ms) use XOR18111's 7-bit full-mode code (9 bits + // total) vs XOR compact's minimum 14-bit code (16 bits total). + name: "t=gap-jitter", + next: func(t int64, i int) int64 { + if i < 10 { + return t + 60000 } - return ret - }(), + if i == 10 { + return t + 10*60000 // 10-interval gap; triggers XOR18111 full mode. + } + return t + 60000 + rInts[i]%61 - 30 // 60s ± 30ms jitter. + }, + }, + } + vPatterns := []vPattern{ + { + name: "v=constant", + next: func(v float64, _ int) float64 { return v }, + }, + // We are not interested in float compression we're not changing it. + // { + // // Varying from -50 to +50 in 100 discrete steps. + // name: "v=rand-steps", + // next: func(v float64, i int) float64 { return v + rFloats[i] - 50 }, + // }, + // { + // // Random increment between 0 and 1.0. + // name: "v=rand0-1", + // next: func(v float64, i int) float64 { return v + rFloats[i]/100.0 }, + // }, + // { + // // Random decrement between 0 and -1.0. Tests negative varint encoding; + // // see https://victoriametrics.com/blog/go-protobuf/. + // name: "v=nrand0-1", + // next: func(v float64, i int) float64 { return v - rFloats[i]/100.0 }, + // }, + } + stPatterns := []stPattern{ + { + name: "st=0", + compute: func(_, _ int64, _ int) int64 { return 0 }, }, { - name: "vt=constant/st=t", - samples: func() (ret []triple) { - t, v := initT, initV - for range nSamples { - t += 15000 - ret = append(ret, triple{st: t, t: t, v: v}) - } - return ret - }(), + // Constant ST throughout the chunk, typical for long-running counters. + name: "st=cumulative", + compute: func(_, _ int64, _ int) int64 { return initST }, }, { - // Delta simulates delta type or worst case for cumulatives, where ST - // is changing on every sample. - name: "vt=constant/st=delta-jitter", - samples: func() (ret []triple) { + // ST is just after the previous sample's t: tight delta interval. + name: "st=delta-excl", + compute: func(prevT, _ int64, _ int) int64 { return prevT + 1 }, + }, + { + // ST equals the previous sample's t: inclusive delta interval. + name: "st=delta-incl", + compute: func(prevT, _ int64, _ int) int64 { return prevT }, + }, + { + // ST equals the current sample's t. + name: "st=t", + compute: func(_, newT int64, _ int) int64 { return newT }, + }, + { + // ST is equal to the previous t plus up to 100ms of jitter. + name: "st=delta-jitter", + compute: func(prevT, _ int64, i int) int64 { return prevT + rInts[nSamples+i] }, + }, + { + // Cumulative ST with periodic resets 10s before the current t. + name: "st=cum-resets", + compute: func(_, newT int64, i int) int64 { + if i%6 == 5 { + return newT - 10000 + } + return initST + }, + }, + { + // Cumulative ST with periodic zero resets. + name: "st=cum-zeros", + compute: func(_, _ int64, i int) int64 { + if i%6 == 5 { + return 0 + } + return initST + }, + }, + } + + var sampleCases []sampleCase + for _, tp := range tPatterns { + for _, vp := range vPatterns { + for _, sp := range stPatterns { + samples := make([]triple, 0, nSamples) t, v := initT, initV for i := range nSamples { - st := t + rInts[nSamples+i] // ST is the same as the previous t + jitter of up to 100ms. - t += 15000 - ret = append(ret, triple{st: st, t: t, v: v}) + prevT := t + t = tp.next(t, i) + v = vp.next(v, i) + st := sp.compute(prevT, t, i) + samples = append(samples, triple{st: st, t: t, v: v}) } - return ret - }(), - }, - { - name: "vt=random steps/st=0", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. - ret = append(ret, triple{st: 0, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random steps/st=cumulative", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. - ret = append(ret, triple{st: initST, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random steps/st=delta-exclusive", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - st := t + 1 // ST is a tight interval after the last t+1ms. - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. - ret = append(ret, triple{st: st, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random steps/st=delta-inclusive", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - st := t // ST is equal to the previous t. - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. - ret = append(ret, triple{st: st, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random steps/st=t", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. - ret = append(ret, triple{st: t, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random steps/st=delta-jittery", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - st := t + rInts[nSamples+i] // ST is equal to the previous t + jitter of up to 100ms. - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] - 50 // Varying from -50 to +50 in 100 discrete steps. - ret = append(ret, triple{st: st, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random 0-1/st=0", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] / 100.0 // Random between 0 and 1.0. - ret = append(ret, triple{st: 0, t: t, v: v}) - } - return ret - }(), - }, - { - // Are we impacted by https://victoriametrics.com/blog/go-protobuf/ negative varint issue? (zig-zag needed?) - name: "vt=negrandom 0-1/st=0", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v -= rFloats[i] / 100.0 // Random between 0 and 1.0. - ret = append(ret, triple{st: 0, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random 0-1/st=cumulative", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] / 100.0 // Random between 0 and 1.0. - ret = append(ret, triple{st: initST, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random 0-1/st=cumulative-periodic-resets", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] / 100.0 // Random between 0 and 1.0. - st := initST - if i%6 == 5 { - st = t - 10000 // Reset of 10s before current t. - } - ret = append(ret, triple{st: st, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random 0-1/st=cumulative-periodic-zeros", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] / 100.0 // Random between 0 and 1.0. - st := initST - if i%6 == 5 { - st = 0 - } - ret = append(ret, triple{st: st, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random 0-1/st=delta-exclusive", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - st := t + 1 // ST is a tight interval after the last t+1ms. - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] / 100.0 // Random between 0 and 1.0. - ret = append(ret, triple{st: st, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random 0-1/st=delta-inclusive", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - st := t // ST is the same as the previous t. - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] / 100.0 // Random between 0 and 1.0. - ret = append(ret, triple{st: st, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random 0-1/st=t", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] / 100.0 // Random between 0 and 1.0. - ret = append(ret, triple{st: t, t: t, v: v}) - } - return ret - }(), - }, - { - name: "vt=random 0-1/st=delta-jittery", - samples: func() (ret []triple) { - t, v := initT, initV - for i := range nSamples { - st := t + rInts[nSamples+i] // ST is equal to the previous t + jitter of up to 100ms. - t += rInts[i] - 50 + 15000 // 15 seconds +- up to 100ms of jitter. - v += rFloats[i] / 100.0 // Random between 0 and 1.0. - ret = append(ret, triple{st: st, t: t, v: v}) - } - return ret - }(), - }, + sampleCases = append(sampleCases, sampleCase{ + name: tp.name + "/" + vp.name + "/" + sp.name, + samples: samples, + }) + } + } } for _, f := range []fmtCase{ {name: "XOR", newChunkFn: func() Chunk { return NewXORChunk() }, stUnsupported: true}, - {name: "XOR_OPT_ST", newChunkFn: func() Chunk { return NewXOROptSTChunk() }}, + {name: "XOR2", newChunkFn: func() Chunk { return NewXOR2Chunk() }}, } { for _, s := range sampleCases { b.Run(fmt.Sprintf("fmt=%s/%s", f.name, s.name), func(b *testing.B) { diff --git a/tsdb/chunkenc/bstream.go b/tsdb/chunkenc/bstream.go index abf6e4dbef..ecface3099 100644 --- a/tsdb/chunkenc/bstream.go +++ b/tsdb/chunkenc/bstream.go @@ -215,6 +215,95 @@ func (b *bstreamReader) ReadByte() (byte, error) { return byte(v), nil } +// readXOR2Control reads the XOR2 variable-length joint control prefix +// and returns 0-5 mapping to the six encoding cases: +// +// 0 → '0' dod=0, val=0 (1 bit consumed) +// 1 → '10' dod=0, val≠0 (2 bits consumed) +// 2 → '110' dod≠0, 13-bit signed dod (3 bits consumed) +// 3 → '1110' dod≠0, 20-bit signed dod (4 bits consumed) +// 4 → '11110' dod≠0, 64-bit escape (5 bits consumed) +// 5 → '11111' dod=0, stale NaN (5 bits consumed) +// +// The fast path peeks at 4 bits from the internal buffer; for the '1111' +// prefix a fifth bit is read to distinguish cases 4 and 5. +func (b *bstreamReader) readXOR2Control() (uint8, error) { + if b.valid >= 4 { + top4 := uint8((b.buffer >> (b.valid - 4)) & 0xf) + if top4 < 8 { // '0xxx' → case 0. + b.valid-- + return 0, nil + } + if top4 < 12 { // '10xx' → case 1. + b.valid -= 2 + return 1, nil + } + if top4 < 14 { // '110x' → case 2. + b.valid -= 3 + return 2, nil + } + if top4 == 14 { // '1110' → case 3. + b.valid -= 4 + return 3, nil + } + // '1111': need fifth bit to distinguish cases 4 and 5. + if b.valid >= 5 { + bit4 := uint8((b.buffer >> (b.valid - 5)) & 1) + b.valid -= 5 + return 4 + bit4, nil + } + // Fifth bit spans a buffer boundary; consume the four known bits + // and read the fifth from the stream. + b.valid -= 4 + bit4, err := b.readBit() + if err != nil { + return 0, err + } + if bit4 == zero { + return 4, nil + } + return 5, nil + } + + // Slow path: bits may span buffer boundaries, read one at a time. + bit0, err := b.readBit() + if err != nil { + return 0, err + } + if bit0 == zero { + return 0, nil + } + bit1, err := b.readBit() + if err != nil { + return 0, err + } + if bit1 == zero { + return 1, nil + } + bit2, err := b.readBit() + if err != nil { + return 0, err + } + if bit2 == zero { + return 2, nil + } + bit3, err := b.readBit() + if err != nil { + return 0, err + } + if bit3 == zero { + return 3, nil + } + bit4, err := b.readBit() + if err != nil { + return 0, err + } + if bit4 == zero { + return 4, nil + } + return 5, nil +} + // loadNextBuffer loads the next bytes from the stream into the internal buffer. // The input nbits is the minimum number of bits that must be read, but the implementation // can read more (if possible) to improve performances. diff --git a/tsdb/chunkenc/chunk.go b/tsdb/chunkenc/chunk.go index de5fa0c2de..b3b33df34b 100644 --- a/tsdb/chunkenc/chunk.go +++ b/tsdb/chunkenc/chunk.go @@ -30,7 +30,7 @@ const ( EncXOR EncHistogram EncFloatHistogram - EncXOROptST + EncXOR2 ) func (e Encoding) String() string { @@ -43,15 +43,15 @@ func (e Encoding) String() string { return "histogram" case EncFloatHistogram: return "floathistogram" - case EncXOROptST: - return "XOR-start-timestamp" + case EncXOR2: + return "XOR2" } return "" } // IsValidEncoding returns true for supported encodings. func IsValidEncoding(e Encoding) bool { - return e == EncXOR || e == EncHistogram || e == EncFloatHistogram || e == EncXOROptST + return e == EncXOR || e == EncHistogram || e == EncFloatHistogram || e == EncXOR2 } const ( @@ -195,7 +195,7 @@ func (v ValueType) ChunkEncoding(storeST bool) Encoding { switch v { case ValFloat: if storeST { - return EncXOROptST + return EncXOR2 } return EncXOR case ValHistogram: @@ -321,7 +321,7 @@ func NewPool() Pool { }, xoroptst: sync.Pool{ New: func() any { - return &XorOptSTChunk{b: bstream{}} + return &XOR2Chunk{b: bstream{}} }, }, } @@ -336,8 +336,8 @@ func (p *pool) Get(e Encoding, b []byte) (Chunk, error) { c = p.histogram.Get().(*HistogramChunk) case EncFloatHistogram: c = p.floatHistogram.Get().(*FloatHistogramChunk) - case EncXOROptST: - c = p.xoroptst.Get().(*XorOptSTChunk) + case EncXOR2: + c = p.xoroptst.Get().(*XOR2Chunk) default: return nil, fmt.Errorf("invalid chunk encoding %q", e) } @@ -359,8 +359,8 @@ func (p *pool) Put(c Chunk) error { case EncFloatHistogram: _, ok = c.(*FloatHistogramChunk) sp = &p.floatHistogram - case EncXOROptST: - _, ok = c.(*XorOptSTChunk) + case EncXOR2: + _, ok = c.(*XOR2Chunk) sp = &p.xoroptst default: return fmt.Errorf("invalid chunk encoding %q", c.Encoding()) @@ -388,8 +388,8 @@ func FromData(e Encoding, d []byte) (Chunk, error) { return &HistogramChunk{b: bstream{count: 0, stream: d}}, nil case EncFloatHistogram: return &FloatHistogramChunk{b: bstream{count: 0, stream: d}}, nil - case EncXOROptST: - return &XorOptSTChunk{b: bstream{count: 0, stream: d}}, nil + case EncXOR2: + return &XOR2Chunk{b: bstream{count: 0, stream: d}}, nil } return nil, fmt.Errorf("invalid chunk encoding %q", e) } @@ -403,8 +403,8 @@ func NewEmptyChunk(e Encoding) (Chunk, error) { return NewHistogramChunk(), nil case EncFloatHistogram: return NewFloatHistogramChunk(), nil - case EncXOROptST: - return NewXOROptSTChunk(), nil + case EncXOR2: + return NewXOR2Chunk(), nil } return nil, fmt.Errorf("invalid chunk encoding %q", e) } diff --git a/tsdb/chunkenc/chunk_test.go b/tsdb/chunkenc/chunk_test.go index 1717300288..4e19f15b42 100644 --- a/tsdb/chunkenc/chunk_test.go +++ b/tsdb/chunkenc/chunk_test.go @@ -34,7 +34,7 @@ func TestChunk(t *testing.T) { factory func() Chunk }{ {encoding: EncXOR, supportsST: false, factory: func() Chunk { return NewXORChunk() }}, - {encoding: EncXOROptST, supportsST: true, factory: func() Chunk { return NewXOROptSTChunk() }}, + {encoding: EncXOR2, supportsST: true, factory: func() Chunk { return NewXOR2Chunk() }}, } for _, tc := range testcases { t.Run(fmt.Sprintf("%v", tc.encoding), func(t *testing.T) { @@ -144,7 +144,7 @@ func TestPool(t *testing.T) { }, { name: "xor opt st", - encoding: EncXOROptST, + encoding: EncXOR2, }, { name: "invalid encoding", @@ -167,8 +167,8 @@ func TestPool(t *testing.T) { b = &c.(*HistogramChunk).b case EncFloatHistogram: b = &c.(*FloatHistogramChunk).b - case EncXOROptST: - b = &c.(*XorOptSTChunk).b + case EncXOR2: + b = &c.(*XOR2Chunk).b default: b = &c.(*XORChunk).b } diff --git a/tsdb/chunkenc/xor2.go b/tsdb/chunkenc/xor2.go new file mode 100644 index 0000000000..3dd3241180 --- /dev/null +++ b/tsdb/chunkenc/xor2.go @@ -0,0 +1,822 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// XOR2Chunk implements XOR encoding with joint timestamp+value control bits +// and byte-packed dod encoding for efficient appending. It also has an extra +// header byte after the sample count to allow for optionally encoding start +// timestamp (ST). +// +// Control prefix for samples >= 2: +// +// 0 → dod=0 AND value unchanged (1 bit) +// 10 → dod=0, value changed (2 bits, then value encoding) +// 110 → dod≠0, 13-bit signed [-4096, 4095] (prefix+dod packed into 2 bytes) +// 1110 → dod≠0, 20-bit signed [-524288, 524287] (prefix+dod packed into 3 bytes) +// 11110 → dod≠0, 64-bit escape (5+64 bits, then value encoding) +// 11111 → dod=0, stale NaN (5 bits, no value field) +// +// The dod bins are widened so that prefix+dod aligns to byte boundaries, +// replacing writeBit calls with writeByte for common cases. +// +// Value encoding for the dod≠0 cases (``): +// +// 0 → value unchanged +// 10 → reuse previous leading/trailing window +// 110 → new leading/trailing window +// 111 → stale NaN +// +// Value encoding for the dod=0, value-changed case (``): +// +// 0 → reuse previous leading/trailing window +// 1 → new leading/trailing window +// +// Start timestamp (ST) encoding: +// +// 1-byte ST header (at b[chunkHeaderSize]) layout: +// +// bit 7 (0x80): firstSTKnown — ST for the first sample is present in the stream +// bits 6-0: firstSTChangeOn — sample index where the first ST change begins +// +// When no ST is provided (st == 0 always), the header stays 0x00 and the +// chunk has no additional bits in it. +// +// When ST is present, the ST delta (prevT - st) is appended after each +// sample's joint timestamp+value encoding using putVarbitInt. + +package chunkenc + +import ( + "encoding/binary" + "math" + "math/bits" + + "github.com/prometheus/prometheus/model/histogram" + "github.com/prometheus/prometheus/model/value" +) + +const ( + chunkSTHeaderSize = 1 + maxFirstSTChangeOn = 0x7F +) + +func writeHeaderFirstSTKnown(b []byte) { + b[0] = 0x80 +} + +func writeHeaderFirstSTChangeOn(b []byte, firstSTChangeOn uint16) { + // First bit indicates the initial ST value. + // Here we save the sample number from where the first change occurs in the + // rest of the byte (7 bits) + + if firstSTChangeOn > maxFirstSTChangeOn { + // This should never happen, would cause corruption (ST already skipped but shouldn't). + return + } + b[0] |= uint8(firstSTChangeOn) +} + +func readSTHeader(b []byte) (firstSTKnown bool, firstSTChangeOn uint8) { + if b[0] == 0x00 { + return false, 0 + } + if b[0] == 0x80 { + return true, 0 + } + mask := byte(0x80) + if b[0]&mask != 0 { + firstSTKnown = true + } + mask = 0x7F + return firstSTKnown, b[0] & mask +} + +// XOR2Chunk holds XOR2 encoded samples with optional start +// timestamp per chunk or per sample. See XOROptST for the ST header format. +type XOR2Chunk struct { + b bstream +} + +// NewXOR2Chunk returns a new chunk with XOR2 encoding. +func NewXOR2Chunk() *XOR2Chunk { + b := make([]byte, chunkHeaderSize+chunkSTHeaderSize, chunkAllocationSize) + return &XOR2Chunk{b: bstream{stream: b, count: 0}} +} + +func (c *XOR2Chunk) Reset(stream []byte) { + c.b.Reset(stream) +} + +// Encoding returns the encoding type. +func (*XOR2Chunk) Encoding() Encoding { + return EncXOR2 +} + +// Bytes returns the underlying byte slice of the chunk. +func (c *XOR2Chunk) Bytes() []byte { + return c.b.bytes() +} + +// NumSamples returns the number of samples in the chunk. +func (c *XOR2Chunk) NumSamples() int { + return int(binary.BigEndian.Uint16(c.Bytes())) +} + +// Compact implements the Chunk interface. +func (c *XOR2Chunk) Compact() { + if l := len(c.b.stream); cap(c.b.stream) > l+chunkCompactCapacityThreshold { + buf := make([]byte, l) + copy(buf, c.b.stream) + c.b.stream = buf + } +} + +// Appender implements the Chunk interface. +func (c *XOR2Chunk) Appender() (Appender, error) { + if len(c.b.stream) == chunkHeaderSize+chunkSTHeaderSize { + return &xor2Appender{ + b: &c.b, + t: math.MinInt64, + leading: 0xff, + }, nil + } + it := c.iterator(nil) + + for it.Next() != ValNone { + } + if err := it.Err(); err != nil { + return nil, err + } + + // Set the bit position for continuing writes. The iterator's reader tracks + // how many bits remain unread in the last byte. + c.b.count = it.br.valid + + a := &xor2Appender{ + b: &c.b, + st: it.st, + t: it.t, + v: it.baselineV, + tDelta: it.tDelta, + stDiff: it.stDiff, + leading: it.leading, + trailing: it.trailing, + numTotal: binary.BigEndian.Uint16(c.b.bytes()), + firstSTKnown: it.firstSTKnown, + firstSTChangeOn: uint16(it.firstSTChangeOn), + } + return a, nil +} + +func (c *XOR2Chunk) iterator(it Iterator) *xor2Iterator { + if iter, ok := it.(*xor2Iterator); ok { + iter.Reset(c.b.bytes()) + return iter + } + iter := &xor2Iterator{} + iter.Reset(c.b.bytes()) + return iter +} + +// Iterator implements the Chunk interface. +func (c *XOR2Chunk) Iterator(it Iterator) Iterator { + return c.iterator(it) +} + +// xor2Appender appends samples with optional start timestamps using +// the XOR2 joint control bit encoding for regular timestamp and value, +// and putVarbitInt for the start timestamp delta. +type xor2Appender struct { + b *bstream + + st int64 + t int64 + v float64 + tDelta uint64 + stDiff int64 // prevT - st for the previous sample. + + leading uint8 + trailing uint8 + + numTotal uint16 + firstSTChangeOn uint16 + firstSTKnown bool +} + +func (a *xor2Appender) Append(st, t int64, v float64) { + var ( + tDelta uint64 + stDiff int64 + ) + + switch a.numTotal { + case 0: + buf := make([]byte, binary.MaxVarintLen64) + for _, b := range buf[:binary.PutVarint(buf, t)] { + a.b.writeByte(b) + } + a.b.writeBits(math.Float64bits(v), 64) + + if st != 0 { + for _, b := range buf[:binary.PutVarint(buf, t-st)] { + a.b.writeByte(b) + } + a.firstSTKnown = true + writeHeaderFirstSTKnown(a.b.bytes()[chunkHeaderSize:]) + } + + case 1: + tDelta = uint64(t - a.t) + + buf := make([]byte, binary.MaxVarintLen64) + for _, b := range buf[:binary.PutUvarint(buf, tDelta)] { + a.b.writeByte(b) + } + + a.writeVDelta(v) + + if st != a.st { + stDiff = a.t - st + a.firstSTChangeOn = 1 + writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], 1) + putVarbitInt(a.b, stDiff) + } + + default: + tDelta = uint64(t - a.t) + dod := int64(tDelta - a.tDelta) + + // Fast path: no ST involvement at all. + if st == 0 && a.numTotal != maxFirstSTChangeOn && a.firstSTChangeOn == 0 && !a.firstSTKnown { + a.encodeJoint(dod, v) + a.t = t + if !value.IsStaleNaN(v) { + a.v = v + } + a.tDelta = tDelta + a.numTotal++ + binary.BigEndian.PutUint16(a.b.bytes(), a.numTotal) + return + } + + // Slow path: ST may be involved. + a.encodeJoint(dod, v) + + if a.firstSTChangeOn == 0 { + if st != a.st || a.numTotal == maxFirstSTChangeOn { + // First ST change: record prevT - st. + stDiff = a.t - st + a.firstSTChangeOn = a.numTotal + writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], a.numTotal) + putVarbitInt(a.b, stDiff) + } + } else { + stDiff = a.t - st + putVarbitInt(a.b, stDiff-a.stDiff) + } + } + + a.st = st + a.t = t + if !value.IsStaleNaN(v) { + a.v = v + } + a.tDelta = tDelta + a.stDiff = stDiff + a.numTotal++ + binary.BigEndian.PutUint16(a.b.bytes(), a.numTotal) +} + +// encodeJoint writes the XOR2 joint timestamp+value control sequence for +// samples >= 2. +func (a *xor2Appender) encodeJoint(dod int64, v float64) { + if dod == 0 { + switch { + case value.IsStaleNaN(v): + a.b.writeBits(0b11111, 5) + case math.Float64bits(v)^math.Float64bits(a.v) == 0: + a.b.writeBit(zero) + default: + a.b.writeBits(0b10, 2) + a.writeVDeltaKnownNonZero(v) + } + return + } + + switch { + case dod >= -(1<<12) && dod <= (1<<12)-1: + // 13-bit dod: prefix `110` packed with top 5 bits → 2 bytes total. + a.b.writeByte(0b110_00000 | byte(uint64(dod)>>8)&0x1F) + a.b.writeByte(byte(uint64(dod))) + case dod >= -(1<<19) && dod <= (1<<19)-1: + // 20-bit dod: prefix `1110` packed with top 4 bits → 3 bytes total. + a.b.writeByte(0b1110_0000 | byte(uint64(dod)>>16)&0x0F) + a.b.writeByte(byte(uint64(dod) >> 8)) + a.b.writeByte(byte(uint64(dod))) + default: + // 64-bit escape (rare): `11110`. + a.b.writeBits(0b11110, 5) + a.b.writeBits(uint64(dod), 64) + } + a.writeVDelta(v) +} + +// writeVDelta encodes the value delta for the dod≠0 case. +func (a *xor2Appender) writeVDelta(v float64) { + if value.IsStaleNaN(v) { + a.b.writeBits(0b111, 3) + return + } + + delta := math.Float64bits(v) ^ math.Float64bits(a.v) + + if delta == 0 { + a.b.writeBit(zero) + return + } + + newLeading := uint8(bits.LeadingZeros64(delta)) + newTrailing := uint8(bits.TrailingZeros64(delta)) + + if newLeading >= 32 { + newLeading = 31 + } + + if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing { + a.b.writeBits(0b10, 2) + a.b.writeBits(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) + return + } + + a.leading, a.trailing = newLeading, newTrailing + + a.b.writeBits(0b110, 3) + a.b.writeBits(uint64(newLeading), 5) + + sigbits := 64 - newLeading - newTrailing + a.b.writeBits(uint64(sigbits), 6) + a.b.writeBits(delta>>newTrailing, int(sigbits)) +} + +// writeVDeltaKnownNonZero encodes the value delta when it is known to be +// non-zero and non-stale (dod=0, value-changed case). +func (a *xor2Appender) writeVDeltaKnownNonZero(v float64) { + delta := math.Float64bits(v) ^ math.Float64bits(a.v) + + newLeading := uint8(bits.LeadingZeros64(delta)) + newTrailing := uint8(bits.TrailingZeros64(delta)) + + if newLeading >= 32 { + newLeading = 31 + } + + if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing { + a.b.writeBit(zero) + a.b.writeBits(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) + return + } + + a.leading, a.trailing = newLeading, newTrailing + + a.b.writeBit(one) + a.b.writeBits(uint64(newLeading), 5) + + sigbits := 64 - newLeading - newTrailing + a.b.writeBits(uint64(sigbits), 6) + a.b.writeBits(delta>>newTrailing, int(sigbits)) +} + +func (*xor2Appender) AppendHistogram(*HistogramAppender, int64, int64, *histogram.Histogram, bool) (Chunk, bool, Appender, error) { + panic("appended a histogram sample to a float chunk") +} + +func (*xor2Appender) AppendFloatHistogram(*FloatHistogramAppender, int64, int64, *histogram.FloatHistogram, bool) (Chunk, bool, Appender, error) { + panic("appended a float histogram sample to a float chunk") +} + +// xor2Iterator decodes XOR2 chunks. +type xor2Iterator struct { + br bstreamReader + numTotal uint16 + numRead uint16 + + firstSTKnown bool + firstSTChangeOn uint8 + + leading uint8 + trailing uint8 + + st int64 + t int64 + val float64 + + tDelta uint64 + stDiff int64 // Accumulated prevT - st. + err error + + baselineV float64 // Last non-stale value for XOR baseline. +} + +func (it *xor2Iterator) Seek(t int64) ValueType { + if it.err != nil { + return ValNone + } + + for t > it.t || it.numRead == 0 { + if it.Next() == ValNone { + return ValNone + } + } + return ValFloat +} + +func (it *xor2Iterator) At() (int64, float64) { + return it.t, it.val +} + +func (*xor2Iterator) AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram) { + panic("cannot call xor2Iterator.AtHistogram") +} + +func (*xor2Iterator) AtFloatHistogram(*histogram.FloatHistogram) (int64, *histogram.FloatHistogram) { + panic("cannot call xor2Iterator.AtFloatHistogram") +} + +func (it *xor2Iterator) AtT() int64 { + return it.t +} + +func (it *xor2Iterator) AtST() int64 { + return it.st +} + +func (it *xor2Iterator) Err() error { + return it.err +} + +func (it *xor2Iterator) Reset(b []byte) { + it.br = newBReader(b[chunkHeaderSize+chunkSTHeaderSize:]) + it.numTotal = binary.BigEndian.Uint16(b) + it.firstSTKnown, it.firstSTChangeOn = readSTHeader(b[chunkHeaderSize:]) + + it.numRead = 0 + it.st = 0 + it.t = 0 + it.val = 0 + it.leading = 0 + it.trailing = 0 + it.tDelta = 0 + it.stDiff = 0 + it.baselineV = 0 + it.err = nil +} + +func (it *xor2Iterator) Next() ValueType { + if it.err != nil || it.numRead == it.numTotal { + return ValNone + } + + if it.numRead == 0 { + t, err := binary.ReadVarint(&it.br) + if err != nil { + it.err = err + return ValNone + } + v, err := it.br.readBits(64) + if err != nil { + it.err = err + return ValNone + } + it.t = t + it.val = math.Float64frombits(v) + if !value.IsStaleNaN(it.val) { + it.baselineV = it.val + } + + // Optional ST for sample 0. + if it.firstSTKnown { + stDiff, err := binary.ReadVarint(&it.br) + if err != nil { + it.err = err + return ValNone + } + it.st = t - stDiff + } + + it.numRead++ + return ValFloat + } + + if it.numRead == 1 { + tDelta, err := binary.ReadUvarint(&it.br) + if err != nil { + it.err = err + return ValNone + } + prevT := it.t + it.tDelta = tDelta + it.t += int64(it.tDelta) + + if err := it.decodeValue(); err != nil { + it.err = err + return ValNone + } + + // Optional ST delta for sample 1. + if it.firstSTChangeOn == 1 { + sdod, err := readVarbitInt(&it.br) + if err != nil { + it.err = err + return ValNone + } + it.stDiff = sdod + it.st = prevT - sdod + } + + it.numRead++ + return ValFloat + } + + // Sample N >= 2: read joint XOR2 control, then optional ST data. + prevT := it.t + savedNumRead := it.numRead + + ctrl, err := it.br.readXOR2Control() + if err != nil { + it.err = err + return ValNone + } + + switch ctrl { + case 0: + // dod=0, value unchanged. + it.t += int64(it.tDelta) + it.val = it.baselineV + case 1: + // dod=0, value changed. + it.t += int64(it.tDelta) + if err := it.decodeValueKnownNonZero(); err != nil { + it.err = err + return ValNone + } + case 2: + // 13-bit dod. + if err := it.readDod(13); err != nil { + it.err = err + return ValNone + } + if err := it.decodeValue(); err != nil { + it.err = err + return ValNone + } + case 3: + // 20-bit dod. + if err := it.readDod(20); err != nil { + it.err = err + return ValNone + } + if err := it.decodeValue(); err != nil { + it.err = err + return ValNone + } + case 4: + // 64-bit escape. + if err := it.readDod(64); err != nil { + it.err = err + return ValNone + } + if err := it.decodeValue(); err != nil { + it.err = err + return ValNone + } + default: + // dod=0, stale NaN. + it.t += int64(it.tDelta) + it.val = math.Float64frombits(value.StaleNaN) + } + + // Optional ST data, appended after the joint timestamp+value encoding. + // The ST delta was encoded as (prevT - st), using the PREVIOUS sample's t. + if it.firstSTChangeOn > 0 && savedNumRead >= uint16(it.firstSTChangeOn) { + sdod, err := readVarbitInt(&it.br) + if err != nil { + it.err = err + return ValNone + } + if savedNumRead == uint16(it.firstSTChangeOn) { + it.stDiff = sdod + } else { + it.stDiff += sdod + } + it.st = prevT - it.stDiff + } + + it.numRead++ + return ValFloat +} + +// readDod reads a signed dod of width w bits and updates it.tDelta and it.t. +func (it *xor2Iterator) readDod(w uint8) error { + var b uint64 + if it.br.valid >= w { + it.br.valid -= w + b = (it.br.buffer >> it.br.valid) & ((uint64(1) << w) - 1) + } else { + var err error + b, err = it.br.readBits(w) + if err != nil { + return err + } + } + + if w < 64 && b >= (1<<(w-1)) { + b -= 1 << w + } + + it.tDelta = uint64(int64(it.tDelta) + int64(b)) + it.t += int64(it.tDelta) + return nil +} + +// decodeValue reads the XOR2 value encoding for the dod≠0 case: +// +// `0` → value unchanged +// `10` → reuse previous leading/trailing window +// `110` → new leading/trailing window +// `111` → stale NaN +func (it *xor2Iterator) decodeValue() error { + var bit bit + if it.br.valid > 0 { + it.br.valid-- + bit = (it.br.buffer & (uint64(1) << it.br.valid)) != 0 + } else { + var err error + bit, err = it.br.readBit() + if err != nil { + return err + } + } + + if bit == zero { + // `0` → value unchanged. + it.val = it.baselineV + return nil + } + + if it.br.valid > 0 { + it.br.valid-- + bit = (it.br.buffer & (uint64(1) << it.br.valid)) != 0 + } else { + var err error + bit, err = it.br.readBit() + if err != nil { + return err + } + } + + if bit == zero { + // `10` → reuse previous leading/trailing window. + sz := uint8(64 - int(it.leading) - int(it.trailing)) + var valueBits uint64 + if it.br.valid >= sz { + it.br.valid -= sz + valueBits = (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + } else { + var err error + valueBits, err = it.br.readBits(sz) + if err != nil { + return err + } + } + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + + if it.br.valid > 0 { + it.br.valid-- + bit = (it.br.buffer & (uint64(1) << it.br.valid)) != 0 + } else { + var err error + bit, err = it.br.readBit() + if err != nil { + return err + } + } + + if bit == zero { + // `110` → new leading/trailing window. + return it.decodeNewLeadingTrailing() + } + + // `111` → stale NaN. + it.val = math.Float64frombits(value.StaleNaN) + return nil +} + +// decodeValueKnownNonZero reads the XOR2 value encoding for the dod=0, +// value-changed case: +// +// `0` → reuse previous leading/trailing window +// `1` → new leading/trailing window +func (it *xor2Iterator) decodeValueKnownNonZero() error { + var bit bit + if it.br.valid > 0 { + it.br.valid-- + bit = (it.br.buffer & (uint64(1) << it.br.valid)) != 0 + } else { + var err error + bit, err = it.br.readBit() + if err != nil { + return err + } + } + + if bit == zero { + // `0` → reuse previous leading/trailing window. + sz := uint8(64 - int(it.leading) - int(it.trailing)) + var valueBits uint64 + if it.br.valid >= sz { + it.br.valid -= sz + valueBits = (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + } else { + var err error + valueBits, err = it.br.readBits(sz) + if err != nil { + return err + } + } + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + + // `1` → new leading/trailing window. + return it.decodeNewLeadingTrailing() +} + +// decodeNewLeadingTrailing reads a new leading/sigbits/value triple and +// updates it.leading, it.trailing, it.val, and it.baselineV. +func (it *xor2Iterator) decodeNewLeadingTrailing() error { + var newLeading uint64 + if it.br.valid >= 5 { + it.br.valid -= 5 + newLeading = (it.br.buffer >> it.br.valid) & 0x1f + } else { + var err error + newLeading, err = it.br.readBits(5) + if err != nil { + return err + } + } + + var sigbits uint64 + if it.br.valid >= 6 { + it.br.valid -= 6 + sigbits = (it.br.buffer >> it.br.valid) & 0x3f + } else { + var err error + sigbits, err = it.br.readBits(6) + if err != nil { + return err + } + } + + it.leading = uint8(newLeading) + if sigbits == 0 { + sigbits = 64 + } + it.trailing = 64 - it.leading - uint8(sigbits) + + n := uint8(sigbits) + var valueBits uint64 + if it.br.valid >= n { + it.br.valid -= n + valueBits = (it.br.buffer >> it.br.valid) & ((uint64(1) << n) - 1) + } else { + var err error + valueBits, err = it.br.readBits(n) + if err != nil { + return err + } + } + + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil +} diff --git a/tsdb/chunkenc/xor2_test.go b/tsdb/chunkenc/xor2_test.go new file mode 100644 index 0000000000..f6a344f598 --- /dev/null +++ b/tsdb/chunkenc/xor2_test.go @@ -0,0 +1,279 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chunkenc + +import ( + "math" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/model/value" +) + +func BenchmarkXor2Write(b *testing.B) { + samples := make([]struct { + t int64 + v float64 + }, 120) + for i := range samples { + samples[i].t = int64(i) * 1000 + samples[i].v = float64(i) + float64(i)/10 + float64(i)/100 + float64(i)/1000 + } + + b.ReportAllocs() + + for b.Loop() { + c := NewXOR2Chunk() + app, _ := c.Appender() + for _, s := range samples { + app.Append(0, s.t, s.v) + } + } +} + +func BenchmarkXor2Read(b *testing.B) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(b, err) + for i := int64(0); i < 120*1000; i += 1000 { + app.Append(0, i, float64(i)+float64(i)/10+float64(i)/100+float64(i)/1000) + } + + b.ReportAllocs() + + var it Iterator + for b.Loop() { + var ts int64 + var v float64 + it = c.Iterator(it) + for it.Next() != ValNone { + ts, v = it.At() + } + _, _ = ts, v + } +} + +func TestXOR2Basic(t *testing.T) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(t, err) + + samples := []struct { + t int64 + v float64 + }{ + {1000, 1.0}, + {2000, 2.0}, + {3000, 3.0}, + {4000, 4.0}, + {5000, 5.0}, + } + + for _, s := range samples { + app.Append(0, s.t, s.v) + } + + it := c.Iterator(nil) + for _, expected := range samples { + require.Equal(t, ValFloat, it.Next()) + ts, v := it.At() + require.Equal(t, expected.t, ts) + require.Equal(t, expected.v, v) + } + require.Equal(t, ValNone, it.Next()) +} + +func TestXOR2WithStaleness(t *testing.T) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(t, err) + + samples := []struct { + t int64 + v float64 + stale bool + }{ + {1000, 1.0, false}, + {2000, 2.0, false}, + {3000, math.Float64frombits(value.StaleNaN), true}, + {4000, 4.0, false}, + {5000, math.Float64frombits(value.StaleNaN), true}, + {6000, 6.0, false}, + } + + for _, s := range samples { + app.Append(0, s.t, s.v) + } + + it := c.Iterator(nil) + for _, expected := range samples { + require.Equal(t, ValFloat, it.Next()) + ts, v := it.At() + require.Equal(t, expected.t, ts) + if expected.stale { + require.True(t, value.IsStaleNaN(v), "Expected stale NaN at ts=%d", ts) + } else { + require.Equal(t, expected.v, v) + } + } + require.Equal(t, ValNone, it.Next()) +} + +func TestXOR2StaleWithDodNonZero(t *testing.T) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(t, err) + + // Stale NaN samples where the timestamp dod is non-zero, exercising the + // `111` value encoding path inside writeVDelta. + samples := []struct { + t int64 + v float64 + stale bool + }{ + {1000, 1.0, false}, + {2000, 2.0, false}, + // dod = (1050 - 1000) - (2000 - 1000) = 50 - 1000 = -950: stale with dod≠0. + {3050, math.Float64frombits(value.StaleNaN), true}, + {4050, 4.0, false}, + {5050, 5.0, false}, + } + + for _, s := range samples { + app.Append(0, s.t, s.v) + } + + it := c.Iterator(nil) + for _, expected := range samples { + require.Equal(t, ValFloat, it.Next()) + ts, v := it.At() + require.Equal(t, expected.t, ts) + if expected.stale { + require.True(t, value.IsStaleNaN(v), "Expected stale NaN at ts=%d", ts) + } else { + require.Equal(t, expected.v, v) + } + } + require.Equal(t, ValNone, it.Next()) +} + +func TestXOR2IrregularTimestamps(t *testing.T) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(t, err) + + // Timestamps with dod values spanning multiple encoding ranges. + timestamps := []int64{ + 1000, 2000, 3000, + // dod in 13-bit range. + 3050, 4050, 5050, + // dod in 20-bit range (large jitter). + 5050 + 100000, 5050 + 200000, 5050 + 300000, + // Back to regular. + 5050 + 301000, + } + for _, ts := range timestamps { + app.Append(0, ts, 1.0) + } + + it := c.Iterator(nil) + for _, expected := range timestamps { + require.Equal(t, ValFloat, it.Next()) + ts, _ := it.At() + require.Equal(t, expected, ts) + } + require.Equal(t, ValNone, it.Next()) +} + +func TestXOR2LargeDod(t *testing.T) { + c := NewXOR2Chunk() + app, err := c.Appender() + require.NoError(t, err) + + // Force the 64-bit escape path with a very large dod. + timestamps := []int64{0, 1000, 2000, 2000 + (1 << 20)} + for _, ts := range timestamps { + app.Append(0, ts, 1.0) + } + + it := c.Iterator(nil) + for _, expected := range timestamps { + require.Equal(t, ValFloat, it.Next()) + ts, _ := it.At() + require.Equal(t, expected, ts) + } + require.Equal(t, ValNone, it.Next()) +} + +func TestXOR2ChunkST(t *testing.T) { + testChunkSTHandling(t, ValFloat, func() Chunk { + return NewXOR2Chunk() + }) +} + +func TestXOR2Chunk_MoreThan127Samples(t *testing.T) { + const afterMax = maxFirstSTChangeOn + 3 + t.Run("zero ST", func(t *testing.T) { + chunk := NewXOR2Chunk() + app, err := chunk.Appender() + require.NoError(t, err) + for i := range afterMax { + app.Append(0, int64(i*10+1), float64(i)*1.5) + } + + it := chunk.Iterator(nil) + for i := range afterMax { + require.Equal(t, ValFloat, it.Next()) + st := it.AtST() + ts, v := it.At() + require.Equal(t, int64(0), st) + require.Equal(t, int64(i*10+1), ts) + require.Equal(t, float64(i)*1.5, v) + } + + require.Equal(t, ValNone, it.Next()) + require.NoError(t, it.Err()) + }) + + t.Run("non-zero ST after 127", func(t *testing.T) { + chunk := NewXOR2Chunk() + app, err := chunk.Appender() + require.NoError(t, err) + for i := range afterMax { + st := int64(0) + if i == afterMax-1 { + st = int64((afterMax - 1) * 10) + } + app.Append(st, int64(i*10+1), float64(i)*1.5) + } + + it := chunk.Iterator(nil) + for i := range afterMax { + require.Equal(t, ValFloat, it.Next()) + st := it.AtST() + ts, v := it.At() + if i == afterMax-1 { + require.Equal(t, int64((afterMax-1)*10), st) + } else { + require.Equal(t, int64(0), st) + } + require.Equal(t, int64(i*10+1), ts) + require.Equal(t, float64(i)*1.5, v) + } + + require.Equal(t, ValNone, it.Next()) + require.NoError(t, it.Err()) + }) +} diff --git a/tsdb/chunkenc/xoroptst.go b/tsdb/chunkenc/xoroptst.go deleted file mode 100644 index b138ddbdf4..0000000000 --- a/tsdb/chunkenc/xoroptst.go +++ /dev/null @@ -1,729 +0,0 @@ -// Copyright The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package chunkenc - -import ( - "encoding/binary" - "math" - - "github.com/prometheus/prometheus/model/histogram" -) - -const ( - chunkSTHeaderSize = 1 - maxFirstSTChangeOn = 0x7F -) - -func writeHeaderFirstSTKnown(b []byte) { - b[0] = 0x80 -} - -func writeHeaderFirstSTChangeOn(b []byte, firstSTChangeOn uint16) { - // First bit indicates the initial ST value. - // Here we save the sample number from where the first change occurs in the - // rest of the byte (7 bits) - - if firstSTChangeOn > maxFirstSTChangeOn { - // This should never happen, would cause corruption (ST already skipped but shouldn't). - return - } - b[0] |= uint8(firstSTChangeOn) -} - -func readSTHeader(b []byte) (firstSTKnown bool, firstSTChangeOn uint8) { - if b[0] == 0x00 { - return false, 0 - } - if b[0] == 0x80 { - return true, 0 - } - mask := byte(0x80) - if b[0]&mask != 0 { - firstSTKnown = true - } - mask = 0x7F - return firstSTKnown, b[0] & mask -} - -// XorOptSTChunk holds XOR enncoded samples with optional start time (ST) -// per chunk or per sample. See tsdb/docs/format/chunks.md for details. -type XorOptSTChunk struct { - b bstream -} - -// NewXOROptSTChunk returns a new chunk with XORv2 encoding. -func NewXOROptSTChunk() *XorOptSTChunk { - b := make([]byte, chunkHeaderSize+chunkSTHeaderSize, chunkAllocationSize) - return &XorOptSTChunk{b: bstream{stream: b, count: 0}} -} - -func (c *XorOptSTChunk) Reset(stream []byte) { - c.b.Reset(stream) -} - -// Encoding returns the encoding type. -func (*XorOptSTChunk) Encoding() Encoding { - return EncXOROptST -} - -// Bytes returns the underlying byte slice of the chunk. -func (c *XorOptSTChunk) Bytes() []byte { - return c.b.bytes() -} - -// NumSamples returns the number of samples in the chunk. -func (c *XorOptSTChunk) NumSamples() int { - return int(binary.BigEndian.Uint16(c.Bytes())) -} - -// Compact implements the Chunk interface. -func (c *XorOptSTChunk) Compact() { - if l := len(c.b.stream); cap(c.b.stream) > l+chunkCompactCapacityThreshold { - buf := make([]byte, l) - copy(buf, c.b.stream) - c.b.stream = buf - } -} - -// Appender implements the Chunk interface. -// It is not valid to call Appender() multiple times concurrently or to use multiple -// Appenders on the same chunk. -func (c *XorOptSTChunk) Appender() (Appender, error) { - if len(c.b.stream) == chunkHeaderSize+chunkSTHeaderSize { // Avoid allocating an Iterator when chunk is empty. - return &xorOptSTAppender{b: &c.b, t: math.MinInt64, leading: 0xff}, nil - } - it := c.iterator(nil) - - // To get an appender we must know the state it would have if we had - // appended all existing data from scratch. - // We iterate through the end and populate via the iterator's state. - for it.Next() != ValNone { - } - if err := it.Err(); err != nil { - return nil, err - } - - // Set the bit position for continuing writes. - // The iterator's reader tracks how many bits remain unread in the last byte. - c.b.count = it.br.valid - - a := &xorOptSTAppender{ - b: &c.b, - st: it.st, - t: it.t, - v: it.val, - stDiff: it.stDiff, - tDelta: it.tDelta, - leading: it.leading, - trailing: it.trailing, - - numTotal: it.numTotal, - firstSTKnown: it.firstSTKnown, - firstSTChangeOn: uint16(it.firstSTChangeOn), - } - return a, nil -} - -func (c *XorOptSTChunk) iterator(it Iterator) *xorOptSTtIterator { - xorIter, ok := it.(*xorOptSTtIterator) - if !ok { - xorIter = &xorOptSTtIterator{} - } - - xorIter.Reset(c.b.bytes()) - return xorIter -} - -// Iterator implements the Chunk interface. -// Iterator() must not be called concurrently with any modifications to the chunk, -// but after it returns you can use an Iterator concurrently with an Appender or -// other Iterators. -func (c *XorOptSTChunk) Iterator(it Iterator) Iterator { - return c.iterator(it) -} - -type xorOptSTAppender struct { - b *bstream - numTotal uint16 - firstSTChangeOn uint16 - leading uint8 - trailing uint8 - firstSTKnown bool - st, t int64 - v float64 - stDiff int64 // Difference between current ST and previous T. Undefined for first sample. - tDelta uint64 // Difference between current T and previous T. Undefined for first sample. -} - -func (a *xorOptSTAppender) writeVDelta(v float64) { - xorWrite(a.b, v, a.v, &a.leading, &a.trailing) -} - -func (*xorOptSTAppender) AppendHistogram(*HistogramAppender, int64, int64, *histogram.Histogram, bool) (Chunk, bool, Appender, error) { - panic("appended a histogram sample to a float chunk") -} - -func (*xorOptSTAppender) AppendFloatHistogram(*FloatHistogramAppender, int64, int64, *histogram.FloatHistogram, bool) (Chunk, bool, Appender, error) { - panic("appended a float histogram sample to a float chunk") -} - -type xorOptSTtIterator struct { - br bstreamReader - numTotal uint16 - - firstSTKnown bool - firstSTChangeOn uint8 - leading uint8 - trailing uint8 - - numRead uint16 - - st, t int64 - val float64 - - stDiff int64 - tDelta uint64 - err error -} - -func (it *xorOptSTtIterator) Seek(t int64) ValueType { - if it.err != nil { - return ValNone - } - - for t > it.t || it.numRead == 0 { - if it.Next() == ValNone { - return ValNone - } - } - return ValFloat -} - -func (it *xorOptSTtIterator) At() (int64, float64) { - return it.t, it.val -} - -func (*xorOptSTtIterator) AtHistogram(*histogram.Histogram) (int64, *histogram.Histogram) { - panic("cannot call xorIterator.AtHistogram") -} - -func (*xorOptSTtIterator) AtFloatHistogram(*histogram.FloatHistogram) (int64, *histogram.FloatHistogram) { - panic("cannot call xorIterator.AtFloatHistogram") -} - -func (it *xorOptSTtIterator) AtT() int64 { - return it.t -} - -func (it *xorOptSTtIterator) AtST() int64 { - return it.st -} - -func (it *xorOptSTtIterator) Err() error { - return it.err -} - -func (it *xorOptSTtIterator) Reset(b []byte) { - // We skip initial headers for actual samples. - it.br = newBReader(b[chunkHeaderSize+chunkSTHeaderSize:]) - it.numTotal = binary.BigEndian.Uint16(b) - it.firstSTKnown, it.firstSTChangeOn = readSTHeader(b[chunkHeaderSize:]) - it.numRead = 0 - it.st = 0 - it.t = 0 - it.val = 0 - it.leading = 0 - it.trailing = 0 - it.stDiff = 0 - it.tDelta = 0 - it.err = nil -} - -func (a *xorOptSTAppender) Append(st, t int64, v float64) { - if st == 0 && a.numTotal != maxFirstSTChangeOn && a.firstSTChangeOn == 0 && !a.firstSTKnown { - // Fast path for no ST usage at all. - // Same as classic XOR chunk appender. - - var tDelta uint64 - - switch a.numTotal { - case 0: - buf := make([]byte, binary.MaxVarintLen64) - for _, b := range buf[:binary.PutVarint(buf, t)] { - a.b.writeByte(b) - } - a.b.writeBits(math.Float64bits(v), 64) - case 1: - buf := make([]byte, binary.MaxVarintLen64) - tDelta = uint64(t - a.t) - for _, b := range buf[:binary.PutUvarint(buf, tDelta)] { - a.b.writeByte(b) - } - a.writeVDelta(v) - default: - tDelta = uint64(t - a.t) - dod := int64(tDelta - a.tDelta) - - // Gorilla has a max resolution of seconds, Prometheus milliseconds. - // Thus we use higher value range steps with larger bit size. - // - // TODO(beorn7): This seems to needlessly jump to large bit - // sizes even for very small deviations from zero. Timestamp - // compression can probably benefit from some smaller bit - // buckets. See also what was done for histogram encoding in - // varbit.go. - switch { - case dod == 0: - a.b.writeBit(zero) - case bitRange(dod, 14): - a.b.writeByte(0b10<<6 | (uint8(dod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. - a.b.writeByte(uint8(dod)) // Bottom 8 bits of dod. - case bitRange(dod, 17): - a.b.writeBits(0b110, 3) - a.b.writeBits(uint64(dod), 17) - case bitRange(dod, 20): - a.b.writeBits(0b1110, 4) - a.b.writeBits(uint64(dod), 20) - default: - a.b.writeBits(0b1111, 4) - a.b.writeBits(uint64(dod), 64) - } - - a.writeVDelta(v) - } - - a.t = t - a.v = v - a.tDelta = tDelta - a.numTotal++ - binary.BigEndian.PutUint16(a.b.bytes(), a.numTotal) - - return - } - - var ( - stDiff int64 // Difference between current ST and previous T. Undefined for first sample. - tDelta uint64 // Difference between current T and previous T. Undefined for first sample. - ) - - // Slow path for ST usage. - switch a.numTotal { - case 0: - buf := make([]byte, binary.MaxVarintLen64) - - // Write T. - for _, b := range buf[:binary.PutVarint(buf, t)] { - a.b.writeByte(b) - } - - // Write V. - a.b.writeBits(math.Float64bits(v), 64) - - // Write ST. - for _, b := range buf[:binary.PutVarint(buf, t-st)] { - a.b.writeByte(b) - } - a.firstSTKnown = true - writeHeaderFirstSTKnown(a.b.bytes()[chunkHeaderSize:]) - - case 1: - buf := make([]byte, binary.MaxVarintLen64) - tDelta = uint64(t - a.t) - for _, b := range buf[:binary.PutUvarint(buf, tDelta)] { - a.b.writeByte(b) - } - a.writeVDelta(v) - - if st == a.st { - break - } - - stDiff = a.t - st - a.firstSTChangeOn = 1 - writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], 1) - // for _, b := range buf[:binary.PutVarint(buf, stDiff)] { - // a.b.writeByte(b) - // } - sdod := stDiff - // Gorilla has a max resolution of seconds, Prometheus milliseconds. - // Thus we use higher value range steps with larger bit size. - // - // TODO(beorn7): This seems to needlessly jump to large bit - // sizes even for very small deviations from zero. Timestamp - // compression can probably benefit from some smaller bit - // buckets. See also what was done for histogram encoding in - // varbit.go. - switch { - case sdod == 0: - a.b.writeBit(zero) - case bitRange(sdod, 14): - a.b.writeByte(0b10<<6 | (uint8(sdod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. - a.b.writeByte(uint8(sdod)) // Bottom 8 bits of dod. - case bitRange(sdod, 17): - a.b.writeBits(0b110, 3) - a.b.writeBits(uint64(sdod), 17) - case bitRange(sdod, 20): - a.b.writeBits(0b1110, 4) - a.b.writeBits(uint64(sdod), 20) - default: - a.b.writeBits(0b1111, 4) - a.b.writeBits(uint64(sdod), 64) - } - - default: - tDelta = uint64(t - a.t) - dod := int64(tDelta - a.tDelta) - - // Gorilla has a max resolution of seconds, Prometheus milliseconds. - // Thus we use higher value range steps with larger bit size. - // - // TODO(beorn7): This seems to needlessly jump to large bit - // sizes even for very small deviations from zero. Timestamp - // compression can probably benefit from some smaller bit - // buckets. See also what was done for histogram encoding in - // varbit.go. - switch { - case dod == 0: - a.b.writeBit(zero) - case bitRange(dod, 14): - a.b.writeByte(0b10<<6 | (uint8(dod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. - a.b.writeByte(uint8(dod)) // Bottom 8 bits of dod. - case bitRange(dod, 17): - a.b.writeBits(0b110, 3) - a.b.writeBits(uint64(dod), 17) - case bitRange(dod, 20): - a.b.writeBits(0b1110, 4) - a.b.writeBits(uint64(dod), 20) - default: - a.b.writeBits(0b1111, 4) - a.b.writeBits(uint64(dod), 64) - } - - a.writeVDelta(v) - - if a.firstSTChangeOn == 0 { - if st != a.st || a.numTotal == maxFirstSTChangeOn { - stDiff = a.t - st - a.firstSTChangeOn = a.numTotal - writeHeaderFirstSTChangeOn(a.b.bytes()[chunkHeaderSize:], a.numTotal) - sdod := stDiff - // Gorilla has a max resolution of seconds, Prometheus milliseconds. - // Thus we use higher value range steps with larger bit size. - // - // TODO(beorn7): This seems to needlessly jump to large bit - // sizes even for very small deviations from zero. Timestamp - // compression can probably benefit from some smaller bit - // buckets. See also what was done for histogram encoding in - // varbit.go. - switch { - case sdod == 0: - a.b.writeBit(zero) - case bitRange(sdod, 14): - a.b.writeByte(0b10<<6 | (uint8(sdod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. - a.b.writeByte(uint8(sdod)) // Bottom 8 bits of dod. - case bitRange(sdod, 17): - a.b.writeBits(0b110, 3) - a.b.writeBits(uint64(sdod), 17) - case bitRange(sdod, 20): - a.b.writeBits(0b1110, 4) - a.b.writeBits(uint64(sdod), 20) - default: - a.b.writeBits(0b1111, 4) - a.b.writeBits(uint64(sdod), 64) - } - } - } else { - stDiff = a.t - st - sdod := stDiff - a.stDiff - // Gorilla has a max resolution of seconds, Prometheus milliseconds. - // Thus we use higher value range steps with larger bit size. - // - // TODO(beorn7): This seems to needlessly jump to large bit - // sizes even for very small deviations from zero. Timestamp - // compression can probably benefit from some smaller bit - // buckets. See also what was done for histogram encoding in - // varbit.go. - switch { - case sdod == 0: - a.b.writeBit(zero) - case bitRange(sdod, 14): - a.b.writeByte(0b10<<6 | (uint8(sdod>>8) & (1<<6 - 1))) // 0b10 size code combined with 6 bits of dod. - a.b.writeByte(uint8(sdod)) // Bottom 8 bits of dod. - case bitRange(sdod, 17): - a.b.writeBits(0b110, 3) - a.b.writeBits(uint64(sdod), 17) - case bitRange(sdod, 20): - a.b.writeBits(0b1110, 4) - a.b.writeBits(uint64(sdod), 20) - default: - a.b.writeBits(0b1111, 4) - a.b.writeBits(uint64(sdod), 64) - } - } - } - - a.st = st - a.t = t - a.v = v - a.tDelta = tDelta - a.stDiff = stDiff - - a.numTotal++ - binary.BigEndian.PutUint16(a.b.bytes(), a.numTotal) -} - -func (it *xorOptSTtIterator) retErr(err error) ValueType { - it.err = err - return ValNone -} - -func (it *xorOptSTtIterator) Next() ValueType { - if it.err != nil || it.numRead == it.numTotal { - return ValNone - } - - if it.numRead == 0 { - t, err := binary.ReadVarint(&it.br) - if err != nil { - return it.retErr(err) - } - - v, err := it.br.readBits(64) - if err != nil { - return it.retErr(err) - } - it.t = t - it.val = math.Float64frombits(v) - - // Optional ST read. - if it.firstSTKnown { - st, err := binary.ReadVarint(&it.br) - if err != nil { - return it.retErr(err) - } - it.st = t - st - } - - it.numRead++ - return ValFloat - } - - if it.numRead == 1 { - tDelta, err := binary.ReadUvarint(&it.br) - if err != nil { - return it.retErr(err) - } - it.tDelta = tDelta - - if err := xorRead(&it.br, &it.val, &it.leading, &it.trailing); err != nil { - return it.retErr(err) - } - - // Optional ST delta read. - if it.firstSTChangeOn == 1 { - // stDiff, err := binary.ReadVarint(&it.br) - // if err != nil { - // return it.retErr(err) - // } - // it.stDiff = stDiff - // it.st = it.t - stDiff - var d byte - // read delta-of-delta - for range 4 { - d <<= 1 - bit, err := it.br.readBitFast() - if err != nil { - bit, err = it.br.readBit() - if err != nil { - return it.retErr(err) - } - } - if bit == zero { - break - } - d |= 1 - } - var sz uint8 - var sdod int64 - switch d { - case 0b0: - // dod == 0 - case 0b10: - sz = 14 - case 0b110: - sz = 17 - case 0b1110: - sz = 20 - case 0b1111: - // Do not use fast because it's very unlikely it will succeed. - bits, err := it.br.readBits(64) - if err != nil { - return it.retErr(err) - } - - sdod = int64(bits) - } - - if sz != 0 { - bits, err := it.br.readBitsFast(sz) - if err != nil { - bits, err = it.br.readBits(sz) - if err != nil { - return it.retErr(err) - } - } - - // Account for negative numbers, which come back as high unsigned numbers. - // See docs/bstream.md. - if bits > (1 << (sz - 1)) { - bits -= 1 << sz - } - sdod = int64(bits) - } - it.stDiff = sdod - it.st = it.t - sdod - } - - it.t += int64(it.tDelta) - it.numRead++ - return ValFloat - } - - var d byte - // read delta-of-delta - for range 4 { - d <<= 1 - bit, err := it.br.readBitFast() - if err != nil { - bit, err = it.br.readBit() - } - if err != nil { - return it.retErr(err) - } - if bit == zero { - break - } - d |= 1 - } - var sz uint8 - var dod int64 - switch d { - case 0b0: - // dod == 0 - case 0b10: - sz = 14 - case 0b110: - sz = 17 - case 0b1110: - sz = 20 - case 0b1111: - // Do not use fast because it's very unlikely it will succeed. - bits, err := it.br.readBits(64) - if err != nil { - return it.retErr(err) - } - - dod = int64(bits) - } - - if sz != 0 { - bits, err := it.br.readBitsFast(sz) - if err != nil { - bits, err = it.br.readBits(sz) - } - if err != nil { - return it.retErr(err) - } - - // Account for negative numbers, which come back as high unsigned numbers. - // See docs/bstream.md. - if bits > (1 << (sz - 1)) { - bits -= 1 << sz - } - dod = int64(bits) - } - - it.tDelta = uint64(int64(it.tDelta) + dod) - - if err := xorRead(&it.br, &it.val, &it.leading, &it.trailing); err != nil { - return it.retErr(err) - } - - if it.firstSTChangeOn > 0 && it.numRead >= uint16(it.firstSTChangeOn) { - var d byte - // read delta-of-delta - for range 4 { - d <<= 1 - bit, err := it.br.readBitFast() - if err != nil { - bit, err = it.br.readBit() - if err != nil { - return it.retErr(err) - } - } - if bit == zero { - break - } - d |= 1 - } - var sz uint8 - var sdod int64 - switch d { - case 0b0: - // dod == 0 - case 0b10: - sz = 14 - case 0b110: - sz = 17 - case 0b1110: - sz = 20 - case 0b1111: - // Do not use fast because it's very unlikely it will succeed. - bits, err := it.br.readBits(64) - if err != nil { - return it.retErr(err) - } - - sdod = int64(bits) - } - - if sz != 0 { - bits, err := it.br.readBitsFast(sz) - if err != nil { - bits, err = it.br.readBits(sz) - if err != nil { - return it.retErr(err) - } - } - - // Account for negative numbers, which come back as high unsigned numbers. - // See docs/bstream.md. - if bits > (1 << (sz - 1)) { - bits -= 1 << sz - } - sdod = int64(bits) - } - if it.numRead == uint16(it.firstSTChangeOn) { - it.stDiff = sdod - } else { - it.stDiff += sdod - } - it.st = it.t - it.stDiff - } - - it.t += int64(it.tDelta) - - it.numRead++ - return ValFloat -} diff --git a/tsdb/chunkenc/xoroptst_test.go b/tsdb/chunkenc/xoroptst_test.go deleted file mode 100644 index 15b87993de..0000000000 --- a/tsdb/chunkenc/xoroptst_test.go +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package chunkenc - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestXorOptSTChunk(t *testing.T) { - testChunkSTHandling(t, ValFloat, func() Chunk { - return NewXOROptSTChunk() - }, - ) -} - -func TestXorOptSTChunk_MoreThan127Samples(t *testing.T) { - const afterMax = maxFirstSTChangeOn + 3 - t.Run("zero ST", func(t *testing.T) { - chunk := NewXOROptSTChunk() - app, err := chunk.Appender() - require.NoError(t, err) - for i := range afterMax { - app.Append(0, int64(i*10+1), float64(i)*1.5) - } - - it := chunk.Iterator(nil) - for i := range afterMax { - require.Equal(t, ValFloat, it.Next()) - st := it.AtST() - ts, v := it.At() - require.Equal(t, int64(0), st) - require.Equal(t, int64(i*10+1), ts) - require.Equal(t, float64(i)*1.5, v) - } - - require.Equal(t, ValNone, it.Next()) - require.NoError(t, it.Err()) - }) - - t.Run("non-zero ST after 127", func(t *testing.T) { - chunk := NewXOROptSTChunk() - app, err := chunk.Appender() - require.NoError(t, err) - for i := range afterMax { - st := int64(0) - if i == afterMax-1 { - st = int64((afterMax - 1) * 10) - } - app.Append(st, int64(i*10+1), float64(i)*1.5) - } - - it := chunk.Iterator(nil) - for i := range afterMax { - require.Equal(t, ValFloat, it.Next()) - st := it.AtST() - ts, v := it.At() - if i == afterMax-1 { - require.Equal(t, int64((afterMax-1)*10), st) - } else { - require.Equal(t, int64(0), st) - } - require.Equal(t, int64(i*10+1), ts) - require.Equal(t, float64(i)*1.5, v) - } - - require.Equal(t, ValNone, it.Next()) - require.NoError(t, it.Err()) - }) -} - -func TestXorOptSTChunk_STHeader(t *testing.T) { - b := make([]byte, 1) - writeHeaderFirstSTKnown(b) - firstSTKnown, firstSTChangeOn := readSTHeader(b) - require.True(t, firstSTKnown) - require.Equal(t, uint8(0), firstSTChangeOn) - - b = make([]byte, 1) - firstSTKnown, firstSTChangeOn = readSTHeader(b) - require.False(t, firstSTKnown) - require.Equal(t, uint8(0), firstSTChangeOn) - - b = make([]byte, 1) - writeHeaderFirstSTChangeOn(b, 1) - firstSTKnown, firstSTChangeOn = readSTHeader(b) - require.False(t, firstSTKnown) - require.Equal(t, uint8(1), firstSTChangeOn) - - b = make([]byte, 1) - writeHeaderFirstSTKnown(b) - writeHeaderFirstSTChangeOn(b, 119) - firstSTKnown, firstSTChangeOn = readSTHeader(b) - require.True(t, firstSTKnown) - require.Equal(t, uint8(119), firstSTChangeOn) -} diff --git a/tsdb/db_append_v2_test.go b/tsdb/db_append_v2_test.go index a3d74efefd..1b05a1280e 100644 --- a/tsdb/db_append_v2_test.go +++ b/tsdb/db_append_v2_test.go @@ -7553,7 +7553,7 @@ func TestCompactHeadWithSTStorage_AppendV2(t *testing.T) { for _, chk := range chks { c, _, err := chunkr.ChunkOrIterable(chk) require.NoError(t, err) - require.Equal(t, chunkenc.EncXOROptST, c.Encoding(), + require.Equal(t, chunkenc.EncXOR2, c.Encoding(), "unexpected chunk encoding, got %s", c.Encoding()) chunkCount++ } diff --git a/tsdb/docs/format/chunks.md b/tsdb/docs/format/chunks.md index c126a4d8dd..32538d436b 100644 --- a/tsdb/docs/format/chunks.md +++ b/tsdb/docs/format/chunks.md @@ -65,35 +65,74 @@ Notes: * `padding` of 0 to 7 bits so that the whole chunk data is byte-aligned. * The chunk can have as few as one sample, i.e. `ts_1`, `v_1`, etc. are optional. -## XOR chunk data with start timestamp +## XOR2 chunk data -This is experimental, related to supporting delta temporality metrics. -Subject to change. +XOR2 uses the same structure as XOR for samples 0 and 1. Starting from sample 2, +a joint control prefix encodes both the timestamp delta-of-delta (dod) and whether +the value changed, with common dod cases byte-aligned for efficient writing. + +XOR2 can encode start timestamp (ST) as well optionally, see details further +down. -The format is similar to XOR chunk data, except there's an additional one byte -start time (ST) header and optional start time values. ``` -┌──────────────────────┬───────────────────┬────────────────┬───────────────────────────────┬─- -│ num_samples │ st_header | ?st_0 | ts_0 │ v_0 │ -└──────────────────────┴───────────────────┴────────────────┴───────────────────────────────┴─- +┌──────────────────────┬───────────────────┬───────────────┬───────────────┬────────────────┬─- +│ num_samples │ st_header | ts_0 │ v_0 │ ?st_0 | +└──────────────────────┴───────────────────┴───────────────┴───────────────┴────────────────┴─- --──────────────────────┬──────────────────────┬──────────────────────┬─- - ?st_1_delta | ts_1_delta │ v_1_xor │ --──────────────────────┴──────────────────────┴──────────────────────┴─- +-─────────────────────┬───────────────────────┬─────────────────────────┬─- + ts_1_delta │ v_1_xor │ ?st_1_delta | +-─────────────────────┴───────────────────────┴─────────────────────────┴─- --──────────────────────┬──────────────────────┬──────────────────────┬─────┬─- - ?st_2_dod | ts_2_dod │ v_2_xor │ ... │ --──────────────────────┴──────────────────────┴──────────────────────┴─────┴─- +-─────────────────────────┬───────────────────────┬─────┬─- + sample_2 │ ?st_2_dod | ... │ +-─────────────────────────┴───────────────────────┴─────┴─- + +-─────────────────────────┬───────────────────────┬──────────────────┐ + sample_n │ ?st_n_dod | padding │ +-─────────────────────────┴───────────────────────┴──────────────────┘ --──────────────────────┬──────────────────────┬──────────────────────┬──────────────────┐ - ?st_n_dod | ts_n_dod │ v_n_xor │ padding │ --──────────────────────┴──────────────────────┴──────────────────────┴──────────────────┘ ``` -### Notes +### Joint sample encoding for n >= 2 (``): -In addition to the notes from [XOR chunk data](#xor-chunk-data). +Each sample starts with a variable-length control prefix that jointly encodes the +dod and value change status: + +| Control prefix | dod | Value encoding that follows | +|---|---|---| +| `0` | 0 | (none, value unchanged) | +| `10` | 0 | `` (value known non-zero and non-stale) | +| `110DDDDD` `DDDDDDDD` | 13-bit signed [-4096, 4095] | `` | +| `1110DDDD` `DDDDDDDD` `DDDDDDDD` | 20-bit signed [-524288, 524287] | `` | +| `11110` + 64-bit dod | exact | `` | +| `11111` | 0 | (none, stale NaN — no value field) | + +The `110` and `1110` cases pack the prefix and the most-significant dod bits into +the first byte, making the full dod field byte-aligned. + +### Value delta encoding (``): + +Used after the dod≠0 control prefixes. The XOR of the current and previous value is encoded as: + +| Prefix | Meaning | +|---|---| +| `0` | XOR = 0 (value unchanged) | +| `10` | Reuse previous leading/trailing window; `sigbits` value bits follow | +| `110` + leading(5) + sigbits(6) + value(sigbits) | New leading/trailing window | +| `111` | Stale NaN marker (3 bits) | + +### Value delta encoding, known non-zero (``): + +Used after the `10` control prefix (dod=0, value known to have changed and be non-stale). +The delta=0 check is skipped, saving one bit on the reuse path: + +| Prefix | Meaning | +|---|---| +| `0` | Reuse previous leading/trailing window; `sigbits` value bits follow | +| `1` + leading(5) + sigbits(6) + value(sigbits) | New leading/trailing window | + +### Start timestamp encoding * We use `st_i_dod` and `st_i` interchangeably when `i>1` in these notes. * `st_header` is one byte: @@ -111,8 +150,8 @@ In addition to the notes from [XOR chunk data](#xor-chunk-data). `st_changed_on` is set to 127 (0xEF) and the 127th and further samples will have `st_i` present. * `st_0` is encoded as a `varint` if present. -* `st_1` is encoded as a `varint` delta from `st_0` (or from 0 if `st_0` is not - present). +* `st_1` is encoded as a `varbit_ts` delta from `st_0` (or from 0 if `st_0` is + not present). * `st_i_dod` aka `st_i (i>1)` is encoded as a `varbit_ts` "delta of delta" from `st_i-1` (or from 0 if `st_i-1` is not present). diff --git a/tsdb/head_test.go b/tsdb/head_test.go index 7f0af8b41f..1e5858b02b 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -7695,7 +7695,7 @@ func TestHeadAppender_STStorage_ChunkEncoding(t *testing.T) { encoding := chk.Encoding() if enableST { - require.Equal(t, chunkenc.EncXOROptST, encoding, + require.Equal(t, chunkenc.EncXOR2, encoding, "Expected ST-capable encoding when EnableSTStorage is true") } else { require.Equal(t, chunkenc.EncXOR, encoding, diff --git a/tsdb/ooo_head.go b/tsdb/ooo_head.go index 04f859154f..20b225bc56 100644 --- a/tsdb/ooo_head.go +++ b/tsdb/ooo_head.go @@ -123,7 +123,7 @@ func (o *OOOChunk) ToEncodedChunks(mint, maxt int64, storeST bool) (chks []memCh } } switch encoding { - case chunkenc.EncXOR, chunkenc.EncXOROptST: + case chunkenc.EncXOR, chunkenc.EncXOR2: app.Append(s.st, s.t, s.f) case chunkenc.EncHistogram: // TODO(krajorama): handle ST capable histogram chunk. diff --git a/tsdb/ooo_head_test.go b/tsdb/ooo_head_test.go index f7e73233fb..857018f5c2 100644 --- a/tsdb/ooo_head_test.go +++ b/tsdb/ooo_head_test.go @@ -406,7 +406,7 @@ func TestOOOChunks_ToEncodedChunks_WithST(t *testing.T) { storeST bool expectedEncoding chunkenc.Encoding }{ - {"storeST=true", true, chunkenc.EncXOROptST}, + {"storeST=true", true, chunkenc.EncXOR2}, {"storeST=false", false, chunkenc.EncXOR}, } From 53e7b61692628ff402069043fb312cbecc1c0427 Mon Sep 17 00:00:00 2001 From: bwplotka Date: Tue, 10 Mar 2026 11:31:47 +0000 Subject: [PATCH 42/73] feat: add compliance RW sender test for agent Signed-off-by: bwplotka --- compliance/remote_write_sender_test.go | 29 +++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/compliance/remote_write_sender_test.go b/compliance/remote_write_sender_test.go index 6840132bd3..f6ddea1b9a 100644 --- a/compliance/remote_write_sender_test.go +++ b/compliance/remote_write_sender_test.go @@ -53,7 +53,9 @@ scrape_configs: var scrapeConfigTmpl = template.Must(template.New("config").Parse(scrapeConfigTemplate)) -type internalPrometheus struct{} +type internalPrometheus struct { + agentMode bool +} func (p internalPrometheus) Name() string { return "internal-prometheus" } @@ -74,20 +76,33 @@ func (p internalPrometheus) Run(ctx context.Context, opts sender.Options) error } defer os.RemoveAll(dir) - return sender.RunCommand(ctx, "../cmd/prometheus", nil, - "go", "run", ".", + args := []string{ + "run", ".", "--web.listen-address=0.0.0.0:0", - fmt.Sprintf("--storage.tsdb.path=%v", dir), fmt.Sprintf("--config.file=%s", configFile), // Set important flags for the full remote write compliance: "--enable-feature=st-storage", - ) + } + if p.agentMode { + args = append(args, fmt.Sprintf("--storage.agent.path=%v", dir), "--agent") + } else { + args = append(args, fmt.Sprintf("--storage.tsdb.path=%v", dir)) + } + return sender.RunCommand(ctx, "../cmd/prometheus", nil, "go", args...) } var _ sender.Sender = internalPrometheus{} // TestRemoteWriteSender runs remote write sender compliance tests defined in -// https://github.com/prometheus/compliance/tree/main/remotewrite/sender +// https://github.com/prometheus/compliance/tree/main/remotewrite/sender against +// both agent and sever modes. func TestRemoteWriteSender(t *testing.T) { - sender.RunTests(t, internalPrometheus{}, sender.ComplianceTests()) + t.Run("mode=server", func(t *testing.T) { + t.Parallel() + sender.RunTests(t, internalPrometheus{}, sender.ComplianceTests()) + }) + t.Run("mode=agent", func(t *testing.T) { + t.Parallel() + sender.RunTests(t, internalPrometheus{agentMode: true}, sender.ComplianceTests()) + }) } From 6ab5d8f9be91e914e698088803953396a099bc51 Mon Sep 17 00:00:00 2001 From: bwplotka Date: Tue, 10 Mar 2026 12:27:48 +0000 Subject: [PATCH 43/73] feat(agent): add support for appending ST Signed-off-by: bwplotka --- tsdb/agent/db.go | 2 +- tsdb/agent/db_append_v2.go | 4 +++- tsdb/agent/db_append_v2_test.go | 40 ++++++++++++++++++++++++--------- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/tsdb/agent/db.go b/tsdb/agent/db.go index 3f79d9176a..f1d0aff18f 100644 --- a/tsdb/agent/db.go +++ b/tsdb/agent/db.go @@ -95,7 +95,7 @@ type Options struct { // EnableSTStorage determines whether agent DB should write a Start Timestamp (ST) // per sample to WAL. - // TODO(bwplotka): Implement this option as per PROM-60, currently it's noop. + // Represents 'st-storage' feature flag. EnableSTStorage bool } diff --git a/tsdb/agent/db_append_v2.go b/tsdb/agent/db_append_v2.go index bb2601e1e3..b963608637 100644 --- a/tsdb/agent/db_append_v2.go +++ b/tsdb/agent/db_append_v2.go @@ -72,7 +72,6 @@ func (a *appenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t int64 lastTS := s.lastTs s.Unlock() - // TODO(bwplotka): Handle ST natively (as per PROM-60). if a.opts.EnableSTAsZeroSample && st != 0 { a.bestEffortAppendSTZeroSample(s, ls, lastTS, st, t, h, fh) } @@ -86,6 +85,7 @@ func (a *appenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t int64 case fh != nil: isStale = value.IsStaleNaN(fh.Sum) // NOTE: always modify pendingFloatHistograms and floatHistogramSeries together + // TODO(krajorama,ywwg,bwplotka): Pass ST when available in WAL. a.pendingFloatHistograms = append(a.pendingFloatHistograms, record.RefFloatHistogramSample{ Ref: s.ref, T: t, @@ -95,6 +95,7 @@ func (a *appenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t int64 case h != nil: isStale = value.IsStaleNaN(h.Sum) // NOTE: always modify pendingHistograms and histogramSeries together + // TODO(krajorama,ywwg,bwplotka): Pass ST when available in WAL. a.pendingHistograms = append(a.pendingHistograms, record.RefHistogramSample{ Ref: s.ref, T: t, @@ -107,6 +108,7 @@ func (a *appenderV2) Append(ref storage.SeriesRef, ls labels.Labels, st, t int64 // NOTE: always modify pendingSamples and sampleSeries together. a.pendingSamples = append(a.pendingSamples, record.RefSample{ Ref: s.ref, + ST: st, T: t, V: v, }) diff --git a/tsdb/agent/db_append_v2_test.go b/tsdb/agent/db_append_v2_test.go index cbe9b09374..92a5bb8f35 100644 --- a/tsdb/agent/db_append_v2_test.go +++ b/tsdb/agent/db_append_v2_test.go @@ -90,6 +90,9 @@ func TestDB_InvalidSeries_AppendV2(t *testing.T) { }) } +// TestCommit_AppendV2 tests Appender commit. +// TODO(bwplotka): Rewrite this so Refs are generated, then appended, then expected so we test the +// exact data durability. func TestCommit_AppendV2(t *testing.T) { const ( numDatapoints = 1000 @@ -102,15 +105,24 @@ func TestCommit_AppendV2(t *testing.T) { opts.EnableSTStorage = enableSTStorage s := createTestAgentDB(t, nil, opts) - app := s.AppenderV2(context.TODO()) + var ( + expectedSampleSTs []int64 + gotSampleSTs []int64 + ) + if enableSTStorage { + expectedSampleSTs = make([]int64, 0, numSeries*numDatapoints) + gotSampleSTs = make([]int64, 0, numSeries*numDatapoints) + } + app := s.AppenderV2(t.Context()) lbls := labelsForTest(t.Name(), numSeries) for _, l := range lbls { lset := labels.New(l...) for i := range numDatapoints { sample := chunks.GenerateSamples(0, 1) - _, err := app.Append(0, lset, int64(i), sample[0].T()+2000, sample[0].F(), nil, nil, storage.AOptions{ + st := int64(i + 1234) + _, err := app.Append(0, lset, st, sample[0].T()+2000, sample[0].F(), nil, nil, storage.AOptions{ Exemplars: []exemplar.Exemplar{{ Labels: lset, Ts: sample[0].T() + int64(i) + 2000, @@ -119,6 +131,9 @@ func TestCommit_AppendV2(t *testing.T) { }}, }) require.NoError(t, err) + if enableSTStorage { + expectedSampleSTs = append(expectedSampleSTs, st) + } } } @@ -129,7 +144,7 @@ func TestCommit_AppendV2(t *testing.T) { histograms := tsdbutil.GenerateTestHistograms(numHistograms) for i := range numHistograms { - _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, histograms[i], nil, storage.AOptions{}) + _, err := app.Append(0, lset, int64(i+2234), int64(i+2000), 0, histograms[i], nil, storage.AOptions{}) require.NoError(t, err) } } @@ -141,7 +156,7 @@ func TestCommit_AppendV2(t *testing.T) { customBucketHistograms := tsdbutil.GenerateTestCustomBucketsHistograms(numHistograms) for i := range numHistograms { - _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, customBucketHistograms[i], nil, storage.AOptions{}) + _, err := app.Append(0, lset, int64(i+3234), int64(i+2000), 0, customBucketHistograms[i], nil, storage.AOptions{}) require.NoError(t, err) } } @@ -153,7 +168,7 @@ func TestCommit_AppendV2(t *testing.T) { floatHistograms := tsdbutil.GenerateTestFloatHistograms(numHistograms) for i := range numHistograms { - _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, nil, floatHistograms[i], storage.AOptions{}) + _, err := app.Append(0, lset, int64(i+4234), int64(i+2000), 0, nil, floatHistograms[i], storage.AOptions{}) require.NoError(t, err) } } @@ -165,7 +180,7 @@ func TestCommit_AppendV2(t *testing.T) { customBucketFloatHistograms := tsdbutil.GenerateTestCustomBucketsFloatHistograms(numHistograms) for i := range numHistograms { - _, err := app.Append(0, lset, int64(i), int64(i+2000), 0, nil, customBucketFloatHistograms[i], storage.AOptions{}) + _, err := app.Append(0, lset, int64(i+5234), int64(i+2000), 0, nil, customBucketFloatHistograms[i], storage.AOptions{}) require.NoError(t, err) } } @@ -203,7 +218,6 @@ func TestCommit_AppendV2(t *testing.T) { samples, err = dec.Samples(rec, samples) require.NoError(t, err) walSamplesCount += len(samples) - case record.SamplesV2: if !enableSTStorage { t.Errorf("Got V2 Samples when ST disabled") @@ -211,6 +225,10 @@ func TestCommit_AppendV2(t *testing.T) { var samples []record.RefSample samples, err = dec.Samples(rec, samples) require.NoError(t, err) + + for _, s := range samples { + gotSampleSTs = append(gotSampleSTs, s.ST) + } walSamplesCount += len(samples) case record.HistogramSamples, record.CustomBucketsHistogramSamples: @@ -238,13 +256,15 @@ func TestCommit_AppendV2(t *testing.T) { // Check that the WAL contained the same number of committed series/samples/exemplars. require.Equal(t, numSeries*5, walSeriesCount, "unexpected number of series") require.Equal(t, numSeries*numDatapoints, walSamplesCount, "unexpected number of samples") + require.Equal(t, expectedSampleSTs, gotSampleSTs, "unexpected STs received") require.Equal(t, numSeries*numDatapoints, walExemplarsCount, "unexpected number of exemplars") require.Equal(t, numSeries*numHistograms*2, walHistogramCount, "unexpected number of histograms") require.Equal(t, numSeries*numHistograms*2, walFloatHistogramCount, "unexpected number of float histograms") - // Check that we can still create both kinds of Appender - see https://github.com/prometheus/prometheus/issues/17800. - _ = s.Appender(context.TODO()) - _ = s.AppenderV2(context.TODO()) + // Check that we can still create both kinds of Appender. + // Regression test against https://github.com/prometheus/prometheus/issues/17800. + _ = s.Appender(t.Context()) + _ = s.AppenderV2(t.Context()) }) } } From a773d3daad129e509faf00d408778b7750b72fa3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Tue, 10 Mar 2026 16:23:08 +0100 Subject: [PATCH 44/73] replace stray xoroptst words MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: György Krajcsovits --- tsdb/chunkenc/chunk.go | 10 +++++----- tsdb/chunkenc/xor2.go | 2 +- tsdb/db_append_v2_test.go | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tsdb/chunkenc/chunk.go b/tsdb/chunkenc/chunk.go index b3b33df34b..d1b14568ef 100644 --- a/tsdb/chunkenc/chunk.go +++ b/tsdb/chunkenc/chunk.go @@ -77,7 +77,7 @@ type Chunk interface { // Encoding returns the encoding type of the chunk. // If the chunk is capable of storing ST (start timestamps), it should - // return the appropriate encoding type (e.g., EncXOROptST). + // return the appropriate encoding type (e.g., EncXOR2). Encoding() Encoding // Appender returns an appender to append samples to the chunk. @@ -298,7 +298,7 @@ type pool struct { xor sync.Pool histogram sync.Pool floatHistogram sync.Pool - xoroptst sync.Pool + xo2 sync.Pool } // NewPool returns a new pool. @@ -319,7 +319,7 @@ func NewPool() Pool { return &FloatHistogramChunk{b: bstream{}} }, }, - xoroptst: sync.Pool{ + xo2: sync.Pool{ New: func() any { return &XOR2Chunk{b: bstream{}} }, @@ -337,7 +337,7 @@ func (p *pool) Get(e Encoding, b []byte) (Chunk, error) { case EncFloatHistogram: c = p.floatHistogram.Get().(*FloatHistogramChunk) case EncXOR2: - c = p.xoroptst.Get().(*XOR2Chunk) + c = p.xo2.Get().(*XOR2Chunk) default: return nil, fmt.Errorf("invalid chunk encoding %q", e) } @@ -361,7 +361,7 @@ func (p *pool) Put(c Chunk) error { sp = &p.floatHistogram case EncXOR2: _, ok = c.(*XOR2Chunk) - sp = &p.xoroptst + sp = &p.xo2 default: return fmt.Errorf("invalid chunk encoding %q", c.Encoding()) } diff --git a/tsdb/chunkenc/xor2.go b/tsdb/chunkenc/xor2.go index 3dd3241180..2d5cc2b542 100644 --- a/tsdb/chunkenc/xor2.go +++ b/tsdb/chunkenc/xor2.go @@ -101,7 +101,7 @@ func readSTHeader(b []byte) (firstSTKnown bool, firstSTChangeOn uint8) { } // XOR2Chunk holds XOR2 encoded samples with optional start -// timestamp per chunk or per sample. See XOROptST for the ST header format. +// timestamp per chunk or per sample. type XOR2Chunk struct { b bstream } diff --git a/tsdb/db_append_v2_test.go b/tsdb/db_append_v2_test.go index 1b05a1280e..149331b438 100644 --- a/tsdb/db_append_v2_test.go +++ b/tsdb/db_append_v2_test.go @@ -7505,7 +7505,7 @@ func TestAbortBlockCompactions_AppendV2(t *testing.T) { } // TestCompactHeadWithSTStorage_AppendV2 ensures that when EnableSTStorage is true, -// compacted blocks contain chunks with EncXOROptST encoding for float samples. +// compacted blocks contain chunks with EncXOR2 encoding for float samples. func TestCompactHeadWithSTStorage_AppendV2(t *testing.T) { t.Parallel() From f7c60bf97ef3c305157751eafd3531e8f66ff755 Mon Sep 17 00:00:00 2001 From: Bartlomiej Plotka Date: Tue, 10 Mar 2026 15:55:40 +0000 Subject: [PATCH 45/73] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Bartlomiej Plotka --- compliance/remote_write_sender_test.go | 2 +- tsdb/agent/db.go | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/compliance/remote_write_sender_test.go b/compliance/remote_write_sender_test.go index f6ddea1b9a..9822e2d3e6 100644 --- a/compliance/remote_write_sender_test.go +++ b/compliance/remote_write_sender_test.go @@ -95,7 +95,7 @@ var _ sender.Sender = internalPrometheus{} // TestRemoteWriteSender runs remote write sender compliance tests defined in // https://github.com/prometheus/compliance/tree/main/remotewrite/sender against -// both agent and sever modes. +// both agent and server modes. func TestRemoteWriteSender(t *testing.T) { t.Run("mode=server", func(t *testing.T) { t.Parallel() diff --git a/tsdb/agent/db.go b/tsdb/agent/db.go index f1d0aff18f..a5d0879ed9 100644 --- a/tsdb/agent/db.go +++ b/tsdb/agent/db.go @@ -95,7 +95,9 @@ type Options struct { // EnableSTStorage determines whether agent DB should write a Start Timestamp (ST) // per sample to WAL. - // Represents 'st-storage' feature flag. + // Controlled by the `--enable-feature=st-storage` CLI flag; when enabled, ST is + // persisted to the WAL for samples that include a non-zero start timestamp in + // supported record types. EnableSTStorage bool } From 3cf43337dcb483cfa4cc7348f8e247a156101747 Mon Sep 17 00:00:00 2001 From: bwplotka Date: Thu, 12 Mar 2026 08:28:45 +0000 Subject: [PATCH 46/73] post merge conflict fixes Signed-off-by: bwplotka --- storage/remote/queue_manager_test.go | 18 +- tsdb/wlog/watcher_test.go | 297 ++++++++++++++------------- util/testwal/records.go | 14 +- 3 files changed, 170 insertions(+), 159 deletions(-) diff --git a/storage/remote/queue_manager_test.go b/storage/remote/queue_manager_test.go index 5c97a170f6..b0a5627e2f 100644 --- a/storage/remote/queue_manager_test.go +++ b/storage/remote/queue_manager_test.go @@ -608,7 +608,7 @@ func TestReshardPartialBatch(t *testing.T) { for _, protoMsg := range []remoteapi.WriteMessageType{remoteapi.WriteV1MessageType, remoteapi.WriteV2MessageType} { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { recs := testwal.GenerateRecords(recCase{ - NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. Series: 1, SamplesPerSeries: 10, }) @@ -656,8 +656,8 @@ func TestQueueFilledDeadlock(t *testing.T) { for _, protoMsg := range []remoteapi.WriteMessageType{remoteapi.WriteV1MessageType, remoteapi.WriteV2MessageType} { t.Run(fmt.Sprint(protoMsg), func(t *testing.T) { recs := testwal.GenerateRecords(recCase{ - NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. - Series: 50, SamplesPerSeries: 1 + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + Series: 50, SamplesPerSeries: 1, }) c := NewNopWriteClient() @@ -1920,7 +1920,7 @@ func TestDropOldTimeSeries(t *testing.T) { nSamples := config.DefaultQueueConfig.Capacity * size noST := protoMsg == remoteapi.WriteV1MessageType // RW1 pastRecs := testwal.GenerateRecords(recCase{ - NoST: noST, + NoST: noST, Series: nSeries, SamplesPerSeries: (nSamples / nSeries) / 2, // Half data is past. TsFn: func(_, j int) int64 { @@ -1929,7 +1929,7 @@ func TestDropOldTimeSeries(t *testing.T) { }, }) newRecs := testwal.GenerateRecords(recCase{ - NoST: noST, + NoST: noST, Series: nSeries, SamplesPerSeries: (nSamples / nSeries) / 2, // Half data is past. TsFn: func(_, j int) int64 { @@ -2004,7 +2004,7 @@ func TestSendSamplesWithBackoffWithSampleAgeLimit(t *testing.T) { r := rand.New(rand.NewSource(99)) recs := testwal.GenerateRecords(recCase{ - NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. Series: numberOfSeries, SamplesPerSeries: 1, TsFn: func(_, _ int) int64 { @@ -2031,8 +2031,8 @@ func TestSendSamplesWithBackoffWithSampleAgeLimit(t *testing.T) { tsID := getSeriesIDFromRef(recs.Series[s.Ref]) c.expectedSamples[tsID] = append(c.expectedSamples[tsID], writev2.Sample{ StartTimestamp: s.ST, - Timestamp: s.T, - Value: s.V, + Timestamp: s.T, + Value: s.V, }) } } @@ -2554,7 +2554,7 @@ func TestHighestTimestampOnAppend(t *testing.T) { nSamples := 11 * config.DefaultQueueConfig.Capacity nSeries := 3 recs := testwal.GenerateRecords(recCase{ - NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. + NoST: protoMsg == remoteapi.WriteV1MessageType, // RW1 does not support ST. Series: nSeries, SamplesPerSeries: nSamples / nSeries, }) diff --git a/tsdb/wlog/watcher_test.go b/tsdb/wlog/watcher_test.go index 5c63fd3d92..6c82ec8dcb 100644 --- a/tsdb/wlog/watcher_test.go +++ b/tsdb/wlog/watcher_test.go @@ -194,145 +194,145 @@ func TestWatcher_Tail(t *testing.T) { exemplarsPerSeries = 2 ) for _, enableSTStorage := range []bool{false, true} { - for _, compress := range compression.Types() { - t.Run(fmt.Sprintf("compress=%s/stStorage=%v", compress, enableSTStorage), func(t *testing.T) { - var ( - now = time.Now() - dir = t.TempDir() - wdir = path.Join(dir, "wal") - enc = record.Encoder{EnableSTStorage: enableSTStorage} - ) - require.NoError(t, os.Mkdir(wdir, 0o777)) + for _, compress := range compression.Types() { + t.Run(fmt.Sprintf("compress=%s/stStorage=%v", compress, enableSTStorage), func(t *testing.T) { + var ( + now = time.Now() + dir = t.TempDir() + wdir = path.Join(dir, "wal") + enc = record.Encoder{EnableSTStorage: enableSTStorage} + ) + require.NoError(t, os.Mkdir(wdir, 0o777)) - // Generate test records that represents batches of records data. - // "batch" simulates a single scrape or RW/OTLP receive message. - // Watcher does not inspect the data other than watching start timestamp, so records - // does not need any certain shape. - records := make([]testwal.Records, batches) - cbHistogramRecords := make([]testwal.Records, batches) - for i := range records { - tsFn := func(_, _ int) int64 { - return timestamp.FromTime(now.Add(1 * time.Second)) + // Generate test records that represents batches of records data. + // "batch" simulates a single scrape or RW/OTLP receive message. + // Watcher does not inspect the data other than watching start timestamp, so records + // does not need any certain shape. + records := make([]testwal.Records, batches) + cbHistogramRecords := make([]testwal.Records, batches) + for i := range records { + tsFn := func(_, _ int) int64 { + return timestamp.FromTime(now.Add(1 * time.Second)) + } + records[i] = testwal.GenerateRecords(testwal.RecordsCase{ + NoST: !enableSTStorage, + RefPadding: i * seriesPerBatch, + TsFn: tsFn, + + Series: seriesPerBatch, + SamplesPerSeries: 10, + HistogramsPerSeries: 5, + FloatHistogramsPerSeries: 5, + ExemplarsPerSeries: exemplarsPerSeries, + }) + cbHistogramRecords[i] = testwal.GenerateRecords(testwal.RecordsCase{ + NoST: !enableSTStorage, + RefPadding: i * seriesPerBatch, + TsFn: tsFn, + + Series: seriesPerBatch, + HistogramsPerSeries: 5, + FloatHistogramsPerSeries: 5, + HistogramFn: func(ref int) *histogram.Histogram { + return &histogram.Histogram{ + Schema: -53, + ZeroThreshold: 1e-128, + ZeroCount: 0, + Count: 2, + Sum: 0, + PositiveSpans: []histogram.Span{{Offset: 0, Length: 1}}, + CustomValues: []float64{float64(ref) + 2}, + } + }, + }) } - records[i] = testwal.GenerateRecords(testwal.RecordsCase{ - NoST: !enableSTStorage, - RefPadding: i * seriesPerBatch, - TsFn: tsFn, - Series: seriesPerBatch, - SamplesPerSeries: 10, - HistogramsPerSeries: 5, - FloatHistogramsPerSeries: 5, - ExemplarsPerSeries: exemplarsPerSeries, + // Create WAL for writing. + w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, w.Close()) }) - cbHistogramRecords[i] = testwal.GenerateRecords(testwal.RecordsCase{ - NoST: !enableSTStorage, - RefPadding: i * seriesPerBatch, - TsFn: tsFn, - Series: seriesPerBatch, - HistogramsPerSeries: 5, - FloatHistogramsPerSeries: 5, - HistogramFn: func(ref int) *histogram.Histogram { - return &histogram.Histogram{ - Schema: -53, - ZeroThreshold: 1e-128, - ZeroCount: 0, - Count: 2, - Sum: 0, - PositiveSpans: []histogram.Span{{Offset: 0, Length: 1}}, - CustomValues: []float64{float64(ref) + 2}, - } - }, - }) - } + // Start watcher to that reads into a mock. + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "test", wt, dir, true, true, true, nil) + // Update the time because we just created samples around "now" time and watcher + // only starts watching after that time. + watcher.SetStartTime(now) + // Start spins up watcher loop in a go-routine. + watcher.Start() + t.Cleanup(watcher.Stop) - // Create WAL for writing. - w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) - require.NoError(t, err) - t.Cleanup(func() { - require.NoError(t, w.Close()) - }) + // Write to WAL like append commit would do, while watcher is tailing. - // Start watcher to that reads into a mock. - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "test", wt, dir, true, true, true, nil) - // Update the time because we just created samples around "now" time and watcher - // only starts watching after that time. - watcher.SetStartTime(now) - // Start spins up watcher loop in a go-routine. - watcher.Start() - t.Cleanup(watcher.Stop) + // Write first a few samples before the start time, we don't expect those to be appended. + require.NoError(t, w.Log(enc.Samples([]record.RefSample{ + {Ref: 1, T: timestamp.FromTime(now), V: 123}, + {Ref: 2, T: timestamp.FromTime(now), V: 123.1}, + }, nil))) - // Write to WAL like append commit would do, while watcher is tailing. + for i := range records { + // Similar order as tsdb/head_appender.go.headAppenderBase.log + // https://github.com/prometheus/prometheus/blob/1751685dd4f6430757ba3078a96cffeffcb2bb47/tsdb/head_append.go#L1053 + require.NoError(t, w.Log(enc.Series(records[i].Series, nil))) + require.NoError(t, w.Log(enc.Metadata(records[i].Metadata, nil))) + require.NoError(t, w.Log(enc.Samples(records[i].Samples, nil))) - // Write first a few samples before the start time, we don't expect those to be appended. - require.NoError(t, w.Log(enc.Samples([]record.RefSample{ - {Ref: 1, T: timestamp.FromTime(now), V: 123}, - {Ref: 2, T: timestamp.FromTime(now), V: 123.1}, - }, nil))) + hs, cbHs := enc.HistogramSamples(records[i].Histograms, nil) + require.Empty(t, cbHs) + require.NoError(t, w.Log(hs)) + fhs, cbFhs := enc.FloatHistogramSamples(records[i].FloatHistograms, nil) + require.Empty(t, cbFhs) + require.NoError(t, w.Log(fhs)) + require.NoError(t, w.Log(enc.CustomBucketsHistogramSamples(cbHistogramRecords[i].Histograms, nil))) + require.NoError(t, w.Log(enc.CustomBucketsFloatHistogramSamples(cbHistogramRecords[i].FloatHistograms, nil))) - for i := range records { - // Similar order as tsdb/head_appender.go.headAppenderBase.log - // https://github.com/prometheus/prometheus/blob/1751685dd4f6430757ba3078a96cffeffcb2bb47/tsdb/head_append.go#L1053 - require.NoError(t, w.Log(enc.Series(records[i].Series, nil))) - require.NoError(t, w.Log(enc.Metadata(records[i].Metadata, nil))) - require.NoError(t, w.Log(enc.Samples(records[i].Samples, nil))) + require.NoError(t, w.Log(enc.Exemplars(records[i].Exemplars, nil))) - hs, cbHs := enc.HistogramSamples(records[i].Histograms, nil) - require.Empty(t, cbHs) - require.NoError(t, w.Log(hs)) - fhs, cbFhs := enc.FloatHistogramSamples(records[i].FloatHistograms, nil) - require.Empty(t, cbFhs) - require.NoError(t, w.Log(fhs)) - require.NoError(t, w.Log(enc.CustomBucketsHistogramSamples(cbHistogramRecords[i].Histograms, nil))) - require.NoError(t, w.Log(enc.CustomBucketsFloatHistogramSamples(cbHistogramRecords[i].FloatHistograms, nil))) + // Ping watcher for faster test. Watcher is checking for segment changes or 15s timeout. + watcher.Notify() + } - require.NoError(t, w.Log(enc.Exemplars(records[i].Exemplars, nil))) + // Wait for watcher to lead all. + require.Eventually(t, func() bool { + wt.mu.Lock() + defer wt.mu.Unlock() - // Ping watcher for faster test. Watcher is checking for segment changes or 15s timeout. - watcher.Notify() - } + // Exemplars are logged as the last one, so assert on those. + return wt.exemplarAppends >= batches + }, 2*time.Minute, 1*time.Second) - // Wait for watcher to lead all. - require.Eventually(t, func() bool { wt.mu.Lock() defer wt.mu.Unlock() - // Exemplars are logged as the last one, so assert on those. - return wt.exemplarAppends >= batches - }, 2*time.Minute, 1*time.Second) + require.Equal(t, batches, wt.seriesStores) + require.Equal(t, batches, wt.metadataStores) + require.Equal(t, batches, wt.sampleAppends) + require.Equal(t, 2*batches, wt.histogramAppends) + require.Equal(t, 2*batches, wt.floatHistogramsAppends) + require.Equal(t, batches, wt.exemplarAppends) - wt.mu.Lock() - defer wt.mu.Unlock() + for i := range batches { + sector := len(records[i].Series) + testutil.RequireEqual(t, records[i].Series, wt.seriesStored[i*sector:(i+1)*sector], i) + sector = len(records[i].Metadata) + require.Equal(t, records[i].Metadata, wt.metadataStored[i*sector:(i+1)*sector], i) + sector = len(records[i].Samples) + require.Equal(t, records[i].Samples, wt.samplesAppended[i*sector:(i+1)*sector], i) - require.Equal(t, batches, wt.seriesStores) - require.Equal(t, batches, wt.metadataStores) - require.Equal(t, batches, wt.sampleAppends) - require.Equal(t, 2*batches, wt.histogramAppends) - require.Equal(t, 2*batches, wt.floatHistogramsAppends) - require.Equal(t, batches, wt.exemplarAppends) + sector = len(records[i].Histograms) + len(cbHistogramRecords[i].Histograms) + require.Equal(t, records[i].Histograms, wt.histogramsAppended[i*sector:i*sector+len(records[i].Histograms)], i) + require.Equal(t, cbHistogramRecords[i].Histograms, wt.histogramsAppended[i*sector+len(records[i].Histograms):(i+1)*sector]) + sector = len(records[i].FloatHistograms) + len(cbHistogramRecords[i].FloatHistograms) + require.Equal(t, records[i].FloatHistograms, wt.floatHistogramsAppended[i*sector:i*sector+len(records[i].FloatHistograms)]) + require.Equal(t, cbHistogramRecords[i].FloatHistograms, wt.floatHistogramsAppended[i*sector+len(records[i].FloatHistograms):(i+1)*sector]) - for i := range batches { - sector := len(records[i].Series) - testutil.RequireEqual(t, records[i].Series, wt.seriesStored[i*sector:(i+1)*sector], i) - sector = len(records[i].Metadata) - require.Equal(t, records[i].Metadata, wt.metadataStored[i*sector:(i+1)*sector], i) - sector = len(records[i].Samples) - require.Equal(t, records[i].Samples, wt.samplesAppended[i*sector:(i+1)*sector], i) - - sector = len(records[i].Histograms) + len(cbHistogramRecords[i].Histograms) - require.Equal(t, records[i].Histograms, wt.histogramsAppended[i*sector:i*sector+len(records[i].Histograms)], i) - require.Equal(t, cbHistogramRecords[i].Histograms, wt.histogramsAppended[i*sector+len(records[i].Histograms):(i+1)*sector]) - sector = len(records[i].FloatHistograms) + len(cbHistogramRecords[i].FloatHistograms) - require.Equal(t, records[i].FloatHistograms, wt.floatHistogramsAppended[i*sector:i*sector+len(records[i].FloatHistograms)]) - require.Equal(t, cbHistogramRecords[i].FloatHistograms, wt.floatHistogramsAppended[i*sector+len(records[i].FloatHistograms):(i+1)*sector]) - - sector = len(records[i].Exemplars) - testutil.RequireEqual(t, records[i].Exemplars, wt.exemplarsAppended[i*sector:(i+1)*sector]) - } - }) - } + sector = len(records[i].Exemplars) + testutil.RequireEqual(t, records[i].Exemplars, wt.exemplarsAppended[i*sector:(i+1)*sector]) + } + }) + } } } @@ -390,9 +390,9 @@ func TestReadToEndNoCheckpoint(t *testing.T) { _, _, err = Segments(w.Dir()) require.NoError(t, err) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - go watcher.Start() + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + go watcher.Start() expected := seriesCount require.Eventually(t, func() bool { @@ -478,12 +478,12 @@ func TestReadToEndWithCheckpoint(t *testing.T) { } } - _, _, err = Segments(w.Dir()) - require.NoError(t, err) - overwriteReadTimeout(t, time.Second) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - go watcher.Start() + _, _, err = Segments(w.Dir()) + require.NoError(t, err) + overwriteReadTimeout(t, time.Second) + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + go watcher.Start() expected := seriesCount * 2 @@ -554,9 +554,9 @@ func TestReadCheckpoint(t *testing.T) { _, _, err = Segments(w.Dir()) require.NoError(t, err) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - go watcher.Start() + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + go watcher.Start() expectedSeries := seriesCount retry(t, defaultRetryInterval, defaultRetries, func() bool { @@ -625,9 +625,9 @@ func TestReadCheckpointMultipleSegments(t *testing.T) { require.NoError(t, err) } - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - watcher.MaxSegment = -1 + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + watcher.MaxSegment = -1 // Set the Watcher's metrics so they're not nil pointers. watcher.SetMetrics() @@ -705,7 +705,7 @@ func TestCheckpointSeriesReset(t *testing.T) { overwriteReadTimeout(t, time.Second) wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, subdir, false, false, false, nil) watcher.MaxSegment = -1 go watcher.Start() @@ -784,9 +784,9 @@ func TestRun_StartupTime(t *testing.T) { } require.NoError(t, w.Close()) - wt := newWriteToMock(0) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - watcher.MaxSegment = segments + wt := newWriteToMock(0) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + watcher.MaxSegment = segments watcher.SetMetrics() startTime := time.Now() @@ -856,11 +856,11 @@ func TestRun_AvoidNotifyWhenBehind(t *testing.T) { // Create 00000001, the watcher will tail it once started. w.NextSegment() - // Set up the watcher and run it in the background. - wt := newWriteToMock(time.Millisecond) - watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) - watcher.SetMetrics() - watcher.MaxSegment = segmentsToRead + // Set up the watcher and run it in the background. + wt := newWriteToMock(time.Millisecond) + watcher := NewWatcher(wMetrics, nil, nil, "", wt, dir, false, false, false, nil) + watcher.SetMetrics() + watcher.MaxSegment = segmentsToRead var g errgroup.Group g.Go(func() error { @@ -892,10 +892,11 @@ func TestRun_AvoidNotifyWhenBehind(t *testing.T) { // Wait for the watcher. require.NoError(t, g.Wait()) - // All series and samples were read. - require.Equal(t, (segmentsToRead+1)*seriesCount, wt.checkNumSeries()) // Series from 00000000 are also read. - require.Len(t, wt.samplesAppended, segmentsToRead*seriesCount*samplesCount) - require.NoError(t, w.Close()) - }) + // All series and samples were read. + require.Equal(t, (segmentsToRead+1)*seriesCount, wt.checkNumSeries()) // Series from 00000000 are also read. + require.Len(t, wt.samplesAppended, segmentsToRead*seriesCount*samplesCount) + require.NoError(t, w.Close()) + }) + } } } diff --git a/util/testwal/records.go b/util/testwal/records.go index 5f85e42c3c..1fe5938461 100644 --- a/util/testwal/records.go +++ b/util/testwal/records.go @@ -48,6 +48,8 @@ type RecordsCase struct { // HistogramFn source histogram for histogram and float histogram records. // By default, newTestHist is used (exponential bucketing) HistogramFn func(ref int) *histogram.Histogram + // NoST controls if ref samples should skip generating Start Timestamps. If true, ST is 0. + NoST bool } // Records represents batches of generated WAL records. @@ -118,10 +120,18 @@ func GenerateRecords(c RecordsCase) (ret Records) { Help: fmt.Sprintf("help text for %d", ref), } for j := range c.SamplesPerSeries { + ts := c.TsFn(ref, j) + // Keep ST simple for now; we don't test the exact semantics. + // We can improve later (e.g. STsFN). + sts := ts - 1 + if c.NoST { + sts = 0 + } + ret.Samples[i*c.SamplesPerSeries+j] = record.RefSample{ Ref: chunks.HeadSeriesRef(ref), - T: c.TsFn(ref, j), - V: float64(ref), + ST: sts, T: ts, + V: float64(ref), } } h := c.HistogramFn(ref) From 0dac72ee94f5bd35c84ba204009d36b06cc8b2e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 12 Mar 2026 16:01:04 +0100 Subject: [PATCH 47/73] feat(tsdb): register st_storage in feature API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Register the st-storage feature flag in the feature registry via the TSDB options, consistent with how other TSDB features like exemplar_storage and delayed_compaction are registered. Signed-off-by: György Krajcsovits Coded with Claude Sonnet 4.6. --- cmd/prometheus/testdata/features.json | 1 + tsdb/db.go | 1 + 2 files changed, 2 insertions(+) diff --git a/cmd/prometheus/testdata/features.json b/cmd/prometheus/testdata/features.json index 60e6b65b40..e68b7def1e 100644 --- a/cmd/prometheus/testdata/features.json +++ b/cmd/prometheus/testdata/features.json @@ -251,6 +251,7 @@ "exemplar_storage": false, "isolation": true, "native_histograms": true, + "st_storage": false, "use_uncached_io": false }, "ui": { diff --git a/tsdb/db.go b/tsdb/db.go index c793ace522..8c1fffbed4 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -868,6 +868,7 @@ func Open(dir string, l *slog.Logger, r prometheus.Registerer, opts *Options, st opts.FeatureRegistry.Set(features.TSDB, "isolation", !opts.IsolationDisabled) opts.FeatureRegistry.Set(features.TSDB, "use_uncached_io", opts.UseUncachedIO) opts.FeatureRegistry.Enable(features.TSDB, "native_histograms") + opts.FeatureRegistry.Set(features.TSDB, "st_storage", opts.EnableSTStorage) } return open(dir, l, r, opts, rngs, stats) From 7694434044567e443153b9d9975c7983646d86b4 Mon Sep 17 00:00:00 2001 From: Carrie Edwards Date: Wed, 11 Mar 2026 09:26:11 -0700 Subject: [PATCH 48/73] Document xor2-encoding feature flag Signed-off-by: Carrie Edwards --- cmd/prometheus/main.go | 10 ++++++++-- cmd/prometheus/testdata/features.json | 1 + docs/command-line/prometheus.md | 2 +- docs/feature_flags.md | 17 +++++++++++++++++ 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 2901707709..698618e929 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -281,16 +281,20 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { config.DefaultConfig.GlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols logger.Info("Experimental start timestamp zero ingestion enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) + case "xor2-encoding": + c.tsdb.EnableXOR2Encoding = true + logger.Info("Experimental XOR2 chunk encoding enabled.") case "st-storage": // TODO(bwplotka): Implement ST Storage as per PROM-60 and document this hidden feature flag. c.scrape.ParseST = true c.tsdb.EnableSTStorage = true + c.tsdb.EnableXOR2Encoding = true // Set chunk encoding type to XOR2 for samples with ST c.agent.EnableSTStorage = true // Change relevant global variables. Hacky, but it's hard to pass a new option or default to unmarshallers. This is to widen the ST support surface. config.DefaultConfig.GlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols - logger.Info("Experimental start timestamp storage enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) + logger.Info("Experimental start timestamp storage enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format. Changed chunk encoding type to XOR2.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) case "delayed-compaction": c.tsdb.EnableDelayedCompaction = true logger.Info("Experimental delayed compaction is enabled.") @@ -603,7 +607,7 @@ func main() { a.Flag("scrape.discovery-reload-interval", "Interval used by scrape manager to throttle target groups updates."). Hidden().Default("5s").SetValue(&cfg.scrape.DiscoveryReloadInterval) - a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). + a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers, xor2-encoding. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). Default("").StringsVar(&cfg.featureList) a.Flag("agent", "Run Prometheus in 'Agent mode'.").BoolVar(&agentMode) @@ -2011,6 +2015,7 @@ type tsdbOptions struct { BlockReloadInterval model.Duration EnableSTAsZeroSample bool EnableSTStorage bool + EnableXOR2Encoding bool StaleSeriesCompactionThreshold float64 } @@ -2041,6 +2046,7 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options { FeatureRegistry: features.DefaultRegistry, EnableSTAsZeroSample: opts.EnableSTAsZeroSample, EnableSTStorage: opts.EnableSTStorage, + EnableXOR2Encoding: opts.EnableXOR2Encoding, StaleSeriesCompactionThreshold: opts.StaleSeriesCompactionThreshold, } } diff --git a/cmd/prometheus/testdata/features.json b/cmd/prometheus/testdata/features.json index e68b7def1e..d30b3b382f 100644 --- a/cmd/prometheus/testdata/features.json +++ b/cmd/prometheus/testdata/features.json @@ -252,6 +252,7 @@ "isolation": true, "native_histograms": true, "st_storage": false, + "xor2_encoding": false, "use_uncached_io": false }, "ui": { diff --git a/docs/command-line/prometheus.md b/docs/command-line/prometheus.md index 251fdfd6a4..23c4ac4b59 100644 --- a/docs/command-line/prometheus.md +++ b/docs/command-line/prometheus.md @@ -59,7 +59,7 @@ The Prometheus monitoring server | --query.timeout | Maximum time a query may take before being aborted. Use with server mode only. | `2m` | | --query.max-concurrency | Maximum number of queries executed concurrently. Use with server mode only. | `20` | | --query.max-samples | Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return. Use with server mode only. | `50000000` | -| --enable-feature ... | Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | +| --enable-feature ... | Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers, xor2-encoding. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | | --agent | Run Prometheus in 'Agent mode'. | | | --log.level | Only log messages with the given severity or above. One of: [debug, info, warn, error] | `info` | | --log.format | Output format of log messages. One of: [logfmt, json] | `logfmt` | diff --git a/docs/feature_flags.md b/docs/feature_flags.md index 45d14b72db..d707cbec50 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -306,6 +306,23 @@ This is currently implemented using direct I/O. For more details, see the [proposal](https://github.com/prometheus/proposals/pull/45). +## XOR2 chunk encoding + +`--enable-feature=xor2-encoding` + +Enables the XOR2 chunk encoding for float samples, which provides better +disk compression than the default XOR encoding for typical Prometheus workloads. XOR2 reduces chunk storage size by +5-20% in typical Prometheus deployments through: + +1. **Staleness marker optimization**: When a scrape target disappears, Prometheus writes a special "staleness marker" to indicate the series has ended. Standard XOR encoding uses ~110 bits per marker; XOR2 uses only 13 bits—a **90% reduction**. For deployments with dynamic infrastructure (Kubernetes pods, auto-scaling, short-lived jobs), staleness markers can represent 10-40% of all samples. +2. **Adaptive timestamp encoding**: XOR2 automatically optimizes for both regular and irregular scrape intervals. Regular data gets the same compression as standard XOR (zero overhead), while irregular data benefits from improved encoding that handles larger timestamp variations more efficiently. + +Chunks encoded with XOR2 **cannot be read by older Prometheus versions** that do not support +the encoding. Once enabled and data is written, downgrading requires waiting for +all XOR2 chunks to be compacted out of retention. + +This feature is automatically enabled when `st-storage` is enabled. + ## Extended Range Selectors `--enable-feature=promql-extended-range-selectors` From a679ab5eb4ed91ba356301bf581c7ff144803498 Mon Sep 17 00:00:00 2001 From: Carrie Edwards Date: Wed, 11 Mar 2026 09:26:37 -0700 Subject: [PATCH 49/73] Add xor2-encoding feature flag Signed-off-by: Carrie Edwards --- tsdb/chunkenc/chunk.go | 10 +++--- tsdb/db.go | 10 ++++++ tsdb/head.go | 5 +++ tsdb/head_append.go | 14 ++++---- tsdb/head_append_v2.go | 1 + tsdb/head_append_v2_test.go | 4 ++- tsdb/head_test.go | 65 ++++++++++++++++++++----------------- tsdb/head_wal.go | 6 ++-- tsdb/ooo_head.go | 4 +-- tsdb/ooo_head_read.go | 10 +++--- tsdb/ooo_head_test.go | 20 ++++++------ 11 files changed, 88 insertions(+), 61 deletions(-) diff --git a/tsdb/chunkenc/chunk.go b/tsdb/chunkenc/chunk.go index d1b14568ef..22d4d160e6 100644 --- a/tsdb/chunkenc/chunk.go +++ b/tsdb/chunkenc/chunk.go @@ -191,10 +191,10 @@ func (v ValueType) String() string { } } -func (v ValueType) ChunkEncoding(storeST bool) Encoding { +func (v ValueType) ChunkEncoding(useXOR2 bool) Encoding { switch v { case ValFloat: - if storeST { + if useXOR2 { return EncXOR2 } return EncXOR @@ -207,10 +207,12 @@ func (v ValueType) ChunkEncoding(storeST bool) Encoding { } } -func (v ValueType) NewChunk(storeST bool) (Chunk, error) { - return NewEmptyChunk(v.ChunkEncoding(storeST)) +// NewChunk returns a new empty chunk for the given value type. +func (v ValueType) NewChunk(useXOR2 bool) (Chunk, error) { + return NewEmptyChunk(v.ChunkEncoding(useXOR2)) } + // MockSeriesIterator returns an iterator for a mock series with custom // start timestamp, timestamps, and values. // Start timestamps is optional, pass nil or empty slice to indicate no start diff --git a/tsdb/db.go b/tsdb/db.go index 8c1fffbed4..711f4e541c 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -240,6 +240,11 @@ type Options struct { // is implemented. EnableSTAsZeroSample bool + // EnableXOR2Encoding enables the XOR2 chunk encoding for float samples. + // XOR2 provides better compression than XOR, especially for stale markers. + // Automatically set to true when EnableSTStorage is true. + EnableXOR2Encoding bool + // EnableSTStorage determines whether TSDB should write a Start Timestamp (ST) // per sample to WAL. // TODO(bwplotka): Implement this option as per PROM-60, currently it's noop. @@ -869,6 +874,7 @@ func Open(dir string, l *slog.Logger, r prometheus.Registerer, opts *Options, st opts.FeatureRegistry.Set(features.TSDB, "use_uncached_io", opts.UseUncachedIO) opts.FeatureRegistry.Enable(features.TSDB, "native_histograms") opts.FeatureRegistry.Set(features.TSDB, "st_storage", opts.EnableSTStorage) + opts.FeatureRegistry.Set(features.TSDB, "xor2_encoding", opts.EnableXOR2Encoding) } return open(dir, l, r, opts, rngs, stats) @@ -1075,7 +1081,11 @@ func open(dir string, l *slog.Logger, r prometheus.Registerer, opts *Options, rn headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax) headOpts.EnableSharding = opts.EnableSharding headOpts.EnableSTAsZeroSample = opts.EnableSTAsZeroSample + if opts.EnableSTStorage { + opts.EnableXOR2Encoding = true + } headOpts.EnableSTStorage.Store(opts.EnableSTStorage) + headOpts.EnableXOR2Encoding.Store(opts.EnableXOR2Encoding) headOpts.EnableMetadataWALRecords = opts.EnableMetadataWALRecords if opts.WALReplayConcurrency > 0 { headOpts.WALReplayConcurrency = opts.WALReplayConcurrency diff --git a/tsdb/head.go b/tsdb/head.go index ce95cbba77..0052ddd72c 100644 --- a/tsdb/head.go +++ b/tsdb/head.go @@ -166,6 +166,11 @@ type HeadOptions struct { // Represents 'st-storage' feature flag. EnableSTStorage atomic.Bool + // EnableXOR2Encoding enables XOR2 chunk encoding for float samples. + // Represents 'xor2-encoding' feature flag. Automatically true when + // EnableSTStorage is true. + EnableXOR2Encoding atomic.Bool + ChunkRange int64 // ChunkDirRoot is the parent directory of the chunks directory. ChunkDirRoot string diff --git a/tsdb/head_append.go b/tsdb/head_append.go index adb11ec076..c7143d8d96 100644 --- a/tsdb/head_append.go +++ b/tsdb/head_append.go @@ -186,6 +186,7 @@ func (h *Head) appender() *headAppender { appendID: appendID, cleanupAppendIDsBelow: cleanupAppendIDsBelow, storeST: h.opts.EnableSTStorage.Load(), + useXOR2: h.opts.EnableXOR2Encoding.Load(), }, } } @@ -414,6 +415,7 @@ type headAppenderBase struct { appendID, cleanupAppendIDsBelow uint64 closed bool storeST bool + useXOR2 bool } type headAppender struct { headAppenderBase @@ -1747,7 +1749,7 @@ func (a *headAppenderBase) Commit() (err error) { chunkDiskMapper: h.chunkDiskMapper, chunkRange: h.chunkRange.Load(), samplesPerChunk: h.opts.SamplesPerChunk, - storeST: a.storeST, + useXOR2: a.useXOR2, }, oooEnc: record.Encoder{ EnableSTStorage: a.storeST, @@ -1834,7 +1836,7 @@ type chunkOpts struct { chunkDiskMapper *chunks.ChunkDiskMapper chunkRange int64 samplesPerChunk int - storeST bool + useXOR2 bool // Selects XOR2 encoding for float chunks. } // append adds the sample (t, v) to the series. The caller also has to provide @@ -1842,7 +1844,7 @@ type chunkOpts struct { // isolation for this append.) // Series lock must be held when calling. func (s *memSeries) append(st, t int64, v float64, appendID uint64, o chunkOpts) (sampleInOrder, chunkCreated bool) { - c, sampleInOrder, chunkCreated := s.appendPreprocessor(t, chunkenc.ValFloat.ChunkEncoding(o.storeST), o) + c, sampleInOrder, chunkCreated := s.appendPreprocessor(t, chunkenc.ValFloat.ChunkEncoding(o.useXOR2), o) if !sampleInOrder { return sampleInOrder, chunkCreated } @@ -1873,7 +1875,7 @@ func (s *memSeries) appendHistogram(st, t int64, h *histogram.Histogram, appendI // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway. prevApp, _ := s.app.(*chunkenc.HistogramAppender) - c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.ValHistogram.ChunkEncoding(o.storeST), o) + c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.ValHistogram.ChunkEncoding(o.useXOR2), o) if !sampleInOrder { return sampleInOrder, chunkCreated } @@ -1930,7 +1932,7 @@ func (s *memSeries) appendFloatHistogram(st, t int64, fh *histogram.FloatHistogr // Ignoring ok is ok, since we don't want to compare to the wrong previous appender anyway. prevApp, _ := s.app.(*chunkenc.FloatHistogramAppender) - c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.ValFloatHistogram.ChunkEncoding(o.storeST), o) + c, sampleInOrder, chunkCreated := s.histogramsAppendPreprocessor(t, chunkenc.ValFloatHistogram.ChunkEncoding(o.useXOR2), o) if !sampleInOrder { return sampleInOrder, chunkCreated } @@ -2187,7 +2189,7 @@ func (s *memSeries) mmapCurrentOOOHeadChunk(o chunkOpts, logger *slog.Logger) [] // OOO is not enabled or there is no head chunk, so nothing to m-map here. return nil } - chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, o.storeST) + chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, o.useXOR2) if err != nil { handleChunkWriteError(err) return nil diff --git a/tsdb/head_append_v2.go b/tsdb/head_append_v2.go index 40f5b0b102..29e19f4265 100644 --- a/tsdb/head_append_v2.go +++ b/tsdb/head_append_v2.go @@ -96,6 +96,7 @@ func (h *Head) appenderV2() *headAppenderV2 { appendID: appendID, cleanupAppendIDsBelow: cleanupAppendIDsBelow, storeST: h.opts.EnableSTStorage.Load(), + useXOR2: h.opts.EnableXOR2Encoding.Load(), }, } } diff --git a/tsdb/head_append_v2_test.go b/tsdb/head_append_v2_test.go index ec4f3f5857..9464a9ef66 100644 --- a/tsdb/head_append_v2_test.go +++ b/tsdb/head_append_v2_test.go @@ -2943,6 +2943,7 @@ func testWBLReplayAppenderV2(t *testing.T, scenario sampleTypeScenario, enableST opts.ChunkDirRoot = dir opts.OutOfOrderTimeWindow.Store(30 * time.Minute.Milliseconds()) opts.EnableSTStorage.Store(enableSTstorage) + opts.EnableXOR2Encoding.Store(enableSTstorage) h, err := NewHead(nil, nil, wal, oooWlog, opts, nil) require.NoError(t, err) @@ -2994,7 +2995,7 @@ func testWBLReplayAppenderV2(t *testing.T, scenario sampleTypeScenario, enableST require.False(t, ok) require.NotNil(t, ms) - chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, h.opts.EnableSTStorage.Load()) + chks, err := ms.ooo.oooHeadChunk.chunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, h.opts.EnableXOR2Encoding.Load()) require.NoError(t, err) require.Len(t, chks, 1) @@ -4813,6 +4814,7 @@ func TestHeadAppenderV2_STStorage(t *testing.T) { t.Run(tc.name, func(t *testing.T) { opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) opts.EnableSTStorage.Store(true) + opts.EnableXOR2Encoding.Store(true) h, _ := newTestHeadWithOptions(t, compression.None, opts) lbls := labels.FromStrings("foo", "bar") diff --git a/tsdb/head_test.go b/tsdb/head_test.go index 1e5858b02b..e72a532cd7 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -2183,47 +2183,47 @@ func TestComputeChunkEndTime(t *testing.T) { } } -// TestMemSeries_append tests float appending with various storeST/st combinations. +// TestMemSeries_append tests float appending with various useXOR2/st combinations. func TestMemSeries_append(t *testing.T) { scenarios := []struct { name string - storeST bool + useXOR2 bool stFunc func(ts int64) int64 // Function to compute st from ts }{ { - name: "storeST=false st=0", - storeST: false, + name: "useXOR2=false st=0", + useXOR2: false, stFunc: func(_ int64) int64 { return 0 }, }, { - name: "storeST=true st=0", - storeST: true, + name: "useXOR2=true st=0", + useXOR2: true, stFunc: func(_ int64) int64 { return 0 }, }, { - name: "storeST=true st=ts", - storeST: true, + name: "useXOR2=true st=ts", + useXOR2: true, stFunc: func(ts int64) int64 { return ts }, }, { - name: "storeST=true st=ts-100", - storeST: true, + name: "useXOR2=true st=ts-100", + useXOR2: true, stFunc: func(ts int64) int64 { return ts - 100 }, }, { - name: "storeST=false st=ts (st ignored)", - storeST: false, + name: "useXOR2=false st=ts (st ignored)", + useXOR2: false, stFunc: func(ts int64) int64 { return ts }, }, } for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { - testMemSeriesAppend(t, scenario.storeST, scenario.stFunc) + testMemSeriesAppend(t, scenario.useXOR2, scenario.stFunc) }) } } -func testMemSeriesAppend(t *testing.T, storeST bool, stFunc func(ts int64) int64) { +func testMemSeriesAppend(t *testing.T, useXOR2 bool, stFunc func(ts int64) int64) { dir := t.TempDir() // This is usually taken from the Head, but passing manually here. chunkDiskMapper, err := chunks.NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), chunks.DefaultWriteBufferSize, chunks.DefaultWriteQueueSize) @@ -2235,7 +2235,7 @@ func testMemSeriesAppend(t *testing.T, storeST bool, stFunc func(ts int64) int64 chunkDiskMapper: chunkDiskMapper, chunkRange: 500, samplesPerChunk: DefaultSamplesPerChunk, - storeST: storeST, + useXOR2: useXOR2, } s := newMemSeries(labels.Labels{}, 1, 0, defaultIsolationDisabled, false) @@ -2286,47 +2286,47 @@ func testMemSeriesAppend(t *testing.T, storeST bool, stFunc func(ts int64) int64 } } -// TestMemSeries_appendHistogram tests histogram appending with various storeST/st combinations. +// TestMemSeries_appendHistogram tests histogram appending with various useXOR2/st combinations. func TestMemSeries_appendHistogram(t *testing.T) { scenarios := []struct { name string - storeST bool + useXOR2 bool stFunc func(ts int64) int64 // Function to compute st from ts }{ { - name: "storeST=false st=0", - storeST: false, + name: "useXOR2=false st=0", + useXOR2: false, stFunc: func(_ int64) int64 { return 0 }, }, { - name: "storeST=true st=0", - storeST: true, + name: "useXOR2=true st=0", + useXOR2: true, stFunc: func(_ int64) int64 { return 0 }, }, { - name: "storeST=true st=ts", - storeST: true, + name: "useXOR2=true st=ts", + useXOR2: true, stFunc: func(ts int64) int64 { return ts }, }, { - name: "storeST=true st=ts-100", - storeST: true, + name: "useXOR2=true st=ts-100", + useXOR2: true, stFunc: func(ts int64) int64 { return ts - 100 }, }, { - name: "storeST=false st=ts (st ignored)", - storeST: false, + name: "useXOR2=false st=ts (st ignored)", + useXOR2: false, stFunc: func(ts int64) int64 { return ts }, }, } for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { - testMemSeriesAppendHistogram(t, scenario.storeST, scenario.stFunc) + testMemSeriesAppendHistogram(t, scenario.useXOR2, scenario.stFunc) }) } } -func testMemSeriesAppendHistogram(t *testing.T, storeST bool, stFunc func(ts int64) int64) { +func testMemSeriesAppendHistogram(t *testing.T, useXOR2 bool, stFunc func(ts int64) int64) { dir := t.TempDir() // This is usually taken from the Head, but passing manually here. chunkDiskMapper, err := chunks.NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), chunks.DefaultWriteBufferSize, chunks.DefaultWriteQueueSize) @@ -2338,7 +2338,7 @@ func testMemSeriesAppendHistogram(t *testing.T, storeST bool, stFunc func(ts int chunkDiskMapper: chunkDiskMapper, chunkRange: int64(1000), samplesPerChunk: DefaultSamplesPerChunk, - storeST: storeST, + useXOR2: useXOR2, } s := newMemSeries(labels.Labels{}, 1, 0, defaultIsolationDisabled, false) @@ -7354,6 +7354,7 @@ func TestHeadAppender_WALEncoder_EnableSTStorage(t *testing.T) { t.Run(fmt.Sprintf("enableSTStorage=%v", enableST), func(t *testing.T) { opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) opts.EnableSTStorage.Store(enableST) + opts.EnableXOR2Encoding.Store(enableST) h, w := newTestHeadWithOptions(t, compression.None, opts) lbls := labels.FromStrings("foo", "bar") @@ -7409,6 +7410,7 @@ func TestHeadAppender_WBLEncoder_EnableSTStorage(t *testing.T) { opts.ChunkDirRoot = dir opts.OutOfOrderTimeWindow.Store(60 * time.Minute.Milliseconds()) opts.EnableSTStorage.Store(enableST) + opts.EnableXOR2Encoding.Store(enableST) h, err := NewHead(nil, nil, wal, wbl, opts, nil) require.NoError(t, err) @@ -7527,6 +7529,7 @@ func TestHeadAppender_STStorage_Disabled(t *testing.T) { func TestHeadAppender_STStorage_WALReplay(t *testing.T) { opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) opts.EnableSTStorage.Store(true) + opts.EnableXOR2Encoding.Store(true) h, w := newTestHeadWithOptions(t, compression.None, opts) lbls := labels.FromStrings("foo", "bar") @@ -7578,6 +7581,7 @@ func TestHeadAppender_STStorage_WBLReplay(t *testing.T) { opts.ChunkDirRoot = dir opts.OutOfOrderTimeWindow.Store(60 * time.Minute.Milliseconds()) opts.EnableSTStorage.Store(true) + opts.EnableXOR2Encoding.Store(true) h, err := NewHead(nil, nil, wal, wbl, opts, nil) require.NoError(t, err) @@ -7656,6 +7660,7 @@ func TestHeadAppender_STStorage_ChunkEncoding(t *testing.T) { t.Run(fmt.Sprintf("EnableSTStorage=%t", enableST), func(t *testing.T) { opts := newTestHeadDefaultOptions(DefaultBlockDuration, false) opts.EnableSTStorage.Store(enableST) + opts.EnableXOR2Encoding.Store(enableST) // ST storage implies XOR2 encoding. h, _ := newTestHeadWithOptions(t, compression.None, opts) lbls := labels.FromStrings("foo", "bar") diff --git a/tsdb/head_wal.go b/tsdb/head_wal.go index 1851e99230..0e5cba606c 100644 --- a/tsdb/head_wal.go +++ b/tsdb/head_wal.go @@ -636,7 +636,7 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmapp chunkDiskMapper: h.chunkDiskMapper, chunkRange: h.chunkRange.Load(), samplesPerChunk: h.opts.SamplesPerChunk, - storeST: h.opts.EnableSTStorage.Load(), + useXOR2: h.opts.EnableXOR2Encoding.Load(), } for in := range wp.input { @@ -1084,7 +1084,7 @@ func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (map[chunks.HeadSeriesR chunkDiskMapper: h.chunkDiskMapper, chunkRange: h.chunkRange.Load(), samplesPerChunk: h.opts.SamplesPerChunk, - storeST: h.opts.EnableSTStorage.Load(), + useXOR2: h.opts.EnableXOR2Encoding.Load(), } // We don't check for minValidTime for ooo samples. mint, maxt := int64(math.MaxInt64), int64(math.MinInt64) @@ -1251,7 +1251,7 @@ func decodeSeriesFromChunkSnapshot(d *record.Decoder, b []byte) (csr chunkSnapsh csr.mc.chunk = chk switch enc { - case chunkenc.EncXOR: + case chunkenc.EncXOR, chunkenc.EncXOR2: // Backwards-compatibility for old sampleBuf which had last 4 samples. for range 3 { _ = dec.Be64int64() diff --git a/tsdb/ooo_head.go b/tsdb/ooo_head.go index 20b225bc56..60cee8d005 100644 --- a/tsdb/ooo_head.go +++ b/tsdb/ooo_head.go @@ -73,7 +73,7 @@ func (o *OOOChunk) NumSamples() int { // ToEncodedChunks returns chunks with the samples in the OOOChunk. // //nolint:revive -func (o *OOOChunk) ToEncodedChunks(mint, maxt int64, storeST bool) (chks []memChunk, err error) { +func (o *OOOChunk) ToEncodedChunks(mint, maxt int64, useXOR2 bool) (chks []memChunk, err error) { if len(o.samples) == 0 { return nil, nil } @@ -93,7 +93,7 @@ func (o *OOOChunk) ToEncodedChunks(mint, maxt int64, storeST bool) (chks []memCh if s.t > maxt { break } - encoding := chunkenc.ValFloat.ChunkEncoding(storeST) + encoding := chunkenc.ValFloat.ChunkEncoding(useXOR2) switch { case s.h != nil: // TODO(krajorama): use ST capable histogram chunk. diff --git a/tsdb/ooo_head_read.go b/tsdb/ooo_head_read.go index 86c64ff6e0..ed3e7baeb5 100644 --- a/tsdb/ooo_head_read.go +++ b/tsdb/ooo_head_read.go @@ -77,7 +77,7 @@ func (oh *HeadAndOOOIndexReader) Series(ref storage.SeriesRef, builder *labels.S *chks = (*chks)[:0] if s.ooo != nil { - return getOOOSeriesChunks(s, oh.head.opts.EnableSTStorage.Load(), oh.mint, oh.maxt, oh.lastGarbageCollectedMmapRef, 0, true, oh.inoMint, chks) + return getOOOSeriesChunks(s, oh.head.opts.EnableXOR2Encoding.Load(), oh.mint, oh.maxt, oh.lastGarbageCollectedMmapRef, 0, true, oh.inoMint, chks) } *chks = appendSeriesChunks(s, oh.inoMint, oh.maxt, *chks) return nil @@ -88,7 +88,7 @@ func (oh *HeadAndOOOIndexReader) Series(ref storage.SeriesRef, builder *labels.S // // maxMmapRef tells upto what max m-map chunk that we can consider. If it is non-0, then // the oooHeadChunk will not be considered. -func getOOOSeriesChunks(s *memSeries, storeST bool, mint, maxt int64, lastGarbageCollectedMmapRef, maxMmapRef chunks.ChunkDiskMapperRef, includeInOrder bool, inoMint int64, chks *[]chunks.Meta) error { +func getOOOSeriesChunks(s *memSeries, useXOR2 bool, mint, maxt int64, lastGarbageCollectedMmapRef, maxMmapRef chunks.ChunkDiskMapperRef, includeInOrder bool, inoMint int64, chks *[]chunks.Meta) error { tmpChks := make([]chunks.Meta, 0, len(s.ooo.oooMmappedChunks)) addChunk := func(minT, maxT int64, ref chunks.ChunkRef, chunk chunkenc.Chunk) { @@ -106,7 +106,7 @@ func getOOOSeriesChunks(s *memSeries, storeST bool, mint, maxt int64, lastGarbag if c.OverlapsClosedInterval(mint, maxt) && maxMmapRef == 0 { ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.ooo.oooMmappedChunks)))) if len(c.chunk.samples) > 0 { // Empty samples happens in tests, at least. - chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(c.minTime, c.maxTime, storeST) + chks, err := s.ooo.oooHeadChunk.chunk.ToEncodedChunks(c.minTime, c.maxTime, useXOR2) if err != nil { handleChunkWriteError(err) return nil @@ -347,7 +347,7 @@ func NewOOOCompactionHead(ctx context.Context, head *Head) (*OOOCompactionHead, } var lastMmapRef chunks.ChunkDiskMapperRef - mmapRefs := ms.mmapCurrentOOOHeadChunk(chunkOpts{chunkDiskMapper: head.chunkDiskMapper, storeST: head.opts.EnableSTStorage.Load()}, head.logger) + mmapRefs := ms.mmapCurrentOOOHeadChunk(chunkOpts{chunkDiskMapper: head.chunkDiskMapper, useXOR2: head.opts.EnableXOR2Encoding.Load()}, head.logger) if len(mmapRefs) == 0 && len(ms.ooo.oooMmappedChunks) > 0 { // Nothing was m-mapped. So take the mmapRef from the existing slice if it exists. mmapRefs = []chunks.ChunkDiskMapperRef{ms.ooo.oooMmappedChunks[len(ms.ooo.oooMmappedChunks)-1].ref} @@ -481,7 +481,7 @@ func (ir *OOOCompactionHeadIndexReader) Series(ref storage.SeriesRef, builder *l return nil } - return getOOOSeriesChunks(s, ir.ch.head.opts.EnableSTStorage.Load(), ir.ch.mint, ir.ch.maxt, 0, ir.ch.lastMmapRef, false, 0, chks) + return getOOOSeriesChunks(s, ir.ch.head.opts.EnableXOR2Encoding.Load(), ir.ch.mint, ir.ch.maxt, 0, ir.ch.lastMmapRef, false, 0, chks) } func (*OOOCompactionHeadIndexReader) SortedLabelValues(_ context.Context, _ string, _ *storage.LabelHints, _ ...*labels.Matcher) ([]string, error) { diff --git a/tsdb/ooo_head_test.go b/tsdb/ooo_head_test.go index 857018f5c2..d410835571 100644 --- a/tsdb/ooo_head_test.go +++ b/tsdb/ooo_head_test.go @@ -367,8 +367,8 @@ func TestOOOChunks_ToEncodedChunks(t *testing.T) { } } -// TestOOOChunks_ToEncodedChunks_WithST tests ToEncodedChunks with storeST=true and storeST=false for float samples. -// When storeST=true, st values are preserved; when storeST=false, AtST() returns 0. +// TestOOOChunks_ToEncodedChunks_WithST tests ToEncodedChunks with useXOR2=true and useXOR2=false for float samples. +// When useXOR2=true, st values are preserved; when useXOR2=false, AtST() returns 0. // TODO(@krajorama): Add histogram test cases once ST storage is implemented for histograms. func TestOOOChunks_ToEncodedChunks_WithST(t *testing.T) { testCases := map[string]struct { @@ -403,11 +403,11 @@ func TestOOOChunks_ToEncodedChunks_WithST(t *testing.T) { storageScenarios := []struct { name string - storeST bool + useXOR2 bool expectedEncoding chunkenc.Encoding }{ - {"storeST=true", true, chunkenc.EncXOR2}, - {"storeST=false", false, chunkenc.EncXOR}, + {"useXOR2=true", true, chunkenc.EncXOR2}, + {"useXOR2=false", false, chunkenc.EncXOR}, } for name, tc := range testCases { @@ -418,7 +418,7 @@ func TestOOOChunks_ToEncodedChunks_WithST(t *testing.T) { oooChunk.Insert(s.st, s.t, s.f, nil, nil) } - chunks, err := oooChunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, ss.storeST) + chunks, err := oooChunk.ToEncodedChunks(math.MinInt64, math.MaxInt64, ss.useXOR2) require.NoError(t, err) require.Len(t, chunks, 1, "number of chunks") @@ -434,12 +434,12 @@ func TestOOOChunks_ToEncodedChunks_WithST(t *testing.T) { gotT, gotF := it.At() gotST := it.AtST() - if ss.storeST { - // When storeST=true, st values should be preserved. + if ss.useXOR2 { + // When useXOR2=true, st values should be preserved. require.Equal(t, tc.samples[sampleIndex].st, gotST, "sample %d st", sampleIndex) } else { - // When storeST=false, AtST() should return 0. - require.Equal(t, int64(0), gotST, "sample %d st should be 0 when storeST=false", sampleIndex) + // When useXOR2=false, AtST() should return 0. + require.Equal(t, int64(0), gotST, "sample %d st should be 0 when useXOR2=false", sampleIndex) } require.Equal(t, tc.samples[sampleIndex].t, gotT, "sample %d t", sampleIndex) require.Equal(t, tc.samples[sampleIndex].f, gotF, "sample %d f", sampleIndex) From 750adfc819f354ad884898e67c626221a0485235 Mon Sep 17 00:00:00 2001 From: Carrie Edwards Date: Wed, 11 Mar 2026 09:30:28 -0700 Subject: [PATCH 50/73] Update CHANGELOG Signed-off-by: Carrie Edwards --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index eabcf6d9fa..265efe2905 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ User migrating from bind mounts might need to ajust permissions too, depending o - [FEATURE] Web: Add OpenAPI 3.2 specification for the HTTP API at `/api/v1/openapi.yaml`. #17825 - [FEATURE] Dockerfile: Add distroless image variant using UID/GID 65532 and no VOLUME declaration. Busybox image remains default. #17876 - [FEATURE] Web: Add on-demand wall time profiling under `/debug/pprof/fgprof`. #18027 +- [FEATURE] Add xor2-encoding feature flag. #18276 - [ENHANCEMENT] PromQL: Add more detail to histogram quantile monotonicity info annotations. #15578 - [ENHANCEMENT] Alerting: Independent alertmanager sendloops. #16355 - [ENHANCEMENT] TSDB: Experimental support for early compaction of stale series in the memory with configurable threshold `stale_series_compaction_threshold` in the config file. #16929 From c10abae45e147a49c97533101ce1e110746f0eac Mon Sep 17 00:00:00 2001 From: Carrie Edwards Date: Wed, 11 Mar 2026 10:08:18 -0700 Subject: [PATCH 51/73] Fix linting Signed-off-by: Carrie Edwards --- tsdb/chunkenc/chunk.go | 1 - 1 file changed, 1 deletion(-) diff --git a/tsdb/chunkenc/chunk.go b/tsdb/chunkenc/chunk.go index 22d4d160e6..3a405e8cf7 100644 --- a/tsdb/chunkenc/chunk.go +++ b/tsdb/chunkenc/chunk.go @@ -212,7 +212,6 @@ func (v ValueType) NewChunk(useXOR2 bool) (Chunk, error) { return NewEmptyChunk(v.ChunkEncoding(useXOR2)) } - // MockSeriesIterator returns an iterator for a mock series with custom // start timestamp, timestamps, and values. // Start timestamps is optional, pass nil or empty slice to indicate no start From a0d0a8efe8d06b2f518a6de45e601d886c1ab91f Mon Sep 17 00:00:00 2001 From: Carrie Edwards Date: Thu, 12 Mar 2026 11:08:33 -0700 Subject: [PATCH 52/73] Remove setting of xor2 encoding option in db open Signed-off-by: Carrie Edwards --- tsdb/db.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/tsdb/db.go b/tsdb/db.go index 711f4e541c..136d198750 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -1081,9 +1081,6 @@ func open(dir string, l *slog.Logger, r prometheus.Registerer, opts *Options, rn headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax) headOpts.EnableSharding = opts.EnableSharding headOpts.EnableSTAsZeroSample = opts.EnableSTAsZeroSample - if opts.EnableSTStorage { - opts.EnableXOR2Encoding = true - } headOpts.EnableSTStorage.Store(opts.EnableSTStorage) headOpts.EnableXOR2Encoding.Store(opts.EnableXOR2Encoding) headOpts.EnableMetadataWALRecords = opts.EnableMetadataWALRecords From 8a02ae58d4be73bcdfc7f9f155385bc4a82a930e Mon Sep 17 00:00:00 2001 From: Carrie Edwards Date: Thu, 12 Mar 2026 12:20:24 -0700 Subject: [PATCH 53/73] Fix tests Signed-off-by: Carrie Edwards --- tsdb/db_append_v2_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tsdb/db_append_v2_test.go b/tsdb/db_append_v2_test.go index 149331b438..5914cbcf97 100644 --- a/tsdb/db_append_v2_test.go +++ b/tsdb/db_append_v2_test.go @@ -7514,8 +7514,9 @@ func TestCompactHeadWithSTStorage_AppendV2(t *testing.T) { NoLockfile: true, MinBlockDuration: int64(time.Hour * 2 / time.Millisecond), MaxBlockDuration: int64(time.Hour * 2 / time.Millisecond), - WALCompression: compression.Snappy, - EnableSTStorage: true, + WALCompression: compression.Snappy, + EnableSTStorage: true, + EnableXOR2Encoding: true, } db := newTestDB(t, withOpts(opts)) ctx := context.Background() @@ -7654,6 +7655,7 @@ func TestDBAppenderV2_STStorage_OutOfOrder(t *testing.T) { opts := DefaultOptions() opts.OutOfOrderTimeWindow = 300 * time.Minute.Milliseconds() opts.EnableSTStorage = true + opts.EnableXOR2Encoding = true db := newTestDB(t, withOpts(opts)) db.DisableCompactions() From b575f5e28b461e25df660772a59da0c4142be6b1 Mon Sep 17 00:00:00 2001 From: Carrie Edwards Date: Thu, 12 Mar 2026 12:26:42 -0700 Subject: [PATCH 54/73] Fix linting Signed-off-by: Carrie Edwards --- tsdb/db_append_v2_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tsdb/db_append_v2_test.go b/tsdb/db_append_v2_test.go index 5914cbcf97..0bb1763f3d 100644 --- a/tsdb/db_append_v2_test.go +++ b/tsdb/db_append_v2_test.go @@ -7510,10 +7510,10 @@ func TestCompactHeadWithSTStorage_AppendV2(t *testing.T) { t.Parallel() opts := &Options{ - RetentionDuration: int64(time.Hour * 24 * 15 / time.Millisecond), - NoLockfile: true, - MinBlockDuration: int64(time.Hour * 2 / time.Millisecond), - MaxBlockDuration: int64(time.Hour * 2 / time.Millisecond), + RetentionDuration: int64(time.Hour * 24 * 15 / time.Millisecond), + NoLockfile: true, + MinBlockDuration: int64(time.Hour * 2 / time.Millisecond), + MaxBlockDuration: int64(time.Hour * 2 / time.Millisecond), WALCompression: compression.Snappy, EnableSTStorage: true, EnableXOR2Encoding: true, From b49ad5fc4bab17d80e4e021c2f3e90e2159f166f Mon Sep 17 00:00:00 2001 From: Carrie Edwards Date: Thu, 12 Mar 2026 13:24:59 -0700 Subject: [PATCH 55/73] Update feature flag description Signed-off-by: Carrie Edwards --- docs/feature_flags.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/feature_flags.md b/docs/feature_flags.md index d707cbec50..cbff482379 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -317,12 +317,12 @@ disk compression than the default XOR encoding for typical Prometheus workloads. 1. **Staleness marker optimization**: When a scrape target disappears, Prometheus writes a special "staleness marker" to indicate the series has ended. Standard XOR encoding uses ~110 bits per marker; XOR2 uses only 13 bits—a **90% reduction**. For deployments with dynamic infrastructure (Kubernetes pods, auto-scaling, short-lived jobs), staleness markers can represent 10-40% of all samples. 2. **Adaptive timestamp encoding**: XOR2 automatically optimizes for both regular and irregular scrape intervals. Regular data gets the same compression as standard XOR (zero overhead), while irregular data benefits from improved encoding that handles larger timestamp variations more efficiently. +This feature also introduces the fields needed for encoding start timestamps. `xor2-encoding` is automatically enabled when `st-storage` is enabled. + Chunks encoded with XOR2 **cannot be read by older Prometheus versions** that do not support the encoding. Once enabled and data is written, downgrading requires waiting for all XOR2 chunks to be compacted out of retention. -This feature is automatically enabled when `st-storage` is enabled. - ## Extended Range Selectors `--enable-feature=promql-extended-range-selectors` From a4a17a77cd4246d57712c0aceda6a65da4012847 Mon Sep 17 00:00:00 2001 From: Carrie Edwards Date: Fri, 13 Mar 2026 07:43:28 -0700 Subject: [PATCH 56/73] Update comments and feature flag description Signed-off-by: Carrie Edwards --- CHANGELOG.md | 1 - docs/feature_flags.md | 16 +++++----------- tsdb/head.go | 3 +-- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 265efe2905..eabcf6d9fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,7 +27,6 @@ User migrating from bind mounts might need to ajust permissions too, depending o - [FEATURE] Web: Add OpenAPI 3.2 specification for the HTTP API at `/api/v1/openapi.yaml`. #17825 - [FEATURE] Dockerfile: Add distroless image variant using UID/GID 65532 and no VOLUME declaration. Busybox image remains default. #17876 - [FEATURE] Web: Add on-demand wall time profiling under `/debug/pprof/fgprof`. #18027 -- [FEATURE] Add xor2-encoding feature flag. #18276 - [ENHANCEMENT] PromQL: Add more detail to histogram quantile monotonicity info annotations. #15578 - [ENHANCEMENT] Alerting: Independent alertmanager sendloops. #16355 - [ENHANCEMENT] TSDB: Experimental support for early compaction of stale series in the memory with configurable threshold `stale_series_compaction_threshold` in the config file. #16929 diff --git a/docs/feature_flags.md b/docs/feature_flags.md index cbff482379..f355d39edb 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -310,18 +310,12 @@ For more details, see the [proposal](https://github.com/prometheus/proposals/pul `--enable-feature=xor2-encoding` -Enables the XOR2 chunk encoding for float samples, which provides better -disk compression than the default XOR encoding for typical Prometheus workloads. XOR2 reduces chunk storage size by -5-20% in typical Prometheus deployments through: +> WARNING: This is highly experimental and risky setting: +> * Chunks encoded with XOR2 **cannot be read by older Prometheus versions** that do not support the encoding. Once enabled and data is written, you need to **manually delete blocks from the disk**, otherwise Prometheus will return error on all queries. +> * We are still experimenting on the final encoding. As of now this encoding can change in any Prometheus version. All your persistent block data will be lost between versions. +> * This is encoding is new, meaning downstream tools and LTS systems might now support it yet (e.g. Thanos sidecar uploaded blocks). -1. **Staleness marker optimization**: When a scrape target disappears, Prometheus writes a special "staleness marker" to indicate the series has ended. Standard XOR encoding uses ~110 bits per marker; XOR2 uses only 13 bits—a **90% reduction**. For deployments with dynamic infrastructure (Kubernetes pods, auto-scaling, short-lived jobs), staleness markers can represent 10-40% of all samples. -2. **Adaptive timestamp encoding**: XOR2 automatically optimizes for both regular and irregular scrape intervals. Regular data gets the same compression as standard XOR (zero overhead), while irregular data benefits from improved encoding that handles larger timestamp variations more efficiently. - -This feature also introduces the fields needed for encoding start timestamps. `xor2-encoding` is automatically enabled when `st-storage` is enabled. - -Chunks encoded with XOR2 **cannot be read by older Prometheus versions** that do not support -the encoding. Once enabled and data is written, downgrading requires waiting for -all XOR2 chunks to be compacted out of retention. +This setting enables the new XOR2 chunk encoding for float samples, which provides better disk compression than the default XOR encoding for typical Prometheus workloads. This format also allow storing Start Timestamp (ST). ## Extended Range Selectors diff --git a/tsdb/head.go b/tsdb/head.go index 0052ddd72c..838b4bb699 100644 --- a/tsdb/head.go +++ b/tsdb/head.go @@ -167,8 +167,7 @@ type HeadOptions struct { EnableSTStorage atomic.Bool // EnableXOR2Encoding enables XOR2 chunk encoding for float samples. - // Represents 'xor2-encoding' feature flag. Automatically true when - // EnableSTStorage is true. + // Represents 'xor2-encoding' feature flag. EnableXOR2Encoding atomic.Bool ChunkRange int64 From 870fdf71f1e9efe76ef726024e75b9a502c62edc Mon Sep 17 00:00:00 2001 From: Carrie Edwards Date: Fri, 13 Mar 2026 13:34:42 -0700 Subject: [PATCH 57/73] Update documentation for st-storage feature Signed-off-by: Carrie Edwards --- cmd/prometheus/main.go | 5 ++--- docs/command-line/prometheus.md | 2 +- docs/feature_flags.md | 29 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 2901707709..35bd0f324f 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -282,7 +282,6 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols logger.Info("Experimental start timestamp zero ingestion enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) case "st-storage": - // TODO(bwplotka): Implement ST Storage as per PROM-60 and document this hidden feature flag. c.scrape.ParseST = true c.tsdb.EnableSTStorage = true c.agent.EnableSTStorage = true @@ -290,7 +289,7 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { // Change relevant global variables. Hacky, but it's hard to pass a new option or default to unmarshallers. This is to widen the ST support surface. config.DefaultConfig.GlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols - logger.Info("Experimental start timestamp storage enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) + logger.Info("Experimental start timestamp storage enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format. Changed chunk encoding type to XOR2.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) case "delayed-compaction": c.tsdb.EnableDelayedCompaction = true logger.Info("Experimental delayed compaction is enabled.") @@ -603,7 +602,7 @@ func main() { a.Flag("scrape.discovery-reload-interval", "Interval used by scrape manager to throttle target groups updates."). Hidden().Default("5s").SetValue(&cfg.scrape.DiscoveryReloadInterval) - a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). + a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, st-storage, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). Default("").StringsVar(&cfg.featureList) a.Flag("agent", "Run Prometheus in 'Agent mode'.").BoolVar(&agentMode) diff --git a/docs/command-line/prometheus.md b/docs/command-line/prometheus.md index 251fdfd6a4..481450b32f 100644 --- a/docs/command-line/prometheus.md +++ b/docs/command-line/prometheus.md @@ -59,7 +59,7 @@ The Prometheus monitoring server | --query.timeout | Maximum time a query may take before being aborted. Use with server mode only. | `2m` | | --query.max-concurrency | Maximum number of queries executed concurrently. Use with server mode only. | `20` | | --query.max-samples | Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return. Use with server mode only. | `50000000` | -| --enable-feature ... | Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | +| --enable-feature ... | Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-per-step-stats, promql-experimental-functions, extra-scrape-metrics, auto-gomaxprocs, created-timestamp-zero-ingestion, st-storage, concurrent-rule-eval, delayed-compaction, old-ui, otlp-deltatocumulative, promql-duration-expr, use-uncached-io, promql-extended-range-selectors, promql-binop-fill-modifiers. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | | | --agent | Run Prometheus in 'Agent mode'. | | | --log.level | Only log messages with the given severity or above. One of: [debug, info, warn, error] | `info` | | --log.format | Output format of log messages. One of: [logfmt, json] | `logfmt` | diff --git a/docs/feature_flags.md b/docs/feature_flags.md index 45d14b72db..afd0c7b28c 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -77,6 +77,35 @@ Therefore, when `created-timestamp-zero-ingestion` is enabled Prometheus changes Besides enabling this feature in Prometheus, start timestamps need to be exposed by the application being scraped. +## Start timestamp (ST) native storage + +`--enable-feature=st-storage` + +> WARNING: This is a highly experimental and risky setting. +> * The new SamplesV2 WAL records cannot be replayed with Prometheus versions that do not support them. +> * This feature uses XOR2 encoded chunks, which cannot be read by older Prometheus versions that do not support the encoding. +> * XOR2 encoding is new, meaning downstream tools and LTS systems might now support it yet (e.g. Thanos sidecar uploaded blocks). + +> See [PROM-60](https://github.com/prometheus/proposals/pull/60) for the full +design proposal. + +Enables the storage of start timestamps (ST) natively per sample, instead of injecting synthetic 0 valued samples (as `created-timestamp-zero-ingestion` does). +Native storage of start timestamps preserves the exact ST values without adding extra samples. + +Currently, native start timestamp storage is only supported for float samples; support for histograms will be added in the future. +Additionally, start timestamp values are not yet used by the PromQL engine for queries. + +Currently, Prometheus supports start timestamps on: + +* `PrometheusProto` +* `OpenMetrics1.0.0` + +`PrometheusProto` is recommended. + +Besides enabling this feature in Prometheus, start timestamps need to be exposed by the application being scraped. + +Enabling this feature flag automatically enables the xor2-encoding flag. + ## Concurrent evaluation of independent rules `--enable-feature=concurrent-rule-eval` From caa250a29c0dbdf926f4988cae92fedd8771644e Mon Sep 17 00:00:00 2001 From: Ridwan Sharif Date: Tue, 17 Mar 2026 17:43:14 +0000 Subject: [PATCH 58/73] scrape: reset ticker to align target scrape times with offset and intervals Signed-off-by: Ridwan Sharif --- scrape/scrape.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scrape/scrape.go b/scrape/scrape.go index 55d0eaf70b..2866a7fa61 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -1282,6 +1282,9 @@ func (sl *scrapeLoop) run(errc chan<- error) { } } + // Reset the ticker so target scrape times are aligned to the offset+intervals. + ticker.Reset(sl.interval) + for { select { case <-sl.ctx.Done(): From 695db71c68252646586ba1d90c6f35bb850cc8f2 Mon Sep 17 00:00:00 2001 From: Ridwan Sharif Date: Tue, 17 Mar 2026 18:10:10 +0000 Subject: [PATCH 59/73] scrape: add test for distribution of scrapes Signed-off-by: Ridwan Sharif --- scrape/scrape_test.go | 106 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index 63547869be..432230219b 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -24,6 +24,7 @@ import ( "log/slog" "maps" "math" + "net" "net/http" "net/http/httptest" "net/url" @@ -51,6 +52,7 @@ import ( sdktrace "go.opentelemetry.io/otel/sdk/trace" "go.uber.org/atomic" "go.uber.org/goleak" + "go.yaml.in/yaml/v2" "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/discovery" @@ -69,6 +71,7 @@ import ( "github.com/prometheus/prometheus/util/pool" "github.com/prometheus/prometheus/util/teststorage" "github.com/prometheus/prometheus/util/testutil" + "github.com/prometheus/prometheus/util/testutil/synctest" ) func TestMain(m *testing.M) { @@ -6786,3 +6789,106 @@ func TestScrapePoolSetScrapeFailureLoggerRace(t *testing.T) { wg.Wait() } + +func TestScrapeOffsetDistribution(t *testing.T) { + interval := 5 * time.Second + + synctest.Test(t, func(t *testing.T) { + startTime := time.Now() + + listener := newPipeListener() + + var mu sync.Mutex + scrapeTimes := make(map[string][]time.Duration) + + handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + select { + case <-r.Context().Done(): + return + default: + mu.Lock() + target := r.URL.Path + scrapeTimes[target] = append(scrapeTimes[target], time.Since(startTime)) + mu.Unlock() + + w.Header().Set("Content-Type", "text/plain; version=0.0.4") + fmt.Fprintln(w, "expected_metric 1") + } + }) + + srv := httptest.NewUnstartedServer(handler) + srv.Listener = listener + srv.Start() + t.Cleanup(srv.Close) + + app := teststorage.NewAppendable() + opts := &Options{ + HTTPClientOptions: []config_util.HTTPClientOption{ + config_util.WithDialContextFunc(func(ctx context.Context, _, _ string) (net.Conn, error) { + srvConn, cliConn := net.Pipe() + select { + case listener.conns <- srvConn: + return cliConn, nil + case <-listener.closed: + return nil, net.ErrClosed + case <-ctx.Done(): + return nil, ctx.Err() + } + }), + }, + } + scrapeManager, err := NewManager(opts, promslog.NewNopLogger(), nil, app, nil, prometheus.NewRegistry()) + require.NoError(t, err) + + var targets []model.LabelSet + for i := range 5 { + targets = append(targets, model.LabelSet{ + model.SchemeLabel: "http", + model.AddressLabel: model.LabelValue(fmt.Sprintf("target-%d.local", i)), + model.MetricsPathLabel: model.LabelValue(fmt.Sprintf("/metrics/%d", i)), + }) + } + + scrapeManager.updateTsets(map[string][]*targetgroup.Group{ + "test": {{Targets: targets}}, + }) + + cfg := &config.Config{ + GlobalConfig: config.GlobalConfig{ + ScrapeInterval: model.Duration(interval), + ScrapeTimeout: model.Duration(interval), + ScrapeProtocols: []config.ScrapeProtocol{config.PrometheusProto}, + }, + ScrapeConfigs: []*config.ScrapeConfig{{JobName: "test"}}, + } + cfgText, err := yaml.Marshal(*cfg) + require.NoError(t, err) + cfg = loadConfiguration(t, string(cfgText)) + require.NoError(t, scrapeManager.ApplyConfig(cfg)) + + scrapeManager.reload() + + time.Sleep(22 * time.Second) + synctest.Wait() + + scrapeManager.Stop() + + maxScrapes := 0 + for _, times := range scrapeTimes { + if len(times) > maxScrapes { + maxScrapes = len(times) + } + } + require.Positive(t, maxScrapes, "Expected at least one scrape") + + for i := 0; i < maxScrapes; i++ { + uniqueTimes := make(map[time.Duration]struct{}) + for _, times := range scrapeTimes { + if i < len(times) { + uniqueTimes[times[i]] = struct{}{} + } + } + require.Greater(t, len(uniqueTimes), 2, "Expected targets to be scraped at staggered offsets rather than simultaneously at scrape index %d", i) + } + }) +} From 8e8cd480cbe90da79d07c80ec81a4dd6adffc63f Mon Sep 17 00:00:00 2001 From: Ridwan Sharif Date: Tue, 17 Mar 2026 20:15:56 +0000 Subject: [PATCH 60/73] scrape: Introduce an `offsetSeed` option for deterministic scrape offset calculation and utilize it in tests Signed-off-by: Ridwan Sharif --- scrape/manager.go | 8 +++++++- scrape/scrape_test.go | 14 ++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/scrape/manager.go b/scrape/manager.go index e632b015d7..7a4a4463d9 100644 --- a/scrape/manager.go +++ b/scrape/manager.go @@ -149,8 +149,9 @@ type Options struct { // because of an early startup scrape. InitialScrapeOffset time.Duration - // private option for testability. + // private options for testability. skipJitterOffsetting bool + offsetSeed uint64 } // Manager maintains a set of scrape pools and manages start/stop cycles @@ -269,6 +270,11 @@ func (m *Manager) reload() { // setOffsetSeed calculates a global offsetSeed per server relying on extra label set. func (m *Manager) setOffsetSeed(labels labels.Labels) error { + if m.opts.offsetSeed != 0 { + m.offsetSeed = m.opts.offsetSeed + return nil + } + h := fnv.New64a() hostname, err := osutil.GetFQDN() if err != nil { diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index 432230219b..9cb3adff45 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -6823,6 +6823,7 @@ func TestScrapeOffsetDistribution(t *testing.T) { app := teststorage.NewAppendable() opts := &Options{ + offsetSeed: 1, HTTPClientOptions: []config_util.HTTPClientOption{ config_util.WithDialContextFunc(func(ctx context.Context, _, _ string) (net.Conn, error) { srvConn, cliConn := net.Pipe() @@ -6868,20 +6869,13 @@ func TestScrapeOffsetDistribution(t *testing.T) { scrapeManager.reload() - time.Sleep(22 * time.Second) + numScrapes := 4 + time.Sleep((time.Duration(numScrapes) * interval) + time.Second) synctest.Wait() scrapeManager.Stop() - maxScrapes := 0 - for _, times := range scrapeTimes { - if len(times) > maxScrapes { - maxScrapes = len(times) - } - } - require.Positive(t, maxScrapes, "Expected at least one scrape") - - for i := 0; i < maxScrapes; i++ { + for i := range numScrapes { uniqueTimes := make(map[time.Duration]struct{}) for _, times := range scrapeTimes { if i < len(times) { From 364d70812f1635a20c3b8c881fa4d9a411944ba7 Mon Sep 17 00:00:00 2001 From: Bartlomiej Plotka Date: Thu, 19 Mar 2026 09:47:42 +0100 Subject: [PATCH 61/73] st: disconnect st-storage with xor2-encoding given planned experiments (#18316) * st: disconnect st-storage with xor2-encoding given planned experiments Signed-off-by: bwplotka * Update docs/feature_flags.md Co-authored-by: George Krajcsovits Signed-off-by: Bartlomiej Plotka * Update docs/feature_flags.md Co-authored-by: George Krajcsovits Signed-off-by: Bartlomiej Plotka * Update docs/feature_flags.md Co-authored-by: George Krajcsovits Signed-off-by: Bartlomiej Plotka * Update docs/feature_flags.md Co-authored-by: George Krajcsovits Signed-off-by: Bartlomiej Plotka --------- Signed-off-by: bwplotka Signed-off-by: Bartlomiej Plotka Co-authored-by: George Krajcsovits --- cmd/prometheus/main.go | 3 +-- docs/feature_flags.md | 25 ++++++++++--------------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 5a0c84c0cb..7db9c53171 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -287,13 +287,12 @@ func (c *flagConfig) setFeatureListOptions(logger *slog.Logger) error { case "st-storage": c.scrape.ParseST = true c.tsdb.EnableSTStorage = true - c.tsdb.EnableXOR2Encoding = true // Set chunk encoding type to XOR2 for samples with ST c.agent.EnableSTStorage = true // Change relevant global variables. Hacky, but it's hard to pass a new option or default to unmarshallers. This is to widen the ST support surface. config.DefaultConfig.GlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultProtoFirstScrapeProtocols - logger.Info("Experimental start timestamp storage enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format. Changed chunk encoding type to XOR2.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) + logger.Info("Experimental start timestamp storage enabled. OpenMetrics 1.0 parsing will parse _created metrics as ST instead of normal sample. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) case "delayed-compaction": c.tsdb.EnableDelayedCompaction = true logger.Info("Experimental delayed compaction is enabled.") diff --git a/docs/feature_flags.md b/docs/feature_flags.md index c285826c94..ccc3a2bcde 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -81,30 +81,25 @@ Besides enabling this feature in Prometheus, start timestamps need to be exposed `--enable-feature=st-storage` -> WARNING: This is a highly experimental and risky setting. -> * The new SamplesV2 WAL records cannot be replayed with Prometheus versions that do not support them. -> * This feature uses XOR2 encoded chunks, which cannot be read by older Prometheus versions that do not support the encoding. -> * XOR2 encoding is new, meaning downstream tools and LTS systems might now support it yet (e.g. Thanos sidecar uploaded blocks). - -> See [PROM-60](https://github.com/prometheus/proposals/pull/60) for the full -design proposal. - -Enables the storage of start timestamps (ST) natively per sample, instead of injecting synthetic 0 valued samples (as `created-timestamp-zero-ingestion` does). -Native storage of start timestamps preserves the exact ST values without adding extra samples. - -Currently, native start timestamp storage is only supported for float samples; support for histograms will be added in the future. -Additionally, start timestamp values are not yet used by the PromQL engine for queries. +Enables the storage of start timestamps (ST) per sample, through WAL, TSDB/Agent and Remote-Write 2.0. This option +allows preserving the exact ST value as it was presented from scrape and receive protocols. In the future this feature +is meant to be a replacement of `created-timestamp-zero-ingestion` which injects synthetic 0 samples. Currently, Prometheus supports start timestamps on: * `PrometheusProto` * `OpenMetrics1.0.0` -`PrometheusProto` is recommended. +`PrometheusProto` is recommended, due to efficiency of ST passing. Besides enabling this feature in Prometheus, start timestamps need to be exposed by the application being scraped. -Enabling this feature flag automatically enables the xor2-encoding flag. +> NOTE: This is an experimental feature with known limitations until fully implemented. +> * It introduces new WAL record type (SamplesV2) that can only be replayed with Prometheus 3.11 or later versions. +> * For persistent storage support (TSDB blocks), you need to manually opt-in for XOR2 chunk format ([`xor2-encoding` flag](#xor2-chunk-encoding)). +> This might change later once we finish experimentation phase with XOR2. +> * ST for native histograms and NHCBs are not yet implemented (see [#18315](https://github.com/prometheus/prometheus/issues/18315)). +> * PromQL use of ST is out of scope of this feature. ## Concurrent evaluation of independent rules From a9d90952bac6ff117cd0d99171bf631fc8a3dce2 Mon Sep 17 00:00:00 2001 From: "Jonas L." Date: Thu, 19 Mar 2026 10:25:01 +0000 Subject: [PATCH 62/73] Deprecate Hetzner Cloud server datacenter labels (#17850) [hcloud.Server.Datacenter] is deprecated and will be removed after 1 July 2026. Use [hcloud.Server.Location] instead. See https://docs.hetzner.cloud/changelog#2025-12-16-phasing-out-datacenters Changes to Hetzner meta labels: - `__meta_hetzner_datacenter` - is deprecated for the role `robot` but kept for backward compatibility. Using `__meta_hetzner_robot_datacenter` is preferred. - is deprecated for the role `hcloud` and will stop working after the 1 July 2026. - `__meta_hetzner_hcloud_datacenter_location` label - is deprecated but kept for backward compatibility, the same data is available in the [`hcloud.Server.Location`](https://pkg.go.dev/github.com/hetznercloud/hcloud-go/v2/hcloud#Server) struct. - using `__meta_hetzner_hcloud_location` is preferred. - `__meta_hetzner_hcloud_datacenter_location_network_zone` - is deprecated but kept for backward compatibility, the same data is available in the [`hcloud.Server.Location`](https://pkg.go.dev/github.com/hetznercloud/hcloud-go/v2/hcloud#Server) struct. - using `__meta_hetzner_hcloud_location_network_zone` is preferred. - `__meta_hetzner_hcloud_location` - replacement label for `__meta_hetzner_hcloud_datacenter_location` - `__meta_hetzner_hcloud_location_network_zone` - replacement label for `__meta_hetzner_hcloud_datacenter_location_network_zone` - `__meta_hetzner_robot_datacenter` - replacement label for `__meta_hetzner_datacenter` with the role `robot`. Signed-off-by: Jonas Lammler --- discovery/hetzner/hcloud.go | 19 +++++++++++++----- discovery/hetzner/hcloud_test.go | 6 ++++++ discovery/hetzner/hetzner.go | 2 +- discovery/hetzner/mock_test.go | 30 +++++++++++++++++++++++++++++ discovery/hetzner/robot.go | 24 ++++++++++++----------- discovery/hetzner/robot_test.go | 20 ++++++++++--------- docs/configuration/configuration.md | 12 +++++++++--- 7 files changed, 84 insertions(+), 29 deletions(-) diff --git a/discovery/hetzner/hcloud.go b/discovery/hetzner/hcloud.go index 7fe55ffded..c28bfd2a1f 100644 --- a/discovery/hetzner/hcloud.go +++ b/discovery/hetzner/hcloud.go @@ -38,8 +38,10 @@ const ( hetznerLabelHcloudImageOSVersion = hetznerHcloudLabelPrefix + "image_os_version" hetznerLabelHcloudImageOSFlavor = hetznerHcloudLabelPrefix + "image_os_flavor" hetznerLabelHcloudPrivateIPv4 = hetznerHcloudLabelPrefix + "private_ipv4_" - hetznerLabelHcloudDatacenterLocation = hetznerHcloudLabelPrefix + "datacenter_location" - hetznerLabelHcloudDatacenterLocationNetworkZone = hetznerHcloudLabelPrefix + "datacenter_location_network_zone" + hetznerLabelHcloudLocation = hetznerHcloudLabelPrefix + "location" + hetznerLabelHcloudLocationNetworkZone = hetznerHcloudLabelPrefix + "location_network_zone" + hetznerLabelHcloudDatacenterLocation = hetznerHcloudLabelPrefix + "datacenter_location" // Label name kept for backward compatibility + hetznerLabelHcloudDatacenterLocationNetworkZone = hetznerHcloudLabelPrefix + "datacenter_location_network_zone" // Label name kept for backward compatibility hetznerLabelHcloudCPUCores = hetznerHcloudLabelPrefix + "cpu_cores" hetznerLabelHcloudCPUType = hetznerHcloudLabelPrefix + "cpu_type" hetznerLabelHcloudMemoryGB = hetznerHcloudLabelPrefix + "memory_size_gb" @@ -98,13 +100,14 @@ func (d *hcloudDiscovery) refresh(ctx context.Context) ([]*targetgroup.Group, er hetznerLabelRole: model.LabelValue(HetznerRoleHcloud), hetznerLabelServerID: model.LabelValue(strconv.FormatInt(server.ID, 10)), hetznerLabelServerName: model.LabelValue(server.Name), - hetznerLabelDatacenter: model.LabelValue(server.Datacenter.Name), //nolint:staticcheck // server.Datacenter is deprecated but kept for backwards compatibility until the next minor release hetznerLabelPublicIPv4: model.LabelValue(server.PublicNet.IPv4.IP.String()), hetznerLabelPublicIPv6Network: model.LabelValue(server.PublicNet.IPv6.Network.String()), hetznerLabelServerStatus: model.LabelValue(server.Status), - hetznerLabelHcloudDatacenterLocation: model.LabelValue(server.Datacenter.Location.Name), //nolint:staticcheck // server.Datacenter is deprecated but kept for backwards compatibility until the next minor release - hetznerLabelHcloudDatacenterLocationNetworkZone: model.LabelValue(server.Datacenter.Location.NetworkZone), //nolint:staticcheck // server.Datacenter is deprecated but kept for backwards compatibility until the next minor release + hetznerLabelHcloudLocation: model.LabelValue(server.Location.Name), + hetznerLabelHcloudLocationNetworkZone: model.LabelValue(server.Location.NetworkZone), + hetznerLabelHcloudDatacenterLocation: model.LabelValue(server.Location.Name), // Label name kept for backward compatibility + hetznerLabelHcloudDatacenterLocationNetworkZone: model.LabelValue(server.Location.NetworkZone), // Label name kept for backward compatibility hetznerLabelHcloudType: model.LabelValue(server.ServerType.Name), hetznerLabelHcloudCPUCores: model.LabelValue(strconv.Itoa(server.ServerType.Cores)), hetznerLabelHcloudCPUType: model.LabelValue(server.ServerType.CPUType), @@ -114,6 +117,12 @@ func (d *hcloudDiscovery) refresh(ctx context.Context) ([]*targetgroup.Group, er model.AddressLabel: model.LabelValue(net.JoinHostPort(server.PublicNet.IPv4.IP.String(), strconv.FormatUint(uint64(d.port), 10))), } + // [hcloud.Server.Datacenter] is deprecated and will be removed after 1 July 2026. + // See https://docs.hetzner.cloud/changelog#2025-12-16-phasing-out-datacenters + if server.Datacenter != nil { // nolint: staticcheck + labels[hetznerLabelDatacenter] = model.LabelValue(server.Datacenter.Name) // nolint: staticcheck + } + if server.Image != nil { labels[hetznerLabelHcloudImageName] = model.LabelValue(server.Image.Name) labels[hetznerLabelHcloudImageDescription] = model.LabelValue(server.Image.Description) diff --git a/discovery/hetzner/hcloud_test.go b/discovery/hetzner/hcloud_test.go index 3f20bcb86c..e7a11608c5 100644 --- a/discovery/hetzner/hcloud_test.go +++ b/discovery/hetzner/hcloud_test.go @@ -69,6 +69,8 @@ func TestHCloudSDRefresh(t *testing.T) { "__meta_hetzner_hcloud_image_description": model.LabelValue("Ubuntu 20.04 Standard 64 bit"), "__meta_hetzner_hcloud_image_os_flavor": model.LabelValue("ubuntu"), "__meta_hetzner_hcloud_image_os_version": model.LabelValue("20.04"), + "__meta_hetzner_hcloud_location": model.LabelValue("fsn1"), + "__meta_hetzner_hcloud_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_datacenter_location": model.LabelValue("fsn1"), "__meta_hetzner_hcloud_datacenter_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_cpu_cores": model.LabelValue("1"), @@ -93,6 +95,8 @@ func TestHCloudSDRefresh(t *testing.T) { "__meta_hetzner_hcloud_image_description": model.LabelValue("Ubuntu 20.04 Standard 64 bit"), "__meta_hetzner_hcloud_image_os_flavor": model.LabelValue("ubuntu"), "__meta_hetzner_hcloud_image_os_version": model.LabelValue("20.04"), + "__meta_hetzner_hcloud_location": model.LabelValue("fsn1"), + "__meta_hetzner_hcloud_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_datacenter_location": model.LabelValue("fsn1"), "__meta_hetzner_hcloud_datacenter_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_cpu_cores": model.LabelValue("2"), @@ -114,6 +118,8 @@ func TestHCloudSDRefresh(t *testing.T) { "__meta_hetzner_datacenter": model.LabelValue("fsn1-dc14"), "__meta_hetzner_public_ipv4": model.LabelValue("1.2.3.6"), "__meta_hetzner_public_ipv6_network": model.LabelValue("2001:db7::/64"), + "__meta_hetzner_hcloud_location": model.LabelValue("fsn1"), + "__meta_hetzner_hcloud_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_datacenter_location": model.LabelValue("fsn1"), "__meta_hetzner_hcloud_datacenter_location_network_zone": model.LabelValue("eu-central"), "__meta_hetzner_hcloud_cpu_cores": model.LabelValue("2"), diff --git a/discovery/hetzner/hetzner.go b/discovery/hetzner/hetzner.go index 932cfc8c93..3b7349e896 100644 --- a/discovery/hetzner/hetzner.go +++ b/discovery/hetzner/hetzner.go @@ -36,7 +36,7 @@ const ( hetznerLabelServerID = hetznerLabelPrefix + "server_id" hetznerLabelServerName = hetznerLabelPrefix + "server_name" hetznerLabelServerStatus = hetznerLabelPrefix + "server_status" - hetznerLabelDatacenter = hetznerLabelPrefix + "datacenter" + hetznerLabelDatacenter = hetznerLabelPrefix + "datacenter" // Label name kept for backward compatibility hetznerLabelPublicIPv4 = hetznerLabelPrefix + "public_ipv4" hetznerLabelPublicIPv6Network = hetznerLabelPrefix + "public_ipv6_network" ) diff --git a/discovery/hetzner/mock_test.go b/discovery/hetzner/mock_test.go index 5f1e9c036b..fb69a76b04 100644 --- a/discovery/hetzner/mock_test.go +++ b/discovery/hetzner/mock_test.go @@ -124,6 +124,16 @@ func (m *SDMock) HandleHcloudServers() { "storage_type": "local", "cpu_type": "shared" }, + "location": { + "id": 1, + "name": "fsn1", + "description": "Falkenstein DC Park 1", + "country": "DE", + "city": "Falkenstein", + "latitude": 50.47612, + "longitude": 12.370071, + "network_zone": "eu-central" + }, "datacenter": { "id": 1, "name": "fsn1-dc8", @@ -244,6 +254,16 @@ func (m *SDMock) HandleHcloudServers() { "storage_type": "local", "cpu_type": "shared" }, + "location": { + "id": 1, + "name": "fsn1", + "description": "Falkenstein DC Park 1", + "country": "DE", + "city": "Falkenstein", + "latitude": 50.47612, + "longitude": 12.370071, + "network_zone": "eu-central" + }, "datacenter": { "id": 2, "name": "fsn1-dc14", @@ -365,6 +385,16 @@ func (m *SDMock) HandleHcloudServers() { "storage_type": "local", "cpu_type": "shared" }, + "location": { + "id": 1, + "name": "fsn1", + "description": "Falkenstein DC Park 1", + "country": "DE", + "city": "Falkenstein", + "latitude": 50.47612, + "longitude": 12.370071, + "network_zone": "eu-central" + }, "datacenter": { "id": 2, "name": "fsn1-dc14", diff --git a/discovery/hetzner/robot.go b/discovery/hetzner/robot.go index c112d5549a..5b1c149ccb 100644 --- a/discovery/hetzner/robot.go +++ b/discovery/hetzner/robot.go @@ -34,9 +34,10 @@ import ( ) const ( - hetznerRobotLabelPrefix = hetznerLabelPrefix + "robot_" - hetznerLabelRobotProduct = hetznerRobotLabelPrefix + "product" - hetznerLabelRobotCancelled = hetznerRobotLabelPrefix + "cancelled" + hetznerRobotLabelPrefix = hetznerLabelPrefix + "robot_" + hetznerLabelRobotDatacenter = hetznerRobotLabelPrefix + "datacenter" + hetznerLabelRobotProduct = hetznerRobotLabelPrefix + "product" + hetznerLabelRobotCancelled = hetznerRobotLabelPrefix + "cancelled" ) var userAgent = version.PrometheusUserAgent() @@ -105,14 +106,15 @@ func (d *robotDiscovery) refresh(context.Context) ([]*targetgroup.Group, error) targets := make([]model.LabelSet, len(servers)) for i, server := range servers { labels := model.LabelSet{ - hetznerLabelRole: model.LabelValue(HetznerRoleRobot), - hetznerLabelServerID: model.LabelValue(strconv.Itoa(server.Server.ServerNumber)), - hetznerLabelServerName: model.LabelValue(server.Server.ServerName), - hetznerLabelDatacenter: model.LabelValue(strings.ToLower(server.Server.Dc)), - hetznerLabelPublicIPv4: model.LabelValue(server.Server.ServerIP), - hetznerLabelServerStatus: model.LabelValue(server.Server.Status), - hetznerLabelRobotProduct: model.LabelValue(server.Server.Product), - hetznerLabelRobotCancelled: model.LabelValue(strconv.FormatBool(server.Server.Canceled)), + hetznerLabelRole: model.LabelValue(HetznerRoleRobot), + hetznerLabelServerID: model.LabelValue(strconv.Itoa(server.Server.ServerNumber)), + hetznerLabelServerName: model.LabelValue(server.Server.ServerName), + hetznerLabelDatacenter: model.LabelValue(strings.ToLower(server.Server.Dc)), // Label name kept for backward compatibility + hetznerLabelPublicIPv4: model.LabelValue(server.Server.ServerIP), + hetznerLabelServerStatus: model.LabelValue(server.Server.Status), + hetznerLabelRobotDatacenter: model.LabelValue(strings.ToLower(server.Server.Dc)), + hetznerLabelRobotProduct: model.LabelValue(server.Server.Product), + hetznerLabelRobotCancelled: model.LabelValue(strconv.FormatBool(server.Server.Canceled)), model.AddressLabel: model.LabelValue(net.JoinHostPort(server.Server.ServerIP, strconv.FormatUint(uint64(d.port), 10))), } diff --git a/discovery/hetzner/robot_test.go b/discovery/hetzner/robot_test.go index 0e8b7954cc..56f9978858 100644 --- a/discovery/hetzner/robot_test.go +++ b/discovery/hetzner/robot_test.go @@ -64,19 +64,21 @@ func TestRobotSDRefresh(t *testing.T) { "__meta_hetzner_public_ipv4": model.LabelValue("123.123.123.123"), "__meta_hetzner_public_ipv6_network": model.LabelValue("2a01:4f8:111:4221::/64"), "__meta_hetzner_datacenter": model.LabelValue("nbg1-dc1"), + "__meta_hetzner_robot_datacenter": model.LabelValue("nbg1-dc1"), "__meta_hetzner_robot_product": model.LabelValue("DS 3000"), "__meta_hetzner_robot_cancelled": model.LabelValue("false"), }, { - "__address__": model.LabelValue("123.123.123.124:80"), - "__meta_hetzner_role": model.LabelValue("robot"), - "__meta_hetzner_server_id": model.LabelValue("421"), - "__meta_hetzner_server_name": model.LabelValue("server2"), - "__meta_hetzner_server_status": model.LabelValue("in process"), - "__meta_hetzner_public_ipv4": model.LabelValue("123.123.123.124"), - "__meta_hetzner_datacenter": model.LabelValue("fsn1-dc10"), - "__meta_hetzner_robot_product": model.LabelValue("X5"), - "__meta_hetzner_robot_cancelled": model.LabelValue("true"), + "__address__": model.LabelValue("123.123.123.124:80"), + "__meta_hetzner_role": model.LabelValue("robot"), + "__meta_hetzner_server_id": model.LabelValue("421"), + "__meta_hetzner_server_name": model.LabelValue("server2"), + "__meta_hetzner_server_status": model.LabelValue("in process"), + "__meta_hetzner_public_ipv4": model.LabelValue("123.123.123.124"), + "__meta_hetzner_datacenter": model.LabelValue("fsn1-dc10"), + "__meta_hetzner_robot_datacenter": model.LabelValue("fsn1-dc10"), + "__meta_hetzner_robot_product": model.LabelValue("X5"), + "__meta_hetzner_robot_cancelled": model.LabelValue("true"), }, } { t.Run(fmt.Sprintf("item %d", i), func(t *testing.T) { diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 334c5da490..75b47e0d94 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -2238,7 +2238,10 @@ The following meta labels are available on all targets during [relabeling](#rela * `__meta_hetzner_server_status`: the status of the server * `__meta_hetzner_public_ipv4`: the public ipv4 address of the server * `__meta_hetzner_public_ipv6_network`: the public ipv6 network (/64) of the server -* `__meta_hetzner_datacenter`: the datacenter of the server + +Note that the `__meta_hetzner_datacenter` label is deprecated for both roles `robot` and `hcloud`: +- For the `robot` role, the replacement label is `__meta_hetzner_robot_datacenter`. +- For the `hcloud` role, the label will be removed after 1 July 2026. For more details, see the [changelog](https://docs.hetzner.cloud/changelog#2025-12-16-phasing-out-datacenters). The labels below are only available for targets with `role` set to `hcloud`: @@ -2246,8 +2249,10 @@ The labels below are only available for targets with `role` set to `hcloud`: * `__meta_hetzner_hcloud_image_description`: the description of the server image * `__meta_hetzner_hcloud_image_os_flavor`: the OS flavor of the server image * `__meta_hetzner_hcloud_image_os_version`: the OS version of the server image -* `__meta_hetzner_hcloud_datacenter_location`: the location of the server -* `__meta_hetzner_hcloud_datacenter_location_network_zone`: the network zone of the server +* `__meta_hetzner_hcloud_location`: the location of the server +* `__meta_hetzner_hcloud_location_network_zone`: the network zone of the server +* `__meta_hetzner_hcloud_datacenter_location`: the location of the server (deprecated in favor of `__meta_hetzner_hcloud_location`) +* `__meta_hetzner_hcloud_datacenter_location_network_zone`: the network zone of the server (deprecated in favor of `__meta_hetzner_hcloud_location_network_zone`) * `__meta_hetzner_hcloud_server_type`: the type of the server * `__meta_hetzner_hcloud_cpu_cores`: the CPU cores count of the server * `__meta_hetzner_hcloud_cpu_type`: the CPU type of the server (shared or dedicated) @@ -2259,6 +2264,7 @@ The labels below are only available for targets with `role` set to `hcloud`: The labels below are only available for targets with `role` set to `robot`: +* `__meta_hetzner_robot_datacenter`: the datacenter of the server * `__meta_hetzner_robot_product`: the product of the server * `__meta_hetzner_robot_cancelled`: the server cancellation status From 7bbff490a3bd72054c153940512b25d65b8bf514 Mon Sep 17 00:00:00 2001 From: Ogulcan Aydogan Date: Thu, 19 Mar 2026 10:49:17 +0000 Subject: [PATCH 63/73] discovery/azure: fix system managed identity when client_id is empty When using ManagedIdentity authentication with system-assigned identity, the client_id field is intentionally left empty. However, the current code unconditionally sets options.ID = azidentity.ClientID(cfg.ClientID), which passes an empty string instead of nil. The Azure SDK treats an empty ClientID as a request for a user-assigned identity with an empty client ID, rather than falling back to system-assigned identity. Fix by only setting options.ID when cfg.ClientID is non-empty, matching the pattern already used in storage/remote/azuread/azuread.go. Fixes #16634 Signed-off-by: Ogulcan Aydogan --- discovery/azure/azure.go | 5 ++++- discovery/azure/azure_test.go | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/discovery/azure/azure.go b/discovery/azure/azure.go index 834eaf1f29..0ac9a9af4e 100644 --- a/discovery/azure/azure.go +++ b/discovery/azure/azure.go @@ -298,7 +298,10 @@ func newCredential(cfg SDConfig, policyClientOptions policy.ClientOptions) (azco } credential = azcore.TokenCredential(workloadIdentityCredential) case authMethodManagedIdentity: - options := &azidentity.ManagedIdentityCredentialOptions{ClientOptions: policyClientOptions, ID: azidentity.ClientID(cfg.ClientID)} + options := &azidentity.ManagedIdentityCredentialOptions{ClientOptions: policyClientOptions} + if cfg.ClientID != "" { + options.ID = azidentity.ClientID(cfg.ClientID) + } managedIdentityCredential, err := azidentity.NewManagedIdentityCredential(options) if err != nil { return nil, err diff --git a/discovery/azure/azure_test.go b/discovery/azure/azure_test.go index 23c120ac6b..dd2eeb0a3f 100644 --- a/discovery/azure/azure_test.go +++ b/discovery/azure/azure_test.go @@ -24,6 +24,7 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm" azfake "github.com/Azure/azure-sdk-for-go/sdk/azcore/fake" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/policy" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" fake "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5/fake" @@ -490,6 +491,27 @@ func TestNewAzureResourceFromID(t *testing.T) { } } +func TestNewCredentialManagedIdentity(t *testing.T) { + // Test that system-assigned managed identity (empty ClientID) creates + // a valid credential. Previously, an empty ClientID was passed as + // azidentity.ClientID("") which is not nil and caused Azure SDK to + // look up a non-existent user-assigned identity instead of falling + // back to system-assigned identity. + cfg := SDConfig{ + AuthenticationMethod: authMethodManagedIdentity, + ClientID: "", + } + cred, err := newCredential(cfg, policy.ClientOptions{}) + require.NoError(t, err) + require.NotNil(t, cred) + + // Test that user-assigned managed identity (non-empty ClientID) also works. + cfg.ClientID = "00000000-0000-0000-0000-000000000000" + cred, err = newCredential(cfg, policy.ClientOptions{}) + require.NoError(t, err) + require.NotNil(t, cred) +} + func TestAzureRefresh(t *testing.T) { tests := []struct { scenario string From 7176a6de916be45a173ca85d5c42062cfd912fdb Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Thu, 19 Mar 2026 12:14:34 +0100 Subject: [PATCH 64/73] tsdb/chunkenc: port XOR2 performance improvements to ST-aware encoding Port the following optimizations from the roidelapluie/xor2 branch to the ST-aware XOR2 implementation on main: bstream.go: - Add writeBitsFast() as a writeBits variant that handles the partial last byte inline to avoid per-byte writeByte calls and writes complete bytes directly to the stream slice; used only by XOR2 to leave the shared writeBits unchanged for other encoders - Add readXOR2ControlFast() for inlinable hot-path control decoding that avoids buffer refills for the common 4-bit cases - Add readUvarint()/readVarint() methods that use direct method calls instead of io.ByteReader interface dispatch, reducing GC pressure from interior pointer references in findObject xor2.go: - Switch all writeBits calls to writeBitsFast - Use readXOR2ControlFast() + readXOR2Control() fallback in Next() - Use it.br.readVarint()/readUvarint() instead of binary.ReadVarint/ ReadUvarint to avoid GC overhead from interface dispatch - Add 3-bit fast path in decodeValue() to read the full value control prefix in one buffer peek rather than up to three separate bit reads - Add combined 1+sz bit fast path in decodeValueKnownNonZero() to fold the control bit and value bits into a single buffer operation - Add 11-bit combined read in decodeNewLeadingTrailing() to read leading (5 bits) and sigbits (6 bits) together - Pre-compute the value XOR delta in encodeJoint() and pass it to writeVDeltaKnownNonZero(delta uint64) to avoid recomputation Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- tsdb/chunkenc/bstream.go | 95 +++++++++++++++++++++++++ tsdb/chunkenc/xor2.go | 149 ++++++++++++++++++++++++++++----------- 2 files changed, 202 insertions(+), 42 deletions(-) diff --git a/tsdb/chunkenc/bstream.go b/tsdb/chunkenc/bstream.go index ecface3099..b2e6e3f91f 100644 --- a/tsdb/chunkenc/bstream.go +++ b/tsdb/chunkenc/bstream.go @@ -117,6 +117,40 @@ func (b *bstream) writeBits(u uint64, nbits int) { } } +// writeBitsFast is like writeBits but handles the partial last byte inline to +// avoid per-byte writeByte calls, and writes complete bytes directly to the +// stream slice. +func (b *bstream) writeBitsFast(u uint64, nbits int) { + u <<= 64 - uint(nbits) + + // If the last byte is partial, fill its remaining bits first. + if b.count > 0 { + free := int(b.count) + last := len(b.stream) - 1 + b.stream[last] |= byte(u >> uint(64-free)) + if nbits < free { + b.count = uint8(free - nbits) + return + } + u <<= uint(free) + nbits -= free + b.count = 0 + } + + // Write complete bytes directly, avoiding per-byte function call overhead. + for nbits >= 8 { + b.stream = append(b.stream, byte(u>>56)) + u <<= 8 + nbits -= 8 + } + + // Write any remaining bits as a partial final byte. + if nbits > 0 { + b.stream = append(b.stream, byte(u>>56)) + b.count = uint8(8 - nbits) + } +} + type bstreamReader struct { stream []byte streamOffset int // The offset from which read the next byte from the stream. @@ -215,6 +249,35 @@ func (b *bstreamReader) ReadByte() (byte, error) { return byte(v), nil } +// readXOR2ControlFast is like readXOR2Control but returns io.EOF when the +// internal buffer has fewer than 4 valid bits, or when the control prefix +// indicates cases 4 or 5 (top4 == 0xf). The caller should retry with +// readXOR2Control. This function must be kept small and a leaf in order to +// help the compiler inlining it and further improve performance. +func (b *bstreamReader) readXOR2ControlFast() (uint8, error) { + if b.valid < 4 { + return 0, io.EOF + } + top4 := uint8((b.buffer >> (b.valid - 4)) & 0xf) + if top4 < 8 { // '0xxx': dod=0, val=0 (case 0). + b.valid-- + return 0, nil + } + if top4 < 12 { // '10xx': dod=0, val changed (case 1). + b.valid -= 2 + return 1, nil + } + if top4 < 14 { // '110x': small dod (case 2). + b.valid -= 3 + return 2, nil + } + if top4 == 14 { // '1110': medium dod (case 3). + b.valid -= 4 + return 3, nil + } + return 0, io.EOF +} + // readXOR2Control reads the XOR2 variable-length joint control prefix // and returns 0-5 mapping to the six encoding cases: // @@ -304,6 +367,38 @@ func (b *bstreamReader) readXOR2Control() (uint8, error) { return 5, nil } +// readUvarint decodes a varint-encoded uint64 using direct method calls, +// avoiding the io.ByteReader interface dispatch used by binary.ReadUvarint. +// This prevents interior pointer references on goroutine stacks that the GC +// must trace via findObject, reducing GC overhead. +func (b *bstreamReader) readUvarint() (uint64, error) { + var x uint64 + var s uint + for range binary.MaxVarintLen64 { + byt, err := b.ReadByte() + if err != nil { + return x, err + } + if byt < 0x80 { + return x | uint64(byt)<> 1) + if ux&1 != 0 { + x = ^x + } + return x, err +} + // loadNextBuffer loads the next bytes from the stream into the internal buffer. // The input nbits is the minimum number of bits that must be read, but the implementation // can read more (if possible) to improve performances. diff --git a/tsdb/chunkenc/xor2.go b/tsdb/chunkenc/xor2.go index 2d5cc2b542..ddfab67617 100644 --- a/tsdb/chunkenc/xor2.go +++ b/tsdb/chunkenc/xor2.go @@ -224,7 +224,7 @@ func (a *xor2Appender) Append(st, t int64, v float64) { for _, b := range buf[:binary.PutVarint(buf, t)] { a.b.writeByte(b) } - a.b.writeBits(math.Float64bits(v), 64) + a.b.writeBitsFast(math.Float64bits(v), 64) if st != 0 { for _, b := range buf[:binary.PutVarint(buf, t-st)] { @@ -300,14 +300,15 @@ func (a *xor2Appender) Append(st, t int64, v float64) { // samples >= 2. func (a *xor2Appender) encodeJoint(dod int64, v float64) { if dod == 0 { + vbits := math.Float64bits(v) ^ math.Float64bits(a.v) switch { case value.IsStaleNaN(v): - a.b.writeBits(0b11111, 5) - case math.Float64bits(v)^math.Float64bits(a.v) == 0: + a.b.writeBitsFast(0b11111, 5) + case vbits == 0: a.b.writeBit(zero) default: - a.b.writeBits(0b10, 2) - a.writeVDeltaKnownNonZero(v) + a.b.writeBitsFast(0b10, 2) + a.writeVDeltaKnownNonZero(vbits) } return } @@ -324,8 +325,8 @@ func (a *xor2Appender) encodeJoint(dod int64, v float64) { a.b.writeByte(byte(uint64(dod))) default: // 64-bit escape (rare): `11110`. - a.b.writeBits(0b11110, 5) - a.b.writeBits(uint64(dod), 64) + a.b.writeBitsFast(0b11110, 5) + a.b.writeBitsFast(uint64(dod), 64) } a.writeVDelta(v) } @@ -333,7 +334,7 @@ func (a *xor2Appender) encodeJoint(dod int64, v float64) { // writeVDelta encodes the value delta for the dod≠0 case. func (a *xor2Appender) writeVDelta(v float64) { if value.IsStaleNaN(v) { - a.b.writeBits(0b111, 3) + a.b.writeBitsFast(0b111, 3) return } @@ -352,26 +353,30 @@ func (a *xor2Appender) writeVDelta(v float64) { } if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing { - a.b.writeBits(0b10, 2) - a.b.writeBits(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) + a.b.writeBitsFast(0b10, 2) + a.b.writeBitsFast(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) return } a.leading, a.trailing = newLeading, newTrailing - a.b.writeBits(0b110, 3) - a.b.writeBits(uint64(newLeading), 5) + a.b.writeBitsFast(0b110, 3) + a.b.writeBitsFast(uint64(newLeading), 5) sigbits := 64 - newLeading - newTrailing - a.b.writeBits(uint64(sigbits), 6) - a.b.writeBits(delta>>newTrailing, int(sigbits)) + a.b.writeBitsFast(uint64(sigbits), 6) + a.b.writeBitsFast(delta>>newTrailing, int(sigbits)) } -// writeVDeltaKnownNonZero encodes the value delta when it is known to be -// non-zero and non-stale (dod=0, value-changed case). -func (a *xor2Appender) writeVDeltaKnownNonZero(v float64) { - delta := math.Float64bits(v) ^ math.Float64bits(a.v) - +// writeVDeltaKnownNonZero encodes a precomputed value XOR delta for the +// dod=0, value-changed case. delta must be non-zero; stale NaN with dod=0 is +// handled at the joint control level (`11111`) and never reaches this function. +// +// Encoding: +// +// `0` → reuse previous leading/trailing window +// `1` → new leading/trailing window +func (a *xor2Appender) writeVDeltaKnownNonZero(delta uint64) { newLeading := uint8(bits.LeadingZeros64(delta)) newTrailing := uint8(bits.TrailingZeros64(delta)) @@ -381,18 +386,18 @@ func (a *xor2Appender) writeVDeltaKnownNonZero(v float64) { if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing { a.b.writeBit(zero) - a.b.writeBits(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) + a.b.writeBitsFast(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) return } a.leading, a.trailing = newLeading, newTrailing a.b.writeBit(one) - a.b.writeBits(uint64(newLeading), 5) + a.b.writeBitsFast(uint64(newLeading), 5) sigbits := 64 - newLeading - newTrailing - a.b.writeBits(uint64(sigbits), 6) - a.b.writeBits(delta>>newTrailing, int(sigbits)) + a.b.writeBitsFast(uint64(sigbits), 6) + a.b.writeBitsFast(delta>>newTrailing, int(sigbits)) } func (*xor2Appender) AppendHistogram(*HistogramAppender, int64, int64, *histogram.Histogram, bool) (Chunk, bool, Appender, error) { @@ -486,7 +491,7 @@ func (it *xor2Iterator) Next() ValueType { } if it.numRead == 0 { - t, err := binary.ReadVarint(&it.br) + t, err := it.br.readVarint() if err != nil { it.err = err return ValNone @@ -504,7 +509,7 @@ func (it *xor2Iterator) Next() ValueType { // Optional ST for sample 0. if it.firstSTKnown { - stDiff, err := binary.ReadVarint(&it.br) + stDiff, err := it.br.readVarint() if err != nil { it.err = err return ValNone @@ -517,7 +522,7 @@ func (it *xor2Iterator) Next() ValueType { } if it.numRead == 1 { - tDelta, err := binary.ReadUvarint(&it.br) + tDelta, err := it.br.readUvarint() if err != nil { it.err = err return ValNone @@ -550,10 +555,13 @@ func (it *xor2Iterator) Next() ValueType { prevT := it.t savedNumRead := it.numRead - ctrl, err := it.br.readXOR2Control() + ctrl, err := it.br.readXOR2ControlFast() if err != nil { - it.err = err - return ValNone + ctrl, err = it.br.readXOR2Control() + if err != nil { + it.err = err + return ValNone + } } switch ctrl { @@ -654,6 +662,49 @@ func (it *xor2Iterator) readDod(w uint8) error { // `110` → new leading/trailing window // `111` → stale NaN func (it *xor2Iterator) decodeValue() error { + // Fast path: 3 bits available — read the full control prefix in one shot. + // Encoding: `0`=unchanged, `10`=reuse window, `110`=new window, `111`=stale NaN. + if it.br.valid >= 3 { + ctrl := (it.br.buffer >> (it.br.valid - 3)) & 0x7 + if ctrl&0x4 == 0 { + // `0xx`: value unchanged, consume 1 bit. + it.br.valid-- + it.val = it.baselineV + return nil + } + if ctrl&0x6 == 0x4 { + // `10x`: reuse previous leading/trailing window, consume 2 bits. + it.br.valid -= 2 + sz := uint8(64 - int(it.leading) - int(it.trailing)) + var valueBits uint64 + if it.br.valid >= sz { + it.br.valid -= sz + valueBits = (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + } else { + var err error + valueBits, err = it.br.readBits(sz) + if err != nil { + return err + } + } + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + // `11x`: consume 3 bits. + it.br.valid -= 3 + if ctrl == 0x6 { + // `110`: new leading/trailing window. + return it.decodeNewLeadingTrailing() + } + // `111`: stale NaN. + it.val = math.Float64frombits(value.StaleNaN) + return nil + } + + // Slow path: fewer than 3 bits buffered (rare, only near buffer refills). var bit bit if it.br.valid > 0 { it.br.valid-- @@ -731,6 +782,26 @@ func (it *xor2Iterator) decodeValue() error { // `0` → reuse previous leading/trailing window // `1` → new leading/trailing window func (it *xor2Iterator) decodeValueKnownNonZero() error { + sz := uint8(64 - int(it.leading) - int(it.trailing)) + // Fast path: combine the 1-bit reuse/new-window control read with the + // sz-bit value read into a single buffer operation. + if it.br.valid >= 1+sz { + ctrlBit := (it.br.buffer >> (it.br.valid - 1)) & 1 + if ctrlBit == 0 { // `0`: reuse previous leading/trailing window. + it.br.valid -= 1 + sz + valueBits := (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + // `1`: new leading/trailing window. + it.br.valid-- + return it.decodeNewLeadingTrailing() + } + + // Slow path: read control bit then value bits separately. var bit bit if it.br.valid > 0 { it.br.valid-- @@ -745,7 +816,6 @@ func (it *xor2Iterator) decodeValueKnownNonZero() error { if bit == zero { // `0` → reuse previous leading/trailing window. - sz := uint8(64 - int(it.leading) - int(it.trailing)) var valueBits uint64 if it.br.valid >= sz { it.br.valid -= sz @@ -771,24 +841,19 @@ func (it *xor2Iterator) decodeValueKnownNonZero() error { // decodeNewLeadingTrailing reads a new leading/sigbits/value triple and // updates it.leading, it.trailing, it.val, and it.baselineV. func (it *xor2Iterator) decodeNewLeadingTrailing() error { - var newLeading uint64 - if it.br.valid >= 5 { - it.br.valid -= 5 - newLeading = (it.br.buffer >> it.br.valid) & 0x1f + var newLeading, sigbits uint64 + // Fast path: read leading (5 bits) and sigbits (6 bits) together as 11 bits. + if it.br.valid >= 11 { + val := (it.br.buffer >> (it.br.valid - 11)) & 0x7ff + it.br.valid -= 11 + newLeading = val >> 6 + sigbits = val & 0x3f } else { var err error newLeading, err = it.br.readBits(5) if err != nil { return err } - } - - var sigbits uint64 - if it.br.valid >= 6 { - it.br.valid -= 6 - sigbits = (it.br.buffer >> it.br.valid) & 0x3f - } else { - var err error sigbits, err = it.br.readBits(6) if err != nil { return err From 549c6ffd548342081f894352d7429a75519e09f7 Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Thu, 19 Mar 2026 12:35:41 +0100 Subject: [PATCH 65/73] Shepherd for 3.11 Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- RELEASE.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index 5a8f8601ab..5c29b0a522 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -20,7 +20,8 @@ Please see [the v2.55 RELEASE.md](https://github.com/prometheus/prometheus/blob/ | v3.8 | 2025-11-06 | Jan Fajerski (GitHub: @jan--f) | | v3.9 | 2025-12-18 | Bryan Boreham (GitHub: @bboreham) | | v3.10 | 2026-02-05 | Ganesh Vernekar (Github: @codesome) | -| v3.11 | 2026-03-19 | **volunteer welcome** | +| v3.11 | 2026-03-25 | Julien Pivotto (GitHub: @roidelapluie) | +| v3.12 | 2026-05-06 | **volunteer welcome** | If you are interested in volunteering please create a pull request against the [prometheus/prometheus](https://github.com/prometheus/prometheus) repository and propose yourself for the release series of your choice. From 530c4bfcc9b340bc981df8205db3106b501d34cb Mon Sep 17 00:00:00 2001 From: Jeremy Rickards Date: Thu, 19 Mar 2026 14:28:18 +0100 Subject: [PATCH 66/73] docs: clarify that histogram_avg/count/sum/stddev/stdvar are native-histogram-only The docs for these functions previously described them as acting on "each histogram sample," which was ambiguous. Add "native" to clarify they only operate on native histogram samples, not classic histograms. This distinction was originally documented but lost when the experimental feature warnings were removed. Signed-off-by: Jeremy Rickards --- docs/querying/functions.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/querying/functions.md b/docs/querying/functions.md index 68a003359d..64e172000f 100644 --- a/docs/querying/functions.md +++ b/docs/querying/functions.md @@ -219,7 +219,7 @@ to their original value. Histogram samples in the input vector are ignored silen ## `histogram_avg()` `histogram_avg(v instant-vector)` returns the arithmetic average of observed -values stored in each histogram sample in `v`. Float samples are ignored and do +values stored in each native histogram sample in `v`. Float samples are ignored and do not show up in the returned vector. Use `histogram_avg` as demonstrated below to compute the average request duration @@ -236,11 +236,11 @@ Which is equivalent to the following query: ## `histogram_count()` and `histogram_sum()` `histogram_count(v instant-vector)` returns the count of observations stored in -each histogram sample in `v`. Float samples are ignored and do not show up in +each native histogram sample in `v`. Float samples are ignored and do not show up in the returned vector. Similarly, `histogram_sum(v instant-vector)` returns the sum of observations -stored in each histogram sample. +stored in each native histogram sample. Use `histogram_count` in the following way to calculate a rate of observations (in this case corresponding to “requests per second”) from a series of @@ -453,14 +453,14 @@ histogram_quantiles(sum(rate(foo[1m])), "quantile", 0.9, 0.99) ## `histogram_stddev()` and `histogram_stdvar()` `histogram_stddev(v instant-vector)` returns the estimated standard deviation -of observations for each histogram sample in `v`. For this estimation, all observations +of observations for each native histogram sample in `v`. For this estimation, all observations in a bucket are assumed to have the value of the mean of the bucket boundaries. For the zero bucket and for buckets with custom boundaries, the arithmetic mean is used. For the usual exponential buckets, the geometric mean is used. Float samples are ignored and do not show up in the returned vector. Similarly, `histogram_stdvar(v instant-vector)` returns the estimated standard -variance of observations for each histogram sample in `v`. +variance of observations for each native histogram sample in `v`. ## `hour()` From 7a44a2ddc443010d50749b9682b181f39a6b8ab6 Mon Sep 17 00:00:00 2001 From: Jeremy Rickards Date: Thu, 19 Mar 2026 15:03:31 +0100 Subject: [PATCH 67/73] docs: regenerate PromQL function docs for UI Signed-off-by: Jeremy Rickards --- web/ui/mantine-ui/src/promql/functionDocs.tsx | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/web/ui/mantine-ui/src/promql/functionDocs.tsx b/web/ui/mantine-ui/src/promql/functionDocs.tsx index c7f744ba6f..75a4a767a7 100644 --- a/web/ui/mantine-ui/src/promql/functionDocs.tsx +++ b/web/ui/mantine-ui/src/promql/functionDocs.tsx @@ -1257,7 +1257,7 @@ const funcDocs: Record = { <>

histogram_avg(v instant-vector) returns the arithmetic average of observed values stored in each - histogram sample in v. Float samples are ignored and do not show up in the returned vector. + native histogram sample in v. Float samples are ignored and do not show up in the returned vector.

@@ -1283,13 +1283,13 @@ const funcDocs: Record = { histogram_count: ( <>

- histogram_count(v instant-vector) returns the count of observations stored in each histogram sample - in v. Float samples are ignored and do not show up in the returned vector. + histogram_count(v instant-vector) returns the count of observations stored in each native histogram + sample in v. Float samples are ignored and do not show up in the returned vector.

- Similarly, histogram_sum(v instant-vector) returns the sum of observations stored in each histogram - sample. + Similarly, histogram_sum(v instant-vector) returns the sum of observations stored in each native + histogram sample.

@@ -1574,15 +1574,15 @@ const funcDocs: Record = { <>

histogram_stddev(v instant-vector) returns the estimated standard deviation of observations for - each histogram sample in v. For this estimation, all observations in a bucket are assumed to have - the value of the mean of the bucket boundaries. For the zero bucket and for buckets with custom boundaries, the - arithmetic mean is used. For the usual exponential buckets, the geometric mean is used. Float samples are + each native histogram sample in v. For this estimation, all observations in a bucket are assumed to + have the value of the mean of the bucket boundaries. For the zero bucket and for buckets with custom boundaries, + the arithmetic mean is used. For the usual exponential buckets, the geometric mean is used. Float samples are ignored and do not show up in the returned vector.

Similarly, histogram_stdvar(v instant-vector) returns the estimated standard variance of - observations for each histogram sample in v. + observations for each native histogram sample in v.

), @@ -1590,28 +1590,28 @@ const funcDocs: Record = { <>

histogram_stddev(v instant-vector) returns the estimated standard deviation of observations for - each histogram sample in v. For this estimation, all observations in a bucket are assumed to have - the value of the mean of the bucket boundaries. For the zero bucket and for buckets with custom boundaries, the - arithmetic mean is used. For the usual exponential buckets, the geometric mean is used. Float samples are + each native histogram sample in v. For this estimation, all observations in a bucket are assumed to + have the value of the mean of the bucket boundaries. For the zero bucket and for buckets with custom boundaries, + the arithmetic mean is used. For the usual exponential buckets, the geometric mean is used. Float samples are ignored and do not show up in the returned vector.

Similarly, histogram_stdvar(v instant-vector) returns the estimated standard variance of - observations for each histogram sample in v. + observations for each native histogram sample in v.

), histogram_sum: ( <>

- histogram_count(v instant-vector) returns the count of observations stored in each histogram sample - in v. Float samples are ignored and do not show up in the returned vector. + histogram_count(v instant-vector) returns the count of observations stored in each native histogram + sample in v. Float samples are ignored and do not show up in the returned vector.

- Similarly, histogram_sum(v instant-vector) returns the sum of observations stored in each histogram - sample. + Similarly, histogram_sum(v instant-vector) returns the sum of observations stored in each native + histogram sample.

From 4a400dc3df0f20c12f94f47eecaa72ffef0ae445 Mon Sep 17 00:00:00 2001 From: Linas Medziunas Date: Thu, 19 Mar 2026 16:10:23 +0200 Subject: [PATCH 68/73] fix(UI): autocomplete for first_over_time and ts_of_first_over_time Signed-off-by: Linas Medziunas --- .../codemirror-promql/src/complete/promql.terms.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/web/ui/module/codemirror-promql/src/complete/promql.terms.ts b/web/ui/module/codemirror-promql/src/complete/promql.terms.ts index 645b507855..7fb89bf062 100644 --- a/web/ui/module/codemirror-promql/src/complete/promql.terms.ts +++ b/web/ui/module/codemirror-promql/src/complete/promql.terms.ts @@ -317,10 +317,16 @@ export const functionIdentifierTerms = [ info: 'Join together label values into new label', type: 'function', }, + { + label: 'first_over_time', + detail: 'function', + info: 'Return the value of the oldest sample in the specified interval', + type: 'function', + }, { label: 'last_over_time', detail: 'function', - info: 'The most recent point value in specified interval.', + info: 'Return the value of the most recent sample in the specified interval', type: 'function', }, { @@ -371,6 +377,12 @@ export const functionIdentifierTerms = [ info: 'Return the timestamp of the minimum value over time for input series', type: 'function', }, + { + label: 'ts_of_first_over_time', + detail: 'function', + info: 'Return the timestamp of the first value over time for input series', + type: 'function', + }, { label: 'ts_of_last_over_time', detail: 'function', From 2129702dff404191c8fb07e916651430a9619dcf Mon Sep 17 00:00:00 2001 From: Domantas Date: Thu, 19 Mar 2026 16:31:20 +0200 Subject: [PATCH 69/73] perf(PromQL): allow inlining kahansum.Inc (#18319) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Domantas Jadenkus Signed-off-by: Domantas Co-authored-by: Linas Medžiūnas --- util/kahansum/kahansum.go | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/util/kahansum/kahansum.go b/util/kahansum/kahansum.go index d55defcb29..b9a02889b3 100644 --- a/util/kahansum/kahansum.go +++ b/util/kahansum/kahansum.go @@ -16,10 +16,21 @@ package kahansum import "math" // Inc performs addition of two floating-point numbers using the Kahan summation algorithm. -// We get incorrect results if this function is inlined; see https://github.com/prometheus/prometheus/issues/16714. -// -//go:noinline func Inc(inc, sum, c float64) (newSum, newC float64) { + // We've seen Kahan summation return less accurate results when Inc function is + // allowed to be inlined (see https://github.com/prometheus/prometheus/pull/16895). + // Go permits fusing float operations (e.g. using fused multiply-add, which allows + // calculating a*b+c without rounding the result of a*b to precision available in float64), + // and Kahan sum is sensitive to float rounding behavior. Instead of forbidding inlining + // (which only disallows fusing operations outside of Inc with operations happening inside) + // and eating the performance cost of non-inlined function calls, we forbid just the fusing + // across Inc call boundary. We can do that by explicitly requesting Inc arguments and results + // to be rounded to float64 precision, as documented in go spec (https://go.dev/ref/spec#Floating_point_operators). + // The following casts are not no-ops! + inc = float64(inc) + sum = float64(sum) + c = float64(c) + t := sum + inc switch { case math.IsInf(t, 0): @@ -31,6 +42,9 @@ func Inc(inc, sum, c float64) (newSum, newC float64) { default: c += (inc - t) + sum } + + t = float64(t) + c = float64(c) return t, c } From e865bdd17298095b66319193e657a0725ae7c7d7 Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Thu, 19 Mar 2026 17:00:31 +0100 Subject: [PATCH 70/73] tsdb/chunkenc: avoid error allocation in readXOR2ControlFast and add decode tests Change readXOR2ControlFast to return (uint8, bool) instead of (uint8, error) to avoid allocating io.EOF on the fast path. Refactor encodeJoint to skip computing vbits when the value is a stale NaN. Add TestXOR2DecodeFunctionsAcrossPadding to exercise decodeValue, decodeValueKnownNonZero, and decodeNewLeadingTrailing across all 64 bit-buffer alignments. Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- tsdb/chunkenc/bstream.go | 16 +-- tsdb/chunkenc/xor2.go | 17 +-- tsdb/chunkenc/xor2_test.go | 248 +++++++++++++++++++++++++++++++++++++ 3 files changed, 265 insertions(+), 16 deletions(-) diff --git a/tsdb/chunkenc/bstream.go b/tsdb/chunkenc/bstream.go index b2e6e3f91f..2ac92b69c8 100644 --- a/tsdb/chunkenc/bstream.go +++ b/tsdb/chunkenc/bstream.go @@ -249,33 +249,33 @@ func (b *bstreamReader) ReadByte() (byte, error) { return byte(v), nil } -// readXOR2ControlFast is like readXOR2Control but returns io.EOF when the +// readXOR2ControlFast is like readXOR2Control but returns false when the // internal buffer has fewer than 4 valid bits, or when the control prefix // indicates cases 4 or 5 (top4 == 0xf). The caller should retry with // readXOR2Control. This function must be kept small and a leaf in order to // help the compiler inlining it and further improve performance. -func (b *bstreamReader) readXOR2ControlFast() (uint8, error) { +func (b *bstreamReader) readXOR2ControlFast() (uint8, bool) { if b.valid < 4 { - return 0, io.EOF + return 0, false } top4 := uint8((b.buffer >> (b.valid - 4)) & 0xf) if top4 < 8 { // '0xxx': dod=0, val=0 (case 0). b.valid-- - return 0, nil + return 0, true } if top4 < 12 { // '10xx': dod=0, val changed (case 1). b.valid -= 2 - return 1, nil + return 1, true } if top4 < 14 { // '110x': small dod (case 2). b.valid -= 3 - return 2, nil + return 2, true } if top4 == 14 { // '1110': medium dod (case 3). b.valid -= 4 - return 3, nil + return 3, true } - return 0, io.EOF + return 0, false } // readXOR2Control reads the XOR2 variable-length joint control prefix diff --git a/tsdb/chunkenc/xor2.go b/tsdb/chunkenc/xor2.go index ddfab67617..85db376ffb 100644 --- a/tsdb/chunkenc/xor2.go +++ b/tsdb/chunkenc/xor2.go @@ -300,13 +300,13 @@ func (a *xor2Appender) Append(st, t int64, v float64) { // samples >= 2. func (a *xor2Appender) encodeJoint(dod int64, v float64) { if dod == 0 { - vbits := math.Float64bits(v) ^ math.Float64bits(a.v) - switch { - case value.IsStaleNaN(v): + if value.IsStaleNaN(v) { a.b.writeBitsFast(0b11111, 5) - case vbits == 0: + return + } + if vbits := math.Float64bits(v) ^ math.Float64bits(a.v); vbits == 0 { a.b.writeBit(zero) - default: + } else { a.b.writeBitsFast(0b10, 2) a.writeVDeltaKnownNonZero(vbits) } @@ -369,7 +369,7 @@ func (a *xor2Appender) writeVDelta(v float64) { } // writeVDeltaKnownNonZero encodes a precomputed value XOR delta for the -// dod=0, value-changed case. delta must be non-zero; stale NaN with dod=0 is +// dod=0, value-changed case. delta must be non-zero or staleNaN. Stale NaN with dod=0 is // handled at the joint control level (`11111`) and never reaches this function. // // Encoding: @@ -555,8 +555,9 @@ func (it *xor2Iterator) Next() ValueType { prevT := it.t savedNumRead := it.numRead - ctrl, err := it.br.readXOR2ControlFast() - if err != nil { + ctrl, ok := it.br.readXOR2ControlFast() + if !ok { + var err error ctrl, err = it.br.readXOR2Control() if err != nil { it.err = err diff --git a/tsdb/chunkenc/xor2_test.go b/tsdb/chunkenc/xor2_test.go index f6a344f598..c0c1af8a1b 100644 --- a/tsdb/chunkenc/xor2_test.go +++ b/tsdb/chunkenc/xor2_test.go @@ -14,7 +14,9 @@ package chunkenc import ( + "fmt" "math" + "math/bits" "testing" "github.com/stretchr/testify/require" @@ -22,6 +24,55 @@ import ( "github.com/prometheus/prometheus/model/value" ) +func newXOR2IteratorForPayload(t *testing.T, padding int, payload func(*bstream), setup func(*xor2Iterator)) *xor2Iterator { + t.Helper() + + var bs bstream + if padding > 0 { + bs.writeBitsFast(0, padding) + } + payload(&bs) + // Add tail bytes so the reader initially fills a full 64-bit buffer. + bs.writeBitsFast(0, 64) + + it := &xor2Iterator{} + if setup != nil { + setup(it) + } + it.br = newBReader(bs.bytes()) + + if padding > 0 { + _, err := it.br.readBits(uint8(padding)) + require.NoError(t, err) + } + + return it +} + +func writeXOR2NewWindowPayload(bs *bstream, delta uint64) (leading, trailing uint8) { + leading, trailing, sigbits := xor2DeltaWindow(delta) + encodedSigbits := sigbits + if sigbits == 64 { + encodedSigbits = 0 + } + + bs.writeBitsFast(uint64(leading), 5) + bs.writeBitsFast(uint64(encodedSigbits), 6) + bs.writeBitsFast(delta>>trailing, int(sigbits)) + + return leading, trailing +} + +func xor2DeltaWindow(delta uint64) (leading, trailing, sigbits uint8) { + leading = uint8(bits.LeadingZeros64(delta)) + trailing = uint8(bits.TrailingZeros64(delta)) + if leading >= 32 { + leading = 31 + } + + return leading, trailing, 64 - leading - trailing +} + func BenchmarkXor2Write(b *testing.B) { samples := make([]struct { t int64 @@ -277,3 +328,200 @@ func TestXOR2Chunk_MoreThan127Samples(t *testing.T) { require.NoError(t, it.Err()) }) } + +// TestXOR2DecodeFunctionsAcrossPadding exercises decodeValue, +// decodeValueKnownNonZero, and decodeNewLeadingTrailing across all logical +// cases × all 64 bit-buffer alignments (padding 0..63). Padding controls the +// number of bits that precede the payload in the stream, which determines +// how many bits remain in the 64-bit read buffer when the decode function is +// called. This Cartesian product ensures both the fast path (enough bits +// buffered for a single-shot read) and the slow path (bits span a buffer +// refill) are exercised for every case. +func TestXOR2DecodeFunctionsAcrossPadding(t *testing.T) { + const baseline = 1234.5 + + type testCase struct { + name string + payload func(*bstream) + setup func(*xor2Iterator) + assert func(*testing.T, *xor2Iterator) + } + + runCases := func(t *testing.T, cases []testCase, fn func(*xor2Iterator) error) { + t.Helper() + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + for padding := range 64 { + t.Run(fmt.Sprintf("padding=%d", padding), func(t *testing.T) { + it := newXOR2IteratorForPayload(t, padding, tc.payload, tc.setup) + require.NoError(t, fn(it)) + tc.assert(t, it) + }) + } + }) + } + } + + // decodeValue: `0`=unchanged, `10`=reuse window, `110`=new window, `111`=stale NaN. + t.Run("decodeValue", func(t *testing.T) { + reuseD := uint64(0x000ABCDE000000) + rL, rT, rS := xor2DeltaWindow(reuseD) + + // Two new-window variants: full-width sigbits (encoded as 0) and small + // sigbits, to cover both value-bits read paths inside decodeNewLeadingTrailing. + newDFull := uint64(0xFEDCBA9876543211) + nLFull, nTFull, _ := xor2DeltaWindow(newDFull) + newDSmall := uint64(0x000ABCDE000000) + nLSmall, nTSmall, _ := xor2DeltaWindow(newDSmall) + + runCases(t, []testCase{ + { + name: "unchanged", + payload: func(bs *bstream) { bs.writeBit(zero) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.Equal(t, baseline, it.val) + require.Equal(t, baseline, it.baselineV) + }, + }, + { + name: "reuse_window", + payload: func(bs *bstream) { + bs.writeBitsFast(0b10, 2) + bs.writeBitsFast(reuseD>>rT, int(rS)) + }, + setup: func(it *xor2Iterator) { + it.baselineV = baseline + it.leading, it.trailing = rL, rT + }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ reuseD) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, rL, it.leading) + require.Equal(t, rT, it.trailing) + }, + }, + { + name: "new_window_full_sigbits", + payload: func(bs *bstream) { + bs.writeBitsFast(0b110, 3) + writeXOR2NewWindowPayload(bs, newDFull) + }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ newDFull) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, nLFull, it.leading) + require.Equal(t, nTFull, it.trailing) + }, + }, + { + name: "new_window_small_sigbits", + payload: func(bs *bstream) { + bs.writeBitsFast(0b110, 3) + writeXOR2NewWindowPayload(bs, newDSmall) + }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ newDSmall) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, nLSmall, it.leading) + require.Equal(t, nTSmall, it.trailing) + }, + }, + { + name: "stale_nan", + payload: func(bs *bstream) { bs.writeBitsFast(0b111, 3) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.True(t, value.IsStaleNaN(it.val)) + require.Equal(t, baseline, it.baselineV) + }, + }, + }, (*xor2Iterator).decodeValue) + }) + + // decodeValueKnownNonZero: `0`=reuse window, `1`=new window. + // The new_window case uses real leading/trailing (not 0xff) so that sz is + // small enough for the fast path (valid >= 1+sz) to be reached with ctrlBit=1. + t.Run("decodeValueKnownNonZero", func(t *testing.T) { + delta := uint64(0x000ABCDE000000) + dL, dT, dS := xor2DeltaWindow(delta) + + runCases(t, []testCase{ + { + name: "reuse_window", + payload: func(bs *bstream) { + bs.writeBit(zero) + bs.writeBitsFast(delta>>dT, int(dS)) + }, + setup: func(it *xor2Iterator) { + it.baselineV = baseline + it.leading, it.trailing = dL, dT + }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ delta) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + }, + }, + { + name: "new_window", + payload: func(bs *bstream) { + bs.writeBit(one) + writeXOR2NewWindowPayload(bs, delta) + }, + setup: func(it *xor2Iterator) { + it.baselineV = baseline + it.leading, it.trailing = dL, dT + }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ delta) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, dL, it.leading) + require.Equal(t, dT, it.trailing) + }, + }, + }, (*xor2Iterator).decodeValueKnownNonZero) + }) + + // decodeNewLeadingTrailing: exercises the 11-bit header fast path, the + // value-bits fast path (small sigbits), and full-width sigbits (encoded as 0). + t.Run("decodeNewLeadingTrailing", func(t *testing.T) { + smallD := uint64(0x000ABCDE000000) + sL, sT, _ := xor2DeltaWindow(smallD) + fullD := uint64(0xFEDCBA9876543211) + fL, fT, _ := xor2DeltaWindow(fullD) + + runCases(t, []testCase{ + { + name: "small_sigbits", + payload: func(bs *bstream) { writeXOR2NewWindowPayload(bs, smallD) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.Equal(t, sL, it.leading) + require.Equal(t, sT, it.trailing) + expected := math.Float64frombits(math.Float64bits(baseline) ^ smallD) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + }, + }, + { + name: "full_width_sigbits", + payload: func(bs *bstream) { writeXOR2NewWindowPayload(bs, fullD) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.Equal(t, fL, it.leading) + require.Equal(t, fT, it.trailing) + expected := math.Float64frombits(math.Float64bits(baseline) ^ fullD) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + }, + }, + }, (*xor2Iterator).decodeNewLeadingTrailing) + }) +} From 1064c26da2177be7200584e7ab0548604c9d4d14 Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Fri, 27 Feb 2026 12:37:01 +0100 Subject: [PATCH 71/73] Log retention changes Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- cmd/prometheus/main.go | 25 +++++++++++++++++++++++-- config/config.go | 6 +++--- docs/configuration/configuration.md | 4 ++-- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index dfafd0902a..b036b0d316 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -1020,8 +1020,29 @@ func main() { reloaders := []reloader{ { - name: "db_storage", - reloader: localStorage.ApplyConfig, + name: "db_storage", + reloader: func() func(*config.Config) error { + lastTSDBRetention := config.TSDBRetentionConfig{} + return func(cfg *config.Config) error { + err := localStorage.ApplyConfig(cfg) + if err != nil || agentMode || cfg.StorageConfig.TSDBConfig == nil || cfg.StorageConfig.TSDBConfig.Retention == nil { + return err + } + + curr := cfg.StorageConfig.TSDBConfig.Retention + if *curr == lastTSDBRetention { + return nil + } + + logger.Info("TSDB retention updated", + "duration", curr.Time, + "size", curr.Size, + "percentage", curr.Percentage, + ) + lastTSDBRetention = *curr + return nil + } + }(), }, { name: "remote_storage", reloader: remoteStorage.ApplyConfig, diff --git a/config/config.go b/config/config.go index cb45347e41..469ffe10a6 100644 --- a/config/config.go +++ b/config/config.go @@ -283,10 +283,10 @@ var ( // For backwards compatibility. LabelNamePreserveMultipleUnderscores: true, } -) -// DefaultTSDBRetentionConfig is the default TSDB retention configuration. -var DefaultTSDBRetentionConfig TSDBRetentionConfig + // DefaultTSDBRetentionConfig is the default TSDB retention configuration. + DefaultTSDBRetentionConfig TSDBRetentionConfig +) // Config is the top-level configuration for Prometheus's config files. type Config struct { diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 060319ae92..4792d4fed2 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -3686,9 +3686,9 @@ with this feature. # or when a compaction completes, whichever comes first. [ retention: ] : # How long to retain samples in storage. If neither this option nor the size option - # is set, the retention time defaults to 15d. Units Supported: y, w, d, h, m, s, ms. + # is set, the retention time defaults to 15d. Setting this to 0 disables time-based retention. # This option takes precedence over the deprecated command-line flag --storage.tsdb.retention.time. - [ time: | default = 15d ] + [ time: ] # Maximum number of bytes that can be stored for blocks. A unit is required, # supported units: B, KB, MB, GB, TB, PB, EB. Ex: "512MB". Based on powers-of-2, so 1KB is 1024B. From 101ae7338030e5ee2e6b78fc60ebf4916e211f2b Mon Sep 17 00:00:00 2001 From: Ridwan Sharif Date: Fri, 20 Mar 2026 05:58:18 +0000 Subject: [PATCH 72/73] scrape: address comments on PR Signed-off-by: Ridwan Sharif --- scrape/manager.go | 8 +------- scrape/scrape.go | 6 +++--- scrape/scrape_test.go | 2 +- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/scrape/manager.go b/scrape/manager.go index 7a4a4463d9..e632b015d7 100644 --- a/scrape/manager.go +++ b/scrape/manager.go @@ -149,9 +149,8 @@ type Options struct { // because of an early startup scrape. InitialScrapeOffset time.Duration - // private options for testability. + // private option for testability. skipJitterOffsetting bool - offsetSeed uint64 } // Manager maintains a set of scrape pools and manages start/stop cycles @@ -270,11 +269,6 @@ func (m *Manager) reload() { // setOffsetSeed calculates a global offsetSeed per server relying on extra label set. func (m *Manager) setOffsetSeed(labels labels.Labels) error { - if m.opts.offsetSeed != 0 { - m.offsetSeed = m.opts.offsetSeed - return nil - } - h := fnv.New64a() hostname, err := osutil.GetFQDN() if err != nil { diff --git a/scrape/scrape.go b/scrape/scrape.go index 2866a7fa61..a0c5da10d6 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -1253,9 +1253,8 @@ func (sl *scrapeLoop) getScrapeOffset() time.Duration { func (sl *scrapeLoop) run(errc chan<- error) { var ( - last time.Time - alignedScrapeTime = time.Now().Round(0) - ticker = time.NewTicker(sl.interval) + last time.Time + ticker = time.NewTicker(sl.interval) ) defer func() { if sl.scrapeOnShutdown { @@ -1284,6 +1283,7 @@ func (sl *scrapeLoop) run(errc chan<- error) { // Reset the ticker so target scrape times are aligned to the offset+intervals. ticker.Reset(sl.interval) + alignedScrapeTime := time.Now().Round(0) for { select { diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index 9cb3adff45..37807cea98 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -6823,7 +6823,6 @@ func TestScrapeOffsetDistribution(t *testing.T) { app := teststorage.NewAppendable() opts := &Options{ - offsetSeed: 1, HTTPClientOptions: []config_util.HTTPClientOption{ config_util.WithDialContextFunc(func(ctx context.Context, _, _ string) (net.Conn, error) { srvConn, cliConn := net.Pipe() @@ -6839,6 +6838,7 @@ func TestScrapeOffsetDistribution(t *testing.T) { }, } scrapeManager, err := NewManager(opts, promslog.NewNopLogger(), nil, app, nil, prometheus.NewRegistry()) + scrapeManager.offsetSeed = 1 // Set a fixed offset seed for deterministic testing. require.NoError(t, err) var targets []model.LabelSet From 3b2b42f68104ad749eeece3ba4bd4e7133e2ce4d Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:42:03 +0100 Subject: [PATCH 73/73] tsdb/chunkenc: add writeBits benchmarks, clarify comments, and simplify encodeJoint Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- tsdb/chunkenc/bstream.go | 9 ++++---- tsdb/chunkenc/bstream_test.go | 39 +++++++++++++++++++++++++++++++++++ tsdb/chunkenc/xor2.go | 9 ++++---- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/tsdb/chunkenc/bstream.go b/tsdb/chunkenc/bstream.go index 2ac92b69c8..4fd37a140f 100644 --- a/tsdb/chunkenc/bstream.go +++ b/tsdb/chunkenc/bstream.go @@ -101,6 +101,7 @@ func (b *bstream) writeByte(byt byte) { // writeBits writes the nbits right-most bits of u to the stream // in left-to-right order. +// TODO: Once XOR2 stabilizes, replace writeBits with the writeBitsFast implementation and remove writeBitsFast. func (b *bstream) writeBits(u uint64, nbits int) { u <<= 64 - uint(nbits) for nbits >= 8 { @@ -368,9 +369,8 @@ func (b *bstreamReader) readXOR2Control() (uint8, error) { } // readUvarint decodes a varint-encoded uint64 using direct method calls, -// avoiding the io.ByteReader interface dispatch used by binary.ReadUvarint. -// This prevents interior pointer references on goroutine stacks that the GC -// must trace via findObject, reducing GC overhead. +// avoiding the io.ByteReader interface dispatch used by binary.ReadUvarint, +// which causes the receiver to escape to the heap. func (b *bstreamReader) readUvarint() (uint64, error) { var x uint64 var s uint @@ -389,7 +389,8 @@ func (b *bstreamReader) readUvarint() (uint64, error) { } // readVarint decodes a varint-encoded int64 using direct method calls, -// avoiding the io.ByteReader interface dispatch used by binary.ReadVarint. +// avoiding the io.ByteReader interface dispatch used by binary.ReadVarint, +// which causes the receiver to escape to the heap. func (b *bstreamReader) readVarint() (int64, error) { ux, err := b.readUvarint() x := int64(ux >> 1) diff --git a/tsdb/chunkenc/bstream_test.go b/tsdb/chunkenc/bstream_test.go index 3098be5945..0b6a0e9b35 100644 --- a/tsdb/chunkenc/bstream_test.go +++ b/tsdb/chunkenc/bstream_test.go @@ -14,6 +14,7 @@ package chunkenc import ( + "fmt" "testing" "github.com/stretchr/testify/require" @@ -32,6 +33,44 @@ func TestBstream_Reset(t *testing.T) { }, bs) } +// BenchmarkWriteBits benchmarks writeBits for various bit widths. +func BenchmarkWriteBits(b *testing.B) { + sizes := []int{1, 8, 17, 32, 52, 64} + for _, nbits := range sizes { + b.Run(fmt.Sprintf("nbits=%d", nbits), func(b *testing.B) { + b.ReportAllocs() + var bs bstream + bs.stream = make([]byte, 0, 1024) + for range b.N { + bs.stream = bs.stream[:0] + bs.count = 0 + for j := range 100 { + bs.writeBits(uint64(j), nbits) + } + } + }) + } +} + +// BenchmarkWriteBitsFast benchmarks writeBitsFast for various bit widths. +func BenchmarkWriteBitsFast(b *testing.B) { + sizes := []int{1, 8, 17, 32, 52, 64} + for _, nbits := range sizes { + b.Run(fmt.Sprintf("nbits=%d", nbits), func(b *testing.B) { + b.ReportAllocs() + var bs bstream + bs.stream = make([]byte, 0, 1024) + for range b.N { + bs.stream = bs.stream[:0] + bs.count = 0 + for j := range 100 { + bs.writeBitsFast(uint64(j), nbits) + } + } + }) + } +} + func TestBstreamReader(t *testing.T) { // Write to the bit stream. w := bstream{} diff --git a/tsdb/chunkenc/xor2.go b/tsdb/chunkenc/xor2.go index 85db376ffb..defe1e8102 100644 --- a/tsdb/chunkenc/xor2.go +++ b/tsdb/chunkenc/xor2.go @@ -304,12 +304,13 @@ func (a *xor2Appender) encodeJoint(dod int64, v float64) { a.b.writeBitsFast(0b11111, 5) return } - if vbits := math.Float64bits(v) ^ math.Float64bits(a.v); vbits == 0 { + vbits := math.Float64bits(v) ^ math.Float64bits(a.v) + if vbits == 0 { a.b.writeBit(zero) - } else { - a.b.writeBitsFast(0b10, 2) - a.writeVDeltaKnownNonZero(vbits) + return } + a.b.writeBitsFast(0b10, 2) + a.writeVDeltaKnownNonZero(vbits) return }