From 7176a6de916be45a173ca85d5c42062cfd912fdb Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Thu, 19 Mar 2026 12:14:34 +0100 Subject: [PATCH 1/3] tsdb/chunkenc: port XOR2 performance improvements to ST-aware encoding Port the following optimizations from the roidelapluie/xor2 branch to the ST-aware XOR2 implementation on main: bstream.go: - Add writeBitsFast() as a writeBits variant that handles the partial last byte inline to avoid per-byte writeByte calls and writes complete bytes directly to the stream slice; used only by XOR2 to leave the shared writeBits unchanged for other encoders - Add readXOR2ControlFast() for inlinable hot-path control decoding that avoids buffer refills for the common 4-bit cases - Add readUvarint()/readVarint() methods that use direct method calls instead of io.ByteReader interface dispatch, reducing GC pressure from interior pointer references in findObject xor2.go: - Switch all writeBits calls to writeBitsFast - Use readXOR2ControlFast() + readXOR2Control() fallback in Next() - Use it.br.readVarint()/readUvarint() instead of binary.ReadVarint/ ReadUvarint to avoid GC overhead from interface dispatch - Add 3-bit fast path in decodeValue() to read the full value control prefix in one buffer peek rather than up to three separate bit reads - Add combined 1+sz bit fast path in decodeValueKnownNonZero() to fold the control bit and value bits into a single buffer operation - Add 11-bit combined read in decodeNewLeadingTrailing() to read leading (5 bits) and sigbits (6 bits) together - Pre-compute the value XOR delta in encodeJoint() and pass it to writeVDeltaKnownNonZero(delta uint64) to avoid recomputation Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- tsdb/chunkenc/bstream.go | 95 +++++++++++++++++++++++++ tsdb/chunkenc/xor2.go | 149 ++++++++++++++++++++++++++++----------- 2 files changed, 202 insertions(+), 42 deletions(-) diff --git a/tsdb/chunkenc/bstream.go b/tsdb/chunkenc/bstream.go index ecface3099..b2e6e3f91f 100644 --- a/tsdb/chunkenc/bstream.go +++ b/tsdb/chunkenc/bstream.go @@ -117,6 +117,40 @@ func (b *bstream) writeBits(u uint64, nbits int) { } } +// writeBitsFast is like writeBits but handles the partial last byte inline to +// avoid per-byte writeByte calls, and writes complete bytes directly to the +// stream slice. +func (b *bstream) writeBitsFast(u uint64, nbits int) { + u <<= 64 - uint(nbits) + + // If the last byte is partial, fill its remaining bits first. + if b.count > 0 { + free := int(b.count) + last := len(b.stream) - 1 + b.stream[last] |= byte(u >> uint(64-free)) + if nbits < free { + b.count = uint8(free - nbits) + return + } + u <<= uint(free) + nbits -= free + b.count = 0 + } + + // Write complete bytes directly, avoiding per-byte function call overhead. + for nbits >= 8 { + b.stream = append(b.stream, byte(u>>56)) + u <<= 8 + nbits -= 8 + } + + // Write any remaining bits as a partial final byte. + if nbits > 0 { + b.stream = append(b.stream, byte(u>>56)) + b.count = uint8(8 - nbits) + } +} + type bstreamReader struct { stream []byte streamOffset int // The offset from which read the next byte from the stream. @@ -215,6 +249,35 @@ func (b *bstreamReader) ReadByte() (byte, error) { return byte(v), nil } +// readXOR2ControlFast is like readXOR2Control but returns io.EOF when the +// internal buffer has fewer than 4 valid bits, or when the control prefix +// indicates cases 4 or 5 (top4 == 0xf). The caller should retry with +// readXOR2Control. This function must be kept small and a leaf in order to +// help the compiler inlining it and further improve performance. +func (b *bstreamReader) readXOR2ControlFast() (uint8, error) { + if b.valid < 4 { + return 0, io.EOF + } + top4 := uint8((b.buffer >> (b.valid - 4)) & 0xf) + if top4 < 8 { // '0xxx': dod=0, val=0 (case 0). + b.valid-- + return 0, nil + } + if top4 < 12 { // '10xx': dod=0, val changed (case 1). + b.valid -= 2 + return 1, nil + } + if top4 < 14 { // '110x': small dod (case 2). + b.valid -= 3 + return 2, nil + } + if top4 == 14 { // '1110': medium dod (case 3). + b.valid -= 4 + return 3, nil + } + return 0, io.EOF +} + // readXOR2Control reads the XOR2 variable-length joint control prefix // and returns 0-5 mapping to the six encoding cases: // @@ -304,6 +367,38 @@ func (b *bstreamReader) readXOR2Control() (uint8, error) { return 5, nil } +// readUvarint decodes a varint-encoded uint64 using direct method calls, +// avoiding the io.ByteReader interface dispatch used by binary.ReadUvarint. +// This prevents interior pointer references on goroutine stacks that the GC +// must trace via findObject, reducing GC overhead. +func (b *bstreamReader) readUvarint() (uint64, error) { + var x uint64 + var s uint + for range binary.MaxVarintLen64 { + byt, err := b.ReadByte() + if err != nil { + return x, err + } + if byt < 0x80 { + return x | uint64(byt)<> 1) + if ux&1 != 0 { + x = ^x + } + return x, err +} + // loadNextBuffer loads the next bytes from the stream into the internal buffer. // The input nbits is the minimum number of bits that must be read, but the implementation // can read more (if possible) to improve performances. diff --git a/tsdb/chunkenc/xor2.go b/tsdb/chunkenc/xor2.go index 2d5cc2b542..ddfab67617 100644 --- a/tsdb/chunkenc/xor2.go +++ b/tsdb/chunkenc/xor2.go @@ -224,7 +224,7 @@ func (a *xor2Appender) Append(st, t int64, v float64) { for _, b := range buf[:binary.PutVarint(buf, t)] { a.b.writeByte(b) } - a.b.writeBits(math.Float64bits(v), 64) + a.b.writeBitsFast(math.Float64bits(v), 64) if st != 0 { for _, b := range buf[:binary.PutVarint(buf, t-st)] { @@ -300,14 +300,15 @@ func (a *xor2Appender) Append(st, t int64, v float64) { // samples >= 2. func (a *xor2Appender) encodeJoint(dod int64, v float64) { if dod == 0 { + vbits := math.Float64bits(v) ^ math.Float64bits(a.v) switch { case value.IsStaleNaN(v): - a.b.writeBits(0b11111, 5) - case math.Float64bits(v)^math.Float64bits(a.v) == 0: + a.b.writeBitsFast(0b11111, 5) + case vbits == 0: a.b.writeBit(zero) default: - a.b.writeBits(0b10, 2) - a.writeVDeltaKnownNonZero(v) + a.b.writeBitsFast(0b10, 2) + a.writeVDeltaKnownNonZero(vbits) } return } @@ -324,8 +325,8 @@ func (a *xor2Appender) encodeJoint(dod int64, v float64) { a.b.writeByte(byte(uint64(dod))) default: // 64-bit escape (rare): `11110`. - a.b.writeBits(0b11110, 5) - a.b.writeBits(uint64(dod), 64) + a.b.writeBitsFast(0b11110, 5) + a.b.writeBitsFast(uint64(dod), 64) } a.writeVDelta(v) } @@ -333,7 +334,7 @@ func (a *xor2Appender) encodeJoint(dod int64, v float64) { // writeVDelta encodes the value delta for the dod≠0 case. func (a *xor2Appender) writeVDelta(v float64) { if value.IsStaleNaN(v) { - a.b.writeBits(0b111, 3) + a.b.writeBitsFast(0b111, 3) return } @@ -352,26 +353,30 @@ func (a *xor2Appender) writeVDelta(v float64) { } if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing { - a.b.writeBits(0b10, 2) - a.b.writeBits(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) + a.b.writeBitsFast(0b10, 2) + a.b.writeBitsFast(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) return } a.leading, a.trailing = newLeading, newTrailing - a.b.writeBits(0b110, 3) - a.b.writeBits(uint64(newLeading), 5) + a.b.writeBitsFast(0b110, 3) + a.b.writeBitsFast(uint64(newLeading), 5) sigbits := 64 - newLeading - newTrailing - a.b.writeBits(uint64(sigbits), 6) - a.b.writeBits(delta>>newTrailing, int(sigbits)) + a.b.writeBitsFast(uint64(sigbits), 6) + a.b.writeBitsFast(delta>>newTrailing, int(sigbits)) } -// writeVDeltaKnownNonZero encodes the value delta when it is known to be -// non-zero and non-stale (dod=0, value-changed case). -func (a *xor2Appender) writeVDeltaKnownNonZero(v float64) { - delta := math.Float64bits(v) ^ math.Float64bits(a.v) - +// writeVDeltaKnownNonZero encodes a precomputed value XOR delta for the +// dod=0, value-changed case. delta must be non-zero; stale NaN with dod=0 is +// handled at the joint control level (`11111`) and never reaches this function. +// +// Encoding: +// +// `0` → reuse previous leading/trailing window +// `1` → new leading/trailing window +func (a *xor2Appender) writeVDeltaKnownNonZero(delta uint64) { newLeading := uint8(bits.LeadingZeros64(delta)) newTrailing := uint8(bits.TrailingZeros64(delta)) @@ -381,18 +386,18 @@ func (a *xor2Appender) writeVDeltaKnownNonZero(v float64) { if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing { a.b.writeBit(zero) - a.b.writeBits(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) + a.b.writeBitsFast(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) return } a.leading, a.trailing = newLeading, newTrailing a.b.writeBit(one) - a.b.writeBits(uint64(newLeading), 5) + a.b.writeBitsFast(uint64(newLeading), 5) sigbits := 64 - newLeading - newTrailing - a.b.writeBits(uint64(sigbits), 6) - a.b.writeBits(delta>>newTrailing, int(sigbits)) + a.b.writeBitsFast(uint64(sigbits), 6) + a.b.writeBitsFast(delta>>newTrailing, int(sigbits)) } func (*xor2Appender) AppendHistogram(*HistogramAppender, int64, int64, *histogram.Histogram, bool) (Chunk, bool, Appender, error) { @@ -486,7 +491,7 @@ func (it *xor2Iterator) Next() ValueType { } if it.numRead == 0 { - t, err := binary.ReadVarint(&it.br) + t, err := it.br.readVarint() if err != nil { it.err = err return ValNone @@ -504,7 +509,7 @@ func (it *xor2Iterator) Next() ValueType { // Optional ST for sample 0. if it.firstSTKnown { - stDiff, err := binary.ReadVarint(&it.br) + stDiff, err := it.br.readVarint() if err != nil { it.err = err return ValNone @@ -517,7 +522,7 @@ func (it *xor2Iterator) Next() ValueType { } if it.numRead == 1 { - tDelta, err := binary.ReadUvarint(&it.br) + tDelta, err := it.br.readUvarint() if err != nil { it.err = err return ValNone @@ -550,10 +555,13 @@ func (it *xor2Iterator) Next() ValueType { prevT := it.t savedNumRead := it.numRead - ctrl, err := it.br.readXOR2Control() + ctrl, err := it.br.readXOR2ControlFast() if err != nil { - it.err = err - return ValNone + ctrl, err = it.br.readXOR2Control() + if err != nil { + it.err = err + return ValNone + } } switch ctrl { @@ -654,6 +662,49 @@ func (it *xor2Iterator) readDod(w uint8) error { // `110` → new leading/trailing window // `111` → stale NaN func (it *xor2Iterator) decodeValue() error { + // Fast path: 3 bits available — read the full control prefix in one shot. + // Encoding: `0`=unchanged, `10`=reuse window, `110`=new window, `111`=stale NaN. + if it.br.valid >= 3 { + ctrl := (it.br.buffer >> (it.br.valid - 3)) & 0x7 + if ctrl&0x4 == 0 { + // `0xx`: value unchanged, consume 1 bit. + it.br.valid-- + it.val = it.baselineV + return nil + } + if ctrl&0x6 == 0x4 { + // `10x`: reuse previous leading/trailing window, consume 2 bits. + it.br.valid -= 2 + sz := uint8(64 - int(it.leading) - int(it.trailing)) + var valueBits uint64 + if it.br.valid >= sz { + it.br.valid -= sz + valueBits = (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + } else { + var err error + valueBits, err = it.br.readBits(sz) + if err != nil { + return err + } + } + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + // `11x`: consume 3 bits. + it.br.valid -= 3 + if ctrl == 0x6 { + // `110`: new leading/trailing window. + return it.decodeNewLeadingTrailing() + } + // `111`: stale NaN. + it.val = math.Float64frombits(value.StaleNaN) + return nil + } + + // Slow path: fewer than 3 bits buffered (rare, only near buffer refills). var bit bit if it.br.valid > 0 { it.br.valid-- @@ -731,6 +782,26 @@ func (it *xor2Iterator) decodeValue() error { // `0` → reuse previous leading/trailing window // `1` → new leading/trailing window func (it *xor2Iterator) decodeValueKnownNonZero() error { + sz := uint8(64 - int(it.leading) - int(it.trailing)) + // Fast path: combine the 1-bit reuse/new-window control read with the + // sz-bit value read into a single buffer operation. + if it.br.valid >= 1+sz { + ctrlBit := (it.br.buffer >> (it.br.valid - 1)) & 1 + if ctrlBit == 0 { // `0`: reuse previous leading/trailing window. + it.br.valid -= 1 + sz + valueBits := (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + // `1`: new leading/trailing window. + it.br.valid-- + return it.decodeNewLeadingTrailing() + } + + // Slow path: read control bit then value bits separately. var bit bit if it.br.valid > 0 { it.br.valid-- @@ -745,7 +816,6 @@ func (it *xor2Iterator) decodeValueKnownNonZero() error { if bit == zero { // `0` → reuse previous leading/trailing window. - sz := uint8(64 - int(it.leading) - int(it.trailing)) var valueBits uint64 if it.br.valid >= sz { it.br.valid -= sz @@ -771,24 +841,19 @@ func (it *xor2Iterator) decodeValueKnownNonZero() error { // decodeNewLeadingTrailing reads a new leading/sigbits/value triple and // updates it.leading, it.trailing, it.val, and it.baselineV. func (it *xor2Iterator) decodeNewLeadingTrailing() error { - var newLeading uint64 - if it.br.valid >= 5 { - it.br.valid -= 5 - newLeading = (it.br.buffer >> it.br.valid) & 0x1f + var newLeading, sigbits uint64 + // Fast path: read leading (5 bits) and sigbits (6 bits) together as 11 bits. + if it.br.valid >= 11 { + val := (it.br.buffer >> (it.br.valid - 11)) & 0x7ff + it.br.valid -= 11 + newLeading = val >> 6 + sigbits = val & 0x3f } else { var err error newLeading, err = it.br.readBits(5) if err != nil { return err } - } - - var sigbits uint64 - if it.br.valid >= 6 { - it.br.valid -= 6 - sigbits = (it.br.buffer >> it.br.valid) & 0x3f - } else { - var err error sigbits, err = it.br.readBits(6) if err != nil { return err From e865bdd17298095b66319193e657a0725ae7c7d7 Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Thu, 19 Mar 2026 17:00:31 +0100 Subject: [PATCH 2/3] tsdb/chunkenc: avoid error allocation in readXOR2ControlFast and add decode tests Change readXOR2ControlFast to return (uint8, bool) instead of (uint8, error) to avoid allocating io.EOF on the fast path. Refactor encodeJoint to skip computing vbits when the value is a stale NaN. Add TestXOR2DecodeFunctionsAcrossPadding to exercise decodeValue, decodeValueKnownNonZero, and decodeNewLeadingTrailing across all 64 bit-buffer alignments. Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- tsdb/chunkenc/bstream.go | 16 +-- tsdb/chunkenc/xor2.go | 17 +-- tsdb/chunkenc/xor2_test.go | 248 +++++++++++++++++++++++++++++++++++++ 3 files changed, 265 insertions(+), 16 deletions(-) diff --git a/tsdb/chunkenc/bstream.go b/tsdb/chunkenc/bstream.go index b2e6e3f91f..2ac92b69c8 100644 --- a/tsdb/chunkenc/bstream.go +++ b/tsdb/chunkenc/bstream.go @@ -249,33 +249,33 @@ func (b *bstreamReader) ReadByte() (byte, error) { return byte(v), nil } -// readXOR2ControlFast is like readXOR2Control but returns io.EOF when the +// readXOR2ControlFast is like readXOR2Control but returns false when the // internal buffer has fewer than 4 valid bits, or when the control prefix // indicates cases 4 or 5 (top4 == 0xf). The caller should retry with // readXOR2Control. This function must be kept small and a leaf in order to // help the compiler inlining it and further improve performance. -func (b *bstreamReader) readXOR2ControlFast() (uint8, error) { +func (b *bstreamReader) readXOR2ControlFast() (uint8, bool) { if b.valid < 4 { - return 0, io.EOF + return 0, false } top4 := uint8((b.buffer >> (b.valid - 4)) & 0xf) if top4 < 8 { // '0xxx': dod=0, val=0 (case 0). b.valid-- - return 0, nil + return 0, true } if top4 < 12 { // '10xx': dod=0, val changed (case 1). b.valid -= 2 - return 1, nil + return 1, true } if top4 < 14 { // '110x': small dod (case 2). b.valid -= 3 - return 2, nil + return 2, true } if top4 == 14 { // '1110': medium dod (case 3). b.valid -= 4 - return 3, nil + return 3, true } - return 0, io.EOF + return 0, false } // readXOR2Control reads the XOR2 variable-length joint control prefix diff --git a/tsdb/chunkenc/xor2.go b/tsdb/chunkenc/xor2.go index ddfab67617..85db376ffb 100644 --- a/tsdb/chunkenc/xor2.go +++ b/tsdb/chunkenc/xor2.go @@ -300,13 +300,13 @@ func (a *xor2Appender) Append(st, t int64, v float64) { // samples >= 2. func (a *xor2Appender) encodeJoint(dod int64, v float64) { if dod == 0 { - vbits := math.Float64bits(v) ^ math.Float64bits(a.v) - switch { - case value.IsStaleNaN(v): + if value.IsStaleNaN(v) { a.b.writeBitsFast(0b11111, 5) - case vbits == 0: + return + } + if vbits := math.Float64bits(v) ^ math.Float64bits(a.v); vbits == 0 { a.b.writeBit(zero) - default: + } else { a.b.writeBitsFast(0b10, 2) a.writeVDeltaKnownNonZero(vbits) } @@ -369,7 +369,7 @@ func (a *xor2Appender) writeVDelta(v float64) { } // writeVDeltaKnownNonZero encodes a precomputed value XOR delta for the -// dod=0, value-changed case. delta must be non-zero; stale NaN with dod=0 is +// dod=0, value-changed case. delta must be non-zero or staleNaN. Stale NaN with dod=0 is // handled at the joint control level (`11111`) and never reaches this function. // // Encoding: @@ -555,8 +555,9 @@ func (it *xor2Iterator) Next() ValueType { prevT := it.t savedNumRead := it.numRead - ctrl, err := it.br.readXOR2ControlFast() - if err != nil { + ctrl, ok := it.br.readXOR2ControlFast() + if !ok { + var err error ctrl, err = it.br.readXOR2Control() if err != nil { it.err = err diff --git a/tsdb/chunkenc/xor2_test.go b/tsdb/chunkenc/xor2_test.go index f6a344f598..c0c1af8a1b 100644 --- a/tsdb/chunkenc/xor2_test.go +++ b/tsdb/chunkenc/xor2_test.go @@ -14,7 +14,9 @@ package chunkenc import ( + "fmt" "math" + "math/bits" "testing" "github.com/stretchr/testify/require" @@ -22,6 +24,55 @@ import ( "github.com/prometheus/prometheus/model/value" ) +func newXOR2IteratorForPayload(t *testing.T, padding int, payload func(*bstream), setup func(*xor2Iterator)) *xor2Iterator { + t.Helper() + + var bs bstream + if padding > 0 { + bs.writeBitsFast(0, padding) + } + payload(&bs) + // Add tail bytes so the reader initially fills a full 64-bit buffer. + bs.writeBitsFast(0, 64) + + it := &xor2Iterator{} + if setup != nil { + setup(it) + } + it.br = newBReader(bs.bytes()) + + if padding > 0 { + _, err := it.br.readBits(uint8(padding)) + require.NoError(t, err) + } + + return it +} + +func writeXOR2NewWindowPayload(bs *bstream, delta uint64) (leading, trailing uint8) { + leading, trailing, sigbits := xor2DeltaWindow(delta) + encodedSigbits := sigbits + if sigbits == 64 { + encodedSigbits = 0 + } + + bs.writeBitsFast(uint64(leading), 5) + bs.writeBitsFast(uint64(encodedSigbits), 6) + bs.writeBitsFast(delta>>trailing, int(sigbits)) + + return leading, trailing +} + +func xor2DeltaWindow(delta uint64) (leading, trailing, sigbits uint8) { + leading = uint8(bits.LeadingZeros64(delta)) + trailing = uint8(bits.TrailingZeros64(delta)) + if leading >= 32 { + leading = 31 + } + + return leading, trailing, 64 - leading - trailing +} + func BenchmarkXor2Write(b *testing.B) { samples := make([]struct { t int64 @@ -277,3 +328,200 @@ func TestXOR2Chunk_MoreThan127Samples(t *testing.T) { require.NoError(t, it.Err()) }) } + +// TestXOR2DecodeFunctionsAcrossPadding exercises decodeValue, +// decodeValueKnownNonZero, and decodeNewLeadingTrailing across all logical +// cases × all 64 bit-buffer alignments (padding 0..63). Padding controls the +// number of bits that precede the payload in the stream, which determines +// how many bits remain in the 64-bit read buffer when the decode function is +// called. This Cartesian product ensures both the fast path (enough bits +// buffered for a single-shot read) and the slow path (bits span a buffer +// refill) are exercised for every case. +func TestXOR2DecodeFunctionsAcrossPadding(t *testing.T) { + const baseline = 1234.5 + + type testCase struct { + name string + payload func(*bstream) + setup func(*xor2Iterator) + assert func(*testing.T, *xor2Iterator) + } + + runCases := func(t *testing.T, cases []testCase, fn func(*xor2Iterator) error) { + t.Helper() + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + for padding := range 64 { + t.Run(fmt.Sprintf("padding=%d", padding), func(t *testing.T) { + it := newXOR2IteratorForPayload(t, padding, tc.payload, tc.setup) + require.NoError(t, fn(it)) + tc.assert(t, it) + }) + } + }) + } + } + + // decodeValue: `0`=unchanged, `10`=reuse window, `110`=new window, `111`=stale NaN. + t.Run("decodeValue", func(t *testing.T) { + reuseD := uint64(0x000ABCDE000000) + rL, rT, rS := xor2DeltaWindow(reuseD) + + // Two new-window variants: full-width sigbits (encoded as 0) and small + // sigbits, to cover both value-bits read paths inside decodeNewLeadingTrailing. + newDFull := uint64(0xFEDCBA9876543211) + nLFull, nTFull, _ := xor2DeltaWindow(newDFull) + newDSmall := uint64(0x000ABCDE000000) + nLSmall, nTSmall, _ := xor2DeltaWindow(newDSmall) + + runCases(t, []testCase{ + { + name: "unchanged", + payload: func(bs *bstream) { bs.writeBit(zero) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.Equal(t, baseline, it.val) + require.Equal(t, baseline, it.baselineV) + }, + }, + { + name: "reuse_window", + payload: func(bs *bstream) { + bs.writeBitsFast(0b10, 2) + bs.writeBitsFast(reuseD>>rT, int(rS)) + }, + setup: func(it *xor2Iterator) { + it.baselineV = baseline + it.leading, it.trailing = rL, rT + }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ reuseD) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, rL, it.leading) + require.Equal(t, rT, it.trailing) + }, + }, + { + name: "new_window_full_sigbits", + payload: func(bs *bstream) { + bs.writeBitsFast(0b110, 3) + writeXOR2NewWindowPayload(bs, newDFull) + }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ newDFull) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, nLFull, it.leading) + require.Equal(t, nTFull, it.trailing) + }, + }, + { + name: "new_window_small_sigbits", + payload: func(bs *bstream) { + bs.writeBitsFast(0b110, 3) + writeXOR2NewWindowPayload(bs, newDSmall) + }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ newDSmall) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, nLSmall, it.leading) + require.Equal(t, nTSmall, it.trailing) + }, + }, + { + name: "stale_nan", + payload: func(bs *bstream) { bs.writeBitsFast(0b111, 3) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.True(t, value.IsStaleNaN(it.val)) + require.Equal(t, baseline, it.baselineV) + }, + }, + }, (*xor2Iterator).decodeValue) + }) + + // decodeValueKnownNonZero: `0`=reuse window, `1`=new window. + // The new_window case uses real leading/trailing (not 0xff) so that sz is + // small enough for the fast path (valid >= 1+sz) to be reached with ctrlBit=1. + t.Run("decodeValueKnownNonZero", func(t *testing.T) { + delta := uint64(0x000ABCDE000000) + dL, dT, dS := xor2DeltaWindow(delta) + + runCases(t, []testCase{ + { + name: "reuse_window", + payload: func(bs *bstream) { + bs.writeBit(zero) + bs.writeBitsFast(delta>>dT, int(dS)) + }, + setup: func(it *xor2Iterator) { + it.baselineV = baseline + it.leading, it.trailing = dL, dT + }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ delta) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + }, + }, + { + name: "new_window", + payload: func(bs *bstream) { + bs.writeBit(one) + writeXOR2NewWindowPayload(bs, delta) + }, + setup: func(it *xor2Iterator) { + it.baselineV = baseline + it.leading, it.trailing = dL, dT + }, + assert: func(t *testing.T, it *xor2Iterator) { + expected := math.Float64frombits(math.Float64bits(baseline) ^ delta) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + require.Equal(t, dL, it.leading) + require.Equal(t, dT, it.trailing) + }, + }, + }, (*xor2Iterator).decodeValueKnownNonZero) + }) + + // decodeNewLeadingTrailing: exercises the 11-bit header fast path, the + // value-bits fast path (small sigbits), and full-width sigbits (encoded as 0). + t.Run("decodeNewLeadingTrailing", func(t *testing.T) { + smallD := uint64(0x000ABCDE000000) + sL, sT, _ := xor2DeltaWindow(smallD) + fullD := uint64(0xFEDCBA9876543211) + fL, fT, _ := xor2DeltaWindow(fullD) + + runCases(t, []testCase{ + { + name: "small_sigbits", + payload: func(bs *bstream) { writeXOR2NewWindowPayload(bs, smallD) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.Equal(t, sL, it.leading) + require.Equal(t, sT, it.trailing) + expected := math.Float64frombits(math.Float64bits(baseline) ^ smallD) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + }, + }, + { + name: "full_width_sigbits", + payload: func(bs *bstream) { writeXOR2NewWindowPayload(bs, fullD) }, + setup: func(it *xor2Iterator) { it.baselineV = baseline }, + assert: func(t *testing.T, it *xor2Iterator) { + require.Equal(t, fL, it.leading) + require.Equal(t, fT, it.trailing) + expected := math.Float64frombits(math.Float64bits(baseline) ^ fullD) + require.Equal(t, expected, it.val) + require.Equal(t, expected, it.baselineV) + }, + }, + }, (*xor2Iterator).decodeNewLeadingTrailing) + }) +} From 3b2b42f68104ad749eeece3ba4bd4e7133e2ce4d Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:42:03 +0100 Subject: [PATCH 3/3] tsdb/chunkenc: add writeBits benchmarks, clarify comments, and simplify encodeJoint Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- tsdb/chunkenc/bstream.go | 9 ++++---- tsdb/chunkenc/bstream_test.go | 39 +++++++++++++++++++++++++++++++++++ tsdb/chunkenc/xor2.go | 9 ++++---- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/tsdb/chunkenc/bstream.go b/tsdb/chunkenc/bstream.go index 2ac92b69c8..4fd37a140f 100644 --- a/tsdb/chunkenc/bstream.go +++ b/tsdb/chunkenc/bstream.go @@ -101,6 +101,7 @@ func (b *bstream) writeByte(byt byte) { // writeBits writes the nbits right-most bits of u to the stream // in left-to-right order. +// TODO: Once XOR2 stabilizes, replace writeBits with the writeBitsFast implementation and remove writeBitsFast. func (b *bstream) writeBits(u uint64, nbits int) { u <<= 64 - uint(nbits) for nbits >= 8 { @@ -368,9 +369,8 @@ func (b *bstreamReader) readXOR2Control() (uint8, error) { } // readUvarint decodes a varint-encoded uint64 using direct method calls, -// avoiding the io.ByteReader interface dispatch used by binary.ReadUvarint. -// This prevents interior pointer references on goroutine stacks that the GC -// must trace via findObject, reducing GC overhead. +// avoiding the io.ByteReader interface dispatch used by binary.ReadUvarint, +// which causes the receiver to escape to the heap. func (b *bstreamReader) readUvarint() (uint64, error) { var x uint64 var s uint @@ -389,7 +389,8 @@ func (b *bstreamReader) readUvarint() (uint64, error) { } // readVarint decodes a varint-encoded int64 using direct method calls, -// avoiding the io.ByteReader interface dispatch used by binary.ReadVarint. +// avoiding the io.ByteReader interface dispatch used by binary.ReadVarint, +// which causes the receiver to escape to the heap. func (b *bstreamReader) readVarint() (int64, error) { ux, err := b.readUvarint() x := int64(ux >> 1) diff --git a/tsdb/chunkenc/bstream_test.go b/tsdb/chunkenc/bstream_test.go index 3098be5945..0b6a0e9b35 100644 --- a/tsdb/chunkenc/bstream_test.go +++ b/tsdb/chunkenc/bstream_test.go @@ -14,6 +14,7 @@ package chunkenc import ( + "fmt" "testing" "github.com/stretchr/testify/require" @@ -32,6 +33,44 @@ func TestBstream_Reset(t *testing.T) { }, bs) } +// BenchmarkWriteBits benchmarks writeBits for various bit widths. +func BenchmarkWriteBits(b *testing.B) { + sizes := []int{1, 8, 17, 32, 52, 64} + for _, nbits := range sizes { + b.Run(fmt.Sprintf("nbits=%d", nbits), func(b *testing.B) { + b.ReportAllocs() + var bs bstream + bs.stream = make([]byte, 0, 1024) + for range b.N { + bs.stream = bs.stream[:0] + bs.count = 0 + for j := range 100 { + bs.writeBits(uint64(j), nbits) + } + } + }) + } +} + +// BenchmarkWriteBitsFast benchmarks writeBitsFast for various bit widths. +func BenchmarkWriteBitsFast(b *testing.B) { + sizes := []int{1, 8, 17, 32, 52, 64} + for _, nbits := range sizes { + b.Run(fmt.Sprintf("nbits=%d", nbits), func(b *testing.B) { + b.ReportAllocs() + var bs bstream + bs.stream = make([]byte, 0, 1024) + for range b.N { + bs.stream = bs.stream[:0] + bs.count = 0 + for j := range 100 { + bs.writeBitsFast(uint64(j), nbits) + } + } + }) + } +} + func TestBstreamReader(t *testing.T) { // Write to the bit stream. w := bstream{} diff --git a/tsdb/chunkenc/xor2.go b/tsdb/chunkenc/xor2.go index 85db376ffb..defe1e8102 100644 --- a/tsdb/chunkenc/xor2.go +++ b/tsdb/chunkenc/xor2.go @@ -304,12 +304,13 @@ func (a *xor2Appender) encodeJoint(dod int64, v float64) { a.b.writeBitsFast(0b11111, 5) return } - if vbits := math.Float64bits(v) ^ math.Float64bits(a.v); vbits == 0 { + vbits := math.Float64bits(v) ^ math.Float64bits(a.v) + if vbits == 0 { a.b.writeBit(zero) - } else { - a.b.writeBitsFast(0b10, 2) - a.writeVDeltaKnownNonZero(vbits) + return } + a.b.writeBitsFast(0b10, 2) + a.writeVDeltaKnownNonZero(vbits) return }