From 7176a6de916be45a173ca85d5c42062cfd912fdb Mon Sep 17 00:00:00 2001 From: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> Date: Thu, 19 Mar 2026 12:14:34 +0100 Subject: [PATCH] tsdb/chunkenc: port XOR2 performance improvements to ST-aware encoding Port the following optimizations from the roidelapluie/xor2 branch to the ST-aware XOR2 implementation on main: bstream.go: - Add writeBitsFast() as a writeBits variant that handles the partial last byte inline to avoid per-byte writeByte calls and writes complete bytes directly to the stream slice; used only by XOR2 to leave the shared writeBits unchanged for other encoders - Add readXOR2ControlFast() for inlinable hot-path control decoding that avoids buffer refills for the common 4-bit cases - Add readUvarint()/readVarint() methods that use direct method calls instead of io.ByteReader interface dispatch, reducing GC pressure from interior pointer references in findObject xor2.go: - Switch all writeBits calls to writeBitsFast - Use readXOR2ControlFast() + readXOR2Control() fallback in Next() - Use it.br.readVarint()/readUvarint() instead of binary.ReadVarint/ ReadUvarint to avoid GC overhead from interface dispatch - Add 3-bit fast path in decodeValue() to read the full value control prefix in one buffer peek rather than up to three separate bit reads - Add combined 1+sz bit fast path in decodeValueKnownNonZero() to fold the control bit and value bits into a single buffer operation - Add 11-bit combined read in decodeNewLeadingTrailing() to read leading (5 bits) and sigbits (6 bits) together - Pre-compute the value XOR delta in encodeJoint() and pass it to writeVDeltaKnownNonZero(delta uint64) to avoid recomputation Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com> --- tsdb/chunkenc/bstream.go | 95 +++++++++++++++++++++++++ tsdb/chunkenc/xor2.go | 149 ++++++++++++++++++++++++++++----------- 2 files changed, 202 insertions(+), 42 deletions(-) diff --git a/tsdb/chunkenc/bstream.go b/tsdb/chunkenc/bstream.go index ecface3099..b2e6e3f91f 100644 --- a/tsdb/chunkenc/bstream.go +++ b/tsdb/chunkenc/bstream.go @@ -117,6 +117,40 @@ func (b *bstream) writeBits(u uint64, nbits int) { } } +// writeBitsFast is like writeBits but handles the partial last byte inline to +// avoid per-byte writeByte calls, and writes complete bytes directly to the +// stream slice. +func (b *bstream) writeBitsFast(u uint64, nbits int) { + u <<= 64 - uint(nbits) + + // If the last byte is partial, fill its remaining bits first. + if b.count > 0 { + free := int(b.count) + last := len(b.stream) - 1 + b.stream[last] |= byte(u >> uint(64-free)) + if nbits < free { + b.count = uint8(free - nbits) + return + } + u <<= uint(free) + nbits -= free + b.count = 0 + } + + // Write complete bytes directly, avoiding per-byte function call overhead. + for nbits >= 8 { + b.stream = append(b.stream, byte(u>>56)) + u <<= 8 + nbits -= 8 + } + + // Write any remaining bits as a partial final byte. + if nbits > 0 { + b.stream = append(b.stream, byte(u>>56)) + b.count = uint8(8 - nbits) + } +} + type bstreamReader struct { stream []byte streamOffset int // The offset from which read the next byte from the stream. @@ -215,6 +249,35 @@ func (b *bstreamReader) ReadByte() (byte, error) { return byte(v), nil } +// readXOR2ControlFast is like readXOR2Control but returns io.EOF when the +// internal buffer has fewer than 4 valid bits, or when the control prefix +// indicates cases 4 or 5 (top4 == 0xf). The caller should retry with +// readXOR2Control. This function must be kept small and a leaf in order to +// help the compiler inlining it and further improve performance. +func (b *bstreamReader) readXOR2ControlFast() (uint8, error) { + if b.valid < 4 { + return 0, io.EOF + } + top4 := uint8((b.buffer >> (b.valid - 4)) & 0xf) + if top4 < 8 { // '0xxx': dod=0, val=0 (case 0). + b.valid-- + return 0, nil + } + if top4 < 12 { // '10xx': dod=0, val changed (case 1). + b.valid -= 2 + return 1, nil + } + if top4 < 14 { // '110x': small dod (case 2). + b.valid -= 3 + return 2, nil + } + if top4 == 14 { // '1110': medium dod (case 3). + b.valid -= 4 + return 3, nil + } + return 0, io.EOF +} + // readXOR2Control reads the XOR2 variable-length joint control prefix // and returns 0-5 mapping to the six encoding cases: // @@ -304,6 +367,38 @@ func (b *bstreamReader) readXOR2Control() (uint8, error) { return 5, nil } +// readUvarint decodes a varint-encoded uint64 using direct method calls, +// avoiding the io.ByteReader interface dispatch used by binary.ReadUvarint. +// This prevents interior pointer references on goroutine stacks that the GC +// must trace via findObject, reducing GC overhead. +func (b *bstreamReader) readUvarint() (uint64, error) { + var x uint64 + var s uint + for range binary.MaxVarintLen64 { + byt, err := b.ReadByte() + if err != nil { + return x, err + } + if byt < 0x80 { + return x | uint64(byt)<> 1) + if ux&1 != 0 { + x = ^x + } + return x, err +} + // loadNextBuffer loads the next bytes from the stream into the internal buffer. // The input nbits is the minimum number of bits that must be read, but the implementation // can read more (if possible) to improve performances. diff --git a/tsdb/chunkenc/xor2.go b/tsdb/chunkenc/xor2.go index 2d5cc2b542..ddfab67617 100644 --- a/tsdb/chunkenc/xor2.go +++ b/tsdb/chunkenc/xor2.go @@ -224,7 +224,7 @@ func (a *xor2Appender) Append(st, t int64, v float64) { for _, b := range buf[:binary.PutVarint(buf, t)] { a.b.writeByte(b) } - a.b.writeBits(math.Float64bits(v), 64) + a.b.writeBitsFast(math.Float64bits(v), 64) if st != 0 { for _, b := range buf[:binary.PutVarint(buf, t-st)] { @@ -300,14 +300,15 @@ func (a *xor2Appender) Append(st, t int64, v float64) { // samples >= 2. func (a *xor2Appender) encodeJoint(dod int64, v float64) { if dod == 0 { + vbits := math.Float64bits(v) ^ math.Float64bits(a.v) switch { case value.IsStaleNaN(v): - a.b.writeBits(0b11111, 5) - case math.Float64bits(v)^math.Float64bits(a.v) == 0: + a.b.writeBitsFast(0b11111, 5) + case vbits == 0: a.b.writeBit(zero) default: - a.b.writeBits(0b10, 2) - a.writeVDeltaKnownNonZero(v) + a.b.writeBitsFast(0b10, 2) + a.writeVDeltaKnownNonZero(vbits) } return } @@ -324,8 +325,8 @@ func (a *xor2Appender) encodeJoint(dod int64, v float64) { a.b.writeByte(byte(uint64(dod))) default: // 64-bit escape (rare): `11110`. - a.b.writeBits(0b11110, 5) - a.b.writeBits(uint64(dod), 64) + a.b.writeBitsFast(0b11110, 5) + a.b.writeBitsFast(uint64(dod), 64) } a.writeVDelta(v) } @@ -333,7 +334,7 @@ func (a *xor2Appender) encodeJoint(dod int64, v float64) { // writeVDelta encodes the value delta for the dod≠0 case. func (a *xor2Appender) writeVDelta(v float64) { if value.IsStaleNaN(v) { - a.b.writeBits(0b111, 3) + a.b.writeBitsFast(0b111, 3) return } @@ -352,26 +353,30 @@ func (a *xor2Appender) writeVDelta(v float64) { } if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing { - a.b.writeBits(0b10, 2) - a.b.writeBits(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) + a.b.writeBitsFast(0b10, 2) + a.b.writeBitsFast(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) return } a.leading, a.trailing = newLeading, newTrailing - a.b.writeBits(0b110, 3) - a.b.writeBits(uint64(newLeading), 5) + a.b.writeBitsFast(0b110, 3) + a.b.writeBitsFast(uint64(newLeading), 5) sigbits := 64 - newLeading - newTrailing - a.b.writeBits(uint64(sigbits), 6) - a.b.writeBits(delta>>newTrailing, int(sigbits)) + a.b.writeBitsFast(uint64(sigbits), 6) + a.b.writeBitsFast(delta>>newTrailing, int(sigbits)) } -// writeVDeltaKnownNonZero encodes the value delta when it is known to be -// non-zero and non-stale (dod=0, value-changed case). -func (a *xor2Appender) writeVDeltaKnownNonZero(v float64) { - delta := math.Float64bits(v) ^ math.Float64bits(a.v) - +// writeVDeltaKnownNonZero encodes a precomputed value XOR delta for the +// dod=0, value-changed case. delta must be non-zero; stale NaN with dod=0 is +// handled at the joint control level (`11111`) and never reaches this function. +// +// Encoding: +// +// `0` → reuse previous leading/trailing window +// `1` → new leading/trailing window +func (a *xor2Appender) writeVDeltaKnownNonZero(delta uint64) { newLeading := uint8(bits.LeadingZeros64(delta)) newTrailing := uint8(bits.TrailingZeros64(delta)) @@ -381,18 +386,18 @@ func (a *xor2Appender) writeVDeltaKnownNonZero(v float64) { if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing { a.b.writeBit(zero) - a.b.writeBits(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) + a.b.writeBitsFast(delta>>a.trailing, 64-int(a.leading)-int(a.trailing)) return } a.leading, a.trailing = newLeading, newTrailing a.b.writeBit(one) - a.b.writeBits(uint64(newLeading), 5) + a.b.writeBitsFast(uint64(newLeading), 5) sigbits := 64 - newLeading - newTrailing - a.b.writeBits(uint64(sigbits), 6) - a.b.writeBits(delta>>newTrailing, int(sigbits)) + a.b.writeBitsFast(uint64(sigbits), 6) + a.b.writeBitsFast(delta>>newTrailing, int(sigbits)) } func (*xor2Appender) AppendHistogram(*HistogramAppender, int64, int64, *histogram.Histogram, bool) (Chunk, bool, Appender, error) { @@ -486,7 +491,7 @@ func (it *xor2Iterator) Next() ValueType { } if it.numRead == 0 { - t, err := binary.ReadVarint(&it.br) + t, err := it.br.readVarint() if err != nil { it.err = err return ValNone @@ -504,7 +509,7 @@ func (it *xor2Iterator) Next() ValueType { // Optional ST for sample 0. if it.firstSTKnown { - stDiff, err := binary.ReadVarint(&it.br) + stDiff, err := it.br.readVarint() if err != nil { it.err = err return ValNone @@ -517,7 +522,7 @@ func (it *xor2Iterator) Next() ValueType { } if it.numRead == 1 { - tDelta, err := binary.ReadUvarint(&it.br) + tDelta, err := it.br.readUvarint() if err != nil { it.err = err return ValNone @@ -550,10 +555,13 @@ func (it *xor2Iterator) Next() ValueType { prevT := it.t savedNumRead := it.numRead - ctrl, err := it.br.readXOR2Control() + ctrl, err := it.br.readXOR2ControlFast() if err != nil { - it.err = err - return ValNone + ctrl, err = it.br.readXOR2Control() + if err != nil { + it.err = err + return ValNone + } } switch ctrl { @@ -654,6 +662,49 @@ func (it *xor2Iterator) readDod(w uint8) error { // `110` → new leading/trailing window // `111` → stale NaN func (it *xor2Iterator) decodeValue() error { + // Fast path: 3 bits available — read the full control prefix in one shot. + // Encoding: `0`=unchanged, `10`=reuse window, `110`=new window, `111`=stale NaN. + if it.br.valid >= 3 { + ctrl := (it.br.buffer >> (it.br.valid - 3)) & 0x7 + if ctrl&0x4 == 0 { + // `0xx`: value unchanged, consume 1 bit. + it.br.valid-- + it.val = it.baselineV + return nil + } + if ctrl&0x6 == 0x4 { + // `10x`: reuse previous leading/trailing window, consume 2 bits. + it.br.valid -= 2 + sz := uint8(64 - int(it.leading) - int(it.trailing)) + var valueBits uint64 + if it.br.valid >= sz { + it.br.valid -= sz + valueBits = (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + } else { + var err error + valueBits, err = it.br.readBits(sz) + if err != nil { + return err + } + } + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + // `11x`: consume 3 bits. + it.br.valid -= 3 + if ctrl == 0x6 { + // `110`: new leading/trailing window. + return it.decodeNewLeadingTrailing() + } + // `111`: stale NaN. + it.val = math.Float64frombits(value.StaleNaN) + return nil + } + + // Slow path: fewer than 3 bits buffered (rare, only near buffer refills). var bit bit if it.br.valid > 0 { it.br.valid-- @@ -731,6 +782,26 @@ func (it *xor2Iterator) decodeValue() error { // `0` → reuse previous leading/trailing window // `1` → new leading/trailing window func (it *xor2Iterator) decodeValueKnownNonZero() error { + sz := uint8(64 - int(it.leading) - int(it.trailing)) + // Fast path: combine the 1-bit reuse/new-window control read with the + // sz-bit value read into a single buffer operation. + if it.br.valid >= 1+sz { + ctrlBit := (it.br.buffer >> (it.br.valid - 1)) & 1 + if ctrlBit == 0 { // `0`: reuse previous leading/trailing window. + it.br.valid -= 1 + sz + valueBits := (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1) + vbits := math.Float64bits(it.baselineV) + vbits ^= valueBits << it.trailing + it.val = math.Float64frombits(vbits) + it.baselineV = it.val + return nil + } + // `1`: new leading/trailing window. + it.br.valid-- + return it.decodeNewLeadingTrailing() + } + + // Slow path: read control bit then value bits separately. var bit bit if it.br.valid > 0 { it.br.valid-- @@ -745,7 +816,6 @@ func (it *xor2Iterator) decodeValueKnownNonZero() error { if bit == zero { // `0` → reuse previous leading/trailing window. - sz := uint8(64 - int(it.leading) - int(it.trailing)) var valueBits uint64 if it.br.valid >= sz { it.br.valid -= sz @@ -771,24 +841,19 @@ func (it *xor2Iterator) decodeValueKnownNonZero() error { // decodeNewLeadingTrailing reads a new leading/sigbits/value triple and // updates it.leading, it.trailing, it.val, and it.baselineV. func (it *xor2Iterator) decodeNewLeadingTrailing() error { - var newLeading uint64 - if it.br.valid >= 5 { - it.br.valid -= 5 - newLeading = (it.br.buffer >> it.br.valid) & 0x1f + var newLeading, sigbits uint64 + // Fast path: read leading (5 bits) and sigbits (6 bits) together as 11 bits. + if it.br.valid >= 11 { + val := (it.br.buffer >> (it.br.valid - 11)) & 0x7ff + it.br.valid -= 11 + newLeading = val >> 6 + sigbits = val & 0x3f } else { var err error newLeading, err = it.br.readBits(5) if err != nil { return err } - } - - var sigbits uint64 - if it.br.valid >= 6 { - it.br.valid -= 6 - sigbits = (it.br.buffer >> it.br.valid) & 0x3f - } else { - var err error sigbits, err = it.br.readBits(6) if err != nil { return err