tsdb/chunkenc: port XOR2 performance improvements to ST-aware encoding

Port the following optimizations from the roidelapluie/xor2 branch to
the ST-aware XOR2 implementation on main:

bstream.go:
- Add writeBitsFast() as a writeBits variant that handles the partial
  last byte inline to avoid per-byte writeByte calls and writes
  complete bytes directly to the stream slice; used only by XOR2 to
  leave the shared writeBits unchanged for other encoders
- Add readXOR2ControlFast() for inlinable hot-path control decoding
  that avoids buffer refills for the common 4-bit cases
- Add readUvarint()/readVarint() methods that use direct method calls
  instead of io.ByteReader interface dispatch, reducing GC pressure
  from interior pointer references in findObject

xor2.go:
- Switch all writeBits calls to writeBitsFast
- Use readXOR2ControlFast() + readXOR2Control() fallback in Next()
- Use it.br.readVarint()/readUvarint() instead of binary.ReadVarint/
  ReadUvarint to avoid GC overhead from interface dispatch
- Add 3-bit fast path in decodeValue() to read the full value control
  prefix in one buffer peek rather than up to three separate bit reads
- Add combined 1+sz bit fast path in decodeValueKnownNonZero() to
  fold the control bit and value bits into a single buffer operation
- Add 11-bit combined read in decodeNewLeadingTrailing() to read
  leading (5 bits) and sigbits (6 bits) together
- Pre-compute the value XOR delta in encodeJoint() and pass it to
  writeVDeltaKnownNonZero(delta uint64) to avoid recomputation

Signed-off-by: Julien Pivotto <291750+roidelapluie@users.noreply.github.com>
This commit is contained in:
Julien Pivotto 2026-03-19 12:14:34 +01:00
parent a9d90952ba
commit 7176a6de91
2 changed files with 202 additions and 42 deletions

View File

@ -117,6 +117,40 @@ func (b *bstream) writeBits(u uint64, nbits int) {
}
}
// writeBitsFast is like writeBits but handles the partial last byte inline to
// avoid per-byte writeByte calls, and writes complete bytes directly to the
// stream slice.
func (b *bstream) writeBitsFast(u uint64, nbits int) {
u <<= 64 - uint(nbits)
// If the last byte is partial, fill its remaining bits first.
if b.count > 0 {
free := int(b.count)
last := len(b.stream) - 1
b.stream[last] |= byte(u >> uint(64-free))
if nbits < free {
b.count = uint8(free - nbits)
return
}
u <<= uint(free)
nbits -= free
b.count = 0
}
// Write complete bytes directly, avoiding per-byte function call overhead.
for nbits >= 8 {
b.stream = append(b.stream, byte(u>>56))
u <<= 8
nbits -= 8
}
// Write any remaining bits as a partial final byte.
if nbits > 0 {
b.stream = append(b.stream, byte(u>>56))
b.count = uint8(8 - nbits)
}
}
type bstreamReader struct {
stream []byte
streamOffset int // The offset from which read the next byte from the stream.
@ -215,6 +249,35 @@ func (b *bstreamReader) ReadByte() (byte, error) {
return byte(v), nil
}
// readXOR2ControlFast is like readXOR2Control but returns io.EOF when the
// internal buffer has fewer than 4 valid bits, or when the control prefix
// indicates cases 4 or 5 (top4 == 0xf). The caller should retry with
// readXOR2Control. This function must be kept small and a leaf in order to
// help the compiler inlining it and further improve performance.
func (b *bstreamReader) readXOR2ControlFast() (uint8, error) {
if b.valid < 4 {
return 0, io.EOF
}
top4 := uint8((b.buffer >> (b.valid - 4)) & 0xf)
if top4 < 8 { // '0xxx': dod=0, val=0 (case 0).
b.valid--
return 0, nil
}
if top4 < 12 { // '10xx': dod=0, val changed (case 1).
b.valid -= 2
return 1, nil
}
if top4 < 14 { // '110x': small dod (case 2).
b.valid -= 3
return 2, nil
}
if top4 == 14 { // '1110': medium dod (case 3).
b.valid -= 4
return 3, nil
}
return 0, io.EOF
}
// readXOR2Control reads the XOR2 variable-length joint control prefix
// and returns 0-5 mapping to the six encoding cases:
//
@ -304,6 +367,38 @@ func (b *bstreamReader) readXOR2Control() (uint8, error) {
return 5, nil
}
// readUvarint decodes a varint-encoded uint64 using direct method calls,
// avoiding the io.ByteReader interface dispatch used by binary.ReadUvarint.
// This prevents interior pointer references on goroutine stacks that the GC
// must trace via findObject, reducing GC overhead.
func (b *bstreamReader) readUvarint() (uint64, error) {
var x uint64
var s uint
for range binary.MaxVarintLen64 {
byt, err := b.ReadByte()
if err != nil {
return x, err
}
if byt < 0x80 {
return x | uint64(byt)<<s, nil
}
x |= uint64(byt&0x7f) << s
s += 7
}
return x, io.ErrUnexpectedEOF
}
// readVarint decodes a varint-encoded int64 using direct method calls,
// avoiding the io.ByteReader interface dispatch used by binary.ReadVarint.
func (b *bstreamReader) readVarint() (int64, error) {
ux, err := b.readUvarint()
x := int64(ux >> 1)
if ux&1 != 0 {
x = ^x
}
return x, err
}
// loadNextBuffer loads the next bytes from the stream into the internal buffer.
// The input nbits is the minimum number of bits that must be read, but the implementation
// can read more (if possible) to improve performances.

View File

@ -224,7 +224,7 @@ func (a *xor2Appender) Append(st, t int64, v float64) {
for _, b := range buf[:binary.PutVarint(buf, t)] {
a.b.writeByte(b)
}
a.b.writeBits(math.Float64bits(v), 64)
a.b.writeBitsFast(math.Float64bits(v), 64)
if st != 0 {
for _, b := range buf[:binary.PutVarint(buf, t-st)] {
@ -300,14 +300,15 @@ func (a *xor2Appender) Append(st, t int64, v float64) {
// samples >= 2.
func (a *xor2Appender) encodeJoint(dod int64, v float64) {
if dod == 0 {
vbits := math.Float64bits(v) ^ math.Float64bits(a.v)
switch {
case value.IsStaleNaN(v):
a.b.writeBits(0b11111, 5)
case math.Float64bits(v)^math.Float64bits(a.v) == 0:
a.b.writeBitsFast(0b11111, 5)
case vbits == 0:
a.b.writeBit(zero)
default:
a.b.writeBits(0b10, 2)
a.writeVDeltaKnownNonZero(v)
a.b.writeBitsFast(0b10, 2)
a.writeVDeltaKnownNonZero(vbits)
}
return
}
@ -324,8 +325,8 @@ func (a *xor2Appender) encodeJoint(dod int64, v float64) {
a.b.writeByte(byte(uint64(dod)))
default:
// 64-bit escape (rare): `11110`.
a.b.writeBits(0b11110, 5)
a.b.writeBits(uint64(dod), 64)
a.b.writeBitsFast(0b11110, 5)
a.b.writeBitsFast(uint64(dod), 64)
}
a.writeVDelta(v)
}
@ -333,7 +334,7 @@ func (a *xor2Appender) encodeJoint(dod int64, v float64) {
// writeVDelta encodes the value delta for the dod≠0 case.
func (a *xor2Appender) writeVDelta(v float64) {
if value.IsStaleNaN(v) {
a.b.writeBits(0b111, 3)
a.b.writeBitsFast(0b111, 3)
return
}
@ -352,26 +353,30 @@ func (a *xor2Appender) writeVDelta(v float64) {
}
if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing {
a.b.writeBits(0b10, 2)
a.b.writeBits(delta>>a.trailing, 64-int(a.leading)-int(a.trailing))
a.b.writeBitsFast(0b10, 2)
a.b.writeBitsFast(delta>>a.trailing, 64-int(a.leading)-int(a.trailing))
return
}
a.leading, a.trailing = newLeading, newTrailing
a.b.writeBits(0b110, 3)
a.b.writeBits(uint64(newLeading), 5)
a.b.writeBitsFast(0b110, 3)
a.b.writeBitsFast(uint64(newLeading), 5)
sigbits := 64 - newLeading - newTrailing
a.b.writeBits(uint64(sigbits), 6)
a.b.writeBits(delta>>newTrailing, int(sigbits))
a.b.writeBitsFast(uint64(sigbits), 6)
a.b.writeBitsFast(delta>>newTrailing, int(sigbits))
}
// writeVDeltaKnownNonZero encodes the value delta when it is known to be
// non-zero and non-stale (dod=0, value-changed case).
func (a *xor2Appender) writeVDeltaKnownNonZero(v float64) {
delta := math.Float64bits(v) ^ math.Float64bits(a.v)
// writeVDeltaKnownNonZero encodes a precomputed value XOR delta for the
// dod=0, value-changed case. delta must be non-zero; stale NaN with dod=0 is
// handled at the joint control level (`11111`) and never reaches this function.
//
// Encoding:
//
// `0` → reuse previous leading/trailing window
// `1` → new leading/trailing window
func (a *xor2Appender) writeVDeltaKnownNonZero(delta uint64) {
newLeading := uint8(bits.LeadingZeros64(delta))
newTrailing := uint8(bits.TrailingZeros64(delta))
@ -381,18 +386,18 @@ func (a *xor2Appender) writeVDeltaKnownNonZero(v float64) {
if a.leading != 0xff && newLeading >= a.leading && newTrailing >= a.trailing {
a.b.writeBit(zero)
a.b.writeBits(delta>>a.trailing, 64-int(a.leading)-int(a.trailing))
a.b.writeBitsFast(delta>>a.trailing, 64-int(a.leading)-int(a.trailing))
return
}
a.leading, a.trailing = newLeading, newTrailing
a.b.writeBit(one)
a.b.writeBits(uint64(newLeading), 5)
a.b.writeBitsFast(uint64(newLeading), 5)
sigbits := 64 - newLeading - newTrailing
a.b.writeBits(uint64(sigbits), 6)
a.b.writeBits(delta>>newTrailing, int(sigbits))
a.b.writeBitsFast(uint64(sigbits), 6)
a.b.writeBitsFast(delta>>newTrailing, int(sigbits))
}
func (*xor2Appender) AppendHistogram(*HistogramAppender, int64, int64, *histogram.Histogram, bool) (Chunk, bool, Appender, error) {
@ -486,7 +491,7 @@ func (it *xor2Iterator) Next() ValueType {
}
if it.numRead == 0 {
t, err := binary.ReadVarint(&it.br)
t, err := it.br.readVarint()
if err != nil {
it.err = err
return ValNone
@ -504,7 +509,7 @@ func (it *xor2Iterator) Next() ValueType {
// Optional ST for sample 0.
if it.firstSTKnown {
stDiff, err := binary.ReadVarint(&it.br)
stDiff, err := it.br.readVarint()
if err != nil {
it.err = err
return ValNone
@ -517,7 +522,7 @@ func (it *xor2Iterator) Next() ValueType {
}
if it.numRead == 1 {
tDelta, err := binary.ReadUvarint(&it.br)
tDelta, err := it.br.readUvarint()
if err != nil {
it.err = err
return ValNone
@ -550,10 +555,13 @@ func (it *xor2Iterator) Next() ValueType {
prevT := it.t
savedNumRead := it.numRead
ctrl, err := it.br.readXOR2Control()
ctrl, err := it.br.readXOR2ControlFast()
if err != nil {
it.err = err
return ValNone
ctrl, err = it.br.readXOR2Control()
if err != nil {
it.err = err
return ValNone
}
}
switch ctrl {
@ -654,6 +662,49 @@ func (it *xor2Iterator) readDod(w uint8) error {
// `110` → new leading/trailing window
// `111` → stale NaN
func (it *xor2Iterator) decodeValue() error {
// Fast path: 3 bits available — read the full control prefix in one shot.
// Encoding: `0`=unchanged, `10`=reuse window, `110`=new window, `111`=stale NaN.
if it.br.valid >= 3 {
ctrl := (it.br.buffer >> (it.br.valid - 3)) & 0x7
if ctrl&0x4 == 0 {
// `0xx`: value unchanged, consume 1 bit.
it.br.valid--
it.val = it.baselineV
return nil
}
if ctrl&0x6 == 0x4 {
// `10x`: reuse previous leading/trailing window, consume 2 bits.
it.br.valid -= 2
sz := uint8(64 - int(it.leading) - int(it.trailing))
var valueBits uint64
if it.br.valid >= sz {
it.br.valid -= sz
valueBits = (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1)
} else {
var err error
valueBits, err = it.br.readBits(sz)
if err != nil {
return err
}
}
vbits := math.Float64bits(it.baselineV)
vbits ^= valueBits << it.trailing
it.val = math.Float64frombits(vbits)
it.baselineV = it.val
return nil
}
// `11x`: consume 3 bits.
it.br.valid -= 3
if ctrl == 0x6 {
// `110`: new leading/trailing window.
return it.decodeNewLeadingTrailing()
}
// `111`: stale NaN.
it.val = math.Float64frombits(value.StaleNaN)
return nil
}
// Slow path: fewer than 3 bits buffered (rare, only near buffer refills).
var bit bit
if it.br.valid > 0 {
it.br.valid--
@ -731,6 +782,26 @@ func (it *xor2Iterator) decodeValue() error {
// `0` → reuse previous leading/trailing window
// `1` → new leading/trailing window
func (it *xor2Iterator) decodeValueKnownNonZero() error {
sz := uint8(64 - int(it.leading) - int(it.trailing))
// Fast path: combine the 1-bit reuse/new-window control read with the
// sz-bit value read into a single buffer operation.
if it.br.valid >= 1+sz {
ctrlBit := (it.br.buffer >> (it.br.valid - 1)) & 1
if ctrlBit == 0 { // `0`: reuse previous leading/trailing window.
it.br.valid -= 1 + sz
valueBits := (it.br.buffer >> it.br.valid) & ((uint64(1) << sz) - 1)
vbits := math.Float64bits(it.baselineV)
vbits ^= valueBits << it.trailing
it.val = math.Float64frombits(vbits)
it.baselineV = it.val
return nil
}
// `1`: new leading/trailing window.
it.br.valid--
return it.decodeNewLeadingTrailing()
}
// Slow path: read control bit then value bits separately.
var bit bit
if it.br.valid > 0 {
it.br.valid--
@ -745,7 +816,6 @@ func (it *xor2Iterator) decodeValueKnownNonZero() error {
if bit == zero {
// `0` → reuse previous leading/trailing window.
sz := uint8(64 - int(it.leading) - int(it.trailing))
var valueBits uint64
if it.br.valid >= sz {
it.br.valid -= sz
@ -771,24 +841,19 @@ func (it *xor2Iterator) decodeValueKnownNonZero() error {
// decodeNewLeadingTrailing reads a new leading/sigbits/value triple and
// updates it.leading, it.trailing, it.val, and it.baselineV.
func (it *xor2Iterator) decodeNewLeadingTrailing() error {
var newLeading uint64
if it.br.valid >= 5 {
it.br.valid -= 5
newLeading = (it.br.buffer >> it.br.valid) & 0x1f
var newLeading, sigbits uint64
// Fast path: read leading (5 bits) and sigbits (6 bits) together as 11 bits.
if it.br.valid >= 11 {
val := (it.br.buffer >> (it.br.valid - 11)) & 0x7ff
it.br.valid -= 11
newLeading = val >> 6
sigbits = val & 0x3f
} else {
var err error
newLeading, err = it.br.readBits(5)
if err != nil {
return err
}
}
var sigbits uint64
if it.br.valid >= 6 {
it.br.valid -= 6
sigbits = (it.br.buffer >> it.br.valid) & 0x3f
} else {
var err error
sigbits, err = it.br.readBits(6)
if err != nil {
return err