From a007eb2e1eeb3e42d504adb9bcd1e5cca804d524 Mon Sep 17 00:00:00 2001 From: Fabian Reinartz Date: Thu, 7 Sep 2017 14:14:33 +0200 Subject: [PATCH] vendor: update prometheus/tsdb to single head mode --- vendor/github.com/go-kit/kit/log/term/LICENSE | 21 + .../go-kit/kit/log/term/colorlogger.go | 144 ++ .../go-kit/kit/log/term/colorwriter_others.go | 12 + .../kit/log/term/colorwriter_windows.go | 190 +++ vendor/github.com/go-kit/kit/log/term/term.go | 22 + .../go-kit/kit/log/term/terminal_appengine.go | 15 + .../go-kit/kit/log/term/terminal_darwin.go | 10 + .../go-kit/kit/log/term/terminal_freebsd.go | 7 + .../go-kit/kit/log/term/terminal_linux.go | 12 + .../kit/log/term/terminal_notwindows.go | 25 + .../go-kit/kit/log/term/terminal_openbsd.go | 5 + .../go-kit/kit/log/term/terminal_windows.go | 102 ++ vendor/github.com/prometheus/tsdb/block.go | 49 +- vendor/github.com/prometheus/tsdb/chunks.go | 11 +- vendor/github.com/prometheus/tsdb/compact.go | 266 ++-- vendor/github.com/prometheus/tsdb/db.go | 557 ++----- .../prometheus/tsdb/encoding_helpers.go | 2 +- vendor/github.com/prometheus/tsdb/head.go | 1300 +++++++++++------ vendor/github.com/prometheus/tsdb/index.go | 20 +- vendor/github.com/prometheus/tsdb/pool.go | 79 + vendor/github.com/prometheus/tsdb/postings.go | 74 +- vendor/github.com/prometheus/tsdb/querier.go | 73 +- .../github.com/prometheus/tsdb/tombstones.go | 79 +- vendor/github.com/prometheus/tsdb/wal.go | 951 +++++++----- vendor/vendor.json | 14 +- 25 files changed, 2562 insertions(+), 1478 deletions(-) create mode 100644 vendor/github.com/go-kit/kit/log/term/LICENSE create mode 100644 vendor/github.com/go-kit/kit/log/term/colorlogger.go create mode 100644 vendor/github.com/go-kit/kit/log/term/colorwriter_others.go create mode 100644 vendor/github.com/go-kit/kit/log/term/colorwriter_windows.go create mode 100644 vendor/github.com/go-kit/kit/log/term/term.go create mode 100644 vendor/github.com/go-kit/kit/log/term/terminal_appengine.go create mode 100644 vendor/github.com/go-kit/kit/log/term/terminal_darwin.go create mode 100644 vendor/github.com/go-kit/kit/log/term/terminal_freebsd.go create mode 100644 vendor/github.com/go-kit/kit/log/term/terminal_linux.go create mode 100644 vendor/github.com/go-kit/kit/log/term/terminal_notwindows.go create mode 100644 vendor/github.com/go-kit/kit/log/term/terminal_openbsd.go create mode 100644 vendor/github.com/go-kit/kit/log/term/terminal_windows.go create mode 100644 vendor/github.com/prometheus/tsdb/pool.go diff --git a/vendor/github.com/go-kit/kit/log/term/LICENSE b/vendor/github.com/go-kit/kit/log/term/LICENSE new file mode 100644 index 0000000000..f090cb42f3 --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Simon Eskildsen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/github.com/go-kit/kit/log/term/colorlogger.go b/vendor/github.com/go-kit/kit/log/term/colorlogger.go new file mode 100644 index 0000000000..00376ce0e7 --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/colorlogger.go @@ -0,0 +1,144 @@ +package term + +import ( + "bytes" + "fmt" + "io" + "sync" + + "github.com/go-kit/kit/log" +) + +// Color represents an ANSI color. The zero value is Default. +type Color uint8 + +// ANSI colors. +const ( + Default = Color(iota) + + Black + DarkRed + DarkGreen + Brown + DarkBlue + DarkMagenta + DarkCyan + Gray + + DarkGray + Red + Green + Yellow + Blue + Magenta + Cyan + White + + numColors +) + +// For more on ANSI escape codes see +// https://en.wikipedia.org/wiki/ANSI_escape_code. See in particular +// https://en.wikipedia.org/wiki/ANSI_escape_code#Colors. + +var ( + resetColorBytes = []byte("\x1b[39;49;22m") + fgColorBytes [][]byte + bgColorBytes [][]byte +) + +func init() { + // Default + fgColorBytes = append(fgColorBytes, []byte("\x1b[39m")) + bgColorBytes = append(bgColorBytes, []byte("\x1b[49m")) + + // dark colors + for color := Black; color < DarkGray; color++ { + fgColorBytes = append(fgColorBytes, []byte(fmt.Sprintf("\x1b[%dm", 30+color-Black))) + bgColorBytes = append(bgColorBytes, []byte(fmt.Sprintf("\x1b[%dm", 40+color-Black))) + } + + // bright colors + for color := DarkGray; color < numColors; color++ { + fgColorBytes = append(fgColorBytes, []byte(fmt.Sprintf("\x1b[%d;1m", 30+color-DarkGray))) + bgColorBytes = append(bgColorBytes, []byte(fmt.Sprintf("\x1b[%d;1m", 40+color-DarkGray))) + } +} + +// FgBgColor represents a foreground and background color. +type FgBgColor struct { + Fg, Bg Color +} + +func (c FgBgColor) isZero() bool { + return c.Fg == Default && c.Bg == Default +} + +// NewColorLogger returns a Logger which writes colored logs to w. ANSI color +// codes for the colors returned by color are added to the formatted output +// from the Logger returned by newLogger and the combined result written to w. +func NewColorLogger(w io.Writer, newLogger func(io.Writer) log.Logger, color func(keyvals ...interface{}) FgBgColor) log.Logger { + if color == nil { + panic("color func nil") + } + return &colorLogger{ + w: w, + newLogger: newLogger, + color: color, + bufPool: sync.Pool{New: func() interface{} { return &loggerBuf{} }}, + noColorLogger: newLogger(w), + } +} + +type colorLogger struct { + w io.Writer + newLogger func(io.Writer) log.Logger + color func(keyvals ...interface{}) FgBgColor + bufPool sync.Pool + noColorLogger log.Logger +} + +func (l *colorLogger) Log(keyvals ...interface{}) error { + color := l.color(keyvals...) + if color.isZero() { + return l.noColorLogger.Log(keyvals...) + } + + lb := l.getLoggerBuf() + defer l.putLoggerBuf(lb) + if color.Fg != Default { + lb.buf.Write(fgColorBytes[color.Fg]) + } + if color.Bg != Default { + lb.buf.Write(bgColorBytes[color.Bg]) + } + err := lb.logger.Log(keyvals...) + if err != nil { + return err + } + if color.Fg != Default || color.Bg != Default { + lb.buf.Write(resetColorBytes) + } + _, err = io.Copy(l.w, lb.buf) + return err +} + +type loggerBuf struct { + buf *bytes.Buffer + logger log.Logger +} + +func (l *colorLogger) getLoggerBuf() *loggerBuf { + lb := l.bufPool.Get().(*loggerBuf) + if lb.buf == nil { + lb.buf = &bytes.Buffer{} + lb.logger = l.newLogger(lb.buf) + } else { + lb.buf.Reset() + } + return lb +} + +func (l *colorLogger) putLoggerBuf(cb *loggerBuf) { + l.bufPool.Put(cb) +} diff --git a/vendor/github.com/go-kit/kit/log/term/colorwriter_others.go b/vendor/github.com/go-kit/kit/log/term/colorwriter_others.go new file mode 100644 index 0000000000..cc571024b1 --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/colorwriter_others.go @@ -0,0 +1,12 @@ +// +build !windows + +package term + +import "io" + +// NewColorWriter returns an io.Writer that writes to w and provides cross +// platform support for ANSI color codes. If w is not a terminal it is +// returned unmodified. +func NewColorWriter(w io.Writer) io.Writer { + return w +} diff --git a/vendor/github.com/go-kit/kit/log/term/colorwriter_windows.go b/vendor/github.com/go-kit/kit/log/term/colorwriter_windows.go new file mode 100644 index 0000000000..fcacda3a63 --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/colorwriter_windows.go @@ -0,0 +1,190 @@ +// The code in this file is adapted from github.com/mattn/go-colorable. + +// +build windows + +package term + +import ( + "bytes" + "fmt" + "io" + "strconv" + "strings" + "syscall" + "unsafe" +) + +type colorWriter struct { + out io.Writer + handle syscall.Handle + lastbuf bytes.Buffer + oldattr word +} + +// NewColorWriter returns an io.Writer that writes to w and provides cross +// platform support for ANSI color codes. If w is not a terminal it is +// returned unmodified. +func NewColorWriter(w io.Writer) io.Writer { + if !IsConsole(w) { + return w + } + + var csbi consoleScreenBufferInfo + handle := syscall.Handle(w.(fder).Fd()) + procGetConsoleScreenBufferInfo.Call(uintptr(handle), uintptr(unsafe.Pointer(&csbi))) + + return &colorWriter{ + out: w, + handle: handle, + oldattr: csbi.attributes, + } +} + +func (w *colorWriter) Write(data []byte) (n int, err error) { + var csbi consoleScreenBufferInfo + procGetConsoleScreenBufferInfo.Call(uintptr(w.handle), uintptr(unsafe.Pointer(&csbi))) + + er := bytes.NewBuffer(data) +loop: + for { + r1, _, err := procGetConsoleScreenBufferInfo.Call(uintptr(w.handle), uintptr(unsafe.Pointer(&csbi))) + if r1 == 0 { + break loop + } + + c1, _, err := er.ReadRune() + if err != nil { + break loop + } + if c1 != 0x1b { + fmt.Fprint(w.out, string(c1)) + continue + } + c2, _, err := er.ReadRune() + if err != nil { + w.lastbuf.WriteRune(c1) + break loop + } + if c2 != 0x5b { + w.lastbuf.WriteRune(c1) + w.lastbuf.WriteRune(c2) + continue + } + + var buf bytes.Buffer + var m rune + for { + c, _, err := er.ReadRune() + if err != nil { + w.lastbuf.WriteRune(c1) + w.lastbuf.WriteRune(c2) + w.lastbuf.Write(buf.Bytes()) + break loop + } + if ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '@' { + m = c + break + } + buf.Write([]byte(string(c))) + } + + switch m { + case 'm': + attr := csbi.attributes + cs := buf.String() + if cs == "" { + procSetConsoleTextAttribute.Call(uintptr(w.handle), uintptr(w.oldattr)) + continue + } + token := strings.Split(cs, ";") + intensityMode := word(0) + for _, ns := range token { + if n, err = strconv.Atoi(ns); err == nil { + switch { + case n == 0: + attr = w.oldattr + case n == 1: + attr |= intensityMode + case 30 <= n && n <= 37: + attr = (attr & backgroundMask) + if (n-30)&1 != 0 { + attr |= foregroundRed + } + if (n-30)&2 != 0 { + attr |= foregroundGreen + } + if (n-30)&4 != 0 { + attr |= foregroundBlue + } + intensityMode = foregroundIntensity + case n == 39: // reset foreground color + attr &= backgroundMask + attr |= w.oldattr & foregroundMask + case 40 <= n && n <= 47: + attr = (attr & foregroundMask) + if (n-40)&1 != 0 { + attr |= backgroundRed + } + if (n-40)&2 != 0 { + attr |= backgroundGreen + } + if (n-40)&4 != 0 { + attr |= backgroundBlue + } + intensityMode = backgroundIntensity + case n == 49: // reset background color + attr &= foregroundMask + attr |= w.oldattr & backgroundMask + } + procSetConsoleTextAttribute.Call(uintptr(w.handle), uintptr(attr)) + } + } + } + } + return len(data) - w.lastbuf.Len(), nil +} + +var ( + procGetConsoleScreenBufferInfo = kernel32.NewProc("GetConsoleScreenBufferInfo") + procSetConsoleTextAttribute = kernel32.NewProc("SetConsoleTextAttribute") +) + +const ( + foregroundBlue = 0x1 + foregroundGreen = 0x2 + foregroundRed = 0x4 + foregroundIntensity = 0x8 + foregroundMask = (foregroundRed | foregroundBlue | foregroundGreen | foregroundIntensity) + backgroundBlue = 0x10 + backgroundGreen = 0x20 + backgroundRed = 0x40 + backgroundIntensity = 0x80 + backgroundMask = (backgroundRed | backgroundBlue | backgroundGreen | backgroundIntensity) +) + +type ( + wchar uint16 + short int16 + dword uint32 + word uint16 +) + +type coord struct { + x short + y short +} + +type smallRect struct { + left short + top short + right short + bottom short +} + +type consoleScreenBufferInfo struct { + size coord + cursorPosition coord + attributes word + window smallRect + maximumWindowSize coord +} diff --git a/vendor/github.com/go-kit/kit/log/term/term.go b/vendor/github.com/go-kit/kit/log/term/term.go new file mode 100644 index 0000000000..3965f1c8be --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/term.go @@ -0,0 +1,22 @@ +// Package term provides tools for logging to a terminal. +package term + +import ( + "io" + + "github.com/go-kit/kit/log" +) + +// NewLogger returns a Logger that takes advantage of terminal features if +// possible. Log events are formatted by the Logger returned by newLogger. If +// w is a terminal each log event is colored according to the color function. +func NewLogger(w io.Writer, newLogger func(io.Writer) log.Logger, color func(keyvals ...interface{}) FgBgColor) log.Logger { + if !IsTerminal(w) { + return newLogger(w) + } + return NewColorLogger(NewColorWriter(w), newLogger, color) +} + +type fder interface { + Fd() uintptr +} diff --git a/vendor/github.com/go-kit/kit/log/term/terminal_appengine.go b/vendor/github.com/go-kit/kit/log/term/terminal_appengine.go new file mode 100644 index 0000000000..b023121ae7 --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/terminal_appengine.go @@ -0,0 +1,15 @@ +// Based on ssh/terminal: +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build appengine + +package term + +import "io" + +// IsTerminal always returns false on AppEngine. +func IsTerminal(w io.Writer) bool { + return false +} diff --git a/vendor/github.com/go-kit/kit/log/term/terminal_darwin.go b/vendor/github.com/go-kit/kit/log/term/terminal_darwin.go new file mode 100644 index 0000000000..459cf54ab9 --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/terminal_darwin.go @@ -0,0 +1,10 @@ +// Based on ssh/terminal: +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package term + +import "syscall" + +const ioctlReadTermios = syscall.TIOCGETA diff --git a/vendor/github.com/go-kit/kit/log/term/terminal_freebsd.go b/vendor/github.com/go-kit/kit/log/term/terminal_freebsd.go new file mode 100644 index 0000000000..791d5c69e3 --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/terminal_freebsd.go @@ -0,0 +1,7 @@ +package term + +import ( + "syscall" +) + +const ioctlReadTermios = syscall.TIOCGETA diff --git a/vendor/github.com/go-kit/kit/log/term/terminal_linux.go b/vendor/github.com/go-kit/kit/log/term/terminal_linux.go new file mode 100644 index 0000000000..ffeab4d15c --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/terminal_linux.go @@ -0,0 +1,12 @@ +// Based on ssh/terminal: +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine + +package term + +import "syscall" + +const ioctlReadTermios = syscall.TCGETS diff --git a/vendor/github.com/go-kit/kit/log/term/terminal_notwindows.go b/vendor/github.com/go-kit/kit/log/term/terminal_notwindows.go new file mode 100644 index 0000000000..9c72558c1c --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/terminal_notwindows.go @@ -0,0 +1,25 @@ +// Based on ssh/terminal: +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build linux,!appengine darwin freebsd openbsd + +package term + +import ( + "io" + "syscall" + "unsafe" +) + +// IsTerminal returns true if w writes to a terminal. +func IsTerminal(w io.Writer) bool { + fw, ok := w.(fder) + if !ok { + return false + } + var termios syscall.Termios + _, _, err := syscall.Syscall6(syscall.SYS_IOCTL, fw.Fd(), ioctlReadTermios, uintptr(unsafe.Pointer(&termios)), 0, 0, 0) + return err == 0 +} diff --git a/vendor/github.com/go-kit/kit/log/term/terminal_openbsd.go b/vendor/github.com/go-kit/kit/log/term/terminal_openbsd.go new file mode 100644 index 0000000000..f9931666bd --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/terminal_openbsd.go @@ -0,0 +1,5 @@ +package term + +import "syscall" + +const ioctlReadTermios = syscall.TIOCGETA diff --git a/vendor/github.com/go-kit/kit/log/term/terminal_windows.go b/vendor/github.com/go-kit/kit/log/term/terminal_windows.go new file mode 100644 index 0000000000..753fd12d86 --- /dev/null +++ b/vendor/github.com/go-kit/kit/log/term/terminal_windows.go @@ -0,0 +1,102 @@ +// Based on ssh/terminal: +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build windows + +package term + +import ( + "encoding/binary" + "io" + "regexp" + "syscall" + "unsafe" +) + +var kernel32 = syscall.NewLazyDLL("kernel32.dll") + +var ( + procGetFileInformationByHandleEx = kernel32.NewProc("GetFileInformationByHandleEx") + msysPipeNameRegex = regexp.MustCompile(`\\(cygwin|msys)-\w+-pty\d?-(to|from)-master`) +) + +const ( + fileNameInfo = 0x02 +) + +// IsTerminal returns true if w writes to a terminal. +func IsTerminal(w io.Writer) bool { + return IsConsole(w) || IsMSYSTerminal(w) +} + +// IsConsole returns true if w writes to a Windows console. +func IsConsole(w io.Writer) bool { + var handle syscall.Handle + + if fw, ok := w.(fder); ok { + handle = syscall.Handle(fw.Fd()) + } else { + // The writer has no file-descriptor and so can't be a terminal. + return false + } + + var st uint32 + err := syscall.GetConsoleMode(handle, &st) + + // If the handle is attached to a terminal, GetConsoleMode returns a + // non-zero value containing the console mode flags. We don't care about + // the specifics of flags, just that it is not zero. + return (err == nil && st != 0) +} + +// IsMSYSTerminal returns true if w writes to a MSYS/MSYS2 terminal. +func IsMSYSTerminal(w io.Writer) bool { + var handle syscall.Handle + + if fw, ok := w.(fder); ok { + handle = syscall.Handle(fw.Fd()) + } else { + // The writer has no file-descriptor and so can't be a terminal. + return false + } + + // MSYS(2) terminal reports as a pipe for STDIN/STDOUT/STDERR. If it isn't + // a pipe, it can't be a MSYS(2) terminal. + filetype, err := syscall.GetFileType(handle) + + if filetype != syscall.FILE_TYPE_PIPE || err != nil { + return false + } + + // MSYS2/Cygwin terminal's name looks like: \msys-dd50a72ab4668b33-pty2-to-master + data := make([]byte, 256, 256) + + r, _, e := syscall.Syscall6( + procGetFileInformationByHandleEx.Addr(), + 4, + uintptr(handle), + uintptr(fileNameInfo), + uintptr(unsafe.Pointer(&data[0])), + uintptr(len(data)), + 0, + 0, + ) + + if r != 0 && e == 0 { + // The first 4 bytes of the buffer are the size of the UTF16 name, in bytes. + unameLen := binary.LittleEndian.Uint32(data[:4]) / 2 + uname := make([]uint16, unameLen, unameLen) + + for i := uint32(0); i < unameLen; i++ { + uname[i] = binary.LittleEndian.Uint16(data[i*2+4 : i*2+2+4]) + } + + name := syscall.UTF16ToString(uname) + + return msysPipeNameRegex.MatchString(name) + } + + return false +} diff --git a/vendor/github.com/prometheus/tsdb/block.go b/vendor/github.com/prometheus/tsdb/block.go index bc9f581ab7..67cd574918 100644 --- a/vendor/github.com/prometheus/tsdb/block.go +++ b/vendor/github.com/prometheus/tsdb/block.go @@ -26,14 +26,23 @@ import ( "github.com/prometheus/tsdb/labels" ) -// DiskBlock handles reads against a Block of time series data. type DiskBlock interface { + BlockReader + // Directory where block data is stored. Dir() string // Stats returns statistics about the block. Meta() BlockMeta + Delete(mint, maxt int64, m ...labels.Matcher) error + + Snapshot(dir string) error + + Close() error +} + +type BlockReader interface { // Index returns an IndexReader over the block's data. Index() IndexReader @@ -42,30 +51,6 @@ type DiskBlock interface { // Tombstones returns a TombstoneReader over the block's deleted data. Tombstones() TombstoneReader - - // Delete deletes data from the block. - Delete(mint, maxt int64, ms ...labels.Matcher) error - - // Close releases all underlying resources of the block. - Close() error -} - -// Block is an interface to a DiskBlock that can also be queried. -type Block interface { - DiskBlock - Queryable - Snapshottable -} - -// headBlock is a regular block that can still be appended to. -type headBlock interface { - Block - Appendable - - // ActiveWriters returns the number of currently active appenders. - ActiveWriters() int - // HighTimestamp returns the highest currently inserted timestamp. - HighTimestamp() int64 } // Snapshottable defines an entity that can be backedup online. @@ -225,16 +210,6 @@ func (pb *persistedBlock) String() string { return pb.meta.ULID.String() } -func (pb *persistedBlock) Querier(mint, maxt int64) Querier { - return &blockQuerier{ - mint: mint, - maxt: maxt, - index: pb.Index(), - chunks: pb.Chunks(), - tombstones: pb.Tombstones(), - } -} - func (pb *persistedBlock) Dir() string { return pb.dir } func (pb *persistedBlock) Index() IndexReader { return pb.indexr } func (pb *persistedBlock) Chunks() ChunkReader { return pb.chunkr } @@ -250,7 +225,7 @@ func (pb *persistedBlock) Delete(mint, maxt int64, ms ...labels.Matcher) error { ir := pb.indexr // Choose only valid postings which have chunks in the time-range. - stones := map[uint32]intervals{} + stones := map[uint64]Intervals{} var lset labels.Labels var chks []ChunkMeta @@ -272,7 +247,7 @@ Outer: if intervalOverlap(mint, maxt, chk.MinTime, chk.MaxTime) { // Delete only until the current vlaues and not beyond. tmin, tmax := clampInterval(mint, maxt, chks[0].MinTime, chks[len(chks)-1].MaxTime) - stones[p.At()] = intervals{{tmin, tmax}} + stones[p.At()] = Intervals{{tmin, tmax}} continue Outer } } diff --git a/vendor/github.com/prometheus/tsdb/chunks.go b/vendor/github.com/prometheus/tsdb/chunks.go index 6bed69700f..5955c50851 100644 --- a/vendor/github.com/prometheus/tsdb/chunks.go +++ b/vendor/github.com/prometheus/tsdb/chunks.go @@ -18,7 +18,6 @@ import ( "encoding/binary" "fmt" "hash" - "hash/crc32" "io" "os" @@ -59,7 +58,7 @@ func (cm *ChunkMeta) writeHash(h hash.Hash) error { type deletedIterator struct { it chunks.Iterator - intervals intervals + intervals Intervals } func (it *deletedIterator) At() (int64, float64) { @@ -76,7 +75,7 @@ Outer: continue Outer } - if ts > tr.maxt { + if ts > tr.Maxt { it.intervals = it.intervals[1:] continue } @@ -136,7 +135,7 @@ func newChunkWriter(dir string) (*chunkWriter, error) { cw := &chunkWriter{ dirFile: dirFile, n: 0, - crc32: crc32.New(crc32.MakeTable(crc32.Castagnoli)), + crc32: newCRC32(), segmentSize: defaultChunkSegmentSize, } return cw, nil @@ -180,7 +179,7 @@ func (w *chunkWriter) cut() error { return err } - p, _, err := nextSequenceFile(w.dirFile.Name(), "") + p, _, err := nextSequenceFile(w.dirFile.Name()) if err != nil { return err } @@ -303,7 +302,7 @@ type chunkReader struct { // newChunkReader returns a new chunkReader based on mmaped files found in dir. func newChunkReader(dir string, pool chunks.Pool) (*chunkReader, error) { - files, err := sequenceFiles(dir, "") + files, err := sequenceFiles(dir) if err != nil { return nil, err } diff --git a/vendor/github.com/prometheus/tsdb/compact.go b/vendor/github.com/prometheus/tsdb/compact.go index dc803ef8f0..7d8174f0d3 100644 --- a/vendor/github.com/prometheus/tsdb/compact.go +++ b/vendor/github.com/prometheus/tsdb/compact.go @@ -14,10 +14,10 @@ package tsdb import ( - "fmt" "math/rand" "os" "path/filepath" + "runtime" "sort" "time" @@ -51,7 +51,7 @@ type Compactor interface { Plan(dir string) ([]string, error) // Write persists a Block into a directory. - Write(dest string, b Block) error + Write(dest string, b BlockReader, mint, maxt int64) error // Compact runs compaction against the provided directories. Must // only be called concurrently with results of Plan(). @@ -60,16 +60,20 @@ type Compactor interface { // LeveledCompactor implements the Compactor interface. type LeveledCompactor struct { - dir string - metrics *compactorMetrics - logger log.Logger - opts *LeveledCompactorOptions + dir string + metrics *compactorMetrics + logger log.Logger + ranges []int64 + chunkPool chunks.Pool } type compactorMetrics struct { - ran prometheus.Counter - failed prometheus.Counter - duration prometheus.Histogram + ran prometheus.Counter + failed prometheus.Counter + duration prometheus.Histogram + chunkSize prometheus.Histogram + chunkSamples prometheus.Histogram + chunkRange prometheus.Histogram } func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics { @@ -83,9 +87,25 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics { Name: "tsdb_compactions_failed_total", Help: "Total number of compactions that failed for the partition.", }) - m.duration = prometheus.NewSummary(prometheus.SummaryOpts{ - Name: "tsdb_compaction_duration", - Help: "Duration of compaction runs.", + m.duration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "tsdb_compaction_duration", + Help: "Duration of compaction runs.", + Buckets: prometheus.ExponentialBuckets(1, 2, 10), + }) + m.chunkSize = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "tsdb_compaction_chunk_size", + Help: "Final size of chunks on their first compaction", + Buckets: prometheus.ExponentialBuckets(32, 1.5, 12), + }) + m.chunkSamples = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "tsdb_compaction_chunk_samples", + Help: "Final number of samples on their first compaction", + Buckets: prometheus.ExponentialBuckets(4, 1.5, 12), + }) + m.chunkRange = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "tsdb_compaction_chunk_range", + Help: "Final time range of chunks on their first compaction", + Buckets: prometheus.ExponentialBuckets(100, 4, 10), }) if r != nil { @@ -93,39 +113,30 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics { m.ran, m.failed, m.duration, + m.chunkRange, + m.chunkSamples, + m.chunkSize, ) } return m } -// LeveledCompactorOptions are the options for a LeveledCompactor. -type LeveledCompactorOptions struct { - blockRanges []int64 - chunkPool chunks.Pool -} - // NewLeveledCompactor returns a LeveledCompactor. -func NewLeveledCompactor(r prometheus.Registerer, l log.Logger, opts *LeveledCompactorOptions) *LeveledCompactor { - if opts == nil { - opts = &LeveledCompactorOptions{ - chunkPool: chunks.NewPool(), - } +func NewLeveledCompactor(r prometheus.Registerer, l log.Logger, ranges []int64, pool chunks.Pool) (*LeveledCompactor, error) { + if len(ranges) == 0 { + return nil, errors.Errorf("at least one range must be provided") + } + if pool == nil { + pool = chunks.NewPool() } return &LeveledCompactor{ - opts: opts, - logger: l, - metrics: newCompactorMetrics(r), - } + ranges: ranges, + chunkPool: pool, + logger: l, + metrics: newCompactorMetrics(r), + }, nil } -type compactionInfo struct { - seq int - generation int - mint, maxt int64 -} - -const compactionBlocksLen = 3 - type dirMeta struct { dir string meta *BlockMeta @@ -145,21 +156,15 @@ func (c *LeveledCompactor) Plan(dir string) ([]string, error) { if err != nil { return nil, err } - if meta.Compaction.Level > 0 { - dms = append(dms, dirMeta{dir, meta}) - } + dms = append(dms, dirMeta{dir, meta}) } - sort.Slice(dms, func(i, j int) bool { - return dms[i].meta.MinTime < dms[j].meta.MinTime - }) - return c.plan(dms) } func (c *LeveledCompactor) plan(dms []dirMeta) ([]string, error) { - if len(dms) <= 1 { - return nil, nil - } + sort.Slice(dms, func(i, j int) bool { + return dms[i].meta.MinTime < dms[j].meta.MinTime + }) var res []string for _, dm := range c.selectDirs(dms) { @@ -172,11 +177,11 @@ func (c *LeveledCompactor) plan(dms []dirMeta) ([]string, error) { // Compact any blocks that have >5% tombstones. for i := len(dms) - 1; i >= 0; i-- { meta := dms[i].meta - if meta.MaxTime-meta.MinTime < c.opts.blockRanges[len(c.opts.blockRanges)/2] { + if meta.MaxTime-meta.MinTime < c.ranges[len(c.ranges)/2] { break } - if meta.Stats.NumSeries/(meta.Stats.NumTombstones+1) <= 20 { // 5% + if float64(meta.Stats.NumTombstones)/float64(meta.Stats.NumSeries+1) > 0.05 { return []string{dms[i].dir}, nil } } @@ -187,13 +192,13 @@ func (c *LeveledCompactor) plan(dms []dirMeta) ([]string, error) { // selectDirs returns the dir metas that should be compacted into a single new block. // If only a single block range is configured, the result is always nil. func (c *LeveledCompactor) selectDirs(ds []dirMeta) []dirMeta { - if len(c.opts.blockRanges) < 2 || len(ds) < 1 { + if len(c.ranges) < 2 || len(ds) < 1 { return nil } highTime := ds[len(ds)-1].meta.MinTime - for _, iv := range c.opts.blockRanges[1:] { + for _, iv := range c.ranges[1:] { parts := splitByRange(ds, iv) if len(parts) == 0 { continue @@ -258,9 +263,12 @@ func splitByRange(ds []dirMeta, tr int64) [][]dirMeta { return splitDirs } -func compactBlockMetas(blocks ...BlockMeta) (res BlockMeta) { - res.MinTime = blocks[0].MinTime - res.MaxTime = blocks[len(blocks)-1].MaxTime +func compactBlockMetas(uid ulid.ULID, blocks ...*BlockMeta) *BlockMeta { + res := &BlockMeta{ + ULID: uid, + MinTime: blocks[0].MinTime, + MaxTime: blocks[len(blocks)-1].MaxTime, + } sources := map[ulid.ULID]struct{}{} @@ -271,10 +279,6 @@ func compactBlockMetas(blocks ...BlockMeta) (res BlockMeta) { for _, s := range b.Compaction.Sources { sources[s] = struct{}{} } - // If it's an in memory block, its ULID goes into the sources. - if b.Compaction.Level == 0 { - sources[b.ULID] = struct{}{} - } } res.Compaction.Level++ @@ -291,40 +295,69 @@ func compactBlockMetas(blocks ...BlockMeta) (res BlockMeta) { // Compact creates a new block in the compactor's directory from the blocks in the // provided directories. func (c *LeveledCompactor) Compact(dest string, dirs ...string) (err error) { - var blocks []Block + var blocks []BlockReader + var metas []*BlockMeta for _, d := range dirs { - b, err := newPersistedBlock(d, c.opts.chunkPool) + b, err := newPersistedBlock(d, c.chunkPool) if err != nil { return err } defer b.Close() + meta, err := readMetaFile(d) + if err != nil { + return err + } + + metas = append(metas, meta) blocks = append(blocks, b) } entropy := rand.New(rand.NewSource(time.Now().UnixNano())) uid := ulid.MustNew(ulid.Now(), entropy) - return c.write(dest, uid, blocks...) + return c.write(dest, compactBlockMetas(uid, metas...), blocks...) } -func (c *LeveledCompactor) Write(dest string, b Block) error { - // Buffering blocks might have been created that often have no data. - if b.Meta().Stats.NumSeries == 0 { - return nil - } - +func (c *LeveledCompactor) Write(dest string, b BlockReader, mint, maxt int64) error { entropy := rand.New(rand.NewSource(time.Now().UnixNano())) uid := ulid.MustNew(ulid.Now(), entropy) - return c.write(dest, uid, b) + meta := &BlockMeta{ + ULID: uid, + MinTime: mint, + MaxTime: maxt, + } + meta.Compaction.Level = 1 + meta.Compaction.Sources = []ulid.ULID{uid} + + return c.write(dest, meta, b) +} + +// instrumentedChunkWriter is used for level 1 compactions to record statistics +// about compacted chunks. +type instrumentedChunkWriter struct { + ChunkWriter + + size prometheus.Histogram + samples prometheus.Histogram + trange prometheus.Histogram +} + +func (w *instrumentedChunkWriter) WriteChunks(chunks ...ChunkMeta) error { + for _, c := range chunks { + w.size.Observe(float64(len(c.Chunk.Bytes()))) + w.samples.Observe(float64(c.Chunk.NumSamples())) + w.trange.Observe(float64(c.MaxTime - c.MinTime)) + } + return w.ChunkWriter.WriteChunks(chunks...) } // write creates a new block that is the union of the provided blocks into dir. // It cleans up all files of the old blocks after completing successfully. -func (c *LeveledCompactor) write(dest string, uid ulid.ULID, blocks ...Block) (err error) { - c.logger.Log("msg", "compact blocks", "blocks", fmt.Sprintf("%v", blocks)) +func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockReader) (err error) { + c.logger.Log("msg", "compact blocks", "count", len(blocks), "mint", meta.MinTime, "maxt", meta.MaxTime) defer func(t time.Time) { if err != nil { @@ -332,9 +365,13 @@ func (c *LeveledCompactor) write(dest string, uid ulid.ULID, blocks ...Block) (e } c.metrics.ran.Inc() c.metrics.duration.Observe(time.Since(t).Seconds()) + + // We might have done quite a few allocs. Enforce a GC so they do not accumulate + // with subsequent compactions or head GCs. + runtime.GC() }(time.Now()) - dir := filepath.Join(dest, uid.String()) + dir := filepath.Join(dest, meta.ULID.String()) tmp := dir + ".tmp" if err = os.RemoveAll(tmp); err != nil { @@ -347,20 +384,30 @@ func (c *LeveledCompactor) write(dest string, uid ulid.ULID, blocks ...Block) (e // Populate chunk and index files into temporary directory with // data of all blocks. - chunkw, err := newChunkWriter(chunkDir(tmp)) + var chunkw ChunkWriter + + chunkw, err = newChunkWriter(chunkDir(tmp)) if err != nil { return errors.Wrap(err, "open chunk writer") } + // Record written chunk sizes on level 1 compactions. + if meta.Compaction.Level == 1 { + chunkw = &instrumentedChunkWriter{ + ChunkWriter: chunkw, + size: c.metrics.chunkSize, + samples: c.metrics.chunkSamples, + trange: c.metrics.chunkRange, + } + } + indexw, err := newIndexWriter(tmp) if err != nil { return errors.Wrap(err, "open index writer") } - meta, err := c.populateBlock(blocks, indexw, chunkw) - if err != nil { + if err := c.populateBlock(blocks, meta, indexw, chunkw); err != nil { return errors.Wrap(err, "write compaction") } - meta.ULID = uid if err = writeMetaFile(tmp, meta); err != nil { return errors.Wrap(err, "write merged meta") @@ -398,18 +445,16 @@ func (c *LeveledCompactor) write(dest string, uid ulid.ULID, blocks ...Block) (e // populateBlock fills the index and chunk writers with new data gathered as the union // of the provided blocks. It returns meta information for the new block. -func (c *LeveledCompactor) populateBlock(blocks []Block, indexw IndexWriter, chunkw ChunkWriter) (*BlockMeta, error) { +func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, indexw IndexWriter, chunkw ChunkWriter) error { var ( set compactionSet - metas []BlockMeta allSymbols = make(map[string]struct{}, 1<<16) ) for i, b := range blocks { - metas = append(metas, b.Meta()) symbols, err := b.Index().Symbols() if err != nil { - return nil, errors.Wrap(err, "read symbols") + return errors.Wrap(err, "read symbols") } for s := range symbols { allSymbols[s] = struct{}{} @@ -419,7 +464,7 @@ func (c *LeveledCompactor) populateBlock(blocks []Block, indexw IndexWriter, chu all, err := indexr.Postings("", "") if err != nil { - return nil, err + return err } all = indexr.SortedPostings(all) @@ -431,20 +476,19 @@ func (c *LeveledCompactor) populateBlock(blocks []Block, indexw IndexWriter, chu } set, err = newCompactionMerger(set, s) if err != nil { - return nil, err + return err } } // We fully rebuild the postings list index from merged series. var ( - postings = &memPostings{m: make(map[term][]uint32, 512)} + postings = newMemPostings() values = map[string]stringset{} - i = uint32(0) - meta = compactBlockMetas(metas...) + i = uint64(0) ) if err := indexw.AddSymbols(allSymbols); err != nil { - return nil, errors.Wrap(err, "add symbols") + return errors.Wrap(err, "add symbols") } for set.Next() { @@ -458,11 +502,11 @@ func (c *LeveledCompactor) populateBlock(blocks []Block, indexw IndexWriter, chu if len(dranges) > 0 { // Re-encode the chunk to not have deleted values. for _, chk := range chks { - if intervalOverlap(dranges[0].mint, dranges[len(dranges)-1].maxt, chk.MinTime, chk.MaxTime) { + if intervalOverlap(dranges[0].Mint, dranges[len(dranges)-1].Maxt, chk.MinTime, chk.MaxTime) { newChunk := chunks.NewXORChunk() app, err := newChunk.Appender() if err != nil { - return nil, err + return err } it := &deletedIterator{it: chk.Chunk.Iterator(), intervals: dranges} @@ -476,11 +520,11 @@ func (c *LeveledCompactor) populateBlock(blocks []Block, indexw IndexWriter, chu } } if err := chunkw.WriteChunks(chks...); err != nil { - return nil, err + return errors.Wrap(err, "write chunks") } if err := indexw.AddSeries(i, lset, chks...); err != nil { - return nil, errors.Wrapf(err, "add series") + return errors.Wrap(err, "add series") } meta.Stats.NumChunks += uint64(len(chks)) @@ -490,7 +534,7 @@ func (c *LeveledCompactor) populateBlock(blocks []Block, indexw IndexWriter, chu } for _, chk := range chks { - c.opts.chunkPool.Put(chk.Chunk) + c.chunkPool.Put(chk.Chunk) } for _, l := range lset { @@ -500,15 +544,13 @@ func (c *LeveledCompactor) populateBlock(blocks []Block, indexw IndexWriter, chu values[l.Name] = valset } valset.set(l.Value) - - t := term{name: l.Name, value: l.Value} - - postings.add(i, t) } + postings.add(i, lset) + i++ } if set.Err() != nil { - return nil, set.Err() + return errors.Wrap(set.Err(), "iterate compaction set") } s := make([]string, 0, 256) @@ -519,30 +561,30 @@ func (c *LeveledCompactor) populateBlock(blocks []Block, indexw IndexWriter, chu s = append(s, x) } if err := indexw.WriteLabelIndex([]string{n}, s); err != nil { - return nil, err + return errors.Wrap(err, "write label index") } } - for t := range postings.m { - if err := indexw.WritePostings(t.name, t.value, postings.get(t)); err != nil { - return nil, err + for l := range postings.m { + if err := indexw.WritePostings(l.Name, l.Value, postings.get(l.Name, l.Value)); err != nil { + return errors.Wrap(err, "write postings") } } // Write a postings list containing all series. - all := make([]uint32, i) + all := make([]uint64, i) for i := range all { - all[i] = uint32(i) + all[i] = uint64(i) } if err := indexw.WritePostings("", "", newListPostings(all)); err != nil { - return nil, err + return errors.Wrap(err, "write 'all' postings") } - return &meta, nil + return nil } type compactionSet interface { Next() bool - At() (labels.Labels, []ChunkMeta, intervals) + At() (labels.Labels, []ChunkMeta, Intervals) Err() error } @@ -555,7 +597,7 @@ type compactionSeriesSet struct { l labels.Labels c []ChunkMeta - intervals intervals + intervals Intervals err error } @@ -572,9 +614,12 @@ func (c *compactionSeriesSet) Next() bool { if !c.p.Next() { return false } + var err error + c.intervals = c.tombstones.Get(c.p.At()) - if c.err = c.index.Series(c.p.At(), &c.l, &c.c); c.err != nil { + if err = c.index.Series(c.p.At(), &c.l, &c.c); err != nil { + c.err = errors.Wrapf(err, "get series %d", c.p.At()) return false } @@ -582,7 +627,7 @@ func (c *compactionSeriesSet) Next() bool { if len(c.intervals) > 0 { chks := make([]ChunkMeta, 0, len(c.c)) for _, chk := range c.c { - if !(interval{chk.MinTime, chk.MaxTime}.isSubrange(c.intervals)) { + if !(Interval{chk.MinTime, chk.MaxTime}.isSubrange(c.intervals)) { chks = append(chks, chk) } } @@ -593,8 +638,9 @@ func (c *compactionSeriesSet) Next() bool { for i := range c.c { chk := &c.c[i] - chk.Chunk, c.err = c.chunks.Chunk(chk.Ref) - if c.err != nil { + chk.Chunk, err = c.chunks.Chunk(chk.Ref) + if err != nil { + c.err = errors.Wrapf(err, "chunk %d not found", chk.Ref) return false } } @@ -609,7 +655,7 @@ func (c *compactionSeriesSet) Err() error { return c.p.Err() } -func (c *compactionSeriesSet) At() (labels.Labels, []ChunkMeta, intervals) { +func (c *compactionSeriesSet) At() (labels.Labels, []ChunkMeta, Intervals) { return c.l, c.c, c.intervals } @@ -619,7 +665,7 @@ type compactionMerger struct { aok, bok bool l labels.Labels c []ChunkMeta - intervals intervals + intervals Intervals } type compactionSeries struct { @@ -700,7 +746,7 @@ func (c *compactionMerger) Err() error { return c.b.Err() } -func (c *compactionMerger) At() (labels.Labels, []ChunkMeta, intervals) { +func (c *compactionMerger) At() (labels.Labels, []ChunkMeta, Intervals) { return c.l, c.c, c.intervals } diff --git a/vendor/github.com/prometheus/tsdb/db.go b/vendor/github.com/prometheus/tsdb/db.go index 8d581cdfa5..ad034d8b02 100644 --- a/vendor/github.com/prometheus/tsdb/db.go +++ b/vendor/github.com/prometheus/tsdb/db.go @@ -21,10 +21,8 @@ import ( "io/ioutil" "os" "path/filepath" - "runtime" "sort" "strconv" - "strings" "sync" "time" "unsafe" @@ -77,11 +75,11 @@ type Appender interface { // to AddFast() at any point. Adding the sample via Add() returns a new // reference number. // If the reference is the empty string it must not be used for caching. - Add(l labels.Labels, t int64, v float64) (string, error) + Add(l labels.Labels, t int64, v float64) (uint64, error) // Add adds a sample pair for the referenced series. It is generally faster // than adding a sample by providing its full label set. - AddFast(ref string, t int64, v float64) error + AddFast(ref uint64, t int64, v float64) error // Commit submits the collected samples and purges the batch. Commit() error @@ -100,18 +98,14 @@ type DB struct { metrics *dbMetrics opts *Options chunkPool chunks.Pool + compactor Compactor + wal WAL // Mutex for that must be held when modifying the general block layout. mtx sync.RWMutex - blocks []Block + blocks []DiskBlock - // Mutex that must be held when modifying just the head blocks - // or the general layout. - // mtx must be held before acquiring. - headmtx sync.RWMutex - heads []headBlock - - compactor Compactor + head *Head compactc chan struct{} donec chan struct{} @@ -123,22 +117,15 @@ type DB struct { } type dbMetrics struct { - activeAppenders prometheus.Gauge loadedBlocks prometheus.GaugeFunc reloads prometheus.Counter reloadsFailed prometheus.Counter - reloadDuration prometheus.Summary - samplesAppended prometheus.Counter compactionsTriggered prometheus.Counter } func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { m := &dbMetrics{} - m.activeAppenders = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "tsdb_active_appenders", - Help: "Number of currently active appender transactions", - }) m.loadedBlocks = prometheus.NewGaugeFunc(prometheus.GaugeOpts{ Name: "tsdb_blocks_loaded", Help: "Number of currently loaded data blocks", @@ -155,14 +142,6 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { Name: "tsdb_reloads_failures_total", Help: "Number of times the database failed to reload black data from disk.", }) - m.reloadDuration = prometheus.NewSummary(prometheus.SummaryOpts{ - Name: "tsdb_reload_duration_seconds", - Help: "Duration of block reloads.", - }) - m.samplesAppended = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "tsdb_samples_appended_total", - Help: "Total number of appended sampledb.", - }) m.compactionsTriggered = prometheus.NewCounter(prometheus.CounterOpts{ Name: "tsdb_compactions_triggered_total", Help: "Total number of triggered compactions for the partition.", @@ -170,12 +149,9 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { if r != nil { r.MustRegister( - m.activeAppenders, m.loadedBlocks, m.reloads, m.reloadsFailed, - m.reloadDuration, - m.samplesAppended, m.compactionsTriggered, ) } @@ -187,12 +163,10 @@ func Open(dir string, l log.Logger, r prometheus.Registerer, opts *Options) (db if err := os.MkdirAll(dir, 0777); err != nil { return nil, err } - if l == nil { l = log.NewLogfmtLogger(os.Stdout) l = log.With(l, "ts", log.DefaultTimestampUTC, "caller", log.DefaultCaller) } - if opts == nil { opts = DefaultOptions } @@ -224,29 +198,26 @@ func Open(dir string, l log.Logger, r prometheus.Registerer, opts *Options) (db db.lockf = &lockf } - copts := &LeveledCompactorOptions{ - blockRanges: opts.BlockRanges, - chunkPool: db.chunkPool, + db.compactor, err = NewLeveledCompactor(r, l, opts.BlockRanges, db.chunkPool) + if err != nil { + return nil, errors.Wrap(err, "create leveled compactor") } - if len(copts.blockRanges) == 0 { - return nil, errors.New("at least one block-range must exist") - } - - for float64(copts.blockRanges[len(copts.blockRanges)-1])/float64(opts.RetentionDuration) > 0.2 { - if len(copts.blockRanges) == 1 { - break - } - - // Max overflow is restricted to 20%. - copts.blockRanges = copts.blockRanges[:len(copts.blockRanges)-1] - } - - db.compactor = NewLeveledCompactor(r, l, copts) - - if err := db.reloadBlocks(); err != nil { + wal, err := OpenSegmentWAL(filepath.Join(dir, "wal"), l, 10*time.Second) + if err != nil { return nil, err } + db.head, err = NewHead(r, l, wal, opts.BlockRanges[0]) + if err != nil { + return nil, err + } + if err := db.reload(); err != nil { + return nil, err + } + if err := db.head.ReadWAL(); err != nil { + return nil, errors.Wrap(err, "read WAL") + } + go db.run() return db, nil @@ -260,12 +231,17 @@ func (db *DB) Dir() string { func (db *DB) run() { defer close(db.donec) - tick := time.NewTicker(30 * time.Second) - defer tick.Stop() + backoff := time.Duration(0) for { select { - case <-tick.C: + case <-db.stopc: + return + case <-time.After(backoff): + } + + select { + case <-time.After(1 * time.Minute): select { case db.compactc <- struct{}{}: default: @@ -273,20 +249,20 @@ func (db *DB) run() { case <-db.compactc: db.metrics.compactionsTriggered.Inc() - changes1, err := db.retentionCutoff() - if err != nil { - db.logger.Log("msg", "retention cutoff failed", "err", err) + _, err1 := db.retentionCutoff() + if err1 != nil { + db.logger.Log("msg", "retention cutoff failed", "err", err1) } - changes2, err := db.compact() - if err != nil { - db.logger.Log("msg", "compaction failed", "err", err) + _, err2 := db.compact() + if err2 != nil { + db.logger.Log("msg", "compaction failed", "err", err2) } - if changes1 || changes2 { - if err := db.reloadBlocks(); err != nil { - db.logger.Log("msg", "reloading blocks failed", "err", err) - } + if err1 != nil || err2 != nil { + backoff = exponential(backoff, 1*time.Second, 1*time.Minute) + } else { + backoff = 0 } case <-db.stopc: @@ -303,74 +279,40 @@ func (db *DB) retentionCutoff() (bool, error) { db.mtx.RLock() defer db.mtx.RUnlock() - // We only consider the already persisted blocks. Head blocks generally - // only account for a fraction of the total data. - db.headmtx.RLock() - lenp := len(db.blocks) - len(db.heads) - db.headmtx.RUnlock() - - if lenp == 0 { + if len(db.blocks) == 0 { return false, nil } - last := db.blocks[lenp-1] + last := db.blocks[len(db.blocks)-1] mint := last.Meta().MaxTime - int64(db.opts.RetentionDuration) return retentionCutoff(db.dir, mint) } -// headFullness returns up to which fraction of a blocks time range samples -// were already inserted. -func headFullness(h headBlock) float64 { - m := h.Meta() - a := float64(h.HighTimestamp() - m.MinTime) - b := float64(m.MaxTime - m.MinTime) - return a / b +// Appender opens a new appender against the database. +func (db *DB) Appender() Appender { + return dbAppender{db: db, Appender: db.head.Appender()} } -// appendableHeads returns a copy of a slice of HeadBlocks that can still be appended to. -func (db *DB) appendableHeads() (r []headBlock) { - switch l := len(db.heads); l { - case 0: - case 1: - r = append(r, db.heads[0]) - default: - if headFullness(db.heads[l-1]) < 0.5 { - r = append(r, db.heads[l-2]) - } - r = append(r, db.heads[l-1]) - } - return r +// dbAppender wraps the DB's head appender and triggers compactions on commit +// if necessary. +type dbAppender struct { + Appender + db *DB } -func (db *DB) completedHeads() (r []headBlock) { - db.mtx.RLock() - defer db.mtx.RUnlock() +func (a dbAppender) Commit() error { + err := a.Appender.Commit() - db.headmtx.RLock() - defer db.headmtx.RUnlock() - - if len(db.heads) < 2 { - return nil - } - - // Select all old heads unless they still have pending appenders. - for _, h := range db.heads[:len(db.heads)-2] { - if h.ActiveWriters() > 0 { - return r + // We could just run this check every few minutes practically. But for benchmarks + // and high frequency use cases this is the safer way. + if a.db.head.MaxTime()-a.db.head.MinTime() > a.db.head.chunkRange/2*3 { + select { + case a.db.compactc <- struct{}{}: + default: } - r = append(r, h) } - // Add the 2nd last head if the last head is more than 50% filled. - // Compacting it early allows us to free its memory before allocating - // more for the next block and thus reduces spikes. - h0 := db.heads[len(db.heads)-1] - h1 := db.heads[len(db.heads)-2] - - if headFullness(h0) >= 0.5 && h1.ActiveWriters() == 0 { - r = append(r, h1) - } - return r + return err } func (db *DB) compact() (changes bool, err error) { @@ -383,22 +325,33 @@ func (db *DB) compact() (changes bool, err error) { // Check whether we have pending head blocks that are ready to be persisted. // They have the highest priority. - for _, h := range db.completedHeads() { + for { select { case <-db.stopc: return changes, nil default: } + // The head has a compactable range if 1.5 level 0 ranges are between the oldest + // and newest timestamp. The 0.5 acts as a buffer of the appendable window. + if db.head.MaxTime()-db.head.MinTime() <= db.opts.BlockRanges[0]/2*3 { + break + } + mint, maxt := rangeForTimestamp(db.head.MinTime(), db.opts.BlockRanges[0]) - if err = db.compactor.Write(db.dir, h); err != nil { + // Wrap head into a range that bounds all reads to it. + head := &rangeHead{ + head: db.head, + mint: mint, + maxt: maxt, + } + if err = db.compactor.Write(db.dir, head, mint, maxt); err != nil { return changes, errors.Wrap(err, "persist head block") } changes = true - if err := os.RemoveAll(h.Dir()); err != nil { - return changes, errors.Wrap(err, "delete compacted head block") + if err := db.reload(); err != nil { + return changes, errors.Wrap(err, "reload blocks") } - runtime.GC() } // Check for compactions of multiple blocks. @@ -427,7 +380,10 @@ func (db *DB) compact() (changes bool, err error) { return changes, errors.Wrap(err, "delete compacted block") } } - runtime.GC() + + if err := db.reload(); err != nil { + return changes, errors.Wrap(err, "reload blocks") + } } return changes, nil @@ -469,7 +425,7 @@ func retentionCutoff(dir string, mint int64) (bool, error) { return changes, fileutil.Fsync(df) } -func (db *DB) getBlock(id ulid.ULID) (Block, bool) { +func (db *DB) getBlock(id ulid.ULID) (DiskBlock, bool) { for _, b := range db.blocks { if b.Meta().ULID == id { return b, true @@ -478,30 +434,23 @@ func (db *DB) getBlock(id ulid.ULID) (Block, bool) { return nil, false } -func (db *DB) reloadBlocks() (err error) { - defer func(t time.Time) { +func (db *DB) reload() (err error) { + defer func() { if err != nil { db.metrics.reloadsFailed.Inc() } db.metrics.reloads.Inc() - db.metrics.reloadDuration.Observe(time.Since(t).Seconds()) - }(time.Now()) + }() var cs []io.Closer defer func() { closeAll(cs...) }() - db.mtx.Lock() - defer db.mtx.Unlock() - - db.headmtx.Lock() - defer db.headmtx.Unlock() - dirs, err := blockDirs(db.dir) if err != nil { return errors.Wrap(err, "find blocks") } var ( - blocks []Block + blocks []DiskBlock exist = map[ulid.ULID]struct{}{} ) @@ -513,11 +462,7 @@ func (db *DB) reloadBlocks() (err error) { b, ok := db.getBlock(meta.ULID) if !ok { - if meta.Compaction.Level == 0 { - b, err = db.openHeadBlock(dir) - } else { - b, err = newPersistedBlock(dir, db.chunkPool) - } + b, err = newPersistedBlock(dir, db.chunkPool) if err != nil { return errors.Wrapf(err, "open block %s", dir) } @@ -532,25 +477,29 @@ func (db *DB) reloadBlocks() (err error) { } // Close all opened blocks that no longer exist after we returned all locks. + // TODO(fabxc: probably races with querier still reading from them. Can + // we just abandon them and have the open FDs be GC'd automatically eventually? for _, b := range db.blocks { if _, ok := exist[b.Meta().ULID]; !ok { cs = append(cs, b) } } + db.mtx.Lock() db.blocks = blocks - db.heads = nil + db.mtx.Unlock() - for _, b := range blocks { - if b.Meta().Compaction.Level == 0 { - db.heads = append(db.heads, b.(*HeadBlock)) - } + // Garbage collect data in the head if the most recent persisted block + // covers data of its current time range. + if len(blocks) == 0 { + return nil } + maxt := blocks[len(db.blocks)-1].Meta().MaxTime - return nil + return errors.Wrap(db.head.Truncate(maxt), "head truncate failed") } -func validateBlockSequence(bs []Block) error { +func validateBlockSequence(bs []DiskBlock) error { if len(bs) == 0 { return nil } @@ -584,10 +533,10 @@ func (db *DB) Close() error { var merr MultiError merr.Add(g.Wait()) + if db.lockf != nil { merr.Add(db.lockf.Unlock()) } - return merr.Err() } @@ -614,125 +563,48 @@ func (db *DB) Snapshot(dir string) error { if dir == db.dir { return errors.Errorf("cannot snapshot into base directory") } + if _, err := ulid.Parse(dir); err == nil { + return errors.Errorf("dir must not be a valid ULID") + } + db.cmtx.Lock() defer db.cmtx.Unlock() - db.mtx.Lock() // To block any appenders. - defer db.mtx.Unlock() + db.mtx.RLock() + defer db.mtx.RUnlock() - blocks := db.blocks[:] - for _, b := range blocks { + for _, b := range db.blocks { db.logger.Log("msg", "snapshotting block", "block", b) + if err := b.Snapshot(dir); err != nil { return errors.Wrap(err, "error snapshotting headblock") } } - return nil + return db.compactor.Write(dir, db.head, db.head.MinTime(), db.head.MaxTime()) } -// Appender returns a new Appender on the database. -func (db *DB) Appender() Appender { - db.metrics.activeAppenders.Inc() - +// Querier returns a new querier over the data partition for the given time range. +// A goroutine must not handle more than one open Querier. +func (db *DB) Querier(mint, maxt int64) Querier { db.mtx.RLock() - return &dbAppender{db: db} -} -type dbAppender struct { - db *DB - heads []*metaAppender + blocks := db.blocksForInterval(mint, maxt) - samples int -} - -type metaAppender struct { - meta BlockMeta - app Appender -} - -func (a *dbAppender) Add(lset labels.Labels, t int64, v float64) (string, error) { - h, err := a.appenderAt(t) - if err != nil { - return "", err + sq := &querier{ + blocks: make([]Querier, 0, len(blocks)), + db: db, } - ref, err := h.app.Add(lset, t, v) - if err != nil { - return "", err + for _, b := range blocks { + sq.blocks = append(sq.blocks, &blockQuerier{ + mint: mint, + maxt: maxt, + index: b.Index(), + chunks: b.Chunks(), + tombstones: b.Tombstones(), + }) } - a.samples++ - - if ref == "" { - return "", nil - } - return string(append(h.meta.ULID[:], ref...)), nil -} - -func (a *dbAppender) AddFast(ref string, t int64, v float64) error { - if len(ref) < 16 { - return errors.Wrap(ErrNotFound, "invalid ref length") - } - // The first 16 bytes a ref hold the ULID of the head block. - h, err := a.appenderAt(t) - if err != nil { - return err - } - // Validate the ref points to the same block we got for t. - if string(h.meta.ULID[:]) != ref[:16] { - return ErrNotFound - } - if err := h.app.AddFast(ref[16:], t, v); err != nil { - // The block the ref points to might fit the given timestamp. - // We mask the error to stick with our contract. - if errors.Cause(err) == ErrOutOfBounds { - err = ErrNotFound - } - return err - } - - a.samples++ - return nil -} - -// appenderFor gets the appender for the head containing timestamp t. -// If the head block doesn't exist yet, it gets created. -func (a *dbAppender) appenderAt(t int64) (*metaAppender, error) { - for _, h := range a.heads { - if intervalContains(h.meta.MinTime, h.meta.MaxTime-1, t) { - return h, nil - } - } - // Currently opened appenders do not cover t. Ensure the head block is - // created and add missing appenders. - a.db.headmtx.Lock() - - if err := a.db.ensureHead(t); err != nil { - a.db.headmtx.Unlock() - return nil, err - } - - var hb headBlock - for _, h := range a.db.appendableHeads() { - m := h.Meta() - - if intervalContains(m.MinTime, m.MaxTime-1, t) { - hb = h - break - } - } - a.db.headmtx.Unlock() - - if hb == nil { - return nil, ErrOutOfBounds - } - // Instantiate appender after returning headmtx! - app := &metaAppender{ - meta: hb.Meta(), - app: hb.Appender(), - } - a.heads = append(a.heads, app) - - return app, nil + return sq } func rangeForTimestamp(t int64, width int64) (mint, maxt int64) { @@ -740,87 +612,7 @@ func rangeForTimestamp(t int64, width int64) (mint, maxt int64) { return mint, mint + width } -// ensureHead makes sure that there is a head block for the timestamp t if -// it is within or after the currently appendable window. -func (db *DB) ensureHead(t int64) error { - var ( - mint, maxt = rangeForTimestamp(t, int64(db.opts.BlockRanges[0])) - addBuffer = len(db.blocks) == 0 - last BlockMeta - ) - - if !addBuffer { - last = db.blocks[len(db.blocks)-1].Meta() - addBuffer = last.MaxTime <= mint-int64(db.opts.BlockRanges[0]) - } - // Create another block of buffer in front if the DB is initialized or retrieving - // new data after a long gap. - // This ensures we always have a full block width of append window. - if addBuffer { - if _, err := db.createHeadBlock(mint-int64(db.opts.BlockRanges[0]), mint); err != nil { - return err - } - // If the previous block reaches into our new window, make it smaller. - } else if mt := last.MaxTime; mt > mint { - mint = mt - } - if mint >= maxt { - return nil - } - // Error if the requested time for a head is before the appendable window. - if len(db.heads) > 0 && t < db.heads[0].Meta().MinTime { - return ErrOutOfBounds - } - - _, err := db.createHeadBlock(mint, maxt) - return err -} - -func (a *dbAppender) Commit() error { - defer a.db.metrics.activeAppenders.Dec() - defer a.db.mtx.RUnlock() - - // Commits to partial appenders must be concurrent as concurrent appenders - // may have conflicting locks on head appenders. - // For high-throughput use cases the errgroup causes significant blocking. Typically, - // we just deal with a single appender and special case it. - var err error - - switch len(a.heads) { - case 1: - err = a.heads[0].app.Commit() - default: - var g errgroup.Group - for _, h := range a.heads { - g.Go(h.app.Commit) - } - err = g.Wait() - } - - if err != nil { - return err - } - // XXX(fabxc): Push the metric down into head block to account properly - // for partial appends? - a.db.metrics.samplesAppended.Add(float64(a.samples)) - - return nil -} - -func (a *dbAppender) Rollback() error { - defer a.db.metrics.activeAppenders.Dec() - defer a.db.mtx.RUnlock() - - var g errgroup.Group - - for _, h := range a.heads { - g.Go(h.app.Rollback) - } - - return g.Wait() -} - -// Delete implements deletion of metrics. +// Delete implements deletion of metrics. It only has atomicity guarantees on a per-block basis. func (db *DB) Delete(mint, maxt int64, ms ...labels.Matcher) error { db.cmtx.Lock() defer db.cmtx.Unlock() @@ -828,16 +620,21 @@ func (db *DB) Delete(mint, maxt int64, ms ...labels.Matcher) error { db.mtx.Lock() defer db.mtx.Unlock() - blocks := db.blocksForInterval(mint, maxt) - var g errgroup.Group - for _, b := range blocks { - g.Go(func(b Block) func() error { - return func() error { return b.Delete(mint, maxt, ms...) } - }(b)) + for _, b := range db.blocks { + m := b.Meta() + if intervalOverlap(mint, maxt, m.MinTime, m.MaxTime) { + g.Go(func(b DiskBlock) func() error { + return func() error { return b.Delete(mint, maxt, ms...) } + }(b)) + } } + g.Go(func() error { + return db.head.Delete(mint, maxt, ms...) + }) + if err := g.Wait(); err != nil { return err } @@ -856,8 +653,8 @@ func intervalContains(min, max, t int64) bool { // blocksForInterval returns all blocks within the partition that may contain // data for the given time range. -func (db *DB) blocksForInterval(mint, maxt int64) []Block { - var bs []Block +func (db *DB) blocksForInterval(mint, maxt int64) []BlockReader { + var bs []BlockReader for _, b := range db.blocks { m := b.Meta() @@ -865,52 +662,13 @@ func (db *DB) blocksForInterval(mint, maxt int64) []Block { bs = append(bs, b) } } + if maxt >= db.head.MinTime() { + bs = append(bs, db.head) + } return bs } -// openHeadBlock opens the head block at dir. -func (db *DB) openHeadBlock(dir string) (*HeadBlock, error) { - var ( - wdir = walDir(dir) - l = log.With(db.logger, "wal", wdir) - ) - wal, err := OpenSegmentWAL(wdir, l, 5*time.Second) - if err != nil { - return nil, errors.Wrap(err, "open WAL %s") - } - - h, err := OpenHeadBlock(dir, log.With(db.logger, "block", dir), wal, db.compactor) - if err != nil { - return nil, errors.Wrapf(err, "open head block %s", dir) - } - return h, nil -} - -// createHeadBlock starts a new head block to append to. -func (db *DB) createHeadBlock(mint, maxt int64) (headBlock, error) { - dir, err := TouchHeadBlock(db.dir, mint, maxt) - if err != nil { - return nil, errors.Wrapf(err, "touch head block %s", dir) - } - newHead, err := db.openHeadBlock(dir) - if err != nil { - return nil, err - } - - db.logger.Log("msg", "created head block", "ulid", newHead.meta.ULID, "mint", mint, "maxt", maxt) - - db.blocks = append(db.blocks, newHead) // TODO(fabxc): this is a race! - db.heads = append(db.heads, newHead) - - select { - case db.compactc <- struct{}{}: - default: - } - - return newHead, nil -} - func isBlockDir(fi os.FileInfo) bool { if !fi.IsDir() { return false @@ -934,7 +692,7 @@ func blockDirs(dir string) ([]string, error) { return dirs, nil } -func sequenceFiles(dir, prefix string) ([]string, error) { +func sequenceFiles(dir string) ([]string, error) { files, err := ioutil.ReadDir(dir) if err != nil { return nil, err @@ -942,24 +700,15 @@ func sequenceFiles(dir, prefix string) ([]string, error) { var res []string for _, fi := range files { - if isSequenceFile(fi, prefix) { - res = append(res, filepath.Join(dir, fi.Name())) + if _, err := strconv.ParseUint(fi.Name(), 10, 64); err != nil { + continue } + res = append(res, filepath.Join(dir, fi.Name())) } return res, nil } -func isSequenceFile(fi os.FileInfo, prefix string) bool { - if !strings.HasPrefix(fi.Name(), prefix) { - return false - } - if _, err := strconv.ParseUint(fi.Name()[len(prefix):], 10, 32); err != nil { - return false - } - return true -} - -func nextSequenceFile(dir, prefix string) (string, int, error) { +func nextSequenceFile(dir string) (string, int, error) { names, err := fileutil.ReadDir(dir) if err != nil { return "", 0, err @@ -967,16 +716,13 @@ func nextSequenceFile(dir, prefix string) (string, int, error) { i := uint64(0) for _, n := range names { - if !strings.HasPrefix(n, prefix) { - continue - } - j, err := strconv.ParseUint(n[len(prefix):], 10, 32) + j, err := strconv.ParseUint(n, 10, 64) if err != nil { continue } i = j } - return filepath.Join(dir, fmt.Sprintf("%s%0.6d", prefix, i+1)), int(i + 1), nil + return filepath.Join(dir, fmt.Sprintf("%0.6d", i+1)), int(i + 1), nil } // The MultiError type implements the error interface, and contains the @@ -1032,3 +778,14 @@ func closeAll(cs ...io.Closer) error { } return merr.Err() } + +func exponential(d, min, max time.Duration) time.Duration { + d *= 2 + if d < min { + d = min + } + if d > max { + d = max + } + return d +} diff --git a/vendor/github.com/prometheus/tsdb/encoding_helpers.go b/vendor/github.com/prometheus/tsdb/encoding_helpers.go index 25ff32d00b..9aa4ba4097 100644 --- a/vendor/github.com/prometheus/tsdb/encoding_helpers.go +++ b/vendor/github.com/prometheus/tsdb/encoding_helpers.go @@ -86,7 +86,7 @@ func (d *decbuf) uvarintStr() string { d.e = errInvalidSize return "" } - s := yoloString(d.b[:l]) + s := string(d.b[:l]) d.b = d.b[l:] return s } diff --git a/vendor/github.com/prometheus/tsdb/head.go b/vendor/github.com/prometheus/tsdb/head.go index 045378d9cb..a74552bcaf 100644 --- a/vendor/github.com/prometheus/tsdb/head.go +++ b/vendor/github.com/prometheus/tsdb/head.go @@ -14,21 +14,16 @@ package tsdb import ( - "fmt" "math" - "math/rand" - "os" - "path/filepath" + "runtime" "sort" "sync" "sync/atomic" "time" - "encoding/binary" - "github.com/go-kit/kit/log" - "github.com/oklog/ulid" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/tsdb/chunks" "github.com/prometheus/tsdb/labels" ) @@ -50,107 +45,171 @@ var ( ErrOutOfBounds = errors.New("out of bounds") ) -// HeadBlock handles reads and writes of time series data within a time window. -type HeadBlock struct { - mtx sync.RWMutex - dir string - wal WAL - compactor Compactor +// Head handles reads and writes of time series data within a time window. +type Head struct { + chunkRange int64 + metrics *headMetrics + wal WAL + logger log.Logger + appendPool sync.Pool - activeWriters uint64 - highTimestamp int64 - closed bool + minTime, maxTime int64 + lastSeriesID uint64 - // descs holds all chunk descs for the head block. Each chunk implicitly - // is assigned the index as its ID. - series []*memSeries - // hashes contains a collision map of label set hashes of chunks - // to their chunk descs. - hashes map[uint64][]*memSeries + // All series addressable by their ID or hash. + series *stripeSeries - symbols map[string]struct{} - values map[string]stringset // label names to possible values - postings *memPostings // postings lists for terms + symMtx sync.RWMutex + symbols map[string]struct{} + values map[string]stringset // label names to possible values + + postings *memPostings // postings lists for terms tombstones tombstoneReader - - meta BlockMeta } -// TouchHeadBlock atomically touches a new head block in dir for -// samples in the range [mint,maxt). -func TouchHeadBlock(dir string, mint, maxt int64) (string, error) { - entropy := rand.New(rand.NewSource(time.Now().UnixNano())) - - ulid, err := ulid.New(ulid.Now(), entropy) - if err != nil { - return "", err - } - - // Make head block creation appear atomic. - dir = filepath.Join(dir, ulid.String()) - tmp := dir + ".tmp" - - if err := os.MkdirAll(tmp, 0777); err != nil { - return "", err - } - - if err := writeMetaFile(tmp, &BlockMeta{ - ULID: ulid, - MinTime: mint, - MaxTime: maxt, - }); err != nil { - return "", err - } - - return dir, renameFile(tmp, dir) +type headMetrics struct { + activeAppenders prometheus.Gauge + series prometheus.Gauge + seriesCreated prometheus.Counter + seriesRemoved prometheus.Counter + chunks prometheus.Gauge + chunksCreated prometheus.Gauge + chunksRemoved prometheus.Gauge + gcDuration prometheus.Summary + minTime prometheus.GaugeFunc + maxTime prometheus.GaugeFunc + samplesAppended prometheus.Counter + walTruncateDuration prometheus.Summary } -// OpenHeadBlock opens the head block in dir. -func OpenHeadBlock(dir string, l log.Logger, wal WAL, c Compactor) (*HeadBlock, error) { - meta, err := readMetaFile(dir) - if err != nil { - return nil, err - } +func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { + m := &headMetrics{} - h := &HeadBlock{ - dir: dir, + m.activeAppenders = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "tsdb_head_active_appenders", + Help: "Number of currently active appender transactions", + }) + m.series = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "tsdb_head_series", + Help: "Total number of series in the head block.", + }) + m.seriesCreated = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "tsdb_head_series_created_total", + Help: "Total number of series created in the head", + }) + m.seriesRemoved = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "tsdb_head_series_removed_total", + Help: "Total number of series removed in the head", + }) + m.chunks = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "tsdb_head_chunks", + Help: "Total number of chunks in the head block.", + }) + m.chunksCreated = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "tsdb_head_chunks_created_total", + Help: "Total number of chunks created in the head", + }) + m.chunksRemoved = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "tsdb_head_chunks_removed_total", + Help: "Total number of chunks removed in the head", + }) + m.gcDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "tsdb_head_gc_duration_seconds", + Help: "Runtime of garbage collection in the head block.", + }) + m.minTime = prometheus.NewGaugeFunc(prometheus.GaugeOpts{ + Name: "tsdb_head_max_time", + Help: "Maximum timestamp of the head block.", + }, func() float64 { + return float64(h.MaxTime()) + }) + m.maxTime = prometheus.NewGaugeFunc(prometheus.GaugeOpts{ + Name: "tsdb_head_min_time", + Help: "Minimum time bound of the head block.", + }, func() float64 { + return float64(h.MinTime()) + }) + m.walTruncateDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Name: "tsdb_wal_truncate_duration_seconds", + Help: "Duration of WAL truncation.", + }) + m.samplesAppended = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "tsdb_head_samples_appended_total", + Help: "Total number of appended sampledb.", + }) + + if r != nil { + r.MustRegister( + m.activeAppenders, + m.chunks, + m.chunksCreated, + m.chunksRemoved, + m.series, + m.seriesCreated, + m.seriesRemoved, + m.minTime, + m.maxTime, + m.gcDuration, + m.walTruncateDuration, + m.samplesAppended, + ) + } + return m +} + +// NewHead opens the head block in dir. +func NewHead(r prometheus.Registerer, l log.Logger, wal WAL, chunkRange int64) (*Head, error) { + if l == nil { + l = log.NewNopLogger() + } + if wal == nil { + wal = NopWAL() + } + if chunkRange < 1 { + return nil, errors.Errorf("invalid chunk range %d", chunkRange) + } + h := &Head{ wal: wal, - compactor: c, - series: []*memSeries{nil}, // 0 is not a valid posting, filled with nil. - hashes: map[uint64][]*memSeries{}, + logger: l, + chunkRange: chunkRange, + minTime: math.MinInt64, + maxTime: math.MinInt64, + series: newStripeSeries(), values: map[string]stringset{}, symbols: map[string]struct{}{}, - postings: &memPostings{m: make(map[term][]uint32)}, - meta: *meta, + postings: newMemPostings(), tombstones: newEmptyTombstoneReader(), } - return h, h.init() + h.metrics = newHeadMetrics(h, r) + + return h, nil } -func (h *HeadBlock) init() error { +func (h *Head) ReadWAL() error { r := h.wal.Reader() + mint := h.MinTime() - seriesFunc := func(series []labels.Labels) error { - for _, lset := range series { - h.create(lset.Hash(), lset) - h.meta.Stats.NumSeries++ + seriesFunc := func(series []RefSeries) error { + for _, s := range series { + h.create(s.Labels.Hash(), s.Labels) } - return nil } samplesFunc := func(samples []RefSample) error { for _, s := range samples { - if int(s.Ref) >= len(h.series) { - return errors.Errorf("unknown series reference %d (max %d); abort WAL restore", - s.Ref, len(h.series)) + if s.T < mint { + continue } - h.series[s.Ref].append(s.T, s.V) - - if !h.inBounds(s.T) { - return errors.Wrap(ErrOutOfBounds, "consume WAL") + ms := h.series.getByID(s.Ref) + if ms == nil { + return errors.Errorf("unknown series reference %d; abort WAL restore", s.Ref) + } + _, chunkCreated := ms.append(s.T, s.V) + if chunkCreated { + h.metrics.chunksCreated.Inc() + h.metrics.chunks.Inc() } - h.meta.Stats.NumSamples++ } return nil @@ -158,6 +217,9 @@ func (h *HeadBlock) init() error { deletesFunc := func(stones []Stone) error { for _, s := range stones { for _, itv := range s.intervals { + if itv.Maxt < mint { + continue + } h.tombstones.add(s.ref, itv) } } @@ -168,377 +230,229 @@ func (h *HeadBlock) init() error { if err := r.Read(seriesFunc, samplesFunc, deletesFunc); err != nil { return errors.Wrap(err, "consume WAL") } - return nil } -// inBounds returns true if the given timestamp is within the valid -// time bounds of the block. -func (h *HeadBlock) inBounds(t int64) bool { - return t >= h.meta.MinTime && t <= h.meta.MaxTime -} +// Truncate removes all data before mint from the head block and truncates its WAL. +func (h *Head) Truncate(mint int64) error { + initialize := h.MinTime() == math.MinInt64 -func (h *HeadBlock) String() string { - return h.meta.ULID.String() -} - -// Close syncs all data and closes underlying resources of the head block. -func (h *HeadBlock) Close() error { - h.mtx.Lock() - defer h.mtx.Unlock() - - if err := h.wal.Close(); err != nil { - return errors.Wrapf(err, "close WAL for head %s", h.dir) + if mint%h.chunkRange != 0 { + return errors.Errorf("truncating at %d not aligned", mint) } - // Check whether the head block still exists in the underlying dir - // or has already been replaced with a compacted version or removed. - meta, err := readMetaFile(h.dir) - if os.IsNotExist(err) { + if h.MinTime() >= mint { return nil } + atomic.StoreInt64(&h.minTime, mint) + + // Ensure that max time is at least as high as min time. + for h.MaxTime() < mint { + atomic.CompareAndSwapInt64(&h.maxTime, h.MaxTime(), mint) + } + + // This was an initial call to Truncate after loading blocks on startup. + // We haven't read back the WAL yet, so do not attempt to truncate it. + if initialize { + return nil + } + + start := time.Now() + + h.gc() + h.logger.Log("msg", "head GC completed", "duration", time.Since(start)) + h.metrics.gcDuration.Observe(time.Since(start).Seconds()) + + start = time.Now() + + p, err := h.indexRange(mint, math.MaxInt64).Postings("", "") if err != nil { return err } - if meta.ULID == h.meta.ULID { - return writeMetaFile(h.dir, &h.meta) - } - h.closed = true + if err := h.wal.Truncate(mint, p); err == nil { + h.logger.Log("msg", "WAL truncation completed", "duration", time.Since(start)) + } else { + h.logger.Log("msg", "WAL truncation failed", "err", err, "duration", time.Since(start)) + } + h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds()) + return nil } -// Meta returns a BlockMeta for the head block. -func (h *HeadBlock) Meta() BlockMeta { - m := BlockMeta{ - ULID: h.meta.ULID, - MinTime: h.meta.MinTime, - MaxTime: h.meta.MaxTime, - Compaction: h.meta.Compaction, +// initTime initializes a head with the first timestamp. This only needs to be called +// for a compltely fresh head with an empty WAL. +// Returns true if the initialization took an effect. +func (h *Head) initTime(t int64) (initialized bool) { + // In the init state, the head has a high timestamp of math.MinInt64. + mint, _ := rangeForTimestamp(t, h.chunkRange) + + if !atomic.CompareAndSwapInt64(&h.minTime, math.MinInt64, mint) { + return false } + // Ensure that max time is initialized to at least the min time we just set. + // Concurrent appenders may already have set it to a higher value. + atomic.CompareAndSwapInt64(&h.maxTime, math.MinInt64, t) - m.Stats.NumChunks = atomic.LoadUint64(&h.meta.Stats.NumChunks) - m.Stats.NumSeries = atomic.LoadUint64(&h.meta.Stats.NumSeries) - m.Stats.NumSamples = atomic.LoadUint64(&h.meta.Stats.NumSamples) - - return m + return true } -// Tombstones returns the TombstoneReader against the block. -func (h *HeadBlock) Tombstones() TombstoneReader { - return h.tombstones +// initAppender is a helper to initialize the time bounds of a the head +// upon the first sample it receives. +type initAppender struct { + app Appender + head *Head } -// Delete implements headBlock. -func (h *HeadBlock) Delete(mint int64, maxt int64, ms ...labels.Matcher) error { - ir := h.Index() - - pr := newPostingsReader(ir) - p, absent := pr.Select(ms...) - - var stones []Stone - -Outer: - for p.Next() { - ref := p.At() - lset := h.series[ref].lset - for _, abs := range absent { - if lset.Get(abs) != "" { - continue Outer - } - } - - // Delete only until the current values and not beyond. - tmin, tmax := clampInterval(mint, maxt, h.series[ref].chunks[0].minTime, h.series[ref].head().maxTime) - stones = append(stones, Stone{ref, intervals{{tmin, tmax}}}) +func (a *initAppender) Add(lset labels.Labels, t int64, v float64) (uint64, error) { + if a.app != nil { + return a.app.Add(lset, t, v) } + a.head.initTime(t) + a.app = a.head.appender() - if p.Err() != nil { - return p.Err() - } - if err := h.wal.LogDeletes(stones); err != nil { - return err - } - - for _, s := range stones { - h.tombstones.add(s.ref, s.intervals[0]) - } - - h.meta.Stats.NumTombstones = uint64(len(h.tombstones)) - return nil + return a.app.Add(lset, t, v) } -// Snapshot persists the current state of the headblock to the given directory. -// Callers must ensure that there are no active appenders against the block. -// DB does this by acquiring its own write lock. -func (h *HeadBlock) Snapshot(snapshotDir string) error { - if h.meta.Stats.NumSeries == 0 { +func (a *initAppender) AddFast(ref uint64, t int64, v float64) error { + if a.app == nil { + return ErrNotFound + } + return a.app.AddFast(ref, t, v) +} + +func (a *initAppender) Commit() error { + if a.app == nil { return nil } - - return h.compactor.Write(snapshotDir, h) + return a.app.Commit() } -// Dir returns the directory of the block. -func (h *HeadBlock) Dir() string { return h.dir } - -// Index returns an IndexReader against the block. -func (h *HeadBlock) Index() IndexReader { - h.mtx.RLock() - defer h.mtx.RUnlock() - - return &headIndexReader{HeadBlock: h, maxSeries: uint32(len(h.series) - 1)} -} - -// Chunks returns a ChunkReader against the block. -func (h *HeadBlock) Chunks() ChunkReader { return &headChunkReader{h} } - -// Querier returns a new Querier against the block for the range [mint, maxt]. -func (h *HeadBlock) Querier(mint, maxt int64) Querier { - h.mtx.RLock() - if h.closed { - panic(fmt.Sprintf("block %s already closed", h.dir)) +func (a *initAppender) Rollback() error { + if a.app == nil { + return nil } - h.mtx.RUnlock() + return a.app.Rollback() +} - return &blockQuerier{ - mint: mint, - maxt: maxt, - index: h.Index(), - chunks: h.Chunks(), - tombstones: h.Tombstones(), +// Appender returns a new Appender on the database. +func (h *Head) Appender() Appender { + h.metrics.activeAppenders.Inc() + + // The head cache might not have a starting point yet. The init appender + // picks up the first appended timestamp as the base. + if h.MinTime() == math.MinInt64 { + return &initAppender{head: h} + } + return h.appender() +} + +func (h *Head) appender() *headAppender { + return &headAppender{ + head: h, + mint: h.MaxTime() - h.chunkRange/2, + samples: h.getAppendBuffer(), + highTimestamp: math.MinInt64, } } -// Appender returns a new Appender against the head block. -func (h *HeadBlock) Appender() Appender { - atomic.AddUint64(&h.activeWriters, 1) - - h.mtx.RLock() - - if h.closed { - panic(fmt.Sprintf("block %s already closed", h.dir)) - } - return &headAppender{HeadBlock: h, samples: getHeadAppendBuffer()} -} - -// ActiveWriters returns true if the block has open write transactions. -func (h *HeadBlock) ActiveWriters() int { - return int(atomic.LoadUint64(&h.activeWriters)) -} - -// HighTimestamp returns the highest inserted sample timestamp. -func (h *HeadBlock) HighTimestamp() int64 { - return atomic.LoadInt64(&h.highTimestamp) -} - -var headPool = sync.Pool{} - -func getHeadAppendBuffer() []RefSample { - b := headPool.Get() +func (h *Head) getAppendBuffer() []RefSample { + b := h.appendPool.Get() if b == nil { return make([]RefSample, 0, 512) } return b.([]RefSample) } -func putHeadAppendBuffer(b []RefSample) { - headPool.Put(b[:0]) +func (h *Head) putAppendBuffer(b []RefSample) { + h.appendPool.Put(b[:0]) } type headAppender struct { - *HeadBlock - - newSeries []*hashedLabels - newLabels []labels.Labels - newHashes map[uint64]uint64 + head *Head + mint int64 + series []RefSeries samples []RefSample highTimestamp int64 } -type hashedLabels struct { - ref uint64 - hash uint64 - labels labels.Labels -} - -func (a *headAppender) Add(lset labels.Labels, t int64, v float64) (string, error) { - if !a.inBounds(t) { - return "", ErrOutOfBounds +func (a *headAppender) Add(lset labels.Labels, t int64, v float64) (uint64, error) { + if t < a.mint { + return 0, ErrOutOfBounds } - hash := lset.Hash() - refb := make([]byte, 8) - // Series exists already in the block. - if ms := a.get(hash, lset); ms != nil { - binary.BigEndian.PutUint64(refb, uint64(ms.ref)) - return string(refb), a.AddFast(string(refb), t, v) + s := a.head.series.getByHash(hash, lset) + + if s == nil { + s = a.head.create(hash, lset) + + a.series = append(a.series, RefSeries{ + Ref: s.ref, + Labels: lset, + hash: hash, + }) } - // Series was added in this transaction previously. - if ref, ok := a.newHashes[hash]; ok { - binary.BigEndian.PutUint64(refb, ref) - // XXX(fabxc): there's no fast path for multiple samples for the same new series - // in the same transaction. We always return the invalid empty ref. It's has not - // been a relevant use case so far and is not worth the trouble. - return "", a.AddFast(string(refb), t, v) - } - - // The series is completely new. - if a.newSeries == nil { - a.newHashes = map[uint64]uint64{} - } - // First sample for new series. - ref := uint64(len(a.newSeries)) - - a.newSeries = append(a.newSeries, &hashedLabels{ - ref: ref, - hash: hash, - labels: lset, - }) - // First bit indicates its a series created in this transaction. - ref |= (1 << 63) - - a.newHashes[hash] = ref - binary.BigEndian.PutUint64(refb, ref) - - return "", a.AddFast(string(refb), t, v) + return s.ref, a.AddFast(s.ref, t, v) } -func (a *headAppender) AddFast(ref string, t int64, v float64) error { - if len(ref) != 8 { - return errors.Wrap(ErrNotFound, "invalid ref length") +func (a *headAppender) AddFast(ref uint64, t int64, v float64) error { + s := a.head.series.getByID(ref) + + if s == nil { + return errors.Wrap(ErrNotFound, "unknown series") } - var ( - refn = binary.BigEndian.Uint64(yoloBytes(ref)) - id = (refn << 1) >> 1 - inTx = refn&(1<<63) != 0 - ) - // Distinguish between existing series and series created in - // this transaction. - if inTx { - if id > uint64(len(a.newSeries)-1) { - return errors.Wrap(ErrNotFound, "transaction series ID too high") - } - // TODO(fabxc): we also have to validate here that the - // sample sequence is valid. - // We also have to revalidate it as we switch locks and create - // the new series. - } else if id > uint64(len(a.series)) { - return errors.Wrap(ErrNotFound, "transaction series ID too high") - } else { - ms := a.series[id] - if ms == nil { - return errors.Wrap(ErrNotFound, "nil series") - } - // TODO(fabxc): memory series should be locked here already. - // Only problem is release of locks in case of a rollback. - c := ms.head() - - if !a.inBounds(t) { - return ErrOutOfBounds - } - if t < c.maxTime { - return ErrOutOfOrderSample - } - - // We are allowing exact duplicates as we can encounter them in valid cases - // like federation and erroring out at that time would be extremely noisy. - if c.maxTime == t && math.Float64bits(ms.lastValue) != math.Float64bits(v) { - return ErrAmendSample - } + if err := s.appendable(t, v); err != nil { + return err } + if t < a.mint { + return ErrOutOfBounds + } if t > a.highTimestamp { a.highTimestamp = t } a.samples = append(a.samples, RefSample{ - Ref: refn, - T: t, - V: v, + Ref: ref, + T: t, + V: v, + series: s, }) return nil } -func (a *headAppender) createSeries() error { - if len(a.newSeries) == 0 { - return nil - } - a.newLabels = make([]labels.Labels, 0, len(a.newSeries)) - base0 := len(a.series) - - a.mtx.RUnlock() - defer a.mtx.RLock() - a.mtx.Lock() - defer a.mtx.Unlock() - - base1 := len(a.series) - - for _, l := range a.newSeries { - // We switched locks and have to re-validate that the series were not - // created by another goroutine in the meantime. - if base1 > base0 { - if ms := a.get(l.hash, l.labels); ms != nil { - l.ref = uint64(ms.ref) - continue - } - } - // Series is still new. - a.newLabels = append(a.newLabels, l.labels) - l.ref = uint64(len(a.series)) - - a.create(l.hash, l.labels) - } - - // Write all new series to the WAL. - if err := a.wal.LogSeries(a.newLabels); err != nil { - return errors.Wrap(err, "WAL log series") - } - - return nil -} - func (a *headAppender) Commit() error { - defer atomic.AddUint64(&a.activeWriters, ^uint64(0)) - defer putHeadAppendBuffer(a.samples) - defer a.mtx.RUnlock() + defer a.Rollback() - if err := a.createSeries(); err != nil { + if err := a.head.wal.LogSeries(a.series); err != nil { return err } - - // We have to update the refs of samples for series we just created. - for i := range a.samples { - s := &a.samples[i] - if s.Ref&(1<<63) != 0 { - s.Ref = a.newSeries[(s.Ref<<1)>>1].ref - } - } - - // Write all new samples to the WAL and add them to the - // in-mem database on success. - if err := a.wal.LogSamples(a.samples); err != nil { + if err := a.head.wal.LogSamples(a.samples); err != nil { return errors.Wrap(err, "WAL log samples") } - total := uint64(len(a.samples)) + total := len(a.samples) for _, s := range a.samples { - if !a.series[s.Ref].append(s.T, s.V) { + ok, chunkCreated := s.series.append(s.T, s.V) + if !ok { total-- } + if chunkCreated { + a.head.metrics.chunks.Inc() + a.head.metrics.chunksCreated.Inc() + } } - atomic.AddUint64(&a.meta.Stats.NumSamples, total) - atomic.AddUint64(&a.meta.Stats.NumSeries, uint64(len(a.newSeries))) + a.head.metrics.samplesAppended.Add(float64(total)) for { - ht := a.HeadBlock.HighTimestamp() + ht := a.head.MaxTime() if a.highTimestamp <= ht { break } - if atomic.CompareAndSwapInt64(&a.HeadBlock.highTimestamp, ht, a.highTimestamp) { + if atomic.CompareAndSwapInt64(&a.head.maxTime, ht, a.highTimestamp) { break } } @@ -547,69 +461,297 @@ func (a *headAppender) Commit() error { } func (a *headAppender) Rollback() error { - a.mtx.RUnlock() - atomic.AddUint64(&a.activeWriters, ^uint64(0)) - putHeadAppendBuffer(a.samples) + a.head.metrics.activeAppenders.Dec() + a.head.putAppendBuffer(a.samples) + return nil } +// Delete all samples in the range of [mint, maxt] for series that satisfy the given +// label matchers. +func (h *Head) Delete(mint, maxt int64, ms ...labels.Matcher) error { + // Do not delete anything beyond the currently valid range. + mint, maxt = clampInterval(mint, maxt, h.MinTime(), h.MaxTime()) + + ir := h.indexRange(mint, maxt) + + pr := newPostingsReader(ir) + p, absent := pr.Select(ms...) + + var stones []Stone + +Outer: + for p.Next() { + series := h.series.getByID(p.At()) + + for _, abs := range absent { + if series.lset.Get(abs) != "" { + continue Outer + } + } + + // Delete only until the current values and not beyond. + t0, t1 := clampInterval(mint, maxt, series.minTime(), series.maxTime()) + stones = append(stones, Stone{p.At(), Intervals{{t0, t1}}}) + } + + if p.Err() != nil { + return p.Err() + } + if err := h.wal.LogDeletes(stones); err != nil { + return err + } + for _, s := range stones { + h.tombstones.add(s.ref, s.intervals[0]) + } + return nil +} + +// gc removes data before the minimum timestmap from the head. +func (h *Head) gc() { + defer runtime.GC() + + // Only data strictly lower than this timestamp must be deleted. + mint := h.MinTime() + + // Drop old chunks and remember series IDs and hashes if they can be + // deleted entirely. + deleted, chunksRemoved := h.series.gc(mint) + seriesRemoved := len(deleted) + + h.metrics.seriesRemoved.Add(float64(seriesRemoved)) + h.metrics.series.Sub(float64(seriesRemoved)) + h.metrics.chunksRemoved.Add(float64(chunksRemoved)) + h.metrics.chunks.Sub(float64(chunksRemoved)) + + // Remove deleted series IDs from the postings lists. First do a collection + // run where we rebuild all postings that have something to delete + h.postings.mtx.RLock() + + type replEntry struct { + idx int + l []uint64 + } + collected := map[labels.Label]replEntry{} + + for t, p := range h.postings.m { + repl := replEntry{idx: len(p)} + + for i, id := range p { + if _, ok := deleted[id]; ok { + // First ID that got deleted, initialize replacement with + // all remaining IDs so far. + if repl.l == nil { + repl.l = make([]uint64, 0, len(p)) + repl.l = append(repl.l, p[:i]...) + } + continue + } + // Only add to the replacement once we know we have to do it. + if repl.l != nil { + repl.l = append(repl.l, id) + } + } + if repl.l != nil { + collected[t] = repl + } + } + + h.postings.mtx.RUnlock() + + // Replace all postings that have changed. Append all IDs that may have + // been added while we switched locks. + h.postings.mtx.Lock() + + for t, repl := range collected { + l := append(repl.l, h.postings.m[t][repl.idx:]...) + + if len(l) > 0 { + h.postings.m[t] = l + } else { + delete(h.postings.m, t) + } + } + + h.postings.mtx.Unlock() + + // Rebuild symbols and label value indices from what is left in the postings terms. + h.postings.mtx.RLock() + + symbols := make(map[string]struct{}, len(h.symbols)) + values := make(map[string]stringset, len(h.values)) + + for t := range h.postings.m { + symbols[t.Name] = struct{}{} + symbols[t.Value] = struct{}{} + + ss, ok := values[t.Name] + if !ok { + ss = stringset{} + values[t.Name] = ss + } + ss.set(t.Value) + } + + h.postings.mtx.RUnlock() + + h.symMtx.Lock() + + h.symbols = symbols + h.values = values + + h.symMtx.Unlock() +} + +func (h *Head) Tombstones() TombstoneReader { + return h.tombstones +} + +// Index returns an IndexReader against the block. +func (h *Head) Index() IndexReader { + return h.indexRange(math.MinInt64, math.MaxInt64) +} + +func (h *Head) indexRange(mint, maxt int64) *headIndexReader { + if hmin := h.MinTime(); hmin > mint { + mint = hmin + } + return &headIndexReader{head: h, mint: mint, maxt: maxt} +} + +// Chunks returns a ChunkReader against the block. +func (h *Head) Chunks() ChunkReader { + return h.chunksRange(math.MinInt64, math.MaxInt64) +} + +func (h *Head) chunksRange(mint, maxt int64) *headChunkReader { + if hmin := h.MinTime(); hmin > mint { + mint = hmin + } + return &headChunkReader{head: h, mint: mint, maxt: maxt} +} + +// MinTime returns the lowest time bound on visible data in the head. +func (h *Head) MinTime() int64 { + return atomic.LoadInt64(&h.minTime) +} + +// MaxTime returns the highest timestamp seen in data of the head. +func (h *Head) MaxTime() int64 { + return atomic.LoadInt64(&h.maxTime) +} + type headChunkReader struct { - *HeadBlock + head *Head + mint, maxt int64 +} + +func (h *headChunkReader) Close() error { + return nil +} + +// packChunkID packs a seriesID and a chunkID within it into a global 8 byte ID. +// It panicks if the seriesID exceeds 5 bytes or the chunk ID 3 bytes. +func packChunkID(seriesID, chunkID uint64) uint64 { + if seriesID > (1<<40)-1 { + panic("series ID exceeds 5 bytes") + } + if chunkID > (1<<24)-1 { + panic("chunk ID exceeds 3 bytes") + } + return (seriesID << 24) | chunkID +} + +func unpackChunkID(id uint64) (seriesID, chunkID uint64) { + return id >> 24, (id << 40) >> 40 } // Chunk returns the chunk for the reference number. func (h *headChunkReader) Chunk(ref uint64) (chunks.Chunk, error) { - h.mtx.RLock() - defer h.mtx.RUnlock() + sid, cid := unpackChunkID(ref) - si := ref >> 32 - ci := (ref << 32) >> 32 + s := h.head.series.getByID(sid) - c := &safeChunk{ - Chunk: h.series[si].chunks[ci].chunk, - s: h.series[si], - i: int(ci), + s.mtx.RLock() + c := s.chunk(int(cid)) + s.mtx.RUnlock() + + // Do not expose chunks that are outside of the specified range. + if c == nil || !intervalOverlap(c.minTime, c.maxTime, h.mint, h.maxt) { + return nil, ErrNotFound } - return c, nil + return &safeChunk{ + Chunk: c.chunk, + s: s, + cid: int(cid), + }, nil } type safeChunk struct { chunks.Chunk - s *memSeries - i int + s *memSeries + cid int } func (c *safeChunk) Iterator() chunks.Iterator { c.s.mtx.RLock() defer c.s.mtx.RUnlock() - return c.s.iterator(c.i) + return c.s.iterator(c.cid) } // func (c *safeChunk) Appender() (chunks.Appender, error) { panic("illegal") } // func (c *safeChunk) Bytes() []byte { panic("illegal") } // func (c *safeChunk) Encoding() chunks.Encoding { panic("illegal") } +type rangeHead struct { + head *Head + mint, maxt int64 +} + +func (h *rangeHead) Index() IndexReader { + return h.head.indexRange(h.mint, h.maxt) +} + +func (h *rangeHead) Chunks() ChunkReader { + return h.head.chunksRange(h.mint, h.maxt) +} + +func (h *rangeHead) Tombstones() TombstoneReader { + return newEmptyTombstoneReader() +} + type headIndexReader struct { - *HeadBlock - // Highest series that existed when the index reader was instantiated. - maxSeries uint32 + head *Head + mint, maxt int64 +} + +func (h *headIndexReader) Close() error { + return nil } func (h *headIndexReader) Symbols() (map[string]struct{}, error) { - return h.symbols, nil + h.head.symMtx.RLock() + defer h.head.symMtx.RUnlock() + + res := make(map[string]struct{}, len(h.head.symbols)) + + for s := range h.head.symbols { + res[s] = struct{}{} + } + return res, nil } // LabelValues returns the possible label values func (h *headIndexReader) LabelValues(names ...string) (StringTuples, error) { - h.mtx.RLock() - defer h.mtx.RUnlock() - if len(names) != 1 { return nil, errInvalidSize } var sl []string - for s := range h.values[names[0]] { + h.head.symMtx.RLock() + defer h.head.symMtx.RUnlock() + + for s := range h.head.values[names[0]] { sl = append(sl, s) } sort.Strings(sl) @@ -619,46 +761,43 @@ func (h *headIndexReader) LabelValues(names ...string) (StringTuples, error) { // Postings returns the postings list iterator for the label pair. func (h *headIndexReader) Postings(name, value string) (Postings, error) { - h.mtx.RLock() - defer h.mtx.RUnlock() - - return h.postings.get(term{name: name, value: value}), nil + return h.head.postings.get(name, value), nil } func (h *headIndexReader) SortedPostings(p Postings) Postings { - h.mtx.RLock() - defer h.mtx.RUnlock() - - ep := make([]uint32, 0, 1024) + ep := make([]uint64, 0, 128) for p.Next() { - // Skip posting entries that include series added after we - // instantiated the index reader. - if p.At() > h.maxSeries { - break - } ep = append(ep, p.At()) } if err := p.Err(); err != nil { return errPostings{err: errors.Wrap(err, "expand postings")} } + var err error sort.Slice(ep, func(i, j int) bool { - return labels.Compare(h.series[ep[i]].lset, h.series[ep[j]].lset) < 0 + if err != nil { + return false + } + a := h.head.series.getByID(ep[i]) + b := h.head.series.getByID(ep[j]) + + if a == nil || b == nil { + err = errors.Errorf("series not found") + return false + } + return labels.Compare(a.lset, b.lset) < 0 }) + if err != nil { + return errPostings{err: err} + } return newListPostings(ep) } // Series returns the series for the given reference. -func (h *headIndexReader) Series(ref uint32, lbls *labels.Labels, chks *[]ChunkMeta) error { - h.mtx.RLock() - defer h.mtx.RUnlock() +func (h *headIndexReader) Series(ref uint64, lbls *labels.Labels, chks *[]ChunkMeta) error { + s := h.head.series.getByID(ref) - if ref > h.maxSeries { - return ErrNotFound - } - - s := h.series[ref] if s == nil { return ErrNotFound } @@ -670,10 +809,14 @@ func (h *headIndexReader) Series(ref uint32, lbls *labels.Labels, chks *[]ChunkM *chks = (*chks)[:0] for i, c := range s.chunks { + // Do not expose chunks that are outside of the specified range. + if !intervalOverlap(c.minTime, c.maxTime, h.mint, h.maxt) { + continue + } *chks = append(*chks, ChunkMeta{ MinTime: c.minTime, MaxTime: c.maxTime, - Ref: (uint64(ref) << 32) | uint64(i), + Ref: packChunkID(s.ref, uint64(s.chunkID(i))), }) } @@ -681,37 +824,35 @@ func (h *headIndexReader) Series(ref uint32, lbls *labels.Labels, chks *[]ChunkM } func (h *headIndexReader) LabelIndices() ([][]string, error) { - h.mtx.RLock() - defer h.mtx.RUnlock() + h.head.symMtx.RLock() + defer h.head.symMtx.RUnlock() res := [][]string{} - for s := range h.values { + for s := range h.head.values { res = append(res, []string{s}) } return res, nil } -// get retrieves the chunk with the hash and label set and creates -// a new one if it doesn't exist yet. -func (h *HeadBlock) get(hash uint64, lset labels.Labels) *memSeries { - series := h.hashes[hash] +func (h *Head) create(hash uint64, lset labels.Labels) *memSeries { + h.metrics.series.Inc() + h.metrics.seriesCreated.Inc() - for _, s := range series { - if s.lset.Equals(lset) { - return s - } + // Optimistically assume that we are the first one to create the series. + id := atomic.AddUint64(&h.lastSeriesID, 1) + s := newMemSeries(lset, id, h.chunkRange) + + s, created := h.series.getOrSet(hash, s) + // Skip indexing if we didn't actually create the series. + if !created { + return s } - return nil -} -func (h *HeadBlock) create(hash uint64, lset labels.Labels) *memSeries { - s := newMemSeries(lset, uint32(len(h.series)), h.meta.MaxTime) + h.postings.add(id, lset) - // Allocate empty space until we can insert at the given index. - h.series = append(h.series, s) - - h.hashes[hash] = append(h.hashes[hash], s) + h.symMtx.Lock() + defer h.symMtx.Unlock() for _, l := range lset { valset, ok := h.values[l.Name] @@ -721,17 +862,179 @@ func (h *HeadBlock) create(hash uint64, lset labels.Labels) *memSeries { } valset.set(l.Value) - h.postings.add(s.ref, term{name: l.Name, value: l.Value}) - h.symbols[l.Name] = struct{}{} h.symbols[l.Value] = struct{}{} } - h.postings.add(s.ref, term{}) - return s } +// seriesHashmap is a simple hashmap for memSeries by their label set. It is built +// on top of a regular hashmap and holds a slice of series to resolve hash collisions. +// Its methods require the hash to be submitted with it to avoid re-computations throughout +// the code. +type seriesHashmap map[uint64][]*memSeries + +func (m seriesHashmap) get(hash uint64, lset labels.Labels) *memSeries { + for _, s := range m[hash] { + if s.lset.Equals(lset) { + return s + } + } + return nil +} + +func (m seriesHashmap) set(hash uint64, s *memSeries) { + l := m[hash] + for i, prev := range l { + if prev.lset.Equals(s.lset) { + l[i] = s + return + } + } + m[hash] = append(l, s) +} + +func (m seriesHashmap) del(hash uint64, lset labels.Labels) { + var rem []*memSeries + for _, s := range m[hash] { + if !s.lset.Equals(lset) { + rem = append(rem, s) + } + } + if len(rem) == 0 { + delete(m, hash) + } else { + m[hash] = rem + } +} + +// stripeSeries locks modulo ranges of IDs and hashes to reduce lock contention. +// The locks are padded to not be on the same cache line. Filling the badded space +// with the maps was profiled to be slower – likely due to the additional pointer +// dereferences. +type stripeSeries struct { + series [stripeSize]map[uint64]*memSeries + hashes [stripeSize]seriesHashmap + locks [stripeSize]stripeLock +} + +const ( + stripeSize = 1 << 14 + stripeMask = stripeSize - 1 +) + +type stripeLock struct { + sync.RWMutex + // Padding to avoid multiple locks being on the same cache line. + _ [40]byte +} + +func newStripeSeries() *stripeSeries { + s := &stripeSeries{} + + for i := range s.series { + s.series[i] = map[uint64]*memSeries{} + } + for i := range s.hashes { + s.hashes[i] = seriesHashmap{} + } + return s +} + +// gc garbage collects old chunks that are strictly before mint and removes +// series entirely that have no chunks left. +func (s *stripeSeries) gc(mint int64) (map[uint64]struct{}, int) { + var ( + deleted = map[uint64]struct{}{} + rmChunks = 0 + ) + // Run through all series and truncate old chunks. Mark those with no + // chunks left as deleted and store their ID. + for i := 0; i < stripeSize; i++ { + s.locks[i].Lock() + + for hash, all := range s.hashes[i] { + for _, series := range all { + series.mtx.Lock() + rmChunks += series.truncateChunksBefore(mint) + + if len(series.chunks) > 0 { + series.mtx.Unlock() + continue + } + + // The series is gone entirely. We need to keep the series lock + // and make sure we have acquired the stripe locks for hash and ID of the + // series alike. + // If we don't hold them all, there's a very small chance that a series receives + // samples again while we are half-way into deleting it. + j := int(series.ref & stripeMask) + + if i != j { + s.locks[j].Lock() + } + + deleted[series.ref] = struct{}{} + s.hashes[i].del(hash, series.lset) + delete(s.series[j], series.ref) + + if i != j { + s.locks[j].Unlock() + } + + series.mtx.Unlock() + } + } + + s.locks[i].Unlock() + } + + return deleted, rmChunks +} + +func (s *stripeSeries) getByID(id uint64) *memSeries { + i := id & stripeMask + + s.locks[i].RLock() + series := s.series[i][id] + s.locks[i].RUnlock() + + return series +} + +func (s *stripeSeries) getByHash(hash uint64, lset labels.Labels) *memSeries { + i := hash & stripeMask + + s.locks[i].RLock() + series := s.hashes[i].get(hash, lset) + s.locks[i].RUnlock() + + return series +} + +func (s *stripeSeries) getOrSet(hash uint64, series *memSeries) (*memSeries, bool) { + i := hash & stripeMask + + s.locks[i].Lock() + + if prev := s.hashes[i].get(hash, series.lset); prev != nil { + return prev, false + } + s.hashes[i].set(hash, series) + + s.hashes[i][hash] = append(s.hashes[i][hash], series) + s.locks[i].Unlock() + + i = series.ref & stripeMask + + s.locks[i].Lock() + s.series[i][series.ref] = series + s.locks[i].Unlock() + + return series, true +} + type sample struct { t int64 v float64 @@ -740,18 +1043,27 @@ type sample struct { type memSeries struct { mtx sync.RWMutex - ref uint32 - lset labels.Labels - chunks []*memChunk + ref uint64 + lset labels.Labels + chunks []*memChunk + chunkRange int64 + firstChunkID int nextAt int64 // timestamp at which to cut the next chunk. - maxt int64 // maximum timestamp for the series. lastValue float64 sampleBuf [4]sample app chunks.Appender // Current appender for the chunk. } +func (s *memSeries) minTime() int64 { + return s.chunks[0].minTime +} + +func (s *memSeries) maxTime() int64 { + return s.head().maxTime +} + func (s *memSeries) cut(mint int64) *memChunk { c := &memChunk{ chunk: chunks.NewXORChunk(), @@ -768,41 +1080,92 @@ func (s *memSeries) cut(mint int64) *memChunk { return c } -func newMemSeries(lset labels.Labels, id uint32, maxt int64) *memSeries { +func newMemSeries(lset labels.Labels, id uint64, chunkRange int64) *memSeries { s := &memSeries{ - lset: lset, - ref: id, - maxt: maxt, - nextAt: math.MinInt64, + lset: lset, + ref: id, + chunkRange: chunkRange, + nextAt: math.MinInt64, } return s } -func (s *memSeries) append(t int64, v float64) bool { +// appendable checks whether the given sample is valid for appending to the series. +func (s *memSeries) appendable(t int64, v float64) error { + c := s.head() + if c == nil { + return nil + } + + if t > c.maxTime { + return nil + } + if t < c.maxTime { + return ErrOutOfOrderSample + } + // We are allowing exact duplicates as we can encounter them in valid cases + // like federation and erroring out at that time would be extremely noisy. + if math.Float64bits(s.lastValue) != math.Float64bits(v) { + return ErrAmendSample + } + return nil +} + +func (s *memSeries) chunk(id int) *memChunk { + ix := id - s.firstChunkID + if ix < 0 || ix >= len(s.chunks) { + return nil + } + return s.chunks[ix] +} + +func (s *memSeries) chunkID(pos int) int { + return pos + s.firstChunkID +} + +// truncateChunksBefore removes all chunks from the series that have not timestamp +// at or after mint. Chunk IDs remain unchanged. +func (s *memSeries) truncateChunksBefore(mint int64) (removed int) { + var k int + for i, c := range s.chunks { + if c.maxTime >= mint { + break + } + k = i + 1 + } + s.chunks = append(s.chunks[:0], s.chunks[k:]...) + s.firstChunkID += k + + return k +} + +// append adds the sample (t, v) to the series. +func (s *memSeries) append(t int64, v float64) (success, chunkCreated bool) { const samplesPerChunk = 120 s.mtx.Lock() - defer s.mtx.Unlock() - var c *memChunk + c := s.head() - if len(s.chunks) == 0 { + if c == nil { c = s.cut(t) + chunkCreated = true } - c = s.head() if c.maxTime >= t { - return false + s.mtx.Unlock() + return false, chunkCreated } - if c.samples > samplesPerChunk/4 && t >= s.nextAt { + if c.chunk.NumSamples() > samplesPerChunk/4 && t >= s.nextAt { c = s.cut(t) + chunkCreated = true } s.app.Append(t, v) c.maxTime = t - c.samples++ - if c.samples == samplesPerChunk/4 { - s.nextAt = computeChunkEndTime(c.minTime, c.maxTime, s.maxt) + if c.chunk.NumSamples() == samplesPerChunk/4 { + _, maxt := rangeForTimestamp(c.minTime, s.chunkRange) + s.nextAt = computeChunkEndTime(c.minTime, c.maxTime, maxt) } s.lastValue = v @@ -812,7 +1175,9 @@ func (s *memSeries) append(t int64, v float64) bool { s.sampleBuf[2] = s.sampleBuf[3] s.sampleBuf[3] = sample{t: t, v: v} - return true + s.mtx.Unlock() + + return true, chunkCreated } // computeChunkEndTime estimates the end timestamp based the beginning of a chunk, @@ -826,30 +1191,33 @@ func computeChunkEndTime(start, cur, max int64) int64 { return start + (max-start)/a } -func (s *memSeries) iterator(i int) chunks.Iterator { - c := s.chunks[i] +func (s *memSeries) iterator(id int) chunks.Iterator { + c := s.chunk(id) - if i < len(s.chunks)-1 { + if id-s.firstChunkID < len(s.chunks)-1 { return c.chunk.Iterator() } - + // Serve the last 4 samples for the last chunk from the series buffer + // as their compressed bytes may be mutated by added samples. it := &memSafeIterator{ Iterator: c.chunk.Iterator(), i: -1, - total: c.samples, + total: c.chunk.NumSamples(), buf: s.sampleBuf, } return it } func (s *memSeries) head() *memChunk { + if len(s.chunks) == 0 { + return nil + } return s.chunks[len(s.chunks)-1] } type memChunk struct { chunk chunks.Chunk minTime, maxTime int64 - samples int } type memSafeIterator struct { diff --git a/vendor/github.com/prometheus/tsdb/index.go b/vendor/github.com/prometheus/tsdb/index.go index e3cce3c00f..ddc2c4f52a 100644 --- a/vendor/github.com/prometheus/tsdb/index.go +++ b/vendor/github.com/prometheus/tsdb/index.go @@ -18,7 +18,6 @@ import ( "encoding/binary" "fmt" "hash" - "hash/crc32" "io" "os" "path/filepath" @@ -100,7 +99,7 @@ type IndexWriter interface { // their labels. // The reference numbers are used to resolve entries in postings lists that // are added later. - AddSeries(ref uint32, l labels.Labels, chunks ...ChunkMeta) error + AddSeries(ref uint64, l labels.Labels, chunks ...ChunkMeta) error // WriteLabelIndex serializes an index from label names to values. // The passed in values chained tuples of strings of the length of names. @@ -131,7 +130,7 @@ type indexWriter struct { uint32s []uint32 symbols map[string]uint32 // symbol offsets - seriesOffsets map[uint32]uint64 // offsets of series + seriesOffsets map[uint64]uint64 // offsets of series labelIndexes []hashEntry // label index offsets postings []hashEntry // postings lists offsets @@ -176,8 +175,8 @@ func newIndexWriter(dir string) (*indexWriter, error) { // Caches. symbols: make(map[string]uint32, 1<<13), - seriesOffsets: make(map[uint32]uint64, 1<<16), - crc32: crc32.New(crc32.MakeTable(crc32.Castagnoli)), + seriesOffsets: make(map[uint64]uint64, 1<<16), + crc32: newCRC32(), } if err := iw.writeMeta(); err != nil { return nil, err @@ -261,7 +260,7 @@ func (w *indexWriter) writeMeta() error { return w.write(w.buf1.get()) } -func (w *indexWriter) AddSeries(ref uint32, lset labels.Labels, chunks ...ChunkMeta) error { +func (w *indexWriter) AddSeries(ref uint64, lset labels.Labels, chunks ...ChunkMeta) error { if err := w.ensureStage(idxStageSeries); err != nil { return err } @@ -458,7 +457,10 @@ func (w *indexWriter) WritePostings(name, value string, it Postings) error { if !ok { return errors.Errorf("%p series for reference %d not found", w, it.At()) } - refs = append(refs, uint32(offset)) // XXX(fabxc): get uint64 vs uint32 sorted out. + if offset > (1<<32)-1 { + return errors.Errorf("series offset %d exceeds 4 bytes", offset) + } + refs = append(refs, uint32(offset)) } if err := it.Err(); err != nil { return err @@ -525,7 +527,7 @@ type IndexReader interface { // Series populates the given labels and chunk metas for the series identified // by the reference. - Series(ref uint32, lset *labels.Labels, chks *[]ChunkMeta) error + Series(ref uint64, lset *labels.Labels, chks *[]ChunkMeta) error // LabelIndices returns the label pairs for which indices exist. LabelIndices() ([][]string, error) @@ -741,7 +743,7 @@ func (r *indexReader) LabelIndices() ([][]string, error) { return res, nil } -func (r *indexReader) Series(ref uint32, lbls *labels.Labels, chks *[]ChunkMeta) error { +func (r *indexReader) Series(ref uint64, lbls *labels.Labels, chks *[]ChunkMeta) error { d1 := r.decbufAt(int(ref)) d2 := d1.decbuf(int(d1.uvarint())) diff --git a/vendor/github.com/prometheus/tsdb/pool.go b/vendor/github.com/prometheus/tsdb/pool.go new file mode 100644 index 0000000000..7d0f3f6f08 --- /dev/null +++ b/vendor/github.com/prometheus/tsdb/pool.go @@ -0,0 +1,79 @@ +package tsdb + +import "sync" + +type bucketPool struct { + buckets []sync.Pool + sizes []int + new func(sz int) interface{} +} + +func newBucketPool(minSize, maxSize int, factor float64, f func(sz int) interface{}) *bucketPool { + if minSize < 1 { + panic("invalid minimum pool size") + } + if maxSize < 1 { + panic("invalid maximum pool size") + } + if factor < 1 { + panic("invalid factor") + } + + var sizes []int + + for s := minSize; s <= maxSize; s = int(float64(s) * factor) { + sizes = append(sizes, s) + } + + p := &bucketPool{ + buckets: make([]sync.Pool, len(sizes)), + sizes: sizes, + new: f, + } + + return p +} + +func (p *bucketPool) get(sz int) interface{} { + for i, bktSize := range p.sizes { + if sz > bktSize { + continue + } + x := p.buckets[i].Get() + if x == nil { + x = p.new(sz) + } + return x + } + return p.new(sz) +} + +func (p *bucketPool) put(x interface{}, sz int) { + for i, bktSize := range p.sizes { + if sz > bktSize { + continue + } + p.buckets[i].Put(x) + return + } +} + +type poolUint64 struct { + p *bucketPool +} + +func newPoolUint64(minSize, maxSize int, factor float64) poolUint64 { + return poolUint64{ + p: newBucketPool(minSize, maxSize, factor, func(sz int) interface{} { + return make([]uint64, 0, sz) + }), + } +} + +func (p poolUint64) get(sz int) []uint64 { + return p.p.get(sz).([]uint64) +} + +func (p poolUint64) put(x []uint64) { + p.p.put(x[:0], cap(x)) +} diff --git a/vendor/github.com/prometheus/tsdb/postings.go b/vendor/github.com/prometheus/tsdb/postings.go index f2f1eb5b8f..97a29ab197 100644 --- a/vendor/github.com/prometheus/tsdb/postings.go +++ b/vendor/github.com/prometheus/tsdb/postings.go @@ -17,31 +17,47 @@ import ( "encoding/binary" "sort" "strings" + "sync" + + "github.com/prometheus/tsdb/labels" ) type memPostings struct { - m map[term][]uint32 + mtx sync.RWMutex + m map[labels.Label][]uint64 } -type term struct { - name, value string +func newMemPostings() *memPostings { + return &memPostings{ + m: make(map[labels.Label][]uint64, 512), + } } // Postings returns an iterator over the postings list for s. -func (p *memPostings) get(t term) Postings { - l := p.m[t] +func (p *memPostings) get(name, value string) Postings { + p.mtx.RLock() + l := p.m[labels.Label{Name: name, Value: value}] + p.mtx.RUnlock() + if l == nil { return emptyPostings } return newListPostings(l) } +var allLabel = labels.Label{} + // add adds a document to the index. The caller has to ensure that no // term argument appears twice. -func (p *memPostings) add(id uint32, terms ...term) { - for _, t := range terms { - p.m[t] = append(p.m[t], id) +func (p *memPostings) add(id uint64, lset labels.Labels) { + p.mtx.Lock() + + for _, l := range lset { + p.m[l] = append(p.m[l], id) } + p.m[allLabel] = append(p.m[allLabel], id) + + p.mtx.Unlock() } // Postings provides iterative access over a postings list. @@ -51,10 +67,10 @@ type Postings interface { // Seek advances the iterator to value v or greater and returns // true if a value was found. - Seek(v uint32) bool + Seek(v uint64) bool // At returns the value at the current iterator position. - At() uint32 + At() uint64 // Err returns the last error of the iterator. Err() error @@ -66,8 +82,8 @@ type errPostings struct { } func (e errPostings) Next() bool { return false } -func (e errPostings) Seek(uint32) bool { return false } -func (e errPostings) At() uint32 { return 0 } +func (e errPostings) Seek(uint64) bool { return false } +func (e errPostings) At() uint64 { return 0 } func (e errPostings) Err() error { return e.err } var emptyPostings = errPostings{} @@ -88,18 +104,18 @@ func Intersect(its ...Postings) Postings { type intersectPostings struct { a, b Postings aok, bok bool - cur uint32 + cur uint64 } func newIntersectPostings(a, b Postings) *intersectPostings { return &intersectPostings{a: a, b: b} } -func (it *intersectPostings) At() uint32 { +func (it *intersectPostings) At() uint64 { return it.cur } -func (it *intersectPostings) doNext(id uint32) bool { +func (it *intersectPostings) doNext(id uint64) bool { for { if !it.b.Seek(id) { return false @@ -125,7 +141,7 @@ func (it *intersectPostings) Next() bool { return it.doNext(it.a.At()) } -func (it *intersectPostings) Seek(id uint32) bool { +func (it *intersectPostings) Seek(id uint64) bool { if !it.a.Seek(id) { return false } @@ -155,14 +171,14 @@ type mergedPostings struct { a, b Postings initialized bool aok, bok bool - cur uint32 + cur uint64 } func newMergedPostings(a, b Postings) *mergedPostings { return &mergedPostings{a: a, b: b} } -func (it *mergedPostings) At() uint32 { +func (it *mergedPostings) At() uint64 { return it.cur } @@ -204,7 +220,7 @@ func (it *mergedPostings) Next() bool { return true } -func (it *mergedPostings) Seek(id uint32) bool { +func (it *mergedPostings) Seek(id uint64) bool { if it.cur >= id { return true } @@ -225,15 +241,15 @@ func (it *mergedPostings) Err() error { // listPostings implements the Postings interface over a plain list. type listPostings struct { - list []uint32 - cur uint32 + list []uint64 + cur uint64 } -func newListPostings(list []uint32) *listPostings { +func newListPostings(list []uint64) *listPostings { return &listPostings{list: list} } -func (it *listPostings) At() uint32 { +func (it *listPostings) At() uint64 { return it.cur } @@ -247,7 +263,7 @@ func (it *listPostings) Next() bool { return false } -func (it *listPostings) Seek(x uint32) bool { +func (it *listPostings) Seek(x uint64) bool { // If the current value satisfies, then return. if it.cur >= x { return true @@ -281,8 +297,8 @@ func newBigEndianPostings(list []byte) *bigEndianPostings { return &bigEndianPostings{list: list} } -func (it *bigEndianPostings) At() uint32 { - return it.cur +func (it *bigEndianPostings) At() uint64 { + return uint64(it.cur) } func (it *bigEndianPostings) Next() bool { @@ -294,15 +310,15 @@ func (it *bigEndianPostings) Next() bool { return false } -func (it *bigEndianPostings) Seek(x uint32) bool { - if it.cur >= x { +func (it *bigEndianPostings) Seek(x uint64) bool { + if uint64(it.cur) >= x { return true } num := len(it.list) / 4 // Do binary search between current position and end. i := sort.Search(num, func(i int) bool { - return binary.BigEndian.Uint32(it.list[i*4:]) >= x + return binary.BigEndian.Uint32(it.list[i*4:]) >= uint32(x) }) if i < num { j := i * 4 diff --git a/vendor/github.com/prometheus/tsdb/querier.go b/vendor/github.com/prometheus/tsdb/querier.go index 8c2f6cbee2..5461fec89a 100644 --- a/vendor/github.com/prometheus/tsdb/querier.go +++ b/vendor/github.com/prometheus/tsdb/querier.go @@ -54,26 +54,6 @@ type querier struct { blocks []Querier } -// Querier returns a new querier over the data partition for the given time range. -// A goroutine must not handle more than one open Querier. -func (s *DB) Querier(mint, maxt int64) Querier { - s.mtx.RLock() - - s.headmtx.RLock() - blocks := s.blocksForInterval(mint, maxt) - s.headmtx.RUnlock() - - sq := &querier{ - blocks: make([]Querier, 0, len(blocks)), - db: s, - } - for _, b := range blocks { - sq.blocks = append(sq.blocks, b.Querier(mint, maxt)) - } - - return sq -} - func (q *querier) LabelValues(n string) ([]string, error) { return q.lvals(q.blocks, n) } @@ -128,6 +108,18 @@ func (q *querier) Close() error { return merr.Err() } +// NewBlockQuerier returns a queries against the readers. +func NewBlockQuerier(ir IndexReader, cr ChunkReader, tr TombstoneReader, mint, maxt int64) Querier { + return &blockQuerier{ + index: ir, + chunks: cr, + tombstones: tr, + + mint: mint, + maxt: maxt, + } +} + // blockQuerier provides querying access to a single block database. type blockQuerier struct { index IndexReader @@ -348,6 +340,13 @@ type mergedSeriesSet struct { adone, bdone bool } +// NewMergedSeriesSet takes two series sets as a single series set. The input series sets +// must be sorted and sequential in time, i.e. if they have the same label set, +// the datapoints of a must be before the datapoints of b. +func NewMergedSeriesSet(a, b SeriesSet) SeriesSet { + return newMergedSeriesSet(a, b) +} + func newMergedSeriesSet(a, b SeriesSet) *mergedSeriesSet { s := &mergedSeriesSet{a: a, b: b} // Initialize first elements of both sets as Next() needs @@ -403,7 +402,7 @@ func (s *mergedSeriesSet) Next() bool { type chunkSeriesSet interface { Next() bool - At() (labels.Labels, []ChunkMeta, intervals) + At() (labels.Labels, []ChunkMeta, Intervals) Err() error } @@ -417,11 +416,11 @@ type baseChunkSeries struct { lset labels.Labels chks []ChunkMeta - intervals intervals + intervals Intervals err error } -func (s *baseChunkSeries) At() (labels.Labels, []ChunkMeta, intervals) { +func (s *baseChunkSeries) At() (labels.Labels, []ChunkMeta, Intervals) { return s.lset, s.chks, s.intervals } @@ -455,7 +454,7 @@ Outer: // Only those chunks that are not entirely deleted. chks := make([]ChunkMeta, 0, len(s.chks)) for _, chk := range s.chks { - if !(interval{chk.MinTime, chk.MaxTime}.isSubrange(s.intervals)) { + if !(Interval{chk.MinTime, chk.MaxTime}.isSubrange(s.intervals)) { chks = append(chks, chk) } } @@ -482,10 +481,10 @@ type populatedChunkSeries struct { err error chks []ChunkMeta lset labels.Labels - intervals intervals + intervals Intervals } -func (s *populatedChunkSeries) At() (labels.Labels, []ChunkMeta, intervals) { +func (s *populatedChunkSeries) At() (labels.Labels, []ChunkMeta, Intervals) { return s.lset, s.chks, s.intervals } func (s *populatedChunkSeries) Err() error { return s.err } @@ -570,7 +569,7 @@ type chunkSeries struct { mint, maxt int64 - intervals intervals + intervals Intervals } func (s *chunkSeries) Labels() labels.Labels { @@ -676,11 +675,12 @@ type chunkSeriesIterator struct { maxt, mint int64 - intervals intervals + intervals Intervals } -func newChunkSeriesIterator(cs []ChunkMeta, dranges intervals, mint, maxt int64) *chunkSeriesIterator { +func newChunkSeriesIterator(cs []ChunkMeta, dranges Intervals, mint, maxt int64) *chunkSeriesIterator { it := cs[0].Chunk.Iterator() + if len(dranges) > 0 { it = &deletedIterator{it: it, intervals: dranges} } @@ -731,19 +731,22 @@ func (it *chunkSeriesIterator) At() (t int64, v float64) { } func (it *chunkSeriesIterator) Next() bool { - for it.cur.Next() { + if it.cur.Next() { t, _ := it.cur.At() - if t < it.mint { - return it.Seek(it.mint) - } + if t < it.mint { + if !it.Seek(it.mint) { + return false + } + t, _ = it.At() + + return t <= it.maxt + } if t > it.maxt { return false } - return true } - if err := it.cur.Err(); err != nil { return false } diff --git a/vendor/github.com/prometheus/tsdb/tombstones.go b/vendor/github.com/prometheus/tsdb/tombstones.go index 612b3029fe..7b24407b53 100644 --- a/vendor/github.com/prometheus/tsdb/tombstones.go +++ b/vendor/github.com/prometheus/tsdb/tombstones.go @@ -16,7 +16,6 @@ package tsdb import ( "encoding/binary" "fmt" - "hash/crc32" "io" "io/ioutil" "os" @@ -34,10 +33,15 @@ const ( tombstoneFormatV1 = 1 ) +// TombstoneReader is the iterator over tombstones. +type TombstoneReader interface { + Get(ref uint64) Intervals +} + func writeTombstoneFile(dir string, tr tombstoneReader) error { path := filepath.Join(dir, tombstoneFilename) tmp := path + ".tmp" - hash := crc32.New(crc32.MakeTable(crc32.Castagnoli)) + hash := newCRC32() f, err := os.Create(tmp) if err != nil { @@ -60,9 +64,9 @@ func writeTombstoneFile(dir string, tr tombstoneReader) error { for k, v := range tr { for _, itv := range v { buf.reset() - buf.putUvarint32(k) - buf.putVarint64(itv.mint) - buf.putVarint64(itv.maxt) + buf.putUvarint64(k) + buf.putVarint64(itv.Mint) + buf.putVarint64(itv.Maxt) _, err = mw.Write(buf.get()) if err != nil { @@ -82,13 +86,8 @@ func writeTombstoneFile(dir string, tr tombstoneReader) error { // Stone holds the information on the posting and time-range // that is deleted. type Stone struct { - ref uint32 - intervals intervals -} - -// TombstoneReader is the iterator over tombstones. -type TombstoneReader interface { - Get(ref uint32) intervals + ref uint64 + intervals Intervals } func readTombstones(dir string) (tombstoneReader, error) { @@ -114,7 +113,7 @@ func readTombstones(dir string) (tombstoneReader, error) { } // Verify checksum - hash := crc32.New(crc32.MakeTable(crc32.Castagnoli)) + hash := newCRC32() if _, err := hash.Write(d.get()); err != nil { return nil, errors.Wrap(err, "write to hash") } @@ -124,48 +123,49 @@ func readTombstones(dir string) (tombstoneReader, error) { stonesMap := newEmptyTombstoneReader() for d.len() > 0 { - k := d.uvarint32() + k := d.uvarint64() mint := d.varint64() maxt := d.varint64() if d.err() != nil { return nil, d.err() } - stonesMap.add(k, interval{mint, maxt}) + stonesMap.add(k, Interval{mint, maxt}) } return newTombstoneReader(stonesMap), nil } -type tombstoneReader map[uint32]intervals +type tombstoneReader map[uint64]Intervals -func newTombstoneReader(ts map[uint32]intervals) tombstoneReader { +func newTombstoneReader(ts map[uint64]Intervals) tombstoneReader { return tombstoneReader(ts) } func newEmptyTombstoneReader() tombstoneReader { - return tombstoneReader(make(map[uint32]intervals)) + return tombstoneReader(make(map[uint64]Intervals)) } -func (t tombstoneReader) Get(ref uint32) intervals { +func (t tombstoneReader) Get(ref uint64) Intervals { return t[ref] } -func (t tombstoneReader) add(ref uint32, itv interval) { +func (t tombstoneReader) add(ref uint64, itv Interval) { t[ref] = t[ref].add(itv) } -type interval struct { - mint, maxt int64 +// Interval represents a single time-interval. +type Interval struct { + Mint, Maxt int64 } -func (tr interval) inBounds(t int64) bool { - return t >= tr.mint && t <= tr.maxt +func (tr Interval) inBounds(t int64) bool { + return t >= tr.Mint && t <= tr.Maxt } -func (tr interval) isSubrange(dranges intervals) bool { +func (tr Interval) isSubrange(dranges Intervals) bool { for _, r := range dranges { - if r.inBounds(tr.mint) && r.inBounds(tr.maxt) { + if r.inBounds(tr.Mint) && r.inBounds(tr.Maxt) { return true } } @@ -173,43 +173,44 @@ func (tr interval) isSubrange(dranges intervals) bool { return false } -type intervals []interval +// Intervals represents a set of increasing and non-overlapping time-intervals. +type Intervals []Interval // This adds the new time-range to the existing ones. // The existing ones must be sorted. -func (itvs intervals) add(n interval) intervals { +func (itvs Intervals) add(n Interval) Intervals { for i, r := range itvs { // TODO(gouthamve): Make this codepath easier to digest. - if r.inBounds(n.mint-1) || r.inBounds(n.mint) { - if n.maxt > r.maxt { - itvs[i].maxt = n.maxt + if r.inBounds(n.Mint-1) || r.inBounds(n.Mint) { + if n.Maxt > r.Maxt { + itvs[i].Maxt = n.Maxt } j := 0 for _, r2 := range itvs[i+1:] { - if n.maxt < r2.mint { + if n.Maxt < r2.Mint { break } j++ } if j != 0 { - if itvs[i+j].maxt > n.maxt { - itvs[i].maxt = itvs[i+j].maxt + if itvs[i+j].Maxt > n.Maxt { + itvs[i].Maxt = itvs[i+j].Maxt } itvs = append(itvs[:i+1], itvs[i+j+1:]...) } return itvs } - if r.inBounds(n.maxt+1) || r.inBounds(n.maxt) { - if n.mint < r.maxt { - itvs[i].mint = n.mint + if r.inBounds(n.Maxt+1) || r.inBounds(n.Maxt) { + if n.Mint < r.Maxt { + itvs[i].Mint = n.Mint } return itvs } - if n.mint < r.mint { - newRange := make(intervals, i, len(itvs[:i])+1) + if n.Mint < r.Mint { + newRange := make(Intervals, i, len(itvs[:i])+1) copy(newRange, itvs[:i]) newRange = append(newRange, n) newRange = append(newRange, itvs[i:]...) diff --git a/vendor/github.com/prometheus/tsdb/wal.go b/vendor/github.com/prometheus/tsdb/wal.go index 50ddb6e346..1dadc8f2c3 100644 --- a/vendor/github.com/prometheus/tsdb/wal.go +++ b/vendor/github.com/prometheus/tsdb/wal.go @@ -16,11 +16,14 @@ package tsdb import ( "bufio" "encoding/binary" + "fmt" "hash" "hash/crc32" "io" "math" "os" + "path/filepath" + "sort" "sync" "time" @@ -53,50 +56,75 @@ const ( type SamplesCB func([]RefSample) error // SeriesCB is the callback after reading series. -type SeriesCB func([]labels.Labels) error +type SeriesCB func([]RefSeries) error // DeletesCB is the callback after reading deletes. type DeletesCB func([]Stone) error -// SegmentWAL is a write ahead log for series data. -type SegmentWAL struct { - mtx sync.Mutex - - dirFile *os.File - files []*os.File - - logger log.Logger - flushInterval time.Duration - segmentSize int64 - - crc32 hash.Hash32 - cur *bufio.Writer - curN int64 - - stopc chan struct{} - donec chan struct{} -} - // WAL is a write ahead log that can log new series labels and samples. // It must be completely read before new entries are logged. type WAL interface { Reader() WALReader - LogSeries([]labels.Labels) error + LogSeries([]RefSeries) error LogSamples([]RefSample) error LogDeletes([]Stone) error + Truncate(int64, Postings) error Close() error } +// NopWAL is a WAL that does nothing. +func NopWAL() WAL { + return nopWAL{} +} + +type nopWAL struct{} + +func (nopWAL) Read(SeriesCB, SamplesCB, DeletesCB) error { return nil } +func (w nopWAL) Reader() WALReader { return w } +func (nopWAL) LogSeries([]RefSeries) error { return nil } +func (nopWAL) LogSamples([]RefSample) error { return nil } +func (nopWAL) LogDeletes([]Stone) error { return nil } +func (nopWAL) Truncate(int64, Postings) error { return nil } +func (nopWAL) Close() error { return nil } + // WALReader reads entries from a WAL. type WALReader interface { Read(SeriesCB, SamplesCB, DeletesCB) error } +// RefSeries is the series labels with the series ID. +type RefSeries struct { + Ref uint64 + Labels labels.Labels + + // hash for the label set. This field is not generally populated. + hash uint64 +} + // RefSample is a timestamp/value pair associated with a reference to a series. type RefSample struct { Ref uint64 T int64 V float64 + + series *memSeries +} + +// segmentFile wraps a file object of a segment and tracks the highest timestamp +// it contains. During WAL truncating, all segments with no higher timestamp than +// the truncation threshold can be compacted. +type segmentFile struct { + *os.File + maxTime int64 // highest tombstone or sample timpstamp in segment + minSeries uint64 // lowerst series ID in segment +} + +func newSegmentFile(f *os.File) *segmentFile { + return &segmentFile{ + File: f, + maxTime: math.MinInt64, + minSeries: math.MaxUint64, + } } const ( @@ -112,6 +140,32 @@ func init() { castagnoliTable = crc32.MakeTable(crc32.Castagnoli) } +// newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the +// polynomial may be easily changed in one location at a later time, if necessary. +func newCRC32() hash.Hash32 { + return crc32.New(castagnoliTable) +} + +// SegmentWAL is a write ahead log for series data. +type SegmentWAL struct { + mtx sync.Mutex + + dirFile *os.File + files []*segmentFile + + logger log.Logger + flushInterval time.Duration + segmentSize int64 + + crc32 hash.Hash32 + cur *bufio.Writer + curN int64 + + stopc chan struct{} + donec chan struct{} + buffers sync.Pool +} + // OpenSegmentWAL opens or creates a write ahead log in the given directory. // The WAL must be read completely before new data is written. func OpenSegmentWAL(dir string, logger log.Logger, flushInterval time.Duration) (*SegmentWAL, error) { @@ -133,157 +187,379 @@ func OpenSegmentWAL(dir string, logger log.Logger, flushInterval time.Duration) donec: make(chan struct{}), stopc: make(chan struct{}), segmentSize: walSegmentSizeBytes, - crc32: crc32.New(castagnoliTable), + crc32: newCRC32(), } - if err := w.initSegments(); err != nil { + + fns, err := sequenceFiles(w.dirFile.Name()) + if err != nil { return nil, err } + for _, fn := range fns { + f, err := w.openSegmentFile(fn) + if err != nil { + return nil, err + } + w.files = append(w.files, newSegmentFile(f)) + } go w.run(flushInterval) return w, nil } +// repairingWALReader wraps a WAL reader and truncates its underlying SegmentWAL after the last +// valid entry if it encounters corruption. +type repairingWALReader struct { + wal *SegmentWAL + r WALReader +} + +func (r *repairingWALReader) Read(series SeriesCB, samples SamplesCB, deletes DeletesCB) error { + err := r.r.Read(series, samples, deletes) + if err == nil { + return nil + } + cerr, ok := err.(walCorruptionErr) + if !ok { + return err + } + return r.wal.truncate(cerr.err, cerr.file, cerr.lastOffset) +} + +// truncate the WAL after the last valid entry. +func (w *SegmentWAL) truncate(err error, file int, lastOffset int64) error { + w.logger.Log("msg", "WAL corruption detected; truncating", + "err", err, "file", w.files[file].Name(), "pos", lastOffset) + + // Close and delete all files after the current one. + for _, f := range w.files[file+1:] { + if err := f.Close(); err != nil { + return err + } + if err := os.Remove(f.Name()); err != nil { + return err + } + } + w.mtx.Lock() + defer w.mtx.Unlock() + + w.files = w.files[:file+1] + + // Seek the current file to the last valid offset where we continue writing from. + _, err = w.files[file].Seek(lastOffset, os.SEEK_SET) + return err +} + // Reader returns a new reader over the the write ahead log data. // It must be completely consumed before writing to the WAL. func (w *SegmentWAL) Reader() WALReader { - return newWALReader(w, w.logger) + return &repairingWALReader{ + wal: w, + r: newWALReader(w.files, w.logger), + } } -// Log writes a batch of new series labels and samples to the log. -//func (w *SegmentWAL) Log(series []labels.Labels, samples []RefSample) error { -//return nil -//} +func (w *SegmentWAL) getBuffer() *encbuf { + b := w.buffers.Get() + if b == nil { + return &encbuf{b: make([]byte, 0, 64*1024)} + } + return b.(*encbuf) +} -// LogSeries writes a batch of new series labels to the log. -func (w *SegmentWAL) LogSeries(series []labels.Labels) error { - if err := w.encodeSeries(series); err != nil { +func (w *SegmentWAL) putBuffer(b *encbuf) { + b.reset() + w.buffers.Put(b) +} + +// Truncate deletes the values prior to mint and the series entries not in p. +func (w *SegmentWAL) Truncate(mint int64, p Postings) error { + // The last segment is always active. + if len(w.files) < 2 { + return nil + } + var candidates []*segmentFile + + // All files have to be traversed as there could be two segments for a block + // with first block having times (10000, 20000) and SECOND one having (0, 10000). + for _, sf := range w.files[:len(w.files)-1] { + if sf.maxTime >= mint { + break + } + // Past WAL files are closed. We have to reopen them for another read. + f, err := w.openSegmentFile(sf.Name()) + if err != nil { + return errors.Wrap(err, "open old WAL segment for read") + } + candidates = append(candidates, &segmentFile{ + File: f, + minSeries: sf.minSeries, + maxTime: sf.maxTime, + }) + } + if len(candidates) == 0 { + return nil + } + + r := newWALReader(candidates, w.logger) + + // Create a new tmp file. + f, err := w.createSegmentFile(filepath.Join(w.dirFile.Name(), "compact.tmp")) + if err != nil { + return errors.Wrap(err, "create compaction segment") + } + var ( + csf = newSegmentFile(f) + crc32 = newCRC32() + activeSeries = []RefSeries{} + ) + +Loop: + for r.next() { + rt, flag, byt := r.at() + + if rt != WALEntrySeries { + continue + } + series, err := r.decodeSeries(flag, byt) + if err != nil { + return errors.Wrap(err, "decode samples while truncating") + } + activeSeries = activeSeries[:0] + + for _, s := range series { + if !p.Seek(s.Ref) { + break Loop + } + if p.At() == s.Ref { + activeSeries = append(activeSeries, s) + } + } + + buf := w.getBuffer() + flag = w.encodeSeries(buf, activeSeries) + + _, err = w.writeTo(csf, crc32, WALEntrySeries, flag, buf.get()) + w.putBuffer(buf) + + if err != nil { + return err + } + } + if r.Err() != nil { + return errors.Wrap(r.Err(), "read candidate WAL files") + } + + off, err := csf.Seek(0, os.SEEK_CUR) + if err != nil { + return err + } + if err := csf.Truncate(off); err != nil { + return err + } + csf.Sync() + csf.Close() + + if err := renameFile(csf.Name(), candidates[0].Name()); err != nil { + return err + } + for _, f := range candidates[1:] { + if err := os.RemoveAll(f.Name()); err != nil { + return errors.Wrap(err, "delete WAL segment file") + } + f.Close() + } + if err := w.dirFile.Sync(); err != nil { return err } + // The file object of csf still holds the name before rename. Recreate it so + // subsequent truncations do not look at a non-existant file name. + csf.File, err = w.openSegmentFile(candidates[0].Name()) + if err != nil { + return err + } + // We don't need it to be open. + csf.Close() + + w.mtx.Lock() + w.files = append([]*segmentFile{csf}, w.files[len(candidates):]...) + w.mtx.Unlock() + + return nil +} + +// LogSeries writes a batch of new series labels to the log. +// The series have to be ordered. +func (w *SegmentWAL) LogSeries(series []RefSeries) error { + buf := w.getBuffer() + + flag := w.encodeSeries(buf, series) + err := w.write(WALEntrySeries, flag, buf.get()) + + w.putBuffer(buf) + + if err != nil { + return errors.Wrap(err, "log series") + } + + tf := w.head() + + for _, s := range series { + if tf.minSeries > s.Ref { + tf.minSeries = s.Ref + } + } + if w.flushInterval <= 0 { - return w.Sync() + return errors.Wrap(w.Sync(), "sync") } return nil } // LogSamples writes a batch of new samples to the log. func (w *SegmentWAL) LogSamples(samples []RefSample) error { - if err := w.encodeSamples(samples); err != nil { - return err + buf := w.getBuffer() + + flag := w.encodeSamples(buf, samples) + err := w.write(WALEntrySamples, flag, buf.get()) + + w.putBuffer(buf) + + if err != nil { + return errors.Wrap(err, "log series") + } + tf := w.head() + + for _, s := range samples { + if tf.maxTime < s.T { + tf.maxTime = s.T + } } if w.flushInterval <= 0 { - return w.Sync() + return errors.Wrap(w.Sync(), "sync") } return nil } // LogDeletes write a batch of new deletes to the log. func (w *SegmentWAL) LogDeletes(stones []Stone) error { - if err := w.encodeDeletes(stones); err != nil { - return err + buf := w.getBuffer() + + flag := w.encodeDeletes(buf, stones) + err := w.write(WALEntryDeletes, flag, buf.get()) + + w.putBuffer(buf) + + if err != nil { + return errors.Wrap(err, "log series") + } + tf := w.head() + + for _, s := range stones { + for _, iv := range s.intervals { + if tf.maxTime < iv.Maxt { + tf.maxTime = iv.Maxt + } + } } if w.flushInterval <= 0 { - return w.Sync() + return errors.Wrap(w.Sync(), "sync") } return nil } -// initSegments finds all existing segment files and opens them in the -// appropriate file modes. -func (w *SegmentWAL) initSegments() error { - fns, err := sequenceFiles(w.dirFile.Name(), "") - if err != nil { - return err - } - if len(fns) == 0 { - return nil - } +// openSegmentFile opens the given segment file and consumes and validates header. +func (w *SegmentWAL) openSegmentFile(name string) (*os.File, error) { // We must open all files in read/write mode as we may have to truncate along - // the way and any file may become the tail. - for _, fn := range fns { - f, err := os.OpenFile(fn, os.O_RDWR, 0666) - if err != nil { - return err - } - w.files = append(w.files, f) + // the way and any file may become the head. + f, err := os.OpenFile(name, os.O_RDWR, 0666) + if err != nil { + return nil, err + } + metab := make([]byte, 8) + + if n, err := f.Read(metab); err != nil { + return nil, errors.Wrapf(err, "validate meta %q", f.Name()) + } else if n != 8 { + return nil, errors.Errorf("invalid header size %d in %q", n, f.Name()) } - // Consume and validate meta headers. - for _, f := range w.files { - metab := make([]byte, 8) - - if n, err := f.Read(metab); err != nil { - return errors.Wrapf(err, "validate meta %q", f.Name()) - } else if n != 8 { - return errors.Errorf("invalid header size %d in %q", n, f.Name()) - } - - if m := binary.BigEndian.Uint32(metab[:4]); m != WALMagic { - return errors.Errorf("invalid magic header %x in %q", m, f.Name()) - } - if metab[4] != WALFormatDefault { - return errors.Errorf("unknown WAL segment format %d in %q", metab[4], f.Name()) - } + if m := binary.BigEndian.Uint32(metab[:4]); m != WALMagic { + return nil, errors.Errorf("invalid magic header %x in %q", m, f.Name()) } - - return nil + if metab[4] != WALFormatDefault { + return nil, errors.Errorf("unknown WAL segment format %d in %q", metab[4], f.Name()) + } + return f, nil } -// cut finishes the currently active segments and opens the next one. -// The encoder is reset to point to the new segment. -func (w *SegmentWAL) cut() error { - // Sync current tail to disk and close. - if tf := w.tail(); tf != nil { - if err := w.sync(); err != nil { - return err - } - off, err := tf.Seek(0, os.SEEK_CUR) - if err != nil { - return err - } - if err := tf.Truncate(off); err != nil { - return err - } - if err := tf.Close(); err != nil { - return err - } - } - - p, _, err := nextSequenceFile(w.dirFile.Name(), "") +// createSegmentFile creates a new segment file with the given name. It preallocates +// the standard segment size if possible and writes the header. +func (w *SegmentWAL) createSegmentFile(name string) (*os.File, error) { + f, err := os.Create(name) if err != nil { - return err - } - f, err := os.Create(p) - if err != nil { - return err + return nil, err } if err = fileutil.Preallocate(f, w.segmentSize, true); err != nil { - return err + return nil, err } - if err = w.dirFile.Sync(); err != nil { - return err - } - // Write header metadata for new file. metab := make([]byte, 8) binary.BigEndian.PutUint32(metab[:4], WALMagic) metab[4] = WALFormatDefault if _, err := f.Write(metab); err != nil { + return nil, err + } + return f, err +} + +// cut finishes the currently active segments and opens the next one. +// The encoder is reset to point to the new segment. +func (w *SegmentWAL) cut() error { + // Sync current head to disk and close. + if hf := w.head(); hf != nil { + if err := w.sync(); err != nil { + return err + } + off, err := hf.Seek(0, os.SEEK_CUR) + if err != nil { + return err + } + if err := hf.Truncate(off); err != nil { + return err + } + if err := hf.Close(); err != nil { + return err + } + } + + p, _, err := nextSequenceFile(w.dirFile.Name()) + if err != nil { + return err + } + f, err := w.createSegmentFile(p) + if err != nil { return err } - w.files = append(w.files, f) - w.cur = bufio.NewWriterSize(f, 4*1024*1024) + if err = w.dirFile.Sync(); err != nil { + return err + } + + w.files = append(w.files, newSegmentFile(f)) + + // TODO(gouthamve): make the buffer size a constant. + w.cur = bufio.NewWriterSize(f, 8*1024*1024) w.curN = 8 return nil } -func (w *SegmentWAL) tail() *os.File { +func (w *SegmentWAL) head() *segmentFile { if len(w.files) == 0 { return nil } @@ -292,20 +568,40 @@ func (w *SegmentWAL) tail() *os.File { // Sync flushes the changes to disk. func (w *SegmentWAL) Sync() error { - w.mtx.Lock() - defer w.mtx.Unlock() + var head *segmentFile + var err error - return w.sync() + // Flush the writer and retrieve the reference to the head segment under mutex lock. + func() { + w.mtx.Lock() + defer w.mtx.Unlock() + if err = w.flush(); err != nil { + return + } + head = w.head() + }() + if err != nil { + return errors.Wrap(err, "flush buffer") + } + if head != nil { + // But only fsync the head segment after releasing the mutex as it will block on disk I/O. + return fileutil.Fdatasync(head.File) + } + return nil } func (w *SegmentWAL) sync() error { + if err := w.flush(); err != nil { + return err + } + return fileutil.Fdatasync(w.head().File) +} + +func (w *SegmentWAL) flush() error { if w.cur == nil { return nil } - if err := w.cur.Flush(); err != nil { - return err - } - return fileutil.Fdatasync(w.tail()) + return w.cur.Flush() } func (w *SegmentWAL) run(interval time.Duration) { @@ -335,17 +631,16 @@ func (w *SegmentWAL) Close() error { close(w.stopc) <-w.donec - // Lock mutex and leave it locked so we panic if there's a bug causing - // the block to be used afterwards. w.mtx.Lock() + defer w.mtx.Unlock() if err := w.sync(); err != nil { return err } // On opening, a WAL must be fully consumed once. Afterwards // only the current segment will still be open. - if tf := w.tail(); tf != nil { - return errors.Wrapf(tf.Close(), "closing WAL tail %s", tf.Name()) + if hf := w.head(); hf != nil { + return errors.Wrapf(hf.Close(), "closing WAL head %s", hf.Name()) } return nil } @@ -359,15 +654,14 @@ const ( walPageBytes = 16 * minSectorSize ) -func (w *SegmentWAL) entry(et WALEntryType, flag byte, buf []byte) error { +func (w *SegmentWAL) write(t WALEntryType, flag uint8, buf []byte) error { w.mtx.Lock() defer w.mtx.Unlock() - // Cut to the next segment if the entry exceeds the file size unless it would also // exceed the size of a new segment. + // TODO(gouthamve): Add a test for this case where the commit is greater than segmentSize. var ( - // 6-byte header + 4-byte CRC32 + buf. - sz = int64(6 + 4 + len(buf)) + sz = int64(len(buf)) + 6 newsz = w.curN + sz ) // XXX(fabxc): this currently cuts a new file whenever the WAL was newly opened. @@ -377,30 +671,37 @@ func (w *SegmentWAL) entry(et WALEntryType, flag byte, buf []byte) error { return err } } + n, err := w.writeTo(w.cur, w.crc32, t, flag, buf) - w.crc32.Reset() - wr := io.MultiWriter(w.crc32, w.cur) + w.curN += int64(n) - b := make([]byte, 6) - b[0] = byte(et) + return err +} + +func (w *SegmentWAL) writeTo(wr io.Writer, crc32 hash.Hash, t WALEntryType, flag uint8, buf []byte) (int, error) { + if len(buf) == 0 { + return 0, nil + } + crc32.Reset() + wr = io.MultiWriter(crc32, wr) + + var b [6]byte + b[0] = byte(t) b[1] = flag binary.BigEndian.PutUint32(b[2:], uint32(len(buf))) - if _, err := wr.Write(b); err != nil { - return err + n1, err := wr.Write(b[:]) + if err != nil { + return n1, err } - if _, err := wr.Write(buf); err != nil { - return err - } - if _, err := w.cur.Write(w.crc32.Sum(nil)); err != nil { - return err + n2, err := wr.Write(buf) + if err != nil { + return n1 + n2, err } + n3, err := wr.Write(crc32.Sum(b[:0])) - w.curN += sz - - putWALBuffer(buf) - return nil + return n1 + n2 + n3, err } const ( @@ -409,122 +710,77 @@ const ( walDeletesSimple = 1 ) -var walBuffers = sync.Pool{} +func (w *SegmentWAL) encodeSeries(buf *encbuf, series []RefSeries) uint8 { + for _, s := range series { + buf.putBE64(s.Ref) + buf.putUvarint(len(s.Labels)) -func getWALBuffer() []byte { - b := walBuffers.Get() - if b == nil { - return make([]byte, 0, 64*1024) - } - return b.([]byte) -} - -func putWALBuffer(b []byte) { - b = b[:0] - walBuffers.Put(b) -} - -func (w *SegmentWAL) encodeSeries(series []labels.Labels) error { - if len(series) == 0 { - return nil - } - - b := make([]byte, binary.MaxVarintLen32) - buf := getWALBuffer() - - for _, lset := range series { - n := binary.PutUvarint(b, uint64(len(lset))) - buf = append(buf, b[:n]...) - - for _, l := range lset { - n = binary.PutUvarint(b, uint64(len(l.Name))) - buf = append(buf, b[:n]...) - buf = append(buf, l.Name...) - - n = binary.PutUvarint(b, uint64(len(l.Value))) - buf = append(buf, b[:n]...) - buf = append(buf, l.Value...) + for _, l := range s.Labels { + buf.putUvarintStr(l.Name) + buf.putUvarintStr(l.Value) } } - - return w.entry(WALEntrySeries, walSeriesSimple, buf) + return walSeriesSimple } -func (w *SegmentWAL) encodeSamples(samples []RefSample) error { +func (w *SegmentWAL) encodeSamples(buf *encbuf, samples []RefSample) uint8 { if len(samples) == 0 { - return nil + return walSamplesSimple } - - b := make([]byte, binary.MaxVarintLen64) - buf := getWALBuffer() - // Store base timestamp and base reference number of first sample. // All samples encode their timestamp and ref as delta to those. // // TODO(fabxc): optimize for all samples having the same timestamp. first := samples[0] - binary.BigEndian.PutUint64(b, first.Ref) - buf = append(buf, b[:8]...) - binary.BigEndian.PutUint64(b, uint64(first.T)) - buf = append(buf, b[:8]...) + buf.putBE64(first.Ref) + buf.putBE64int64(first.T) for _, s := range samples { - n := binary.PutVarint(b, int64(s.Ref)-int64(first.Ref)) - buf = append(buf, b[:n]...) - - n = binary.PutVarint(b, s.T-first.T) - buf = append(buf, b[:n]...) - - binary.BigEndian.PutUint64(b, math.Float64bits(s.V)) - buf = append(buf, b[:8]...) + buf.putVarint64(int64(s.Ref) - int64(first.Ref)) + buf.putVarint64(s.T - first.T) + buf.putBE64(math.Float64bits(s.V)) } - - return w.entry(WALEntrySamples, walSamplesSimple, buf) + return walSamplesSimple } -func (w *SegmentWAL) encodeDeletes(stones []Stone) error { - b := make([]byte, 2*binary.MaxVarintLen64) - eb := &encbuf{b: b} - buf := getWALBuffer() +func (w *SegmentWAL) encodeDeletes(buf *encbuf, stones []Stone) uint8 { for _, s := range stones { - for _, itv := range s.intervals { - eb.reset() - eb.putUvarint32(s.ref) - eb.putVarint64(itv.mint) - eb.putVarint64(itv.maxt) - buf = append(buf, eb.get()...) + for _, iv := range s.intervals { + buf.putBE64(s.ref) + buf.putVarint64(iv.Mint) + buf.putVarint64(iv.Maxt) } } - - return w.entry(WALEntryDeletes, walDeletesSimple, buf) + return walDeletesSimple } // walReader decodes and emits write ahead log entries. type walReader struct { logger log.Logger - wal *SegmentWAL + files []*segmentFile cur int buf []byte crc32 hash.Hash32 - curType WALEntryType - curFlag byte - curBuf []byte + curType WALEntryType + curFlag byte + curBuf []byte + lastOffset int64 // offset after last successfully read entry err error } -func newWALReader(w *SegmentWAL, l log.Logger) *walReader { +func newWALReader(files []*segmentFile, l log.Logger) *walReader { if l == nil { l = log.NewNopLogger() } return &walReader{ logger: l, - wal: w, + files: files, buf: make([]byte, 0, 128*4096), - crc32: crc32.New(crc32.MakeTable(crc32.Castagnoli)), + crc32: newCRC32(), } } @@ -534,29 +790,69 @@ func (r *walReader) Err() error { } func (r *walReader) Read(seriesf SeriesCB, samplesf SamplesCB, deletesf DeletesCB) error { + if seriesf == nil { + seriesf = func([]RefSeries) error { return nil } + } + if samplesf == nil { + samplesf = func([]RefSample) error { return nil } + } + if deletesf == nil { + deletesf = func([]Stone) error { return nil } + } + for r.next() { et, flag, b := r.at() // In decoding below we never return a walCorruptionErr for now. // Those should generally be catched by entry decoding before. switch et { case WALEntrySeries: - s, err := r.decodeSeries(flag, b) + series, err := r.decodeSeries(flag, b) if err != nil { - return err + return errors.Wrap(err, "decode series entry") } - seriesf(s) + seriesf(series) + + cf := r.current() + + for _, s := range series { + if cf.minSeries > s.Ref { + cf.minSeries = s.Ref + } + } + case WALEntrySamples: - s, err := r.decodeSamples(flag, b) + samples, err := r.decodeSamples(flag, b) if err != nil { - return err + return errors.Wrap(err, "decode samples entry") } - samplesf(s) + samplesf(samples) + + // Update the times for the WAL segment file. + cf := r.current() + + for _, s := range samples { + if cf.maxTime < s.T { + cf.maxTime = s.T + } + } + case WALEntryDeletes: - s, err := r.decodeDeletes(flag, b) + stones, err := r.decodeDeletes(flag, b) if err != nil { - return err + return errors.Wrap(err, "decode delete entry") + } + deletesf(stones) + // Update the times for the WAL segment file. + + cf := r.current() + + for _, s := range stones { + for _, iv := range s.intervals { + if cf.maxTime < iv.Maxt { + cf.maxTime = iv.Maxt + } + } } - deletesf(s) } } @@ -565,20 +861,17 @@ func (r *walReader) Read(seriesf SeriesCB, samplesf SamplesCB, deletesf DeletesC // nextEntry retrieves the next entry. It is also used as a testing hook. func (r *walReader) nextEntry() (WALEntryType, byte, []byte, error) { - if r.cur >= len(r.wal.files) { + if r.cur >= len(r.files) { return 0, 0, nil, io.EOF } - cf := r.wal.files[r.cur] + cf := r.current() et, flag, b, err := r.entry(cf) - // If we reached the end of the reader, advance to the next one - // and close. + // If we reached the end of the reader, advance to the next one and close. // Do not close on the last one as it will still be appended to. - if err == io.EOF && r.cur < len(r.wal.files)-1 { - // Current reader completed, close and move to the next one. - if err := cf.Close(); err != nil { - return 0, 0, nil, err - } + if err == io.EOF && r.cur < len(r.files)-1 { + // Current reader completed. Leave the file open for later reads + // for truncating. r.cur++ return r.nextEntry() } @@ -592,15 +885,15 @@ func (r *walReader) at() (WALEntryType, byte, []byte) { // next returns decodes the next entry pair and returns true // if it was succesful. func (r *walReader) next() bool { - if r.cur >= len(r.wal.files) { + if r.cur >= len(r.files) { return false } - cf := r.wal.files[r.cur] + cf := r.files[r.cur] - // Save position after last valid entry if we have to truncate the WAL. - lastOffset, err := cf.Seek(0, os.SEEK_CUR) - if err != nil { - r.err = err + // Remember the offset after the last correctly read entry. If the next one + // is corrupted, this is where we can safely truncate. + r.lastOffset, r.err = cf.Seek(0, os.SEEK_CUR) + if r.err != nil { return false } @@ -609,7 +902,7 @@ func (r *walReader) next() bool { // and close. // Do not close on the last one as it will still be appended to. if err == io.EOF { - if r.cur == len(r.wal.files)-1 { + if r.cur == len(r.files)-1 { return false } // Current reader completed, close and move to the next one. @@ -622,10 +915,6 @@ func (r *walReader) next() bool { } if err != nil { r.err = err - - if _, ok := err.(walCorruptionErr); ok { - r.err = r.truncate(lastOffset) - } return false } @@ -635,37 +924,28 @@ func (r *walReader) next() bool { return r.err == nil } -func (r *walReader) current() *os.File { - return r.wal.files[r.cur] -} - -// truncate the WAL after the last valid entry. -func (r *walReader) truncate(lastOffset int64) error { - r.logger.Log("msg", "WAL corruption detected; truncating", - "err", r.err, "file", r.current().Name(), "pos", lastOffset) - - // Close and delete all files after the current one. - for _, f := range r.wal.files[r.cur+1:] { - if err := f.Close(); err != nil { - return err - } - if err := os.Remove(f.Name()); err != nil { - return err - } - } - r.wal.files = r.wal.files[:r.cur+1] - - // Seek the current file to the last valid offset where we continue writing from. - _, err := r.current().Seek(lastOffset, os.SEEK_SET) - return err +func (r *walReader) current() *segmentFile { + return r.files[r.cur] } // walCorruptionErr is a type wrapper for errors that indicate WAL corruption // and trigger a truncation. -type walCorruptionErr error +type walCorruptionErr struct { + err error + file int + lastOffset int64 +} -func walCorruptionErrf(s string, args ...interface{}) error { - return walCorruptionErr(errors.Errorf(s, args...)) +func (e walCorruptionErr) Error() string { + return fmt.Sprintf("%s ", e.err, e.file, e.lastOffset) +} + +func (r *walReader) corruptionErr(s string, args ...interface{}) error { + return walCorruptionErr{ + err: errors.Errorf(s, args...), + file: r.cur, + lastOffset: r.lastOffset, + } } func (r *walReader) entry(cr io.Reader) (WALEntryType, byte, []byte, error) { @@ -676,7 +956,7 @@ func (r *walReader) entry(cr io.Reader) (WALEntryType, byte, []byte, error) { if n, err := tr.Read(b); err != nil { return 0, 0, nil, err } else if n != 6 { - return 0, 0, nil, walCorruptionErrf("invalid entry header size %d", n) + return 0, 0, nil, r.corruptionErr("invalid entry header size %d", n) } var ( @@ -689,7 +969,7 @@ func (r *walReader) entry(cr io.Reader) (WALEntryType, byte, []byte, error) { return 0, 0, nil, io.EOF } if etype != WALEntrySeries && etype != WALEntrySamples && etype != WALEntryDeletes { - return 0, 0, nil, walCorruptionErrf("invalid entry type %d", etype) + return 0, 0, nil, r.corruptionErr("invalid entry type %d", etype) } if length > len(r.buf) { @@ -700,107 +980,100 @@ func (r *walReader) entry(cr io.Reader) (WALEntryType, byte, []byte, error) { if n, err := tr.Read(buf); err != nil { return 0, 0, nil, err } else if n != length { - return 0, 0, nil, walCorruptionErrf("invalid entry body size %d", n) + return 0, 0, nil, r.corruptionErr("invalid entry body size %d", n) } if n, err := cr.Read(b[:4]); err != nil { return 0, 0, nil, err } else if n != 4 { - return 0, 0, nil, walCorruptionErrf("invalid checksum length %d", n) + return 0, 0, nil, r.corruptionErr("invalid checksum length %d", n) } if exp, has := binary.BigEndian.Uint32(b[:4]), r.crc32.Sum32(); has != exp { - return 0, 0, nil, walCorruptionErrf("unexpected CRC32 checksum %x, want %x", has, exp) + return 0, 0, nil, r.corruptionErr("unexpected CRC32 checksum %x, want %x", has, exp) } return etype, flag, buf, nil } -func (r *walReader) decodeSeries(flag byte, b []byte) ([]labels.Labels, error) { - series := []labels.Labels{} - for len(b) > 0 { - l, n := binary.Uvarint(b) - if n < 1 { - return nil, errors.Wrap(errInvalidSize, "number of labels") +func (r *walReader) decodeSeries(flag byte, b []byte) ([]RefSeries, error) { + series := []RefSeries{} + dec := decbuf{b: b} + + for len(dec.b) > 0 && dec.err() == nil { + ref := dec.be64() + + lset := make(labels.Labels, dec.uvarint()) + + for i := range lset { + lset[i].Name = dec.uvarintStr() + lset[i].Value = dec.uvarintStr() } - b = b[n:] - lset := make(labels.Labels, l) + sort.Sort(lset) - for i := 0; i < int(l); i++ { - nl, n := binary.Uvarint(b) - if n < 1 || len(b) < n+int(nl) { - return nil, errors.Wrap(errInvalidSize, "label name") - } - lset[i].Name = string(b[n : n+int(nl)]) - b = b[n+int(nl):] - - vl, n := binary.Uvarint(b) - if n < 1 || len(b) < n+int(vl) { - return nil, errors.Wrap(errInvalidSize, "label value") - } - lset[i].Value = string(b[n : n+int(vl)]) - b = b[n+int(vl):] - } - - series = append(series, lset) + series = append(series, RefSeries{ + Ref: ref, + Labels: lset, + }) + } + if dec.err() != nil { + return nil, dec.err() + } + if len(dec.b) > 0 { + return series, errors.Errorf("unexpected %d bytes left in entry", len(dec.b)) } return series, nil } func (r *walReader) decodeSamples(flag byte, b []byte) ([]RefSample, error) { - samples := []RefSample{} - - if len(b) < 16 { - return nil, errors.Wrap(errInvalidSize, "header length") + if len(b) == 0 { + return nil, nil } + samples := []RefSample{} + dec := decbuf{b: b} + var ( - baseRef = binary.BigEndian.Uint64(b) - baseTime = int64(binary.BigEndian.Uint64(b[8:])) + baseRef = dec.be64() + baseTime = dec.be64int64() ) - b = b[16:] - for len(b) > 0 { - var smpl RefSample + for len(dec.b) > 0 && dec.err() == nil { + dref := dec.varint64() + dtime := dec.varint64() + val := dec.be64() - dref, n := binary.Varint(b) - if n < 1 { - return nil, errors.Wrap(errInvalidSize, "sample ref delta") - } - b = b[n:] + samples = append(samples, RefSample{ + Ref: uint64(int64(baseRef) + dref), + T: baseTime + dtime, + V: math.Float64frombits(val), + }) + } - smpl.Ref = uint64(int64(baseRef) + dref) - - dtime, n := binary.Varint(b) - if n < 1 { - return nil, errors.Wrap(errInvalidSize, "sample timestamp delta") - } - b = b[n:] - smpl.T = baseTime + dtime - - if len(b) < 8 { - return nil, errors.Wrapf(errInvalidSize, "sample value bits %d", len(b)) - } - smpl.V = float64(math.Float64frombits(binary.BigEndian.Uint64(b))) - b = b[8:] - - samples = append(samples, smpl) + if dec.err() != nil { + return nil, errors.Wrapf(dec.err(), "decode error after %d samples", len(samples)) + } + if len(dec.b) > 0 { + return samples, errors.Errorf("unexpected %d bytes left in entry", len(dec.b)) } return samples, nil } func (r *walReader) decodeDeletes(flag byte, b []byte) ([]Stone, error) { - db := &decbuf{b: b} - stones := []Stone{} + dec := &decbuf{b: b} + var stones []Stone - for db.len() > 0 { - var s Stone - s.ref = db.uvarint32() - s.intervals = intervals{{db.varint64(), db.varint64()}} - if db.err() != nil { - return nil, db.err() - } - - stones = append(stones, s) + for dec.len() > 0 && dec.err() == nil { + stones = append(stones, Stone{ + ref: dec.be64(), + intervals: Intervals{ + {Mint: dec.varint64(), Maxt: dec.varint64()}, + }, + }) + } + if dec.err() != nil { + return nil, dec.err() + } + if len(dec.b) > 0 { + return stones, errors.Errorf("unexpected %d bytes left in entry", len(dec.b)) } - return stones, nil } diff --git a/vendor/vendor.json b/vendor/vendor.json index ed23fd4e55..bb499c8b40 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -859,22 +859,22 @@ "revisionTime": "2016-04-11T19:08:41Z" }, { - "checksumSHA1": "WvgmP/a6PVjj33/h8L7XrNUmoQE=", + "checksumSHA1": "AoNkGFKIyLNi4a/QcO8p5D7xIXs=", "path": "github.com/prometheus/tsdb", - "revision": "c4ca881685ae1266a75caf57da46d8b6934213c0", - "revisionTime": "2017-08-18T07:54:27Z" + "revision": "0db4c227b72145418ad4c1fbda8fdb87bfe77a02", + "revisionTime": "2017-09-07T11:04:02Z" }, { "checksumSHA1": "Gua979gmISm4cJP/fR2hL8m5To8=", "path": "github.com/prometheus/tsdb/chunks", - "revision": "c4ca881685ae1266a75caf57da46d8b6934213c0", - "revisionTime": "2017-08-18T07:54:27Z" + "revision": "0db4c227b72145418ad4c1fbda8fdb87bfe77a02", + "revisionTime": "2017-09-07T11:04:02Z" }, { "checksumSHA1": "zhmlvc322RH1L3l9DaA9d/HVVWs=", "path": "github.com/prometheus/tsdb/labels", - "revision": "c4ca881685ae1266a75caf57da46d8b6934213c0", - "revisionTime": "2017-08-18T07:54:27Z" + "revision": "0db4c227b72145418ad4c1fbda8fdb87bfe77a02", + "revisionTime": "2017-09-07T11:04:02Z" }, { "checksumSHA1": "5SYLEhADhdBVZAGPVHWggQl7H8k=",