diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index a7ebf518f2..bf2e78a956 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -164,6 +164,10 @@ func main() { "Maximum duration compacted blocks may span. For use in testing. (Defaults to 10% of the retention period)."). Hidden().PlaceHolder("").SetValue(&cfg.tsdb.MaxBlockDuration) + a.Flag("storage.tsdb.wal-segment-size", + "Size at which to split the tsdb WAL segment files (e.g. 100MB)"). + Hidden().PlaceHolder("").BytesVar(&cfg.tsdb.WALSegmentSize) + a.Flag("storage.tsdb.retention", "How long to retain samples in storage."). Default("15d").SetValue(&cfg.tsdb.Retention) @@ -560,6 +564,11 @@ func main() { g.Add( func() error { level.Info(logger).Log("msg", "Starting TSDB ...") + if cfg.tsdb.WALSegmentSize != 0 { + if cfg.tsdb.WALSegmentSize < 10*1024*1024 || cfg.tsdb.WALSegmentSize > 256*1024*1024 { + return errors.New("flag 'storage.tsdb.wal-segment-size' must be set between 10MB and 256MB") + } + } db, err := tsdb.Open( cfg.localStoragePath, log.With(logger, "component", "tsdb"), diff --git a/cmd/prometheus/main_test.go b/cmd/prometheus/main_test.go index c6697a101f..d0bd128b0d 100644 --- a/cmd/prometheus/main_test.go +++ b/cmd/prometheus/main_test.go @@ -247,3 +247,36 @@ func TestSendAlerts(t *testing.T) { }) } } + +func TestWALSegmentSizeBounds(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + for size, expectedExitStatus := range map[string]int{"9MB": 1, "257MB": 1, "10": 2, "1GB": 1, "12MB": 0} { + prom := exec.Command(promPath, "--storage.tsdb.wal-segment-size="+size, "--config.file="+promConfig) + err := prom.Start() + testutil.Ok(t, err) + + if expectedExitStatus == 0 { + done := make(chan error, 1) + go func() { done <- prom.Wait() }() + select { + case err := <-done: + t.Errorf("prometheus should be still running: %v", err) + case <-time.After(5 * time.Second): + prom.Process.Signal(os.Interrupt) + } + continue + } + + err = prom.Wait() + testutil.NotOk(t, err, "") + if exitError, ok := err.(*exec.ExitError); ok { + status := exitError.Sys().(syscall.WaitStatus) + testutil.Equals(t, expectedExitStatus, status.ExitStatus()) + } else { + t.Errorf("unable to retrieve the exit status for prometheus: %v", err) + } + } +} diff --git a/go.mod b/go.mod index 33eed11df0..2a1dd7a6a0 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ require ( github.com/Azure/go-autorest v10.8.1+incompatible github.com/StackExchange/wmi v0.0.0-20180725035823-b12b22c5341f // indirect github.com/VividCortex/ewma v1.1.1 // indirect + github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da // indirect github.com/aws/aws-sdk-go v0.0.0-20180507225419-00862f899353 github.com/biogo/store v0.0.0-20160505134755-913427a1d5e8 // indirect @@ -88,7 +89,7 @@ require ( github.com/prometheus/client_golang v0.9.1 github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910 github.com/prometheus/common v0.0.0-20181119215939-b36ad289a3ea - github.com/prometheus/tsdb v0.3.1 + github.com/prometheus/tsdb v0.3.2-0.20181219094047-6d489a1004dc github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a // indirect github.com/rlmcpherson/s3gof3r v0.5.0 // indirect github.com/rubyist/circuitbreaker v2.2.1+incompatible // indirect diff --git a/go.sum b/go.sum index 6120d79621..2e390ec09b 100644 --- a/go.sum +++ b/go.sum @@ -209,8 +209,8 @@ github.com/prometheus/common v0.0.0-20181119215939-b36ad289a3ea h1:4RkbEb5XX0Wvu github.com/prometheus/common v0.0.0-20181119215939-b36ad289a3ea/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d h1:GoAlyOgbOEIFdaDqxJVlbOQ1DtGmZWs/Qau0hIlk+WQ= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= -github.com/prometheus/tsdb v0.3.1 h1:uGgfubT2MesNpx3T46c5R32RcUoKAPGyWX+4x1orJLE= -github.com/prometheus/tsdb v0.3.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= +github.com/prometheus/tsdb v0.3.2-0.20181219094047-6d489a1004dc h1:phU3kj067sczIc4fhaq5rRcH4Lp9A45MsrcQqjC+cao= +github.com/prometheus/tsdb v0.3.2-0.20181219094047-6d489a1004dc/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a h1:9ZKAASQSHhDYGoxY8uLVpewe1GDZ2vu2Tr/vTdVAkFQ= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= github.com/rlmcpherson/s3gof3r v0.5.0 h1:1izOJpTiohSibfOHuNyEA/yQnAirh05enzEdmhez43k= diff --git a/storage/tsdb/tsdb.go b/storage/tsdb/tsdb.go index 8dce7d512b..f1750f6515 100644 --- a/storage/tsdb/tsdb.go +++ b/storage/tsdb/tsdb.go @@ -19,6 +19,7 @@ import ( "time" "unsafe" + "github.com/alecthomas/units" "github.com/go-kit/kit/log" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" @@ -117,6 +118,9 @@ type Options struct { // The maximum timestamp range of compacted blocks. MaxBlockDuration model.Duration + // The maximum size of each WAL segment file. + WALSegmentSize units.Base2Bytes + // Duration for how long to retain data. Retention model.Duration @@ -182,6 +186,7 @@ func Open(path string, l log.Logger, r prometheus.Registerer, opts *Options) (*t db, err := tsdb.Open(path, l, r, &tsdb.Options{ WALFlushInterval: 10 * time.Second, + WALSegmentSize: int(opts.WALSegmentSize), RetentionDuration: uint64(time.Duration(opts.Retention).Seconds() * 1000), BlockRanges: rngs, NoLockfile: opts.NoLockfile, diff --git a/vendor/github.com/prometheus/tsdb/CHANGELOG.md b/vendor/github.com/prometheus/tsdb/CHANGELOG.md index 1f2abce09e..8e3c01f2c3 100644 --- a/vendor/github.com/prometheus/tsdb/CHANGELOG.md +++ b/vendor/github.com/prometheus/tsdb/CHANGELOG.md @@ -1,11 +1,10 @@ ## master / unreleased - + - [CHANGE] New `WALSegmentSize` option to override the `DefaultOptions.WALSegmentSize`. Added to allow using smaller wal files. For example using tmpfs on a RPI to minimise the SD card wear out from the constant WAL writes. As part of this change the `DefaultOptions.WALSegmentSize` constant was also exposed. ## 0.3.1 - [BUGFIX] Fixed most windows test and some actual bugs for unclosed file readers. ## 0.3.0 - - [CHANGE] `LastCheckpoint()` used to return just the segment name and now it returns the full relative path. - [CHANGE] `NewSegmentsRangeReader()` can now read over miltiple wal ranges by using the new `SegmentRange{}` struct. - [CHANGE] `CorruptionErr{}` now also exposes the Segment `Dir` which is added when displaying any errors. diff --git a/vendor/github.com/prometheus/tsdb/db.go b/vendor/github.com/prometheus/tsdb/db.go index 3a47f0bf42..e5a057cbae 100644 --- a/vendor/github.com/prometheus/tsdb/db.go +++ b/vendor/github.com/prometheus/tsdb/db.go @@ -45,6 +45,7 @@ import ( // millisecond precision timestamps. var DefaultOptions = &Options{ WALFlushInterval: 5 * time.Second, + WALSegmentSize: wal.DefaultSegmentSize, RetentionDuration: 15 * 24 * 60 * 60 * 1000, // 15 days in milliseconds BlockRanges: ExponentialBlockRanges(int64(2*time.Hour)/1e6, 3, 5), NoLockfile: false, @@ -55,6 +56,9 @@ type Options struct { // The interval at which the write ahead log is flushed to disk. WALFlushInterval time.Duration + // Segments (wal files) max size + WALSegmentSize int + // Duration of persisted data to keep. RetentionDuration uint64 @@ -263,7 +267,11 @@ func Open(dir string, l log.Logger, r prometheus.Registerer, opts *Options) (db return nil, errors.Wrap(err, "create leveled compactor") } - wlog, err := wal.New(l, r, filepath.Join(dir, "wal")) + segmentSize := wal.DefaultSegmentSize + if opts.WALSegmentSize > 0 { + segmentSize = opts.WALSegmentSize + } + wlog, err := wal.NewSize(l, r, filepath.Join(dir, "wal"), segmentSize) if err != nil { return nil, err } diff --git a/vendor/github.com/prometheus/tsdb/head.go b/vendor/github.com/prometheus/tsdb/head.go index 4d917291a4..eba1f2dc43 100644 --- a/vendor/github.com/prometheus/tsdb/head.go +++ b/vendor/github.com/prometheus/tsdb/head.go @@ -89,6 +89,7 @@ type headMetrics struct { maxTime prometheus.GaugeFunc samplesAppended prometheus.Counter walTruncateDuration prometheus.Summary + walCorruptionsTotal prometheus.Counter headTruncateFail prometheus.Counter headTruncateTotal prometheus.Counter checkpointDeleteFail prometheus.Counter @@ -152,6 +153,10 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { Name: "prometheus_tsdb_wal_truncate_duration_seconds", Help: "Duration of WAL truncation.", }) + m.walCorruptionsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_corruptions_total", + Help: "Total number of WAL corruptions.", + }) m.samplesAppended = prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_head_samples_appended_total", Help: "Total number of appended samples.", @@ -195,6 +200,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { m.maxTime, m.gcDuration, m.walTruncateDuration, + m.walCorruptionsTotal, m.samplesAppended, m.headTruncateFail, m.headTruncateTotal, @@ -480,10 +486,10 @@ func (h *Head) Init(minValidTime int64) error { return nil } level.Warn(h.logger).Log("msg", "encountered WAL error, attempting repair", "err", err) + h.metrics.walCorruptionsTotal.Inc() if err := h.wal.Repair(err); err != nil { return errors.Wrap(err, "repair corrupted WAL") } - return nil } @@ -500,7 +506,7 @@ func (h *Head) Truncate(mint int64) (err error) { return nil } atomic.StoreInt64(&h.minTime, mint) - h.minValidTime = mint + atomic.StoreInt64(&h.minValidTime, mint) // Ensure that max time is at least as high as min time. for h.MaxTime() < mint { @@ -656,7 +662,7 @@ func (h *Head) appender() *headAppender { head: h, // Set the minimum valid time to whichever is greater the head min valid time or the compaciton window. // This ensures that no samples will be added within the compaction window to avoid races. - minValidTime: max(h.minValidTime, h.MaxTime()-h.chunkRange/2), + minValidTime: max(atomic.LoadInt64(&h.minValidTime), h.MaxTime()-h.chunkRange/2), mint: math.MaxInt64, maxt: math.MinInt64, samples: h.getAppendBuffer(), diff --git a/vendor/github.com/prometheus/tsdb/wal/wal.go b/vendor/github.com/prometheus/tsdb/wal/wal.go index 5134850fe1..92374e3121 100644 --- a/vendor/github.com/prometheus/tsdb/wal/wal.go +++ b/vendor/github.com/prometheus/tsdb/wal/wal.go @@ -35,7 +35,7 @@ import ( ) const ( - defaultSegmentSize = 128 * 1024 * 1024 // 128 MB + DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB pageSize = 32 * 1024 // 32KB recordHeaderSize = 7 ) @@ -174,7 +174,7 @@ type WAL struct { // New returns a new WAL over the given directory. func New(logger log.Logger, reg prometheus.Registerer, dir string) (*WAL, error) { - return NewSize(logger, reg, dir, defaultSegmentSize) + return NewSize(logger, reg, dir, DefaultSegmentSize) } // NewSize returns a new WAL over the given directory. diff --git a/vendor/modules.txt b/vendor/modules.txt index 7abd212992..98bf919eca 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -187,7 +187,7 @@ github.com/prometheus/procfs github.com/prometheus/procfs/nfs github.com/prometheus/procfs/xfs github.com/prometheus/procfs/internal/util -# github.com/prometheus/tsdb v0.3.1 +# github.com/prometheus/tsdb v0.3.2-0.20181219094047-6d489a1004dc github.com/prometheus/tsdb github.com/prometheus/tsdb/labels github.com/prometheus/tsdb/chunkenc