Merge remote-tracking branch 'origin/release-3.7' into krajo/merge-3.7.3-to-main

# Conflicts:
#	CHANGELOG.md
#	storage/remote/queue_manager_test.go
This commit is contained in:
György Krajcsovits 2025-10-30 09:21:25 +01:00
commit b8192127ee
No known key found for this signature in database
GPG Key ID: 47A8F9CE80FD7C7F
11 changed files with 157 additions and 35 deletions

View File

@ -4,6 +4,13 @@
* [FEATURE] Templates: Add urlQueryEscape to template functions. #17403
## 3.7.3 / 2025-10-29
* [BUGFIX] UI: Revert changed (and breaking) redirect behavior for `-web.external-url` if `-web.route-prefix` is configured, which was introduced in #17240. #17389
* [BUGFIX] Fix federation of some native histograms. #17299 #17409
* [BUGFIX] promtool: `check config` would fail when `--lint=none` flag was set. #17399 #17414
* [BUGFIX] Remote-write: fix a deadlock in the queue resharding logic that can lead to suboptimal queue behavior. #17412
## 3.7.2 / 2025-10-22
* [BUGFIX] AWS SD: Fix AWS SDK v2 credentials handling for EC2 and Lightsail discovery. #17355

View File

@ -1 +1 @@
3.7.2
3.7.3

View File

@ -965,3 +965,73 @@ remote_write:
return true
}, 10*time.Second, 100*time.Millisecond)
}
// TestRemoteWrite_ReshardingWithoutDeadlock ensures that resharding (scaling up) doesn't block when the shards are full.
// See: https://github.com/prometheus/prometheus/issues/17384.
func TestRemoteWrite_ReshardingWithoutDeadlock(t *testing.T) {
t.Parallel()
tmpDir := t.TempDir()
configFile := filepath.Join(tmpDir, "prometheus.yml")
port := testutil.RandomUnprivilegedPort(t)
server := httptest.NewServer(http.HandlerFunc(func(http.ResponseWriter, *http.Request) {
time.Sleep(time.Second)
}))
t.Cleanup(server.Close)
config := fmt.Sprintf(`
global:
scrape_interval: 100ms
scrape_configs:
- job_name: 'self'
static_configs:
- targets: ['localhost:%d']
remote_write:
- url: %s
queue_config:
# Speed up the queue being full.
capacity: 1
`, port, server.URL)
require.NoError(t, os.WriteFile(configFile, []byte(config), 0o777))
prom := prometheusCommandWithLogging(
t,
configFile,
port,
fmt.Sprintf("--storage.tsdb.path=%s", tmpDir),
)
require.NoError(t, prom.Start())
var checkInitialDesiredShardsOnce sync.Once
require.Eventually(t, func() bool {
r, err := http.Get(fmt.Sprintf("http://127.0.0.1:%d/metrics", port))
if err != nil {
return false
}
defer r.Body.Close()
if r.StatusCode != http.StatusOK {
return false
}
metrics, err := io.ReadAll(r.Body)
if err != nil {
return false
}
checkInitialDesiredShardsOnce.Do(func() {
s, err := getMetricValue(t, bytes.NewReader(metrics), model.MetricTypeGauge, "prometheus_remote_storage_shards_desired")
require.NoError(t, err)
require.Equal(t, 1.0, s)
})
desiredShards, err := getMetricValue(t, bytes.NewReader(metrics), model.MetricTypeGauge, "prometheus_remote_storage_shards_desired")
if err != nil || desiredShards <= 1 {
return false
}
return true
// 3*shardUpdateDuration to allow for the resharding logic to run.
}, 30*time.Second, 1*time.Second)
}

View File

@ -454,10 +454,11 @@ type QueueManager struct {
quit chan struct{}
wg sync.WaitGroup
dataIn, dataOut, dataOutDuration *ewmaRate
dataIn, dataDropped, dataOut, dataOutDuration *ewmaRate
metrics *queueManagerMetrics
interner *pool
metrics *queueManagerMetrics
interner *pool
highestRecvTimestamp *maxTimestamp
}
// NewQueueManager builds a new QueueManager and starts a new
@ -471,6 +472,7 @@ func NewQueueManager(
readerMetrics *wlog.LiveReaderMetrics,
logger *slog.Logger,
dir string,
samplesIn *ewmaRate,
cfg config.QueueConfig,
mCfg config.MetadataConfig,
externalLabels labels.Labels,
@ -478,6 +480,7 @@ func NewQueueManager(
client WriteClient,
flushDeadline time.Duration,
interner *pool,
highestRecvTimestamp *maxTimestamp,
sm ReadyScrapeManager,
enableExemplarRemoteWrite bool,
enableNativeHistogramRemoteWrite bool,
@ -517,12 +520,14 @@ func NewQueueManager(
reshardChan: make(chan int),
quit: make(chan struct{}),
dataIn: newEWMARate(ewmaWeight, shardUpdateDuration),
dataIn: samplesIn,
dataDropped: newEWMARate(ewmaWeight, shardUpdateDuration),
dataOut: newEWMARate(ewmaWeight, shardUpdateDuration),
dataOutDuration: newEWMARate(ewmaWeight, shardUpdateDuration),
metrics: metrics,
interner: interner,
metrics: metrics,
interner: interner,
highestRecvTimestamp: highestRecvTimestamp,
protoMsg: protoMsg,
compr: compression.Snappy, // Hardcoded for now, but scaffolding exists for likely future use.
@ -712,6 +717,7 @@ outer:
t.seriesMtx.Lock()
lbls, ok := t.seriesLabels[s.Ref]
if !ok {
t.dataDropped.incr(1)
if _, ok := t.droppedSeries[s.Ref]; !ok {
t.logger.Info("Dropped sample for series that was not explicitly dropped via relabelling", "ref", s.Ref)
t.metrics.droppedSamplesTotal.WithLabelValues(reasonUnintentionalDroppedSeries).Inc()
@ -773,6 +779,8 @@ outer:
t.seriesMtx.Lock()
lbls, ok := t.seriesLabels[e.Ref]
if !ok {
// Track dropped exemplars in the same EWMA for sharding calc.
t.dataDropped.incr(1)
if _, ok := t.droppedSeries[e.Ref]; !ok {
t.logger.Info("Dropped exemplar for series that was not explicitly dropped via relabelling", "ref", e.Ref)
t.metrics.droppedExemplarsTotal.WithLabelValues(reasonUnintentionalDroppedSeries).Inc()
@ -834,6 +842,7 @@ outer:
t.seriesMtx.Lock()
lbls, ok := t.seriesLabels[h.Ref]
if !ok {
t.dataDropped.incr(1)
if _, ok := t.droppedSeries[h.Ref]; !ok {
t.logger.Info("Dropped histogram for series that was not explicitly dropped via relabelling", "ref", h.Ref)
t.metrics.droppedHistogramsTotal.WithLabelValues(reasonUnintentionalDroppedSeries).Inc()
@ -894,6 +903,7 @@ outer:
t.seriesMtx.Lock()
lbls, ok := t.seriesLabels[h.Ref]
if !ok {
t.dataDropped.incr(1)
if _, ok := t.droppedSeries[h.Ref]; !ok {
t.logger.Info("Dropped histogram for series that was not explicitly dropped via relabelling", "ref", h.Ref)
t.metrics.droppedHistogramsTotal.WithLabelValues(reasonUnintentionalDroppedSeries).Inc()
@ -1123,8 +1133,8 @@ func (t *QueueManager) shouldReshard(desiredShards int) bool {
// outlined in this functions implementation. It is up to the caller to reshard, or not,
// based on the return value.
func (t *QueueManager) calculateDesiredShards() int {
t.dataIn.tick()
t.dataOut.tick()
t.dataDropped.tick()
t.dataOutDuration.tick()
// We use the number of incoming samples as a prediction of how much work we
@ -1134,12 +1144,13 @@ func (t *QueueManager) calculateDesiredShards() int {
var (
dataInRate = t.dataIn.rate()
dataOutRate = t.dataOut.rate()
dataKeptRatio = dataOutRate / (t.dataDropped.rate() + dataOutRate)
dataOutDuration = t.dataOutDuration.rate() / float64(time.Second)
dataPendingRate = dataInRate - dataOutRate
dataPendingRate = dataInRate*dataKeptRatio - dataOutRate
highestSent = t.metrics.highestSentTimestamp.Get()
highestRecv = t.metrics.highestTimestamp.Get()
highestRecv = t.highestRecvTimestamp.Get()
delay = highestRecv - highestSent
dataPending = delay * dataInRate
dataPending = delay * dataInRate * dataKeptRatio
)
if dataOutRate <= 0 {
@ -1151,12 +1162,13 @@ func (t *QueueManager) calculateDesiredShards() int {
backlogCatchup = 0.05 * dataPending
// Calculate Time to send one sample, averaged across all sends done this tick.
timePerSample = dataOutDuration / dataOutRate
desiredShards = timePerSample * (dataInRate + backlogCatchup)
desiredShards = timePerSample * (dataInRate*dataKeptRatio + backlogCatchup)
)
t.metrics.desiredNumShards.Set(desiredShards)
t.logger.Debug("QueueManager.calculateDesiredShards",
"dataInRate", dataInRate,
"dataOutRate", dataOutRate,
"dataKeptRatio", dataKeptRatio,
"dataPendingRate", dataPendingRate,
"dataPending", dataPending,
"dataOutDuration", dataOutDuration,
@ -1349,7 +1361,6 @@ func (s *shards) enqueue(ref chunks.HeadSeriesRef, data timeSeries) bool {
return true
}
s.qm.metrics.highestTimestamp.Set(float64(data.timestamp / 1000))
s.qm.dataIn.incr(1)
return true
}
}

View File

@ -55,6 +55,17 @@ import (
const defaultFlushDeadline = 1 * time.Minute
func newHighestTimestampMetric() *maxTimestamp {
return &maxTimestamp{
Gauge: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "highest_timestamp_in_seconds",
Help: "Highest timestamp that has come into the remote storage via the Appender interface, in seconds since epoch. Initialized to 0 when no data has been received yet",
}),
}
}
func TestBasicContentNegotiation(t *testing.T) {
t.Parallel()
queueConfig := config.DefaultQueueConfig
@ -314,7 +325,7 @@ func newTestClientAndQueueManager(t testing.TB, flushDeadline time.Duration, pro
func newTestQueueManager(t testing.TB, cfg config.QueueConfig, mcfg config.MetadataConfig, deadline time.Duration, c WriteClient, protoMsg remoteapi.WriteMessageType) *QueueManager {
dir := t.TempDir()
metrics := newQueueManagerMetrics(nil, "", "")
m := NewQueueManager(metrics, nil, nil, nil, dir, cfg, mcfg, labels.EmptyLabels(), nil, c, deadline, newPool(), nil, false, false, false, protoMsg)
m := NewQueueManager(metrics, nil, nil, nil, dir, newEWMARate(ewmaWeight, shardUpdateDuration), cfg, mcfg, labels.EmptyLabels(), nil, c, deadline, newPool(), newHighestTimestampMetric(), nil, false, false, false, protoMsg)
return m
}
@ -770,7 +781,7 @@ func TestDisableReshardOnRetry(t *testing.T) {
}
)
m := NewQueueManager(metrics, nil, nil, nil, "", cfg, mcfg, labels.EmptyLabels(), nil, client, 0, newPool(), nil, false, false, false, remoteapi.WriteV1MessageType)
m := NewQueueManager(metrics, nil, nil, nil, "", newEWMARate(ewmaWeight, shardUpdateDuration), cfg, mcfg, labels.EmptyLabels(), nil, client, 0, newPool(), newHighestTimestampMetric(), nil, false, false, false, remoteapi.WriteV1MessageType)
m.StoreSeries(fakeSeries, 0)
// Attempt to samples while the manager is running. We immediately stop the
@ -1459,7 +1470,7 @@ func BenchmarkStoreSeries(b *testing.B) {
mcfg := config.DefaultMetadataConfig
metrics := newQueueManagerMetrics(nil, "", "")
m := NewQueueManager(metrics, nil, nil, nil, dir, cfg, mcfg, labels.EmptyLabels(), nil, c, defaultFlushDeadline, newPool(), nil, false, false, false, remoteapi.WriteV1MessageType)
m := NewQueueManager(metrics, nil, nil, nil, dir, newEWMARate(ewmaWeight, shardUpdateDuration), cfg, mcfg, labels.EmptyLabels(), nil, c, defaultFlushDeadline, newPool(), newHighestTimestampMetric(), nil, false, false, false, remoteapi.WriteV1MessageType)
m.externalLabels = tc.externalLabels
m.relabelConfigs = tc.relabelConfigs
@ -1559,8 +1570,9 @@ func TestCalculateDesiredShards(t *testing.T) {
addSamples := func(s int64, ts time.Duration) {
pendingSamples += s
samplesIn.incr(s)
samplesIn.tick()
m.metrics.highestTimestamp.Set(float64(startedAt.Add(ts).Unix()))
m.highestRecvTimestamp.Set(float64(startedAt.Add(ts).Unix()))
}
// helper function for sending samples.
@ -1617,6 +1629,7 @@ func TestCalculateDesiredShardsDetail(t *testing.T) {
prevShards int
dataIn int64 // Quantities normalised to seconds.
dataOut int64
dataDropped int64
dataOutDuration float64
backlog float64
expectedShards int
@ -1763,9 +1776,11 @@ func TestCalculateDesiredShardsDetail(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
m.numShards = tc.prevShards
forceEMWA(samplesIn, tc.dataIn*int64(shardUpdateDuration/time.Second))
samplesIn.tick()
forceEMWA(m.dataOut, tc.dataOut*int64(shardUpdateDuration/time.Second))
forceEMWA(m.dataDropped, tc.dataDropped*int64(shardUpdateDuration/time.Second))
forceEMWA(m.dataOutDuration, int64(tc.dataOutDuration*float64(shardUpdateDuration)))
m.metrics.highestTimestamp.value = tc.backlog // Not Set() because it can only increase value.
m.highestRecvTimestamp.value = tc.backlog // Not Set() because it can only increase value.
require.Equal(t, tc.expectedShards, m.calculateDesiredShards())
})

View File

@ -34,8 +34,6 @@ import (
"github.com/prometheus/prometheus/tsdb/wlog"
)
// TODO: Remove along with timestampTracker logic once we can be sure no user
// will encounter a gap that these metrics cover but other metrics don't.
var (
samplesIn = promauto.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
@ -68,9 +66,11 @@ type WriteStorage struct {
externalLabels labels.Labels
dir string
queues map[string]*QueueManager
samplesIn *ewmaRate
flushDeadline time.Duration
interner *pool
scraper ReadyScrapeManager
quit chan struct{}
// For timestampTracker.
highestTimestamp *maxTimestamp
@ -89,11 +89,11 @@ func NewWriteStorage(logger *slog.Logger, reg prometheus.Registerer, dir string,
logger: logger,
reg: reg,
flushDeadline: flushDeadline,
samplesIn: newEWMARate(ewmaWeight, shardUpdateDuration),
dir: dir,
interner: newPool(),
scraper: sm,
// TODO: Remove along with timestampTracker logic once we can be sure no user
// will encounter a gap that this metric covers but other metrics don't.
quit: make(chan struct{}),
highestTimestamp: &maxTimestamp{
Gauge: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
@ -107,9 +107,23 @@ func NewWriteStorage(logger *slog.Logger, reg prometheus.Registerer, dir string,
if reg != nil {
reg.MustRegister(rws.highestTimestamp)
}
go rws.run()
return rws
}
func (rws *WriteStorage) run() {
ticker := time.NewTicker(shardUpdateDuration)
defer ticker.Stop()
for {
select {
case <-ticker.C:
rws.samplesIn.tick()
case <-rws.quit:
return
}
}
}
func (rws *WriteStorage) Notify() {
rws.mtx.Lock()
defer rws.mtx.Unlock()
@ -187,6 +201,7 @@ func (rws *WriteStorage) ApplyConfig(conf *config.Config) error {
rws.liveReaderMetrics,
rws.logger,
rws.dir,
rws.samplesIn,
rwConf.QueueConfig,
rwConf.MetadataConfig,
conf.GlobalConfig.ExternalLabels,
@ -194,6 +209,7 @@ func (rws *WriteStorage) ApplyConfig(conf *config.Config) error {
c,
rws.flushDeadline,
rws.interner,
rws.highestTimestamp,
rws.scraper,
rwConf.SendExemplars,
rwConf.SendNativeHistograms,
@ -254,6 +270,7 @@ func (rws *WriteStorage) Close() error {
for _, q := range rws.queues {
q.Stop()
}
close(rws.quit)
rws.watcherMetrics.Unregister()
rws.liveReaderMetrics.Unregister()
@ -329,6 +346,8 @@ func (*timestampTracker) UpdateMetadata(storage.SeriesRef, labels.Labels, metada
// Commit implements storage.Appender.
func (t *timestampTracker) Commit() error {
t.writeStorage.samplesIn.incr(t.samples + t.exemplars + t.histograms)
samplesIn.Add(float64(t.samples))
exemplarsIn.Add(float64(t.exemplars))
histogramsIn.Add(float64(t.histograms))

View File

@ -1,7 +1,7 @@
{
"name": "@prometheus-io/mantine-ui",
"private": true,
"version": "0.307.2",
"version": "0.307.3",
"type": "module",
"scripts": {
"start": "vite",
@ -28,7 +28,7 @@
"@microsoft/fetch-event-source": "^2.0.1",
"@nexucis/fuzzy": "^0.5.1",
"@nexucis/kvsearch": "^0.9.1",
"@prometheus-io/codemirror-promql": "0.307.2",
"@prometheus-io/codemirror-promql": "0.307.3",
"@reduxjs/toolkit": "^2.9.0",
"@tabler/icons-react": "^3.35.0",
"@tanstack/react-query": "^5.90.2",

View File

@ -1,6 +1,6 @@
{
"name": "@prometheus-io/codemirror-promql",
"version": "0.307.2",
"version": "0.307.3",
"description": "a CodeMirror mode for the PromQL language",
"types": "dist/esm/index.d.ts",
"module": "dist/esm/index.js",
@ -29,7 +29,7 @@
},
"homepage": "https://github.com/prometheus/prometheus/blob/main/web/ui/module/codemirror-promql/README.md",
"dependencies": {
"@prometheus-io/lezer-promql": "0.307.2",
"@prometheus-io/lezer-promql": "0.307.3",
"lru-cache": "^11.2.2"
},
"devDependencies": {

View File

@ -1,6 +1,6 @@
{
"name": "@prometheus-io/lezer-promql",
"version": "0.307.2",
"version": "0.307.3",
"description": "lezer-based PromQL grammar",
"main": "dist/index.cjs",
"type": "module",

View File

@ -1,12 +1,12 @@
{
"name": "prometheus-io",
"version": "0.307.2",
"version": "0.307.3",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "prometheus-io",
"version": "0.307.2",
"version": "0.307.3",
"workspaces": [
"mantine-ui",
"module/*"
@ -24,7 +24,7 @@
},
"mantine-ui": {
"name": "@prometheus-io/mantine-ui",
"version": "0.307.2",
"version": "0.307.3",
"dependencies": {
"@codemirror/autocomplete": "^6.19.0",
"@codemirror/language": "^6.11.3",
@ -42,7 +42,7 @@
"@microsoft/fetch-event-source": "^2.0.1",
"@nexucis/fuzzy": "^0.5.1",
"@nexucis/kvsearch": "^0.9.1",
"@prometheus-io/codemirror-promql": "0.307.2",
"@prometheus-io/codemirror-promql": "0.307.3",
"@reduxjs/toolkit": "^2.9.0",
"@tabler/icons-react": "^3.35.0",
"@tanstack/react-query": "^5.90.2",
@ -87,10 +87,10 @@
},
"module/codemirror-promql": {
"name": "@prometheus-io/codemirror-promql",
"version": "0.307.2",
"version": "0.307.3",
"license": "Apache-2.0",
"dependencies": {
"@prometheus-io/lezer-promql": "0.307.2",
"@prometheus-io/lezer-promql": "0.307.3",
"lru-cache": "^11.2.2"
},
"devDependencies": {
@ -120,7 +120,7 @@
},
"module/lezer-promql": {
"name": "@prometheus-io/lezer-promql",
"version": "0.307.2",
"version": "0.307.3",
"license": "Apache-2.0",
"devDependencies": {
"@lezer/generator": "^1.8.0",

View File

@ -1,7 +1,7 @@
{
"name": "prometheus-io",
"description": "Monorepo for the Prometheus UI",
"version": "0.307.2",
"version": "0.307.3",
"private": true,
"scripts": {
"build": "bash build_ui.sh --all",