Limit concurrency of scrape pool reloads (#16783)

To avoid possible overload. As per https://github.com/prometheus/prometheus/pull/16595#issuecomment-3005027067 this changes scrape pool manager to limit the number of scrape pools that can reload at the same time. Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com>
2025-08-05 21:57:09 +02:00 · 2025-06-27 12:34:07 +01:00 · 2025-06-27 12:34:07 +01:00 · 748fe6d825
commit 748fe6d825
parent df4f1df43f
1 changed files with 11 additions and 1 deletions
--- a/scrape/manager.go
+++ b/scrape/manager.go
@ -19,6 +19,7 @@ import (
 	"hash/fnv"
 	"log/slog"
 	"reflect"
+	"runtime"
 	"sync"
 	"time"

@ -293,13 +294,22 @@ func (m *Manager) ApplyConfig(cfg *config.Config) error {
 		wg       sync.WaitGroup
 		toDelete sync.Map // Stores the list of names of pools to delete.
 	)
+
+	// Use a buffered channel to limit reload concurrency.
+	// Each scrape pool writes the channel before we start to reload it and read from it at the end.
+	// This means only N pools can be reloaded at the same time.
+	canReload := make(chan int, runtime.GOMAXPROCS(0))
 	for poolName, pool := range m.scrapePools {
+		canReload <- 1
 		wg.Add(1)
 		cfg, ok := m.scrapeConfigs[poolName]
 		// Reload each scrape pool in a dedicated goroutine so we don't have to wait a long time
 		// if we have a lot of scrape pools to update.
 		go func(name string, sp *scrapePool, cfg *config.ScrapeConfig, ok bool) {
-			defer wg.Done()
+			defer func() {
+				wg.Done()
+				<-canReload
+			}()
 			switch {
 			case !ok:
 				sp.stop()