From 357e6520449004565aeb4a7ed6c05f74c5cd1d87 Mon Sep 17 00:00:00 2001
From: Lukasz Mierzwa <l.mierzwa@gmail.com>
Date: Wed, 2 Jul 2025 16:01:42 +0100
Subject: [PATCH 1/2] Add a test for a rare shutdown hang

When doing a config reload that need to stop some providers while also sending SIGTERM to Prometheus at the same time can sometimes hang

1: sync.WaitGroup.Wait [83 minutes] [Created by run.(*Group).Run in goroutine 1 @ group.go:37]
    sync         sema.go:110              runtime_SemacquireWaitGroup(*uint32(#166))
    sync         waitgroup.go:118         (*WaitGroup).Wait(*WaitGroup(#23))
    discovery    manager.go:276           (*Manager).ApplyConfig(#23, #167)
    main         main.go:964              main.func5(#120)
    main         main.go:1505             reloadConfig({#183, 0x1b}, 1, #40, #43, #50, {#31, 0xa, 0})
    main         main.go:1182             main.func22()
    run          group.go:38              (*Group).Run.func1(*Group(#26), #51)

Add a test for it.

Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com>
---
 discovery/manager_test.go | 50 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/discovery/manager_test.go b/discovery/manager_test.go
index 1dd10baf47..38a93be9f4 100644
--- a/discovery/manager_test.go
+++ b/discovery/manager_test.go
@@ -1562,3 +1562,53 @@ func TestUnregisterMetrics(t *testing.T) {
 		cancel()
 	}
 }
+
+// Calling ApplyConfig() that removes providers at the same time as shutting down
+// the manager should not hang.
+func TestConfigReloadAndShutdownRace(t *testing.T) {
+	reg := prometheus.NewRegistry()
+	_, sdMetrics := NewTestMetrics(t, reg)
+
+	mgrCtx, mgrCancel := context.WithCancel(context.Background())
+	discoveryManager := NewManager(mgrCtx, promslog.NewNopLogger(), reg, sdMetrics)
+	require.NotNil(t, discoveryManager)
+	discoveryManager.updatert = 100 * time.Millisecond
+
+	var wgDiscovery sync.WaitGroup
+	wgDiscovery.Add(1)
+	go func() {
+		discoveryManager.Run()
+		wgDiscovery.Done()
+	}()
+	time.Sleep(time.Millisecond * 200)
+
+	var wgBg sync.WaitGroup
+	updateChan := discoveryManager.SyncCh()
+	wgBg.Add(1)
+	ctx, cancel := context.WithCancel(context.Background())
+	go func() {
+		defer wgBg.Done()
+		select {
+		case <-ctx.Done():
+			return
+		case <-updateChan:
+		}
+	}()
+
+	c := map[string]Configs{
+		"prometheus": {staticConfig("bar:9090")},
+	}
+	discoveryManager.ApplyConfig(c)
+
+	delete(c, "prometheus")
+	wgBg.Add(1)
+	go func() {
+		discoveryManager.ApplyConfig(c)
+		wgBg.Done()
+	}()
+	mgrCancel()
+	wgDiscovery.Wait()
+
+	cancel()
+	wgBg.Wait()
+}

From b49d143595f98c0817bb804edaaf69232495a24e Mon Sep 17 00:00:00 2001
From: Lukasz Mierzwa <l.mierzwa@gmail.com>
Date: Wed, 2 Jul 2025 16:03:08 +0100
Subject: [PATCH 2/2] Fix a race in discovery manager ApplyConfig & shutdown

If we call ApplyConfig() at the same time the manager is being stopped we might end up hanging forever.
This is because ApplyConfig() will try to cancel obsolete providers and wait until they are cancelled.
It's done by setting a done() function that call Done() on a sync.WaitGroup:

```
if len(prov.newSubs) == 0 {
	wg.Add(1)
	prov.done = func() {
		wg.Done()
	}
}
```

then calling prov.cancel() and finally waiting until all providers run done() function
that by blocking it all on a wg.Wait() call.

For each provider there is a goroutine created by calling Manager.startProvider(*Provider):

```
func (m *Manager) startProvider(ctx context.Context, p *Provider) {
	m.logger.Debug("Starting provider", "provider", p.name, "subs", fmt.Sprintf("%v", p.subs))
	ctx, cancel := context.WithCancel(ctx)
	updates := make(chan []*targetgroup.Group)

	p.mu.Lock()
	p.cancel = cancel
	p.mu.Unlock()

	go p.d.Run(ctx, updates)
	go m.updater(ctx, p, updates)
}
```

It creates a context that can be cancelled and that cancel function becomes prov.cancel. This is what ApplyConfig will call.
If we look at the body of updater() method:

```
func (m *Manager) updater(ctx context.Context, p *Provider, updates chan []*targetgroup.Group) {
	// Ensure targets from this provider are cleaned up.
	defer m.cleaner(p)
	for {
		select {
		case <-ctx.Done():
			return
[...]
```

we can see that it will exit if that context is cancelled and that will trigger a call to Manager.cleaner().
That cleaner() is where done() is called.
So ApplyConfig() -> calls cancel() -> causes cleaner() to be executed -> calls done().

cancel() is also called from cancelDiscoverers() method that will be called by Manager.Run() when Manager is stopping:

```
func (m *Manager) Run() error {
	go m.sender()
	<-m.ctx.Done()
	m.cancelDiscoverers()
	return m.ctx.Err()
}
```

The problem is that if we call both ApplyConfig and stop the manager at the same time we might end up with:

- We call Manager.ApplyConfig()
- We stop the Manager
- Manager.cancelDiscoverers() is called
- Provider.cancel() is called for every Provider
- cancel() causes provider context to be cancelled which terminates updater() for given Provider
- cancelling context causes cleaner() method to be called for given Provider
- cleaner() calls done() and exits
- Provider is considered stopped at this point, there is no goroutine running that will call done() anymore
- ApplyConfig iterates providers and decides that one is obsolete is must be stopped
- It sets a custom done() function body with a WaitGroup.Done() call in it
- Then ApplyConfig waits until all Providers run done()
- But they are all stopped and no done() will be run
- We wait forever

This only happens if cancelDiscoverers() is run before ApplyConfig, if ApplyConfig runs first done() will be called,
if cancelDiscoverers() is called first it will stop updater() instances and so done() won't be called anymore.

Part of the problem is that there is no distinction between running and stopped providers. There is Provider.IsStarted() method
that returns a bool based on the value of cancel function but ApplyConfig doesn't check it.
Second problem is that although there is a mutex on a Provider it's used much in the code, so two goroutines can try to read and/or write
provider.cancel and/or provider.done at the same time, making it all more likely to race.

The easiest way to fix it is to check if the provider is started inside ApplyConfig so we don't try to stop a provider that's already stopped.
For that we need to mark it as stopped after cancel() is called, by setting cancel to nil.
This also needs better lock usage to avoid different parts of the code trying to set cancel and done at the same time.

Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com>
---
 discovery/manager.go | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/discovery/manager.go b/discovery/manager.go
index 6e9bab1d7c..24950d9d59 100644
--- a/discovery/manager.go
+++ b/discovery/manager.go
@@ -57,6 +57,8 @@ func (p *Provider) Discoverer() Discoverer {
 
 // IsStarted return true if Discoverer is started.
 func (p *Provider) IsStarted() bool {
+	p.mu.RLock()
+	defer p.mu.RUnlock()
 	return p.cancel != nil
 }
 
@@ -216,15 +218,22 @@ func (m *Manager) ApplyConfig(cfg map[string]Configs) error {
 		newProviders []*Provider
 	)
 	for _, prov := range m.providers {
-		// Cancel obsolete providers.
-		if len(prov.newSubs) == 0 {
+		// Cancel obsolete providers if it has no new subs and it has a cancel function.
+		// prov.cancel != nil is the same check as we use in IsStarted() method but we don't call IsStarted
+		// here because it would take a lock and we need the same lock ourselves for other reads.
+		prov.mu.RLock()
+		if len(prov.newSubs) == 0 && prov.cancel != nil {
 			wg.Add(1)
 			prov.done = func() {
 				wg.Done()
 			}
+
 			prov.cancel()
+			prov.mu.RUnlock()
 			continue
 		}
+		prov.mu.RUnlock()
+
 		newProviders = append(newProviders, prov)
 		// refTargets keeps reference targets used to populate new subs' targets as they should be the same.
 		var refTargets map[string]*targetgroup.Group
@@ -298,7 +307,9 @@ func (m *Manager) startProvider(ctx context.Context, p *Provider) {
 	ctx, cancel := context.WithCancel(ctx)
 	updates := make(chan []*targetgroup.Group)
 
+	p.mu.Lock()
 	p.cancel = cancel
+	p.mu.Unlock()
 
 	go p.d.Run(ctx, updates)
 	go m.updater(ctx, p, updates)
@@ -306,16 +317,20 @@ func (m *Manager) startProvider(ctx context.Context, p *Provider) {
 
 // cleaner cleans resources associated with provider.
 func (m *Manager) cleaner(p *Provider) {
-	p.mu.RLock()
+	p.mu.Lock()
+	defer p.mu.Unlock()
+
 	m.targetsMtx.Lock()
 	for s := range p.subs {
 		delete(m.targets, poolKey{s, p.name})
 	}
 	m.targetsMtx.Unlock()
-	p.mu.RUnlock()
 	if p.done != nil {
 		p.done()
 	}
+
+	// Provider was cleaned so mark is as down.
+	p.cancel = nil
 }
 
 func (m *Manager) updater(ctx context.Context, p *Provider, updates chan []*targetgroup.Group) {
@@ -380,9 +395,11 @@ func (m *Manager) cancelDiscoverers() {
 	m.mtx.RLock()
 	defer m.mtx.RUnlock()
 	for _, p := range m.providers {
+		p.mu.RLock()
 		if p.cancel != nil {
 			p.cancel()
 		}
+		p.mu.RUnlock()
 	}
 }