From 7d55ee8cc8f6ee81935c28ecc2de75a7293100ff Mon Sep 17 00:00:00 2001
From: Lukasz Mierzwa <l.mierzwa@gmail.com>
Date: Fri, 9 May 2025 15:45:36 +0100
Subject: [PATCH 1/2] Try fixing potential deadlocks in discovery

Manager.ApplyConfig() uses multiple locks:
- Provider.mu
- Manager.targetsMtx

Manager.cleaner() uses the same locks but in the opposite order:
- First it locks Manager.targetsMtx
- The it locks Provider.mu

I've seen a few strange cases of Prometheus hanging up on shutdown and never compliting that shutdown.
From a few traces I was given it appears that while Prometheus is still running only discovery.Manager and notifier.Manager are running running.
From that trace it also seems like they are stuck on a lock from two functions:
- cleaner waits on a RLock()
- ApplyConfig waits on a Lock()

I cannot reproduce it but I suspect this is a race between locks. Imagine this scenario:
- Manager.ApplyConfig() is called
- Manager.ApplyConfig locks Provider.mu.Lock()
- at the same time cleaner() is called on the same Provider instance and it calls Manager.targetsMtx.Lock()
- Manager.ApplyConfig() now calls Manager.targetsMtx.Lock() but that lock is already held by cleaner() function so ApplyConfig() hangs there
- at the same time cleaner() now wants to lock Provider.mu.Rlock() but that lock is already held by Manager.ApplyConfig()
- we end up with both functions locking each other out without any way to break that lock

Re-order lock calls to try to avoid this scenario.
I tried writing a test case for it but couldn't hit this issue.

Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com>
---
 discovery/manager.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/discovery/manager.go b/discovery/manager.go
index 3219117d2a..d76e67adcc 100644
--- a/discovery/manager.go
+++ b/discovery/manager.go
@@ -306,13 +306,13 @@ func (m *Manager) startProvider(ctx context.Context, p *Provider) {
 
 // cleaner cleans resources associated with provider.
 func (m *Manager) cleaner(p *Provider) {
-	m.targetsMtx.Lock()
 	p.mu.RLock()
+	m.targetsMtx.Lock()
 	for s := range p.subs {
 		delete(m.targets, poolKey{s, p.name})
 	}
-	p.mu.RUnlock()
 	m.targetsMtx.Unlock()
+	p.mu.RUnlock()
 	if p.done != nil {
 		p.done()
 	}

From 59761f631b29de8dccbffe2626b74fccddbb8b48 Mon Sep 17 00:00:00 2001
From: Lukasz Mierzwa <l.mierzwa@gmail.com>
Date: Thu, 15 May 2025 12:30:48 +0100
Subject: [PATCH 2/2] Move m.targetsMtx.Lock down into the loop

Make sure the order of locks is always the same in all functions. In ApplyConfig() we have m.targetsMtx.Lock() after provider is locked, so replicate the same in allGroups().

Signed-off-by: Lukasz Mierzwa <l.mierzwa@gmail.com>
---
 discovery/manager.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/discovery/manager.go b/discovery/manager.go
index d76e67adcc..6e9bab1d7c 100644
--- a/discovery/manager.go
+++ b/discovery/manager.go
@@ -413,9 +413,9 @@ func (m *Manager) allGroups() map[string][]*targetgroup.Group {
 	n := map[string]int{}
 
 	m.mtx.RLock()
-	m.targetsMtx.Lock()
 	for _, p := range m.providers {
 		p.mu.RLock()
+		m.targetsMtx.Lock()
 		for s := range p.subs {
 			// Send empty lists for subs without any targets to make sure old stale targets are dropped by consumers.
 			// See: https://github.com/prometheus/prometheus/issues/12858 for details.
@@ -430,9 +430,9 @@ func (m *Manager) allGroups() map[string][]*targetgroup.Group {
 				}
 			}
 		}
+		m.targetsMtx.Unlock()
 		p.mu.RUnlock()
 	}
-	m.targetsMtx.Unlock()
 	m.mtx.RUnlock()
 
 	for setName, v := range n {