mirror of
https://git.haproxy.org/git/haproxy.git/
synced 2025-08-07 15:47:01 +02:00
BUG/MEDIUM: debug: close a possible race between thread dump and panic()
The rework of the thread dumping mechanism in 2.8 with commit9a6ecbd590
("MEDIUM: debug: simplify the thread dump mechanism") opened a small race, which is that a thread in the process of dumping other ones may block the other one from panicing while it's looping at the end of ha_thread_dump_fill(), or any other sequence involving the currently dumped one. This was emphasized in 3.1 with commit148eb5875f
("DEBUG: wdt: better detect apparently locked up threads and warn about them") that allowed to emit warnings about long-stuck threads, because in this case, what happens is that sometimes a thread starts to emit a warning (or a set of warnings), and while the warning is being awaited for, a panic finally happens and interrupts either the dumping thread, which never finishes and waits for the target's pointer to become NULL which will never happen since it was supposed to do it itself, or the currently dumped thread which could wait for the dumping thread to become ready while this one has not released the former. In order to address this, first we now make sure never to dump a thread that is already in the process of dumping another one. We're adding a new thread flag to know this situation, that is set in ha_thread_dump_fill() and cleared in ha_thread_dump_done(). And similarly, we don't trigger the watchdog on a thread waiting for another one to finish its dump, as it's likely a case of warning (and maybe even a panic) that makes them wait for each other and we don't want such cases to be reentrant. Finally, we check in the main polling loop that the flag never accidentally leaked (e.g. wrong flag manipulation) as this would be difficult to spot with bad consequences. This should be backported at least to 2.8, and should resolve github issue #2860. Thanks to Chris Staite for the very informative backtrace that exhibited the problem.
This commit is contained in:
parent
37e84676c7
commit
7ddcdff33f
@ -64,6 +64,7 @@ enum {
|
|||||||
#define TH_FL_SLEEPING 0x00000008 /* thread won't check its task list before next wakeup */
|
#define TH_FL_SLEEPING 0x00000008 /* thread won't check its task list before next wakeup */
|
||||||
#define TH_FL_STARTED 0x00000010 /* set once the thread starts */
|
#define TH_FL_STARTED 0x00000010 /* set once the thread starts */
|
||||||
#define TH_FL_IN_LOOP 0x00000020 /* set only inside the polling loop */
|
#define TH_FL_IN_LOOP 0x00000020 /* set only inside the polling loop */
|
||||||
|
#define TH_FL_DUMPING_OTHERS 0x00000040 /* thread currently dumping other threads */
|
||||||
|
|
||||||
/* we have 4 buffer-wait queues, in highest to lowest emergency order */
|
/* we have 4 buffer-wait queues, in highest to lowest emergency order */
|
||||||
#define DYNBUF_NBQ 4
|
#define DYNBUF_NBQ 4
|
||||||
|
22
src/debug.c
22
src/debug.c
@ -358,12 +358,23 @@ void ha_thread_dump_one(int thr, int from_signal)
|
|||||||
* there is enough room otherwise some contents will be truncated. The function
|
* there is enough room otherwise some contents will be truncated. The function
|
||||||
* waits for the called thread to fill the buffer before returning (or cancelling
|
* waits for the called thread to fill the buffer before returning (or cancelling
|
||||||
* by reporting NULL). It does not release the called thread yet. It returns a
|
* by reporting NULL). It does not release the called thread yet. It returns a
|
||||||
* pointer to the buffer used if the dump was done, otherwise NULL.
|
* pointer to the buffer used if the dump was done, otherwise NULL. When the
|
||||||
|
* dump starts, it marks the current thread as dumping, which will only be
|
||||||
|
* released via a failure (returns NULL) or via a call to ha_dump_thread_done().
|
||||||
*/
|
*/
|
||||||
struct buffer *ha_thread_dump_fill(struct buffer *buf, int thr)
|
struct buffer *ha_thread_dump_fill(struct buffer *buf, int thr)
|
||||||
{
|
{
|
||||||
struct buffer *old = NULL;
|
struct buffer *old = NULL;
|
||||||
|
|
||||||
|
/* A thread that's currently dumping other threads cannot be dumped, or
|
||||||
|
* it will very likely cause a deadlock.
|
||||||
|
*/
|
||||||
|
if (HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_DUMPING_OTHERS)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/* This will be undone in ha_thread_dump_done() */
|
||||||
|
HA_ATOMIC_OR(&th_ctx->flags, TH_FL_DUMPING_OTHERS);
|
||||||
|
|
||||||
/* try to impose our dump buffer and to reserve the target thread's
|
/* try to impose our dump buffer and to reserve the target thread's
|
||||||
* next dump for us.
|
* next dump for us.
|
||||||
*/
|
*/
|
||||||
@ -388,8 +399,11 @@ struct buffer *ha_thread_dump_fill(struct buffer *buf, int thr)
|
|||||||
old = HA_ATOMIC_LOAD(&ha_thread_ctx[thr].thread_dump_buffer);
|
old = HA_ATOMIC_LOAD(&ha_thread_ctx[thr].thread_dump_buffer);
|
||||||
if ((ulong)old & 0x1)
|
if ((ulong)old & 0x1)
|
||||||
break;
|
break;
|
||||||
if (!old)
|
if (!old) {
|
||||||
|
/* cancelled: no longer dumping */
|
||||||
|
HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_DUMPING_OTHERS);
|
||||||
return old;
|
return old;
|
||||||
|
}
|
||||||
ha_thread_relax();
|
ha_thread_relax();
|
||||||
}
|
}
|
||||||
return (struct buffer *)((ulong)old & ~0x1UL);
|
return (struct buffer *)((ulong)old & ~0x1UL);
|
||||||
@ -409,11 +423,13 @@ void ha_thread_dump_done(struct buffer *buf, int thr)
|
|||||||
old = HA_ATOMIC_LOAD(&ha_thread_ctx[thr].thread_dump_buffer);
|
old = HA_ATOMIC_LOAD(&ha_thread_ctx[thr].thread_dump_buffer);
|
||||||
if (!((ulong)old & 0x1)) {
|
if (!((ulong)old & 0x1)) {
|
||||||
if (!old)
|
if (!old)
|
||||||
return;
|
break;
|
||||||
ha_thread_relax();
|
ha_thread_relax();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} while (!HA_ATOMIC_CAS(&ha_thread_ctx[thr].thread_dump_buffer, &old, buf));
|
} while (!HA_ATOMIC_CAS(&ha_thread_ctx[thr].thread_dump_buffer, &old, buf));
|
||||||
|
|
||||||
|
HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_DUMPING_OTHERS);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* dumps into the buffer some information related to task <task> (which may
|
/* dumps into the buffer some information related to task <task> (which may
|
||||||
|
@ -2749,6 +2749,9 @@ void run_poll_loop()
|
|||||||
/* Process a few tasks */
|
/* Process a few tasks */
|
||||||
process_runnable_tasks();
|
process_runnable_tasks();
|
||||||
|
|
||||||
|
/* If this happens this is an accidental leak */
|
||||||
|
BUG_ON(HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_DUMPING_OTHERS);
|
||||||
|
|
||||||
/* also stop if we failed to cleanly stop all tasks */
|
/* also stop if we failed to cleanly stop all tasks */
|
||||||
if (killed > 1)
|
if (killed > 1)
|
||||||
break;
|
break;
|
||||||
|
@ -95,13 +95,15 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
|
|||||||
if (!p)
|
if (!p)
|
||||||
goto update_and_leave;
|
goto update_and_leave;
|
||||||
|
|
||||||
if ((_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_SLEEPING) ||
|
if ((_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & (TH_FL_SLEEPING|TH_FL_DUMPING_OTHERS)) ||
|
||||||
(_HA_ATOMIC_LOAD(&ha_tgroup_ctx[tgrp-1].threads_harmless) & thr_bit)) {
|
(_HA_ATOMIC_LOAD(&ha_tgroup_ctx[tgrp-1].threads_harmless) & thr_bit)) {
|
||||||
/* This thread is currently doing exactly nothing
|
/* This thread is currently doing exactly nothing
|
||||||
* waiting in the poll loop (unlikely but possible),
|
* waiting in the poll loop (unlikely but possible),
|
||||||
* waiting for all other threads to join the rendez-vous
|
* waiting for all other threads to join the rendez-vous
|
||||||
* point (common), or waiting for another thread to
|
* point (common), or waiting for another thread to
|
||||||
* finish an isolated operation (unlikely but possible).
|
* finish an isolated operation (unlikely but possible),
|
||||||
|
* or waiting for another thread to finish dumping its
|
||||||
|
* stack.
|
||||||
*/
|
*/
|
||||||
goto update_and_leave;
|
goto update_and_leave;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user