mirror of
https://git.haproxy.org/git/haproxy.git/
synced 2025-08-06 23:27:04 +02:00
MEDIUM: wdt: always make the faulty thread report its own warnings
Warnings remain tricky to deal with, especially for other threads as they require some inter-thread synchronization that doesn't cope very well with other parallel activities such as "show threads" for example. However there is nothing that forces us to handle them this way. The panic for example is already handled by bouncing the WDT signal to the faulty thread. This commit rearranges the WDT handler to make a better used of this existing signal bouncing feature of the WDT handler so that it's no longer limited to panics but can also deal with warnings. In order not to bounce on all wakeups, we only bounce when there is a suspicion, that is, when the warning timer has been crossed. We'll let the target thread verify the stuck flag and context switch count by itself to decide whether or not to panic, warn, or just do nothing and update the counters. As a bonus, now all warning traces look the same regardless of the reporting thread: call trace(16): | 0x6bc733 <01 00 00 e8 6d e6 de ff]: ha_dump_backtrace+0x73/0x309 > main-0x2570 | 0x6bd37a <00 00 00 e8 d6 fb ff ff]: ha_thread_dump_fill+0xda/0x104 > ha_thread_dump_one | 0x6bd625 <00 00 00 e8 7b fc ff ff]: ha_stuck_warning+0xc5/0x19e > ha_thread_dump_fill | 0x7b2b60 <64 8b 3b e8 00 aa f0 ff]: wdt_handler+0x1f0/0x212 > ha_stuck_warning | 0x7fd7e2cef3a0 <00 00 00 00 0f 1f 40 00]: libpthread:+0x123a0 | 0x7ffc6af9e634 <85 a6 00 00 00 0f 01 f9]: linux-vdso:__vdso_gettimeofday+0x34/0x2b0 | 0x6bad74 <7c 24 10 e8 9c 01 df ff]: sc_conn_io_cb+0x9fa4 > main-0x2400 | 0x67c457 <89 f2 4c 89 e6 41 ff d0]: main+0x1cf147 | 0x67d401 <48 89 df e8 8f ed ff ff]: cli_io_handler+0x191/0xb38 > main+0x1cee80 | 0x6dd605 <40 48 8b 45 60 ff 50 18]: task_process_applet+0x275/0xce9
This commit is contained in:
parent
b24d7f248e
commit
a06c215f08
63
src/wdt.c
63
src/wdt.c
@ -65,6 +65,7 @@ int wdt_ping(int thr)
|
|||||||
void wdt_handler(int sig, siginfo_t *si, void *arg)
|
void wdt_handler(int sig, siginfo_t *si, void *arg)
|
||||||
{
|
{
|
||||||
unsigned long long n, p;
|
unsigned long long n, p;
|
||||||
|
uint prev_ctxsw, curr_ctxsw;
|
||||||
ulong thr_bit;
|
ulong thr_bit;
|
||||||
int thr, tgrp;
|
int thr, tgrp;
|
||||||
|
|
||||||
@ -123,27 +124,23 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
|
|||||||
* at least emit a warning.
|
* at least emit a warning.
|
||||||
*/
|
*/
|
||||||
if (!(_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_STUCK)) {
|
if (!(_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_STUCK)) {
|
||||||
uint prev_ctxsw;
|
/* after one second it's clear that we're stuck */
|
||||||
|
|
||||||
prev_ctxsw = HA_ATOMIC_LOAD(&per_thread_wd_ctx[thr].prev_ctxsw);
|
|
||||||
|
|
||||||
/* only after one second it's clear we're stuck */
|
|
||||||
if (n - p >= 1000000000ULL)
|
if (n - p >= 1000000000ULL)
|
||||||
_HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
|
_HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
|
||||||
|
else if (n - p < (ullong)wdt_warn_blocked_traffic_ns) {
|
||||||
/* have we crossed the warning boundary ? If so we note were we
|
/* if we haven't crossed the warning boundary,
|
||||||
* where, and second time called from the same place will trigger
|
* let's just refresh the reporting thread's timer.
|
||||||
* a warning (unless already stuck).
|
*/
|
||||||
*/
|
goto update_and_leave;
|
||||||
if (n - p >= (ullong)wdt_warn_blocked_traffic_ns) {
|
|
||||||
uint curr_ctxsw = HA_ATOMIC_LOAD(&activity[thr].ctxsw);
|
|
||||||
|
|
||||||
if (curr_ctxsw == prev_ctxsw)
|
|
||||||
ha_stuck_warning(thr);
|
|
||||||
HA_ATOMIC_STORE(&per_thread_wd_ctx[thr].prev_ctxsw, curr_ctxsw);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
goto update_and_leave;
|
/* OK so we've crossed the warning boundary and possibly the
|
||||||
|
* panic one as well. This may only be reported by the original
|
||||||
|
* thread. Let's fall back to the common code below which will
|
||||||
|
* possibly bounce to the reporting thread, which will then
|
||||||
|
* check the ctxsw count and decide whether to do nothing, to
|
||||||
|
* warn, or either panic.
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
/* No doubt now, there's no hop to recover, die loudly! */
|
/* No doubt now, there's no hop to recover, die loudly! */
|
||||||
@ -170,24 +167,38 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* By default we terminate. If we're not on the victim thread, better
|
/* Right here, we either got a bounce from another thread's WDT to
|
||||||
* bounce the signal there so that we produce a cleaner stack trace
|
* report a crossed period, or we noticed it for the current thread.
|
||||||
* with the other thread interrupted exactly where it was running and
|
* For other threads, we're bouncing.
|
||||||
* the current one not involved in this.
|
|
||||||
*/
|
*/
|
||||||
#ifdef USE_THREAD
|
#ifdef USE_THREAD
|
||||||
if (thr != tid)
|
if (thr != tid) {
|
||||||
ha_tkill(thr, sig);
|
ha_tkill(thr, sig);
|
||||||
else
|
goto leave;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* Now the interesting things begin. The timer was at least as large
|
||||||
|
* as the warning threshold. If the stuck bit was set, we must now
|
||||||
|
* panic. Otherwise we're checking if we're still context-switching
|
||||||
|
* or not and we'll either warn if not, or just update the ctxsw
|
||||||
|
* counter to check next time.
|
||||||
|
*/
|
||||||
|
if (_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_STUCK)
|
||||||
ha_panic();
|
ha_panic();
|
||||||
|
|
||||||
_HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_IN_WDT_HANDLER);
|
prev_ctxsw = per_thread_wd_ctx[tid].prev_ctxsw;
|
||||||
return;
|
curr_ctxsw = activity[tid].ctxsw;
|
||||||
|
|
||||||
|
if (curr_ctxsw == prev_ctxsw)
|
||||||
|
ha_stuck_warning(tid);
|
||||||
|
else
|
||||||
|
per_thread_wd_ctx[tid].prev_ctxsw = curr_ctxsw;
|
||||||
|
/* let's go on */
|
||||||
|
|
||||||
update_and_leave:
|
update_and_leave:
|
||||||
wdt_ping(thr);
|
wdt_ping(thr);
|
||||||
|
leave:
|
||||||
_HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_IN_WDT_HANDLER);
|
_HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_IN_WDT_HANDLER);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user