DEBUG: wdt: make the blocked traffic warning delay configurable

The new global "warn-blocked-traffic-after" allows one to configure
after how much time a warning should be emitted when traffic is blocked.
This commit is contained in:
Willy Tarreau 2024-11-06 17:48:41 +01:00
parent 7337c42224
commit 6127e5a4e9
3 changed files with 89 additions and 9 deletions

View File

@ -1562,6 +1562,7 @@ The following keywords are supported in the "global" section :
* Debugging
- anonkey
- quiet
- warn-blocked-traffic-after
- zero-warning
* HTTPClient
@ -4394,6 +4395,24 @@ quiet
Do not display any message during startup. It is equivalent to the command-
line argument "-q".
warn-blocked-traffic-after <time>
This allows to adjust the delay after which a stuck task blocking the traffic
will trigger the emission of a warning on the standard error output. The
delay is expressed in milliseconds and defaults to 1000 ms. Permitted values
must be comprised between 1 ms and 1000 ms included. Lower values will
trigger warnings frequently and higher ones will rarely. The watchdog will
kill a runaway task that fails to respond twice for one second anyway, so a
1000 ms warning delay will normally not trigger any warning. It is
recommended to stay with values between 10 and 100ms to detect configuration
anomalies that may degrade the user's experience, causing long response times
or jerkiness on interactive sessions. For example, a poorly designed Lua
sample-fetch function doing heavy computations, or a very large map_reg or
map_regm map file with a very high evaluation cost may cause such trouble.
For comparison a TLS handshake can eat between one and two milliseconds, and
compressing a 16kB HTTP response buffer is around one millisecond. The output
contains a thread dump of the offending task with a backtrace and some
context that helps figure where the time is being spent.
zero-warning
When this option is set, HAProxy will refuse to start if any warning was
emitted while processing the configuration and applying it. It means that

View File

@ -760,9 +760,10 @@ void ha_stuck_warning(int thr)
" this must remain exceptional because the system's stability is now at risk.\n"
" Timers in logs may be reported incorrectly, spurious timeouts may happen,\n"
" some incoming connections may silently be dropped, health checks may\n"
" randomly fail, and accesses to the CLI may block the whole process. Please\n"
" check the trace below for any clues about configuration elements that need\n"
" to be corrected:\n\n",
" randomly fail, and accesses to the CLI may block the whole process. The\n"
" blocking delay before emitting this warning may be adjusted via the global\n"
" 'warn-blocked-traffic-after' directive. Please check the trace below for\n"
" any clues about configuration elements that need to be corrected:\n\n",
thr + 1, (n - p) / 1000000ULL,
HA_ATOMIC_LOAD(&ha_thread_ctx[thr].stream_cnt));

View File

@ -14,6 +14,7 @@
#include <haproxy/activity.h>
#include <haproxy/api.h>
#include <haproxy/cfgparse.h>
#include <haproxy/clock.h>
#include <haproxy/debug.h>
#include <haproxy/errors.h>
@ -42,6 +43,9 @@ static struct {
uint prev_ctxsw;
} per_thread_wd_ctx[MAX_THREADS];
/* warn about stuck tasks after this delay (ns) */
static unsigned int wdt_warn_blocked_traffic_ns = 1000000000U;
/* Setup (or ping) the watchdog timer for thread <thr>. Returns non-zero on
* success, zero on failure. It interrupts once per second of CPU time. It
* happens that timers based on the CPU time are not automatically re-armed
@ -51,7 +55,8 @@ int wdt_ping(int thr)
{
struct itimerspec its;
its.it_value.tv_sec = 1; its.it_value.tv_nsec = 0;
its.it_value.tv_sec = wdt_warn_blocked_traffic_ns / 1000000000U;
its.it_value.tv_nsec = wdt_warn_blocked_traffic_ns % 1000000000U;
its.it_interval.tv_sec = 0; its.it_interval.tv_nsec = 0;
return timer_settime(per_thread_wd_ctx[thr].timer, 0, &its, NULL) == 0;
}
@ -87,7 +92,7 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
/* not yet reached the deadline of 1 sec,
* or p wasn't initialized yet
*/
if (!p || n - p < 1000000000UL)
if (!p)
goto update_and_leave;
if ((_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_SLEEPING) ||
@ -115,11 +120,22 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
if (!(_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_STUCK)) {
uint prev_ctxsw;
_HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
prev_ctxsw = HA_ATOMIC_LOAD(&per_thread_wd_ctx[tid].prev_ctxsw);
if (HA_ATOMIC_LOAD(&activity[thr].ctxsw) == prev_ctxsw)
ha_stuck_warning(thr);
HA_ATOMIC_STORE(&activity[thr].ctxsw, prev_ctxsw);
/* only after one second it's clear we're stuck */
if (n - p >= 1000000000ULL)
_HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
/* have we crossed the warning boundary ? If so we note were we
* where, and second time called from the same place will trigger
* a warning (unless already stuck).
*/
if (n - p >= (ullong)wdt_warn_blocked_traffic_ns) {
if (HA_ATOMIC_LOAD(&activity[thr].ctxsw) == prev_ctxsw)
ha_stuck_warning(thr);
HA_ATOMIC_STORE(&activity[thr].ctxsw, prev_ctxsw);
}
goto update_and_leave;
}
@ -163,6 +179,44 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
wdt_ping(thr);
}
/* parse the "warn-blocked-traffic-after" parameter */
static int wdt_parse_warn_blocked(char **args, int section_type, struct proxy *curpx,
const struct proxy *defpx, const char *file, int line,
char **err)
{
const char *res;
uint value;
if (!*args[1]) {
memprintf(err, "'%s' expects <time> as argument between 1 and 1000 ms.\n", args[0]);
return -1;
}
res = parse_time_err(args[1], &value, TIME_UNIT_MS);
if (res == PARSE_TIME_OVER) {
memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 1000 ms)",
args[1], args[0]);
return -1;
}
else if (res == PARSE_TIME_UNDER) {
memprintf(err, "timer underflow in argument '%s' to '%s' (minimum value is 1 ms)",
args[1], args[0]);
return -1;
}
else if (res) {
memprintf(err, "unexpected character '%c' in argument to <%s>.\n", *res, args[0]);
return -1;
}
else if (value > 1000 || value < 1) {
memprintf(err, "timer out of range in argument '%s' to '%s' (value must be between 1 and 1000 ms)",
args[1], args[0]);
return -1;
}
wdt_warn_blocked_traffic_ns = value * 1000000U;
return 0;
}
int init_wdt_per_thread()
{
if (!clock_setup_signal_timer(&per_thread_wd_ctx[tid].timer, WDTSIG, tid))
@ -202,6 +256,12 @@ int init_wdt()
return ERR_NONE;
}
static struct cfg_kw_list cfg_kws = {ILH, {
{ CFG_GLOBAL, "warn-blocked-traffic-after", wdt_parse_warn_blocked },
{ 0, NULL, NULL },
}};
INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws);
REGISTER_POST_CHECK(init_wdt);
REGISTER_PER_THREAD_INIT(init_wdt_per_thread);
REGISTER_PER_THREAD_DEINIT(deinit_wdt_per_thread);