mirror of
https://git.haproxy.org/git/haproxy.git/
synced 2026-05-04 20:46:11 +02:00
DEBUG: wdt: make the blocked traffic warning delay configurable
The new global "warn-blocked-traffic-after" allows one to configure after how much time a warning should be emitted when traffic is blocked.
This commit is contained in:
parent
7337c42224
commit
6127e5a4e9
@ -1562,6 +1562,7 @@ The following keywords are supported in the "global" section :
|
||||
* Debugging
|
||||
- anonkey
|
||||
- quiet
|
||||
- warn-blocked-traffic-after
|
||||
- zero-warning
|
||||
|
||||
* HTTPClient
|
||||
@ -4394,6 +4395,24 @@ quiet
|
||||
Do not display any message during startup. It is equivalent to the command-
|
||||
line argument "-q".
|
||||
|
||||
warn-blocked-traffic-after <time>
|
||||
This allows to adjust the delay after which a stuck task blocking the traffic
|
||||
will trigger the emission of a warning on the standard error output. The
|
||||
delay is expressed in milliseconds and defaults to 1000 ms. Permitted values
|
||||
must be comprised between 1 ms and 1000 ms included. Lower values will
|
||||
trigger warnings frequently and higher ones will rarely. The watchdog will
|
||||
kill a runaway task that fails to respond twice for one second anyway, so a
|
||||
1000 ms warning delay will normally not trigger any warning. It is
|
||||
recommended to stay with values between 10 and 100ms to detect configuration
|
||||
anomalies that may degrade the user's experience, causing long response times
|
||||
or jerkiness on interactive sessions. For example, a poorly designed Lua
|
||||
sample-fetch function doing heavy computations, or a very large map_reg or
|
||||
map_regm map file with a very high evaluation cost may cause such trouble.
|
||||
For comparison a TLS handshake can eat between one and two milliseconds, and
|
||||
compressing a 16kB HTTP response buffer is around one millisecond. The output
|
||||
contains a thread dump of the offending task with a backtrace and some
|
||||
context that helps figure where the time is being spent.
|
||||
|
||||
zero-warning
|
||||
When this option is set, HAProxy will refuse to start if any warning was
|
||||
emitted while processing the configuration and applying it. It means that
|
||||
|
||||
@ -760,9 +760,10 @@ void ha_stuck_warning(int thr)
|
||||
" this must remain exceptional because the system's stability is now at risk.\n"
|
||||
" Timers in logs may be reported incorrectly, spurious timeouts may happen,\n"
|
||||
" some incoming connections may silently be dropped, health checks may\n"
|
||||
" randomly fail, and accesses to the CLI may block the whole process. Please\n"
|
||||
" check the trace below for any clues about configuration elements that need\n"
|
||||
" to be corrected:\n\n",
|
||||
" randomly fail, and accesses to the CLI may block the whole process. The\n"
|
||||
" blocking delay before emitting this warning may be adjusted via the global\n"
|
||||
" 'warn-blocked-traffic-after' directive. Please check the trace below for\n"
|
||||
" any clues about configuration elements that need to be corrected:\n\n",
|
||||
thr + 1, (n - p) / 1000000ULL,
|
||||
HA_ATOMIC_LOAD(&ha_thread_ctx[thr].stream_cnt));
|
||||
|
||||
|
||||
72
src/wdt.c
72
src/wdt.c
@ -14,6 +14,7 @@
|
||||
|
||||
#include <haproxy/activity.h>
|
||||
#include <haproxy/api.h>
|
||||
#include <haproxy/cfgparse.h>
|
||||
#include <haproxy/clock.h>
|
||||
#include <haproxy/debug.h>
|
||||
#include <haproxy/errors.h>
|
||||
@ -42,6 +43,9 @@ static struct {
|
||||
uint prev_ctxsw;
|
||||
} per_thread_wd_ctx[MAX_THREADS];
|
||||
|
||||
/* warn about stuck tasks after this delay (ns) */
|
||||
static unsigned int wdt_warn_blocked_traffic_ns = 1000000000U;
|
||||
|
||||
/* Setup (or ping) the watchdog timer for thread <thr>. Returns non-zero on
|
||||
* success, zero on failure. It interrupts once per second of CPU time. It
|
||||
* happens that timers based on the CPU time are not automatically re-armed
|
||||
@ -51,7 +55,8 @@ int wdt_ping(int thr)
|
||||
{
|
||||
struct itimerspec its;
|
||||
|
||||
its.it_value.tv_sec = 1; its.it_value.tv_nsec = 0;
|
||||
its.it_value.tv_sec = wdt_warn_blocked_traffic_ns / 1000000000U;
|
||||
its.it_value.tv_nsec = wdt_warn_blocked_traffic_ns % 1000000000U;
|
||||
its.it_interval.tv_sec = 0; its.it_interval.tv_nsec = 0;
|
||||
return timer_settime(per_thread_wd_ctx[thr].timer, 0, &its, NULL) == 0;
|
||||
}
|
||||
@ -87,7 +92,7 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
|
||||
/* not yet reached the deadline of 1 sec,
|
||||
* or p wasn't initialized yet
|
||||
*/
|
||||
if (!p || n - p < 1000000000UL)
|
||||
if (!p)
|
||||
goto update_and_leave;
|
||||
|
||||
if ((_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_SLEEPING) ||
|
||||
@ -115,11 +120,22 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
|
||||
if (!(_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_STUCK)) {
|
||||
uint prev_ctxsw;
|
||||
|
||||
_HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
|
||||
prev_ctxsw = HA_ATOMIC_LOAD(&per_thread_wd_ctx[tid].prev_ctxsw);
|
||||
if (HA_ATOMIC_LOAD(&activity[thr].ctxsw) == prev_ctxsw)
|
||||
ha_stuck_warning(thr);
|
||||
HA_ATOMIC_STORE(&activity[thr].ctxsw, prev_ctxsw);
|
||||
|
||||
/* only after one second it's clear we're stuck */
|
||||
if (n - p >= 1000000000ULL)
|
||||
_HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
|
||||
|
||||
/* have we crossed the warning boundary ? If so we note were we
|
||||
* where, and second time called from the same place will trigger
|
||||
* a warning (unless already stuck).
|
||||
*/
|
||||
if (n - p >= (ullong)wdt_warn_blocked_traffic_ns) {
|
||||
if (HA_ATOMIC_LOAD(&activity[thr].ctxsw) == prev_ctxsw)
|
||||
ha_stuck_warning(thr);
|
||||
HA_ATOMIC_STORE(&activity[thr].ctxsw, prev_ctxsw);
|
||||
}
|
||||
|
||||
goto update_and_leave;
|
||||
}
|
||||
|
||||
@ -163,6 +179,44 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
|
||||
wdt_ping(thr);
|
||||
}
|
||||
|
||||
/* parse the "warn-blocked-traffic-after" parameter */
|
||||
static int wdt_parse_warn_blocked(char **args, int section_type, struct proxy *curpx,
|
||||
const struct proxy *defpx, const char *file, int line,
|
||||
char **err)
|
||||
{
|
||||
const char *res;
|
||||
uint value;
|
||||
|
||||
if (!*args[1]) {
|
||||
memprintf(err, "'%s' expects <time> as argument between 1 and 1000 ms.\n", args[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
res = parse_time_err(args[1], &value, TIME_UNIT_MS);
|
||||
if (res == PARSE_TIME_OVER) {
|
||||
memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 1000 ms)",
|
||||
args[1], args[0]);
|
||||
return -1;
|
||||
}
|
||||
else if (res == PARSE_TIME_UNDER) {
|
||||
memprintf(err, "timer underflow in argument '%s' to '%s' (minimum value is 1 ms)",
|
||||
args[1], args[0]);
|
||||
return -1;
|
||||
}
|
||||
else if (res) {
|
||||
memprintf(err, "unexpected character '%c' in argument to <%s>.\n", *res, args[0]);
|
||||
return -1;
|
||||
}
|
||||
else if (value > 1000 || value < 1) {
|
||||
memprintf(err, "timer out of range in argument '%s' to '%s' (value must be between 1 and 1000 ms)",
|
||||
args[1], args[0]);
|
||||
return -1;
|
||||
}
|
||||
|
||||
wdt_warn_blocked_traffic_ns = value * 1000000U;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int init_wdt_per_thread()
|
||||
{
|
||||
if (!clock_setup_signal_timer(&per_thread_wd_ctx[tid].timer, WDTSIG, tid))
|
||||
@ -202,6 +256,12 @@ int init_wdt()
|
||||
return ERR_NONE;
|
||||
}
|
||||
|
||||
static struct cfg_kw_list cfg_kws = {ILH, {
|
||||
{ CFG_GLOBAL, "warn-blocked-traffic-after", wdt_parse_warn_blocked },
|
||||
{ 0, NULL, NULL },
|
||||
}};
|
||||
|
||||
INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws);
|
||||
REGISTER_POST_CHECK(init_wdt);
|
||||
REGISTER_PER_THREAD_INIT(init_wdt_per_thread);
|
||||
REGISTER_PER_THREAD_DEINIT(deinit_wdt_per_thread);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user