mirror of
https://git.haproxy.org/git/haproxy.git/
synced 2025-11-13 15:00:59 +01:00
Return ERR_NONE instead of 0 on success for all config callbacks that should return ERR_* codes. There is no change because ERR_NONE is a macro equals to 0. But this makes the return value more explicit.
184 lines
5.1 KiB
C
184 lines
5.1 KiB
C
/*
|
|
* Thread lockup detection
|
|
*
|
|
* Copyright 2000-2019 Willy Tarreau <willy@haproxy.org>.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
|
|
#include <signal.h>
|
|
#include <time.h>
|
|
|
|
#include <haproxy/api.h>
|
|
#include <haproxy/debug.h>
|
|
#include <haproxy/errors.h>
|
|
#include <haproxy/global.h>
|
|
#include <haproxy/thread.h>
|
|
#include <haproxy/tools.h>
|
|
|
|
|
|
/*
|
|
* It relies on timer_create() and timer_settime() which are only available in
|
|
* this case.
|
|
*/
|
|
#if defined(USE_RT) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME)
|
|
|
|
/* Setup (or ping) the watchdog timer for thread <thr>. Returns non-zero on
|
|
* success, zero on failure. It interrupts once per second of CPU time. It
|
|
* happens that timers based on the CPU time are not automatically re-armed
|
|
* so we only use the value and leave the interval unset.
|
|
*/
|
|
int wdt_ping(int thr)
|
|
{
|
|
struct itimerspec its;
|
|
|
|
its.it_value.tv_sec = 1; its.it_value.tv_nsec = 0;
|
|
its.it_interval.tv_sec = 0; its.it_interval.tv_nsec = 0;
|
|
return timer_settime(ha_thread_info[thr].wd_timer, 0, &its, NULL) == 0;
|
|
}
|
|
|
|
/* This is the WDTSIG signal handler */
|
|
void wdt_handler(int sig, siginfo_t *si, void *arg)
|
|
{
|
|
unsigned long long n, p;
|
|
int thr;
|
|
|
|
switch (si->si_code) {
|
|
case SI_TIMER:
|
|
/* A thread's timer fired, the thread ID is in si_int. We have
|
|
* no guarantee that the thread handling this signal is in any
|
|
* way related to the one triggering it, so we need to retrieve
|
|
* the thread number from there. Note: this thread might
|
|
* continue to execute in parallel.
|
|
*/
|
|
thr = si->si_value.sival_int;
|
|
|
|
/* cannot happen unless an unknown timer tries to play with our
|
|
* nerves. Let's die for now if this happens.
|
|
*/
|
|
if (thr < 0 || thr >= global.nbthread)
|
|
break;
|
|
|
|
p = ha_thread_info[thr].prev_cpu_time;
|
|
n = now_cpu_time_thread(&ha_thread_info[thr]);
|
|
|
|
/* not yet reached the deadline of 1 sec */
|
|
if (n - p < 1000000000UL)
|
|
goto update_and_leave;
|
|
|
|
if ((threads_harmless_mask|sleeping_thread_mask|threads_to_dump) & (1UL << thr)) {
|
|
/* This thread is currently doing exactly nothing
|
|
* waiting in the poll loop (unlikely but possible),
|
|
* waiting for all other threads to join the rendez-vous
|
|
* point (common), or waiting for another thread to
|
|
* finish an isolated operation (unlikely but possible).
|
|
*/
|
|
goto update_and_leave;
|
|
}
|
|
|
|
/* So the thread indeed appears locked up. In order to be
|
|
* certain that we're not witnessing an exceptional spike of
|
|
* CPU usage due to a configuration issue (like running tens
|
|
* of thousands of tasks in a single loop), we'll check if the
|
|
* scheduler is still alive by setting the TI_FL_STUCK flag
|
|
* that the scheduler clears when switching to the next task.
|
|
* If it's already set, then it's our second call with no
|
|
* progress and the thread is dead.
|
|
*/
|
|
if (!(ha_thread_info[thr].flags & TI_FL_STUCK)) {
|
|
_HA_ATOMIC_OR(&ha_thread_info[thr].flags, TI_FL_STUCK);
|
|
goto update_and_leave;
|
|
}
|
|
|
|
/* No doubt now, there's no hop to recover, die loudly! */
|
|
break;
|
|
#ifdef USE_THREAD
|
|
case SI_TKILL:
|
|
/* we got a pthread_kill, stop on it */
|
|
thr = tid;
|
|
break;
|
|
#endif
|
|
default:
|
|
/* unhandled other conditions */
|
|
return;
|
|
}
|
|
|
|
/* By default we terminate. If we're not on the victim thread, better
|
|
* bounce the signal there so that we produce a cleaner stack trace
|
|
* with the other thread interrupted exactly where it was running and
|
|
* the current one not involved in this.
|
|
*/
|
|
#ifdef USE_THREAD
|
|
if (thr != tid)
|
|
pthread_kill(ha_thread_info[thr].pthread, sig);
|
|
else
|
|
#endif
|
|
ha_panic();
|
|
return;
|
|
|
|
update_and_leave:
|
|
wdt_ping(thr);
|
|
}
|
|
|
|
int init_wdt_per_thread()
|
|
{
|
|
struct sigevent sev = { };
|
|
sigset_t set;
|
|
|
|
/* unblock the WDTSIG signal we intend to use */
|
|
sigemptyset(&set);
|
|
sigaddset(&set, WDTSIG);
|
|
ha_sigmask(SIG_UNBLOCK, &set, NULL);
|
|
|
|
/* this timer will signal WDTSIG when it fires, with tid in the si_int
|
|
* field (important since any thread will receive the signal).
|
|
*/
|
|
sev.sigev_notify = SIGEV_SIGNAL;
|
|
sev.sigev_signo = WDTSIG;
|
|
sev.sigev_value.sival_int = tid;
|
|
if (timer_create(ti->clock_id, &sev, &ti->wd_timer) == -1 &&
|
|
timer_create(CLOCK_REALTIME, &sev, &ti->wd_timer) == -1)
|
|
goto fail1;
|
|
|
|
if (!wdt_ping(tid))
|
|
goto fail2;
|
|
|
|
return 1;
|
|
|
|
fail2:
|
|
timer_delete(ti->wd_timer);
|
|
fail1:
|
|
ti->wd_timer = TIMER_INVALID;
|
|
ha_warning("Failed to setup watchdog timer for thread %u, disabling lockup detection.\n", tid);
|
|
return 1;
|
|
}
|
|
|
|
void deinit_wdt_per_thread()
|
|
{
|
|
if (ti->wd_timer != TIMER_INVALID)
|
|
timer_delete(ti->wd_timer);
|
|
}
|
|
|
|
/* registers the watchdog signal handler and returns 0. This sets up the signal
|
|
* handler for WDTSIG, so it must be called once per process.
|
|
*/
|
|
int init_wdt()
|
|
{
|
|
struct sigaction sa;
|
|
|
|
sa.sa_handler = NULL;
|
|
sa.sa_sigaction = wdt_handler;
|
|
sigemptyset(&sa.sa_mask);
|
|
sa.sa_flags = SA_SIGINFO;
|
|
sigaction(WDTSIG, &sa, NULL);
|
|
return ERR_NONE;
|
|
}
|
|
|
|
REGISTER_POST_CHECK(init_wdt);
|
|
REGISTER_PER_THREAD_INIT(init_wdt_per_thread);
|
|
REGISTER_PER_THREAD_DEINIT(deinit_wdt_per_thread);
|
|
#endif
|