haproxy/src/wdt.c
Willy Tarreau aeed4a85d6 REORG: include: move log.h to haproxy/log{,-t}.h
The current state of the logging is a real mess. The main problem is
that almost all files include log.h just in order to have access to
the alert/warning functions like ha_alert() etc, and don't care about
logs. But log.h also deals with real logging as well as log-format and
depends on stream.h and various other things. As such it forces a few
heavy files like stream.h to be loaded early and to hide missing
dependencies depending where it's loaded. Among the missing ones is
syslog.h which was often automatically included resulting in no less
than 3 users missing it.

Among 76 users, only 5 could be removed, and probably 70 don't need the
full set of dependencies.

A good approach would consist in splitting that file in 3 parts:
  - one for error output ("errors" ?).
  - one for log_format processing
  - and one for actual logging.
2020-06-11 10:18:58 +02:00

184 lines
5.1 KiB
C

/*
* Thread lockup detection
*
* Copyright 2000-2019 Willy Tarreau <willy@haproxy.org>.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <signal.h>
#include <time.h>
#include <haproxy/api.h>
#include <haproxy/debug.h>
#include <haproxy/global.h>
#include <haproxy/log.h>
#include <haproxy/thread.h>
#include <haproxy/tools.h>
/*
* It relies on timer_create() and timer_settime() which are only available in
* this case.
*/
#if defined(USE_RT) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME)
/* Setup (or ping) the watchdog timer for thread <thr>. Returns non-zero on
* success, zero on failure. It interrupts once per second of CPU time. It
* happens that timers based on the CPU time are not automatically re-armed
* so we only use the value and leave the interval unset.
*/
int wdt_ping(int thr)
{
struct itimerspec its;
its.it_value.tv_sec = 1; its.it_value.tv_nsec = 0;
its.it_interval.tv_sec = 0; its.it_interval.tv_nsec = 0;
return timer_settime(ha_thread_info[thr].wd_timer, 0, &its, NULL) == 0;
}
/* This is the WDTSIG signal handler */
void wdt_handler(int sig, siginfo_t *si, void *arg)
{
unsigned long long n, p;
int thr;
switch (si->si_code) {
case SI_TIMER:
/* A thread's timer fired, the thread ID is in si_int. We have
* no guarantee that the thread handling this signal is in any
* way related to the one triggering it, so we need to retrieve
* the thread number from there. Note: this thread might
* continue to execute in parallel.
*/
thr = si->si_value.sival_int;
/* cannot happen unless an unknown timer tries to play with our
* nerves. Let's die for now if this happens.
*/
if (thr < 0 || thr >= global.nbthread)
break;
p = ha_thread_info[thr].prev_cpu_time;
n = now_cpu_time_thread(&ha_thread_info[thr]);
/* not yet reached the deadline of 1 sec */
if (n - p < 1000000000UL)
goto update_and_leave;
if ((threads_harmless_mask|sleeping_thread_mask|threads_to_dump) & (1UL << thr)) {
/* This thread is currently doing exactly nothing
* waiting in the poll loop (unlikely but possible),
* waiting for all other threads to join the rendez-vous
* point (common), or waiting for another thread to
* finish an isolated operation (unlikely but possible).
*/
goto update_and_leave;
}
/* So the thread indeed appears locked up. In order to be
* certain that we're not witnessing an exceptional spike of
* CPU usage due to a configuration issue (like running tens
* of thousands of tasks in a single loop), we'll check if the
* scheduler is still alive by setting the TI_FL_STUCK flag
* that the scheduler clears when switching to the next task.
* If it's already set, then it's our second call with no
* progress and the thread is dead.
*/
if (!(ha_thread_info[thr].flags & TI_FL_STUCK)) {
_HA_ATOMIC_OR(&ha_thread_info[thr].flags, TI_FL_STUCK);
goto update_and_leave;
}
/* No doubt now, there's no hop to recover, die loudly! */
break;
#ifdef USE_THREAD
case SI_TKILL:
/* we got a pthread_kill, stop on it */
thr = tid;
break;
#endif
default:
/* unhandled other conditions */
return;
}
/* By default we terminate. If we're not on the victim thread, better
* bounce the signal there so that we produce a cleaner stack trace
* with the other thread interrupted exactly where it was running and
* the current one not involved in this.
*/
#ifdef USE_THREAD
if (thr != tid)
pthread_kill(ha_thread_info[thr].pthread, sig);
else
#endif
ha_panic();
return;
update_and_leave:
wdt_ping(thr);
}
int init_wdt_per_thread()
{
struct sigevent sev = { };
sigset_t set;
/* unblock the WDTSIG signal we intend to use */
sigemptyset(&set);
sigaddset(&set, WDTSIG);
ha_sigmask(SIG_UNBLOCK, &set, NULL);
/* this timer will signal WDTSIG when it fires, with tid in the si_int
* field (important since any thread will receive the signal).
*/
sev.sigev_notify = SIGEV_SIGNAL;
sev.sigev_signo = WDTSIG;
sev.sigev_value.sival_int = tid;
if (timer_create(ti->clock_id, &sev, &ti->wd_timer) == -1 &&
timer_create(CLOCK_REALTIME, &sev, &ti->wd_timer) == -1)
goto fail1;
if (!wdt_ping(tid))
goto fail2;
return 1;
fail2:
timer_delete(ti->wd_timer);
fail1:
ti->wd_timer = TIMER_INVALID;
ha_warning("Failed to setup watchdog timer for thread %u, disabling lockup detection.\n", tid);
return 1;
}
void deinit_wdt_per_thread()
{
if (ti->wd_timer != TIMER_INVALID)
timer_delete(ti->wd_timer);
}
/* registers the watchdog signal handler and returns 0. This sets up the signal
* handler for WDTSIG, so it must be called once per process.
*/
int init_wdt()
{
struct sigaction sa;
sa.sa_handler = NULL;
sa.sa_sigaction = wdt_handler;
sigemptyset(&sa.sa_mask);
sa.sa_flags = SA_SIGINFO;
sigaction(WDTSIG, &sa, NULL);
return 0;
}
REGISTER_POST_CHECK(init_wdt);
REGISTER_PER_THREAD_INIT(init_wdt_per_thread);
REGISTER_PER_THREAD_DEINIT(deinit_wdt_per_thread);
#endif