From 7fa70da06db3e69a61e4451a82a4f980b40b4d0c Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 27 Jan 2025 15:41:26 +0100 Subject: [PATCH] MINOR: epoll: permit to mask certain specific events A few times in the past we've seen cases where epoll was caught reporting a wrong event that caused trouble (e.g. spuriously reporting HUP or RDHUP after a successful connect()). The new tune.epoll.mask-events directive permits to mask events such as ERR, HUP and RDHUP and convert them to IN events that are processed by the regular receive path. This should help better diagnose and troubleshoot issues such as this one, as well as rule out such a cause when similar issues are reported: https://github.com/haproxy/haproxy/issues/2368 https://www.spinics.net/lists/netdev/msg876470.html It should be harmless to backport this if necessary. --- doc/configuration.txt | 26 +++++++++++++++++++++++++ src/ev_epoll.c | 45 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/doc/configuration.txt b/doc/configuration.txt index 2db2f36d6..da9d8b540 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -1639,6 +1639,7 @@ The following keywords are supported in the "global" section : - tune.comp.maxlevel - tune.disable-fast-forward - tune.disable-zero-copy-forwarding + - tune.epoll.mask-events - tune.events.max-events-at-once - tune.fail-alloc - tune.fd.edge-triggered @@ -3549,6 +3550,31 @@ tune.disable-zero-copy-forwarding tune.h1.zero-copy-fwd-recv, tune.h1.zero-copy-fwd-send, tune.h2.zero-copy-fwd-send, tune.quic.zero-copy-fwd-send +tune.epoll.mask-events + Along HAProxy's history, a few complex issues were met that were caused by + bugs in the epoll mechanism in the Linux kernel. These ones usually are very + rare and unreproducible outside the reporter's environment, and may only be + worked around by disabling epoll and switching to poll instead, which is not + very satisfying for high performance environments. Each time, issues affect + only very specific (and rare) event types, and offering the ability to mask + them can constitute a more acceptable work-around. This options offers this + possibility by permitting to silently ignore events a few uncommon events + and replace them with an input (which reports an unspecified incoming event). + The effect is to avoid the fast error processing paths in certain places and + only use the common paths. This should never be used unless being invited to + do so by an expert in order to diagnose or work around a kernel bug. + + The option takes a single argument which is a comma-delimited list of words + each designating an event to be masked. The currently supported list of + events is: + - "err": mask the EPOLLERR event + - "hup": mask the EPOLLHUP events + - "rdhup": mask the EPOLLRDHUP events + + Example: + # mask all non-traffic epoll events: + tune.epoll.mask-events err,hup,rdhup + tune.events.max-events-at-once Sets the number of events that may be processed at once by an asynchronous task handler (from event_hdl API). should be included between 1 diff --git a/src/ev_epoll.c b/src/ev_epoll.c index 9e7050c73..11e99310a 100644 --- a/src/ev_epoll.c +++ b/src/ev_epoll.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -28,6 +29,7 @@ /* private data */ static THREAD_LOCAL struct epoll_event *epoll_events = NULL; static int epoll_fd[MAX_THREADS] __read_mostly; // per-thread epoll_fd +static uint epoll_mask = 0; // events to be masked and turned to EPOLLIN #ifndef EPOLLRDHUP /* EPOLLRDHUP was defined late in libc, and it appeared in kernel 2.6.17 */ @@ -150,6 +152,7 @@ static void _update_fd(int fd) ev.events |= EPOLLOUT; done: + ev.events &= ~epoll_mask; ev.data.fd = fd; epoll_ctl(epoll_fd[tid], opcode, fd, &ev); } @@ -259,6 +262,11 @@ static void _do_poll(struct poller *p, int exp, int wake) #ifdef DEBUG_FD _HA_ATOMIC_INC(&fdtab[fd].event_count); #endif + if (e & epoll_mask) { + e |= EPOLLIN; + e &= ~epoll_mask; + } + n = ((e & EPOLLIN) ? FD_EV_READY_R : 0) | ((e & EPOLLOUT) ? FD_EV_READY_W : 0) | ((e & EPOLLRDHUP) ? FD_EV_SHUT_R : 0) | @@ -404,6 +412,43 @@ static void _do_register(void) p->fork = _do_fork; } +/* config parser for global "tune.epoll.mask-events", accepts "err", "hup", "rdhup" */ +static int cfg_parse_tune_epoll_mask_events(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + char *comma, *kw; + + if (too_many_args(1, args, err, NULL)) + return -1; + + epoll_mask = 0; + for (kw = args[1]; kw && *kw; kw = comma) { + comma = strchr(kw, ','); + if (comma) + *(comma++) = 0; + + if (strcmp(kw, "err") == 0) + epoll_mask |= EPOLLERR; + else if (strcmp(kw, "hup") == 0) + epoll_mask |= EPOLLHUP; + else if (strcmp(kw, "rdhup") == 0) + epoll_mask |= EPOLLRDHUP; + else { + memprintf(err, "'%s' expects a comma-delimited list of 'err', 'hup' and 'rdhup' but got '%s'.", args[0], kw); + return -1; + } + } + return 0; +} + +/* config keyword parsers */ +static struct cfg_kw_list cfg_kws = {ILH, { + { CFG_GLOBAL, "tune.epoll.mask-events", cfg_parse_tune_epoll_mask_events }, + { 0, NULL, NULL } +}}; + +INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws); INITCALL0(STG_REGISTER, _do_register);