From 1662cdb0c6523213cf179889e5935aaceb917c3a Mon Sep 17 00:00:00 2001 From: Olivier Houchard Date: Fri, 3 Jul 2020 14:04:37 +0200 Subject: [PATCH] BUG/MEDIUM: connections: Set the tid for the old tasklet on takeover. In the various takeover() methods, make sure we schedule the old tasklet on the old thread, as we don't want it to run on our own thread! This was causing a very rare crash when building with DEBUG_STRICT, seeing that either an FD's thread mask didn't match the thread ID in h1_io_cb(), or that stream_int_notify() would try to queue a task with the wrong tid_bit. In order to reproduce this, it is necessary to maintain many connections (typically 30k) at a high request rate flowing over H1+SSL between two proxies, the second of which would randomly reject ~1% of the incoming connection and randomly killing some idle ones using a very short client timeout. The request rate must be adjusted so that the CPUs are nearly saturated, but never reach 100%. It's easier to reproduce this by skipping local connections and always picking from other threads. The issue should happen in less than 20s otherwise it's necessary to restart to reset the idle connections lists. No backport is needed, takeover() is 2.2 only. --- include/haproxy/connection-t.h | 2 +- src/backend.c | 4 ++-- src/mux_fcgi.c | 4 ++-- src/mux_h1.c | 4 ++-- src/mux_h2.c | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/haproxy/connection-t.h b/include/haproxy/connection-t.h index 8b48c9815..e7e9c0d63 100644 --- a/include/haproxy/connection-t.h +++ b/include/haproxy/connection-t.h @@ -390,7 +390,7 @@ struct mux_ops { void (*reset)(struct connection *conn); /* Reset the mux, because we're re-trying to connect */ const struct cs_info *(*get_cs_info)(struct conn_stream *cs); /* Return info on the specified conn_stream or NULL if not defined */ int (*ctl)(struct connection *conn, enum mux_ctl_type mux_ctl, void *arg); /* Provides information about the mux */ - int (*takeover)(struct connection *conn); /* Attempts to migrate the connection to the current thread */ + int (*takeover)(struct connection *conn, int orig_tid); /* Attempts to migrate the connection to the current thread */ unsigned int flags; /* some flags characterizing the mux's capabilities (MX_FL_*) */ char name[8]; /* mux layer name, zero-terminated */ }; diff --git a/src/backend.c b/src/backend.c index f87940be8..7027c54cd 100644 --- a/src/backend.c +++ b/src/backend.c @@ -1135,7 +1135,7 @@ static struct connection *conn_backend_get(struct server *srv, int is_safe) HA_SPIN_LOCK(OTHER_LOCK, &idle_conns[i].takeover_lock); mt_list_for_each_entry_safe(conn, &mt_list[i], list, elt1, elt2) { - if (conn->mux->takeover && conn->mux->takeover(conn) == 0) { + if (conn->mux->takeover && conn->mux->takeover(conn, i) == 0) { MT_LIST_DEL_SAFE(elt1); _HA_ATOMIC_ADD(&activity[tid].fd_takeover, 1); found = 1; @@ -1145,7 +1145,7 @@ static struct connection *conn_backend_get(struct server *srv, int is_safe) if (!found && !is_safe && srv->curr_safe_nb > 0) { mt_list_for_each_entry_safe(conn, &srv->safe_conns[i], list, elt1, elt2) { - if (conn->mux->takeover && conn->mux->takeover(conn) == 0) { + if (conn->mux->takeover && conn->mux->takeover(conn, i) == 0) { MT_LIST_DEL_SAFE(elt1); _HA_ATOMIC_ADD(&activity[tid].fd_takeover, 1); found = 1; diff --git a/src/mux_fcgi.c b/src/mux_fcgi.c index c4c7ce642..9e8eb9a4b 100644 --- a/src/mux_fcgi.c +++ b/src/mux_fcgi.c @@ -4084,7 +4084,7 @@ static void fcgi_show_fd(struct buffer *msg, struct connection *conn) * Return 0 if successful, non-zero otherwise. * Expected to be called with the old thread lock held. */ -static int fcgi_takeover(struct connection *conn) +static int fcgi_takeover(struct connection *conn, int orig_tid) { struct fcgi_conn *fcgi = conn->ctx; struct task *task; @@ -4098,7 +4098,7 @@ static int fcgi_takeover(struct connection *conn) * set its context to NULL; */ fcgi->wait_event.tasklet->context = NULL; - tasklet_wakeup(fcgi->wait_event.tasklet); + tasklet_wakeup_on(fcgi->wait_event.tasklet, orig_tid); task = fcgi->task; if (task) { diff --git a/src/mux_h1.c b/src/mux_h1.c index 89c55b4d4..0111f19e7 100644 --- a/src/mux_h1.c +++ b/src/mux_h1.c @@ -2922,7 +2922,7 @@ static int add_hdr_case_adjust(const char *from, const char *to, char **err) * Return 0 if successful, non-zero otherwise. * Expected to be called with the old thread lock held. */ -static int h1_takeover(struct connection *conn) +static int h1_takeover(struct connection *conn, int orig_tid) { struct h1c *h1c = conn->ctx; struct task *task; @@ -2936,7 +2936,7 @@ static int h1_takeover(struct connection *conn) * set its context to NULL. */ h1c->wait_event.tasklet->context = NULL; - tasklet_wakeup(h1c->wait_event.tasklet); + tasklet_wakeup_on(h1c->wait_event.tasklet, orig_tid); task = h1c->task; if (task) { diff --git a/src/mux_h2.c b/src/mux_h2.c index 827ff8e4f..22212afe8 100644 --- a/src/mux_h2.c +++ b/src/mux_h2.c @@ -6055,7 +6055,7 @@ static void h2_show_fd(struct buffer *msg, struct connection *conn) * Return 0 if successful, non-zero otherwise. * Expected to be called with the old thread lock held. */ -static int h2_takeover(struct connection *conn) +static int h2_takeover(struct connection *conn, int orig_tid) { struct h2c *h2c = conn->ctx; struct task *task; @@ -6069,7 +6069,7 @@ static int h2_takeover(struct connection *conn) * set its context to NULL. */ h2c->wait_event.tasklet->context = NULL; - tasklet_wakeup(h2c->wait_event.tasklet); + tasklet_wakeup_on(h2c->wait_event.tasklet, orig_tid); task = h2c->task; if (task) {