From 50ae717624875cd36aac290ac5953062ee7de692 Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Tue, 2 Jul 2024 18:14:57 +0200 Subject: [PATCH] BUG/MEDIUM: server: fix race on server_atomic_sync() The following patch fixes a race condition during server addr/port update : cd994407a9545a8d84e410dc0cc18c30966b70d8 BUG/MAJOR: server/addr: fix a race during server addr:svc_port updates The new update mechanism is implemented via an event update. It uses thread isolation to guarantee that no other thread is accessing server addr/port. Furthermore, to ensure server instance is not deleted just before the event handler, server instance is lookup via its ID in proxy tree. However, thread isolation is only entered after server lookup. This leaves a tiny race condition as the thread will be marked as harmless and a concurrent thread can delete the server in the meantime. This causes server_atomic_sync() to manipulated a deleted server instance to reinsert it in used_server_addr backend tree. This can cause a segfault during this operation or possibly on a future used_server_addr tree access. This issue was detected by criteo. Several backtraces were retrieved, each related to server addr_node insert or delete operation, either in srv_set_addr_desc(), or add/delete dynamic server handlers. To fix this, simply extend thread isolation section to start it before server lookup. This ensures that once retrieved the server cannot be deleted until its addr/port are updated. To ensure this issue won't happen anymore, a new BUG_ON() is added in srv_set_addr_desc(). Also note that ebpt_delete() is now called every time on delete handler as this is a safe idempotent operation. To reproduce these crashes, a script was executed to add then remove different servers every second. In parallel, the following CLI command was issued repeatdly without any delay to force multiple update on servers port : set server addr 0.0.0.0 port $((1024 + RANDOM % 1024)) This must be backported at least up to 3.0. If above mentionned patch has been selected for previous version, this commit must also be backported on them. --- src/server.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/src/server.c b/src/server.c index 8bf3c595a..1cfcb6bf2 100644 --- a/src/server.c +++ b/src/server.c @@ -262,6 +262,16 @@ static struct task *server_atomic_sync(struct task *task, void *context, unsigne * to prevent invalid reads by other threads. */ + /* + * this requires thread isolation, which is safe since we're the only + * task working for the current subscription and we don't hold locks + * or resources that other threads may depend on to complete a running + * cycle. Note that we do this way because we assume that this event is + * rather rare. + */ + if (!thread_isolated()) + thread_isolate_full(); + /* check if related server still exists */ px = proxy_find_by_id(data->server.safe.proxy_uuid, PR_CAP_BE, 0); if (!px) @@ -290,15 +300,6 @@ static struct task *server_atomic_sync(struct task *task, void *context, unsigne /* should not happen */ break; } - /* - * this requires thread isolation, which is safe since we're the only - * task working for the current subscription and we don't hold locks - * or resources that other threads may depend on to complete a running - * cycle. Note that we do this way because we assume that this event is - * rather rare. - */ - if (!thread_isolated()) - thread_isolate_full(); /* apply new addr:port combination */ _srv_set_inetaddr_port(srv, &new_addr, @@ -710,6 +711,9 @@ static void srv_set_addr_desc(struct server *s, int reattach) struct proxy *p = s->proxy; char *key; + /* Risk of used_server_addr tree corruption if server is already deleted. */ + BUG_ON(s->flags & SRV_F_DELETED); + key = sa2str(&s->addr, s->svc_port, s->flags & SRV_F_MAPPORTS); if (s->addr_node.key) { @@ -2989,6 +2993,14 @@ struct server *srv_drop(struct server *srv) if (HA_ATOMIC_SUB_FETCH(&srv->refcount, 1)) goto end; + /* This BUG_ON() is invalid for now as server released on deinit will + * trigger it as they are not properly removed from their tree. + */ + //BUG_ON(srv->addr_node.node.leaf_p || + // srv->idle_node.node.leaf_p || + // srv->conf.id.node.leaf_p || + // srv->conf.name.node.leaf_p); + guid_remove(&srv->guid); /* make sure we are removed from our 'next->prev_deleted' list @@ -6117,8 +6129,7 @@ static int cli_parse_delete_server(char **args, char *payload, struct appctx *ap /* remove srv from addr_node tree */ eb32_delete(&srv->conf.id); ebpt_delete(&srv->conf.name); - if (srv->addr_node.key) - ebpt_delete(&srv->addr_node); + ebpt_delete(&srv->addr_node); /* remove srv from idle_node tree for idle conn cleanup */ eb32_delete(&srv->idle_node);