From 7232677385b24c6b51e43e4d97b8c12e154f8758 Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Fri, 1 Aug 2025 17:51:16 +0200 Subject: [PATCH] MAJOR: server: do not remove idle conns in del server Do not remove anymore idle and purgeable connections directly under the "del server" handler. The main objective of this patch is to reduce the amount of work performed under thread isolation. This should improve "del server" scheduling with other haproxy tasks. Another objective is to be able to properly support dynamic servers with QUIC. Indeed, takeover is not yet implemented for this protocol, hence it is not possible to rely on cleanup of idle connections performed by a single thread under "del server" handler. With this change it is not possible anymore to remove a server if there is still idle connections referencing it. To ensure this cannot be performed, srv_check_for_deletion() has been extended to check server counters for idle and idle private connections. Server deletion should still remain a viable procedure, as first it is mandatory to put the targetted server into maintenance. This step forces the cleanup of its existing idle connections. Thanks to a recent change, all finishing connections are also removed immediately instead of becoming idle. In short, this patch transforms idle connections removal from a synchronous to an asynchronous procedure. However, this should remain a steadfast and quick method achievable in less than a second. This patch is considered major as some users may notice this change when removing a server. In particular with the following CLI commands pipeline: "disable server ; shutdown sessions server ; del server " Server deletion will now probably fail, as idle connections purge cannot be completed immediately. Thus, it is now highly advise to always use a small delay "wait srv-removable" before "del server" to ensure that idle connections purge is executed prior. Along with this change, documentation for "del server" and related "shutdown sessions server" has been refined, in particular to better highlight under what conditions a server can be removed. --- doc/management.txt | 35 +++++++++++++++------- src/server.c | 73 ++++------------------------------------------ 2 files changed, 30 insertions(+), 78 deletions(-) diff --git a/doc/management.txt b/doc/management.txt index 1953f7fa7..9cd9216e7 100644 --- a/doc/management.txt +++ b/doc/management.txt @@ -2085,11 +2085,19 @@ del ssl crt-list numbers, use "show ssl crt-list -n ". del server / - Remove a server attached to the backend . All servers are eligible, - except servers which are referenced by other configuration elements. The - server must be put in maintenance mode prior to its deletion. The operation - is cancelled if the server still has active or idle connection or its - connection queue is not empty. + Delete a removable server attached to the backend . A removable + server is the server which satisfies all of these conditions : + - not referenced by other configuration elements + - must already be in maintenance (see "disable server") + - must not have any active or idle connections + + If any of these conditions is not met, the command will fail. + + Active connections are those with at least one ongoing request. It is + possible to speed up their termination using "shutdown sessions server". It + is highly recommended to use "wait srv-removable" before "del server" to + ensure that all active or idle connections are closed and that the command + succeeds. disable agent / Mark the auxiliary agent check as temporarily stopped. @@ -4097,6 +4105,10 @@ shutdown sessions server / maintenance mode, for instance. Such terminated streams are reported with a 'K' flag in the logs. + Backend connections are left in idle state, unless the server is already in + maintenance mode, in which case they will be immediately scheduled for + deletion. + trace The "trace" command alone lists the trace sources, their current status, and their brief descriptions. It is only meant as a menu to enter next levels, @@ -4311,12 +4323,13 @@ wait { -h | } [ [...]] unsatisfied for the whole duration. The supported conditions are: - srv-removable / : this will wait for the specified server to - be removable, i.e. be in maintenance and no longer have any connection on - it. Some conditions will never be accepted (e.g. not in maintenance) and - will cause the report of a specific error message indicating what condition - is not met. The server might even have been removed in parallel and no - longer exit. If everything is OK before the delay, a success is returned - and the operation is terminated. + be removable by the "del server" command, i.e. be in maintenance and no + longer have any connection on it (neither active or idle). Some conditions + will never be accepted (e.g. not in maintenance) and will cause the report + of a specific error message indicating what condition is not met. The + server might even have been removed in parallel and no longer exit. If + everything is OK before the delay, a success is returned and the operation + is terminated. The default unit for the delay is milliseconds, though other units are accepted if suffixed with the usual timer units (us, ms, s, m, h, d). When diff --git a/src/server.c b/src/server.c index 6599e53d7..d4ae2e895 100644 --- a/src/server.c +++ b/src/server.c @@ -6335,9 +6335,12 @@ int srv_check_for_deletion(const char *bename, const char *svname, struct proxy /* Second, conditions that may change over time */ ret = 0; - /* Ensure that there is no active/pending connection on the server. */ + /* Ensure that there is no active/pending/idle connection on the server. + * Note that idle conns scheduled for purging are still accounted in idle counter. + */ if (_HA_ATOMIC_LOAD(&srv->curr_used_conns) || - _HA_ATOMIC_LOAD(&srv->queueslength) || srv_has_streams(srv)) { + _HA_ATOMIC_LOAD(&srv->queueslength) || srv_has_streams(srv) || + _HA_ATOMIC_LOAD(&srv->curr_idle_conns) || _HA_ATOMIC_LOAD(&srv->curr_sess_idle_conns)) { msg = "Server still has connections attached to it, cannot remove it."; goto leave; } @@ -6362,11 +6365,9 @@ static int cli_parse_delete_server(char **args, char *payload, struct appctx *ap struct proxy *be; struct server *srv; struct ist be_name, sv_name; - struct mt_list back; - struct sess_priv_conns *sess_conns = NULL; struct watcher *srv_watch; const char *msg; - int ret, i; + int ret; if (!cli_has_level(appctx, ACCESS_LVL_ADMIN)) return 1; @@ -6395,68 +6396,6 @@ static int cli_parse_delete_server(char **args, char *payload, struct appctx *ap goto out; } - /* Close idle connections attached to this server. */ - for (i = tid;;) { - struct list *list = &srv->per_thr[i].idle_conn_list; - struct connection *conn; - - while (!LIST_ISEMPTY(list)) { - conn = LIST_ELEM(list->n, struct connection *, idle_list); - if (i != tid) { - if (conn->mux && conn->mux->takeover) - conn->mux->takeover(conn, i, 1); - else if (conn->xprt && conn->xprt->takeover) - conn->xprt->takeover(conn, conn->ctx, i, 1); - } - conn_release(conn); - } - - /* Also remove all purgeable conns as some of them may still - * reference the currently deleted server. - */ - while ((conn = MT_LIST_POP(&idle_conns[i].toremove_conns, - struct connection *, toremove_list))) { - conn_release(conn); - } - - if ((i = ((i + 1 == global.nbthread) ? 0 : i + 1)) == tid) - break; - } - - /* All idle connections should be removed now. */ - BUG_ON(srv->curr_idle_conns); - - /* Close idle private connections attached to this server. */ - for (i = tid;;) { - MT_LIST_FOR_EACH_ENTRY_LOCKED(sess_conns, &srv->per_thr[i].sess_conns, srv_el, back) { - struct connection *conn, *conn_back; - list_for_each_entry_safe(conn, conn_back, &sess_conns->conn_list, sess_el) { - - /* Only idle connections should be present if srv_check_for_deletion() is true. */ - BUG_ON(!(conn->flags & CO_FL_SESS_IDLE)); - --((struct session *)conn->owner)->idle_conns; - - LIST_DEL_INIT(&conn->sess_el); - conn->owner = NULL; - - if (sess_conns->tid != tid) { - if (conn->mux && conn->mux->takeover) - conn->mux->takeover(conn, sess_conns->tid, 1); - else if (conn->xprt && conn->xprt->takeover) - conn->xprt->takeover(conn, conn->ctx, sess_conns->tid, 1); - } - conn_release(conn); - } - - LIST_DELETE(&sess_conns->sess_el); - pool_free(pool_head_sess_priv_conns, sess_conns); - sess_conns = NULL; - } - - if ((i = ((i + 1 == global.nbthread) ? 0 : i + 1)) == tid) - break; - } - /* removing cannot fail anymore when we reach this: * publishing EVENT_HDL_SUB_SERVER_DEL */