From 64383e655b23e1240dd0043a18ca020994c60022 Mon Sep 17 00:00:00 2001 From: Alexander Stephan Date: Thu, 30 Apr 2026 14:23:52 +0000 Subject: [PATCH] BUG/MEDIUM: cli: fix master CLI connection slot leak on client disconnect In master-worker mode the master CLI proxy (mworker_proxy) has a hardcoded maxconn of 10. When a client connects to the master CLI socket and issues a command that gets forwarded to an unresponsive worker (e.g. one that is stuck or very slow), the connection hangs waiting for the worker's response. If the client then disconnects (timeout, Ctrl-C, etc.), the connection slot is never released because the client-side FIN is never acknowledged by the unresponsive worker. After 10 such leaked slots the master CLI socket becomes completely unreachable, returning "Resource temporarily unavailable" to any new connection attempt. In containerized deployments this means readiness probes start failing and the pod gets restarted. The fix adds a timeout server-fin of 1s to the mworker_proxy. When the client disconnects while waiting for a worker response, this timeout ensures the dangling backend connection is cleaned up after 1s, freeing the connection slot. This does not affect normal CLI operations since the timeout only starts after the client has already closed its side of the connection. A regression test is included that blocks the worker CLI thread using "debug dev delay" with nbthread 1, fills all 10 master CLI slots, waits for client-side timeouts, then verifies a new connection still succeeds. This fixes GH issue #3351. This should be backported to all stable branches. Co-authored-by: Martin Strenge Co-authored-by: William Lallemand --- reg-tests/mcli/mcli_master_socket_leak.vtc | 66 ++++++++++++++++++++++ src/cli.c | 5 +- 2 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 reg-tests/mcli/mcli_master_socket_leak.vtc diff --git a/reg-tests/mcli/mcli_master_socket_leak.vtc b/reg-tests/mcli/mcli_master_socket_leak.vtc new file mode 100644 index 000000000..cf30ea6fe --- /dev/null +++ b/reg-tests/mcli/mcli_master_socket_leak.vtc @@ -0,0 +1,66 @@ +varnishtest "Bug fix: master CLI connection slots freed on client disconnect" + +# Regression test for a master CLI socket connection leak in master-worker mode. +# +# mworker_proxy has a fixed maxconn of 10. When clients connect to the master +# socket, send a command that is forwarded to a busy worker, and then close +# the connection due to a client-side receive timeout, haproxy must free each +# slot as the client disconnects. +# +# Without the fix: slots remain occupied after the client disconnects, so the +# master CLI becomes unreachable once all 10 slots are filled. +# With the fix: slots are freed on client disconnect and the master CLI keeps +# accepting new connections. +# +# The worker is made unresponsive using "debug dev delay" (expert-mode) with +# nbthread 1 so a single delay blocks the entire worker CLI. This avoids +# using SIGSTOP/SIGCONT which can be unreliable in CI environments. + +#REGTEST_TYPE=bug + +feature cmd "command -v socat" +feature cmd "command -v timeout" +feature ignore_unknown_macro + +server s1 { +} -start + +haproxy h1 -W -S -conf { + global + nbthread 1 + + defaults + mode http + timeout connect "${HAPROXY_TEST_TIMEOUT-5s}" + timeout client "${HAPROXY_TEST_TIMEOUT-5s}" + timeout server "${HAPROXY_TEST_TIMEOUT-5s}" + + frontend fe + bind "fd@${fe}" + default_backend be + + backend be + server s1 ${s1_addr}:${s1_port} +} -start + +# Fill all 10 master CLI slots (mworker_proxy->maxconn is hardcoded to 10). +# Each socat sends "expert-mode on" followed by "@1 debug dev delay 10000" +# which blocks the single worker thread for 10 s. After 2 s the timeout(1) +# wrapper kills socat, simulating a client-side receive timeout. "wait" +# ensures all background processes have exited before proceeding. +shell { + for i in $(seq 1 10); do + (printf "expert-mode on\n@1 debug dev delay 10000\n" \ + | timeout 2 socat TCP:${h1_mcli_addr}:${h1_mcli_port} - 2>/dev/null) & + done + wait +} + +# This is the key assertion: after all 10 clients have disconnected, a new +# connection to the master CLI must succeed. With the bug all 10 slots are +# still marked occupied and this connect is refused or times out. +haproxy h1 -mcli { + send "show version" + expect ~ "3." +} + diff --git a/src/cli.c b/src/cli.c index 3bd709898..0c327dd03 100644 --- a/src/cli.c +++ b/src/cli.c @@ -3834,8 +3834,9 @@ int mworker_cli_create_master_proxy(char **errmsg) mworker_proxy->mode = PR_MODE_CLI; /* default to 10 concurrent connections */ mworker_proxy->maxconn = 10; - /* no timeout */ - mworker_proxy->timeout.client = 0; + mworker_proxy->timeout.client = 0; /* no timeout */ + mworker_proxy->timeout.serverfin = MS_TO_TICKS(1000); /* 1s timeout in case worker is not responding on shutdown */ + mworker_proxy->conf.file = strdup("MASTER"); mworker_proxy->conf.line = 0; mworker_proxy->accept = frontend_accept;