BUG/MEDIUM: stats/server: use watcher to track server during stats dump

If a server A is deleted while a stats dump is currently on it, deletion
is delayed thanks to reference counting. Server A is nonetheless removed
from the proxy list. However, this list is a single linked list. If the
next server B is deleted and freed immediately, server A would still
point to it. This problem has been solved by the prev_deleted list in
servers.

This model seems correct, but it is difficult to ensure completely its
validity. In particular, it implies when stats dump is resumed, server A
elements will be accessed despite the server being in a half-deleted
state.

Thus, it has been decided to completely ditch the refcount mechanism for
stats dump. Instead, use the watcher element to register every stats
dump currently tracking a server instance. Each time a server is deleted
on the CLI, each stats dump element which may points to it are updated
to access the next server instance, or NULL if this is the last server.
This ensures that a server which was deleted via CLI but not completely
freed is never accessed on stats dump resumption.

Currently, no race condition related to dynamic servers and stats dump
is known. However, as described above, the previous model is deemed too
fragile, as such this patch is labelled as bug-fix. It should be
backported up to 2.6, after a reasonable period of observation. It
relies on the following patch :
  MINOR: list: define a watcher type
This commit is contained in:
Amaury Denoyelle 2024-10-23 11:33:34 +02:00
parent eafa8a32bb
commit 071ae8ce3d
8 changed files with 27 additions and 41 deletions

View File

@ -34,7 +34,7 @@
/* flags for appctx->state */
/* Room for per-command context (mostly CLI commands but not only) */
#define APPLET_MAX_SVCCTX 128
#define APPLET_MAX_SVCCTX 256
/* Appctx Flags */
#define APPCTX_FL_INBLK_ALLOC 0x00000001

View File

@ -350,6 +350,7 @@ struct server {
enum srv_ws_mode ws; /* configure the protocol selection for websocket */
/* 3 bytes hole here */
struct mt_list watcher_list; /* list of elems which currently references this server instance */
uint refcount; /* refcount used to remove a server at runtime */
/* The elements below may be changed on every single request by any

View File

@ -578,6 +578,7 @@ struct show_stat_ctx {
int iid, type, sid; /* proxy id, type and service id if bounding of stats is enabled */
int st_code; /* the status code returned by an action */
struct buffer chunk; /* temporary buffer which holds a single-line output */
struct watcher srv_watch; /* watcher to automatically update obj2 on server deletion */
enum stat_state state; /* phase of output production */
};

View File

@ -4008,6 +4008,7 @@ static int http_handle_stats(struct stream *s, struct channel *req, struct proxy
ctx->flags |= STAT_F_FMT_HTML; /* assume HTML mode by default */
if ((msg->flags & HTTP_MSGF_VER_11) && (txn->meth != HTTP_METH_HEAD))
ctx->flags |= STAT_F_CHUNKED;
watcher_init(&ctx->srv_watch, &ctx->obj2, offsetof(struct server, watcher_list));
htx = htxbuf(&req->buf);
sl = http_get_stline(htx);

View File

@ -2988,6 +2988,7 @@ struct server *new_server(struct proxy *proxy)
MT_LIST_INIT(&srv->sess_conns);
guid_init(&srv->guid);
MT_LIST_INIT(&srv->watcher_list);
srv->extra_counters = NULL;
#ifdef USE_OPENSSL
@ -6076,6 +6077,7 @@ static int cli_parse_delete_server(char **args, char *payload, struct appctx *ap
struct ist be_name, sv_name;
struct mt_list back;
struct sess_priv_conns *sess_conns = NULL;
struct watcher *srv_watch;
const char *msg;
int ret, i;
@ -6177,6 +6179,12 @@ static int cli_parse_delete_server(char **args, char *payload, struct appctx *ap
if (srv->agent.state & CHK_ST_CONFIGURED)
check_purge(&srv->agent);
while (!MT_LIST_ISEMPTY(&srv->watcher_list)) {
srv_watch = MT_LIST_NEXT(&srv->watcher_list, struct watcher *, el);
BUG_ON(srv->next && srv->next->flags & SRV_F_DELETED);
watcher_next(srv_watch, srv->next);
}
/* detach the server from the proxy linked list
* The proxy servers list is currently not protected by a lock, so this
* requires thread_isolate/release.

View File

@ -2097,9 +2097,8 @@ static size_t http_stats_fastfwd(struct appctx *appctx, struct buffer *buf,
static void http_stats_release(struct appctx *appctx)
{
struct show_stat_ctx *ctx = appctx->svcctx;
if (ctx->px_st == STAT_PX_ST_SV)
srv_drop(ctx->obj2);
if (ctx->px_st == STAT_PX_ST_SV && ctx->obj2)
watcher_detach(&ctx->srv_watch);
}
struct applet http_stats_applet = {

View File

@ -1335,7 +1335,6 @@ static int stats_dump_proxy_to_buffer(struct stconn *sc, struct buffer *buf,
struct listener *l;
struct uri_auth *uri = NULL;
int current_field;
int px_st = ctx->px_st;
if (ctx->http_px)
uri = ctx->http_px->uri_auth;
@ -1442,46 +1441,23 @@ static int stats_dump_proxy_to_buffer(struct stconn *sc, struct buffer *buf,
current_field = 0;
}
ctx->obj2 = px->srv; /* may be NULL */
/* for servers ctx.obj2 is set via watcher_attach() */
watcher_attach(&ctx->srv_watch, px->srv);
ctx->px_st = STAT_PX_ST_SV;
__fallthrough;
case STAT_PX_ST_SV:
/* check for dump resumption */
if (px_st == STAT_PX_ST_SV) {
struct server *cur = ctx->obj2;
/* re-entrant dump */
BUG_ON(!cur);
if (cur->flags & SRV_F_DELETED) {
/* the server could have been marked as deleted
* between two dumping attempts, skip it.
*/
cur = cur->next;
}
srv_drop(ctx->obj2); /* drop old srv taken on last dumping attempt */
ctx->obj2 = cur; /* could be NULL */
/* back to normal */
}
/* obj2 points to servers list as initialized above.
*
* A server may be removed during the stats dumping.
* Temporarily increment its refcount to prevent its
* anticipated cleaning. Call srv_drop() to release it.
*/
for (; ctx->obj2 != NULL;
ctx->obj2 = srv_drop(sv)) {
sv = ctx->obj2;
srv_take(sv);
/* obj2 is updated and returned through watcher_next() */
for (sv = ctx->obj2; sv;
sv = watcher_next(&ctx->srv_watch, sv->next)) {
if (stats_is_full(appctx, buf, htx))
goto full;
if (ctx->flags & STAT_F_BOUND) {
if (!(ctx->type & (1 << STATS_TYPE_SV))) {
srv_drop(sv);
watcher_detach(&ctx->srv_watch);
break;
}

View File

@ -929,6 +929,7 @@ static int cli_parse_show_stat(char **args, char *payload, struct appctx *appctx
ctx->scope_len = 0;
ctx->http_px = NULL; // not under http context
ctx->flags = STAT_F_SHNODE | STAT_F_SHDESC;
watcher_init(&ctx->srv_watch, &ctx->obj2, offsetof(struct server, watcher_list));
if ((strm_li(appctx_strm(appctx))->bind_conf->level & ACCESS_LVL_MASK) >= ACCESS_LVL_OPER)
ctx->flags |= STAT_F_SHLGNDS;
@ -1004,9 +1005,8 @@ static int cli_io_handler_dump_stat(struct appctx *appctx)
static void cli_io_handler_release_stat(struct appctx *appctx)
{
struct show_stat_ctx *ctx = appctx->svcctx;
if (ctx->px_st == STAT_PX_ST_SV)
srv_drop(ctx->obj2);
if (ctx->px_st == STAT_PX_ST_SV && ctx->obj2)
watcher_detach(&ctx->srv_watch);
}
static int cli_io_handler_dump_json_schema(struct appctx *appctx)
@ -1024,6 +1024,7 @@ static int cli_parse_dump_stat_file(char **args, char *payload,
ctx->chunk = b_make(trash.area, trash.size, 0, 0);
ctx->domain = STATS_DOMAIN_PROXY;
ctx->flags |= STAT_F_FMT_FILE;
watcher_init(&ctx->srv_watch, &ctx->obj2, offsetof(struct server, watcher_list));
return 0;
}
@ -1065,9 +1066,8 @@ static int cli_io_handler_dump_stat_file(struct appctx *appctx)
static void cli_io_handler_release_dump_stat_file(struct appctx *appctx)
{
struct show_stat_ctx *ctx = appctx->svcctx;
if (ctx->px_st == STAT_PX_ST_SV)
srv_drop(ctx->obj2);
if (ctx->px_st == STAT_PX_ST_SV && ctx->obj2)
watcher_detach(&ctx->srv_watch);
}
int stats_allocate_proxy_counters_internal(struct extra_counters **counters,