From ee17d2024528efc7aebce63d97fa7c64e701f83e Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Wed, 27 Aug 2025 16:36:12 +0200 Subject: [PATCH] MINOR: stats-file: add process slot management for shm stats file Now that all processes tied to the same shm stats file now share a common clock source, we introduce the process slot notion in this patch. Each living process registers itself in a map at a free index: each slot stores information about the process' PID and heartbeat. Each process is responsible for updating its heartbeat, a slot is considered as "free" if the heartbeat was never set or if the heartbeat is expired (60 seconds of inactivity). The total number of slots is set to 64, this is on purpose because it allows to easily store the "users" of a given shm object using a 64 bits bitmask. Given that when haproxy is reloaded olders processes are supposed to die eventually, it should be large enough (64 simultaneous processes) to be safe. If we manage to reach this limit someday, more slots could be added by splitting "users" bitmask on multiple 64bits variable. --- include/haproxy/stats-file-t.h | 16 +++++++ src/stats-file.c | 78 ++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/include/haproxy/stats-file-t.h b/include/haproxy/stats-file-t.h index 4e2e229a6..8df17c336 100644 --- a/include/haproxy/stats-file-t.h +++ b/include/haproxy/stats-file-t.h @@ -17,6 +17,10 @@ enum stfile_domain { #define SHM_STATS_FILE_VER_MAJOR 1 #define SHM_STATS_FILE_VER_MINOR 0 +#define SHM_STATS_FILE_HEARTBEAT_TIMEOUT 60 /* passed this delay (seconds) process which has not + * sent heartbeat will be considered down + */ + /* header for shm stats file ("shm-stats-file") */ struct shm_stats_file_hdr { /* to check if the header is compatible with current haproxy version */ @@ -27,6 +31,18 @@ struct shm_stats_file_hdr { uint global_now_ms; /* global monotonic date (ms) common to all processes using the shm */ ullong global_now_ns; /* global monotonic date (ns) common to all processes using the shm */ llong now_offset; /* offset applied to global monotonic date on startup */ + /* each process uses one slot and is identified using its pid, max 64 in order + * to be able to use bitmask to refer to a process and then look its pid in the + * "slots.pid" map + * "heartbeat"is used to store the last activity + timeout of the process to check + * whether it should be considered as alive or dead + * no thread safety mechanism is employed, we assume co-processes are not started + * simultaneously + */ + struct { + pid_t pid; + int heartbeat; // last activity of this process + heartbeat timeout, in ticks + } slots[64]; }; struct shm_stats_file_object { diff --git a/src/stats-file.c b/src/stats-file.c index 9ecdcc694..0e50becd7 100644 --- a/src/stats-file.c +++ b/src/stats-file.c @@ -32,6 +32,7 @@ struct shm_stats_file_hdr *shm_stats_file_hdr = NULL; static int shm_stats_file_fd = -1; +int shm_stats_file_slot = -1; int shm_stats_file_max_objects = -1; /* Dump all fields from into for stats-file. */ @@ -467,6 +468,37 @@ static int shm_stats_file_check_ver(struct shm_stats_file_hdr *hdr) return 1; } +static inline int shm_hb_is_stale(int hb) +{ + return (hb == TICK_ETERNITY || tick_is_expired(hb, now_ms)); +} + +/* returns free slot id on success or -1 if no more slots are available + * on success, the free slot is already reserved for the process pid + */ +int shm_stats_file_get_free_slot(struct shm_stats_file_hdr *hdr) +{ + int it = 0; + int hb; + + while (it < sizeof(hdr->slots) / sizeof(hdr->slots[0])) { + hb = HA_ATOMIC_LOAD(&hdr->slots[it].heartbeat); + /* try to own a stale entry */ + while (shm_hb_is_stale(hb)) { + int new_hb = tick_add(now_ms, MS_TO_TICKS(SHM_STATS_FILE_HEARTBEAT_TIMEOUT * 1000)); + + if (HA_ATOMIC_CAS(&hdr->slots[it].heartbeat, &hb, new_hb)) { + shm_stats_file_hdr->slots[it].pid = getpid(); + return it; + } + /* another process was faster than us */ + __ha_cpu_relax(); + } + it += 1; + } + return -1; +} + /* since shm file was opened using O_APPEND flag, let's grow * the file by in an atomic manner (O_APPEND offers such guarantee), * so that even if multiple processes try to grow the file simultaneously, @@ -492,10 +524,33 @@ static int shm_file_grow(unsigned int bytes) return 1; } +static struct task *shm_stats_file_hb(struct task *task, void *context, unsigned int state) +{ + if (stopping) + return NULL; + + /* only update the heartbeat if it hasn't expired. Else it means the slot could have + * been reused and it isn't safe to use anymore. + * If this happens, raise a warning and stop using it + */ + if (tick_is_expired(HA_ATOMIC_LOAD(&shm_stats_file_hdr->slots[shm_stats_file_slot].heartbeat), now_ms)) { + ha_warning("shm_stats_file: heartbeat for the current process slot already expired, it is not safe to use it anymore\n"); + task->expire = TICK_ETERNITY; + return task; + } + HA_ATOMIC_STORE(&shm_stats_file_hdr->slots[shm_stats_file_slot].heartbeat, + tick_add(now_ms, MS_TO_TICKS(SHM_STATS_FILE_HEARTBEAT_TIMEOUT * 1000))); + task->expire = tick_add(now_ms, 1000); // next update in 1 sec + + return task; +} + /* prepare and and initialize shm stats memory file as needed */ int shm_stats_file_prepare(void) { + struct task *heartbeat_task; int first = 0; // process responsible for initializing the shm memory + int slot; /* do nothing if master process or shm_stats_file not configured */ if (master || !global.shm_stats_file) @@ -582,6 +637,25 @@ int shm_stats_file_prepare(void) /* sync local and global clocks, so all clocks are consistent */ clock_update_date(0, 1); + /* reserve our slot */ + slot = shm_stats_file_get_free_slot(shm_stats_file_hdr); + if (slot == -1) { + ha_warning("config: failed to get shm stats file slot for '%s', all slots are occupied\n", global.shm_stats_file); + munmap(shm_stats_file_hdr, sizeof(*shm_stats_file_hdr)); + return ERR_WARN; + } + + shm_stats_file_slot = slot; + + /* start the task responsible for updating the heartbeat */ + heartbeat_task = task_new_anywhere(); + if (!heartbeat_task) { + ha_alert("config: failed to create the heartbeat task for shm stats file '%s'\n", global.shm_stats_file); + return ERR_ALERT | ERR_FATAL; + } + heartbeat_task->process = shm_stats_file_hb; + task_schedule(heartbeat_task, tick_add(now_ms, 1000)); + end: return ERR_NONE; @@ -593,6 +667,10 @@ int shm_stats_file_prepare(void) static void cleanup_shm_stats_file(void) { if (shm_stats_file_hdr) { + /* mark the process slot we occupied as unused */ + HA_ATOMIC_STORE(&shm_stats_file_hdr->slots[shm_stats_file_slot].heartbeat, TICK_ETERNITY); + shm_stats_file_hdr->slots[shm_stats_file_slot].pid = -1; + munmap(shm_stats_file_hdr, SHM_STATS_FILE_MAPPING_SIZE(shm_stats_file_max_objects)); close(shm_stats_file_fd); }