From 56fd0c1a5c15f9b1986d0a6fa6d703c8ea9ae5fe Mon Sep 17 00:00:00 2001
From: Olivier Houchard <ohouchard@haproxy.com>
Date: Mon, 5 Jan 2026 06:38:58 +0100
Subject: [PATCH] MEDIUM: cpu-topo: Add an optional directive for per-group
 affinity

When using per-group affinity, add an optional new directive. It accepts
the values of "auto", where when multiple thread groups are created, the
available CPUs are split equally across the groups, and is the new
default, and "loose", where all groups are bound to all available CPUs,
this is the old default.
---
 doc/configuration.txt |  11 +++
 src/cpu_topo.c        | 173 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 152 insertions(+), 32 deletions(-)
diff --git a/doc/configuration.txt b/doc/configuration.txt
index edb3c9695..01cab8625 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -2233,6 +2233,17 @@ cpu-affinity <affinity>
   - per-core, each thread will be bound to all the hardware threads of one core.
   - per-group, each thread will be bound to all the hardware threads of the
     group. This is the default unless threads-per-core 1 is used in cpu-policy.
+    per-group accepts an optional argument, to specify how CPUs should be
+    allocated. When a list of CPUs is larger than the maximum allozed number
+    of CPUs per group and has to be split between multiple groups, an extra
+    option allows to choose how the groups will be bound to those CPUs:
+    - auto: each thread group will only be assigned a fair share of contiguous
+      CPU cores that are dedicated to it and not shared with other groups.
+      This is the default as it generally is more optimal.
+      be shared across two thread groups. This is the default.
+    - loose: each group will still be allowed to use any CPU in the list. This
+      generally causes more contention, but may sometimes help deal better
+      with parasitic loads running on the same CPUs.
   - auto, per-group will be used, unless threads-per-core 1 is used in
     cpu-policy, in which case per-core will be used. This is the default.
   - per-thread, that will bind one thread to one hardware thread only.
diff --git a/src/cpu_topo.c b/src/cpu_topo.c
index 57cdbc448..d7d39c90a 100644
--- a/src/cpu_topo.c
+++ b/src/cpu_topo.c
@@ -26,6 +26,11 @@
 #define CPU_AFFINITY_PER_THREAD (1 << 2)
 #define CPU_AFFINITY_PER_CCX    (1 << 3)
 
+/*
+ * Specific to the per-group affinity
+ */
+#define CPU_AFFINITY_PER_GROUP_LOOSE (1 << 8)
+
 /* CPU topology information, ha_cpuset_size() entries, allocated at boot */
 int cpu_topo_maxcpus  = -1;  // max number of CPUs supported by OS/haproxy
 int cpu_topo_lastcpu  = -1;  // last supposed online CPU (no need to look beyond)
@@ -64,19 +69,31 @@ struct {
 } cpu_policy_conf = {
 			1, /* "performance" policy */
 			0, /* Default flags */
-			0  /* Default affinity */
+			0, /* Default affinity */
+};
+
+struct cpu_affinity_optional {
+	char *name;
+	int affinity_flag;
+};
+
+static struct cpu_affinity_optional per_group_optional[] = {
+	{"loose", CPU_AFFINITY_PER_GROUP_LOOSE},
+	{"auto", 0},
+	{NULL, 0}
 };
 
 static struct cpu_affinity {
 	char *name;
 	int affinity_flags;
+	struct cpu_affinity_optional *optional;
 } ha_cpu_affinity[] = {
-	{"per-core", CPU_AFFINITY_PER_CORE},
-	{"per-group", CPU_AFFINITY_PER_GROUP},
-	{"per-thread", CPU_AFFINITY_PER_THREAD},
-	{"per-ccx", CPU_AFFINITY_PER_CCX},
-	{"auto", 0},
-	{NULL, 0}
+	{"per-core", CPU_AFFINITY_PER_CORE, NULL},
+	{"per-group", CPU_AFFINITY_PER_GROUP, per_group_optional},
+	{"per-thread", CPU_AFFINITY_PER_THREAD, NULL},
+	{"per-ccx", CPU_AFFINITY_PER_CCX, NULL},
+	{"auto", 0, NULL},
+	{NULL, 0, NULL}
 };
 
 /* list of CPU policies for "cpu-policy". The default one is the first one. */
@@ -1258,34 +1275,97 @@ static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin
 }
 
 static void
-cpu_policy_assign_threads(int cpu_count, struct hap_cpuset node_cpu_set, struct hap_cpuset touse_tsid, struct hap_cpuset touse_ccx)
+cpu_policy_assign_threads(int cpu_count, int thr_count, struct hap_cpuset node_cpu_set, struct hap_cpuset touse_tsid, struct hap_cpuset touse_ccx)
 {
 	struct hap_cpuset thrset;
+	struct hap_cpuset saved_touse_ccx;
 	int nb_grp;
 	int thr_per_grp;
 	int thr;
 	int same_core = 0;
+	int cpu_per_group;
 
 	ha_cpuset_zero(&thrset);
+	ha_cpuset_assign(&saved_touse_ccx, &touse_ccx);
 	/* check that we're still within limits. If there are too many
 	 * CPUs but enough groups left, we'll try to make more smaller
 	 * groups, of the closest size each.
 	 */
-	nb_grp = (cpu_count + global.maxthrpertgroup - 1) / global.maxthrpertgroup;
+	nb_grp = (thr_count + global.maxthrpertgroup - 1) / global.maxthrpertgroup;
 	if (nb_grp > MAX_TGROUPS - global.nbtgroups)
 		nb_grp = MAX_TGROUPS - global.nbtgroups;
-	thr_per_grp = (cpu_count + nb_grp - 1) / nb_grp;
+	cpu_per_group = (cpu_count + nb_grp - 1) / nb_grp;
+	thr_per_grp = (thr_count + nb_grp - 1) / nb_grp;
 	if (thr_per_grp > global.maxthrpertgroup)
 		thr_per_grp = global.maxthrpertgroup;
 
-	while (nb_grp && cpu_count > 0) {
+	while (nb_grp && thr_count > 0) {
+		struct hap_cpuset group_cpuset;
+		struct hap_cpuset current_tsid;
+		struct hap_cpuset current_ccx;
+
+		ha_cpuset_zero(&group_cpuset);
+		ha_cpuset_zero(&current_tsid);
+		ha_cpuset_zero(&current_ccx);
+
 		/* create at most thr_per_grp threads */
-		if (thr_per_grp > cpu_count)
-			thr_per_grp = cpu_count;
+		if (thr_per_grp > thr_count)
+			thr_per_grp = thr_count;
 
 		if (thr_per_grp + global.nbthread > MAX_THREADS)
 			thr_per_grp = MAX_THREADS - global.nbthread;
 
+		if ((cpu_policy_conf.affinity & (CPU_AFFINITY_PER_GROUP | CPU_AFFINITY_PER_GROUP_LOOSE)) == CPU_AFFINITY_PER_GROUP) {
+			int i = 0;
+			int next_ccx;
+
+			/*
+			 * Decide which CPUs to use for the group.
+			 * Try to allocate them from the same CCX, and then
+			 * the same TSID
+			 */
+			while (i < cpu_per_group) {
+				int next_cpu = 0;
+				int got_cpu;
+
+				next_ccx = ha_cpuset_ffs(&saved_touse_ccx) - 1;
+
+				if (next_ccx == -1)
+					break;
+
+				while (i < cpu_per_group && (got_cpu = find_next_cpu_ccx(next_cpu, next_ccx)) != -1) {
+					int tsid;
+					int got_cpu_tsid;
+					int next_cpu_tsid = 0;
+					next_cpu = got_cpu + 1;
+					if (!ha_cpuset_isset(&node_cpu_set, ha_cpu_topo[got_cpu].idx))
+						continue;
+					tsid = ha_cpu_topo[got_cpu].ts_id;
+
+					while (i < cpu_per_group && (got_cpu_tsid = find_next_cpu_tsid(next_cpu_tsid, tsid)) != -1) {
+						next_cpu_tsid = got_cpu_tsid + 1;
+						if (!ha_cpuset_isset(&node_cpu_set, ha_cpu_topo[got_cpu_tsid].idx))
+							continue;
+						ha_cpuset_set(&group_cpuset, ha_cpu_topo[got_cpu_tsid].idx);
+						ha_cpuset_clr(&node_cpu_set, ha_cpu_topo[got_cpu_tsid].idx);
+						ha_cpuset_set(&current_tsid, tsid);
+						ha_cpuset_set(&current_ccx, next_ccx);
+						i++;
+					}
+				}
+				/*
+				 * At this point there is nothing left
+				 * for us in that CCX, forget about it.
+				 */
+				if (i < cpu_per_group)
+					ha_cpuset_clr(&saved_touse_ccx, next_ccx);
+
+			}
+			ha_cpuset_assign(&touse_tsid, &current_tsid);
+			ha_cpuset_assign(&touse_ccx, &current_ccx);
+		} else {
+			ha_cpuset_assign(&group_cpuset, &node_cpu_set);
+		}
 		/* let's create the new thread group */
 		ha_tgroup_info[global.nbtgroups].base  = global.nbthread;
 		ha_tgroup_info[global.nbtgroups].count = thr_per_grp;
@@ -1310,7 +1390,8 @@ cpu_policy_assign_threads(int cpu_count, struct hap_cpuset node_cpu_set, struct
 						int got_cpu;
 						while ((got_cpu = find_next_cpu_tsid(next_try, tsid)) != -1) {
 							next_try = got_cpu + 1;
-							if (!(ha_cpu_topo[got_cpu].st & HA_CPU_F_EXCL_MASK)) {
+							if (!(ha_cpu_topo[got_cpu].st & HA_CPU_F_EXCL_MASK) &&
+							    ha_cpuset_isset(&group_cpuset, ha_cpu_topo[got_cpu].idx)) {
 								ha_cpuset_set(&thrset, ha_cpu_topo[got_cpu].idx);
 								corenb++;
 							}
@@ -1337,7 +1418,8 @@ cpu_policy_assign_threads(int cpu_count, struct hap_cpuset node_cpu_set, struct
 
 					tsid = ha_cpuset_ffs(&touse_tsid) - 1;
 					while ((got_cpu = find_next_cpu_tsid(next_cpu, tsid)) != -1) {
-						if (!(ha_cpu_topo[got_cpu].st & HA_CPU_F_EXCL_MASK))
+						if (!(ha_cpu_topo[got_cpu].st & HA_CPU_F_EXCL_MASK) &&
+						    ha_cpuset_isset(&group_cpuset, ha_cpu_topo[got_cpu].idx))
 							break;
 						next_cpu = got_cpu + 1;
 					}
@@ -1347,7 +1429,7 @@ cpu_policy_assign_threads(int cpu_count, struct hap_cpuset node_cpu_set, struct
 						ha_cpuset_clr(&touse_tsid, tsid);
 					}
 				} else {
-					int tid = ha_cpuset_ffs(&node_cpu_set) - 1;
+					int tid = ha_cpuset_ffs(&group_cpuset) - 1;
 
 					if (tid != -1) {
 						ha_cpuset_set(&thrset, tid);
@@ -1358,30 +1440,34 @@ cpu_policy_assign_threads(int cpu_count, struct hap_cpuset node_cpu_set, struct
 					ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &thrset);
 
 			} else if (cpu_policy_conf.affinity & CPU_AFFINITY_PER_CCX) {
-				if (same_core == 0) {
+				while (same_core == 0) {
 					int l3id = ha_cpuset_ffs(&touse_ccx) - 1;
 					int got_cpu;
 					int next_try = 0;
-					ha_cpuset_zero(&thrset);
 
+					if (l3id == -1)
+						break;
+					ha_cpuset_zero(&thrset);
 					while ((got_cpu = find_next_cpu_ccx(next_try, l3id)) != -1) {
 						next_try = got_cpu + 1;
-						same_core++;
-						ha_cpuset_set(&thrset, ha_cpu_topo[got_cpu].idx);
+						if (!(ha_cpu_topo[got_cpu].st & HA_CPU_F_EXCL_MASK) &&
+						    ha_cpuset_isset(&group_cpuset, ha_cpu_topo[got_cpu].idx)) {
+							same_core++;
+							ha_cpuset_set(&thrset, ha_cpu_topo[got_cpu].idx);
+						}
 					}
 					ha_cpuset_clr(&touse_ccx, l3id);
 				}
-				BUG_ON(same_core == 0);
 				if (ha_cpuset_ffs(&thrset) != 0)
 					ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &thrset);
 				same_core--;
 			} else {
 				/* map these threads to all the CPUs */
-				ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &node_cpu_set);
+				ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &group_cpuset);
 			}
 		}
 
-		cpu_count -= thr_per_grp;
+		thr_count -= thr_per_grp;
 		global.nbthread += thr_per_grp;
 		global.nbtgroups++;
 		if (global.nbtgroups >= MAX_TGROUPS || global.nbthread >= MAX_THREADS)
@@ -1404,6 +1490,7 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin,
 	struct hap_cpuset touse_ccx;
 	int cpu, cpu_start;
 	int cpu_count;
+	int thr_count;
 	int cid;
 	int div;
 
@@ -1426,7 +1513,7 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin,
 		ha_cpuset_zero(&node_cpu_set);
 		ha_cpuset_zero(&touse_tsid);
 		ha_cpuset_zero(&touse_ccx);
-		cid = -1; cpu_count = 0;
+		cid = -1; cpu_count = 0; thr_count = 0;
 
 		for (cpu = cpu_start; cpu <= cpu_topo_lastcpu; cpu++) {
 			/* skip disabled and already visited CPUs */
@@ -1446,11 +1533,12 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin,
 			/* make a mask of all of this cluster's CPUs */
 			ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx);
 			ha_cpuset_set(&touse_ccx, ha_cpu_topo[cpu].ca_id[3]);
+			cpu_count++;
 			if (!ha_cpuset_isset(&touse_tsid, ha_cpu_topo[cpu].ts_id)) {
-				cpu_count++;
+				thr_count++;
 				ha_cpuset_set(&touse_tsid, ha_cpu_topo[cpu].ts_id);
 			} else if (!(cpu_policy_conf.flags & CPU_POLICY_ONE_THREAD_PER_CORE))
-				cpu_count++;
+				thr_count++;
 		}
 
 		/* now cid = next cluster_id or -1 if none; cpu_count is the
@@ -1462,7 +1550,7 @@ static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin,
 
 		ha_cpuset_set(&visited_cl_set, cid);
 
-		cpu_policy_assign_threads(cpu_count, node_cpu_set, touse_tsid, touse_ccx);
+		cpu_policy_assign_threads(cpu_count, thr_count, node_cpu_set, touse_tsid, touse_ccx);
 	}
 
 	if (global.nbthread)
@@ -1490,6 +1578,7 @@ static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int
 	struct hap_cpuset touse_ccx; /* List of CCXs we'll currently use */
 	int cpu, cpu_start;
 	int cpu_count;
+	int thr_count;
 	int l3id;
 	int div;
 
@@ -1512,7 +1601,7 @@ static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int
 		ha_cpuset_zero(&node_cpu_set);
 		ha_cpuset_zero(&touse_tsid);
 		ha_cpuset_zero(&touse_ccx);
-		l3id = -1; cpu_count = 0;
+		l3id = -1; cpu_count = 0; thr_count = 0;
 
 		for (cpu = cpu_start; cpu <= cpu_topo_lastcpu; cpu++) {
 			/* skip disabled and already visited CPUs */
@@ -1532,11 +1621,12 @@ static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int
 			/* make a mask of all of this cluster's CPUs */
 			ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx);
 			ha_cpuset_set(&touse_ccx, ha_cpu_topo[cpu].ca_id[3]);
+			cpu_count++;
 			if (!ha_cpuset_isset(&touse_tsid, ha_cpu_topo[cpu].ts_id)) {
-				cpu_count++;
+				thr_count++;
 				ha_cpuset_set(&touse_tsid, ha_cpu_topo[cpu].ts_id);
 			} else if (!(cpu_policy_conf.flags & CPU_POLICY_ONE_THREAD_PER_CORE))
-				cpu_count++;
+				thr_count++;
 		}
 
 		/* now l3id = next L3 ID or -1 if none; cpu_count is the
@@ -1548,7 +1638,7 @@ static int cpu_policy_group_by_ccx(int policy, int tmin, int tmax, int gmin, int
 
 		ha_cpuset_set(&visited_ccx_set, l3id);
 
-		cpu_policy_assign_threads(cpu_count, node_cpu_set, touse_tsid, touse_ccx);
+		cpu_policy_assign_threads(cpu_count, thr_count, node_cpu_set, touse_tsid, touse_ccx);
 	}
 
 	if (global.nbthread)
@@ -2059,12 +2149,31 @@ static int cfg_parse_cpu_affinity(char **args, int section_type, struct proxy *c
 {
 	int i;
 
-	if (too_many_args(1, args, err, NULL))
+	if (too_many_args(2, args, err, NULL))
 		return -1;
 
 	for (i = 0; ha_cpu_affinity[i].name != NULL; i++) {
 		if (!strcmp(args[1], ha_cpu_affinity[i].name)) {
 			cpu_policy_conf.affinity |= ha_cpu_affinity[i].affinity_flags;
+			if (*args[2] != 0) {
+				struct cpu_affinity_optional *optional = ha_cpu_affinity[i].optional;
+
+				if (optional) {
+					for (i = 0; optional[i].name; i++) {
+						if (!strcmp(args[2], optional[i].name)) {
+							cpu_policy_conf.affinity |= optional[i].affinity_flag;
+							return 0;
+						}
+					}
+				}
+				memprintf(err, "'%s' provided with unknown optional argument '%s'. ", args[1], args[2]);
+				if (optional) {
+					memprintf(err, "%s Known values are :", *err);
+					for (i = 0; optional[i].name != NULL; i++)
+						memprintf(err, "%s %s", *err, optional[i].name);
+				}
+				return -1;
+			}
 			return 0;
 		}
 	}