mirror of
https://git.haproxy.org/git/haproxy.git/
synced 2025-08-11 01:26:58 +02:00
This cpu policy keeps the smallest CPU cluster. This can be used to limit the resource usage to the strict minimum that still delivers decent performance, for example to try to further reduce power consumption or minimize the number of cores needed on some rented systems for a sidecar setup, in order to scale the system down more easily. Note that if a single cluster is present, it will still be fully used. When started on a 64-core EPYC gen3, it uses only one CCX with 8 cores and 16 threads, all in the same group.
1808 lines
56 KiB
C
1808 lines
56 KiB
C
#define _GNU_SOURCE
|
|
|
|
#include <dirent.h>
|
|
#include <sched.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
|
|
#include <haproxy/api.h>
|
|
#include <haproxy/cfgparse.h>
|
|
#include <haproxy/cpuset.h>
|
|
#include <haproxy/cpu_topo.h>
|
|
#include <haproxy/global.h>
|
|
#include <haproxy/log.h>
|
|
#include <haproxy/tools.h>
|
|
|
|
/* for cpu_set.flags below */
|
|
#define CPU_SET_FL_NONE 0x0000
|
|
#define CPU_SET_FL_DO_RESET 0x0001
|
|
|
|
/* CPU topology information, ha_cpuset_size() entries, allocated at boot */
|
|
int cpu_topo_maxcpus = -1; // max number of CPUs supported by OS/haproxy
|
|
int cpu_topo_lastcpu = -1; // last supposed online CPU (no need to look beyond)
|
|
struct ha_cpu_topo *ha_cpu_topo = NULL;
|
|
struct ha_cpu_cluster *ha_cpu_clusters = NULL;
|
|
struct cpu_map *cpu_map;
|
|
|
|
/* non-zero if we're certain that taskset or similar was used to force CPUs */
|
|
int cpu_mask_forced = 0;
|
|
|
|
/* "cpu-set" global configuration */
|
|
struct cpu_set_cfg {
|
|
uint flags; // CPU_SET_FL_XXX above
|
|
/* CPU numbers to accept / reject */
|
|
struct hap_cpuset only_cpus;
|
|
struct hap_cpuset drop_cpus;
|
|
/* node numbers to accept / reject */
|
|
struct hap_cpuset only_nodes;
|
|
struct hap_cpuset drop_nodes;
|
|
/* cluster numbers to accept / reject */
|
|
struct hap_cpuset only_clusters;
|
|
struct hap_cpuset drop_clusters;
|
|
/* core numbers to accept / reject */
|
|
struct hap_cpuset only_cores;
|
|
struct hap_cpuset drop_cores;
|
|
/* thread numbers to accept / reject */
|
|
struct hap_cpuset only_threads;
|
|
struct hap_cpuset drop_threads;
|
|
} cpu_set_cfg;
|
|
|
|
/* CPU policy choice */
|
|
static int cpu_policy = 1; // "first-usable-node"
|
|
|
|
/* list of CPU policies for "cpu-policy". The default one is the first one. */
|
|
static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
|
|
static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
|
|
static int cpu_policy_performance(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
|
|
static int cpu_policy_efficiency(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
|
|
static int cpu_policy_resource(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
|
|
|
|
static struct ha_cpu_policy ha_cpu_policy[] = {
|
|
{ .name = "none", .desc = "use all available CPUs", .fct = NULL },
|
|
{ .name = "first-usable-node", .desc = "use only first usable node if nbthreads not set", .fct = cpu_policy_first_usable_node },
|
|
{ .name = "group-by-cluster", .desc = "make one thread group per core cluster", .fct = cpu_policy_group_by_cluster },
|
|
{ .name = "performance", .desc = "make one thread group per perf. core cluster", .fct = cpu_policy_performance },
|
|
{ .name = "efficiency", .desc = "make one thread group per eff. core cluster", .fct = cpu_policy_efficiency },
|
|
{ .name = "resource", .desc = "make one thread group from the smallest cluster", .fct = cpu_policy_resource },
|
|
{ 0 } /* end */
|
|
};
|
|
|
|
/* Detects CPUs that are online on the system. It may rely on FS access (e.g.
|
|
* /sys on Linux). Returns the number of CPUs detected or 0 if the detection
|
|
* failed.
|
|
*/
|
|
int ha_cpuset_detect_online(struct hap_cpuset *set)
|
|
{
|
|
#if defined(__linux__)
|
|
|
|
ha_cpuset_zero(set);
|
|
|
|
/* contains a list of CPUs in the format <low>[-<high>][,...] */
|
|
if (read_line_to_trash("%s/cpu/online", NUMA_DETECT_SYSTEM_SYSFS_PATH) >= 0) {
|
|
const char *parse_cpu_set_args[2] = { trash.area, "\0" };
|
|
|
|
if (parse_cpu_set(parse_cpu_set_args, set, NULL) != 0)
|
|
ha_cpuset_zero(set);
|
|
}
|
|
|
|
#elif defined(__FreeBSD__)
|
|
|
|
struct hap_cpuset node_cpu_set;
|
|
int ndomains, domain;
|
|
size_t len = sizeof(ndomains);
|
|
|
|
ha_cpuset_zero(set);
|
|
|
|
/* retrieve the union of NUMA nodes as online CPUs */
|
|
if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == 0) {
|
|
BUG_ON(ndomains > MAXMEMDOM);
|
|
|
|
for (domain = 0; domain < ndomains; domain++) {
|
|
ha_cpuset_zero(&node_cpu_set);
|
|
|
|
if (cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_DOMAIN, domain,
|
|
sizeof(node_cpu_set.cpuset), &node_cpu_set.cpuset) == -1)
|
|
continue;
|
|
|
|
ha_cpuset_or(set, &node_cpu_set);
|
|
}
|
|
}
|
|
|
|
#else // !__linux__, !__FreeBSD__
|
|
|
|
ha_cpuset_zero(set);
|
|
|
|
#endif
|
|
return ha_cpuset_count(set);
|
|
}
|
|
|
|
/* Detects the CPUs that will be used based on the ones the process is bound to
|
|
* at boot. The principle is the following: all CPUs from the boot cpuset will
|
|
* be used since we don't know upfront how individual threads will be mapped to
|
|
* groups and CPUs.
|
|
*
|
|
* Returns non-zero on success, zero on failure. Note that it may not be
|
|
* performed in the function above because some calls may rely on other items
|
|
* being allocated (e.g. trash).
|
|
*/
|
|
int cpu_detect_usable(void)
|
|
{
|
|
struct hap_cpuset boot_set = { };
|
|
int cpu;
|
|
|
|
if (!(cpu_set_cfg.flags & CPU_SET_FL_DO_RESET)) {
|
|
/* update the list with the CPUs currently bound to the current process */
|
|
ha_cpuset_detect_bound(&boot_set);
|
|
|
|
/* remove the known-excluded CPUs */
|
|
for (cpu = 0; cpu < cpu_topo_maxcpus; cpu++)
|
|
if (!ha_cpuset_isset(&boot_set, cpu))
|
|
ha_cpu_topo[cpu].st |= HA_CPU_F_EXCLUDED;
|
|
}
|
|
|
|
/* remove CPUs in the drop-cpu set or not in the only-cpu set */
|
|
for (cpu = 0; cpu < cpu_topo_maxcpus; cpu++) {
|
|
if ( ha_cpuset_isset(&cpu_set_cfg.drop_cpus, cpu) ||
|
|
!ha_cpuset_isset(&cpu_set_cfg.only_cpus, cpu))
|
|
ha_cpu_topo[cpu].st |= HA_CPU_F_DONT_USE;
|
|
}
|
|
|
|
/* Update the list of currently offline CPUs. Normally it's a subset
|
|
* of the unbound ones, but we cannot infer anything if we don't have
|
|
* the info so we only update what we know. We take this opportunity
|
|
* for detecting that some online CPUs are not bound, indicating that
|
|
* taskset or equivalent was used.
|
|
*/
|
|
if (ha_cpuset_detect_online(&boot_set)) {
|
|
for (cpu = 0; cpu < cpu_topo_maxcpus; cpu++) {
|
|
if (!ha_cpuset_isset(&boot_set, cpu)) {
|
|
ha_cpu_topo[cpu].st |= HA_CPU_F_OFFLINE;
|
|
} else {
|
|
cpu_topo_lastcpu = cpu;
|
|
if (ha_cpu_topo[cpu].st & HA_CPU_F_EXCLUDED)
|
|
cpu_mask_forced = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Detects CPUs that are bound to the current process. Returns the number of
|
|
* CPUs detected or 0 if the detection failed.
|
|
*/
|
|
int ha_cpuset_detect_bound(struct hap_cpuset *set)
|
|
{
|
|
ha_cpuset_zero(set);
|
|
|
|
/* detect bound CPUs depending on the OS's API */
|
|
if (0
|
|
#if defined(__linux__)
|
|
|| sched_getaffinity(0, sizeof(set->cpuset), &set->cpuset) != 0
|
|
#elif defined(__FreeBSD__)
|
|
|| cpuset_getaffinity(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(set->cpuset), &set->cpuset) != 0
|
|
#else
|
|
|| 1 // unhandled platform
|
|
#endif
|
|
) {
|
|
/* detection failed */
|
|
return 0;
|
|
}
|
|
|
|
return ha_cpuset_count(set);
|
|
}
|
|
|
|
/* Returns true if at least one cpu-map directive was configured, otherwise
|
|
* false.
|
|
*/
|
|
int cpu_map_configured(void)
|
|
{
|
|
int grp, thr;
|
|
|
|
for (grp = 0; grp < MAX_TGROUPS; grp++) {
|
|
for (thr = 0; thr < MAX_THREADS_PER_GROUP; thr++)
|
|
if (ha_cpuset_count(&cpu_map[grp].thread[thr]))
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Dump the CPU topology <topo> for up to cpu_topo_maxcpus CPUs for
|
|
* debugging purposes. Offline CPUs are skipped.
|
|
*/
|
|
void cpu_dump_topology(const struct ha_cpu_topo *topo)
|
|
{
|
|
int has_smt = 0;
|
|
int cpu, lvl;
|
|
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++)
|
|
if (ha_cpu_topo[cpu].th_cnt > 1)
|
|
has_smt = 1;
|
|
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].st & HA_CPU_F_OFFLINE)
|
|
continue;
|
|
|
|
printf("[%s] cpu=%3d pk=%02d no=%02d cl=%03d(%03d)",
|
|
(ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK) ? "----" : "keep",
|
|
ha_cpu_topo[cpu].idx,
|
|
ha_cpu_topo[cpu].pk_id,
|
|
ha_cpu_topo[cpu].no_id,
|
|
ha_cpu_topo[cpu].cl_gid,
|
|
ha_cpu_topo[cpu].cl_lid);
|
|
|
|
/* list only relevant cache levels */
|
|
for (lvl = 4; lvl >= 0; lvl--) {
|
|
if (ha_cpu_topo[cpu].ca_id[lvl] < 0)
|
|
continue;
|
|
printf(lvl < 3 ? " l%d=%02d" : " l%d=%03d", lvl, ha_cpu_topo[cpu].ca_id[lvl]);
|
|
}
|
|
|
|
printf(" ts=%03d capa=%d",
|
|
ha_cpu_topo[cpu].ts_id,
|
|
ha_cpu_topo[cpu].capa);
|
|
|
|
if (has_smt) {
|
|
if (ha_cpu_topo[cpu].th_cnt > 1)
|
|
printf(" smt=%d/%d",
|
|
ha_cpu_topo[cpu].th_id,
|
|
ha_cpu_topo[cpu].th_cnt);
|
|
else
|
|
printf(" smt=%d",
|
|
ha_cpu_topo[cpu].th_cnt);
|
|
}
|
|
putchar('\n');
|
|
}
|
|
|
|
printf("CPU clusters:\n");
|
|
for (cpu = 0; cpu < cpu_topo_maxcpus; cpu++) {
|
|
if (!ha_cpu_clusters[cpu].nb_cpu)
|
|
continue;
|
|
printf(" %3u cpus=%3u cores=%3u capa=%u\n",
|
|
cpu, ha_cpu_clusters[cpu].nb_cpu,
|
|
ha_cpu_clusters[cpu].nb_cores,
|
|
ha_cpu_clusters[cpu].capa);
|
|
}
|
|
}
|
|
|
|
/* function used by qsort to re-arrange CPUs by index only, to restore original
|
|
* ordering.
|
|
*/
|
|
int _cmp_cpu_index(const void *a, const void *b)
|
|
{
|
|
const struct ha_cpu_topo *l = (const struct ha_cpu_topo *)a;
|
|
const struct ha_cpu_topo *r = (const struct ha_cpu_topo *)b;
|
|
|
|
/* next, IDX, so that SMT ordering is preserved */
|
|
if (l->idx >= 0 && l->idx < r->idx)
|
|
return -1;
|
|
if (l->idx > r->idx && r->idx >= 0)
|
|
return 1;
|
|
|
|
/* exactly the same (e.g. absent, should not happend) */
|
|
return 0;
|
|
}
|
|
|
|
/* function used by qsort to compare two hwcpus and arrange them by vicinity
|
|
* only. -1 says a<b, 1 says a>b. The goal is to arrange the closest CPUs
|
|
* together, preferring locality over performance in order to keep latency
|
|
* as low as possible, so that when picking a fixed number of threads, the
|
|
* closest ones are used in priority. It's also used to help arranging groups
|
|
* at the end.
|
|
*/
|
|
int _cmp_cpu_locality(const void *a, const void *b)
|
|
{
|
|
const struct ha_cpu_topo *l = (const struct ha_cpu_topo *)a;
|
|
const struct ha_cpu_topo *r = (const struct ha_cpu_topo *)b;
|
|
|
|
/* first, online vs offline */
|
|
if (!(l->st & HA_CPU_F_EXCL_MASK) && (r->st & HA_CPU_F_EXCL_MASK))
|
|
return -1;
|
|
|
|
if (!(r->st & HA_CPU_F_EXCL_MASK) && (l->st & HA_CPU_F_EXCL_MASK))
|
|
return 1;
|
|
|
|
/* next, package ID */
|
|
if (l->pk_id >= 0 && l->pk_id < r->pk_id)
|
|
return -1;
|
|
if (l->pk_id > r->pk_id && r->pk_id >= 0)
|
|
return 1;
|
|
|
|
/* next, node ID */
|
|
if (l->no_id >= 0 && l->no_id < r->no_id)
|
|
return -1;
|
|
if (l->no_id > r->no_id && r->no_id >= 0)
|
|
return 1;
|
|
|
|
/* next, L4 */
|
|
if (l->ca_id[4] >= 0 && l->ca_id[4] < r->ca_id[4])
|
|
return -1;
|
|
if (l->ca_id[4] > r->ca_id[4] && r->ca_id[4] >= 0)
|
|
return 1;
|
|
|
|
/* next, L3 */
|
|
if (l->ca_id[3] >= 0 && l->ca_id[3] < r->ca_id[3])
|
|
return -1;
|
|
if (l->ca_id[3] > r->ca_id[3] && r->ca_id[3] >= 0)
|
|
return 1;
|
|
|
|
/* next, cluster */
|
|
if (l->cl_gid >= 0 && l->cl_gid < r->cl_gid)
|
|
return -1;
|
|
if (l->cl_gid > r->cl_gid && r->cl_gid >= 0)
|
|
return 1;
|
|
|
|
/* next, L2 */
|
|
if (l->ca_id[2] >= 0 && l->ca_id[2] < r->ca_id[2])
|
|
return -1;
|
|
if (l->ca_id[2] > r->ca_id[2] && r->ca_id[2] >= 0)
|
|
return 1;
|
|
|
|
/* next, thread set */
|
|
if (l->ts_id >= 0 && l->ts_id < r->ts_id)
|
|
return -1;
|
|
if (l->ts_id > r->ts_id && r->ts_id >= 0)
|
|
return 1;
|
|
|
|
/* next, L1 */
|
|
if (l->ca_id[1] >= 0 && l->ca_id[1] < r->ca_id[1])
|
|
return -1;
|
|
if (l->ca_id[1] > r->ca_id[1] && r->ca_id[1] >= 0)
|
|
return 1;
|
|
|
|
/* next, L0 */
|
|
if (l->ca_id[0] >= 0 && l->ca_id[0] < r->ca_id[0])
|
|
return -1;
|
|
if (l->ca_id[0] > r->ca_id[0] && r->ca_id[0] >= 0)
|
|
return 1;
|
|
|
|
/* next, IDX, so that SMT ordering is preserved */
|
|
if (l->idx >= 0 && l->idx < r->idx)
|
|
return -1;
|
|
if (l->idx > r->idx && r->idx >= 0)
|
|
return 1;
|
|
|
|
/* exactly the same (e.g. absent) */
|
|
return 0;
|
|
}
|
|
|
|
/* function used by qsort to compare two hwcpus and arrange them by vicinity
|
|
* then capacity. -1 says a<b, 1 says a>b. The goal is to detect different
|
|
* CPU capacities among clusters.
|
|
*/
|
|
int _cmp_cpu_cluster_capa(const void *a, const void *b)
|
|
{
|
|
const struct ha_cpu_topo *l = (const struct ha_cpu_topo *)a;
|
|
const struct ha_cpu_topo *r = (const struct ha_cpu_topo *)b;
|
|
|
|
/* first, online vs offline */
|
|
if (!(l->st & HA_CPU_F_EXCL_MASK) && (r->st & HA_CPU_F_EXCL_MASK))
|
|
return -1;
|
|
|
|
if (!(r->st & HA_CPU_F_EXCL_MASK) && (l->st & HA_CPU_F_EXCL_MASK))
|
|
return 1;
|
|
|
|
/* next, package ID */
|
|
if (l->pk_id >= 0 && l->pk_id < r->pk_id)
|
|
return -1;
|
|
if (l->pk_id > r->pk_id && r->pk_id >= 0)
|
|
return 1;
|
|
|
|
/* next, node ID */
|
|
if (l->no_id >= 0 && l->no_id < r->no_id)
|
|
return -1;
|
|
if (l->no_id > r->no_id && r->no_id >= 0)
|
|
return 1;
|
|
|
|
/* next, L4 */
|
|
if (l->ca_id[4] >= 0 && l->ca_id[4] < r->ca_id[4])
|
|
return -1;
|
|
if (l->ca_id[4] > r->ca_id[4] && r->ca_id[4] >= 0)
|
|
return 1;
|
|
|
|
/* next, L3 */
|
|
if (l->ca_id[3] >= 0 && l->ca_id[3] < r->ca_id[3])
|
|
return -1;
|
|
if (l->ca_id[3] > r->ca_id[3] && r->ca_id[3] >= 0)
|
|
return 1;
|
|
|
|
/* next, cluster */
|
|
if (l->cl_gid >= 0 && l->cl_gid < r->cl_gid)
|
|
return -1;
|
|
if (l->cl_gid > r->cl_gid && r->cl_gid >= 0)
|
|
return 1;
|
|
|
|
/* Same cluster. For CPU capacity, we tolerate a +/- 5% margin however
|
|
* so that if some values come from measurement we don't end up
|
|
* reorganizing everything.
|
|
*/
|
|
if (l->capa > 0 && (int)l->capa * 19 > (int)r->capa * 20)
|
|
return -1;
|
|
if (r->capa > 0 && (int)l->capa * 20 < (int)r->capa * 19)
|
|
return 1;
|
|
|
|
/* next, L2 */
|
|
if (l->ca_id[2] >= 0 && l->ca_id[2] < r->ca_id[2])
|
|
return -1;
|
|
if (l->ca_id[2] > r->ca_id[2] && r->ca_id[2] >= 0)
|
|
return 1;
|
|
|
|
/* next, thread set */
|
|
if (l->ts_id >= 0 && l->ts_id < r->ts_id)
|
|
return -1;
|
|
if (l->ts_id > r->ts_id && r->ts_id >= 0)
|
|
return 1;
|
|
|
|
/* next, L1 */
|
|
if (l->ca_id[1] >= 0 && l->ca_id[1] < r->ca_id[1])
|
|
return -1;
|
|
if (l->ca_id[1] > r->ca_id[1] && r->ca_id[1] >= 0)
|
|
return 1;
|
|
|
|
/* next, L0 */
|
|
if (l->ca_id[0] >= 0 && l->ca_id[0] < r->ca_id[0])
|
|
return -1;
|
|
if (l->ca_id[0] > r->ca_id[0] && r->ca_id[0] >= 0)
|
|
return 1;
|
|
|
|
/* next, IDX, so that SMT ordering is preserved */
|
|
if (l->idx >= 0 && l->idx < r->idx)
|
|
return -1;
|
|
if (l->idx > r->idx && r->idx >= 0)
|
|
return 1;
|
|
|
|
/* exactly the same */
|
|
return 0;
|
|
}
|
|
|
|
/* function used by qsort to compare two hwcpus and arrange them by cluster to
|
|
* make sure no cluster crosses L3 boundaries. -1 says a<b, 1 says a>b. It's
|
|
* only used during topology detection.
|
|
*/
|
|
int _cmp_cpu_cluster(const void *a, const void *b)
|
|
{
|
|
const struct ha_cpu_topo *l = (const struct ha_cpu_topo *)a;
|
|
const struct ha_cpu_topo *r = (const struct ha_cpu_topo *)b;
|
|
|
|
/* first, online vs offline */
|
|
if (!(l->st & HA_CPU_F_EXCL_MASK) && (r->st & HA_CPU_F_EXCL_MASK))
|
|
return -1;
|
|
|
|
if (!(r->st & HA_CPU_F_EXCL_MASK) && (l->st & HA_CPU_F_EXCL_MASK))
|
|
return 1;
|
|
|
|
/* next, cluster */
|
|
if (l->cl_gid >= 0 && l->cl_gid < r->cl_gid)
|
|
return -1;
|
|
if (l->cl_gid > r->cl_gid && r->cl_gid >= 0)
|
|
return 1;
|
|
|
|
/* next, package ID */
|
|
if (l->pk_id >= 0 && l->pk_id < r->pk_id)
|
|
return -1;
|
|
if (l->pk_id > r->pk_id && r->pk_id >= 0)
|
|
return 1;
|
|
|
|
/* next, node ID */
|
|
if (l->no_id >= 0 && l->no_id < r->no_id)
|
|
return -1;
|
|
if (l->no_id > r->no_id && r->no_id >= 0)
|
|
return 1;
|
|
|
|
/* next, L3 */
|
|
if (l->ca_id[3] >= 0 && l->ca_id[3] < r->ca_id[3])
|
|
return -1;
|
|
if (l->ca_id[3] > r->ca_id[3] && r->ca_id[3] >= 0)
|
|
return 1;
|
|
|
|
/* if no L3, then L2 */
|
|
if (l->ca_id[2] >= 0 && l->ca_id[2] < r->ca_id[2])
|
|
return -1;
|
|
if (l->ca_id[2] > r->ca_id[2] && r->ca_id[2] >= 0)
|
|
return 1;
|
|
|
|
/* next, IDX, so that SMT ordering is preserved */
|
|
if (l->idx >= 0 && l->idx < r->idx)
|
|
return -1;
|
|
if (l->idx > r->idx && r->idx >= 0)
|
|
return 1;
|
|
|
|
/* exactly the same (e.g. absent) */
|
|
return 0;
|
|
}
|
|
|
|
/* re-order a CPU topology array by CPU index only. This is mostly used before
|
|
* listing CPUs regardless of their characteristics.
|
|
*/
|
|
void cpu_reorder_by_index(struct ha_cpu_topo *topo, int entries)
|
|
{
|
|
qsort(topo, entries, sizeof(*topo), _cmp_cpu_index);
|
|
}
|
|
|
|
/* re-order a CPU topology array by locality to help form groups. */
|
|
void cpu_reorder_by_locality(struct ha_cpu_topo *topo, int entries)
|
|
{
|
|
qsort(topo, entries, sizeof(*topo), _cmp_cpu_locality);
|
|
}
|
|
|
|
/* re-order a CPU topology array by cluster id. */
|
|
void cpu_reorder_by_cluster(struct ha_cpu_topo *topo, int entries)
|
|
{
|
|
qsort(topo, entries, sizeof(*topo), _cmp_cpu_cluster);
|
|
}
|
|
|
|
/* re-order a CPU topology array by locality and capacity to detect clusters. */
|
|
void cpu_reorder_by_cluster_capa(struct ha_cpu_topo *topo, int entries)
|
|
{
|
|
qsort(topo, entries, sizeof(*topo), _cmp_cpu_cluster_capa);
|
|
}
|
|
|
|
/* functions below act on ha_cpu_cluster structs */
|
|
|
|
/* function used by qsort to reorder clusters by index */
|
|
int _cmp_cluster_index(const void *a, const void *b)
|
|
{
|
|
const struct ha_cpu_cluster *l = (const struct ha_cpu_cluster *)a;
|
|
const struct ha_cpu_cluster *r = (const struct ha_cpu_cluster *)b;
|
|
return l->idx - r->idx;
|
|
}
|
|
|
|
/* function used by qsort to order clustes by reverse capacity */
|
|
int _cmp_cluster_capa(const void *a, const void *b)
|
|
{
|
|
const struct ha_cpu_cluster *l = (const struct ha_cpu_cluster *)a;
|
|
const struct ha_cpu_cluster *r = (const struct ha_cpu_cluster *)b;
|
|
return r->capa - l->capa;
|
|
}
|
|
|
|
/* re-order a cluster array by cluster index only */
|
|
void cpu_cluster_reorder_by_index(struct ha_cpu_cluster *clusters, int entries)
|
|
{
|
|
qsort(clusters, entries, sizeof(*clusters), _cmp_cluster_index);
|
|
}
|
|
|
|
/* re-order a CPU topology array by locality and capacity to detect clusters. */
|
|
void cpu_cluster_reorder_by_capa(struct ha_cpu_cluster *clusters, int entries)
|
|
{
|
|
qsort(clusters, entries, sizeof(*clusters), _cmp_cluster_capa);
|
|
}
|
|
|
|
/* returns an optimal maxcpus for the current system. It will take into
|
|
* account what is reported by the OS, if any, otherwise will fall back
|
|
* to the cpuset size, which serves as an upper limit in any case.
|
|
*/
|
|
static int cpu_topo_get_maxcpus(void)
|
|
{
|
|
int abs_max = ha_cpuset_size();
|
|
|
|
#if defined(_SC_NPROCESSORS_CONF)
|
|
int n = (int)sysconf(_SC_NPROCESSORS_CONF);
|
|
|
|
if (n > 0 && n <= abs_max)
|
|
return n;
|
|
#endif
|
|
return abs_max;
|
|
}
|
|
|
|
/* This function is responsible for trying to fill in the missing info after
|
|
* topology detection and making sure we don't leave any ID at -1, but rather
|
|
* we assign unused ones.
|
|
*/
|
|
void cpu_fixup_topology(void)
|
|
{
|
|
struct hap_cpuset cpuset;
|
|
int cpu, cpu2;
|
|
int curr_id, prev_id;
|
|
int min_id, neg;
|
|
int cl_cpu, small_cl;
|
|
|
|
/* fill the package id, node id and thread_id. First we'll build a bitmap
|
|
* of all unassigned ones so that we can spot the lowest unassigned one
|
|
* and assign it to those currently set to -1.
|
|
*/
|
|
|
|
/* package id */
|
|
ha_cpuset_zero(&cpuset);
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++)
|
|
ha_cpuset_set(&cpuset, cpu);
|
|
|
|
for (cpu = neg = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].pk_id < 0)
|
|
neg++;
|
|
else
|
|
ha_cpuset_clr(&cpuset, ha_cpu_topo[cpu].pk_id);
|
|
}
|
|
|
|
/* get the first unused pkg id */
|
|
min_id = ha_cpuset_ffs(&cpuset) - 1;
|
|
for (cpu = 0; neg && cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].pk_id < 0) {
|
|
ha_cpu_topo[cpu].pk_id = min_id;
|
|
neg--;
|
|
}
|
|
}
|
|
|
|
/* node id */
|
|
ha_cpuset_zero(&cpuset);
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++)
|
|
ha_cpuset_set(&cpuset, cpu);
|
|
|
|
for (cpu = neg = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].no_id < 0)
|
|
neg++;
|
|
else
|
|
ha_cpuset_clr(&cpuset, ha_cpu_topo[cpu].no_id);
|
|
}
|
|
|
|
/* get the first unused node id */
|
|
min_id = ha_cpuset_ffs(&cpuset) - 1;
|
|
for (cpu = 0; neg && cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].no_id < 0) {
|
|
ha_cpu_topo[cpu].no_id = min_id;
|
|
neg--;
|
|
}
|
|
}
|
|
|
|
/* thread id */
|
|
ha_cpuset_zero(&cpuset);
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++)
|
|
ha_cpuset_set(&cpuset, cpu);
|
|
|
|
for (cpu = neg = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].th_id < 0)
|
|
neg++;
|
|
else
|
|
ha_cpuset_clr(&cpuset, ha_cpu_topo[cpu].th_id);
|
|
}
|
|
|
|
/* get the first unused thr id */
|
|
min_id = ha_cpuset_ffs(&cpuset) - 1;
|
|
for (cpu = 0; neg && cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].th_id < 0) {
|
|
ha_cpu_topo[cpu].th_id = min_id;
|
|
ha_cpu_topo[cpu].th_cnt = min_id + 1;
|
|
neg--;
|
|
}
|
|
}
|
|
|
|
/* Some machines (typically ARM cortex A76 and Neoverse-N1) report 1
|
|
* cluster per pair of cores due to the internal architecture. While
|
|
* this can occasionally make sense (i.e. big.LITTLE etc), when there
|
|
* are many clusters of few cores, this is totally pointless. Here
|
|
* we'll check if there are at least 4 2-cpu clusters, and if so, all
|
|
* the 2-cpu clusters will be cancelled.
|
|
*/
|
|
cpu_reorder_by_cluster(ha_cpu_topo, cpu_topo_maxcpus);
|
|
|
|
curr_id = -1;
|
|
cl_cpu = small_cl = 0;
|
|
for (cpu = cpu2 = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].cl_gid < 0)
|
|
continue;
|
|
|
|
if (ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK)
|
|
continue;
|
|
|
|
if (ha_cpu_topo[cpu].cl_gid != curr_id) {
|
|
if (curr_id >= 0 && cl_cpu <= 2)
|
|
small_cl++;
|
|
cl_cpu = 0;
|
|
cpu2 = cpu;
|
|
curr_id = ha_cpu_topo[cpu].cl_gid;
|
|
}
|
|
cl_cpu++;
|
|
}
|
|
|
|
/* last one */
|
|
if (cl_cpu && cl_cpu <= 2)
|
|
small_cl++;
|
|
|
|
/* here we have the number of small clusters (<=2 cpu) in small_cl */
|
|
if (small_cl >= 4) {
|
|
for (cpu = cpu2 = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].cl_gid < 0)
|
|
continue;
|
|
|
|
if (ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK)
|
|
continue;
|
|
|
|
if (ha_cpu_topo[cpu].cl_gid != curr_id) {
|
|
if (curr_id >= 0 && cl_cpu <= 2) {
|
|
/* small cluster found for curr_id */
|
|
while (cpu2 < cpu) {
|
|
if (ha_cpu_topo[cpu2].cl_gid == curr_id)
|
|
ha_cpu_topo[cpu2].cl_gid = -1;
|
|
cpu2++;
|
|
}
|
|
}
|
|
cl_cpu = 0;
|
|
cpu2 = cpu;
|
|
curr_id = ha_cpu_topo[cpu].cl_gid;
|
|
}
|
|
cl_cpu++;
|
|
}
|
|
|
|
/* handle the last cluster */
|
|
while (curr_id >= 0 && cl_cpu <= 2 && cpu2 < cpu) {
|
|
if (ha_cpu_topo[cpu2].cl_gid == curr_id)
|
|
ha_cpu_topo[cpu2].cl_gid = -1;
|
|
cpu2++;
|
|
}
|
|
}
|
|
|
|
cpu_reorder_by_index(ha_cpu_topo, cpu_topo_maxcpus);
|
|
|
|
/* assign capacity if not filled, based on the number of threads on the
|
|
* core: in a same package, SMT-capable cores are generally those
|
|
* optimized for performers while non-SMT ones are generally those
|
|
* optimized for efficiency. We'll reflect that by assigning 100 and 50
|
|
* respectively to those.
|
|
*/
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].capa < 0)
|
|
ha_cpu_topo[cpu].capa = (ha_cpu_topo[cpu].th_cnt > 1) ? 100 : 50;
|
|
}
|
|
|
|
/* First, on some machines, L3 is not reported. But some also don't
|
|
* have L3. However, no L3 when there are more than 2 L2 is quite
|
|
* unheard of, and while we don't really care about firing 2 groups for
|
|
* 2 L2, we'd rather avoid this if there are 8! In this case we'll add
|
|
* an L3 instance to fix the situation.
|
|
*/
|
|
cpu_reorder_by_locality(ha_cpu_topo, cpu_topo_maxcpus);
|
|
|
|
prev_id = -2; // make sure it cannot match even unassigned ones
|
|
curr_id = -1;
|
|
for (cpu = cpu2 = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].ca_id[3] >= 0)
|
|
continue;
|
|
|
|
/* L3 not assigned, count L2 instances */
|
|
if (!cpu ||
|
|
(ha_cpu_topo[cpu].pk_id != ha_cpu_topo[cpu-1].pk_id) ||
|
|
(ha_cpu_topo[cpu].no_id != ha_cpu_topo[cpu-1].no_id) ||
|
|
(ha_cpu_topo[cpu].ca_id[4] != ha_cpu_topo[cpu-1].ca_id[4])) {
|
|
curr_id = 0;
|
|
prev_id = -2;
|
|
cpu2 = cpu;
|
|
}
|
|
else if (ha_cpu_topo[cpu].ca_id[2] != prev_id) {
|
|
curr_id++;
|
|
if (curr_id >= 2) {
|
|
/* let's assign L3 id to zero for all those.
|
|
* We can go till the end since we'll just skip
|
|
* them on next passes above.
|
|
*/
|
|
for (; cpu2 <= cpu_topo_lastcpu; cpu2++) {
|
|
if (ha_cpu_topo[cpu2].ca_id[3] < 0 &&
|
|
ha_cpu_topo[cpu2].pk_id == ha_cpu_topo[cpu].pk_id &&
|
|
ha_cpu_topo[cpu2].no_id == ha_cpu_topo[cpu].no_id &&
|
|
ha_cpu_topo[cpu2].ca_id[4] == ha_cpu_topo[cpu].ca_id[4])
|
|
ha_cpu_topo[cpu2].ca_id[3] = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* let's make core numbers contiguous and per (pkg,node) as well, as
|
|
* holes may exist due to SMT.
|
|
*/
|
|
prev_id = -2; // make sure it cannot match even unassigned ones
|
|
curr_id = -1;
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
/* renumber clusters and assign unassigne ones at the same
|
|
* time. For this, we'll compare pkg/die/llc with the last
|
|
* CPU's and verify if we need to create a new cluster ID.
|
|
* Note that some platforms don't report cache. The value is
|
|
* local to the pkg+node combination so that we reset it when
|
|
* changing.
|
|
*/
|
|
if (!cpu ||
|
|
(ha_cpu_topo[cpu].pk_id != ha_cpu_topo[cpu-1].pk_id) ||
|
|
(ha_cpu_topo[cpu].no_id != ha_cpu_topo[cpu-1].no_id)) {
|
|
curr_id = 0;
|
|
}
|
|
else if (ha_cpu_topo[cpu].ts_id != prev_id ||
|
|
ha_cpu_topo[cpu].ca_id[4] != ha_cpu_topo[cpu-1].ca_id[4] ||
|
|
(ha_cpu_topo[cpu].ca_id[4] < 0 && // no l4 ? check L3
|
|
((ha_cpu_topo[cpu].ca_id[3] != ha_cpu_topo[cpu-1].ca_id[3]) ||
|
|
(ha_cpu_topo[cpu].ca_id[3] < 0 && // no l3 ? check L2
|
|
(ha_cpu_topo[cpu].ca_id[2] != ha_cpu_topo[cpu-1].ca_id[2]))))) {
|
|
curr_id++;
|
|
}
|
|
prev_id = ha_cpu_topo[cpu].ts_id;
|
|
ha_cpu_topo[cpu].ts_id = curr_id;
|
|
}
|
|
|
|
cpu_reorder_by_index(ha_cpu_topo, cpu_topo_maxcpus);
|
|
}
|
|
|
|
/* This function is responsible for composing clusters based on existing info
|
|
* on the CPU topology.
|
|
*/
|
|
void cpu_compose_clusters(void)
|
|
{
|
|
int cpu, core;
|
|
int curr_gid, prev_gid;
|
|
int curr_lid, prev_lid;
|
|
|
|
/* Now we'll sort CPUs by topology/cluster/capacity and assign cluster
|
|
* IDs to those that don't have one, based on the die/pkg/lcc, and
|
|
* double-check that capacity within a cluster doesn't vary by +/- 5%,
|
|
* otherwise it indicates different clusters (typically big.little).
|
|
*/
|
|
cpu_reorder_by_cluster_capa(ha_cpu_topo, cpu_topo_maxcpus);
|
|
|
|
prev_gid = prev_lid = -2; // make sure it cannot match even unassigned ones
|
|
curr_gid = curr_lid = -1;
|
|
core = -1;
|
|
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
/* renumber clusters and assign unassigned ones at the same
|
|
* time. For this, we'll compare pkg/die/llc with the last
|
|
* CPU's and verify if we need to create a new cluster ID.
|
|
* Note that some platforms don't report cache. The locao value
|
|
* is local to the pkg+node combination so that we reset it
|
|
* when changing, contrary to the global one which grows.
|
|
*/
|
|
if (!cpu ||
|
|
(ha_cpu_topo[cpu].pk_id != ha_cpu_topo[cpu-1].pk_id) ||
|
|
(ha_cpu_topo[cpu].no_id != ha_cpu_topo[cpu-1].no_id)) {
|
|
curr_gid++;
|
|
curr_lid = 0;
|
|
core = -1;
|
|
}
|
|
else if (ha_cpu_topo[cpu].cl_gid != prev_gid ||
|
|
ha_cpu_topo[cpu].ca_id[4] != ha_cpu_topo[cpu-1].ca_id[4] ||
|
|
(ha_cpu_topo[cpu].ca_id[4] < 0 && // no l4 ? check L3
|
|
((ha_cpu_topo[cpu].ca_id[3] != ha_cpu_topo[cpu-1].ca_id[3]) ||
|
|
(ha_cpu_topo[cpu].ca_id[3] < 0 && // no l3 ? check L2
|
|
(ha_cpu_topo[cpu].ca_id[2] != ha_cpu_topo[cpu-1].ca_id[2])))) ||
|
|
(ha_cpu_topo[cpu].capa > 0 && ha_cpu_topo[cpu-1].capa > 0 &&
|
|
(ha_cpu_topo[cpu].capa * 100 < ha_cpu_topo[cpu-1].capa * 95 ||
|
|
ha_cpu_topo[cpu].capa * 95 > ha_cpu_topo[cpu-1].capa * 100))) {
|
|
curr_gid++;
|
|
curr_lid++;
|
|
}
|
|
prev_gid = ha_cpu_topo[cpu].cl_gid;
|
|
prev_lid = ha_cpu_topo[cpu].cl_lid;
|
|
ha_cpu_topo[cpu].cl_gid = curr_gid;
|
|
ha_cpu_topo[cpu].cl_lid = curr_lid;
|
|
|
|
/* update per-cluster info */
|
|
if (!(ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK)) {
|
|
ha_cpu_clusters[curr_gid].nb_cpu++;
|
|
if (ha_cpu_topo[cpu].ts_id != core) {
|
|
/* new core for this cluster */
|
|
ha_cpu_clusters[curr_gid].nb_cores++;
|
|
ha_cpu_clusters[curr_gid].capa += ha_cpu_topo[cpu].capa;
|
|
core = ha_cpu_topo[cpu].ts_id;
|
|
} else {
|
|
/* tests show that it's reasonable to expect
|
|
* ~+33% for an extra thread on the same core.
|
|
*/
|
|
ha_cpu_clusters[curr_gid].capa += ha_cpu_topo[cpu].capa / 3;
|
|
}
|
|
}
|
|
}
|
|
|
|
cpu_reorder_by_index(ha_cpu_topo, cpu_topo_maxcpus);
|
|
}
|
|
|
|
/* apply remaining topology-based cpu set restrictions */
|
|
void cpu_refine_cpusets(void)
|
|
{
|
|
int cpu;
|
|
|
|
/* remove CPUs in the drop-node set or not in the only-node set */
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if ( ha_cpuset_isset(&cpu_set_cfg.drop_nodes, ha_cpu_topo[cpu].no_id) ||
|
|
!ha_cpuset_isset(&cpu_set_cfg.only_nodes, ha_cpu_topo[cpu].no_id))
|
|
ha_cpu_topo[cpu].st |= HA_CPU_F_DONT_USE;
|
|
}
|
|
|
|
/* remove CPUs in the drop-cluster set or not in the only-cluster set */
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if ( ha_cpuset_isset(&cpu_set_cfg.drop_clusters, ha_cpu_topo[cpu].cl_lid) ||
|
|
!ha_cpuset_isset(&cpu_set_cfg.only_clusters, ha_cpu_topo[cpu].cl_lid))
|
|
ha_cpu_topo[cpu].st |= HA_CPU_F_DONT_USE;
|
|
}
|
|
|
|
/* remove CPUs in the drop-core set or not in the only-core set */
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if ( ha_cpuset_isset(&cpu_set_cfg.drop_cores, ha_cpu_topo[cpu].ts_id) ||
|
|
!ha_cpuset_isset(&cpu_set_cfg.only_cores, ha_cpu_topo[cpu].ts_id))
|
|
ha_cpu_topo[cpu].st |= HA_CPU_F_DONT_USE;
|
|
}
|
|
|
|
/* remove CPUs in the drop-thread set or not in the only-thread set */
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if ( ha_cpuset_isset(&cpu_set_cfg.drop_threads, ha_cpu_topo[cpu].th_id) ||
|
|
!ha_cpuset_isset(&cpu_set_cfg.only_threads, ha_cpu_topo[cpu].th_id))
|
|
ha_cpu_topo[cpu].st |= HA_CPU_F_DONT_USE;
|
|
}
|
|
}
|
|
|
|
/* the "first-usable-node" cpu-policy: historical one
|
|
* - does nothing if numa_cpu_mapping is not set
|
|
* - does nothing if nbthread is set
|
|
* - does nothing if the set of CPUs had been set manually using taskset
|
|
* - does nothing if the first node couldn't be determined
|
|
* Otherwise ignores all CPUs not on the first node.
|
|
*/
|
|
static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin, int gmax, char **err)
|
|
{
|
|
struct hap_cpuset node_cpu_set;
|
|
int first_node_id = -1;
|
|
int second_node_id = -1;
|
|
int cpu;
|
|
int cpu_count;
|
|
int grp, thr;
|
|
|
|
if (!global.numa_cpu_mapping)
|
|
return 0;
|
|
|
|
if (global.nbthread)
|
|
return 0;
|
|
|
|
if (cpu_mask_forced)
|
|
return 0;
|
|
|
|
/* determine first and second nodes with usable CPUs */
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK)
|
|
continue;
|
|
|
|
if (ha_cpu_topo[cpu].no_id >= 0 &&
|
|
ha_cpu_topo[cpu].no_id != first_node_id) {
|
|
if (first_node_id < 0)
|
|
first_node_id = ha_cpu_topo[cpu].no_id;
|
|
else {
|
|
second_node_id = ha_cpu_topo[cpu].no_id;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* no information found on a second node */
|
|
if (second_node_id < 0)
|
|
return 0;
|
|
|
|
/* ignore all CPUs of other nodes, count the remaining valid ones,
|
|
* and make a CPU set of them.
|
|
*/
|
|
ha_cpuset_zero(&node_cpu_set);
|
|
for (cpu = cpu_count = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].no_id != first_node_id)
|
|
ha_cpu_topo[cpu].st |= HA_CPU_F_IGNORED;
|
|
else if (!(ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK)) {
|
|
ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx);
|
|
cpu_count++;
|
|
}
|
|
}
|
|
|
|
/* assign all threads of all thread groups to this node */
|
|
for (grp = 0; grp < MAX_TGROUPS; grp++)
|
|
for (thr = 0; thr < MAX_THREADS_PER_GROUP; thr++)
|
|
ha_cpuset_assign(&cpu_map[grp].thread[thr], &node_cpu_set);
|
|
|
|
if (tmin <= cpu_count && cpu_count < tmax)
|
|
tmax = cpu_count;
|
|
|
|
ha_diag_warning("Multi-socket cpu detected, automatically binding on active CPUs of '%d' (%u active cpu(s))\n", first_node_id, cpu_count);
|
|
|
|
if (!global.nbthread)
|
|
global.nbthread = tmax;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* the "group-by-cluster" cpu-policy:
|
|
* - does nothing if nbthread or thread-groups are set
|
|
* - otherwise tries to create one thread-group per cluster, with as many
|
|
* threads as CPUs in the cluster, and bind all the threads of this group
|
|
* to all the CPUs of the cluster.
|
|
*/
|
|
static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, int gmax, char **err)
|
|
{
|
|
struct hap_cpuset node_cpu_set;
|
|
int cpu, cpu_start;
|
|
int cpu_count;
|
|
int cid, lcid;
|
|
int thr_per_grp, nb_grp;
|
|
int thr;
|
|
|
|
if (global.nbthread)
|
|
return 0;
|
|
|
|
if (global.nbtgroups)
|
|
return 0;
|
|
|
|
/* iterate over each new cluster */
|
|
lcid = -1;
|
|
cpu_start = 0;
|
|
while (global.nbtgroups < MAX_TGROUPS && global.nbthread < MAX_THREADS) {
|
|
ha_cpuset_zero(&node_cpu_set);
|
|
cid = -1; cpu_count = 0;
|
|
|
|
for (cpu = cpu_start; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
/* skip disabled and already visited CPUs */
|
|
if (ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK)
|
|
continue;
|
|
if (ha_cpu_topo[cpu].cl_gid <= lcid)
|
|
continue;
|
|
|
|
if (cid < 0) {
|
|
cid = ha_cpu_topo[cpu].cl_gid;
|
|
cpu_start = cpu + 1;
|
|
}
|
|
else if (cid != ha_cpu_topo[cpu].cl_gid)
|
|
continue;
|
|
|
|
/* make a mask of all of this cluster's CPUs */
|
|
ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx);
|
|
cpu_count++;
|
|
}
|
|
/* now cid = next cluster_id or -1 if none; cpu_count is the
|
|
* number of CPUs in this cluster, and cpu_start is the next
|
|
* cpu to restart from to scan for new clusters.
|
|
*/
|
|
if (cid < 0 || !cpu_count)
|
|
break;
|
|
|
|
/* check that we're still within limits. If there are too many
|
|
* CPUs but enough groups left, we'll try to make more smaller
|
|
* groups, of the closest size each.
|
|
*/
|
|
nb_grp = (cpu_count + MAX_THREADS_PER_GROUP - 1) / MAX_THREADS_PER_GROUP;
|
|
if (nb_grp > MAX_TGROUPS - global.nbtgroups)
|
|
nb_grp = MAX_TGROUPS - global.nbtgroups;
|
|
thr_per_grp = (cpu_count + nb_grp - 1) / nb_grp;
|
|
if (thr_per_grp > MAX_THREADS_PER_GROUP)
|
|
thr_per_grp = MAX_THREADS_PER_GROUP;
|
|
|
|
while (nb_grp && cpu_count > 0) {
|
|
/* create at most thr_per_grp threads */
|
|
if (thr_per_grp > cpu_count)
|
|
thr_per_grp = cpu_count;
|
|
|
|
if (thr_per_grp + global.nbthread > MAX_THREADS)
|
|
thr_per_grp = MAX_THREADS - global.nbthread;
|
|
|
|
/* let's create the new thread group */
|
|
ha_tgroup_info[global.nbtgroups].base = global.nbthread;
|
|
ha_tgroup_info[global.nbtgroups].count = thr_per_grp;
|
|
|
|
/* assign to this group the required number of threads */
|
|
for (thr = 0; thr < thr_per_grp; thr++) {
|
|
ha_thread_info[thr + global.nbthread].tgid = global.nbtgroups + 1;
|
|
ha_thread_info[thr + global.nbthread].tg = &ha_tgroup_info[global.nbtgroups];
|
|
ha_thread_info[thr + global.nbthread].tg_ctx = &ha_tgroup_ctx[global.nbtgroups];
|
|
/* map these threads to all the CPUs */
|
|
ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &node_cpu_set);
|
|
}
|
|
|
|
cpu_count -= thr_per_grp;
|
|
global.nbthread += thr_per_grp;
|
|
global.nbtgroups++;
|
|
if (global.nbtgroups >= MAX_TGROUPS || global.nbthread >= MAX_THREADS)
|
|
break;
|
|
}
|
|
|
|
lcid = cid; // last cluster_id
|
|
}
|
|
|
|
if (global.nbthread)
|
|
ha_diag_warning("Created %d threads split into %d groups\n", global.nbthread, global.nbtgroups);
|
|
else
|
|
ha_diag_warning("Could not determine any CPU cluster\n");
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* the "performance" cpu-policy:
|
|
* - does nothing if nbthread or thread-groups are set
|
|
* - eliminates clusters whose total capacity is below half of others
|
|
* - tries to create one thread-group per cluster, with as many
|
|
* threads as CPUs in the cluster, and bind all the threads of
|
|
* this group to all the CPUs of the cluster.
|
|
*/
|
|
static int cpu_policy_performance(int policy, int tmin, int tmax, int gmin, int gmax, char **err)
|
|
{
|
|
int cpu, cluster;
|
|
int capa;
|
|
|
|
if (global.nbthread || global.nbtgroups)
|
|
return 0;
|
|
|
|
/* sort clusters by reverse capacity */
|
|
cpu_cluster_reorder_by_capa(ha_cpu_clusters, cpu_topo_maxcpus);
|
|
|
|
capa = 0;
|
|
for (cluster = 0; cluster < cpu_topo_maxcpus; cluster++) {
|
|
if (capa && ha_cpu_clusters[cluster].capa < capa / 2) {
|
|
/* This cluster is more than twice as slow as the
|
|
* previous one, we're not interested in using it.
|
|
*/
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].cl_gid == ha_cpu_clusters[cluster].idx)
|
|
ha_cpu_topo[cpu].st |= HA_CPU_F_IGNORED;
|
|
}
|
|
}
|
|
else
|
|
capa = ha_cpu_clusters[cluster].capa;
|
|
}
|
|
|
|
cpu_cluster_reorder_by_index(ha_cpu_clusters, cpu_topo_maxcpus);
|
|
|
|
/* and finish using the group-by-cluster strategy */
|
|
return cpu_policy_group_by_cluster(policy, tmin, tmax, gmin, gmax, err);
|
|
}
|
|
|
|
/* the "efficiency" cpu-policy:
|
|
* - does nothing if nbthread or thread-groups are set
|
|
* - eliminates clusters whose total capacity is above half of others
|
|
* - tries to create one thread-group per cluster, with as many
|
|
* threads as CPUs in the cluster, and bind all the threads of
|
|
* this group to all the CPUs of the cluster.
|
|
*/
|
|
static int cpu_policy_efficiency(int policy, int tmin, int tmax, int gmin, int gmax, char **err)
|
|
{
|
|
int cpu, cluster;
|
|
int capa;
|
|
|
|
if (global.nbthread || global.nbtgroups)
|
|
return 0;
|
|
|
|
/* sort clusters by reverse capacity */
|
|
cpu_cluster_reorder_by_capa(ha_cpu_clusters, cpu_topo_maxcpus);
|
|
|
|
capa = 0;
|
|
for (cluster = cpu_topo_maxcpus - 1; cluster >= 0; cluster--) {
|
|
if (capa && ha_cpu_clusters[cluster].capa > capa * 2) {
|
|
/* This cluster is more than twice as fast as the
|
|
* previous one, we're not interested in using it.
|
|
*/
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].cl_gid == ha_cpu_clusters[cluster].idx)
|
|
ha_cpu_topo[cpu].st |= HA_CPU_F_IGNORED;
|
|
}
|
|
}
|
|
else
|
|
capa = ha_cpu_clusters[cluster].capa;
|
|
}
|
|
|
|
cpu_cluster_reorder_by_index(ha_cpu_clusters, cpu_topo_maxcpus);
|
|
|
|
/* and finish using the group-by-cluster strategy */
|
|
return cpu_policy_group_by_cluster(policy, tmin, tmax, gmin, gmax, err);
|
|
}
|
|
|
|
|
|
/* the "resource" cpu-policy:
|
|
* - does nothing if nbthread or thread-groups are set
|
|
* - only keeps the smallest cluster.
|
|
*/
|
|
static int cpu_policy_resource(int policy, int tmin, int tmax, int gmin, int gmax, char **err)
|
|
{
|
|
int cpu, cluster;
|
|
int capa;
|
|
|
|
if (global.nbthread || global.nbtgroups)
|
|
return 0;
|
|
|
|
/* sort clusters by reverse capacity */
|
|
cpu_cluster_reorder_by_capa(ha_cpu_clusters, cpu_topo_maxcpus);
|
|
|
|
capa = 0;
|
|
for (cluster = cpu_topo_maxcpus - 1; cluster >= 0; cluster--) {
|
|
if (capa) {
|
|
/* we already have a cluster, let's disable this
|
|
* one.
|
|
*/
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
if (ha_cpu_topo[cpu].cl_gid == ha_cpu_clusters[cluster].idx)
|
|
ha_cpu_topo[cpu].st |= HA_CPU_F_IGNORED;
|
|
}
|
|
}
|
|
else
|
|
capa = ha_cpu_clusters[cluster].capa;
|
|
}
|
|
|
|
cpu_cluster_reorder_by_index(ha_cpu_clusters, cpu_topo_maxcpus);
|
|
|
|
/* and finish using the group-by-cluster strategy */
|
|
return cpu_policy_group_by_cluster(policy, tmin, tmax, gmin, gmax, err);
|
|
}
|
|
|
|
/* apply the chosen CPU policy if no cpu-map was forced. Returns < 0 on failure
|
|
* with a message in *err that must be freed by the caller if non-null.
|
|
*/
|
|
int cpu_apply_policy(int tmin, int tmax, int gmin, int gmax, char **err)
|
|
{
|
|
*err = NULL;
|
|
|
|
if (cpu_map_configured()) {
|
|
/* nothing to do */
|
|
return 0;
|
|
}
|
|
|
|
if (!ha_cpu_policy[cpu_policy].fct) {
|
|
/* nothing to do */
|
|
return 0;
|
|
}
|
|
|
|
if (ha_cpu_policy[cpu_policy].fct(cpu_policy, tmin, tmax, gmin, gmax, err) < 0)
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* CPU topology detection below, OS-specific */
|
|
|
|
#if defined(__linux__)
|
|
|
|
/* detect the CPU topology based on info in /sys */
|
|
int cpu_detect_topology(void)
|
|
{
|
|
const char *parse_cpu_set_args[2];
|
|
struct ha_cpu_topo cpu_id = { }; /* all zeroes */
|
|
struct hap_cpuset node_cpu_set;
|
|
struct dirent *de;
|
|
int no_cache, no_topo, no_capa, no_clust, no_pkg;
|
|
int no_cppc, no_freq;
|
|
DIR *dir;
|
|
int cpu;
|
|
|
|
/* now let's only focus on bound CPUs to learn more about their
|
|
* topology, their siblings, their cache affinity etc. We can stop
|
|
* at lastcpu which matches the ID of the last known bound CPU
|
|
* when it's set. We'll pre-assign and auto-increment indexes for
|
|
* thread_set_id, cluster_id, l1/l2/l3 id, etc. We don't revisit entries
|
|
* already filled from the list provided by another CPU.
|
|
*/
|
|
|
|
if (!is_dir_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu"))
|
|
goto skip_cpu;
|
|
|
|
/* detect the presence of some kernel-specific fields */
|
|
no_cache = no_topo = no_capa = no_clust = no_pkg = no_freq = no_cppc = -1;
|
|
for (cpu = 0; cpu <= cpu_topo_lastcpu; cpu++) {
|
|
struct hap_cpuset siblings_list = { };
|
|
struct hap_cpuset cpus_list;
|
|
int next_level = 1; // assume L1 if unknown
|
|
int idx, level;
|
|
int cpu2;
|
|
|
|
if (ha_cpu_topo[cpu].st & HA_CPU_F_OFFLINE)
|
|
continue;
|
|
|
|
if (!is_dir_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d", cpu))
|
|
continue;
|
|
|
|
/* First, let's check the cache hierarchy. On systems exposing
|
|
* it, index0 generally is the L1D cache, index1 the L1I, index2
|
|
* the L2 and index3 the L3. But sometimes L1I/D are reversed,
|
|
* and some CPUs also have L0 or L4. Maybe some heterogenous
|
|
* SoCs even have inconsistent levels between clusters... Thus
|
|
* we'll scan all entries that we can find for each CPU and
|
|
* assign levels based on what is reported. The types generally
|
|
* are "Data", "Instruction", "Unified". We just ignore inst if
|
|
* found.
|
|
*/
|
|
if (no_cache < 0)
|
|
no_cache = !is_dir_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/cache", cpu);
|
|
|
|
if (no_cache)
|
|
goto skip_cache;
|
|
|
|
for (idx = 0; idx < 10; idx++) {
|
|
if (!is_dir_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/cache/index%d", cpu, idx))
|
|
break;
|
|
|
|
if (read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH
|
|
"/cpu/cpu%d/cache/index%d/type", cpu, idx) >= 0 &&
|
|
strcmp(trash.area, "Instruction") == 0)
|
|
continue;
|
|
|
|
level = next_level;
|
|
if (read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH
|
|
"/cpu/cpu%d/cache/index%d/level", cpu, idx) >= 0) {
|
|
level = atoi(trash.area);
|
|
next_level = level + 1;
|
|
}
|
|
|
|
if (level < 0 || level > 4)
|
|
continue; // level out of bounds
|
|
|
|
if (ha_cpu_topo[cpu].ca_id[level] >= 0)
|
|
continue; // already filled
|
|
|
|
if (read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH
|
|
"/cpu/cpu%d/cache/index%d/shared_cpu_list", cpu, idx) >= 0) {
|
|
parse_cpu_set_args[0] = trash.area;
|
|
parse_cpu_set_args[1] = "\0";
|
|
if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
|
|
for (cpu2 = 0; cpu2 <= cpu_topo_lastcpu; cpu2++) {
|
|
if (ha_cpuset_isset(&cpus_list, cpu2))
|
|
ha_cpu_topo[cpu2].ca_id[level] = cpu_id.ca_id[level];
|
|
}
|
|
cpu_id.ca_id[level]++;
|
|
}
|
|
}
|
|
}
|
|
|
|
skip_cache:
|
|
if (no_topo < 0)
|
|
no_topo = !is_dir_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology", cpu);
|
|
|
|
if (no_topo)
|
|
goto skip_topo;
|
|
|
|
/* Now let's try to get more info about how the cores are
|
|
* arranged in packages, clusters, cores, threads etc. It
|
|
* overlaps a bit with the cache above, but as not all systems
|
|
* provide all of these, they're quite complementary in fact.
|
|
*/
|
|
|
|
/* thread siblings list will allow to figure which CPU threads
|
|
* share the same cores, and also to tell apart cores that
|
|
* support SMT from those which do not. When mixed, generally
|
|
* the ones with SMT are big cores and the ones without are the
|
|
* small ones. We also read the entry if the cluster_id is not
|
|
* known because we'll have to compare both values.
|
|
*/
|
|
if ((ha_cpu_topo[cpu].ts_id < 0 || ha_cpu_topo[cpu].cl_gid < 0) &&
|
|
read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/thread_siblings_list", cpu) >= 0) {
|
|
parse_cpu_set_args[0] = trash.area;
|
|
parse_cpu_set_args[1] = "\0";
|
|
if (parse_cpu_set(parse_cpu_set_args, &siblings_list, NULL) == 0) {
|
|
int sib_id = 0;
|
|
|
|
cpu_id.th_cnt = ha_cpuset_count(&siblings_list);
|
|
for (cpu2 = 0; cpu2 <= cpu_topo_lastcpu; cpu2++) {
|
|
if (ha_cpuset_isset(&siblings_list, cpu2)) {
|
|
ha_cpu_topo[cpu2].ts_id = cpu_id.ts_id;
|
|
ha_cpu_topo[cpu2].th_cnt = cpu_id.th_cnt;
|
|
ha_cpu_topo[cpu2].th_id = sib_id++;
|
|
}
|
|
}
|
|
cpu_id.ts_id++;
|
|
}
|
|
}
|
|
|
|
/* clusters of cores when they exist, can be smaller and more
|
|
* precise than core lists (e.g. big.little), otherwise use
|
|
* core lists as a fall back, which may also have been used
|
|
* above as a fallback for package but we don't care here. We
|
|
* only consider these values if there's more than one CPU per
|
|
* cluster (some kernels such as 6.1 report one cluster per CPU).
|
|
* Note that we purposely ignore clusters that are reportedly
|
|
* equal to the siblings list, because some machines report one
|
|
* distinct cluster per *core* (e.g. some armv7 and intel 14900).
|
|
*/
|
|
if (no_clust < 0) {
|
|
no_clust = !is_file_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/cluster_cpus_list", cpu) &&
|
|
!is_file_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/core_siblings_list", cpu);
|
|
}
|
|
|
|
if (!no_clust && ha_cpu_topo[cpu].cl_gid < 0 &&
|
|
(read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/cluster_cpus_list", cpu) >= 0 ||
|
|
read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/core_siblings_list", cpu) >= 0)) {
|
|
parse_cpu_set_args[0] = trash.area;
|
|
parse_cpu_set_args[1] = "\0";
|
|
if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0 && ha_cpuset_count(&cpus_list) > 1 &&
|
|
(memcmp(&cpus_list, &siblings_list, sizeof(cpus_list)) != 0)) {
|
|
for (cpu2 = 0; cpu2 <= cpu_topo_lastcpu; cpu2++) {
|
|
if (ha_cpuset_isset(&cpus_list, cpu2)) {
|
|
ha_cpu_topo[cpu2].cl_lid = cpu_id.cl_lid;
|
|
ha_cpu_topo[cpu2].cl_gid = cpu_id.cl_gid;
|
|
}
|
|
}
|
|
cpu_id.cl_lid++;
|
|
cpu_id.cl_gid++;
|
|
}
|
|
}
|
|
|
|
/* package CPUs list, like nodes, are generally a hard limit
|
|
* for groups, which must not span over multiple of them. On
|
|
* some systems, the package_cpus_list is not always provided,
|
|
* so we may first fall back to core_siblings_list which also
|
|
* exists, then to the physical package id from each CPU, whose
|
|
* number starts at 0. The first one is preferred because it
|
|
* provides a list in a single read().
|
|
*/
|
|
if (no_pkg < 0) {
|
|
no_pkg = !is_file_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/package_cpus_list", cpu) &&
|
|
!is_file_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/core_siblings_list", cpu);
|
|
}
|
|
|
|
if (!no_pkg && ha_cpu_topo[cpu].pk_id < 0 &&
|
|
(read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/package_cpus_list", cpu) >= 0 ||
|
|
read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/core_siblings_list", cpu) >= 0)) {
|
|
parse_cpu_set_args[0] = trash.area;
|
|
parse_cpu_set_args[1] = "\0";
|
|
if (parse_cpu_set(parse_cpu_set_args, &cpus_list, NULL) == 0) {
|
|
for (cpu2 = 0; cpu2 <= cpu_topo_lastcpu; cpu2++) {
|
|
if (ha_cpuset_isset(&cpus_list, cpu2))
|
|
ha_cpu_topo[cpu2].pk_id = cpu_id.pk_id;
|
|
}
|
|
cpu_id.pk_id++;
|
|
}
|
|
}
|
|
|
|
if (ha_cpu_topo[cpu].pk_id < 0 &&
|
|
read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/topology/physical_package_id", cpu) >= 0) {
|
|
if (trash.data)
|
|
ha_cpu_topo[cpu].pk_id = str2uic(trash.area);
|
|
}
|
|
|
|
skip_topo:
|
|
if (no_capa < 0)
|
|
no_capa = !is_file_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/cpu_capacity", cpu);
|
|
|
|
/* CPU capacity is a relative notion to compare little and big
|
|
* cores. Usually the values encountered in field set the big
|
|
* CPU's nominal capacity to 1024 and the other ones below.
|
|
*/
|
|
if (!no_capa && ha_cpu_topo[cpu].capa < 0 &&
|
|
read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/cpu_capacity", cpu) >= 0) {
|
|
if (trash.data)
|
|
ha_cpu_topo[cpu].capa = str2uic(trash.area);
|
|
}
|
|
|
|
/* When cpu_capacity is not available, sometimes acpi_cppc is
|
|
* available on servers to provide an equivalent metric allowing
|
|
* to distinguish big from small cores. Values as low as 15 and
|
|
* as high as 260 were seen there. Note that only nominal_perf
|
|
* is trustable, as nominal_freq may return zero. It's also
|
|
* more reliable than the max cpufreq values because it doesn't
|
|
* seem to take into account the die quality. However, acpi_cppc
|
|
* can be super slow on some systems (5ms per access noticed on
|
|
* a 64-core EPYC), making haproxy literally take seconds to
|
|
* start just due to this. Thus we start with cpufreq and fall
|
|
* back to acpi_cppc. If it becomes an issue, we could imagine
|
|
* forcing the value to all members of the same core and even
|
|
* cluster. Since the frequency alone is not a good criterion
|
|
* to qualify the CPU quality (perf vs efficiency core), instead
|
|
* we rely on the thread count to gauge if it's a performant or
|
|
* an efficient core, and we major performant cores' capacity
|
|
* by 50% (shown to be roughly correct on modern CPUs).
|
|
*/
|
|
if (no_freq < 0)
|
|
no_freq = !is_dir_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/cpufreq", cpu);
|
|
|
|
if (!no_freq && ha_cpu_topo[cpu].capa < 0 &&
|
|
read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/cpufreq/scaling_max_freq", cpu) >= 0) {
|
|
/* This is in kHz, turn it to MHz to stay below 32k */
|
|
if (trash.data) {
|
|
ha_cpu_topo[cpu].capa = (str2uic(trash.area) + 999U) / 1000U;
|
|
if (ha_cpu_topo[cpu].th_cnt > 1)
|
|
ha_cpu_topo[cpu].capa = ha_cpu_topo[cpu].capa * 3 / 2;
|
|
}
|
|
}
|
|
|
|
if (no_cppc < 0)
|
|
no_cppc = !is_dir_present(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/acpi_cppc", cpu);
|
|
|
|
if (!no_cppc && ha_cpu_topo[cpu].capa < 0 &&
|
|
read_line_to_trash(NUMA_DETECT_SYSTEM_SYSFS_PATH "/cpu/cpu%d/acpi_cppc/nominal_perf", cpu) >= 0) {
|
|
if (trash.data)
|
|
ha_cpu_topo[cpu].capa = str2uic(trash.area);
|
|
}
|
|
}
|
|
|
|
skip_cpu:
|
|
/* Now locate NUMA node IDs if any */
|
|
|
|
dir = opendir(NUMA_DETECT_SYSTEM_SYSFS_PATH "/node");
|
|
if (dir) {
|
|
while ((de = readdir(dir))) {
|
|
long node_id;
|
|
char *endptr;
|
|
|
|
/* dir name must start with "node" prefix */
|
|
if (strncmp(de->d_name, "node", 4) != 0)
|
|
continue;
|
|
|
|
/* dir name must be at least 5 characters long */
|
|
if (!de->d_name[4])
|
|
continue;
|
|
|
|
/* dir name must end with a non-negative numeric id */
|
|
node_id = strtol(&de->d_name[4], &endptr, 10);
|
|
if (*endptr || node_id < 0)
|
|
continue;
|
|
|
|
/* all tests succeeded, it's in the form "node%d" */
|
|
if (read_line_to_trash("%s/node/%s/cpulist", NUMA_DETECT_SYSTEM_SYSFS_PATH, de->d_name) >= 0) {
|
|
parse_cpu_set_args[0] = trash.area;
|
|
parse_cpu_set_args[1] = "\0";
|
|
if (parse_cpu_set(parse_cpu_set_args, &node_cpu_set, NULL) == 0) {
|
|
for (cpu = 0; cpu < cpu_topo_maxcpus; cpu++)
|
|
if (ha_cpuset_isset(&node_cpu_set, cpu))
|
|
ha_cpu_topo[cpu].no_id = node_id;
|
|
}
|
|
}
|
|
}
|
|
/* done */
|
|
closedir(dir);
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
#elif defined(__FreeBSD__)
|
|
|
|
int cpu_detect_topology(void)
|
|
{
|
|
struct hap_cpuset node_cpu_set;
|
|
int ndomains, domain, cpu;
|
|
size_t len = sizeof(ndomains);
|
|
|
|
/* Try to detect NUMA nodes */
|
|
if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == 0) {
|
|
BUG_ON(ndomains > MAXMEMDOM);
|
|
|
|
/* For each domain we'll reference the domain ID in the belonging
|
|
* CPUs.
|
|
*/
|
|
for (domain = 0; domain < ndomains; domain++) {
|
|
ha_cpuset_zero(&node_cpu_set);
|
|
|
|
if (cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_DOMAIN, domain,
|
|
sizeof(node_cpu_set.cpuset), &node_cpu_set.cpuset) == -1)
|
|
continue;
|
|
|
|
for (cpu = 0; cpu < cpu_topo_maxcpus; cpu++)
|
|
if (ha_cpuset_isset(&node_cpu_set, cpu))
|
|
ha_cpu_topo[cpu].no_id = domain;
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
#else // !__linux__, !__FreeBSD__
|
|
|
|
int cpu_detect_topology(void)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
#endif // OS-specific cpu_detect_topology()
|
|
|
|
/* Parse the "cpu-set" global directive, which takes action names and
|
|
* optional values, and fills the cpu_set structure above.
|
|
*/
|
|
static int cfg_parse_cpu_set(char **args, int section_type, struct proxy *curpx,
|
|
const struct proxy *defpx, const char *file, int line,
|
|
char **err)
|
|
{
|
|
const char *cpu_set_str[2] = { "", "" };
|
|
struct hap_cpuset tmp_cpuset = { };
|
|
int arg;
|
|
|
|
for (arg = 1; *args[arg]; arg++) {
|
|
if (strcmp(args[arg], "reset") == 0) {
|
|
/* reset the excluded CPUs first (undo "taskset") */
|
|
cpu_set_cfg.flags |= CPU_SET_FL_DO_RESET;
|
|
cpu_mask_forced = 0;
|
|
}
|
|
else if (strcmp(args[arg], "drop-cpu") == 0 || strcmp(args[arg], "only-cpu") == 0) {
|
|
if (!*args[arg + 1]) {
|
|
memprintf(err, "missing CPU set");
|
|
goto parse_err;
|
|
}
|
|
|
|
cpu_set_str[0] = args[arg + 1];
|
|
if (parse_cpu_set(cpu_set_str, &tmp_cpuset, err) != 0)
|
|
goto parse_err;
|
|
|
|
if (*args[arg] == 'd') // cpus to drop
|
|
ha_cpuset_or(&cpu_set_cfg.drop_cpus, &tmp_cpuset);
|
|
else // cpus to keep
|
|
ha_cpuset_and(&cpu_set_cfg.only_cpus, &tmp_cpuset);
|
|
arg++;
|
|
}
|
|
else if (strcmp(args[arg], "drop-node") == 0 || strcmp(args[arg], "only-node") == 0) {
|
|
if (!*args[arg + 1]) {
|
|
memprintf(err, "missing node set");
|
|
goto parse_err;
|
|
}
|
|
|
|
cpu_set_str[0] = args[arg + 1];
|
|
if (parse_cpu_set(cpu_set_str, &tmp_cpuset, err) != 0)
|
|
goto parse_err;
|
|
|
|
if (*args[arg] == 'd') // nodes to drop
|
|
ha_cpuset_or(&cpu_set_cfg.drop_nodes, &tmp_cpuset);
|
|
else // nodes to keep
|
|
ha_cpuset_and(&cpu_set_cfg.only_nodes, &tmp_cpuset);
|
|
arg++;
|
|
}
|
|
else if (strcmp(args[arg], "drop-cluster") == 0 || strcmp(args[arg], "only-cluster") == 0) {
|
|
if (!*args[arg + 1]) {
|
|
memprintf(err, "missing cluster set");
|
|
goto parse_err;
|
|
}
|
|
|
|
cpu_set_str[0] = args[arg + 1];
|
|
if (parse_cpu_set(cpu_set_str, &tmp_cpuset, err) != 0)
|
|
goto parse_err;
|
|
|
|
if (*args[arg] == 'd') // clusters to drop
|
|
ha_cpuset_or(&cpu_set_cfg.drop_clusters, &tmp_cpuset);
|
|
else // clusters to keep
|
|
ha_cpuset_and(&cpu_set_cfg.only_clusters, &tmp_cpuset);
|
|
arg++;
|
|
}
|
|
else if (strcmp(args[arg], "drop-core") == 0 || strcmp(args[arg], "only-core") == 0) {
|
|
if (!*args[arg + 1]) {
|
|
memprintf(err, "missing core set");
|
|
goto parse_err;
|
|
}
|
|
|
|
cpu_set_str[0] = args[arg + 1];
|
|
if (parse_cpu_set(cpu_set_str, &tmp_cpuset, err) != 0)
|
|
goto parse_err;
|
|
|
|
if (*args[arg] == 'd') // cores to drop
|
|
ha_cpuset_or(&cpu_set_cfg.drop_cores, &tmp_cpuset);
|
|
else // cores to keep
|
|
ha_cpuset_and(&cpu_set_cfg.only_cores, &tmp_cpuset);
|
|
arg++;
|
|
}
|
|
else if (strcmp(args[arg], "drop-thread") == 0 || strcmp(args[arg], "only-thread") == 0) {
|
|
if (!*args[arg + 1]) {
|
|
memprintf(err, "missing thread set");
|
|
goto parse_err;
|
|
}
|
|
|
|
cpu_set_str[0] = args[arg + 1];
|
|
if (parse_cpu_set(cpu_set_str, &tmp_cpuset, err) != 0)
|
|
goto parse_err;
|
|
|
|
if (*args[arg] == 'd') // threads to drop
|
|
ha_cpuset_or(&cpu_set_cfg.drop_threads, &tmp_cpuset);
|
|
else // threads to keep
|
|
ha_cpuset_and(&cpu_set_cfg.only_threads, &tmp_cpuset);
|
|
arg++;
|
|
}
|
|
else {
|
|
/* fall back with default error message */
|
|
memprintf(err, "'%s' passed an unknown directive '%s'", args[0], args[arg]);
|
|
goto leave_with_err;
|
|
}
|
|
}
|
|
|
|
if (arg == 1) {
|
|
memprintf(err, "'%s' requires a directive and an optional value", args[0]);
|
|
goto leave_with_err;
|
|
}
|
|
|
|
/* all done */
|
|
return 0;
|
|
|
|
parse_err:
|
|
/* displays args[0] and args[arg] followed by *err so as to remind the
|
|
* option name, the sub-directive and the reported error.
|
|
*/
|
|
memprintf(err, "'%s %s': %s\n.", args[0], args[arg], *err);
|
|
goto leave;
|
|
|
|
leave_with_err:
|
|
/* complete with supported directives */
|
|
memprintf(err, "%s (only 'reset', 'only-cpu', 'drop-cpu', 'only-node', 'drop-node', 'only-cluster', 'drop-cluster', 'only-core', 'drop-core', 'only-thread', 'drop-thread' supported).", *err);
|
|
leave:
|
|
return -1;
|
|
}
|
|
|
|
/* Parse the "cpu-policy" global directive, which takes the name of one of the
|
|
* ha_cpu_policy[] names, and sets the associated index in cpu_policy.
|
|
*/
|
|
static int cfg_parse_cpu_policy(char **args, int section_type, struct proxy *curpx,
|
|
const struct proxy *defpx, const char *file, int line,
|
|
char **err)
|
|
{
|
|
int i;
|
|
|
|
if (too_many_args(1, args, err, NULL))
|
|
return -1;
|
|
|
|
for (i = 0; ha_cpu_policy[i].name; i++) {
|
|
if (strcmp(args[1], ha_cpu_policy[i].name) == 0) {
|
|
cpu_policy = i;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
memprintf(err, "'%s' passed an unknown CPU policy '%s'. Supported values are:", args[0], args[1]);
|
|
for (i = 0; ha_cpu_policy[i].name; i++) {
|
|
memprintf(err, "%s%s '%s' (%s)%s", *err,
|
|
(i > 0 && ha_cpu_policy[i+1].name) ? "" : " and",
|
|
ha_cpu_policy[i].name,
|
|
ha_cpu_policy[i].desc,
|
|
(ha_cpu_policy[i+1].name) ? "," : ".\n");
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
/* Allocates everything needed to store CPU topology at boot.
|
|
* Returns non-zero on success, zero on failure.
|
|
*/
|
|
static int cpu_topo_alloc(void)
|
|
{
|
|
int cpu;
|
|
|
|
cpu_topo_maxcpus = cpu_topo_get_maxcpus();
|
|
cpu_topo_lastcpu = cpu_topo_maxcpus - 1;
|
|
|
|
cpu_map = calloc(MAX_TGROUPS, sizeof(*cpu_map));
|
|
if (!cpu_map)
|
|
return 0;
|
|
|
|
/* allocate the structures used to store CPU topology info */
|
|
ha_cpu_topo = (struct ha_cpu_topo*)malloc(cpu_topo_maxcpus * sizeof(*ha_cpu_topo));
|
|
if (!ha_cpu_topo)
|
|
return 0;
|
|
|
|
/* allocate the structures used to store CPU topology info */
|
|
ha_cpu_clusters = (struct ha_cpu_cluster*)malloc(cpu_topo_maxcpus * sizeof(*ha_cpu_clusters));
|
|
if (!ha_cpu_topo)
|
|
return 0;
|
|
|
|
/* preset all fields to -1 except the index and the state flags which
|
|
* are assumed to all be bound and online unless detected otherwise.
|
|
* Also set all cluster idx to their respective index.
|
|
*/
|
|
for (cpu = 0; cpu < cpu_topo_maxcpus; cpu++) {
|
|
memset(&ha_cpu_topo[cpu], 0xff, sizeof(*ha_cpu_topo));
|
|
ha_cpu_topo[cpu].st = 0;
|
|
ha_cpu_topo[cpu].idx = cpu;
|
|
|
|
memset(&ha_cpu_clusters[cpu], 0x0, sizeof(*ha_cpu_clusters));
|
|
ha_cpu_clusters[cpu].idx = cpu;
|
|
}
|
|
|
|
/* pre-inizialize the configured CPU sets */
|
|
ha_cpuset_zero(&cpu_set_cfg.drop_cpus);
|
|
ha_cpuset_zero(&cpu_set_cfg.only_cpus);
|
|
ha_cpuset_zero(&cpu_set_cfg.drop_nodes);
|
|
ha_cpuset_zero(&cpu_set_cfg.only_nodes);
|
|
ha_cpuset_zero(&cpu_set_cfg.drop_clusters);
|
|
ha_cpuset_zero(&cpu_set_cfg.only_clusters);
|
|
ha_cpuset_zero(&cpu_set_cfg.drop_cores);
|
|
ha_cpuset_zero(&cpu_set_cfg.only_cores);
|
|
ha_cpuset_zero(&cpu_set_cfg.drop_threads);
|
|
ha_cpuset_zero(&cpu_set_cfg.only_threads);
|
|
|
|
/* preset all CPUs in the "only-XXX" sets */
|
|
for (cpu = 0; cpu < cpu_topo_maxcpus; cpu++) {
|
|
ha_cpuset_set(&cpu_set_cfg.only_cpus, cpu);
|
|
ha_cpuset_set(&cpu_set_cfg.only_nodes, cpu);
|
|
ha_cpuset_set(&cpu_set_cfg.only_clusters, cpu);
|
|
ha_cpuset_set(&cpu_set_cfg.only_cores, cpu);
|
|
ha_cpuset_set(&cpu_set_cfg.only_threads, cpu);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
static void cpu_topo_deinit(void)
|
|
{
|
|
ha_free(&ha_cpu_clusters);
|
|
ha_free(&ha_cpu_topo);
|
|
ha_free(&cpu_map);
|
|
}
|
|
|
|
INITCALL0(STG_ALLOC, cpu_topo_alloc);
|
|
REGISTER_POST_DEINIT(cpu_topo_deinit);
|
|
|
|
/* config keyword parsers */
|
|
static struct cfg_kw_list cfg_kws = {ILH, {
|
|
{ CFG_GLOBAL, "cpu-policy", cfg_parse_cpu_policy, 0 },
|
|
{ CFG_GLOBAL, "cpu-set", cfg_parse_cpu_set, 0 },
|
|
{ 0, NULL, NULL }
|
|
}};
|
|
|
|
INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws);
|