mirror of
https://git.haproxy.org/git/haproxy.git/
synced 2025-12-16 07:01:38 +01:00
1459 lines
63 KiB
Plaintext
1459 lines
63 KiB
Plaintext
2023-07-04 - automatic grouping for NUMA
|
|
|
|
|
|
Xeon: (W2145)
|
|
|
|
willy@debian:~$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0,8
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0,8
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0,8
|
|
/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list:0-15
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
/sys/devices/system/cpu/cpu0/cache/index3/type:Unified
|
|
|
|
|
|
Wtap: i7-8650U
|
|
|
|
willy@wtap:~ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0,4
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0,4
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0,4
|
|
/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list:0-7
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
/sys/devices/system/cpu/cpu0/cache/index3/type:Unified
|
|
|
|
|
|
pcw: i7-6700k
|
|
|
|
willy@pcw:~$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0,4
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0,4
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0,4
|
|
/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list:0-7
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
/sys/devices/system/cpu/cpu0/cache/index3/type:Unified
|
|
|
|
|
|
nfs: N5105, v5.15
|
|
|
|
willy@nfs:~$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0-3
|
|
/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list:0-3
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
/sys/devices/system/cpu/cpu0/cache/index3/type:Unified
|
|
|
|
|
|
eeepc: Atom N2800, 5.4 : no L3, L2 not shared.
|
|
|
|
willy@eeepc:~$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0-1
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0-1
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0-1
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
|
|
willy@eeepc:~$ grep '' /sys/devices/system/cpu/cpu2/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list:2-3
|
|
/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list:2-3
|
|
/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list:2-3
|
|
/sys/devices/system/cpu/cpu2/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu2/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu2/cache/index2/type:Unified
|
|
|
|
|
|
dev13: Ryzen 2700X
|
|
|
|
haproxy@dev13:~$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0-1
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0-1
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0-1
|
|
/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list:0-7
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
/sys/devices/system/cpu/cpu0/cache/index3/type:Unified
|
|
|
|
haproxy@dev13:~$ grep '' /sys/devices/system/cpu/cpu8/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8-9
|
|
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8-9
|
|
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8-9
|
|
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-15
|
|
/sys/devices/system/cpu/cpu8/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu8/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu8/cache/index2/type:Unified
|
|
/sys/devices/system/cpu/cpu8/cache/index3/type:Unified
|
|
|
|
|
|
dev12: Ryzen 5800X
|
|
|
|
haproxy@dev12:~$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0,8
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0,8
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0,8
|
|
/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list:0-15
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
/sys/devices/system/cpu/cpu0/cache/index3/type:Unified
|
|
|
|
|
|
amd24: EPYC 74F3
|
|
|
|
willy@mt:~$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0,24
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0,24
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0,24
|
|
/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list:0-2,24-26
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
/sys/devices/system/cpu/cpu0/cache/index3/type:Unified
|
|
|
|
willy@mt:~$ grep '' /sys/devices/system/cpu/cpu8/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,32
|
|
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,32
|
|
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,32
|
|
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:6-8,30-32
|
|
/sys/devices/system/cpu/cpu8/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu8/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu8/cache/index2/type:Unified
|
|
/sys/devices/system/cpu/cpu8/cache/index3/type:Unified
|
|
|
|
willy@mt:~$ grep '' /sys/devices/system/cpu/cpu0/topology/*list
|
|
/sys/devices/system/cpu/cpu0/topology/core_cpus_list:0,24
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings_list:0-47
|
|
/sys/devices/system/cpu/cpu0/topology/die_cpus_list:0-47
|
|
/sys/devices/system/cpu/cpu0/topology/package_cpus_list:0-47
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings_list:0,24
|
|
|
|
|
|
xeon24: Gold 6212U
|
|
|
|
willy@mt01:~$ grep '' /sys/devices/system/cpu/cpu8/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,32
|
|
/sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,32
|
|
/sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,32
|
|
/sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:0-47
|
|
/sys/devices/system/cpu/cpu8/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu8/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu8/cache/index2/type:Unified
|
|
/sys/devices/system/cpu/cpu8/cache/index3/type:Unified
|
|
|
|
|
|
SPR 8480+
|
|
|
|
$ grep -a '' /sys/devices/system/node/node*/cpulist
|
|
/sys/devices/system/node/node0/cpulist:0-55,112-167
|
|
/sys/devices/system/node/node1/cpulist:56-111,168-223
|
|
|
|
$ grep -a '' /sys/devices/system/cpu/cpu0/topology/*list
|
|
/sys/devices/system/cpu/cpu0/topology/core_cpus_list:0,112
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings_list:0-55,112-167
|
|
/sys/devices/system/cpu/cpu0/topology/die_cpus_list:0-55,112-167
|
|
/sys/devices/system/cpu/cpu0/topology/package_cpus_list:0-55,112-167
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings_list:0,112
|
|
|
|
$ grep -a '' /sys/devices/system/cpu/cpu0/cache/*/shared_cpu_list
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0,112
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0,112
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0,112
|
|
/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list:0-55,112-167
|
|
|
|
|
|
UP Board - Atom X5-8350 : no L3, exactly like Armada8040
|
|
|
|
willy@up1:~$ grep '' /sys/devices/system/cpu/cpu{0,1,2,3}/cache/index2/*list
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0-1
|
|
/sys/devices/system/cpu/cpu1/cache/index2/shared_cpu_list:0-1
|
|
/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list:2-3
|
|
/sys/devices/system/cpu/cpu3/cache/index2/shared_cpu_list:2-3
|
|
|
|
willy@up1:~$ grep '' /sys/devices/system/cpu/cpu0/topology/*list
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings_list:0-3
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings_list:0
|
|
|
|
Atom D510 - kernel 2.6.33
|
|
|
|
$ strings -fn1 sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list: 0,2
|
|
sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list: 0,2
|
|
sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list: 0,2
|
|
sys/devices/system/cpu/cpu0/cache/index0/type: Data
|
|
sys/devices/system/cpu/cpu0/cache/index1/type: Instruction
|
|
sys/devices/system/cpu/cpu0/cache/index2/type: Unified
|
|
|
|
$ strings -fn1 sys/devices/system/cpu/cpu?/topology/*list
|
|
sys/devices/system/cpu/cpu0/topology/core_siblings_list: 0-3
|
|
sys/devices/system/cpu/cpu0/topology/thread_siblings_list: 0,2
|
|
sys/devices/system/cpu/cpu1/topology/core_siblings_list: 0-3
|
|
sys/devices/system/cpu/cpu1/topology/thread_siblings_list: 1,3
|
|
sys/devices/system/cpu/cpu2/topology/core_siblings_list: 0-3
|
|
sys/devices/system/cpu/cpu2/topology/thread_siblings_list: 0,2
|
|
sys/devices/system/cpu/cpu3/topology/core_siblings_list: 0-3
|
|
sys/devices/system/cpu/cpu3/topology/thread_siblings_list: 1,3
|
|
|
|
mcbin: Armada 8040 : no L3, no difference with L3 not reported
|
|
|
|
root@lg7:~# grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0-1
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
|
|
root@lg7:~# grep '' /sys/devices/system/cpu/cpu0/topology/*list
|
|
/sys/devices/system/cpu/cpu0/topology/core_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings_list:0-3
|
|
/sys/devices/system/cpu/cpu0/topology/die_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/package_cpus_list:0-3
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings_list:0
|
|
|
|
|
|
Ampere/monolithic: Ampere Altra 80-26 : L3 not reported
|
|
|
|
willy@ampere:~$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
|
|
willy@ampere:~$ grep '' /sys/devices/system/cpu/cpu0/topology/*list
|
|
/sys/devices/system/cpu/cpu0/topology/core_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings_list:0-79
|
|
/sys/devices/system/cpu/cpu0/topology/die_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/package_cpus_list:0-79
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings_list:0
|
|
|
|
|
|
Ampere/Hemisphere: Ampere Altra 80-26 : L3 not reported
|
|
|
|
willy@ampere:~$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
|
|
willy@ampere:~$ grep '' /sys/devices/system/cpu/cpu0/topology/*list
|
|
/sys/devices/system/cpu/cpu0/topology/core_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings_list:0-79
|
|
/sys/devices/system/cpu/cpu0/topology/die_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/package_cpus_list:0-79
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings_list:0
|
|
|
|
willy@ampere:~$ grep '' /sys/devices/system/node/node*/cpulist
|
|
/sys/devices/system/node/node0/cpulist:0-39
|
|
/sys/devices/system/node/node1/cpulist:40-79
|
|
|
|
|
|
LX2A: LX2160A => L3 not reported
|
|
|
|
willy@lx2a:~$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0-1
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
|
|
willy@lx2a:~$ grep '' /sys/devices/system/cpu/cpu2/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu2/cache/index0/shared_cpu_list:2
|
|
/sys/devices/system/cpu/cpu2/cache/index1/shared_cpu_list:2
|
|
/sys/devices/system/cpu/cpu2/cache/index2/shared_cpu_list:2-3
|
|
/sys/devices/system/cpu/cpu2/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu2/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu2/cache/index2/type:Unified
|
|
|
|
willy@lx2a:~$ grep '' /sys/devices/system/cpu/cpu0/topology/*list
|
|
/sys/devices/system/cpu/cpu0/topology/core_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings_list:0-15
|
|
/sys/devices/system/cpu/cpu0/topology/die_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/package_cpus_list:0-15
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings_list:0
|
|
|
|
|
|
Rock5B: RK3588 (big-little A76+A55)
|
|
|
|
rock@rock-5b:~$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index3/shared_cpu_list:0-7
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
/sys/devices/system/cpu/cpu0/cache/index3/type:Unified
|
|
|
|
rock@rock-5b:~$ grep '' /sys/devices/system/cpu/cpu{0,4,6}/topology/*list
|
|
/sys/devices/system/cpu/cpu0/topology/core_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings_list:0-3
|
|
/sys/devices/system/cpu/cpu0/topology/die_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/package_cpus_list:0-3
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings_list:0
|
|
/sys/devices/system/cpu/cpu4/topology/core_cpus_list:4
|
|
/sys/devices/system/cpu/cpu4/topology/core_siblings_list:4-5
|
|
/sys/devices/system/cpu/cpu4/topology/die_cpus_list:4
|
|
/sys/devices/system/cpu/cpu4/topology/package_cpus_list:4-5
|
|
/sys/devices/system/cpu/cpu4/topology/thread_siblings_list:4
|
|
/sys/devices/system/cpu/cpu6/topology/core_cpus_list:6
|
|
/sys/devices/system/cpu/cpu6/topology/core_siblings_list:6-7
|
|
/sys/devices/system/cpu/cpu6/topology/die_cpus_list:6
|
|
/sys/devices/system/cpu/cpu6/topology/package_cpus_list:6-7
|
|
/sys/devices/system/cpu/cpu6/topology/thread_siblings_list:6
|
|
|
|
$ grep '' /sys/devices/system/cpu/cpu*/cpu_capacity
|
|
/sys/devices/system/cpu/cpu0/cpu_capacity:414
|
|
/sys/devices/system/cpu/cpu1/cpu_capacity:414
|
|
/sys/devices/system/cpu/cpu2/cpu_capacity:414
|
|
/sys/devices/system/cpu/cpu3/cpu_capacity:414
|
|
/sys/devices/system/cpu/cpu4/cpu_capacity:1024
|
|
/sys/devices/system/cpu/cpu5/cpu_capacity:1024
|
|
/sys/devices/system/cpu/cpu6/cpu_capacity:1024
|
|
/sys/devices/system/cpu/cpu7/cpu_capacity:1024
|
|
|
|
|
|
Firefly: RK3399 (2xA72 + 4xA53) kernel 6.1.28
|
|
|
|
root@firefly:~# grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
grep: /sys/devices/system/cpu/cpu0/cache/index?/shared_cpu_list: No such file or directory
|
|
grep: /sys/devices/system/cpu/cpu0/cache/index?/type: No such file or directory
|
|
|
|
root@firefly:~# grep '' /sys/devices/system/cpu/cpu*/cache/index?/{shared_cpu_list,type}
|
|
grep: /sys/devices/system/cpu/cpu*/cache/index?/shared_cpu_list: No such file or directory
|
|
grep: /sys/devices/system/cpu/cpu*/cache/index?/type: No such file or directory
|
|
|
|
root@firefly:~# dmesg|grep cacheinfo
|
|
[ 0.006290] cacheinfo: Unable to detect cache hierarchy for CPU 0
|
|
[ 0.016339] cacheinfo: Unable to detect cache hierarchy for CPU 1
|
|
[ 0.017692] cacheinfo: Unable to detect cache hierarchy for CPU 2
|
|
[ 0.019050] cacheinfo: Unable to detect cache hierarchy for CPU 3
|
|
[ 0.020478] cacheinfo: Unable to detect cache hierarchy for CPU 4
|
|
[ 0.021660] cacheinfo: Unable to detect cache hierarchy for CPU 5
|
|
[ 1.990108] cacheinfo: Unable to detect cache hierarchy for CPU 0
|
|
|
|
root@firefly:~# grep '' /sys/devices/system/cpu/cpu0/topology/*
|
|
/sys/devices/system/cpu/cpu0/topology/cluster_cpus:0f
|
|
/sys/devices/system/cpu/cpu0/topology/cluster_cpus_list:0-3
|
|
/sys/devices/system/cpu/cpu0/topology/cluster_id:0
|
|
/sys/devices/system/cpu/cpu0/topology/core_cpus:01
|
|
/sys/devices/system/cpu/cpu0/topology/core_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/core_id:0
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings:3f
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings_list:0-5
|
|
/sys/devices/system/cpu/cpu0/topology/package_cpus:3f
|
|
/sys/devices/system/cpu/cpu0/topology/package_cpus_list:0-5
|
|
/sys/devices/system/cpu/cpu0/topology/physical_package_id:0
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings:01
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings_list:0
|
|
|
|
$ grep '' /sys/devices/system/cpu/cpu*/cpu_capacity
|
|
/sys/devices/system/cpu/cpu0/cpu_capacity:381
|
|
/sys/devices/system/cpu/cpu1/cpu_capacity:381
|
|
/sys/devices/system/cpu/cpu2/cpu_capacity:381
|
|
/sys/devices/system/cpu/cpu3/cpu_capacity:381
|
|
/sys/devices/system/cpu/cpu4/cpu_capacity:1024
|
|
/sys/devices/system/cpu/cpu5/cpu_capacity:1024
|
|
|
|
|
|
VIM3L: S905D3 (4*A55), kernel 5.14.10
|
|
|
|
$ grep '' /sys/devices/system/cpu/cpu0/topology/*
|
|
/sys/devices/system/cpu/cpu0/topology/core_cpus:1
|
|
/sys/devices/system/cpu/cpu0/topology/core_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/core_id:0
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings:f
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings_list:0-3
|
|
/sys/devices/system/cpu/cpu0/topology/die_cpus:1
|
|
/sys/devices/system/cpu/cpu0/topology/die_cpus_list:0
|
|
/sys/devices/system/cpu/cpu0/topology/die_id:-1
|
|
/sys/devices/system/cpu/cpu0/topology/package_cpus:f
|
|
/sys/devices/system/cpu/cpu0/topology/package_cpus_list:0-3
|
|
/sys/devices/system/cpu/cpu0/topology/physical_package_id:0
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings:1
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings_list:0
|
|
|
|
$ grep '' /sys/devices/system/cpu/cpu0/cache/index?/{shared_cpu_list,type}
|
|
/sys/devices/system/cpu/cpu0/cache/index0/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index1/shared_cpu_list:0
|
|
/sys/devices/system/cpu/cpu0/cache/index2/shared_cpu_list:0-3
|
|
/sys/devices/system/cpu/cpu0/cache/index0/type:Data
|
|
/sys/devices/system/cpu/cpu0/cache/index1/type:Instruction
|
|
/sys/devices/system/cpu/cpu0/cache/index2/type:Unified
|
|
|
|
$ grep '' /sys/devices/system/cpu/cpu*/cpu_capacity
|
|
/sys/devices/system/cpu/cpu0/cpu_capacity:1024
|
|
/sys/devices/system/cpu/cpu1/cpu_capacity:1024
|
|
/sys/devices/system/cpu/cpu2/cpu_capacity:1024
|
|
/sys/devices/system/cpu/cpu3/cpu_capacity:1024
|
|
|
|
|
|
Odroid-N2: S922X (4*A73 + 2*A53), kernel 4.9.254
|
|
|
|
willy@n2:~$ grep '' /sys/devices/system/cpu/cpu*/cache/index?/{shared_cpu_list,type}
|
|
grep: /sys/devices/system/cpu/cpu*/cache/index?/shared_cpu_list: No such file or directory
|
|
grep: /sys/devices/system/cpu/cpu*/cache/index?/type: No such file or directory
|
|
|
|
willy@n2:~$ sudo dmesg|grep -i 'cache hi'
|
|
[ 0.649924] Unable to detect cache hierarchy for CPU 0
|
|
|
|
No capacity.
|
|
|
|
Note that it reports 2 physical packages!
|
|
|
|
willy@n2:~$ grep '' /sys/devices/system/cpu/cpu0/topology/*
|
|
/sys/devices/system/cpu/cpu0/topology/core_id:0
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings:03
|
|
/sys/devices/system/cpu/cpu0/topology/core_siblings_list:0-1
|
|
/sys/devices/system/cpu/cpu0/topology/physical_package_id:0
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings:01
|
|
/sys/devices/system/cpu/cpu0/topology/thread_siblings_list:0
|
|
|
|
willy@n2:~$ grep '' /sys/devices/system/cpu/cpu4/topology/*
|
|
/sys/devices/system/cpu/cpu4/topology/core_id:2
|
|
/sys/devices/system/cpu/cpu4/topology/core_siblings:3c
|
|
/sys/devices/system/cpu/cpu4/topology/core_siblings_list:2-5
|
|
/sys/devices/system/cpu/cpu4/topology/physical_package_id:1
|
|
/sys/devices/system/cpu/cpu4/topology/thread_siblings:10
|
|
/sys/devices/system/cpu/cpu4/topology/thread_siblings_list:4
|
|
|
|
StarFive VisionFive2 - JH7110, kernel 5.15
|
|
|
|
willy@starfive:~/haproxy$ ./haproxy -c -f cps3.cfg
|
|
thr 0 -> cpu 0 onl=1 bnd=1 pk=00 no=-1 l3=-1 cl=000 l2=000 ts=000 l1=000
|
|
thr 1 -> cpu 1 onl=1 bnd=1 pk=00 no=-1 l3=-1 cl=000 l2=000 ts=001 l1=001
|
|
thr 2 -> cpu 2 onl=1 bnd=1 pk=00 no=-1 l3=-1 cl=000 l2=000 ts=002 l1=002
|
|
thr 3 -> cpu 3 onl=1 bnd=1 pk=00 no=-1 l3=-1 cl=000 l2=000 ts=003 l1=003
|
|
Configuration file is valid
|
|
|
|
Graviton2 / Graviton3 ?
|
|
|
|
|
|
On PPC64 not everything is available:
|
|
|
|
https://www.ibm.com/docs/en/linux-on-systems?topic=cpus-cpu-topology
|
|
|
|
/sys/devices/system/cpu/cpu<N>/topology/thread_siblings
|
|
/sys/devices/system/cpu/cpu<N>/topology/core_siblings
|
|
/sys/devices/system/cpu/cpu<N>/topology/book_siblings
|
|
/sys/devices/system/cpu/cpu<N>/topology/drawer_siblings
|
|
|
|
# lscpu -e
|
|
CPU NODE DRAWER BOOK SOCKET CORE L1d:L1i:L2d:L2i ONLINE CONFIGURED POLARIZATION ADDRESS
|
|
0 1 0 0 0 0 0:0:0:0 yes yes horizontal 0
|
|
1 1 0 0 0 0 1:1:1:1 yes yes horizontal 1
|
|
2 1 0 0 0 1 2:2:2:2 yes yes horizontal 2
|
|
3 1 0 0 0 1 3:3:3:3 yes yes horizontal 3
|
|
4 1 0 0 0 2 4:4:4:4 yes yes horizontal 4
|
|
5 1 0 0 0 2 5:5:5:5 yes yes horizontal 5
|
|
6 1 0 0 0 3 6:6:6:6 yes yes horizontal 6
|
|
7 1 0 0 0 3 7:7:7:7 yes yes horizontal 7
|
|
8 0 1 1 1 4 8:8:8:8 yes yes horizontal 8
|
|
...
|
|
|
|
Intel E5-2600v2/v3 has two L3:
|
|
https://www.enterpriseai.news/2014/09/08/intel-ups-performance-ante-haswell-xeon-chips/
|
|
|
|
More info on these, and s390's "books" (mostly L4 in fact):
|
|
https://groups.google.com/g/fa.linux.kernel/c/qgAxjYq8ohI
|
|
|
|
########################################
|
|
Analysis:
|
|
- some server ARM CPUs (Altra, LX2) do not return any L3 info though they
|
|
DO have some. They stop at L2.
|
|
|
|
- other CPUs like Atom N2800 and Armada 8040 do not have L3.
|
|
|
|
=> there's no apparent way to detect that the server CPUs do have an L3.
|
|
=> or maybe we should consider that it's more likely that there is one
|
|
than none ? Armada works much better with groups than without. It's
|
|
basically the same topology as N2800.
|
|
|
|
=> Do we really care then ? No L3 = same L3 for everyone. The problem is
|
|
that those really without L3 will make a difference on L2 while the
|
|
other ones not. Maybe we should consider that it does not make sense
|
|
to cut groups on L2 (i.e. under no circumstance we'll have one group
|
|
per core).
|
|
|
|
=> This would mean:
|
|
- regardless of L3, consider LLC. If the LLC has more than one
|
|
core per instance, it's likely the last one (not true on LX2
|
|
but better use 8 groups of 2 than nothing).
|
|
|
|
- otherwise if there's a single core per instance, it's unlikely
|
|
to be the LLC so we can imagine the LLC is unified. Note that
|
|
some systems such as LX2/Armada8K (and Neoverse-N1 devices as
|
|
well) may have 2 cores per L2, yet this doesn't allow to infer
|
|
anything regarding the absence of an L3. Core2-quad has 2 cores
|
|
per L2 with no L3, like Armada8K. LX2 has 2 cores per L2 yet does
|
|
have an L3 which is not necessarily reported.
|
|
|
|
- this needs to be done per {node,package} !
|
|
=> core_siblings and thread_siblings seem to be the only portable
|
|
ones to figure packages and threads
|
|
|
|
At the very least, when multiple nodes are possibly present, there is a
|
|
symlink "node0", "node1" etc in the cpu entry. It requires a lookup for each
|
|
cpu directory though while reading /sys/devices/system/node/node*/cpulist is
|
|
much cheaper.
|
|
|
|
There's some redundancy in this. Probably better approach:
|
|
|
|
1) if there is more than 1 CPU:
|
|
- if cache/index3 exists, use its cpulist to pre-group entries.
|
|
- else if topology or node exists, use (node,package,die,core_siblings) to
|
|
group entries
|
|
- else pre-create a single large group
|
|
|
|
2) if there is more than 1 CPU and less than max#groups:
|
|
- for each group, if no cache/index3 exists and cache/index2 exists and some
|
|
index2 entries contain at least two CPUs of different cores or a single one
|
|
for a 2-core system, then use that to re-split the group.
|
|
|
|
- if in the end there are too many groups, remerge some of them (?) or stick
|
|
to the previous layout (?)
|
|
|
|
- if in the end there are too many CPUs in a group, cut as needed, if
|
|
possible with an integral result (/2, /3, ...)
|
|
|
|
3) L1 cache / thread_siblings should be used to associate CPUs by cores in
|
|
the same groups.
|
|
|
|
Maybe instead it should be done bottom->top by collecting info and merging
|
|
groups while keeping CPU lists ordered to ease later splitting.
|
|
|
|
1) create a group per bound CPU
|
|
2) based on thread_siblings, detect CPUs that are on the same core, merge
|
|
their groups. They may not always create similarly sized groups.
|
|
=> eg: epyc keeps 24 groups such as {0,24}, ...
|
|
ryzen 2700x keeps 4 groups such as {0,1}, ...
|
|
rk3588 keeps 3 groups {0-3},{4-5},{6-7}
|
|
3) based on cache index0/1, detect CPUs that are on the same L1 cache,
|
|
merge their groups. They may not always create similarly sized groups.
|
|
4) based on cache index2, detect CPUs that are on the same L2 cache, merge
|
|
their groups. They may not always create similarly sized groups.
|
|
=> eg: mcbin now keeps 2 groups {0-1},{2,3}
|
|
5) At this point there may possibly be too many groups (still one per CPU,
|
|
e.g. when no cache info was found or there are many cores with their own
|
|
L2 like on SPR) or too large one (when all cores are indeed on the same
|
|
L2).
|
|
|
|
5.1) if there are as many groups as bound CPUs, merge them all together in
|
|
a single one => lx2, altra, mcbin
|
|
5.2) if there are still more than max#groups, merge them all together in a
|
|
single one since the splitting criterion is not relevant
|
|
5.3) if there is a group with too many CPUs, split it in two if integral,
|
|
otherwise 3, etc, trying to add the least possible number of groups.
|
|
If too difficult (e.g. result less than half the authorized max),
|
|
let's just round around N/((N+63)/64).
|
|
5.4) if at the end there are too many groups, warn that we can't optimize
|
|
the setup and are limiting ourselves to the first node or 64 CPUs.
|
|
|
|
Observations:
|
|
- lx2 definitely works better with everything bound together than by creating
|
|
8 groups (~130k rps vs ~120k rps)
|
|
=> does this mean we should assume a unified L3 if there's no L3 info, and
|
|
remerge everything ? Likely Altra would benefit from this as well. mcbin
|
|
doesn't notice any change (within noise in both directions)
|
|
|
|
- on x86 13th gen, 2 P-cores and 8 E-cores. The P-cores support HT, not the
|
|
E-cores. There's no cpu_capacity there, but the cluster_id is properly set.
|
|
=> proposal: when a machine reports both single-threaded cores and SMT,
|
|
consider the SMT ones bigger and use them.
|
|
|
|
Problems: how should auto-detection interfer with user-settings ?
|
|
|
|
- Case 1: program started with a reduced taskset
|
|
=> current: this serves to the the thread count first, and to map default
|
|
threads to CPUs if they are not affected by a cpu-map.
|
|
|
|
=> we want to keep that behavior (i.e. use all these threads) but only
|
|
change how the thread-groups are arranged.
|
|
|
|
- example: start on the first 6c12t of an EPYC74F3, should automatically
|
|
create 2 groups for the two sockets.
|
|
|
|
=> should we brute-force all thread-groups combinations to figure how the
|
|
threads will spread over cpu-map and which one is better ? Or should we
|
|
decide to ignore input mapping as soon as there's at least one cpu-map?
|
|
But then which one to use ? Or should we consider that cpu-map only works
|
|
with explicit thread-groups ?
|
|
|
|
- Case 2: taskset not involved, but nbthread and cpu-map in the config. In
|
|
fact a pretty standard 2.4-2.8 config.
|
|
=> maybe the presence of cpu-map and no thread-groups should be sufficient
|
|
to imply a single thread-group to stay compatible ? Or maybe start as
|
|
many thread-groups as are referenced in cpu-map ? Seems like cpu-map and
|
|
thread-groups work hand-in-hand regarding topology since cpu-map
|
|
designates hardware CPUs so the user knows better than haproxy. Thus
|
|
why should be try to do better ?
|
|
|
|
- Case 3: taskset not involved, nbthread not involved, cpu-map not involved,
|
|
only thread-groups
|
|
=> seems like an ideal approach. Take all online CPUs and try to cut them
|
|
into equitable thread groups ? Or rather, since nbthreads is not forced,
|
|
better sort the clusters and bind to the N first clusters only ? If too
|
|
many groups for the clusters, then try to refine them ?
|
|
|
|
- Case 4: nothing specified at all (default config, target)
|
|
=> current: uses only one thread-group with all threads (max 64).
|
|
=> desired: bind only to performance cores and cut them in a few groups
|
|
based on l3, package, cluster etc.
|
|
|
|
- Case 5: nbthread only in the config
|
|
=> might match a docker use case. No group nor cpu-map configured. Figure
|
|
the best group usage respecting the thread count.
|
|
|
|
- Case 6: some constraints are enforced in the config (e.g. threads-hard-limit,
|
|
one-thread-per-core, etc).
|
|
=> like 3, 4 or 5 but with selection adjustment.
|
|
|
|
- Case 7: thread-groups and generic cpu-map 1/all, 2/all... in the config
|
|
=> user just wants to use cpu-map as a taskset alternative
|
|
=> need to figure number of threads first, then cut them in groups like
|
|
today, and only then the cpu-map are found. Can we do better ? Not sure.
|
|
Maybe just when cpu-map is too lax (e.g. all entries reference the same
|
|
CPUs). Better use a special "cpumap all/all 0-19" for this, but not
|
|
implemented for now.
|
|
|
|
Proposal:
|
|
- if there is any cpu-map, disable automatic CPU assignment
|
|
- if there is any cpu-map, disable automatic thread group detection
|
|
- if taskset was forced, disable automatic CPU assignment
|
|
|
|
### 2023-07-17 ###
|
|
|
|
=> step 1: mark CPUs enabled at boot (cpu_detect_usable)
|
|
// => step 2: mark CPUs referenced in cpu-map => no, no real meaning
|
|
=> step 3: identify all CPUs topologies + NUMA (cpu_detect_topology)
|
|
|
|
=> step 4: if taskset && !cpu-map, mark all non-bound CPUs as unusable (UNAVAIL ?)
|
|
=> which is the same as saying if !cpu-map.
|
|
=> step 5: if !cpu-map, sort usable CPUs and find the best set to use
|
|
//=> step 6: if cpu-map, mark all non-covered CPUs are unusable => not necessarily possible if partial cpu-map
|
|
|
|
=> step 7: if thread-groups && cpu-map, nothing else to do
|
|
=> step 8: if cpu-map && !thread-groups, thread-groups=1
|
|
=> step 9: if thread-groups && !cpu-map, use that value to cut the thread set
|
|
=> step 10: if !cpu-map && !thread-groups, detect the optimal thread-group count
|
|
|
|
=> step 11: if !cpu-map, cut the thread set into mostly fair groups and assign
|
|
the group numbers to CPUs; create implicit cpu-maps.
|
|
|
|
Ideas:
|
|
- use minthr and maxthr.
|
|
If nbthread, minthr=maxthr=nbthread, else if taskset_forced, maxthr=taskset_thr,
|
|
minthr=1, else minthr=1, maxthr=cpus_enabled.
|
|
|
|
- use CPU_F_ALLOWED (or DISALLOWED?) and CPU_F_REFERENCED and CPU_F_EXCLUDED ?
|
|
Note: cpu-map doesn't exclude, it only includes. Taskset does exclude. Also,
|
|
cpu-map only includes the CPUs that will belong to the correct groups & threads.
|
|
|
|
- Usual startup: taskset presets the CPU sets and sets the thread count. Tgrp
|
|
defaults to 1, then threads indicated in cpu-map get their CPU assigned.
|
|
Other ones are not changed. If we say that cpu-map => tgrp==1 then it means
|
|
we can infer automatic grouping for group 1 only ?
|
|
=> it could be said that the CPUs of all enabled groups mentioned in
|
|
cpu-map are considered usable, but we don't know how many of these
|
|
will really have threads started on.
|
|
|
|
=> maybe completely ignore cpu-map instead (i.e. fall back to thread-groups 1) ?
|
|
=> automatic detection would mean:
|
|
- if !cpu-map && !nbthrgrp => must automatically detect thgrp
|
|
- if !cpu-map => must automatically detect binding
|
|
- otherwise nothing
|
|
|
|
Examples of problems:
|
|
|
|
thread-groups 4
|
|
nbthreads 128
|
|
cpu-map 1/all 0-63
|
|
cpu-map 2/all 128-191
|
|
|
|
=> 32 threads per group, hence grp 1 uses 0-63 and grp 2 128-191,
|
|
grp 3 and grp 4 unknown, in practice on boot CPUs.
|
|
|
|
=> could we demand that if one cpu-map is specified, then all groups
|
|
are covered ? Do we need really this after all ? i.e. let's just not
|
|
bind other threads and that's all (and what is written).
|
|
|
|
|
|
Calls from haproxy.c:
|
|
|
|
cpu_detect_usable()
|
|
cpu_detect_topology()
|
|
|
|
+ thread_detect_count()
|
|
=> compute nbtgroups
|
|
=> compute nbthreads
|
|
|
|
thread_assign_cpus() ?
|
|
|
|
check_config_validity()
|
|
|
|
|
|
BUGS:
|
|
- cpu_map[0].proc still used for the whole process in daemon mode (though not
|
|
in foreground mode)
|
|
-> whole process bound to thread group 1
|
|
-> binding not working in foreground
|
|
|
|
- cpu_map[x].proc ANDed with the thread's map depite thread's map apparently
|
|
never set
|
|
-> group binding ignored ?
|
|
|
|
2023-09-05
|
|
----------
|
|
Remember to make the difference between sorting (used for grouping) and
|
|
preference. We should avoid selecting the first CPUs as it encourages to
|
|
use wrong grouping criteria. E.g. CPU capacity has no business being used
|
|
for grouping, it's used for selecting. Support for HT however, does because
|
|
it allows to pack together threads of the same core.
|
|
|
|
We should also have an option to enable/disable SMT (e.g. max threads per core)
|
|
so that we can skip siblings of cores already assigned. This can be convenient
|
|
with network running on the other sibling.
|
|
|
|
|
|
2024-12-26
|
|
----------
|
|
|
|
Some interesting cases about intel 14900. The CPU has 8 P-cores and 16 E-cores.
|
|
Experiments in the lab show excellent performance by binding the network to E
|
|
cores and haproxy to P cores. Here's how the clusters are made:
|
|
|
|
$ grep -h . /sys/devices/system/cpu/cpu*/topology/package_cpus | sort |uniq -c
|
|
32 ffffffff
|
|
|
|
=> expected
|
|
|
|
$ grep -h . /sys/devices/system/cpu/cpu*/topology/die_cpus | sort |uniq -c
|
|
32 ffffffff
|
|
|
|
=> all CPUs on the same die
|
|
|
|
$ grep -h . /sys/devices/system/cpu/cpu*/topology/cluster_cpus | sort |uniq -c
|
|
2 00000003
|
|
2 0000000c
|
|
2 00000030
|
|
2 000000c0
|
|
2 00000300
|
|
2 00000c00
|
|
2 00003000
|
|
2 0000c000
|
|
4 000f0000
|
|
4 00f00000
|
|
4 0f000000
|
|
4 f0000000
|
|
|
|
=> 1 "cluster" per core on each P-core (2 threads, 8 clusters total)
|
|
=> 1 "cluster" per 4 E-cores (4 clusters total)
|
|
=> It can be difficult to split that into groups by just using this topology.
|
|
|
|
$ grep -h . /sys/devices/system/cpu/cpu*/cache/index3/shared_cpu_list | sort |uniq -c
|
|
32 0-31
|
|
|
|
=> everyone shares a uniform L3 cache
|
|
|
|
$ grep -h . /sys/devices/system/cpu/cpu*/cache/index2/shared_cpu_map | sort |uniq -c
|
|
2 00000003
|
|
2 0000000c
|
|
2 00000030
|
|
2 000000c0
|
|
2 00000300
|
|
2 00000c00
|
|
2 00003000
|
|
2 0000c000
|
|
4 000f0000
|
|
4 00f00000
|
|
4 0f000000
|
|
4 f0000000
|
|
|
|
=> L2 is split like the respective "clusters" above.
|
|
|
|
Semms like one would like to split them into 12 groups :-/ Maybe it still
|
|
remains relevant to consider L3 for grouping, and core performance for
|
|
selection (e.g. evict/prefer E-cores depending on policy).
|
|
|
|
Differences between P and E cores on 14900:
|
|
|
|
- acpi_cppc/*perf : pretty useful but not always there (e.g. aloha)
|
|
- cache index0: 48 vs 32k (bigger CPU has smaller cache)
|
|
- cache index1: 32 vs 64k (smaller CPU has bigger cache)
|
|
- cache index2: 2 vs 4M, but dedicated per core vs shared per cluster (4 cores)
|
|
|
|
=> probably that the presence of a larger "cluster" with less cache per
|
|
avg core is an indication of a smaller CPU set. Warning however, some
|
|
CPUs (e.g. S922X) have a large (4) cluster of big cores and a small (2)
|
|
cluster of little cores.
|
|
|
|
|
|
diff -urN cpu0/acpi_cppc/lowest_nonlinear_perf cpu16/acpi_cppc/lowest_nonlinear_perf
|
|
--- cpu0/acpi_cppc/lowest_nonlinear_perf 2024-12-26 18:39:27.563410317 +0100
|
|
+++ cpu16/acpi_cppc/lowest_nonlinear_perf 2024-12-26 18:40:39.531408186 +0100
|
|
@@ -1 +1 @@
|
|
-20
|
|
+15
|
|
diff -urN cpu0/acpi_cppc/nominal_perf cpu16/acpi_cppc/nominal_perf
|
|
--- cpu0/acpi_cppc/nominal_perf 2024-12-26 18:39:27.563410317 +0100
|
|
+++ cpu16/acpi_cppc/nominal_perf 2024-12-26 18:40:39.531408186 +0100
|
|
@@ -1 +1 @@
|
|
-40
|
|
+24
|
|
diff -urN cpu0/acpi_cppc/reference_perf cpu16/acpi_cppc/reference_perf
|
|
--- cpu0/acpi_cppc/reference_perf 2024-12-26 18:39:27.563410317 +0100
|
|
+++ cpu16/acpi_cppc/reference_perf 2024-12-26 18:40:39.531408186 +0100
|
|
@@ -1 +1 @@
|
|
-40
|
|
+24
|
|
diff -urN cpu0/cache/index0/size cpu16/cache/index0/size
|
|
--- cpu0/cache/index0/size 2024-12-26 18:39:27.563410317 +0100
|
|
+++ cpu16/cache/index0/size 2024-12-26 18:40:39.531408186 +0100
|
|
@@ -1 +1 @@
|
|
-48K
|
|
+32K
|
|
diff -urN cpu0/cache/index1/shared_cpu_list cpu16/cache/index1/shared_cpu_list
|
|
--- cpu0/cache/index1/shared_cpu_list 2024-12-26 18:39:27.563410317 +0100
|
|
+++ cpu16/cache/index1/shared_cpu_list 2024-12-26 18:40:39.531408186 +0100
|
|
@@ -1 +1 @@
|
|
-0-1
|
|
+16
|
|
diff -urN cpu0/cache/index1/shared_cpu_map cpu16/cache/index1/shared_cpu_map
|
|
--- cpu0/cache/index1/shared_cpu_map 2024-12-26 18:39:27.563410317 +0100
|
|
+++ cpu16/cache/index1/shared_cpu_map 2024-12-26 18:40:39.531408186 +0100
|
|
@@ -1 +1 @@
|
|
-00000003
|
|
+00010000
|
|
diff -urN cpu0/cache/index1/size cpu16/cache/index1/size
|
|
--- cpu0/cache/index1/size 2024-12-26 18:39:27.563410317 +0100
|
|
+++ cpu16/cache/index1/size 2024-12-26 18:40:39.531408186 +0100
|
|
@@ -1 +1 @@
|
|
-32K
|
|
+64K
|
|
diff -urN cpu0/cache/index2/shared_cpu_list cpu16/cache/index2/shared_cpu_list
|
|
--- cpu0/cache/index2/shared_cpu_list 2024-12-26 18:39:27.563410317 +0100
|
|
+++ cpu16/cache/index2/shared_cpu_list 2024-12-26 18:40:39.531408186 +0100
|
|
@@ -1 +1 @@
|
|
-0-1
|
|
+16-19
|
|
--- cpu0/cache/index2/size 2024-12-26 18:39:27.563410317 +0100
|
|
+++ cpu16/cache/index2/size 2024-12-26 18:40:39.531408186 +0100
|
|
@@ -1 +1 @@
|
|
-2048K
|
|
+4096K
|
|
diff -urN cpu0/topology/cluster_cpus cpu16/topology/cluster_cpus
|
|
--- cpu0/topology/cluster_cpus 2024-12-26 18:39:27.563410317 +0100
|
|
+++ cpu16/topology/cluster_cpus 2024-12-26 18:40:39.531408186 +0100
|
|
@@ -1 +1 @@
|
|
-00000003
|
|
+000f0000
|
|
diff -urN cpu0/topology/cluster_cpus_list cpu16/topology/cluster_cpus_list
|
|
--- cpu0/topology/cluster_cpus_list 2024-12-26 18:39:27.563410317 +0100
|
|
+++ cpu16/topology/cluster_cpus_list 2024-12-26 18:40:39.531408186 +0100
|
|
@@ -1 +1 @@
|
|
-0-1
|
|
+16-19
|
|
|
|
For acpi_cppc, the values differ between machines, looks like nominal_perf
|
|
is always usable:
|
|
|
|
14900k:
|
|
$ grep '' cpu8/acpi_cppc/*
|
|
cpu8/acpi_cppc/feedback_ctrs:ref:85172004640 del:143944480100
|
|
cpu8/acpi_cppc/highest_perf:255
|
|
cpu8/acpi_cppc/lowest_freq:0
|
|
cpu8/acpi_cppc/lowest_nonlinear_perf:20
|
|
cpu8/acpi_cppc/lowest_perf:1
|
|
cpu8/acpi_cppc/nominal_freq:3200
|
|
cpu8/acpi_cppc/nominal_perf:40
|
|
cpu8/acpi_cppc/reference_perf:40
|
|
cpu8/acpi_cppc/wraparound_time:18446744073709551615
|
|
|
|
$ grep '' cpu16/acpi_cppc/*
|
|
cpu16/acpi_cppc/feedback_ctrs:ref:84153776128 del:112977352354
|
|
cpu16/acpi_cppc/highest_perf:255
|
|
cpu16/acpi_cppc/lowest_freq:0
|
|
cpu16/acpi_cppc/lowest_nonlinear_perf:15
|
|
cpu16/acpi_cppc/lowest_perf:1
|
|
cpu16/acpi_cppc/nominal_freq:3200
|
|
cpu16/acpi_cppc/nominal_perf:24
|
|
cpu16/acpi_cppc/reference_perf:24
|
|
cpu16/acpi_cppc/wraparound_time:18446744073709551615
|
|
|
|
altra:
|
|
$ grep '' /sys/devices/system/cpu/cpu0/acpi_cppc/*
|
|
feedback_ctrs:ref:227098452801 del:590247062111
|
|
highest_perf:260
|
|
lowest_freq:1000
|
|
lowest_nonlinear_perf:200
|
|
lowest_perf:100
|
|
nominal_freq:2600
|
|
nominal_perf:260
|
|
reference_perf:100
|
|
|
|
w3-2345:
|
|
$ grep '' /sys/devices/system/cpu/cpu0/acpi_cppc/*
|
|
feedback_ctrs:ref:4775674480779 del:5675950973600
|
|
highest_perf:45
|
|
lowest_freq:0
|
|
lowest_nonlinear_perf:8
|
|
lowest_perf:5
|
|
nominal_freq:0
|
|
nominal_perf:31
|
|
reference_perf:31
|
|
wraparound_time:18446744073709551615
|
|
|
|
Other approaches may consist in checking the CPU's max frequency via
|
|
cpufreq, e.g on the N2:
|
|
|
|
$ grep . /sys/devices/system/cpu/cpu?/cpufreq/scaling_max_freq
|
|
/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq:2016000
|
|
/sys/devices/system/cpu/cpu1/cpufreq/scaling_max_freq:2016000
|
|
/sys/devices/system/cpu/cpu2/cpufreq/scaling_max_freq:2400000
|
|
/sys/devices/system/cpu/cpu3/cpufreq/scaling_max_freq:2400000
|
|
/sys/devices/system/cpu/cpu4/cpufreq/scaling_max_freq:2400000
|
|
/sys/devices/system/cpu/cpu5/cpufreq/scaling_max_freq:2400000
|
|
|
|
However on x86, the cores no longer all have the same frequency, like below on
|
|
the W3-2345, so it cannot always be used to split them into groups, it may at
|
|
best be used to sort them.
|
|
|
|
$ grep . /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq
|
|
/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq:4500000
|
|
/sys/devices/system/cpu/cpu1/cpufreq/scaling_max_freq:4500000
|
|
/sys/devices/system/cpu/cpu2/cpufreq/scaling_max_freq:4300000
|
|
/sys/devices/system/cpu/cpu3/cpufreq/scaling_max_freq:4400000
|
|
/sys/devices/system/cpu/cpu4/cpufreq/scaling_max_freq:4300000
|
|
/sys/devices/system/cpu/cpu5/cpufreq/scaling_max_freq:4300000
|
|
/sys/devices/system/cpu/cpu6/cpufreq/scaling_max_freq:4400000
|
|
/sys/devices/system/cpu/cpu7/cpufreq/scaling_max_freq:4300000
|
|
/sys/devices/system/cpu/cpu8/cpufreq/scaling_max_freq:4500000
|
|
/sys/devices/system/cpu/cpu9/cpufreq/scaling_max_freq:4500000
|
|
/sys/devices/system/cpu/cpu10/cpufreq/scaling_max_freq:4300000
|
|
/sys/devices/system/cpu/cpu11/cpufreq/scaling_max_freq:4400000
|
|
/sys/devices/system/cpu/cpu12/cpufreq/scaling_max_freq:4300000
|
|
/sys/devices/system/cpu/cpu13/cpufreq/scaling_max_freq:4300000
|
|
/sys/devices/system/cpu/cpu14/cpufreq/scaling_max_freq:4400000
|
|
/sys/devices/system/cpu/cpu15/cpufreq/scaling_max_freq:4300000
|
|
|
|
On 14900, not cool either:
|
|
|
|
$ grep -h . /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq|sort|uniq -c
|
|
16 4400000
|
|
12 5700000
|
|
4 6000000
|
|
|
|
Considering that values that are within +/-10% of a cluster's min/max are still
|
|
part of it would seem to work and would make a good rule of thumb.
|
|
|
|
On x86, the model number might help, here on w3-2345:
|
|
|
|
$ grep '^model\s\s' /proc/cpuinfo |sort|uniq -c
|
|
16 model : 143
|
|
|
|
But not always (here: 14900K with 8xP and 16xE):
|
|
|
|
$ grep '^model\s\s' /proc/cpuinfo |sort|uniq -c
|
|
32 model : 183
|
|
|
|
On ARM it's rather the part number:
|
|
|
|
# a9
|
|
$ grep part /proc/cpuinfo
|
|
CPU part : 0xc09
|
|
CPU part : 0xc09
|
|
|
|
# a17
|
|
$ grep part /proc/cpuinfo
|
|
CPU part : 0xc0d
|
|
CPU part : 0xc0d
|
|
CPU part : 0xc0d
|
|
CPU part : 0xc0d
|
|
|
|
# a72
|
|
$ grep part /proc/cpuinfo
|
|
CPU part : 0xd08
|
|
CPU part : 0xd08
|
|
CPU part : 0xd08
|
|
CPU part : 0xd08
|
|
|
|
# a53+a72
|
|
$ grep part /proc/cpuinfo
|
|
CPU part : 0xd03
|
|
CPU part : 0xd03
|
|
CPU part : 0xd03
|
|
CPU part : 0xd03
|
|
CPU part : 0xd08
|
|
CPU part : 0xd08
|
|
|
|
# a53+a73
|
|
$ grep 'part' /proc/cpuinfo
|
|
CPU part : 0xd03
|
|
CPU part : 0xd03
|
|
CPU part : 0xd09
|
|
CPU part : 0xd09
|
|
CPU part : 0xd09
|
|
CPU part : 0xd09
|
|
|
|
# a55+a76
|
|
$ grep 'part' /proc/cpuinfo
|
|
CPU part : 0xd05
|
|
CPU part : 0xd05
|
|
CPU part : 0xd05
|
|
CPU part : 0xd05
|
|
CPU part : 0xd0b
|
|
CPU part : 0xd0b
|
|
CPU part : 0xd0b
|
|
CPU part : 0xd0b
|
|
|
|
|
|
2024-12-27
|
|
----------
|
|
|
|
Such machines with P+E cores are becoming increasingly common. Some like the
|
|
CIX-P1 can even provide 3 levels of performance: 4 big cores (A720-2.8G), 4
|
|
medium cores (A720-2.4G), 4 little cores (A520-1.8G). Architectures like below
|
|
will become the norm, and can be used under different policies:
|
|
|
|
+-----------------------------+
|
|
| L3 |
|
|
+---+----------+----------+---+
|
|
| | |
|
|
+---+---+ +---+---+ +---+---+
|
|
| P | P | | E | E | | E | E |
|
|
+---+---+ +---+---+ +---+---+
|
|
Policy: | P | P | | E | E | | E | E |
|
|
------- +---+---+ +---+---+ +---+---+
|
|
1 group, min: N/A 0 0
|
|
1 group, max: 0 N/A N/A
|
|
1 group, all: 0 0 0
|
|
2 groups, min: N/A 0 1
|
|
2 groups, full: 0 1 1
|
|
3 groups: 0 1 2
|
|
|
|
In dual-socket or multiple dies it can even become more complicated:
|
|
|
|
+---+---+ +---+---+ +---+---+
|
|
| P | P | | E | E | | E | E |
|
|
+---+---+ +---+---+ +---+---+
|
|
| P | P | | E | E | | E | E |
|
|
+---+---+ +---+---+ +---+---+
|
|
| | |
|
|
+---+----------+----------+---+
|
|
| L3.0 |
|
|
+-----------------------------+
|
|
|
|
+-----------------------------+
|
|
| L3.1 |
|
|
+---+----------+----------+---+
|
|
| | |
|
|
+---+---+ +---+---+ +---+---+
|
|
| P | P | | E | E | | E | E |
|
|
+---+---+ +---+---+ +---+---+
|
|
| P | P | | E | E | | E | E |
|
|
+---+---+ +---+---+ +---+---+
|
|
|
|
Setting only a thread count would yield interesting things above:
|
|
1-4T: P.0
|
|
5-8T: P.0, P.1 (2 grp)
|
|
9-16T: P.0, E.0, P.1, E.1 (3-4 grp)
|
|
17-24T: PEE.0, PEE.1 (5-6 grp)
|
|
|
|
With forced tgrp = 1:
|
|
- only fill node 0 first (P then PE, then PEE)
|
|
|
|
With forced tgrp = 2:
|
|
|
|
def: P.0, P.1
|
|
2-4T: P.0 only ?
|
|
6-8T: P.0, P.1
|
|
9-24T: PEE.0, PEE.1
|
|
|
|
With dual-socket, dual-die, it becomes:
|
|
|
|
+---+---+ +---+---+ +---+---+ ' +---+---+ +---+---+ +---+---+
|
|
| P | P | | E | E | | E | E | ' | P | P | | E | E | | E | E |
|
|
+---+---+ +---+---+ +---+---+ ' +---+---+ +---+---+ +---+---+
|
|
| P | P | | E | E | | E | E | ' | P | P | | E | E | | E | E |
|
|
+---+---+ +---+---+ +---+---+ ' +---+---+ +---+---+ +---+---+
|
|
| | | ' | | |
|
|
+---+----------+----------+---+ ' +---+----------+----------+---+
|
|
| L3.0.0 | ' | L3.1.0 |
|
|
+-----------------------------+ ' +-----------------------------+
|
|
'
|
|
+-----------------------------+ ' +-----------------------------+
|
|
| L3.0.1 | ' | L3.1.1 |
|
|
+---+----------+----------+---+ ' +---+----------+----------+---+
|
|
| | | ' | | |
|
|
+---+---+ +---+---+ +---+---+ ' +---+---+ +---+---+ +---+---+
|
|
| P | P | | E | E | | E | E | ' | P | P | | E | E | | E | E |
|
|
+---+---+ +---+---+ +---+---+ ' +---+---+ +---+---+ +---+---+
|
|
| P | P | | E | E | | E | E | ' | P | P | | E | E | | E | E |
|
|
+---+---+ +---+---+ +---+---+ ' +---+---+ +---+---+ +---+---+
|
|
|
|
In such conditions, it could make sense to first enumerate all the available
|
|
cores with all their characteristics, and distribute them between "buckets"
|
|
representing the thread groups:
|
|
|
|
1. create the min number of tgrp (tgrp.min)
|
|
2. it's possible to automatically create more until tgrp.max
|
|
-> cores are sorted by performance then by proximity. They're
|
|
distributed in order into existing buckets, and if too distant,
|
|
then new groups are created. It could allow for example to use
|
|
all P-cores in the DSDD model above, split into 4 tgrp.
|
|
-> the total number of threads is then discovered at the end.
|
|
|
|
|
|
It seems in the end that such binding policies (P, E, single/multi dies,
|
|
single/multi sockets etc) should be made more accessible to the user. What
|
|
we're missing in "cpu-map" is the ability to apply to the whole process in
|
|
fact, so that it can supersede taskset. Indeed, right now, cpu-map requires
|
|
too many details and that's why it often remains easier to deal with taskset,
|
|
particularly when dealing with thread groups.
|
|
|
|
We can revisit the situation differently. First, let's keep in mind that
|
|
cpu-map is a restriction. It means "use no more than these", it does not
|
|
mean "use all of these". So it totally makes sense to use it to replace
|
|
taskset at the process level without interfering with groups detection.
|
|
We could then have:
|
|
|
|
- "cpu-map all|process|global|? ..." to apply to the whole process
|
|
- then special keywords for the CPUs designation, among:
|
|
- package (socket) number
|
|
- die number (CCD)
|
|
- L3 number (CCX)
|
|
- cluster type (big/performant, medium, little/efficient)
|
|
- use of SMT or not, and which ones
|
|
- maybe optional numbers before these to indicate (any two of them),
|
|
e.g. "4P" to indicate "4 performance cores".
|
|
|
|
Question: how would we designate "only P cores of socket 0" ? Or
|
|
"only thread 0 of all P cores" ?
|
|
|
|
One benefit of such a declaration method is that it can make nbthread often
|
|
useless and automatic while still portable across a whole fleet of servers. E.g.
|
|
if "cpu-map all S0P*T0" would designate thread 0 of all P-cores of socket 0, it
|
|
would mean the same on all machines.
|
|
|
|
Another benefit is that we can make cpu-map and automatic detection more
|
|
exclusive:
|
|
- cpu-map all => equivalent of taskset, leaves auto-detection on
|
|
- cpu-map thr => disables auto-detection
|
|
|
|
So in the end:
|
|
- cpu-map all restricts the CPUs the process may use
|
|
-> auto-detection starts from here and sorts them
|
|
- thread-groups offers more "buckets" to arrange distant CPUs in the
|
|
same process
|
|
- nbthread limits the number of threads we'll use
|
|
-> pick the most suited ones (at least thr.min, at most thr.max)
|
|
and distribute them optimally among the number of thread groups.
|
|
|
|
One question remains: is it always possible to automatically configure
|
|
thread-groups ? Maybe it's possible after the detection to set an optimal
|
|
one between grp.min and grp.max ? (e.g. socket count, core types, etc).
|
|
|
|
It still seems that a policy such as "optimize-for resources|perfomance"
|
|
would still help quite a bit.
|
|
|
|
-> what defines a match between a CPU core and a group:
|
|
- cluster identification:
|
|
- either cluster_cpus if present (and sometimes), or:
|
|
- pkg+die+ccd number
|
|
- same LLC instance (L3 if present, L2 if no L3 etc)
|
|
- CPU core model ("model" on x86, "CPU part" on arm)
|
|
- number of SMT per core
|
|
- speed if known:
|
|
- /sys/devices/system/cpu/cpu0/acpi_cppc/nominal_perf if available
|
|
- or /sys/devices/system/cpu/cpu15/cpufreq/scaling_max_freq +/- 10%
|
|
|
|
PB: on intel P+E, clusters of E cores sharing the same L2+L3, but P cores are
|
|
alone on their L3 => poor grouping.
|
|
|
|
Maybe one approach could be to characterize how L3/L2 are used. E.g. on the
|
|
14900, we have:
|
|
- L3 0 => all cpus there
|
|
- L2 0..7 => 1C2T per L2
|
|
- L2 8..11 => 4C4T per L2
|
|
=> it's obvious that CPUs connected to L2 #8..11 are not the same as those
|
|
on L2 #0..7. We could make something with them.
|
|
=> it does not make sense to ditch the L2 distinction due to L3 being
|
|
present and the same, though it doesn't make sense to use L3 either.
|
|
Maybe elements with a cardinality of 1 should just be ignored. E.g.
|
|
cores per cache == 1 => ignore L2. Probably not true per die/pkg
|
|
though.
|
|
=> replace absent or irrelevant info with "?"
|
|
|
|
Note that for caches we have the list of CPUs, not the list of cores, so
|
|
we need to remap that invidivually to cores.
|
|
|
|
Warning: die_id, core_id etc are per socket, not per system. Worse, on Altra,
|
|
core_id has gigantic values (multiples of +1 and +256). However core_cpus_list
|
|
indicates other threads and could be a solution to create our own global core
|
|
ID. Also, cluster_id=-1 found on all cores for A8040 on kernel 6.1.
|
|
|
|
Note that LLC is always the first discriminator. But within a same LLC we can
|
|
have the issues above (e.g. 14900).
|
|
|
|
Would an intermediate approach like this work ?
|
|
-----------------------------------------------
|
|
1) first split by LLC (also test with L3-less A8040, N2800, x5-8350)
|
|
2) within LLC, check of we have different cores (model, perf, freq?)
|
|
and resplit
|
|
3) divide again so that no group has more than 64 CPUs
|
|
|
|
=> it looks like from the beginning that's what we're trying to do:
|
|
preserve locality first, then possibly trim down the number of cores
|
|
if some don't bring sufficient benefit. It possibly avoids the need
|
|
to identify dies etc. It still doesn't completely solve the 14900
|
|
though.
|
|
|
|
Multi-die CPUs worth checking:
|
|
Pentium-D (Presler, Dempsey: two 65nm dies)
|
|
Core2Quad Q6600/Q6700 (Kentsfield, Clowertown: two 65nm dual-core dies)
|
|
Core2Quad Q8xxx/Q9xxx (Yorkfield, Harpertown, Tigerton: two 45nm dual-core dies)
|
|
- atom 330 ("diamondville") is really a dual-die
|
|
- note that atom x3-z8350 ("cherry trail"), N2800 ("cedar trail") and D510
|
|
("pine trail") are single-die (verified) but have two L2 caches and no L3.
|
|
Note that these are apparently not identified as multi-die (Q6600 has die=0).
|
|
|
|
It *seems* that in order to form groups we'll first have to sort by topology,
|
|
and only after that sort by performance so as to choose preferred CPUs.
|
|
Otherwise we could end up trying to form inter-socket CPU groups first in
|
|
case we're forced to mix adjacent CPUs due to too many groups.
|
|
|
|
|
|
2025-01-07
|
|
----------
|
|
|
|
What is needed in fact is to act on two directions:
|
|
|
|
- binding restrictions: the user doesn't want the process to run on
|
|
second node, on efficient cores, second thread of each core, so
|
|
they're indicating where (not) to bind. This is a strict choice,
|
|
and it overrides taskset. That's the process-wide cpu-map.
|
|
|
|
- user preferences / execution profile: the user expresses their wishes
|
|
about how to allocate resources. This is only a binding order strategy
|
|
among a few existing ones that help easily decide which cores to select.
|
|
In this case CPUs are not enumerated. We can imagine choices as:
|
|
|
|
- full : use all permitted cores
|
|
- performance: use all permitted performance cores (all sockets)
|
|
- single-node: (like today): use all cores of a single node
|
|
- balanced: use a reasonable amount of perf cores (e.g. all perf
|
|
cores of a single socket)
|
|
- resources: use a single cluster of efficient cores
|
|
- minimal: use a single efficient core
|
|
|
|
By sorting CPUs first on the performance, then applying the filtering based on
|
|
the profile to eliminate more CPUs, then applying the limit on the desired max
|
|
number of threads, then sorting again on the topology, it should be possible to
|
|
draw a list of usable CPUs that can then be split in groups along the L3s.
|
|
|
|
It even sounds likely that the CPU profile or allocation strategy will affect
|
|
the first sort method. E.g:
|
|
- full: no sort needed though we'll use the same as perf so as to enable
|
|
the maximum possible high-perf threads when #threads is limited
|
|
- performance: probably that we should invert the topology so as to maximize
|
|
memory bandwidth across multiple sockets, i.e. visite node1.core0 just
|
|
after node0.core0 etc, and visit their threads later.
|
|
- bandwidth: that could be the same as "performance" one above in fact
|
|
- (low-)latency: better stay local first
|
|
- balanced: sort by perf then sockets (i.e. P0, P1, E0, E1)
|
|
- resources: sort on perf first.
|
|
- etc
|
|
|
|
The strategy will also help determine the number of threads when it's not fixed
|
|
in the configuration.
|
|
|
|
Plan:
|
|
1) make the profile configurable and implement the sort:
|
|
- option name? cpu-tuning, cpu-strategy, cpu-policy, cpu-allocation,
|
|
cpu-selection, cpu-priority, cpu-optimize-for, cpu-prefer, cpu-favor,
|
|
cpu-profile
|
|
=> cpu-selection
|
|
|
|
2) make the process-wide cpu-map configurable
|
|
3) extend cpu-map to make it possible to designate symbolic groups
|
|
(e.g. "ht0/ht1, node 0, 3*CCD, etc)
|
|
|
|
Also, offering an option to the user to see how haproxy sees the CPUs and the
|
|
bindings for various profiles would be a nice improvement helping them make
|
|
educated decisions instead of trying blindly.
|
|
|
|
2025-01-11
|
|
----------
|
|
|
|
Configuration profile: there are multiple dimensions:
|
|
- preferences between cores types
|
|
- never use a given cpu type
|
|
- never use a given cpu location
|
|
|
|
Better use something like:
|
|
- ignore-XXX -> never use XXX
|
|
- avoid-XXX -> prefer not to use XXX
|
|
- prefer-XXX -> prefer to use XXX
|
|
- restrict-XXX -> only use XXX
|
|
|
|
"XXX" could be "single-threaded", "dual-threaded", "first-thread",
|
|
"second-thread", "first-socket", "second-socket", "slowest", "fastest",
|
|
"node-XXX" etc.
|
|
|
|
We could then have:
|
|
- cpu-selection restrict-first-socket,ignore-slowest,...
|
|
|
|
Then some of the keywords could simply be shortcuts for these.
|
|
|
|
2025-01-30
|
|
----------
|
|
Problem: we need to set the restrictions first to eliminate undesired CPUs,
|
|
then sort according to the desired preferences so as to pick what
|
|
is considered the best CPUs. So the preference really looks like
|
|
a different setting.
|
|
|
|
More precisely, the final strategy involves multiple criteria. For example,
|
|
let's say that the number of threads is set to 4 and we've restricted ourselves
|
|
to using the first thread of each CPU core. We're on an EPYC74F3, there are 3
|
|
cores per CCX. One algorithm (resource) would create one group with 3 threads
|
|
on the first CCX and 1 group of 1 thread on the next one, then let each of
|
|
these threads bind to all the enabled CPU cores of their respective groups.
|
|
Another algo (performance) would avoid sharing and would want to place one
|
|
thread per CCX, causing the creation of 4 groups of 1 thread each. A third
|
|
algo (balanced) would probably say that 4 threads require 2 CCX hence 2
|
|
groups, thus there should be 2 threads per group, and it would bind 2 threads
|
|
on all cores of the first CCX and the 2 remaining ones on the second.
|
|
|
|
And if the thread count is not set, these strategies will also do their best
|
|
to figure the optimal count. Resource would probably use 1 core max, moderate
|
|
one CCX max, balanced one node max, performance all of them.
|
|
|
|
This means that these CPU selection strategries should provide multiple
|
|
functions:
|
|
- how to sort CPUs
|
|
- how to count how many is best within imposed rules
|
|
|
|
The other actions seem to only be static. This also means that "avoid" or
|
|
"prefer" should maybe not be used in the end, even in the sorting algo ?
|
|
|
|
Or maybe these are just enums or bits in a strategy and all are considered
|
|
at the same time everywhere. For example the thread counting could consider
|
|
the presence of "avoid-XXX" during the operations. But how to codify XXX is
|
|
complicated then.
|
|
|
|
Maybe a scoring system could work:
|
|
- default: all CPUs score = 1000
|
|
- ignore-XXX: foreach(XXX) set score to 0
|
|
- restrict-XXX: foreach(YYY not XXX), set score to 0
|
|
- avoid-XXX: foreach(XXX) score *= 0.8
|
|
- prefer-XXX: foreach(XXX) score *= 1.25
|
|
|
|
This supports being ignored for up to 30 different reasons before being
|
|
permanently disabled, which is sufficient.
|
|
|
|
Then sort according to score, and pick at least min_thr CPUs and continue as
|
|
long as not max_thr or score < 1000 ("avoid"). This gives the thread count. It
|
|
does not permit anything inter-CPU though. E.g. large vs medium vs small cores,
|
|
or sort by locality or frequency. But maybe these ones would use a different
|
|
strategy then and would use the score as a second sorting key (after which
|
|
one?). Or maybe there would be 2 passes, one which avoids <1000 and another
|
|
one which completes up to #min_thr including those <1000, in which case we
|
|
never sort per score.
|
|
|
|
We can do a bit better to respect the tgrp min/max as well: we can count what
|
|
it implies in terms of number of tgrps (#LLC or clusters) and decide to refrain
|
|
from adding theads which would exceed max_tgrp, but we'd possibly continue to
|
|
add score<1000 CPUs until at least enough threads to reach min_tgrp.
|
|
|
|
######## new captures ###########
|
|
CIX-P1 / radxa Orion O6 (no topology exported):
|
|
$ ~/haproxy/haproxy -dc -f /dev/null
|
|
grp=[1..12] thr=[1..12]
|
|
first node = 0
|
|
Note: threads already set to 12
|
|
going to start with nbthread=12 nbtgroups=1
|
|
[keep] thr= 0 -> cpu= 0 pk=00 no=-1 di=00 cl=000 ts=000 capa=1024
|
|
[keep] thr= 1 -> cpu= 1 pk=00 no=-1 di=00 cl=000 ts=001 capa=278
|
|
[keep] thr= 2 -> cpu= 2 pk=00 no=-1 di=00 cl=000 ts=002 capa=278
|
|
[keep] thr= 3 -> cpu= 3 pk=00 no=-1 di=00 cl=000 ts=003 capa=278
|
|
[keep] thr= 4 -> cpu= 4 pk=00 no=-1 di=00 cl=000 ts=004 capa=278
|
|
[keep] thr= 5 -> cpu= 5 pk=00 no=-1 di=00 cl=000 ts=005 capa=905
|
|
[keep] thr= 6 -> cpu= 6 pk=00 no=-1 di=00 cl=000 ts=006 capa=905
|
|
[keep] thr= 7 -> cpu= 7 pk=00 no=-1 di=00 cl=000 ts=007 capa=866
|
|
[keep] thr= 8 -> cpu= 8 pk=00 no=-1 di=00 cl=000 ts=008 capa=866
|
|
[keep] thr= 9 -> cpu= 9 pk=00 no=-1 di=00 cl=000 ts=009 capa=984
|
|
[keep] thr= 10 -> cpu= 10 pk=00 no=-1 di=00 cl=000 ts=010 capa=984
|
|
[keep] thr= 11 -> cpu= 11 pk=00 no=-1 di=00 cl=000 ts=011 capa=1024
|
|
########
|
|
|
|
2025-02-25 - clarification on the configuration
|
|
-----------------------------------------------
|
|
|
|
The "two dimensions" above can in fact be summarized like this:
|
|
|
|
- exposing the ability for the user to perform the same as "taskset",
|
|
i.e. restrict the usage to a static subset of the CPUs. We could then
|
|
have "cpu-set only-node0", "0-39", "ignore-smt1", "ignore-little", etc.
|
|
=> the user defines precise sets to be kept/evicted.
|
|
|
|
- then letting the user express what they want to do with the remaining
|
|
cores. This is a strategy/policy that is used to:
|
|
- count the optimal number of threads (when not forced), also keeping
|
|
in mind that it cannot be more than 32/64 * maxtgroups if set.
|
|
- sort CPUs by order of preference (for when threads are forced or
|
|
a thread-hard-limit is set).
|
|
|
|
It can, partially overlap with the first one. For example, the default
|
|
strategy could be to focus on a single node. If the user has limited its
|
|
usage to cores of both nodes, the policy could still further limit this.
|
|
But this time it should only be a matter of sorting and preference, i.e.
|
|
nbthread and cpuset are respected. If a policy prefers the node with more
|
|
cores first, it will sort them according to this, and its algorithm for
|
|
counting cores will only be used if nbthread is not set, otherwise it may
|
|
very well end up on two nodes to respect the user's choice.
|
|
|
|
And once all of this is done, thread groups should be formed based on the
|
|
remaining topology. Similarly, if the number of tgroups is not set, the
|
|
algorithm must try to propose one based on the topology and the maxtgroups
|
|
setting (i.e. find a divider of the #LLC that's lower than or equal to
|
|
maxtgroups), otherwise the configured number of tgroups is respected. Then
|
|
the number of LLCs will be divided by this number of tgroups, and as many
|
|
threads as enabled CPUs of each LLC will be assigned to these respective
|
|
groups.
|
|
|
|
In the end we should have groups bound to cpu sets, and threads belonging
|
|
to groups mapped to all accessible cpus of these groups.
|
|
|
|
Note: clusters may be finer than LLCs because they could report finer
|
|
information. We could have a big and a medium cluster share the same L3
|
|
for example. However not all boards report their cluster number (see CIX-P1
|
|
above). However the info about the capacity still allows to figure that and
|
|
should probably be used for that. At this point it would seem logical to say
|
|
that the cluster number is re-adjusted based on the claimed capacity, at
|
|
least to avoid accidentally mixing workloads on heterogeneous cores. But
|
|
sorting by cluster number might not necessarily work if allocated randomly.
|
|
So we might need a distinct metric that doesn't require to override the
|
|
system's numbering, like a "set", "group", "team", "bond", "bunch", "club",
|
|
"band", ... that would be first sorted based on LLC (and no finer), and
|
|
second based on capacity, then on L2 etc. This way we should be able to
|
|
respect topology when forming groups.
|
|
|
|
Note: We need to consider as LLC a level which has more than one core!
|
|
Otherwise it's supposed to exist and be unique/shared but not reported.
|
|
=> maybe this should be done very early when counting CPUs ?
|
|
We need to store the LLC level somewhere in the topo.
|