diff --git a/doc/configuration.txt b/doc/configuration.txt index f0801aaba..60fe2a46a 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -2494,12 +2494,13 @@ tune.recv_enough tune.runqueue-depth Sets the maximum amount of task that can be processed at once when running - tasks. The default value is 40 which tends to show the highest request rates - and lowest latencies. Increasing it may incur latency when dealing with I/Os, - making it too small can incur extra overhead. When experimenting with much - larger values, it may be useful to also enable tune.sched.low-latency and - possibly tune.fd.edge-triggered to limit the maximum latency to the lowest - possible. + tasks. The default value depends on the number of threads but sits between 35 + and 280, which tend to show the highest request rates and lowest latencies. + Increasing it may incur latency when dealing with I/Os, making it too small + can incur extra overhead. Higher thread counts benefit from lower values. + When experimenting with much larger values, it may be useful to also enable + tune.sched.low-latency and possibly tune.fd.edge-triggered to limit the + maximum latency to the lowest possible. tune.sched.low-latency { on | off } Enables ('on') or disables ('off') the low-latency task scheduler. By default diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h index 13b5ad3fc..3dc98e5ce 100644 --- a/include/haproxy/defaults.h +++ b/include/haproxy/defaults.h @@ -186,19 +186,12 @@ #define MAX_ACCEPT 4 #endif -// the max number of tasks to run at once. Tests have shown the following -// number of requests/s for 1 to 16 threads (1c1t, 1c2t, 2c4t, 4c8t, 4c16t): -// -// rq\thr| 1 2 4 8 16 -// ------+------------------------------ -// 32| 120k 159k 276k 477k 698k -// 40| 122k 160k 276k 478k 722k -// 48| 121k 159k 274k 482k 720k -// 64| 121k 160k 274k 469k 710k -// 200| 114k 150k 247k 415k 613k -// +// The base max number of tasks to run at once to be used when not set by +// tune.runqueue-depth. It will automatically be divided by the square root +// of the number of threads for better fairness. As such, 64 threads will +// use 35 and a single thread will use 280. #ifndef RUNQUEUE_DEPTH -#define RUNQUEUE_DEPTH 40 +#define RUNQUEUE_DEPTH 280 #endif // cookie delimiter in "prefix" mode. This character is inserted between the diff --git a/src/haproxy.c b/src/haproxy.c index 49f6957c3..7b30a78a2 100644 --- a/src/haproxy.c +++ b/src/haproxy.c @@ -2274,8 +2274,14 @@ static void init(int argc, char **argv) if (global.tune.maxpollevents <= 0) global.tune.maxpollevents = MAX_POLL_EVENTS; - if (global.tune.runqueue_depth <= 0) - global.tune.runqueue_depth = RUNQUEUE_DEPTH; + if (global.tune.runqueue_depth <= 0) { + /* tests on various thread counts from 1 to 64 have shown an + * optimal queue depth following roughly 1/sqrt(threads). + */ + int s = my_flsl(global.nbthread); + s += (global.nbthread / s); // roughly twice the sqrt. + global.tune.runqueue_depth = RUNQUEUE_DEPTH * 2 / s; + } if (global.tune.recv_enough == 0) global.tune.recv_enough = MIN_RECV_AT_ONCE_ENOUGH;