diff --git a/doc/configuration.txt b/doc/configuration.txt
index f0801aaba..60fe2a46a 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -2494,12 +2494,13 @@ tune.recv_enough <number>
 
 tune.runqueue-depth <number>
   Sets the maximum amount of task that can be processed at once when running
-  tasks. The default value is 40 which tends to show the highest request rates
-  and lowest latencies. Increasing it may incur latency when dealing with I/Os,
-  making it too small can incur extra overhead. When experimenting with much
-  larger values, it may be useful to also enable tune.sched.low-latency and
-  possibly tune.fd.edge-triggered to limit the maximum latency to the lowest
-  possible.
+  tasks. The default value depends on the number of threads but sits between 35
+  and 280, which tend to show the highest request rates and lowest latencies.
+  Increasing it may incur latency when dealing with I/Os, making it too small
+  can incur extra overhead. Higher thread counts benefit from lower values.
+  When experimenting with much larger values, it may be useful to also enable
+  tune.sched.low-latency and possibly tune.fd.edge-triggered to limit the
+  maximum latency to the lowest possible.
 
 tune.sched.low-latency { on | off }
   Enables ('on') or disables ('off') the low-latency task scheduler. By default
diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h
index 13b5ad3fc..3dc98e5ce 100644
--- a/include/haproxy/defaults.h
+++ b/include/haproxy/defaults.h
@@ -186,19 +186,12 @@
 #define MAX_ACCEPT 4
 #endif
 
-// the max number of tasks to run at once. Tests have shown the following
-// number of requests/s for 1 to 16 threads (1c1t, 1c2t, 2c4t, 4c8t, 4c16t):
-//
-// rq\thr|    1     2     4     8    16
-// ------+------------------------------
-//     32|  120k  159k  276k  477k  698k
-//     40|  122k  160k  276k  478k  722k
-//     48|  121k  159k  274k  482k  720k
-//     64|  121k  160k  274k  469k  710k
-//    200|  114k  150k  247k  415k  613k
-//
+// The base max number of tasks to run at once to be used when not set by
+// tune.runqueue-depth. It will automatically be divided by the square root
+// of the number of threads for better fairness. As such, 64 threads will
+// use 35 and a single thread will use 280.
 #ifndef RUNQUEUE_DEPTH
-#define RUNQUEUE_DEPTH 40
+#define RUNQUEUE_DEPTH 280
 #endif
 
 // cookie delimiter in "prefix" mode. This character is inserted between the
diff --git a/src/haproxy.c b/src/haproxy.c
index 49f6957c3..7b30a78a2 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -2274,8 +2274,14 @@ static void init(int argc, char **argv)
 	if (global.tune.maxpollevents <= 0)
 		global.tune.maxpollevents = MAX_POLL_EVENTS;
 
-	if (global.tune.runqueue_depth <= 0)
-		global.tune.runqueue_depth = RUNQUEUE_DEPTH;
+	if (global.tune.runqueue_depth <= 0) {
+		/* tests on various thread counts from 1 to 64 have shown an
+		 * optimal queue depth following roughly 1/sqrt(threads).
+		 */
+		int s = my_flsl(global.nbthread);
+		s += (global.nbthread / s); // roughly twice the sqrt.
+		global.tune.runqueue_depth = RUNQUEUE_DEPTH * 2 / s;
+	}
 
 	if (global.tune.recv_enough == 0)
 		global.tune.recv_enough = MIN_RECV_AT_ONCE_ENOUGH;