diff --git a/doc/configuration.txt b/doc/configuration.txt
index a2ba7cf93..780d27fd9 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -697,6 +697,7 @@ The following keywords are supported in the "global" section :
    - tune.rcvbuf.server
    - tune.recv_enough
    - tune.runqueue-depth
+   - tune.sched.low-latency
    - tune.sndbuf.client
    - tune.sndbuf.server
    - tune.ssl.cachesize
@@ -2095,7 +2096,20 @@ tune.recv_enough <number>
 tune.runqueue-depth <number>
   Sets the maximum amount of task that can be processed at once when running
   tasks. The default value is 200. Increasing it may incur latency when
-  dealing with I/Os, making it too small can incur extra overhead.
+  dealing with I/Os, making it too small can incur extra overhead. When
+  experimenting with much larger values, it may be useful to also enable
+  tune.sched.low-latency to limit the maximum latency to the lowest possible.
+
+tune.sched.low-latency { on | off }
+  Enables ('on') or disables ('off') the low-latency task scheduler. By default
+  haproxy processes tasks from several classes one class at a time as this is
+  the most efficient. But when running with large values of tune.runqueue-depth
+  this can have a measurable effect on request or connection latency. When this
+  low-latency setting is enabled, tasks of lower priority classes will always
+  be executed before other ones if they exist. This will permit to lower the
+  maximum latency experienced by new requests or connections in the middle of
+  massive traffic, at the expense of a higher impact on this large traffic.
+  For regular usage it is better to leave this off. The default value is off.
 
 tune.sndbuf.client <number>
 tune.sndbuf.server <number>
@@ -15838,11 +15852,12 @@ lat_ns_avg : integer
   the value low, it is possible to reduce the scheduler's run queue depth using
   "tune.runqueue-depth", to reduce the number of concurrent events processed at
   once using "tune.maxpollevents", to decrease the stream's nice value using
-  the "nice" option on the "bind" lines or in the frontend, or to look for
-  other heavy requests in logs (those exhibiting large values of "cpu_ns_avg"),
-  whose processing needs to be adjusted or fixed. Compression of large buffers
-  could be a culprit, like heavy regex or long lists of regex.
-  Note: this value is exactly lat_ns_tot divided by cpu_calls.
+  the "nice" option on the "bind" lines or in the frontend, to enable low
+  latency scheduling using "tune.sched.low-latency", or to look for other heavy
+  requests in logs (those exhibiting large values of "cpu_ns_avg"), whose
+  processing needs to be adjusted or fixed. Compression of large buffers could
+  be a culprit, like heavy regex or long lists of regex. Note: this value is
+  exactly lat_ns_tot divided by cpu_calls.
 
 lat_ns_tot : integer
   Returns the total number of nanoseconds spent between the moment the task
@@ -15854,10 +15869,11 @@ lat_ns_tot : integer
   the value low, it is possible to reduce the scheduler's run queue depth using
   "tune.runqueue-depth", to reduce the number of concurrent events processed at
   once using "tune.maxpollevents", to decrease the stream's nice value using
-  the "nice" option on the "bind" lines or in the frontend, or to look for
-  other heavy requests in logs (those exhibiting large values of "cpu_ns_avg"),
-  whose processing needs to be adjusted or fixed. Compression of large buffers
-  could be a culprit, like heavy regex or long lists of regex. Note: while it
+  the "nice" option on the "bind" lines or in the frontend, to enable low
+  latency scheduling using "tune.sched.low-latency", or to look for other heavy
+  requests in logs (those exhibiting large values of "cpu_ns_avg"), whose
+  processing needs to be adjusted or fixed. Compression of large buffers could
+  be a culprit, like heavy regex or long lists of regex. Note: while it
   may intuitively seem that the total latency adds to a transfer time, it is
   almost never true because while a task waits for the CPU, network buffers
   continue to fill up and the next call will process more at once. The value
diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h
index c7591b467..0da246dde 100644
--- a/include/haproxy/global-t.h
+++ b/include/haproxy/global-t.h
@@ -67,6 +67,7 @@
 #define GTUNE_INSECURE_FORK      (1<<16)
 #define GTUNE_INSECURE_SETUID    (1<<17)
 #define GTUNE_FD_ET              (1<<18)
+#define GTUNE_SCHED_LOW_LATENCY  (1<<19)
 
 /* SSL server verify mode */
 enum {
diff --git a/src/task.c b/src/task.c
index 22954adc2..6079956ba 100644
--- a/src/task.c
+++ b/src/task.c
@@ -16,6 +16,7 @@
 #include <import/eb32tree.h>
 
 #include <haproxy/api.h>
+#include <haproxy/cfgparse.h>
 #include <haproxy/fd.h>
 #include <haproxy/freq_ctr.h>
 #include <haproxy/list.h>
@@ -328,6 +329,7 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
 	struct task *(*process)(struct task *t, void *ctx, unsigned short state);
 	struct list *tl_queues = sched->tasklets;
 	struct task *t;
+	uint8_t budget_mask = (1 << TL_CLASSES) - 1;
 	unsigned int done = 0;
 	unsigned int queue;
 	unsigned short state;
@@ -336,6 +338,33 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
 	for (queue = 0; queue < TL_CLASSES;) {
 		sched->current_queue = queue;
 
+		/* global.tune.sched.low-latency is set */
+		if (global.tune.options & GTUNE_SCHED_LOW_LATENCY) {
+			if (unlikely(sched->tl_class_mask & budget_mask & ((1 << queue) - 1))) {
+				/* a lower queue index has tasks again and still has a
+				 * budget to run them. Let's switch to it now.
+				 */
+				queue = (sched->tl_class_mask & 1) ? 0 :
+					(sched->tl_class_mask & 2) ? 1 : 2;
+				continue;
+			}
+
+			if (unlikely(queue > TL_URGENT &&
+				     budget_mask & (1 << TL_URGENT) &&
+				     !MT_LIST_ISEMPTY(&sched->shared_tasklet_list))) {
+				/* an urgent tasklet arrived from another thread */
+				break;
+			}
+
+			if (unlikely(queue > TL_NORMAL &&
+				     budget_mask & (1 << TL_NORMAL) &&
+				     ((sched->rqueue_size > 0) ||
+				      (global_tasks_mask & tid_bit)))) {
+				/* a task was woken up by a bulk tasklet or another thread */
+				break;
+			}
+		}
+
 		if (LIST_ISEMPTY(&tl_queues[queue])) {
 			sched->tl_class_mask &= ~(1 << queue);
 			queue++;
@@ -343,6 +372,7 @@ unsigned int run_tasks_from_lists(unsigned int budgets[])
 		}
 
 		if (!budgets[queue]) {
+			budget_mask &= ~(1 << queue);
 			queue++;
 			continue;
 		}
@@ -687,6 +717,32 @@ static void init_task()
 	}
 }
 
+/* config parser for global "tune.sched.low-latency", accepts "on" or "off" */
+static int cfg_parse_tune_sched_low_latency(char **args, int section_type, struct proxy *curpx,
+                                      struct proxy *defpx, const char *file, int line,
+                                      char **err)
+{
+	if (too_many_args(1, args, err, NULL))
+		return -1;
+
+	if (strcmp(args[1], "on") == 0)
+		global.tune.options |= GTUNE_SCHED_LOW_LATENCY;
+	else if (strcmp(args[1], "off") == 0)
+		global.tune.options &= ~GTUNE_SCHED_LOW_LATENCY;
+	else {
+		memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]);
+		return -1;
+	}
+	return 0;
+}
+
+/* config keyword parsers */
+static struct cfg_kw_list cfg_kws = {ILH, {
+	{ CFG_GLOBAL, "tune.sched.low-latency", cfg_parse_tune_sched_low_latency },
+	{ 0, NULL, NULL }
+}};
+
+INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws);
 INITCALL0(STG_PREPARE, init_task);
 
 /*