mirror of
https://git.haproxy.org/git/haproxy.git/
synced 2025-08-07 07:37:02 +02:00
[MEDIUM] rework the wait queue mechanism
The wait queues now rely on 4 trees for past, present and future timers. The computations are cleaner and more reliable. The wake_expired_tasks function has become simpler. Also, a bug previously introduced in task_queue() by the first introduction of eb_trees has been fixed (the eb->key was never updated).
This commit is contained in:
parent
284c7b3195
commit
28c41a4041
186
src/task.c
186
src/task.c
@ -25,59 +25,99 @@ struct pool_head *pool2_task;
|
|||||||
|
|
||||||
void *run_queue = NULL;
|
void *run_queue = NULL;
|
||||||
|
|
||||||
/* Principle of the wait queue : we have two trees ordered by time. On of them
|
/* Principle of the wait queue.
|
||||||
* contains all timers for current time-frame, and the other one for next
|
*
|
||||||
* time-frame. Each time-frame is TIMER_KEY_BITS bits wide in number of
|
* We want to be able to tell whether an expiration date is before of after the
|
||||||
* milliseconds, which is 49 days for 32 bits. Values are stored into and
|
* current time <now>. We KNOW that expiration dates are never too far apart,
|
||||||
* retrieved from the tree using a key of TIMER_KEY_BITS bits. A pointer
|
* because they are already computed by adding integer numbers of milliseconds
|
||||||
* always designates the current tree, which is the one we read from, until
|
* to the current date.
|
||||||
* it is exhausted and <now> has its high bit designate the new tree.
|
* We also know that almost all dates will be in the future, and that a very
|
||||||
* An improvement would consist in holding too large timers in a side tree
|
* small part of them will be in the past, they are the ones which have expired
|
||||||
* consulted only once a switch. It could also be a simple list BTW.
|
* since last time we checked them.
|
||||||
|
*
|
||||||
|
* The current implementation uses a wrapping time cut into 3 ranges :
|
||||||
|
* - previous : those ones are expired by definition
|
||||||
|
* - current : some are expired, some are not
|
||||||
|
* - next : none are expired
|
||||||
|
*
|
||||||
|
* We use the higher two bits of the timers expressed in ticks (milliseconds)
|
||||||
|
* to determine which range a timer is in, compared to <now> :
|
||||||
|
*
|
||||||
|
* now previous current next0 next1
|
||||||
|
* [31:30] [31:30] [31:30] [31:30] [31:30]
|
||||||
|
* 00 11 00 01 10
|
||||||
|
* 01 00 01 10 11
|
||||||
|
* 10 01 10 11 00
|
||||||
|
* 11 10 11 00 01
|
||||||
|
*
|
||||||
|
* By definition, <current> is the range containing <now> as well as all timers
|
||||||
|
* which have the same 2 high bits as <now>, <previous> is the range just
|
||||||
|
* before, which contains all timers whose high bits equal those of <now> minus
|
||||||
|
* 1. Last, <next> is composed of the two remaining ranges.
|
||||||
|
*
|
||||||
|
* For ease of implementation, the timers will then be stored into 4 queues 0-3
|
||||||
|
* determined by the 2 higher bits of the timer. The expiration algorithm is
|
||||||
|
* very simple :
|
||||||
|
* - expire everything in <previous>=queue[((now>>30)-1)&3]
|
||||||
|
* - expire from <current>=queue[(now>>30)&3] everything where timer >= now
|
||||||
|
*
|
||||||
|
* With this algorithm, it's possible to queue tasks meant to expire 24.8 days
|
||||||
|
* in the future, and still be able to detect events remaining unprocessed for
|
||||||
|
* the last 12.4 days! Note that the principle might be extended to any number
|
||||||
|
* of higher bits as long as there is only one range for expired tasks. For
|
||||||
|
* instance, using the 8 higher bits to index the range, we would have one past
|
||||||
|
* range of 4.6 hours (24 bits in ms), and 254 ranges in the future totalizing
|
||||||
|
* 49.3 days. This would eat more memory for a very little added benefit.
|
||||||
|
*
|
||||||
|
* Also, in order to maintain the ability to perform time comparisons, it is
|
||||||
|
* recommended to avoid using the <next1> range above, as values in this range
|
||||||
|
* may not easily be compared to <now> outside of these functions as it is the
|
||||||
|
* opposite of the <current> range, and <timer>-<now> may randomly be positive
|
||||||
|
* or negative. That means we're left with +/- 12 days timers.
|
||||||
|
*
|
||||||
|
* To keep timers ordered, we use 4 ebtrees [0..3]. To keep computation low, we
|
||||||
|
* may use (seconds*1024)+milliseconds, which preserves ordering eventhough we
|
||||||
|
* can't do real computations on it. Future evolutions could make use of 1024th
|
||||||
|
* of seconds instead of milliseconds, with the special value 0 avoided (and
|
||||||
|
* replaced with 1), so that zero indicates the timer is not set.
|
||||||
*/
|
*/
|
||||||
#define TIMER_KEY_BITS 32
|
|
||||||
#define TIMER_SUBSEC_BITS 10
|
|
||||||
#define TIMER_SECOND_BITS (TIMER_KEY_BITS - TIMER_SUBSEC_BITS)
|
|
||||||
|
|
||||||
static struct {
|
#define TIMER_TICK_BITS 32
|
||||||
struct eb_root *curr; /* current time frame (t[0],t[1]) */
|
#define TIMER_TREE_BITS 2
|
||||||
struct eb_root t[2]; /* trees with MSB 0 and 1 */
|
#define TIMER_TREES (1 << TIMER_TREE_BITS)
|
||||||
struct timeval first; /* first value in the tree when known */
|
#define TIMER_TREE_SHIFT (TIMER_TICK_BITS - TIMER_TREE_BITS)
|
||||||
} timers;
|
#define TIMER_TREE_MASK (TIMER_TREES - 1)
|
||||||
|
#define TIMER_TICK_MASK ((1U << (TIMER_TICK_BITS-1)) * 2 - 1)
|
||||||
|
#define TIMER_SIGN_BIT (1 << (TIMER_TICK_BITS - 1))
|
||||||
|
|
||||||
|
static struct eb_root timers[TIMER_TREES]; /* trees with MSB 00, 01, 10 and 11 */
|
||||||
|
|
||||||
/* returns an ordered key based on an expiration date. */
|
/* returns an ordered key based on an expiration date. */
|
||||||
static inline unsigned int timeval_to_key(const struct timeval *t)
|
static inline unsigned int timeval_to_ticks(const struct timeval *t)
|
||||||
{
|
{
|
||||||
unsigned int key;
|
unsigned int key;
|
||||||
|
|
||||||
/* We choose sec << 10 + usec / 1000 below to keep the precision at the
|
key = ((unsigned int)t->tv_sec * 1000) + ((unsigned int)t->tv_usec / 1000);
|
||||||
* millisecond, but we might as well divide by 1024 and have a slightly
|
key &= TIMER_TICK_MASK;
|
||||||
* lower precision of 1.024 ms.
|
|
||||||
*/
|
|
||||||
|
|
||||||
key = ((unsigned int)t->tv_sec << TIMER_SUBSEC_BITS) +
|
|
||||||
((unsigned int)t->tv_usec / 1000);
|
|
||||||
|
|
||||||
#if TIMER_KEY_BITS != 32
|
|
||||||
key &= (1 << TIMER_KEY_BITS) - 1;
|
|
||||||
#endif
|
|
||||||
return key;
|
return key;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* returns a tree number based on a ticks value */
|
||||||
|
static inline unsigned int ticks_to_tree(unsigned int ticks)
|
||||||
|
{
|
||||||
|
return (ticks >> TIMER_TREE_SHIFT) & TIMER_TREE_MASK;
|
||||||
|
}
|
||||||
|
|
||||||
/* returns a tree number based on an expiration date. */
|
/* returns a tree number based on an expiration date. */
|
||||||
static inline unsigned int timeval_to_tree(const struct timeval *t)
|
static inline unsigned int timeval_to_tree(const struct timeval *t)
|
||||||
{
|
{
|
||||||
return (t->tv_sec >> TIMER_SECOND_BITS) & 1;
|
return ticks_to_tree(timeval_to_ticks(t));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* perform minimal intializations, report 0 in case of error, 1 if OK. */
|
/* perform minimal intializations, report 0 in case of error, 1 if OK. */
|
||||||
int init_task()
|
int init_task()
|
||||||
{
|
{
|
||||||
memset(&timers, 0, sizeof(timers));
|
memset(&timers, 0, sizeof(timers));
|
||||||
|
|
||||||
/* note: we never queue in the past, so we start with <now> */
|
|
||||||
timers.curr = &timers.t[timeval_to_tree(&now)];
|
|
||||||
|
|
||||||
pool2_task = create_pool("task", sizeof(struct task), MEM_F_SHARED);
|
pool2_task = create_pool("task", sizeof(struct task), MEM_F_SHARED);
|
||||||
return pool2_task != NULL;
|
return pool2_task != NULL;
|
||||||
}
|
}
|
||||||
@ -94,23 +134,21 @@ struct task *_task_wakeup(struct task *t)
|
|||||||
* date. Note that the task must *not* already be in the wait queue nor in the
|
* date. Note that the task must *not* already be in the wait queue nor in the
|
||||||
* run queue, otherwise unpredictable results may happen. Tasks queued with an
|
* run queue, otherwise unpredictable results may happen. Tasks queued with an
|
||||||
* eternity expiration date are simply returned. Last, tasks must not be queued
|
* eternity expiration date are simply returned. Last, tasks must not be queued
|
||||||
* further than the end of the next tree, which is between now and
|
* further than the end of the next tree, which is between <now_ms> and
|
||||||
* now+1<<TIMER_KEY_BITS ms (now+49days in 32bit).
|
* <now_ms> + TIMER_SIGN_BIT ms (now+12days..24days in 32bit).
|
||||||
*/
|
*/
|
||||||
struct task *task_queue(struct task *task)
|
struct task *task_queue(struct task *task)
|
||||||
{
|
{
|
||||||
struct eb_root *tmp;
|
|
||||||
unsigned int key;
|
|
||||||
|
|
||||||
if (unlikely(tv_iseternity(&task->expire)))
|
if (unlikely(tv_iseternity(&task->expire)))
|
||||||
return task;
|
return task;
|
||||||
|
|
||||||
if (tv_islt(&task->expire, &timers.first))
|
task->eb.key = timeval_to_ticks(&task->expire);
|
||||||
timers.first = task->expire;
|
#ifdef DEBUG_CHECK_INVALID_EXPIRATION_DATES
|
||||||
|
if ((task->eb.key - now_ms) & TIMER_SIGN_BIT)
|
||||||
key = timeval_to_key(&task->expire);
|
/* we're queuing too far away or in the past (most likely) */
|
||||||
tmp = &timers.t[timeval_to_tree(&task->expire)];
|
return task;
|
||||||
eb32_insert(tmp, &task->eb);
|
#endif
|
||||||
|
eb32_insert(&timers[ticks_to_tree(task->eb.key)], &task->eb);
|
||||||
return task;
|
return task;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -124,33 +162,31 @@ void wake_expired_tasks(struct timeval *next)
|
|||||||
{
|
{
|
||||||
struct task *task;
|
struct task *task;
|
||||||
struct eb32_node *eb;
|
struct eb32_node *eb;
|
||||||
unsigned int now_key;
|
|
||||||
unsigned int now_tree;
|
unsigned int now_tree;
|
||||||
|
unsigned int tree;
|
||||||
|
|
||||||
|
/* In theory, we should :
|
||||||
now_tree = timeval_to_tree(&now);
|
* - wake all tasks from the <previous> tree
|
||||||
|
* - wake all expired tasks from the <current> tree
|
||||||
/* This is a speedup: we immediately check for an expirable task in the
|
* - scan <next> trees for next expiration date if not found earlier.
|
||||||
* timer's index. Warning: if nothing is found, we still may have to
|
* But we can do all this more easily : we scan all 3 trees before we
|
||||||
* switch the trees.
|
* wrap, and wake everything expired from there, then stop on the first
|
||||||
|
* non-expired entry.
|
||||||
*/
|
*/
|
||||||
if (likely(tv_isgt(&timers.first, &now))) {
|
|
||||||
*next = timers.first;
|
|
||||||
if (timers.curr != &timers.t[now_tree])
|
|
||||||
timers.curr = &timers.t[now_tree];
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
now_key = timeval_to_key(&now);
|
now_tree = ticks_to_tree(now_ms);
|
||||||
|
tree = (now_tree - 1) & TIMER_TREE_MASK;
|
||||||
do {
|
do {
|
||||||
eb = eb32_first(timers.curr);
|
eb = eb32_first(&timers[tree]);
|
||||||
while (eb) {
|
while (eb) {
|
||||||
struct eb32_node *next_eb;
|
struct eb32_node *next_eb;
|
||||||
|
|
||||||
task = eb32_entry(eb, struct task, eb);
|
task = eb32_entry(eb, struct task, eb);
|
||||||
if ((signed)(eb->key - now_key) > 0) {
|
if ((now_ms - eb->key) & TIMER_SIGN_BIT) {
|
||||||
|
/* note that we don't need this check for the <previous>
|
||||||
|
* tree, but it's cheaper than duplicating the code.
|
||||||
|
*/
|
||||||
*next = task->expire;
|
*next = task->expire;
|
||||||
timers.first = task->expire;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -163,28 +199,12 @@ void wake_expired_tasks(struct timeval *next)
|
|||||||
DLIST_ADD(run_queue, &task->qlist);
|
DLIST_ADD(run_queue, &task->qlist);
|
||||||
task->state = TASK_RUNNING;
|
task->state = TASK_RUNNING;
|
||||||
}
|
}
|
||||||
|
tree = (tree + 1) & TIMER_TREE_MASK;
|
||||||
|
} while (((tree - now_tree) & TIMER_TREE_MASK) < TIMER_TREES/2);
|
||||||
|
|
||||||
/* OK we have reached the end of the <curr> tree. It might mean
|
/* We have found no task to expire in any tree */
|
||||||
* that we must now switch, which is indicated by the fact that
|
tv_eternity(next);
|
||||||
* the current tree pointer does not match <now> anymore.
|
return;
|
||||||
*/
|
|
||||||
if (timers.curr == &timers.t[now_tree]) {
|
|
||||||
/* We cannot switch now, so we have to find the first
|
|
||||||
* timer of the next tree.
|
|
||||||
*/
|
|
||||||
eb = eb32_first(&timers.t[now_tree ^ 1]);
|
|
||||||
if (eb) {
|
|
||||||
task = eb32_entry(eb, struct task, eb);
|
|
||||||
*next = task->expire;
|
|
||||||
timers.first = task->expire;
|
|
||||||
} else {
|
|
||||||
tv_eternity(next);
|
|
||||||
tv_eternity(&timers.first);
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
timers.curr = &timers.t[now_tree];
|
|
||||||
} while (1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
5038
tests/test-sched.cfg
Normal file
5038
tests/test-sched.cfg
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user