IMPORT: eb32/eb64: reorder the lookup loop for modern CPUs

The current code calculates the next troot based on a calculation.
This was efficient when the algorithm was developed many years ago
on K6 and K7 CPUs running at low frequencies with few registers and
limited branch prediction units but nowadays with ultra-deep pipelines
and high latency memory that's no longer efficient, because the CPU
needs to have completed multiple operations before knowing which
address to start fetching from. It's sad because we only have two
branches each time but the CPU cannot know it. In addition, the
calculation is performed late in the loop, which does not help the
address generation unit to start prefetching next data.

Instead we should help the CPU by preloading data early from the node
and calculing troot as soon as possible. The CPU will be able to
postpone that processing until the dependencies are available and it
really needs to dereference it. In addition we must absolutely avoid
serializing instructions such as "(a >> b) & 1" because there's no
way for the compiler to parallelize that code nor for the CPU to pre-
process some early data.

What this patch does is relatively simple:

  - we try to prefetch the next two branches as soon as the
    node is known, which will help dereference the selected node in
    the next iteration; it was shown that it only works with the next
    changes though, otherwise it can reduce the performance instead.
    In practice the prefetching will start a bit later once the node
    is really in the cache, but since there's no dependency between
    these instructions and any other one, we let the CPU optimize as
    it wants.

  - we preload all important data from the node (next two branches,
    key and node.bit) very early even if not immediately needed.
    This is cheap, it doesn't cause any pipeline stall and speeds
    up later operations.

  - we pre-calculate 1<<bit that we assign into a register, so as
    to avoid serializing instructions when deciding which branch to
    take.

  - we assign the troot based on a ternary operation (or if/else) so
    that the CPU knows upfront the two possible next addresses without
    waiting for the end of a calculation and can prefetch their contents
    every time the branch prediction unit guesses right.

Just doing this provides significant gains at various tree sizes on
random keys (in million lookups per second):

  eb32   1k:  29.07 -> 33.17  +14.1%
        10k:  14.27 -> 15.74  +10.3%
       100k:   6.64 ->  8.00  +20.5%
  eb64   1k:  27.51 -> 34.40  +25.0%
        10k:  13.54 -> 16.17  +19.4%
       100k:   7.53 ->  8.38  +11.3%

The performance is now much closer to the sequential keys. This was
done for all variants ({32,64}{,i,le,ge}).

Another point, the equality test in the loop improves the performance
when looking up random keys (since we don't need to reach the leaf),
but is counter-productive for sequential keys, which can gain ~17%
without that test. However sequential keys are normally not used with
exact lookups, but rather with lookup_ge() that spans a time frame,
and which does not have that test for this precise reason, so in the
end both use cases are served optimally.

It's interesting to note that everything here is solely based on data
dependencies, and that trying to perform *less* operations upfront
always ends up with lower performance (typically the original one).

This is ebtree commit 05a0613e97f51b6665ad5ae2801199ad55991534.
This commit is contained in:
Willy Tarreau 2025-06-07 14:36:16 +02:00
parent dcd4d36723
commit 6af17d491f
4 changed files with 66 additions and 22 deletions

View File

@ -119,7 +119,7 @@ static forceinline struct eb32_node *__eb32_lookup(struct eb_root *root, u32 x)
{
struct eb32_node *node;
eb_troot_t *troot;
u32 y;
u32 y, z;
int node_bit;
troot = root->b[EB_LEFT];
@ -137,9 +137,15 @@ static forceinline struct eb32_node *__eb32_lookup(struct eb_root *root, u32 x)
}
node = container_of(eb_untag(troot, EB_NODE),
struct eb32_node, node.branches);
node_bit = node->node.bit;
__builtin_prefetch(node->node.branches.b[0], 0);
__builtin_prefetch(node->node.branches.b[1], 0);
node_bit = node->node.bit;
y = node->key ^ x;
z = 1U << (node_bit & 31);
troot = (x & z) ? node->node.branches.b[1] : node->node.branches.b[0];
if (!y) {
/* Either we found the node which holds the key, or
* we have a dup tree. In the later case, we have to
@ -157,8 +163,6 @@ static forceinline struct eb32_node *__eb32_lookup(struct eb_root *root, u32 x)
if ((y >> node_bit) >= EB_NODE_BRANCHES)
return NULL; /* no more common bits */
troot = node->node.branches.b[(x >> node_bit) & EB_NODE_BRANCH_MASK];
}
}
@ -171,7 +175,7 @@ static forceinline struct eb32_node *__eb32i_lookup(struct eb_root *root, s32 x)
struct eb32_node *node;
eb_troot_t *troot;
u32 key = x ^ 0x80000000;
u32 y;
u32 y, z;
int node_bit;
troot = root->b[EB_LEFT];
@ -189,9 +193,15 @@ static forceinline struct eb32_node *__eb32i_lookup(struct eb_root *root, s32 x)
}
node = container_of(eb_untag(troot, EB_NODE),
struct eb32_node, node.branches);
node_bit = node->node.bit;
__builtin_prefetch(node->node.branches.b[0], 0);
__builtin_prefetch(node->node.branches.b[1], 0);
node_bit = node->node.bit;
y = node->key ^ x;
z = 1U << (node_bit & 31);
troot = (key & z) ? node->node.branches.b[1] : node->node.branches.b[0];
if (!y) {
/* Either we found the node which holds the key, or
* we have a dup tree. In the later case, we have to
@ -209,8 +219,6 @@ static forceinline struct eb32_node *__eb32i_lookup(struct eb_root *root, s32 x)
if ((y >> node_bit) >= EB_NODE_BRANCHES)
return NULL; /* no more common bits */
troot = node->node.branches.b[(key >> node_bit) & EB_NODE_BRANCH_MASK];
}
}

View File

@ -119,7 +119,7 @@ static forceinline struct eb64_node *__eb64_lookup(struct eb_root *root, u64 x)
{
struct eb64_node *node;
eb_troot_t *troot;
u64 y;
u64 y, z;
troot = root->b[EB_LEFT];
if (unlikely(troot == NULL))
@ -137,7 +137,13 @@ static forceinline struct eb64_node *__eb64_lookup(struct eb_root *root, u64 x)
node = container_of(eb_untag(troot, EB_NODE),
struct eb64_node, node.branches);
__builtin_prefetch(node->node.branches.b[0], 0);
__builtin_prefetch(node->node.branches.b[1], 0);
y = node->key ^ x;
z = 1ULL << (node->node.bit & 63);
troot = (x & z) ? node->node.branches.b[1] : node->node.branches.b[0];
if (!y) {
/* Either we found the node which holds the key, or
* we have a dup tree. In the later case, we have to
@ -155,8 +161,6 @@ static forceinline struct eb64_node *__eb64_lookup(struct eb_root *root, u64 x)
if ((y >> node->node.bit) >= EB_NODE_BRANCHES)
return NULL; /* no more common bits */
troot = node->node.branches.b[(x >> node->node.bit) & EB_NODE_BRANCH_MASK];
}
}
@ -169,7 +173,7 @@ static forceinline struct eb64_node *__eb64i_lookup(struct eb_root *root, s64 x)
struct eb64_node *node;
eb_troot_t *troot;
u64 key = x ^ (1ULL << 63);
u64 y;
u64 y, z;
troot = root->b[EB_LEFT];
if (unlikely(troot == NULL))
@ -187,7 +191,13 @@ static forceinline struct eb64_node *__eb64i_lookup(struct eb_root *root, s64 x)
node = container_of(eb_untag(troot, EB_NODE),
struct eb64_node, node.branches);
__builtin_prefetch(node->node.branches.b[0], 0);
__builtin_prefetch(node->node.branches.b[1], 0);
y = node->key ^ x;
z = 1ULL << (node->node.bit & 63);
troot = (key & z) ? node->node.branches.b[1] : node->node.branches.b[0];
if (!y) {
/* Either we found the node which holds the key, or
* we have a dup tree. In the later case, we have to
@ -205,8 +215,6 @@ static forceinline struct eb64_node *__eb64i_lookup(struct eb_root *root, s64 x)
if ((y >> node->node.bit) >= EB_NODE_BRANCHES)
return NULL; /* no more common bits */
troot = node->node.branches.b[(key >> node->node.bit) & EB_NODE_BRANCH_MASK];
}
}

View File

@ -50,6 +50,7 @@ struct eb32_node *eb32_lookup_le(struct eb_root *root, u32 x)
{
struct eb32_node *node;
eb_troot_t *troot;
u32 y, z;
troot = root->b[EB_LEFT];
if (unlikely(troot == NULL))
@ -72,6 +73,13 @@ struct eb32_node *eb32_lookup_le(struct eb_root *root, u32 x)
node = container_of(eb_untag(troot, EB_NODE),
struct eb32_node, node.branches);
__builtin_prefetch(node->node.branches.b[0], 0);
__builtin_prefetch(node->node.branches.b[1], 0);
y = node->key;
z = 1U << (node->node.bit & 31);
troot = (x & z) ? node->node.branches.b[1] : node->node.branches.b[0];
if (node->node.bit < 0) {
/* We're at the top of a dup tree. Either we got a
* matching value and we return the rightmost node, or
@ -93,7 +101,7 @@ struct eb32_node *eb32_lookup_le(struct eb_root *root, u32 x)
break;
}
if (((x ^ node->key) >> node->node.bit) >= EB_NODE_BRANCHES) {
if ((x ^ y) & -(z << 1)) {
/* No more common bits at all. Either this node is too
* small and we need to get its highest value, or it is
* too large, and we need to get the prev value.
@ -109,7 +117,6 @@ struct eb32_node *eb32_lookup_le(struct eb_root *root, u32 x)
troot = node->node.node_p;
break;
}
troot = node->node.branches.b[(x >> node->node.bit) & EB_NODE_BRANCH_MASK];
}
/* If we get here, it means we want to report previous node before the
@ -138,6 +145,7 @@ struct eb32_node *eb32_lookup_ge(struct eb_root *root, u32 x)
{
struct eb32_node *node;
eb_troot_t *troot;
u32 y, z;
troot = root->b[EB_LEFT];
if (unlikely(troot == NULL))
@ -160,6 +168,13 @@ struct eb32_node *eb32_lookup_ge(struct eb_root *root, u32 x)
node = container_of(eb_untag(troot, EB_NODE),
struct eb32_node, node.branches);
__builtin_prefetch(node->node.branches.b[0], 0);
__builtin_prefetch(node->node.branches.b[1], 0);
y = node->key;
z = 1U << (node->node.bit & 31);
troot = (x & z) ? node->node.branches.b[1] : node->node.branches.b[0];
if (node->node.bit < 0) {
/* We're at the top of a dup tree. Either we got a
* matching value and we return the leftmost node, or
@ -181,7 +196,7 @@ struct eb32_node *eb32_lookup_ge(struct eb_root *root, u32 x)
break;
}
if (((x ^ node->key) >> node->node.bit) >= EB_NODE_BRANCHES) {
if ((x ^ y) & -(z << 1)) {
/* No more common bits at all. Either this node is too
* large and we need to get its lowest value, or it is too
* small, and we need to get the next value.
@ -197,7 +212,6 @@ struct eb32_node *eb32_lookup_ge(struct eb_root *root, u32 x)
troot = node->node.node_p;
break;
}
troot = node->node.branches.b[(x >> node->node.bit) & EB_NODE_BRANCH_MASK];
}
/* If we get here, it means we want to report next node after the

View File

@ -50,6 +50,7 @@ struct eb64_node *eb64_lookup_le(struct eb_root *root, u64 x)
{
struct eb64_node *node;
eb_troot_t *troot;
u64 y, z;
troot = root->b[EB_LEFT];
if (unlikely(troot == NULL))
@ -72,6 +73,13 @@ struct eb64_node *eb64_lookup_le(struct eb_root *root, u64 x)
node = container_of(eb_untag(troot, EB_NODE),
struct eb64_node, node.branches);
__builtin_prefetch(node->node.branches.b[0], 0);
__builtin_prefetch(node->node.branches.b[1], 0);
y = node->key;
z = 1ULL << (node->node.bit & 63);
troot = (x & z) ? node->node.branches.b[1] : node->node.branches.b[0];
if (node->node.bit < 0) {
/* We're at the top of a dup tree. Either we got a
* matching value and we return the rightmost node, or
@ -93,7 +101,7 @@ struct eb64_node *eb64_lookup_le(struct eb_root *root, u64 x)
break;
}
if (((x ^ node->key) >> node->node.bit) >= EB_NODE_BRANCHES) {
if ((x ^ y) & -(z << 1)) {
/* No more common bits at all. Either this node is too
* small and we need to get its highest value, or it is
* too large, and we need to get the prev value.
@ -109,7 +117,6 @@ struct eb64_node *eb64_lookup_le(struct eb_root *root, u64 x)
troot = node->node.node_p;
break;
}
troot = node->node.branches.b[(x >> node->node.bit) & EB_NODE_BRANCH_MASK];
}
/* If we get here, it means we want to report previous node before the
@ -138,6 +145,7 @@ struct eb64_node *eb64_lookup_ge(struct eb_root *root, u64 x)
{
struct eb64_node *node;
eb_troot_t *troot;
u64 y, z;
troot = root->b[EB_LEFT];
if (unlikely(troot == NULL))
@ -160,6 +168,13 @@ struct eb64_node *eb64_lookup_ge(struct eb_root *root, u64 x)
node = container_of(eb_untag(troot, EB_NODE),
struct eb64_node, node.branches);
__builtin_prefetch(node->node.branches.b[0], 0);
__builtin_prefetch(node->node.branches.b[1], 0);
y = node->key;
z = 1ULL << (node->node.bit & 63);
troot = (x & z) ? node->node.branches.b[1] : node->node.branches.b[0];
if (node->node.bit < 0) {
/* We're at the top of a dup tree. Either we got a
* matching value and we return the leftmost node, or
@ -181,7 +196,7 @@ struct eb64_node *eb64_lookup_ge(struct eb_root *root, u64 x)
break;
}
if (((x ^ node->key) >> node->node.bit) >= EB_NODE_BRANCHES) {
if ((x ^ y) & -(z << 1)) {
/* No more common bits at all. Either this node is too
* large and we need to get its lowest value, or it is too
* small, and we need to get the next value.
@ -197,7 +212,6 @@ struct eb64_node *eb64_lookup_ge(struct eb_root *root, u64 x)
troot = node->node.node_p;
break;
}
troot = node->node.branches.b[(x >> node->node.bit) & EB_NODE_BRANCH_MASK];
}
/* If we get here, it means we want to report next node after the