From 573bbbe127608d36e4050fcb19069dc64ba4b2fb Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Sun, 17 Mar 2024 10:20:56 +0100
Subject: [PATCH] MEDIUM: ring: improve speed in the queue waiting loop on
 x86_64

x86_64 doesn't have a native atomic FETCH_OR(), it's implemented using
a CAS, which will always cause a write cycle. Here we know we can just
wait as long as the lock bit is held so better loop on a load, and only
attempt the CAS on success. This requires a tiny ifdef and brings nice
benefits. This brings the performance back from 3.33M to 3.75M at 24C48T
while doing no change at 3C6T.
---
 src/ring.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/ring.c b/src/ring.c
index 74772314a..0393a269b 100644
--- a/src/ring.c
+++ b/src/ring.c
@@ -281,11 +281,15 @@ ssize_t ring_write(struct ring *ring, size_t maxlen, const struct ist pfx[], siz
 			if (next_cell != &cell)
 				goto wait_for_flush; // FIXME: another thread arrived, we should go to wait now
 			__ha_cpu_relax_for_read();
-
-			tail_ofs = HA_ATOMIC_FETCH_OR(tail_ptr, RING_TAIL_LOCK);
-			if (!(tail_ofs & RING_TAIL_LOCK))
-				break;
-
+#if defined(__x86_64__)
+			/* x86 prefers a read first */
+			if (!(HA_ATOMIC_LOAD(tail_ptr) & RING_TAIL_LOCK))
+#endif
+			{
+				tail_ofs = HA_ATOMIC_FETCH_OR(tail_ptr, RING_TAIL_LOCK);
+				if (!(tail_ofs & RING_TAIL_LOCK))
+					break;
+			}
 			__ha_cpu_relax_for_read();
 		}
 		/* OK the queue is locked, let's attempt to get the tail lock */