From 69bc0e7351e0c895d7fe0b8962672aea5fd31427 Mon Sep 17 00:00:00 2001
From: Pavel Punsky <eakraly@users.noreply.github.com>
Date: Sun, 3 May 2026 22:03:08 -0700
Subject: [PATCH] Load generator mode in turnutils_uclient (#1894)

## Summary

Adds load-generator modes to `turnutils_uclient` for repeatable TURN
server performance testing:

- Adds `-Y packet|alloc|invalid` load modes.
- Supports packet flood, allocation flood, and invalid-packet flood
workflows.
- Adds unique local client ports for allocation flood mode.
- Removes default packet pacing in load-generator modes unless
explicitly set.
- Adds helper scripts under `examples/loadtest/`.
- Documents load-test usage in `README.turnutils`,
`man/man1/turnutils.1`, `CLAUDE.md`, and
`docs/PerformanceIterationLog.md`.

The performance log captures DigitalOcean benchmark methodology, A/B
lessons, hot-path findings, and future optimization candidates.
---
 CLAUDE.md                             | 203 +++++++++++++++++
 README.turnutils                      |  18 ++
 docs/PerformanceIterationLog.md       | 268 ++++++++++++++++++++++
 examples/loadtest/allocation_flood.sh |  45 ++++
 examples/loadtest/invalid_flood.sh    |  41 ++++
 examples/loadtest/packet_flood.sh     |  49 +++++
 man/man1/turnutils.1                  |  19 ++
 src/apps/uclient/mainuclient.c        |  74 ++++++-
 src/apps/uclient/startuclient.c       | 135 +++++++++++-
 src/apps/uclient/startuclient.h       |   9 +
 src/apps/uclient/uclient.c            | 306 +++++++++++++++++++++++---
 src/apps/uclient/uclient.h            |  13 ++
 12 files changed, 1136 insertions(+), 44 deletions(-)
 create mode 100644 docs/PerformanceIterationLog.md
 create mode 100755 examples/loadtest/allocation_flood.sh
 create mode 100755 examples/loadtest/invalid_flood.sh
 create mode 100755 examples/loadtest/packet_flood.sh

diff --git a/CLAUDE.md b/CLAUDE.md
index 752566df..dd1ceb86 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -134,6 +134,209 @@ cd examples && ./scripts/basic/udp_c2c_client.sh
 cd examples && ./run_tests.sh
 ```
 
+## Load Test on DigitalOcean
+
+Use two same-region CPU-optimized droplets for repeatable load tests. The last
+known setup used Ubuntu 24.04 `c-4` droplets in `nyc1`:
+
+- turnserver droplet private IP: `10.116.0.2`
+- loadgen droplet private IP: `10.116.0.3`
+- build: current branch archived with `git archive`
+- important baseline: turnserver was **not** run with `--udp-recvmmsg`
+
+Never paste DigitalOcean tokens into logs or files. Use a local environment
+variable such as `DIGITALOCEAN_TOKEN`, and revoke temporary tokens after the
+run.
+
+Local source package and upload:
+
+```bash
+git archive --format=tar HEAD -o /tmp/coturn.tar
+
+scp /tmp/coturn.tar root@TURN_PUBLIC_IP:/root/coturn.tar
+scp /tmp/coturn.tar root@LOADGEN_PUBLIC_IP:/root/coturn.tar
+```
+
+Install dependencies and build on both droplets:
+
+```bash
+export DEBIAN_FRONTEND=noninteractive
+apt-get update
+apt-get install -y build-essential cmake pkg-config libssl-dev libevent-dev \
+  libsqlite3-dev libhiredis-dev git iproute2 sysstat
+
+rm -rf /root/coturn
+mkdir /root/coturn
+tar -xf /root/coturn.tar -C /root/coturn
+cmake -S /root/coturn -B /root/coturn/build -DCMAKE_BUILD_TYPE=Release
+cmake --build /root/coturn/build --target turnserver turnutils_uclient turnutils_peer -j$(nproc)
+```
+
+Start `turnserver` on the server droplet. This is the baseline command used for
+the final run; add `--udp-recvmmsg` only when intentionally comparing that mode:
+
+```bash
+pkill -x turnserver || true
+sysctl -w net.core.rmem_max=134217728 net.core.wmem_max=134217728 \
+  net.core.netdev_max_backlog=250000 || true
+ulimit -n 1048576
+
+nohup /root/coturn/build/bin/turnserver \
+  --use-auth-secret \
+  --static-auth-secret=secret \
+  --realm=north.gov \
+  --allow-loopback-peers \
+  --listening-ip=10.116.0.2 \
+  --relay-ip=10.116.0.2 \
+  --min-port=49152 \
+  --max-port=65535 \
+  --no-cli \
+  --no-tls \
+  --no-dtls \
+  --log-file=stdout \
+  --simple-log \
+  > /root/turnserver.log 2>&1 &
+echo $! > /root/turnserver.pid
+```
+
+Start the UDP peer on the loadgen droplet:
+
+```bash
+pkill -x turnutils_peer || true
+sysctl -w net.core.rmem_max=134217728 net.core.wmem_max=134217728 \
+  net.core.netdev_max_backlog=250000 || true
+ulimit -n 1048576
+
+nohup /root/coturn/build/bin/turnutils_peer -L 10.116.0.3 -p 3480 \
+  > /root/peer.log 2>&1 &
+echo $! > /root/peer.pid
+```
+
+Optional server-side monitor, run on the turnserver droplet before each test:
+
+```bash
+cat > /root/start_monitor.sh <<'EOF'
+#!/bin/bash
+label=$1
+pid=$(cat /root/turnserver.pid)
+rm -f /root/${label}_*.txt
+nohup bash -c "pidstat -h -u -r -p $pid 1 14 > /root/${label}_pidstat.txt & \
+  mpstat 1 14 > /root/${label}_mpstat.txt & \
+  sar -n DEV 1 14 > /root/${label}_sar.txt & wait" \
+  > /root/${label}_monitor.out 2>&1 &
+echo $! > /root/${label}_monitor.pid
+EOF
+chmod +x /root/start_monitor.sh
+```
+
+Connectivity smoke from loadgen:
+
+```bash
+/root/coturn/build/bin/turnutils_uclient \
+  -Y packet -m 1 -n 1000 -l 120 \
+  -e 10.116.0.3 -r 3480 -X -g \
+  -u user -W secret \
+  10.116.0.2
+```
+
+Packet relay sweep from loadgen:
+
+```bash
+for m in 1 2 4 8 16 32; do
+  log=/root/packet_m${m}.log
+  timeout -s INT 12s /root/coturn/build/bin/turnutils_uclient \
+    -Y packet -m "$m" -l 120 \
+    -e 10.116.0.3 -r 3480 -X -g \
+    -u user -W secret \
+    10.116.0.2 > "$log" 2>&1 || true
+  tail -20 "$log"
+done
+```
+
+Monitored packet run:
+
+```bash
+# on turnserver
+/root/start_monitor.sh packet_m1_mon
+
+# on loadgen
+timeout -s INT 12s /root/coturn/build/bin/turnutils_uclient \
+  -Y packet -m 1 -l 120 \
+  -e 10.116.0.3 -r 3480 -X -g \
+  -u user -W secret \
+  10.116.0.2 > /root/packet_m1_mon.log 2>&1 || true
+```
+
+Packet-only CPU profile, useful when checking the relay bottleneck. Build with
+`-DCMAKE_BUILD_TYPE=RelWithDebInfo` if you want readable user-space symbols.
+Run once without `--udp-recvmmsg`, then restart `turnserver` with
+`--udp-recvmmsg` and rerun the same commands with the `recvmmsg` label:
+
+```bash
+# on turnserver
+sysctl -w kernel.perf_event_paranoid=-1 kernel.kptr_restrict=0 || true
+pid=$(cat /root/turnserver.pid)
+label=no_recvmmsg
+
+(pidstat -h -u -r -p "$pid" 1 14 > /root/${label}_pidstat.txt & \
+  mpstat 1 14 > /root/${label}_mpstat.txt & \
+  sar -n DEV 1 14 > /root/${label}_sar.txt & wait) \
+  > /root/${label}_monitor.out 2>&1 &
+
+perf record -F 99 -g -p "$pid" -o /root/${label}.perf.data -- sleep 14
+perf report --stdio -i /root/${label}.perf.data --no-children \
+  --sort comm,dso,symbol > /root/${label}_perf.report
+perf report --stdio -i /root/${label}.perf.data --children \
+  --sort symbol,dso > /root/${label}_perf.children
+
+# on loadgen, started about one second after perf starts
+timeout -s INT 12s /root/coturn/build/bin/turnutils_uclient \
+  -Y packet -m 1 -l 120 \
+  -e 10.116.0.3 -r 3480 -X -g \
+  -u user -W secret \
+  10.116.0.2 > /root/${label}_packet_m1.log 2>&1 || true
+```
+
+Invalid-packet flood:
+
+```bash
+# on turnserver
+/root/start_monitor.sh invalid_m1_mon
+
+# on loadgen
+timeout -s INT 12s /root/coturn/build/bin/turnutils_uclient \
+  -Y invalid -m 1 -l 16 \
+  10.116.0.2 > /root/invalid_m1_mon.log 2>&1 || true
+```
+
+Restart `turnserver` after invalid-packet tests before allocation tests. The
+last run saw rapid RSS growth during invalid flood, so avoid chaining tests on
+the same server process.
+
+Allocation flood:
+
+```bash
+# on turnserver
+/root/start_monitor.sh alloc_10000_mon
+
+# on loadgen
+/root/coturn/build/bin/turnutils_uclient \
+  -Y alloc -m 50 -n 200 \
+  -L 10.116.0.3 \
+  -u user -W secret \
+  10.116.0.2 > /root/alloc_10000.log 2>&1
+```
+
+Useful summaries:
+
+```bash
+grep -h 'send_pps=' /root/packet_m*.log /root/*_mon.log | tail -50
+grep -h 'total_allocations=' /root/alloc_*.log | tail -20
+ps -o pid,rss,vsz,pcpu,pmem,comm -p $(cat /root/turnserver.pid)
+tail -20 /root/*_pidstat.txt
+tail -20 /root/*_sar.txt
+```
+
 ### Unit tests (Unity, opt-in via `BUILD_TESTING=ON`)
 
 Unity is fetched on demand via CMake `FetchContent`; nothing is vendored.
diff --git a/README.turnutils b/README.turnutils
index d318222a..46b46dc4 100644
--- a/README.turnutils
+++ b/README.turnutils
@@ -123,6 +123,12 @@ Flags:
 
 -J	Use oAuth with default test key kid='north'.
 
+-Y	Load-generator mode:
+	packet floods data through a single TURN allocation as fast as possible,
+	alloc creates allocations as fast as possible,
+	and invalid sends small invalid packets to the TURN listener as fast as possible.
+	Load-generator modes imply -c and do not support -y.
+
 Options with required values:
 
 -l      Message length (Default: 100 Bytes).
@@ -137,6 +143,8 @@ Options with required values:
 -p      TURN Server port (Defaults: 3478 unsecure, 5349 secure).
 
 -n      Number of messages to send (Default: 5).
+	In load-generator mode, -n is the number of operations per client.
+	If omitted there, the client runs until interrupted.
 
 -d      Local interface device (optional, Linux only).
 
@@ -149,6 +157,7 @@ Options with required values:
 -r      Peer port (Default: 3480).
 
 -z      Per-session packet interval in milliseconds (Default: 20).
+	In packet and invalid load-generator modes the default is 0 ms.
 
 -u      STUN/TURN user name.
 
@@ -168,6 +177,15 @@ Options with required values:
 
 -a	Bandwidth for the bandwidth request in ALLOCATE. The default value is zero.
 
+Notes for load-generator mode:
+
+- packet mode still performs the normal TURN allocation/setup and then starts sending immediately with no pacing.
+
+- alloc mode does not require -e; it repeatedly establishes new allocations and closes them again.
+- alloc mode does not require -e; it repeatedly establishes new allocations, uses a unique client local port for each one, attaches each one to a unique synthetic peer ip:port, and closes them again.
+
+- invalid mode does not require -e; by default it uses 16-byte payloads unless -l is specified.
+
 See the examples in the "examples/scripts" directory.
 
 ======================================
diff --git a/docs/PerformanceIterationLog.md b/docs/PerformanceIterationLog.md
new file mode 100644
index 00000000..8316394c
--- /dev/null
+++ b/docs/PerformanceIterationLog.md
@@ -0,0 +1,268 @@
+# Performance iteration log
+
+Running notes for the multi-iteration performance work on the UDP relay
+data path. Pick this up to continue without re-deriving everything.
+
+The harness, baseline command, and droplet topology are documented in
+[CLAUDE.md](../CLAUDE.md) under "Load Test on DigitalOcean" — this file
+captures the *deltas*: what was measured, what landed, what didn't, and
+where the next round should go.
+
+## Cumulative result
+
+Five commits on `claude/beautiful-black-c3b741` between `727ec2ab`
+("loadgen") and `321a2d18`:
+
+| # | Commit | Optimization |
+|---|---|---|
+| 1 | `ce7e7e53` | Hoist `turn_server_get_engine()` out of per-packet hot path |
+| 2 | `8e28491a` | `ioa_socket_check_bandwidth` early fast-exit; drop dead `if (!(s->done \|\| s->fd==-1))` in `send_data_from_ioa_socket_nbh` |
+| 3 | `344360f6` | Cache `get_relay_socket_ss()` and `ioa_network_buffer_get_size()` in `write_to_peerchannel`, `handle_turn_send`, `read_client_connection` |
+| 4 | `a6f6767f` | Inline `get_ioa_addr_len()` via `ns_turn_ioaddr.h` |
+| 5 | `321a2d18` | Inline `addr_cpy()` via `ns_turn_ioaddr.h` |
+
+Alternating A/B run on the same droplet pair, m=1 packet flood, 30 s
+per run, with a 4 s warm-up between binary swaps:
+
+- Baseline (clean `master` binary): mean 146,984 round-trips / 30 s
+- Cumulative (all 5 iters): mean 155,468 round-trips / 30 s
+- **+5.8 % throughput**
+
+Per-iteration deltas were within run-to-run noise (~5–10 % variance).
+The cumulative effect is what's visible.
+
+## Test setup that was used
+
+Two `c-4` Ubuntu 24.04 droplets in `nyc1`, same VPC `default-nyc1`:
+
+- `coturn-turnserver` — public `68.183.121.197`, private `10.116.0.2`
+- `coturn-loadgen`    — public `68.183.132.220`, private `10.116.0.3`
+
+Created via the DigitalOcean v2 API (`doctl` is *not* installed; use
+`curl` + `$DIGITALOCEAN_TOKEN` from the user's `~/.zshrc`). SSH via
+`~/.ssh/id_rsa` (matches DO ssh key id `23704483`, fingerprint
+`37:3a:9b:e3:1e:1a:9b:42:a0:6f:58:f5:5a:3a:6a:2c`).
+
+State on the turnserver droplet (kept across iterations):
+
+- `/root/coturn_clean.tar` — `git archive HEAD` of master at start of run.
+  Re-extract this before applying any new patch.
+- `/root/coturn_baseline/build/bin/turnserver` — clean baseline binary,
+  used as the "B" in every A/B round. **Don't overwrite.**
+- `/root/coturn/build/bin/turnserver` — current iteration binary.
+- `/root/start_turnserver.sh`, `/root/baseline_run.sh` — helper scripts.
+
+State on the loadgen droplet:
+
+- `/root/coturn/build/bin/turnutils_uclient`, `turnutils_peer`.
+- `turnutils_peer` runs as a daemon on `10.116.0.3:3480`
+  (`pid` in `/root/peer.pid`).
+
+A small env file was written to `/tmp/coturn_perf_env.sh` on the local
+machine with the IPs / droplet IDs — recreate it from the current
+state of the DO account if it gets lost.
+
+The standard packet-flood command (matches CLAUDE.md baseline, runs
+*without* `--udp-recvmmsg`):
+
+```bash
+timeout -s INT 30s /root/coturn/build/bin/turnutils_uclient \
+  -Y packet -m 1 -l 120 \
+  -e 10.116.0.3 -r 3480 -X -g \
+  -u user -W secret \
+  10.116.0.2
+```
+
+Metric: the `tot_recv_msgs` field on the last `start_mclient:` log
+line. (This is round-trips through the relay over the test window;
+`send_pps` is loadgen-side only and can hit 262 K even when the relay
+is dropping most of them, so it's not a useful proxy for relay
+throughput.)
+
+## Hot-path map at the end of iter 5
+
+`perf record -F 99 -g` on the turnserver during a 12 s `-Y packet -m 1`
+run, sorted by user-space self-time:
+
+```
+0.80 % send_data_from_ioa_socket_nbh
+0.76 % socket_input_worker
+0.69 % read_client_connection.isra.0
+0.60 % turn_report_session_usage
+0.53 % peer_input_handler
+0.51 % udp_server_input_handler
+0.35 % udp_recvfrom               # was 0.76 % at iter 1
+0.34 % lm_map_get
+0.27 % stun_is_channel_message_str
+0.27 % get_relay_socket
+0.26 % ioa_socket_check_bandwidth # was 0.33 % at iter 1
+0.26 % udp_send                   # was 0.60 % at iter 1
+0.18 % ioa_network_buffer_get_size
+```
+
+Total user-space coturn cycles: ~5–7 % of the relay thread.
+The relay thread sits at ~100 % CPU pinned to one core; the 4 relay
+threads aren't parallelised by the m=1 single-flow test (one 5-tuple
+hashes to one SO_REUSEPORT worker).
+
+Kernel side (children-aggregated) is the real cost:
+
+```
+36 % udp_sendmsg (sendto path)
+14 % udp_recvmsg
+17 % ip_finish_output / ip_output / __dev_queue_xmit
+~23 % syscall enter / exit machinery (sysret, SYSRETQ, SYSCALL_64*)
+```
+
+That ~23 % syscall overhead is the next big lever. Halving it
+(via batching) is worth ~10 % wall-clock CPU.
+
+## What didn't work
+
+### Default `--udp-recvmmsg=true` on Linux (tried in iter 1, reverted)
+
+The flag exists and is wired to `receive_udp_batch_recvmmsg` in
+[dtls_listener.c](../src/apps/relay/dtls_listener.c), but **only on
+the listener socket** — the unconnected `udp_listen_s` that handles
+the *first* packet from a new client. Once `dtls_listener` calls
+`create_new_connected_udp_socket` (line ~583), subsequent
+client→relay traffic on that 5-tuple goes through a per-session
+*connected* UDP socket whose libevent callback is
+`socket_input_handler` → `socket_input_worker` →
+`udp_recvfrom` (single `recvmsg`). Same on the peer→relay direction.
+
+In a steady-state packet flood with one client, almost zero packets
+hit the listener path, so flipping the default does nothing for this
+test. It would help a many-client / many-allocate workload, but
+that's not what the m=1 harness measures.
+
+Throughput parity confirmed across multiple A/B rounds; reverted to
+keep the baseline mental model in CLAUDE.md intact.
+
+### Caching `get_relay_socket_ss` (iter 3) — no measurable wall-clock win
+
+The function is `static inline` already and the underlying
+`get_relay_socket()` is a four-line accessor. Caching the result
+*does* save a cross-TU function call per packet (the compiler can't
+prove `get_relay_socket` pure across the
+`set_df_on_ioa_socket` / `ioa_network_buffer_*` calls in between),
+which the perf profile picked up as a small redistribution, but
+throughput stayed in the noise band. Kept anyway: the cleanup is
+defensible and matches the iter 4/5 inlining direction.
+
+## Methodology lessons
+
+- **Always alternate A/B per round** rather than running 5×B then 5×I.
+  The droplet pair has noticeable environmental drift over a few
+  minutes (other tenants on the hypervisor, NIC ring backpressure,
+  whatever); sequential blocks bias whichever binary ran on the worse
+  half of the run.
+- **Discard the first run after a turnserver restart.** The loadgen's
+  first run after a server restart is consistently 30–80 % slower
+  than steady-state — looks like channel/permission state in the
+  client side warming up, not the server. A 4 s "throwaway" run
+  before the measured 30 s run is enough.
+- **Run-to-run variance is ~5–10 %** even with alternation. Plan on
+  6–8 rounds (≈ 8 minutes wall-clock) before claiming a sub-10 % win.
+  A single 3-round A/B will lie to you.
+- **Use the `tot_recv_msgs` field, not `send_pps`**. Loadgen send rate
+  saturates at ~262 K pps regardless of relay capacity — it's
+  whatever the loadgen kernel will accept into its UDP send buffer.
+  The receive count is what made it round-trip through the relay.
+- **The relay is kernel-bound.** User-space coturn is ~5 % of cycles.
+  Halving it gives at most ~2.5 % wall-clock — usually undetectable
+  per-iteration, only visible cumulatively. Don't expect a 10 % jump
+  from a CSE.
+- **Single-flow tests pin one core.** With `SO_REUSEPORT` the kernel
+  hashes 5-tuples to worker sockets; one client → one tuple → one
+  worker thread. The other 3 cores sit idle. To exercise all 4 relay
+  threads you'd need m≥4 *with distinct source ports* — ours don't
+  spread cleanly because the loadgen reuses ports.
+- **Don't re-extract `/root/coturn` between iterations** if you want
+  to keep `git apply`-style patches working. The droplet copy is *not*
+  a git checkout (it's the `git archive` tar). Use `patch -p1`. Each
+  iteration uploaded a *cumulative* diff (current branch vs `master`)
+  and re-extracted from `/root/coturn_clean.tar` first to get a clean
+  apply.
+
+## Optimization backlog (bigger fish for next session)
+
+Ordered by expected impact for the m=1 packet-flood metric:
+
+1. **Extend `recvmmsg` into `socket_input_worker`** for plain UDP
+   non-DTLS sockets. The existing `try_again` loop in
+   [ns_ioalib_engine_impl.c:2683](../src/apps/relay/ns_ioalib_engine_impl.c#L2683)
+   already drains up to `MAX_TRIES = 16` packets per epoll wakeup via
+   16 single `recvmsg` calls. Replacing the inner read with a
+   `recvmmsg` of up to 16 messages saves ~15 syscalls per drain
+   iteration. At ~14 % `udp_recvmsg` kernel + ~6 % syscall machinery
+   on the recv side, plausible 8–12 % throughput. Risk: the function
+   is heavily branched (TCP / TLS / DTLS / UDP all share the body)
+   and state can change mid-loop (`s->tobeclosed` etc.); the cleanest
+   shape is a separate UDP-only helper called from
+   `socket_input_handler` *before* falling through to the existing
+   `socket_input_worker`, gated on `s->ssl == NULL && s->bev == NULL
+   && !s->parent_s`. **This is the highest-value remaining item.**
+
+2. **`sendmmsg` batched send.** Each successful packet fires one
+   `sendto`. After (1) lands, when the receive loop hands a batch of
+   N packets to the dispatch layer in one go, the corresponding sends
+   could be coalesced into one `sendmmsg`. Requires a lightweight
+   per-thread send queue and a flush at the end of each event-loop
+   tick. Bigger refactor; expect another ~10 % if (1) lands.
+
+3. **GSO (`UDP_SEGMENT`)** on the send path. Linux can take one
+   "large" datagram and segment it in the kernel for back-to-back
+   packets to the same destination. Our channel-data flood IS
+   same-destination. Setting `UDP_SEGMENT` and submitting a single
+   `sendmsg` of N×packet_size cuts skb-alloc / `__dev_queue_xmit`
+   work substantially. Needs careful handling for short tails and
+   non-uniform sizes; complementary to (2).
+
+4. **Inline more cross-TU per-packet accessors.** Pattern from iter
+   4/5 still applies: `addr_eq` (called per channel-data packet for
+   permission lookup), `ioa_network_buffer_get_size`,
+   `get_ioa_socket_type` / `_app_type`. Each is small enough; the
+   only reason to be cautious is they're declared in `ns_turn_ioalib.h`
+   which is part of the public-ish server library API — moving the
+   body inline doesn't break ABI but does require a recompile of all
+   consumers. Likely <1 % each but cheap to do.
+
+5. **Re-evaluate `--udp-recvmmsg` default after (1) lands.** Once
+   per-session sockets also batch, the listener path is no longer a
+   special case and turning it on by default becomes a free win for
+   multi-tenant servers without hurting m=1.
+
+## Things investigated and ruled out (don't redo)
+
+- `set_socket_ttl` / `set_socket_tos` already short-circuit on
+  no-change via `s->current_ttl != ttl` / `s->current_tos != tos`.
+  In a steady-state flood the per-packet call returns immediately
+  without `setsockopt`. Already optimized.
+- `set_df_on_ioa_socket` similarly guarded
+  ([ns_ioalib_engine_impl.c:242](../src/apps/relay/ns_ioalib_engine_impl.c#L242)).
+- `turn_report_session_usage` slow path runs once per 4096 packets
+  (see iter 1 commit); the per-call overhead is now ~3 reads + 1
+  bitmask test + 1 conditional return.
+- `MSG_CONFIRM` in `sendto` would skip ARP refresh, but
+  `neigh_resolve_output` + `neigh_hh_output` show ~17 % combined in
+  perf only because we're sending *that many* packets — per-packet
+  it's the normal cached neighbor path, not a refresh.
+- Increasing `MAX_TRIES` from 16 to 64 in `socket_input_worker`
+  doesn't change syscall count; it only delays returning to libevent.
+  Useless without (1) above.
+
+## How to resume
+
+1. Verify the droplets are still up (the IPs above). If they were
+   destroyed, re-create with `c-4` / `nyc1` / `default-nyc1` VPC and
+   the `pavel` SSH key (id 23704483).
+2. Re-upload `/tmp/coturn_clean.tar` from `git archive master` and
+   rebuild `/root/coturn_baseline/build/bin/turnserver` if the
+   baseline binary is gone. The A/B harness depends on having both
+   binaries side-by-side on the turnserver droplet.
+3. Run a 6-round alternating A/B as a sanity check that the current
+   tip-of-branch still beats `master` by ~5 %. If it doesn't, the
+   environment drifted and the baseline needs re-anchoring.
+4. Pick the next item from the backlog. Item (1) — `recvmmsg` into
+   `socket_input_worker` — is where the next material gain lives.
diff --git a/examples/loadtest/allocation_flood.sh b/examples/loadtest/allocation_flood.sh
new file mode 100755
index 00000000..e3702d42
--- /dev/null
+++ b/examples/loadtest/allocation_flood.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+set -eu
+
+# Allocation flood does not start turnutils_peer.
+# turnutils_uclient now generates a unique synthetic peer ip:port for
+# each new allocation cycle, so only turnserver and uclient are needed.
+
+SCRIPT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")" && pwd)"
+REPO_ROOT="$(CDPATH= cd -- "${SCRIPT_DIR}/../.." && pwd)"
+
+BINDIR="${REPO_ROOT}/build/bin"
+if [ ! -x "${BINDIR}/turnserver" ]; then
+  BINDIR="${REPO_ROOT}/bin"
+fi
+
+cleanup() {
+  kill "${uclient_pid:-}" "${turnserver_pid:-}" 2>/dev/null || true
+  wait "${uclient_pid:-}" "${turnserver_pid:-}" 2>/dev/null || true
+}
+trap cleanup EXIT INT TERM
+
+"${BINDIR}/turnserver" \
+  --use-auth-secret \
+  --static-auth-secret=secret \
+  --realm=north.gov \
+  --allow-loopback-peers \
+  --listening-ip=127.0.0.1 \
+  --relay-ip=127.0.0.1 \
+  > /dev/null 2>&1 &
+turnserver_pid=$!
+
+sleep 2
+
+"${BINDIR}/turnutils_uclient" \
+  -Y alloc \
+  -m 50 \
+  -L 127.0.0.1 \
+  -u user \
+  -W secret \
+  "$@" \
+  127.0.0.1 &
+uclient_pid=$!
+
+wait "${uclient_pid}"
diff --git a/examples/loadtest/invalid_flood.sh b/examples/loadtest/invalid_flood.sh
new file mode 100755
index 00000000..e3bbce9f
--- /dev/null
+++ b/examples/loadtest/invalid_flood.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+set -eu
+
+SCRIPT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")" && pwd)"
+REPO_ROOT="$(CDPATH= cd -- "${SCRIPT_DIR}/../.." && pwd)"
+
+BINDIR="${REPO_ROOT}/build/bin"
+if [ ! -x "${BINDIR}/turnserver" ]; then
+  BINDIR="${REPO_ROOT}/bin"
+fi
+
+cleanup() {
+  kill "${uclient_pid:-}" "${turnserver_pid:-}" 2>/dev/null || true
+  wait "${uclient_pid:-}" "${turnserver_pid:-}" 2>/dev/null || true
+}
+trap cleanup EXIT INT TERM
+
+# "${BINDIR}/turnserver" \
+#   --use-auth-secret \
+#   --static-auth-secret=secret \
+#   --realm=north.gov \
+#   --allow-loopback-peers \
+#   --listening-ip=127.0.0.1 \
+#   --relay-ip=127.0.0.1 \
+#   > /dev/null 2>&1 &
+# turnserver_pid=$!
+
+sleep 2
+
+"${BINDIR}/turnutils_uclient" \
+  -Y invalid \
+  -m 50 \
+  -l 16 \
+  -u user \
+  -W secret \
+  "$@" \
+  127.0.0.1 &
+uclient_pid=$!
+
+wait "${uclient_pid}"
diff --git a/examples/loadtest/packet_flood.sh b/examples/loadtest/packet_flood.sh
new file mode 100755
index 00000000..fc8c263a
--- /dev/null
+++ b/examples/loadtest/packet_flood.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+set -eu
+
+SCRIPT_DIR="$(CDPATH= cd -- "$(dirname -- "$0")" && pwd)"
+REPO_ROOT="$(CDPATH= cd -- "${SCRIPT_DIR}/../.." && pwd)"
+
+BINDIR="${REPO_ROOT}/build/bin"
+if [ ! -x "${BINDIR}/turnserver" ]; then
+  BINDIR="${REPO_ROOT}/bin"
+fi
+
+cleanup() {
+  kill "${uclient_pid:-}" "${peer_pid:-}" "${turnserver_pid:-}" 2>/dev/null || true
+  wait "${uclient_pid:-}" "${peer_pid:-}" "${turnserver_pid:-}" 2>/dev/null || true
+}
+trap cleanup EXIT INT TERM
+
+"${BINDIR}/turnserver" \
+  --use-auth-secret \
+  --static-auth-secret=secret \
+  --realm=north.gov \
+  --allow-loopback-peers \
+  --listening-ip=127.0.0.1 \
+  --relay-ip=127.0.0.1 \
+  > /dev/null 2>&1 &
+turnserver_pid=$!
+
+"${BINDIR}/turnutils_peer" -L 127.0.0.1 -p 3480 > /dev/null 2>&1 &
+peer_pid=$!
+
+sleep 2
+
+"${BINDIR}/turnutils_uclient" \
+  -Y packet \
+  -m 1 \
+  -z 0 \
+  -l 120 \
+  -e 127.0.0.1 \
+  -r 3480 \
+  -X \
+  -g \
+  -u user \
+  -W secret \
+  "$@" \
+  127.0.0.1 &
+uclient_pid=$!
+
+wait "${uclient_pid}"
diff --git a/man/man1/turnutils.1 b/man/man1/turnutils.1
index ec488992..d7fbabd5 100644
--- a/man/man1/turnutils.1
+++ b/man/man1/turnutils.1
@@ -191,6 +191,14 @@ Dual allocation (SSODA). Implies \fB\-c\fP option.
 .B
 \fB\-J\fP
 Use oAuth with default test key kid='north'.
+.TP
+.B
+\fB\-Y\fP
+Load\-generator mode:
+\fIpacket\fP floods data through a single TURN allocation as fast as possible,
+\fIalloc\fP creates allocations as fast as possible,
+and \fIinvalid\fP sends small invalid packets to the TURN listener as fast as possible.
+Load\-generator modes imply \fB\-c\fP and do not support \fB\-y\fP.
 .PP
 Options with required values:
 .TP
@@ -218,6 +226,8 @@ if the server certificate to be verified.
 .B
 \fB\-n\fP
 Number of messages to send (Default: 5).
+In load\-generator mode, \fB\-n\fP is the number of operations per client.
+If omitted there, the client runs until interrupted.
 .TP
 .B
 \fB\-d\fP
@@ -242,6 +252,7 @@ Peer port (Default: 3480).
 .B
 \fB\-z\fP
 Per\-session packet interval in milliseconds (Default: 20).
+In \fIpacket\fP and \fIinvalid\fP load\-generator modes the default is 0 ms.
 .TP
 .B
 \fB\-u\fP
@@ -275,6 +286,14 @@ the ORIGIN STUN attribute value.
 \fB\-a\fP
 Bandwidth for the bandwidth request in ALLOCATE. The default value is zero.
 .PP
+Notes for load\-generator mode:
+.IP
+\fIpacket\fP mode still performs the normal TURN allocation/setup and then starts sending immediately with no pacing.
+.IP
+\fIalloc\fP mode does not require \fB\-e\fP; it repeatedly establishes new allocations, uses a unique client local port for each one, attaches each one to a unique synthetic peer ip:port, and closes them again.
+.IP
+\fIinvalid\fP mode does not require \fB\-e\fP; by default it uses 16\-byte payloads unless \fB\-l\fP is specified.
+.PP
 See the examples in the "examples/scripts" directory.
 .SH ======================================
 
diff --git a/src/apps/uclient/mainuclient.c b/src/apps/uclient/mainuclient.c
index 89e38573..4fb5bda8 100644
--- a/src/apps/uclient/mainuclient.c
+++ b/src/apps/uclient/mainuclient.c
@@ -97,6 +97,8 @@ char origin[STUN_MAX_ORIGIN_SIZE + 1] = "\0";
 band_limit_t bps = 0;
 
 bool dual_allocation = false;
+bool unique_client_ports = false;
+uclient_load_mode load_mode = UCLIENT_LOAD_MODE_NONE;
 
 int oauth = 0;
 oauth_key okey_array[3];
@@ -108,6 +110,22 @@ static oauth_key_data_raw okdr_array[3] = {
 
 //////////////// local definitions /////////////////
 
+static uclient_load_mode parse_load_mode(const char *mode) {
+  if (!mode) {
+    return UCLIENT_LOAD_MODE_NONE;
+  }
+  if (!strcmp(mode, "packet")) {
+    return UCLIENT_LOAD_MODE_PACKET_FLOOD;
+  }
+  if (!strcmp(mode, "alloc")) {
+    return UCLIENT_LOAD_MODE_ALLOC_FLOOD;
+  }
+  if (!strcmp(mode, "invalid")) {
+    return UCLIENT_LOAD_MODE_INVALID_FLOOD;
+  }
+  return UCLIENT_LOAD_MODE_NONE;
+}
+
 static char Usage[] =
     "Usage: uclient [flags] [options] turn-server-ip-address\n"
     "Flags:\n"
@@ -138,6 +156,7 @@ static char Usage[] =
     "	-Z	Dual allocation (implies -c).\n"
     "	-J	Use oAuth with default test keys kid='north', 'union' or 'oldempire'.\n"
     "Options:\n"
+    "	-Y	<packet|alloc|invalid> Enable load-generator mode.\n"
     "	-l	Message length (Default: 100 Bytes).\n"
     "	-i	Certificate file (for secure connections only, optional).\n"
     "	-k	Private key file (for secure connections only).\n"
@@ -172,6 +191,9 @@ int main(int argc, char **argv) {
 
   char rest_api_separator = ':';
   bool use_null_cipher = false;
+  bool message_length_set = false;
+  bool message_count_set = false;
+  bool packet_interval_set = false;
 
 #if defined(WINDOWS)
 
@@ -200,7 +222,7 @@ int main(int argc, char **argv) {
 
   memset(local_addr, 0, sizeof(local_addr));
 
-  while ((c = getopt(argc, argv, "a:d:p:l:n:L:m:e:r:u:w:i:k:z:W:C:E:F:o:bZvsyhcxXgtTSAPDNOUMRIGBJ")) != -1) {
+  while ((c = getopt(argc, argv, "a:d:p:l:n:L:m:e:r:u:w:i:k:z:W:C:E:F:o:Y:bZvsyhcxXgtTSAPDNOUMRIGBJ")) != -1) {
     switch (c) {
     case 'J': {
 
@@ -232,6 +254,13 @@ int main(int argc, char **argv) {
     case 'a':
       bps = (band_limit_t)strtoul(optarg, NULL, 10);
       break;
+    case 'Y':
+      load_mode = parse_load_mode(optarg);
+      if (load_mode == UCLIENT_LOAD_MODE_NONE) {
+        fprintf(stderr, "Unknown load mode: %s\n", optarg);
+        exit(1);
+      }
+      break;
     case 'o':
       STRCPY(origin, optarg);
       break;
@@ -274,6 +303,7 @@ int main(int argc, char **argv) {
       negative_protocol_test = true;
       break;
     case 'z':
+      packet_interval_set = true;
       RTP_PACKET_INTERVAL = atoi(optarg);
       break;
     case 'Z':
@@ -298,12 +328,14 @@ int main(int argc, char **argv) {
       default_address_family = STUN_ATTRIBUTE_REQUESTED_ADDRESS_FAMILY_VALUE_IPV4;
       break;
     case 'l':
+      message_length_set = true;
       clmessage_length = atoi(optarg);
       break;
     case 's':
       do_not_use_channel = true;
       break;
     case 'n':
+      message_count_set = true;
       messagenumber = atoi(optarg);
       break;
     case 'p':
@@ -388,6 +420,31 @@ int main(int argc, char **argv) {
     no_rtcp = true;
   }
 
+  if (is_load_generator_mode()) {
+    no_rtcp = true;
+
+    if (!message_count_set) {
+      messagenumber = 0;
+    }
+
+    if ((is_packet_flood_mode() || is_invalid_flood_mode()) && !packet_interval_set) {
+      RTP_PACKET_INTERVAL = 0;
+    }
+
+    if (is_invalid_flood_mode() && !message_length_set) {
+      clmessage_length = 16;
+    }
+
+    if (is_alloc_flood_mode()) {
+      unique_client_ports = true;
+    }
+
+    if (c2c) {
+      fprintf(stderr, "Load-generator mode does not support -y client-to-client mode\n");
+      exit(1);
+    }
+  }
+
   if (g_use_auth_secret_with_timestamp) {
 
     {
@@ -453,14 +510,19 @@ int main(int argc, char **argv) {
     }
   }
 
-  if (clmessage_length < (int)sizeof(message_info)) {
+  if (!is_invalid_flood_mode() && clmessage_length < (int)sizeof(message_info)) {
     clmessage_length = (int)sizeof(message_info);
   }
 
+  if (is_invalid_flood_mode() && clmessage_length < 1) {
+    clmessage_length = 1;
+  }
+
   const int max_header = 100;
-  if (clmessage_length > (int)(STUN_BUFFER_SIZE - max_header)) {
-    fprintf(stderr, "Message length was corrected to %d\n", (STUN_BUFFER_SIZE - max_header));
-    clmessage_length = (int)(STUN_BUFFER_SIZE - max_header);
+  const int max_message_length = is_invalid_flood_mode() ? (int)STUN_BUFFER_SIZE : (int)(STUN_BUFFER_SIZE - max_header);
+  if (clmessage_length > max_message_length) {
+    fprintf(stderr, "Message length was corrected to %d\n", max_message_length);
+    clmessage_length = max_message_length;
   }
 
   if (optind >= argc) {
@@ -468,7 +530,7 @@ int main(int argc, char **argv) {
     exit(-1);
   }
 
-  if (!c2c) {
+  if (!c2c && !is_alloc_flood_mode() && !is_invalid_flood_mode()) {
     if (!peer_address[0]) {
       fprintf(stderr, "Either -e peer_address or -y must be specified\n");
       return -1;
diff --git a/src/apps/uclient/startuclient.c b/src/apps/uclient/startuclient.c
index ec4a6fd2..945417e7 100644
--- a/src/apps/uclient/startuclient.c
+++ b/src/apps/uclient/startuclient.c
@@ -62,9 +62,19 @@ static const int never_allocate_rtcp = 0;
 
 static const unsigned char kALPNProtos[] = "\x08http/1.1\x09stun.turn\x12stun.nat-discovery";
 static const size_t kALPNProtosLen = sizeof(kALPNProtos) - 1;
+static uint16_t next_unique_local_port = 49152;
 
 /////////////////////////////////////////
 
+static uint16_t allocate_unique_local_port(void) {
+  const uint16_t port = next_unique_local_port;
+  ++next_unique_local_port;
+  if (next_unique_local_port < 49152) {
+    next_unique_local_port = 49152;
+  }
+  return port;
+}
+
 int rare_event(void) {
   if (dos) {
     return (((unsigned long)turn_random_number()) % 1000 == 777);
@@ -160,7 +170,7 @@ static SSL *tls_connect(ioa_socket_raw fd, ioa_addr *remote_addr, bool *try_agai
       switch (SSL_get_error(ssl, rc)) {
       case SSL_ERROR_WANT_READ:
       case SSL_ERROR_WANT_WRITE:
-        if (!dos) {
+        if (!dos && !is_load_generator_mode()) {
           usleep(1000);
         }
         continue;
@@ -216,6 +226,7 @@ static int clnet_connect(uint16_t clnet_remote_port, const char *remote_address,
 
   ioa_addr local_addr;
   int connect_cycle = 0;
+  int bind_cycle = 0;
 
   ioa_addr remote_addr;
 
@@ -261,16 +272,36 @@ start_socket:
       }
     }
 
-    addr_bind(clnet_fd, &local_addr, 0, 1, get_socket_type());
-
-  } else if (strlen(local_address) > 0) {
-
-    if (make_ioa_addr((const uint8_t *)local_address, 0, &local_addr) < 0) {
+    if (addr_bind(clnet_fd, &local_addr, 0, 1, get_socket_type()) < 0) {
       socket_closesocket(clnet_fd);
       return -1;
     }
 
-    addr_bind(clnet_fd, &local_addr, 0, 1, get_socket_type());
+  } else if (strlen(local_address) > 0 || unique_client_ports) {
+
+    const char *bind_address = local_address;
+    if (!bind_address[0]) {
+      bind_address = (remote_addr.ss.sa_family == AF_INET6) ? "::" : "0.0.0.0";
+    }
+
+    if (make_ioa_addr((const uint8_t *)bind_address, 0, &local_addr) < 0) {
+      socket_closesocket(clnet_fd);
+      return -1;
+    }
+
+    if (unique_client_ports) {
+      addr_set_port(&local_addr, allocate_unique_local_port());
+    }
+
+    const int bind_debug = unique_client_ports ? 0 : 1;
+    if (addr_bind(clnet_fd, &local_addr, 0, bind_debug, get_socket_type()) < 0) {
+      const int bind_err = socket_errno();
+      socket_closesocket(clnet_fd);
+      if (unique_client_ports && bind_err == EADDRINUSE && bind_cycle++ < MAX_CONNECT_EFFORTS) {
+        goto start_socket;
+      }
+      return -1;
+    }
   }
 
   int connect_err = 0;
@@ -307,7 +338,7 @@ start_socket:
     addr_debug_print(verbose, &remote_addr, "Connected to");
   }
 
-  if (!dos) {
+  if (!dos && !is_load_generator_mode()) {
     usleep(500);
   }
 
@@ -943,6 +974,62 @@ beg_cp:
   return 0;
 }
 
+int turn_refresh_allocation(bool verbose, app_ur_conn_info *clnet_info, uint32_t lifetime) {
+
+  stun_buffer request_message, response_message;
+
+beg_refresh:
+
+  stun_init_request(STUN_METHOD_REFRESH, &request_message);
+  uint32_t lt = htonl(lifetime);
+  stun_attr_add(&request_message, STUN_ATTRIBUTE_LIFETIME, (const char *)&lt, 4);
+
+  add_origin(&request_message);
+
+  if (add_integrity(clnet_info, &request_message) < 0) {
+    return -1;
+  }
+
+  stun_attr_add_fingerprint_str(request_message.buf, &(request_message.len));
+
+  if (send_buffer(clnet_info, &request_message, 0, 0) <= 0) {
+    return -1;
+  }
+
+  while (true) {
+    const int len = recv_buffer(clnet_info, &response_message, 1, 0, NULL, &request_message);
+    if (len <= 0) {
+      return -1;
+    }
+
+    response_message.len = len;
+
+    int err_code = 0;
+    uint8_t err_msg[129];
+
+    if (stun_is_success_response(&response_message)) {
+      if (clnet_info->nonce[0]) {
+        if (check_integrity(clnet_info, &response_message) < 0) {
+          return -1;
+        }
+      }
+      if (verbose) {
+        TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "refresh success: lifetime=%u\n", lifetime);
+      }
+      return 0;
+    } else if (stun_is_challenge_response_str(response_message.buf, response_message.len, &err_code, err_msg,
+                                              sizeof(err_msg), clnet_info->realm, clnet_info->nonce,
+                                              clnet_info->server_name, &(clnet_info->oauth))) {
+      goto beg_refresh;
+    } else if (stun_is_error_response(&response_message, &err_code, err_msg, sizeof(err_msg))) {
+      if (verbose) {
+        TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "refresh error %d (%s)\n", err_code, (char *)err_msg);
+      }
+      return -1;
+    }
+  }
+}
+
 int start_connection(uint16_t clnet_remote_port0, const char *remote_address0, const unsigned char *ifname,
                      const char *local_address, bool verbose, app_ur_conn_info *clnet_info_probe,
                      app_ur_conn_info *clnet_info, uint16_t *chn, app_ur_conn_info *clnet_info_rtcp,
@@ -1170,6 +1257,38 @@ int start_connection(uint16_t clnet_remote_port0, const char *remote_address0, c
   return 0;
 }
 
+int start_allocate_only_connection(uint16_t clnet_remote_port0, const char *remote_address0,
+                                   const unsigned char *ifname, const char *local_address, bool verbose,
+                                   app_ur_conn_info *clnet_info_probe, app_ur_conn_info *clnet_info,
+                                   ioa_addr *peer_addr) {
+
+  UNUSED_ARG(clnet_info_probe);
+
+  ioa_addr relay_addr;
+
+  if (clnet_connect(clnet_remote_port0, remote_address0, ifname, local_address, verbose, clnet_info) < 0) {
+    exit(-1);
+  }
+
+  if (clnet_allocate(verbose, clnet_info, &relay_addr, default_address_family, NULL, NULL) < 0) {
+    return -1;
+  }
+
+  if (peer_addr) {
+    addr_cpy(&(clnet_info->peer_addr), peer_addr);
+    if (turn_create_permission(verbose, clnet_info, peer_addr, 1) < 0) {
+      return -1;
+    }
+  }
+
+  return 0;
+}
+
+int start_raw_connection(uint16_t clnet_remote_port, const char *remote_address, const unsigned char *ifname,
+                         const char *local_address, bool verbose, app_ur_conn_info *clnet_info) {
+  return clnet_connect(clnet_remote_port, remote_address, ifname, local_address, verbose, clnet_info);
+}
+
 int start_c2c_connection(uint16_t clnet_remote_port0, const char *remote_address0, const unsigned char *ifname,
                          const char *local_address, bool verbose, app_ur_conn_info *clnet_info_probe,
                          app_ur_conn_info *clnet_info1, uint16_t *chn1, app_ur_conn_info *clnet_info1_rtcp,
diff --git a/src/apps/uclient/startuclient.h b/src/apps/uclient/startuclient.h
index d351ab84..f17289b7 100644
--- a/src/apps/uclient/startuclient.h
+++ b/src/apps/uclient/startuclient.h
@@ -61,6 +61,15 @@ int start_connection(uint16_t clnet_remote_port, const char *remote_address, con
                      app_ur_conn_info *clnet_info, uint16_t *chn, app_ur_conn_info *clnet_info_rtcp,
                      uint16_t *chn_rtcp);
 
+int start_allocate_only_connection(uint16_t clnet_remote_port, const char *remote_address, const unsigned char *ifname,
+                                   const char *local_address, bool verbose, app_ur_conn_info *clnet_info_probe,
+                                   app_ur_conn_info *clnet_info, ioa_addr *peer_addr);
+
+int start_raw_connection(uint16_t clnet_remote_port, const char *remote_address, const unsigned char *ifname,
+                         const char *local_address, bool verbose, app_ur_conn_info *clnet_info);
+
+int turn_refresh_allocation(bool verbose, app_ur_conn_info *clnet_info, uint32_t lifetime);
+
 int turn_tcp_connect(bool verbose, app_ur_conn_info *clnet_info, ioa_addr *peer_addr);
 
 void tcp_data_connect(app_ur_session *elem, uint32_t cid);
diff --git a/src/apps/uclient/uclient.c b/src/apps/uclient/uclient.c
index 03328360..3dd247d8 100644
--- a/src/apps/uclient/uclient.c
+++ b/src/apps/uclient/uclient.c
@@ -62,6 +62,11 @@ static uint64_t tot_send_bytes = 0;
 static uint32_t tot_recv_messages = 0;
 static uint64_t tot_recv_bytes = 0;
 static uint64_t tot_send_dropped = 0;
+static uint64_t tot_allocations = 0;
+static uint64_t load_sent_packets = 0;
+static uint64_t load_last_sent_packets = 0;
+static uint64_t load_last_report_time = 0;
+static uint64_t synthetic_peer_counter = 0;
 
 struct event_base *client_event_base = NULL;
 
@@ -97,6 +102,74 @@ static uint64_t max_jitter = 0;
 
 static bool show_statistics = false;
 
+static bool uses_turn_allocation(void) { return !is_invalid_flood_mode(); }
+
+static bool uses_unlimited_message_count(const app_ur_session *elem) {
+  return elem && is_load_generator_mode() && (elem->tot_msgnum <= 0);
+}
+
+static int get_send_burst_limit(void) { return is_packet_flood_mode() || is_invalid_flood_mode() ? 4096 : 50; }
+
+static size_t get_invalid_packet_length(void) {
+  if (clmessage_length < 1) {
+    return 1;
+  }
+  if (clmessage_length > (int)STUN_BUFFER_SIZE) {
+    return STUN_BUFFER_SIZE;
+  }
+  return (size_t)clmessage_length;
+}
+
+static void reset_load_generator_rate_stats(void) {
+  load_sent_packets = 0;
+  load_last_sent_packets = 0;
+  load_last_report_time = current_time;
+}
+
+static void print_load_generator_rate(const char *context) {
+  if (!is_load_generator_mode()) {
+    return;
+  }
+
+  if (current_time <= load_last_report_time) {
+    return;
+  }
+
+  const uint64_t elapsed = current_time - load_last_report_time;
+  const uint64_t delta_packets = load_sent_packets - load_last_sent_packets;
+  const double pps = (double)delta_packets / (double)elapsed;
+
+  TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "%s: send_pps=%.2f, total_packets=%llu\n", context, pps,
+                (unsigned long long)load_sent_packets);
+
+  load_last_report_time = current_time;
+  load_last_sent_packets = load_sent_packets;
+}
+
+static void generate_unique_allocation_peer(ioa_addr *peer_addr) {
+  if (!peer_addr) {
+    return;
+  }
+
+  const uint64_t peer_index = synthetic_peer_counter++;
+  const uint16_t port = (uint16_t)(1024 + (peer_index % (uint64_t)(0x10000 - 1024)));
+  char peer_saddr[129];
+
+  if (default_address_family == STUN_ATTRIBUTE_REQUESTED_ADDRESS_FAMILY_VALUE_IPV6) {
+    const uint64_t host_index = peer_index / (uint64_t)(0x10000 - 1024);
+    snprintf(peer_saddr, sizeof(peer_saddr), "2001:db8:%x:%x::1", (unsigned int)((host_index >> 16) & 0xffffU),
+             (unsigned int)(host_index & 0xffffU));
+  } else {
+    const uint64_t host_index = 1 + (peer_index / (uint64_t)(0x10000 - 1024));
+    snprintf(peer_saddr, sizeof(peer_saddr), "198.%u.%u.%u", 18 + (unsigned int)((host_index >> 16) & 0x1U),
+             (unsigned int)((host_index >> 8) & 0xffU), (unsigned int)(host_index & 0xffU));
+  }
+
+  if (make_ioa_addr((const uint8_t *)peer_saddr, port, peer_addr) < 0) {
+    addr_set_any(peer_addr);
+  }
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 
 static void __turn_getMSTime(void) {
@@ -307,6 +380,10 @@ int send_buffer(app_ur_conn_info *clnet_info, stun_buffer *message, bool data_co
     ret = (int)message->len;
   }
 
+  if ((ret > 0) && is_load_generator_mode()) {
+    ++load_sent_packets;
+  }
+
   return ret;
 }
 
@@ -878,12 +955,27 @@ static int client_write(app_ur_session *elem) {
 
   elem->ctime = current_time;
 
-  message_info *mi = (message_info *)buffer_to_send;
-  mi->msgnum = elem->wmsgnum;
-  mi->mstime = current_mstime;
   app_tcp_conn_info *atc = NULL;
+  size_t payload_len = (size_t)clmessage_length;
 
-  if (is_TCP_relay()) {
+  if (is_invalid_flood_mode()) {
+    payload_len = get_invalid_packet_length();
+    memset(elem->out_buffer.buf, 0xA5, payload_len);
+    if (payload_len >= 8) {
+      elem->out_buffer.buf[0] = 0x00;
+      elem->out_buffer.buf[1] = 0x01;
+      elem->out_buffer.buf[2] = 0x7f;
+      elem->out_buffer.buf[3] = 0x7f;
+      memcpy(elem->out_buffer.buf + 4, &(elem->wmsgnum), sizeof(elem->wmsgnum));
+    }
+    elem->out_buffer.len = payload_len;
+  } else {
+    message_info *mi = (message_info *)buffer_to_send;
+    mi->msgnum = elem->wmsgnum;
+    mi->mstime = current_mstime;
+  }
+
+  if (!is_invalid_flood_mode() && is_TCP_relay()) {
 
     memcpy(elem->out_buffer.buf, buffer_to_send, clmessage_length);
     elem->out_buffer.len = clmessage_length;
@@ -893,7 +985,7 @@ static int client_write(app_ur_session *elem) {
         ++elem->wmsgnum;
         elem->to_send_timems += RTP_PACKET_INTERVAL;
         tot_send_messages++;
-        tot_send_bytes += clmessage_length;
+        tot_send_bytes += payload_len;
       }
       return 0;
     }
@@ -907,11 +999,11 @@ static int client_write(app_ur_session *elem) {
       printf("%s: Uninitialized atc: i=%d, atc=%p\n", __FUNCTION__, i, atc);
       return -1;
     }
-  } else if (!do_not_use_channel) {
+  } else if (!is_invalid_flood_mode() && !do_not_use_channel) {
     /* Let's always do padding: */
     stun_init_channel_message(elem->chnum, &(elem->out_buffer), clmessage_length, mandatory_channel_padding || use_tcp);
     memcpy(elem->out_buffer.buf + 4, buffer_to_send, clmessage_length);
-  } else {
+  } else if (!is_invalid_flood_mode()) {
     stun_init_indication(STUN_METHOD_SEND, &(elem->out_buffer));
     stun_attr_add(&(elem->out_buffer), STUN_ATTRIBUTE_DATA, buffer_to_send, clmessage_length);
     stun_attr_add_addr(&(elem->out_buffer), STUN_ATTRIBUTE_XOR_PEER_ADDRESS, &(elem->pinfo.peer_addr));
@@ -940,7 +1032,7 @@ static int client_write(app_ur_session *elem) {
         TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "wrote %d bytes\n", (int)rc);
       }
       tot_send_messages++;
-      tot_send_bytes += clmessage_length;
+      tot_send_bytes += payload_len;
     } else {
       return -1;
     }
@@ -990,6 +1082,33 @@ void client_input_handler(evutil_socket_t fd, short what, void *arg) {
   }
 }
 
+static void client_discard_input_handler(evutil_socket_t fd, short what, void *arg) {
+  if (!(what & EV_READ) || !arg) {
+    return;
+  }
+
+  UNUSED_ARG(fd);
+
+  app_ur_session *elem = (app_ur_session *)arg;
+  if (!elem || (elem->state != UR_STATE_READY)) {
+    return;
+  }
+
+  uint8_t buffer[STUN_BUFFER_SIZE];
+
+  if (elem->pinfo.ssl) {
+    int rc = 0;
+    do {
+      rc = SSL_read(elem->pinfo.ssl, buffer, (int)sizeof(buffer));
+    } while ((rc > 0) || (rc < 0 && socket_eintr()));
+  } else if (elem->pinfo.fd >= 0) {
+    ssize_t rc = 0;
+    do {
+      rc = recv(elem->pinfo.fd, buffer, sizeof(buffer), 0);
+    } while ((rc > 0) || (rc < 0 && socket_eintr()));
+  }
+}
+
 static void run_events(int short_burst) {
   struct timeval timeout;
 
@@ -1008,6 +1127,33 @@ static void run_events(int short_burst) {
 
 ////////////////////// main method /////////////////
 
+static int start_invalid_client(const char *remote_address, uint16_t port, const unsigned char *ifname,
+                                const char *local_address, int messagenumber, int i) {
+
+  app_ur_session *ss = create_new_ss();
+  app_ur_conn_info *clnet_info = &(ss->pinfo);
+
+  if (start_raw_connection(port, remote_address, ifname, local_address, clnet_verbose, clnet_info) < 0) {
+    exit(-1);
+  }
+
+  socket_set_nonblocking(clnet_info->fd);
+
+  struct event *ev =
+      event_new(client_event_base, clnet_info->fd, EV_READ | EV_PERSIST, client_discard_input_handler, ss);
+  event_add(ev, NULL);
+
+  ss->state = UR_STATE_READY;
+  ss->input_ev = ev;
+  ss->tot_msgnum = messagenumber;
+  ss->recvmsgnum = -1;
+  ss->chnum = 0;
+
+  elems[i] = ss;
+
+  return 0;
+}
+
 static int start_client(const char *remote_address, uint16_t port, const unsigned char *ifname,
                         const char *local_address, int messagenumber, int i) {
 
@@ -1092,6 +1238,72 @@ static int start_client(const char *remote_address, uint16_t port, const unsigne
   return 0;
 }
 
+static void start_allocation_flood(const char *remote_address, uint16_t port, const unsigned char *ifname,
+                                   const char *local_address, int allocation_count, int mclient) {
+
+  const bool unlimited = allocation_count <= 0;
+  const uint64_t per_client_target = unlimited ? 0 : (uint64_t)allocation_count;
+  const uint64_t total_target = unlimited ? 0 : (per_client_target * (uint64_t)mclient);
+
+  __turn_getMSTime();
+  const uint64_t start_time = current_time;
+  tot_allocations = 0;
+  synthetic_peer_counter = 0;
+  reset_load_generator_rate_stats();
+
+  while (unlimited || (tot_allocations < total_target)) {
+    for (int i = 0; i < mclient; ++i) {
+      app_ur_conn_info clnet_info_probe;
+      app_ur_conn_info clnet_info;
+      ioa_addr synthetic_peer_addr;
+      memset(&clnet_info_probe, 0, sizeof(clnet_info_probe));
+      memset(&clnet_info, 0, sizeof(clnet_info));
+      memset(&synthetic_peer_addr, 0, sizeof(synthetic_peer_addr));
+      clnet_info_probe.fd = -1;
+      clnet_info.fd = -1;
+
+      generate_unique_allocation_peer(&synthetic_peer_addr);
+
+      if (start_allocate_only_connection(port, remote_address, ifname, local_address, clnet_verbose, &clnet_info_probe,
+                                         &clnet_info, &synthetic_peer_addr) < 0) {
+        exit(-1);
+      }
+
+      turn_refresh_allocation(clnet_verbose, &clnet_info, 0);
+
+      app_ur_session ss_probe;
+      app_ur_session ss_alloc;
+      memset(&ss_probe, 0, sizeof(ss_probe));
+      memset(&ss_alloc, 0, sizeof(ss_alloc));
+      ss_probe.pinfo = clnet_info_probe;
+      ss_alloc.pinfo = clnet_info;
+      if (ss_probe.pinfo.fd >= 0 || ss_probe.pinfo.ssl) {
+        uc_delete_session_elem_data(&ss_probe);
+      }
+      uc_delete_session_elem_data(&ss_alloc);
+
+      ++tot_allocations;
+
+      __turn_getMSTime();
+      if (show_statistics) {
+        print_load_generator_rate(__FUNCTION__);
+        TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "%s: total_allocations=%llu\n", __FUNCTION__,
+                      (unsigned long long)tot_allocations);
+        show_statistics = false;
+      }
+
+      if (!unlimited && (tot_allocations >= total_target)) {
+        break;
+      }
+    }
+  }
+
+  __turn_getMSTime();
+  print_load_generator_rate(__FUNCTION__);
+  TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "%s: total_allocations=%llu\n", __FUNCTION__, (unsigned long long)tot_allocations);
+  TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "Total allocation flood time is %u\n", (unsigned int)(current_time - start_time));
+}
+
 static int start_c2c(const char *remote_address, uint16_t port, const unsigned char *ifname, const char *local_address,
                      int messagenumber, int i) {
 
@@ -1303,7 +1515,7 @@ static int refresh_channel(app_ur_session *elem, uint16_t method, uint32_t lt) {
 
 static inline int client_timer_handler(app_ur_session *elem, int *done) {
   if (elem) {
-    if (!turn_time_before(current_mstime, elem->refresh_time)) {
+    if (uses_turn_allocation() && !turn_time_before(current_mstime, elem->refresh_time)) {
       refresh_channel(elem, 0, 600);
     }
 
@@ -1311,15 +1523,17 @@ static inline int client_timer_handler(app_ur_session *elem, int *done) {
       return 0;
     }
 
-    int max_num = 50;
+    const bool unlimited = uses_unlimited_message_count(elem);
+    int max_num = get_send_burst_limit();
     int cur_num = 0;
 
     while (!turn_time_before(current_mstime, elem->to_send_timems)) {
       if (cur_num++ >= max_num) {
         break;
       }
-      if (elem->wmsgnum >= elem->tot_msgnum) {
-        if (!turn_time_before(current_mstime, elem->finished_time) || (tot_recv_messages >= tot_messages)) {
+      if (!unlimited && (elem->wmsgnum >= elem->tot_msgnum)) {
+        if (!turn_time_before(current_mstime, elem->finished_time) ||
+            (!is_invalid_flood_mode() && (tot_recv_messages >= tot_messages))) {
           /*
           TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO,"%s: elem=0x%x: 111.111: c=%d, t=%d, r=%d,
           w=%d\n",__FUNCTION__,(int)elem,elem->wait_cycles,elem->tot_msgnum,elem->rmsgnum,elem->wmsgnum);
@@ -1347,7 +1561,10 @@ static inline int client_timer_handler(app_ur_session *elem, int *done) {
         }
       } else {
         *done += 1;
-        client_write(elem);
+        if (client_write(elem) < 0) {
+          client_shutdown(elem);
+          return 1;
+        }
         elem->finished_time = current_mstime + STOPPING_TIME * 1000;
       }
     }
@@ -1393,6 +1610,11 @@ void start_mclient(const char *remote_address, uint16_t port, const unsigned cha
 
   total_clients = mclient;
 
+  if (is_alloc_flood_mode()) {
+    start_allocation_flood(remote_address, port, ifname, local_address, messagenumber, mclient);
+    return;
+  }
+
   if (c2c) {
     // mclient must be a multiple of 4:
     if (!no_rtcp) {
@@ -1416,6 +1638,7 @@ void start_mclient(const char *remote_address, uint16_t port, const unsigned cha
 
   __turn_getMSTime();
   uint32_t stime = current_time;
+  reset_load_generator_rate_stats();
 
   memset(buffer_to_send, 7, clmessage_length);
 
@@ -1426,7 +1649,7 @@ void start_mclient(const char *remote_address, uint16_t port, const unsigned cha
   if (c2c) {
     if (!no_rtcp) {
       for (int i = 0; i < (mclient >> 2); i++) {
-        if (!dos) {
+        if (!dos && !is_load_generator_mode()) {
           usleep(SLEEP_INTERVAL);
         }
         if (start_c2c(remote_address, port, ifname, local_address, messagenumber, i << 2) < 0) {
@@ -1436,7 +1659,7 @@ void start_mclient(const char *remote_address, uint16_t port, const unsigned cha
       }
     } else {
       for (int i = 0; i < (mclient >> 1); i++) {
-        if (!dos) {
+        if (!dos && !is_load_generator_mode()) {
           usleep(SLEEP_INTERVAL);
         }
         if (start_c2c(remote_address, port, ifname, local_address, messagenumber, i << 1) < 0) {
@@ -1448,7 +1671,7 @@ void start_mclient(const char *remote_address, uint16_t port, const unsigned cha
   } else {
     if (!no_rtcp) {
       for (int i = 0; i < (mclient >> 1); i++) {
-        if (!dos) {
+        if (!dos && !is_load_generator_mode()) {
           usleep(SLEEP_INTERVAL);
         }
         if (start_client(remote_address, port, ifname, local_address, messagenumber, i << 1) < 0) {
@@ -1458,10 +1681,13 @@ void start_mclient(const char *remote_address, uint16_t port, const unsigned cha
       }
     } else {
       for (int i = 0; i < mclient; i++) {
-        if (!dos) {
+        if (!dos && !is_load_generator_mode()) {
           usleep(SLEEP_INTERVAL);
         }
-        if (start_client(remote_address, port, ifname, local_address, messagenumber, i) < 0) {
+        const int rc = is_invalid_flood_mode()
+                           ? start_invalid_client(remote_address, port, ifname, local_address, messagenumber, i)
+                           : start_client(remote_address, port, ifname, local_address, messagenumber, i);
+        if (rc < 0) {
           exit(-1);
         }
         tot_clients++;
@@ -1481,7 +1707,7 @@ void start_mclient(const char *remote_address, uint16_t port, const unsigned cha
   struct timeval tv;
 
   tv.tv_sec = 0;
-  tv.tv_usec = 1000;
+  tv.tv_usec = (is_packet_flood_mode() || is_invalid_flood_mode()) ? 100 : 1000;
 
   evtimer_add(ev, &tv);
 
@@ -1550,7 +1776,11 @@ void start_mclient(const char *remote_address, uint16_t port, const unsigned cha
   stime = current_time;
 
   for (int i = 0; i < total_clients; i++) {
-    elems[i]->to_send_timems = current_mstime + 1000 + ((uint32_t)turn_random_number()) % 5000;
+    if (is_packet_flood_mode() || is_invalid_flood_mode()) {
+      elems[i]->to_send_timems = current_mstime;
+    } else {
+      elems[i]->to_send_timems = current_mstime + 1000 + ((uint32_t)turn_random_number()) % 5000;
+    }
   }
 
   tot_messages = elems[0]->tot_msgnum * total_clients;
@@ -1567,6 +1797,7 @@ void start_mclient(const char *remote_address, uint16_t port, const unsigned cha
     }
 
     if (show_statistics) {
+      print_load_generator_rate(__FUNCTION__);
       TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO,
                     "%s: msz=%d, tot_send_msgs=%lu, tot_recv_msgs=%lu, tot_send_bytes ~ %llu, tot_recv_bytes ~ %llu\n",
                     __FUNCTION__, msz, (unsigned long)tot_send_messages, (unsigned long)tot_recv_messages,
@@ -1575,6 +1806,9 @@ void start_mclient(const char *remote_address, uint16_t port, const unsigned cha
     }
   }
 
+  __turn_getMSTime();
+  print_load_generator_rate(__FUNCTION__);
+
   TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "%s: tot_send_msgs=%lu, tot_recv_msgs=%lu\n", __FUNCTION__,
                 (unsigned long)tot_send_messages, (unsigned long)tot_recv_messages);
 
@@ -1592,16 +1826,28 @@ void start_mclient(const char *remote_address, uint16_t port, const unsigned cha
   total_loss = tot_send_messages - tot_recv_messages;
 
   TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "Total transmit time is %u\n", ((unsigned int)(current_time - stime)));
-  TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "Total lost packets %llu (%f%c), total send dropped %llu (%f%c)\n",
-                (unsigned long long)total_loss, (((double)total_loss / (double)tot_send_messages) * 100.00), '%',
-                (unsigned long long)tot_send_dropped,
-                (((double)tot_send_dropped / (double)(tot_send_messages + tot_send_dropped)) * 100.00), '%');
-  TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "Average round trip delay %f ms; min = %lu ms, max = %lu ms\n",
-                ((double)total_latency / (double)((tot_recv_messages < 1) ? 1 : tot_recv_messages)),
-                (unsigned long)min_latency, (unsigned long)max_latency);
-  TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "Average jitter %f ms; min = %lu ms, max = %lu ms\n",
-                ((double)total_jitter / (double)tot_recv_messages), (unsigned long)min_jitter,
-                (unsigned long)max_jitter);
+  if (is_invalid_flood_mode()) {
+    TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "Total send dropped %llu (%f%c)\n", (unsigned long long)tot_send_dropped,
+                  (((double)tot_send_dropped /
+                    (double)((tot_send_messages + tot_send_dropped) ? (tot_send_messages + tot_send_dropped) : 1)) *
+                   100.00),
+                  '%');
+  } else {
+    TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "Total lost packets %llu (%f%c), total send dropped %llu (%f%c)\n",
+                  (unsigned long long)total_loss,
+                  (((double)total_loss / (double)(tot_send_messages ? tot_send_messages : 1)) * 100.00), '%',
+                  (unsigned long long)tot_send_dropped,
+                  (((double)tot_send_dropped /
+                    (double)((tot_send_messages + tot_send_dropped) ? (tot_send_messages + tot_send_dropped) : 1)) *
+                   100.00),
+                  '%');
+    TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "Average round trip delay %f ms; min = %lu ms, max = %lu ms\n",
+                  ((double)total_latency / (double)((tot_recv_messages < 1) ? 1 : tot_recv_messages)),
+                  (unsigned long)min_latency, (unsigned long)max_latency);
+    TURN_LOG_FUNC(TURN_LOG_LEVEL_INFO, "Average jitter %f ms; min = %lu ms, max = %lu ms\n",
+                  ((double)total_jitter / (double)((tot_recv_messages < 1) ? 1 : tot_recv_messages)),
+                  (unsigned long)min_jitter, (unsigned long)max_jitter);
+  }
 
   free(elems);
 }
diff --git a/src/apps/uclient/uclient.h b/src/apps/uclient/uclient.h
index 511f26ac..eb791922 100644
--- a/src/apps/uclient/uclient.h
+++ b/src/apps/uclient/uclient.h
@@ -46,6 +46,13 @@ extern "C" {
 
 //////////////////////////////////////////////
 
+typedef enum {
+  UCLIENT_LOAD_MODE_NONE = 0,
+  UCLIENT_LOAD_MODE_PACKET_FLOOD,
+  UCLIENT_LOAD_MODE_ALLOC_FLOOD,
+  UCLIENT_LOAD_MODE_INVALID_FLOOD
+} uclient_load_mode;
+
 #define STOPPING_TIME (10)
 #define STARTING_TCP_RELAY_TIME (30)
 
@@ -86,6 +93,8 @@ extern bool no_permissions;
 extern bool extra_requests;
 extern band_limit_t bps;
 extern bool dual_allocation;
+extern bool unique_client_ports;
+extern uclient_load_mode load_mode;
 
 extern char origin[STUN_MAX_ORIGIN_SIZE + 1];
 
@@ -96,6 +105,10 @@ extern oauth_key okey_array[3];
 #define OAUTH_SESSION_LIFETIME (555)
 
 #define is_TCP_relay() (relay_transport == STUN_ATTRIBUTE_TRANSPORT_TCP_VALUE)
+#define is_packet_flood_mode() (load_mode == UCLIENT_LOAD_MODE_PACKET_FLOOD)
+#define is_alloc_flood_mode() (load_mode == UCLIENT_LOAD_MODE_ALLOC_FLOOD)
+#define is_invalid_flood_mode() (load_mode == UCLIENT_LOAD_MODE_INVALID_FLOOD)
+#define is_load_generator_mode() (load_mode != UCLIENT_LOAD_MODE_NONE)
 
 void start_mclient(const char *remote_address, uint16_t port, const unsigned char *ifname, const char *local_address,
                    int messagenumber, int mclient);