diff --git a/src/quic_cc_cubic.c b/src/quic_cc_cubic.c index 0ed4a325e..81eda3396 100644 --- a/src/quic_cc_cubic.c +++ b/src/quic_cc_cubic.c @@ -3,34 +3,84 @@ #include #include +/* IMPORTANT NOTE about the units defined by the RFC 9438 + * (CUBIC for Fast and Long-Distance Networks): + * + * RFC 9438 4.1. Definitions: + * The unit of all window sizes in this document is segments of the SMSS, and + * the unit of all times is seconds. Implementations can use bytes to express + * window sizes, which would require factoring in the SMSS wherever necessary + * and replacing segments_acked (Figure 4) with the number of acknowledged + * bytes. + */ + +/* So, this is the reason why here in this implementation each time a number + * of segments is used (typically a congestion window value), its value is + * multiplied by the MTU value. + */ + /* This source file is highly inspired from Linux kernel source file * implementation for TCP Cubic. In fact, we have no choice if we do * not want to use any floating point operations to be fast! * (See net/ipv4/tcp_cubic.c) */ -#define CUBIC_BETA_SCALE 1024 -#define CUBIC_BETA_SCALE_SHIFT 10 -/* beta = 0.7 ; C = 0.4 */ -#define CUBIC_BETA 717 /* CUBIC_BETA / CUBIC_BETA_SCALE = 0.7 */ -#define CUBIC_C 410 /* CUBIC_C / CUBIC_BETA_SCALE = 0.4 */ +/* Constants definitions: + * CUBIC_BETA_SCALED refers to the scaled value of RFC 9438 beta_cubic variable. + * CUBIC_C_SCALED refers to the scaled value of RFC 9438 C variable. + */ -#define TIME_SCALE_FACTOR_SHIFT 10 +/* The right shifting value to apply to scaled values to get its real value. */ +#define CUBIC_SCALE_FACTOR_SHIFT 10 -/* The maximum value which may be cubed an multiplied by CUBIC_BETA */ -#define CUBIC_DIFF_TIME_LIMIT 355535ULL /* ms */ +/* CUBIC multiplicative decrease factor as described in RFC 9438 section 4.6 */ +#define CUBIC_BETA_SCALED 717 /* beta_cubic = 0.7 (constant) */ -/* K cube factor: (1 - beta) / c */ +/* CUBIC C constant that determines the aggressiveness of CUBIC in competing + * with other congestion control algorithms in high-BDP networks. + */ +#define CUBIC_C_SCALED 410 /* RFC 9438 C = 0.4 segment/seconds^3 + * or 410 mB/s^3 in this implementation. + */ + +/* The scaled value of 1 */ +#define CUBIC_ONE_SCALED (1 << CUBIC_SCALE_FACTOR_SHIFT) + +/* The left bit shifting to apply to convert milliseconds to seconds. */ +#define TIME_SCALE_FACTOR_SHIFT 10 + +/* The maximum time value which may be cubed and multiplied by CUBIC_C_SCALED */ +#define CUBIC_TIME_LIMIT 355535ULL /* ms */ + +/* By connection CUBIC algorithm state. Note that the current congestion window + * value is not stored in this structure. + */ struct cubic { + /* QUIC_CC_ST_* state values. */ uint32_t state; + /* Slow start threshold (in bytes) */ uint32_t ssthresh; + /* Remaining number of acknowledged bytes between two ACK for CUBIC congestion + * control window (in bytes). + */ uint32_t remaining_inc; - uint32_t remaining_tcp_inc; - uint32_t epoch_start; - uint32_t origin_point; + /* Start time of at which the current avoidance stage started (in ms). */ + uint32_t t_epoch; + /* The window to reach for each recovery period during a concave region (in bytes). */ + uint32_t W_target; + /* The time period to reach W_target during a concave region (in ms). */ uint32_t K; + /* The last window maximum reached (in bytes). */ uint32_t last_w_max; - uint32_t tcp_wnd; + /* Estimated value of the Reno congestion window in the TCP-friendly region (in bytes). */ + uint32_t W_est; + /* Remaining number of acknowledged bytes between two ACKs for estimated + * TCP-Reno congestion control window (in bytes). + */ + uint32_t remaining_W_est_inc; + /* Start time of recovery period (used to avoid re-entering this state, if already + * in recovery period) (in ms). + */ uint32_t recovery_start_time; }; @@ -42,12 +92,12 @@ static void quic_cc_cubic_reset(struct quic_cc *cc) c->state = QUIC_CC_ST_SS; c->ssthresh = QUIC_CC_INFINITE_SSTHESH; c->remaining_inc = 0; - c->remaining_tcp_inc = 0; - c->epoch_start = 0; - c->origin_point = 0; + c->remaining_W_est_inc = 0; + c->t_epoch = 0; + c->W_target = 0; c->K = 0; c->last_w_max = 0; - c->tcp_wnd = 0; + c->W_est = 0; c->recovery_start_time = 0; TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc); } @@ -93,42 +143,126 @@ static uint32_t cubic_root(uint64_t val) return x; } +/* + * RFC 9438 3.1. Principle 1 for the CUBIC Increase Function + * + * For better network utilization and stability, CUBIC [HRX08] uses a cubic + * window increase function in terms of the elapsed time from the last + * congestion event. While most congestion control algorithms that provide + * alternatives to Reno increase the congestion window using convex functions, + * CUBIC uses both the concave and convex profiles of a cubic function for + * window growth. + * + * After a window reduction in response to a congestion event detected by + * duplicate acknowledgments (ACKs), Explicit Congestion Notification-Echo + * (ECN-Echo (ECE)) ACKs [RFC3168], RACK-TLP for TCP [RFC8985], or QUIC loss + * detection [RFC9002], CUBIC remembers the congestion window size at which it + * received the congestion event and performs a multiplicative decrease of the + * congestion window. When CUBIC enters into congestion avoidance, it starts to + * increase the congestion window using the concave profile of the cubic + * function. The cubic function is set to have its plateau at the remembered + * congestion window size, so that the concave window increase continues until + * then. After that, the cubic function turns into a convex profile and the + * convex window increase begins. + * + * W_cubic(time) (bytes) + * ^ convex region + * | <-------------------------> + * | . + + * | . + + * | . + + * | . + + * | . + ^ + * | . + | W_cubic_t + * | . + | + * | . + | + * W_target |-----------+--------------------------+------------------------+ + * (W_max) | +. + . t + * | + . + . + * | + . + . + * | + . + . + * | + . + . + * | .+ . + * | + . + * | + . + * | + . + * | . . + * | . . + * | . . + * +-----------+--------------------------+-+------------------------> time (s) + * 0 t_epoch (t_epoch + K) + * <--------------------------> + * . concave region + * . + * congestion + * event + * + * RFC 9438 4.2. Window Increase Function: + * + * W_cubic(t) = C*(t-K)^3 + W_max (Figure 1) + * K = cubic_root((W_max - cwnd_epoch)/C) (Figure 2) + * + * +--------------------------------------------------------------------+ + * | RFC 9438 definitions | Code variables | + * +--------------------------------------------------------------------+ + * | C (segments/s^3) | CUBIC_C_SCALED (mB/s^3) | + * +--------------------------------------------------------------------+ + * | W_max (segments) | c->last_w_max - path->cwnd (bytes) | + * +--------------------------------------------------------------------+ + * | K (s) | c->K (ms) | + * +--------------------------------------------------------------------+ + * | beta_cubic (constant) | CUBIC_BETA_SCALED (constant) | + * +--------------------------------------------------------------------+ + */ static inline void quic_cubic_update(struct quic_cc *cc, uint32_t acked) { struct cubic *c = quic_cc_priv(cc); struct quic_cc_path *path = container_of(cc, struct quic_cc_path, cc); - /* Current cwnd as number of packets */ - uint32_t t, target, inc, inc_diff; - uint64_t delta, diff; + /* The elapsed time since the start of the congestion event. */ + uint32_t elapsed_time; + /* Target value of the congestion window. */ + uint32_t target; + /* The time at which the congestion window will be computed based + * on the cubic increase function. + */ + uint64_t t; + /* The computed value of the congestion window at time t based on the cubic + * increase function. + */ + uint64_t W_cubic_t; + uint32_t inc, inc_diff; TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); - if (!c->epoch_start) { - c->epoch_start = now_ms; + if (!c->t_epoch) { + c->t_epoch = now_ms; if (c->last_w_max <= path->cwnd) { c->K = 0; - c->origin_point = path->cwnd; + c->W_target = path->cwnd; } else { - /* K = cubic_root((1 - beta) * W_max / C) */ + /* K value computing (in seconds): + * K = cubic_root((W_max - cwnd_epoch)/C) (Figure 2) + * Note that K is stored in 1024th of a second. + */ c->K = cubic_root((c->last_w_max - path->cwnd) * - (CUBIC_BETA_SCALE - CUBIC_BETA) / CUBIC_C / path->mtu) << TIME_SCALE_FACTOR_SHIFT; - c->origin_point = c->last_w_max; + (CUBIC_ONE_SCALED - CUBIC_BETA_SCALED) / CUBIC_C_SCALED / path->mtu) << TIME_SCALE_FACTOR_SHIFT; + c->W_target = c->last_w_max; } - c->tcp_wnd = path->cwnd; + c->W_est = path->cwnd; c->remaining_inc = 0; - c->remaining_tcp_inc = 0; + c->remaining_W_est_inc = 0; } - t = now_ms + path->loss.rtt_min - c->epoch_start; - if (t < c->K) { - diff = c->K - t; + elapsed_time = now_ms + path->loss.rtt_min - c->t_epoch; + if (elapsed_time < c->K) { + t = c->K - elapsed_time; } else { - diff = t - c->K; + t = elapsed_time - c->K; } - if (diff > CUBIC_DIFF_TIME_LIMIT) { + if (t > CUBIC_TIME_LIMIT) { /* TODO : should not happen if we handle the case * of very late acks receipt. This must be handled as a congestion * control event: a very late ack should trigger a congestion @@ -138,32 +272,90 @@ static inline void quic_cubic_update(struct quic_cc *cc, uint32_t acked) goto leave; } - delta = path->mtu * ((CUBIC_C * diff * diff * diff) >> (CUBIC_BETA_SCALE_SHIFT + 3 * TIME_SCALE_FACTOR_SHIFT)); - if (t < c->K) - target = c->origin_point - delta; + /* Compute W_cubic_t at t time. */ + W_cubic_t = path->mtu * ((CUBIC_C_SCALED * t * t * t) >> (CUBIC_SCALE_FACTOR_SHIFT + 3 * TIME_SCALE_FACTOR_SHIFT)); + if (elapsed_time < c->K) + target = c->W_target - W_cubic_t; else - target = c->origin_point + delta; + target = c->W_target + W_cubic_t; if (target > path->cwnd) { + /* Concave region */ + + /* RFC 9438 4.4. Concave Region + * + * When receiving a new ACK in congestion avoidance, if CUBIC is not in + * the Reno-friendly region and cwnd is less than Wmax, then CUBIC is + * in the concave region. In this region, cwnd MUST be incremented by + * (target - cwnd) / cwnd. + */ inc_diff = c->remaining_inc + path->mtu * (target - path->cwnd); c->remaining_inc = inc_diff % path->cwnd; inc = inc_diff / path->cwnd; } else { - /* small increment */ + /* Convex region: very small increment */ + + /* RFC 9438 4.5. Convex Region + * + * When receiving a new ACK in congestion avoidance, if CUBIC is not in + * the Reno-friendly region and cwnd is larger than or equal to Wmax, + * then CUBIC is in the convex region.The convex region indicates that + * the network conditions might have changed since the last congestion + * event, possibly implying more available bandwidth after some flow + * departures. Since the Internet is highly asynchronous, some amount + * of perturbation is always possible without causing a major change in + * available bandwidth.Unless the cwnd is overridden by the AIMD window + * increase, CUBIC will behave cautiously when operating in this region. + * The convex profile aims to increase the window very slowly at the + * beginning when cwnd is around Wmax and then gradually increases its + * rate of increase. This region is also called the "maximum probing + * phase", since CUBIC is searching for a new Wmax. In this region, + * cwnd MUST be incremented by (target - cwnd) / cwnd) for each received + * new ACK, where target is calculated as described in Section 4.2. + */ inc_diff = c->remaining_inc + path->mtu; c->remaining_inc = inc_diff % (100 * path->cwnd); inc = inc_diff / (100 * path->cwnd); } - inc_diff = c->remaining_tcp_inc + path->mtu * acked; - c->tcp_wnd += inc_diff / path->cwnd; - c->remaining_tcp_inc = inc_diff % path->cwnd; - /* TCP friendliness */ - if (c->tcp_wnd > path->cwnd) { - uint32_t tcp_inc = path->mtu * (c->tcp_wnd - path->cwnd) / path->cwnd; - if (tcp_inc > inc) - inc = tcp_inc; + inc_diff = c->remaining_W_est_inc + path->mtu * acked; + c->W_est += inc_diff / path->cwnd; + c->remaining_W_est_inc = inc_diff % path->cwnd; + + /* TCP friendliness : + * RFC 9438 4.3. Reno-Friendly Region + * + * Reno performs well in certain types of networks -- for example, under + * short RTTs and small bandwidths (or small BDPs). In these networks, + * CUBIC remains in the Reno-friendly region to achieve at least the same + * throughput as Reno. + * + * When receiving a new ACK in congestion avoidance (where cwnd could be + * greater than or less than Wmax), CUBIC checks whether Wcubic(t) is less + * than West. If so, CUBIC is in the Reno-friendly region and cwnd SHOULD + * be set to West at each reception of a new ACK. + * + * West is set equal to cwnd_epoch at the start of the congestion avoidance + * stage. After that, on every new ACK, West is updated using Figure 4. + * Note that this equation uses segments_acked and cwnd is measured in + * segments. An implementation that measures cwnd in bytes should adjust the + * equation accordingly using the number of acknowledged bytes and the SMSS. + * Also note that this equation works for connections with enabled or + * disabled delayed ACKs [RFC5681], as segments_acked will be different based + * on the segments actually acknowledged by a new ACK. + * + * Figure 4 : West = West + alpha_cubic * (segments_acked / cwnd) + * + * Once West has grown to reach the cwnd at the time of most recently + * setting ssthresh -- that is, West >= cwndprior -- the sender SHOULD set + * alpha_cubic to 1 to ensure that it can achieve the same congestion window + * increment rate as Reno, which uses AIMD(1, 0.5). + */ + if (c->W_est > path->cwnd) { + uint32_t W_est_inc = path->mtu * (c->W_est - path->cwnd) / path->cwnd; + if (W_est_inc > inc) + inc = W_est_inc; } path->cwnd += inc; @@ -187,17 +379,34 @@ static void quic_enter_recovery(struct quic_cc *cc) /* Current cwnd as number of packets */ TRACE_ENTER(QUIC_EV_CONN_CC, cc->qc); - c->epoch_start = 0; + c->t_epoch = 0; c->recovery_start_time = now_ms; - /* Fast convergence */ + + /* RFC 9438 4.7. Fast Convergence + * + * To improve convergence speed, CUBIC uses a heuristic. When a new flow + * joins the network, existing flows need to give up some of their bandwidth + * to allow the new flow some room for growth if the existing flows have + * been using all the network bandwidth. To speed up this bandwidth release + * by existing flows, the following fast convergence mechanism SHOULD be + * implemented.With fast convergence, when a congestion event occurs, Wmax + * is updated as follows, before the window reduction described in Section + * 4.6. + * + * if cwnd < Wmax and fast convergence enabled, further reduce Wax: + * Wmax = cwnd * (1 + beta_cubic) + * otherwise, remember cwn before reduction: + * Wmax = cwnd + */ if (path->cwnd < c->last_w_max) { - /* (1 + beta) * path->cwnd / 2 */ - c->last_w_max = (path->cwnd * (CUBIC_BETA_SCALE + CUBIC_BETA) / 2) >> CUBIC_BETA_SCALE_SHIFT; + /* (1 + beta_cubic) * path->cwnd / 2 */ + c->last_w_max = (path->cwnd * (CUBIC_ONE_SCALED + CUBIC_BETA_SCALED) / 2) >> CUBIC_SCALE_FACTOR_SHIFT; } else { c->last_w_max = path->cwnd; } - c->ssthresh = (CUBIC_BETA * path->cwnd) >> CUBIC_BETA_SCALE_SHIFT; + + c->ssthresh = (CUBIC_BETA_SCALED * path->cwnd) >> CUBIC_SCALE_FACTOR_SHIFT; path->cwnd = QUIC_MAX(c->ssthresh, (uint32_t)path->min_cwnd); c->state = QUIC_CC_ST_RP; TRACE_LEAVE(QUIC_EV_CONN_CC, cc->qc, NULL, cc); @@ -269,7 +478,7 @@ static void quic_cc_cubic_rp_cb(struct quic_cc *cc, struct quic_cc_event *ev) switch (ev->type) { case QUIC_CC_EVT_ACK: - /* RFC 9022 7.3.2. Recovery + /* RFC 9002 7.3.2. Recovery * A recovery period ends and the sender enters congestion avoidance when a * packet sent during the recovery period is acknowledged. */