From d8f1ff8648b1705957f94680af354d4173c6d5fe Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Mon, 4 Mar 2024 18:41:39 +0100 Subject: [PATCH] BUG/MEDIUM: quic: fix connection freeze on post handshake After handshake completion, QUIC server is responsible to emit HANDSHAKE_DONE frame. Some clients wait for it to begin STREAM transfers. Previously, there was no explicit tasklet_wakeup() after handshake completion, which is necessary to emit post-handshake frames. In most cases, this was undetected as most client continue emission which will reschedule the tasklet. However, as there is no tasklet_wakeup(), this is not a consistent behavior. If this bug occurs, it causes a connection freeze, preventing the client to emit any request. The connection is finally closed on idle timeout. To fix this, add an explicit tasklet_wakeup() after handshake completion. It sounds simple enough but in fact it's difficult to find the correct location efor tasklet_wakeup() invocation, as post-handshake is directly linked to connection accept, with different orderings. Notably, if 0-RTT is used, connection can be accepted prior handshake completion. Another major point is that along HANDSHAKE_DONE frame, a series of NEW_CONNECTION_ID frames are emitted. However, these new CIDs allocation must occur after connection is migrated to its new thread as these CIDs are tied to it. A BUG_ON() is present to check this in qc_set_tid_affinity(). With all this in mind, 2 locations were selected for the necessary tasklet_wakeup() : * on qc_xprt_start() : this is useful for standard case without 0-RTT. This ensures that this is done only after connection thread migration. * on qc_ssl_provide_all_quic_data() : this is done on handshake completion with 0-RTT used. In this case only, connection is already accepted and migrated, so tasklet_wakeup() is safe. Note that as a side-change, quic_accept_push_qc() API has evolved to better reflect differences between standard and 0-RTT usages. It is now forbidden to call it multiple times on a single quic_conn instance. A BUG_ON() has been added. This issue is labelled as medium even though it seems pretty rare. It was only reproducible using QUIC interop runner, with haproxy compiled with LibreSSL with quic-go as client. However, affected code parts are pretty sensible, which justify the chosen severity. This should fix github issue #2418. It should be backported up to 2.6, after a brief period of observation. Note that the extra comment added in qc_set_tid_affinity() can be removed in 2.6 as thread migration is not implemented for this version. Other parts should apply without conflict. --- src/quic_conn.c | 9 ++++++++- src/quic_sock.c | 9 +++------ src/quic_ssl.c | 13 +++++++++++-- src/xprt_quic.c | 7 +++++++ 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 5233496e3..8c7d5aaf3 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -1818,7 +1818,14 @@ int qc_set_tid_affinity(struct quic_conn *qc, uint new_tid, struct listener *new qc_detach_th_ctx_list(qc, 0); node = eb64_first(qc->cids); - BUG_ON(!node || eb64_next(node)); /* One and only one CID must be present before affinity rebind. */ + /* One and only one CID must be present before affinity rebind. + * + * This could be triggered fairly easily if tasklet is scheduled just + * before thread migration for post-handshake state to generate new + * CIDs. In this case, QUIC_FL_CONN_IO_TO_REQUEUE should be used + * instead of tasklet_wakeup(). + */ + BUG_ON(!node || eb64_next(node)); conn_id = eb64_entry(node, struct quic_connection_id, seq_num); /* At this point no connection was accounted for yet on this diff --git a/src/quic_sock.c b/src/quic_sock.c index 4d51ef6fd..5bd65c706 100644 --- a/src/quic_sock.c +++ b/src/quic_sock.c @@ -999,18 +999,15 @@ void qc_want_recv(struct quic_conn *qc) struct quic_accept_queue *quic_accept_queues; /* Install on the queue ready to be accepted. The queue task is then woken - * up. If accept is already scheduled or done, nothing is done. + * up. */ void quic_accept_push_qc(struct quic_conn *qc) { struct quic_accept_queue *queue = &quic_accept_queues[tid]; struct li_per_thread *lthr = &qc->li->per_thr[ti->ltid]; - /* early return if accept is already in progress/done for this - * connection - */ - if (qc->flags & QUIC_FL_CONN_ACCEPT_REGISTERED) - return; + /* A connection must only be accepted once per instance. */ + BUG_ON(qc->flags & QUIC_FL_CONN_ACCEPT_REGISTERED); BUG_ON(MT_LIST_INLIST(&qc->accept_list)); HA_ATOMIC_INC(&qc->li->rx.quic_curr_accept); diff --git a/src/quic_ssl.c b/src/quic_ssl.c index 08c119f3e..668497339 100644 --- a/src/quic_ssl.c +++ b/src/quic_ssl.c @@ -593,8 +593,17 @@ int qc_ssl_provide_quic_data(struct ncbuf *ncbuf, if (qc_is_listener(ctx->qc)) { qc->flags |= QUIC_FL_CONN_NEED_POST_HANDSHAKE_FRMS; qc->state = QUIC_HS_ST_CONFIRMED; - /* The connection is ready to be accepted. */ - quic_accept_push_qc(qc); + + if (!(qc->flags & QUIC_FL_CONN_ACCEPT_REGISTERED)) { + quic_accept_push_qc(qc); + } + else { + /* Connection already accepted if 0-RTT used. + * In this case, schedule quic-conn to ensure + * post-handshake frames are emitted. + */ + tasklet_wakeup(qc->wait_event.tasklet); + } BUG_ON(qc->li->rx.quic_curr_handshake == 0); HA_ATOMIC_DEC(&qc->li->rx.quic_curr_handshake); diff --git a/src/xprt_quic.c b/src/xprt_quic.c index eda113cfc..b83b6340c 100644 --- a/src/xprt_quic.c +++ b/src/xprt_quic.c @@ -140,6 +140,13 @@ static int qc_xprt_start(struct connection *conn, void *ctx) /* mux-quic can now be considered ready. */ qc->mux_state = QC_MUX_READY; + /* Schedule quic-conn to ensure post handshake frames are emitted. This + * is not done for 0-RTT as xprt->start happens before handshake + * completion. + */ + if (qc->flags & QUIC_FL_CONN_NEED_POST_HANDSHAKE_FRMS) + tasklet_wakeup(qc->wait_event.tasklet); + ret = 1; out: TRACE_LEAVE(QUIC_EV_CONN_NEW, qc);