haproxy/src/proto_quic.c
Frédéric Lécaille c1029f6182 MINOR: quic: Allocate listener RX buffers
At this time we allocate an RX buffer by thread.
Also take the opportunity offered by this patch to rename TX related variable
names to distinguish them from the RX part.
2021-11-05 15:20:04 +01:00

684 lines
20 KiB
C

/*
* AF_INET/AF_INET6 QUIC protocol layer.
*
* Copyright 2020 Frédéric Lécaille <flecaille@haproxy.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/udp.h>
#include <netinet/in.h>
#include <haproxy/api.h>
#include <haproxy/arg.h>
#include <haproxy/cbuf.h>
#include <haproxy/connection.h>
#include <haproxy/errors.h>
#include <haproxy/fd.h>
#include <haproxy/global.h>
#include <haproxy/list.h>
#include <haproxy/listener.h>
#include <haproxy/log.h>
#include <haproxy/namespace.h>
#include <haproxy/port_range.h>
#include <haproxy/protocol.h>
#include <haproxy/proto_quic.h>
#include <haproxy/proto_udp.h>
#include <haproxy/proxy-t.h>
#include <haproxy/sock.h>
#include <haproxy/quic_sock.h>
#include <haproxy/sock_inet.h>
#include <haproxy/tools.h>
#include <haproxy/xprt_quic-t.h>
static void quic_add_listener(struct protocol *proto, struct listener *listener);
static int quic_bind_listener(struct listener *listener, char *errmsg, int errlen);
static int quic_connect_server(struct connection *conn, int flags);
static void quic_enable_listener(struct listener *listener);
static void quic_disable_listener(struct listener *listener);
/* Note: must not be declared <const> as its list will be overwritten */
struct protocol proto_quic4 = {
.name = "quic4",
/* connection layer */
.ctrl_type = SOCK_STREAM,
.listen = quic_bind_listener,
.enable = quic_enable_listener,
.disable = quic_disable_listener,
.add = quic_add_listener,
.unbind = default_unbind_listener,
.suspend = default_suspend_listener,
.resume = default_resume_listener,
.accept_conn = quic_sock_accept_conn,
.connect = quic_connect_server,
/* binding layer */
.rx_suspend = udp_suspend_receiver,
.rx_resume = udp_resume_receiver,
/* address family */
.fam = &proto_fam_inet4,
/* socket layer */
.proto_type = PROTO_TYPE_DGRAM,
.sock_type = SOCK_DGRAM,
.sock_prot = IPPROTO_UDP,
.rx_enable = sock_enable,
.rx_disable = sock_disable,
.rx_unbind = sock_unbind,
.rx_listening = quic_sock_accepting_conn,
.default_iocb = quic_sock_fd_iocb,
.receivers = LIST_HEAD_INIT(proto_quic4.receivers),
.nb_receivers = 0,
};
INITCALL1(STG_REGISTER, protocol_register, &proto_quic4);
/* Note: must not be declared <const> as its list will be overwritten */
struct protocol proto_quic6 = {
.name = "quic6",
/* connection layer */
.ctrl_type = SOCK_STREAM,
.listen = quic_bind_listener,
.enable = quic_enable_listener,
.disable = quic_disable_listener,
.add = quic_add_listener,
.unbind = default_unbind_listener,
.suspend = default_suspend_listener,
.resume = default_resume_listener,
.accept_conn = quic_sock_accept_conn,
.connect = quic_connect_server,
/* binding layer */
.rx_suspend = udp_suspend_receiver,
.rx_resume = udp_resume_receiver,
/* address family */
.fam = &proto_fam_inet6,
/* socket layer */
.proto_type = PROTO_TYPE_DGRAM,
.sock_type = SOCK_DGRAM,
.sock_prot = IPPROTO_UDP,
.rx_enable = sock_enable,
.rx_disable = sock_disable,
.rx_unbind = sock_unbind,
.rx_listening = quic_sock_accepting_conn,
.default_iocb = quic_sock_fd_iocb,
.receivers = LIST_HEAD_INIT(proto_quic6.receivers),
.nb_receivers = 0,
};
INITCALL1(STG_REGISTER, protocol_register, &proto_quic6);
/* Binds ipv4/ipv6 address <local> to socket <fd>, unless <flags> is set, in which
* case we try to bind <remote>. <flags> is a 2-bit field consisting of :
* - 0 : ignore remote address (may even be a NULL pointer)
* - 1 : use provided address
* - 2 : use provided port
* - 3 : use both
*
* The function supports multiple foreign binding methods :
* - linux_tproxy: we directly bind to the foreign address
* The second one can be used as a fallback for the first one.
* This function returns 0 when everything's OK, 1 if it could not bind, to the
* local address, 2 if it could not bind to the foreign address.
*/
int quic_bind_socket(int fd, int flags, struct sockaddr_storage *local, struct sockaddr_storage *remote)
{
struct sockaddr_storage bind_addr;
int foreign_ok = 0;
int ret;
static THREAD_LOCAL int ip_transp_working = 1;
static THREAD_LOCAL int ip6_transp_working = 1;
switch (local->ss_family) {
case AF_INET:
if (flags && ip_transp_working) {
/* This deserves some explanation. Some platforms will support
* multiple combinations of certain methods, so we try the
* supported ones until one succeeds.
*/
if (sock_inet4_make_foreign(fd))
foreign_ok = 1;
else
ip_transp_working = 0;
}
break;
case AF_INET6:
if (flags && ip6_transp_working) {
if (sock_inet6_make_foreign(fd))
foreign_ok = 1;
else
ip6_transp_working = 0;
}
break;
}
if (flags) {
memset(&bind_addr, 0, sizeof(bind_addr));
bind_addr.ss_family = remote->ss_family;
switch (remote->ss_family) {
case AF_INET:
if (flags & 1)
((struct sockaddr_in *)&bind_addr)->sin_addr = ((struct sockaddr_in *)remote)->sin_addr;
if (flags & 2)
((struct sockaddr_in *)&bind_addr)->sin_port = ((struct sockaddr_in *)remote)->sin_port;
break;
case AF_INET6:
if (flags & 1)
((struct sockaddr_in6 *)&bind_addr)->sin6_addr = ((struct sockaddr_in6 *)remote)->sin6_addr;
if (flags & 2)
((struct sockaddr_in6 *)&bind_addr)->sin6_port = ((struct sockaddr_in6 *)remote)->sin6_port;
break;
default:
/* we don't want to try to bind to an unknown address family */
foreign_ok = 0;
}
}
setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
if (foreign_ok) {
if (is_inet_addr(&bind_addr)) {
ret = bind(fd, (struct sockaddr *)&bind_addr, get_addr_len(&bind_addr));
if (ret < 0)
return 2;
}
}
else {
if (is_inet_addr(local)) {
ret = bind(fd, (struct sockaddr *)local, get_addr_len(local));
if (ret < 0)
return 1;
}
}
if (!flags)
return 0;
if (!foreign_ok)
/* we could not bind to a foreign address */
return 2;
return 0;
}
/*
* This function initiates a QUIC connection establishment to the target assigned
* to connection <conn> using (si->{target,dst}). A source address may be
* pointed to by conn->src in case of transparent proxying. Normal source
* bind addresses are still determined locally (due to the possible need of a
* source port). conn->target may point either to a valid server or to a backend,
* depending on conn->target. Only OBJ_TYPE_PROXY and OBJ_TYPE_SERVER are
* supported. The <data> parameter is a boolean indicating whether there are data
* waiting for being sent or not, in order to adjust data write polling and on
* some platforms, the ability to avoid an empty initial ACK. The <flags> argument
* is not used.
*
* Note that a pending send_proxy message accounts for data.
*
* It can return one of :
* - SF_ERR_NONE if everything's OK
* - SF_ERR_SRVTO if there are no more servers
* - SF_ERR_SRVCL if the connection was refused by the server
* - SF_ERR_PRXCOND if the connection has been limited by the proxy (maxconn)
* - SF_ERR_RESOURCE if a system resource is lacking (eg: fd limits, ports, ...)
* - SF_ERR_INTERNAL for any other purely internal errors
* Additionally, in the case of SF_ERR_RESOURCE, an emergency log will be emitted.
*
* The connection's fd is inserted only when SF_ERR_NONE is returned, otherwise
* it's invalid and the caller has nothing to do.
*/
int quic_connect_server(struct connection *conn, int flags)
{
int fd;
struct server *srv;
struct proxy *be;
struct conn_src *src;
struct sockaddr_storage *addr;
conn->flags |= CO_FL_WAIT_L4_CONN; /* connection in progress */
switch (obj_type(conn->target)) {
case OBJ_TYPE_PROXY:
be = objt_proxy(conn->target);
srv = NULL;
break;
case OBJ_TYPE_SERVER:
srv = objt_server(conn->target);
be = srv->proxy;
break;
default:
conn->flags |= CO_FL_ERROR;
return SF_ERR_INTERNAL;
}
if (!conn->dst) {
conn->flags |= CO_FL_ERROR;
return SF_ERR_INTERNAL;
}
fd = conn->handle.fd = sock_create_server_socket(conn);
if (fd == -1) {
qfprintf(stderr, "Cannot get a server socket.\n");
if (errno == ENFILE) {
conn->err_code = CO_ER_SYS_FDLIM;
send_log(be, LOG_EMERG,
"Proxy %s reached system FD limit (maxsock=%d). Please check system tunables.\n",
be->id, global.maxsock);
}
else if (errno == EMFILE) {
conn->err_code = CO_ER_PROC_FDLIM;
send_log(be, LOG_EMERG,
"Proxy %s reached process FD limit (maxsock=%d). Please check 'ulimit-n' and restart.\n",
be->id, global.maxsock);
}
else if (errno == ENOBUFS || errno == ENOMEM) {
conn->err_code = CO_ER_SYS_MEMLIM;
send_log(be, LOG_EMERG,
"Proxy %s reached system memory limit (maxsock=%d). Please check system tunables.\n",
be->id, global.maxsock);
}
else if (errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
conn->err_code = CO_ER_NOPROTO;
}
else
conn->err_code = CO_ER_SOCK_ERR;
/* this is a resource error */
conn->flags |= CO_FL_ERROR;
return SF_ERR_RESOURCE;
}
if (fd >= global.maxsock) {
/* do not log anything there, it's a normal condition when this option
* is used to serialize connections to a server !
*/
ha_alert("socket(): not enough free sockets. Raise -n argument. Giving up.\n");
close(fd);
conn->err_code = CO_ER_CONF_FDLIM;
conn->flags |= CO_FL_ERROR;
return SF_ERR_PRXCOND; /* it is a configuration limit */
}
if ((fcntl(fd, F_SETFL, O_NONBLOCK)==-1)) {
qfprintf(stderr,"Cannot set client socket to non blocking mode.\n");
close(fd);
conn->err_code = CO_ER_SOCK_ERR;
conn->flags |= CO_FL_ERROR;
return SF_ERR_INTERNAL;
}
if (master == 1 && (fcntl(fd, F_SETFD, FD_CLOEXEC) == -1)) {
ha_alert("Cannot set CLOEXEC on client socket.\n");
close(fd);
conn->err_code = CO_ER_SOCK_ERR;
conn->flags |= CO_FL_ERROR;
return SF_ERR_INTERNAL;
}
/* allow specific binding :
* - server-specific at first
* - proxy-specific next
*/
if (srv && srv->conn_src.opts & CO_SRC_BIND)
src = &srv->conn_src;
else if (be->conn_src.opts & CO_SRC_BIND)
src = &be->conn_src;
else
src = NULL;
if (src) {
int ret, flags = 0;
if (conn->src && is_inet_addr(conn->src)) {
switch (src->opts & CO_SRC_TPROXY_MASK) {
case CO_SRC_TPROXY_CLI:
conn_set_private(conn);
/* fall through */
case CO_SRC_TPROXY_ADDR:
flags = 3;
break;
case CO_SRC_TPROXY_CIP:
case CO_SRC_TPROXY_DYN:
conn_set_private(conn);
flags = 1;
break;
}
}
#ifdef SO_BINDTODEVICE
/* Note: this might fail if not CAP_NET_RAW */
if (src->iface_name)
setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, src->iface_name, src->iface_len + 1);
#endif
if (src->sport_range) {
int attempts = 10; /* should be more than enough to find a spare port */
struct sockaddr_storage sa;
ret = 1;
memcpy(&sa, &src->source_addr, sizeof(sa));
do {
/* note: in case of retry, we may have to release a previously
* allocated port, hence this loop's construct.
*/
port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port);
fdinfo[fd].port_range = NULL;
if (!attempts)
break;
attempts--;
fdinfo[fd].local_port = port_range_alloc_port(src->sport_range);
if (!fdinfo[fd].local_port) {
conn->err_code = CO_ER_PORT_RANGE;
break;
}
fdinfo[fd].port_range = src->sport_range;
set_host_port(&sa, fdinfo[fd].local_port);
ret = quic_bind_socket(fd, flags, &sa, conn->src);
if (ret != 0)
conn->err_code = CO_ER_CANT_BIND;
} while (ret != 0); /* binding NOK */
}
else {
#ifdef IP_BIND_ADDRESS_NO_PORT
static THREAD_LOCAL int bind_address_no_port = 1;
setsockopt(fd, IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT, (const void *) &bind_address_no_port, sizeof(int));
#endif
ret = quic_bind_socket(fd, flags, &src->source_addr, conn->src);
if (ret != 0)
conn->err_code = CO_ER_CANT_BIND;
}
if (unlikely(ret != 0)) {
port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port);
fdinfo[fd].port_range = NULL;
close(fd);
if (ret == 1) {
ha_alert("Cannot bind to source address before connect() for backend %s. Aborting.\n",
be->id);
send_log(be, LOG_EMERG,
"Cannot bind to source address before connect() for backend %s.\n",
be->id);
} else {
ha_alert("Cannot bind to tproxy source address before connect() for backend %s. Aborting.\n",
be->id);
send_log(be, LOG_EMERG,
"Cannot bind to tproxy source address before connect() for backend %s.\n",
be->id);
}
conn->flags |= CO_FL_ERROR;
return SF_ERR_RESOURCE;
}
}
if (global.tune.server_sndbuf)
setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &global.tune.server_sndbuf, sizeof(global.tune.server_sndbuf));
if (global.tune.server_rcvbuf)
setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &global.tune.server_rcvbuf, sizeof(global.tune.server_rcvbuf));
addr = (conn->flags & CO_FL_SOCKS4) ? &srv->socks4_addr : conn->dst;
if (connect(fd, (const struct sockaddr *)addr, get_addr_len(addr)) == -1) {
if (errno == EINPROGRESS || errno == EALREADY) {
/* common case, let's wait for connect status */
conn->flags |= CO_FL_WAIT_L4_CONN;
}
else if (errno == EISCONN) {
/* should normally not happen but if so, indicates that it's OK */
conn->flags &= ~CO_FL_WAIT_L4_CONN;
}
else if (errno == EAGAIN || errno == EADDRINUSE || errno == EADDRNOTAVAIL) {
char *msg;
if (errno == EAGAIN || errno == EADDRNOTAVAIL) {
msg = "no free ports";
conn->err_code = CO_ER_FREE_PORTS;
}
else {
msg = "local address already in use";
conn->err_code = CO_ER_ADDR_INUSE;
}
qfprintf(stderr,"Connect() failed for backend %s: %s.\n", be->id, msg);
port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port);
fdinfo[fd].port_range = NULL;
close(fd);
send_log(be, LOG_ERR, "Connect() failed for backend %s: %s.\n", be->id, msg);
conn->flags |= CO_FL_ERROR;
return SF_ERR_RESOURCE;
} else if (errno == ETIMEDOUT) {
//qfprintf(stderr,"Connect(): ETIMEDOUT");
port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port);
fdinfo[fd].port_range = NULL;
close(fd);
conn->err_code = CO_ER_SOCK_ERR;
conn->flags |= CO_FL_ERROR;
return SF_ERR_SRVTO;
} else {
// (errno == ECONNREFUSED || errno == ENETUNREACH || errno == EACCES || errno == EPERM)
//qfprintf(stderr,"Connect(): %d", errno);
port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port);
fdinfo[fd].port_range = NULL;
close(fd);
conn->err_code = CO_ER_SOCK_ERR;
conn->flags |= CO_FL_ERROR;
return SF_ERR_SRVCL;
}
}
else {
/* connect() == 0, this is great! */
conn->flags &= ~CO_FL_WAIT_L4_CONN;
}
conn->flags |= CO_FL_ADDR_TO_SET;
conn_ctrl_init(conn); /* registers the FD */
HA_ATOMIC_OR(&fdtab[fd].state, FD_LINGER_RISK); /* close hard if needed */
if (conn->flags & CO_FL_WAIT_L4_CONN) {
fd_want_send(fd);
fd_cant_send(fd);
fd_cant_recv(fd);
}
return SF_ERR_NONE; /* connection is OK */
}
/* Add listener <listener> to protocol <proto>. Technically speaking we just
* initialize a few entries which should be doable during quic_bind_listener().
* The end of the initialization goes on with the default function.
*/
static void quic_add_listener(struct protocol *proto, struct listener *listener)
{
MT_LIST_INIT(&listener->rx.pkts);
listener->rx.odcids = EB_ROOT_UNIQUE;
listener->rx.cids = EB_ROOT_UNIQUE;
HA_RWLOCK_INIT(&listener->rx.cids_lock);
default_add_listener(proto, listener);
}
/* Allocate the TX ring buffers for <l> listener.
* Return 1 if succeeded, 0 if not.
*/
static int quic_alloc_tx_rings_listener(struct listener *l)
{
struct qring *qr;
int i;
l->rx.tx_qrings = calloc(global.nbthread, sizeof *l->rx.tx_qrings);
if (!l->rx.tx_qrings)
return 0;
MT_LIST_INIT(&l->rx.tx_qring_list);
for (i = 0; i < global.nbthread; i++) {
unsigned char *buf;
struct qring *qr = &l->rx.tx_qrings[i];
buf = pool_alloc(pool_head_quic_tx_ring);
if (!buf)
goto err;
qr->cbuf = cbuf_new(buf, QUIC_TX_RING_BUFSZ);
if (!qr->cbuf) {
pool_free(pool_head_quic_tx_ring, buf);
goto err;
}
MT_LIST_APPEND(&l->rx.tx_qring_list, &qr->mt_list);
}
return 1;
err:
while ((qr = MT_LIST_POP(&l->rx.tx_qring_list, typeof(qr), mt_list))) {
pool_free(pool_head_quic_tx_ring, qr->cbuf->buf);
cbuf_free(qr->cbuf);
}
free(l->rx.tx_qrings);
return 0;
}
/* Allocate the RX buffers for <l> listener.
* Return 1 if succeeded, 0 if not.
*/
static int quic_alloc_rxbufs_listener(struct listener *l)
{
int i;
struct rxbuf *rxbuf;
l->rx.rxbufs = calloc(global.nbthread, sizeof *l->rx.rxbufs);
if (!l->rx.rxbufs)
return 0;
MT_LIST_INIT(&l->rx.rxbuf_list);
for (i = 0; i < global.nbthread; i++) {
char *buf;
rxbuf = &l->rx.rxbufs[i];
buf = pool_alloc(pool_head_quic_rxbuf);
if (!buf)
goto err;
rxbuf->buf = b_make(buf, QUIC_RX_BUFSZ, 0, 0);
MT_LIST_APPEND(&l->rx.rxbuf_list, &rxbuf->mt_list);
}
return 1;
err:
while ((rxbuf = MT_LIST_POP(&l->rx.rxbuf_list, typeof(rxbuf), mt_list)))
pool_free(pool_head_quic_rxbuf, rxbuf->buf.area);
free(l->rx.rxbufs);
return 0;
}
/* This function tries to bind a QUIC4/6 listener. It may return a warning or
* an error message in <errmsg> if the message is at most <errlen> bytes long
* (including '\0'). Note that <errmsg> may be NULL if <errlen> is also zero.
* The return value is composed from ERR_ABORT, ERR_WARN,
* ERR_ALERT, ERR_RETRYABLE and ERR_FATAL. ERR_NONE indicates that everything
* was alright and that no message was returned. ERR_RETRYABLE means that an
* error occurred but that it may vanish after a retry (eg: port in use), and
* ERR_FATAL indicates a non-fixable error. ERR_WARN and ERR_ALERT do not alter
* the meaning of the error, but just indicate that a message is present which
* should be displayed with the respective level. Last, ERR_ABORT indicates
* that it's pointless to try to start other listeners. No error message is
* returned if errlen is NULL.
*/
static int quic_bind_listener(struct listener *listener, char *errmsg, int errlen)
{
int err = ERR_NONE;
char *msg = NULL;
/* ensure we never return garbage */
if (errlen)
*errmsg = 0;
if (listener->state != LI_ASSIGNED)
return ERR_NONE; /* already bound */
if (!(listener->rx.flags & RX_F_BOUND)) {
msg = "receiving socket not bound";
goto udp_return;
}
if (!quic_alloc_tx_rings_listener(listener) ||
!quic_alloc_rxbufs_listener(listener)) {
msg = "could not initialize tx/rx rings";
err |= ERR_WARN;
goto udp_return;
}
listener_set_state(listener, LI_LISTEN);
udp_return:
if (msg && errlen) {
char pn[INET6_ADDRSTRLEN];
addr_to_str(&listener->rx.addr, pn, sizeof(pn));
snprintf(errmsg, errlen, "%s for [%s:%d]", msg, pn, get_host_port(&listener->rx.addr));
}
return err;
}
/* Enable receipt of incoming connections for listener <l>. The receiver must
* still be valid. Does nothing in early boot (needs fd_updt).
*/
static void quic_enable_listener(struct listener *l)
{
/* FIXME: The following statements are incorrect. This
* is the responsibility of the QUIC xprt to stop accepting new
* connections.
*/
if (fd_updt)
fd_want_recv(l->rx.fd);
}
/* Disable receipt of incoming connections for listener <l>. The receiver must
* still be valid. Does nothing in early boot (needs fd_updt).
*/
static void quic_disable_listener(struct listener *l)
{
/* FIXME: The following statements are incorrect. This
* is the responsibility of the QUIC xprt to start accepting new
* connections again.
*/
if (fd_updt)
fd_stop_recv(l->rx.fd);
}
/*
* Local variables:
* c-indent-level: 8
* c-basic-offset: 8
* End:
*/