diff --git a/include/proto/connection.h b/include/proto/connection.h index af995cd15..d44aeec0a 100644 --- a/include/proto/connection.h +++ b/include/proto/connection.h @@ -96,6 +96,9 @@ static inline void conn_ctrl_init(struct connection *conn) int fd = conn->t.sock.fd; fd_insert(fd); + /* mark the fd as ready so as not to needlessly poll at the beginning */ + fd_may_recv(fd); + fd_may_send(fd); fdtab[fd].owner = conn; fdtab[fd].iocb = conn_fd_handler; conn->flags |= CO_FL_CTRL_READY; @@ -166,9 +169,9 @@ static inline void conn_refresh_polling_flags(struct connection *conn) if ((conn->flags & CO_FL_CTRL_READY) && conn->ctrl) { unsigned int flags = conn->flags & ~(CO_FL_CURR_RD_ENA | CO_FL_CURR_WR_ENA); - if (fd_ev_is_set(conn->t.sock.fd, DIR_RD)) + if (fd_recv_active(conn->t.sock.fd)) flags |= CO_FL_CURR_RD_ENA; - if (fd_ev_is_set(conn->t.sock.fd, DIR_WR)) + if (fd_send_active(conn->t.sock.fd)) flags |= CO_FL_CURR_WR_ENA; conn->flags = flags; } diff --git a/include/proto/fd.h b/include/proto/fd.h index 1ca9b3538..c87dc3dc3 100644 --- a/include/proto/fd.h +++ b/include/proto/fd.h @@ -2,7 +2,7 @@ * include/proto/fd.h * File descriptors states. * - * Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu + * Copyright (C) 2000-2014 Willy Tarreau - w@1wt.eu * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -127,99 +127,165 @@ static inline void fd_release_cache_entry(int fd) } } +/* Automatically allocates or releases a cache entry for fd depending on + * its new state. This is meant to be used by pollers while processing updates. + */ +static inline void fd_alloc_or_release_cache_entry(int fd, int new_state) +{ + /* READY and ACTIVE states (the two with both flags set) require a cache entry */ + + if (((new_state & (FD_EV_READY_R | FD_EV_ACTIVE_R)) == (FD_EV_READY_R | FD_EV_ACTIVE_R)) || + ((new_state & (FD_EV_READY_W | FD_EV_ACTIVE_W)) == (FD_EV_READY_W | FD_EV_ACTIVE_W))) { + fd_alloc_cache_entry(fd); + } + else { + fd_release_cache_entry(fd); + } +} + /* - * Returns non-zero if is already monitored for events in direction . + * returns the FD's recv state (FD_EV_*) */ -static inline int fd_ev_is_set(const int fd, int dir) +static inline int fd_recv_state(const int fd) { - return ((unsigned)fdtab[fd].state >> dir) & FD_EV_STATUS; + return ((unsigned)fdtab[fd].state >> (4 * DIR_RD)) & FD_EV_STATUS; } -/* Disable processing of events on fd for direction . Note: this - * function was optimized to be used with a constant for . +/* + * returns true if the FD is active for recv */ -static inline void fd_ev_clr(const int fd, int dir) +static inline int fd_recv_active(const int fd) { - unsigned int i = ((unsigned int)fdtab[fd].state) & (FD_EV_STATUS << dir); - if (i == 0) + return (unsigned)fdtab[fd].state & FD_EV_ACTIVE_R; +} + +/* + * returns true if the FD is ready for recv + */ +static inline int fd_recv_ready(const int fd) +{ + return (unsigned)fdtab[fd].state & FD_EV_READY_R; +} + +/* + * returns true if the FD is polled for recv + */ +static inline int fd_recv_polled(const int fd) +{ + return (unsigned)fdtab[fd].state & FD_EV_POLLED_R; +} + +/* + * returns the FD's send state (FD_EV_*) + */ +static inline int fd_send_state(const int fd) +{ + return ((unsigned)fdtab[fd].state >> (4 * DIR_WR)) & FD_EV_STATUS; +} + +/* + * returns true if the FD is active for send + */ +static inline int fd_send_active(const int fd) +{ + return (unsigned)fdtab[fd].state & FD_EV_ACTIVE_W; +} + +/* + * returns true if the FD is ready for send + */ +static inline int fd_send_ready(const int fd) +{ + return (unsigned)fdtab[fd].state & FD_EV_READY_W; +} + +/* + * returns true if the FD is polled for send + */ +static inline int fd_send_polled(const int fd) +{ + return (unsigned)fdtab[fd].state & FD_EV_POLLED_W; +} + +/* Disable processing recv events on fd */ +static inline void fd_stop_recv(int fd) +{ + if (!((unsigned int)fdtab[fd].state & FD_EV_ACTIVE_R)) return; /* already disabled */ - fdtab[fd].state ^= i; + fdtab[fd].state &= ~FD_EV_ACTIVE_R; updt_fd(fd); /* need an update entry to change the state */ } -/* Enable polling for events on fd for direction . Note: this - * function was optimized to be used with a constant for . - */ -static inline void fd_ev_wai(const int fd, int dir) +/* Disable processing send events on fd */ +static inline void fd_stop_send(int fd) { - unsigned int i = ((unsigned int)fdtab[fd].state) & (FD_EV_STATUS << dir); - if (i == (FD_EV_POLLED << dir)) - return; /* already in desired state */ - fdtab[fd].state ^= i ^ (FD_EV_POLLED << dir); - updt_fd(fd); /* need an update entry to change the state */ -} - -/* Enable processing of events on fd for direction . Note: this - * function was optimized to be used with a constant for . - */ -static inline void fd_ev_set(int fd, int dir) -{ - unsigned int i = ((unsigned int)fdtab[fd].state) & (FD_EV_STATUS << dir); - - /* note that we don't care about disabling the polled state when - * enabling the active state, since it brings no benefit but costs - * some syscalls. - */ - if (i & (FD_EV_ACTIVE << dir)) - return; /* already in desired state */ - fdtab[fd].state |= (FD_EV_ACTIVE << dir); + if (!((unsigned int)fdtab[fd].state & FD_EV_ACTIVE_W)) + return; /* already disabled */ + fdtab[fd].state &= ~FD_EV_ACTIVE_W; updt_fd(fd); /* need an update entry to change the state */ } /* Disable processing of events on fd for both directions. */ -static inline void fd_ev_rem(const int fd) +static inline void fd_stop_both(int fd) { - unsigned int i = ((unsigned int)fdtab[fd].state) & FD_EV_CURR_MASK; - if (i == 0) + if (!((unsigned int)fdtab[fd].state & FD_EV_ACTIVE_RW)) return; /* already disabled */ - fdtab[fd].state ^= i; + fdtab[fd].state &= ~FD_EV_ACTIVE_RW; updt_fd(fd); /* need an update entry to change the state */ } -/* event manipulation primitives for use by I/O callbacks */ +/* Report that FD cannot receive anymore without polling (EAGAIN detected). */ +static inline void fd_cant_recv(const int fd) +{ + if (!(((unsigned int)fdtab[fd].state) & FD_EV_READY_R)) + return; /* already marked as blocked */ + fdtab[fd].state &= ~FD_EV_READY_R; + updt_fd(fd); +} + +/* Report that FD can receive anymore without polling. */ +static inline void fd_may_recv(const int fd) +{ + if (((unsigned int)fdtab[fd].state) & FD_EV_READY_R) + return; /* already marked as blocked */ + fdtab[fd].state |= FD_EV_READY_R; + updt_fd(fd); +} + +/* Report that FD cannot send anymore without polling (EAGAIN detected). */ +static inline void fd_cant_send(const int fd) +{ + if (!(((unsigned int)fdtab[fd].state) & FD_EV_READY_W)) + return; /* already marked as blocked */ + fdtab[fd].state &= ~FD_EV_READY_W; + updt_fd(fd); +} + +/* Report that FD can send anymore without polling (EAGAIN detected). */ +static inline void fd_may_send(const int fd) +{ + if (((unsigned int)fdtab[fd].state) & FD_EV_READY_W) + return; /* already marked as blocked */ + fdtab[fd].state |= FD_EV_READY_W; + updt_fd(fd); +} + +/* Prepare FD to try to receive */ static inline void fd_want_recv(int fd) { - return fd_ev_set(fd, DIR_RD); -} - -static inline void fd_stop_recv(int fd) -{ - return fd_ev_clr(fd, DIR_RD); -} - -static inline void fd_poll_recv(int fd) -{ - return fd_ev_wai(fd, DIR_RD); + if (((unsigned int)fdtab[fd].state & FD_EV_ACTIVE_R)) + return; /* already enabled */ + fdtab[fd].state |= FD_EV_ACTIVE_R; + updt_fd(fd); /* need an update entry to change the state */ } +/* Prepare FD to try to send */ static inline void fd_want_send(int fd) { - return fd_ev_set(fd, DIR_WR); -} - -static inline void fd_stop_send(int fd) -{ - return fd_ev_clr(fd, DIR_WR); -} - -static inline void fd_poll_send(int fd) -{ - return fd_ev_wai(fd, DIR_WR); -} - -static inline void fd_stop_both(int fd) -{ - return fd_ev_rem(fd); + if (((unsigned int)fdtab[fd].state & FD_EV_ACTIVE_W)) + return; /* already enabled */ + fdtab[fd].state |= FD_EV_ACTIVE_W; + updt_fd(fd); /* need an update entry to change the state */ } /* Prepares for being polled */ diff --git a/include/types/fd.h b/include/types/fd.h index 3bfe48fa9..1c2c7c808 100644 --- a/include/types/fd.h +++ b/include/types/fd.h @@ -1,8 +1,8 @@ /* * include/types/fd.h - * File descriptors states. + * File descriptors states - check src/fd.c for explanations. * - * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu + * Copyright (C) 2000-2014 Willy Tarreau - w@1wt.eu * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -45,32 +45,43 @@ enum { #define FD_POLL_DATA (FD_POLL_IN | FD_POLL_OUT) #define FD_POLL_STICKY (FD_POLL_ERR | FD_POLL_HUP) -/* Event state for an FD in each direction, as found in the 4 lower bits of - * fdtab[].state, and in the 4 next bits. - */ #define FD_EV_ACTIVE 1U +#define FD_EV_READY 2U #define FD_EV_POLLED 4U -#define FD_EV_STATUS (FD_EV_ACTIVE | FD_EV_POLLED) + +#define FD_EV_STATUS (FD_EV_ACTIVE | FD_EV_POLLED | FD_EV_READY) #define FD_EV_STATUS_R (FD_EV_STATUS) -#define FD_EV_STATUS_W (FD_EV_STATUS << 1) +#define FD_EV_STATUS_W (FD_EV_STATUS << 4) #define FD_EV_POLLED_R (FD_EV_POLLED) -#define FD_EV_POLLED_W (FD_EV_POLLED << 1) +#define FD_EV_POLLED_W (FD_EV_POLLED << 4) #define FD_EV_POLLED_RW (FD_EV_POLLED_R | FD_EV_POLLED_W) #define FD_EV_ACTIVE_R (FD_EV_ACTIVE) -#define FD_EV_ACTIVE_W (FD_EV_ACTIVE << 1) +#define FD_EV_ACTIVE_W (FD_EV_ACTIVE << 4) #define FD_EV_ACTIVE_RW (FD_EV_ACTIVE_R | FD_EV_ACTIVE_W) -#define FD_EV_CURR_MASK 0x0FU -#define FD_EV_PREV_MASK 0xF0U +#define FD_EV_READY_R (FD_EV_READY) +#define FD_EV_READY_W (FD_EV_READY << 4) +#define FD_EV_READY_RW (FD_EV_READY_R | FD_EV_READY_W) + +enum fd_states { + FD_ST_DISABLED = 0, + FD_ST_MUSTPOLL, + FD_ST_STOPPED, + FD_ST_ACTIVE, + FD_ST_ABORT, + FD_ST_POLLED, + FD_ST_PAUSED, + FD_ST_READY +}; /* info about one given fd */ struct fdtab { int (*iocb)(int fd); /* I/O handler, returns FD_WAIT_* */ void *owner; /* the connection or listener associated with this fd, NULL if closed */ unsigned int cache; /* position+1 in the FD cache. 0=not in cache. */ - unsigned char state; /* FD state for read and write directions (4+4 bits) */ + unsigned char state; /* FD state for read and write directions (2*3 bits) */ unsigned char ev; /* event seen in return of poll() : FD_POLL_* */ unsigned char new:1; /* 1 if this fd has just been created */ unsigned char updated:1; /* 1 if this fd is already in the update list */ diff --git a/src/checks.c b/src/checks.c index ac1bbed85..bdc7a6bd0 100644 --- a/src/checks.c +++ b/src/checks.c @@ -1416,7 +1416,7 @@ static void event_srv_chk_r(struct connection *conn) return; wait_more_data: - __conn_data_poll_recv(conn); + __conn_data_want_recv(conn); } /* diff --git a/src/connection.c b/src/connection.c index 75f876f64..367d1e3bb 100644 --- a/src/connection.c +++ b/src/connection.c @@ -164,7 +164,8 @@ void conn_update_data_polling(struct connection *c) /* update read status if needed */ if (unlikely((f & (CO_FL_DATA_RD_ENA|CO_FL_WAIT_RD)) == (CO_FL_DATA_RD_ENA|CO_FL_WAIT_RD))) { - fd_poll_recv(c->t.sock.fd); + fd_want_recv(c->t.sock.fd); + fd_cant_recv(c->t.sock.fd); f |= CO_FL_CURR_RD_ENA; } else if (unlikely((f & (CO_FL_CURR_RD_ENA|CO_FL_DATA_RD_ENA)) == CO_FL_DATA_RD_ENA)) { @@ -178,7 +179,8 @@ void conn_update_data_polling(struct connection *c) /* update write status if needed */ if (unlikely((f & (CO_FL_DATA_WR_ENA|CO_FL_WAIT_WR)) == (CO_FL_DATA_WR_ENA|CO_FL_WAIT_WR))) { - fd_poll_send(c->t.sock.fd); + fd_want_send(c->t.sock.fd); + fd_cant_send(c->t.sock.fd); f |= CO_FL_CURR_WR_ENA; } else if (unlikely((f & (CO_FL_CURR_WR_ENA|CO_FL_DATA_WR_ENA)) == CO_FL_DATA_WR_ENA)) { @@ -207,7 +209,8 @@ void conn_update_sock_polling(struct connection *c) /* update read status if needed */ if (unlikely((f & (CO_FL_SOCK_RD_ENA|CO_FL_WAIT_RD)) == (CO_FL_SOCK_RD_ENA|CO_FL_WAIT_RD))) { - fd_poll_recv(c->t.sock.fd); + fd_want_recv(c->t.sock.fd); + fd_cant_recv(c->t.sock.fd); f |= CO_FL_CURR_RD_ENA; } else if (unlikely((f & (CO_FL_CURR_RD_ENA|CO_FL_SOCK_RD_ENA)) == CO_FL_SOCK_RD_ENA)) { @@ -221,7 +224,8 @@ void conn_update_sock_polling(struct connection *c) /* update write status if needed */ if (unlikely((f & (CO_FL_SOCK_WR_ENA|CO_FL_WAIT_WR)) == (CO_FL_SOCK_WR_ENA|CO_FL_WAIT_WR))) { - fd_poll_send(c->t.sock.fd); + fd_want_send(c->t.sock.fd); + fd_cant_send(c->t.sock.fd); f |= CO_FL_CURR_WR_ENA; } else if (unlikely((f & (CO_FL_CURR_WR_ENA|CO_FL_SOCK_WR_ENA)) == CO_FL_SOCK_WR_ENA)) { diff --git a/src/ev_epoll.c b/src/ev_epoll.c index a15cfc3dd..ae415fc62 100644 --- a/src/ev_epoll.c +++ b/src/ev_epoll.c @@ -1,7 +1,7 @@ /* * FD polling functions for Linux epoll * - * Copyright 2000-2012 Willy Tarreau + * Copyright 2000-2014 Willy Tarreau * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -46,7 +46,7 @@ static struct epoll_event ev; #endif /* - * speculative epoll() poller + * Linux epoll() poller */ REGPRM2 static void _do_poll(struct poller *p, int exp) { @@ -59,52 +59,58 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) /* first, scan the update list to find changes */ for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) { fd = fd_updt[updt_idx]; - en = fdtab[fd].state & 15; /* new events */ - eo = fdtab[fd].state >> 4; /* previous events */ + en = eo = fdtab[fd].state; - if (fdtab[fd].owner && (eo ^ en)) { - if ((eo ^ en) & FD_EV_POLLED_RW) { - /* poll status changed */ - if ((en & FD_EV_POLLED_RW) == 0) { - /* fd removed from poll list */ - opcode = EPOLL_CTL_DEL; - } - else if ((eo & FD_EV_POLLED_RW) == 0) { - /* new fd in the poll list */ - opcode = EPOLL_CTL_ADD; - } - else { - /* fd status changed */ - opcode = EPOLL_CTL_MOD; - } - - /* construct the epoll events based on new state */ - ev.events = 0; - if (en & FD_EV_POLLED_R) - ev.events |= EPOLLIN | EPOLLRDHUP; - - if (en & FD_EV_POLLED_W) - ev.events |= EPOLLOUT; - - ev.data.fd = fd; - epoll_ctl(epoll_fd, opcode, fd, &ev); - } - - fdtab[fd].state = (en << 4) + en; /* save new events */ - - if (!(en & FD_EV_ACTIVE_RW)) { - /* This fd doesn't use any active entry anymore, we can - * kill its entry. - */ - fd_release_cache_entry(fd); - } - else if ((en & ~eo) & FD_EV_ACTIVE_RW) { - /* we need a new cache entry now */ - fd_alloc_cache_entry(fd); - } - } fdtab[fd].updated = 0; fdtab[fd].new = 0; + + if (!fdtab[fd].owner) + continue; + + if (en & FD_EV_ACTIVE_R) { + if (!(en & FD_EV_READY_R)) + en |= FD_EV_POLLED_R; + } + else + en &= ~FD_EV_POLLED_R; + + if (en & FD_EV_ACTIVE_W) { + if (!(en & FD_EV_READY_W)) + en |= FD_EV_POLLED_W; + } + else + en &= ~FD_EV_POLLED_W; + + if ((eo ^ en) & FD_EV_POLLED_RW) { + /* poll status changed */ + fdtab[fd].state = en; + + if ((en & FD_EV_POLLED_RW) == 0) { + /* fd removed from poll list */ + opcode = EPOLL_CTL_DEL; + } + else if ((eo & FD_EV_POLLED_RW) == 0) { + /* new fd in the poll list */ + opcode = EPOLL_CTL_ADD; + } + else { + /* fd status changed */ + opcode = EPOLL_CTL_MOD; + } + + /* construct the epoll events based on new state */ + ev.events = 0; + if (en & FD_EV_POLLED_R) + ev.events |= EPOLLIN | EPOLLRDHUP; + + if (en & FD_EV_POLLED_W) + ev.events |= EPOLLOUT; + + ev.data.fd = fd; + epoll_ctl(epoll_fd, opcode, fd, &ev); + } + + fd_alloc_or_release_cache_entry(fd, en); } fd_nbupdt = 0; @@ -176,20 +182,15 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) if (fdtab[fd].iocb) { int new_updt, old_updt; - /* Mark the events as speculative before processing - * them so that if nothing can be done we don't need - * to poll again. - */ - if (fdtab[fd].ev & FD_POLL_IN) - fd_ev_set(fd, DIR_RD); + /* Mark the events as ready before processing */ + if (fdtab[fd].ev & (FD_POLL_IN | FD_POLL_HUP | FD_POLL_ERR)) + fd_may_recv(fd); - if (fdtab[fd].ev & FD_POLL_OUT) - fd_ev_set(fd, DIR_WR); + if (fdtab[fd].ev & (FD_POLL_OUT | FD_POLL_ERR)) + fd_may_send(fd); - if (fdtab[fd].cache) { - /* This fd was already scheduled for being called as a speculative I/O */ + if (fdtab[fd].cache) continue; - } /* Save number of updates to detect creation of new FDs. */ old_updt = fd_nbupdt; @@ -212,10 +213,10 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) fdtab[fd].new = 0; fdtab[fd].ev &= FD_POLL_STICKY; - if ((fdtab[fd].state & FD_EV_STATUS_R) == FD_EV_ACTIVE_R) + if ((fdtab[fd].state & FD_EV_STATUS_R) == (FD_EV_READY_R | FD_EV_ACTIVE_R)) fdtab[fd].ev |= FD_POLL_IN; - if ((fdtab[fd].state & FD_EV_STATUS_W) == FD_EV_ACTIVE_W) + if ((fdtab[fd].state & FD_EV_STATUS_W) == (FD_EV_READY_W | FD_EV_ACTIVE_W)) fdtab[fd].ev |= FD_POLL_OUT; if (fdtab[fd].ev && fdtab[fd].iocb && fdtab[fd].owner) @@ -224,19 +225,18 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) /* we can remove this update entry if it's the last one and is * unused, otherwise we don't touch anything. */ - if (new_updt == fd_nbupdt && fdtab[fd].state == 0) { + if (new_updt == fd_nbupdt && !fd_recv_active(fd) && !fd_send_active(fd)) { fdtab[fd].updated = 0; fd_nbupdt--; } } } } - - /* the caller will take care of speculative events */ + /* the caller will take care of cached events */ } /* - * Initialization of the speculative epoll() poller. + * Initialization of the epoll() poller. * Returns 0 in case of failure, non-zero in case of success. If it fails, it * disables the poller by setting its pref to 0. */ @@ -267,7 +267,7 @@ REGPRM1 static int _do_init(struct poller *p) } /* - * Termination of the speculative epoll() poller. + * Termination of the epoll() poller. * Memory is released and the poller is marked as unselectable. */ REGPRM1 static void _do_term(struct poller *p) diff --git a/src/ev_kqueue.c b/src/ev_kqueue.c index af5698bee..dab6f5bdc 100644 --- a/src/ev_kqueue.c +++ b/src/ev_kqueue.c @@ -1,17 +1,13 @@ /* * FD polling functions for FreeBSD kqueue() * - * Copyright 2000-2008 Willy Tarreau + * Copyright 2000-2014 Willy Tarreau * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Note: not knowing much about kqueue, I had to rely on OpenBSD's detailed man - * page and to check how it was implemented in lighttpd to understand it better. - * But it is possible that I got things wrong. - * */ #include @@ -51,10 +47,33 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) /* first, scan the update list to find changes */ for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) { fd = fd_updt[updt_idx]; - en = fdtab[fd].state & 15; /* new events */ - eo = fdtab[fd].state >> 4; /* previous events */ + en = eo = fdtab[fd].state; + + fdtab[fd].updated = 0; + fdtab[fd].new = 0; + + if (!fdtab[fd].owner) + continue; + + if (en & FD_EV_ACTIVE_R) { + if (!(en & FD_EV_READY_R)) + en |= FD_EV_POLLED_R; + } + else + en &= ~FD_EV_POLLED_R; + + if (en & FD_EV_ACTIVE_W) { + if (!(en & FD_EV_READY_W)) + en |= FD_EV_POLLED_W; + } + else + en &= ~FD_EV_POLLED_W; + + + if ((eo ^ en) & FD_EV_POLLED_RW) { + /* poll status changed */ + fdtab[fd].state = en; - if (fdtab[fd].owner && (eo ^ en)) { if ((eo ^ en) & FD_EV_POLLED_R) { /* read poll status changed */ if (en & FD_EV_POLLED_R) { @@ -78,22 +97,9 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) changes++; } } - - fdtab[fd].state = (en << 4) + en; /* save new events */ - - if (!(en & FD_EV_ACTIVE_RW)) { - /* This fd doesn't use any active entry anymore, we can - * kill its entry. - */ - fd_release_cache_entry(fd); - } - else if ((en & ~eo) & FD_EV_ACTIVE_RW) { - /* we need a new cache entry now */ - fd_alloc_cache_entry(fd); - } } - fdtab[fd].updated = 0; - fdtab[fd].new = 0; + + fd_alloc_or_release_cache_entry(fd, en); } if (changes) kevent(kqueue_fd, kev, changes, NULL, 0, NULL); @@ -147,22 +153,14 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) } if (fdtab[fd].iocb && fdtab[fd].ev) { - /* Mark the events as speculative before processing - * them so that if nothing can be done we don't need - * to poll again. - */ if (fdtab[fd].ev & FD_POLL_IN) - fd_ev_set(fd, DIR_RD); + fd_may_recv(fd); if (fdtab[fd].ev & FD_POLL_OUT) - fd_ev_set(fd, DIR_WR); + fd_may_send(fd); - if (fdtab[fd].cache) { - /* This fd was already scheduled for being - * called as a speculative I/O. - */ + if (fdtab[fd].cache) continue; - } fdtab[fd].iocb(fd); } diff --git a/src/ev_poll.c b/src/ev_poll.c index eff900714..da927dc5f 100644 --- a/src/ev_poll.c +++ b/src/ev_poll.c @@ -1,7 +1,7 @@ /* * FD polling functions for generic poll() * - * Copyright 2000-2012 Willy Tarreau + * Copyright 2000-2014 Willy Tarreau * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -70,38 +70,44 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) /* first, scan the update list to find changes */ for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) { fd = fd_updt[updt_idx]; - en = fdtab[fd].state & 15; /* new events */ - eo = fdtab[fd].state >> 4; /* previous events */ + en = eo = fdtab[fd].state; - if (fdtab[fd].owner && (eo ^ en)) { - if ((eo ^ en) & FD_EV_POLLED_RW) { - /* poll status changed, update the lists */ - if ((eo & ~en) & FD_EV_POLLED_R) - hap_fd_clr(fd, fd_evts[DIR_RD]); - else if ((en & ~eo) & FD_EV_POLLED_R) - hap_fd_set(fd, fd_evts[DIR_RD]); - - if ((eo & ~en) & FD_EV_POLLED_W) - hap_fd_clr(fd, fd_evts[DIR_WR]); - else if ((en & ~eo) & FD_EV_POLLED_W) - hap_fd_set(fd, fd_evts[DIR_WR]); - } - - fdtab[fd].state = (en << 4) + en; /* save new events */ - - if (!(en & FD_EV_ACTIVE_RW)) { - /* This fd doesn't use any active entry anymore, we can - * kill its entry. - */ - fd_release_cache_entry(fd); - } - else if ((en & ~eo) & FD_EV_ACTIVE_RW) { - /* we need a new cache entry now */ - fd_alloc_cache_entry(fd); - } - } fdtab[fd].updated = 0; fdtab[fd].new = 0; + + if (!fdtab[fd].owner) + continue; + + if (en & FD_EV_ACTIVE_R) { + if (!(en & FD_EV_READY_R)) + en |= FD_EV_POLLED_R; + } + else + en &= ~FD_EV_POLLED_R; + + if (en & FD_EV_ACTIVE_W) { + if (!(en & FD_EV_READY_W)) + en |= FD_EV_POLLED_W; + } + else + en &= ~FD_EV_POLLED_W; + + if ((eo ^ en) & FD_EV_POLLED_RW) { + /* poll status changed, update the lists */ + fdtab[fd].state = en; + + if ((eo & ~en) & FD_EV_POLLED_R) + hap_fd_clr(fd, fd_evts[DIR_RD]); + else if ((en & ~eo) & FD_EV_POLLED_R) + hap_fd_set(fd, fd_evts[DIR_RD]); + + if ((eo & ~en) & FD_EV_POLLED_W) + hap_fd_clr(fd, fd_evts[DIR_WR]); + else if ((en & ~eo) & FD_EV_POLLED_W) + hap_fd_set(fd, fd_evts[DIR_WR]); + } + + fd_alloc_or_release_cache_entry(fd, en); } fd_nbupdt = 0; @@ -172,22 +178,14 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) } if (fdtab[fd].iocb && fdtab[fd].ev) { - /* Mark the events as speculative before processing - * them so that if nothing can be done we don't need - * to poll again. - */ if (fdtab[fd].ev & FD_POLL_IN) - fd_ev_set(fd, DIR_RD); + fd_may_recv(fd); if (fdtab[fd].ev & FD_POLL_OUT) - fd_ev_set(fd, DIR_WR); + fd_may_send(fd); - if (fdtab[fd].cache) { - /* This fd was already scheduled for being - * called as a speculative I/O - */ + if (fdtab[fd].cache) continue; - } fdtab[fd].iocb(fd); } diff --git a/src/ev_select.c b/src/ev_select.c index df1d1615c..a87834028 100644 --- a/src/ev_select.c +++ b/src/ev_select.c @@ -1,7 +1,7 @@ /* * FD polling functions for generic select() * - * Copyright 2000-2012 Willy Tarreau + * Copyright 2000-2014 Willy Tarreau * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -53,38 +53,44 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) /* first, scan the update list to find changes */ for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) { fd = fd_updt[updt_idx]; - en = fdtab[fd].state & 15; /* new events */ - eo = fdtab[fd].state >> 4; /* previous events */ + en = eo = fdtab[fd].state; - if (fdtab[fd].owner && (eo ^ en)) { - if ((eo ^ en) & FD_EV_POLLED_RW) { - /* poll status changed, update the lists */ - if ((eo & ~en) & FD_EV_POLLED_R) - FD_CLR(fd, fd_evts[DIR_RD]); - else if ((en & ~eo) & FD_EV_POLLED_R) - FD_SET(fd, fd_evts[DIR_RD]); - - if ((eo & ~en) & FD_EV_POLLED_W) - FD_CLR(fd, fd_evts[DIR_WR]); - else if ((en & ~eo) & FD_EV_POLLED_W) - FD_SET(fd, fd_evts[DIR_WR]); - } - - fdtab[fd].state = (en << 4) + en; /* save new events */ - - if (!(en & FD_EV_ACTIVE_RW)) { - /* This fd doesn't use any active entry anymore, we can - * kill its entry. - */ - fd_release_cache_entry(fd); - } - else if ((en & ~eo) & FD_EV_ACTIVE_RW) { - /* we need a new cache entry now */ - fd_alloc_cache_entry(fd); - } - } fdtab[fd].updated = 0; fdtab[fd].new = 0; + + if (!fdtab[fd].owner) + continue; + + if (en & FD_EV_ACTIVE_R) { + if (!(en & FD_EV_READY_R)) + en |= FD_EV_POLLED_R; + } + else + en &= ~FD_EV_POLLED_R; + + if (en & FD_EV_ACTIVE_W) { + if (!(en & FD_EV_READY_W)) + en |= FD_EV_POLLED_W; + } + else + en &= ~FD_EV_POLLED_W; + + if ((eo ^ en) & FD_EV_POLLED_RW) { + /* poll status changed, update the lists */ + fdtab[fd].state = en; + + if ((eo & ~en) & FD_EV_POLLED_R) + FD_CLR(fd, fd_evts[DIR_RD]); + else if ((en & ~eo) & FD_EV_POLLED_R) + FD_SET(fd, fd_evts[DIR_RD]); + + if ((eo & ~en) & FD_EV_POLLED_W) + FD_CLR(fd, fd_evts[DIR_WR]); + else if ((en & ~eo) & FD_EV_POLLED_W) + FD_SET(fd, fd_evts[DIR_WR]); + } + + fd_alloc_or_release_cache_entry(fd, en); } fd_nbupdt = 0; @@ -156,22 +162,14 @@ REGPRM2 static void _do_poll(struct poller *p, int exp) fdtab[fd].ev |= FD_POLL_OUT; if (fdtab[fd].iocb && fdtab[fd].ev) { - /* Mark the events as speculative before processing - * them so that if nothing can be done we don't need - * to poll again. - */ if (fdtab[fd].ev & FD_POLL_IN) - fd_ev_set(fd, DIR_RD); + fd_may_recv(fd); if (fdtab[fd].ev & FD_POLL_OUT) - fd_ev_set(fd, DIR_WR); + fd_may_send(fd); - if (fdtab[fd].cache) { - /* This fd was already scheduled for being - * called as a speculative I/O. - */ + if (fdtab[fd].cache) continue; - } fdtab[fd].iocb(fd); } diff --git a/src/fd.c b/src/fd.c index 45e30336c..9e3706824 100644 --- a/src/fd.c +++ b/src/fd.c @@ -1,37 +1,46 @@ /* * File descriptors management functions. * - * Copyright 2000-2012 Willy Tarreau + * Copyright 2000-2014 Willy Tarreau * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * This code implements "speculative I/O". The principle is to try to perform - * expected I/O before registering the events in the poller. Each time this - * succeeds, it saves a possibly expensive system call to set the event. It - * generally succeeds for all reads after an accept(), and for writes after a - * connect(). It also improves performance for streaming connections because - * even if only one side is polled, the other one may react accordingly - * depending on the fill level of the buffer. This behaviour is also the only - * one compatible with event-based pollers (eg: EPOLL_ET). + * This code implements an events cache for file descriptors. It remembers the + * readiness of a file descriptor after a return from poll() and the fact that + * an I/O attempt failed on EAGAIN. Events in the cache which are still marked + * ready and active are processed just as if they were reported by poll(). * - * More importantly, it enables I/O operations that are backed by invisible - * buffers. For example, SSL is able to read a whole socket buffer and not - * deliver it to the application buffer because it's full. Unfortunately, it - * won't be reported by a poller anymore until some new activity happens. The - * only way to call it again thus is to perform speculative I/O as soon as - * reading on the FD is enabled again. + * This serves multiple purposes. First, it significantly improves performance + * by avoiding to subscribe to polling unless absolutely necessary, so most + * events are processed without polling at all, especially send() which + * benefits from the socket buffers. Second, it is the only way to support + * edge-triggered pollers (eg: EPOLL_ET). And third, it enables I/O operations + * that are backed by invisible buffers. For example, SSL is able to read a + * whole socket buffer and not deliver it to the application buffer because + * it's full. Unfortunately, it won't be reported by a poller anymore until + * some new activity happens. The only way to call it again thus is to keep + * this readiness information in the cache and to access it without polling + * once the FD is enabled again. * - * The speculative I/O uses a list of expected events and a list of updates. - * Expected events are events that are expected to come and that we must report - * to the application until it asks to stop or to poll. Updates are new requests - * for changing an FD state. Updates are the only way to create new events. This - * is important because it means that the number of speculative events cannot - * increase between updates and will only grow one at a time while processing - * updates. All updates must always be processed, though events might be - * processed by small batches if required. + * One interesting feature of the cache is that it maintains the principle + * of speculative I/O introduced in haproxy 1.3 : the first time an event is + * enabled, the FD is considered as ready so that the I/O attempt is performed + * via the cache without polling. And the polling happens only when EAGAIN is + * first met. This avoids polling for HTTP requests, especially when the + * defer-accept mode is used. It also avoids polling for sending short data + * such as requests to servers or short responses to clients. + * + * The cache consists in a list of active events and a list of updates. + * Active events are events that are expected to come and that we must report + * to the application until it asks to stop or asks to poll. Updates are new + * requests for changing an FD state. Updates are the only way to create new + * events. This is important because it means that the number of cached events + * cannot increase between updates and will only grow one at a time while + * processing updates. All updates must always be processed, though events + * might be processed by small batches if required. * * There is no direct link between the FD and the updates list. There is only a * bit in the fdtab[] to indicate than a file descriptor is already present in @@ -41,45 +50,98 @@ * * It is important to understand that as long as all expected events are * processed, they might starve the polled events, especially because polled - * I/O starvation quickly induces more speculative I/O. One solution to this + * I/O starvation quickly induces more cached I/O. One solution to this * consists in only processing a part of the events at once, but one drawback - * is that unhandled events will still wake the poller up. Using an event-driven - * poller such as EPOLL_ET will solve this issue though. + * is that unhandled events will still wake the poller up. Using an edge- + * triggered poller such as EPOLL_ET will solve this issue though. * - * A file descriptor has a distinct state for each direction. This state is a - * combination of two bits : - * bit 0 = active Y/N : is set if the FD is active, which means that its - * handler will be called without prior polling ; - * bit 1 = polled Y/N : is set if the FD was subscribed to polling - * - * It is perfectly valid to have both bits set at a time, which generally means - * that the FD was reported by polling, was marked active and not yet unpolled. - * Such a state must not last long to avoid unneeded wakeups. - * - * The state of the FD as of last change is preserved in two other bits. These - * ones are useful to save a significant amount of system calls during state - * changes, because there is no need to update the FD status in the system until - * we're about to call the poller. - * - * Since we do not want to scan all the FD list to find speculative I/O events, + * Since we do not want to scan all the FD list to find cached I/O events, * we store them in a list consisting in a linear array holding only the FD - * indexes right now. Note that a closed FD cannot exist in the spec list, - * because it is closed by fd_delete() which in turn calls __fd_clo() which - * always removes it from the list. + * indexes right now. Note that a closed FD cannot exist in the cache, because + * it is closed by fd_delete() which in turn calls fd_release_cache_entry() + * which always removes it from the list. * - * For efficiency reasons, we will store the Read and Write bits interlaced to - * form a 4-bit field, so that we can simply shift the value right by 0/1 and - * get what we want : - * 3 2 1 0 - * Wp Rp Wa Ra + * The FD array has to hold a back reference to the cache. This reference is + * always valid unless the FD is not in the cache and is not updated, in which + * case the reference points to index 0. * - * The FD array has to hold a back reference to the speculative list. This - * reference is always valid unless the FD if currently being polled and not - * updated (in which case the reference points to index 0). + * The event state for an FD, as found in fdtab[].state, is maintained for each + * direction. The state field is built this way, with R bits in the low nibble + * and W bits in the high nibble for ease of access and debugging : * - * We store the FD state in the 4 lower bits of fdtab[fd].state, and save the - * previous state upon changes in the 4 higher bits, so that changes are easy - * to spot. + * 7 6 5 4 3 2 1 0 + * [ 0 | PW | RW | AW | 0 | PR | RR | AR ] + * + * A* = active *R = read + * P* = polled *W = write + * R* = ready + * + * An FD is marked "active" when there is a desire to use it. + * An FD is marked "polled" when it is registered in the polling. + * An FD is marked "ready" when it has not faced a new EAGAIN since last wake-up + * (it is a cache of the last EAGAIN regardless of polling changes). + * + * We have 8 possible states for each direction based on these 3 flags : + * + * +---+---+---+----------+---------------------------------------------+ + * | P | R | A | State | Description | + * +---+---+---+----------+---------------------------------------------+ + * | 0 | 0 | 0 | DISABLED | No activity desired, not ready. | + * | 0 | 0 | 1 | MUSTPOLL | Activity desired via polling. | + * | 0 | 1 | 0 | STOPPED | End of activity without polling. | + * | 0 | 1 | 1 | ACTIVE | Activity desired without polling. | + * | 1 | 0 | 0 | ABORT | Aborted poll(). Not frequently seen. | + * | 1 | 0 | 1 | POLLED | FD is being polled. | + * | 1 | 1 | 0 | PAUSED | FD was paused while ready (eg: buffer full) | + * | 1 | 1 | 1 | READY | FD was marked ready by poll() | + * +---+---+---+----------+---------------------------------------------+ + * + * The transitions are pretty simple : + * - fd_want_*() : set flag A + * - fd_stop_*() : clear flag A + * - fd_cant_*() : clear flag R (when facing EAGAIN) + * - fd_may_*() : set flag R (upon return from poll()) + * - sync() : if (A) { if (!R) P := 1 } else { P := 0 } + * + * The PAUSED, ABORT and MUSTPOLL states are transient for level-trigerred + * pollers and are fixed by the sync() which happens at the beginning of the + * poller. For event-triggered pollers, only the MUSTPOLL state will be + * transient and ABORT will lead to PAUSED. The ACTIVE state is the only stable + * one which has P != A. + * + * The READY state is a bit special as activity on the FD might be notified + * both by the poller or by the cache. But it is needed for some multi-layer + * protocols (eg: SSL) where connection activity is not 100% linked to FD + * activity. Also some pollers might prefer to implement it as ACTIVE if + * enabling/disabling the FD is cheap. The READY and ACTIVE states are the + * two states for which a cache entry is allocated. + * + * The state transitions look like the diagram below. Only the 4 right states + * have polling enabled : + * + * (POLLED=0) (POLLED=1) + * + * +----------+ sync +-------+ + * | DISABLED | <----- | ABORT | (READY=0, ACTIVE=0) + * +----------+ +-------+ + * clr | ^ set | ^ + * | | | | + * v | set v | clr + * +----------+ sync +--------+ + * | MUSTPOLL | -----> | POLLED | (READY=0, ACTIVE=1) + * +----------+ +--------+ + * ^ poll | ^ + * | | | + * | EAGAIN v | EAGAIN + * +--------+ +-------+ + * | ACTIVE | | READY | (READY=1, ACTIVE=1) + * +--------+ +-------+ + * clr | ^ set | ^ + * | | | | + * v | set v | clr + * +---------+ sync +--------+ + * | STOPPED | <------ | PAUSED | (READY=1, ACTIVE=0) + * +---------+ +--------+ */ #include @@ -124,7 +186,7 @@ void fd_delete(int fd) cur_poller.clo(fd); fd_release_cache_entry(fd); - fdtab[fd].state &= ~(FD_EV_CURR_MASK | FD_EV_PREV_MASK); + fdtab[fd].state = 0; port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); fdinfo[fd].port_range = NULL; @@ -155,10 +217,10 @@ void fd_process_cached_events() */ fdtab[fd].ev &= FD_POLL_STICKY; - if (e & FD_EV_ACTIVE_R) + if ((e & (FD_EV_READY_R | FD_EV_ACTIVE_R)) == (FD_EV_READY_R | FD_EV_ACTIVE_R)) fdtab[fd].ev |= FD_POLL_IN; - if (e & FD_EV_ACTIVE_W) + if ((e & (FD_EV_READY_W | FD_EV_ACTIVE_W)) == (FD_EV_READY_W | FD_EV_ACTIVE_W)) fdtab[fd].ev |= FD_POLL_OUT; if (fdtab[fd].iocb && fdtab[fd].owner && fdtab[fd].ev) diff --git a/src/listener.c b/src/listener.c index c9418178f..ba7d727fb 100644 --- a/src/listener.c +++ b/src/listener.c @@ -324,7 +324,7 @@ void listener_accept(int fd) if (unlikely(cfd == -1)) { switch (errno) { case EAGAIN: - fd_poll_recv(fd); + fd_cant_recv(fd); return; /* nothing more to accept */ case EINTR: case ECONNABORTED: diff --git a/src/proto_tcp.c b/src/proto_tcp.c index ab8f343bb..4460bb437 100644 --- a/src/proto_tcp.c +++ b/src/proto_tcp.c @@ -584,8 +584,11 @@ int tcp_drain(int fd) } if (len < 0) { - if (errno == EAGAIN) /* connection not closed yet */ + if (errno == EAGAIN) { + /* connection not closed yet */ + fd_cant_recv(fd); return -1; + } if (errno == EINTR) /* oops, try again */ continue; /* other errors indicate a dead connection, fine. */