haproxy/src/ev_kqueue.c
Willy Tarreau 5be2f35231 MAJOR: polling: centralize calls to I/O callbacks
In order for HTTP/2 not to eat too much memory, we'll have to support
on-the-fly buffer allocation, since most streams will have an empty
request buffer at some point. Supporting allocation on the fly means
being able to sleep inside I/O callbacks if a buffer is not available.

Till now, the I/O callbacks were called from two locations :
  - when processing the cached events
  - when processing the polled events from the poller

This change cleans up the design a bit further than what was started in
1.5. It now ensures that we never call any iocb from the poller itself
and that instead, events learned by the poller are put into the cache.
The benefit is important in terms of stability : we don't have to care
anymore about the risk that new events are added into the poller while
processing its events, and we're certain that updates are processed at
a single location.

To achieve this, we now modify all the fd_* functions so that instead of
creating updates, they add/remove the fd to/from the cache depending on
its state, and only create an update when the polling status reaches a
state where it will have to change. Since the pollers make use of these
functions to notify readiness (using fd_may_recv/fd_may_send), the cache
is always up to date with the poller.

Creating updates only when the polling status needs to change saves a
significant amount of work for the pollers : a benchmark showed that on
a typical TCP proxy test, the amount of updates per connection dropped
from 11 to 1 on average. This also means that the update list is smaller
and has more chances of not thrashing too many CPU cache lines. The first
observed benefit is a net 2% performance gain on the connection rate.

A second benefit is that when a connection is accepted, it's only when
we're processing the cache, and the recv event is automatically added
into the cache *after* the current one, resulting in this event to be
processed immediately during the same loop. Previously we used to have
a second run over the updates to detect if new events were added to
catch them before waking up tasks.

The next gain will be offered by the next steps on this subject consisting
in implementing an I/O queue containing all cached events ordered by priority
just like the run queue, and to be able to leave some events pending there
as long as needed. That will allow us *not* to perform some FD processing
if it's not the proper time for this (typically keep waiting for a buffer
to be allocated if none is available for an recv()). And by only processing
a small bunch of them, we'll allow priorities to take place even at the I/O
level.

As a result of this change, functions fd_alloc_or_release_cache_entry()
and fd_process_polled_events() have disappeared, and the code dedicated
to checking for new fd events after the callback during the poll() loop
was removed as well. Despite the patch looking large, it's mostly a
change of what function is falled upon fd_*() and almost nothing was
added.
2014-11-21 20:37:32 +01:00

257 lines
5.4 KiB
C

/*
* FD polling functions for FreeBSD kqueue()
*
* Copyright 2000-2014 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <unistd.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/event.h>
#include <sys/time.h>
#include <common/compat.h>
#include <common/config.h>
#include <common/ticks.h>
#include <common/time.h>
#include <common/tools.h>
#include <types/global.h>
#include <proto/fd.h>
#include <proto/signal.h>
#include <proto/task.h>
/* private data */
static int kqueue_fd;
static struct kevent *kev = NULL;
/*
* kqueue() poller
*/
REGPRM2 static void _do_poll(struct poller *p, int exp)
{
int status;
int count, fd, delta_ms;
struct timespec timeout;
int updt_idx, en, eo;
int changes = 0;
/* first, scan the update list to find changes */
for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) {
fd = fd_updt[updt_idx];
fdtab[fd].updated = 0;
fdtab[fd].new = 0;
if (!fdtab[fd].owner)
continue;
eo = fdtab[fd].state;
en = fd_compute_new_polled_status(eo);
if ((eo ^ en) & FD_EV_POLLED_RW) {
/* poll status changed */
fdtab[fd].state = en;
if ((eo ^ en) & FD_EV_POLLED_R) {
/* read poll status changed */
if (en & FD_EV_POLLED_R) {
EV_SET(&kev[changes], fd, EVFILT_READ, EV_ADD, 0, 0, NULL);
changes++;
}
else {
EV_SET(&kev[changes], fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
changes++;
}
}
if ((eo ^ en) & FD_EV_POLLED_W) {
/* write poll status changed */
if (en & FD_EV_POLLED_W) {
EV_SET(&kev[changes], fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL);
changes++;
}
else {
EV_SET(&kev[changes], fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
changes++;
}
}
}
}
if (changes)
kevent(kqueue_fd, kev, changes, NULL, 0, NULL);
fd_nbupdt = 0;
delta_ms = 0;
timeout.tv_sec = 0;
timeout.tv_nsec = 0;
if (!fd_cache_num && !run_queue && !signal_queue_len) {
if (!exp) {
delta_ms = MAX_DELAY_MS;
timeout.tv_sec = (MAX_DELAY_MS / 1000);
timeout.tv_nsec = (MAX_DELAY_MS % 1000) * 1000000;
}
else if (!tick_is_expired(exp, now_ms)) {
delta_ms = TICKS_TO_MS(tick_remain(now_ms, exp)) + 1;
if (delta_ms > MAX_DELAY_MS)
delta_ms = MAX_DELAY_MS;
timeout.tv_sec = (delta_ms / 1000);
timeout.tv_nsec = (delta_ms % 1000) * 1000000;
}
}
fd = MIN(maxfd, global.tune.maxpollevents);
gettimeofday(&before_poll, NULL);
status = kevent(kqueue_fd, // int kq
NULL, // const struct kevent *changelist
0, // int nchanges
kev, // struct kevent *eventlist
fd, // int nevents
&timeout); // const struct timespec *timeout
tv_update_date(delta_ms, status);
measure_idle();
for (count = 0; count < status; count++) {
fd = kev[count].ident;
if (!fdtab[fd].owner)
continue;
fdtab[fd].ev &= FD_POLL_STICKY;
if (kev[count].filter == EVFILT_READ) {
if ((fdtab[fd].state & FD_EV_STATUS_R))
fdtab[fd].ev |= FD_POLL_IN;
}
else if (kev[count].filter == EVFILT_WRITE) {
if ((fdtab[fd].state & FD_EV_STATUS_W))
fdtab[fd].ev |= FD_POLL_OUT;
}
if (fdtab[fd].ev & (FD_POLL_IN | FD_POLL_HUP | FD_POLL_ERR))
fd_may_recv(fd);
if (fdtab[fd].ev & (FD_POLL_OUT | FD_POLL_ERR))
fd_may_send(fd);
}
}
/*
* Initialization of the kqueue() poller.
* Returns 0 in case of failure, non-zero in case of success. If it fails, it
* disables the poller by setting its pref to 0.
*/
REGPRM1 static int _do_init(struct poller *p)
{
p->private = NULL;
kqueue_fd = kqueue();
if (kqueue_fd < 0)
goto fail_fd;
/* we can have up to two events per fd (*/
kev = (struct kevent*)calloc(1, sizeof(struct kevent) * 2 * global.maxsock);
if (kev == NULL)
goto fail_kev;
return 1;
fail_kev:
close(kqueue_fd);
kqueue_fd = -1;
fail_fd:
p->pref = 0;
return 0;
}
/*
* Termination of the kqueue() poller.
* Memory is released and the poller is marked as unselectable.
*/
REGPRM1 static void _do_term(struct poller *p)
{
free(kev);
if (kqueue_fd >= 0) {
close(kqueue_fd);
kqueue_fd = -1;
}
p->private = NULL;
p->pref = 0;
}
/*
* Check that the poller works.
* Returns 1 if OK, otherwise 0.
*/
REGPRM1 static int _do_test(struct poller *p)
{
int fd;
fd = kqueue();
if (fd < 0)
return 0;
close(fd);
return 1;
}
/*
* Recreate the kqueue file descriptor after a fork(). Returns 1 if OK,
* otherwise 0. Note that some pollers need to be reopened after a fork()
* (such as kqueue), and some others may fail to do so in a chroot.
*/
REGPRM1 static int _do_fork(struct poller *p)
{
if (kqueue_fd >= 0)
close(kqueue_fd);
kqueue_fd = kqueue();
if (kqueue_fd < 0)
return 0;
return 1;
}
/*
* It is a constructor, which means that it will automatically be called before
* main(). This is GCC-specific but it works at least since 2.95.
* Special care must be taken so that it does not need any uninitialized data.
*/
__attribute__((constructor))
static void _do_register(void)
{
struct poller *p;
if (nbpollers >= MAX_POLLERS)
return;
kqueue_fd = -1;
p = &pollers[nbpollers++];
p->name = "kqueue";
p->pref = 300;
p->private = NULL;
p->clo = NULL;
p->test = _do_test;
p->init = _do_init;
p->term = _do_term;
p->poll = _do_poll;
p->fork = _do_fork;
}
/*
* Local variables:
* c-indent-level: 8
* c-basic-offset: 8
* End:
*/