From 1c2ad21e0fc3d911520b867e450fd0219529a9ab Mon Sep 17 00:00:00 2001 From: willy tarreau Date: Sun, 18 Dec 2005 01:11:29 +0100 Subject: [PATCH] * released 1.2.5-pre4 * made epoll() support a compile-time option : ENABLE_EPOLL * provided a very little libc replacement for a possibly missing epoll() implementation which can be enabled by -DUSE_MY_EPOLL * implemented the poll() poller, which can be enabled with -DENABLE_POLL. The equivalent runtime argument becomes '-P'. A few tests show that it performs like select() with many fds, but slightly slower (certainly because of the higher amount of memory involved). * separated the 3 polling methods and the tasks scheduler into 4 distinct functions which makes the code a lot more modular. * moved some event tables to private static declarations inside the poller functions. * the poller functions can now initialize themselves, run, and cleanup. * changed the runtime argument to enable epoll() to '-E'. * removed buggy epoll_ctl() code in the client_retnclose() function. This function was never meant to remove anything. * fixed a typo which caused glibc to yell about a double free on exit. * removed error checking after epoll_ctl(DEL) because we can never know if the fd is still active or already closed. * added a few entries in the makefile --- CHANGELOG | 21 ++ Makefile | 22 +- haproxy.c | 827 +++++++++++++++++++++++++++++------------------- include/epoll.h | 64 ++++ 4 files changed, 602 insertions(+), 332 deletions(-) create mode 100644 include/epoll.h diff --git a/CHANGELOG b/CHANGELOG index fc0b5aba1..6b129d024 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,27 @@ ChangeLog : =========== +2005/04/26 : 1.2.5-pre4 + - made epoll() support a compile-time option : ENABLE_EPOLL + - provided a very little libc replacement for a possibly missing epoll() + implementation which can be enabled by -DUSE_MY_EPOLL + - implemented the poll() poller, which can be enabled with -DENABLE_POLL. + The equivalent runtime argument becomes '-P'. A few tests show that it + performs like select() with many fds, but slightly slower (certainly + because of the higher amount of memory involved). + - separated the 3 polling methods and the tasks scheduler into 4 distinct + functions which makes the code a lot more modular. + - moved some event tables to private static declarations inside the poller + functions. + - the poller functions can now initialize themselves, run, and cleanup. + - changed the runtime argument to enable epoll() to '-E'. + - removed buggy epoll_ctl() code in the client_retnclose() function. This + function was never meant to remove anything. + - fixed a typo which caused glibc to yell about a double free on exit. + - removed error checking after epoll_ctl(DEL) because we can never know if + the fd is still active or already closed. + - added a few entries in the makefile + 2005/04/25 : 1.2.5-pre3 - experimental epoll() support (use temporary '-e' argument) diff --git a/Makefile b/Makefile index 70a21fe47..3991baa4c 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ # Select target OS. TARGET must match a system for which COPTS and LIBS are # correctly defined below. +#TARGET = linux26 TARGET = linux24 #TARGET = linux22 #TARGET = solaris @@ -27,20 +28,24 @@ LD = gcc PCREDIR := $(shell pcre-config --prefix 2>/dev/null || :) #PCREDIR=/usr/local +# This is for Linux 2.6 with netfilter and EPOLL +COPTS.linux26 = -DNETFILTER -DENABLE_POLL -DENABLE_EPOLL +LIBS.linux26 = + # This is for Linux 2.4 with netfilter -COPTS.linux24 = -DNETFILTER +COPTS.linux24 = -DNETFILTER -DENABLE_POLL LIBS.linux24 = # This is for Linux 2.2 -COPTS.linux22 = -DUSE_GETSOCKNAME +COPTS.linux22 = -DUSE_GETSOCKNAME -DENABLE_POLL LIBS.linux22 = # This is for Solaris 8 -COPTS.solaris = -fomit-frame-pointer -DSOLARIS +COPTS.solaris = -fomit-frame-pointer -DSOLARIS -DENABLE_POLL LIBS.solaris = -lnsl -lsocket # This is for OpenBSD 3.0 -COPTS.openbsd = +COPTS.openbsd = -DENABLE_POLL LIBS.openbsd = # CPU dependant optimizations @@ -67,13 +72,20 @@ DEBUG = -g #SMALL_OPTS = -DBUFSIZE=8192 -DMAXREWRITE=1024 SMALL_OPTS = +# redefine this if you want to add some special PATH to include/libs +ADDINC = +ADDLIB = + +# set some defines when needed. +# Known ones are -DENABLE_POLL, -DENABLE_EPOLL, and -DUSE_MY_EPOLL +DEFINE = # global options TARGET_OPTS=$(COPTS.$(TARGET)) REGEX_OPTS=$(COPTS.$(REGEX)) CPU_OPTS=$(COPTS.$(CPU)) -COPTS=-I. $(ADDINC) $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS) +COPTS=-I. $(ADDINC) $(CPU_OPTS) $(TARGET_OPTS) $(REGEX_OPTS) $(SMALL_OPTS) $(DEFINE) LIBS=$(LIBS.$(TARGET)) $(LIBS.$(REGEX)) $(ADDLIB) # - use -DSTATTIME=0 to disable statistics, else specify an interval in diff --git a/haproxy.c b/haproxy.c index 79819e622..63822e503 100644 --- a/haproxy.c +++ b/haproxy.c @@ -62,7 +62,17 @@ #include #endif +#if defined(ENABLE_POLL) +#include +#endif + +#if defined(ENABLE_EPOLL) +#if !defined(USE_MY_EPOLL) #include +#else +#include "include/epoll.h" +#endif +#endif #include "include/appsession.h" @@ -272,6 +282,11 @@ int strlcpy2(char *dst, const char *src, int size) { #define PR_MODE_HTTP 1 #define PR_MODE_HEALTH 2 +/* possible actions for the *poll() loops */ +#define POLL_LOOP_ACTION_INIT 0 +#define POLL_LOOP_ACTION_RUN 1 +#define POLL_LOOP_ACTION_CLEAN 2 + /* bits for proxy->options */ #define PR_O_REDISP 0x00000001 /* allow reconnection to dispatch in case of errors */ #define PR_O_TRANSP 0x00000002 /* transparent mode : use original DEST as dispatch */ @@ -577,7 +592,6 @@ struct fdtab { /*********************************************************************/ int cfg_maxpconn = 2000; /* # of simultaneous connections per proxy (-N) */ -int cfg_use_epoll = 0; /* use epoll() instead of select() ? */ char *cfg_cfgfile = NULL; /* configuration file */ char *progname = NULL; /* program name */ int pid; /* current process id */ @@ -605,15 +619,11 @@ static struct { /*********************************************************************/ -fd_set *ReadEvent, - *WriteEvent, - *StaticReadEvent, +fd_set *StaticReadEvent, *StaticWriteEvent; -/* used by the epoll() emulation of select() */ -fd_set *PrevReadEvent, *PrevWriteEvent; -struct epoll_event *epoll_events; -int epoll_fd; +int cfg_use_epoll = 0; /* use epoll() instead of select() ? */ +int cfg_use_poll = 0; /* use poll() instead of select() ? */ void **pool_session = NULL, **pool_buffer = NULL, @@ -812,7 +822,12 @@ void usage(char *name) { " -n sets the maximum total # of connections (%d)\n" " -N sets the default, per-proxy maximum # of connections (%d)\n" " -p writes pids of all children to this file\n" - " -e tries to use epoll() instead of select()\n" +#if defined(ENABLE_EPOLL) + " -E tries to use epoll() instead of select()\n" +#endif +#if defined(ENABLE_POLL) + " -P tries to use poll() instead of select()\n" +#endif "\n", name, DEFAULT_MAXCONN, cfg_maxpconn); exit(1); @@ -1394,20 +1409,6 @@ static inline struct timeval *tv_min(struct timeval *tvmin, static inline void fd_delete(int fd) { FD_CLR(fd, StaticReadEvent); FD_CLR(fd, StaticWriteEvent); - if (cfg_use_epoll) { - struct epoll_event ev; - - ev.data.fd = fd; - if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, &ev) < 0) { - // it's impossible to tell whether it has already - // been done. - //perror("epoll_ctl(DEL)"); - //exit(1); - } - - FD_CLR(fd, PrevReadEvent); - FD_CLR(fd, PrevWriteEvent); - } close(fd); fdtab[fd].state = FD_STCLOSE; @@ -2154,20 +2155,6 @@ int event_srv_write(int fd) { void client_retnclose(struct session *s, int len, const char *msg) { FD_CLR(s->cli_fd, StaticReadEvent); FD_SET(s->cli_fd, StaticWriteEvent); - if (cfg_use_epoll) { - struct epoll_event ev; - - ev.data.fd = s->cli_fd; - if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, s->cli_fd, &ev) < 0) { - // it's impossible to tell whether it has already - // been done. - //perror("epoll_ctl(DEL)"); - //exit(1); - } - - FD_CLR(s->cli_fd, PrevReadEvent); - FD_CLR(s->cli_fd, PrevWriteEvent); - } tv_eternity(&s->crexpire); shutdown(s->cli_fd, SHUT_RD); s->cli_state = CL_STSHUTR; @@ -4796,308 +4783,483 @@ int stats(void); #endif /* - * Main select() loop. + * This does 4 things : + * - wake up all expired tasks + * - call all runnable tasks + * - call maintain_proxies() to enable/disable the listeners + * - return the delay till next event in ms, -1 = wait indefinitely + * Note: this part should be rewritten with the O(ln(n)) scheduler. + * */ -void select_loop() { +int process_runnable_tasks() { int next_time; int time2; - int status; - int fd,i; - struct timeval delta; - int readnotnull, writenotnull; struct task *t, *tnext; + next_time = -1; /* set the timer to wait eternally first */ + + /* look for expired tasks and add them to the run queue. + */ + tnext = ((struct task *)LIST_HEAD(wait_queue))->next; + while ((t = tnext) != LIST_HEAD(wait_queue)) { /* we haven't looped ? */ + tnext = t->next; + if (t->state & TASK_RUNNING) + continue; + + /* wakeup expired entries. It doesn't matter if they are + * already running because of a previous event + */ + if (tv_cmp2_ms(&t->expire, &now) <= 0) { + task_wakeup(&rq, t); + } + else { + /* first non-runnable task. Use its expiration date as an upper bound */ + int temp_time = tv_remain(&now, &t->expire); + if (temp_time) + next_time = temp_time; + break; + } + } + + /* process each task in the run queue now. Each task may be deleted + * since we only use tnext. + */ + tnext = rq; + while ((t = tnext) != NULL) { + int temp_time; + + tnext = t->rqnext; + task_sleep(&rq, t); + temp_time = t->process(t); + next_time = MINTIME(temp_time, next_time); + } + + /* maintain all proxies in a consistent state. This should quickly become a task */ + time2 = maintain_proxies(); + return MINTIME(time2, next_time); +} + + +#if defined(ENABLE_EPOLL) + +/* + * Main epoll() loop. + */ + +/* does 3 actions : + * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures + * 1 (POLL_LOOP_ACTION_RUN) : runs the loop + * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up + * + * returns 0 if initialization failed, !0 otherwise. + */ + +int epoll_loop(int action) { + int next_time; + int status; + int fd; + + int fds, count; + int pr, pw, sr, sw; + unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */ + struct epoll_event ev; + + /* private data */ + static int last_maxfd = 0; + static fd_set *PrevReadEvent = NULL, *PrevWriteEvent = NULL; + static struct epoll_event *epoll_events = NULL; + static int epoll_fd; + + if (action == POLL_LOOP_ACTION_INIT) { + epoll_fd = epoll_create(global.maxsock + 1); + if (epoll_fd < 0) + return 0; + else { + epoll_events = (struct epoll_event*) + calloc(1, sizeof(struct epoll_event) * global.maxsock); + PrevReadEvent = (fd_set *) + calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE); + PrevWriteEvent = (fd_set *) + calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE); + } + return 1; + } + else if (action == POLL_LOOP_ACTION_CLEAN) { + if (PrevWriteEvent) free(PrevWriteEvent); + if (PrevReadEvent) free(PrevReadEvent); + if (epoll_events) free(epoll_events); + close(epoll_fd); + last_maxfd = 0; + epoll_fd = 0; + return 1; + } + + /* OK, it's POLL_LOOP_ACTION_RUN */ + tv_now(&now); while (1) { - next_time = -1; /* set the timer to wait eternally first */ - - /* look for expired tasks and add them to the run queue. - */ - tnext = ((struct task *)LIST_HEAD(wait_queue))->next; - while ((t = tnext) != LIST_HEAD(wait_queue)) { /* we haven't looped ? */ - tnext = t->next; - if (t->state & TASK_RUNNING) - continue; - - /* wakeup expired entries. It doesn't matter if they are - * already running because of a previous event - */ - if (tv_cmp2_ms(&t->expire, &now) <= 0) { - //fprintf(stderr,"task_wakeup(%p, %p)\n", &rq, t); - task_wakeup(&rq, t); - } - else { - /* first non-runnable task. Use its expiration date as an upper bound */ - int temp_time = tv_remain(&now, &t->expire); - if (temp_time) - next_time = temp_time; - //fprintf(stderr,"no_task_wakeup(%p, %p) : expire in %d ms\n", &rq, t, temp_time); - break; - } - } - - /* process each task in the run queue now. Each task may be deleted - * since we only use tnext. - */ - tnext = rq; - while ((t = tnext) != NULL) { - int temp_time; - - tnext = t->rqnext; - task_sleep(&rq, t); - //fprintf(stderr,"task %p\n",t); - temp_time = t->process(t); - next_time = MINTIME(temp_time, next_time); - //fprintf(stderr,"process(%p)=%d -> next_time=%d)\n", t, temp_time, next_time); - } - - //fprintf(stderr,"---end of run---\n"); - - /* maintain all proxies in a consistent state. This should quickly become a task */ - time2 = maintain_proxies(); - next_time = MINTIME(time2, next_time); + next_time = process_runnable_tasks(); /* stop when there's no connection left and we don't allow them anymore */ if (!actconn && listeners == 0) break; - #if STATTIME > 0 - time2 = stats(); - // fprintf(stderr," stats = %d\n", time2); - next_time = MINTIME(time2, next_time); + { + int time2; + time2 = stats(); + next_time = MINTIME(time2, next_time); + } #endif - if (cfg_use_epoll) { - /* use epoll() */ - int fds, count; - int pr, pw, sr, sw; - unsigned rn, ro, wn, wo; /* read new, read old, write new, write old */ - struct epoll_event ev; + /* + * We'll first check if some fds have been closed recently, in which case + * we'll have to remove them from the previous epoll set. It's + * unnecessary to call epoll_ctl(DEL) because close() automatically + * removes the fds from the epoll set. + */ + for (fd = maxfd; fd < last_maxfd; fd++) { + ev.data.fd = fd; + FD_CLR(fd, PrevReadEvent); + FD_CLR(fd, PrevWriteEvent); + } + last_maxfd = maxfd; - for (fds = 0; (fds << INTBITS) < maxfd; fds++) { - - rn = ((int*)StaticReadEvent)[fds]; ro = ((int*)PrevReadEvent)[fds]; - wn = ((int*)StaticWriteEvent)[fds]; wo = ((int*)PrevWriteEvent)[fds]; - - if ((ro^rn) | (wo^wn)) { - for (count = 1<> ((1<> ((1<> ((1<> ((1<> count) & 1; + pw = (wo >> count) & 1; + sr = (rn >> count) & 1; + sw = (wn >> count) & 1; #else - pr = FD_ISSET(fd&((1< 0) { /* FIXME */ - /* Convert to timeval */ - /* to avoid eventual select loops due to timer precision */ - next_time += SCHEDULER_RESOLUTION; - delta.tv_sec = next_time / 1000; - delta.tv_usec = (next_time % 1000) * 1000; - } - else if (next_time == 0) { /* allow select to return immediately when needed */ - delta.tv_sec = delta.tv_usec = 0; - } - - - /* let's restore fdset state */ - - readnotnull = 0; writenotnull = 0; - for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) { - readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0; - writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0; - } - - // /* just a verification code, needs to be removed for performance */ - // for (i=0; i= 0) ? &delta : NULL); + ((int*)PrevReadEvent)[fds] = rn; + ((int*)PrevWriteEvent)[fds] = wn; + } + } - /* this is an experiment on the separation of the select work */ - // status = (readnotnull ? select(maxfd, ReadEvent, NULL, NULL, (next_time >= 0) ? &delta : NULL) : 0); - // status |= (writenotnull ? select(maxfd, NULL, WriteEvent, NULL, (next_time >= 0) ? &delta : NULL) : 0); + /* now let's wait for events */ + status = epoll_wait(epoll_fd, epoll_events, maxfd, next_time); + tv_now(&now); - tv_now(&now); - - if (status > 0) { /* must proceed with events */ - - int fds; - char count; + for (count = 0; count < status; count++) { + fd = epoll_events[count].data.fd; - for (fds = 0; (fds << INTBITS) < maxfd; fds++) - if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) != 0) - for (count = 1< 0 + { + int time2; + time2 = stats(); + next_time = MINTIME(time2, next_time); + } +#endif + + + nbfd = 0; + for (fds = 0; (fds << INTBITS) < maxfd; fds++) { + + rn = ((int*)StaticReadEvent)[fds]; + wn = ((int*)StaticWriteEvent)[fds]; + + if ((rn|wn)) { + for (count = 0, fd = fds << INTBITS; count < (1<> count) & 1; + sw = (wn >> count) & 1; +#else + sr = FD_ISSET(fd&((1< 0 && count < nbfd; count++) { + fd = poll_events[count].fd; + + if (!poll_events[count].revents & ( POLLOUT | POLLIN | POLLERR | POLLHUP )) + continue; + + /* ok, we found one active fd */ + status--; + + if (fdtab[fd].state == FD_STCLOSE) + continue; + + if (poll_events[count].revents & ( POLLIN | POLLERR | POLLHUP )) + fdtab[fd].read(fd); + + if (fdtab[fd].state == FD_STCLOSE) + continue; + + if (poll_events[count].revents & ( POLLOUT | POLLERR | POLLHUP )) + fdtab[fd].write(fd); + } + } + return 1; +} +#endif + + + +/* + * Main select() loop. + */ + +/* does 3 actions : + * 0 (POLL_LOOP_ACTION_INIT) : initializes necessary private structures + * 1 (POLL_LOOP_ACTION_RUN) : runs the loop + * 2 (POLL_LOOP_ACTION_CLEAN) : cleans up + * + * returns 0 if initialization failed, !0 otherwise. + */ + + +int select_loop(int action) { + int next_time; + int status; + int fd,i; + struct timeval delta; + int readnotnull, writenotnull; + static fd_set *ReadEvent = NULL, *WriteEvent = NULL; + + if (action == POLL_LOOP_ACTION_INIT) { + ReadEvent = (fd_set *) + calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE); + WriteEvent = (fd_set *) + calloc(1, sizeof(fd_set) * (global.maxsock + FD_SETSIZE - 1) / FD_SETSIZE); + return 1; + } + else if (action == POLL_LOOP_ACTION_CLEAN) { + if (WriteEvent) free(WriteEvent); + if (ReadEvent) free(ReadEvent); + return 1; + } + + /* OK, it's POLL_LOOP_ACTION_RUN */ + + tv_now(&now); + + while (1) { + next_time = process_runnable_tasks(); + + /* stop when there's no connection left and we don't allow them anymore */ + if (!actconn && listeners == 0) + break; + +#if STATTIME > 0 + { + int time2; + time2 = stats(); + next_time = MINTIME(time2, next_time); + } +#endif + + + if (next_time > 0) { /* FIXME */ + /* Convert to timeval */ + /* to avoid eventual select loops due to timer precision */ + next_time += SCHEDULER_RESOLUTION; + delta.tv_sec = next_time / 1000; + delta.tv_usec = (next_time % 1000) * 1000; + } + else if (next_time == 0) { /* allow select to return immediately when needed */ + delta.tv_sec = delta.tv_usec = 0; + } + + + /* let's restore fdset state */ + + readnotnull = 0; writenotnull = 0; + for (i = 0; i < (maxfd + FD_SETSIZE - 1)/(8*sizeof(int)); i++) { + readnotnull |= (*(((int*)ReadEvent)+i) = *(((int*)StaticReadEvent)+i)) != 0; + writenotnull |= (*(((int*)WriteEvent)+i) = *(((int*)StaticWriteEvent)+i)) != 0; + } + + // /* just a verification code, needs to be removed for performance */ + // for (i=0; i= 0) ? &delta : NULL); + + /* this is an experiment on the separation of the select work */ + // status = (readnotnull ? select(maxfd, ReadEvent, NULL, NULL, (next_time >= 0) ? &delta : NULL) : 0); + // status |= (writenotnull ? select(maxfd, NULL, WriteEvent, NULL, (next_time >= 0) ? &delta : NULL) : 0); + + tv_now(&now); + + if (status > 0) { /* must proceed with events */ + + int fds; + char count; + + for (fds = 0; (fds << INTBITS) < maxfd; fds++) + if ((((int *)(ReadEvent))[fds] | ((int *)(WriteEvent))[fds]) != 0) + for (count = 1< + +/* epoll_ctl() commands */ +#define EPOLL_CTL_ADD 1 +#define EPOLL_CTL_DEL 2 +#define EPOLL_CTL_MOD 3 + +/* events types (bit fields) */ +#define EPOLLIN 1 +#define EPOLLPRI 2 +#define EPOLLOUT 4 +#define EPOLLERR 8 +#define EPOLLHUP 16 +#define EPOLLONESHOT (1 << 30) +#define EPOLLET (1 << 31) + +struct epoll_event { + uint32_t events; + struct { + void *ptr; + int fd; + uint32_t u32; + uint64_t u64; + } data; +}; + + +#if defined(__powerpc__) || defined(__powerpc64__) +#define __NR_epoll_create 236 +#define __NR_epoll_ctl 237 +#define __NR_epoll_wait 238 +#elif defined(__sparc__) || defined(__sparc64__) +#define __NR_epoll_create 193 +#define __NR_epoll_ctl 194 +#define __NR_epoll_wait 195 +#elif defined(__x86_64__) +#define __NR_epoll_create 213 +#define __NR_epoll_ctl 214 +#define __NR_epoll_wait 215 +#elif defined(__alpha__) +#define __NR_sys_epoll_create 407 +#define __NR_sys_epoll_ctl 408 +#define __NR_sys_epoll_wait 409 +#elif defined (__i386__) +#define __NR_epoll_create 254 +#define __NR_epoll_ctl 255 +#define __NR_epoll_wait 256 +#else +#warning unsupported architecture, guessing __NR_epoll_create=254 like x86... +#define __NR_epoll_create 254 +#define __NR_epoll_ctl 255 +#define __NR_epoll_wait 256 +#endif + +_syscall1 (int, epoll_create, int, size); +_syscall4 (int, epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event *, event); +_syscall4 (int, epoll_wait, int, epfd, struct epoll_event *, events, int, maxevents, int, timeout);