diff --git a/Makefile b/Makefile index 1480fafff..51bbefc6f 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ # USE_LINUX_SPLICE : enable kernel 2.6 splicing (broken on old kernels) # USE_LIBCRYPT : enable crypted passwords using -lcrypt # USE_CRYPT_H : set it if your system requires including crypt.h +# USE_VSYSCALL : enable vsyscall on Linux x86, bypassing libc # # Options can be forced by specifying "USE_xxx=1" or can be disabled by using # "USE_xxx=" (empty string). @@ -374,6 +375,12 @@ OPTIONS_OBJS += src/ev_kqueue.o BUILD_OPTIONS += $(call ignore_implicit,USE_KQUEUE) endif +ifneq ($(USE_VSYSCALL),) +OPTIONS_OBJS += src/i386-linux-vsys.o +OPTIONS_CFLAGS += -DCONFIG_HAP_LINUX_VSYSCALL +BUILD_OPTIONS += $(call ignore_implicit,USE_VSYSCALL) +endif + ifneq ($(USE_NETFILTER),) OPTIONS_CFLAGS += -DNETFILTER BUILD_OPTIONS += $(call ignore_implicit,USE_NETFILTER) diff --git a/src/i386-linux-vsys.c b/src/i386-linux-vsys.c new file mode 100644 index 000000000..a4927e2d2 --- /dev/null +++ b/src/i386-linux-vsys.c @@ -0,0 +1,204 @@ +/* + * Fast system call support for x86 on Linux + * + * Copyright 2010 Willy Tarreau + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Recent kernels support a faster syscall ABI on x86 using the VDSO page, but + * some libc that are built for CPUs earlier than i686 do not implement it. + * This code bypasses the libc when the VDSO is detected. It should only be + * used when it's sure that the libc really does not support the VDSO, but + * fixing the libc is preferred. Using the VDSO can improve the overall + * performance by about 10%. + */ + +#if defined(__linux__) && defined(__i386__) +/* Silently ignore other platforms to be friendly with distro packagers */ + +#include +#include + +void int80(void); /* declared in the assembler code */ +static void *vsyscall = &int80; /* initialize vsyscall to use int80 by default */ +static __attribute__((used)) unsigned int back_ebx; + +/* now we redefine some frequently used syscalls. Epoll_create is defined too + * in order to replace old disabled implementations. + */ +asm +( + "epoll_create: .GLOBL epoll_create\n" + " mov $0xfe, %eax\n" + " mov %ebx, back_ebx\n" + " mov 4(%esp), %ebx\n" + " jmp do_syscall\n" + + "epoll_ctl: .GLOBL epoll_ctl\n" + " push %esi\n" + " mov $0xff, %eax\n" + " mov %ebx, back_ebx\n" + " mov 20(%esp), %esi\n" + " mov 16(%esp), %edx\n" + " mov 12(%esp), %ecx\n" + " mov 8(%esp), %ebx\n" + " call do_syscall\n" + " pop %esi\n" + " ret\n" + + "epoll_wait: .GLOBL epoll_wait\n" + " push %esi\n" + " mov $0x100, %eax\n" + " mov %ebx, back_ebx\n" + " mov 20(%esp), %esi\n" + " mov 16(%esp), %edx\n" + " mov 12(%esp), %ecx\n" + " mov 8(%esp), %ebx\n" + " call do_syscall\n" + " pop %esi\n" + " ret\n" + + "splice: .GLOBL splice\n" + " push %ebp\n" + " push %edi\n" + " push %esi\n" + " mov $0x139, %eax\n" + " mov %ebx, back_ebx\n" + " mov 36(%esp), %ebp\n" + " mov 32(%esp), %edi\n" + " mov 28(%esp), %esi\n" + " mov 24(%esp), %edx\n" + " mov 20(%esp), %ecx\n" + " mov 16(%esp), %ebx\n" + " call do_syscall\n" + " pop %esi\n" + " pop %edi\n" + " pop %ebp\n" + " ret\n" + + "close: .GLOBL close\n" + " mov $0x06, %eax\n" + " mov %ebx, back_ebx\n" + " mov 4(%esp), %ebx\n" + " jmp do_syscall\n" + + "gettimeofday: .GLOBL gettimeofday\n" + " mov $0x4e, %eax\n" + " mov %ebx, back_ebx\n" + " mov 8(%esp), %ecx\n" + " mov 4(%esp), %ebx\n" + " jmp do_syscall\n" + + "fcntl: .GLOBL fcntl\n" + " mov $0xdd, %eax\n" + " mov %ebx, back_ebx\n" + " mov 12(%esp), %edx\n" + " mov 8(%esp), %ecx\n" + " mov 4(%esp), %ebx\n" + " jmp do_syscall\n" + + "socket: .GLOBL socket\n" + " mov $0x01, %eax\n" + " jmp socketcall\n" + + "bind: .GLOBL bind\n" + " mov $0x02, %eax\n" + " jmp socketcall\n" + + "connect: .GLOBL connect\n" + " mov $0x03, %eax\n" + " jmp socketcall\n" + + "listen: .GLOBL listen\n" + " mov $0x04, %eax\n" + " jmp socketcall\n" + + "accept: .GLOBL accept\n" + " mov $0x05, %eax\n" + " jmp socketcall\n" + + "getsockname: .GLOBL getsockname\n" + " mov $0x06, %eax\n" + " jmp socketcall\n" + + "send: .GLOBL send\n" + " mov $0x09, %eax\n" + " jmp socketcall\n" + + "recv: .GLOBL recv\n" + " mov $0x0a, %eax\n" + " jmp socketcall\n" + + "shutdown: .GLOBL shutdown\n" + " mov $0x0d, %eax\n" + " jmp socketcall\n" + + "setsockopt: .GLOBL setsockopt\n" + " mov $0x0e, %eax\n" + " jmp socketcall\n" + + "getsockopt: .GLOBL getsockopt\n" + " mov $0x0f, %eax\n" + " jmp socketcall\n" + + "socketcall:\n" + " mov %ebx, back_ebx\n" + " mov %eax, %ebx\n" + " mov $0x66, %eax\n" + " lea 4(%esp), %ecx\n" + /* fall through */ + + "do_syscall:\n" + " call *vsyscall\n" // always valid, may be int80 or vsyscall + " mov back_ebx, %ebx\n" + " cmpl $0xfffff000, %eax\n" // consider -4096..-1 for errno + " jae 0f\n" + " ret\n" + "0:\n" // error handling + " neg %eax\n" // get errno value + " push %eax\n" // save it + " call __errno_location\n" + " popl (%eax)\n" // store the pushed errno into the proper location + " mov $-1, %eax\n" // and return -1 + " ret\n" + + "int80:\n" // default compatible calling convention + " int $0x80\n" + " ret\n" +); + +__attribute__((constructor)) +static void __i386_linux_vsyscall_init(void) +{ + /* We can get the pointer by resolving the __kernel_vsyscall symbol + * from the "linux-gate.so.1" virtual shared object, but this requires + * libdl. Or we can also know that the vsyscall pointer is always + * located at 0xFFFFE018 when /proc/sys/abi/vsyscall32 contains the + * default value 2. So we can use that once we've checked that we can + * access it without faulting. The dlsym method will also work when + * vsyscall32 = 1, which randomizes the VDSO address. + */ +#ifdef USE_VSYSCALL_DLSYM + void *handle = dlopen("linux-gate.so.1", RTLD_NOW); + if (handle) { + void *ptr = dlsym(handle, "__kernel_vsyscall"); + dlclose(handle); + if (ptr) + vsyscall = ptr; + } +#else + /* Heuristic: trying to mprotect() the VDSO area will only succeed if + * it is mapped. + */ + if (mprotect((void *)0xffffe000, 4096, PROT_READ|PROT_EXEC) == 0) { + unsigned long ptr = *(unsigned long *)0xFFFFE018; /* VDSO is mapped */ + if ((ptr & 0xFFFFE000) == 0xFFFFE000) + vsyscall = (void *)ptr; + } +#endif +} + +#endif /* defined(__linux__) && defined(__i386__) */ diff --git a/src/stream_sock.c b/src/stream_sock.c index 598a1a02f..b9b55b740 100644 --- a/src/stream_sock.c +++ b/src/stream_sock.c @@ -77,8 +77,12 @@ #define __NR_splice 313 #endif /* $arch */ +#if defined(CONFIG_HAP_LINUX_VSYSCALL) && defined(__linux__) && defined(__i386__) +/* the syscall is redefined somewhere else */ +extern int splice(int fdin, loff_t *off_in, int fdout, loff_t *off_out, size_t len, unsigned long flags); +#else _syscall6(int, splice, int, fdin, loff_t *, off_in, int, fdout, loff_t *, off_out, size_t, len, unsigned long, flags) - +#endif #endif /* __NR_splice */ /* A pipe contains 16 segments max, and it's common to see segments of 1448 bytes