diff --git a/doc/configuration.txt b/doc/configuration.txt index 1ad0d4ea5..4a7c08034 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -6465,6 +6465,25 @@ page. Both means provide a CSV format whose fields follow. 31. tracked: id of proxy/server if tracking is enabled 32. type (0=frontend, 1=backend, 2=server) 33. rate (number of sessions per second over last elapsed second) + 36. check_status: status of last health check, one of: + UNK -> unknown + INI -> initializing + SOCKERR -> socket error + L4OK -> check passed on layer 4, no upper layers testing enabled + L4TMOUT -> layer 1-4 timeout + L4CON -> layer 1-4 connection problem, for example "Connection refused" + (tcp rst) or "No route to host" (icmp) + L6OK -> check passed on layer 6 + L6TOUT -> layer 6 (SSL) timeout + L6RSP -> layer 6 invalid response - protocol error + L7OK -> check passed on layer 7 + L7OKC -> check conditionally passed on layer 7, for example 404 with + disable-on-404 + L7TOUT -> layer 7 (HTTP/SMTP) timeout + L7RSP -> layer 7 invalid response - protocol error + L7STS -> layer 7 response error, for example HTTP 5xx + 37. check_code: layer5-7 code, if available + 38. check_duration: time in ms took to finish last health check 9.2. Unix Socket commands diff --git a/include/proto/checks.h b/include/proto/checks.h index 6f0aa8b3b..bd701645d 100644 --- a/include/proto/checks.h +++ b/include/proto/checks.h @@ -25,6 +25,8 @@ #include #include +const char *get_check_status_description(short check_status); +const char *get_check_status_info(short check_status); struct task *process_chk(struct task *t); int start_checks(); diff --git a/include/types/checks.h b/include/types/checks.h new file mode 100644 index 000000000..87ff2c8e6 --- /dev/null +++ b/include/types/checks.h @@ -0,0 +1,41 @@ +/* + * Health-checks. + * + * Copyright 2008-2009 Krzysztof Piotr Oledzki + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* check status */ +enum { + HCHK_STATUS_UNKNOWN = 0, /* Unknown */ + HCHK_STATUS_INI, /* Initializing */ + + /* Below we have check finished */ + HCHK_STATUS_CHECKED, /* DUMMY STATUS */ + HCHK_STATUS_SOCKERR, /* Socket error */ + + HCHK_STATUS_L4OK, /* L4 check passed, for example tcp connect */ + HCHK_STATUS_L4TOUT, /* L4 timeout */ + HCHK_STATUS_L4CON, /* L4 connection problem, for example: */ + /* "Connection refused" (tcp rst) or "No route to host" (icmp) */ + + HCHK_STATUS_L6OK, /* L6 check passed */ + HCHK_STATUS_L6TOUT, /* L6 (SSL) timeout */ + HCHK_STATUS_L6RSP, /* L6 invalid response - protocol error */ + + HCHK_STATUS_L7TOUT, /* L7 (HTTP/SMTP) timeout */ + HCHK_STATUS_L7RSP, /* L7 invalid response - protocol error */ + + /* Below we have layer 5-7 data avaliable */ + HCHK_STATUS_L57DATA, /* DUMMY STATUS */ + HCHK_STATUS_L7OKD, /* L7 check passed */ + HCHK_STATUS_L7OKCD, /* L7 check conditionally passed */ + HCHK_STATUS_L7STS, /* L7 response error, for example HTTP 5xx */ + + HCHK_STATUS_SIZE +}; diff --git a/include/types/server.h b/include/types/server.h index f634b8a18..3304004ac 100644 --- a/include/types/server.h +++ b/include/types/server.h @@ -35,6 +35,7 @@ #include #include #include +#include /* server flags */ @@ -74,7 +75,7 @@ struct server { struct server *next; int state; /* server state (SRV_*) */ int prev_state; /* server state before last change (SRV_*) */ - int cklen; /* the len of the cookie, to speed up checks */ + int cklen; /* the len of the cookie, to speed up checks */ int rdr_len; /* the length of the redirection prefix */ char *cookie; /* the id set in the cookie */ char *rdr_pfx; /* the redirection prefix */ @@ -121,9 +122,12 @@ struct server { long long failed_checks, down_trans; /* failed checks and up-down transitions */ unsigned down_time; /* total time the server was down */ time_t last_change; /* last time, when the state was changed */ + struct timeval check_start; /* last health check start time */ + unsigned long check_duration; /* time in ms took to finish last health check */ + short check_status, check_code; /* check result, check code */ long long failed_conns, failed_resp; /* failed connect() and responses */ - long long retries, redispatches; /* retried and redispatched connections */ + long long retries, redispatches; /* retried and redispatched connections */ long long failed_secu; /* blocked responses because of security concerns */ struct freq_ctr sess_per_sec; /* sessions per second on this server */ unsigned int sps_max; /* maximum of new sessions per second seen on this server */ diff --git a/src/cfgparse.c b/src/cfgparse.c index 382881225..b36470bd0 100644 --- a/src/cfgparse.c +++ b/src/cfgparse.c @@ -2643,6 +2643,8 @@ int cfg_parse_listen(const char *file, int linenum, char **args, int kwm) err_code |= ERR_ALERT | ERR_FATAL; goto out; } + + newsrv->check_status = HCHK_STATUS_INI; newsrv->state |= SRV_CHECKED; } diff --git a/src/checks.c b/src/checks.c index 99c21224f..278f0c1c2 100644 --- a/src/checks.c +++ b/src/checks.c @@ -2,7 +2,7 @@ * Health-checks functions. * * Copyright 2000-2009 Willy Tarreau - * Copyright 2007-2008 Krzysztof Piotr Oledzki + * Copyright 2007-2009 Krzysztof Piotr Oledzki * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -34,6 +35,7 @@ #include #include +#include #include #include #include @@ -45,6 +47,102 @@ #include #include +const char *check_status_description[HCHK_STATUS_SIZE] = { + [HCHK_STATUS_UNKNOWN] = "Unknown", + + [HCHK_STATUS_INI] = "Initializing", + + [HCHK_STATUS_SOCKERR] = "Socket error", + + [HCHK_STATUS_L4OK] = "Layer4 check passed", + [HCHK_STATUS_L4TOUT] = "Layer4 timeout", + [HCHK_STATUS_L4CON] = "Layer4 connection problem", + + [HCHK_STATUS_L6OK] = "Layer6 check passed", + [HCHK_STATUS_L6TOUT] = "Layer6 timeout", + [HCHK_STATUS_L6RSP] = "Layer6 invalid response", + + [HCHK_STATUS_L7TOUT] = "Layer7 timeout", + [HCHK_STATUS_L7RSP] = "Layer7 invalid response", + + [HCHK_STATUS_L7OKD] = "Layer7 check passed", + [HCHK_STATUS_L7OKCD] = "Layer7 check conditionally passed", + [HCHK_STATUS_L7STS] = "Layer7 wrong status", +}; + + +const char *check_status_info[HCHK_STATUS_SIZE] = { + [HCHK_STATUS_UNKNOWN] = "UNK", + + [HCHK_STATUS_INI] = "INI", + + [HCHK_STATUS_SOCKERR] = "SOCKERR", + + [HCHK_STATUS_L4OK] = "L4OK", + [HCHK_STATUS_L4TOUT] = "L4TOUT", + [HCHK_STATUS_L4CON] = "L4CON", + + [HCHK_STATUS_L6OK] = "L6OK", + [HCHK_STATUS_L6TOUT] = "L6TOUT", + [HCHK_STATUS_L6RSP] = "L6RSP", + + [HCHK_STATUS_L7TOUT] = "L7TOUT", + [HCHK_STATUS_L7RSP] = "L7RSP", + + [HCHK_STATUS_L7OKD] = "L7OK", + [HCHK_STATUS_L7OKCD] = "L7OKC", + [HCHK_STATUS_L7STS] = "L7STS", +}; + +/* + * Convert check_status code to description + */ +const char *get_check_status_description(short check_status) { + + const char *desc; + + if (check_status < HCHK_STATUS_SIZE) + desc = check_status_description[check_status]; + else + desc = NULL; + + if (desc && *desc) + return desc; + else + return check_status_description[HCHK_STATUS_UNKNOWN]; +} + +/* + * Convert check_status code to short info + */ +const char *get_check_status_info(short check_status) { + + const char *info; + + if (check_status < HCHK_STATUS_SIZE) + info = check_status_info[check_status]; + else + info = NULL; + + if (info && *info) + return info; + else + return check_status_info[HCHK_STATUS_UNKNOWN]; +} + +/* + * Set check_status and update check_duration + */ +static void set_server_check_status(struct server *s, short status) { + + if (tv_iszero(&s->check_start)) + return; + + s->check_status = status; + s->check_duration = tv_ms_elapsed(&s->check_start, &now); + tv_zero(&s->check_start); +} + /* sends a log message when a backend goes down, and also sets last * change date. */ @@ -145,6 +243,12 @@ static void set_server_down(struct server *s) chunk_printf(&msg, sizeof(trash), " via %s/%s", s->tracked->proxy->id, s->tracked->id); + chunk_printf(&msg, sizeof(trash), ", reason: %s", get_check_status_description(s->check_status)); + if (s->check_status >= HCHK_STATUS_L57DATA) + chunk_printf(&msg, sizeof(trash), ", code: %d", s->check_code); + + chunk_printf(&msg, sizeof(trash), ", check duration: %lums", s->check_duration); + chunk_printf(&msg, sizeof(trash), ". %d active and %d backup servers left.%s" " %d sessions active, %d requeued, %d remaining in queue.\n", s->proxy->srv_act, s->proxy->srv_bck, @@ -220,6 +324,10 @@ static void set_server_up(struct server *s) { chunk_printf(&msg, sizeof(trash), " via %s/%s", s->tracked->proxy->id, s->tracked->id); + chunk_printf(&msg, sizeof(trash), ", reason: %s", get_check_status_description(s->check_status)); + if (s->check_status >= HCHK_STATUS_L57DATA) + chunk_printf(&msg, sizeof(trash), ", code: %d", s->check_code); + chunk_printf(&msg, sizeof(trash), ". %d active and %d backup servers online.%s" " %d sessions requeued, %d total in queue.\n", s->proxy->srv_act, s->proxy->srv_bck, @@ -339,8 +447,10 @@ static int event_srv_chk_w(int fd) struct server *s = t->context; //fprintf(stderr, "event_srv_chk_w, state=%ld\n", unlikely(fdtab[fd].state)); - if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))) + if (unlikely(fdtab[fd].state == FD_STERROR || (fdtab[fd].ev & FD_POLL_ERR))) { + set_server_check_status(s, HCHK_STATUS_L4CON); goto out_error; + } /* here, we know that the connection is established */ @@ -370,8 +480,19 @@ static int event_srv_chk_w(int fd) } else if (ret == 0 || errno == EAGAIN) goto out_poll; - else + else { + switch (errno) { + case ECONNREFUSED: + case ENETUNREACH: + set_server_check_status(s, HCHK_STATUS_L4CON); + break; + + default: + set_server_check_status(s, HCHK_STATUS_SOCKERR); + } + goto out_error; + } } else { /* We have no data to send to check the connection, and @@ -395,11 +516,14 @@ static int event_srv_chk_w(int fd) if (errno == EALREADY || errno == EINPROGRESS) goto out_poll; - if (errno && errno != EISCONN) + if (errno && errno != EISCONN) { + set_server_check_status(s, HCHK_STATUS_L4CON); goto out_error; + } /* good TCP connection is enough */ s->result |= SRV_CHK_RUNNING; + set_server_check_status(s, HCHK_STATUS_L4OK); goto out_wakeup; } } @@ -416,6 +540,7 @@ static int event_srv_chk_w(int fd) return 0; out_error: s->result |= SRV_CHK_ERROR; + /* set_server_check_status() called bofore goto into this label */ fdtab[fd].state = FD_STERROR; goto out_wakeup; } @@ -448,7 +573,12 @@ static int event_srv_chk_r(int fd) (getsockopt(fd, SOL_SOCKET, SO_ERROR, &skerr, &lskerr) == -1) || (skerr != 0))) { /* in case of TCP only, this tells us if the connection failed */ + + if (!(s->result & SRV_CHK_ERROR)) + set_server_check_status(s, HCHK_STATUS_SOCKERR); + s->result |= SRV_CHK_ERROR; + goto out_wakeup; } @@ -467,40 +597,67 @@ static int event_srv_chk_r(int fd) if (s->proxy->options & PR_O_HTTP_CHK) { /* Check if the server speaks HTTP 1.X */ if ((len < strlen("HTTP/1.0 000\r")) || - (memcmp(trash, "HTTP/1.", 7) != 0)) { + (memcmp(trash, "HTTP/1.", 7) != 0 || + (trash[12] != ' ' && trash[12] != '\r')) || + !isdigit(trash[9]) || !isdigit(trash[10]) || !isdigit(trash[11])) { s->result |= SRV_CHK_ERROR; + set_server_check_status(s, HCHK_STATUS_L7RSP); goto out_wakeup; } + s->check_code = str2uic(&trash[9]); + /* check the reply : HTTP/1.X 2xx and 3xx are OK */ - if (trash[9] == '2' || trash[9] == '3') + if (trash[9] == '2' || trash[9] == '3') { s->result |= SRV_CHK_RUNNING; - else if ((s->proxy->options & PR_O_DISABLE404) && + set_server_check_status(s, HCHK_STATUS_L7OKD); + } else if ((s->proxy->options & PR_O_DISABLE404) && (s->state & SRV_RUNNING) && - (memcmp(&trash[9], "404", 3) == 0)) { + (s->check_code == 404)) { /* 404 may be accepted as "stopping" only if the server was up */ s->result |= SRV_CHK_RUNNING | SRV_CHK_DISABLE; + set_server_check_status(s, HCHK_STATUS_L7OKCD); } - else + else { s->result |= SRV_CHK_ERROR; + set_server_check_status(s, HCHK_STATUS_L7STS); + } } else if (s->proxy->options & PR_O_SSL3_CHK) { /* Check for SSLv3 alert or handshake */ - if ((len >= 5) && (trash[0] == 0x15 || trash[0] == 0x16)) + if ((len >= 5) && (trash[0] == 0x15 || trash[0] == 0x16)) { s->result |= SRV_CHK_RUNNING; - else + set_server_check_status(s, HCHK_STATUS_L6OK); + } else { s->result |= SRV_CHK_ERROR; + set_server_check_status(s, HCHK_STATUS_L6RSP); + } } else if (s->proxy->options & PR_O_SMTP_CHK) { - /* Check for SMTP code 2xx (should be 250) */ - if ((len >= 3) && (trash[0] == '2')) - s->result |= SRV_CHK_RUNNING; - else + /* Check if the server speaks SMTP */ + if ((len < strlen("000\r")) || + (trash[3] != ' ' && trash[3] != '\r') || + !isdigit(trash[0]) || !isdigit(trash[1]) || !isdigit(trash[2])) { s->result |= SRV_CHK_ERROR; + set_server_check_status(s, HCHK_STATUS_L7RSP); + goto out_wakeup; + } + + s->check_code = str2uic(&trash[0]); + + /* Check for SMTP code 2xx (should be 250) */ + if (trash[0] == '2') { + s->result |= SRV_CHK_RUNNING; + set_server_check_status(s, HCHK_STATUS_L7OKD); + } else { + s->result |= SRV_CHK_ERROR; + set_server_check_status(s, HCHK_STATUS_L7STS); + } } else { /* other checks are valid if the connection succeeded anyway */ s->result |= SRV_CHK_RUNNING; + set_server_check_status(s, HCHK_STATUS_L4OK); } out_wakeup: @@ -551,6 +708,7 @@ struct task *process_chk(struct task *t) /* we'll initiate a new check */ s->result = SRV_CHK_UNKNOWN; /* no result yet */ + s->check_start = now; if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) != -1) { if ((fd < global.maxsock) && (fcntl(fd, F_SETFL, O_NONBLOCK) != -1) && @@ -626,6 +784,7 @@ struct task *process_chk(struct task *t) if (ret) { s->result |= SRV_CHK_ERROR; + set_server_check_status(s, HCHK_STATUS_SOCKERR); switch (ret) { case 1: Alert("Cannot bind to source address before connect() for server %s/%s. Aborting.\n", @@ -657,6 +816,7 @@ struct task *process_chk(struct task *t) ret = tcpv4_bind_socket(fd, flags, &s->proxy->source_addr, remote); if (ret) { s->result |= SRV_CHK_ERROR; + set_server_check_status(s, HCHK_STATUS_SOCKERR); switch (ret) { case 1: Alert("Cannot bind to source address before connect() for %s '%s'. Aborting.\n", @@ -714,6 +874,17 @@ struct task *process_chk(struct task *t) } else if (errno != EALREADY && errno != EISCONN && errno != EAGAIN) { s->result |= SRV_CHK_ERROR; /* a real error */ + + switch (errno) { + /* FIXME: is it possible to get ECONNREFUSED/ENETUNREACH with O_NONBLOCK? */ + case ECONNREFUSED: + case ENETUNREACH: + set_server_check_status(s, HCHK_STATUS_L4CON); + break; + + default: + set_server_check_status(s, HCHK_STATUS_SOCKERR); + } } } } @@ -808,6 +979,17 @@ struct task *process_chk(struct task *t) goto new_chk; } else if ((s->result & SRV_CHK_ERROR) || tick_is_expired(t->expire, now_ms)) { + if (!(s->result & SRV_CHK_ERROR)) { + if (!EV_FD_ISSET(fd, DIR_RD)) { + set_server_check_status(s, HCHK_STATUS_L4TOUT); + } else { + if (s->proxy->options & PR_O_SSL3_CHK) + set_server_check_status(s, HCHK_STATUS_L6TOUT); + else /* HTTP, SMTP */ + set_server_check_status(s, HCHK_STATUS_L7TOUT); + } + } + //fprintf(stderr, "process_chk: 10\n"); /* failure or timeout detected */ if (s->health > s->rise) { @@ -894,6 +1076,7 @@ int start_checks() { t->expire = tick_add(now_ms, MS_TO_TICKS(((mininter && mininter >= srv_getinter(s)) ? mininter : srv_getinter(s)) * srvpos / nbchk)); + s->check_start = now; task_queue(t); srvpos++; diff --git a/src/dumpstats.c b/src/dumpstats.c index 86e73a95f..681ae8931 100644 --- a/src/dumpstats.c +++ b/src/dumpstats.c @@ -2,6 +2,7 @@ * Functions dedicated to statistics output * * Copyright 2000-2009 Willy Tarreau + * Copyright 2007-2009 Krzysztof Piotr Oledzki * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -39,6 +40,7 @@ #include #include +#include #include #include #include @@ -222,6 +224,7 @@ int print_csv_header(struct chunk *msg, int size) "chkfail,chkdown,lastchg,downtime,qlimit," "pid,iid,sid,throttle,lbtot,tracked,type," "rate,rate_lim,rate_max," + "check_status,check_code,check_duration," "\n"); } @@ -860,7 +863,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri) "Session rateSessions" "BytesDenied" "ErrorsWarnings" - "Server" + "Server" "\n" "" "CurMaxLimit" @@ -868,7 +871,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri) "LimitTotalLbTotInOut" "ReqRespReqConn" "RespRetrRedis" - "StatusWghtAct" + "StatusLastChkWghtAct" "BckChkDwnDwntme" "Thrtle\n" "", @@ -912,7 +915,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri) /* server status : reflect frontend status */ "%s" /* rest of server: nothing */ - "" + "" "", U2H0(px->denied_req), U2H1(px->denied_resp), U2H2(px->failed_req), @@ -938,8 +941,10 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri) ",,,,,,,," /* pid, iid, sid, throttle, lbtot, tracked, type */ "%d,%d,0,,,,%d," - /* rate, rate_lim, rate_max, */ + /* rate, rate_lim, rate_max */ "%u,%u,%u," + /* check_status, check_code, check_duration */ + ",,," "\n", px->id, px->feconn, px->feconn_max, px->maxconn, px->cum_feconn, @@ -1044,17 +1049,31 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri) U2H3(sv->failed_conns), U2H4(sv->failed_resp), sv->retries, sv->redispatches); - /* status */ + /* status, lest check */ chunk_printf(&msg, sizeof(trash), ""); - if (sv->state & SRV_CHECKED) + if (sv->state & SRV_CHECKED) { chunk_printf(&msg, sizeof(trash), "%s ", human_time(now.tv_sec - sv->last_change, 1)); - chunk_printf(&msg, sizeof(trash), - srv_hlt_st[sv_state], - (svs->state & SRV_RUNNING) ? (svs->health - svs->rise + 1) : (svs->health), - (svs->state & SRV_RUNNING) ? (svs->fall) : (svs->rise)); + chunk_printf(&msg, sizeof(trash), + srv_hlt_st[sv_state], + (svs->state & SRV_RUNNING) ? (svs->health - svs->rise + 1) : (svs->health), + (svs->state & SRV_RUNNING) ? (svs->fall) : (svs->rise)); + + chunk_printf(&msg, sizeof(trash), " %s%s", + get_check_status_description(sv->check_status), + tv_iszero(&sv->check_start)?"":"* ", + get_check_status_info(sv->check_status)); + + if (sv->check_status >= HCHK_STATUS_L57DATA) + chunk_printf(&msg, sizeof(trash), "/%d", sv->check_code); + + if (sv->check_status >= HCHK_STATUS_CHECKED) + chunk_printf(&msg, sizeof(trash), " in %lums", sv->check_duration); + } else { + chunk_printf(&msg, sizeof(trash), ""); + } chunk_printf(&msg, sizeof(trash), /* weight */ @@ -1180,6 +1199,26 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri) read_freq_ctr(&sv->sess_per_sec), sv->sps_max); + if (sv->state & SRV_CHECKED) { + /* check_status */ + chunk_printf(&msg, sizeof(trash), "%s,", get_check_status_info(sv->check_status)); + + /* check_code */ + if (sv->check_status >= HCHK_STATUS_L57DATA) + chunk_printf(&msg, sizeof(trash), "%u,", sv->check_code); + else + chunk_printf(&msg, sizeof(trash), ","); + + /* check_duration */ + if (sv->check_status >= HCHK_STATUS_CHECKED) + chunk_printf(&msg, sizeof(trash), "%lu,", sv->check_duration); + else + chunk_printf(&msg, sizeof(trash), ","); + + } else { + chunk_printf(&msg, sizeof(trash), ",,,"); + } + /* finish with EOL */ chunk_printf(&msg, sizeof(trash), "\n"); } @@ -1228,7 +1267,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri) * if the backend has known working servers or if it has no server at * all (eg: for stats). Then we display the total weight, number of * active and backups. */ - "%s %s%d" + "%s %s %d" "%d%d" "", U2H0(px->denied_req), U2H1(px->denied_resp), @@ -1276,6 +1315,8 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri) "%d,%d,0,,%lld,,%d," /* rate, rate_lim, rate_max, */ "%u,,%u," + /* check_status, check_code, check_duration */ + ",,," "\n", px->id, px->nbpend /* or px->totpend ? */, px->nbpend_max,