From 85130941e7ba66656e302a68a211c3834d7d5a93 Mon Sep 17 00:00:00 2001 From: Krzysztof Oledzki Date: Mon, 22 Oct 2007 16:21:10 +0200 Subject: [PATCH] [MEDIUM] stats: report server and backend cumulated downtime Hello, This patch implements new statistics for SLA calculation by adding new field 'Dwntime' with total down time since restart (both HTTP/CSV) and extending status field (HTTP) or inserting a new one (CSV) with time showing how long each server/backend is in a current state. Additionaly, down transations are also calculated and displayed for backends, so it is possible to know how many times selected backend was down, generating "No server is available to handle this request." error. New information are presentetd in two different ways: - for HTTP: a "human redable form", one of "100000d 23h", "23h 59m" or "59m 59s" - for CSV: seconds I believe that seconds resolution is enough. As there are more columns in the status page I decided to shrink some names to make more space: - Weight -> Wght - Check -> Chk - Down -> Dwn Making described changes I also made some improvements and fixed some small bugs: - don't increment s->health above 's->rise + s->fall - 1'. Previously it was incremented an then (re)set to 's->rise + s->fall - 1'. - do not set server down if it is down already - do not set server up if it is up already - fix colspan in multiple places (mostly introduced by my previous patch) - add missing "status" header to CSV - fix order of retries/redispatches in server (CSV) - s/Tthen/Then/ - s/server/backend/ in DATA_ST_PX_BE (dumpstats.c) Changes from previous version: - deal with negative time intervales - don't relay on s->state (SRV_RUNNING) - little reworked human_time + compacted format (no spaces). If needed it can be used in the future for other purposes by optionally making "cnt" as an argument - leave set_server_down mostly unchanged - only little reworked "process_chk: 9" - additional fields in CSV are appended to the rigth - fix "SEC" macro - named arguments (human_time, be_downtime, srv_downtime) Hope it is OK. If there are only cosmetic changes needed please fill free to correct it, however if there are some bigger changes required I would like to discuss it first or at last to know what exactly was changed especially since I already put this patch into my production server. :) Thank you, Best regards, Krzysztof Oledzki --- include/common/time.h | 6 +++ include/proto/backend.h | 2 +- include/proto/server.h | 2 +- include/types/proxy.h | 5 +++ include/types/server.h | 4 ++ src/backend.c | 7 ++++ src/cfgparse.c | 4 +- src/checks.c | 35 ++++++++++++----- src/dumpstats.c | 86 +++++++++++++++++++++++++---------------- src/server.c | 9 +++++ src/time.c | 34 ++++++++++++++++ 11 files changed, 148 insertions(+), 46 deletions(-) diff --git a/include/common/time.h b/include/common/time.h index 001d63e95..ced4b6bd9 100644 --- a/include/common/time.h +++ b/include/common/time.h @@ -27,6 +27,11 @@ #include #include +#define SEC 1 +#define MINUTE (60 * SEC) +#define HOUR (60 * MINUTE) +#define DAY (24 * HOUR) + /* eternity when exprimed in timeval */ #ifndef TV_ETERNITY #define TV_ETERNITY (~0UL) @@ -480,6 +485,7 @@ REGPRM3 static inline struct timeval *__tv_ms_add(struct timeval *tv, const stru tv1; \ }) +char *human_time(int t, short hz); #endif /* _COMMON_TIME_H */ diff --git a/include/proto/backend.h b/include/proto/backend.h index 467e75680..d70c68faf 100644 --- a/include/proto/backend.h +++ b/include/proto/backend.h @@ -39,7 +39,7 @@ int srv_redispatch_connect(struct session *t); void recount_servers(struct proxy *px); void recalc_server_map(struct proxy *px); - +int be_downtime(struct proxy *px); /* * This function tries to find a running server with free connection slots for diff --git a/include/proto/server.h b/include/proto/server.h index 2df0763a2..27e4f367d 100644 --- a/include/proto/server.h +++ b/include/proto/server.h @@ -31,7 +31,7 @@ #include - +int srv_downtime(struct server *s); #endif /* _PROTO_SERVER_H */ diff --git a/include/types/proxy.h b/include/types/proxy.h index ced7630e3..04d116272 100644 --- a/include/types/proxy.h +++ b/include/types/proxy.h @@ -111,6 +111,11 @@ struct proxy { unsigned int maxconn; /* max # of active sessions on the frontend */ unsigned int fullconn; /* #conns on backend above which servers are used at full load */ struct in_addr except_net, except_mask; /* don't x-forward-for for this address. FIXME: should support IPv6 */ + + unsigned down_trans; /* up-down transitions */ + unsigned down_time; /* total time the proxy was down */ + time_t last_change; /* last time, when the state was changed */ + unsigned failed_conns, failed_resp; /* failed connect() and responses */ unsigned retries, redispatches; /* retried and redispatched connections */ unsigned denied_req, denied_resp; /* blocked requests/responses because of security concerns */ diff --git a/include/types/server.h b/include/types/server.h index 6e1e810ed..62179f33d 100644 --- a/include/types/server.h +++ b/include/types/server.h @@ -86,10 +86,14 @@ struct server { unsigned wscore; /* weight score, used during srv map computation */ unsigned failed_checks, down_trans; /* failed checks and up-down transitions */ + unsigned down_time; /* total time the server was down */ + time_t last_change; /* last time, when the state was changed */ + unsigned failed_conns, failed_resp; /* failed connect() and responses */ unsigned retries; /* retried connections */ unsigned failed_secu; /* blocked responses because of security concerns */ unsigned cum_sess; /* cumulated number of sessions really sent to this server */ + long long bytes_in; /* number of bytes transferred from the client to the server */ long long bytes_out; /* number of bytes transferred from the server to the client */ }; diff --git a/src/backend.c b/src/backend.c index 6cecb1770..596d5676c 100644 --- a/src/backend.c +++ b/src/backend.c @@ -717,6 +717,13 @@ int srv_redispatch_connect(struct session *t) return 0; } +int be_downtime(struct proxy *px) { + + if ((px->srv_act || px->srv_bck) && px->last_change < now.tv_sec) // ignore negative time + return px->down_time; + + return now.tv_sec - px->last_change + px->down_time; +} /* * Local variables: diff --git a/src/cfgparse.c b/src/cfgparse.c index 7483a9c6c..7b73e0bee 100644 --- a/src/cfgparse.c +++ b/src/cfgparse.c @@ -564,7 +564,8 @@ int cfg_parse_listen(const char *file, int linenum, char **args) tv_eternity(&curproxy->srvtimeout); tv_eternity(&curproxy->contimeout); tv_eternity(&curproxy->appsession_timeout); - + + curproxy->last_change = now.tv_sec; curproxy->id = strdup(args[1]); curproxy->cap = rc; @@ -1380,6 +1381,7 @@ int cfg_parse_listen(const char *file, int linenum, char **args) LIST_INIT(&newsrv->pendconns); do_check = 0; newsrv->state = SRV_RUNNING; /* early server setup */ + newsrv->last_change = now.tv_sec; newsrv->id = strdup(args[1]); /* several ways to check the port component : diff --git a/src/checks.c b/src/checks.c index 0cd1a8741..77e9b513f 100644 --- a/src/checks.c +++ b/src/checks.c @@ -57,9 +57,11 @@ static void set_server_down(struct server *s) struct session *sess; int xferred; - s->state &= ~SRV_RUNNING; - if (s->health == s->rise) { + + s->last_change = now.tv_sec; + s->state &= ~SRV_RUNNING; + recount_servers(s->proxy); s->proxy->map_state |= PR_MAP_RECALC; @@ -92,10 +94,13 @@ static void set_server_down(struct server *s) Warning("%s", trash); send_log(s->proxy, LOG_ALERT, "%s", trash); - + if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) { - Alert("%s '%s' has no server available !\n", proxy_type_str(s->proxy), s->proxy->id); - send_log(s->proxy, LOG_EMERG, "%s %s has no server available !\n", proxy_type_str(s->proxy), s->proxy->id); + s->proxy->last_change = now.tv_sec; + s->proxy->down_trans++; + + Alert("%s '%s' has no server available!\n", proxy_type_str(s->proxy), s->proxy->id); + send_log(s->proxy, LOG_EMERG, "%s %s has no server available!\n", proxy_type_str(s->proxy), s->proxy->id); } s->down_trans++; } @@ -455,13 +460,24 @@ void process_chk(struct task *t, struct timeval *next) /* there was a test running */ if (s->result > 0) { /* good server detected */ //fprintf(stderr, "process_chk: 9\n"); - s->health++; /* was bad, stays for a while */ - if (s->health >= s->rise) { - s->state |= SRV_RUNNING; + + if (s->health < s->rise + s->fall - 1) { + s->health++; /* was bad, stays for a while */ if (s->health == s->rise) { int xferred; + if (s->last_change < now.tv_sec) // ignore negative times + s->down_time += now.tv_sec - s->last_change; + s->last_change = now.tv_sec; + s->state |= SRV_RUNNING; + + if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) { + if (s->proxy->last_change < now.tv_sec) // ignore negative times + s->proxy->down_time += now.tv_sec - s->proxy->last_change; + s->proxy->last_change = now.tv_sec; + } + recount_servers(s->proxy); s->proxy->map_state |= PR_MAP_RECALC; @@ -493,7 +509,8 @@ void process_chk(struct task *t, struct timeval *next) send_log(s->proxy, LOG_NOTICE, "%s", trash); } - s->health = s->rise + s->fall - 1; /* OK now */ + if (s->health >= s->rise) + s->health = s->rise + s->fall - 1; /* OK now */ } s->curfd = -1; /* no check running anymore */ fd_delete(fd); diff --git a/src/dumpstats.c b/src/dumpstats.c index 02e0518d8..7ec4b565b 100644 --- a/src/dumpstats.c +++ b/src/dumpstats.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -47,6 +46,7 @@ #include #include #include +#include /* This function parses a "stats" statement in the "global" section. It returns * -1 if there is any error, otherwise zero. If it returns -1, it may write an @@ -186,8 +186,8 @@ int stats_dump_raw(struct session *s, struct uri_auth *uri, int flags) "dreq,dresp," "ereq,econ,eresp," "wretr,wredis," - "weight,act,bck," - "chkfail,chkdown" + "status,weight,act,bck," + "chkfail,chkdown,lastchg,downtime," "\n"); if (buffer_write_chunk(rep, &msg) != 0) @@ -371,8 +371,8 @@ int stats_dump_http(struct session *s, struct uri_auth *uri, int flags) "dreq,dresp," "ereq,econ,eresp," "wretr,wredis," - "weight,act,bck," - "chkfail,chkdown" + "status,weight,act,bck," + "chkfail,chkdown,lastchg,downtime," "\n"); } if (buffer_write_chunk(rep, &msg) != 0) @@ -582,26 +582,26 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri, if (flags & STAT_FMT_HTML) { /* print a new table */ chunk_printf(&msg, sizeof(trash), - "\n" + "
\n" "" "" - "" + "" "\n" "" "" "" "" "" - "" + "" "\n" "" "" "" "" "" - "" - "\n" - "", + "" + "\n" + "", px->id); if (buffer_write_chunk(rep, &msg) != 0) @@ -632,7 +632,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri, /* server status : reflect frontend status */ "" /* rest of server: nothing */ - "" + "" "", px->feconn, px->feconn_max, px->maxconn, px->cum_feconn, px->bytes_in, px->bytes_out, @@ -657,7 +657,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri, /* server status : reflect frontend status */ "%s," /* rest of server: nothing */ - ",,,,," + ",,,,,,," "\n", px->id, px->feconn, px->feconn_max, px->maxconn, px->cum_feconn, @@ -734,6 +734,11 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri, /* status */ chunk_printf(&msg, sizeof(trash), "\n", - sv->failed_checks, sv->down_trans); + "" + "" + "\n", + sv->failed_checks, sv->down_trans, + human_time(srv_downtime(sv), 1)); else chunk_printf(&msg, sizeof(trash), - "\n"); + "\n"); } else { static char *srv_hlt_st[5] = { "DOWN,", "DOWN %d/%d,", "UP %d/%d,", "UP,", "no check," }; @@ -774,7 +782,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri, /* errors : request, connect, response */ ",%d,%d," /* warnings: retries, redispatches */ - ",%d," + "%d,," "", px->id, sv->id, sv->nbpend, sv->nbpend_max, @@ -798,14 +806,15 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri, (sv->state & SRV_BACKUP) ? 0 : 1, (sv->state & SRV_BACKUP) ? 1 : 0); - /* check failures : unique, fatal */ + /* check failures: unique, fatal; last change, total downtime */ if (sv->state & SRV_CHECKED) chunk_printf(&msg, sizeof(trash), - "%d,%d,\n", - sv->failed_checks, sv->down_trans); + "%d,%d,%d,%d,\n", + sv->failed_checks, sv->down_trans, + now.tv_sec - sv->last_change, srv_downtime(sv)); else chunk_printf(&msg, sizeof(trash), - ",,\n"); + ",,,,\n"); } if (buffer_write_chunk(rep, &msg) != 0) return 0; @@ -847,23 +856,29 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri, "\n" /* warnings: retries, redispatches */ "" - /* server status : reflect backend status (up/down) : we display UP + /* backend status: reflect backend status (up/down): we display UP * if the backend has known working servers or if it has no server at - * all (eg: for stats). Tthen we display the total weight, number of + * all (eg: for stats). Then we display the total weight, number of * active and backups. */ - "" - "" - /* rest of server: nothing */ - "" - "", + "" + "", px->nbpend /* or px->totpend ? */, px->nbpend_max, px->beconn, px->beconn_max, px->fullconn, px->cum_beconn, px->bytes_in, px->bytes_out, px->denied_req, px->denied_resp, px->failed_conns, px->failed_resp, px->retries, px->redispatches, + human_time(now.tv_sec - px->last_change, 1), (px->srv_map_sz > 0 || !px->srv) ? "UP" : "DOWN", px->srv_map_sz * gcd, px->srv_act, px->srv_bck); + + chunk_printf(&msg, sizeof(trash), + /* rest of backend: nothing, down transformations, total downtime */ + "" + "" + "", + px->down_trans, + px->srv?human_time(be_downtime(px), 1):" "); } else { chunk_printf(&msg, sizeof(trash), /* pxid, name */ @@ -880,14 +895,15 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri, ",%d,%d," /* warnings: retries, redispatches */ "%d,%d," - /* server status : reflect backend status (up/down) : we display UP + /* backend status: reflect backend status (up/down): we display UP * if the backend has known working servers or if it has no server at - * all (eg: for stats). Tthen we display the total weight, number of + * all (eg: for stats). Then we display the total weight, number of * active and backups. */ "%s," "%d,%d,%d," - /* rest of server: nothing */ - ",," + /* rest of backend: nothing, down transformations, + * last change, total downtime. */ + ",%d,%d,%d," "\n", px->id, px->nbpend /* or px->totpend ? */, px->nbpend_max, @@ -897,7 +913,9 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri, px->failed_conns, px->failed_resp, px->retries, px->redispatches, (px->srv_map_sz > 0 || !px->srv) ? "UP" : "DOWN", - px->srv_map_sz * gcd, px->srv_act, px->srv_bck); + px->srv_map_sz * gcd, px->srv_act, px->srv_bck, + px->down_trans, now.tv_sec - px->last_change, + px->srv?be_downtime(px):0); } if (buffer_write_chunk(rep, &msg) != 0) return 0; diff --git a/src/server.c b/src/server.c index 5b33d114b..8b0fa1490 100644 --- a/src/server.c +++ b/src/server.c @@ -11,10 +11,19 @@ */ #include +#include + #include #include #include +int srv_downtime(struct server *s) { + + if ((s->state & SRV_RUNNING) && s->last_change < now.tv_sec) // ignore negative time + return s->down_time; + + return now.tv_sec - s->last_change + s->down_time; +} /* * Local variables: diff --git a/src/time.c b/src/time.c index b80dca9f4..bb98d98cf 100644 --- a/src/time.c +++ b/src/time.c @@ -142,6 +142,40 @@ REGPRM2 int _tv_isgt(const struct timeval *tv1, const struct timeval *tv2) return __tv_isgt(tv1, tv2); } +char *human_time(int t, short hz) { + static char rv[sizeof("24855d23h")+1]; // longest of "23h59m" and "59m59s" + char *p = rv; + int cnt=2; // print two numbers + + if (unlikely(t < 0 || hz <= 0)) { + sprintf(p, "?"); + return rv; + } + + if (unlikely(hz > 1)) + t /= hz; + + if (t >= DAY) { + p += sprintf(p, "%dd", t / DAY); + cnt--; + } + + if (cnt && t % DAY / HOUR) { + p += sprintf(p, "%dh", t % DAY / HOUR); + cnt--; + } + + if (cnt && t % HOUR / MINUTE) { + p += sprintf(p, "%dm", t % HOUR / MINUTE); + cnt--; + } + + if ((cnt && t % MINUTE) || !t) // also display '0s' + p += sprintf(p, "%ds", t % MINUTE / SEC); + + return rv; +} + /* * Local variables: * c-indent-level: 8
%s
QueueSessionsBytesDeniedErrorsWarningsServerServer
CurMaxCurMaxLimitCumulInOutReqRespReqConnRespRetrRedisStatusWeightActBckCheckDown
StatusWghtActBckChkDwnDwntme
%s
"); + + if (sv->state & SRV_CHECKED) + chunk_printf(&msg, sizeof(trash), "%s ", + human_time(now.tv_sec - sv->last_change, 1)); + chunk_printf(&msg, sizeof(trash), srv_hlt_st[sv_state], (sv->state & SRV_RUNNING) ? (sv->health - sv->rise + 1) : (sv->health), @@ -749,14 +754,17 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri, (sv->state & SRV_BACKUP) ? "-" : "Y", (sv->state & SRV_BACKUP) ? "Y" : "-"); - /* check failures : unique, fatal */ + /* check failures: unique, fatal, down time */ if (sv->state & SRV_CHECKED) chunk_printf(&msg, sizeof(trash), - "%d%d
%d%d%s
%d%d%d%d%s%d%d%d
%s %s%d%d%d %d%s