From a0066ddbda7b4521239ed87131a4d61cbfcd20ae Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 16 May 2014 11:25:16 +0200 Subject: [PATCH] MEDIUM: server: properly support and propagate the maintenance status This change now involves a new flag SRV_ADMF_IMAINT to note that the maintenance status of a server is inherited from another server. Thus, we know at each server level in the chain if it's running, in forced maintenance or in a maintenance status because it tracks another server, or even in both states. Disabling a server propagates this flag down to other servers. Enabling a server flushes the flag down. A server becomes up again once both of its flags are cleared. Two new functions "srv_adm_set_maint()" and "srv_adm_set_ready()" are used to manipulate this maintenance status. They're used by the CLI and the stats page. Now the stats page always says "MAINT" instead of "MAINT(via)" and it's only the chk/down field which reports "via x/y" when the status is inherited from another server, but it doesn't say it when a server was forced into maintenance. The CSV output indicates "MAINT (via x/y)" instead of only "MAINT(via)". This is the most accurate representation. One important thing is that now entering/leaving maintenance for a tracking server correctly follows the state of the tracked server. --- include/proto/server.h | 26 +++++ src/cfgparse.c | 1 + src/dumpstats.c | 76 +++---------- src/server.c | 236 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 278 insertions(+), 61 deletions(-) diff --git a/include/proto/server.h b/include/proto/server.h index 277974a96..e069f7fc8 100644 --- a/include/proto/server.h +++ b/include/proto/server.h @@ -116,6 +116,32 @@ void srv_shutdown_sessions(struct server *srv, int why); */ void srv_shutdown_backup_sessions(struct proxy *px, int why); +/* Appends some information to a message string related to a server going UP or DOWN. + * If is null and the server tracks another one, a "via" information will + * be provided to know where the status came from. If xferred is non-negative, some + * information about requeued sessions are provided. + */ +void srv_adm_append_status(struct chunk *msg, struct server *s, int xferred, int forced); + +/* Puts server into maintenance mode, and propagate that status down to all + * tracking servers. This does the same action as the CLI's "disable server x". + * A log is emitted for all servers that were not yet in maintenance mode. + * Health checks are disabled but not agent checks. The server is marked as + * being either forced into maintenance by having set to SRV_ADMF_FMAINT, + * or as inheriting the maintenance status by having set to + * SRV_ADMF_IMAINT. Nothing is done if neither flag is set. + */ +void srv_adm_set_maint(struct server *s, enum srv_admin mode); + +/* Gets server out of maintenance mode, and propagate that status down to + * all tracking servers. This does the same action as the CLI's "enable server x". + * A log is emitted for all servers that leave maintenance mode. Health checks + * are possibly enabled again. The server is marked as leaving forced maintenance + * when is set to SRV_ADMF_FMAINT, or as leaving inherited maintenance + * when set to SRV_ADMF_IMAINT. Nothing is done if neither flag is set. + */ +void srv_adm_set_ready(struct server *s, enum srv_admin mode); + /* * Local variables: * c-indent-level: 8 diff --git a/src/cfgparse.c b/src/cfgparse.c index 5384f4f95..08168a171 100644 --- a/src/cfgparse.c +++ b/src/cfgparse.c @@ -6618,6 +6618,7 @@ out_uri_auth_compat: /* if the other server is forced disabled, we have to do the same here */ if (srv->admin & SRV_ADMF_MAINT) { + newsrv->admin |= SRV_ADMF_IMAINT; newsrv->state = SRV_ST_STOPPED; newsrv->check.health = 0; newsrv->agent.health = 0; diff --git a/src/dumpstats.c b/src/dumpstats.c index fbe1d2ab0..d0cd63261 100644 --- a/src/dumpstats.c +++ b/src/dumpstats.c @@ -1710,26 +1710,7 @@ static int stats_sock_parse_request(struct stream_interface *si, char *line) if (!sv) return 1; - if (sv->admin & SRV_ADMF_MAINT) { - /* The server is really in maintenance, we can change the server state */ - if (sv->track) { - /* If this server tracks the status of another one, - * we must restore the good status. - */ - if (sv->track->state != SRV_ST_STOPPED) { - set_server_up(&sv->check); - sv->check.health = sv->check.rise; /* up, but will fall down at first failure */ - } else { - sv->admin &= ~SRV_ADMF_FMAINT; - sv->check.state &= ~CHK_ST_PAUSED; - set_server_down(&sv->check); - } - } else { - set_server_up(&sv->check); - sv->check.health = sv->check.rise; /* up, but will fall down at first failure */ - } - } - + srv_adm_set_ready(sv, SRV_ADMF_FMAINT); return 1; } else if (strcmp(args[1], "frontend") == 0) { @@ -1782,13 +1763,7 @@ static int stats_sock_parse_request(struct stream_interface *si, char *line) if (!sv) return 1; - if (!(sv->admin & SRV_ADMF_MAINT)) { - /* Not already in maintenance, we can change the server state */ - sv->admin |= SRV_ADMF_FMAINT; - sv->check.state |= CHK_ST_PAUSED; - set_server_down(&sv->check); - } - + srv_adm_set_maint(sv, SRV_ADMF_FMAINT); return 1; } else if (strcmp(args[1], "frontend") == 0) { @@ -2780,7 +2755,7 @@ static int stats_dump_sv_stats(struct stream_interface *si, struct proxy *px, in "no check" }; - if ((sv->admin | ref->admin) & SRV_ADMF_MAINT) + if (sv->admin & SRV_ADMF_MAINT) chunk_appendf(&trash, ""); else chunk_appendf(&trash, @@ -2915,10 +2890,6 @@ static int stats_dump_sv_stats(struct stream_interface *si, struct proxy *px, in chunk_appendf(&trash, "%s ", human_time(now.tv_sec - sv->last_change, 1)); chunk_appendf(&trash, "MAINT"); } - else if (ref != sv && (ref->admin & SRV_ADMF_MAINT)) { - chunk_appendf(&trash, "%s ", human_time(now.tv_sec - ref->last_change, 1)); - chunk_appendf(&trash, "MAINT(via)"); - } else if (ref->check.state & CHK_ST_ENABLED) { chunk_appendf(&trash, "%s ", human_time(now.tv_sec - ref->last_change, 1)); chunk_appendf(&trash, @@ -2975,13 +2946,11 @@ static int stats_dump_sv_stats(struct stream_interface *si, struct proxy *px, in ref->observe ? "/Health Analyses" : "", ref->counters.down_trans, human_time(srv_downtime(sv), 1)); } - else if (sv != ref) { - if (sv->admin & SRV_ADMF_MAINT) - chunk_appendf(&trash, ""); - else - chunk_appendf(&trash, - "via %s/%s", - ref->proxy->id, ref->id, ref->proxy->id, ref->id); + else if (!(sv->admin & SRV_ADMF_FMAINT) && sv != ref) { + /* tracking a server */ + chunk_appendf(&trash, + "via %s/%s", + ref->proxy->id, ref->id, ref->proxy->id, ref->id); } else chunk_appendf(&trash, ""); @@ -3030,10 +2999,10 @@ static int stats_dump_sv_stats(struct stream_interface *si, struct proxy *px, in sv->counters.retries, sv->counters.redispatches); /* status */ - if (sv->admin & SRV_ADMF_MAINT) + if (sv->admin & SRV_ADMF_IMAINT) + chunk_appendf(&trash, "MAINT (via %s/%s),", ref->proxy->id, ref->id); + else if (sv->admin & SRV_ADMF_MAINT) chunk_appendf(&trash, "MAINT,"); - else if (ref != sv && (ref->admin & SRV_ADMF_MAINT)) - chunk_appendf(&trash, "MAINT(via),"); else chunk_appendf(&trash, srv_hlt_st[state], @@ -4249,32 +4218,17 @@ static int stats_process_http_post(struct stream_interface *si) else if ((sv = findserver(px, value)) != NULL) { switch (action) { case ST_ADM_ACTION_DISABLE: - if ((px->state != PR_STSTOPPED) && !(sv->admin & SRV_ADMF_MAINT)) { - /* Not already in maintenance, we can change the server state */ - sv->admin |= SRV_ADMF_FMAINT; - sv->check.state |= CHK_ST_PAUSED; - set_server_down(&sv->check); + if ((px->state != PR_STSTOPPED) && !(sv->admin & SRV_ADMF_FMAINT)) { altered_servers++; total_servers++; + srv_adm_set_maint(sv, SRV_ADMF_FMAINT); } break; case ST_ADM_ACTION_ENABLE: - if ((px->state != PR_STSTOPPED) && (sv->admin & SRV_ADMF_MAINT)) { - /* Already in maintenance, we can change the server state. - * If this server tracks the status of another one, - * we must restore the good status. - */ - if (!sv->track || (sv->track->state != SRV_ST_STOPPED)) { - set_server_up(&sv->check); - sv->check.health = sv->check.rise; /* up, but will fall down at first failure */ - } - else { - sv->admin &= ~SRV_ADMF_FMAINT; - sv->check.state &= ~CHK_ST_PAUSED; - set_server_down(&sv->check); - } + if ((px->state != PR_STSTOPPED) && (sv->admin & SRV_ADMF_FMAINT)) { altered_servers++; total_servers++; + srv_adm_set_ready(sv, SRV_ADMF_FMAINT); } break; case ST_ADM_ACTION_STOP: diff --git a/src/server.c b/src/server.c index 07d243694..f0fb0a76d 100644 --- a/src/server.c +++ b/src/server.c @@ -186,6 +186,242 @@ void srv_shutdown_backup_sessions(struct proxy *px, int why) srv_shutdown_sessions(srv, why); } +/* Appends some information to a message string related to a server going UP or DOWN. + * If is null and the server tracks another one, a "via" information will + * be provided to know where the status came from. If xferred is non-negative, some + * information about requeued sessions are provided. + */ +void srv_adm_append_status(struct chunk *msg, struct server *s, int xferred, int forced) +{ + if (!forced && s->track) + chunk_appendf(msg, " via %s/%s", + s->track->proxy->id, s->track->id); + + if (xferred >= 0) { + if (s->state == SRV_ST_STOPPED) + chunk_appendf(msg, ". %d active and %d backup servers left.%s" + " %d sessions active, %d requeued, %d remaining in queue", + s->proxy->srv_act, s->proxy->srv_bck, + (s->proxy->srv_bck && !s->proxy->srv_act) ? " Running on backup." : "", + s->cur_sess, xferred, s->nbpend); + else + chunk_appendf(msg, ". %d active and %d backup servers online.%s" + " %d sessions requeued, %d total in queue", + s->proxy->srv_act, s->proxy->srv_bck, + (s->proxy->srv_bck && !s->proxy->srv_act) ? " Running on backup." : "", + xferred, s->nbpend); + } +} + +/* Puts server into maintenance mode, and propagate that status down to all + * tracking servers. This does the same action as the CLI's "disable server x". + * A log is emitted for all servers that were not yet in maintenance mode. + * Health checks are disabled but not agent checks. The server is marked as + * being either forced into maintenance by having set to SRV_ADMF_FMAINT, + * or as inheriting the maintenance status by having set to + * SRV_ADMF_IMAINT. Nothing is done if neither flag is set. + */ +void srv_adm_set_maint(struct server *s, enum srv_admin mode) +{ + struct check *check = &s->check; + struct server *srv; + int xferred; + + if (!mode) + return; + + /* stop going down as soon as we meet a server already in the same state */ + if (s->admin & mode) + return; + + s->admin |= mode; + + if (s->check.state & CHK_ST_ENABLED) { + s->check.state |= CHK_ST_PAUSED; + check->health = 0; + } + + if (s->state == SRV_ST_STOPPED) { /* server was already down */ + if (!(s->admin & ~mode & SRV_ADMF_MAINT)) { + chunk_printf(&trash, + "%sServer %s/%s was DOWN and now enters maintenance", + s->flags & SRV_F_BACKUP ? "Backup " : "", s->proxy->id, s->id); + + srv_adm_append_status(&trash, s, -1, (mode & SRV_ADMF_FMAINT)); + + Warning("%s.\n", trash.str); + send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.str); + } + } + else { /* server was still running */ + int srv_was_stopping = (s->state == SRV_ST_STOPPING); + int prev_srv_count = s->proxy->srv_bck + s->proxy->srv_act; + + check->health = 0; /* failure */ + s->last_change = now.tv_sec; + s->state = SRV_ST_STOPPED; + if (s->proxy->lbprm.set_server_status_down) + s->proxy->lbprm.set_server_status_down(s); + + if (s->onmarkeddown & HANA_ONMARKEDDOWN_SHUTDOWNSESSIONS) + srv_shutdown_sessions(s, SN_ERR_DOWN); + + /* we might have sessions queued on this server and waiting for + * a connection. Those which are redispatchable will be queued + * to another server or to the proxy itself. + */ + xferred = pendconn_redistribute(s); + + chunk_printf(&trash, + "%sServer %s/%s is going DOWN for maintenance", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id); + + srv_adm_append_status(&trash, s, xferred, (mode & SRV_ADMF_FMAINT)); + + Warning("%s.\n", trash.str); + send_log(s->proxy, srv_was_stopping ? LOG_NOTICE : LOG_ALERT, "%s.\n", trash.str); + + if (prev_srv_count && s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) + set_backend_down(s->proxy); + + s->counters.down_trans++; + } + + for (srv = s->trackers; srv; srv = srv->tracknext) + srv_adm_set_maint(srv, SRV_ADMF_IMAINT); +} + +/* Gets server out of maintenance mode, and propagate that status down to + * all tracking servers. This does the same action as the CLI's "enable server x". + * A log is emitted for all servers that leave maintenance mode. Health checks + * are possibly enabled again. The server is marked as leaving forced maintenance + * when is set to SRV_ADMF_FMAINT, or as leaving inherited maintenance + * when set to SRV_ADMF_IMAINT. Nothing is done if neither flag is set. + */ +void srv_adm_set_ready(struct server *s, enum srv_admin mode) +{ + struct check *check = &s->check; + struct server *srv; + int xferred = -1; + + if (!mode) + return; + + /* stop going down as soon as we see the flag is not there anymore */ + if (!(s->admin & mode)) + return; + + s->admin &= ~mode; + + if (s->admin & SRV_ADMF_MAINT) { + /* remaining in maintenance mode, let's inform precisely about the + * situation. + */ + + if (s->admin & SRV_ADMF_FMAINT) { + chunk_printf(&trash, + "%sServer %s/%s remains in forced maintenance", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id); + } + else { + chunk_printf(&trash, + "%sServer %s/%s is leaving forced maintenance but remains in maintenance", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id); + + if (s->track) /* normally it's mandatory here */ + chunk_appendf(&trash, " via %s/%s", + s->track->proxy->id, s->track->id); + } + + Warning("%s.\n", trash.str); + send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.str); + return; + } + + /* OK here we're leaving maintenance, we have many things to check, + * because the server might possibly be coming back up depending on + * its state. In practice, leaving maintenance means that we should + * immediately turn to UP (more or less the slowstart) under the + * following conditions : + * - server is neither checked nor tracked + * - server tracks another server which is not checked + * - server tracks another server which is already up + * Which sums up as something simpler : + * "either the server's or the tracked server's checks are disabled or up". + * Otherwise we only re-enable health checks. + */ + + if (s->check.state & CHK_ST_ENABLED) { + s->check.state &= ~CHK_ST_PAUSED; + check->health = check->rise; /* start OK but check immediately */ + } + + if ((!s->track && + (!(s->agent.state & CHK_ST_ENABLED) || (s->agent.health >= s->agent.rise)) && + (!(s->check.state & CHK_ST_ENABLED) || (s->check.health >= s->check.rise))) || + (s->track && + (!(s->track->agent.state & CHK_ST_ENABLED) || (s->track->agent.health >= s->track->agent.rise)) && + (!(s->track->check.state & CHK_ST_ENABLED) || (s->track->check.health >= s->track->check.rise)))) { + + if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) { + if (s->proxy->last_change < now.tv_sec) // ignore negative times + s->proxy->down_time += now.tv_sec - s->proxy->last_change; + s->proxy->last_change = now.tv_sec; + } + + if (s->last_change < now.tv_sec) // ignore negative times + s->down_time += now.tv_sec - s->last_change; + s->last_change = now.tv_sec; + + s->state = SRV_ST_STARTING; + if (s->slowstart > 0) + task_schedule(s->warmup, tick_add(now_ms, MS_TO_TICKS(MAX(1000, s->slowstart / 20)))); + else + s->state = SRV_ST_RUNNING; + + server_recalc_eweight(s); + + /* If the server is set with "on-marked-up shutdown-backup-sessions", + * and it's not a backup server and its effective weight is > 0, + * then it can accept new connections, so we shut down all sessions + * on all backup servers. + */ + if ((s->onmarkedup & HANA_ONMARKEDUP_SHUTDOWNBACKUPSESSIONS) && + !(s->flags & SRV_F_BACKUP) && s->eweight) + srv_shutdown_backup_sessions(s->proxy, SN_ERR_UP); + + /* check if we can handle some connections queued at the proxy. We + * will take as many as we can handle. + */ + xferred = pendconn_grab_from_px(s); + } + + if (mode & SRV_ADMF_FMAINT) { + chunk_printf(&trash, + "%sServer %s/%s is %s (leaving forced maintenance)", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id, + (s->state == SRV_ST_STOPPED) ? "DOWN" : "UP"); + } + else { + chunk_printf(&trash, + "%sServer %s/%s is %s (leaving maintenance)", + s->flags & SRV_F_BACKUP ? "Backup " : "", + s->proxy->id, s->id, + (s->state == SRV_ST_STOPPED) ? "DOWN" : "UP"); + srv_adm_append_status(&trash, s, xferred, 0); + } + + Warning("%s.\n", trash.str); + send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.str); + + for (srv = s->trackers; srv; srv = srv->tracknext) + srv_adm_set_ready(srv, SRV_ADMF_IMAINT); +} + /* Note: must not be declared as its list will be overwritten. * Please take care of keeping this list alphabetically sorted, doing so helps * all code contributors.