Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
home:sschapiro:openstack:upstream
cluster
cman_make_qdisk_heuristics_time_out.patch
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File cman_make_qdisk_heuristics_time_out.patch of Package cluster
commit da0d0e0e4fee1bac432304f9a792de8bd89c36d2 Author: Lon Hohberger <lhh@redhat.com> Date: Tue Sep 21 13:45:20 2010 -0400 cman: Make qdiskd heuristics time out Qdiskd heuristics were previously expected to enforce their own timeouts. This patch makes qdiskd count any heuristic which has taken longer than (interval*(tko-1)) as failed, since that heuristic is not being reliable. A side effect is that now qdiskd will also automatically calculate interval and tko counts for all heuristics, obviating the need for administrators to do this manually. Resolves: rhbz#636243 Signed-off-by: Lon Hohberger <lhh@redhat.com> Reviewed-by: Fabio M. Di Nitto <fdinitto@redhat.com> diff --git a/cman/man/qdisk.5 b/cman/man/qdisk.5 index efa3638..4070f48 100644 --- a/cman/man/qdisk.5 +++ b/cman/man/qdisk.5 @@ -189,7 +189,7 @@ master will only grant a node membership if: .in 12 (a) CMAN believes the node to be online, and -.br +.bi (b) that node has made enough consecutive, timely writes .in 16 to the quorum disk, and @@ -448,15 +448,15 @@ for heuristics. The default score for each heuristic is 1. \fIinterval\fP\fB="\fP2\fB"\fP .in 12 This is the frequency (in seconds) at which we poll the heuristic. The -default interval for every heuristic is 2 seconds. +default interval is determined by the qdiskd timeout. .in 0 .in 9 \fItko\fP\fB="\fP1\fB"\fP .in 12 After this many failed attempts to run the heuristic, it is considered DOWN, -and its score is removed. The default tko for each heuristic is 1, which -may be inadequate for things such as 'ping'. +and its score is removed. The default tko for each heuristic is determined +by the qdiskd timeout. .in 8 \fB/>\fP .in 0 diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c index 8ca99f7..617a705 100644 --- a/cman/qdisk/main.c +++ b/cman/qdisk/main.c @@ -1844,7 +1844,11 @@ get_config_data(qd_ctx *ctx, struct h_data *h, int maxh, int *cfh) goto out; } - *cfh = configure_heuristics(ccsfd, h, maxh); + /* Heuristics need to report in 1 cycle before we need to + * report in so we can get their score. + */ + *cfh = configure_heuristics(ccsfd, h, maxh, + ctx->qc_interval * (ctx->qc_tko - 1)); if (*cfh) { if (ctx->qc_flags & RF_MASTER_WINS) { diff --git a/cman/qdisk/score.c b/cman/qdisk/score.c index 81ff700..572464d 100644 --- a/cman/qdisk/score.c +++ b/cman/qdisk/score.c @@ -75,22 +75,25 @@ restore_signals(void) Spin off a user-defined heuristic */ static int -fork_heuristic(struct h_data *h) +fork_heuristic(struct h_data *h, struct timespec *now) { int pid; char *argv[4]; - time_t now; if (h->childpid) { errno = EINPROGRESS; return -1; } - now = time(NULL); - if (now < h->nextrun) + if (now->tv_sec < h->nextrun.tv_sec || + now->tv_nsec < h->nextrun.tv_nsec) return 0; - h->nextrun = now + h->interval; + h->nextrun.tv_sec = now->tv_sec + h->interval; + h->nextrun.tv_nsec = now->tv_nsec; + + h->failtime.tv_sec = now->tv_sec + h->maxtime; + h->failtime.tv_nsec = now->tv_nsec; pid = fork(); if (pid < 0) @@ -162,7 +165,7 @@ total_score(struct h_data *h, int max, int *score, int *maxscore) Check for response from a user-defined heuristic / script */ static int -check_heuristic(struct h_data *h, int block) +check_heuristic(struct h_data *h, int block, struct timespec *now) { int ret; int status; @@ -172,14 +175,40 @@ check_heuristic(struct h_data *h, int block) return 0; ret = waitpid(h->childpid, &status, block?0:WNOHANG); - if (!block && ret == 0) + if (!block && ret == 0) { /* No children exited */ + + /* no timeout */ + if (!h->maxtime) + return 0; + + /* If we overran our timeout, the heuristic is dead */ + if (now->tv_sec > h->failtime.tv_sec || + (now->tv_sec == h->failtime.tv_sec && + now->tv_nsec > h->failtime.tv_nsec)) { + h->misses = h->tko; + h->failed = ETIMEDOUT; + if (h->available) { + logt_print(LOG_INFO, "Heuristic: '%s' DOWN - " + "Exceeded timeout of %d seconds\n", + h->program, h->maxtime); + h->available = 0; + } + } + return 0; + } h->childpid = 0; if (ret < 0 && errno == ECHILD) /* wrong child? */ goto miss; + + /* Timed out previously; this run must be ignored. */ + if (h->failed) { + h->failed = 0; + goto miss; + } if (!WIFEXITED(status)) { ret = 0; goto miss; @@ -188,7 +217,7 @@ check_heuristic(struct h_data *h, int block) ret = 0; goto miss; } - + /* Returned 0 and was not killed */ if (!h->available) { h->available = 1; @@ -222,10 +251,12 @@ miss: static int fork_heuristics(struct h_data *h, int max) { + struct timespec now; int x; + clock_gettime(CLOCK_MONOTONIC, &now); for (x = 0; x < max; x++) - fork_heuristic(&h[x]); + fork_heuristic(&h[x], &now); return 0; } @@ -236,19 +267,49 @@ fork_heuristics(struct h_data *h, int max) static int check_heuristics(struct h_data *h, int max, int block) { + struct timespec now; int x; + clock_gettime(CLOCK_MONOTONIC, &now); for (x = 0; x < max; x++) - check_heuristic(&h[x], block); + check_heuristic(&h[x], block, &now); return 0; } +/* + * absmax should be qdiskd (interval * (tko-1)) + */ +static void +auto_heuristic_timing(int *interval, int *tko, int absmax) +{ + if (!interval || ! tko) + return; + + if (absmax < 3) + return; + + if (absmax <= 4) { + *interval = 1; + } else if (absmax <= 22) { + *interval = 2; + } else if (absmax <= 39) { + *interval = 3; + } else if (absmax <= 50) { + *interval = 4; + } else { + *interval = 5; + } + + *tko = absmax / (*interval); +} + + /** Read configuration data from CCS into the array provided */ int -configure_heuristics(int ccsfd, struct h_data *h, int max) +configure_heuristics(int ccsfd, struct h_data *h, int max, int maxtime) { int x = 0; char *val; @@ -261,11 +322,14 @@ configure_heuristics(int ccsfd, struct h_data *h, int max) h[x].program = NULL; h[x].available = 0; h[x].misses = 0; - h[x].interval = 2; - h[x].tko = 1; + auto_heuristic_timing(&h[x].interval, &h[x].tko, maxtime); + h[x].maxtime = maxtime; h[x].score = 1; h[x].childpid = 0; - h[x].nextrun = 0; + h[x].nextrun.tv_sec = 0; + h[x].nextrun.tv_nsec = 0; + h[x].failtime.tv_sec = 0; + h[x].failtime.tv_nsec = 0; /* Get program */ snprintf(query, sizeof(query), diff --git a/cman/qdisk/score.h b/cman/qdisk/score.h index 77e155b..beff31b 100644 --- a/cman/qdisk/score.h +++ b/cman/qdisk/score.h @@ -10,19 +10,22 @@ struct h_data { char * program; + struct timespec nextrun; + struct timespec failtime; int score; int available; int tko; int interval; + int maxtime; int misses; + int failed; pid_t childpid; - time_t nextrun; }; /* Grab score data from CCSD */ -int configure_heuristics(int ccsfd, struct h_data *hp, int max); +int configure_heuristics(int ccsfd, struct h_data *hp, int max, int maxtime); /* Start the thread which runs the scoring applets
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor