Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
openSUSE:Step:15-SP1
pacemaker.15719
jsc#ECO-1611-0002-Feature-scheduler-implement-p...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File jsc#ECO-1611-0002-Feature-scheduler-implement-priority-fencing-delay.patch of Package pacemaker.15719
From 79ded22a9cc7dcb074fdac3174e504502bea147f Mon Sep 17 00:00:00 2001 From: "Gao,Yan" <ygao@suse.com> Date: Tue, 17 Mar 2020 14:33:35 +0100 Subject: [PATCH 2/9] Feature: scheduler: implement priority-fencing-delay --- include/crm/pengine/internal.h | 4 +- include/crm/pengine/status.h | 2 + pengine/allocate.c | 14 ++--- pengine/native.c | 8 +-- lib/pengine/native.c | 47 ++++++++++++++ lib/pengine/unpack.c | 43 ++++++++----- lib/pengine/utils.c | 96 ++++++++++++++++++++++++++++- 7 files changed, 183 insertions(+), 31 deletions(-) Index: pacemaker-1.1.18+20180430.b12c320f5/include/crm/pengine/internal.h =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/include/crm/pengine/internal.h +++ pacemaker-1.1.18+20180430.b12c320f5/include/crm/pengine/internal.h @@ -282,7 +282,7 @@ typedef struct op_digest_cache_s { op_digest_cache_t *rsc_action_digest_cmp(resource_t * rsc, xmlNode * xml_op, node_t * node, pe_working_set_t * data_set); -action_t *pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe_working_set_t * data_set); +pe_action_t *pe_fence_op(pe_node_t * node, const char *op, bool optional, const char *reason, bool priority_delay, pe_working_set_t * data_set); void trigger_unfencing( resource_t * rsc, node_t *node, const char *reason, action_t *dependency, pe_working_set_t * data_set); @@ -299,7 +299,7 @@ gboolean add_tag_ref(GHashTable * tags, void print_rscs_brief(GListPtr rsc_list, const char * pre_text, long options, void * print_data, gboolean print_all); -void pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason); +void pe_fence_node(pe_working_set_t * data_set, pe_node_t * node, const char *reason, bool priority_delay); node_t *pe_create_node(const char *id, const char *uname, const char *type, const char *score, pe_working_set_t * data_set); Index: pacemaker-1.1.18+20180430.b12c320f5/include/crm/pengine/status.h =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/include/crm/pengine/status.h +++ pacemaker-1.1.18+20180430.b12c320f5/include/crm/pengine/status.h @@ -123,6 +123,7 @@ struct pe_working_set_s { int blocked_resources; int disabled_resources; + int priority_fencing_delay; // Enforced priority fencing delay }; struct pe_node_shared_s { @@ -155,6 +156,7 @@ struct pe_node_shared_s { GHashTable *attrs; /* char* => char* */ GHashTable *utilization; GHashTable *digest_cache; /*! cache of calculated resource digests */ + int priority; // calculated based on the priority of resources running on the node }; struct pe_node_s { Index: pacemaker-1.1.18+20180430.b12c320f5/pengine/allocate.c =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/pengine/allocate.c +++ pacemaker-1.1.18+20180430.b12c320f5/pengine/allocate.c @@ -887,7 +887,7 @@ probe_resources(pe_working_set_t * data_ if (is_baremetal_remote_node(node) && node->details->remote_rsc && (get_remote_node_state(node) == remote_state_failed)) { - pe_fence_node(data_set, node, "the connection is unrecoverable"); + pe_fence_node(data_set, node, "the connection is unrecoverable", FALSE); } continue; @@ -1420,7 +1420,7 @@ fence_guest(pe_node_t *node, pe_action_t /* Create a fence pseudo-event, so we have an event to order actions * against, and crmd can always detect it. */ - stonith_op = pe_fence_op(node, fence_action, FALSE, "guest is unclean", data_set); + stonith_op = pe_fence_op(node, fence_action, FALSE, "guest is unclean", FALSE, data_set); update_action_flags(stonith_op, pe_action_pseudo | pe_action_runnable, __FUNCTION__, __LINE__); @@ -1429,7 +1429,7 @@ fence_guest(pe_node_t *node, pe_action_t * (even though start might be closer to what is done for a real reboot). */ if(stop && is_set(stop->flags, pe_action_pseudo)) { - pe_action_t *parent_stonith_op = pe_fence_op(stop->node, NULL, FALSE, NULL, data_set); + pe_action_t *parent_stonith_op = pe_fence_op(stop->node, NULL, FALSE, NULL, FALSE, data_set); crm_info("Implying guest node %s is down (action %d) after %s fencing", node->details->uname, stonith_op->id, stop->node->details->uname); order_actions(parent_stonith_op, stonith_op, @@ -1509,7 +1509,7 @@ stage6(pe_working_set_t * data_set) if (node->details->unclean && need_stonith && pe_can_fence(data_set, node)) { - stonith_op = pe_fence_op(node, NULL, FALSE, "node is unclean", data_set); + stonith_op = pe_fence_op(node, NULL, FALSE, "node is unclean", FALSE, data_set); pe_warn("Scheduling Node %s for STONITH", node->details->uname); stonith_constraints(node, stonith_op, data_set); @@ -1814,7 +1814,7 @@ apply_container_ordering(action_t *actio CRM_ASSERT(container); if(is_set(container->flags, pe_rsc_failed)) { - pe_fence_node(data_set, action->node, "container failed"); + pe_fence_node(data_set, action->node, "container failed", FALSE); } crm_trace("Order %s action %s relative to %s%s for %s%s", @@ -2023,7 +2023,7 @@ apply_remote_ordering(action_t *action, * way to stop it, it is necessary to fence the * node. */ - pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable"); + pe_fence_node(data_set, action->node, "resources are active and the connection is unrecoverable", FALSE); order_action_then_stop(action, remote_rsc, pe_order_implies_first, data_set); @@ -2071,7 +2071,7 @@ apply_remote_ordering(action_t *action, * Since we have no way to find out, it is * necessary to fence the node. */ - pe_fence_node(data_set, action->node, "resources are in an unknown state and the connection is unrecoverable"); + pe_fence_node(data_set, action->node, "resources are in an unknown state and the connection is unrecoverable", FALSE); } if(cluster_node && state == remote_state_stopped) { Index: pacemaker-1.1.18+20180430.b12c320f5/pengine/native.c =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/pengine/native.c +++ pacemaker-1.1.18+20180430.b12c320f5/pengine/native.c @@ -1362,7 +1362,7 @@ native_internal_constraints(resource_t * g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { - action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, data_set); + pe_action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, FALSE, data_set); crm_debug("Ordering any stops of %s before %s, and any starts after", rsc->id, unfence->uuid); @@ -1825,7 +1825,7 @@ rsc_ticket_constraint(resource_t * rsc_l for (gIter = rsc_lh->running_on; gIter != NULL; gIter = gIter->next) { node_t *node = (node_t *) gIter->data; - pe_fence_node(data_set, node, "deadman ticket was lost"); + pe_fence_node(data_set, node, "deadman ticket was lost", FALSE); } break; @@ -2505,7 +2505,7 @@ StopRsc(resource_t * rsc, node_t * next, } if(is_set(rsc->flags, pe_rsc_needs_unfencing)) { - action_t *unfence = pe_fence_op(current, "on", TRUE, NULL, data_set); + pe_action_t *unfence = pe_fence_op(current, "on", TRUE, NULL, FALSE, data_set); order_actions(stop, unfence, pe_order_implies_first); if (!node_has_been_unfenced(current)) { @@ -2535,7 +2535,7 @@ order_after_unfencing(resource_t *rsc, p * the node being unfenced, and all its resources being stopped, * whenever a new resource is added -- which would be highly suboptimal. */ - action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, data_set); + pe_action_t *unfence = pe_fence_op(node, "on", TRUE, NULL, FALSE, data_set); order_actions(unfence, action, order); Index: pacemaker-1.1.18+20180430.b12c320f5/lib/pengine/native.c =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/lib/pengine/native.c +++ pacemaker-1.1.18+20180430.b12c320f5/lib/pengine/native.c @@ -17,6 +17,51 @@ #define VARIANT_NATIVE 1 #include "./variant.h" +static void +native_priority_to_node(pe_resource_t * rsc, pe_node_t * node) +{ + int priority = 0; + + if (rsc->priority == 0) { + return; + } + + if (rsc->role == RSC_ROLE_MASTER) { + // Promoted instance takes base priority + 1 + priority = rsc->priority + 1; + + } else { + priority = rsc->priority; + } + + node->details->priority += priority; + pe_rsc_trace(rsc, "Node '%s' now has priority %d with %s'%s' (priority: %d%s)", + node->details->uname, node->details->priority, + rsc->role == RSC_ROLE_MASTER ? "promoted " : "", + rsc->id, rsc->priority, + rsc->role == RSC_ROLE_MASTER ? " + 1" : ""); + + /* Priority of a resource running on a guest node is added to the cluster + * node as well. */ + if (node->details->remote_rsc + && node->details->remote_rsc->container) { + GListPtr gIter = node->details->remote_rsc->container->running_on; + + for (; gIter != NULL; gIter = gIter->next) { + pe_node_t *a_node = gIter->data; + + a_node->details->priority += priority; + pe_rsc_trace(rsc, "Node '%s' now has priority %d with %s'%s' (priority: %d%s) " + "from guest node '%s'", + a_node->details->uname, a_node->details->priority, + rsc->role == RSC_ROLE_MASTER ? "promoted " : "", + rsc->id, rsc->priority, + rsc->role == RSC_ROLE_MASTER ? " + 1" : "", + node->details->uname); + } + } +} + void native_add_running(resource_t * rsc, node_t * node, pe_working_set_t * data_set) { @@ -38,6 +83,8 @@ native_add_running(resource_t * rsc, nod rsc->running_on = g_list_append(rsc->running_on, node); if (rsc->variant == pe_native) { node->details->running_rsc = g_list_append(node->details->running_rsc, rsc); + + native_priority_to_node(rsc, node); } if (rsc->variant == pe_native && node->details->maintenance) { Index: pacemaker-1.1.18+20180430.b12c320f5/lib/pengine/unpack.c =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/lib/pengine/unpack.c +++ pacemaker-1.1.18+20180430.b12c320f5/lib/pengine/unpack.c @@ -62,9 +62,11 @@ is_dangling_container_remote_node(node_t * \param[in,out] data_set Current working set of cluster * \param[in,out] node Node to fence * \param[in] reason Text description of why fencing is needed + * \param[in] priority_delay Whether to consider `priority-fencing-delay` */ void -pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) +pe_fence_node(pe_working_set_t * data_set, pe_node_t * node, + const char *reason, bool priority_delay) { CRM_CHECK(node, return); @@ -114,7 +116,8 @@ pe_fence_node(pe_working_set_t * data_se reason); } node->details->unclean = TRUE; - pe_fence_op(node, NULL, TRUE, reason, data_set); + // No need to apply `priority-fencing-delay` for remote nodes + pe_fence_op(node, NULL, TRUE, reason, FALSE, data_set); } else if (node->details->unclean) { crm_trace("Cluster node %s %s because %s", @@ -128,7 +131,7 @@ pe_fence_node(pe_working_set_t * data_se pe_can_fence(data_set, node)? "will be fenced" : "is unclean", reason); node->details->unclean = TRUE; - pe_fence_op(node, NULL, TRUE, reason, data_set); + pe_fence_op(node, NULL, TRUE, reason, priority_delay, data_set); } } @@ -212,6 +215,15 @@ unpack_config(xmlNode * config, pe_worki crm_debug("Concurrent fencing is %s", is_set(data_set->flags, pe_flag_concurrent_fencing) ? "enabled" : "disabled"); + // Default value -1 means `priority-fencing-delay` is disabled + data_set->priority_fencing_delay = -1; + value = pe_pref(data_set->config_hash, + XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY); + if (value) { + data_set->priority_fencing_delay = crm_parse_interval_spec(value) / 1000; + crm_trace("Priority fencing delay is %ds", data_set->priority_fencing_delay); + } + set_config_flag(data_set, "stop-all-resources", pe_flag_stop_everything); crm_debug("Stop all active resources: %s", is_set(data_set->flags, pe_flag_stop_everything) ? "true" : "false"); @@ -1126,7 +1138,7 @@ unpack_status(xmlNode * status, pe_worki /* Everything else should flow from this automatically * At least until the PE becomes able to migrate off healthy resources */ - pe_fence_node(data_set, this_node, "cluster does not have quorum"); + pe_fence_node(data_set, this_node, "cluster does not have quorum", FALSE); } } } @@ -1182,7 +1194,7 @@ determine_online_status_no_fencing(pe_wo } else { /* mark it unclean */ - pe_fence_node(data_set, this_node, "peer is unexpectedly down"); + pe_fence_node(data_set, this_node, "peer is unexpectedly down", FALSE); crm_info("\tin_cluster=%s, is_peer=%s, join=%s, expected=%s", crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state)); } @@ -1238,10 +1250,10 @@ determine_online_status_fencing(pe_worki online = crmd_online; } else if (in_cluster == NULL) { - pe_fence_node(data_set, this_node, "peer has not been seen by the cluster"); + pe_fence_node(data_set, this_node, "peer has not been seen by the cluster", FALSE); } else if (safe_str_eq(join, CRMD_JOINSTATE_NACK)) { - pe_fence_node(data_set, this_node, "peer failed the pacemaker membership criteria"); + pe_fence_node(data_set, this_node, "peer failed the pacemaker membership criteria", FALSE); } else if (do_terminate == FALSE && safe_str_eq(exp_state, CRMD_JOINSTATE_DOWN)) { @@ -1260,14 +1272,15 @@ determine_online_status_fencing(pe_worki online = FALSE; } else if (crm_is_true(in_cluster) == FALSE) { - pe_fence_node(data_set, this_node, "peer is no longer part of the cluster"); + // Consider `priority-fencing-delay` for lost nodes + pe_fence_node(data_set, this_node, "peer is no longer part of the cluster", TRUE); } else if (!crmd_online) { - pe_fence_node(data_set, this_node, "peer process is no longer available"); + pe_fence_node(data_set, this_node, "peer process is no longer available", FALSE); /* Everything is running at this point, now check join state */ } else if (do_terminate) { - pe_fence_node(data_set, this_node, "termination was requested"); + pe_fence_node(data_set, this_node, "termination was requested", FALSE); } else if (safe_str_eq(join, CRMD_JOINSTATE_MEMBER)) { crm_info("Node %s is active", this_node->details->uname); @@ -1279,7 +1292,7 @@ determine_online_status_fencing(pe_worki this_node->details->pending = TRUE; } else { - pe_fence_node(data_set, this_node, "peer was in an unknown state"); + pe_fence_node(data_set, this_node, "peer was in an unknown state", FALSE); crm_warn("%s: in-cluster=%s, is-peer=%s, join=%s, expected=%s, term=%d, shutdown=%d", this_node->details->uname, crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state), do_terminate, this_node->details->shutdown); @@ -1803,7 +1816,7 @@ process_rsc_state(resource_t * rsc, node if (reason == NULL) { reason = crm_strdup_printf("%s is thought to be active there", rsc->id); } - pe_fence_node(data_set, node, reason); + pe_fence_node(data_set, node, reason, FALSE); } free(reason); } @@ -1825,7 +1838,7 @@ process_rsc_state(resource_t * rsc, node * but also mark the node as unclean */ reason = crm_strdup_printf("%s failed there", rsc->id); - pe_fence_node(data_set, node, reason); + pe_fence_node(data_set, node, reason, FALSE); free(reason); break; @@ -1884,7 +1897,7 @@ process_rsc_state(resource_t * rsc, node /* connection resource to baremetal resource failed in a way that * should result in fencing the remote-node. */ pe_fence_node(data_set, tmpnode, - "remote connection is unrecoverable"); + "remote connection is unrecoverable", FALSE); } } @@ -2826,7 +2839,7 @@ static bool check_operation_expiry(resou && remote_node && remote_node->details->unclean) { - action_t *fence = pe_fence_op(remote_node, NULL, TRUE, NULL, data_set); + pe_action_t *fence = pe_fence_op(remote_node, NULL, TRUE, NULL, FALSE, data_set); crm_notice("Waiting for %s to complete before clearing %s failure for remote node %s", fence?fence->uuid:"nil", task, rsc->id); order_actions(fence, clear_op, pe_order_implies_then); Index: pacemaker-1.1.18+20180430.b12c320f5/lib/pengine/utils.c =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/lib/pengine/utils.c +++ pacemaker-1.1.18+20180430.b12c320f5/lib/pengine/utils.c @@ -532,7 +532,7 @@ custom_action(resource_t * rsc, char *ke if (is_set(action->rsc->flags, pe_rsc_managed) && save_action && a_task == stop_rsc && action->node->details->unclean == FALSE) { - pe_fence_node(data_set, action->node, "resource actions are unrunnable"); + pe_fence_node(data_set, action->node, "resource actions are unrunnable", FALSE); } } else if (action->node->details->pending) { @@ -2090,9 +2090,76 @@ find_unfencing_devices(GListPtr candidat return matches; } +static int +node_priority_fencing_delay(pe_node_t * node, pe_working_set_t * data_set) +{ + int member_count = 0; + int online_count = 0; + int top_priority = 0; + int lowest_priority = 0; + GListPtr gIter = NULL; + + // `priority-fencing-delay` is disabled + if (data_set->priority_fencing_delay < 0) { + return -1; + } + + /* No need to delay fencing if the fencing target is not a normal cluster + * member, for example if it's a remote node or a guest node. */ + if (node->details->type != node_member) { + return 0; + } + + // No need to delay fencing if the fencing target is in our partition + if (node->details->online) { + return 0; + } + + for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { + pe_node_t *n = gIter->data; + + if (n->details->type != node_member) { + continue; + } + + member_count ++; + + if (n->details->online) { + online_count++; + } + + if (member_count == 1 + || n->details->priority > top_priority) { + top_priority = n->details->priority; + } + + if (member_count == 1 + || n->details->priority < lowest_priority) { + lowest_priority = n->details->priority; + } + } + + // No need to delay if we have more than half of the cluster members + if (online_count > member_count / 2) { + return 0; + } + + /* All the nodes have equal priority. + * Any configured corresponding `pcmk_delay_base/max` will be applied. */ + if (lowest_priority == top_priority) { + return -1; + } + + if (node->details->priority < top_priority) { + return 0; + } + + return data_set->priority_fencing_delay; +} action_t * -pe_fence_op(node_t * node, const char *op, bool optional, const char *reason, pe_working_set_t * data_set) +pe_fence_op(pe_node_t * node, const char *op, bool optional, const char *reason, + bool priority_delay, pe_working_set_t * data_set) { char *op_key = NULL; action_t *stonith_op = NULL; @@ -2164,6 +2231,29 @@ pe_fence_op(node_t * node, const char *o free(op_key); } + if (data_set->priority_fencing_delay >= 0 + + /* It's a suitable case where `priority-fencing-delay` applies. + * At least add `priority-fencing-delay` field as an indicator. */ + && (priority_delay + + /* Re-calculate priority delay for the suitable case when + * pe_fence_op() is called again by stage6() after node priority has + * been actually calculated with native_add_running() */ + || g_hash_table_lookup(stonith_op->meta, + XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY) != NULL)) { + + /* Add `priority-fencing-delay` to the fencing op even if it's 0 for + * the targeting node. So that it takes precedence over any possible + * `pcmk_delay_base/max`. + */ + char *delay_s = crm_itoa(node_priority_fencing_delay(node, data_set)); + + g_hash_table_insert(stonith_op->meta, + strdup(XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY), + delay_s); + } + if(optional == FALSE && pe_can_fence(data_set, node)) { pe_action_required(stonith_op, NULL, reason); } else if(reason && stonith_op->reason == NULL) { @@ -2189,7 +2279,7 @@ trigger_unfencing( && node->details->online && node->details->unclean == FALSE && node->details->shutdown == FALSE) { - action_t *unfence = pe_fence_op(node, "on", FALSE, reason, data_set); + pe_action_t *unfence = pe_fence_op(node, "on", FALSE, reason, FALSE, data_set); if(dependency) { order_actions(unfence, dependency, pe_order_optional);
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor