Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
SUSE:SLE-15-SP7:Update
pacemaker.16898
bsc#1133866-0003-Fix-controller-confirm-cancel-...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File bsc#1133866-0003-Fix-controller-confirm-cancel-of-failed-monitors.patch of Package pacemaker.16898
From 5470f1d9c776dbf753e015fa96153b6a63c17b83 Mon Sep 17 00:00:00 2001 From: "Gao,Yan" <ygao@suse.com> Date: Thu, 9 May 2019 13:24:35 +0200 Subject: [PATCH] Fix: controller: confirm cancel of failed monitors Usually after a monitor has been cancelled from executor, contoller erases the corresponding lrm_rsc_op from the cib, and DC will confirm the cancel action by process_op_deletion() according to the cib diff. But if a monitor has failed, the lrm_rsc_op will be recorded as "last_failure". When cancelling it, the lrm_rsc_op won't get erased from the cib given the logic on purpose in erase_lrm_history_by_op(). So that the cancel action won't have a chance to get confirmed by DC with process_op_deletion(). Previously cluster transition would get stuck waiting for the remaining action timer to time out. This commit fixes the issue by directly acknowledging the cancel action in this case and enabling DC to be able to confirm it. This also moves get_node_id() function into controld_utils.c for common use. Producer: ``` # Insert a 10s sleep in the monitor action of RA # /usr/lib/ocf/resource.d/pacemaker/Stateful: stateful_monitor() { + sleep 10 stateful_check_state "master" # Add a promotable clone resource: crm configure primitive stateful ocf:pacemaker:Stateful \ op monitor interval=5 role=Master \ op monitor interval=10 role=Slave crm configure clone p-clone stateful \ meta promotable=true # Wait for the resource instance to be started, promoted to be master, # and monitor for master role to complete. # Set is-managed=false for the promotable clone: crm_resource --meta -p is-managed -v false -r p-clone # Change the status of the master instance to be slave and immediately # enforce refresh of it: echo slave > /var/run/Stateful-stateful.state; crm_resource --refresh -r stateful --force # Wait for probe to complete, and then monitor for slave role to be # issued: sleep 15 # While the monitor for slave role is still in progress, change the # status to be master again: echo master > /var/run/Stateful-stateful.state # The monitor for slave role returns error. Cluster issues monitor for # master role instead and tries to cancel the failed one for slave role. # But cluster transition gets stuck. Depending on the monitor timeout # configured for the slave role plus cluster-delay, only after that # controller eventually says: pacemaker-controld[21205] error: Node opensuse150 did not send cancel result (via controller) within 20000ms (action timeout plus cluster-delay) pacemaker-controld[21205] error: [Action 1]: In-flight rsc op stateful_monitor_10000 on opensuse150 (priority: 0, waiting: none) pacemaker-controld[21205] notice: Transition 6 aborted: Action lost ``` --- crmd/lrm.c | 38 ++++++++++++++++++++++++++++++++ crmd/te_callbacks.c | 21 ++---------------- crmd/te_events.c | 32 +++++++++++++++++++++++++++ crmd/tengine.h | 1 + crmd/utils.c | 13 +++++++++++ crmd/crmd_utils.h | 2 ++ 6 files changed, 88 insertions(+), 19 deletions(-) Index: pacemaker-1.1.18+20180430.b12c320f5/crmd/lrm.c =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/crmd/lrm.c +++ pacemaker-1.1.18+20180430.b12c320f5/crmd/lrm.c @@ -2496,6 +2496,30 @@ unescape_newlines(const char *string) return ret; } +static bool +did_lrm_rsc_op_fail(lrm_state_t *lrm_state, const char * rsc_id, + const char * op_type, guint interval_ms) +{ + rsc_history_t *entry = NULL; + + CRM_CHECK(lrm_state != NULL, return FALSE); + CRM_CHECK(rsc_id != NULL, return FALSE); + CRM_CHECK(op_type != NULL, return FALSE); + + entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id); + if (entry == NULL || entry->failed == NULL) { + return FALSE; + } + + if (crm_str_eq(entry->failed->rsc_id, rsc_id, TRUE) + && safe_str_eq(entry->failed->op_type, op_type) + && entry->failed->interval_ms == interval_ms) { + return TRUE; + } + + return FALSE; +} + gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending) { @@ -2559,6 +2583,20 @@ process_lrm_event(lrm_state_t * lrm_stat /* The tengine canceled this op, we have been waiting for the cancel to finish. */ erase_lrm_history_by_op(lrm_state, op); + /* If the recurring operation had failed, the lrm_rsc_op is recorded as + * "last_failure" which won't get erased from the cib given the logic on + * purpose in erase_lrm_history_by_op(). So that the cancel action won't + * have a chance to get confirmed by DC with process_op_deletion(). + * Cluster transition would get stuck waiting for the remaining action + * timer to time out. + * + * Directly acknowledge the cancel operation in this case. + */ + if (did_lrm_rsc_op_fail(lrm_state, pending->rsc_id, + pending->op_type, pending->interval_ms)) { + send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); + } + } else if (op->rsc_deleted) { /* The tengine initiated this op, but it was cancelled outside of the * tengine's control during a resource cleanup/re-probe request. The tengine Index: pacemaker-1.1.18+20180430.b12c320f5/crmd/te_callbacks.c =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/crmd/te_callbacks.c +++ pacemaker-1.1.18+20180430.b12c320f5/crmd/te_callbacks.c @@ -31,19 +31,6 @@ static unsigned long int stonith_max_att /* #define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */ #define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']" -static const char * -get_node_id(xmlNode * rsc_op) -{ - xmlNode *node = rsc_op; - - while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) { - node = node->parent; - } - - CRM_CHECK(node != NULL, return NULL); - return ID(node); -} - void update_stonith_max_attempts(const char* value) { @@ -373,12 +360,8 @@ process_op_deletion(const char *xpath, x node_uuid = extract_node_uuid(xpath); cancel = get_cancel_action(key, node_uuid); if (cancel) { - crm_info("Cancellation of %s on %s confirmed (%d)", - key, node_uuid, cancel->id); - stop_te_timer(cancel->timer); - te_action_confirmed(cancel); - update_graph(transition_graph, cancel); - trigger_graph(); + confirm_cancel_action(cancel); + } else { abort_transition(INFINITY, tg_restart, "Resource operation removal", change); Index: pacemaker-1.1.18+20180430.b12c320f5/crmd/te_events.c =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/crmd/te_events.c +++ pacemaker-1.1.18+20180430.b12c320f5/crmd/te_events.c @@ -375,6 +375,27 @@ get_cancel_action(const char *id, const return NULL; } +void +confirm_cancel_action(crm_action_t *cancel) +{ + const char *op_key = NULL; + const char *node_name = NULL; + + CRM_ASSERT(cancel != NULL); + + op_key = crm_element_value(cancel->xml, XML_LRM_ATTR_TASK_KEY); + node_name = crm_element_value(cancel->xml, XML_LRM_ATTR_TARGET); + + stop_te_timer(cancel->timer); + te_action_confirmed(cancel); + update_graph(transition_graph, cancel); + + crm_info("Cancellation of %s on %s confirmed (action %d)", + op_key, node_name, cancel->id); + + trigger_graph(); +} + /* downed nodes are listed like: <downed> <node id="UUID1" /> ... </downed> */ #define XPATH_DOWNED "//" XML_GRAPH_TAG_DOWNED \ "/" XML_CIB_TAG_NODE "[@" XML_ATTR_UUID "='%s']" @@ -491,6 +512,17 @@ process_graph_event(xmlNode *event, cons /* Recurring actions have the transition number they were first * scheduled in. */ + + if (status == PCMK_LRM_OP_CANCELLED) { + const char *node_id = get_node_id(event); + + action = get_cancel_action(id, node_id); + if (action) { + confirm_cancel_action(action); + } + goto bail; + } + desc = "arrived after initial scheduling"; abort_transition(INFINITY, tg_restart, "Change in recurring result", event); Index: pacemaker-1.1.18+20180430.b12c320f5/crmd/tengine.h =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/crmd/tengine.h +++ pacemaker-1.1.18+20180430.b12c320f5/crmd/tengine.h @@ -25,6 +25,7 @@ void execute_stonith_cleanup(void); /* tengine */ extern crm_action_t *match_down_event(const char *target); extern crm_action_t *get_cancel_action(const char *id, const char *node); +void confirm_cancel_action(crm_action_t *cancel); extern gboolean cib_action_update(crm_action_t * action, int status, int op_rc); extern gboolean fail_incompletable_actions(crm_graph_t * graph, const char *down_node); Index: pacemaker-1.1.18+20180430.b12c320f5/crmd/utils.c =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/crmd/utils.c +++ pacemaker-1.1.18+20180430.b12c320f5/crmd/utils.c @@ -1073,3 +1073,16 @@ feature_set_compatible(const char *dc_ve // DC's minor version must be the same or older return dc_v <= join_v; } + +const char * +get_node_id(xmlNode *lrm_rsc_op) +{ + xmlNode *node = lrm_rsc_op; + + while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) { + node = node->parent; + } + + CRM_CHECK(node != NULL, return NULL); + return ID(node); +} Index: pacemaker-1.1.18+20180430.b12c320f5/crmd/crmd_utils.h =================================================================== --- pacemaker-1.1.18+20180430.b12c320f5.orig/crmd/crmd_utils.h +++ pacemaker-1.1.18+20180430.b12c320f5/crmd/crmd_utils.h @@ -98,6 +98,8 @@ unsigned int cib_op_timeout(void); bool feature_set_compatible(const char *dc_version, const char *join_version); +const char *get_node_id(xmlNode *lrm_rsc_op); + /* Convenience macro for registering a CIB callback * (assumes that data can be freed with free()) */
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor