Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
SUSE:SLE-12-SP5:GA
resource-agents.8843
0019-High-galera-Backport-patches-from-upstream...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File 0019-High-galera-Backport-patches-from-upstream-bsc-10550.patch of Package resource-agents.8843
From 223d99f2016b187298b0cb4df8c726cf34799423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristoffer=20Gr=C3=B6nlund?= <krig@koru.se> Date: Tue, 5 Sep 2017 09:49:53 +0200 Subject: [PATCH 19/21] High: galera: Backport patches from upstream (bsc#1055017) (bsc#1056635) * galera: Honor "safe_to_bootstrap" flag in grastate.dat (bsc#1055017) * galera: Fix instance name in master_exists() (bsc#1056635) --- heartbeat/galera | 569 +++++++++++++++++++++++++++---------------------------- 1 file changed, 278 insertions(+), 291 deletions(-) diff --git a/heartbeat/galera b/heartbeat/galera index e4495bec..dc681a47 100755 --- a/heartbeat/galera +++ b/heartbeat/galera @@ -32,7 +32,7 @@ # Slave vs Master role: # # During the 'Slave' role, galera instances are in read-only mode and -# will not attempt to connect to the cluster. This role exists as +# will not attempt to connect to the cluster. This role exists only as # a means to determine which galera instance is the most up-to-date. The # most up-to-date node will be used to bootstrap a galera cluster that # has no current members. @@ -40,12 +40,9 @@ # The galera instances will only begin to be promoted to the Master role # once all the nodes in the 'wsrep_cluster_address' connection address # have entered read-only mode. At that point the node containing the -# database that is most current will be promoted to Master. -# -# Once the first Master instance bootstraps the galera cluster, the -# other nodes will join the cluster and start synchronizing via SST. -# They will stay in Slave role as long as the SST is running. Their -# promotion to Master will happen once synchronization is finished. +# database that is most current will be promoted to Master. Once the first +# Master instance bootstraps the galera cluster, the other nodes will be +# promoted to Master as well. # # Example: Create a galera cluster using nodes rhel7-node1 rhel7-node2 rhel7-node3 # @@ -76,6 +73,8 @@ # in this file if [ -f "/etc/sysconfig/clustercheck" ]; then . /etc/sysconfig/clustercheck +elif [ -f "/etc/default/clustercheck" ]; then + . /etc/default/clustercheck fi ####################################################################### @@ -206,13 +205,30 @@ The galera cluster address. This takes the form of: gcomm://node,node,node Only nodes present in this node list will be allowed to start a galera instance. -It is expected that the galera node names listed in this address match valid -pacemaker node names. +The galera node names listed in this address are expected to match valid +pacemaker node names. If both names need to differ, you must provide a +mapping in option cluster_host_map. </longdesc> <shortdesc lang="en">Galera cluster address</shortdesc> <content type="string" default=""/> </parameter> +<parameter name="cluster_host_map" unique="0" required="0"> +<longdesc lang="en"> +A mapping of pacemaker node names to galera node names. + +To be used when both pacemaker and galera names need to differ, +(e.g. when galera names map to IP from a specific network interface) +This takes the form of: +pcmk1:node.1.galera;pcmk2:node.2.galera;pcmk3:node.3.galera + +where the galera resource started on node pcmk1 would be named +node.1.galera in the wsrep_cluster_address +</longdesc> +<shortdesc lang="en">Pacemaker to Galera name mapping</shortdesc> +<content type="string" default=""/> +</parameter> + <parameter name="check_user" unique="0" required="0"> <longdesc lang="en"> Cluster check user. @@ -316,6 +332,27 @@ get_last_commit() fi } +clear_safe_to_bootstrap() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -D +} + +set_safe_to_bootstrap() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -v $1 +} + +get_safe_to_bootstrap() +{ + local node=$1 + + if [ -z "$node" ]; then + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -Q 2>/dev/null + else + ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" -Q 2>/dev/null + fi +} + wait_for_sync() { local state=$(get_status_variable "wsrep_local_state") @@ -328,56 +365,6 @@ wait_for_sync() ocf_log info "Database synced." } -set_sync_needed() -{ - ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-sync-needed" -v "true" -} - -clear_sync_needed() -{ - ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-sync-needed" -D -} - -check_sync_needed() -{ - ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-sync-needed" -Q 2>/dev/null -} - - -# this function is called when attribute sync-needed is set in the CIB -check_sync_status() -{ - # if the pidfile is created, mysqld is up and running - # an IST might still be in progress, check wsrep status - if [ -e $OCF_RESKEY_pid ]; then - local cluster_status=$(get_status_variable "wsrep_cluster_status") - local state=$(get_status_variable "wsrep_local_state") - local ready=$(get_status_variable "wsrep_ready") - - if [ -z "$cluster_status" -o -z "$state" -o -z "$ready" ]; then - ocf_exit_reason "Unable to retrieve state transfer status, verify check_user '$OCF_RESKEY_check_user' has permissions to view status" - return $OCF_ERR_GENERIC - fi - - if [ "$cluster_status" != "Primary" ]; then - ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." - return $OCF_ERR_GENERIC - fi - - if [ "$state" = "4" -a "$ready" = "ON" ]; then - ocf_log info "local node synced with the cluster" - # when sync is finished, we are ready to switch to Master - clear_sync_needed - set_master_score - return $OCF_SUCCESS - fi - fi - - # if we pass here, an IST or SST is still in progress - ocf_log info "local node syncing" - return $OCF_SUCCESS -} - is_primary() { cluster_status=$(get_status_variable "wsrep_cluster_status") @@ -420,7 +407,7 @@ master_exists() return 1 fi # determine if a master instance is already up and is healthy - crm_mon --as-xml | grep "resource.*id=\"${OCF_RESOURCE_INSTANCE}\".*role=\"Master\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" > /dev/null 2>&1 + crm_mon --as-xml | grep "resource.*id=\"${INSTANCE_ATTR_NAME}\".*role=\"Master\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" > /dev/null 2>&1 return $? } @@ -445,6 +432,22 @@ set_master_score() fi } +promote_everyone() +{ + + for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do + local pcmk_node=$(galera_to_pcmk_name $node) + if [ -z "$pcmk_node" ]; then + ocf_log err "Could not determine pacemaker node from galera name <${node}>." + return + else + node=$pcmk_node + fi + + set_master_score $node + done +} + greater_than_equal_long() { # there are values we need to compare in this script @@ -452,17 +455,57 @@ greater_than_equal_long() echo | awk -v n1="$1" -v n2="$2" '{if (n1>=n2) printf ("true"); else printf ("false");}' | grep -q "true" } +galera_to_pcmk_name() +{ + local galera=$1 + if [ -z "$OCF_RESKEY_cluster_host_map" ]; then + echo $galera + else + echo "$OCF_RESKEY_cluster_host_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$2=="'"$galera"'" {print $1;exit}' + fi +} + +pcmk_to_galera_name() +{ + local pcmk=$1 + if [ -z "$OCF_RESKEY_cluster_host_map" ]; then + echo $pcmk + else + echo "$OCF_RESKEY_cluster_host_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$1=="'"$pcmk"'" {print $2;exit}' + fi +} + + detect_first_master() { local best_commit=0 - local best_node="$NODENAME" local last_commit=0 local missing_nodes=0 local nodes="" local nodes_recovered="" + local all_nodes + local best_node_gcomm + local best_node + local safe_to_bootstrap + + all_nodes=$(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' ') + best_node_gcomm=$(echo "$all_nodes" | sed 's/^.* \(.*\)$/\1/') + best_node=$(galera_to_pcmk_name $best_node_gcomm) + if [ -z "$best_node" ]; then + ocf_log err "Could not determine initial best node from galera name <${best_node_gcomm}>." + return + fi # avoid selecting a recovered node as bootstrap if possible - for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do + for node in $all_nodes; do + local pcmk_node=$(galera_to_pcmk_name $node) + if [ -z "$pcmk_node" ]; then + ocf_log err "Could not determine pacemaker node from galera name <${node}>." + return + else + node=$pcmk_node + fi + if is_no_grastate $node; then nodes_recovered="$nodes_recovered $node" else @@ -471,6 +514,19 @@ detect_first_master() done for node in $nodes_recovered $nodes; do + safe_to_bootstrap=$(get_safe_to_bootstrap $node) + + if [ "$safe_to_bootstrap" = "1" ]; then + # Galera marked the node as safe to boostrap during shutdown. Let's just + # pick it as our bootstrap node. + ocf_log info "Node <${node}> is marked as safe to bootstrap." + best_node=$node + + # We don't need to wait for the other nodes to report state in this case + missing_nodes=0 + break + fi + last_commit=$(get_last_commit $node) if [ -z "$last_commit" ]; then @@ -501,155 +557,20 @@ detect_first_master() set_bootstrap_node $best_node } -detect_galera_pid() +detect_safe_to_bootstrap() { - ps auxww | grep -v -e "${OCF_RESKEY_binary}" -e grep | grep -qe "--pid-file=$OCF_RESKEY_pid" -} + local safe_to_bootstrap="" -galera_status() -{ - local loglevel=$1 - local rc - local running - - if [ -e $OCF_RESKEY_pid ]; then - mysql_common_status $loglevel - rc=$? - else - # if pidfile is not created, the server may - # still be starting up, e.g. running SST - detect_galera_pid - running=$? - if [ $running -eq 0 ]; then - rc=$OCF_SUCCESS - else - ocf_log $loglevel "MySQL is not running" - rc=$OCF_NOT_RUNNING - fi + if [ -f ${OCF_RESKEY_datadir}/grastate.dat ]; then + ocf_log info "attempting to read safe_to_bootstrap flag from ${OCF_RESKEY_datadir}/grastate.dat" + safe_to_bootstrap=$(sed -n 's/^safe_to_bootstrap:\s*\(.*\)$/\1/p' < ${OCF_RESKEY_datadir}/grastate.dat) fi - return $rc -} - -galera_start_nowait() -{ - local mysql_extra_params="$1" - local pid - local running - - ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \ - --pid-file=$OCF_RESKEY_pid \ - --socket=$OCF_RESKEY_socket \ - --datadir=$OCF_RESKEY_datadir \ - --log-error=$OCF_RESKEY_log \ - --user=$OCF_RESKEY_user $OCF_RESKEY_additional_parameters \ - $mysql_extra_params >/dev/null 2>&1 & - pid=$! - - # Spin waiting for the server to be spawned. - # Let the CRM/LRM time us out if required. - start_wait=1 - while [ $start_wait = 1 ]; do - if ! ps $pid > /dev/null 2>&1; then - wait $pid - ocf_exit_reason "MySQL server failed to start (pid=$pid) (rc=$?), please check your installation" - return $OCF_ERR_GENERIC - fi - detect_galera_pid - running=$? - if [ $running -eq 0 ]; then - start_wait=0 - else - ocf_log info "MySQL is not running" - fi - sleep 2 - done - - return $OCF_SUCCESS -} - -galera_start_local_node() -{ - local rc - local extra_opts - local bootstrap - - bootstrap=$(is_bootstrap) - - master_exists - if [ $? -eq 0 ]; then - # join without bootstrapping - ocf_log info "Node <${NODENAME}> is joining the cluster" - extra_opts="--wsrep-cluster-address=${OCF_RESKEY_wsrep_cluster_address}" - elif ocf_is_true $bootstrap; then - ocf_log info "Node <${NODENAME}> is bootstrapping the cluster" - extra_opts="--wsrep-cluster-address=gcomm://" + if [ "$safe_to_bootstrap" = "1" ] || [ "$safe_to_bootstrap" = "0" ]; then + set_safe_to_bootstrap $safe_to_bootstrap else - ocf_exit_reason "Failure, Attempted to join cluster of $OCF_RESOURCE_INSTANCE before master node has been detected." - clear_last_commit - return $OCF_ERR_GENERIC + clear_safe_to_bootstrap fi - - # clear last_commit before we start galera to make sure there - # won't be discrepency between the cib and galera if this node - # processes a few transactions and fails before we detect it - clear_last_commit - - mysql_common_prepare_dirs - - # At start time, if galera requires a SST rather than an IST, the - # mysql server's pidfile won't be available until SST finishes, - # which can be longer than the start timeout. So we only check - # bootstrap node extensively. Joiner nodes are monitored in the - # "monitor" op - if ocf_is_true $bootstrap; then - # start server and wait until it's up and running - mysql_common_start "$extra_opts" - rc=$? - if [ $rc != $OCF_SUCCESS ]; then - return $rc - fi - - mysql_common_status info - rc=$? - - if [ $rc != $OCF_SUCCESS ]; then - ocf_exit_reason "Failed initial monitor action" - return $rc - fi - - is_readonly - if [ $? -eq 0 ]; then - ocf_exit_reason "Failure. Master instance started in read-only mode, check configuration." - return $OCF_ERR_GENERIC - fi - - is_primary - if [ $? -ne 0 ]; then - ocf_exit_reason "Failure. Master instance started, but is not in Primary mode." - return $OCF_ERR_GENERIC - fi - - clear_bootstrap_node - # clear attribute no-grastate. if last shutdown was - # not clean, we cannot be extra-cautious by requesting a SST - # since this is the bootstrap node - clear_no_grastate - else - # only start server, defer full checks to "monitor" op - galera_start_nowait "$extra_opts" - rc=$? - if [ $rc != $OCF_SUCCESS ]; then - return $rc - fi - - set_sync_needed - # attribute no-grastate will be cleared once the joiner - # has finished syncing and is promoted to Master - fi - - ocf_log info "Galera started" - return $OCF_SUCCESS } detect_last_commit() @@ -660,13 +581,14 @@ detect_last_commit() --socket=$OCF_RESKEY_socket \ --datadir=$OCF_RESKEY_datadir \ --user=$OCF_RESKEY_user" + local recovery_file_regex='s/.*WSREP\:.*position\s*recovery.*--log_error='\''\([^'\'']*\)'\''.*/\1/p' local recovered_position_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p' ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat" last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')" if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then local tmp=$(mktemp) - local tmperr=$(mktemp) + chown $OCF_RESKEY_user:$OCF_RESKEY_group $tmp # if we pass here because grastate.dat doesn't exist, # try not to bootstrap from this node if possible @@ -676,33 +598,36 @@ detect_last_commit() ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" - ${OCF_RESKEY_binary} $recover_args --wsrep-recover > $tmp 2> $tmperr + ${OCF_RESKEY_binary} $recover_args --wsrep-recover --log-error=$tmp 2>/dev/null - last_commit="$(cat $tmp | sed -n $recovered_position_regex)" + last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)" if [ -z "$last_commit" ]; then # Galera uses InnoDB's 2pc transactions internally. If # server was stopped in the middle of a replication, the # recovery may find a "prepared" XA transaction in the # redo log, and mysql won't recover automatically - cat $tmperr | grep -q -E '\[ERROR\]\s+Found\s+[0-9]+\s+prepared\s+transactions!' 2>/dev/null - if [ $? -eq 0 ]; then - # we can only rollback the transaction, but that's OK - # since the DB will get resynchronized anyway - ocf_log warn "local node <${NODENAME}> was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover" - ${OCF_RESKEY_binary} $recover_args --wsrep-recover \ - --tc-heuristic-recover=rollback > $tmp 2>/dev/null + local recovery_file="$(cat $tmp | sed -n $recovery_file_regex)" + if [ -e $recovery_file ]; then + cat $recovery_file | grep -q -E '\[ERROR\]\s+Found\s+[0-9]+\s+prepared\s+transactions!' 2>/dev/null + if [ $? -eq 0 ]; then + # we can only rollback the transaction, but that's OK + # since the DB will get resynchronized anyway + ocf_log warn "local node <${NODENAME}> was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover" + ${OCF_RESKEY_binary} $recover_args --wsrep-recover \ + --tc-heuristic-recover=rollback --log-error=$tmp 2>/dev/null - last_commit="$(cat $tmp | sed -n $recovered_position_regex)" - if [ ! -z "$last_commit" ]; then - ocf_log warn "State recovered. force SST at next restart for full resynchronization" - rm -f ${OCF_RESKEY_datadir}/grastate.dat - # try not to bootstrap from this node if possible - set_no_grastate + last_commit="$(cat $tmp | sed -n $recovered_position_regex | tail -1)" + if [ ! -z "$last_commit" ]; then + ocf_log warn "State recovered. force SST at next restart for full resynchronization" + rm -f ${OCF_RESKEY_datadir}/grastate.dat + # try not to bootstrap from this node if possible + set_no_grastate + fi fi fi fi - rm -f $tmp $tmperr + rm -f $tmp fi if [ ! -z "$last_commit" ]; then @@ -716,35 +641,95 @@ detect_last_commit() fi } +# For galera, promote is really start galera_promote() { local rc local extra_opts local bootstrap - + local safe_to_bootstrap master_exists + if [ $? -eq 0 ]; then + # join without bootstrapping + extra_opts="--wsrep-cluster-address=${OCF_RESKEY_wsrep_cluster_address}" + else + bootstrap=$(is_bootstrap) + + if ocf_is_true $bootstrap; then + # The best node for bootstrapping wasn't cleanly shutdown. Allow + # bootstrapping anyways + if [ "$(get_safe_to_bootstrap)" = "0" ]; then + sed -ie 's/^\(safe_to_bootstrap:\) 0/\1 1/' ${OCF_RESKEY_datadir}/grastate.dat + fi + ocf_log info "Node <${NODENAME}> is bootstrapping the cluster" + extra_opts="--wsrep-cluster-address=gcomm://" + else + ocf_exit_reason "Failure, Attempted to promote Master instance of $OCF_RESOURCE_INSTANCE before bootstrap node has been detected." + clear_last_commit + return $OCF_ERR_GENERIC + fi + fi + + galera_monitor + if [ $? -eq $OCF_RUNNING_MASTER ]; then + if ocf_is_true $bootstrap; then + promote_everyone + clear_bootstrap_node + ocf_log info "boostrap node already up, promoting the rest of the galera instances." + fi + clear_safe_to_bootstrap + clear_last_commit + return $OCF_SUCCESS + fi + + # last commit/safe_to_bootstrap flag are no longer relevant once promoted + clear_last_commit + clear_safe_to_bootstrap + + mysql_common_prepare_dirs + mysql_common_start "$extra_opts" + rc=$? + if [ $rc != $OCF_SUCCESS ]; then + return $rc + fi + + galera_monitor + rc=$? + if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then + ocf_exit_reason "Failed initial monitor action" + return $rc + fi + + is_readonly + if [ $? -eq 0 ]; then + ocf_exit_reason "Failure. Master instance started in read-only mode, check configuration." + return $OCF_ERR_GENERIC + fi + + is_primary if [ $? -ne 0 ]; then - # promoting the first master will bootstrap the cluster - if is_bootstrap; then - galera_start_local_node - rc=$? - return $rc - else - ocf_exit_reason "Attempted to start the cluster without being a bootstrap node." - return $OCF_ERR_GENERIC - fi + ocf_exit_reason "Failure. Master instance started, but is not in Primary mode." + return $OCF_ERR_GENERIC + fi + + if ocf_is_true $bootstrap; then + promote_everyone + clear_bootstrap_node + # clear attribute no-grastate. if last shutdown was + # not clean, we cannot be extra-cautious by requesting a SST + # since this is the bootstrap node + clear_no_grastate + ocf_log info "Bootstrap complete, promoting the rest of the galera instances." else - # promoting other masters only performs sanity checks - # as the joining nodes were started during the "monitor" op - if ! check_sync_needed; then - # sync is done, clear info about last startup - clear_no_grastate - return $OCF_SUCCESS - else - ocf_exit_reason "Attempted to promote local node while sync was still needed." - return $OCF_ERR_GENERIC - fi + # if this is not the bootstrap node, make sure this instance + # syncs with the rest of the cluster before promotion returns. + wait_for_sync + # sync is done, clear info about last startup + clear_no_grastate fi + + ocf_log info "Galera started" + return $OCF_SUCCESS } galera_demote() @@ -759,10 +744,18 @@ galera_demote() # if this node was previously a bootstrap node, that is no longer the case. clear_bootstrap_node clear_last_commit - clear_sync_needed clear_no_grastate + clear_safe_to_bootstrap + + # Clear master score here rather than letting pacemaker do so once + # demote finishes. This way a promote cannot take place right + # after this demote even if pacemaker is requested to do so. It + # will first have to run a start/monitor op, to reprobe the state + # of the other galera nodes and act accordingly. + clear_master_score # record last commit for next promotion + detect_safe_to_bootstrap detect_last_commit rc=$? return $rc @@ -771,21 +764,29 @@ galera_demote() galera_start() { local rc + local galera_node - echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME + galera_node=$(pcmk_to_galera_name $NODENAME) + if [ -z "$galera_node" ]; then + ocf_exit_reason "Could not determine galera name from pacemaker node <${NODENAME}>." + return $OCF_ERR_CONFIGURED + fi + + echo $OCF_RESKEY_wsrep_cluster_address | grep -q -F $galera_node if [ $? -ne 0 ]; then - ocf_exit_reason "local node <${NODENAME}> must be a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>to start this galera instance" + ocf_exit_reason "local node <${NODENAME}> (galera node <${galera_node}>) must be a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}> to start this galera instance" return $OCF_ERR_CONFIGURED fi - galera_status info - if [ $? -ne $OCF_NOT_RUNNING ]; then + galera_monitor + if [ $? -eq $OCF_RUNNING_MASTER ]; then ocf_exit_reason "master galera instance started outside of the cluster's control" return $OCF_ERR_GENERIC fi mysql_common_prepare_dirs + detect_safe_to_bootstrap detect_last_commit rc=$? if [ $rc -ne $OCF_SUCCESS ]; then @@ -794,7 +795,8 @@ galera_start() master_exists if [ $? -eq 0 ]; then - ocf_log info "Master instances are already up, local node will join in when started" + ocf_log info "Master instances are already up, setting master score so this instance will join galera cluster." + set_master_score $NODENAME else clear_master_score detect_first_master @@ -806,6 +808,7 @@ galera_start() galera_monitor() { local rc + local galera_node local status_loglevel="err" # Set loglevel to info during probe @@ -813,29 +816,22 @@ galera_monitor() status_loglevel="info" fi - # Check whether mysql is running or about to start after sync - galera_status $status_loglevel + mysql_common_status $status_loglevel rc=$? if [ $rc -eq $OCF_NOT_RUNNING ]; then - last_commit=$(get_last_commit $NODENAME) - if [ -n "$last_commit" ];then + last_commit=$(get_last_commit $node) + if [ -n "$last_commit" ]; then + # if last commit is set, this instance is considered started in slave mode rc=$OCF_SUCCESS - - if ocf_is_probe; then - # prevent state change during probe - return $rc - fi - master_exists if [ $? -ne 0 ]; then detect_first_master else - # a master instance exists and is healthy. - # start this node and mark it as "pending sync" - ocf_log info "cluster is running. start local node to join in" - galera_start_local_node - rc=$? + # a master instance exists and is healthy, promote this + # local read only instance + # so it can join the master galera cluster. + set_master_score fi fi return $rc @@ -843,40 +839,31 @@ galera_monitor() return $rc fi - # if we make it here, mysql is running or about to start after sync. - # Check cluster status now. + # if we make it here, mysql is running. Check cluster status now. + galera_node=$(pcmk_to_galera_name $NODENAME) + if [ -z "$galera_node" ]; then + ocf_exit_reason "Could not determine galera name from pacemaker node <${NODENAME}>." + return $OCF_ERR_CONFIGURED + fi - echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME + echo $OCF_RESKEY_wsrep_cluster_address | grep -q -F $galera_node if [ $? -ne 0 ]; then - ocf_exit_reason "local node <${NODENAME}> is started, but is not a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>" + ocf_exit_reason "local node <${NODENAME}> (galera node <${galera_node}>) is started, but is not a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>" return $OCF_ERR_GENERIC fi - check_sync_needed + is_primary if [ $? -eq 0 ]; then - # galera running and sync is needed: slave state + if ocf_is_probe; then - # prevent state change during probe - rc=$OCF_SUCCESS - else - check_sync_status - rc=$? + # restore master score during probe + # if we detect this is a master instance + set_master_score fi + rc=$OCF_RUNNING_MASTER else - is_primary - if [ $? -ne 0 ]; then - ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." - rc=$OCF_ERR_GENERIC - else - # galera running, no need to sync: master state and everything's clear - rc=$OCF_RUNNING_MASTER - - if ocf_is_probe; then - # restore master score during probe - # if we detect this is a master instance - set_master_score - fi - fi + ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." + rc=$OCF_ERR_GENERIC fi return $rc @@ -887,12 +874,12 @@ galera_stop() local rc # make sure the process is stopped mysql_common_stop - rc=$? + rc=$1 + clear_safe_to_bootstrap clear_last_commit clear_master_score clear_bootstrap_node - clear_sync_needed clear_no_grastate return $rc } @@ -962,7 +949,7 @@ fi case "$1" in start) galera_start;; stop) galera_stop;; - status) galera_status err;; + status) mysql_common_status err;; monitor) galera_monitor;; promote) galera_promote;; demote) galera_demote;; -- 2.14.1
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor