Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
SUSE:SLE-15-SP3:Update
fence-agents.23353
0001-fence_gce-Add-timeouts-and-failure-options...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File 0001-fence_gce-Add-timeouts-and-failure-options-458.patch of Package fence-agents.23353
From 67f8cee8f5351731a55534a5ae777436a2f67aea Mon Sep 17 00:00:00 2001 From: kj1724 <78624900+kj1724@users.noreply.github.com> Date: Tue, 1 Feb 2022 03:43:58 -0500 Subject: [PATCH 1/1] fence_gce: Add timeouts and failure options (#458) Gives users the opportunity to timeout while waiting for pending resets and allows them to run a follow command if the reset fails. This gives users more flexibility on how failures are handled. --- agents/gce/fence_gce.py | 140 +++++++++++++++++++++++------- tests/data/metadata/fence_gce.xml | 25 ++++++ 2 files changed, 134 insertions(+), 31 deletions(-) diff --git a/agents/gce/fence_gce.py b/agents/gce/fence_gce.py index c0bfefdb..3594fd8e 100644 --- a/agents/gce/fence_gce.py +++ b/agents/gce/fence_gce.py @@ -2,10 +2,10 @@ # # Requires the googleapiclient and oauth2client -# RHEL 7.x: google-api-python-client==1.6.7 python-gflags==2.0 pyasn1==0.4.8 rsa==3.4.2 -# RHEL 8.x: nothing additional needed -# SLES 12.x: python-google-api-python-client python-oauth2client python-oauth2client-gce -# SLES 15.x: python3-google-api-python-client python3-oauth2client python3-oauth2client-gce +# RHEL 7.x: google-api-python-client==1.6.7 python-gflags==2.0 pyasn1==0.4.8 rsa==3.4.2 pysocks==1.7.1 httplib2==0.19.0 +# RHEL 8.x: pysocks==1.7.1 httplib2==0.19.0 +# SLES 12.x: python-google-api-python-client python-oauth2client python-oauth2client-gce pysocks==1.7.1 httplib2==0.19.0 +# SLES 15.x: python3-google-api-python-client python3-oauth2client pysocks==1.7.1 httplib2==0.19.0 # import atexit @@ -28,7 +28,7 @@ else: import urllib2 as urlrequest sys.path.append("@FENCEAGENTSLIBDIR@") -from fencing import fail_usage, run_delay, all_opt, atexit_handler, check_input, process_input, show_docs, fence_action +from fencing import fail_usage, run_delay, all_opt, atexit_handler, check_input, process_input, show_docs, fence_action, run_command try: import googleapiclient.discovery import socks @@ -41,6 +41,19 @@ except: METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/' METADATA_HEADERS = {'Metadata-Flavor': 'Google'} +INSTANCE_LINK = 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}/instances/{}' + +def run_on_fail(options): + if "--runonfail" in options: + run_command(options, options["--runonfail"]) + +def fail_fence_agent(options, message): + run_on_fail(options) + fail_usage(message) + +def raise_fence_agent(options, message): + run_on_fail(options) + raise Exception(message) # # Will use baremetalsolution setting or the environment variable @@ -65,7 +78,7 @@ def replace_api_uri(options, http_request): { "matchlength": 4, "match": "https://compute.googleapis.com/compute/v1/projects/(.*)/zones/(.*)/instances/(.*)/reset(.*)", - "replace": "https://baremetalsolution.googleapis.com/v1alpha1/projects/\\1/locations/\\2/instances/\\3:resetInstance\\4" + "replace": "https://baremetalsolution.googleapis.com/v1/projects/\\1/locations/\\2/instances/\\3:resetInstance\\4" }) for uri_replacement in uri_replacements: # each uri_replacement should have matchlength, match, and replace @@ -120,14 +133,17 @@ def translate_status(instance_status): def get_nodes_list(conn, options): result = {} + if "--zone" not in options: + fail_fence_agent(options, "Failed: get_nodes_list: Please specify the --zone in the command") try: - instanceList = retry_api_execute(options, conn.instances().list( - project=options["--project"], - zone=options["--zone"])) - for instance in instanceList["items"]: - result[instance["id"]] = (instance["name"], translate_status(instance["status"])) + for zone in options["--zone"].split(","): + instanceList = retry_api_execute(options, conn.instances().list( + project=options["--project"], + zone=zone)) + for instance in instanceList["items"]: + result[instance["id"]] = (instance["name"], translate_status(instance["status"])) except Exception as err: - fail_usage("Failed: get_nodes_list: {}".format(str(err))) + fail_fence_agent(options, "Failed: get_nodes_list: {}".format(str(err))) return result @@ -141,23 +157,54 @@ def get_power_status(conn, options): return "off" else: return "on" + # If zone is not listed for an entry we attempt to get it automatically + instance = options["--plug"] + zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance] + instance_status = get_instance_power_status(conn, options, instance, zone) + # If any of the instances do not match the intended status we return the + # the opposite status so that the fence agent can change it. + if instance_status != options.get("--action"): + return instance_status + + return options.get("--action") + + +def get_instance_power_status(conn, options, instance, zone): try: - instance = retry_api_execute(options, conn.instances().get( - project=options["--project"], - zone=options["--zone"], - instance=options["--plug"])) + instance = retry_api_execute( + options, + conn.instances().get(project=options["--project"], zone=zone, instance=instance)) return translate_status(instance["status"]) except Exception as err: - fail_usage("Failed: get_power_status: {}".format(str(err))) + fail_fence_agent(options, "Failed: get_instance_power_status: {}".format(str(err))) + + +def check_for_existing_operation(conn, options, instance, zone, operation_type): + logging.debug("check_for_existing_operation") + if "--baremetalsolution" in options: + # There is no API for checking in progress operations + return False + + project = options["--project"] + target_link = INSTANCE_LINK.format(project, zone, instance) + query_filter = '(targetLink = "{}") AND (operationType = "{}") AND (status = "RUNNING")'.format(target_link, operation_type) + result = retry_api_execute( + options, + conn.zoneOperations().list(project=project, zone=zone, filter=query_filter, maxResults=1)) + + if "items" in result and result["items"]: + logging.info("Existing %s operation found", operation_type) + return result["items"][0] -def wait_for_operation(conn, options, operation): +def wait_for_operation(conn, options, zone, operation): if 'name' not in operation: logging.warning('Cannot wait for operation to complete, the' ' requested operation will continue asynchronously') - return + return False + + wait_time = 0 project = options["--project"] - zone = options["--zone"] while True: result = retry_api_execute(options, conn.zoneOperations().get( project=project, @@ -165,56 +212,93 @@ def wait_for_operation(conn, options, operation): operation=operation['name'])) if result['status'] == 'DONE': if 'error' in result: - raise Exception(result['error']) - return + raise_fence_agent(options, result['error']) + return True + + if "--errortimeout" in options and wait_time > int(options["--errortimeout"]): + raise_fence_agent(options, "Operation did not complete before the timeout.") + + if "--warntimeout" in options and wait_time > int(options["--warntimeout"]): + logging.warning("Operation did not complete before the timeout.") + if "--runonwarn" in options: + run_command(options, options["--runonwarn"]) + return False + + wait_time = wait_time + 1 time.sleep(1) def set_power_status(conn, options): - logging.debug("set_power_status"); + logging.debug("set_power_status") + instance = options["--plug"] + # If zone is not listed for an entry we attempt to get it automatically + zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance] + set_instance_power_status(conn, options, instance, zone, options["--action"]) + + +def set_instance_power_status(conn, options, instance, zone, action): + logging.info("Setting power status of %s in zone %s", instance, zone) + project = options["--project"] + try: - if options["--action"] == "off": - logging.info("Issuing poweroff of %s in zone %s" % (options["--plug"], options["--zone"])) - operation = retry_api_execute(options, conn.instances().stop( - project=options["--project"], - zone=options["--zone"], - instance=options["--plug"])) + if action == "off": + logging.info("Issuing poweroff of %s in zone %s", instance, zone) + operation = check_for_existing_operation(conn, options, instance, zone, "stop") + if operation and "--earlyexit" in options: + return + if not operation: + operation = retry_api_execute( + options, + conn.instances().stop(project=project, zone=zone, instance=instance)) logging.info("Poweroff command completed, waiting for the operation to complete") - wait_for_operation(conn, options, operation) - logging.info("Poweroff of %s in zone %s complete" % (options["--plug"], options["--zone"])) - elif options["--action"] == "on": - logging.info("Issuing poweron of %s in zone %s" % (options["--plug"], options["--zone"])) - operation = retry_api_execute(options, conn.instances().start( - project=options["--project"], - zone=options["--zone"], - instance=options["--plug"])) - wait_for_operation(conn, options, operation) - logging.info("Poweron of %s in zone %s complete" % (options["--plug"], options["--zone"])) + if wait_for_operation(conn, options, zone, operation): + logging.info("Poweroff of %s in zone %s complete", instance, zone) + elif action == "on": + logging.info("Issuing poweron of %s in zone %s", instance, zone) + operation = check_for_existing_operation(conn, options, instance, zone, "start") + if operation and "--earlyexit" in options: + return + if not operation: + operation = retry_api_execute( + options, + conn.instances().start(project=project, zone=zone, instance=instance)) + if wait_for_operation(conn, options, zone, operation): + logging.info("Poweron of %s in zone %s complete", instance, zone) except Exception as err: - fail_usage("Failed: set_power_status: {}".format(str(err))) - + fail_fence_agent(options, "Failed: set_instance_power_status: {}".format(str(err))) def power_cycle(conn, options): - logging.debug("power_cycle"); + logging.debug("power_cycle") + instance = options["--plug"] + # If zone is not listed for an entry we attempt to get it automatically + zone = get_zone(conn, options, instance) if "--plugzonemap" not in options else options["--plugzonemap"][instance] + return power_cycle_instance(conn, options, instance, zone) + + +def power_cycle_instance(conn, options, instance, zone): + logging.info("Issuing reset of %s in zone %s", instance, zone) + project = options["--project"] + try: - logging.info('Issuing reset of %s in zone %s' % (options["--plug"], options["--zone"])) - operation = retry_api_execute(options, conn.instances().reset( - project=options["--project"], - zone=options["--zone"], - instance=options["--plug"])) - logging.info("Reset command completed, waiting for the operation to complete") - wait_for_operation(conn, options, operation) - logging.info('Reset of %s in zone %s complete' % (options["--plug"], options["--zone"])) + operation = check_for_existing_operation(conn, options, instance, zone, "reset") + if operation and "--earlyexit" in options: + return True + if not operation: + operation = retry_api_execute( + options, + conn.instances().reset(project=project, zone=zone, instance=instance)) + logging.info("Reset command sent, waiting for the operation to complete") + if wait_for_operation(conn, options, zone, operation): + logging.info("Reset of %s in zone %s complete", instance, zone) return True except Exception as err: - logging.error("Failed: power_cycle: {}".format(str(err))) - return False + logging.exception("Failed: power_cycle") + raise err -def get_zone(conn, options): +def get_zone(conn, options, instance): logging.debug("get_zone"); project = options['--project'] - instance = options['--plug'] fl = 'name="%s"' % instance request = replace_api_uri(options, conn.instances().aggregatedList(project=project, filter=fl)) while request is not None: @@ -226,7 +310,7 @@ def get_zone(conn, options): return inst['zone'].split("/")[-1] request = replace_api_uri(options, conn.instances().aggregatedList_next( previous_request=request, previous_response=response)) - raise Exception("Unable to find instance %s" % (instance)) + raise_fence_agent(options, "Unable to find instance %s" % (instance)) def get_metadata(metadata_key, params=None, timeout=None): @@ -325,13 +409,21 @@ def define_new_opts(): "required" : "0", "order" : 9 } + all_opt["plugzonemap"] = { + "getopt" : ":", + "longopt" : "plugzonemap", + "help" : "--plugzonemap=[plugzonemap] Comma separated zone map when fencing multiple plugs", + "shortdesc" : "Comma separated zone map when fencing multiple plugs.", + "required" : "0", + "order" : 10 + } all_opt["proxyhost"] = { "getopt" : ":", "longopt" : "proxyhost", "help" : "--proxyhost=[proxy_host] The proxy host to use, if one is needed to access the internet (Example: 10.122.0.33)", "shortdesc" : "If a proxy is used for internet access, the proxy host should be specified.", "required" : "0", - "order" : 10 + "order" : 11 } all_opt["proxyport"] = { "getopt" : ":", @@ -340,7 +432,49 @@ def define_new_opts(): "help" : "--proxyport=[proxy_port] The proxy port to use, if one is needed to access the internet (Example: 3127)", "shortdesc" : "If a proxy is used for internet access, the proxy port should be specified.", "required" : "0", - "order" : 11 + "order" : 12 + } + all_opt["earlyexit"] = { + "getopt" : "", + "longopt" : "earlyexit", + "help" : "--earlyexit Return early if reset is already in progress", + "shortdesc" : "If an existing reset operation is detected, the fence agent will return before the operation completes with a 0 return code.", + "required" : "0", + "order" : 13 + } + all_opt["warntimeout"] = { + "getopt" : ":", + "type" : "second", + "longopt" : "warntimeout", + "help" : "--warntimeout=[warn_timeout] Timeout seconds before logging a warning and returning a 0 status code", + "shortdesc" : "If the operation is not completed within the timeout, the cluster operations are allowed to continue.", + "required" : "0", + "order" : 14 + } + all_opt["errortimeout"] = { + "getopt" : ":", + "type" : "second", + "longopt" : "errortimeout", + "help" : "--errortimeout=[error_timeout] Timeout seconds before failing and returning a non-zero status code", + "shortdesc" : "If the operation is not completed within the timeout, cluster is notified of the operation failure.", + "required" : "0", + "order" : 15 + } + all_opt["runonwarn"] = { + "getopt" : ":", + "longopt" : "runonwarn", + "help" : "--runonwarn=[run_on_warn] If a timeout occurs and warning is generated, run the supplied command", + "shortdesc" : "If a timeout would occur while running the agent, then the supplied command is run.", + "required" : "0", + "order" : 16 + } + all_opt["runonfail"] = { + "getopt" : ":", + "longopt" : "runonfail", + "help" : "--runonfail=[run_on_fail] If a failure occurs, run the supplied command", + "shortdesc" : "If a failure would occur while running the agent, then the supplied command is run.", + "required" : "0", + "order" : 17 } @@ -349,7 +483,8 @@ def main(): device_opt = ["port", "no_password", "zone", "project", "stackdriver-logging", "method", "baremetalsolution", "apitimeout", "retries", "retrysleep", - "serviceaccount", "proxyhost", "proxyport"] + "serviceaccount", "plugzonemap", "proxyhost", "proxyport", "earlyexit", + "warntimeout", "errortimeout", "runonwarn", "runonfail"] atexit.register(atexit_handler) @@ -430,22 +565,39 @@ def main(): conn = googleapiclient.discovery.build( 'compute', 'v1', credentials=credentials, cache_discovery=False) except Exception as err: - fail_usage("Failed: Create GCE compute v1 connection: {}".format(str(err))) + fail_fence_agent(options, "Failed: Create GCE compute v1 connection: {}".format(str(err))) # Get project and zone if not options.get("--project"): try: options["--project"] = get_metadata('project/project-id') except Exception as err: - fail_usage("Failed retrieving GCE project. Please provide --project option: {}".format(str(err))) + fail_fence_agent(options, "Failed retrieving GCE project. Please provide --project option: {}".format(str(err))) if "--baremetalsolution" in options: options["--zone"] = "none" - if not options.get("--zone"): - try: - options["--zone"] = get_zone(conn, options) - except Exception as err: - fail_usage("Failed retrieving GCE zone. Please provide --zone option: {}".format(str(err))) + + # Populates zone automatically if missing from the command + zones = [] if not "--zone" in options else options["--zone"].split(",") + options["--plugzonemap"] = {} + if "--plug" in options: + for i, instance in enumerate(options["--plug"].split(",")): + if len(zones) == 1: + # If only one zone is specified, use it across all plugs + options["--plugzonemap"][instance] = zones[0] + continue + + if len(zones) - 1 >= i: + # If we have enough zones specified with the --zone flag use the zone at + # the same index as the plug + options["--plugzonemap"][instance] = zones[i] + continue + + try: + # In this case we do not have a zone specified so we attempt to detect it + options["--plugzonemap"][instance] = get_zone(conn, options, instance) + except Exception as err: + fail_fence_agent(options, "Failed retrieving GCE zone. Please provide --zone option: {}".format(str(err))) # Operate the fencing device result = fence_action(conn, options, set_power_status, get_power_status, get_nodes_list, power_cycle) diff --git a/tests/data/metadata/fence_gce.xml b/tests/data/metadata/fence_gce.xml index d2c597b3..8d2a2dfc 100644 --- a/tests/data/metadata/fence_gce.xml +++ b/tests/data/metadata/fence_gce.xml @@ -73,6 +73,11 @@ For instructions see: https://cloud.google.com/compute/docs/tutorials/python-gui <content type="string" /> <shortdesc lang="en">Service Account to use for authentication to the google cloud APIs.</shortdesc> </parameter> + <parameter name="plugzonemap" unique="0" required="0"> + <getopt mixed="--plugzonemap=[plugzonemap]" /> + <content type="string" /> + <shortdesc lang="en">Comma separated zone map when fencing multiple plugs.</shortdesc> + </parameter> <parameter name="proxyhost" unique="0" required="0"> <getopt mixed="--proxyhost=[proxy_host]" /> <content type="string" /> @@ -83,6 +88,31 @@ For instructions see: https://cloud.google.com/compute/docs/tutorials/python-gui <content type="integer" /> <shortdesc lang="en">If a proxy is used for internet access, the proxy port should be specified.</shortdesc> </parameter> + <parameter name="earlyexit" unique="0" required="0"> + <getopt mixed="--earlyexit" /> + <content type="boolean" /> + <shortdesc lang="en">If an existing reset operation is detected, the fence agent will return before the operation completes with a 0 return code.</shortdesc> + </parameter> + <parameter name="warntimeout" unique="0" required="0"> + <getopt mixed="--warntimeout=[warn_timeout]" /> + <content type="second" /> + <shortdesc lang="en">If the operation is not completed within the timeout, the cluster operations are allowed to continue.</shortdesc> + </parameter> + <parameter name="errortimeout" unique="0" required="0"> + <getopt mixed="--errortimeout=[error_timeout]" /> + <content type="second" /> + <shortdesc lang="en">If the operation is not completed within the timeout, cluster is notified of the operation failure.</shortdesc> + </parameter> + <parameter name="runonwarn" unique="0" required="0"> + <getopt mixed="--runonwarn=[run_on_warn]" /> + <content type="string" /> + <shortdesc lang="en">If a timeout would occur while running the agent, then the supplied command is run.</shortdesc> + </parameter> + <parameter name="runonfail" unique="0" required="0"> + <getopt mixed="--runonfail=[run_on_fail]" /> + <content type="string" /> + <shortdesc lang="en">If a failure would occur while running the agent, then the supplied command is run.</shortdesc> + </parameter> <parameter name="quiet" unique="0" required="0"> <getopt mixed="-q, --quiet" /> <content type="boolean" />
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor