From 710c33d3a8a5050ce5ef970317245b9476b5c858 Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Mon, 4 Dec 2017 13:28:59 +0100 Subject: [PATCH 01/17] CRITICAL if could not access state files --- modules/connectors.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/modules/connectors.py b/modules/connectors.py index 665cfc5..72287bd 100755 --- a/modules/connectors.py +++ b/modules/connectors.py @@ -15,7 +15,7 @@ weights_state = 'weights-ok' def check_file_ok(fname): - if os.path.isfile(fname): + if os.stat(fname) and os.path.isfile(fname): fh = open(fname, 'r') if fh.read().strip() == 'True': return True @@ -87,7 +87,20 @@ def main(): date_sufix.append(day.strftime("%Y_%m_%d")) nagios = NagiosResponse("All connectors are working fine.") - process_customer(cmd_options, root_directory, date_sufix, nagios) + try: + process_customer(cmd_options, root_directory, date_sufix, nagios) + + except OSError as e: + nagios.setCode(nagios.CRITICAL) + nagios.writeCriticalMessage('{0} {1}'.format(repr(e), e.filename)) + print nagios.getMsg() + raise SystemExit(nagios.getCode()) + + except Exception as e: + nagios.setCode(nagios.CRITICAL) + nagios.writeCriticalMessage(repr(e)) + print nagios.getMsg() + raise SystemExit(nagios.getCode()) print(nagios.getMsg()) raise SystemExit(nagios.getCode()) From 5e0fb2039bfe5001487400a032186400420c2fc3 Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Mon, 4 Dec 2017 13:38:01 +0100 Subject: [PATCH 02/17] first inspect if filename attr exists --- modules/connectors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/connectors.py b/modules/connectors.py index 72287bd..a3bcdd8 100755 --- a/modules/connectors.py +++ b/modules/connectors.py @@ -92,7 +92,10 @@ def main(): except OSError as e: nagios.setCode(nagios.CRITICAL) - nagios.writeCriticalMessage('{0} {1}'.format(repr(e), e.filename)) + if getattr(e, 'filename', False): + nagios.writeCriticalMessage('{0} {1}'.format(repr(e), e.filename)) + else: + nagios.writeCriticalMessage(repr(e)) print nagios.getMsg() raise SystemExit(nagios.getCode()) From 50971f5bb3ededdd1d147cd1c97cd079ee4a4372 Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Fri, 16 Mar 2018 15:56:14 +0100 Subject: [PATCH 03/17] inital AMS publisher probe --- modules/amspub_check.py | 24 ++++++++++++++++++++++++ src/ams-publisher-probe | 5 +++++ 2 files changed, 29 insertions(+) create mode 100755 modules/amspub_check.py create mode 100755 src/ams-publisher-probe diff --git a/modules/amspub_check.py b/modules/amspub_check.py new file mode 100755 index 0000000..2f96e0c --- /dev/null +++ b/modules/amspub_check.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +import argparse +import sys, socket, select + +maxcmdlength = 128 + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-s', dest='socket', required=True, type=str, help='AMS inspection socket') + parser.add_argument('-q', dest='query', action='append', required=True, type=str, help='Query') + arguments = parser.parse_args() + + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.connect(arguments.socket) + sock.send(' '.join(arguments.query), maxcmdlength) + data = sock.recv(maxcmdlength) + print data + sock.close() + + +if __name__ == "__main__": + main() diff --git a/src/ams-publisher-probe b/src/ams-publisher-probe new file mode 100755 index 0000000..cbfa046 --- /dev/null +++ b/src/ams-publisher-probe @@ -0,0 +1,5 @@ +#!/usr/bin/env python + +from nagios_plugins_argo import amspub_check + +amspub_check.main() From 870cb237deb6c0947f534b616daa09d46469974c Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Mon, 19 Mar 2018 13:24:47 +0100 Subject: [PATCH 04/17] handle non existent socket --- modules/NagiosResponse.py | 2 +- modules/amspub_check.py | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/modules/NagiosResponse.py b/modules/NagiosResponse.py index 104ebe2..5e71f3e 100644 --- a/modules/NagiosResponse.py +++ b/modules/NagiosResponse.py @@ -1,4 +1,4 @@ -class NagiosResponse: +class NagiosResponse(object): _msgBagWarning = [] _msgBagCritical = [] _okMsg = "" diff --git a/modules/amspub_check.py b/modules/amspub_check.py index 2f96e0c..603647c 100755 --- a/modules/amspub_check.py +++ b/modules/amspub_check.py @@ -2,6 +2,7 @@ import argparse import sys, socket, select +from nagios_plugins_argo.NagiosResponse import NagiosResponse maxcmdlength = 128 @@ -12,13 +13,20 @@ def main(): parser.add_argument('-q', dest='query', action='append', required=True, type=str, help='Query') arguments = parser.parse_args() - sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - sock.connect(arguments.socket) - sock.send(' '.join(arguments.query), maxcmdlength) - data = sock.recv(maxcmdlength) - print data - sock.close() + nr = NagiosResponse() + try: + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.connect(arguments.socket) + sock.send(' '.join(arguments.query), maxcmdlength) + data = sock.recv(maxcmdlength) + print data + sock.close() + except socket.error as e: + nr.setCode(2) + nr.writeCriticalMessage(str(e)) + print nr.getMsg() + raise SystemExit(nr.getCode()) if __name__ == "__main__": main() From 56044c96e2bbdf03b54d799a8252967c8836de04 Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Mon, 19 Mar 2018 13:32:48 +0100 Subject: [PATCH 05/17] handle response timeout --- modules/amspub_check.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/modules/amspub_check.py b/modules/amspub_check.py index 603647c..d67193b 100755 --- a/modules/amspub_check.py +++ b/modules/amspub_check.py @@ -5,7 +5,7 @@ from nagios_plugins_argo.NagiosResponse import NagiosResponse maxcmdlength = 128 - +timeout = 10 def main(): parser = argparse.ArgumentParser() @@ -17,14 +17,23 @@ def main(): try: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.setblocking(0) + sock.settimeout(timeout) sock.connect(arguments.socket) sock.send(' '.join(arguments.query), maxcmdlength) data = sock.recv(maxcmdlength) print data sock.close() + + except socket.timeout as e: + nr.setCode(2) + nr.writeCriticalMessage('Socket response timeout after {0}s'.format(timeout)) + print nr.getMsg() + raise SystemExit(nr.getCode()) + except socket.error as e: nr.setCode(2) - nr.writeCriticalMessage(str(e)) + nr.writeCriticalMessage('Socket error: {0}'.format(str(e))) print nr.getMsg() raise SystemExit(nr.getCode()) From abd3f81ad12ea575601b4f47dbf2cca6401f3720 Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Mon, 19 Mar 2018 13:39:56 +0100 Subject: [PATCH 06/17] always close sock connection --- modules/amspub_check.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/amspub_check.py b/modules/amspub_check.py index d67193b..4fc9f2f 100755 --- a/modules/amspub_check.py +++ b/modules/amspub_check.py @@ -7,6 +7,7 @@ maxcmdlength = 128 timeout = 10 + def main(): parser = argparse.ArgumentParser() parser.add_argument('-s', dest='socket', required=True, type=str, help='AMS inspection socket') @@ -19,11 +20,12 @@ def main(): sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.setblocking(0) sock.settimeout(timeout) + sock.connect(arguments.socket) sock.send(' '.join(arguments.query), maxcmdlength) data = sock.recv(maxcmdlength) + print data - sock.close() except socket.timeout as e: nr.setCode(2) @@ -37,5 +39,8 @@ def main(): print nr.getMsg() raise SystemExit(nr.getCode()) + finally: + sock.close() + if __name__ == "__main__": main() From 5b54dee83ab96c3221e50561eb871fb68091db37 Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Mon, 19 Mar 2018 15:19:50 +0100 Subject: [PATCH 07/17] incorrectly specified arguments --- modules/amspub_check.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/amspub_check.py b/modules/amspub_check.py index 4fc9f2f..dcbdfda 100755 --- a/modules/amspub_check.py +++ b/modules/amspub_check.py @@ -1,7 +1,7 @@ #!/usr/bin/env python import argparse -import sys, socket, select +import socket from nagios_plugins_argo.NagiosResponse import NagiosResponse maxcmdlength = 128 @@ -12,10 +12,17 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('-s', dest='socket', required=True, type=str, help='AMS inspection socket') parser.add_argument('-q', dest='query', action='append', required=True, type=str, help='Query') + parser.add_argument('-t', dest='threshold', action='append', required=True, type=str, help='Threshold') arguments = parser.parse_args() nr = NagiosResponse() + if len(arguments.threshold) != len(arguments.query): + nr.setCode(2) + nr.writeCriticalMessage('Wrong arguments') + print nr.getMsg() + raise SystemExit(nr.getCode()) + try: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.setblocking(0) From 848d4c4b0cea16f0edbd431558e40e69d4604eeb Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Mon, 19 Mar 2018 16:44:00 +0100 Subject: [PATCH 08/17] parse result and CRITICAL on wrong worker name --- modules/amspub_check.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/modules/amspub_check.py b/modules/amspub_check.py index dcbdfda..563548c 100755 --- a/modules/amspub_check.py +++ b/modules/amspub_check.py @@ -7,6 +7,18 @@ maxcmdlength = 128 timeout = 10 +def parse_result(query): + try: + w, r = query.split('+') + + w = w.split(':')[1] + r = int(r.split(':')[1]) + + except (ValueError, KeyError): + return (w, 'error') + + return (w, r) + def main(): parser = argparse.ArgumentParser() @@ -32,7 +44,19 @@ def main(): sock.send(' '.join(arguments.query), maxcmdlength) data = sock.recv(maxcmdlength) - print data + lr = list() + for r in data.split(): + lr.append(parse_result(r)) + + error = False + for e in lr: + if e[1] == 'error': + nr.setCode(2) + nr.writeCriticalMessage('Worker {0} {1}'.format(e[0], e[1])) + error = True + if error: + print nr.getMsg() + raise SystemExit(nr.getCode()) except socket.timeout as e: nr.setCode(2) From e18aecee84f30f0fad1103f1c5541217cdcae5fe Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Mon, 19 Mar 2018 17:30:33 +0100 Subject: [PATCH 09/17] ams-publisher CRITICAL and OK threshold inspect --- modules/amspub_check.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/modules/amspub_check.py b/modules/amspub_check.py index 563548c..4f7f4c8 100755 --- a/modules/amspub_check.py +++ b/modules/amspub_check.py @@ -24,7 +24,7 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('-s', dest='socket', required=True, type=str, help='AMS inspection socket') parser.add_argument('-q', dest='query', action='append', required=True, type=str, help='Query') - parser.add_argument('-t', dest='threshold', action='append', required=True, type=str, help='Threshold') + parser.add_argument('-c', dest='threshold', action='append', required=True, type=int, help='Threshold') arguments = parser.parse_args() nr = NagiosResponse() @@ -58,6 +58,32 @@ def main(): print nr.getMsg() raise SystemExit(nr.getCode()) + error = False + nr.setCode(0) + i = 0 + while i < len(lr): + e = lr[i] + if e[1] < arguments.threshold[i]: + nr.setCode(2) + nr.writeCriticalMessage('Worker {0} published {1} (threshold {2})'.format(e[0], e[1], arguments.threshold[i])) + error = True + i+=1 + + if error: + print nr.getMsg() + raise SystemExit(nr.getCode()) + else: + i = 0 + nr.setCode(0) + while i < len(lr): + e = lr[i] + nr.writeOkMessage('Worker {0} published {1} (threshold {2})'.format(e[0], e[1], arguments.threshold[i])) + i+=1 + + print nr.getMsg() + raise SystemExit(nr.getCode()) + + except socket.timeout as e: nr.setCode(2) nr.writeCriticalMessage('Socket response timeout after {0}s'.format(timeout)) From c5ca038c641a56ac1df475a4fb3174faf3483258 Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Mon, 19 Mar 2018 17:40:09 +0100 Subject: [PATCH 10/17] timeout argument --- modules/amspub_check.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/modules/amspub_check.py b/modules/amspub_check.py index 4f7f4c8..0a4e628 100755 --- a/modules/amspub_check.py +++ b/modules/amspub_check.py @@ -25,6 +25,7 @@ def main(): parser.add_argument('-s', dest='socket', required=True, type=str, help='AMS inspection socket') parser.add_argument('-q', dest='query', action='append', required=True, type=str, help='Query') parser.add_argument('-c', dest='threshold', action='append', required=True, type=int, help='Threshold') + parser.add_argument('-t', dest='timeout', required=False, type=int, help='Timeout') arguments = parser.parse_args() nr = NagiosResponse() @@ -35,10 +36,15 @@ def main(): print nr.getMsg() raise SystemExit(nr.getCode()) + if arguments.timeout: + timeo = arguments.timeout + else: + timeo = timeout + try: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.setblocking(0) - sock.settimeout(timeout) + sock.settimeout(timeo) sock.connect(arguments.socket) sock.send(' '.join(arguments.query), maxcmdlength) @@ -86,7 +92,7 @@ def main(): except socket.timeout as e: nr.setCode(2) - nr.writeCriticalMessage('Socket response timeout after {0}s'.format(timeout)) + nr.writeCriticalMessage('Socket response timeout after {0}s'.format(timeo)) print nr.getMsg() raise SystemExit(nr.getCode()) From a9882cfe8e667ccb1d295f5b4f52e0788569d232 Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Mon, 19 Mar 2018 17:49:27 +0100 Subject: [PATCH 11/17] list of OK messages joined to create a status msg --- modules/NagiosResponse.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/NagiosResponse.py b/modules/NagiosResponse.py index 5e71f3e..a4f44db 100644 --- a/modules/NagiosResponse.py +++ b/modules/NagiosResponse.py @@ -1,6 +1,7 @@ class NagiosResponse(object): _msgBagWarning = [] _msgBagCritical = [] + _msgBagOk = [] _okMsg = "" _code = None @@ -13,10 +14,12 @@ def __init__(self, ok_msg=""): self._code = self.OK self._okMsg = ok_msg - def writeWarningMessage(self, msg): self._msgBagWarning.append(msg) + def writeOkMessage(self, msg): + self._msgBagOk.append(msg) + def writeCriticalMessage(self, msg): self._msgBagCritical.append(msg) @@ -32,7 +35,8 @@ def getMsg(self): elif self._code == self.CRITICAL: return "CRITICAL - " + self._toString(self._msgBagCritical) elif self._code == self.OK: - return "OK - " + self._okMsg if self._okMsg else "OK" + msg = self._okMsg if self._okMsg else self._toString(self._msgBagOk) + return "OK - " + msg else: return "UNKNOWN!" From eccbfbf22f356c553edbd9acfb5429c166c662b4 Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Tue, 20 Mar 2018 13:54:51 +0100 Subject: [PATCH 12/17] info about ams-publisher-probe --- README.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/README.md b/README.md index 81a00d3..7544a64 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ Currently, there are probes for: - ARGO EGI Connectors - ARGO Messaging service +- ARGO Messaging Nagios publisher - ARGO Web API - POEM service @@ -43,6 +44,36 @@ where: $ ./ams-probe --token T0K3N --host messaging-devel.argo.grnet.gr --project EGI --topic probetest --subscription probetestsub --timeout 30 ``` +## ARGO Messaging Nagios publisher + +Probe is inspecting AMS publisher running on Nagios monitoring instances. It's +inspecting trends of published messages for each spawned worker and raises alarm if +number of published messages of any worker is below expected threshold. It queries local +inspection socket that publisher exposes and reports back status with the help of NRPE +Nagios system. + +The usage is: + +```sh +usage: amspub_check.py [-h] -s SOCKET -q QUERY -c THRESHOLD [-t TIMEOUT] +``` + +where: +- (-s): local path of publisher inspection socket +- (-q): simple query that can be specified multiple times consisted of worker name and identifier of published or consumed + messages in specified minute interval, e.g. `w:metrics+g:published15` + - `metrics` is name of worker that will be inspected + - `published15` is identifier designating that caller is interested in number of + published messages in last 15 minutes +- (-c): threshold corresponding to each query +- (-t): optional timeout after which probe will no longer wait for answer from socket + +### Usage example + +```sh +./ams-publisher-probe -s /var/run/argo-nagios-ams-publisher/sock -q 'w:metrics+g:published180' -c 50000 -q 'w:alarms+g:published180' -c 1 +``` + ## ARGO Web API This is a probe for checking AR and status reports are properly working. From ec604ac558f976e398ca04613cfe91428ccb3bf0 Mon Sep 17 00:00:00 2001 From: Daniel Vrcic Date: Tue, 27 Mar 2018 15:52:31 +0200 Subject: [PATCH 13/17] spec update for release --- nagios-plugins-argo.spec | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nagios-plugins-argo.spec b/nagios-plugins-argo.spec index 7d04727..125a7bb 100644 --- a/nagios-plugins-argo.spec +++ b/nagios-plugins-argo.spec @@ -4,7 +4,7 @@ Name: nagios-plugins-argo Summary: ARGO components related probes. -Version: 0.1.8 +Version: 0.1.9 Release: 1%{?dist} License: ASL 2.0 Source0: %{name}-%{version}.tar.gz @@ -44,6 +44,8 @@ rm -rf %{buildroot} %changelog +* Tue Mar 27 2018 Daniel Vrcic - 0.1.9-1%{?dist} +- added argo-nagios-ams-publisher * Mon Dec 4 2017 Daniel Vrcic - 0.1.8-1%{?dist} - connectors-probe warning logic revised - connectors-probe updated global.conf parsing From 06a34b0aa769b3d2a1257acfaf4321e8e2b79f31 Mon Sep 17 00:00:00 2001 From: agelostsal Date: Fri, 30 Mar 2018 16:37:01 +0300 Subject: [PATCH 14/17] ARGO-1097 Probe for monitoring the compute engine --- README.md | 28 +++++++++++++++ modules/ce_check.py | 87 +++++++++++++++++++++++++++++++++++++++++++++ src/ce_check | 6 ++++ 3 files changed, 121 insertions(+) create mode 100755 modules/ce_check.py create mode 100755 src/ce_check diff --git a/README.md b/README.md index 7544a64..cc174b7 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Currently, there are probes for: - ARGO Messaging Nagios publisher - ARGO Web API - POEM service +- Compute Engine dataflow ## ARGO Messaging service @@ -101,3 +102,30 @@ where: ```sh $ ./web-api -H web-api.test.com --tenant tenantname --rtype ar --token 12321312313123 --unused-reports Report1 Report2 --day 1 -t 180 -v ``` + +## Compute Engine dataflow + +This is a probe for checking the compute engine's dataflow, making sure that all components work as intented. +The checking involves the probe publishing a message to AMS, and expecting after some time, to find the same message produced by the system.If the message is identical, and has been delivered in reasonable time, then everything is ok, otherwise, we examine the result, to figure out, what went wrong with the system. + +Usage of the script: +```sh +$ ce_check.py [-h] [-H HOSTNAME] [--project Project] [--token TOKEN] + [--push_topic Push Topic] [--pull_subscription Pull Subscription] [-t TIMEOUT] + +``` + - (-H): the hostname of the AMS endpoint. + - (--project): the project that holds the topics and subscriptions. + - (--token): the authorization token. + - (--push_topic): the name of the topic, where the probe should publish its data. + - (--pull_subscription): the name of the subscription, where the probe will check for system's response. + - (--push_subscription): the name of the subscription, where the System will read from. + - (-t): A timeout option(seconds) for AMS library requests. + - (-i): a timewindow(seconds) between publishing and retrieving the message that is expected and considered 'healthy' for the system. + +### Usage example + + ```sh + $ ce_check -H ams-endpoint.gr --project TEST_PR --token test_token --push_topic test_topic --pull_subscription test_sub --push_subscription test_sub_2 -t 180 -i 500 + + ``` diff --git a/modules/ce_check.py b/modules/ce_check.py new file mode 100755 index 0000000..e5a5f73 --- /dev/null +++ b/modules/ce_check.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python + +from argparse import ArgumentParser +import datetime +import time +import json +from argo_ams_library import ArgoMessagingService, AmsException +from NagiosResponse import NagiosResponse + + +def main(): + TIMEOUT = 180 + INTERVAL = 300 + + parser = ArgumentParser(description="Nagios probe for monitoring the compute engine's flow.") + parser.add_argument('-H', dest='host', type=str, default='msg-devel.argo.grnet.gr', help='FQDN of AMS Service') + parser.add_argument('--token', type=str, required=True, help='Given token') + parser.add_argument('--project', type=str, required=True, help='Project registered in AMS Service') + parser.add_argument('--push_topic', type=str, default='create_data', help='Given topic') + parser.add_argument('--push_subscription', type=str, default='create_data_sub', help='Push_Subscription name') + parser.add_argument('--pull_subscription', type=str, default='retrieve_data_sub', help='Push_Subscription name') + parser.add_argument('-t', dest='timeout', type=int, default=TIMEOUT, help='Timeout for ams calls') + parser.add_argument('-i', dest='interval', type=int, default=INTERVAL, help='The amount of time the probe should try to read from ams, beforing exiting') + + cmd_options = parser.parse_args() + + run_timestamp = str(datetime.datetime.now()) + + nagios = NagiosResponse("System Dataflow at " + run_timestamp + " completed successfully.") + ams = ArgoMessagingService(endpoint=cmd_options.host, token=cmd_options.token, project=cmd_options.project) + try: + # For both subscriptions move their offset to max + move_sub_offset_to_max(ams, cmd_options.push_subscription, timeout=cmd_options.timeout) + move_sub_offset_to_max(ams, cmd_options.pull_subscription, timeout=cmd_options.timeout) + + # publish a message with the current timestamp as its content + req_data = {'message': run_timestamp, 'errors': []} + d1 = {'data': json.dumps(req_data), 'attributes': {}} + ams.publish(cmd_options.push_topic, d1, timeout=cmd_options.timeout) + start = time.time() + no_resp = True + while no_resp: + end = time.time() + # check if the systsem has written to the retrieve topic + resp = ams.pull_sub(cmd_options.pull_subscription, timeout=cmd_options.timeout) + if len(resp) > 0: + no_resp = False + resp_data = json.loads(resp[0][1]._data) + # check if the submitted and retrieved data differ + if req_data != resp_data: + nagios_report(nagios, 'critical', "System Dataflow at " + run_timestamp + " completed with errors. Expected: " + str(req_data) + ". Found: " + str(resp_data)+".") + # check if data was retrieved within the expected timeout period, BUT had some kind of delay + elif req_data == resp_data and end-start > cmd_options.interval: + nagios_report(nagios, 'warning', "System Dataflow at " + run_timestamp + " completed successfully using an extra time of: " + str((end-start)-cmd_options.interval) + "s.") + + if (end-start) > 2 * cmd_options.interval: + nagios_report(nagios, 'critical', "System Dataflow at " + run_timestamp + " returned with no message from the systsem after " + str(2 * cmd_options.interval) + "s.") + + # check for a response every 10 seconds + time.sleep(10) + + print(nagios.getMsg()) + raise SystemExit(nagios.getCode()) + + except AmsException as e: + nagios_report(nagios, 'critical', e.msg()) + + +def nagios_report(nagios, status, msg): + nagios_method = getattr(nagios, "write{0}Message".format(status.capitalize())) + nagios_method(msg) + nagios_status = getattr(nagios, status.upper()) + nagios.setCode(nagios_status) + if status == 'critical': + print(nagios.getMsg()) + raise SystemExit(nagios.getCode()) + + +def move_sub_offset_to_max(ams, sub, **reqkwargs): + # Retrieve the max offset for the given subscription + max_sub_offset = ams.getoffsets_sub(sub, "max", **reqkwargs) + # Move the current offset to the max position + ams.modifyoffset_sub(sub, max_sub_offset, **reqkwargs) + + +if __name__ == "__main__": + main() diff --git a/src/ce_check b/src/ce_check new file mode 100755 index 0000000..7b01cc3 --- /dev/null +++ b/src/ce_check @@ -0,0 +1,6 @@ +#!/usr/bin/env python + +from nagios_plugins_argo import ce_check + +ce_check.main() + From 8741bbe0c84db922904e384a90d297f6b73bd1f7 Mon Sep 17 00:00:00 2001 From: agelostsal Date: Mon, 23 Apr 2018 13:19:08 +0300 Subject: [PATCH 15/17] ARGO 1126 Probe for monitoring the compute engine-minor bug fix --- modules/ce_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ce_check.py b/modules/ce_check.py index e5a5f73..dc5a66e 100755 --- a/modules/ce_check.py +++ b/modules/ce_check.py @@ -63,7 +63,7 @@ def main(): raise SystemExit(nagios.getCode()) except AmsException as e: - nagios_report(nagios, 'critical', e.msg()) + nagios_report(nagios, 'critical', e.msg) def nagios_report(nagios, status, msg): From 674777aca6f4cd6b71327fa3a4369360a49fbbf6 Mon Sep 17 00:00:00 2001 From: eimamagi Date: Tue, 12 Jun 2018 07:36:09 +0200 Subject: [PATCH 16/17] ARGO-985 Refine last_update logic of check_nagios sensor --- src/check_nagios | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/check_nagios b/src/check_nagios index bd8f2a6..d964ef0 100755 --- a/src/check_nagios +++ b/src/check_nagios @@ -211,7 +211,7 @@ foreach my $service (@services) { $state = CRITICAL; last; } else { - my $diff = ($service->{last_update} - $service->{last_check})/1000; + my $diff = time() - $service->{last_check}/1000; if ($diff > $plugin->opts->age) { $answer .= "Service $host/".$service->{description}." on Nagios $nagios is older than ".$plugin->opts->age." seconds, last check was $diff seconds ago."; $state = CRITICAL; From 1ce25efceabf27af6da7f8d4e78deffc96685f6e Mon Sep 17 00:00:00 2001 From: eimamagi Date: Tue, 19 Jun 2018 08:39:08 +0200 Subject: [PATCH 17/17] Version bump. --- nagios-plugins-argo.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nagios-plugins-argo.spec b/nagios-plugins-argo.spec index 125a7bb..1a20f35 100644 --- a/nagios-plugins-argo.spec +++ b/nagios-plugins-argo.spec @@ -4,7 +4,7 @@ Name: nagios-plugins-argo Summary: ARGO components related probes. -Version: 0.1.9 +Version: 0.1.10 Release: 1%{?dist} License: ASL 2.0 Source0: %{name}-%{version}.tar.gz