Skip to content
This repository has been archived by the owner on Aug 30, 2022. It is now read-only.

Commit

Permalink
Merge pull request #60 from ARGOeu/devel
Browse files Browse the repository at this point in the history
Release  0.1.11
  • Loading branch information
kkoumantaros authored Dec 5, 2018
2 parents f7d7464 + 867ee68 commit f2d0b4f
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 10 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,12 @@ $ ./ams-probe --token T0K3N --host messaging-devel.argo.grnet.gr --project EGI -
## ARGO Messaging Nagios publisher

Probe is inspecting AMS publisher running on Nagios monitoring instances. It's
inspecting trends of published messages for each spawned worker and raises alarm if
number of published messages of any worker is below expected threshold. It queries local
inspection socket that publisher exposes and reports back status with the help of NRPE
Nagios system.
inspecting trends of published results for each spawned worker and raises
critical if number of published results of any worker is below expected
threshold. Additionally, it will raise warning if numbers are not yet available
i.e. ams-publisher has just started and has not yet published expected number
of results. It queries local inspection socket that publisher exposes and
reports back status with the help of NRPE Nagios system.

The usage is:

Expand Down
32 changes: 30 additions & 2 deletions modules/amspub_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

import argparse
import socket
import re
import time
from nagios_plugins_argo.NagiosResponse import NagiosResponse

maxcmdlength = 128
timeout = 10


def parse_result(query):
try:
w, r = query.split('+')
Expand All @@ -20,6 +23,17 @@ def parse_result(query):
return (w, r)


def extract_intervals(queries):
intervals = list()

for q in queries:
get = q.split('+')[1]
i = re.search('[0-9]+$', get).group(0)
intervals.append(int(i))

return intervals


def main():
parser = argparse.ArgumentParser()
parser.add_argument('-s', dest='socket', required=True, type=str, help='AMS inspection socket')
Expand Down Expand Up @@ -50,10 +64,22 @@ def main():
sock.send(' '.join(arguments.query), maxcmdlength)
data = sock.recv(maxcmdlength)

starttime = None
lr = list()
for r in data.split():
if r.startswith('t:'):
starttime = int(r.split(':')[1])
continue
lr.append(parse_result(r))

intervals = extract_intervals(arguments.query)
now = int(time.time())
if now - starttime < 60 * max(intervals):
nr.setCode(1)
nr.writeWarningMessage('No results yet, ams-publisher is not running for %d minutes' % max(intervals))
print nr.getMsg()
raise SystemExit(nr.getCode())

error = False
for e in lr:
if e[1] == 'error':
Expand All @@ -71,7 +97,8 @@ def main():
e = lr[i]
if e[1] < arguments.threshold[i]:
nr.setCode(2)
nr.writeCriticalMessage('Worker {0} published {1} (threshold {2})'.format(e[0], e[1], arguments.threshold[i]))
nr.writeCriticalMessage('Worker {0} published {1} (threshold {2} in {3} minutes)'.\
format(e[0], e[1], arguments.threshold[i], intervals[i]))
error = True
i+=1

Expand All @@ -83,7 +110,8 @@ def main():
nr.setCode(0)
while i < len(lr):
e = lr[i]
nr.writeOkMessage('Worker {0} published {1} (threshold {2})'.format(e[0], e[1], arguments.threshold[i]))
nr.writeOkMessage('Worker {0} published {1} (threshold {2} in {3} minutes)'.\
format(e[0], e[1], arguments.threshold[i], intervals[i]))
i+=1

print nr.getMsg()
Expand Down
48 changes: 45 additions & 3 deletions modules/poem.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,39 @@ def iosock_try():

return True

def check_metric_conf(profiles, arguments):

# Load metrics from api...
try:
metrics = requests.get('https://' + arguments.hostname + '//poem/api/0.2/json/metrics/?tag=production',
cert=(arguments.cert, arguments.key), verify=True)
metrics = metrics.json()
except requests.exceptions.RequestException as e:
print 'CRITICAL - cannot connect to %s: %s' % ('https://' + arguments.hostname +
'//poem/api/0.2/json/metrics/?tag=production',
errmsg_from_excp(e))
raise SystemExit(2)
except ValueError as e:
print 'CRITICAL - %s - %s' % (MIP_API, errmsg_from_excp(e))
raise SystemExit(2)

# Extract metrics in a certain profile...
profile_metrics = set()
for i in profiles:
if arguments.profile == i['name']:
for j in i['metric_instances']:
profile_metrics.add(j['metric'])

# Extract metric names...
metrics_name = set()
for k in metrics:
for key, value in k.items():
metrics_name.add(key)

# Check configurations exist...
inter = profile_metrics.intersection(metrics_name)
return len(inter)


def main():
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -172,7 +205,7 @@ def main():

try:
for m in matched_profile['metrics']:
servicetypes.add(m['name'])
metrics.add(m['name'])
except KeyError:
print 'CRITICAL - cannot retrieve a value from %s' % MIP_API
raise SystemExit(2)
Expand All @@ -181,7 +214,7 @@ def main():
for profile in profilesjson:
if (profile['name'] == arguments.profile):
for metric in profile['metric_instances']:
metrics.add(metric['atp_service_type_flavour'])
servicetypes.add(metric['atp_service_type_flavour'])
break
except KeyError:
print 'CRITICAL - cannot retrieve a value from %s' % PR_API
Expand All @@ -194,7 +227,16 @@ def main():
print 'WARNING - Server certificate will expire in %i days' % (dte - dtn).days
raise SystemExit(1)

print 'OK - %s has %d distinct service types and %d distinct metrics' % (arguments.profile, len(servicetypes), len(metrics))
# Check configuration...
numconf = check_metric_conf(profilesjson, arguments)

# Check if number of configurations is equal to number of distinct metrics...
if numconf == len(metrics):
print 'OK - %s has %d distinct service types, %s configurations and %d distinct metrics' \
% (arguments.profile, len(servicetypes), numconf, len(metrics))
else:
print 'WARNING - %s has %d distinct service types and %d distinct metrics. %d metric configurations is missing' \
% (arguments.profile, len(servicetypes), len(metrics), len(metrics) - numconf)
raise SystemExit(0)

if __name__ == "__main__":
Expand Down
8 changes: 7 additions & 1 deletion nagios-plugins-argo.spec
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

Name: nagios-plugins-argo
Summary: ARGO components related probes.
Version: 0.1.10
Version: 0.1.11
Release: 1%{?dist}
License: ASL 2.0
Source0: %{name}-%{version}.tar.gz
Expand Down Expand Up @@ -44,6 +44,12 @@ rm -rf %{buildroot}


%changelog
* Thu Nov 8 2018 Daniel Vrcic <[email protected]>, Katarina Zailac <[email protected]>, Emir Imamagic <[email protected]>, Angelos Tsalapatis <[email protected]> - 0.1.11-1%{?dist}
- ARGO-1369 Check POEM metric configuration API
- ARGO-1271 Publisher probe should display queried time interval on result
- ARGO 1126 Probe for monitoring the compute engine-minor bug fix
- ARGO-1097 Probe for monitoring the compute engine
- ARGO-985 Refine last_update logic of check_nagios sensor
* Tue Mar 27 2018 Daniel Vrcic <[email protected]> - 0.1.9-1%{?dist}
- added argo-nagios-ams-publisher
* Mon Dec 4 2017 Daniel Vrcic <[email protected]> - 0.1.8-1%{?dist}
Expand Down

0 comments on commit f2d0b4f

Please sign in to comment.