Skip to content

Commit

Permalink
Merge pull request #4 from hooloobooroodkoo/main
Browse files Browse the repository at this point in the history
Adding ps-hosts-not-found alarm
  • Loading branch information
petya-vasileva authored Nov 25, 2024
2 parents 2bc3d1f + fe909df commit e7a4bb2
Show file tree
Hide file tree
Showing 2 changed files with 181 additions and 0 deletions.
174 changes: 174 additions & 0 deletions ps-hosts-not-found.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
# ps-hosts-not-found - This module checks whether expected hosts are found in the Elasticsearch throughput/latency/trace
# data the day before yesterday and compares it with
# the expected hosts from the provided mesh configuration.
# It queries Elasticsearch for specific test data (throughput/latency/trace)
# within a 24-hour time range and verifies if the hosts are listed in the index.
# The function identifies hosts that are expected (in the mesh configuration)
# but not found in Elasticsearch.
#
# The process retrieves the hosts from the configuration, queries Elasticsearch
# for the relevant test data, and counts the number of hosts not found.
# In addition, the function can generate a plot comparing the number of hosts found
# in the configuration versus those found in Elasticsearch. This information helps
# maintain an accurate and up-to-date monitoring system by identifying discrepancies
# between the expected and actual data.
# TODO: Marian's API returns sites whose names do not exactly match those recorded in Elasticsearch. This directly affects the tags that users use.
# Author: Yana Holoborodko
# Copyright 2024
import helpers as hp
import warnings
import time
import datetime as dt
import hashlib
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import psconfig.api
import urllib3
from alarms import alarms

def query4Hosts(dateFrom, dateTo, testType):
query = {
"bool" : {
"must" : [
{
"range" : {
"timestamp" : {
"gt" : dateFrom,
"lte": dateTo
}
}
}
]
}
}
aggregations = {
"unique_host_pairs": {
"composite": {
"size": 20000,
"sources": [
{
"src_host": {
"terms": {
"field": "src_host"
}
}
},
{
"dest_host": {
"terms": {
"field": "dest_host"
}
}
}
]
}
}
}
aggrs = []
aggdata = hp.es.search(index=f'ps_{testType}', query=query, aggregations=aggregations)
for item in aggdata['aggregations']['unique_host_pairs']['buckets']:
aggrs.append(item['key']['src_host'])
aggrs.append(item['key']['dest_host'])
return aggrs

def queryData(dateFrom, dateTo, test):
data = []
# query in portions since ES does not allow aggregations with more than 10000 bins
intv = int(hp.CalcMinutes4Period(dateFrom, dateTo)/60)
time_list = hp.GetTimeRanges(dateFrom, dateTo, intv)
for i in range(len(time_list)-1):
data.extend(query4Hosts(time_list[i], time_list[i+1], test))
return set(data)

def check_tests_for_host(host, mesh_config):
"""
Classifies the host as belonging to one of the three test groups (latency, trace and throughput).
"""
try:
types = mesh_config.get_test_types(host)
except Exception:
return False, False
latency = any(test in ['latency', 'latencybg'] for test in types)
trace = 'trace' in types
throughput = any(test in ['throughput', 'rtt'] for test in types) # as rtt is now in ps_throughput
return host, latency, trace, throughput

def create_hosts_tests_types_grid(hosts, mesh_config):
"""
Creates a dataframe with a list of all hosts and whether
or not they are tested in each group(latency and trace).
"""
host_test_type = pd.DataFrame({
'host': list(hosts),
'owd': False,
'trace': False,
'throughput': False
})
host_test_type = host_test_type['host'].apply(
lambda host: pd.Series(check_tests_for_host(host, mesh_config))
)
host_test_type.columns = ['host', 'owd', 'trace', 'throughput']
return host_test_type

def check_data_difference_in_es(data_from, data_to, test_type, expected_hosts):
"""
Checks whether all expected(mentioned in configurations) hosts
were found in the Elasticsearch, and returns the list of hosts which were omitted.
Can creates the plot for visualization of results.
"""
data = queryData(data_from, data_to, test_type)
difference = expected_hosts.difference(data)
p = len(difference)/len(expected_hosts) * 100
return difference, round(p, 2)

if __name__ == '__main__':
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
today_date = dt.date.today()
delta = today_date - dt.timedelta(days=2)
time_from = dt.time(0, 0)
time_to = dt.time(23, 59, 59)
m_from = dt.datetime.combine(delta, time_from).strftime('%Y-%m-%dT%H:%M:%S.000Z')
m_to = dt.datetime.combine(delta, time_to).strftime('%Y-%m-%dT%H:%M:%S.000Z')
mesh_url = "https://psconfig.aglt2.org/pub/config"
mesh_config = psconfig.api.PSConfig(mesh_url)
all_hosts = mesh_config.get_all_hosts()
ips = list(all_hosts)
test_types = ['owd', 'trace', 'throughput']
expected_tests_types = create_hosts_tests_types_grid(ips, mesh_config)
sites_mapping = {}
stats = {'percent_owd': None,
'percent_trace': None,
'percent_throughput': None,
'num_not_found_owd': None,
'num_not_found_trace': None,
'num_not_found_throughput': None,
'num_expected_owd': None,
'num_expected_trace': None,
'num_expected_throughput': None}
for test in test_types:
expected_hosts_test = set(expected_tests_types[expected_tests_types[test] == True]['host'].to_list())
diff, percent = check_data_difference_in_es(m_from, m_to, test, expected_hosts_test)
for host in diff:
site = mesh_config.get_site(host)
if site not in sites_mapping:
type_subset = dict()
sites_mapping[site] = type_subset
if test not in sites_mapping[site].keys():
sites_mapping[site][test] = set()
sites_mapping[site][test].add(host)
stats[f'percent_{test}'] = percent
stats[f'num_not_found_{test}'] = len(diff)
stats[f'num_expected_{test}'] = len(expected_hosts_test)
# print(f"Hosts expected but not found in the Elasticsearch ps-owd({stats['percent_owd']}% ({stats['num_not_found_owd']}/{stats['num_expected_owd']}) out of included to configurations not found)\nHosts expected but not found in the Elasticsearch ps-trace({stats['percent_trace']}% ({stats['num_not_found_trace']}/{stats['num_expected_trace']}) out of included to configurations not found)\nHosts expected but not found in the Elasticsearch ps-owd({stats['percent_throughput']}% ({stats['num_not_found_throughput']}/{stats['num_expected_throughput']}) out of included to configurations not found)\n")
for s in sites_mapping.keys():
alarmOnSite = alarms('Networking', 'Sites', f"hosts not found")
doc = {'from': m_from,
'to': m_to,
'site': s,
'hosts': sites_mapping[s]}
toHash = ','.join([s, str(sites_mapping[s]), m_from, m_to, test])
doc['alarm_id'] = hashlib.sha224(toHash.encode('utf-8')).hexdigest()
alarmOnSite.addAlarm(body='not found in the Elasticsearch', tags=[s], source=doc)
# print(f"Hosts expected but not found in the Elasticsearch\n{s}\n{doc['hosts']}\n")
7 changes: 7 additions & 0 deletions ps-hosts-not-found.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
date
python ps-hosts-not-found.py
rc=$?; if [[ $rc != 0 ]]; then
echo "problem running hosts not found. Exiting."
exit $rc
fi

0 comments on commit e7a4bb2

Please sign in to comment.