Skip to content

Commit

Permalink
Merge pull request #105 from aviate-labs/102-bugfix/nodes-down-event-…
Browse files Browse the repository at this point in the history
…gets-triggered-when-nodes-arent-down

fix: send node down message only when nodes are "DOWN" or "DEGRADED"
  • Loading branch information
mourginakis authored Nov 3, 2023
2 parents 6fccd74 + 44dc3a7 commit d1f79a7
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 14 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
<!-- ## [1.0.0-alpha.2] - Unreleased -->

- Added logging.
- Fixed bug where `UNASSIGNED` nodes were being alerted on as being compromised.


## [1.0.0-alpha.1] - 2023-10-20
Expand Down
2 changes: 1 addition & 1 deletion node_monitor/node_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def broadcast_alerts(self) -> None:
channels = self.node_provider_db.get_channels_as_dict()
for node_provider_id, nodes in self.actionables.items():
preferences = subscribers[node_provider_id]
subject, message = messages.nodes_down_message(nodes, node_labels)
subject, message = messages.nodes_compromised_message(nodes, node_labels)
# - - - - - - - - - - - - - - - - -
if preferences['notify_email'] == True:
recipients = email_recipients[node_provider_id]
Expand Down
6 changes: 3 additions & 3 deletions node_monitor/node_monitor_helpers/get_compromised_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ def get_compromised_nodes(snapshots: Deque[ic_api.Nodes]) -> List[ic_api.Node]:
# debounce: eliminate false positives
# sometimes the nodes go down for a few minutes and come back up
node_c_is_compromised: bool = all([
node_a.status == 'UP',
node_b.status != 'UP',
node_c.status != 'UP',
node_a.status == 'UP' or node_a.status == 'UNASSIGNED',
node_b.status == 'DOWN' or node_b.status == 'DEGRADED',
node_c.status == 'DOWN' or node_c.status == 'DEGRADED',
])
if node_c_is_compromised:
compromised_nodes.append(node_c)
Expand Down
15 changes: 8 additions & 7 deletions node_monitor/node_monitor_helpers/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,25 +59,26 @@ def detailnodes(nodes: List[ic_api.Node],



def nodes_down_message(nodes: List[ic_api.Node],
def nodes_compromised_message(nodes: List[ic_api.Node],
labels: Dict[Principal, str]) -> Tuple[str, str]:
"""Returns a message that describes the nodes that are down, in the
"""Returns a message that describes the nodes that are compromised, in the
format of an email or message for a comprable communication channel.
"""
nodes_down = [node for node in nodes if node.status == 'DOWN']
nodes_compromised = [node for node in nodes
if node.status == 'DOWN' or node.status == 'DEGRADED']
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def _make_subject() -> str:
datacenters = {node.dc_id.upper() for node in nodes_down}
match len(nodes_down):
datacenters = {node.dc_id.upper() for node in nodes_compromised}
match len(nodes_compromised):
case 0: return "All Systems Healthy"
case _: return "Action Required @ " + ', '.join(sorted(datacenters))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
formatted_nodes_down = detailnodes(nodes, labels)
formatted_nodes_compromised = detailnodes(nodes, labels)
subject = _make_subject()
message = (
f"🛑 Node(s) Compromised:\n"
f"\n"
f"{formatted_nodes_down}\n"
f"{formatted_nodes_compromised}\n"
f"\n"
f"Node Monitor by Aviate Labs\n"
f"Report Generated: {datetime_iso8601()} UTC\n"
Expand Down
6 changes: 3 additions & 3 deletions tests/test_bot_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,15 @@ def test_send_emails_network():

## Create the messages. We use unittest.mock.patch to remove the private URL.
with patch.object(c, 'FEEDBACK_FORM_URL', 'https://url-has-been-redacted.ninja'):
subject1, message1 = messages.nodes_down_message([fakenode], fakelabel)
subject1, message1 = messages.nodes_compromised_message([fakenode], fakelabel)
subject2, message2 = messages.nodes_status_message([fakenode], fakelabel)

## Append the time to the subject to act as an identifier for the test,
## making it easy to do a regex search to validate the email.
subject1 = str(f'{time.time()} - {subject1}')
subject2 = str(f'{time.time()} - {subject2}')

## Send both nodes_down_message and nodes_status_message as emails.
## Send both nodes_compromised_message and nodes_status_message as emails.
email_bot.send_emails(recipients, subject1, message1)
email_bot.send_emails(recipients, subject2, message2)

Expand All @@ -87,6 +87,6 @@ def test_send_emails_network():
url = 'https://mailnesia.com/mailbox/nodemonitortest'
response = requests.get(url)
assert response.status_code == 200, "Mailnesia website did not respond."
assert re.search(subject1, response.text), "The nodes_down_message email was not received."
assert re.search(subject1, response.text), "The nodes_compromised_message email was not received."
assert re.search(subject2, response.text), "The nodes_status_message email was not received."
print('Email received!')

0 comments on commit d1f79a7

Please sign in to comment.