Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding PagerDuty Native support #76

Merged
merged 5 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ venv
*.egg-info
*.snap
**/__pycache__
*.rock
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.9.0] - 2024-05-30

- Added PagerDuty native support (#76).


## [0.8.0] - 2024-03-07

- Fixes container silently running by exiting with non-zero status when configuration file is missing. (#70).
Expand Down
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ cp config-defaults.yaml cos-alerter.yaml
docker run -p 8080:8080 --rm --mount type=bind,source="$(pwd)"/cos-alerter.yaml,target=/etc/cos-alerter.yaml,readonly -it cos-alerter:0.2.0
```


## Run Tests

* `pip install tox`
Expand Down
84 changes: 80 additions & 4 deletions cos_alerter/alerter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
import time
import typing
from pathlib import Path
from typing import Dict, List, Optional

import apprise
import durationpy
import xdg_base_dirs
from pdpyras import EventsAPISession
from ruamel.yaml import YAML
from ruamel.yaml.constructor import DuplicateKeyError

Expand Down Expand Up @@ -211,6 +213,9 @@ def clients():

def reset_alert_timeout(self):
"""Set the "last alert time" to right now."""
# In case an instance was down, resolve the PagerDuty incident before resetting the last alert time
if self.is_down():
self.resolve_existing_alerts()
logger.debug("Resetting alert timeout for %s.", self.clientid)
self.data["alert_time"] = time.monotonic()

Expand Down Expand Up @@ -274,10 +279,26 @@ def notify(self):
# Sending notifications can be a long operation so handle that in a separate thread.
# This avoids interfering with the execution of the main loop.
notify_thread = threading.Thread(
target=send_notifications, kwargs={"title": title, "body": body}
target=send_all_notifications,
kwargs={
"title": title,
"body": body,
"destinations": split_destinations(config["notify"]["destinations"]),
"incident_type": "trigger",
"dedup_key": f"{self.clientid}-{self.last_alert_datetime()}",
},
)
notify_thread.start()

def resolve_existing_alerts(self):
"""Resolves the current alerts."""
categorized_destinations = split_destinations(config["notify"]["destinations"])
handle_pagerduty_incidents(
incident_type="resolve",
dedup_key=f"{self.clientid}-{self.last_alert_datetime()}",
destinations=categorized_destinations["pagerduty"],
)


def now_datetime():
"""Return the current datetime using the monotonic clock."""
Expand All @@ -290,20 +311,75 @@ def up_time():
return time.monotonic() - state["start_time"]


def send_notifications(title: str, body: str):
def split_destinations(destinations: List[str]) -> Dict[str, List[str]]:
"""Split destinations into categorized lists."""
categorized_destinations = {"standard": [], "pagerduty": []}

for source in destinations:
if source.startswith("pagerduty"):
categorized_destinations["pagerduty"].append(source)
else:
categorized_destinations["standard"].append(source)

return categorized_destinations


def send_all_notifications(
title: str, body: str, destinations: Dict[str, List[str]], incident_type: str, dedup_key: str
):
"""Send a notification to all receivers."""
send_standard_notifications(title=title, body=body, destinations=destinations["standard"])
handle_pagerduty_incidents(
incident_type=incident_type,
dedup_key=dedup_key,
destinations=destinations["pagerduty"],
incident_summary=body,
)


def send_standard_notifications(title: str, body: str, destinations: list):
"""Send a notification to all standard receivers."""
# TODO: Since this is run in its own thread, we have to make sure we properly
# log failures here.

# Send notifications to non-PagerDuty destinations
sender = apprise.Apprise()
for source in config["notify"]["destinations"]:
for source in destinations:
sender.add(source)
sender.notify(title=title, body=body)


def handle_pagerduty_incidents(
incident_type: str,
dedup_key: str,
destinations: list,
incident_summary: Optional[str] = None,
):
"""Handles PagerDuty incidents by triggering or resolving incidents based on the specified incident type.

Args:
incident_type (str): The type of incident action to perform. Should be either 'trigger' or 'resolve'.
dedup_key (str): The deduplication key to uniquely identify the incident.
destinations (list): List of destinations to handle PagerDuty incidents for.
incident_summary (str, optional): A summary of the incident, used only when triggering an incident. Defaults to None.
"""
for source in destinations:
integration_key = source.split("//")[1].split("@")[0]
session = EventsAPISession(integration_key)

if incident_type == "trigger":
session.trigger(source="cos-alerter", summary=incident_summary, dedup_key=dedup_key)
elif incident_type == "resolve":
session.resolve(dedup_key)


def send_test_notification():
"""Signal handler which sends a test email to all configured receivers."""
logger.info("Sending test notifications.")
send_notifications(
send_all_notifications(
title="COS-Alerter test email.",
body="This is a test email automatically generated by COS-alerter.",
destinations=split_destinations(config["notify"]["destinations"]),
incident_type="trigger",
dedup_key="test-dedup-key",
)
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "cos-alerter"
version = "0.8.0"
version = "0.9.0"
authors = [
{ name="Dylan Stephano-Shachter", email="[email protected]" }
]
Expand All @@ -28,6 +28,7 @@ dependencies = [
"timeago~=1.0",
"waitress~=2.1",
"xdg-base-dirs~=6.0.1",
"pdpyras~=5.2.0"
]

[project.urls]
Expand Down
2 changes: 1 addition & 1 deletion rockcraft.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: cos-alerter
summary: A liveness checker for self-monitoring.
description: Receive regular pings from the cos stack and alert when they stop.
version: "0.8.0"
version: "0.9.0"
base: [email protected]
license: Apache-2.0
platforms:
Expand Down
2 changes: 1 addition & 1 deletion snap/snapcraft.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cos-alerter
version: '0.8.0'
version: '0.9.0'
summary: A watchdog alerting on alertmanager notification failures.
license: Apache-2.0
contact: [email protected]
Expand Down
1 change: 1 addition & 0 deletions tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
DESTINATIONS = [
"mailtos://user:pass@domain/[email protected],[email protected]",
"slack://xoxb-1234-1234-4ddbc191d40ee098cbaae6f3523ada2d/#general",
"pagerduty://integration-key@api-key",
]

CONFIG = {
Expand Down
59 changes: 33 additions & 26 deletions tests/test_alerter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,24 @@
import freezegun
import yaml
from helpers import DESTINATIONS
from pdpyras import EventsAPISession

from cos_alerter.alerter import AlerterState, config, send_test_notification, up_time
from cos_alerter.alerter import (
AlerterState,
config,
send_test_notification,
split_destinations,
up_time,
)


def assert_notifications(notify_mock, add_mock, title, body):
add_mock.assert_has_calls([unittest.mock.call(x) for x in DESTINATIONS])
def assert_notifications(notify_mock, add_mock, pd_mock, title, body, dedup_key):
categorized_destinations = split_destinations(DESTINATIONS)
add_mock.assert_has_calls(
[unittest.mock.call(x) for x in categorized_destinations["standard"]]
)
notify_mock.assert_called_with(title=title, body=body)
pd_mock.assert_called_with(source="cos-alerter", summary=body, dedup_key=dedup_key)


def test_config_gets_item(fake_fs):
Expand Down Expand Up @@ -142,7 +153,8 @@ def test_is_down_from_initialize(monotonic_mock, fake_fs):

@freezegun.freeze_time("2023-01-01")
@unittest.mock.patch("time.monotonic")
def test_is_down_with_reset_alert_timeout(monotonic_mock, fake_fs):
@unittest.mock.patch.object(EventsAPISession, "resolve")
def test_is_down_with_reset_alert_timeout(pd_mock, monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="clientid1")
Expand All @@ -153,6 +165,7 @@ def test_is_down_with_reset_alert_timeout(monotonic_mock, fake_fs):
assert state.is_down() is False
monotonic_mock.return_value = 2330 # Five and a half minutes have passed
assert state.is_down() is True
pd_mock.assert_called_with(f"{state.clientid}-None")


@freezegun.freeze_time("2023-01-01")
Expand Down Expand Up @@ -201,21 +214,6 @@ def test_is_down_from_graceful_shutdown(monotonic_mock, fake_fs):
assert state.is_down() is True


@freezegun.freeze_time("2023-01-01")
@unittest.mock.patch("time.monotonic")
def test_is_down(monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="clientid1")
with state:
monotonic_mock.return_value = 2000
state.reset_alert_timeout()
monotonic_mock.return_value = 2180 # Three minutes have passed
assert state.is_down() is False
monotonic_mock.return_value = 2330 # Five and a half minutes have passed
assert state.is_down() is True


@freezegun.freeze_time("2023-01-01")
@unittest.mock.patch("time.monotonic")
def test_recently_notified(monotonic_mock, fake_fs):
Expand All @@ -234,46 +232,55 @@ def test_recently_notified(monotonic_mock, fake_fs):
@unittest.mock.patch("time.monotonic")
@unittest.mock.patch.object(apprise.Apprise, "add")
@unittest.mock.patch.object(apprise.Apprise, "notify")
def test_notify(notify_mock, add_mock, monotonic_mock, fake_fs):
@unittest.mock.patch.object(EventsAPISession, "trigger")
def test_notify(pd_mock, notify_mock, add_mock, monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="clientid1")

dedup_key = f"{state.clientid}-{state.last_alert_datetime()}"
with state:
state.notify()
for thread in threading.enumerate():
if thread != threading.current_thread():
thread.join()

assert_notifications(
notify_mock,
add_mock,
notify_mock=notify_mock,
add_mock=add_mock,
pd_mock=pd_mock,
title="**Alertmanager is Down!**",
body=textwrap.dedent(
"""
Your Alertmanager instance: clientid1 seems to be down!
It has not alerted COS-Alerter ever.
"""
),
dedup_key=dedup_key,
)

# Make sure if we try again, nothing is sent
notify_mock.reset_mock()
pd_mock.reset_mock()

with state:
state.notify()
for thread in threading.enumerate():
if thread != threading.current_thread():
thread.join()
notify_mock.assert_not_called()
pd_mock.assert_not_called()


@unittest.mock.patch.object(apprise.Apprise, "add")
@unittest.mock.patch.object(apprise.Apprise, "notify")
def test_send_test_notification(notify_mock, add_mock, fake_fs):
@unittest.mock.patch.object(EventsAPISession, "trigger")
def test_send_test_notification(pd_mock, notify_mock, add_mock, fake_fs):
send_test_notification()
assert_notifications(
notify_mock,
add_mock,
notify_mock=notify_mock,
add_mock=add_mock,
pd_mock=pd_mock,
title="COS-Alerter test email.",
body="This is a test email automatically generated by COS-alerter.",
dedup_key="test-dedup-key",
)
Loading