Skip to content

Commit

Permalink
add fingerprint for all findings (#528)
Browse files Browse the repository at this point in the history
  • Loading branch information
aantn authored Sep 10, 2022
1 parent e1ace49 commit 30dd1bb
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 35 deletions.
66 changes: 44 additions & 22 deletions playbooks/robusta_playbooks/oom_killer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,28 @@ class OomKillerEnricherParams(ActionParams):
# report the corresponding node as the reason for the OOMKill.
NODE_MEMORY_THRESHOLD = 0.95


@action
def oomkilled_container_graph_enricher(event: PodEvent, params: ResourceGraphEnricherParams):
def oomkilled_container_graph_enricher(
event: PodEvent, params: ResourceGraphEnricherParams
):
"""
Get a graph of a specific resource for this pod. Note: "Disk" Resource is not supported.
"""
pod = event.get_pod()
if not pod:
logging.error(f"cannot run pod_oom_killer_enricher on event with no pod: {event}")
logging.error(
f"cannot run pod_oom_killer_enricher on event with no pod: {event}"
)
return
oomkilled_container = pod_most_recent_oom_killed_container(pod)
if not oomkilled_container:
logging.error(f"Unable to find oomkilled container")
return

container_graph = create_container_graph(params, pod, oomkilled_container.container, show_limit=True )
container_graph = create_container_graph(
params, pod, oomkilled_container.container, show_limit=True
)
event.add_enrichment([container_graph])


Expand All @@ -57,25 +64,38 @@ def pod_oom_killer_enricher(
"""
pod = event.get_pod()
if not pod:
logging.error(f"cannot run pod_oom_killer_enricher on event with no pod: {event}")
logging.error(
f"cannot run pod_oom_killer_enricher on event with no pod: {event}"
)
return

finding = Finding(
title=f"Pod {pod.metadata.name} in namespace {pod.metadata.namespace} OOMKilled results",
aggregation_key="pod_oom_killer_enricher",
severity=FindingSeverity.HIGH
severity=FindingSeverity.HIGH,
subject=PodFindingSubject(pod),
)

labels = [("Pod", pod.metadata.name),
("Namespace", pod.metadata.namespace),
("Node Name", pod.spec.nodeName),
]
labels = [
("Pod", pod.metadata.name),
("Namespace", pod.metadata.namespace),
("Node Name", pod.spec.nodeName),
]
node: Node = Node.readNode(pod.spec.nodeName).obj
if node:
allocatable_memory = PodResources.parse_mem(node.status.allocatable.get("memory", "0Mi"))
capacity_memory = PodResources.parse_mem(node.status.capacity.get("memory", "0Mi"))
allocated_precent = (capacity_memory - allocatable_memory) * 100 / capacity_memory
node_label = ("Node allocated memory", f"{allocated_precent:.2f}% out of {allocatable_memory}MB allocatable")
allocatable_memory = PodResources.parse_mem(
node.status.allocatable.get("memory", "0Mi")
)
capacity_memory = PodResources.parse_mem(
node.status.capacity.get("memory", "0Mi")
)
allocated_precent = (
(capacity_memory - allocatable_memory) * 100 / capacity_memory
)
node_label = (
"Node allocated memory",
f"{allocated_precent:.2f}% out of {allocatable_memory}MB allocatable",
)
labels.append(node_label)
else:
logging.warning(
Expand All @@ -84,20 +104,24 @@ def pod_oom_killer_enricher(

oomkilled_container = pod_most_recent_oom_killed_container(pod)
if not oomkilled_container or not oomkilled_container.state:
logging.error(
f"could not find OOMKilled status in pod {pod.metadata.name}"
)
logging.error(f"could not find OOMKilled status in pod {pod.metadata.name}")
else:
requests, limits = PodContainer.get_memory_resources(oomkilled_container.container)
requests, limits = PodContainer.get_memory_resources(
oomkilled_container.container
)
labels.append(("Container name", oomkilled_container.container.name))
memory_limit = "No limit" if not limits else f"{limits}MB limit"
memory_requests = "No request" if not requests else f"{requests}MB request"
labels.append(("Container memory", f"{memory_requests}, {memory_limit}"))
oom_killed_status = oomkilled_container.state
if oom_killed_status.terminated.startedAt:
labels.append(("Container started at", oom_killed_status.terminated.startedAt))
labels.append(
("Container started at", oom_killed_status.terminated.startedAt)
)
if oom_killed_status.terminated.finishedAt:
labels.append(("Container finished at", oom_killed_status.terminated.finishedAt))
labels.append(
("Container finished at", oom_killed_status.terminated.finishedAt)
)
table_block = TableBlock(
[[k, v] for (k, v) in labels],
["field", "value"],
Expand All @@ -106,6 +130,7 @@ def pod_oom_killer_enricher(
finding.add_enrichment([table_block])
event.add_finding(finding)


@action
def oom_killer_enricher(
event: PrometheusKubernetesAlert, config: OomKillerEnricherParams
Expand Down Expand Up @@ -253,8 +278,6 @@ def get_oom_kills_from_pod(self, pod: Pod) -> List[OomKill]:

return oom_kills



@staticmethod
def get_memory_specs(resources: Optional[ResourceRequirements]) -> MemorySpecs:
if resources is None:
Expand Down Expand Up @@ -348,4 +371,3 @@ def get_busy_node_reason(self) -> Optional[str]:

reason = f"node {node_name} used too much memory: reached {node_max_used_memory_in_percentage} percentage of its available memory"
return reason

58 changes: 45 additions & 13 deletions src/robusta/core/reporting/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import hashlib
import logging
import urllib.parse
import uuid
Expand Down Expand Up @@ -37,14 +38,18 @@ def from_severity(severity: str) -> "FindingSeverity":
return FindingSeverity.HIGH

raise Exception(f"Unknown severity {severity}")

def to_emoji(self) -> str:
if self == FindingSeverity.DEBUG: return "🔵"
elif self == FindingSeverity.INFO: return "🟢"
elif self == FindingSeverity.LOW: return "🟡"
elif self == FindingSeverity.MEDIUM: return "🟠"
elif self == FindingSeverity.HIGH: return "🔴"

def to_emoji(self) -> str:
if self == FindingSeverity.DEBUG:
return "🔵"
elif self == FindingSeverity.INFO:
return "🟢"
elif self == FindingSeverity.LOW:
return "🟡"
elif self == FindingSeverity.MEDIUM:
return "🟠"
elif self == FindingSeverity.HIGH:
return "🔴"


class Enrichment:
Expand All @@ -71,7 +76,9 @@ def attribute_map(self) -> Dict[str, str]:
def get_invalid_attributes(self, attributes: List[str]) -> List:
return list(set(attributes) - set(self.attribute_map))

def attribute_matches(self, attribute: str, expression: Union[str, List[str]]) -> bool:
def attribute_matches(
self, attribute: str, expression: Union[str, List[str]]
) -> bool:
value = self.attribute_map[attribute]
if isinstance(expression, str):
return bool(re.match(expression, value))
Expand Down Expand Up @@ -116,14 +123,15 @@ def __init__(
severity: FindingSeverity = FindingSeverity.INFO,
source: FindingSource = FindingSource.NONE,
description: str = None,
# TODO: this is bug-prone - see https://towardsdatascience.com/python-pitfall-mutable-default-arguments-9385e8265422
subject: FindingSubject = FindingSubject(),
finding_type: FindingType = FindingType.ISSUE,
failure: bool = True,
creation_date: str = None,
fingerprint: str = None,
starts_at: datetime = None,
ends_at: datetime = None,
add_silence_url: bool = False
add_silence_url: bool = False,
) -> None:
self.id: uuid = uuid.uuid4()
self.title = title
Expand All @@ -145,7 +153,11 @@ def __init__(
self.investigate_uri = f"{ROBUSTA_UI_DOMAIN}/{uri_path}"
self.add_silence_url = add_silence_url
self.creation_date = creation_date
self.fingerprint = fingerprint
self.fingerprint = (
fingerprint
if fingerprint
else self.__calculate_fingerprint(subject, source, aggregation_key)
)
self.starts_at = starts_at if starts_at else datetime.now()
self.ends_at = ends_at
self.dirty = False
Expand All @@ -164,9 +176,16 @@ def attribute_map(self) -> Dict[str, str]:
"name": str(self.subject.name),
}

def add_enrichment(self, enrichment_blocks: List[BaseBlock], annotations=None, suppress_warning: bool = False):
def add_enrichment(
self,
enrichment_blocks: List[BaseBlock],
annotations=None,
suppress_warning: bool = False,
):
if self.dirty and not suppress_warning:
logging.warning("Updating a finding after it was added to the event is not allowed!")
logging.warning(
"Updating a finding after it was added to the event is not allowed!"
)

if not enrichment_blocks:
return
Expand All @@ -180,7 +199,7 @@ def __str__(self):
def get_prometheus_silence_url(self, cluster_id: str) -> str:
labels: Dict[str, str] = {
"alertname": self.aggregation_key,
"cluster": cluster_id
"cluster": cluster_id,
}
if self.subject.namespace:
labels["namespace"] = self.subject.namespace
Expand All @@ -192,3 +211,16 @@ def get_prometheus_silence_url(self, cluster_id: str) -> str:
labels["referer"] = "sink"

return f"{ROBUSTA_UI_DOMAIN}/silences/create?{urllib.parse.urlencode(labels)}"

@staticmethod
def __calculate_fingerprint(
subject: FindingSubject, source: FindingSource, aggregation_key: str
) -> str:
# some sinks require a unique fingerprint, typically used for two reasons:
# 1. de-dupe the same alert if it fires twice
# 2. update an existing alert and change its status from firing to resolved
#
# if we have a fingerprint available from the trigger (e.g. alertmanager) then use that
# if not, generate with logic similar to alertmanager
s = f"{subject.subject_type},{subject.name},{subject.namespace},{subject.node},{source.value}{aggregation_key}"
return hashlib.sha256(s.encode()).hexdigest()

0 comments on commit 30dd1bb

Please sign in to comment.