From 30dd1bb028dcac9a083655437258e24429e8a4c7 Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Sat, 10 Sep 2022 14:01:12 +0300 Subject: [PATCH] add fingerprint for all findings (#528) --- playbooks/robusta_playbooks/oom_killer.py | 66 +++++++++++++++-------- src/robusta/core/reporting/base.py | 58 +++++++++++++++----- 2 files changed, 89 insertions(+), 35 deletions(-) diff --git a/playbooks/robusta_playbooks/oom_killer.py b/playbooks/robusta_playbooks/oom_killer.py index d4ff8003d..b6e01a52d 100644 --- a/playbooks/robusta_playbooks/oom_killer.py +++ b/playbooks/robusta_playbooks/oom_killer.py @@ -30,21 +30,28 @@ class OomKillerEnricherParams(ActionParams): # report the corresponding node as the reason for the OOMKill. NODE_MEMORY_THRESHOLD = 0.95 + @action -def oomkilled_container_graph_enricher(event: PodEvent, params: ResourceGraphEnricherParams): +def oomkilled_container_graph_enricher( + event: PodEvent, params: ResourceGraphEnricherParams +): """ Get a graph of a specific resource for this pod. Note: "Disk" Resource is not supported. """ pod = event.get_pod() if not pod: - logging.error(f"cannot run pod_oom_killer_enricher on event with no pod: {event}") + logging.error( + f"cannot run pod_oom_killer_enricher on event with no pod: {event}" + ) return oomkilled_container = pod_most_recent_oom_killed_container(pod) if not oomkilled_container: logging.error(f"Unable to find oomkilled container") return - container_graph = create_container_graph(params, pod, oomkilled_container.container, show_limit=True ) + container_graph = create_container_graph( + params, pod, oomkilled_container.container, show_limit=True + ) event.add_enrichment([container_graph]) @@ -57,25 +64,38 @@ def pod_oom_killer_enricher( """ pod = event.get_pod() if not pod: - logging.error(f"cannot run pod_oom_killer_enricher on event with no pod: {event}") + logging.error( + f"cannot run pod_oom_killer_enricher on event with no pod: {event}" + ) return finding = Finding( title=f"Pod {pod.metadata.name} in namespace {pod.metadata.namespace} OOMKilled results", aggregation_key="pod_oom_killer_enricher", - severity=FindingSeverity.HIGH + severity=FindingSeverity.HIGH, + subject=PodFindingSubject(pod), ) - labels = [("Pod", pod.metadata.name), - ("Namespace", pod.metadata.namespace), - ("Node Name", pod.spec.nodeName), - ] + labels = [ + ("Pod", pod.metadata.name), + ("Namespace", pod.metadata.namespace), + ("Node Name", pod.spec.nodeName), + ] node: Node = Node.readNode(pod.spec.nodeName).obj if node: - allocatable_memory = PodResources.parse_mem(node.status.allocatable.get("memory", "0Mi")) - capacity_memory = PodResources.parse_mem(node.status.capacity.get("memory", "0Mi")) - allocated_precent = (capacity_memory - allocatable_memory) * 100 / capacity_memory - node_label = ("Node allocated memory", f"{allocated_precent:.2f}% out of {allocatable_memory}MB allocatable") + allocatable_memory = PodResources.parse_mem( + node.status.allocatable.get("memory", "0Mi") + ) + capacity_memory = PodResources.parse_mem( + node.status.capacity.get("memory", "0Mi") + ) + allocated_precent = ( + (capacity_memory - allocatable_memory) * 100 / capacity_memory + ) + node_label = ( + "Node allocated memory", + f"{allocated_precent:.2f}% out of {allocatable_memory}MB allocatable", + ) labels.append(node_label) else: logging.warning( @@ -84,20 +104,24 @@ def pod_oom_killer_enricher( oomkilled_container = pod_most_recent_oom_killed_container(pod) if not oomkilled_container or not oomkilled_container.state: - logging.error( - f"could not find OOMKilled status in pod {pod.metadata.name}" - ) + logging.error(f"could not find OOMKilled status in pod {pod.metadata.name}") else: - requests, limits = PodContainer.get_memory_resources(oomkilled_container.container) + requests, limits = PodContainer.get_memory_resources( + oomkilled_container.container + ) labels.append(("Container name", oomkilled_container.container.name)) memory_limit = "No limit" if not limits else f"{limits}MB limit" memory_requests = "No request" if not requests else f"{requests}MB request" labels.append(("Container memory", f"{memory_requests}, {memory_limit}")) oom_killed_status = oomkilled_container.state if oom_killed_status.terminated.startedAt: - labels.append(("Container started at", oom_killed_status.terminated.startedAt)) + labels.append( + ("Container started at", oom_killed_status.terminated.startedAt) + ) if oom_killed_status.terminated.finishedAt: - labels.append(("Container finished at", oom_killed_status.terminated.finishedAt)) + labels.append( + ("Container finished at", oom_killed_status.terminated.finishedAt) + ) table_block = TableBlock( [[k, v] for (k, v) in labels], ["field", "value"], @@ -106,6 +130,7 @@ def pod_oom_killer_enricher( finding.add_enrichment([table_block]) event.add_finding(finding) + @action def oom_killer_enricher( event: PrometheusKubernetesAlert, config: OomKillerEnricherParams @@ -253,8 +278,6 @@ def get_oom_kills_from_pod(self, pod: Pod) -> List[OomKill]: return oom_kills - - @staticmethod def get_memory_specs(resources: Optional[ResourceRequirements]) -> MemorySpecs: if resources is None: @@ -348,4 +371,3 @@ def get_busy_node_reason(self) -> Optional[str]: reason = f"node {node_name} used too much memory: reached {node_max_used_memory_in_percentage} percentage of its available memory" return reason - diff --git a/src/robusta/core/reporting/base.py b/src/robusta/core/reporting/base.py index d7e58117e..da8021901 100644 --- a/src/robusta/core/reporting/base.py +++ b/src/robusta/core/reporting/base.py @@ -1,3 +1,4 @@ +import hashlib import logging import urllib.parse import uuid @@ -37,14 +38,18 @@ def from_severity(severity: str) -> "FindingSeverity": return FindingSeverity.HIGH raise Exception(f"Unknown severity {severity}") - - def to_emoji(self) -> str: - if self == FindingSeverity.DEBUG: return "🔵" - elif self == FindingSeverity.INFO: return "🟢" - elif self == FindingSeverity.LOW: return "🟡" - elif self == FindingSeverity.MEDIUM: return "🟠" - elif self == FindingSeverity.HIGH: return "🔴" + def to_emoji(self) -> str: + if self == FindingSeverity.DEBUG: + return "🔵" + elif self == FindingSeverity.INFO: + return "🟢" + elif self == FindingSeverity.LOW: + return "🟡" + elif self == FindingSeverity.MEDIUM: + return "🟠" + elif self == FindingSeverity.HIGH: + return "🔴" class Enrichment: @@ -71,7 +76,9 @@ def attribute_map(self) -> Dict[str, str]: def get_invalid_attributes(self, attributes: List[str]) -> List: return list(set(attributes) - set(self.attribute_map)) - def attribute_matches(self, attribute: str, expression: Union[str, List[str]]) -> bool: + def attribute_matches( + self, attribute: str, expression: Union[str, List[str]] + ) -> bool: value = self.attribute_map[attribute] if isinstance(expression, str): return bool(re.match(expression, value)) @@ -116,6 +123,7 @@ def __init__( severity: FindingSeverity = FindingSeverity.INFO, source: FindingSource = FindingSource.NONE, description: str = None, + # TODO: this is bug-prone - see https://towardsdatascience.com/python-pitfall-mutable-default-arguments-9385e8265422 subject: FindingSubject = FindingSubject(), finding_type: FindingType = FindingType.ISSUE, failure: bool = True, @@ -123,7 +131,7 @@ def __init__( fingerprint: str = None, starts_at: datetime = None, ends_at: datetime = None, - add_silence_url: bool = False + add_silence_url: bool = False, ) -> None: self.id: uuid = uuid.uuid4() self.title = title @@ -145,7 +153,11 @@ def __init__( self.investigate_uri = f"{ROBUSTA_UI_DOMAIN}/{uri_path}" self.add_silence_url = add_silence_url self.creation_date = creation_date - self.fingerprint = fingerprint + self.fingerprint = ( + fingerprint + if fingerprint + else self.__calculate_fingerprint(subject, source, aggregation_key) + ) self.starts_at = starts_at if starts_at else datetime.now() self.ends_at = ends_at self.dirty = False @@ -164,9 +176,16 @@ def attribute_map(self) -> Dict[str, str]: "name": str(self.subject.name), } - def add_enrichment(self, enrichment_blocks: List[BaseBlock], annotations=None, suppress_warning: bool = False): + def add_enrichment( + self, + enrichment_blocks: List[BaseBlock], + annotations=None, + suppress_warning: bool = False, + ): if self.dirty and not suppress_warning: - logging.warning("Updating a finding after it was added to the event is not allowed!") + logging.warning( + "Updating a finding after it was added to the event is not allowed!" + ) if not enrichment_blocks: return @@ -180,7 +199,7 @@ def __str__(self): def get_prometheus_silence_url(self, cluster_id: str) -> str: labels: Dict[str, str] = { "alertname": self.aggregation_key, - "cluster": cluster_id + "cluster": cluster_id, } if self.subject.namespace: labels["namespace"] = self.subject.namespace @@ -192,3 +211,16 @@ def get_prometheus_silence_url(self, cluster_id: str) -> str: labels["referer"] = "sink" return f"{ROBUSTA_UI_DOMAIN}/silences/create?{urllib.parse.urlencode(labels)}" + + @staticmethod + def __calculate_fingerprint( + subject: FindingSubject, source: FindingSource, aggregation_key: str + ) -> str: + # some sinks require a unique fingerprint, typically used for two reasons: + # 1. de-dupe the same alert if it fires twice + # 2. update an existing alert and change its status from firing to resolved + # + # if we have a fingerprint available from the trigger (e.g. alertmanager) then use that + # if not, generate with logic similar to alertmanager + s = f"{subject.subject_type},{subject.name},{subject.namespace},{subject.node},{source.value}{aggregation_key}" + return hashlib.sha256(s.encode()).hexdigest()