add fingerprint for all findings (#528)

robusta-dev · Sep 10, 2022 · 30dd1bb · 30dd1bb
1 parent e1ace49
commit 30dd1bb
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 35 deletions.
diff --git a/playbooks/robusta_playbooks/oom_killer.py b/playbooks/robusta_playbooks/oom_killer.py
@@ -30,21 +30,28 @@ class OomKillerEnricherParams(ActionParams):
 # report the corresponding node as the reason for the OOMKill.
 NODE_MEMORY_THRESHOLD = 0.95
 
+
 @action
-def oomkilled_container_graph_enricher(event: PodEvent, params: ResourceGraphEnricherParams):
+def oomkilled_container_graph_enricher(
+    event: PodEvent, params: ResourceGraphEnricherParams
+):
     """
     Get a graph of a specific resource for this pod. Note: "Disk" Resource is not supported.
     """
     pod = event.get_pod()
     if not pod:
-        logging.error(f"cannot run pod_oom_killer_enricher on event with no pod: {event}")
+        logging.error(
+            f"cannot run pod_oom_killer_enricher on event with no pod: {event}"
+        )
         return
     oomkilled_container = pod_most_recent_oom_killed_container(pod)
     if not oomkilled_container:
         logging.error(f"Unable to find oomkilled container")
         return
 
-    container_graph = create_container_graph(params, pod, oomkilled_container.container, show_limit=True )
+    container_graph = create_container_graph(
+        params, pod, oomkilled_container.container, show_limit=True
+    )
     event.add_enrichment([container_graph])
 
 
@@ -57,25 +64,38 @@ def pod_oom_killer_enricher(
     """
     pod = event.get_pod()
     if not pod:
-        logging.error(f"cannot run pod_oom_killer_enricher on event with no pod: {event}")
+        logging.error(
+            f"cannot run pod_oom_killer_enricher on event with no pod: {event}"
+        )
         return
 
     finding = Finding(
         title=f"Pod {pod.metadata.name} in namespace {pod.metadata.namespace} OOMKilled results",
         aggregation_key="pod_oom_killer_enricher",
-        severity=FindingSeverity.HIGH
+        severity=FindingSeverity.HIGH,
+        subject=PodFindingSubject(pod),
     )
 
-    labels = [("Pod", pod.metadata.name),
-              ("Namespace", pod.metadata.namespace),
-              ("Node Name", pod.spec.nodeName),
-              ]
+    labels = [
+        ("Pod", pod.metadata.name),
+        ("Namespace", pod.metadata.namespace),
+        ("Node Name", pod.spec.nodeName),
+    ]
     node: Node = Node.readNode(pod.spec.nodeName).obj
     if node:
-        allocatable_memory = PodResources.parse_mem(node.status.allocatable.get("memory", "0Mi"))
-        capacity_memory = PodResources.parse_mem(node.status.capacity.get("memory", "0Mi"))
-        allocated_precent = (capacity_memory - allocatable_memory) * 100 / capacity_memory
-        node_label = ("Node allocated memory", f"{allocated_precent:.2f}% out of {allocatable_memory}MB allocatable")
+        allocatable_memory = PodResources.parse_mem(
+            node.status.allocatable.get("memory", "0Mi")
+        )
+        capacity_memory = PodResources.parse_mem(
+            node.status.capacity.get("memory", "0Mi")
+        )
+        allocated_precent = (
+            (capacity_memory - allocatable_memory) * 100 / capacity_memory
+        )
+        node_label = (
+            "Node allocated memory",
+            f"{allocated_precent:.2f}% out of {allocatable_memory}MB allocatable",
+        )
         labels.append(node_label)
     else:
         logging.warning(
@@ -84,20 +104,24 @@ def pod_oom_killer_enricher(
 
     oomkilled_container = pod_most_recent_oom_killed_container(pod)
     if not oomkilled_container or not oomkilled_container.state:
-        logging.error(
-            f"could not find OOMKilled status in pod {pod.metadata.name}"
-        )
+        logging.error(f"could not find OOMKilled status in pod {pod.metadata.name}")
     else:
-        requests, limits = PodContainer.get_memory_resources(oomkilled_container.container)
+        requests, limits = PodContainer.get_memory_resources(
+            oomkilled_container.container
+        )
         labels.append(("Container name", oomkilled_container.container.name))
         memory_limit = "No limit" if not limits else f"{limits}MB limit"
         memory_requests = "No request" if not requests else f"{requests}MB request"
         labels.append(("Container memory", f"{memory_requests}, {memory_limit}"))
         oom_killed_status = oomkilled_container.state
         if oom_killed_status.terminated.startedAt:
-            labels.append(("Container started at", oom_killed_status.terminated.startedAt))
+            labels.append(
+                ("Container started at", oom_killed_status.terminated.startedAt)
+            )
         if oom_killed_status.terminated.finishedAt:
-            labels.append(("Container finished at", oom_killed_status.terminated.finishedAt))
+            labels.append(
+                ("Container finished at", oom_killed_status.terminated.finishedAt)
+            )
     table_block = TableBlock(
         [[k, v] for (k, v) in labels],
         ["field", "value"],
@@ -106,6 +130,7 @@ def pod_oom_killer_enricher(
     finding.add_enrichment([table_block])
     event.add_finding(finding)
 
+
 @action
 def oom_killer_enricher(
     event: PrometheusKubernetesAlert, config: OomKillerEnricherParams
@@ -253,8 +278,6 @@ def get_oom_kills_from_pod(self, pod: Pod) -> List[OomKill]:
 
         return oom_kills
 
-
-
     @staticmethod
     def get_memory_specs(resources: Optional[ResourceRequirements]) -> MemorySpecs:
         if resources is None:
@@ -348,4 +371,3 @@ def get_busy_node_reason(self) -> Optional[str]:
 
         reason = f"node {node_name} used too much memory: reached {node_max_used_memory_in_percentage} percentage of its available memory"
         return reason
-
diff --git a/src/robusta/core/reporting/base.py b/src/robusta/core/reporting/base.py
@@ -1,3 +1,4 @@
+import hashlib
 import logging
 import urllib.parse
 import uuid
@@ -37,14 +38,18 @@ def from_severity(severity: str) -> "FindingSeverity":
             return FindingSeverity.HIGH
 
         raise Exception(f"Unknown severity {severity}")
-
-    def to_emoji(self) -> str:
-        if self == FindingSeverity.DEBUG: return "🔵"
-        elif self == FindingSeverity.INFO: return "🟢"
-        elif self == FindingSeverity.LOW: return "🟡"
-        elif self == FindingSeverity.MEDIUM: return "🟠" 
-        elif self ==  FindingSeverity.HIGH: return "🔴"
 
+    def to_emoji(self) -> str:
+        if self == FindingSeverity.DEBUG:
+            return "🔵"
+        elif self == FindingSeverity.INFO:
+            return "🟢"
+        elif self == FindingSeverity.LOW:
+            return "🟡"
+        elif self == FindingSeverity.MEDIUM:
+            return "🟠"
+        elif self == FindingSeverity.HIGH:
+            return "🔴"
 
 
 class Enrichment:
@@ -71,7 +76,9 @@ def attribute_map(self) -> Dict[str, str]:
     def get_invalid_attributes(self, attributes: List[str]) -> List:
         return list(set(attributes) - set(self.attribute_map))
 
-    def attribute_matches(self, attribute: str, expression: Union[str, List[str]]) -> bool:
+    def attribute_matches(
+        self, attribute: str, expression: Union[str, List[str]]
+    ) -> bool:
         value = self.attribute_map[attribute]
         if isinstance(expression, str):
             return bool(re.match(expression, value))
@@ -116,14 +123,15 @@ def __init__(
         severity: FindingSeverity = FindingSeverity.INFO,
         source: FindingSource = FindingSource.NONE,
         description: str = None,
+        # TODO: this is bug-prone - see https://towardsdatascience.com/python-pitfall-mutable-default-arguments-9385e8265422
         subject: FindingSubject = FindingSubject(),
         finding_type: FindingType = FindingType.ISSUE,
         failure: bool = True,
         creation_date: str = None,
         fingerprint: str = None,
         starts_at: datetime = None,
         ends_at: datetime = None,
-        add_silence_url: bool = False
+        add_silence_url: bool = False,
     ) -> None:
         self.id: uuid = uuid.uuid4()
         self.title = title
@@ -145,7 +153,11 @@ def __init__(
         self.investigate_uri = f"{ROBUSTA_UI_DOMAIN}/{uri_path}"
         self.add_silence_url = add_silence_url
         self.creation_date = creation_date
-        self.fingerprint = fingerprint
+        self.fingerprint = (
+            fingerprint
+            if fingerprint
+            else self.__calculate_fingerprint(subject, source, aggregation_key)
+        )
         self.starts_at = starts_at if starts_at else datetime.now()
         self.ends_at = ends_at
         self.dirty = False
@@ -164,9 +176,16 @@ def attribute_map(self) -> Dict[str, str]:
             "name": str(self.subject.name),
         }
 
-    def add_enrichment(self, enrichment_blocks: List[BaseBlock], annotations=None, suppress_warning: bool = False):
+    def add_enrichment(
+        self,
+        enrichment_blocks: List[BaseBlock],
+        annotations=None,
+        suppress_warning: bool = False,
+    ):
         if self.dirty and not suppress_warning:
-            logging.warning("Updating a finding after it was added to the event is not allowed!")
+            logging.warning(
+                "Updating a finding after it was added to the event is not allowed!"
+            )
 
         if not enrichment_blocks:
             return
@@ -180,7 +199,7 @@ def __str__(self):
     def get_prometheus_silence_url(self, cluster_id: str) -> str:
         labels: Dict[str, str] = {
             "alertname": self.aggregation_key,
-            "cluster": cluster_id
+            "cluster": cluster_id,
         }
         if self.subject.namespace:
             labels["namespace"] = self.subject.namespace
@@ -192,3 +211,16 @@ def get_prometheus_silence_url(self, cluster_id: str) -> str:
         labels["referer"] = "sink"
 
         return f"{ROBUSTA_UI_DOMAIN}/silences/create?{urllib.parse.urlencode(labels)}"
+
+    @staticmethod
+    def __calculate_fingerprint(
+        subject: FindingSubject, source: FindingSource, aggregation_key: str
+    ) -> str:
+        # some sinks require a unique fingerprint, typically used for two reasons:
+        # 1. de-dupe the same alert if it fires twice
+        # 2. update an existing alert and change its status from firing to resolved
+        #
+        # if we have a fingerprint available from the trigger (e.g. alertmanager) then use that
+        # if not, generate with logic similar to alertmanager
+        s = f"{subject.subject_type},{subject.name},{subject.namespace},{subject.node},{source.value}{aggregation_key}"
+        return hashlib.sha256(s.encode()).hexdigest()