From 76b06ff9407804c3c5cda5cb043fe6d077f596af Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Thu, 11 Jul 2024 12:34:20 +0200 Subject: [PATCH 1/9] * Added on_pod_evicted_enricher action * Added default playbook for PodEvicted event * Added enrich_pod_with_node_events action --- helm/robusta/values.yaml | 9 +++ .../robusta_playbooks/event_enrichments.py | 28 +++++++ playbooks/robusta_playbooks/playbook_utils.py | 12 +++ .../pod_evicted_enrichments.py | 80 +++++++++++++++++++ 4 files changed, 129 insertions(+) create mode 100644 playbooks/robusta_playbooks/playbook_utils.py create mode 100644 playbooks/robusta_playbooks/pod_evicted_enrichments.py diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index bf0135e39..775330f70 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -176,6 +176,15 @@ builtinPlaybooks: - image_pull_backoff_reporter: {} # playbooks for non-prometheus based monitoring that use prometheus for enrichment +- name: "PodEvicted" + triggers: + - on_pod_evicted: {} + actions: + - on_pod_evicted_enricher: {} + - pod_events_enricher: {} + - enrich_pod_with_node_events: {} + - logs_enricher: {} + - name: "PodOOMKill" triggers: - on_pod_oom_killed: diff --git a/playbooks/robusta_playbooks/event_enrichments.py b/playbooks/robusta_playbooks/event_enrichments.py index f3a7307ff..7b244badf 100644 --- a/playbooks/robusta_playbooks/event_enrichments.py +++ b/playbooks/robusta_playbooks/event_enrichments.py @@ -272,6 +272,34 @@ def pod_events_enricher(event: PodEvent, params: EventEnricherParams): ) +@action +def enrich_pod_with_node_events(event: PodEvent, params: EventEnricherParams): + """ + Given a Kubernetes pod, fetch related events in the near past for its node + """ + pod = event.get_pod() + node: Node = Node.readNode(pod.spec.nodeName).obj + if not node: + logging.error(f"cannot run pods_node_events_enricher on alert with no node object: {event}") + return + + events_table_block = get_resource_events_table( + "*Node events:*", + node.kind, + node.metadata.name, + node.metadata.namespace, + included_types=params.included_types, + max_events=params.max_events, + ) + if events_table_block: + event.add_enrichment( + [events_table_block], + {SlackAnnotations.ATTACHMENT: True}, + enrichment_type=EnrichmentType.k8s_events, + title="Node Events", + ) + + @action def deployment_events_enricher(event: DeploymentEvent, params: ExtendedEventEnricherParams): """ diff --git a/playbooks/robusta_playbooks/playbook_utils.py b/playbooks/robusta_playbooks/playbook_utils.py new file mode 100644 index 000000000..bcc667621 --- /dev/null +++ b/playbooks/robusta_playbooks/playbook_utils.py @@ -0,0 +1,12 @@ +from typing import List + +from hikaru.model.rel_1_26 import Pod + + +def pod_row(pod: Pod) -> List[str]: + ready_condition = [condition.status for condition in pod.status.conditions if condition.type == "Ready"] + return [ + pod.metadata.namespace, + pod.metadata.name, + ready_condition[0] if ready_condition else "Unknown", + ] diff --git a/playbooks/robusta_playbooks/pod_evicted_enrichments.py b/playbooks/robusta_playbooks/pod_evicted_enrichments.py new file mode 100644 index 000000000..603c8d4f2 --- /dev/null +++ b/playbooks/robusta_playbooks/pod_evicted_enrichments.py @@ -0,0 +1,80 @@ +import logging +from typing import List + +from hikaru.model.rel_1_26 import Node, PodList + +from playbooks.robusta_playbooks.playbook_utils import pod_row +from robusta.api import ( + BaseBlock, + EnrichmentType, + Finding, + FindingSeverity, + PodEvent, + PodFindingSubject, + TableBlock, + action, +) + + +@action +def on_pod_evicted_enricher(event: PodEvent): + """ + Retrieves pod and node information for an OOMKilled pod + """ + pod = event.get_pod() + if not pod: + logging.error(f"cannot run on_pod_evicted_enricher on event with no pod: {event}") + return + + try: + node = Node.readNode(pod.spec.nodeName).obj + except Exception as e: + logging.error(f"Failed to read pod's node information: {e}") + return + + finding = Finding( + title=f"Pod {pod.metadata.name} in namespace {pod.metadata.namespace} was Evicted", + aggregation_key="PodEvictedTriggered", + severity=FindingSeverity.HIGH, + subject=PodFindingSubject(pod), + ) + + node: Node = Node.readNode(pod.spec.nodeName).obj + node_labels = [("Node Name", pod.spec.nodeName)] + node_info_block = TableBlock( + [[k, v] for k, v in node_labels], + headers=["Field", "Value"], + table_name="*Node general info:*", + ) + node_status_block = TableBlock( + [[condition.type, condition.status] for condition in node.status.conditions], + headers=["Type", "Status"], + table_name="*Node status details:*", + ) + + allocatable_resources_block = TableBlock( + [[resource, value] for resource, value in node.status.allocatable.items()], + headers=["Resource", "Value"], + table_name="*Node Allocatable Resources:*", + ) + + finding.add_enrichment( + [node_info_block, node_status_block, allocatable_resources_block], + enrichment_type=EnrichmentType.node_info, + title="Node Info", + ) + + event.add_finding(finding) + + try: + pod_list = PodList.listPodForAllNamespaces(field_selector=f"spec.nodeName={node.metadata.name}").obj + except Exception as e: + logging.error(f"Failed to list pods for node {node.metadata.name}: {e}") + return + + effected_pods_rows = [pod_row(pod) for pod in pod_list.items] + block_list: List[BaseBlock] = [] + block_list.append( + TableBlock(effected_pods_rows, ["namespace", "name", "ready"], table_name="Pods running on the node") + ) + event.add_enrichment(block_list) From a648d82918d3e03f336fb92c4d3d78b66ebfd6e7 Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Thu, 11 Jul 2024 12:44:13 +0200 Subject: [PATCH 2/9] * Moved pod_row function dublicate --- .../robusta_playbooks/node_enrichments.py | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/playbooks/robusta_playbooks/node_enrichments.py b/playbooks/robusta_playbooks/node_enrichments.py index a444e316b..62711eb80 100644 --- a/playbooks/robusta_playbooks/node_enrichments.py +++ b/playbooks/robusta_playbooks/node_enrichments.py @@ -2,8 +2,11 @@ from typing import List from hikaru.model.rel_1_26 import Pod, PodList + +from playbooks.robusta_playbooks.playbook_utils import pod_row from robusta.api import ( BaseBlock, + EnrichmentType, FileBlock, Finding, FindingSeverity, @@ -19,19 +22,9 @@ TableBlock, action, create_node_graph_enrichment, - EnrichmentType ) -def pod_row(pod: Pod) -> List[str]: - ready_condition = [condition.status for condition in pod.status.conditions if condition.type == "Ready"] - return [ - pod.metadata.namespace, - pod.metadata.name, - ready_condition[0] if ready_condition else "Unknown", - ] - - def has_resource_request(pod: Pod, resource_type: str) -> bool: for container in pod.spec.containers: try: @@ -85,7 +78,7 @@ def node_running_pods_enricher(event: NodeEvent): effected_pods_rows = [pod_row(pod) for pod in pod_list.items] block_list.append( - TableBlock(effected_pods_rows, ["namespace", "name", "ready"], table_name=f"Pods running on the node") + TableBlock(effected_pods_rows, ["namespace", "name", "ready"], table_name="Pods running on the node") ) event.add_enrichment(block_list) @@ -127,7 +120,7 @@ def node_status_enricher(event: NodeEvent): logging.error(f"node_status_enricher was called on event without node : {event}") return - logging.info(f"node_status_enricher is depricated, use status_enricher instead") + logging.info("node_status_enricher is depricated, use status_enricher instead") event.add_enrichment( [ @@ -154,8 +147,9 @@ def node_dmesg_enricher(event: NodeEvent, params: PodRunningParams): ) if exec_result: event.add_enrichment( - [FileBlock(f"dmesg.log", exec_result.encode())], enrichment_type=EnrichmentType.text_file, - title="DMESG Info" + [FileBlock("dmesg.log", exec_result.encode())], + enrichment_type=EnrichmentType.text_file, + title="DMESG Info", ) @@ -189,8 +183,9 @@ def node_health_watcher(event: NodeChangeEvent): subject=KubeObjFindingSubject(event.obj), ) event.add_finding(finding) - event.add_enrichment([KubernetesDiffBlock([], event.old_obj, - event.obj, event.obj.metadata.name, kind=event.obj.kind)]) + event.add_enrichment( + [KubernetesDiffBlock([], event.old_obj, event.obj, event.obj.metadata.name, kind=event.obj.kind)] + ) node_status_enricher(event) From 01b7cfa2713cf966fbef021a5653e6f102ba9b9c Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Thu, 11 Jul 2024 14:40:24 +0200 Subject: [PATCH 3/9] *Fixed importing of pod_row --- playbooks/robusta_playbooks/node_enrichments.py | 2 +- playbooks/robusta_playbooks/pod_evicted_enrichments.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/robusta_playbooks/node_enrichments.py b/playbooks/robusta_playbooks/node_enrichments.py index 62711eb80..511568ac3 100644 --- a/playbooks/robusta_playbooks/node_enrichments.py +++ b/playbooks/robusta_playbooks/node_enrichments.py @@ -2,8 +2,8 @@ from typing import List from hikaru.model.rel_1_26 import Pod, PodList +from playbook_utils import pod_row -from playbooks.robusta_playbooks.playbook_utils import pod_row from robusta.api import ( BaseBlock, EnrichmentType, diff --git a/playbooks/robusta_playbooks/pod_evicted_enrichments.py b/playbooks/robusta_playbooks/pod_evicted_enrichments.py index 603c8d4f2..f193576b2 100644 --- a/playbooks/robusta_playbooks/pod_evicted_enrichments.py +++ b/playbooks/robusta_playbooks/pod_evicted_enrichments.py @@ -2,8 +2,8 @@ from typing import List from hikaru.model.rel_1_26 import Node, PodList +from robusta_playbooks.playbook_utils import pod_row -from playbooks.robusta_playbooks.playbook_utils import pod_row from robusta.api import ( BaseBlock, EnrichmentType, From c63d645e700b69066c17d06d46b7f21a2b2261eb Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Thu, 11 Jul 2024 14:44:09 +0200 Subject: [PATCH 4/9] *Fixed pod_row import in node_enrichment --- playbooks/robusta_playbooks/node_enrichments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/robusta_playbooks/node_enrichments.py b/playbooks/robusta_playbooks/node_enrichments.py index 511568ac3..8c97372b8 100644 --- a/playbooks/robusta_playbooks/node_enrichments.py +++ b/playbooks/robusta_playbooks/node_enrichments.py @@ -2,7 +2,7 @@ from typing import List from hikaru.model.rel_1_26 import Pod, PodList -from playbook_utils import pod_row +from robusta_playbooks.playbook_utils import pod_row from robusta.api import ( BaseBlock, From abeb23ee072adfc97c1a6c3aa491988ed6878e16 Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Thu, 11 Jul 2024 15:17:17 +0200 Subject: [PATCH 5/9] *Updated import of the pod_row method --- playbooks/robusta_playbooks/pod_evicted_enrichments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/robusta_playbooks/pod_evicted_enrichments.py b/playbooks/robusta_playbooks/pod_evicted_enrichments.py index f193576b2..603c8d4f2 100644 --- a/playbooks/robusta_playbooks/pod_evicted_enrichments.py +++ b/playbooks/robusta_playbooks/pod_evicted_enrichments.py @@ -2,8 +2,8 @@ from typing import List from hikaru.model.rel_1_26 import Node, PodList -from robusta_playbooks.playbook_utils import pod_row +from playbooks.robusta_playbooks.playbook_utils import pod_row from robusta.api import ( BaseBlock, EnrichmentType, From 800d623d4fac09d65d60ce4fac6025672b239367 Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Fri, 12 Jul 2024 16:08:28 +0200 Subject: [PATCH 6/9] *Added node_enrichment_utils and refactored node related playbooks --- .../robusta_playbooks/node_enrichments.py | 38 ++++-------- playbooks/robusta_playbooks/playbook_utils.py | 12 ---- .../pod_evicted_enrichments.py | 42 ++++--------- src/robusta/api/__init__.py | 5 ++ .../core/playbooks/node_enrichment_utils.py | 60 +++++++++++++++++++ .../integrations/kubernetes/custom_models.py | 12 ++++ 6 files changed, 100 insertions(+), 69 deletions(-) delete mode 100644 playbooks/robusta_playbooks/playbook_utils.py create mode 100644 src/robusta/core/playbooks/node_enrichment_utils.py diff --git a/playbooks/robusta_playbooks/node_enrichments.py b/playbooks/robusta_playbooks/node_enrichments.py index 8c97372b8..d92678f37 100644 --- a/playbooks/robusta_playbooks/node_enrichments.py +++ b/playbooks/robusta_playbooks/node_enrichments.py @@ -2,7 +2,6 @@ from typing import List from hikaru.model.rel_1_26 import Pod, PodList -from robusta_playbooks.playbook_utils import pod_row from robusta.api import ( BaseBlock, @@ -19,9 +18,11 @@ PodRunningParams, ResourceGraphEnricherParams, RobustaPod, - TableBlock, action, create_node_graph_enrichment, + get_node_allocatable_resources_table_block, + get_node_running_pods_table_block_or_none, + get_node_status_table_block, ) @@ -74,12 +75,10 @@ def node_running_pods_enricher(event: NodeEvent): return block_list: List[BaseBlock] = [] - pod_list: PodList = PodList.listPodForAllNamespaces(field_selector=f"spec.nodeName={node.metadata.name}").obj - - effected_pods_rows = [pod_row(pod) for pod in pod_list.items] - block_list.append( - TableBlock(effected_pods_rows, ["namespace", "name", "ready"], table_name="Pods running on the node") - ) + table_resources = get_node_running_pods_table_block_or_none(node) + if not table_resources: + return + block_list.append(table_resources) event.add_enrichment(block_list) @@ -97,13 +96,7 @@ def node_allocatable_resources_enricher(event: NodeEvent): block_list: List[BaseBlock] = [] if node: - block_list.append( - TableBlock( - [[k, v] for (k, v) in node.status.allocatable.items()], - ["resource", "value"], - table_name="Node Allocatable Resources - The amount of compute resources that are available for pods", - ) - ) + block_list.append(get_node_allocatable_resources_table_block(node)) event.add_enrichment(block_list) @@ -116,21 +109,14 @@ def node_status_enricher(event: NodeEvent): Can help troubleshooting Node issues. """ - if not event.get_node(): - logging.error(f"node_status_enricher was called on event without node : {event}") + node = event.get_node() + if not node: + logging.error("node_status_enricher was called on event without node : {event}") return logging.info("node_status_enricher is depricated, use status_enricher instead") - event.add_enrichment( - [ - TableBlock( - [[c.type, c.status] for c in event.get_node().status.conditions], - headers=["Type", "Status"], - table_name="*Node status details:*", - ), - ] - ) + event.add_enrichment(get_node_status_table_block(node)) @action diff --git a/playbooks/robusta_playbooks/playbook_utils.py b/playbooks/robusta_playbooks/playbook_utils.py deleted file mode 100644 index bcc667621..000000000 --- a/playbooks/robusta_playbooks/playbook_utils.py +++ /dev/null @@ -1,12 +0,0 @@ -from typing import List - -from hikaru.model.rel_1_26 import Pod - - -def pod_row(pod: Pod) -> List[str]: - ready_condition = [condition.status for condition in pod.status.conditions if condition.type == "Ready"] - return [ - pod.metadata.namespace, - pod.metadata.name, - ready_condition[0] if ready_condition else "Unknown", - ] diff --git a/playbooks/robusta_playbooks/pod_evicted_enrichments.py b/playbooks/robusta_playbooks/pod_evicted_enrichments.py index 603c8d4f2..38cd49199 100644 --- a/playbooks/robusta_playbooks/pod_evicted_enrichments.py +++ b/playbooks/robusta_playbooks/pod_evicted_enrichments.py @@ -1,11 +1,6 @@ import logging -from typing import List -from hikaru.model.rel_1_26 import Node, PodList - -from playbooks.robusta_playbooks.playbook_utils import pod_row from robusta.api import ( - BaseBlock, EnrichmentType, Finding, FindingSeverity, @@ -13,6 +8,9 @@ PodFindingSubject, TableBlock, action, + get_node_allocatable_resources_table_block, + get_node_running_pods_table_block_or_none, + get_node_status_table_block, ) @@ -26,10 +24,9 @@ def on_pod_evicted_enricher(event: PodEvent): logging.error(f"cannot run on_pod_evicted_enricher on event with no pod: {event}") return - try: - node = Node.readNode(pod.spec.nodeName).obj - except Exception as e: - logging.error(f"Failed to read pod's node information: {e}") + node = pod.get_node() + if not node: + logging.error(f"cannot run on_pod_evicted_enricher on event with no node: {event}") return finding = Finding( @@ -39,23 +36,16 @@ def on_pod_evicted_enricher(event: PodEvent): subject=PodFindingSubject(pod), ) - node: Node = Node.readNode(pod.spec.nodeName).obj node_labels = [("Node Name", pod.spec.nodeName)] node_info_block = TableBlock( [[k, v] for k, v in node_labels], headers=["Field", "Value"], table_name="*Node general info:*", ) - node_status_block = TableBlock( - [[condition.type, condition.status] for condition in node.status.conditions], - headers=["Type", "Status"], - table_name="*Node status details:*", - ) + node_status_block = get_node_status_table_block(node) - allocatable_resources_block = TableBlock( - [[resource, value] for resource, value in node.status.allocatable.items()], - headers=["Resource", "Value"], - table_name="*Node Allocatable Resources:*", + allocatable_resources_block = get_node_allocatable_resources_table_block( + node, table_name="*Node Allocatable Resources:*" ) finding.add_enrichment( @@ -66,15 +56,5 @@ def on_pod_evicted_enricher(event: PodEvent): event.add_finding(finding) - try: - pod_list = PodList.listPodForAllNamespaces(field_selector=f"spec.nodeName={node.metadata.name}").obj - except Exception as e: - logging.error(f"Failed to list pods for node {node.metadata.name}: {e}") - return - - effected_pods_rows = [pod_row(pod) for pod in pod_list.items] - block_list: List[BaseBlock] = [] - block_list.append( - TableBlock(effected_pods_rows, ["namespace", "name", "ready"], table_name="Pods running on the node") - ) - event.add_enrichment(block_list) + running_nodes_table = get_node_running_pods_table_block_or_none(node) + event.add_enrichment(running_nodes_table) diff --git a/src/robusta/api/__init__.py b/src/robusta/api/__init__.py index 418d8765a..1af7ff4f9 100644 --- a/src/robusta/api/__init__.py +++ b/src/robusta/api/__init__.py @@ -112,6 +112,11 @@ from robusta.core.playbooks.container_playbook_utils import create_container_graph from robusta.core.playbooks.crash_reporter import send_crash_report from robusta.core.playbooks.job_utils import CONTROLLER_UID, get_job_all_pods, get_job_latest_pod, get_job_selector +from robusta.core.playbooks.node_enrichment_utils import ( + get_node_allocatable_resources_table_block, + get_node_running_pods_table_block_or_none, + get_node_status_table_block, +) from robusta.core.playbooks.node_playbook_utils import create_node_graph_enrichment from robusta.core.playbooks.pod_utils.crashloop_utils import get_crash_report_enrichments from robusta.core.playbooks.pod_utils.imagepull_utils import ( diff --git a/src/robusta/core/playbooks/node_enrichment_utils.py b/src/robusta/core/playbooks/node_enrichment_utils.py new file mode 100644 index 000000000..83ca1f8a4 --- /dev/null +++ b/src/robusta/core/playbooks/node_enrichment_utils.py @@ -0,0 +1,60 @@ +import logging +from typing import Optional + +from hikaru.model.rel_1_26 import PodList + +from robusta.core.reporting import TableBlock +from robusta.integrations.kubernetes.autogenerated.events import NodeEvent + + +def get_node_allocatable_resources_table_block( + node: NodeEvent, + table_name: Optional[ + str + ] = "Node Allocatable Resources - The amount of compute resources that are available for pods", +) -> TableBlock: + """ + Enrich the finding with the node resources available for allocation. + + Can help troubleshooting node issues. + """ + return TableBlock( + [[k, v] for (k, v) in node.status.allocatable.items()], + ["resource", "value"], + table_name=table_name, + ) + + +def get_node_status_table_block(node: NodeEvent, table_name: Optional[str] = "*Node status details:*") -> TableBlock: + """ + Enrich the finding with the node resources available for allocation. + + Can help troubleshooting node issues. + """ + + return TableBlock( + [[c.type, c.status] for c in node.status.conditions], + headers=["Type", "Status"], + table_name=table_name, + ) + + +def get_node_running_pods_table_block_or_none( + node: NodeEvent, table_name: Optional[str] = "Pods running on the node" +) -> Optional[TableBlock]: + """ + Enrich the finding with the node resources available for allocation. + + Can help troubleshooting node issues. + """ + try: + pod_list = PodList.listPodForAllNamespaces(field_selector=f"spec.nodeName={node.metadata.name}").obj + except Exception as e: + logging.error(f"Failed to list pods for node {node.metadata.name}: {e}") + return None + + effected_pods_rows = [ + [pod.metadata.namespace, pod.metadata.name, pod.is_pod_in_ready_condition()] for pod in pod_list.items + ] + + return TableBlock(effected_pods_rows, ["namespace", "name", "ready"], table_name="Pods running on the node") diff --git a/src/robusta/integrations/kubernetes/custom_models.py b/src/robusta/integrations/kubernetes/custom_models.py index 8450e7c85..b24aa4d72 100644 --- a/src/robusta/integrations/kubernetes/custom_models.py +++ b/src/robusta/integrations/kubernetes/custom_models.py @@ -334,6 +334,14 @@ def extract_container_id(status: ContainerStatus) -> str: runtime, container_id = status.containerID.split("://") return container_id + def get_node(self) -> Optional[Node]: + try: + node = Node.readNode(self.spec.nodeName).obj + except Exception as e: + logging.error(f"Failed to read pod's node information: {e}") + return None + return node + def get_processes(self, custom_annotations: Optional[Dict[str, str]] = None) -> List[Process]: container_ids = " ".join([self.extract_container_id(s) for s in self.status.containerStatuses]) output = RobustaPod.exec_in_debugger_pod( @@ -375,6 +383,10 @@ def upload_file(self, path: str, contents: bytes, container: Optional[str] = Non container=container, ) + def is_pod_in_ready_condition(self) -> str: + ready_condition = [condition.status for condition in self.status.conditions if condition.type == "Ready"] + return ready_condition[0] if ready_condition else "Unknown" + @staticmethod def find_pods_with_direct_owner(namespace: str, owner_uid: str) -> List["RobustaPod"]: all_pods: List["RobustaPod"] = PodList.listNamespacedPod(namespace).obj.items From e2e70e35b8563558b069a050861a172ab76db6c5 Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Fri, 12 Jul 2024 17:58:39 +0200 Subject: [PATCH 7/9] *Removed log_enrichment from PodEvicted because it can't retrieve logs --- helm/robusta/values.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index 086184678..3d75c355f 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -183,7 +183,6 @@ builtinPlaybooks: - on_pod_evicted_enricher: {} - pod_events_enricher: {} - enrich_pod_with_node_events: {} - - logs_enricher: {} - name: "PodOOMKill" triggers: From a4812a313c64d049f3088dbb53e2e708ca05ce04 Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Mon, 15 Jul 2024 17:17:17 +0200 Subject: [PATCH 8/9] Added tests for node_enrichment_utils, fixed typing for node_enrichment_utils --- .../robusta_playbooks/event_enrichments.py | 2 +- playbooks/robusta_playbooks/oom_killer.py | 61 ++++++----- .../robusta_playbooks/pod_enrichments.py | 7 +- .../core/playbooks/node_enrichment_utils.py | 11 +- tests/test_node_enrichment_utils.py | 103 ++++++++++++++++++ 5 files changed, 146 insertions(+), 38 deletions(-) create mode 100644 tests/test_node_enrichment_utils.py diff --git a/playbooks/robusta_playbooks/event_enrichments.py b/playbooks/robusta_playbooks/event_enrichments.py index 7b244badf..08db440e6 100644 --- a/playbooks/robusta_playbooks/event_enrichments.py +++ b/playbooks/robusta_playbooks/event_enrichments.py @@ -278,7 +278,7 @@ def enrich_pod_with_node_events(event: PodEvent, params: EventEnricherParams): Given a Kubernetes pod, fetch related events in the near past for its node """ pod = event.get_pod() - node: Node = Node.readNode(pod.spec.nodeName).obj + node = pod.get_node() if not node: logging.error(f"cannot run pods_node_events_enricher on alert with no node object: {event}") return diff --git a/playbooks/robusta_playbooks/oom_killer.py b/playbooks/robusta_playbooks/oom_killer.py index f13439124..2b9e6835d 100644 --- a/playbooks/robusta_playbooks/oom_killer.py +++ b/playbooks/robusta_playbooks/oom_killer.py @@ -8,8 +8,11 @@ from hikaru.model.rel_1_26 import Node, Pod, PodList, ResourceRequirements from robusta.api import ( + EnrichmentType, Finding, FindingSeverity, + OOMGraphEnricherParams, + OomKillParams, PodContainer, PodEvent, PodFindingSubject, @@ -17,18 +20,15 @@ PrometheusAlert, PrometheusKubernetesAlert, RendererType, - OomKillParams, - OOMGraphEnricherParams, TableBlock, action, create_container_graph, + create_node_graph_enrichment, get_oom_killed_container, parse_kubernetes_datetime_to_ms, pod_most_recent_oom_killed_container, - EnrichmentType, - create_node_graph_enrichment, ) -from robusta.core.model.base_params import PrometheusParams, LogEnricherParams +from robusta.core.model.base_params import LogEnricherParams, PrometheusParams from robusta.core.playbooks.oom_killer_utils import logs_enricher from robusta.core.reporting.blocks import GraphBlock from robusta.integrations.resource_analysis.memory_analyzer import MemoryAnalyzer @@ -53,12 +53,17 @@ class OomKillerEnricherParams(PrometheusParams): NODE_MEMORY_THRESHOLD = 0.95 -def get_oomkilled_graph(oomkilled_container: PodContainer, pod: Pod, params: OOMGraphEnricherParams, - metrics_legends_labels: Optional[List[str]] = None,) -> GraphBlock: +def get_oomkilled_graph( + oomkilled_container: PodContainer, + pod: Pod, + params: OOMGraphEnricherParams, + metrics_legends_labels: Optional[List[str]] = None, +) -> GraphBlock: if params.delay_graph_s > 0: time.sleep(params.delay_graph_s) - return create_container_graph(params, pod, oomkilled_container, show_limit=True, - metrics_legends_labels=metrics_legends_labels) + return create_container_graph( + params, pod, oomkilled_container, show_limit=True, metrics_legends_labels=metrics_legends_labels + ) @action @@ -74,8 +79,7 @@ def oomkilled_container_graph_enricher(event: PodEvent, params: OOMGraphEnricher if not oomkilled_container: logging.error("Unable to find oomkilled container") return - container_graph = get_oomkilled_graph(oomkilled_container, pod, params, - metrics_legends_labels=["container"]) + container_graph = get_oomkilled_graph(oomkilled_container, pod, params, metrics_legends_labels=["container"]) event.add_enrichment([container_graph], enrichment_type=EnrichmentType.graph, title="Container Info") @@ -96,7 +100,7 @@ def pod_oom_killer_enricher(event: PodEvent, params: OomKillParams): subject=PodFindingSubject(pod), ) - node: Node = Node.readNode(pod.spec.nodeName).obj + node = pod.get_node() labels = [ ("Pod", pod.metadata.name), ("Namespace", pod.metadata.namespace), @@ -112,13 +116,16 @@ def pod_oom_killer_enricher(event: PodEvent, params: OomKillParams): ( "Node allocated memory", f"{allocated_precent:.2f}% out of {allocatable_memory}MB allocatable", - )] + ), + ] - blocks = [TableBlock( - [[k, v] for (k, v) in node_labels], - ["field", "value"], - table_name="*Node Info*", - )] + blocks = [ + TableBlock( + [[k, v] for (k, v) in node_labels], + ["field", "value"], + table_name="*Node Info*", + ) + ] if params.node_memory_graph: node_graph = create_node_graph_enrichment(params, node, metrics_legends_labels=["pod"]) blocks.append(node_graph) @@ -151,18 +158,18 @@ def pod_oom_killer_enricher(event: PodEvent, params: OomKillParams): if oom_killed_status.terminated.finishedAt: container_labels.append(("Container finished at", oom_killed_status.terminated.finishedAt)) - blocks = [TableBlock( - [[k, v] for (k, v) in container_labels], - ["field", "value"], - table_name="*Container Info*", - )] + blocks = [ + TableBlock( + [[k, v] for (k, v) in container_labels], + ["field", "value"], + table_name="*Container Info*", + ) + ] if params.container_memory_graph and oomkilled_container.container: - container_graph = get_oomkilled_graph(oomkilled_container, pod, params, - metrics_legends_labels=["pod"]) + container_graph = get_oomkilled_graph(oomkilled_container, pod, params, metrics_legends_labels=["pod"]) blocks.append(container_graph) - finding.add_enrichment(blocks, enrichment_type=EnrichmentType.container_info, - title="Container Info") + finding.add_enrichment(blocks, enrichment_type=EnrichmentType.container_info, title="Container Info") event.add_finding(finding) if params.attach_logs and container_name is not None: diff --git a/playbooks/robusta_playbooks/pod_enrichments.py b/playbooks/robusta_playbooks/pod_enrichments.py index 1ed82b296..2ca3fdc27 100644 --- a/playbooks/robusta_playbooks/pod_enrichments.py +++ b/playbooks/robusta_playbooks/pod_enrichments.py @@ -1,8 +1,8 @@ import logging from datetime import datetime -from hikaru.model.rel_1_26 import Node from robusta.api import ( + EnrichmentType, PodEvent, PodResourceGraphEnricherParams, ResourceChartItemType, @@ -13,7 +13,6 @@ create_node_graph_enrichment, create_resource_enrichment, pod_limits, - EnrichmentType ) from robusta.core.model.pods import pod_requests @@ -64,7 +63,7 @@ def pod_graph_enricher(pod_event: PodEvent, params: PodResourceGraphEnricherPara prometheus_params=params, graph_duration_minutes=params.graph_duration_minutes, lines=limit_lines, - metrics_legends_labels=["pod"] + metrics_legends_labels=["pod"], ) pod_event.add_enrichment([graph_enrichment], enrichment_type=EnrichmentType.graph, title="Pod Resources") @@ -78,7 +77,7 @@ def pod_node_graph_enricher(pod_event: PodEvent, params: ResourceGraphEnricherPa if not pod: logging.error(f"cannot run pod_node_graph_enricher on event with no pod: {pod_event}") return - node: Node = Node.readNode(pod.spec.nodeName).obj + node = pod.get_node() if not node: logging.warning(f"Node {pod.spec.nodeName} not found for pod {pod.metadata.name}") return diff --git a/src/robusta/core/playbooks/node_enrichment_utils.py b/src/robusta/core/playbooks/node_enrichment_utils.py index 83ca1f8a4..75d9e9fe4 100644 --- a/src/robusta/core/playbooks/node_enrichment_utils.py +++ b/src/robusta/core/playbooks/node_enrichment_utils.py @@ -1,14 +1,13 @@ import logging from typing import Optional -from hikaru.model.rel_1_26 import PodList +from hikaru.model.rel_1_26 import Node, PodList from robusta.core.reporting import TableBlock -from robusta.integrations.kubernetes.autogenerated.events import NodeEvent def get_node_allocatable_resources_table_block( - node: NodeEvent, + node: Node, table_name: Optional[ str ] = "Node Allocatable Resources - The amount of compute resources that are available for pods", @@ -25,7 +24,7 @@ def get_node_allocatable_resources_table_block( ) -def get_node_status_table_block(node: NodeEvent, table_name: Optional[str] = "*Node status details:*") -> TableBlock: +def get_node_status_table_block(node: Node, table_name: Optional[str] = "*Node status details:*") -> TableBlock: """ Enrich the finding with the node resources available for allocation. @@ -40,7 +39,7 @@ def get_node_status_table_block(node: NodeEvent, table_name: Optional[str] = "*N def get_node_running_pods_table_block_or_none( - node: NodeEvent, table_name: Optional[str] = "Pods running on the node" + node: Node, table_name: Optional[str] = "Pods running on the node" ) -> Optional[TableBlock]: """ Enrich the finding with the node resources available for allocation. @@ -57,4 +56,4 @@ def get_node_running_pods_table_block_or_none( [pod.metadata.namespace, pod.metadata.name, pod.is_pod_in_ready_condition()] for pod in pod_list.items ] - return TableBlock(effected_pods_rows, ["namespace", "name", "ready"], table_name="Pods running on the node") + return TableBlock(effected_pods_rows, ["namespace", "name", "ready"], table_name=table_name) diff --git a/tests/test_node_enrichment_utils.py b/tests/test_node_enrichment_utils.py new file mode 100644 index 000000000..a020d0e80 --- /dev/null +++ b/tests/test_node_enrichment_utils.py @@ -0,0 +1,103 @@ +from unittest.mock import patch + +import pytest +from hikaru.model.rel_1_26 import Node, NodeCondition, NodeStatus, ObjectMeta, PodCondition, PodList, PodStatus + +from robusta.core.playbooks.node_enrichment_utils import ( + get_node_allocatable_resources_table_block, + get_node_running_pods_table_block_or_none, + get_node_status_table_block, +) +from robusta.core.reporting import TableBlock +from robusta.integrations.kubernetes.custom_models import RobustaPod + + +@pytest.fixture +def create_test_node(): + def _create_test_node(allocatable=None, conditions=None): + # this way of Node object initialization is taken from hikaru repo + # https://github.com/haxsaw/hikaru/blob/bb89e0ddc2de241c2d04da9f720b01ce46473fb1/tests/basic_tests_rel_1_26.py#L1634 + status = NodeStatus(allocatable=allocatable, conditions=conditions) + return Node(status=status) + + return _create_test_node + + +@pytest.fixture +def create_test_pod(): + def _create_test_pod(name, namespace, conditions): + return RobustaPod( + metadata=ObjectMeta(name=name, namespace=namespace), + status=PodStatus( + conditions=[ + PodCondition(status=condition["status"], type=condition["type"]) for condition in conditions + ] + ), + ) + + return _create_test_pod + + +def test_get_node_allocatable_resources_table_block(create_test_node): + test_node = create_test_node(allocatable={"cpu": "4", "memory": "8Gi"}) + + table_block = get_node_allocatable_resources_table_block(test_node) + + assert isinstance(table_block, TableBlock) + assert table_block.headers == ["resource", "value"] + assert ( + table_block.table_name + == "Node Allocatable Resources - The amount of compute resources that are available for pods" + ) + assert table_block.rows == [["cpu", "4"], ["memory", "8Gi"]] + + +def test_get_node_status_table_block(create_test_node): + first_node_condition = NodeCondition(type="Ready", status="True") + second_node_condition = NodeCondition(type="DiskPressure", status="False") + test_node = create_test_node(conditions=[first_node_condition, second_node_condition]) + + table_block = get_node_status_table_block(test_node) + + assert isinstance(table_block, TableBlock) + assert table_block.headers == ["Type", "Status"] + assert table_block.table_name == "*Node status details:*" + assert table_block.rows == [ + [first_node_condition.type, first_node_condition.status], + [second_node_condition.type, second_node_condition.status], + ] + + +def test_get_node_running_pods_table_block_or_none(create_test_node, create_test_pod): + test_node = Node(metadata=ObjectMeta(name="test-node")) + pods = [ + create_test_pod("pod1", "default", [{"status": "True", "type": "Ready"}]), + create_test_pod("pod2", "default", [{"status": "False", "type": "PodScheduled"}]), + create_test_pod("pod3", "default", [{"status": "Unknown", "type": "ContainersReady"}]), + ] + pod_list = PodList(pods) + + with patch("robusta.core.playbooks.node_enrichment_utils.PodList.listPodForAllNamespaces") as patched_list_pods: + patched_list_pods.return_value.obj = pod_list + + table_block = get_node_running_pods_table_block_or_none(test_node) + + assert isinstance(table_block, TableBlock) + assert table_block.headers == ["namespace", "name", "ready"] + assert table_block.table_name == "Pods running on the node" + assert table_block.rows == [ + ["default", "pod1", "True"], + ["default", "pod2", "Unknown"], + ["default", "pod3", "Unknown"], + ] + + +def test_get_node_running_pods_table_block_or_none_failure(): + test_node = Node(metadata=ObjectMeta(name="test-node")) + with patch( + "robusta.core.playbooks.node_enrichment_utils.PodList.listPodForAllNamespaces", + side_effect=Exception("API call failed"), + ): + + table_block = get_node_running_pods_table_block_or_none(test_node) + assert table_block is None From 6c7e48f841a0205d719e59169c1f5fee40f69007 Mon Sep 17 00:00:00 2001 From: Dima Chievtaiev Date: Tue, 16 Jul 2024 12:13:46 +0200 Subject: [PATCH 9/9] *Refactored enrich_pod_with_node_events to use pods to get events instead of node --- helm/robusta/values.yaml | 2 +- playbooks/robusta_playbooks/event_enrichments.py | 10 ++-------- playbooks/robusta_playbooks/pod_evicted_enrichments.py | 6 +++--- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index 84ca8d628..129485784 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -180,7 +180,7 @@ builtinPlaybooks: triggers: - on_pod_evicted: {} actions: - - on_pod_evicted_enricher: {} + - pod_evicted_enricher: {} - pod_events_enricher: {} - enrich_pod_with_node_events: {} diff --git a/playbooks/robusta_playbooks/event_enrichments.py b/playbooks/robusta_playbooks/event_enrichments.py index 08db440e6..7868ca243 100644 --- a/playbooks/robusta_playbooks/event_enrichments.py +++ b/playbooks/robusta_playbooks/event_enrichments.py @@ -278,16 +278,10 @@ def enrich_pod_with_node_events(event: PodEvent, params: EventEnricherParams): Given a Kubernetes pod, fetch related events in the near past for its node """ pod = event.get_pod() - node = pod.get_node() - if not node: - logging.error(f"cannot run pods_node_events_enricher on alert with no node object: {event}") - return - events_table_block = get_resource_events_table( "*Node events:*", - node.kind, - node.metadata.name, - node.metadata.namespace, + kind="Node", + name=pod.spec.nodeName, included_types=params.included_types, max_events=params.max_events, ) diff --git a/playbooks/robusta_playbooks/pod_evicted_enrichments.py b/playbooks/robusta_playbooks/pod_evicted_enrichments.py index 38cd49199..fd44fac9a 100644 --- a/playbooks/robusta_playbooks/pod_evicted_enrichments.py +++ b/playbooks/robusta_playbooks/pod_evicted_enrichments.py @@ -15,18 +15,18 @@ @action -def on_pod_evicted_enricher(event: PodEvent): +def pod_evicted_enricher(event: PodEvent): """ Retrieves pod and node information for an OOMKilled pod """ pod = event.get_pod() if not pod: - logging.error(f"cannot run on_pod_evicted_enricher on event with no pod: {event}") + logging.error(f"cannot run pod_evicted_enricher on event with no pod: {event}") return node = pod.get_node() if not node: - logging.error(f"cannot run on_pod_evicted_enricher on event with no node: {event}") + logging.error(f"cannot run pod_evicted_enricher on event with no node: {event}") return finding = Finding(