Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added PodEvicted default event #1497

Merged
merged 17 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
76b06ff
* Added on_pod_evicted_enricher action
itisallgood Jul 11, 2024
a648d82
* Moved pod_row function dublicate
itisallgood Jul 11, 2024
01b7cfa
*Fixed importing of pod_row
itisallgood Jul 11, 2024
c63d645
*Fixed pod_row import in node_enrichment
itisallgood Jul 11, 2024
f48648b
Merge branch 'master' into add-visibility-out-of-the-box-into-evictions
itisallgood Jul 11, 2024
abeb23e
*Updated import of the pod_row method
itisallgood Jul 11, 2024
800d623
*Added node_enrichment_utils and refactored node related playbooks
itisallgood Jul 12, 2024
7aae9a6
Merge branch 'add-visibility-out-of-the-box-into-evictions' of github…
itisallgood Jul 12, 2024
35c2401
Merge branch 'master' into add-visibility-out-of-the-box-into-evictions
itisallgood Jul 12, 2024
e2e70e3
*Removed log_enrichment from PodEvicted because it can't retrieve logs
itisallgood Jul 12, 2024
2d14494
Merge branch 'master' into add-visibility-out-of-the-box-into-evictions
itisallgood Jul 15, 2024
a4812a3
Added tests for node_enrichment_utils, fixed typing for node_enrichme…
itisallgood Jul 15, 2024
b67fef3
Merge branch 'add-visibility-out-of-the-box-into-evictions' of github…
itisallgood Jul 15, 2024
86bc134
Merge branch 'master' into add-visibility-out-of-the-box-into-evictions
itisallgood Jul 15, 2024
6c7e48f
*Refactored enrich_pod_with_node_events to use pods to get events ins…
itisallgood Jul 16, 2024
12fe317
Merge branch 'add-visibility-out-of-the-box-into-evictions' of github…
itisallgood Jul 16, 2024
e97a166
Merge branch 'master' into add-visibility-out-of-the-box-into-evictions
itisallgood Jul 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions helm/robusta/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,14 @@ builtinPlaybooks:
- image_pull_backoff_reporter: {}

# playbooks for non-prometheus based monitoring that use prometheus for enrichment
- name: "PodEvicted"
triggers:
- on_pod_evicted: {}
actions:
- pod_evicted_enricher: {}
- pod_events_enricher: {}
- enrich_pod_with_node_events: {}

- name: "PodOOMKill"
triggers:
- on_pod_oom_killed:
Expand Down
22 changes: 22 additions & 0 deletions playbooks/robusta_playbooks/event_enrichments.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,28 @@ def pod_events_enricher(event: PodEvent, params: EventEnricherParams):
)


@action
def enrich_pod_with_node_events(event: PodEvent, params: EventEnricherParams):
"""
Given a Kubernetes pod, fetch related events in the near past for its node
"""
pod = event.get_pod()
events_table_block = get_resource_events_table(
"*Node events:*",
kind="Node",
name=pod.spec.nodeName,
included_types=params.included_types,
max_events=params.max_events,
)
if events_table_block:
event.add_enrichment(
[events_table_block],
{SlackAnnotations.ATTACHMENT: True},
enrichment_type=EnrichmentType.k8s_events,
title="Node Events",
)


@action
def deployment_events_enricher(event: DeploymentEvent, params: ExtendedEventEnricherParams):
"""
Expand Down
61 changes: 21 additions & 40 deletions playbooks/robusta_playbooks/node_enrichments.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
from typing import List

from hikaru.model.rel_1_26 import Pod, PodList

from robusta.api import (
BaseBlock,
EnrichmentType,
FileBlock,
Finding,
FindingSeverity,
Expand All @@ -16,22 +18,14 @@
PodRunningParams,
ResourceGraphEnricherParams,
RobustaPod,
TableBlock,
action,
create_node_graph_enrichment,
EnrichmentType
get_node_allocatable_resources_table_block,
get_node_running_pods_table_block_or_none,
get_node_status_table_block,
)


def pod_row(pod: Pod) -> List[str]:
ready_condition = [condition.status for condition in pod.status.conditions if condition.type == "Ready"]
return [
pod.metadata.namespace,
pod.metadata.name,
ready_condition[0] if ready_condition else "Unknown",
]


def has_resource_request(pod: Pod, resource_type: str) -> bool:
for container in pod.spec.containers:
try:
Expand Down Expand Up @@ -81,12 +75,10 @@ def node_running_pods_enricher(event: NodeEvent):
return

block_list: List[BaseBlock] = []
pod_list: PodList = PodList.listPodForAllNamespaces(field_selector=f"spec.nodeName={node.metadata.name}").obj

effected_pods_rows = [pod_row(pod) for pod in pod_list.items]
block_list.append(
TableBlock(effected_pods_rows, ["namespace", "name", "ready"], table_name=f"Pods running on the node")
)
table_resources = get_node_running_pods_table_block_or_none(node)
if not table_resources:
return
block_list.append(table_resources)
event.add_enrichment(block_list)


Expand All @@ -104,13 +96,7 @@ def node_allocatable_resources_enricher(event: NodeEvent):

block_list: List[BaseBlock] = []
if node:
block_list.append(
TableBlock(
[[k, v] for (k, v) in node.status.allocatable.items()],
["resource", "value"],
table_name="Node Allocatable Resources - The amount of compute resources that are available for pods",
)
)
block_list.append(get_node_allocatable_resources_table_block(node))
event.add_enrichment(block_list)


Expand All @@ -123,21 +109,14 @@ def node_status_enricher(event: NodeEvent):

Can help troubleshooting Node issues.
"""
if not event.get_node():
logging.error(f"node_status_enricher was called on event without node : {event}")
node = event.get_node()
if not node:
logging.error("node_status_enricher was called on event without node : {event}")
return

logging.info(f"node_status_enricher is depricated, use status_enricher instead")
logging.info("node_status_enricher is depricated, use status_enricher instead")

event.add_enrichment(
[
TableBlock(
[[c.type, c.status] for c in event.get_node().status.conditions],
headers=["Type", "Status"],
table_name="*Node status details:*",
),
]
)
event.add_enrichment(get_node_status_table_block(node))


@action
Expand All @@ -154,8 +133,9 @@ def node_dmesg_enricher(event: NodeEvent, params: PodRunningParams):
)
if exec_result:
event.add_enrichment(
[FileBlock(f"dmesg.log", exec_result.encode())], enrichment_type=EnrichmentType.text_file,
title="DMESG Info"
[FileBlock("dmesg.log", exec_result.encode())],
enrichment_type=EnrichmentType.text_file,
title="DMESG Info",
)


Expand Down Expand Up @@ -189,8 +169,9 @@ def node_health_watcher(event: NodeChangeEvent):
subject=KubeObjFindingSubject(event.obj),
)
event.add_finding(finding)
event.add_enrichment([KubernetesDiffBlock([], event.old_obj,
event.obj, event.obj.metadata.name, kind=event.obj.kind)])
event.add_enrichment(
[KubernetesDiffBlock([], event.old_obj, event.obj, event.obj.metadata.name, kind=event.obj.kind)]
)
node_status_enricher(event)


Expand Down
61 changes: 34 additions & 27 deletions playbooks/robusta_playbooks/oom_killer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,27 @@
from hikaru.model.rel_1_26 import Node, Pod, PodList, ResourceRequirements

from robusta.api import (
EnrichmentType,
Finding,
FindingSeverity,
OOMGraphEnricherParams,
OomKillParams,
PodContainer,
PodEvent,
PodFindingSubject,
PodResources,
PrometheusAlert,
PrometheusKubernetesAlert,
RendererType,
OomKillParams,
OOMGraphEnricherParams,
TableBlock,
action,
create_container_graph,
create_node_graph_enrichment,
get_oom_killed_container,
parse_kubernetes_datetime_to_ms,
pod_most_recent_oom_killed_container,
EnrichmentType,
create_node_graph_enrichment,
)
from robusta.core.model.base_params import PrometheusParams, LogEnricherParams
from robusta.core.model.base_params import LogEnricherParams, PrometheusParams
from robusta.core.playbooks.oom_killer_utils import logs_enricher
from robusta.core.reporting.blocks import GraphBlock
from robusta.integrations.resource_analysis.memory_analyzer import MemoryAnalyzer
Expand All @@ -53,12 +53,17 @@ class OomKillerEnricherParams(PrometheusParams):
NODE_MEMORY_THRESHOLD = 0.95


def get_oomkilled_graph(oomkilled_container: PodContainer, pod: Pod, params: OOMGraphEnricherParams,
metrics_legends_labels: Optional[List[str]] = None,) -> GraphBlock:
def get_oomkilled_graph(
oomkilled_container: PodContainer,
pod: Pod,
params: OOMGraphEnricherParams,
metrics_legends_labels: Optional[List[str]] = None,
) -> GraphBlock:
if params.delay_graph_s > 0:
time.sleep(params.delay_graph_s)
return create_container_graph(params, pod, oomkilled_container, show_limit=True,
metrics_legends_labels=metrics_legends_labels)
return create_container_graph(
params, pod, oomkilled_container, show_limit=True, metrics_legends_labels=metrics_legends_labels
)


@action
Expand All @@ -74,8 +79,7 @@ def oomkilled_container_graph_enricher(event: PodEvent, params: OOMGraphEnricher
if not oomkilled_container:
logging.error("Unable to find oomkilled container")
return
container_graph = get_oomkilled_graph(oomkilled_container, pod, params,
metrics_legends_labels=["container"])
container_graph = get_oomkilled_graph(oomkilled_container, pod, params, metrics_legends_labels=["container"])
event.add_enrichment([container_graph], enrichment_type=EnrichmentType.graph, title="Container Info")


Expand All @@ -96,7 +100,7 @@ def pod_oom_killer_enricher(event: PodEvent, params: OomKillParams):
subject=PodFindingSubject(pod),
)

node: Node = Node.readNode(pod.spec.nodeName).obj
node = pod.get_node()
labels = [
("Pod", pod.metadata.name),
("Namespace", pod.metadata.namespace),
Expand All @@ -112,13 +116,16 @@ def pod_oom_killer_enricher(event: PodEvent, params: OomKillParams):
(
"Node allocated memory",
f"{allocated_precent:.2f}% out of {allocatable_memory}MB allocatable",
)]
),
]

blocks = [TableBlock(
[[k, v] for (k, v) in node_labels],
["field", "value"],
table_name="*Node Info*",
)]
blocks = [
TableBlock(
[[k, v] for (k, v) in node_labels],
["field", "value"],
table_name="*Node Info*",
)
]
if params.node_memory_graph:
node_graph = create_node_graph_enrichment(params, node, metrics_legends_labels=["pod"])
blocks.append(node_graph)
Expand Down Expand Up @@ -151,18 +158,18 @@ def pod_oom_killer_enricher(event: PodEvent, params: OomKillParams):
if oom_killed_status.terminated.finishedAt:
container_labels.append(("Container finished at", oom_killed_status.terminated.finishedAt))

blocks = [TableBlock(
[[k, v] for (k, v) in container_labels],
["field", "value"],
table_name="*Container Info*",
)]
blocks = [
TableBlock(
[[k, v] for (k, v) in container_labels],
["field", "value"],
table_name="*Container Info*",
)
]
if params.container_memory_graph and oomkilled_container.container:
container_graph = get_oomkilled_graph(oomkilled_container, pod, params,
metrics_legends_labels=["pod"])
container_graph = get_oomkilled_graph(oomkilled_container, pod, params, metrics_legends_labels=["pod"])
blocks.append(container_graph)

finding.add_enrichment(blocks, enrichment_type=EnrichmentType.container_info,
title="Container Info")
finding.add_enrichment(blocks, enrichment_type=EnrichmentType.container_info, title="Container Info")

event.add_finding(finding)
if params.attach_logs and container_name is not None:
Expand Down
7 changes: 3 additions & 4 deletions playbooks/robusta_playbooks/pod_enrichments.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import logging
from datetime import datetime

from hikaru.model.rel_1_26 import Node
from robusta.api import (
EnrichmentType,
PodEvent,
PodResourceGraphEnricherParams,
ResourceChartItemType,
Expand All @@ -13,7 +13,6 @@
create_node_graph_enrichment,
create_resource_enrichment,
pod_limits,
EnrichmentType
)
from robusta.core.model.pods import pod_requests

Expand Down Expand Up @@ -64,7 +63,7 @@ def pod_graph_enricher(pod_event: PodEvent, params: PodResourceGraphEnricherPara
prometheus_params=params,
graph_duration_minutes=params.graph_duration_minutes,
lines=limit_lines,
metrics_legends_labels=["pod"]
metrics_legends_labels=["pod"],
)
pod_event.add_enrichment([graph_enrichment], enrichment_type=EnrichmentType.graph, title="Pod Resources")

Expand All @@ -78,7 +77,7 @@ def pod_node_graph_enricher(pod_event: PodEvent, params: ResourceGraphEnricherPa
if not pod:
logging.error(f"cannot run pod_node_graph_enricher on event with no pod: {pod_event}")
return
node: Node = Node.readNode(pod.spec.nodeName).obj
node = pod.get_node()
if not node:
logging.warning(f"Node {pod.spec.nodeName} not found for pod {pod.metadata.name}")
return
Expand Down
60 changes: 60 additions & 0 deletions playbooks/robusta_playbooks/pod_evicted_enrichments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import logging

from robusta.api import (
EnrichmentType,
Finding,
FindingSeverity,
PodEvent,
PodFindingSubject,
TableBlock,
action,
get_node_allocatable_resources_table_block,
get_node_running_pods_table_block_or_none,
get_node_status_table_block,
)


@action
def pod_evicted_enricher(event: PodEvent):
"""
Retrieves pod and node information for an OOMKilled pod
"""
pod = event.get_pod()
if not pod:
logging.error(f"cannot run pod_evicted_enricher on event with no pod: {event}")
return

node = pod.get_node()
if not node:
logging.error(f"cannot run pod_evicted_enricher on event with no node: {event}")
return

finding = Finding(
title=f"Pod {pod.metadata.name} in namespace {pod.metadata.namespace} was Evicted",
aggregation_key="PodEvictedTriggered",
severity=FindingSeverity.HIGH,
subject=PodFindingSubject(pod),
)

node_labels = [("Node Name", pod.spec.nodeName)]
node_info_block = TableBlock(
[[k, v] for k, v in node_labels],
headers=["Field", "Value"],
table_name="*Node general info:*",
)
node_status_block = get_node_status_table_block(node)

allocatable_resources_block = get_node_allocatable_resources_table_block(
node, table_name="*Node Allocatable Resources:*"
)

finding.add_enrichment(
[node_info_block, node_status_block, allocatable_resources_block],
enrichment_type=EnrichmentType.node_info,
title="Node Info",
)

event.add_finding(finding)

running_nodes_table = get_node_running_pods_table_block_or_none(node)
event.add_enrichment(running_nodes_table)
5 changes: 5 additions & 0 deletions src/robusta/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@
from robusta.core.playbooks.container_playbook_utils import create_container_graph
from robusta.core.playbooks.crash_reporter import send_crash_report
from robusta.core.playbooks.job_utils import CONTROLLER_UID, get_job_all_pods, get_job_latest_pod, get_job_selector
from robusta.core.playbooks.node_enrichment_utils import (
get_node_allocatable_resources_table_block,
get_node_running_pods_table_block_or_none,
get_node_status_table_block,
)
from robusta.core.playbooks.node_playbook_utils import create_node_graph_enrichment
from robusta.core.playbooks.pod_utils.crashloop_utils import get_crash_report_enrichments
from robusta.core.playbooks.pod_utils.imagepull_utils import (
Expand Down
Loading
Loading