Get event for last 24 hours to check back-off for pods

unskript · May 31, 2024 · acebea0 · acebea0
1 parent 93d65d2
commit acebea0
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 36 deletions.
diff --git a/Kubernetes/legos/k8s_get_oomkilled_pods/k8s_get_oomkilled_pods.py b/Kubernetes/legos/k8s_get_oomkilled_pods/k8s_get_oomkilled_pods.py
@@ -3,8 +3,7 @@
 # All rights reserved.
 #
 import pprint
-import datetime
-from datetime import timezone 
+from datetime import datetime, timedelta, timezone
 from typing import Tuple, Optional
 from pydantic import BaseModel, Field
 from kubernetes import client
@@ -38,6 +37,20 @@ def k8s_get_oomkilled_pods_printer(output):
 def format_datetime(dt):
     # Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC'
     return dt.strftime('%Y-%m-%d %H:%M:%S UTC')
+
+def fetch_restart_events(v1, pod_name, namespace, time_interval):
+    """Fetch restart-related events for a specific pod within the given time interval."""
+    current_time = datetime.now(timezone.utc)
+    start_time = current_time - timedelta(hours=time_interval)
+    field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}"
+    event_list = v1.list_namespaced_event(namespace, field_selector=field_selector)
+    restart_events = [
+        event for event in event_list.items
+        if event.reason in ["BackOff", "CrashLoopBackOff"] and
+        event.last_timestamp and
+        start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time
+    ]
+    return len(restart_events)
 
 
 def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=24, restart_threshold: int = 10) -> Tuple:
@@ -81,17 +94,12 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2
     if pods is None:
         raise ApiException("No pods returned from the Kubernetes API.")
 
-    # Get Current Time in UTC
-    current_time = datetime.datetime.now(timezone.utc)
-    # Get time interval to check (or 24 hour) reference and convert to UTC
-    interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check)
-    interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc)
+    interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check)
 
-
     for pod in pods:
         pod_name = pod.metadata.name
         namespace = pod.metadata.namespace
-        restarts = sum(container_status.restart_count for container_status in pod.status.container_statuses) if pod.status.container_statuses else 0
+        recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check)
 
         # Ensure container_statuses is not None before iterating
         container_statuses = pod.status.container_statuses
@@ -103,19 +111,18 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2
             container_name = container_status.name
             last_state = container_status.last_state
             if last_state and last_state.terminated and last_state.terminated.reason == "OOMKilled":
-                termination_time = last_state.terminated.finished_at
-                if termination_time:
-                    termination_time = termination_time.replace(tzinfo=timezone.utc)
-                    # If termination time is greater than interval_time_to_check meaning
-                    # the POD has gotten OOMKilled in the last 24 hours and the number of restarts for 
-                    # that pod is greater than 10, so lets flag it!
-                    if termination_time >= interval_time_to_check and restarts > restart_threshold:
-                        formatted_termination_time = format_datetime(termination_time)
+                oom_time = last_state.terminated.finished_at
+                # If termination time is greater than interval_time_to_check meaning
+                # the POD has gotten OOMKilled in the last 24 hours and the number of restarts for 
+                # that pod is greater than 10, so lets flag it!
+                if oom_time and oom_time.replace(tzinfo=timezone.utc) >= interval_time_to_check:
+                    if recent_restarts > restart_threshold:
+                        formatted_oom_time = format_datetime(oom_time)
                         result.append({
                             "pod": pod_name,
                             "namespace": namespace,
                             "container": container_name,
-                            "termination_time": formatted_termination_time,
-                            "restarts": restarts,
+                            "termination_time": formatted_oom_time,
+                            "restarts": recent_restarts
                         })
     return (False, result) if result else (True, None)
diff --git a/...es/legos/k8s_get_pods_in_crashloopbackoff_state/k8s_get_pods_in_crashloopbackoff_state.py b/...es/legos/k8s_get_pods_in_crashloopbackoff_state/k8s_get_pods_in_crashloopbackoff_state.py
@@ -43,6 +43,20 @@ def format_datetime(dt):
     # Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC'
     return dt.strftime('%Y-%m-%d %H:%M:%S UTC')
 
+def fetch_restart_events(v1, pod_name, namespace, time_interval):
+    """Fetch restart-related events for a specific pod within the given time interval."""
+    current_time = datetime.now(timezone.utc)
+    start_time = current_time - timedelta(hours=time_interval)
+    field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}"
+    event_list = v1.list_namespaced_event(namespace, field_selector=field_selector)
+    restart_events = [
+        event for event in event_list.items
+        if event.reason in ["BackOff", "CrashLoopBackOff"] and
+        event.last_timestamp and
+        start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time
+    ]
+    return len(restart_events)
+
 def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_interval_to_check=24, restart_threshold=10) -> Tuple:
     """
     k8s_get_pods_in_crashloopbackoff_state returns the pods that have CrashLoopBackOff state in their container statuses within the specified time interval.
@@ -85,33 +99,33 @@ def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_int
     if pods is None:
         raise ApiException("No pods returned from the Kubernetes API.")
 
-    current_time = datetime.datetime.now(timezone.utc)
-    interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check)
-    interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc)
+    interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check)
 
     for pod in pods:
         pod_name = pod.metadata.name
         namespace = pod.metadata.namespace
         container_statuses = pod.status.container_statuses
         if container_statuses is None:
             continue
-        total_restarts = sum(container_status.restart_count for container_status in container_statuses)
+        recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check)
+
         for container_status in container_statuses:
             container_name = container_status.name
             if container_status.state and container_status.state.waiting and container_status.state.waiting.reason == "CrashLoopBackOff":
-                last_transition_time = container_status.last_state.terminated.finished_at
-                if last_transition_time:
-                    last_transition_time = last_transition_time.replace(tzinfo=timezone.utc)
+                last_state = container_status.last_state
+                if last_state and last_state.terminated:
+                    last_transition_time = last_state.terminated.finished_at
                     # Check if the last transition time to CrashLoopBackOff is within the specified interval
-                    # and the number of restarts are within threshold
-                    if last_transition_time >= interval_time_to_check and total_restarts > restart_threshold:
-                        formatted_transition_time = format_datetime(last_transition_time)
-                        result.append({
-                            "pod": pod_name,
-                            "namespace": namespace,
-                            "container": container_name,
-                            "last_transition_time": formatted_transition_time,
-                            "restarts": total_restarts
-                        })
+                    # and the number of restarts are greater threshold in the last 24 hours
+                    if last_transition_time and last_transition_time.replace(tzinfo=timezone.utc) >= interval_time_to_check:
+                        if recent_restarts > restart_threshold:
+                            formatted_last_transition_time = format_datetime(last_transition_time)
+                            result.append({
+                                "pod": pod_name,
+                                "namespace": namespace,
+                                "container": container_name,
+                                "termination_time": formatted_last_transition_time,
+                                "restarts": recent_restarts
+                            })
 
     return (False, result) if result else (True, None)