Update check to consider number of restarts (#1069)

unskript · May 31, 2024 · b12795a · b12795a
1 parent 324023a
commit b12795a
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 36 deletions.
diff --git a/Kubernetes/legos/k8s_get_oomkilled_pods/k8s_get_oomkilled_pods.py b/Kubernetes/legos/k8s_get_oomkilled_pods/k8s_get_oomkilled_pods.py
@@ -3,8 +3,7 @@
 # All rights reserved.
 #
 import pprint
-import datetime
-from datetime import timezone 
+from datetime import datetime, timedelta, timezone
 from typing import Tuple, Optional
 from pydantic import BaseModel, Field
 from kubernetes import client
@@ -22,6 +21,11 @@ class InputSchema(BaseModel):
         description='Time interval in hours. This time window is used to check if POD good OOMKilled. Default is 24 hours.',
         title="Time Interval"
     )
+    restart_threshold: int = Field(
+        10,
+        description='The threshold for the number of restarts within the specified time interval. Default is 10 restarts.',
+        title='Restart Threshold'
+    )
 
 
 
@@ -33,9 +37,23 @@ def k8s_get_oomkilled_pods_printer(output):
 def format_datetime(dt):
     # Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC'
     return dt.strftime('%Y-%m-%d %H:%M:%S UTC')
+
+def fetch_restart_events(v1, pod_name, namespace, time_interval):
+    """Fetch restart-related events for a specific pod within the given time interval."""
+    current_time = datetime.now(timezone.utc)
+    start_time = current_time - timedelta(hours=time_interval)
+    field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}"
+    event_list = v1.list_namespaced_event(namespace, field_selector=field_selector)
+    restart_events = [
+        event for event in event_list.items
+        if event.reason in ["BackOff", "CrashLoopBackOff"] and
+        event.last_timestamp and
+        start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time
+    ]
+    return len(restart_events)
 
 
-def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=24) -> Tuple:
+def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=24, restart_threshold: int = 10) -> Tuple:
     """k8s_get_oomkilled_pods This function returns the pods that have OOMKilled event in the container last states
 
     :type handle: Object
@@ -76,16 +94,12 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2
     if pods is None:
         raise ApiException("No pods returned from the Kubernetes API.")
 
-    # Get Current Time in UTC
-    current_time = datetime.datetime.now(timezone.utc)
-    # Get time interval to check (or 24 hour) reference and convert to UTC
-    interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check)
-    interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc)
+    interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check)
 
-
     for pod in pods:
         pod_name = pod.metadata.name
         namespace = pod.metadata.namespace
+        recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check)
 
         # Ensure container_statuses is not None before iterating
         container_statuses = pod.status.container_statuses
@@ -97,14 +111,18 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2
             container_name = container_status.name
             last_state = container_status.last_state
             if last_state and last_state.terminated and last_state.terminated.reason == "OOMKilled":
-                termination_time = last_state.terminated.finished_at
-                termination_time = termination_time.replace(tzinfo=timezone.utc)
+                oom_time = last_state.terminated.finished_at
                 # If termination time is greater than interval_time_to_check meaning
-                # the POD has gotten OOMKilled in the last 24 hours, so lets flag it!
-                if termination_time and termination_time >= interval_time_to_check:
-                    formatted_termination_time = format_datetime(termination_time)
-                    formatted_interval_time_to_check = format_datetime(interval_time_to_check)
-                    result.append({"pod": pod_name, "namespace": namespace, "container": container_name, "termination_time":formatted_termination_time,"interval_time_to_check": formatted_interval_time_to_check})
-
-    return (False, result) if result else (True, None)
-
+                # the POD has gotten OOMKilled in the last 24 hours and the number of restarts for 
+                # that pod is greater than 10, so lets flag it!
+                if oom_time and oom_time.replace(tzinfo=timezone.utc) >= interval_time_to_check:
+                    if recent_restarts > restart_threshold:
+                        formatted_oom_time = format_datetime(oom_time)
+                        result.append({
+                            "pod": pod_name,
+                            "namespace": namespace,
+                            "container": container_name,
+                            "termination_time": formatted_oom_time,
+                            "restarts": recent_restarts
+                        })
+    return (False, result) if result else (True, None)
diff --git a/...es/legos/k8s_get_pods_in_crashloopbackoff_state/k8s_get_pods_in_crashloopbackoff_state.py b/...es/legos/k8s_get_pods_in_crashloopbackoff_state/k8s_get_pods_in_crashloopbackoff_state.py
@@ -8,8 +8,7 @@
 from kubernetes import client
 from kubernetes.client.rest import ApiException
 from tabulate import tabulate
-import datetime
-from datetime import timezone 
+from datetime import datetime, timedelta, timezone
 
 
 class InputSchema(BaseModel):
@@ -20,7 +19,13 @@ class InputSchema(BaseModel):
     time_interval_to_check: int = Field(
         24,
         description='Time interval in hours. This time window is used to check if POD was in Crashloopback. Default is 24 hours.',
-        title="Time Interval"
+        title=
+        "Time Interval"
+    )
+    restart_threshold: int = Field(
+        10,
+        description='The threshold for the number of restarts within the specified time interval. Default is 10 restarts.',
+        title='Restart Threshold'
     )
 
 
@@ -38,7 +43,21 @@ def format_datetime(dt):
     # Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC'
     return dt.strftime('%Y-%m-%d %H:%M:%S UTC')
 
-def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_interval_to_check=24) -> Tuple:
+def fetch_restart_events(v1, pod_name, namespace, time_interval):
+    """Fetch restart-related events for a specific pod within the given time interval."""
+    current_time = datetime.now(timezone.utc)
+    start_time = current_time - timedelta(hours=time_interval)
+    field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}"
+    event_list = v1.list_namespaced_event(namespace, field_selector=field_selector)
+    restart_events = [
+        event for event in event_list.items
+        if event.reason in ["BackOff", "CrashLoopBackOff"] and
+        event.last_timestamp and
+        start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time
+    ]
+    return len(restart_events)
+
+def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_interval_to_check=24, restart_threshold=10) -> Tuple:
     """
     k8s_get_pods_in_crashloopbackoff_state returns the pods that have CrashLoopBackOff state in their container statuses within the specified time interval.
 
@@ -52,6 +71,9 @@ def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_int
     :param time_interval_to_check: (Optional) Integer, in hours, the interval within which the
             state of the POD should be checked.
 
+    :type restart_threshold: int
+        :param restart_threshold: (Optional) Integer, the threshold of restarts to check against.
+
     :rtype: Status, List of objects of pods, namespaces, and containers that are in CrashLoopBackOff state
     """
     result = []
@@ -77,33 +99,33 @@ def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_int
     if pods is None:
         raise ApiException("No pods returned from the Kubernetes API.")
 
-    current_time = datetime.datetime.now(timezone.utc)
-    interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check)
-    interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc)
+    interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check)
 
     for pod in pods:
         pod_name = pod.metadata.name
         namespace = pod.metadata.namespace
         container_statuses = pod.status.container_statuses
         if container_statuses is None:
             continue
+        recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check)
+
         for container_status in container_statuses:
             container_name = container_status.name
             if container_status.state and container_status.state.waiting and container_status.state.waiting.reason == "CrashLoopBackOff":
-                # Check if the last transition time to CrashLoopBackOff is within the specified interval
-                if container_status.last_state and container_status.last_state.terminated:
-                    last_transition_time = container_status.last_state.terminated.finished_at
-                    if last_transition_time:
-                        last_transition_time = last_transition_time.replace(tzinfo=timezone.utc)
-                        if last_transition_time >= interval_time_to_check:
-                            formatted_transition_time = format_datetime(last_transition_time)
-                            formatted_interval_time_to_check = format_datetime(interval_time_to_check)
+                last_state = container_status.last_state
+                if last_state and last_state.terminated:
+                    last_transition_time = last_state.terminated.finished_at
+                    # Check if the last transition time to CrashLoopBackOff is within the specified interval
+                    # and the number of restarts are greater threshold in the last 24 hours
+                    if last_transition_time and last_transition_time.replace(tzinfo=timezone.utc) >= interval_time_to_check:
+                        if recent_restarts > restart_threshold:
+                            formatted_last_transition_time = format_datetime(last_transition_time)
                             result.append({
                                 "pod": pod_name,
                                 "namespace": namespace,
                                 "container": container_name,
-                                "last_transition_time": formatted_transition_time,
-                                "interval_time_to_check": formatted_interval_time_to_check
+                                "termination_time": formatted_last_transition_time,
+                                "restarts": recent_restarts
                             })
 
     return (False, result) if result else (True, None)