diff --git a/Kubernetes/legos/k8s_get_oomkilled_pods/k8s_get_oomkilled_pods.py b/Kubernetes/legos/k8s_get_oomkilled_pods/k8s_get_oomkilled_pods.py index 7d7766a3c..e852e3064 100644 --- a/Kubernetes/legos/k8s_get_oomkilled_pods/k8s_get_oomkilled_pods.py +++ b/Kubernetes/legos/k8s_get_oomkilled_pods/k8s_get_oomkilled_pods.py @@ -3,7 +3,8 @@ # All rights reserved. # import pprint -from datetime import datetime, timedelta, timezone +import datetime +from datetime import timezone from typing import Tuple, Optional from pydantic import BaseModel, Field from kubernetes import client @@ -21,11 +22,6 @@ class InputSchema(BaseModel): description='Time interval in hours. This time window is used to check if POD good OOMKilled. Default is 24 hours.', title="Time Interval" ) - restart_threshold: int = Field( - 10, - description='The threshold for the number of restarts within the specified time interval. Default is 10 restarts.', - title='Restart Threshold' - ) @@ -37,23 +33,9 @@ def k8s_get_oomkilled_pods_printer(output): def format_datetime(dt): # Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC' return dt.strftime('%Y-%m-%d %H:%M:%S UTC') - -def fetch_restart_events(v1, pod_name, namespace, time_interval): - """Fetch restart-related events for a specific pod within the given time interval.""" - current_time = datetime.now(timezone.utc) - start_time = current_time - timedelta(hours=time_interval) - field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}" - event_list = v1.list_namespaced_event(namespace, field_selector=field_selector) - restart_events = [ - event for event in event_list.items - if event.reason in ["BackOff", "CrashLoopBackOff"] and - event.last_timestamp and - start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time - ] - return len(restart_events) -def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=24, restart_threshold: int = 10) -> Tuple: +def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=24) -> Tuple: """k8s_get_oomkilled_pods This function returns the pods that have OOMKilled event in the container last states :type handle: Object @@ -94,12 +76,16 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2 if pods is None: raise ApiException("No pods returned from the Kubernetes API.") - interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check) + # Get Current Time in UTC + current_time = datetime.datetime.now(timezone.utc) + # Get time interval to check (or 24 hour) reference and convert to UTC + interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check) + interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc) + for pod in pods: pod_name = pod.metadata.name namespace = pod.metadata.namespace - recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check) # Ensure container_statuses is not None before iterating container_statuses = pod.status.container_statuses @@ -111,18 +97,14 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2 container_name = container_status.name last_state = container_status.last_state if last_state and last_state.terminated and last_state.terminated.reason == "OOMKilled": - oom_time = last_state.terminated.finished_at + termination_time = last_state.terminated.finished_at + termination_time = termination_time.replace(tzinfo=timezone.utc) # If termination time is greater than interval_time_to_check meaning - # the POD has gotten OOMKilled in the last 24 hours and the number of restarts for - # that pod is greater than 10, so lets flag it! - if oom_time and oom_time.replace(tzinfo=timezone.utc) >= interval_time_to_check: - if recent_restarts > restart_threshold: - formatted_oom_time = format_datetime(oom_time) - result.append({ - "pod": pod_name, - "namespace": namespace, - "container": container_name, - "termination_time": formatted_oom_time, - "restarts": recent_restarts - }) - return (False, result) if result else (True, None) \ No newline at end of file + # the POD has gotten OOMKilled in the last 24 hours, so lets flag it! + if termination_time and termination_time >= interval_time_to_check: + formatted_termination_time = format_datetime(termination_time) + formatted_interval_time_to_check = format_datetime(interval_time_to_check) + result.append({"pod": pod_name, "namespace": namespace, "container": container_name, "termination_time":formatted_termination_time,"interval_time_to_check": formatted_interval_time_to_check}) + + return (False, result) if result else (True, None) + diff --git a/Kubernetes/legos/k8s_get_pods_in_crashloopbackoff_state/k8s_get_pods_in_crashloopbackoff_state.py b/Kubernetes/legos/k8s_get_pods_in_crashloopbackoff_state/k8s_get_pods_in_crashloopbackoff_state.py index 813047359..798b6ce4b 100644 --- a/Kubernetes/legos/k8s_get_pods_in_crashloopbackoff_state/k8s_get_pods_in_crashloopbackoff_state.py +++ b/Kubernetes/legos/k8s_get_pods_in_crashloopbackoff_state/k8s_get_pods_in_crashloopbackoff_state.py @@ -8,7 +8,8 @@ from kubernetes import client from kubernetes.client.rest import ApiException from tabulate import tabulate -from datetime import datetime, timedelta, timezone +import datetime +from datetime import timezone class InputSchema(BaseModel): @@ -19,13 +20,7 @@ class InputSchema(BaseModel): time_interval_to_check: int = Field( 24, description='Time interval in hours. This time window is used to check if POD was in Crashloopback. Default is 24 hours.', - title= - "Time Interval" - ) - restart_threshold: int = Field( - 10, - description='The threshold for the number of restarts within the specified time interval. Default is 10 restarts.', - title='Restart Threshold' + title="Time Interval" ) @@ -43,21 +38,7 @@ def format_datetime(dt): # Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC' return dt.strftime('%Y-%m-%d %H:%M:%S UTC') -def fetch_restart_events(v1, pod_name, namespace, time_interval): - """Fetch restart-related events for a specific pod within the given time interval.""" - current_time = datetime.now(timezone.utc) - start_time = current_time - timedelta(hours=time_interval) - field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}" - event_list = v1.list_namespaced_event(namespace, field_selector=field_selector) - restart_events = [ - event for event in event_list.items - if event.reason in ["BackOff", "CrashLoopBackOff"] and - event.last_timestamp and - start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time - ] - return len(restart_events) - -def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_interval_to_check=24, restart_threshold=10) -> Tuple: +def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_interval_to_check=24) -> Tuple: """ k8s_get_pods_in_crashloopbackoff_state returns the pods that have CrashLoopBackOff state in their container statuses within the specified time interval. @@ -71,9 +52,6 @@ def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_int :param time_interval_to_check: (Optional) Integer, in hours, the interval within which the state of the POD should be checked. - :type restart_threshold: int - :param restart_threshold: (Optional) Integer, the threshold of restarts to check against. - :rtype: Status, List of objects of pods, namespaces, and containers that are in CrashLoopBackOff state """ result = [] @@ -99,7 +77,9 @@ def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_int if pods is None: raise ApiException("No pods returned from the Kubernetes API.") - interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check) + current_time = datetime.datetime.now(timezone.utc) + interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check) + interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc) for pod in pods: pod_name = pod.metadata.name @@ -107,25 +87,23 @@ def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_int container_statuses = pod.status.container_statuses if container_statuses is None: continue - recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check) - for container_status in container_statuses: container_name = container_status.name if container_status.state and container_status.state.waiting and container_status.state.waiting.reason == "CrashLoopBackOff": - last_state = container_status.last_state - if last_state and last_state.terminated: - last_transition_time = last_state.terminated.finished_at - # Check if the last transition time to CrashLoopBackOff is within the specified interval - # and the number of restarts are greater threshold in the last 24 hours - if last_transition_time and last_transition_time.replace(tzinfo=timezone.utc) >= interval_time_to_check: - if recent_restarts > restart_threshold: - formatted_last_transition_time = format_datetime(last_transition_time) + # Check if the last transition time to CrashLoopBackOff is within the specified interval + if container_status.last_state and container_status.last_state.terminated: + last_transition_time = container_status.last_state.terminated.finished_at + if last_transition_time: + last_transition_time = last_transition_time.replace(tzinfo=timezone.utc) + if last_transition_time >= interval_time_to_check: + formatted_transition_time = format_datetime(last_transition_time) + formatted_interval_time_to_check = format_datetime(interval_time_to_check) result.append({ "pod": pod_name, "namespace": namespace, "container": container_name, - "termination_time": formatted_last_transition_time, - "restarts": recent_restarts + "last_transition_time": formatted_transition_time, + "interval_time_to_check": formatted_interval_time_to_check }) return (False, result) if result else (True, None)