Skip to content

Commit

Permalink
Update check to consider number of restarts (#1069)
Browse files Browse the repository at this point in the history
  • Loading branch information
shloka-bhalgat-unskript authored May 31, 2024
1 parent 324023a commit b12795a
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 36 deletions.
56 changes: 37 additions & 19 deletions Kubernetes/legos/k8s_get_oomkilled_pods/k8s_get_oomkilled_pods.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
# All rights reserved.
#
import pprint
import datetime
from datetime import timezone
from datetime import datetime, timedelta, timezone
from typing import Tuple, Optional
from pydantic import BaseModel, Field
from kubernetes import client
Expand All @@ -22,6 +21,11 @@ class InputSchema(BaseModel):
description='Time interval in hours. This time window is used to check if POD good OOMKilled. Default is 24 hours.',
title="Time Interval"
)
restart_threshold: int = Field(
10,
description='The threshold for the number of restarts within the specified time interval. Default is 10 restarts.',
title='Restart Threshold'
)



Expand All @@ -33,9 +37,23 @@ def k8s_get_oomkilled_pods_printer(output):
def format_datetime(dt):
# Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC'
return dt.strftime('%Y-%m-%d %H:%M:%S UTC')

def fetch_restart_events(v1, pod_name, namespace, time_interval):
"""Fetch restart-related events for a specific pod within the given time interval."""
current_time = datetime.now(timezone.utc)
start_time = current_time - timedelta(hours=time_interval)
field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}"
event_list = v1.list_namespaced_event(namespace, field_selector=field_selector)
restart_events = [
event for event in event_list.items
if event.reason in ["BackOff", "CrashLoopBackOff"] and
event.last_timestamp and
start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time
]
return len(restart_events)


def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=24) -> Tuple:
def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=24, restart_threshold: int = 10) -> Tuple:
"""k8s_get_oomkilled_pods This function returns the pods that have OOMKilled event in the container last states
:type handle: Object
Expand Down Expand Up @@ -76,16 +94,12 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2
if pods is None:
raise ApiException("No pods returned from the Kubernetes API.")

# Get Current Time in UTC
current_time = datetime.datetime.now(timezone.utc)
# Get time interval to check (or 24 hour) reference and convert to UTC
interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check)
interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc)
interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check)


for pod in pods:
pod_name = pod.metadata.name
namespace = pod.metadata.namespace
recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check)

# Ensure container_statuses is not None before iterating
container_statuses = pod.status.container_statuses
Expand All @@ -97,14 +111,18 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2
container_name = container_status.name
last_state = container_status.last_state
if last_state and last_state.terminated and last_state.terminated.reason == "OOMKilled":
termination_time = last_state.terminated.finished_at
termination_time = termination_time.replace(tzinfo=timezone.utc)
oom_time = last_state.terminated.finished_at
# If termination time is greater than interval_time_to_check meaning
# the POD has gotten OOMKilled in the last 24 hours, so lets flag it!
if termination_time and termination_time >= interval_time_to_check:
formatted_termination_time = format_datetime(termination_time)
formatted_interval_time_to_check = format_datetime(interval_time_to_check)
result.append({"pod": pod_name, "namespace": namespace, "container": container_name, "termination_time":formatted_termination_time,"interval_time_to_check": formatted_interval_time_to_check})

return (False, result) if result else (True, None)

# the POD has gotten OOMKilled in the last 24 hours and the number of restarts for
# that pod is greater than 10, so lets flag it!
if oom_time and oom_time.replace(tzinfo=timezone.utc) >= interval_time_to_check:
if recent_restarts > restart_threshold:
formatted_oom_time = format_datetime(oom_time)
result.append({
"pod": pod_name,
"namespace": namespace,
"container": container_name,
"termination_time": formatted_oom_time,
"restarts": recent_restarts
})
return (False, result) if result else (True, None)
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
from kubernetes import client
from kubernetes.client.rest import ApiException
from tabulate import tabulate
import datetime
from datetime import timezone
from datetime import datetime, timedelta, timezone


class InputSchema(BaseModel):
Expand All @@ -20,7 +19,13 @@ class InputSchema(BaseModel):
time_interval_to_check: int = Field(
24,
description='Time interval in hours. This time window is used to check if POD was in Crashloopback. Default is 24 hours.',
title="Time Interval"
title=
"Time Interval"
)
restart_threshold: int = Field(
10,
description='The threshold for the number of restarts within the specified time interval. Default is 10 restarts.',
title='Restart Threshold'
)


Expand All @@ -38,7 +43,21 @@ def format_datetime(dt):
# Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC'
return dt.strftime('%Y-%m-%d %H:%M:%S UTC')

def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_interval_to_check=24) -> Tuple:
def fetch_restart_events(v1, pod_name, namespace, time_interval):
"""Fetch restart-related events for a specific pod within the given time interval."""
current_time = datetime.now(timezone.utc)
start_time = current_time - timedelta(hours=time_interval)
field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}"
event_list = v1.list_namespaced_event(namespace, field_selector=field_selector)
restart_events = [
event for event in event_list.items
if event.reason in ["BackOff", "CrashLoopBackOff"] and
event.last_timestamp and
start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time
]
return len(restart_events)

def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_interval_to_check=24, restart_threshold=10) -> Tuple:
"""
k8s_get_pods_in_crashloopbackoff_state returns the pods that have CrashLoopBackOff state in their container statuses within the specified time interval.
Expand All @@ -52,6 +71,9 @@ def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_int
:param time_interval_to_check: (Optional) Integer, in hours, the interval within which the
state of the POD should be checked.
:type restart_threshold: int
:param restart_threshold: (Optional) Integer, the threshold of restarts to check against.
:rtype: Status, List of objects of pods, namespaces, and containers that are in CrashLoopBackOff state
"""
result = []
Expand All @@ -77,33 +99,33 @@ def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_int
if pods is None:
raise ApiException("No pods returned from the Kubernetes API.")

current_time = datetime.datetime.now(timezone.utc)
interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check)
interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc)
interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check)

for pod in pods:
pod_name = pod.metadata.name
namespace = pod.metadata.namespace
container_statuses = pod.status.container_statuses
if container_statuses is None:
continue
recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check)

for container_status in container_statuses:
container_name = container_status.name
if container_status.state and container_status.state.waiting and container_status.state.waiting.reason == "CrashLoopBackOff":
# Check if the last transition time to CrashLoopBackOff is within the specified interval
if container_status.last_state and container_status.last_state.terminated:
last_transition_time = container_status.last_state.terminated.finished_at
if last_transition_time:
last_transition_time = last_transition_time.replace(tzinfo=timezone.utc)
if last_transition_time >= interval_time_to_check:
formatted_transition_time = format_datetime(last_transition_time)
formatted_interval_time_to_check = format_datetime(interval_time_to_check)
last_state = container_status.last_state
if last_state and last_state.terminated:
last_transition_time = last_state.terminated.finished_at
# Check if the last transition time to CrashLoopBackOff is within the specified interval
# and the number of restarts are greater threshold in the last 24 hours
if last_transition_time and last_transition_time.replace(tzinfo=timezone.utc) >= interval_time_to_check:
if recent_restarts > restart_threshold:
formatted_last_transition_time = format_datetime(last_transition_time)
result.append({
"pod": pod_name,
"namespace": namespace,
"container": container_name,
"last_transition_time": formatted_transition_time,
"interval_time_to_check": formatted_interval_time_to_check
"termination_time": formatted_last_transition_time,
"restarts": recent_restarts
})

return (False, result) if result else (True, None)

0 comments on commit b12795a

Please sign in to comment.