Skip to content

Commit

Permalink
Get event for last 24 hours to check back-off for pods
Browse files Browse the repository at this point in the history
  • Loading branch information
shloka-bhalgat-unskript committed May 31, 2024
1 parent 93d65d2 commit acebea0
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 36 deletions.
45 changes: 26 additions & 19 deletions Kubernetes/legos/k8s_get_oomkilled_pods/k8s_get_oomkilled_pods.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
# All rights reserved.
#
import pprint
import datetime
from datetime import timezone
from datetime import datetime, timedelta, timezone
from typing import Tuple, Optional
from pydantic import BaseModel, Field
from kubernetes import client
Expand Down Expand Up @@ -38,6 +37,20 @@ def k8s_get_oomkilled_pods_printer(output):
def format_datetime(dt):
# Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC'
return dt.strftime('%Y-%m-%d %H:%M:%S UTC')

def fetch_restart_events(v1, pod_name, namespace, time_interval):
"""Fetch restart-related events for a specific pod within the given time interval."""
current_time = datetime.now(timezone.utc)
start_time = current_time - timedelta(hours=time_interval)
field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}"
event_list = v1.list_namespaced_event(namespace, field_selector=field_selector)
restart_events = [
event for event in event_list.items
if event.reason in ["BackOff", "CrashLoopBackOff"] and
event.last_timestamp and
start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time
]
return len(restart_events)


def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=24, restart_threshold: int = 10) -> Tuple:
Expand Down Expand Up @@ -81,17 +94,12 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2
if pods is None:
raise ApiException("No pods returned from the Kubernetes API.")

# Get Current Time in UTC
current_time = datetime.datetime.now(timezone.utc)
# Get time interval to check (or 24 hour) reference and convert to UTC
interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check)
interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc)
interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check)


for pod in pods:
pod_name = pod.metadata.name
namespace = pod.metadata.namespace
restarts = sum(container_status.restart_count for container_status in pod.status.container_statuses) if pod.status.container_statuses else 0
recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check)

# Ensure container_statuses is not None before iterating
container_statuses = pod.status.container_statuses
Expand All @@ -103,19 +111,18 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2
container_name = container_status.name
last_state = container_status.last_state
if last_state and last_state.terminated and last_state.terminated.reason == "OOMKilled":
termination_time = last_state.terminated.finished_at
if termination_time:
termination_time = termination_time.replace(tzinfo=timezone.utc)
# If termination time is greater than interval_time_to_check meaning
# the POD has gotten OOMKilled in the last 24 hours and the number of restarts for
# that pod is greater than 10, so lets flag it!
if termination_time >= interval_time_to_check and restarts > restart_threshold:
formatted_termination_time = format_datetime(termination_time)
oom_time = last_state.terminated.finished_at
# If termination time is greater than interval_time_to_check meaning
# the POD has gotten OOMKilled in the last 24 hours and the number of restarts for
# that pod is greater than 10, so lets flag it!
if oom_time and oom_time.replace(tzinfo=timezone.utc) >= interval_time_to_check:
if recent_restarts > restart_threshold:
formatted_oom_time = format_datetime(oom_time)
result.append({
"pod": pod_name,
"namespace": namespace,
"container": container_name,
"termination_time": formatted_termination_time,
"restarts": restarts,
"termination_time": formatted_oom_time,
"restarts": recent_restarts
})
return (False, result) if result else (True, None)
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,20 @@ def format_datetime(dt):
# Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC'
return dt.strftime('%Y-%m-%d %H:%M:%S UTC')

def fetch_restart_events(v1, pod_name, namespace, time_interval):
"""Fetch restart-related events for a specific pod within the given time interval."""
current_time = datetime.now(timezone.utc)
start_time = current_time - timedelta(hours=time_interval)
field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}"
event_list = v1.list_namespaced_event(namespace, field_selector=field_selector)
restart_events = [
event for event in event_list.items
if event.reason in ["BackOff", "CrashLoopBackOff"] and
event.last_timestamp and
start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time
]
return len(restart_events)

def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_interval_to_check=24, restart_threshold=10) -> Tuple:
"""
k8s_get_pods_in_crashloopbackoff_state returns the pods that have CrashLoopBackOff state in their container statuses within the specified time interval.
Expand Down Expand Up @@ -85,33 +99,33 @@ def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_int
if pods is None:
raise ApiException("No pods returned from the Kubernetes API.")

current_time = datetime.datetime.now(timezone.utc)
interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check)
interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc)
interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check)

for pod in pods:
pod_name = pod.metadata.name
namespace = pod.metadata.namespace
container_statuses = pod.status.container_statuses
if container_statuses is None:
continue
total_restarts = sum(container_status.restart_count for container_status in container_statuses)
recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check)

for container_status in container_statuses:
container_name = container_status.name
if container_status.state and container_status.state.waiting and container_status.state.waiting.reason == "CrashLoopBackOff":
last_transition_time = container_status.last_state.terminated.finished_at
if last_transition_time:
last_transition_time = last_transition_time.replace(tzinfo=timezone.utc)
last_state = container_status.last_state
if last_state and last_state.terminated:
last_transition_time = last_state.terminated.finished_at
# Check if the last transition time to CrashLoopBackOff is within the specified interval
# and the number of restarts are within threshold
if last_transition_time >= interval_time_to_check and total_restarts > restart_threshold:
formatted_transition_time = format_datetime(last_transition_time)
result.append({
"pod": pod_name,
"namespace": namespace,
"container": container_name,
"last_transition_time": formatted_transition_time,
"restarts": total_restarts
})
# and the number of restarts are greater threshold in the last 24 hours
if last_transition_time and last_transition_time.replace(tzinfo=timezone.utc) >= interval_time_to_check:
if recent_restarts > restart_threshold:
formatted_last_transition_time = format_datetime(last_transition_time)
result.append({
"pod": pod_name,
"namespace": namespace,
"container": container_name,
"termination_time": formatted_last_transition_time,
"restarts": recent_restarts
})

return (False, result) if result else (True, None)

0 comments on commit acebea0

Please sign in to comment.