Skip to content

Commit

Permalink
Merge branch 'master' into EN-5535-add-time-based-check-for-crashloop…
Browse files Browse the repository at this point in the history
…back-and-pending-pods-check-in-unskript-ctl
  • Loading branch information
jayasimha-raghavan-unskript authored May 29, 2024
2 parents 2a8c2b0 + 84e7a51 commit 6b03acc
Show file tree
Hide file tree
Showing 21 changed files with 124 additions and 138 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def aws_check_ssl_certificate_expiry(handle, threshold_days: int, region: str,)
right_now = datetime.datetime.now(dateutil.tz.tzlocal())
diff = expiry_date-right_now
days_remaining = diff.days
days = 0
if 0 < days_remaining < threshold_days:
days = days_remaining
elif days_remaining < 0:
Expand Down
7 changes: 6 additions & 1 deletion AWS/legos/aws_ebs_modify_volume/aws_ebs_modify_volume.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,15 @@ def aws_ebs_modify_volume(
# Get the current volume size.
Volume = ec2Resource.Volume(volume_id)
currentSize = Volume.size
newSize = None

if resize_option == SizingOption.Add:
newSize = currentSize + resize_value
elif resize_option == SizingOption.Mutiple:
elif resize_option == SizingOption.Multiple:
newSize = currentSize * resize_value
else:
raise ValueError(f"Invalid resize option: {resize_option}")


print(f'CurrentSize {currentSize}, NewSize {newSize}')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def aws_ecs_detect_failed_deployment(handle, cluster_name: str, service_name: st
return ["Empty deployment"]

deploymentInProgress = False
primaryDeploymentID = ""
for deployment in deployments:
if deployment['status'] == "PRIMARY":
primaryDeploymentID = deployment['id']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,38 +27,33 @@ def elasticsearch_check_health_status(handle, unassigned_shards:int = 20) -> Tup
:rtype: Result Tuple of result
"""
cluster_health = {}
output = handle.web_request("/_cluster/health?pretty", "GET", None)


# Early return if cluster status is green
if output['status'] == 'green':
return (True, None)

cluster_health['cluster_name'] = output['cluster_name']
cluster_health['status'] = output['status']

# Additional checks for both red and yellow statuses
additional_details = False

cluster_health = {
"cluster_name": output['cluster_name'],
"status": output['status'],
"unassigned_shards": output['unassigned_shards']
}

# Check for significant health issues
if output['unassigned_shards'] > unassigned_shards:
cluster_health['unassigned_shards'] = output['unassigned_shards']
additional_details = True

if output['delayed_unassigned_shards'] > 0:
cluster_health['delayed_unassigned_shards'] = output['delayed_unassigned_shards']
additional_details = True

if output['initializing_shards'] > 0 or output['relocating_shards'] > 0:
cluster_health['initializing_shards'] = output['initializing_shards']
cluster_health['relocating_shards'] = output['relocating_shards']
additional_details = True

if output['number_of_nodes'] != output['number_of_data_nodes']:
cluster_health['number_of_nodes'] = output['number_of_nodes']
cluster_health['number_of_data_nodes'] = output['number_of_data_nodes']
additional_details = True

# If status is red, return the result immediately
if output['status'] == 'red' or (output['status'] == 'yellow' and additional_details):
return (False, [cluster_health]) # Return immediately if unassigned shards exceed the threshold

# Additional checks for severe conditions
if output['status'] == 'red' or output['delayed_unassigned_shards'] > 0 or output['initializing_shards'] > 0 or output['relocating_shards'] > 0 or output['number_of_nodes'] != output['number_of_data_nodes']:
additional_details = {
"delayed_unassigned_shards": output['delayed_unassigned_shards'],
"initializing_shards": output['initializing_shards'],
"relocating_shards": output['relocating_shards'],
"number_of_nodes": output['number_of_nodes'],
"number_of_data_nodes": output['number_of_data_nodes']
}
cluster_health.update(additional_details)
return (False, [cluster_health])

# If status is yellow but no additional conditions are met, return as healthy
# If status is yellow but no additional critical issues, consider it healthy
return (True, None)
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ def elasticsearch_compare_cluster_disk_size_to_threshold(handle, threshold: floa
# Split the lines and skip the header
lines = allocation_output.splitlines()[1:]

result = []
# Find the max disk percentage from the lines, considering only assigned nodes
max_disk_percent = max(float(line.split()[5]) for line in lines if "UNASSIGNED" not in line)

if max_disk_percent > threshold:
result.append({"usage_disk_percentage": max_disk_percent, "threshold": threshold})

if len(result) != 0:
return (False, result)
# Calculate the max disk percentage from the lines, considering only assigned nodes
max_disk_percent = 0 # Initialize to 0 or an appropriately low number
for line in lines:
if "UNASSIGNED" not in line:
disk_usage = float(line.split()[5])
max_disk_percent = max(max_disk_percent, disk_usage)
if max_disk_percent > threshold:
result = [{"usage_disk_percentage": max_disk_percent, "threshold": threshold}]
return (False, result)
return (True, None)

Original file line number Diff line number Diff line change
Expand Up @@ -51,59 +51,29 @@ def elasticsearch_get_index_health(handle, index_name="") -> Tuple:
:return: A list of dictionaries where each dictionary contains stats about each index
"""
try:
indices_output = []
# If no specific index is provided, get all indices
if len(index_name)==0 :
indices_output = handle.web_request("/_cat/indices?h=index", "GET", None)
indices_output = ''.join(indices_output).split('\n')
# If a specific index is provided, only consider that index
if index_name:
# Fetch specific index health
health_url = f"/_cat/indices/{index_name}?v&h=index,status&format=json"
health_response = handle.web_request(health_url, "GET", None)
index_stats = [health_response[0]] if health_response and 'error' not in health_response else []
else:
indices_output.append(index_name)

all_indices_stats = []

for current_index in indices_output:
# Skip system indices
if current_index.startswith('.'):
continue
index_stats = {}

# Get settings for the current index
settings_output = handle.web_request(f"/{current_index}/_settings", "GET", None)
if "error" in settings_output:
print(f"Error for settings of {current_index}")
continue

# Get stats for the current index
stats_output = handle.web_request(f"/{current_index}/_stats", "GET", None)
if "error" in stats_output:
print(f"Error for stats of {current_index}")
continue

# Get any tasks associated with the current index
tasks_output = handle.web_request(f"/_tasks?actions=*{current_index}*&detailed", "GET", None)

# Get health of the current index
health_output = handle.web_request(f"/_cat/indices/{current_index}?format=json", "GET", None)
if "error" in health_output:
print(f"Error for health of {current_index}")
continue

if settings_output:
settings = settings_output.get(current_index, {}).get('settings', {}).get('index', {})
else:
settings = {}
# Consolidate stats for the current index
if health_output[0]['health'] in ['yellow', 'red']:
index_stats = {
**health_output[0],
'settings': settings,
'stats': stats_output['_all']['total'],
'tasks': tasks_output['nodes']
}
all_indices_stats.append(index_stats)
# Fetches all indices health; skips empty lines and system indices
health_url = "/_cat/indices?v&h=index,status&format=json"
health_response = handle.web_request(health_url, "GET", None)
index_stats = [idx for idx in health_response if not idx['index'].startswith('.')] if health_response and 'error' not in health_response else []

if not index_stats:
print(f"No indices found or error retrieving indices: {health_response.get('error', 'No response') if health_response else 'No data'}")
return (True, None)

all_indices_stats = [
{"index": idx['index'], "status": idx['status']}
for idx in index_stats
]

except Exception as e:
raise e
if len(all_indices_stats)!=0:
return (False, all_indices_stats)
return (True, None)
print(f"Error processing index health: {str(e)}")
return (False, [])

return (False, all_indices_stats) if all_indices_stats else (True, None)

1 change: 1 addition & 0 deletions Github/legos/github_create_team/github_create_team.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def github_create_team(
team_details = {}
repo_names =[]
list_of_repos = ''
privacy_settings = ''
if privacy is None or len(privacy)==0:
privacy_settings = "secret"
organization = handle.get_organization(organization_name)
Expand Down
1 change: 1 addition & 0 deletions Github/legos/github_delete_branch/github_delete_branch.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def github_delete_branch(handle, owner:str, repository: str, branch_name: str)->
:rtype: Deleted branch info
"""
flag_to_check_branch = 0
try:
user = handle.get_user(login=owner)
repo_name = user.login+"/"+repository
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def github_remove_member_from_org(handle, organization_name:str, username:str)->
:rtype: List of return status of removing a member from Org
"""
result = ""
organization = handle.get_organization(organization_name)
try:
user = handle.get_user(username)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,17 @@
from typing import Tuple, Optional
from pydantic import BaseModel, Field
from croniter import croniter
from datetime import datetime, timezone, timedelta
import json


class InputSchema(BaseModel):
namespace: Optional[str] = Field(..., description='k8s Namespace', title='Namespace')
time_interval_to_check: int = Field(
24,
description='Time interval in hours. This time window is used to check if pod in a cronjob was in Pending state. Default is 24 hours.',
title="Time Interval"
)


def k8s_check_cronjob_pod_status_printer(output):
Expand All @@ -20,10 +26,12 @@ def k8s_check_cronjob_pod_status_printer(output):
print("CronJobs are running as expected.")
else:
for issue in issues:
print(f"CronJob '{issue['cronjob_name']}' Alert: {issue['message']}")
print(f"CronJob '{issue['cronjob_name']}' in namespace '{issue['namespace']}' has issues.")

def format_datetime(dt):
return dt.strftime("%Y-%m-%d %H:%M:%S %Z")

def k8s_check_cronjob_pod_status(handle, namespace: str='') -> Tuple:
def k8s_check_cronjob_pod_status(handle, namespace: str='', time_interval_to_check=24) -> Tuple:
"""
Checks the status of the CronJob pods.
Expand All @@ -39,7 +47,10 @@ def k8s_check_cronjob_pod_status(handle, namespace: str='') -> Tuple:
batch_v1 = client.BatchV1Api(api_client=handle)
core_v1 = client.CoreV1Api(api_client=handle)

issues = {"NotAssociated": [], "Pending": [], "UnexpectedState": []}
issues = []
current_time = datetime.now(timezone.utc)
interval_time_to_check = current_time - timedelta(hours=time_interval_to_check)
interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc)

# Get namespaces to check
if namespace:
Expand Down Expand Up @@ -68,34 +79,35 @@ def k8s_check_cronjob_pod_status(handle, namespace: str='') -> Tuple:
continue
cronjob = json.loads(response.stdout)

schedule = cronjob['spec']['schedule']

# Calculate the next expected run
now = datetime.now(timezone.utc)
iter = croniter(schedule, now)
next_run = iter.get_next(datetime)
time_to_next_run = next_run - now

# Fetch the most recent Job associated with the CronJob
jobs = batch_v1.list_namespaced_job(ns) # Fetch all jobs, and then filter by prefix.

associated_jobs = [job for job in jobs.items if job.metadata.name.startswith(cronjob['metadata']['name'])]
if not associated_jobs:
# If no associated jobs, that means the job is not scheduled.
return (True, None)
continue

latest_job = sorted(associated_jobs, key=lambda x: x.status.start_time, reverse=True)[0]

# Check job's pods for any issues
pods = core_v1.list_namespaced_pod(ns, label_selector=f"job-name={latest_job.metadata.name}")

for pod in pods.items:
if pod.status.phase == 'Pending' and now - pod.status.start_time > time_to_next_run:
issues["Pending"].append({"pod_name": pod.metadata.name, "namespace": ns})
elif pod.status.phase not in ['Running', 'Succeeded']:
issues["UnexpectedState"].append({"pod_name": pod.metadata.name, "namespace": ns, "state": pod.status.phase})

if all(not val for val in issues.values()):
return (True, None)
else:
return (False, issues)
if pod.status.phase == 'Pending':
start_time = pod.status.start_time
if start_time and start_time >= interval_time_to_check:
issues.append({
"cronjob_name": cronjob_name,
"namespace": ns,
"pod_name": pod.metadata.name,
"start_time": format_datetime(start_time)
})
break
elif pod.status.phase not in ['Running', 'Succeeded','Completed']:
issues.append({
"cronjob_name": cronjob_name,
"namespace": ns,
"pod_name": pod.metadata.name,
"state": pod.status.phase
})
break

return (not issues, issues if issues else None)
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,25 @@ def k8s_check_worker_cpu_utilization(handle, threshold: float=70.0) -> Tuple:
if response is None or response.stderr:
raise Exception(f"Error while executing command ({kubectl_command}): {response.stderr if response else 'empty response'}")

lines = response.stdout.split('\n')
# Ensure response.stdout is processed only once and correctly
lines = response.stdout.strip().split('\n')
seen_nodes = set() # Keep track of nodes that have already been processed

for line in lines:
parts = line.split()
if len(parts) < 5: # Check for correct line format
continue
node_name, cpu_percentage_str = parts[0], parts[2]
cpu_percentage = float(cpu_percentage_str.rstrip('%'))
node_name, cpu_percentage_str = parts[0], parts[2].rstrip('%')
if node_name in seen_nodes:
print(f"Duplicate entry detected for node {node_name}, skipping.")
continue
seen_nodes.add(node_name)

cpu_percentage = float(cpu_percentage_str)
if cpu_percentage > threshold:
exceeding_nodes.append({"node": node_name, "cpu": cpu_percentage})

if len(exceeding_nodes) != 0:
if exceeding_nodes:
return (False, exceeding_nodes)
return (True, None)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def k8s_get_error_pods_from_all_jobs(handle, namespace:str="") -> Tuple:

if response.stderr:
raise Exception(f"Error occurred while executing command {kubectl_cmd}: {response.stderr}")

jobs = {}
try:
if response.stdout:
jobs = json.loads(response.stdout)
Expand All @@ -57,6 +57,7 @@ def k8s_get_error_pods_from_all_jobs(handle, namespace:str="") -> Tuple:
if pod_response.stderr:
print(f"Error occurred while fetching pods for job {job_name}: {pod_response.stderr}")
continue
pods = {}
try:
if response.stdout:
pods = json.loads(pod_response.stdout)
Expand Down
Loading

0 comments on commit 6b03acc

Please sign in to comment.