Skip to content

Commit

Permalink
feature: Add rolling updates (node auto patching) feature
Browse files Browse the repository at this point in the history
Fixes: #39
  • Loading branch information
gyptazy committed Aug 7, 2024
1 parent 101855b commit 8bbead8
Showing 1 changed file with 133 additions and 30 deletions.
163 changes: 133 additions & 30 deletions proxlb
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def pre_validations(config_path):
logging.info(f'{info_prefix} All pre-validations done.')


def post_validations():
def post_validations(api_object, node_requires_reboot):
""" Run post-validations as sanity checks. """
error_prefix = 'Error: [post-validations]:'
info_prefix = 'Info: [post-validations]:'
Expand All @@ -105,6 +105,8 @@ def post_validations():
logging.critical(f'{error_prefix} Not all post-validations succeeded. Please validate!')
else:
logging.info(f'{info_prefix} All post-validations succeeded.')
# Reboot node if necessary and all validations were performed.
run_node_reboot(api_object, node_requires_reboot)


def validate_daemon(daemon, schedule):
Expand Down Expand Up @@ -270,6 +272,94 @@ def validate_cluster_master(cluster_master):
return True


def get_node_update_status(api_object):
""" Get the current update status of the current executing host node in the cluster. """
info_prefix = 'Info: [node-update-status-getter]:'
error_prefix = 'Error: [node-update-status-getter]:'

node_executor_hostname = socket.gethostname()
logging.info(f'{info_prefix} Get update status for node: {node_executor_hostname}.')

try:
update_status_object = api_object.nodes(node_executor_hostname).apt().update.get()
except proxmoxer.core.ResourceException:
logging.critical(f'{info_prefix} Unknown node in cluster: {node_executor_hostname}.')
sys.exit(2)

if len(update_status_object) > 0:
logging.info(f'{info_prefix} Updates available for node: {node_executor_hostname}.')
return True
else:
logging.info(f'{info_prefix} No updates available for node: {node_executor_hostname}.')
return False


def run_node_update(api_object, node_requires_updates):
""" Run the update execution on node. """
info_prefix = 'Info: [node-update-executor]:'
error_prefix = 'Error: [node-update-executor]:'

node_executor_hostname = socket.gethostname()

if node_requires_updates:
logging.info(f'{info_prefix} Execute updates on node: {node_executor_hostname}.')
try:
update_status_object = api_object.nodes(node_executor_hostname).status().post(command='upgrade')
except proxmoxer.core.ResourceException:
logging.critical(f'{error_prefix} Missing API endpoint on node: {node_executor_hostname}. Please make sure to have the package proxlb-additions installed.')
sys.exit(2)
logging.info(f'{info_prefix} Sucessfully integrated updates to node: {node_executor_hostname}.')


def extend_ignore_node_list(ignore_nodes):
""" Extend the node ignore list by this node. """
info_prefix = 'Info: [node-ignore-list-adder]:'
error_prefix = 'Error: [node-ignore-list-adder]:'

node_executor_hostname = socket.gethostname()
logging.info(f'{info_prefix} Adding node {node_executor_hostname} to ignore list.')
ignore_nodes = ignore_nodes + f',{node_executor_hostname}'
logging.info(f'{info_prefix} Ignored nodes are now: {ignore_nodes}.')

return ignore_nodes


def get_node_reboot_status():
""" Get the current reboot status of the current executing host node in the cluster. """
info_prefix = 'Info: [node-reboot-status-getter]:'
error_prefix = 'Error: [node-reboot-status-getter]:'
reboot_status_file = '/var/run/reboot-required'

node_executor_hostname = socket.gethostname()
logging.info(f'{info_prefix} Get reboot status for node: {node_executor_hostname}.')

reboot_status_object = os.path.exists(reboot_status_file)

if reboot_status_object:
logging.info(f'{info_prefix} Reboot required for node: {node_executor_hostname}.')
return True
else:
logging.info(f'{info_prefix} No reboot required for node: {node_executor_hostname}.')
return False


def run_node_reboot(api_object, node_requires_reboot):
""" Run the update execution on node. """
info_prefix = 'Info: [node-reboot-executor]:'
error_prefix = 'Error: [node-reboot-executor]:'

node_executor_hostname = socket.gethostname()

if node_requires_reboot:
logging.info(f'{info_prefix} Execute reboot on node: {node_executor_hostname}.')
try:
update_status_object = api_object.nodes(node_executor_hostname).status().post(command='reboot')
except proxmoxer.core.ResourceException:
logging.critical(f'{error_prefix} Missing API endpoint on node: {node_executor_hostname}. Please make sure to have the package proxlb-additions installed.')
sys.exit(2)
logging.info(f'{info_prefix} Rebooting node now: {node_executor_hostname}.')


def get_node_statistics(api_object, ignore_nodes):
""" Get statistics of cpu, memory and disk for each node in the cluster. """
info_prefix = 'Info: [node-statistics]:'
Expand Down Expand Up @@ -404,27 +494,29 @@ def get_vm_statistics(api_object, ignore_vms, balancing_type):
return vm_statistics


def update_node_statistics(node_statistics, vm_statistics):
def update_node_statistics(node_statistics, vm_statistics, ignore_nodes):
""" Update node statistics by VMs statistics. """
info_prefix = 'Info: [node-update-statistics]:'
warn_prefix = 'Warning: [node-update-statistics]:'
info_prefix = 'Info: [node-update-statistics]:'
warn_prefix = 'Warning: [node-update-statistics]:'
ignore_nodes_list = ignore_nodes.split(',')

for vm, vm_value in vm_statistics.items():
node_statistics[vm_value['node_parent']]['cpu_assigned'] = node_statistics[vm_value['node_parent']]['cpu_assigned'] + int(vm_value['cpu_total'])
node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] = (node_statistics[vm_value['node_parent']]['cpu_assigned'] / node_statistics[vm_value['node_parent']]['cpu_total']) * 100
node_statistics[vm_value['node_parent']]['memory_assigned'] = node_statistics[vm_value['node_parent']]['memory_assigned'] + int(vm_value['memory_total'])
node_statistics[vm_value['node_parent']]['memory_assigned_percent'] = (node_statistics[vm_value['node_parent']]['memory_assigned'] / node_statistics[vm_value['node_parent']]['memory_total']) * 100
node_statistics[vm_value['node_parent']]['disk_assigned'] = node_statistics[vm_value['node_parent']]['disk_assigned'] + int(vm_value['disk_total'])
node_statistics[vm_value['node_parent']]['disk_assigned_percent'] = (node_statistics[vm_value['node_parent']]['disk_assigned'] / node_statistics[vm_value['node_parent']]['disk_total']) * 100
if not vm_value['node_parent'] in ignore_nodes_list:
node_statistics[vm_value['node_parent']]['cpu_assigned'] = node_statistics[vm_value['node_parent']]['cpu_assigned'] + int(vm_value['cpu_total'])
node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] = (node_statistics[vm_value['node_parent']]['cpu_assigned'] / node_statistics[vm_value['node_parent']]['cpu_total']) * 100
node_statistics[vm_value['node_parent']]['memory_assigned'] = node_statistics[vm_value['node_parent']]['memory_assigned'] + int(vm_value['memory_total'])
node_statistics[vm_value['node_parent']]['memory_assigned_percent'] = (node_statistics[vm_value['node_parent']]['memory_assigned'] / node_statistics[vm_value['node_parent']]['memory_total']) * 100
node_statistics[vm_value['node_parent']]['disk_assigned'] = node_statistics[vm_value['node_parent']]['disk_assigned'] + int(vm_value['disk_total'])
node_statistics[vm_value['node_parent']]['disk_assigned_percent'] = (node_statistics[vm_value['node_parent']]['disk_assigned'] / node_statistics[vm_value['node_parent']]['disk_total']) * 100

if node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for CPU by {int(node_statistics[vm_value["node_parent"]]["cpu_assigned_percent"])}%.')
if node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for CPU by {int(node_statistics[vm_value["node_parent"]]["cpu_assigned_percent"])}%.')

if node_statistics[vm_value['node_parent']]['memory_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for memory by {int(node_statistics[vm_value["node_parent"]]["memory_assigned_percent"])}%.')
if node_statistics[vm_value['node_parent']]['memory_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for memory by {int(node_statistics[vm_value["node_parent"]]["memory_assigned_percent"])}%.')

if node_statistics[vm_value['node_parent']]['disk_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for disk by {int(node_statistics[vm_value["node_parent"]]["disk_assigned_percent"])}%.')
if node_statistics[vm_value['node_parent']]['disk_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for disk by {int(node_statistics[vm_value["node_parent"]]["disk_assigned_percent"])}%.')

logging.info(f'{info_prefix} Updated node resource assignments by all VMs.')
logging.debug('node_statistics')
Expand Down Expand Up @@ -484,7 +576,7 @@ def __get_proxlb_groups(vm_tags):
return group_include, group_exclude, vm_ignore


def balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, rebalance, processed_vms):
def balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance, processed_vms):
""" Calculate re-balancing of VMs on present nodes across the cluster. """
info_prefix = 'Info: [rebalancing-calculator]:'

Expand All @@ -501,14 +593,14 @@ def balancing_calculations(balancing_method, balancing_mode, balancing_mode_opti

# Update resource statistics for VMs and nodes.
node_statistics, vm_statistics = __update_resource_statistics(resources_vm_most_used, resources_node_most_free,
vm_statistics, node_statistics, balancing_method, balancing_mode)
vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)

# Start recursion until we do not have any needs to rebalance anymore.
balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, rebalance, processed_vms)
balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance, processed_vms)

# Honour groupings for include and exclude groups for rebalancing VMs.
node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode)
node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode)
node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)

# Remove VMs that are not being relocated.
vms_to_remove = [vm_name for vm_name, vm_info in vm_statistics.items() if 'node_rebalance' in vm_info and vm_info['node_rebalance'] == vm_info.get('node_parent')]
Expand Down Expand Up @@ -632,11 +724,12 @@ def __get_most_free_resources_node(balancing_method, balancing_mode, balancing_m
return node


def __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node, vm_statistics, node_statistics, balancing_method, balancing_mode):
def __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node, vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes):
""" Update VM and node resource statistics. """
info_prefix = 'Info: [rebalancing-resource-statistics-update]:'
ignore_nodes_list = ignore_nodes.split(',')

if resource_highest_used_resources_vm[1]['node_parent'] != resource_highest_free_resources_node[0]:
if resource_highest_used_resources_vm[1]['node_parent'] != resource_highest_free_resources_node[0] and resource_highest_used_resources_vm[1]['node_parent'] not in ignore_nodes_list:
vm_name = resource_highest_used_resources_vm[0]
vm_node_parent = resource_highest_used_resources_vm[1]['node_parent']
vm_node_rebalance = resource_highest_free_resources_node[0]
Expand Down Expand Up @@ -668,7 +761,7 @@ def __update_resource_statistics(resource_highest_used_resources_vm, resource_hi
return node_statistics, vm_statistics


def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode):
def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes):
""" Get VMs tags for include groups. """
info_prefix = 'Info: [rebalancing-tags-group-include]:'
tags_include_vms = {}
Expand Down Expand Up @@ -697,13 +790,13 @@ def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_metho
vm_node_rebalance = vm_statistics[vm_name]['node_rebalance']
else:
_mocked_vm_object = (vm_name, vm_statistics[vm_name])
node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [vm_node_rebalance], vm_statistics, node_statistics, balancing_method, balancing_mode)
node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [vm_node_rebalance], vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
processed_vm.append(vm_name)

return node_statistics, vm_statistics


def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode):
def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes):
""" Get VMs tags for exclude groups. """
info_prefix = 'Info: [rebalancing-tags-group-exclude]:'
tags_exclude_vms = {}
Expand Down Expand Up @@ -736,7 +829,7 @@ def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_metho
random_node = random.choice(list(node_statistics.keys()))
else:
_mocked_vm_object = (vm_name, vm_statistics[vm_name])
node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [random_node], vm_statistics, node_statistics, balancing_method, balancing_mode)
node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [random_node], vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
processed_vm.append(vm_name)

return node_statistics, vm_statistics
Expand Down Expand Up @@ -891,20 +984,30 @@ def main():
validate_daemon(daemon, schedule)
continue

# Validate for node auto update in cluster for rolling updates.
# Note: This requires proxlb-additions with a patched Proxmox API!
#rolling_updates = 1
if bool(int(rolling_updates)):
node_requires_updates = get_node_update_status(api_object)
run_node_update(api_object, node_requires_updates)
node_requires_reboot = get_node_reboot_status()
if node_requires_reboot:
ignore_nodes = extend_ignore_node_list(ignore_nodes)

# Get metric & statistics for vms and nodes.
node_statistics = get_node_statistics(api_object, ignore_nodes)
vm_statistics = get_vm_statistics(api_object, ignore_vms, balancing_type)
node_statistics = update_node_statistics(node_statistics, vm_statistics)
node_statistics = update_node_statistics(node_statistics, vm_statistics, ignore_nodes)

# Calculate rebalancing of vms.
node_statistics_rebalanced, vm_statistics_rebalanced = balancing_calculations(balancing_method, balancing_mode, balancing_mode_option,
node_statistics, vm_statistics, balanciness, rebalance=False, processed_vms=[])
node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance=False, processed_vms=[])

# Rebalance vms to new nodes within the cluster.
run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations)

# Validate for any errors.
post_validations()
post_validations(api_object, node_requires_reboot)

# Validate daemon service.
validate_daemon(daemon, schedule)
Expand Down

0 comments on commit 8bbead8

Please sign in to comment.