From 8bbead80176478d8748c9cb0d2991e93d63fdce2 Mon Sep 17 00:00:00 2001 From: Florian Paul Azim Hoberg Date: Wed, 7 Aug 2024 16:12:36 +0200 Subject: [PATCH] feature: Add rolling updates (node auto patching) feature Fixes: #39 --- proxlb | 163 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 133 insertions(+), 30 deletions(-) diff --git a/proxlb b/proxlb index fb92547..9832346 100755 --- a/proxlb +++ b/proxlb @@ -96,7 +96,7 @@ def pre_validations(config_path): logging.info(f'{info_prefix} All pre-validations done.') -def post_validations(): +def post_validations(api_object, node_requires_reboot): """ Run post-validations as sanity checks. """ error_prefix = 'Error: [post-validations]:' info_prefix = 'Info: [post-validations]:' @@ -105,6 +105,8 @@ def post_validations(): logging.critical(f'{error_prefix} Not all post-validations succeeded. Please validate!') else: logging.info(f'{info_prefix} All post-validations succeeded.') + # Reboot node if necessary and all validations were performed. + run_node_reboot(api_object, node_requires_reboot) def validate_daemon(daemon, schedule): @@ -270,6 +272,94 @@ def validate_cluster_master(cluster_master): return True +def get_node_update_status(api_object): + """ Get the current update status of the current executing host node in the cluster. """ + info_prefix = 'Info: [node-update-status-getter]:' + error_prefix = 'Error: [node-update-status-getter]:' + + node_executor_hostname = socket.gethostname() + logging.info(f'{info_prefix} Get update status for node: {node_executor_hostname}.') + + try: + update_status_object = api_object.nodes(node_executor_hostname).apt().update.get() + except proxmoxer.core.ResourceException: + logging.critical(f'{info_prefix} Unknown node in cluster: {node_executor_hostname}.') + sys.exit(2) + + if len(update_status_object) > 0: + logging.info(f'{info_prefix} Updates available for node: {node_executor_hostname}.') + return True + else: + logging.info(f'{info_prefix} No updates available for node: {node_executor_hostname}.') + return False + + +def run_node_update(api_object, node_requires_updates): + """ Run the update execution on node. """ + info_prefix = 'Info: [node-update-executor]:' + error_prefix = 'Error: [node-update-executor]:' + + node_executor_hostname = socket.gethostname() + + if node_requires_updates: + logging.info(f'{info_prefix} Execute updates on node: {node_executor_hostname}.') + try: + update_status_object = api_object.nodes(node_executor_hostname).status().post(command='upgrade') + except proxmoxer.core.ResourceException: + logging.critical(f'{error_prefix} Missing API endpoint on node: {node_executor_hostname}. Please make sure to have the package proxlb-additions installed.') + sys.exit(2) + logging.info(f'{info_prefix} Sucessfully integrated updates to node: {node_executor_hostname}.') + + +def extend_ignore_node_list(ignore_nodes): + """ Extend the node ignore list by this node. """ + info_prefix = 'Info: [node-ignore-list-adder]:' + error_prefix = 'Error: [node-ignore-list-adder]:' + + node_executor_hostname = socket.gethostname() + logging.info(f'{info_prefix} Adding node {node_executor_hostname} to ignore list.') + ignore_nodes = ignore_nodes + f',{node_executor_hostname}' + logging.info(f'{info_prefix} Ignored nodes are now: {ignore_nodes}.') + + return ignore_nodes + + +def get_node_reboot_status(): + """ Get the current reboot status of the current executing host node in the cluster. """ + info_prefix = 'Info: [node-reboot-status-getter]:' + error_prefix = 'Error: [node-reboot-status-getter]:' + reboot_status_file = '/var/run/reboot-required' + + node_executor_hostname = socket.gethostname() + logging.info(f'{info_prefix} Get reboot status for node: {node_executor_hostname}.') + + reboot_status_object = os.path.exists(reboot_status_file) + + if reboot_status_object: + logging.info(f'{info_prefix} Reboot required for node: {node_executor_hostname}.') + return True + else: + logging.info(f'{info_prefix} No reboot required for node: {node_executor_hostname}.') + return False + + +def run_node_reboot(api_object, node_requires_reboot): + """ Run the update execution on node. """ + info_prefix = 'Info: [node-reboot-executor]:' + error_prefix = 'Error: [node-reboot-executor]:' + + node_executor_hostname = socket.gethostname() + + if node_requires_reboot: + logging.info(f'{info_prefix} Execute reboot on node: {node_executor_hostname}.') + try: + update_status_object = api_object.nodes(node_executor_hostname).status().post(command='reboot') + except proxmoxer.core.ResourceException: + logging.critical(f'{error_prefix} Missing API endpoint on node: {node_executor_hostname}. Please make sure to have the package proxlb-additions installed.') + sys.exit(2) + logging.info(f'{info_prefix} Rebooting node now: {node_executor_hostname}.') + + def get_node_statistics(api_object, ignore_nodes): """ Get statistics of cpu, memory and disk for each node in the cluster. """ info_prefix = 'Info: [node-statistics]:' @@ -404,27 +494,29 @@ def get_vm_statistics(api_object, ignore_vms, balancing_type): return vm_statistics -def update_node_statistics(node_statistics, vm_statistics): +def update_node_statistics(node_statistics, vm_statistics, ignore_nodes): """ Update node statistics by VMs statistics. """ - info_prefix = 'Info: [node-update-statistics]:' - warn_prefix = 'Warning: [node-update-statistics]:' + info_prefix = 'Info: [node-update-statistics]:' + warn_prefix = 'Warning: [node-update-statistics]:' + ignore_nodes_list = ignore_nodes.split(',') for vm, vm_value in vm_statistics.items(): - node_statistics[vm_value['node_parent']]['cpu_assigned'] = node_statistics[vm_value['node_parent']]['cpu_assigned'] + int(vm_value['cpu_total']) - node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] = (node_statistics[vm_value['node_parent']]['cpu_assigned'] / node_statistics[vm_value['node_parent']]['cpu_total']) * 100 - node_statistics[vm_value['node_parent']]['memory_assigned'] = node_statistics[vm_value['node_parent']]['memory_assigned'] + int(vm_value['memory_total']) - node_statistics[vm_value['node_parent']]['memory_assigned_percent'] = (node_statistics[vm_value['node_parent']]['memory_assigned'] / node_statistics[vm_value['node_parent']]['memory_total']) * 100 - node_statistics[vm_value['node_parent']]['disk_assigned'] = node_statistics[vm_value['node_parent']]['disk_assigned'] + int(vm_value['disk_total']) - node_statistics[vm_value['node_parent']]['disk_assigned_percent'] = (node_statistics[vm_value['node_parent']]['disk_assigned'] / node_statistics[vm_value['node_parent']]['disk_total']) * 100 + if not vm_value['node_parent'] in ignore_nodes_list: + node_statistics[vm_value['node_parent']]['cpu_assigned'] = node_statistics[vm_value['node_parent']]['cpu_assigned'] + int(vm_value['cpu_total']) + node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] = (node_statistics[vm_value['node_parent']]['cpu_assigned'] / node_statistics[vm_value['node_parent']]['cpu_total']) * 100 + node_statistics[vm_value['node_parent']]['memory_assigned'] = node_statistics[vm_value['node_parent']]['memory_assigned'] + int(vm_value['memory_total']) + node_statistics[vm_value['node_parent']]['memory_assigned_percent'] = (node_statistics[vm_value['node_parent']]['memory_assigned'] / node_statistics[vm_value['node_parent']]['memory_total']) * 100 + node_statistics[vm_value['node_parent']]['disk_assigned'] = node_statistics[vm_value['node_parent']]['disk_assigned'] + int(vm_value['disk_total']) + node_statistics[vm_value['node_parent']]['disk_assigned_percent'] = (node_statistics[vm_value['node_parent']]['disk_assigned'] / node_statistics[vm_value['node_parent']]['disk_total']) * 100 - if node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] > 99: - logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for CPU by {int(node_statistics[vm_value["node_parent"]]["cpu_assigned_percent"])}%.') + if node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] > 99: + logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for CPU by {int(node_statistics[vm_value["node_parent"]]["cpu_assigned_percent"])}%.') - if node_statistics[vm_value['node_parent']]['memory_assigned_percent'] > 99: - logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for memory by {int(node_statistics[vm_value["node_parent"]]["memory_assigned_percent"])}%.') + if node_statistics[vm_value['node_parent']]['memory_assigned_percent'] > 99: + logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for memory by {int(node_statistics[vm_value["node_parent"]]["memory_assigned_percent"])}%.') - if node_statistics[vm_value['node_parent']]['disk_assigned_percent'] > 99: - logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for disk by {int(node_statistics[vm_value["node_parent"]]["disk_assigned_percent"])}%.') + if node_statistics[vm_value['node_parent']]['disk_assigned_percent'] > 99: + logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for disk by {int(node_statistics[vm_value["node_parent"]]["disk_assigned_percent"])}%.') logging.info(f'{info_prefix} Updated node resource assignments by all VMs.') logging.debug('node_statistics') @@ -484,7 +576,7 @@ def __get_proxlb_groups(vm_tags): return group_include, group_exclude, vm_ignore -def balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, rebalance, processed_vms): +def balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance, processed_vms): """ Calculate re-balancing of VMs on present nodes across the cluster. """ info_prefix = 'Info: [rebalancing-calculator]:' @@ -501,14 +593,14 @@ def balancing_calculations(balancing_method, balancing_mode, balancing_mode_opti # Update resource statistics for VMs and nodes. node_statistics, vm_statistics = __update_resource_statistics(resources_vm_most_used, resources_node_most_free, - vm_statistics, node_statistics, balancing_method, balancing_mode) + vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes) # Start recursion until we do not have any needs to rebalance anymore. - balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, rebalance, processed_vms) + balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance, processed_vms) # Honour groupings for include and exclude groups for rebalancing VMs. - node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode) - node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode) + node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes) + node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes) # Remove VMs that are not being relocated. vms_to_remove = [vm_name for vm_name, vm_info in vm_statistics.items() if 'node_rebalance' in vm_info and vm_info['node_rebalance'] == vm_info.get('node_parent')] @@ -632,11 +724,12 @@ def __get_most_free_resources_node(balancing_method, balancing_mode, balancing_m return node -def __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node, vm_statistics, node_statistics, balancing_method, balancing_mode): +def __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node, vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes): """ Update VM and node resource statistics. """ info_prefix = 'Info: [rebalancing-resource-statistics-update]:' + ignore_nodes_list = ignore_nodes.split(',') - if resource_highest_used_resources_vm[1]['node_parent'] != resource_highest_free_resources_node[0]: + if resource_highest_used_resources_vm[1]['node_parent'] != resource_highest_free_resources_node[0] and resource_highest_used_resources_vm[1]['node_parent'] not in ignore_nodes_list: vm_name = resource_highest_used_resources_vm[0] vm_node_parent = resource_highest_used_resources_vm[1]['node_parent'] vm_node_rebalance = resource_highest_free_resources_node[0] @@ -668,7 +761,7 @@ def __update_resource_statistics(resource_highest_used_resources_vm, resource_hi return node_statistics, vm_statistics -def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode): +def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes): """ Get VMs tags for include groups. """ info_prefix = 'Info: [rebalancing-tags-group-include]:' tags_include_vms = {} @@ -697,13 +790,13 @@ def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_metho vm_node_rebalance = vm_statistics[vm_name]['node_rebalance'] else: _mocked_vm_object = (vm_name, vm_statistics[vm_name]) - node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [vm_node_rebalance], vm_statistics, node_statistics, balancing_method, balancing_mode) + node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [vm_node_rebalance], vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes) processed_vm.append(vm_name) return node_statistics, vm_statistics -def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode): +def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes): """ Get VMs tags for exclude groups. """ info_prefix = 'Info: [rebalancing-tags-group-exclude]:' tags_exclude_vms = {} @@ -736,7 +829,7 @@ def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_metho random_node = random.choice(list(node_statistics.keys())) else: _mocked_vm_object = (vm_name, vm_statistics[vm_name]) - node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [random_node], vm_statistics, node_statistics, balancing_method, balancing_mode) + node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [random_node], vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes) processed_vm.append(vm_name) return node_statistics, vm_statistics @@ -891,20 +984,30 @@ def main(): validate_daemon(daemon, schedule) continue + # Validate for node auto update in cluster for rolling updates. + # Note: This requires proxlb-additions with a patched Proxmox API! + #rolling_updates = 1 + if bool(int(rolling_updates)): + node_requires_updates = get_node_update_status(api_object) + run_node_update(api_object, node_requires_updates) + node_requires_reboot = get_node_reboot_status() + if node_requires_reboot: + ignore_nodes = extend_ignore_node_list(ignore_nodes) + # Get metric & statistics for vms and nodes. node_statistics = get_node_statistics(api_object, ignore_nodes) vm_statistics = get_vm_statistics(api_object, ignore_vms, balancing_type) - node_statistics = update_node_statistics(node_statistics, vm_statistics) + node_statistics = update_node_statistics(node_statistics, vm_statistics, ignore_nodes) # Calculate rebalancing of vms. node_statistics_rebalanced, vm_statistics_rebalanced = balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, - node_statistics, vm_statistics, balanciness, rebalance=False, processed_vms=[]) + node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance=False, processed_vms=[]) # Rebalance vms to new nodes within the cluster. run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations) # Validate for any errors. - post_validations() + post_validations(api_object, node_requires_reboot) # Validate daemon service. validate_daemon(daemon, schedule)