From 8bbead80176478d8748c9cb0d2991e93d63fdce2 Mon Sep 17 00:00:00 2001
From: Florian Paul Azim Hoberg <gyptazy@gyptazy.ch>
Date: Wed, 7 Aug 2024 16:12:36 +0200
Subject: [PATCH] feature: Add rolling updates (node auto patching) feature

Fixes: #39
---
 proxlb | 163 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 133 insertions(+), 30 deletions(-)

diff --git a/proxlb b/proxlb
index fb92547..9832346 100755
--- a/proxlb
+++ b/proxlb
@@ -96,7 +96,7 @@ def pre_validations(config_path):
     logging.info(f'{info_prefix} All pre-validations done.')
 
 
-def post_validations():
+def post_validations(api_object, node_requires_reboot):
     """ Run post-validations as sanity checks. """
     error_prefix = 'Error: [post-validations]:'
     info_prefix  = 'Info: [post-validations]:'
@@ -105,6 +105,8 @@ def post_validations():
         logging.critical(f'{error_prefix} Not all post-validations succeeded. Please validate!')
     else:
         logging.info(f'{info_prefix} All post-validations succeeded.')
+        # Reboot node if necessary and all validations were performed.
+        run_node_reboot(api_object, node_requires_reboot)
 
 
 def validate_daemon(daemon, schedule):
@@ -270,6 +272,94 @@ def validate_cluster_master(cluster_master):
         return True
 
 
+def get_node_update_status(api_object):
+    """ Get the current update status of the current executing host node in the cluster. """
+    info_prefix   = 'Info: [node-update-status-getter]:'
+    error_prefix  = 'Error: [node-update-status-getter]:'
+
+    node_executor_hostname = socket.gethostname()
+    logging.info(f'{info_prefix} Get update status for node: {node_executor_hostname}.')
+
+    try:
+        update_status_object = api_object.nodes(node_executor_hostname).apt().update.get()
+    except proxmoxer.core.ResourceException:
+        logging.critical(f'{info_prefix} Unknown node in cluster: {node_executor_hostname}.')
+        sys.exit(2)
+
+    if len(update_status_object) > 0:
+        logging.info(f'{info_prefix} Updates available for node: {node_executor_hostname}.')
+        return True
+    else:
+        logging.info(f'{info_prefix} No updates available for node: {node_executor_hostname}.')
+        return False
+
+
+def run_node_update(api_object, node_requires_updates):
+    """ Run the update execution on node. """
+    info_prefix   = 'Info: [node-update-executor]:'
+    error_prefix  = 'Error: [node-update-executor]:'
+
+    node_executor_hostname = socket.gethostname()
+
+    if node_requires_updates:
+        logging.info(f'{info_prefix} Execute updates on node: {node_executor_hostname}.')
+        try:
+            update_status_object = api_object.nodes(node_executor_hostname).status().post(command='upgrade')
+        except proxmoxer.core.ResourceException:
+            logging.critical(f'{error_prefix} Missing API endpoint on node: {node_executor_hostname}. Please make sure to have the package proxlb-additions installed.')
+            sys.exit(2)
+        logging.info(f'{info_prefix} Sucessfully integrated updates to node: {node_executor_hostname}.')
+
+
+def extend_ignore_node_list(ignore_nodes):
+    """ Extend the node ignore list by this node. """
+    info_prefix   = 'Info: [node-ignore-list-adder]:'
+    error_prefix  = 'Error: [node-ignore-list-adder]:'
+
+    node_executor_hostname = socket.gethostname()
+    logging.info(f'{info_prefix} Adding node {node_executor_hostname} to ignore list.')
+    ignore_nodes = ignore_nodes + f',{node_executor_hostname}'
+    logging.info(f'{info_prefix} Ignored nodes are now: {ignore_nodes}.')
+
+    return ignore_nodes
+
+
+def get_node_reboot_status():
+    """ Get the current reboot status of the current executing host node in the cluster. """
+    info_prefix        = 'Info: [node-reboot-status-getter]:'
+    error_prefix       = 'Error: [node-reboot-status-getter]:'
+    reboot_status_file = '/var/run/reboot-required'
+
+    node_executor_hostname = socket.gethostname()
+    logging.info(f'{info_prefix} Get reboot status for node: {node_executor_hostname}.')
+
+    reboot_status_object = os.path.exists(reboot_status_file)
+
+    if reboot_status_object:
+        logging.info(f'{info_prefix} Reboot required for node: {node_executor_hostname}.')
+        return True
+    else:
+        logging.info(f'{info_prefix} No reboot required for node: {node_executor_hostname}.')
+        return False
+
+
+def run_node_reboot(api_object, node_requires_reboot):
+    """ Run the update execution on node. """
+    info_prefix   = 'Info: [node-reboot-executor]:'
+    error_prefix  = 'Error: [node-reboot-executor]:'
+
+    node_executor_hostname = socket.gethostname()
+
+    if node_requires_reboot:
+        logging.info(f'{info_prefix} Execute reboot on node: {node_executor_hostname}.')
+        try:
+            update_status_object = api_object.nodes(node_executor_hostname).status().post(command='reboot')
+        except proxmoxer.core.ResourceException:
+            logging.critical(f'{error_prefix} Missing API endpoint on node: {node_executor_hostname}. Please make sure to have the package proxlb-additions installed.')
+            sys.exit(2)
+        logging.info(f'{info_prefix} Rebooting node now: {node_executor_hostname}.')
+
+
 def get_node_statistics(api_object, ignore_nodes):
     """ Get statistics of cpu, memory and disk for each node in the cluster. """
     info_prefix       = 'Info: [node-statistics]:'
@@ -404,27 +494,29 @@ def get_vm_statistics(api_object, ignore_vms, balancing_type):
     return vm_statistics
 
 
-def update_node_statistics(node_statistics, vm_statistics):
+def update_node_statistics(node_statistics, vm_statistics, ignore_nodes):
     """ Update node statistics by VMs statistics. """
-    info_prefix = 'Info: [node-update-statistics]:'
-    warn_prefix = 'Warning: [node-update-statistics]:'
+    info_prefix       = 'Info: [node-update-statistics]:'
+    warn_prefix       = 'Warning: [node-update-statistics]:'
+    ignore_nodes_list =  ignore_nodes.split(',')
 
     for vm, vm_value in vm_statistics.items():
-        node_statistics[vm_value['node_parent']]['cpu_assigned']            = node_statistics[vm_value['node_parent']]['cpu_assigned'] + int(vm_value['cpu_total'])
-        node_statistics[vm_value['node_parent']]['cpu_assigned_percent']    = (node_statistics[vm_value['node_parent']]['cpu_assigned'] / node_statistics[vm_value['node_parent']]['cpu_total']) * 100
-        node_statistics[vm_value['node_parent']]['memory_assigned']         = node_statistics[vm_value['node_parent']]['memory_assigned'] + int(vm_value['memory_total'])
-        node_statistics[vm_value['node_parent']]['memory_assigned_percent'] = (node_statistics[vm_value['node_parent']]['memory_assigned'] / node_statistics[vm_value['node_parent']]['memory_total']) * 100
-        node_statistics[vm_value['node_parent']]['disk_assigned']           = node_statistics[vm_value['node_parent']]['disk_assigned'] + int(vm_value['disk_total'])
-        node_statistics[vm_value['node_parent']]['disk_assigned_percent']   = (node_statistics[vm_value['node_parent']]['disk_assigned'] / node_statistics[vm_value['node_parent']]['disk_total']) * 100
+        if not vm_value['node_parent'] in ignore_nodes_list:
+            node_statistics[vm_value['node_parent']]['cpu_assigned']            = node_statistics[vm_value['node_parent']]['cpu_assigned'] + int(vm_value['cpu_total'])
+            node_statistics[vm_value['node_parent']]['cpu_assigned_percent']    = (node_statistics[vm_value['node_parent']]['cpu_assigned'] / node_statistics[vm_value['node_parent']]['cpu_total']) * 100
+            node_statistics[vm_value['node_parent']]['memory_assigned']         = node_statistics[vm_value['node_parent']]['memory_assigned'] + int(vm_value['memory_total'])
+            node_statistics[vm_value['node_parent']]['memory_assigned_percent'] = (node_statistics[vm_value['node_parent']]['memory_assigned'] / node_statistics[vm_value['node_parent']]['memory_total']) * 100
+            node_statistics[vm_value['node_parent']]['disk_assigned']           = node_statistics[vm_value['node_parent']]['disk_assigned'] + int(vm_value['disk_total'])
+            node_statistics[vm_value['node_parent']]['disk_assigned_percent']   = (node_statistics[vm_value['node_parent']]['disk_assigned'] / node_statistics[vm_value['node_parent']]['disk_total']) * 100
 
-        if node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] > 99:
-            logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for CPU by {int(node_statistics[vm_value["node_parent"]]["cpu_assigned_percent"])}%.')
+            if node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] > 99:
+                logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for CPU by {int(node_statistics[vm_value["node_parent"]]["cpu_assigned_percent"])}%.')
 
-        if node_statistics[vm_value['node_parent']]['memory_assigned_percent'] > 99:
-            logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for memory by {int(node_statistics[vm_value["node_parent"]]["memory_assigned_percent"])}%.')
+            if node_statistics[vm_value['node_parent']]['memory_assigned_percent'] > 99:
+                logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for memory by {int(node_statistics[vm_value["node_parent"]]["memory_assigned_percent"])}%.')
 
-        if node_statistics[vm_value['node_parent']]['disk_assigned_percent'] > 99:
-            logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for disk by {int(node_statistics[vm_value["node_parent"]]["disk_assigned_percent"])}%.')
+            if node_statistics[vm_value['node_parent']]['disk_assigned_percent'] > 99:
+                logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for disk by {int(node_statistics[vm_value["node_parent"]]["disk_assigned_percent"])}%.')
 
     logging.info(f'{info_prefix} Updated node resource assignments by all VMs.')
     logging.debug('node_statistics')
@@ -484,7 +576,7 @@ def __get_proxlb_groups(vm_tags):
     return group_include, group_exclude, vm_ignore
 
 
-def balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, rebalance, processed_vms):
+def balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance, processed_vms):
     """ Calculate re-balancing of VMs on present nodes across the cluster. """
     info_prefix  = 'Info: [rebalancing-calculator]:'
 
@@ -501,14 +593,14 @@ def balancing_calculations(balancing_method, balancing_mode, balancing_mode_opti
 
         # Update resource statistics for VMs and nodes.
         node_statistics, vm_statistics         = __update_resource_statistics(resources_vm_most_used, resources_node_most_free,
-                                                                             vm_statistics, node_statistics, balancing_method, balancing_mode)
+                                                                             vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
 
         # Start recursion until we do not have any needs to rebalance anymore.
-        balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, rebalance, processed_vms)
+        balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance, processed_vms)
 
     # Honour groupings for include and exclude groups for rebalancing VMs.
-    node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode)
-    node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode)
+    node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
+    node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
 
     # Remove VMs that are not being relocated.
     vms_to_remove = [vm_name for vm_name, vm_info in vm_statistics.items() if 'node_rebalance' in vm_info and vm_info['node_rebalance'] == vm_info.get('node_parent')]
@@ -632,11 +724,12 @@ def __get_most_free_resources_node(balancing_method, balancing_mode, balancing_m
     return node
 
 
-def __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node, vm_statistics, node_statistics, balancing_method, balancing_mode):
+def __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node, vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes):
     """ Update VM and node resource statistics. """
     info_prefix = 'Info: [rebalancing-resource-statistics-update]:'
+    ignore_nodes_list =  ignore_nodes.split(',')
 
-    if resource_highest_used_resources_vm[1]['node_parent'] != resource_highest_free_resources_node[0]:
+    if resource_highest_used_resources_vm[1]['node_parent'] != resource_highest_free_resources_node[0] and resource_highest_used_resources_vm[1]['node_parent'] not in ignore_nodes_list:
         vm_name            = resource_highest_used_resources_vm[0]
         vm_node_parent     = resource_highest_used_resources_vm[1]['node_parent']
         vm_node_rebalance  = resource_highest_free_resources_node[0]
@@ -668,7 +761,7 @@ def __update_resource_statistics(resource_highest_used_resources_vm, resource_hi
     return node_statistics, vm_statistics
 
 
-def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode):
+def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes):
     """ Get VMs tags for include groups. """
     info_prefix = 'Info: [rebalancing-tags-group-include]:'
     tags_include_vms = {}
@@ -697,13 +790,13 @@ def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_metho
                     vm_node_rebalance = vm_statistics[vm_name]['node_rebalance']
                 else:
                     _mocked_vm_object = (vm_name, vm_statistics[vm_name])
-                    node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [vm_node_rebalance], vm_statistics, node_statistics, balancing_method, balancing_mode)
+                    node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [vm_node_rebalance], vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
             processed_vm.append(vm_name)
 
     return node_statistics, vm_statistics
 
 
-def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode):
+def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes):
     """ Get VMs tags for exclude groups. """
     info_prefix = 'Info: [rebalancing-tags-group-exclude]:'
     tags_exclude_vms = {}
@@ -736,7 +829,7 @@ def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_metho
                         random_node = random.choice(list(node_statistics.keys()))
                 else:
                     _mocked_vm_object = (vm_name, vm_statistics[vm_name])
-                    node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [random_node], vm_statistics, node_statistics, balancing_method, balancing_mode)
+                    node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [random_node], vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
             processed_vm.append(vm_name)
 
     return node_statistics, vm_statistics
@@ -891,20 +984,30 @@ def main():
                 validate_daemon(daemon, schedule)
                 continue
 
+        # Validate for node auto update in cluster for rolling updates.
+        # Note: This requires proxlb-additions with a patched Proxmox API!
+        #rolling_updates = 1
+        if bool(int(rolling_updates)):
+            node_requires_updates = get_node_update_status(api_object)
+            run_node_update(api_object, node_requires_updates)
+            node_requires_reboot = get_node_reboot_status()
+            if node_requires_reboot:
+                ignore_nodes = extend_ignore_node_list(ignore_nodes)
+
         # Get metric & statistics for vms and nodes.
         node_statistics = get_node_statistics(api_object, ignore_nodes)
         vm_statistics   = get_vm_statistics(api_object, ignore_vms, balancing_type)
-        node_statistics = update_node_statistics(node_statistics, vm_statistics)
+        node_statistics = update_node_statistics(node_statistics, vm_statistics, ignore_nodes)
 
         # Calculate rebalancing of vms.
         node_statistics_rebalanced, vm_statistics_rebalanced = balancing_calculations(balancing_method, balancing_mode, balancing_mode_option,
-                                                                                      node_statistics, vm_statistics, balanciness, rebalance=False, processed_vms=[])
+                                                                                      node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance=False, processed_vms=[])
 
         # Rebalance vms to new nodes within the cluster.
         run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations)
 
         # Validate for any errors.
-        post_validations()
+        post_validations(api_object, node_requires_reboot)
 
         # Validate daemon service.
         validate_daemon(daemon, schedule)