From 5d071e887fbefe35f8eaf114ca7aa8f75c73fc22 Mon Sep 17 00:00:00 2001
From: Jasper Berton <jasper.berton@team.blue>
Date: Tue, 29 Oct 2024 10:38:23 +0000
Subject: [PATCH] core: Changed waiting for reboot from 10 minutes to check
 connectivity every 30 seconds

Altered sleepOnReboot to try and reach the host with a certain interval, ServerRebootSleepTime, as waiting time in between tries.
When the host is back online after reboot, cancel sleep timeout and continue with operations.
Instead of trying to reach the host after the ServerRebootTimeout has been reached, this is the max time the host has to come back online before breaking off operations.
Uses the stats it gets from the vdsproxy to check the state of the host and reports back that operations with this host can be continued.
Signed-off-by: Jasper Berton <jasper.berton@team.blue>
---
 .../org/ovirt/engine/core/bll/VdsCommand.java | 58 ++++++++++++++++---
 .../core/common/config/ConfigValues.java      |  2 +
 .../upgrade/pre_upgrade/0000_config.sql       |  1 +
 .../engine-config/engine-config.properties    |  4 +-
 .../ovirt-engine/core/misc.py                 |  2 +-
 5 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsCommand.java b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsCommand.java
index ba9ca8e70cc..a93d3bcb541 100644
--- a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsCommand.java
+++ b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsCommand.java
@@ -2,8 +2,13 @@
 
 import java.util.Collections;
 import java.util.List;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ScheduledFuture;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 
+import javax.enterprise.concurrent.ManagedScheduledExecutorService;
 import javax.inject.Inject;
 
 import org.apache.commons.lang.StringUtils;
@@ -37,14 +42,18 @@
 import org.ovirt.engine.core.dao.VdsStaticDao;
 import org.ovirt.engine.core.dao.gluster.GlusterDBUtils;
 import org.ovirt.engine.core.utils.EngineLocalConfig;
-import org.ovirt.engine.core.utils.ThreadUtils;
 import org.ovirt.engine.core.utils.lock.EngineLock;
 import org.ovirt.engine.core.utils.threadpool.ThreadPoolUtil;
+import org.ovirt.engine.core.utils.threadpool.ThreadPools;
 import org.ovirt.engine.core.vdsbroker.ResourceManager;
+import org.ovirt.engine.core.vdsbroker.vdsbroker.IVdsServer;
+import org.ovirt.engine.core.vdsbroker.vdsbroker.VDSInfoReturn;
+
 
 public abstract class VdsCommand<T extends VdsActionParameters> extends CommandBase<T> {
 
     protected String _failureMessage = null;
+    private ScheduledFuture<?> reachableFuture;
 
     @Inject
     protected AuditLogDirector auditLogDirector;
@@ -68,6 +77,9 @@ public abstract class VdsCommand<T extends VdsActionParameters> extends CommandB
     private AlertDirector alertDirector;
     @Inject
     private VdsStaticDao vdsStaticDao;
+    @Inject
+    @ThreadPools(ThreadPools.ThreadPoolType.EngineScheduledThreadPool)
+    private ManagedScheduledExecutorService executor;
 
     /**
      * Constructor for command creation when compensation is applied on startup
@@ -112,14 +124,46 @@ protected void runSleepOnReboot(boolean synchronous, final VDSStatus status) {
         }
     }
 
+    /**
+     * Enables timeout on the thread until max timeout time is exceeded or a connection is made with the rebooting device
+     */
     private void sleepOnReboot(final VDSStatus status) {
-        int sleepTimeInSec = Config.<Integer> getValue(ConfigValues.ServerRebootTimeout);
-        log.info("Waiting {} seconds, for server to finish reboot process.",
-                sleepTimeInSec);
         resourceManager.getVdsManager(getVdsId()).setInServerRebootTimeout(true);
-        ThreadUtils.sleep(TimeUnit.SECONDS.toMillis(sleepTimeInSec));
-        resourceManager.getVdsManager(getVdsId()).setInServerRebootTimeout(false);
-        setVdsStatus(status);
+        int serverRebootMax = Config.<Integer> getValue(ConfigValues.ServerRebootTimeout);
+        int retryTime = Config.<Integer> getValue(ConfigValues.ServerRebootSleepTime);
+        try {
+            reachableFuture
+                = executor.scheduleAtFixedRate(() -> isReachable(), retryTime, retryTime, TimeUnit.SECONDS);
+            reachableFuture.get(serverRebootMax, TimeUnit.SECONDS);
+        } catch (InterruptedException e) {
+            log.info("Trying to reconnect with host {} after reboot failed due to {}", getVdsId(), e.toString());
+        } catch (ExecutionException e) {
+            log.info("Problem during execution of reconnection with host {} after reboot due to {}", getVdsId(), e.toString());
+        } catch (TimeoutException e) {
+            log.info("Unable to connect to host {} after {} seconds", getVdsId(), serverRebootMax);
+        } catch (CancellationException e) {
+            log.info("Future cancelled due to ability to connect to host {} after reboot.", getVdsId());
+        } finally {
+            resourceManager.getVdsManager(getVdsId()).setInServerRebootTimeout(false);
+            setVdsStatus(status);
+        }
+    }
+
+    /**
+     * Checks if the host is ready to reconnect
+     * if the status equals 0 it means the vds is done and ready to reconnect, so the thread can be interrupted
+     */
+    private void isReachable() {
+        try {
+            IVdsServer serv = resourceManager.getVdsManager(getVdsId()).getVdsProxy();
+            VDSInfoReturn info = serv.getVdsStats();
+            log.info("Status of host {} is {}", getVdsId(), info.status.toString());
+            if (info.status.code == 0) {
+                reachableFuture.cancel(false);
+            }
+        } catch (Throwable t) {
+            log.error("Error encountered {}", t.toString());
+        }
     }
 
     /**
diff --git a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
index 0abf5877be7..572ae5da872 100644
--- a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
+++ b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
@@ -234,6 +234,8 @@ public enum ConfigValues {
     @Reloadable
     @TypeConverterAttribute(Integer.class)
     ServerRebootTimeout,
+    @TypeConverterAttribute(Integer.class)
+    ServerRebootSleepTime,
     @Reloadable
     @TypeConverterAttribute(Integer.class)
     VmGracefulShutdownTimeout,
diff --git a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql
index 390c00a4c75..ad23418d9de 100644
--- a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql
+++ b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql
@@ -614,6 +614,7 @@ select fn_db_add_config_value('ServerCPUList',
     '4.8');
 
 select fn_db_add_config_value('ServerRebootTimeout','600','general');
+select fn_db_add_config_value('ServerRebootSleepTime','30','general');
 select fn_db_add_config_value('SetupNetworksPollingTimeout','3','general');
 select fn_db_add_config_value('SignCertTimeoutInSeconds','30','general');
 --Handling Script name for signing
diff --git a/packaging/etc/engine-config/engine-config.properties b/packaging/etc/engine-config/engine-config.properties
index 05b1003046f..8a5228da04e 100644
--- a/packaging/etc/engine-config/engine-config.properties
+++ b/packaging/etc/engine-config/engine-config.properties
@@ -116,8 +116,10 @@ SANWipeAfterDelete.description="Initializing disk image is more secure but it is
 SANWipeAfterDelete.validValues=true,false
 SearchResultsLimit.description="Max Quantity of Search Results"
 SearchResultsLimit.type=Integer
-ServerRebootTimeout.description="Host Reboot Timeout (in seconds)"
+ServerRebootTimeout.description="Max Host Reboot Timeout (in seconds)"
 ServerRebootTimeout.type=Integer
+ServerRebootSleepTime.description="Interval between each try to connect to host while in reboot (in seconds)"
+ServerRebootSleepTime=Integer
 ConsoleReleaseCursorKeys.description="Keyboard keys combination that causes the mouse cursor to be released from its grab on console client window"
 SpiceSecureChannels.description="SPICE Secure Channels"
 SpiceSecureChannels.type=StringMultiple
diff --git a/packaging/setup/plugins/ovirt-engine-setup/ovirt-engine/core/misc.py b/packaging/setup/plugins/ovirt-engine-setup/ovirt-engine/core/misc.py
index 7b5732f8f62..2979ad95cb4 100644
--- a/packaging/setup/plugins/ovirt-engine-setup/ovirt-engine/core/misc.py
+++ b/packaging/setup/plugins/ovirt-engine-setup/ovirt-engine/core/misc.py
@@ -101,4 +101,4 @@ def _customization(self):
             ] = True
 
 
-# vim: expandtab tabstop=4 shiftwidth=4
+# vim: expandtab tabstop=4 shiftwidth=4
\ No newline at end of file