From 72e9ffd3701235589bf1377cf531524b2e51e836 Mon Sep 17 00:00:00 2001 From: Basil Crow Date: Mon, 23 Oct 2023 13:55:12 -0700 Subject: [PATCH] Configurable retry delay and jitter --- src/main/java/hudson/remoting/Engine.java | 62 ++++++++++++----- src/main/java/hudson/remoting/Launcher.java | 41 ++++++++++-- .../engine/JnlpAgentEndpointResolver.java | 27 +++++++- .../jenkinsci/remoting/util/RetryUtils.java | 67 +++++++++++++++++++ 4 files changed, 176 insertions(+), 21 deletions(-) create mode 100644 src/main/java/org/jenkinsci/remoting/util/RetryUtils.java diff --git a/src/main/java/hudson/remoting/Engine.java b/src/main/java/hudson/remoting/Engine.java index 2198e2f38..22c4b08f0 100644 --- a/src/main/java/hudson/remoting/Engine.java +++ b/src/main/java/hudson/remoting/Engine.java @@ -50,6 +50,7 @@ import java.security.cert.CertificateException; import java.security.cert.X509Certificate; import java.security.interfaces.RSAPublicKey; +import java.time.Duration; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -93,6 +94,7 @@ import org.jenkinsci.remoting.protocol.cert.PublicKeyMatchingX509ExtendedTrustManager; import org.jenkinsci.remoting.protocol.impl.ConnectionRefusalException; import org.jenkinsci.remoting.util.KeyUtils; +import org.jenkinsci.remoting.util.RetryUtils; import org.jenkinsci.remoting.util.VersionNumber; import org.kohsuke.accmod.Restricted; import org.kohsuke.accmod.restrictions.NoExternalUse; @@ -180,6 +182,12 @@ public Thread newThread(@NonNull final Runnable r) { private boolean noReconnect = false; + private int delay = 10; + + private double jitterFactor = 0; + + private int jitter = 0; + /** * Determines whether the socket will have {@link Socket#setKeepAlive(boolean)} set or not. * @@ -399,6 +407,21 @@ public void setNoReconnect(boolean noReconnect) { this.noReconnect = noReconnect; } + @Restricted(NoExternalUse.class) + public void setDelay(int delay) { + this.delay = delay; + } + + @Restricted(NoExternalUse.class) + public void setJitterFactor(double jitterFactor) { + this.jitterFactor = jitterFactor; + } + + @Restricted(NoExternalUse.class) + public void setJitter(int jitter) { + this.jitter = jitter; + } + /** * Determines if JNLPAgentEndpointResolver will not perform certificate validation in the HTTPs mode. * @@ -696,8 +719,8 @@ public void closeRead() throws IOException { } events.onDisconnect(); while (true) { - // TODO refactor various sleep statements into a common method - TimeUnit.SECONDS.sleep(10); + Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter); + Thread.sleep(duration.toMillis()); // Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be a 404 if the TCP port is disabled. URL ping = new URL(hudsonUrl, "login"); try { @@ -763,9 +786,9 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) { endpoint = resolver.resolve(); } catch (IOException e) { if (!noReconnect) { - events.status("Could not locate server among " + candidateUrls + "; waiting 10 seconds before retry", e); - // TODO refactor various sleep statements into a common method - TimeUnit.SECONDS.sleep(10); + Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter); + events.status("Could not locate server among " + candidateUrls + "; waiting " + RetryUtils.formatDuration(duration) + " seconds before retry", e); + Thread.sleep(duration.toMillis()); continue; } else { if (Boolean.getBoolean(Engine.class.getName() + ".nonFatalJnlpAgentEndpointResolutionExceptions")) { @@ -883,7 +906,6 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) { } private JnlpEndpointResolver createEndpointResolver(List jenkinsUrls, String agentName) { - JnlpEndpointResolver resolver; if (directConnection == null) { SSLSocketFactory sslSocketFactory = null; try { @@ -891,18 +913,28 @@ private JnlpEndpointResolver createEndpointResolver(List jenkinsUrls, St } catch (Exception e) { events.error(e); } - resolver = new JnlpAgentEndpointResolver(jenkinsUrls, credentials, proxyCredentials, tunnel, - sslSocketFactory, disableHttpsCertValidation, agentName); + JnlpAgentEndpointResolver jnlpAgentEndpointResolver = + new JnlpAgentEndpointResolver( + jenkinsUrls, + credentials, + proxyCredentials, + tunnel, + sslSocketFactory, + disableHttpsCertValidation, + agentName); + jnlpAgentEndpointResolver.setDelay(delay); + jnlpAgentEndpointResolver.setJitterFactor(jitterFactor); + jnlpAgentEndpointResolver.setJitter(jitter); + return jnlpAgentEndpointResolver; } else { - resolver = new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols, proxyCredentials); + return new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols, proxyCredentials); } - return resolver; } private void onConnectionRejected(String greeting) throws InterruptedException { - events.status("reconnect rejected, sleeping 10s: ", new Exception("The server rejected the connection: " + greeting)); - // TODO refactor various sleep statements into a common method - TimeUnit.SECONDS.sleep(10); + Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter); + events.status("reconnect rejected, sleeping " + RetryUtils.formatDuration(duration) + "s: ", new Exception("The server rejected the connection: " + greeting)); + Thread.sleep(duration.toMillis()); } /** @@ -924,8 +956,8 @@ private Socket connectTcp(@NonNull JnlpAgentEndpoint endpoint) throws IOExceptio if(retry++>10) { throw e; } - // TODO refactor various sleep statements into a common method - TimeUnit.SECONDS.sleep(10); + Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter); + Thread.sleep(duration.toMillis()); events.status(msg+" (retrying:"+retry+")",e); } } diff --git a/src/main/java/hudson/remoting/Launcher.java b/src/main/java/hudson/remoting/Launcher.java index 4c6393d46..94192a6fc 100644 --- a/src/main/java/hudson/remoting/Launcher.java +++ b/src/main/java/hudson/remoting/Launcher.java @@ -30,6 +30,7 @@ import org.jenkinsci.remoting.engine.JnlpAgentEndpointResolver; import org.jenkinsci.remoting.engine.WorkDirManager; import org.jenkinsci.remoting.util.PathUtils; +import org.jenkinsci.remoting.util.RetryUtils; import org.kohsuke.args4j.Argument; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; @@ -75,6 +76,7 @@ import java.security.cert.CertificateException; import java.security.cert.CertificateFactory; import java.security.cert.X509Certificate; +import java.time.Duration; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -83,7 +85,6 @@ import java.util.Properties; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; @@ -233,6 +234,25 @@ public void setNoCertificateCheck(boolean ignored) throws NoSuchAlgorithmExcepti @Option(name="-noReconnect",aliases="-noreconnect",usage="Doesn't try to reconnect when a communication fail, and exit instead") public boolean noReconnect = false; + @Option(name = "-retryDelay", + usage = "The fixed delay to occur between retries. Default is 10 seconds.") + public int delay = 10; + + @Option(name = "-retryJitterFactor", + usage = "The jitter factor to randomly vary retry delays by. For each retry delay, a" + + " random portion of the delay multiplied by the jitter factor will be" + + " added or subtracted to the delay. For example: a retry delay of 10" + + " seconds and a jitter factor of .25 will result in a random retry delay" + + " between 7.5 and 12.5 seconds.") + public double jitterFactor = 0; + + @Option(name = "-retryJitter", + usage = "The jitter to randomly vary retry delays by. For each retry delay, a random" + + " portion of the jitter will be added or subtracted to the delay. For" + + " example: a jitter of 5 seconds will randomly add between -5 and 5" + + " seconds to each retry delay.") + public int jitter = 0; + @Option(name = "-noKeepAlive", usage = "Disable TCP socket keep alive on connection to the controller.") public boolean noKeepAlive = false; @@ -393,6 +413,19 @@ public void run() throws CmdLineException, IOException, InterruptedException { return; } + if (jitterFactor != 0 && jitter != 0) { + throw new CmdLineException("Jitter factor and jitter are mutually exclusive"); + } + if (jitterFactor < 0 || jitterFactor > 1) { + throw new CmdLineException("Jitter factor must be >= 0 and <= 1"); + } + if (jitter < 0) { + throw new CmdLineException("Jitter must be >= 0"); + } + if (jitter > 0 && jitter >= delay) { + throw new CmdLineException("Jitter must be < delay"); + } + // Create and verify working directory and logging // TODO: The pass-through for the JNLP mode has been added in JENKINS-39817. But we still need to keep this parameter in // consideration for other modes (TcpServer, TcpClient, etc.) to retain the legacy behavior. @@ -696,9 +729,9 @@ public List parseJnlpArguments() throws ParserConfigurationException, SA System.err.println("Failed to obtain "+ agentJnlpURL); e.printStackTrace(System.err); - System.err.println("Waiting 10 seconds before retry"); - // TODO refactor various sleep statements into a common method - TimeUnit.SECONDS.sleep(10); + Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter); + System.err.println("Waiting " + RetryUtils.formatDuration(duration) + " seconds before retry"); + Thread.sleep(duration.toMillis()); // retry } finally { if (con instanceof HttpURLConnection) { diff --git a/src/main/java/org/jenkinsci/remoting/engine/JnlpAgentEndpointResolver.java b/src/main/java/org/jenkinsci/remoting/engine/JnlpAgentEndpointResolver.java index 6c5168907..816eb3003 100644 --- a/src/main/java/org/jenkinsci/remoting/engine/JnlpAgentEndpointResolver.java +++ b/src/main/java/org/jenkinsci/remoting/engine/JnlpAgentEndpointResolver.java @@ -29,6 +29,7 @@ import hudson.remoting.Engine; import hudson.remoting.Launcher; import hudson.remoting.NoProxyEvaluator; +import org.jenkinsci.remoting.util.RetryUtils; import org.jenkinsci.remoting.util.VersionNumber; import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier; import org.jenkinsci.remoting.util.https.NoCheckTrustManager; @@ -62,6 +63,7 @@ import java.security.SecureRandom; import java.security.interfaces.RSAPublicKey; import java.security.spec.InvalidKeySpecException; +import java.time.Duration; import java.util.ArrayList; import java.util.Base64; import java.util.Iterator; @@ -103,6 +105,12 @@ public class JnlpAgentEndpointResolver extends JnlpEndpointResolver { private final String agentName; + private int delay = 10; + + private double jitterFactor = 0; + + private int jitter = 0; + /** * If specified, only the protocols from the list will be tried during the connection. * The option provides protocol names, but the order of the check is defined internally and cannot be changed. @@ -183,6 +191,21 @@ public void setDisableHttpsCertValidation(boolean disableHttpsCertValidation) { this.disableHttpsCertValidation = disableHttpsCertValidation; } + @Restricted(NoExternalUse.class) + public void setDelay(int delay) { + this.delay = delay; + } + + @Restricted(NoExternalUse.class) + public void setJitterFactor(double jitterFactor) { + this.jitterFactor = jitterFactor; + } + + @Restricted(NoExternalUse.class) + public void setJitter(int jitter) { + this.jitter = jitter; + } + @CheckForNull @Override public JnlpAgentEndpoint resolve() throws IOException { @@ -400,8 +423,8 @@ public void waitForReady() throws InterruptedException { try { int retries = 0; while (true) { - // TODO refactor various sleep statements into a common method - Thread.sleep(1000 * 10); + Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter); + Thread.sleep(duration.toMillis()); // Jenkins top page might be read-protected. see http://www.nabble // .com/more-lenient-retry-logic-in-Engine.waitForServerToBack-td24703172.html if (jenkinsUrls.isEmpty()) { diff --git a/src/main/java/org/jenkinsci/remoting/util/RetryUtils.java b/src/main/java/org/jenkinsci/remoting/util/RetryUtils.java new file mode 100644 index 000000000..d69aa350b --- /dev/null +++ b/src/main/java/org/jenkinsci/remoting/util/RetryUtils.java @@ -0,0 +1,67 @@ +/* + * The MIT License + * + * Copyright (c) 2023, CloudBees, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package org.jenkinsci.remoting.util; + +import java.security.SecureRandom; +import java.text.NumberFormat; +import java.time.Duration; +import java.util.Random; +import org.kohsuke.accmod.Restricted; +import org.kohsuke.accmod.restrictions.NoExternalUse; + +/** + * Retry-related utility methods. Used in place of a library like Failsafe to minimize external third-party dependencies. + */ +@Restricted(NoExternalUse.class) +public class RetryUtils { + + private static final Random RANDOM = new SecureRandom(); + + // Suppress default constructor for noninstantiability + private RetryUtils() { + throw new AssertionError(); + } + + /** + * Get the retry duration based on the CLI arguments. + */ + public static Duration getDuration(int delay, double jitterFactor, int jitter) { + if (jitterFactor != 0) { + double randomFactor = 1 + (1 - RANDOM.nextDouble() * 2) * jitterFactor; + return Duration.ofMillis((long) (Duration.ofSeconds(delay).toMillis() * randomFactor)); + } else if (jitter != 0) { + double randomAddend = + (1 - RANDOM.nextDouble() * 2) * Duration.ofSeconds(jitter).toMillis(); + return Duration.ofMillis((long) (Duration.ofSeconds(delay).toMillis() + randomAddend)); + } else { + return Duration.ofSeconds(delay); + } + } + + public static String formatDuration(Duration duration) { + return NumberFormat.getNumberInstance().format(duration.toMillis() / 1000.0); + } +}