Skip to content

Commit

Permalink
Configurable retry delay and jitter
Browse files Browse the repository at this point in the history
  • Loading branch information
basil committed Oct 23, 2023
1 parent 78543a9 commit 72e9ffd
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 21 deletions.
62 changes: 47 additions & 15 deletions src/main/java/hudson/remoting/Engine.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.security.interfaces.RSAPublicKey;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
Expand Down Expand Up @@ -93,6 +94,7 @@
import org.jenkinsci.remoting.protocol.cert.PublicKeyMatchingX509ExtendedTrustManager;
import org.jenkinsci.remoting.protocol.impl.ConnectionRefusalException;
import org.jenkinsci.remoting.util.KeyUtils;
import org.jenkinsci.remoting.util.RetryUtils;
import org.jenkinsci.remoting.util.VersionNumber;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.NoExternalUse;
Expand Down Expand Up @@ -180,6 +182,12 @@ public Thread newThread(@NonNull final Runnable r) {

private boolean noReconnect = false;

private int delay = 10;

private double jitterFactor = 0;

private int jitter = 0;

/**
* Determines whether the socket will have {@link Socket#setKeepAlive(boolean)} set or not.
*
Expand Down Expand Up @@ -399,6 +407,21 @@ public void setNoReconnect(boolean noReconnect) {
this.noReconnect = noReconnect;
}

@Restricted(NoExternalUse.class)
public void setDelay(int delay) {
this.delay = delay;
}

@Restricted(NoExternalUse.class)
public void setJitterFactor(double jitterFactor) {
this.jitterFactor = jitterFactor;
}

@Restricted(NoExternalUse.class)
public void setJitter(int jitter) {
this.jitter = jitter;
}

/**
* Determines if JNLPAgentEndpointResolver will not perform certificate validation in the HTTPs mode.
*
Expand Down Expand Up @@ -696,8 +719,8 @@ public void closeRead() throws IOException {
}
events.onDisconnect();
while (true) {
// TODO refactor various sleep statements into a common method
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
Thread.sleep(duration.toMillis());
// Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be a 404 if the TCP port is disabled.
URL ping = new URL(hudsonUrl, "login");
try {
Expand Down Expand Up @@ -763,9 +786,9 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) {
endpoint = resolver.resolve();
} catch (IOException e) {
if (!noReconnect) {
events.status("Could not locate server among " + candidateUrls + "; waiting 10 seconds before retry", e);
// TODO refactor various sleep statements into a common method
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
events.status("Could not locate server among " + candidateUrls + "; waiting " + RetryUtils.formatDuration(duration) + " seconds before retry", e);
Thread.sleep(duration.toMillis());
continue;
} else {
if (Boolean.getBoolean(Engine.class.getName() + ".nonFatalJnlpAgentEndpointResolutionExceptions")) {
Expand Down Expand Up @@ -883,26 +906,35 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) {
}

private JnlpEndpointResolver createEndpointResolver(List<String> jenkinsUrls, String agentName) {
JnlpEndpointResolver resolver;
if (directConnection == null) {
SSLSocketFactory sslSocketFactory = null;
try {
sslSocketFactory = getSSLSocketFactory(candidateCertificates);
} catch (Exception e) {
events.error(e);
}
resolver = new JnlpAgentEndpointResolver(jenkinsUrls, credentials, proxyCredentials, tunnel,
sslSocketFactory, disableHttpsCertValidation, agentName);
JnlpAgentEndpointResolver jnlpAgentEndpointResolver =
new JnlpAgentEndpointResolver(
jenkinsUrls,
credentials,
proxyCredentials,
tunnel,
sslSocketFactory,
disableHttpsCertValidation,
agentName);
jnlpAgentEndpointResolver.setDelay(delay);
jnlpAgentEndpointResolver.setJitterFactor(jitterFactor);
jnlpAgentEndpointResolver.setJitter(jitter);
return jnlpAgentEndpointResolver;
} else {
resolver = new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols, proxyCredentials);
return new JnlpAgentEndpointConfigurator(directConnection, instanceIdentity, protocols, proxyCredentials);
}
return resolver;
}

private void onConnectionRejected(String greeting) throws InterruptedException {
events.status("reconnect rejected, sleeping 10s: ", new Exception("The server rejected the connection: " + greeting));
// TODO refactor various sleep statements into a common method
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
events.status("reconnect rejected, sleeping " + RetryUtils.formatDuration(duration) + "s: ", new Exception("The server rejected the connection: " + greeting));
Thread.sleep(duration.toMillis());
}

/**
Expand All @@ -924,8 +956,8 @@ private Socket connectTcp(@NonNull JnlpAgentEndpoint endpoint) throws IOExceptio
if(retry++>10) {
throw e;
}
// TODO refactor various sleep statements into a common method
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
Thread.sleep(duration.toMillis());
events.status(msg+" (retrying:"+retry+")",e);
}
}
Expand Down
41 changes: 37 additions & 4 deletions src/main/java/hudson/remoting/Launcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.jenkinsci.remoting.engine.JnlpAgentEndpointResolver;
import org.jenkinsci.remoting.engine.WorkDirManager;
import org.jenkinsci.remoting.util.PathUtils;
import org.jenkinsci.remoting.util.RetryUtils;
import org.kohsuke.args4j.Argument;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
Expand Down Expand Up @@ -75,6 +76,7 @@
import java.security.cert.CertificateException;
import java.security.cert.CertificateFactory;
import java.security.cert.X509Certificate;
import java.time.Duration;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
Expand All @@ -83,7 +85,6 @@
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand Down Expand Up @@ -233,6 +234,25 @@ public void setNoCertificateCheck(boolean ignored) throws NoSuchAlgorithmExcepti
@Option(name="-noReconnect",aliases="-noreconnect",usage="Doesn't try to reconnect when a communication fail, and exit instead")
public boolean noReconnect = false;

@Option(name = "-retryDelay",
usage = "The fixed delay to occur between retries. Default is 10 seconds.")
public int delay = 10;

@Option(name = "-retryJitterFactor",
usage = "The jitter factor to randomly vary retry delays by. For each retry delay, a"
+ " random portion of the delay multiplied by the jitter factor will be"
+ " added or subtracted to the delay. For example: a retry delay of 10"
+ " seconds and a jitter factor of .25 will result in a random retry delay"
+ " between 7.5 and 12.5 seconds.")
public double jitterFactor = 0;

@Option(name = "-retryJitter",
usage = "The jitter to randomly vary retry delays by. For each retry delay, a random"
+ " portion of the jitter will be added or subtracted to the delay. For"
+ " example: a jitter of 5 seconds will randomly add between -5 and 5"
+ " seconds to each retry delay.")
public int jitter = 0;

@Option(name = "-noKeepAlive",
usage = "Disable TCP socket keep alive on connection to the controller.")
public boolean noKeepAlive = false;
Expand Down Expand Up @@ -393,6 +413,19 @@ public void run() throws CmdLineException, IOException, InterruptedException {
return;
}

if (jitterFactor != 0 && jitter != 0) {
throw new CmdLineException("Jitter factor and jitter are mutually exclusive");
}
if (jitterFactor < 0 || jitterFactor > 1) {
throw new CmdLineException("Jitter factor must be >= 0 and <= 1");
}
if (jitter < 0) {
throw new CmdLineException("Jitter must be >= 0");
}
if (jitter > 0 && jitter >= delay) {
throw new CmdLineException("Jitter must be < delay");
}

// Create and verify working directory and logging
// TODO: The pass-through for the JNLP mode has been added in JENKINS-39817. But we still need to keep this parameter in
// consideration for other modes (TcpServer, TcpClient, etc.) to retain the legacy behavior.
Expand Down Expand Up @@ -696,9 +729,9 @@ public List<String> parseJnlpArguments() throws ParserConfigurationException, SA

System.err.println("Failed to obtain "+ agentJnlpURL);
e.printStackTrace(System.err);
System.err.println("Waiting 10 seconds before retry");
// TODO refactor various sleep statements into a common method
TimeUnit.SECONDS.sleep(10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
System.err.println("Waiting " + RetryUtils.formatDuration(duration) + " seconds before retry");
Thread.sleep(duration.toMillis());
// retry
} finally {
if (con instanceof HttpURLConnection) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import hudson.remoting.Engine;
import hudson.remoting.Launcher;
import hudson.remoting.NoProxyEvaluator;
import org.jenkinsci.remoting.util.RetryUtils;
import org.jenkinsci.remoting.util.VersionNumber;
import org.jenkinsci.remoting.util.https.NoCheckHostnameVerifier;
import org.jenkinsci.remoting.util.https.NoCheckTrustManager;
Expand Down Expand Up @@ -62,6 +63,7 @@
import java.security.SecureRandom;
import java.security.interfaces.RSAPublicKey;
import java.security.spec.InvalidKeySpecException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Iterator;
Expand Down Expand Up @@ -103,6 +105,12 @@ public class JnlpAgentEndpointResolver extends JnlpEndpointResolver {

private final String agentName;

private int delay = 10;

private double jitterFactor = 0;

private int jitter = 0;

/**
* If specified, only the protocols from the list will be tried during the connection.
* The option provides protocol names, but the order of the check is defined internally and cannot be changed.
Expand Down Expand Up @@ -183,6 +191,21 @@ public void setDisableHttpsCertValidation(boolean disableHttpsCertValidation) {
this.disableHttpsCertValidation = disableHttpsCertValidation;
}

@Restricted(NoExternalUse.class)
public void setDelay(int delay) {
this.delay = delay;
}

@Restricted(NoExternalUse.class)
public void setJitterFactor(double jitterFactor) {
this.jitterFactor = jitterFactor;
}

@Restricted(NoExternalUse.class)
public void setJitter(int jitter) {
this.jitter = jitter;
}

@CheckForNull
@Override
public JnlpAgentEndpoint resolve() throws IOException {
Expand Down Expand Up @@ -400,8 +423,8 @@ public void waitForReady() throws InterruptedException {
try {
int retries = 0;
while (true) {
// TODO refactor various sleep statements into a common method
Thread.sleep(1000 * 10);
Duration duration = RetryUtils.getDuration(delay, jitterFactor, jitter);
Thread.sleep(duration.toMillis());
// Jenkins top page might be read-protected. see http://www.nabble
// .com/more-lenient-retry-logic-in-Engine.waitForServerToBack-td24703172.html
if (jenkinsUrls.isEmpty()) {
Expand Down
67 changes: 67 additions & 0 deletions src/main/java/org/jenkinsci/remoting/util/RetryUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* The MIT License
*
* Copyright (c) 2023, CloudBees, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package org.jenkinsci.remoting.util;

import java.security.SecureRandom;
import java.text.NumberFormat;
import java.time.Duration;
import java.util.Random;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.NoExternalUse;

/**
* Retry-related utility methods. Used in place of a library like <a
* href="https://failsafe.dev/">Failsafe</a> to minimize external third-party dependencies.
*/
@Restricted(NoExternalUse.class)
public class RetryUtils {

private static final Random RANDOM = new SecureRandom();

// Suppress default constructor for noninstantiability
private RetryUtils() {
throw new AssertionError();
}

/**
* Get the retry duration based on the CLI arguments.
*/
public static Duration getDuration(int delay, double jitterFactor, int jitter) {
if (jitterFactor != 0) {
double randomFactor = 1 + (1 - RANDOM.nextDouble() * 2) * jitterFactor;
return Duration.ofMillis((long) (Duration.ofSeconds(delay).toMillis() * randomFactor));
} else if (jitter != 0) {
double randomAddend =
(1 - RANDOM.nextDouble() * 2) * Duration.ofSeconds(jitter).toMillis();
return Duration.ofMillis((long) (Duration.ofSeconds(delay).toMillis() + randomAddend));
} else {
return Duration.ofSeconds(delay);
}
}

public static String formatDuration(Duration duration) {
return NumberFormat.getNumberInstance().format(duration.toMillis() / 1000.0);
}
}

0 comments on commit 72e9ffd

Please sign in to comment.