Skip to content

Commit

Permalink
Merge pull request #455 from jenkinsci/JENKINS-56347
Browse files Browse the repository at this point in the history
[JENKINS-56347] Wait for agent to connect on provisioning
  • Loading branch information
carlossg authored May 7, 2019
2 parents c0a7ccb + e05906e commit fc666e1
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,15 @@ public void onClose(KubernetesClientException cause) {

}

/**
* Wait until all pod containers are running
*
* @return the pod
* @throws IllegalStateException
* if pod or containers are no longer running
* @throws KubernetesClientTimeoutException
* if time ran out
*/
public Pod await(long amount, TimeUnit timeUnit) {
long started = System.currentTimeMillis();
long alreadySpent = System.currentTimeMillis() - started;
Expand All @@ -137,6 +146,15 @@ private Pod awaitWatcher(long amount, TimeUnit timeUnit) {
}
}

/**
* Wait until all pod containers are running
*
* @return the pod
* @throws IllegalStateException
* if pod or containers are no longer running
* @throws KubernetesClientTimeoutException
* if time ran out
*/
private Pod periodicAwait(int i, long started, long interval, long amount) {
Pod pod = client.pods().inNamespace(this.pod.getMetadata().getNamespace())
.withName(this.pod.getMetadata().getName()).get();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,32 @@
import static java.util.logging.Level.*;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;

import javax.annotation.CheckForNull;

import io.fabric8.kubernetes.client.Watch;
import org.apache.commons.lang.StringUtils;
import org.kohsuke.stapler.DataBoundConstructor;

import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;

import hudson.model.TaskListener;
import hudson.slaves.JNLPLauncher;
import hudson.slaves.SlaveComputer;

import io.fabric8.kubernetes.api.model.ContainerStatus;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.Watch;
import io.fabric8.kubernetes.client.dsl.LogWatch;
import io.fabric8.kubernetes.client.dsl.PrettyLoggable;

/**
* Launches on Kubernetes the specified {@link KubernetesComputer} instance.
Expand Down Expand Up @@ -110,16 +117,76 @@ public void launch(SlaveComputer computer, TaskListener listener) {
.stream().filter(s -> StringUtils.isNotBlank(s)).findFirst().orElse(null);
slave.setNamespace(namespace);

LOGGER.log(Level.FINE, "Creating Pod: {0} in namespace {1}", new Object[]{podId, namespace});
LOGGER.log(Level.FINE, "Creating Pod: {0}/{1}", new Object[] { namespace, podId });
pod = client.pods().inNamespace(namespace).create(pod);
LOGGER.log(INFO, "Created Pod: {0} in namespace {1}", new Object[]{podId, namespace});
listener.getLogger().printf("Created Pod: %s in namespace %s%n", podId, namespace);
LOGGER.log(INFO, "Created Pod: {0}/{1}", new Object[] { namespace, podId });
listener.getLogger().printf("Created Pod: %s/%s%n", namespace, podId);
String podName = pod.getMetadata().getName();
String namespace1 = pod.getMetadata().getNamespace();
watcher = new AllContainersRunningPodWatcher(client, pod);
try (Watch _w = client.pods().inNamespace(namespace1).withName(podName).watch(watcher)){
try (Watch _w = client.pods().inNamespace(namespace1).withName(podName).watch(watcher)) {
watcher.await(template.getSlaveConnectTimeout(), TimeUnit.SECONDS);
}
LOGGER.log(INFO, "Pod is running: {0}/{1}", new Object[] { namespace, podId });

// We need the pod to be running and connected before returning
// otherwise this method keeps being called multiple times
List<String> validStates = ImmutableList.of("Running");

int waitForSlaveToConnect = template.getSlaveConnectTimeout();
int waitedForSlave;

// now wait for agent to be online
SlaveComputer slaveComputer = null;
String status = null;
List<ContainerStatus> containerStatuses = null;
for (waitedForSlave = 0; waitedForSlave < waitForSlaveToConnect; waitedForSlave++) {
slaveComputer = slave.getComputer();
if (slaveComputer == null) {
throw new IllegalStateException("Node was deleted, computer is null");
}
if (slaveComputer.isOnline()) {
break;
}

// Check that the pod hasn't failed already
pod = client.pods().inNamespace(namespace).withName(podId).get();
if (pod == null) {
throw new IllegalStateException("Pod no longer exists: " + podId);
}
status = pod.getStatus().getPhase();
if (!validStates.contains(status)) {
break;
}

containerStatuses = pod.getStatus().getContainerStatuses();
List<ContainerStatus> terminatedContainers = new ArrayList<>();
for (ContainerStatus info : containerStatuses) {
if (info != null) {
if (info.getState().getTerminated() != null) {
// Container has errored
LOGGER.log(INFO, "Container is terminated {0} [{2}]: {1}",
new Object[] { podId, info.getState().getTerminated(), info.getName() });
listener.getLogger().printf("Container is terminated %1$s [%3$s]: %2$s%n", podId,
info.getState().getTerminated(), info.getName());
terminatedContainers.add(info);
}
}
}

checkTerminatedContainers(terminatedContainers, podId, namespace, slave, client);

LOGGER.log(INFO, "Waiting for agent to connect ({1}/{2}): {0}",
new Object[] { podId, waitedForSlave, waitForSlaveToConnect });
listener.getLogger().printf("Waiting for agent to connect (%2$s/%3$s): %1$s%n", podId, waitedForSlave,
waitForSlaveToConnect);
Thread.sleep(1000);
}
if (slaveComputer == null || slaveComputer.isOffline()) {
logLastLines(containerStatuses, podId, namespace, slave, null, client);
throw new IllegalStateException(
"Agent is not connected after " + waitedForSlave + " seconds, status: " + status);
}

computer.setAcceptingTasks(true);
launched = true;
Expand All @@ -142,6 +209,36 @@ public void launch(SlaveComputer computer, TaskListener listener) {
}
}

private void checkTerminatedContainers(List<ContainerStatus> terminatedContainers, String podId, String namespace,
KubernetesSlave slave, KubernetesClient client) {
if (!terminatedContainers.isEmpty()) {
Map<String, Integer> errors = terminatedContainers.stream().collect(Collectors
.toMap(ContainerStatus::getName, (info) -> info.getState().getTerminated().getExitCode()));

// Print the last lines of failed containers
logLastLines(terminatedContainers, podId, namespace, slave, errors, client);
throw new IllegalStateException("Containers are terminated with exit codes: " + errors);
}
}

/**
* Log the last lines of containers logs
*/
private void logLastLines(List<ContainerStatus> containers, String podId, String namespace, KubernetesSlave slave,
Map<String, Integer> errors, KubernetesClient client) {
for (ContainerStatus containerStatus : containers) {
String containerName = containerStatus.getName();
PrettyLoggable<String, LogWatch> tailingLines = client.pods().inNamespace(namespace).withName(podId)
.inContainer(containerStatus.getName()).tailingLines(30);
String log = tailingLines.getLog();
if (!StringUtils.isBlank(log)) {
String msg = errors != null ? String.format(" exited with error %s", errors.get(containerName)) : "";
LOGGER.log(Level.SEVERE, "Error in provisioning; agent={0}, template={1}. Container {2}{3}. Logs: {4}",
new Object[] { slave, slave.getTemplate(), containerName, msg, tailingLines.getLog() });
}
}
}

/**
* The last problem that occurred, if any.
* @return
Expand Down

0 comments on commit fc666e1

Please sign in to comment.