Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[JENKINS-44796] Retry deletion of unwanted VMs #77

Merged
merged 3 commits into from
Jul 28, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 85 additions & 10 deletions src/main/java/org/jenkinsci/plugins/vSphereCloud.java
Original file line number Diff line number Diff line change
Expand Up @@ -187,11 +187,10 @@ private void ensureLists() {
for (final vSphereCloudProvisionedSlave n : NodeIterator.nodes(vSphereCloudProvisionedSlave.class)) {
final String nodeName = n.getNodeName();
final vSphereCloudSlaveTemplate template = getTemplateForVM(nodeName);
if (template != null) {
final CloudProvisioningRecord provisionable = templateState.getOrCreateRecord(template);
templateState.provisioningStarted(provisionable, nodeName);
templateState.provisionedSlaveNowActive(provisionable, nodeName);
}
if (template == null) continue;
final CloudProvisioningRecord provisionable = templateState.getOrCreateRecord(template);
templateState.provisioningStarted(provisionable, nodeName);
templateState.provisionedSlaveNowActive(provisionable, nodeName);
}
}
}
Expand Down Expand Up @@ -333,6 +332,7 @@ public Collection<PlannedNode> provision(final Label label, int excessWorkload)
synchronized (this) {
ensureLists();
}
retryVMdeletionIfNecessary(Math.max(excessWorkload, 2));
final List<PlannedNode> plannedNodes = new ArrayList<PlannedNode>();
synchronized (templateState) {
templateState.pruneUnwantedRecords();
Expand Down Expand Up @@ -371,6 +371,48 @@ public Collection<PlannedNode> provision(final Label label, int excessWorkload)
}
}

/**
* Has another go at deleting VMs we failed to delete earlier. It's possible
* that we were unable to talk to vSphere (or some other failure happened)
* when we decided to delete some VMs. We remember this sort of thing so we
* can retry later - this is where we use this information.
*
* @param maxToRetryDeletionOn
* The maximum number of VMs to try to remove this time around.
* Can be {@link Integer#MAX_VALUE} for unlimited.
*/
private void retryVMdeletionIfNecessary(final int maxToRetryDeletionOn) {
if (templateState == null) {
VSLOG.log(Level.INFO, "retryVMdeletionIfNecessary({0}): templateState==null", maxToRetryDeletionOn);
return;
}
// find all candidates and trim down the list
final List<String> unwantedVMsThatNeedDeleting = templateState.getUnwantedVMsThatNeedDeleting();
final int numberToAttemptToRetryThisTime = Math.min(maxToRetryDeletionOn, unwantedVMsThatNeedDeleting.size());
final List<String> nodeNamesToRetryDeletion = unwantedVMsThatNeedDeleting.subList(0,
numberToAttemptToRetryThisTime);
// now queue their deletion
synchronized (templateState) {
for (final String nodeName : nodeNamesToRetryDeletion) {
final Boolean isOkToDelete = templateState.isOkToDeleteUnwantedVM(nodeName);
if (isOkToDelete == Boolean.TRUE) {
final Runnable task = new Runnable() {
@Override
public void run() {
attemptDeletionOfSlave("retryVMdeletionIfNecessary(" + nodeName + ")", nodeName);
}
};
VSLOG.log(Level.INFO, "retryVMdeletionIfNecessary({0}): scheduling deletion of {1}", new Object[] { maxToRetryDeletionOn, nodeName });
Computer.threadPoolForRemoting.submit(task);
} else {
VSLOG.log(Level.FINER,
"retryVMdeletionIfNecessary({0}): not going to try deleting {1} as isOkToDeleteUnwantedVM({1})=={2}",
new Object[]{ maxToRetryDeletionOn, nodeName, isOkToDelete });
}
}
}
}

/**
* This is called by {@link vSphereCloudProvisionedSlave} instances once
* they terminate, so we can take note of their passing and then destroy the
Expand All @@ -383,19 +425,52 @@ void provisionedSlaveHasTerminated(final String cloneName) {
ensureLists();
}
VSLOG.log(Level.FINER, "provisionedSlaveHasTerminated({0}): recording in our runtime state...", cloneName);
// once we're done, remove our cached record.
synchronized (templateState) {
templateState.provisionedSlaveNowTerminated(cloneName);
templateState.provisionedSlaveNowUnwanted(cloneName, true);
}
VSLOG.log(Level.FINER, "provisionedSlaveHasTerminated({0}): destroying VM...", cloneName);
// Deletion can take a long time, so we run it asynchronously because,
// at the point where we're called here, we've locked the remoting queue
// so Jenkins is largely crippled until we return.
// JENKINS-42187 describes the problem (for docker).
final Runnable task = new Runnable() {
@Override
public void run() {
attemptDeletionOfSlave("provisionedSlaveHasTerminated(" + cloneName + ")", cloneName);
}
};
VSLOG.log(Level.INFO, "provisionedSlaveHasTerminated({0}): scheduling deletion of {0}", cloneName);
Computer.threadPoolForRemoting.submit(task);
// We also take this opportunity to see if we've got any other slaves
// that need deleting, and deal with at most one of those
// (asynchronously) as well.
retryVMdeletionIfNecessary(1);
}

private void attemptDeletionOfSlave(final String why, final String cloneName) {
VSLOG.log(Level.FINER, "{0}: destroying VM {1}...", new Object[]{ why, cloneName });
VSphere vSphere = null;
boolean successfullyDeleted = false;
try {
vSphere = vSphereInstance();
// Note: This can block indefinitely - it only completes when
// vSphere tells us the deletion has completed, and if vSphere has
// issues (e.g. a node failure) during that process then the
// deletion task can hang for ages.
vSphere.destroyVm(cloneName, false);
VSLOG.log(Level.FINER, "provisionedSlaveHasTerminated({0}): VM destroyed.", cloneName);
successfullyDeleted = true;
VSLOG.log(Level.FINER, "{0}: VM {1} destroyed.", new Object[]{ why, cloneName });
vSphere.disconnect();
vSphere = null;
} catch (VSphereException ex) {
VSLOG.log(Level.SEVERE, "provisionedSlaveHasTerminated(" + cloneName + "): Exception while trying to destroy VM", ex);
VSLOG.log(Level.SEVERE, why + ": Exception while trying to destroy VM " + cloneName, ex);
} finally {
synchronized (templateState) {
if (successfullyDeleted) {
templateState.unwantedSlaveNowDeleted(cloneName);
} else {
templateState.unwantedSlaveNotDeleted(cloneName);
}
}
if (vSphere != null) {
vSphere.disconnect();
}
Expand Down
Loading