Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update clean lost node without requiring a instanceId #503

Open
wants to merge 31 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a91653c
clean lost node working
gbhat618 Jan 2, 2025
c3201ac
developing test
gbhat618 Jan 3, 2025
65bd743
revert the ITUtil changes
gbhat618 Jan 3, 2025
40a1222
Squashed commit of the following:
gbhat618 Jan 3, 2025
b7de0f2
clean nodes in working - needs some more cleanup
gbhat618 Jan 3, 2025
e96b05a
tests and code in working codition
gbhat618 Jan 4, 2025
ca8e589
update the compute client v2 integration tests
gbhat618 Jan 4, 2025
7f2d305
improve comments and small code updates
gbhat618 Jan 4, 2025
caae499
Revert "Squashed commit of the following:"
gbhat618 Jan 4, 2025
97c74c7
minor improvements comments and code
gbhat618 Jan 4, 2025
fe03a64
Update src/test/java/com/google/jenkins/plugins/computeengine/integra…
gbhat618 Jan 7, 2025
dcda478
Update src/test/java/com/google/jenkins/plugins/computeengine/integra…
gbhat618 Jan 7, 2025
92bd182
Update src/test/java/com/google/jenkins/plugins/computeengine/integra…
gbhat618 Jan 7, 2025
e47588d
Update src/test/java/com/google/jenkins/plugins/computeengine/integra…
gbhat618 Jan 7, 2025
4e322d7
Update src/main/java/com/google/jenkins/plugins/computeengine/CleanLo…
gbhat618 Jan 7, 2025
9c840b5
Update src/main/java/com/google/jenkins/plugins/computeengine/CleanLo…
gbhat618 Jan 7, 2025
f17d80e
Update src/test/java/com/google/jenkins/plugins/computeengine/integra…
gbhat618 Jan 7, 2025
6315666
Update src/test/java/com/google/jenkins/plugins/computeengine/integra…
gbhat618 Jan 7, 2025
4cd89ca
Squashed commit of the following:
gbhat618 Jan 7, 2025
5484655
improve code and fix ComputeClientV2 intialization
gbhat618 Jan 7, 2025
38bb7f6
Revert "Squashed commit of the following:"
gbhat618 Jan 7, 2025
fabd527
Merge branch 'develop' of github.com:jenkinsci/google-compute-engine-…
gbhat618 Jan 7, 2025
c243b28
spotless
gbhat618 Jan 7, 2025
1071692
more accurate log message
gbhat618 Jan 7, 2025
0a15aa7
update log recorder with static list to capture more log records than…
gbhat618 Jan 7, 2025
3a8030b
update DateFormatter formatting requirement docs
gbhat618 Jan 7, 2025
012cdc7
add Locale.ROOT in lowercase and uppercase coversions
gbhat618 Jan 7, 2025
a87a569
improve the comment with better clarity
gbhat618 Jan 7, 2025
458b0a8
use the format with hard coded zone info for readability, so no requi…
gbhat618 Jan 7, 2025
deb6d70
remove the non-required javadoc in timestamp string
gbhat618 Jan 7, 2025
28cc50d
Update src/main/java/com/google/jenkins/plugins/computeengine/CleanLo…
gbhat618 Jan 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@
import hudson.model.PeriodicWork;
import hudson.model.Slave;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.security.GeneralSecurityException;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.temporal.ChronoUnit;
import java.util.List;
import java.util.Set;
import java.util.TimeZone;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
Expand All @@ -40,17 +43,28 @@
@Symbol("cleanLostNodesWork")
public class CleanLostNodesWork extends PeriodicWork {
protected final Logger logger = Logger.getLogger(getClass().getName());
public static final String NODE_IN_USE_LABEL_KEY = "jenkins_node_in_use";
public static final String NODE_IN_USE_LABEL_KEY = "jenkins_node_last_refresh";
public static final long RECURRENCE_PERIOD = Long.parseLong(
System.getProperty("jenkins.cloud.gcp.cleanLostNodesWork.recurrencePeriod", String.valueOf(HOUR)));
System.getProperty(CleanLostNodesWork.class.getName() + ".recurrencePeriod", String.valueOf(HOUR)));
public static final int LOST_MULTIPLIER = 3;
gbhat618 marked this conversation as resolved.
Show resolved Hide resolved
private static final DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy_MM_dd'T'HH_mm_ss_SSSX");
gbhat618 marked this conversation as resolved.
Show resolved Hide resolved

/** {@inheritDoc} */
@Override
public long getRecurrencePeriod() {
return RECURRENCE_PERIOD;
}

/**
* Returns current time string as human-readable with the format allowed in GCP label.
* GCP label format: "The value can only contain lowercase letters, numeric characters, underscores and dashes.
* The value can be at most 63 characters long. International characters are allowed".
* Example: 2025_01_07t11_28_06_379z
*/
public static String getLastRefreshLabelVal() {
return formatter.format(OffsetDateTime.now(ZoneOffset.UTC)).toLowerCase();
gbhat618 marked this conversation as resolved.
Show resolved Hide resolved
}

/** {@inheritDoc} */
@Override
protected void doRun() {
Expand All @@ -60,7 +74,13 @@ protected void doRun() {

private void cleanCloud(ComputeEngineCloud cloud) {
logger.log(Level.FINEST, "Cleaning cloud " + cloud.getCloudName());
ComputeClientV2 clientV2 = ComputeClientV2.createFromComputeEngineCloud(cloud);
ComputeClientV2 clientV2;
try {
clientV2 = cloud.getClientV2();
} catch (GeneralSecurityException | IOException ex) {
logger.log(Level.WARNING, "Error getting clientV2 for cloud " + cloud.getCloudName(), ex);
return;
}
List<Instance> remoteInstances = findRunningRemoteInstances(clientV2);
Set<String> localInstances = findLocalInstances(cloud);
if (!(localInstances.isEmpty() || remoteInstances.isEmpty())) {
Expand All @@ -75,18 +95,17 @@ private boolean isOrphaned(Instance remote, Set<String> localInstances) {
if (localInstances.contains(remote.getName())) {
return false;
}
String nodeInUseTs = remote.getLabels().get(NODE_IN_USE_LABEL_KEY);
if (nodeInUseTs == null) {
String nodeLastRefresh = remote.getLabels().get(NODE_IN_USE_LABEL_KEY);
if (nodeLastRefresh == null) {
return false;
}
long lastUsed = Long.parseLong(nodeInUseTs);
var dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSSXXX");
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
boolean isOrphan = lastUsed < System.currentTimeMillis() - RECURRENCE_PERIOD * LOST_MULTIPLIER;
OffsetDateTime lastRefresh = OffsetDateTime.parse(nodeLastRefresh.toUpperCase(), formatter);
gbhat618 marked this conversation as resolved.
Show resolved Hide resolved
boolean isOrphan = lastRefresh
.plus(RECURRENCE_PERIOD * LOST_MULTIPLIER, ChronoUnit.MILLIS)
.isBefore(OffsetDateTime.now(ZoneOffset.UTC));
logger.log(
Level.FINEST,
"Instance " + remote.getName() + " last used at: " + dateFormat.format(lastUsed) + ", isOrphan: "
+ isOrphan);
"Instance " + remote.getName() + " last_refresh label value: " + nodeLastRefresh + ", isOrphan: " + isOrphan);
return isOrphan;
}

Expand Down Expand Up @@ -137,13 +156,14 @@ private void updateLocalInstancesLabel(
ComputeClientV2 clientV2, Set<String> localInstances, List<Instance> remoteInstances) {
var remoteInstancesByName =
remoteInstances.stream().collect(Collectors.toMap(Instance::getName, instance -> instance));
var labelToUpdate = ImmutableMap.of(NODE_IN_USE_LABEL_KEY, String.valueOf(System.currentTimeMillis()));
var labelToUpdate = ImmutableMap.of(NODE_IN_USE_LABEL_KEY, getLastRefreshLabelVal());
for (String instanceName : localInstances) {
if (!remoteInstancesByName.containsKey(instanceName)) {
var remoteInstance = remoteInstancesByName.get(instanceName);
if (remoteInstance == null) {
continue;
}
try {
clientV2.updateInstanceLabels(remoteInstancesByName.get(instanceName), labelToUpdate);
clientV2.updateInstanceLabels(remoteInstance, labelToUpdate);
logger.log(Level.FINEST, "Updated label for instance " + instanceName);
} catch (IOException e) {
logger.log(Level.WARNING, "Error updating label for instance " + instanceName, e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import com.google.jenkins.plugins.computeengine.client.ClientUtil;
import com.google.jenkins.plugins.computeengine.client.ComputeClientV2;
import com.google.jenkins.plugins.credentials.oauth.GoogleOAuth2Credentials;
import edu.umd.cs.findbugs.annotations.NonNull;
import hudson.Extension;
Expand All @@ -48,6 +49,7 @@
import hudson.util.ListBoxModel;
import java.io.IOException;
import java.io.PrintStream;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
Expand Down Expand Up @@ -90,6 +92,7 @@ public class ComputeEngineCloud extends AbstractCloudImpl {
private List<InstanceConfiguration> configurations;

private transient volatile ComputeClient client;
private transient volatile ComputeClientV2 clientV2;
private boolean noDelayProvisioning;

@DataBoundConstructor
Expand Down Expand Up @@ -162,7 +165,7 @@ protected Object readResolve() {
// Apply a label that identifies the name of this instance configuration
configuration.appendLabel(CONFIG_LABEL_KEY, configuration.getNamePrefix());
configuration.appendLabel(
CleanLostNodesWork.NODE_IN_USE_LABEL_KEY, String.valueOf(System.currentTimeMillis()));
CleanLostNodesWork.NODE_IN_USE_LABEL_KEY, CleanLostNodesWork.getLastRefreshLabelVal());
}
}
setInstanceId(instanceId);
Expand Down Expand Up @@ -211,6 +214,17 @@ public ComputeClient getClient() {
return client;
}

public ComputeClientV2 getClientV2() throws IOException, GeneralSecurityException {
if (clientV2 == null) {
synchronized (this) {
if (clientV2 == null) {
clientV2 = ClientUtil.createComputeClientV2(projectId, credentialsId);
}
}
}
return clientV2;
}

/**
* Set configurations for this cloud.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import com.cloudbees.plugins.credentials.CredentialsProvider;
import com.cloudbees.plugins.credentials.domains.DomainRequirement;
import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
import com.google.api.client.http.HttpTransport;
import com.google.api.client.json.jackson2.JacksonFactory;
import com.google.api.services.compute.Compute;
import com.google.cloud.graphite.platforms.plugin.client.ClientFactory;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
Expand All @@ -19,6 +22,7 @@
import java.security.GeneralSecurityException;
import java.util.List;
import java.util.Optional;
import jenkins.model.Jenkins;

/** Utilities for using the gcp-plugin-core clients. */
public class ClientUtil {
Expand Down Expand Up @@ -88,4 +92,17 @@ private static GoogleRobotCredentials getRobotCredentials(
private static Credential getGoogleCredential(GoogleRobotCredentials credentials) throws GeneralSecurityException {
return credentials.getGoogleCredential(new ComputeEngineScopeRequirement());
}

public static ComputeClientV2 createComputeClientV2(String projectId, String credentialsId)
throws GeneralSecurityException, IOException {
Credential httpRequestInitializer = ClientUtil.getGoogleCredential(
ClientUtil.getRobotCredentials(Jenkins.get(), ImmutableList.of(), credentialsId));
Compute compute = new Compute.Builder(
GoogleNetHttpTransport.newTrustedTransport(), new JacksonFactory(), httpRequestInitializer)
.setGoogleClientRequestInitializer(request ->
request.setRequestHeaders(request.getRequestHeaders().setUserAgent(APPLICATION_NAME)))
.setApplicationName(ClientUtil.APPLICATION_NAME)
.build();
return new ComputeClientV2(projectId, compute);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@
import com.google.api.services.compute.model.Instance;
import com.google.api.services.compute.model.InstancesScopedList;
import com.google.api.services.compute.model.InstancesSetLabelsRequest;
import com.google.cloud.graphite.platforms.plugin.client.ComputeClient;
import com.google.jenkins.plugins.computeengine.ComputeEngineCloud;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.List;
import java.util.Map;
import java.util.Objects;
Expand Down Expand Up @@ -35,33 +32,6 @@ public ComputeClientV2(String projectId, Compute compute) {
this.compute = compute;
}

/**
* Instantiates a {@link ComputeClientV2} using the configuration from a given {@link ComputeEngineCloud}.
* <p>Jenkins may host multiple cloud configurations, each with distinct service accounts. This diversity necessitates leveraging an
* existing {@link ComputeEngineCloud} instance to accurately configure {@link ComputeClientV2}. Interfacing directly with
* {@link ComputeClient} is impractical due to its archived status and inability to extend, as noted in class-level documentation.
* Consequently, this method employs reflection to retrieve the necessary {@link Compute} instance from within {@link ComputeClient}.
*
* @return A newly created {@link ComputeClientV2} instance, configured with the credentials and settings from the specified
* {@link ComputeEngineCloud}.
*/
public static ComputeClientV2 createFromComputeEngineCloud(ComputeEngineCloud cloud) {
try {
ComputeClient client = cloud.getClient();
Field f = client.getClass().getDeclaredField("compute");
f.setAccessible(true);
Object wrapper = f.get(client);
Field f2 = wrapper.getClass().getDeclaredField("compute");
f2.setAccessible(true);
ComputeClientV2 clientV2 = new ComputeClientV2(cloud.getProjectId(), (Compute) f2.get(wrapper));
f.setAccessible(false);
f2.setAccessible(false);
return clientV2;
} catch (NoSuchFieldException | IllegalAccessException e) {
throw new RuntimeException(e);
}
}

/**
* Updates the labels of a specified {@code instance} by merging or replacing them with {@code newLabels}.
* <p>This method adds any new labels found in {@code newLabels} to the instance's existing labels and updates
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
Expand All @@ -34,7 +33,7 @@
import org.jvnet.hudson.test.TailLog;

public class CleanLostNodesWorkIT {
private static final Logger log = Logger.getLogger(CleanLostNodesWorkIT.class.getName());
private static final Logger LOGGER = Logger.getLogger(CleanLostNodesWorkIT.class.getName());
private static final String LOG_RECORDER_NAME = "CleanLostNodesWork log recorder";
/*
Provisioning a Cloud VM encounters two primary delays:
Expand All @@ -51,38 +50,31 @@ public class CleanLostNodesWorkIT {
Based on tests conducted with intervals of 30s, 45s, and 60s, a 60s delay currently proves most effective.
*/
private static final long CLEAN_LOST_NODES_WORK_RECURRENCE_PERIOD = 60 * 1000;
private static final Map<String, String> googleLabels = getLabel(CleanLostNodesWorkIT.class);
private static final Map<String, String> GOOGLE_LABELS = getLabel(CleanLostNodesWorkIT.class);

@Rule
public RealJenkinsRule rj1 = new RealJenkinsRule();
public RealJenkinsRule rj1 = new RealJenkinsRule().withColor(PrefixedOutputStream.Color.BLUE);

@Rule
public RealJenkinsRule rj2 = new RealJenkinsRule();
public RealJenkinsRule rj2 = new RealJenkinsRule().withColor(PrefixedOutputStream.Color.RED);

@Before
public void init() throws Throwable {
Stack<PrefixedOutputStream.Color> colors = new Stack<>() {
{
push(PrefixedOutputStream.Color.BLUE);
push(PrefixedOutputStream.Color.RED);
}
};
for (var rj : List.of(rj1, rj2)) {
rj.javaOptions("-Djenkins.cloud.gcp.cleanLostNodesWork.recurrencePeriod="
rj.javaOptions("-D" + CleanLostNodesWork.class.getName() + ".recurrencePeriod="
+ CLEAN_LOST_NODES_WORK_RECURRENCE_PERIOD)
.withLogger(CleanLostNodesWork.class, Level.FINEST)
.withColor(colors.pop());
.withLogger(CleanLostNodesWork.class, Level.FINEST);
rj.startJenkins();
rj.runRemotely(r -> {
initCredentials(r);
var cloud = initCloud(r);
var instanceConfig = instanceConfigurationBuilder()
.numExecutorsStr(NUM_EXECUTORS)
.labels(LABEL)
.oneShot(false)
.oneShot(true)
.createSnapshot(false)
.template(NULL_TEMPLATE)
.googleLabels(googleLabels)
.googleLabels(GOOGLE_LABELS)
.cloud(cloud)
.build();
cloud.setConfigurations(ImmutableList.of(instanceConfig));
Expand All @@ -94,8 +86,9 @@ public void init() throws Throwable {
@After
public void tearDown() throws Throwable {
rj2.runRemotely(j -> {
j.waitUntilNoActivity();
var cloud = (ComputeEngineCloud) j.jenkins.clouds.getByName("gce-integration");
teardownResources(cloud.getClient(), googleLabels, log);
teardownResources(cloud.getClient(), GOOGLE_LABELS, LOGGER);
});
}

Expand Down Expand Up @@ -136,15 +129,15 @@ public void testLostNodeCleanedUpBySecondController() throws Throwable {
try (var tail = new TailLog(j, "p1", 1).withColor(PrefixedOutputStream.Color.MAGENTA)) {
var run = p1.scheduleBuild2(0).waitForStart();
await().timeout(4, TimeUnit.MINUTES).until(() -> run.getLog().contains("first sleep done"));
log.info("Build is already running, can proceed to stopping jenkins to make the agent a lost VM");
LOGGER.info("Build is already running, can proceed to stopping jenkins to make the agent a lost VM");
RealJenkinsLogUtil.assertLogContains(
j,
LOG_RECORDER_NAME,
"Found 1 running remote instances",
"Found 1 local instances",
"Updated label for instance");
RealJenkinsLogUtil.assertLogDoesNotContain(
j, LOG_RECORDER_NAME, "isOrphan: true", "not found locally, removing it");
j, LOG_RECORDER_NAME, "isOrphan: true", "Removing orphaned instance");
}
});
rj1.stopJenkins();
Expand All @@ -155,28 +148,28 @@ public void testLostNodeCleanedUpBySecondController() throws Throwable {
"VM is still there",
1,
cloud.getClient()
.listInstancesWithLabel(cloud.getProjectId(), googleLabels)
.listInstancesWithLabel(cloud.getProjectId(), GOOGLE_LABELS)
.size());

log.info("test sleeps for " + getSleepSeconds() + " seconds; so that the lost VM is detected by the "
LOGGER.info("test sleeps for " + getSleepSeconds() + " seconds; so that the lost VM is detected by the "
+ "second controller and it is deleted");
TimeUnit.SECONDS.sleep(getSleepSeconds());
log.info("proceeding after sleep");
LOGGER.info("proceeding after sleep");

var instances = cloud.getClient().listInstancesWithLabel(cloud.getProjectId(), googleLabels);
var instances = cloud.getClient().listInstancesWithLabel(cloud.getProjectId(), GOOGLE_LABELS);
if (!instances.isEmpty()) {
assertNotEquals(
"VM is not running",
"RUNNING",
cloud.getClient()
.listInstancesWithLabel(cloud.getProjectId(), googleLabels)
.listInstancesWithLabel(cloud.getProjectId(), GOOGLE_LABELS)
.get(0)
.getStatus());
}
await("VM didn't get removed even after waiting 2 minutes post it was stopped")
await("VM didn't get removed even after waiting 2 minutes after it was stopped")
.timeout(2, TimeUnit.MINUTES)
.until(() -> cloud.getClient()
.listInstancesWithLabel(cloud.getProjectId(), googleLabels)
.listInstancesWithLabel(cloud.getProjectId(), GOOGLE_LABELS)
.isEmpty());
RealJenkinsLogUtil.assertLogContains(
j,
Expand Down
Loading
Loading