Skip to content

Commit

Permalink
Add micrometer metrics support
Browse files Browse the repository at this point in the history
This commit adds preliminary support to the Swarm Client for
Prometheus metrics (see https://prometheus.io), by way of Micrometer
(see https://micrometer.io). Currently, only process resource usage
and JVM statistics are reported, but in the future we could expand
this to report stats specific to Swarm Client.

One reason for adding this feature is to facilitate monitoring of
Swarm Client nodes. If the Swarm Client service itself crashes, then
alertmanager (see https://github.com/prometheus/alertmanager) can be
used to send alerts about the service being down.

At the moment, only Prometheus is configured as an endpoint, but in
the future other metrics platforms could also be supported.
  • Loading branch information
nre-ableton committed Oct 29, 2020
1 parent 799e51f commit 3131c1b
Show file tree
Hide file tree
Showing 6 changed files with 123 additions and 0 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ settings unexpectedly.
* [Changelog](CHANGELOG.md)
* [Global Security Configuration](docs/security.md)
* [Logging](docs/logging.md)
* [Prometheus](docs/prometheus.md)
* [Proxy Configuration](docs/proxy.md)

## Available options
Expand Down Expand Up @@ -70,6 +71,7 @@ Name | Description
`-passwordEnvVariable VAL` | Environment variable containing the Jenkins user API token or password.
`-passwordFile VAL` | File containing the Jenkins user API token or password.
`-pidFile VAL` | File to write PID to. The client will refuse to start if this file exists and the previous process is still running.
`-prometheusPort N` | If defined, then start an HTTP service on this port for Prometheus metrics. (default: -1)
`-retry N` | Number of retries before giving up. Unlimited if not specified. (default: -1)
`-retryBackOffStrategy RETRY_BACK_OFF_STRATEGY` | The mode controlling retry wait time. Can be either 'none' (use same interval between retries) or 'linear' (increase wait time before each retry up to maxRetryInterval) or 'exponential' (double wait interval on each retry up to maxRetryInterval). Default is 'none'. (default: NONE)
`-retryInterval N` | Time to wait before retry in seconds. Default is 10 seconds. (default: 10)
Expand Down
10 changes: 10 additions & 0 deletions client/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -166,5 +166,15 @@
<artifactId>oshi-core</artifactId>
<version>5.3.2</version>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
<version>1.5.4</version>
</dependency>
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
<version>1.5.4</version>
</dependency>
</dependencies>
</project>
5 changes: 5 additions & 0 deletions client/src/main/java/hudson/plugins/swarm/Options.java
Original file line number Diff line number Diff line change
Expand Up @@ -203,4 +203,9 @@ public class Options {
+ " missing.",
forbids = "-disableWorkDir")
public boolean failIfWorkDirIsMissing = false;

@Option(
name = "-prometheusPort",
usage = "If defined, then start an HTTP service on this port for Prometheus metrics.")
public int prometheusPort = -1;
}
53 changes: 53 additions & 0 deletions client/src/main/java/hudson/plugins/swarm/SwarmClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@
import hudson.remoting.Launcher;
import hudson.remoting.jnlp.Main;

import com.sun.net.httpserver.HttpServer;
import io.micrometer.core.instrument.binder.jvm.ClassLoaderMetrics;
import io.micrometer.core.instrument.binder.jvm.JvmGcMetrics;
import io.micrometer.core.instrument.binder.jvm.JvmHeapPressureMetrics;
import io.micrometer.core.instrument.binder.jvm.JvmMemoryMetrics;
import io.micrometer.core.instrument.binder.jvm.JvmThreadMetrics;
import io.micrometer.core.instrument.binder.system.FileDescriptorMetrics;
import io.micrometer.core.instrument.binder.system.ProcessorMetrics;
import io.micrometer.core.instrument.binder.system.UptimeMetrics;
import io.micrometer.prometheus.PrometheusConfig;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hc.client5.http.auth.AuthCache;
Expand Down Expand Up @@ -37,11 +48,13 @@
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UncheckedIOException;
import java.io.UnsupportedEncodingException;
import java.net.Inet4Address;
import java.net.Inet6Address;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.NetworkInterface;
import java.net.SocketException;
Expand Down Expand Up @@ -79,6 +92,7 @@ public class SwarmClient {
private final Options options;
private final String hash;
private String name;
private HttpServer prometheusServer = null;

public SwarmClient(Options options) {
this.options = options;
Expand Down Expand Up @@ -108,6 +122,10 @@ public SwarmClient(Options options) {
"Problem reading labels from file " + options.labelsFile, e);
}
}

if (options.prometheusPort > 0) {
startPrometheusService(options.prometheusPort);
}
}

public String getName() {
Expand Down Expand Up @@ -653,13 +671,48 @@ private static String hash(File remoteFsRoot) {
}

public void exitWithStatus(int status) {
if (prometheusServer != null) {
prometheusServer.stop(1);
}
System.exit(status);
}

public void sleepSeconds(int waitTime) throws InterruptedException {
Thread.sleep(waitTime * 1000);
}

private void startPrometheusService(int port) {
logger.fine("Starting Prometheus service on port " + port);
PrometheusMeterRegistry prometheusRegistry = new PrometheusMeterRegistry(PrometheusConfig.DEFAULT);
// Add some standard metrics to the registry
new ClassLoaderMetrics().bindTo(prometheusRegistry);
new FileDescriptorMetrics().bindTo(prometheusRegistry);
new JvmGcMetrics().bindTo(prometheusRegistry);
new JvmHeapPressureMetrics().bindTo(prometheusRegistry);
new JvmMemoryMetrics().bindTo(prometheusRegistry);
new JvmThreadMetrics().bindTo(prometheusRegistry);
new ProcessorMetrics().bindTo(prometheusRegistry);
new UptimeMetrics().bindTo(prometheusRegistry);

try {
prometheusServer = HttpServer.create(new InetSocketAddress(port), 0);
prometheusServer.createContext("/prometheus", httpExchange -> {
String response = prometheusRegistry.scrape();
byte[] responseContent = response.getBytes(StandardCharsets.UTF_8);
httpExchange.sendResponseHeaders(200, responseContent.length);
try (OutputStream os = httpExchange.getResponseBody()) {
os.write(responseContent);
}
});

new Thread(prometheusServer::start).start();
} catch (IOException e) {
logger.severe("Failed to start Prometheus service: " + e.getMessage());
throw new RuntimeException(e);
}
logger.info("Started Prometheus service on port " + port);
}

private static class DefaultTrustManager implements X509TrustManager {

final List<String> allowedFingerprints = new ArrayList<>();
Expand Down
26 changes: 26 additions & 0 deletions docs/prometheus.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Prometheus monitoring

The Jenkins Swarm Client has support for [Prometheus](https://prometheus.io) monitoring, which can be used to scrape
data from a Prometheus server. To start a Prometheus endpoint, simply use a non-zero value for the `-prometheusPort`
option when starting the client JAR. The service will be stopped when the Swarm Client exits.

The actual metrics can be accessed on the `/prometheus` endpoint. So for example, if the node's IP address is
`169.254.10.12`, and `9100` is passed to `-prometheusPort`, then the metrics can be accessed at:
`http://169.254.10.12:9100/prometheus`.

## Data Reported

The client reports metrics for:

- Basic process info, including:
- Process uptime
- CPU time consumed
- Virtual memory consumed
- Resident memory consumed
- File descriptors consumed
- JVM metrics such as:
- CPU usage
- Memory usage
- Thread states
- Garbage collection statistics
- Class loader statistics
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,12 @@
import oshi.software.os.OSProcess;
import oshi.software.os.OperatingSystem;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Writer;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
Expand Down Expand Up @@ -471,6 +474,30 @@ public void jarCacheWithCustomPath() throws Exception {
jarCachePath);
}

@Test
public void metricsPrometheus() throws Exception {
swarmClientRule.createSwarmClient("-prometheusPort", "9999");

// Fetch the metrics page from the client
URL url = new URL("http://localhost:9999/prometheus");
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
BufferedReader reader =
new BufferedReader(new InputStreamReader(connection.getInputStream()));
String inputLine;
StringBuilder content = new StringBuilder();
while ((inputLine = reader.readLine()) != null) {
content.append(inputLine);
}
reader.close();
connection.disconnect();

// Assert that a non-zero length string was read
assertTrue(content.length() > 0);
// Assert that we got at least one known Prometheus metric
assertTrue(content.toString().contains("process_cpu_usage"));
}

@After
public void tearDown() throws IOException {
Files.deleteIfExists(getPidFile());
Expand Down

0 comments on commit 3131c1b

Please sign in to comment.