Skip to content

Commit

Permalink
feat: Introduce OpenTelemetry Metrics Recording (#2500)
Browse files Browse the repository at this point in the history
Builds off #2433,
based on the design in go/java-gapic-otel-metrics-design.

Discovered two issues via showcase tests:
1. #2502
2. #2503

These issues are not blocking for this PR.

---------

Co-authored-by: Blake Li <[email protected]>
  • Loading branch information
lqiu96 and blakeli0 authored Mar 14, 2024
1 parent ff56a20 commit b936580
Show file tree
Hide file tree
Showing 15 changed files with 1,315 additions and 69 deletions.
7 changes: 7 additions & 0 deletions gapic-generator-java-bom/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,13 @@
<type>pom</type>
<scope>import</scope>
</dependency>
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-bom</artifactId>
<version>${opentelemetry.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>

<!-- Libraries published from this repositories -->
<dependency>
Expand Down
1 change: 1 addition & 0 deletions gapic-generator-java-pom-parent/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
<gson.version>2.10.1</gson.version>
<guava.version>32.1.3-jre</guava.version>
<protobuf.version>3.25.2</protobuf.version>
<opentelemetry.version>1.35.0</opentelemetry.version>
<maven.compiler.release>8</maven.compiler.release>
</properties>

Expand Down
1 change: 1 addition & 0 deletions gax-java/dependencies.properties
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ maven.com_google_api_grpc_proto_google_common_protos=com.google.api.grpc:proto-g
maven.com_google_api_grpc_grpc_google_common_protos=com.google.api.grpc:grpc-google-common-protos:2.36.0
maven.com_google_auth_google_auth_library_oauth2_http=com.google.auth:google-auth-library-oauth2-http:1.23.0
maven.com_google_auth_google_auth_library_credentials=com.google.auth:google-auth-library-credentials:1.23.0
maven.io_opentelemetry_opentelemetry_api=io.opentelemetry:opentelemetry-api:1.35.0
maven.io_opencensus_opencensus_api=io.opencensus:opencensus-api:0.31.1
maven.io_opencensus_opencensus_contrib_grpc_metrics=io.opencensus:opencensus-contrib-grpc-metrics:0.31.1
maven.io_opencensus_opencensus_contrib_http_util=io.opencensus:opencensus-contrib-http-util:0.31.1
Expand Down
1 change: 1 addition & 0 deletions gax-java/gax/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ _COMPILE_DEPS = [
"@com_google_code_findbugs_jsr305//jar",
"@com_google_errorprone_error_prone_annotations//jar",
"@com_google_guava_guava//jar",
"@io_opentelemetry_opentelemetry_api//jar",
"@io_opencensus_opencensus_api//jar",
"@io_opencensus_opencensus_contrib_http_util//jar",
"@io_grpc_grpc_java//context:context",
Expand Down
4 changes: 4 additions & 0 deletions gax-java/gax/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@
<artifactId>graal-sdk</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-api</artifactId>
</dependency>
</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,40 +40,50 @@
import java.util.Map;
import java.util.concurrent.CancellationException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import javax.annotation.Nullable;
import org.threeten.bp.Duration;

/**
* This class computes generic metrics that can be observed in the lifecycle of an RPC operation.
* The responsibility of recording metrics should delegate to {@link MetricsRecorder}, hence this
* class should not have any knowledge about the observability framework used for metrics recording.
* method_name and language will be autopopulated attributes. Default value of language is 'Java'.
*/
@BetaApi
@InternalApi
public class MetricsTracer implements ApiTracer {

private static final String STATUS_ATTRIBUTE = "status";

public static final String METHOD_NAME_ATTRIBUTE = "method_name";
public static final String LANGUAGE_ATTRIBUTE = "language";
public static final String STATUS_ATTRIBUTE = "status";
public static final String DEFAULT_LANGUAGE = "Java";
private static final String OPERATION_FINISHED_STATUS_MESSAGE =
"Operation has already been completed";
private Stopwatch attemptTimer;

private final Stopwatch operationTimer = Stopwatch.createStarted();

private final Map<String, String> attributes = new HashMap<>();

private MetricsRecorder metricsRecorder;
private final MetricsRecorder metricsRecorder;
private final AtomicBoolean operationFinished;

public MetricsTracer(MethodName methodName, MetricsRecorder metricsRecorder) {
this.attributes.put("method_name", methodName.toString());
this.attributes.put(METHOD_NAME_ATTRIBUTE, methodName.toString());
this.attributes.put(LANGUAGE_ATTRIBUTE, DEFAULT_LANGUAGE);
this.metricsRecorder = metricsRecorder;
this.operationFinished = new AtomicBoolean();
}

/**
* Signals that the overall operation has finished successfully. The tracer is now considered
* closed and should no longer be used. Successful operation adds "OK" value to the status
* attribute key.
*
* @throws IllegalStateException if an operation completion call has already been invoked
*/
@Override
public void operationSucceeded() {
if (operationFinished.getAndSet(true)) {
throw new IllegalStateException(OPERATION_FINISHED_STATUS_MESSAGE);
}
attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.OK.toString());
metricsRecorder.recordOperationLatency(
operationTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
Expand All @@ -84,9 +94,14 @@ public void operationSucceeded() {
* Signals that the operation was cancelled by the user. The tracer is now considered closed and
* should no longer be used. Cancelled operation adds "CANCELLED" value to the status attribute
* key.
*
* @throws IllegalStateException if an operation completion call has already been invoked
*/
@Override
public void operationCancelled() {
if (operationFinished.getAndSet(true)) {
throw new IllegalStateException(OPERATION_FINISHED_STATUS_MESSAGE);
}
attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.CANCELLED.toString());
metricsRecorder.recordOperationLatency(
operationTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
Expand All @@ -97,9 +112,14 @@ public void operationCancelled() {
* Signals that the operation was cancelled by the user. The tracer is now considered closed and
* should no longer be used. Failed operation extracts the error from the throwable and adds it to
* the status attribute key.
*
* @throws IllegalStateException if an operation completion call has already been invoked
*/
@Override
public void operationFailed(Throwable error) {
if (operationFinished.getAndSet(true)) {
throw new IllegalStateException(OPERATION_FINISHED_STATUS_MESSAGE);
}
attributes.put(STATUS_ATTRIBUTE, extractStatus(error));
metricsRecorder.recordOperationLatency(
operationTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
Expand All @@ -126,7 +146,6 @@ public void attemptStarted(Object request, int attemptNumber) {
*/
@Override
public void attemptSucceeded() {

attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.OK.toString());
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
metricsRecorder.recordAttemptCount(1, attributes);
Expand All @@ -138,7 +157,6 @@ public void attemptSucceeded() {
*/
@Override
public void attemptCancelled() {

attributes.put(STATUS_ATTRIBUTE, StatusCode.Code.CANCELLED.toString());
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
metricsRecorder.recordAttemptCount(1, attributes);
Expand All @@ -154,7 +172,6 @@ public void attemptCancelled() {
*/
@Override
public void attemptFailed(Throwable error, Duration delay) {

attributes.put(STATUS_ATTRIBUTE, extractStatus(error));
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
metricsRecorder.recordAttemptCount(1, attributes);
Expand All @@ -169,7 +186,6 @@ public void attemptFailed(Throwable error, Duration delay) {
*/
@Override
public void attemptFailedRetriesExhausted(Throwable error) {

attributes.put(STATUS_ATTRIBUTE, extractStatus(error));
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
metricsRecorder.recordAttemptCount(1, attributes);
Expand All @@ -184,7 +200,6 @@ public void attemptFailedRetriesExhausted(Throwable error) {
*/
@Override
public void attemptPermanentFailure(Throwable error) {

attributes.put(STATUS_ATTRIBUTE, extractStatus(error));
metricsRecorder.recordAttemptLatency(attemptTimer.elapsed(TimeUnit.MILLISECONDS), attributes);
metricsRecorder.recordAttemptCount(1, attributes);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/*
* Copyright 2024 Google LLC
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Google LLC nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package com.google.api.gax.tracing;

import com.google.api.core.BetaApi;
import com.google.api.core.InternalApi;
import com.google.api.gax.core.GaxProperties;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import io.opentelemetry.api.OpenTelemetry;
import io.opentelemetry.api.common.Attributes;
import io.opentelemetry.api.common.AttributesBuilder;
import io.opentelemetry.api.metrics.DoubleHistogram;
import io.opentelemetry.api.metrics.LongCounter;
import io.opentelemetry.api.metrics.Meter;
import java.util.Map;

/**
* OpenTelemetry implementation of recording metrics. This implementation collections the
* measurements related to the lifecyle of an RPC.
*
* <p>For the Otel implementation, an attempt is a single RPC invocation and an operation is the
* collection of all the attempts made before a response is returned (either as a success or an
* error). A single call (i.e. `EchoClient.echo()`) should have an operation_count of 1 and may have
* an attempt_count of 1+ (depending on the retry configurations).
*/
@BetaApi
@InternalApi
public class OpenTelemetryMetricsRecorder implements MetricsRecorder {
private final DoubleHistogram attemptLatencyRecorder;
private final DoubleHistogram operationLatencyRecorder;
private final LongCounter operationCountRecorder;
private final LongCounter attemptCountRecorder;

/**
* Creates the following instruments for the following metrics:
*
* <ul>
* <li>Attempt Latency: Histogram
* <li>Operation Latency: Histogram
* <li>Attempt Count: Counter
* <li>Operation Count: Counter
* </ul>
*
* @param openTelemetry OpenTelemetry instance
* @param serviceName Service Name
*/
public OpenTelemetryMetricsRecorder(OpenTelemetry openTelemetry, String serviceName) {
Meter meter =
openTelemetry
.meterBuilder("gax-java")
.setInstrumentationVersion(GaxProperties.getGaxVersion())
.build();
this.attemptLatencyRecorder =
meter
.histogramBuilder(serviceName + "/attempt_latency")
.setDescription("Time an individual attempt took")
.setUnit("ms")
.build();
this.operationLatencyRecorder =
meter
.histogramBuilder(serviceName + "/operation_latency")
.setDescription(
"Total time until final operation success or failure, including retries and backoff.")
.setUnit("ms")
.build();
this.attemptCountRecorder =
meter
.counterBuilder(serviceName + "/attempt_count")
.setDescription("Number of Attempts")
.setUnit("1")
.build();
this.operationCountRecorder =
meter
.counterBuilder(serviceName + "/operation_count")
.setDescription("Number of Operations")
.setUnit("1")
.build();
}

/**
* Record the latency for an individual attempt. Data is stored in a Histogram.
*
* @param attemptLatency Attempt Latency in ms
* @param attributes Map of the attributes to store
*/
@Override
public void recordAttemptLatency(double attemptLatency, Map<String, String> attributes) {
attemptLatencyRecorder.record(attemptLatency, toOtelAttributes(attributes));
}

/**
* Record an attempt made. The attempt count number is stored in a LongCounter.
*
* <p>The count should be set as 1 every time this is invoked (each retry attempt)
*
* @param count The number of attempts made
* @param attributes Map of the attributes to store
*/
@Override
public void recordAttemptCount(long count, Map<String, String> attributes) {
attemptCountRecorder.add(count, toOtelAttributes(attributes));
}

/**
* Record the latency for the entire operation. This is the latency for the entire RPC, including
* all the retry attempts
*
* @param operationLatency Operation Latency in ms
* @param attributes Map of the attributes to store
*/
@Override
public void recordOperationLatency(double operationLatency, Map<String, String> attributes) {
operationLatencyRecorder.record(operationLatency, toOtelAttributes(attributes));
}

/**
* Record an operation made. The operation count number is stored in a LongCounter.
*
* <p>The operation count should always be 1 and this should be invoked once.
*
* @param count The number of operations made
* @param attributes Map of the attributes to store
*/
@Override
public void recordOperationCount(long count, Map<String, String> attributes) {
operationCountRecorder.add(count, toOtelAttributes(attributes));
}

@VisibleForTesting
Attributes toOtelAttributes(Map<String, String> attributes) {
Preconditions.checkNotNull(attributes, "Attributes map cannot be null");
AttributesBuilder attributesBuilder = Attributes.builder();
attributes.forEach(attributesBuilder::put);
return attributesBuilder.build();
}
}
Loading

0 comments on commit b936580

Please sign in to comment.