From d45aba670cb9d8656013d5a25bf27961e0f57d44 Mon Sep 17 00:00:00 2001
From: Albert <26584478+albertteoh@users.noreply.github.com>
Date: Mon, 4 Dec 2023 21:56:36 +1100
Subject: [PATCH] [SPM] Differentiate null from no error data (#4985)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Which problem is this PR solving?
- Resolves #4229
## Description of the changes
- Ensures SPM displays a 0% error rate if there are no error metrics
_and_ call rates exist.
- If call rates don't exist, the error rate will also be null.
- This ensures SPM is able to differentiate "no data" from "no errors".
## How was this change tested?
- Add unit tests to cover happy and error cases.
- Tested locally to confirm "No data" is shown in the Error graph when
there is no data, then when call rates are available, a 0% rate is
displayed.
## Checklist
- [x] I have read
https://github.com/jaegertracing/jaeger/blob/master/CONTRIBUTING_GUIDELINES.md
- [x] I have signed all commits
- [x] I have added unit tests for the new functionality
- [x] I have run lint and test steps successfully
- for `jaeger`: `make lint test`
- for `jaeger-ui`: `yarn lint` and `yarn test`
---------
Signed-off-by: Albert Teoh
Co-authored-by: Albert Teoh
---
docker-compose/monitor/Makefile | 2 +-
docker-compose/monitor/README.md | 4 +-
.../metrics/prometheus/metricsstore/reader.go | 33 ++-
.../prometheus/metricsstore/reader_test.go | 204 +++++++++++++++++-
.../metricsstore/testdata/empty_response.json | 7 +
5 files changed, 245 insertions(+), 5 deletions(-)
create mode 100644 plugin/metrics/prometheus/metricsstore/testdata/empty_response.json
diff --git a/docker-compose/monitor/Makefile b/docker-compose/monitor/Makefile
index cab9eca4e26..794854fdb96 100644
--- a/docker-compose/monitor/Makefile
+++ b/docker-compose/monitor/Makefile
@@ -2,7 +2,7 @@
build: export DOCKER_TAG = dev
build: clean-jaeger
cd ../../ && \
- make build-all-in-one && \
+ make build-all-in-one-linux && \
make docker-images-jaeger-backend
# starts up the system required for SPM using the latest otel image and a development jaeger image.
diff --git a/docker-compose/monitor/README.md b/docker-compose/monitor/README.md
index 1e2a95a1473..e9181ae1ccf 100644
--- a/docker-compose/monitor/README.md
+++ b/docker-compose/monitor/README.md
@@ -62,13 +62,13 @@ make build
## Bring up the dev environment
```bash
-make run-dev
+make dev
```
## Backwards compatibility testing with spanmetrics processor
```bash
-make run-dev-processor
+make dev-processor
```
For each "run" make target, you should expect to see the following in the Monitor tab after a few minutes:
diff --git a/plugin/metrics/prometheus/metricsstore/reader.go b/plugin/metrics/prometheus/metricsstore/reader.go
index d0573a75c95..708b1d5635c 100644
--- a/plugin/metrics/prometheus/metricsstore/reader.go
+++ b/plugin/metrics/prometheus/metricsstore/reader.go
@@ -211,7 +211,38 @@ func (m MetricsReader) GetErrorRates(ctx context.Context, requestParams *metrics
)
},
}
- return m.executeQuery(ctx, metricsParams)
+ errorMetrics, err := m.executeQuery(ctx, metricsParams)
+ if err != nil {
+ return nil, fmt.Errorf("failed getting error metrics: %w", err)
+ }
+ // Non-zero error rates are available.
+ if len(errorMetrics.Metrics) > 0 {
+ return errorMetrics, nil
+ }
+
+ // Check for the presence of call rate metrics to differentiate the absence of error rate from
+ // the absence of call rate metrics altogether.
+ callMetrics, err := m.GetCallRates(ctx, &metricsstore.CallRateQueryParameters{BaseQueryParameters: requestParams.BaseQueryParameters})
+ if err != nil {
+ return nil, fmt.Errorf("failed getting call metrics: %w", err)
+ }
+ // No call rate metrics are available, and by association, means no error rate metrics are available.
+ if len(callMetrics.Metrics) == 0 {
+ return errorMetrics, nil
+ }
+
+ // Non-zero call rate metrics are available, which implies that there are just no errors, so we report a zero error rate.
+ zeroErrorMetrics := make([]*metrics.Metric, 0, len(callMetrics.Metrics))
+ for _, cm := range callMetrics.Metrics {
+ zm := *cm
+ for i := 0; i < len(zm.MetricPoints); i++ {
+ zm.MetricPoints[i].Value = &metrics.MetricPoint_GaugeValue{GaugeValue: &metrics.GaugeValue{Value: &metrics.GaugeValue_DoubleValue{DoubleValue: 0.0}}}
+ }
+ zeroErrorMetrics = append(zeroErrorMetrics, &zm)
+ }
+
+ errorMetrics.Metrics = zeroErrorMetrics
+ return errorMetrics, nil
}
// GetMinStepDuration gets the minimum step duration (the smallest possible duration between two data points in a time series) supported.
diff --git a/plugin/metrics/prometheus/metricsstore/reader_test.go b/plugin/metrics/prometheus/metricsstore/reader_test.go
index bd359824a25..576df191566 100644
--- a/plugin/metrics/prometheus/metricsstore/reader_test.go
+++ b/plugin/metrics/prometheus/metricsstore/reader_test.go
@@ -487,6 +487,208 @@ func TestGetErrorRates(t *testing.T) {
}
}
+func TestGetErrorRatesZero(t *testing.T) {
+ params := metricsstore.ErrorRateQueryParameters{
+ BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{
+ serviceNames: []string{"emailservice"},
+ spanKinds: []string{"SPAN_KIND_SERVER"},
+ }),
+ }
+ tracer, exp, closer := tracerProvider(t)
+ defer closer()
+
+ const (
+ queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
+ `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` +
+ `sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
+ queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` +
+ `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
+ )
+ wantPromQLQueries := []string{queryErrorRate, queryCallRate}
+ responses := []string{"testdata/empty_response.json", "testdata/service_datapoint_response.json"}
+ var callCount int
+
+ mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ body, _ := io.ReadAll(r.Body)
+ defer r.Body.Close()
+
+ u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body))
+ require.NoError(t, err)
+
+ q := u.Query()
+ promQuery := q.Get("query")
+ assert.Equal(t, wantPromQLQueries[callCount], promQuery)
+ sendResponse(t, w, responses[callCount])
+ callCount++
+ }))
+
+ logger := zap.NewNop()
+ address := mockPrometheus.Listener.Addr().String()
+
+ cfg := defaultConfig
+ cfg.ServerURL = "http://" + address
+ cfg.ConnectTimeout = defaultTimeout
+
+ reader, err := NewMetricsReader(cfg, logger, tracer)
+ require.NoError(t, err)
+
+ defer mockPrometheus.Close()
+
+ m, err := reader.GetErrorRates(context.Background(), ¶ms)
+ require.NoError(t, err)
+
+ require.Len(t, m.Metrics, 1)
+ mps := m.Metrics[0].MetricPoints
+
+ require.Len(t, mps, 1)
+
+ // Assert that we essentially zeroed the call rate data point.
+ // That is, the timestamp is the same as the call rate's data point, but the value is 0.
+ actualVal := mps[0].Value.(*metrics.MetricPoint_GaugeValue).GaugeValue.Value.(*metrics.GaugeValue_DoubleValue).DoubleValue
+ assert.Zero(t, actualVal)
+ assert.Equal(t, int64(1620351786), mps[0].Timestamp.GetSeconds())
+ assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made")
+}
+
+func TestGetErrorRatesNull(t *testing.T) {
+ params := metricsstore.ErrorRateQueryParameters{
+ BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{
+ serviceNames: []string{"emailservice"},
+ spanKinds: []string{"SPAN_KIND_SERVER"},
+ }),
+ }
+ tracer, exp, closer := tracerProvider(t)
+ defer closer()
+
+ const (
+ queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
+ `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` +
+ `sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
+ queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` +
+ `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
+ )
+ wantPromQLQueries := []string{queryErrorRate, queryCallRate}
+ responses := []string{"testdata/empty_response.json", "testdata/empty_response.json"}
+ var callCount int
+
+ mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ body, _ := io.ReadAll(r.Body)
+ defer r.Body.Close()
+
+ u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body))
+ require.NoError(t, err)
+
+ q := u.Query()
+ promQuery := q.Get("query")
+ assert.Equal(t, wantPromQLQueries[callCount], promQuery)
+ sendResponse(t, w, responses[callCount])
+ callCount++
+ }))
+
+ logger := zap.NewNop()
+ address := mockPrometheus.Listener.Addr().String()
+
+ cfg := defaultConfig
+ cfg.ServerURL = "http://" + address
+ cfg.ConnectTimeout = defaultTimeout
+
+ reader, err := NewMetricsReader(cfg, logger, tracer)
+ require.NoError(t, err)
+
+ defer mockPrometheus.Close()
+
+ m, err := reader.GetErrorRates(context.Background(), ¶ms)
+ require.NoError(t, err)
+ assert.Empty(t, m.Metrics, "expect no error data available")
+ assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made")
+}
+
+func TestGetErrorRatesErrors(t *testing.T) {
+ for _, tc := range []struct {
+ name string
+ failErrorRateQuery bool
+ failCallRateQuery bool
+ wantErr string
+ }{
+ {
+ name: "error rate query failure",
+ failErrorRateQuery: true,
+ wantErr: "failed getting error metrics: failed executing metrics query: server_error: server error: 500",
+ },
+ {
+ name: "call rate query failure",
+ failCallRateQuery: true,
+ wantErr: "failed getting call metrics: failed executing metrics query: server_error: server error: 500",
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ params := metricsstore.ErrorRateQueryParameters{
+ BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{
+ serviceNames: []string{"emailservice"},
+ spanKinds: []string{"SPAN_KIND_SERVER"},
+ }),
+ }
+ tracer, _, closer := tracerProvider(t)
+ defer closer()
+
+ const (
+ queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` +
+ `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` +
+ `sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
+ queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` +
+ `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)`
+ )
+ wantPromQLQueries := []string{queryErrorRate, queryCallRate}
+ responses := []string{"testdata/empty_response.json", "testdata/service_datapoint_response.json"}
+ var callCount int
+
+ mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+ body, _ := io.ReadAll(r.Body)
+ defer r.Body.Close()
+
+ u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body))
+ require.NoError(t, err)
+
+ q := u.Query()
+ promQuery := q.Get("query")
+ assert.Equal(t, wantPromQLQueries[callCount], promQuery)
+
+ switch promQuery {
+ case queryErrorRate:
+ if tc.failErrorRateQuery {
+ w.WriteHeader(http.StatusInternalServerError)
+ } else {
+ sendResponse(t, w, responses[callCount])
+ }
+ case queryCallRate:
+ if tc.failCallRateQuery {
+ w.WriteHeader(http.StatusInternalServerError)
+ } else {
+ sendResponse(t, w, responses[callCount])
+ }
+ }
+ callCount++
+ }))
+
+ logger := zap.NewNop()
+ address := mockPrometheus.Listener.Addr().String()
+
+ cfg := defaultConfig
+ cfg.ServerURL = "http://" + address
+ cfg.ConnectTimeout = defaultTimeout
+
+ reader, err := NewMetricsReader(cfg, logger, tracer)
+ require.NoError(t, err)
+
+ defer mockPrometheus.Close()
+
+ _, err = reader.GetErrorRates(context.Background(), ¶ms)
+ require.Error(t, err)
+ assert.EqualError(t, err, tc.wantErr)
+ })
+ }
+}
+
func TestInvalidLatencyUnit(t *testing.T) {
defer func() {
if r := recover(); r == nil {
@@ -515,7 +717,7 @@ func TestWarningResponse(t *testing.T) {
m, err := reader.GetErrorRates(context.Background(), ¶ms)
require.NoError(t, err)
assert.NotNil(t, m)
- assert.Len(t, exp.GetSpans(), 1, "HTTP request was traced and span reported")
+ assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made")
}
type fakePromServer struct {
diff --git a/plugin/metrics/prometheus/metricsstore/testdata/empty_response.json b/plugin/metrics/prometheus/metricsstore/testdata/empty_response.json
new file mode 100644
index 00000000000..cee5eaaecb9
--- /dev/null
+++ b/plugin/metrics/prometheus/metricsstore/testdata/empty_response.json
@@ -0,0 +1,7 @@
+{
+ "status": "success",
+ "data": {
+ "resultType": "matrix",
+ "result": []
+ }
+}