From d45aba670cb9d8656013d5a25bf27961e0f57d44 Mon Sep 17 00:00:00 2001 From: Albert <26584478+albertteoh@users.noreply.github.com> Date: Mon, 4 Dec 2023 21:56:36 +1100 Subject: [PATCH] [SPM] Differentiate null from no error data (#4985) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which problem is this PR solving? - Resolves #4229 ## Description of the changes - Ensures SPM displays a 0% error rate if there are no error metrics _and_ call rates exist. - If call rates don't exist, the error rate will also be null. - This ensures SPM is able to differentiate "no data" from "no errors". ## How was this change tested? - Add unit tests to cover happy and error cases. - Tested locally to confirm "No data" is shown in the Error graph when there is no data, then when call rates are available, a 0% rate is displayed. Screenshot 2023-12-03 at 8 01 36 pm Screenshot 2023-12-03 at 8 00 45 pm ## Checklist - [x] I have read https://github.com/jaegertracing/jaeger/blob/master/CONTRIBUTING_GUIDELINES.md - [x] I have signed all commits - [x] I have added unit tests for the new functionality - [x] I have run lint and test steps successfully - for `jaeger`: `make lint test` - for `jaeger-ui`: `yarn lint` and `yarn test` --------- Signed-off-by: Albert Teoh Co-authored-by: Albert Teoh --- docker-compose/monitor/Makefile | 2 +- docker-compose/monitor/README.md | 4 +- .../metrics/prometheus/metricsstore/reader.go | 33 ++- .../prometheus/metricsstore/reader_test.go | 204 +++++++++++++++++- .../metricsstore/testdata/empty_response.json | 7 + 5 files changed, 245 insertions(+), 5 deletions(-) create mode 100644 plugin/metrics/prometheus/metricsstore/testdata/empty_response.json diff --git a/docker-compose/monitor/Makefile b/docker-compose/monitor/Makefile index cab9eca4e26..794854fdb96 100644 --- a/docker-compose/monitor/Makefile +++ b/docker-compose/monitor/Makefile @@ -2,7 +2,7 @@ build: export DOCKER_TAG = dev build: clean-jaeger cd ../../ && \ - make build-all-in-one && \ + make build-all-in-one-linux && \ make docker-images-jaeger-backend # starts up the system required for SPM using the latest otel image and a development jaeger image. diff --git a/docker-compose/monitor/README.md b/docker-compose/monitor/README.md index 1e2a95a1473..e9181ae1ccf 100644 --- a/docker-compose/monitor/README.md +++ b/docker-compose/monitor/README.md @@ -62,13 +62,13 @@ make build ## Bring up the dev environment ```bash -make run-dev +make dev ``` ## Backwards compatibility testing with spanmetrics processor ```bash -make run-dev-processor +make dev-processor ``` For each "run" make target, you should expect to see the following in the Monitor tab after a few minutes: diff --git a/plugin/metrics/prometheus/metricsstore/reader.go b/plugin/metrics/prometheus/metricsstore/reader.go index d0573a75c95..708b1d5635c 100644 --- a/plugin/metrics/prometheus/metricsstore/reader.go +++ b/plugin/metrics/prometheus/metricsstore/reader.go @@ -211,7 +211,38 @@ func (m MetricsReader) GetErrorRates(ctx context.Context, requestParams *metrics ) }, } - return m.executeQuery(ctx, metricsParams) + errorMetrics, err := m.executeQuery(ctx, metricsParams) + if err != nil { + return nil, fmt.Errorf("failed getting error metrics: %w", err) + } + // Non-zero error rates are available. + if len(errorMetrics.Metrics) > 0 { + return errorMetrics, nil + } + + // Check for the presence of call rate metrics to differentiate the absence of error rate from + // the absence of call rate metrics altogether. + callMetrics, err := m.GetCallRates(ctx, &metricsstore.CallRateQueryParameters{BaseQueryParameters: requestParams.BaseQueryParameters}) + if err != nil { + return nil, fmt.Errorf("failed getting call metrics: %w", err) + } + // No call rate metrics are available, and by association, means no error rate metrics are available. + if len(callMetrics.Metrics) == 0 { + return errorMetrics, nil + } + + // Non-zero call rate metrics are available, which implies that there are just no errors, so we report a zero error rate. + zeroErrorMetrics := make([]*metrics.Metric, 0, len(callMetrics.Metrics)) + for _, cm := range callMetrics.Metrics { + zm := *cm + for i := 0; i < len(zm.MetricPoints); i++ { + zm.MetricPoints[i].Value = &metrics.MetricPoint_GaugeValue{GaugeValue: &metrics.GaugeValue{Value: &metrics.GaugeValue_DoubleValue{DoubleValue: 0.0}}} + } + zeroErrorMetrics = append(zeroErrorMetrics, &zm) + } + + errorMetrics.Metrics = zeroErrorMetrics + return errorMetrics, nil } // GetMinStepDuration gets the minimum step duration (the smallest possible duration between two data points in a time series) supported. diff --git a/plugin/metrics/prometheus/metricsstore/reader_test.go b/plugin/metrics/prometheus/metricsstore/reader_test.go index bd359824a25..576df191566 100644 --- a/plugin/metrics/prometheus/metricsstore/reader_test.go +++ b/plugin/metrics/prometheus/metricsstore/reader_test.go @@ -487,6 +487,208 @@ func TestGetErrorRates(t *testing.T) { } } +func TestGetErrorRatesZero(t *testing.T) { + params := metricsstore.ErrorRateQueryParameters{ + BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{ + serviceNames: []string{"emailservice"}, + spanKinds: []string{"SPAN_KIND_SERVER"}, + }), + } + tracer, exp, closer := tracerProvider(t) + defer closer() + + const ( + queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` + + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` + + `sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)` + queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` + + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)` + ) + wantPromQLQueries := []string{queryErrorRate, queryCallRate} + responses := []string{"testdata/empty_response.json", "testdata/service_datapoint_response.json"} + var callCount int + + mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + defer r.Body.Close() + + u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body)) + require.NoError(t, err) + + q := u.Query() + promQuery := q.Get("query") + assert.Equal(t, wantPromQLQueries[callCount], promQuery) + sendResponse(t, w, responses[callCount]) + callCount++ + })) + + logger := zap.NewNop() + address := mockPrometheus.Listener.Addr().String() + + cfg := defaultConfig + cfg.ServerURL = "http://" + address + cfg.ConnectTimeout = defaultTimeout + + reader, err := NewMetricsReader(cfg, logger, tracer) + require.NoError(t, err) + + defer mockPrometheus.Close() + + m, err := reader.GetErrorRates(context.Background(), ¶ms) + require.NoError(t, err) + + require.Len(t, m.Metrics, 1) + mps := m.Metrics[0].MetricPoints + + require.Len(t, mps, 1) + + // Assert that we essentially zeroed the call rate data point. + // That is, the timestamp is the same as the call rate's data point, but the value is 0. + actualVal := mps[0].Value.(*metrics.MetricPoint_GaugeValue).GaugeValue.Value.(*metrics.GaugeValue_DoubleValue).DoubleValue + assert.Zero(t, actualVal) + assert.Equal(t, int64(1620351786), mps[0].Timestamp.GetSeconds()) + assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made") +} + +func TestGetErrorRatesNull(t *testing.T) { + params := metricsstore.ErrorRateQueryParameters{ + BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{ + serviceNames: []string{"emailservice"}, + spanKinds: []string{"SPAN_KIND_SERVER"}, + }), + } + tracer, exp, closer := tracerProvider(t) + defer closer() + + const ( + queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` + + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` + + `sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)` + queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` + + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)` + ) + wantPromQLQueries := []string{queryErrorRate, queryCallRate} + responses := []string{"testdata/empty_response.json", "testdata/empty_response.json"} + var callCount int + + mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + defer r.Body.Close() + + u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body)) + require.NoError(t, err) + + q := u.Query() + promQuery := q.Get("query") + assert.Equal(t, wantPromQLQueries[callCount], promQuery) + sendResponse(t, w, responses[callCount]) + callCount++ + })) + + logger := zap.NewNop() + address := mockPrometheus.Listener.Addr().String() + + cfg := defaultConfig + cfg.ServerURL = "http://" + address + cfg.ConnectTimeout = defaultTimeout + + reader, err := NewMetricsReader(cfg, logger, tracer) + require.NoError(t, err) + + defer mockPrometheus.Close() + + m, err := reader.GetErrorRates(context.Background(), ¶ms) + require.NoError(t, err) + assert.Empty(t, m.Metrics, "expect no error data available") + assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made") +} + +func TestGetErrorRatesErrors(t *testing.T) { + for _, tc := range []struct { + name string + failErrorRateQuery bool + failCallRateQuery bool + wantErr string + }{ + { + name: "error rate query failure", + failErrorRateQuery: true, + wantErr: "failed getting error metrics: failed executing metrics query: server_error: server error: 500", + }, + { + name: "call rate query failure", + failCallRateQuery: true, + wantErr: "failed getting call metrics: failed executing metrics query: server_error: server error: 500", + }, + } { + t.Run(tc.name, func(t *testing.T) { + params := metricsstore.ErrorRateQueryParameters{ + BaseQueryParameters: buildTestBaseQueryParametersFrom(metricsTestCase{ + serviceNames: []string{"emailservice"}, + spanKinds: []string{"SPAN_KIND_SERVER"}, + }), + } + tracer, _, closer := tracerProvider(t) + defer closer() + + const ( + queryErrorRate = `sum(rate(calls{service_name =~ "emailservice", status_code = "STATUS_CODE_ERROR", ` + + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name) / ` + + `sum(rate(calls{service_name =~ "emailservice", span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)` + queryCallRate = `sum(rate(calls{service_name =~ "emailservice", ` + + `span_kind =~ "SPAN_KIND_SERVER"}[10m])) by (service_name)` + ) + wantPromQLQueries := []string{queryErrorRate, queryCallRate} + responses := []string{"testdata/empty_response.json", "testdata/service_datapoint_response.json"} + var callCount int + + mockPrometheus := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + defer r.Body.Close() + + u, err := url.Parse("http://" + r.Host + r.RequestURI + "?" + string(body)) + require.NoError(t, err) + + q := u.Query() + promQuery := q.Get("query") + assert.Equal(t, wantPromQLQueries[callCount], promQuery) + + switch promQuery { + case queryErrorRate: + if tc.failErrorRateQuery { + w.WriteHeader(http.StatusInternalServerError) + } else { + sendResponse(t, w, responses[callCount]) + } + case queryCallRate: + if tc.failCallRateQuery { + w.WriteHeader(http.StatusInternalServerError) + } else { + sendResponse(t, w, responses[callCount]) + } + } + callCount++ + })) + + logger := zap.NewNop() + address := mockPrometheus.Listener.Addr().String() + + cfg := defaultConfig + cfg.ServerURL = "http://" + address + cfg.ConnectTimeout = defaultTimeout + + reader, err := NewMetricsReader(cfg, logger, tracer) + require.NoError(t, err) + + defer mockPrometheus.Close() + + _, err = reader.GetErrorRates(context.Background(), ¶ms) + require.Error(t, err) + assert.EqualError(t, err, tc.wantErr) + }) + } +} + func TestInvalidLatencyUnit(t *testing.T) { defer func() { if r := recover(); r == nil { @@ -515,7 +717,7 @@ func TestWarningResponse(t *testing.T) { m, err := reader.GetErrorRates(context.Background(), ¶ms) require.NoError(t, err) assert.NotNil(t, m) - assert.Len(t, exp.GetSpans(), 1, "HTTP request was traced and span reported") + assert.Len(t, exp.GetSpans(), 2, "expected an error rate query and a call rate query to be made") } type fakePromServer struct { diff --git a/plugin/metrics/prometheus/metricsstore/testdata/empty_response.json b/plugin/metrics/prometheus/metricsstore/testdata/empty_response.json new file mode 100644 index 00000000000..cee5eaaecb9 --- /dev/null +++ b/plugin/metrics/prometheus/metricsstore/testdata/empty_response.json @@ -0,0 +1,7 @@ +{ + "status": "success", + "data": { + "resultType": "matrix", + "result": [] + } +}