grafana · pracucci · Apr 28, 2022 · Apr 27, 2022 · Apr 27, 2022 · Apr 28, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -40,7 +40,7 @@
 * [ENHANCEMENT] Admin: Admin API now has some styling. #1482 #1549
 * [ENHANCEMENT] Alertmanager: added `insight=true` field to alertmanager dispatch logs. #1379
 * [ENHANCEMENT] Store-gateway: Add the experimental ability to run index header operations in a dedicated thread pool. This feature can be configured using `-blocks-storage.bucket-store.index-header-thread-pool-size` and is disabled by default. #1660
-* [ENHANCEMENT] Querier: wait until inflight queries are completed when shutting down queriers and running Mimir with query-scheduler. #1756
+* [ENHANCEMENT] Querier: wait until inflight queries are completed when shutting down queriers. #1756 #1767
 * [BUGFIX] Query-frontend: do not shard queries with a subquery unless the subquery is inside a shardable aggregation function call. #1542
 * [BUGFIX] Query-frontend: added `component=query-frontend` label to results cache memcached metrics to fix a panic when Mimir is running in single binary mode and results cache is enabled. #1704
 * [BUGFIX] Mimir: services' status content-type is now correctly set to `text/html`. #1575

@@ -15,6 +15,7 @@ import (
 	"github.com/go-kit/log/level"
 	"github.com/grafana/dskit/backoff"
 	"github.com/weaveworks/common/httpgrpc"
+	"go.uber.org/atomic"
 	"google.golang.org/grpc"
 
 	"github.com/grafana/mimir/pkg/frontend/v1/frontendv1pb"
@@ -29,12 +30,16 @@ var (
 	}
 )
 
-func newFrontendProcessor(cfg Config, handler RequestHandler, log log.Logger) processor {
+func newFrontendProcessor(cfg Config, handler RequestHandler, log log.Logger) *frontendProcessor {
 	return &frontendProcessor{
 		log:            log,
 		handler:        handler,
 		maxMessageSize: cfg.GRPCClientConfig.MaxSendMsgSize,
 		querierID:      cfg.QuerierID,
+
+		frontendClientFactory: func(conn *grpc.ClientConn) frontendv1pb.FrontendClient {
+			return frontendv1pb.NewFrontendClient(conn)
+		},
 	}
 }
 
@@ -45,11 +50,13 @@ type frontendProcessor struct {
 	querierID      string
 
 	log log.Logger
+
+	frontendClientFactory func(conn *grpc.ClientConn) frontendv1pb.FrontendClient
 }
 
 // notifyShutdown implements processor.
 func (fp *frontendProcessor) notifyShutdown(ctx context.Context, conn *grpc.ClientConn, address string) {
-	client := frontendv1pb.NewFrontendClient(conn)
+	client := fp.frontendClientFactory(conn)
 
 	req := &frontendv1pb.NotifyClientShutdownRequest{ClientID: fp.querierID}
 	if _, err := client.NotifyClientShutdown(ctx, req); err != nil {
@@ -58,20 +65,26 @@ func (fp *frontendProcessor) notifyShutdown(ctx context.Context, conn *grpc.Clie
 	}
 }
 
-// runOne loops, trying to establish a stream to the frontend to begin request processing.
-func (fp *frontendProcessor) processQueriesOnSingleStream(ctx context.Context, conn *grpc.ClientConn, address string) {
-	client := frontendv1pb.NewFrontendClient(conn)
+// processQueriesOnSingleStream tries to establish a stream to the query-frontend and then process queries received
+// on the stream. This function loops until workerCtx is canceled.
+func (fp *frontendProcessor) processQueriesOnSingleStream(workerCtx context.Context, conn *grpc.ClientConn, address string) {
+	client := fp.frontendClientFactory(conn)
+
+	// Run the gRPC client and process all the queries in a dedicated context that we call the "execution context".
+	// The execution context is cancelled once the workerCtx is cancelled AND there's no inflight query executing.
+	execCtx, execCancel, inflightQuery := newExecutionContext(workerCtx, fp.log)
+	defer execCancel()
 
-	backoff := backoff.New(ctx, processorBackoffConfig)
+	backoff := backoff.New(execCtx, processorBackoffConfig)
 	for backoff.Ongoing() {
-		c, err := client.Process(ctx)
+		c, err := client.Process(execCtx)
 		if err != nil {
 			level.Error(fp.log).Log("msg", "error contacting frontend", "address", address, "err", err)
 			backoff.Wait()
 			continue
 		}
 
-		if err := fp.process(c); err != nil {
+		if err := fp.process(c, inflightQuery); err != nil {
 			level.Error(fp.log).Log("msg", "error processing requests", "address", address, "err", err)
 			backoff.Wait()
 			continue
@@ -82,7 +95,7 @@ func (fp *frontendProcessor) processQueriesOnSingleStream(ctx context.Context, c
 }
 
 // process loops processing requests on an established stream.
-func (fp *frontendProcessor) process(c frontendv1pb.Frontend_ProcessClient) error {
+func (fp *frontendProcessor) process(c frontendv1pb.Frontend_ProcessClient, inflightQuery *atomic.Bool) error {
 	// Build a child context so we can cancel a query when the stream is closed.
 	ctx, cancel := context.WithCancel(c.Context())
 	defer cancel()
@@ -95,12 +108,16 @@ func (fp *frontendProcessor) process(c frontendv1pb.Frontend_ProcessClient) erro
 
 		switch request.Type {
 		case frontendv1pb.HTTP_REQUEST:
+			inflightQuery.Store(true)
+
 			// Handle the request on a "background" goroutine, so we go back to
 			// blocking on c.Recv().  This allows us to detect the stream closing
 			// and cancel the query.  We don't actually handle queries in parallel
 			// here, as we're running in lock step with the server - each Recv is
 			// paired with a Send.
 			go fp.runRequest(ctx, request.HttpRequest, request.StatsEnabled, func(response *httpgrpc.HTTPResponse, stats *stats.Stats) error {
+				defer inflightQuery.Store(false)
+
 				return c.Send(&frontendv1pb.ClientToFrontend{
 					HttpResponse: response,
 					Stats:        stats,

@@ -13,12 +13,85 @@ import (
 	"github.com/go-kit/log"
 	"github.com/grafana/dskit/test"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
+	"github.com/weaveworks/common/httpgrpc"
 	"go.uber.org/atomic"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/credentials/insecure"
+	"google.golang.org/grpc/metadata"
+
+	"github.com/grafana/mimir/pkg/frontend/v1/frontendv1pb"
 )
 
+func TestFrontendProcessor_processQueriesOnSingleStream(t *testing.T) {
+	t.Run("should immediately return if worker context is canceled and there's no inflight query", func(t *testing.T) {
+		fp, processClient, requestHandler := prepareFrontendProcessor()
+
+		processClient.On("Recv").Return(func() (*frontendv1pb.FrontendToClient, error) {
+			// No query to execute, so wait until terminated.
+			<-processClient.Context().Done()
+			return nil, processClient.Context().Err()
+		})
+
+		requestHandler.On("Handle", mock.Anything, mock.Anything).Return(&httpgrpc.HTTPResponse{}, nil)
+
+		workerCtx, workerCancel := context.WithCancel(context.Background())
+		workerCancel()
+
+		fp.processQueriesOnSingleStream(workerCtx, nil, "127.0.0.1")
+
+		// We expect at this point, the execution context has been canceled too.
+		require.Error(t, processClient.Context().Err())
+
+		// We expect Send() has not been called, because no query has been executed.
+		processClient.AssertNumberOfCalls(t, "Send", 0)
+	})
+
+	t.Run("should wait until inflight query execution is completed before returning when worker context is canceled", func(t *testing.T) {
+		fp, processClient, requestHandler := prepareFrontendProcessor()
+
+		recvCount := atomic.NewInt64(0)
+
+		processClient.On("Recv").Return(func() (*frontendv1pb.FrontendToClient, error) {
+			switch recvCount.Inc() {
+			case 1:
+				return &frontendv1pb.FrontendToClient{
+					Type:        frontendv1pb.HTTP_REQUEST,
+					HttpRequest: nil,
+				}, nil
+			default:
+				// No more messages to process, so waiting until terminated.
+				<-processClient.Context().Done()
+				return nil, processClient.Context().Err()
+			}
+		})
+
+		workerCtx, workerCancel := context.WithCancel(context.Background())
+
+		requestHandler.On("Handle", mock.Anything, mock.Anything).Run(func(args mock.Arguments) {
+			// Cancel the worker context while the query execution is in progress.
+			workerCancel()
+
+			// Ensure the execution context hasn't been canceled yet.
+			require.Nil(t, processClient.Context().Err())
+
+			// Intentionally slow down the query execution, to double check the worker waits until done.
+			time.Sleep(time.Second)
+		}).Return(&httpgrpc.HTTPResponse{}, nil)
+
+		startTime := time.Now()
+		fp.processQueriesOnSingleStream(workerCtx, nil, "127.0.0.1")
+		assert.GreaterOrEqual(t, time.Since(startTime), time.Second)
+
+		// We expect at this point, the execution context has been canceled too.
+		require.Error(t, processClient.Context().Err())
+
+		// We expect Send() to be called once, to send the query result.
+		processClient.AssertNumberOfCalls(t, "Send", 1)
+	})
+}
+
 func TestRecvFailDoesntCancelProcess(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
@@ -78,3 +151,97 @@ func TestContextCancelStopsProcess(t *testing.T) {
 		return int(pm.currentProcessors.Load())
 	})
 }
+
+func prepareFrontendProcessor() (*frontendProcessor, *frontendProcessClientMock, *requestHandlerMock) {
+	var processCtx context.Context
+
+	processClient := &frontendProcessClientMock{}
+	processClient.On("Send", mock.Anything).Return(nil)
+	processClient.On("Context").Return(func() context.Context {
+		return processCtx
+	})
+
+	frontendClient := &frontendClientMock{}
+	frontendClient.On("Process", mock.Anything, mock.Anything).Run(func(args mock.Arguments) {
+		processCtx = args.Get(0).(context.Context)
+	}).Return(processClient, nil)
+
+	requestHandler := &requestHandlerMock{}
+
+	fp := newFrontendProcessor(Config{QuerierID: "test-querier-id"}, requestHandler, log.NewNopLogger())
+	fp.frontendClientFactory = func(_ *grpc.ClientConn) frontendv1pb.FrontendClient {
+		return frontendClient
+	}
+
+	return fp, processClient, requestHandler
+}
+
+type frontendClientMock struct {
+	mock.Mock
+}
+
+func (m *frontendClientMock) Process(ctx context.Context, opts ...grpc.CallOption) (frontendv1pb.Frontend_ProcessClient, error) {
+	args := m.Called(ctx, opts)
+	return args.Get(0).(frontendv1pb.Frontend_ProcessClient), args.Error(1)
+}
+
+func (m *frontendClientMock) NotifyClientShutdown(ctx context.Context, in *frontendv1pb.NotifyClientShutdownRequest, opts ...grpc.CallOption) (*frontendv1pb.NotifyClientShutdownResponse, error) {
+	args := m.Called(ctx, in, opts)
+	return args.Get(0).(*frontendv1pb.NotifyClientShutdownResponse), args.Error(1)
+}
+
+type frontendProcessClientMock struct {
+	mock.Mock
+}
+
+func (m *frontendProcessClientMock) Send(msg *frontendv1pb.ClientToFrontend) error {
+	args := m.Called(msg)
+	return args.Error(0)
+}
+
+func (m *frontendProcessClientMock) Recv() (*frontendv1pb.FrontendToClient, error) {
+	args := m.Called()
+
+	// Allow to mock the Recv() with a function which is called each time.
+	if fn, ok := args.Get(0).(func() (*frontendv1pb.FrontendToClient, error)); ok {
+		return fn()
+	}
+
+	return args.Get(0).(*frontendv1pb.FrontendToClient), args.Error(1)
+}
+
+func (m *frontendProcessClientMock) Header() (metadata.MD, error) {
+	args := m.Called()
+	return args.Get(0).(metadata.MD), args.Error(1)
+}
+
+func (m *frontendProcessClientMock) Trailer() metadata.MD {
+	args := m.Called()
+	return args.Get(0).(metadata.MD)
+}
+
+func (m *frontendProcessClientMock) CloseSend() error {
+	args := m.Called()
+	return args.Error(0)
+}
+
+func (m *frontendProcessClientMock) Context() context.Context {
+	args := m.Called()
+
+	// Allow to mock the Context() with a function which is called each time.
+	if fn, ok := args.Get(0).(func() context.Context); ok {
+		return fn()
+	}
+
+	return args.Get(0).(context.Context)
+}
+
+func (m *frontendProcessClientMock) SendMsg(msg interface{}) error {
+	args := m.Called(msg)
+	return args.Error(0)
+}
+
+func (m *frontendProcessClientMock) RecvMsg(msg interface{}) error {
+	args := m.Called(msg)
+	return args.Error(0)
+}
@@ -25,6 +25,7 @@ import (
 	"github.com/weaveworks/common/httpgrpc"
 	"github.com/weaveworks/common/middleware"
 	"github.com/weaveworks/common/user"
+	"go.uber.org/atomic"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/health/grpc_health_v1"
 
@@ -99,12 +100,12 @@ func (sp *schedulerProcessor) processQueriesOnSingleStream(workerCtx context.Con
 
 	// Run the querier loop (and so all the queries) in a dedicated context that we call the "execution context".
 	// The execution context is cancelled once the workerCtx is cancelled AND there's no inflight query executing.
-	exec := newExecutionContext(workerCtx, sp.log)
-	defer exec.cancel()
+	execCtx, execCancel, inflightQuery := newExecutionContext(workerCtx, sp.log)
+	defer execCancel()
 
-	backoff := backoff.New(exec.context(), processorBackoffConfig)
+	backoff := backoff.New(execCtx, processorBackoffConfig)
 	for backoff.Ongoing() {
-		c, err := schedulerClient.QuerierLoop(exec.context())
+		c, err := schedulerClient.QuerierLoop(execCtx)
 		if err == nil {
 			err = c.Send(&schedulerpb.QuerierToScheduler{QuerierID: sp.querierID})
 		}
@@ -115,7 +116,7 @@ func (sp *schedulerProcessor) processQueriesOnSingleStream(workerCtx context.Con
 			continue
 		}
 
-		if err := sp.querierLoop(c, address, exec); err != nil {
+		if err := sp.querierLoop(c, address, inflightQuery); err != nil {
 			level.Error(sp.log).Log("msg", "error processing requests from scheduler", "err", err, "addr", address)
 			backoff.Wait()
 			continue
@@ -126,7 +127,7 @@ func (sp *schedulerProcessor) processQueriesOnSingleStream(workerCtx context.Con
 }
 
 // process loops processing requests on an established stream.
-func (sp *schedulerProcessor) querierLoop(c schedulerpb.SchedulerForQuerier_QuerierLoopClient, address string, exec *executionContext) error {
+func (sp *schedulerProcessor) querierLoop(c schedulerpb.SchedulerForQuerier_QuerierLoopClient, address string, inflightQuery *atomic.Bool) error {
 	// Build a child context so we can cancel a query when the stream is closed.
 	ctx, cancel := context.WithCancel(c.Context())
 	defer cancel()
@@ -137,15 +138,15 @@ func (sp *schedulerProcessor) querierLoop(c schedulerpb.SchedulerForQuerier_Quer
 			return err
 		}
 
-		exec.queryStarted()
+		inflightQuery.Store(true)
 
 		// Handle the request on a "background" goroutine, so we go back to
 		// blocking on c.Recv().  This allows us to detect the stream closing
 		// and cancel the query.  We don't actually handle queries in parallel
 		// here, as we're running in lock step with the server - each Recv is
 		// paired with a Send.
 		go func() {
-			defer exec.queryEnded()
+			defer inflightQuery.Store(false)
 
 			// We need to inject user into context for sending response back.
 			ctx := user.InjectOrgID(ctx, request.UserID)