From d17355cca01cdc8ab8f39090486d3fa73de38793 Mon Sep 17 00:00:00 2001
From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com>
Date: Thu, 10 Oct 2024 10:45:51 -0400
Subject: [PATCH] feat: make collector health check timeout configurable
 (#1371)

## Which problem is this PR solving?

When there's a big volume of traffic, collector can take longer to
respond to the health check

## Short description of the changes

- add a new config option `HealthCheckTimeout` in `Collection`

---------

Co-authored-by: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com>
# Conflicts:
#	config/metadata/configMeta.yaml
---
 collect/collect.go              |  8 +++++---
 config/file_config.go           |  1 +
 config/metadata/configMeta.yaml | 10 ++++++++++
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/collect/collect.go b/collect/collect.go
index fcd6430015..3d4cbbd36a 100644
--- a/collect/collect.go
+++ b/collect/collect.go
@@ -104,7 +104,7 @@ func (i *InMemCollector) Start() error {
 	// listen for config reloads
 	i.Config.RegisterReloadCallback(i.sendReloadSignal)
 
-	i.Health.Register(CollectorHealthKey, 3*time.Second)
+	i.Health.Register(CollectorHealthKey, time.Duration(imcConfig.HealthCheckTimeout))
 
 	i.Metrics.Register("trace_duration_ms", "histogram")
 	i.Metrics.Register("trace_span_count", "histogram")
@@ -330,6 +330,8 @@ func (i *InMemCollector) collect() {
 	defer i.mutex.Unlock()
 
 	for {
+		startTime := time.Now()
+
 		i.Health.Ready(CollectorHealthKey, true)
 		// record channel lengths as histogram but also as gauges
 		i.Metrics.Histogram("collector_incoming_queue", float64(len(i.incoming)))
@@ -376,18 +378,18 @@ func (i *InMemCollector) collect() {
 					return
 				}
 				i.processSpan(sp)
-				continue
 			case sp, ok := <-i.fromPeer:
 				if !ok {
 					// channel's been closed; we should shut down.
 					return
 				}
 				i.processSpan(sp)
-				continue
 			case <-i.reload:
 				i.reloadConfigs()
 			}
 		}
+
+		i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds()))
 	}
 }
 
diff --git a/config/file_config.go b/config/file_config.go
index 9f5dd7ea8b..2fa41f9bd5 100644
--- a/config/file_config.go
+++ b/config/file_config.go
@@ -302,6 +302,7 @@ type CollectionConfig struct {
 	PeerQueueSize         int        `yaml:"PeerQueueSize"`
 	IncomingQueueSize     int        `yaml:"IncomingQueueSize"`
 	AvailableMemory       MemorySize `yaml:"AvailableMemory" cmdenv:"AvailableMemory"`
+	HealthCheckTimeout    Duration   `yaml:"HealthCheckTimeout" default:"3s"`
 	MaxMemoryPercentage   int        `yaml:"MaxMemoryPercentage" default:"75"`
 	MaxAlloc              MemorySize `yaml:"MaxAlloc"`
 	DisableRedistribution bool       `yaml:"DisableRedistribution"`
diff --git a/config/metadata/configMeta.yaml b/config/metadata/configMeta.yaml
index c1f0959e2c..a0d1823e9b 100644
--- a/config/metadata/configMeta.yaml
+++ b/config/metadata/configMeta.yaml
@@ -1285,6 +1285,16 @@ groups:
           This value should be set to a bit less than the normal timeout period
           for shutting down without forcibly terminating the process.
 
+      - name: HealthCheckTimeout
+        type: duration
+        valuetype: nondefault
+        firstversion: v2.8
+        default: 3s
+        reload: false
+        summary: Controls the maximum duration allowed for collection health checks to complete.
+        description: >
+          The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues.
+
   - name: BufferSizes
     title: "Buffer Sizes"
     description: >