feat: make collector health check timeout configurable (#1371)

## Which problem is this PR solving? When there's a big volume of traffic, collector can take longer to respond to the health check ## Short description of the changes - add a new config option `HealthCheckTimeout` in `Collection` --------- Co-authored-by: Tyler Helmuth <[email protected]> # Conflicts: # config/metadata/configMeta.yaml
honeycombio · Oct 10, 2024 · d17355c · d17355c
1 parent 869da1a
commit d17355c
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 3 deletions.
diff --git a/collect/collect.go b/collect/collect.go
@@ -104,7 +104,7 @@ func (i *InMemCollector) Start() error {
 	// listen for config reloads
 	i.Config.RegisterReloadCallback(i.sendReloadSignal)
 
-	i.Health.Register(CollectorHealthKey, 3*time.Second)
+	i.Health.Register(CollectorHealthKey, time.Duration(imcConfig.HealthCheckTimeout))
 
 	i.Metrics.Register("trace_duration_ms", "histogram")
 	i.Metrics.Register("trace_span_count", "histogram")
@@ -330,6 +330,8 @@ func (i *InMemCollector) collect() {
 	defer i.mutex.Unlock()
 
 	for {
+		startTime := time.Now()
+
 		i.Health.Ready(CollectorHealthKey, true)
 		// record channel lengths as histogram but also as gauges
 		i.Metrics.Histogram("collector_incoming_queue", float64(len(i.incoming)))
@@ -376,18 +378,18 @@ func (i *InMemCollector) collect() {
 					return
 				}
 				i.processSpan(sp)
-				continue
 			case sp, ok := <-i.fromPeer:
 				if !ok {
 					// channel's been closed; we should shut down.
 					return
 				}
 				i.processSpan(sp)
-				continue
 			case <-i.reload:
 				i.reloadConfigs()
 			}
 		}
+
+		i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds()))
 	}
 }
 

diff --git a/config/file_config.go b/config/file_config.go
@@ -302,6 +302,7 @@ type CollectionConfig struct {
 	PeerQueueSize         int        `yaml:"PeerQueueSize"`
 	IncomingQueueSize     int        `yaml:"IncomingQueueSize"`
 	AvailableMemory       MemorySize `yaml:"AvailableMemory" cmdenv:"AvailableMemory"`
+	HealthCheckTimeout    Duration   `yaml:"HealthCheckTimeout" default:"3s"`
 	MaxMemoryPercentage   int        `yaml:"MaxMemoryPercentage" default:"75"`
 	MaxAlloc              MemorySize `yaml:"MaxAlloc"`
 	DisableRedistribution bool       `yaml:"DisableRedistribution"`

diff --git a/config/metadata/configMeta.yaml b/config/metadata/configMeta.yaml
@@ -1285,6 +1285,16 @@ groups:
           This value should be set to a bit less than the normal timeout period
           for shutting down without forcibly terminating the process.
 
+      - name: HealthCheckTimeout
+        type: duration
+        valuetype: nondefault
+        firstversion: v2.8
+        default: 3s
+        reload: false
+        summary: Controls the maximum duration allowed for collection health checks to complete.
+        description: >
+          The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues.
+
   - name: BufferSizes
     title: "Buffer Sizes"
     description: >