From e750f7c1a502c949b186138b17a5c759a7143a86 Mon Sep 17 00:00:00 2001
From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com>
Date: Wed, 9 Oct 2024 18:03:51 -0400
Subject: [PATCH 1/4] feat: make collector health check timeout configurable

---
 collect/collect.go              |  2 +-
 config/file_config.go           |  1 +
 config/metadata/configMeta.yaml | 10 ++++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/collect/collect.go b/collect/collect.go
index 95a86c5976..4c45f4b18a 100644
--- a/collect/collect.go
+++ b/collect/collect.go
@@ -141,7 +141,7 @@ func (i *InMemCollector) Start() error {
 	// listen for config reloads
 	i.Config.RegisterReloadCallback(i.sendReloadSignal)
 
-	i.Health.Register(CollectorHealthKey, 3*time.Second)
+	i.Health.Register(CollectorHealthKey, time.Duration(imcConfig.HealthCheckTimeout))
 
 	for _, metric := range inMemCollectorMetrics {
 		i.Metrics.Register(metric)
diff --git a/config/file_config.go b/config/file_config.go
index 0b5e09bb3b..35c0b4d881 100644
--- a/config/file_config.go
+++ b/config/file_config.go
@@ -302,6 +302,7 @@ type CollectionConfig struct {
 	PeerQueueSize         int        `yaml:"PeerQueueSize"`
 	IncomingQueueSize     int        `yaml:"IncomingQueueSize"`
 	AvailableMemory       MemorySize `yaml:"AvailableMemory" cmdenv:"AvailableMemory"`
+	HealthCheckTimeout    Duration   `yaml:"HealthCheckTimeout" default:"3s"`
 	MaxMemoryPercentage   int        `yaml:"MaxMemoryPercentage" default:"75"`
 	MaxAlloc              MemorySize `yaml:"MaxAlloc"`
 	DisableRedistribution bool       `yaml:"DisableRedistribution"`
diff --git a/config/metadata/configMeta.yaml b/config/metadata/configMeta.yaml
index 69f3767182..3a3485ee5f 100644
--- a/config/metadata/configMeta.yaml
+++ b/config/metadata/configMeta.yaml
@@ -1296,6 +1296,16 @@ groups:
         description: >
           If `true`, Refinery's will route all spans that belongs to the same trace to a single peer.
 
+      - name: HealthCheckTimeout
+        type: duration
+        valuetype: nondefault
+        firstversion: v2.8
+        default: 3s
+        reload: false
+        summary: Controls the maximum duration allowed for health checks to complete.
+        description: >
+          The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of registered subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues.
+
   - name: BufferSizes
     title: "Buffer Sizes"
     description: >

From ca061a7442106fa3d8a4d86b125ce279ba97309e Mon Sep 17 00:00:00 2001
From: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com>
Date: Wed, 9 Oct 2024 18:22:01 -0600
Subject: [PATCH 2/4] Apply suggestions from code review

---
 config/metadata/configMeta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/config/metadata/configMeta.yaml b/config/metadata/configMeta.yaml
index 3a3485ee5f..d87c67f008 100644
--- a/config/metadata/configMeta.yaml
+++ b/config/metadata/configMeta.yaml
@@ -1302,9 +1302,9 @@ groups:
         firstversion: v2.8
         default: 3s
         reload: false
-        summary: Controls the maximum duration allowed for health checks to complete.
+        summary: Controls the maximum duration allowed for collection health checks to complete.
         description: >
-          The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of registered subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues.
+          The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues.
 
   - name: BufferSizes
     title: "Buffer Sizes"

From a247769cb9b9c9020de1ee79283e12092f768891 Mon Sep 17 00:00:00 2001
From: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com>
Date: Wed, 9 Oct 2024 18:27:17 -0600
Subject: [PATCH 3/4] Add metric to time the collect loop

---
 collect/collect.go | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/collect/collect.go b/collect/collect.go
index 4c45f4b18a..d203e5a5cb 100644
--- a/collect/collect.go
+++ b/collect/collect.go
@@ -339,6 +339,8 @@ func (i *InMemCollector) collect() {
 	defer i.mutex.Unlock()
 
 	for {
+		startTime := time.Now()
+
 		i.Health.Ready(CollectorHealthKey, true)
 		// record channel lengths as histogram but also as gauges
 		i.Metrics.Histogram("collector_incoming_queue", float64(len(i.incoming)))
@@ -385,6 +387,7 @@ func (i *InMemCollector) collect() {
 					return
 				}
 				i.processSpan(sp)
+				i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds()))
 				continue
 			case sp, ok := <-i.fromPeer:
 				if !ok {
@@ -392,11 +395,14 @@ func (i *InMemCollector) collect() {
 					return
 				}
 				i.processSpan(sp)
+				i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds()))
 				continue
 			case <-i.reload:
 				i.reloadConfigs()
 			}
 		}
+
+		i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds()))
 	}
 }
 

From 752475c94902b3920afe07a5738e5734f3381cc6 Mon Sep 17 00:00:00 2001
From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com>
Date: Thu, 10 Oct 2024 10:22:42 -0400
Subject: [PATCH 4/4] remove unnecessary continue in collect loop

---
 collect/collect.go | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/collect/collect.go b/collect/collect.go
index d203e5a5cb..aaaae6798c 100644
--- a/collect/collect.go
+++ b/collect/collect.go
@@ -387,16 +387,12 @@ func (i *InMemCollector) collect() {
 					return
 				}
 				i.processSpan(sp)
-				i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds()))
-				continue
 			case sp, ok := <-i.fromPeer:
 				if !ok {
 					// channel's been closed; we should shut down.
 					return
 				}
 				i.processSpan(sp)
-				i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds()))
-				continue
 			case <-i.reload:
 				i.reloadConfigs()
 			}