diff --git a/collect/collect.go b/collect/collect.go index 95a86c5976..aaaae6798c 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -141,7 +141,7 @@ func (i *InMemCollector) Start() error { // listen for config reloads i.Config.RegisterReloadCallback(i.sendReloadSignal) - i.Health.Register(CollectorHealthKey, 3*time.Second) + i.Health.Register(CollectorHealthKey, time.Duration(imcConfig.HealthCheckTimeout)) for _, metric := range inMemCollectorMetrics { i.Metrics.Register(metric) @@ -339,6 +339,8 @@ func (i *InMemCollector) collect() { defer i.mutex.Unlock() for { + startTime := time.Now() + i.Health.Ready(CollectorHealthKey, true) // record channel lengths as histogram but also as gauges i.Metrics.Histogram("collector_incoming_queue", float64(len(i.incoming))) @@ -385,18 +387,18 @@ func (i *InMemCollector) collect() { return } i.processSpan(sp) - continue case sp, ok := <-i.fromPeer: if !ok { // channel's been closed; we should shut down. return } i.processSpan(sp) - continue case <-i.reload: i.reloadConfigs() } } + + i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds())) } } diff --git a/config/file_config.go b/config/file_config.go index 0b5e09bb3b..35c0b4d881 100644 --- a/config/file_config.go +++ b/config/file_config.go @@ -302,6 +302,7 @@ type CollectionConfig struct { PeerQueueSize int `yaml:"PeerQueueSize"` IncomingQueueSize int `yaml:"IncomingQueueSize"` AvailableMemory MemorySize `yaml:"AvailableMemory" cmdenv:"AvailableMemory"` + HealthCheckTimeout Duration `yaml:"HealthCheckTimeout" default:"3s"` MaxMemoryPercentage int `yaml:"MaxMemoryPercentage" default:"75"` MaxAlloc MemorySize `yaml:"MaxAlloc"` DisableRedistribution bool `yaml:"DisableRedistribution"` diff --git a/config/metadata/configMeta.yaml b/config/metadata/configMeta.yaml index 69f3767182..d87c67f008 100644 --- a/config/metadata/configMeta.yaml +++ b/config/metadata/configMeta.yaml @@ -1296,6 +1296,16 @@ groups: description: > If `true`, Refinery's will route all spans that belongs to the same trace to a single peer. + - name: HealthCheckTimeout + type: duration + valuetype: nondefault + firstversion: v2.8 + default: 3s + reload: false + summary: Controls the maximum duration allowed for collection health checks to complete. + description: > + The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues. + - name: BufferSizes title: "Buffer Sizes" description: >