From d17355cca01cdc8ab8f39090486d3fa73de38793 Mon Sep 17 00:00:00 2001 From: Yingrong Zhao <22300958+VinozzZ@users.noreply.github.com> Date: Thu, 10 Oct 2024 10:45:51 -0400 Subject: [PATCH] feat: make collector health check timeout configurable (#1371) ## Which problem is this PR solving? When there's a big volume of traffic, collector can take longer to respond to the health check ## Short description of the changes - add a new config option `HealthCheckTimeout` in `Collection` --------- Co-authored-by: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com> # Conflicts: # config/metadata/configMeta.yaml --- collect/collect.go | 8 +++++--- config/file_config.go | 1 + config/metadata/configMeta.yaml | 10 ++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/collect/collect.go b/collect/collect.go index fcd6430015..3d4cbbd36a 100644 --- a/collect/collect.go +++ b/collect/collect.go @@ -104,7 +104,7 @@ func (i *InMemCollector) Start() error { // listen for config reloads i.Config.RegisterReloadCallback(i.sendReloadSignal) - i.Health.Register(CollectorHealthKey, 3*time.Second) + i.Health.Register(CollectorHealthKey, time.Duration(imcConfig.HealthCheckTimeout)) i.Metrics.Register("trace_duration_ms", "histogram") i.Metrics.Register("trace_span_count", "histogram") @@ -330,6 +330,8 @@ func (i *InMemCollector) collect() { defer i.mutex.Unlock() for { + startTime := time.Now() + i.Health.Ready(CollectorHealthKey, true) // record channel lengths as histogram but also as gauges i.Metrics.Histogram("collector_incoming_queue", float64(len(i.incoming))) @@ -376,18 +378,18 @@ func (i *InMemCollector) collect() { return } i.processSpan(sp) - continue case sp, ok := <-i.fromPeer: if !ok { // channel's been closed; we should shut down. return } i.processSpan(sp) - continue case <-i.reload: i.reloadConfigs() } } + + i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds())) } } diff --git a/config/file_config.go b/config/file_config.go index 9f5dd7ea8b..2fa41f9bd5 100644 --- a/config/file_config.go +++ b/config/file_config.go @@ -302,6 +302,7 @@ type CollectionConfig struct { PeerQueueSize int `yaml:"PeerQueueSize"` IncomingQueueSize int `yaml:"IncomingQueueSize"` AvailableMemory MemorySize `yaml:"AvailableMemory" cmdenv:"AvailableMemory"` + HealthCheckTimeout Duration `yaml:"HealthCheckTimeout" default:"3s"` MaxMemoryPercentage int `yaml:"MaxMemoryPercentage" default:"75"` MaxAlloc MemorySize `yaml:"MaxAlloc"` DisableRedistribution bool `yaml:"DisableRedistribution"` diff --git a/config/metadata/configMeta.yaml b/config/metadata/configMeta.yaml index c1f0959e2c..a0d1823e9b 100644 --- a/config/metadata/configMeta.yaml +++ b/config/metadata/configMeta.yaml @@ -1285,6 +1285,16 @@ groups: This value should be set to a bit less than the normal timeout period for shutting down without forcibly terminating the process. + - name: HealthCheckTimeout + type: duration + valuetype: nondefault + firstversion: v2.8 + default: 3s + reload: false + summary: Controls the maximum duration allowed for collection health checks to complete. + description: > + The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues. + - name: BufferSizes title: "Buffer Sizes" description: >