Skip to content

Commit

Permalink
feat: make collector health check timeout configurable (#1371)
Browse files Browse the repository at this point in the history
## Which problem is this PR solving?

When there's a big volume of traffic, collector can take longer to
respond to the health check

## Short description of the changes

- add a new config option `HealthCheckTimeout` in `Collection`

---------

Co-authored-by: Tyler Helmuth <[email protected]>
# Conflicts:
#	config/metadata/configMeta.yaml
  • Loading branch information
VinozzZ authored and MikeGoldsmith committed Oct 10, 2024
1 parent 869da1a commit d17355c
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 3 deletions.
8 changes: 5 additions & 3 deletions collect/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func (i *InMemCollector) Start() error {
// listen for config reloads
i.Config.RegisterReloadCallback(i.sendReloadSignal)

i.Health.Register(CollectorHealthKey, 3*time.Second)
i.Health.Register(CollectorHealthKey, time.Duration(imcConfig.HealthCheckTimeout))

i.Metrics.Register("trace_duration_ms", "histogram")
i.Metrics.Register("trace_span_count", "histogram")
Expand Down Expand Up @@ -330,6 +330,8 @@ func (i *InMemCollector) collect() {
defer i.mutex.Unlock()

for {
startTime := time.Now()

i.Health.Ready(CollectorHealthKey, true)
// record channel lengths as histogram but also as gauges
i.Metrics.Histogram("collector_incoming_queue", float64(len(i.incoming)))
Expand Down Expand Up @@ -376,18 +378,18 @@ func (i *InMemCollector) collect() {
return
}
i.processSpan(sp)
continue
case sp, ok := <-i.fromPeer:
if !ok {
// channel's been closed; we should shut down.
return
}
i.processSpan(sp)
continue
case <-i.reload:
i.reloadConfigs()
}
}

i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds()))
}
}

Expand Down
1 change: 1 addition & 0 deletions config/file_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ type CollectionConfig struct {
PeerQueueSize int `yaml:"PeerQueueSize"`
IncomingQueueSize int `yaml:"IncomingQueueSize"`
AvailableMemory MemorySize `yaml:"AvailableMemory" cmdenv:"AvailableMemory"`
HealthCheckTimeout Duration `yaml:"HealthCheckTimeout" default:"3s"`
MaxMemoryPercentage int `yaml:"MaxMemoryPercentage" default:"75"`
MaxAlloc MemorySize `yaml:"MaxAlloc"`
DisableRedistribution bool `yaml:"DisableRedistribution"`
Expand Down
10 changes: 10 additions & 0 deletions config/metadata/configMeta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1285,6 +1285,16 @@ groups:
This value should be set to a bit less than the normal timeout period
for shutting down without forcibly terminating the process.
- name: HealthCheckTimeout
type: duration
valuetype: nondefault
firstversion: v2.8
default: 3s
reload: false
summary: Controls the maximum duration allowed for collection health checks to complete.
description: >
The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues.
- name: BufferSizes
title: "Buffer Sizes"
description: >
Expand Down

0 comments on commit d17355c

Please sign in to comment.