Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: make collector health check timeout configurable #1371

Merged
merged 4 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions collect/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ func (i *InMemCollector) Start() error {
// listen for config reloads
i.Config.RegisterReloadCallback(i.sendReloadSignal)

i.Health.Register(CollectorHealthKey, 3*time.Second)
i.Health.Register(CollectorHealthKey, time.Duration(imcConfig.HealthCheckTimeout))

for _, metric := range inMemCollectorMetrics {
i.Metrics.Register(metric)
Expand Down Expand Up @@ -339,6 +339,8 @@ func (i *InMemCollector) collect() {
defer i.mutex.Unlock()

for {
startTime := time.Now()

i.Health.Ready(CollectorHealthKey, true)
// record channel lengths as histogram but also as gauges
i.Metrics.Histogram("collector_incoming_queue", float64(len(i.incoming)))
Expand Down Expand Up @@ -385,18 +387,18 @@ func (i *InMemCollector) collect() {
return
}
i.processSpan(sp)
continue
case sp, ok := <-i.fromPeer:
if !ok {
// channel's been closed; we should shut down.
return
}
i.processSpan(sp)
continue
case <-i.reload:
i.reloadConfigs()
}
}

i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds()))
}
}

Expand Down
1 change: 1 addition & 0 deletions config/file_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ type CollectionConfig struct {
PeerQueueSize int `yaml:"PeerQueueSize"`
IncomingQueueSize int `yaml:"IncomingQueueSize"`
AvailableMemory MemorySize `yaml:"AvailableMemory" cmdenv:"AvailableMemory"`
HealthCheckTimeout Duration `yaml:"HealthCheckTimeout" default:"3s"`
MaxMemoryPercentage int `yaml:"MaxMemoryPercentage" default:"75"`
MaxAlloc MemorySize `yaml:"MaxAlloc"`
DisableRedistribution bool `yaml:"DisableRedistribution"`
Expand Down
10 changes: 10 additions & 0 deletions config/metadata/configMeta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1296,6 +1296,16 @@ groups:
description: >
If `true`, Refinery's will route all spans that belongs to the same trace to a single peer.

- name: HealthCheckTimeout
type: duration
valuetype: nondefault
firstversion: v2.8
default: 3s
reload: false
summary: Controls the maximum duration allowed for collection health checks to complete.
description: >
The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues.

- name: BufferSizes
title: "Buffer Sizes"
description: >
Expand Down
Loading