Skip to content

Commit

Permalink
metrics: quota utilization configuration and documentation (#22912)
Browse files Browse the repository at this point in the history
Introduces support for (optional) quota utilization metrics

CE part of the hashicorp/nomad-enterprise#1488 change
  • Loading branch information
pkazmierczak authored Jun 3, 2024
1 parent 180bab8 commit 2a09abc
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 7 deletions.
3 changes: 3 additions & 0 deletions .changelog/22912.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
metrics (Enterprise): Publish quota utilization as metrics
```
1 change: 1 addition & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,7 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
// Setup telemetry related config
conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval
conf.DisableDispatchedJobSummaryMetrics = agentConfig.Telemetry.DisableDispatchedJobSummaryMetrics
conf.DisableQuotaUtilizationMetrics = agentConfig.Telemetry.DisableQuotaUtilizationMetrics
conf.DisableRPCRateMetricsLabels = agentConfig.Telemetry.DisableRPCRateMetricsLabels

if d, err := time.ParseDuration(agentConfig.Limits.RPCHandshakeTimeout); err != nil {
Expand Down
7 changes: 7 additions & 0 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -981,6 +981,10 @@ type Telemetry struct {
// a small memory overhead.
DisableDispatchedJobSummaryMetrics bool `hcl:"disable_dispatched_job_summary_metrics"`

// DisableQuotaUtilizationMetrics allows to disable publishing of quota
// utilization metrics
DisableQuotaUtilizationMetrics bool `hcl:"disable_quota_utilization_metrics"`

// DisableRPCRateMetricsLabels drops the label for the identity of the
// requester when publishing metrics on RPC rate on the server. This may be
// useful to control metrics collection costs in environments where request
Expand Down Expand Up @@ -2513,6 +2517,9 @@ func (t *Telemetry) Merge(b *Telemetry) *Telemetry {
if b.DisableDispatchedJobSummaryMetrics {
result.DisableDispatchedJobSummaryMetrics = b.DisableDispatchedJobSummaryMetrics
}
if b.DisableQuotaUtilizationMetrics {
result.DisableQuotaUtilizationMetrics = b.DisableQuotaUtilizationMetrics
}
if b.DisableRPCRateMetricsLabels {
result.DisableRPCRateMetricsLabels = b.DisableRPCRateMetricsLabels
}
Expand Down
16 changes: 9 additions & 7 deletions command/agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ func TestConfig_Merge(t *testing.T) {
CirconusBrokerSelectTag: "dc:dc2",
PrefixFilter: []string{"prefix1", "prefix2"},
DisableDispatchedJobSummaryMetrics: true,
DisableQuotaUtilizationMetrics: false,
DisableRPCRateMetricsLabels: true,
FilterDefault: pointer.Of(false),
},
Expand Down Expand Up @@ -1446,26 +1447,27 @@ func TestTelemetry_Validate(t *testing.T) {
func TestTelemetry_Parse(t *testing.T) {
ci.Parallel(t)

require := require.New(t)
dir := t.TempDir()

file1 := filepath.Join(dir, "config1.hcl")
err := os.WriteFile(file1, []byte(`telemetry{
prefix_filter = ["+nomad.raft"]
filter_default = false
disable_dispatched_job_summary_metrics = true
disable_quota_utilization_metrics = true
disable_rpc_rate_metrics_labels = true
}`), 0600)
require.NoError(err)
must.NoError(t, err)

// Works on config dir
config, err := LoadConfig(dir)
require.NoError(err)
must.NoError(t, err)

require.False(*config.Telemetry.FilterDefault)
require.Exactly([]string{"+nomad.raft"}, config.Telemetry.PrefixFilter)
require.True(config.Telemetry.DisableDispatchedJobSummaryMetrics)
require.True(config.Telemetry.DisableRPCRateMetricsLabels)
must.False(t, *config.Telemetry.FilterDefault)
must.Eq(t, []string{"+nomad.raft"}, config.Telemetry.PrefixFilter)
must.True(t, config.Telemetry.DisableDispatchedJobSummaryMetrics)
must.True(t, config.Telemetry.DisableQuotaUtilizationMetrics)
must.True(t, config.Telemetry.DisableRPCRateMetricsLabels)
}

func TestEventBroker_Parse(t *testing.T) {
Expand Down
4 changes: 4 additions & 0 deletions nomad/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,10 @@ type Config struct {
// publishing Job summary metrics
DisableDispatchedJobSummaryMetrics bool

// DisableQuotaUtilizationMetrics allows to disable publishing of quota
// utilization metrics
DisableQuotaUtilizationMetrics bool

// DisableRPCRateMetricsLabels drops the label for the identity of the
// requester when publishing metrics on RPC rate on the server. This may be
// useful to control metrics collection costs in environments where request
Expand Down
6 changes: 6 additions & 0 deletions website/content/docs/configuration/telemetry.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ The following options are available on all telemetry configurations.
summary statistics, it is sometimes desired to trade these statistics for
more memory when dispatching high volumes of jobs.

- `disable_quota_utilization_metrics` `(bool: false)` - Specifies if Nomad
should publish metrics about quota utilization (a Nomad Enterprise feature).
Since each quota utilization check requires a relatively expensive check
against Nomad's state store, users with many namespaces and many quotas may
want to disable these metrics.

### `statsite`

These `telemetry` parameters apply to
Expand Down
3 changes: 3 additions & 0 deletions website/content/docs/operations/metrics-reference.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,9 @@ those listed in [Key Metrics](#key-metrics) above.
| `nomad.nomad.worker.submit_plan` | Time elapsed for worker to submit plan | Milliseconds | Timer | host |
| `nomad.nomad.worker.update_eval` | Time elapsed for worker to submit updated eval | Milliseconds | Timer | host |
| `nomad.nomad.worker.wait_for_index` | Time elapsed that worker waits for the raft index of the eval to be processed | Milliseconds | Timer | host |
| `nomad.quota.utilization.cpu` | Utilization of the CPU quota | Integer | Gauge | quota_name, namespace, region |
| `nomad.quota.utilization.cores` | Utilization of the CPU Cores quota | Integer | Gauge | quota_name, namespace, region |
| `nomad.quota.utilization.memory_mb` | Utilization of the Memory MB quota | Integer | Gauge | quota_name, namespace, region |
| `nomad.raft.appliedIndex` | Current index applied to FSM | Integer | Gauge | host |
| `nomad.raft.barrier` | Count of blocking raft API calls | Integer | Counter | host |
| `nomad.raft.commitNumLogs` | Count of logs enqueued | Integer | Gauge | host |
Expand Down

0 comments on commit 2a09abc

Please sign in to comment.