Skip to content

Commit

Permalink
Refactor Scheduler Metrics (#3804)
Browse files Browse the repository at this point in the history
* wip

Signed-off-by: Chris Martin <[email protected]>

* lint

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* remove unnecessary classes

Signed-off-by: Chris Martin <[email protected]>

* fix tests

Signed-off-by: Chris Martin <[email protected]>

* fix tests

Signed-off-by: Chris Martin <[email protected]>

* lint

Signed-off-by: Chris Martin <[email protected]>

* fix integration test

Signed-off-by: Chris Martin <[email protected]>

* unit test

Signed-off-by: Chris Martin <[email protected]>

* unit test

Signed-off-by: Chris Martin <[email protected]>

* go mod tidy

Signed-off-by: Chris Martin <[email protected]>

* lint

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* merge master

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* fix compilation

Signed-off-by: Chris Martin <[email protected]>

* fix compilation

Signed-off-by: Chris Martin <[email protected]>

* more tests

Signed-off-by: Chris Martin <[email protected]>

* more tests

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* fix merge conflict

Signed-off-by: Chris Martin <[email protected]>

* minor fixes

Signed-off-by: Chris Martin <[email protected]>

* better names

Signed-off-by: Chris Martin <[email protected]>

* fix test

Signed-off-by: Chris Martin <[email protected]>

* add counters

Signed-off-by: Chris Martin <[email protected]>

* wip

Signed-off-by: Chris Martin <[email protected]>

* add back reset

Signed-off-by: Chris Martin <[email protected]>

* add back reset

Signed-off-by: Chris Martin <[email protected]>

* add back reset

Signed-off-by: Chris Martin <[email protected]>

* register metrics

Signed-off-by: Chris Martin <[email protected]>

* lint

Signed-off-by: Chris Martin <[email protected]>

* fix metric names

Signed-off-by: Chris Martin <[email protected]>

* review comments

Signed-off-by: Chris Martin <[email protected]>

---------

Signed-off-by: Chris Martin <[email protected]>
Co-authored-by: Chris Martin <[email protected]>
  • Loading branch information
d80tb7 and d80tb7 authored Aug 24, 2024
1 parent b006b7b commit 523771f
Show file tree
Hide file tree
Showing 26 changed files with 1,248 additions and 1,137 deletions.
17 changes: 1 addition & 16 deletions config/scheduler/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,13 @@ queueRefreshPeriod: 10s
disableSubmitCheck: false
metrics:
port: 9000
jobStateMetricsResetInterval: 12h
refreshInterval: 30s
metrics:
scheduleCycleTimeHistogramSettings:
start: 10.0
factor: 1.1
count: 110
reconcileCycleTimeHistogramSettings:
start: 10.0
factor: 1.1
count: 110
schedulerMetrics:
trackedResourceNames:
- "cpu"
- "memory"
- "ephemeral-storage"
- "nvidia.com/gpu"
resourceRenaming:
nvidia.com/gpu: "gpu"
amd.com/gpu: "gpu"
ephemeral-storage: "ephemeralStorage"
matchedRegexIndexByErrorMessageCacheSize: 100
resetInterval: "1h"
pulsar:
URL: "pulsar://pulsar:6650"
jobsetEventsTopic: "events"
Expand Down
7 changes: 3 additions & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ require (
github.com/go-openapi/spec v0.20.14
github.com/gogo/protobuf v1.3.2
github.com/golang/protobuf v1.5.4
github.com/google/go-cmp v0.5.9 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/uuid v1.6.0
github.com/grpc-ecosystem/go-grpc-middleware v1.4.0
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0
Expand All @@ -35,7 +35,7 @@ require (
github.com/oklog/ulid v1.3.1
github.com/patrickmn/go-cache v2.1.0+incompatible
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.17.0
github.com/prometheus/client_golang v1.19.1
github.com/renstrom/shortuuid v3.0.0+incompatible
github.com/sirupsen/logrus v1.9.3
github.com/spf13/cobra v1.8.0
Expand Down Expand Up @@ -78,7 +78,7 @@ require (
github.com/magefile/mage v1.14.0
github.com/minio/highwayhash v1.0.2
github.com/openconfig/goyang v1.2.0
github.com/prometheus/common v0.45.0
github.com/prometheus/common v0.48.0
github.com/redis/go-redis/extra/redisprometheus/v9 v9.0.5
github.com/redis/go-redis/v9 v9.5.1
github.com/segmentio/fasthash v1.0.3
Expand Down Expand Up @@ -161,7 +161,6 @@ require (
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mattn/go-isatty v0.0.18 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
github.com/microcosm-cc/bluemonday v1.0.25 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
Expand Down
13 changes: 6 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,9 @@ github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g=
github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
Expand Down Expand Up @@ -372,8 +373,6 @@ github.com/mattn/go-runewidth v0.0.15 h1:UNAjwbU9l54TA3KzvqLGxwWjHmMgBUVhBiTjelZ
github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/mattn/go-zglob v0.0.4 h1:LQi2iOm0/fGgu80AioIJ/1j9w9Oh+9DZ39J4VAGzHQM=
github.com/mattn/go-zglob v0.0.4/go.mod h1:MxxjyoXXnMxfIpxTK2GAkw1w8glPsQILx3N5wrKakiY=
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg=
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k=
github.com/microcosm-cc/bluemonday v1.0.25 h1:4NEwSfiJ+Wva0VxN5B8OwMicaJvD8r9tlJWm9rtloEg=
github.com/microcosm-cc/bluemonday v1.0.25/go.mod h1:ZIOjCQp1OrzBBPIJmfX4qDYFuhU02nx4bn030ixfHLE=
github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA/g=
Expand Down Expand Up @@ -437,13 +436,13 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pquerna/cachecontrol v0.1.0 h1:yJMy84ti9h/+OEWa752kBTKv4XC30OtVVHYv/8cTqKc=
github.com/pquerna/cachecontrol v0.1.0/go.mod h1:NrUG3Z7Rdu85UNR3vm7SOsl1nFIeSiQnrHV5K9mBcUI=
github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q=
github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY=
github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE=
github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw=
github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI=
github.com/prometheus/common v0.45.0 h1:2BGz0eBc2hdMDLnO/8n0jeB3oPrt2D08CekT0lneoxM=
github.com/prometheus/common v0.45.0/go.mod h1:YJmSTw9BoKxJplESWWxlbyttQR4uaEcGyv9MZjVOJsY=
github.com/prometheus/common v0.48.0 h1:QO8U2CdOzSn1BBsmXJXduaaW+dY/5QLjfB8svtSzKKE=
github.com/prometheus/common v0.48.0/go.mod h1:0/KsvlIEfPQCQ5I2iNSAWKPZziNCvRs5EC6ILDTlAPc=
github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo=
github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo=
github.com/redis/go-redis/extra/redisprometheus/v9 v9.0.5 h1:kvl0LOTQD23VR1R7A9vDti9msfV6mOE2+j6ngYkFsfg=
Expand Down
47 changes: 11 additions & 36 deletions internal/scheduler/configuration/configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,7 @@ type Configuration struct {
// Configuration controlling leader election
Leader LeaderConfig
// Configuration controlling metrics
Metrics LegacyMetricsConfig
// Configuration for new scheduler metrics.
// Due to replace metrics configured via the above entry.
SchedulerMetrics MetricsConfig
Metrics MetricsConfig
// Scheduler configuration (this is shared with the old scheduler)
Scheduling SchedulingConfig
Auth authconfig.AuthConfig
Expand Down Expand Up @@ -69,28 +66,6 @@ func (c Configuration) Validate() error {
return validate.Struct(c)
}

type MetricsConfig struct {
// If true, disable metric collection and publishing.
Disabled bool
// Regexes used for job error categorisation.
// Specifically, the subCategory label for job failure counters is the first regex that matches the job error.
// If no regex matches, the subCategory label is the empty string.
TrackedErrorRegexes []string
// Metrics are exported for these resources.
TrackedResourceNames []v1.ResourceName
// Optionally rename resources in exported metrics.
// E.g., if ResourceRenaming["nvidia.com/gpu"] = "gpu", then metrics for resource "nvidia.com/gpu" use resource name "gpu" instead.
// This can be used to avoid illegal Prometheus metric names (e.g., for "nvidia.com/gpu" as "/" is not allowed).
// Allowed characters in resource names are [a-zA-Z_:][a-zA-Z0-9_:]*
// It can also be used to track multiple resources within the same metric, e.g., "nvidia.com/gpu" and "amd.com/gpu".
ResourceRenaming map[v1.ResourceName]string
// The first matching regex of each error message is cached in an LRU cache.
// This setting controls the cache size.
MatchedRegexIndexByErrorMessageCacheSize uint64
// Reset metrics this often. Resetting periodically ensures inactive time series are garbage-collected.
ResetInterval time.Duration
}

type LeaderConfig struct {
// Valid modes are "standalone" or "kubernetes"
Mode string `validate:"required"`
Expand Down Expand Up @@ -128,16 +103,16 @@ type HttpConfig struct {
Port int `validate:"required"`
}

// TODO: ALl this needs to be unified with MetricsConfig
type LegacyMetricsConfig struct {
Port uint16
RefreshInterval time.Duration
Metrics SchedulerMetricsConfig
}

type SchedulerMetricsConfig struct {
ScheduleCycleTimeHistogramSettings HistogramConfig
ReconcileCycleTimeHistogramSettings HistogramConfig
type MetricsConfig struct {
Port uint16
RefreshInterval time.Duration
JobStateMetricsResetInterval time.Duration
// Regexes used for job error categorisation.
// Specifically, the subCategory label for job failure counters is the first regex that matches the job error.
// If no regex matches, the subCategory label is the empty string.
TrackedErrorRegexes []string
// Metrics are exported for these resources.
TrackedResourceNames []v1.ResourceName
}

type HistogramConfig struct {
Expand Down
14 changes: 14 additions & 0 deletions internal/scheduler/context/scheduling.go
Original file line number Diff line number Diff line change
Expand Up @@ -378,3 +378,17 @@ func (sctx *SchedulingContext) AllocatedByQueueAndPriority() map[string]schedule
}
return rv
}

// FairnessError returns the cumulative delta between adjusted fair share and actual share for all users who
// are below their fair share
func (sctx *SchedulingContext) FairnessError() float64 {
fairnessError := 0.0
for _, qctx := range sctx.QueueSchedulingContexts {
actualShare := sctx.FairnessCostProvider.UnweightedCostFromQueue(qctx)
delta := qctx.AdjustedFairShare - actualShare
if delta > 0 {
fairnessError += delta
}
}
return fairnessError
}
87 changes: 72 additions & 15 deletions internal/scheduler/context/scheduling_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package context

import (
"fmt"
"testing"

"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -58,21 +59,11 @@ func TestSchedulingContextAccounting(t *testing.T) {
}

func TestCalculateFairShares(t *testing.T) {
zeroCpu := schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse("0")},
}
oneCpu := schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1")},
}
fortyCpu := schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse("40")},
}
oneHundredCpu := schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse("100")},
}
oneThousandCpu := schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse("1000")},
}
zeroCpu := cpu(0)
oneCpu := cpu(1)
fortyCpu := cpu(40)
oneHundredCpu := cpu(100)
oneThousandCpu := cpu(1000)
tests := map[string]struct {
availableResources schedulerobjects.ResourceList
queueCtxs map[string]*QueueSchedulingContext
Expand Down Expand Up @@ -208,6 +199,66 @@ func TestCalculateFairShares(t *testing.T) {
}
}

func TestCalculateFairnessError(t *testing.T) {
tests := map[string]struct {
availableResources schedulerobjects.ResourceList
queueCtxs map[string]*QueueSchedulingContext
expected float64
}{
"one queue, no error": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{
"queueA": {Allocated: cpu(50), AdjustedFairShare: 0.5},
},
expected: 0,
},
"two queues, no error": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{
"queueA": {Allocated: cpu(50), AdjustedFairShare: 0.5},
"queueB": {Allocated: cpu(50), AdjustedFairShare: 0.5},
},
expected: 0,
},
"one queue with error": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{
"queueA": {Allocated: cpu(40), AdjustedFairShare: 0.5},
},
expected: 0.1,
},
"two queues with error": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{
"queueA": {Allocated: cpu(40), AdjustedFairShare: 0.5},
"queueB": {Allocated: cpu(10), AdjustedFairShare: 0.5},
},
expected: 0.5,
},
"above fair share is not counted": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{
"queueA": {Allocated: cpu(100), AdjustedFairShare: 0.5},
},
expected: 0.0,
},
"empty": {
availableResources: cpu(100),
queueCtxs: map[string]*QueueSchedulingContext{},
expected: 0.0,
},
}
for name, tc := range tests {
t.Run(name, func(t *testing.T) {
fairnessCostProvider, err := fairness.NewDominantResourceFairness(tc.availableResources, configuration.SchedulingConfig{DominantResourceFairnessResourcesToConsider: []string{"cpu"}})
require.NoError(t, err)
sctx := NewSchedulingContext("pool", fairnessCostProvider, nil, tc.availableResources)
sctx.QueueSchedulingContexts = tc.queueCtxs
assert.InDelta(t, tc.expected, sctx.FairnessError(), 0.00001)
})
}
}

func testNSmallCpuJobSchedulingContext(queue, priorityClassName string, n int) []*JobSchedulingContext {
rv := make([]*JobSchedulingContext, n)
for i := 0; i < n; i++ {
Expand All @@ -226,3 +277,9 @@ func testSmallCpuJobSchedulingContext(queue, priorityClassName string) *JobSched
GangInfo: EmptyGangInfo(job),
}
}

func cpu(n int) schedulerobjects.ResourceList {
return schedulerobjects.ResourceList{
Resources: map[string]resource.Quantity{"cpu": resource.MustParse(fmt.Sprintf("%d", n))},
}
}
21 changes: 21 additions & 0 deletions internal/scheduler/jobdb/job_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,13 @@ func (run *JobRun) Executor() string {
return run.executor
}

// WithExecutor returns a copy of the job run with the executor updated.
func (run *JobRun) WithExecutor(executor string) *JobRun {
run = run.DeepCopy()
run.executor = executor
return run
}

// NodeId returns the id of the node to which the JobRun is assigned.
func (run *JobRun) NodeId() string {
return run.nodeId
Expand All @@ -290,11 +297,25 @@ func (run *JobRun) Pool() string {
return run.pool
}

// WithPool returns a copy of the job run with the pool updated
func (run *JobRun) WithPool(pool string) *JobRun {
run = run.DeepCopy()
run.pool = pool
return run
}

// NodeName returns the name of the node to which the JobRun is assigned.
func (run *JobRun) NodeName() string {
return run.nodeName
}

// WithNodeName returns a copy of the job run with the node name updated.
func (run *JobRun) WithNodeName(nodeName string) *JobRun {
run = run.DeepCopy()
run.nodeName = nodeName
return run
}

func (run *JobRun) ScheduledAtPriority() *int32 {
return run.scheduledAtPriority
}
Expand Down
29 changes: 29 additions & 0 deletions internal/scheduler/metrics/constants.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package metrics

const (

// common prefix for all metric names
prefix = "armada_scheduler_"

// Prometheus Labels
poolLabel = "pool"
queueLabel = "queue"
priorityClassLabel = "priority_class"
nodeLabel = "node"
clusterLabel = "cluster"
errorCategoryLabel = "category"
errorSubcategoryLabel = "subcategory"
stateLabel = "state"
priorStateLabel = "priorState"
resourceLabel = "resource"

// Job state strings
queued = "queued"
running = "running"
pending = "pending"
cancelled = "cancelled"
leased = "leased"
preempted = "preempted"
failed = "failed"
succeeded = "succeeded"
)
Loading

0 comments on commit 523771f

Please sign in to comment.