diff --git a/CHANGELOG.md b/CHANGELOG.md index 4196a2e03bf..e19855fb26b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ BUG FIXES: * api: autoscaling policies should not be returned for stopped jobs [[GH-7768](https://github.com/hashicorp/nomad/issues/7768)] * core: job scale status endpoint was returning incorrect counts [[GH-7789](https://github.com/hashicorp/nomad/issues/7789)] + * core: Fixed a bug where scores for allocations were biased toward nodes with resource reservations [[GH-7730](https://github.com/hashicorp/nomad/issues/7730)] * jobspec: autoscaling policy block should return a parsing error multiple `policy` blocks are provided [[GH-7716](https://github.com/hashicorp/nomad/issues/7716)] * ui: Fixed a bug where exec popup had incorrect URL for jobs where name ≠ id [[GH-7814](https://github.com/hashicorp/nomad/issues/7814)] diff --git a/nomad/structs/funcs.go b/nomad/structs/funcs.go index da449a1eca6..cee0dd4067b 100644 --- a/nomad/structs/funcs.go +++ b/nomad/structs/funcs.go @@ -101,12 +101,9 @@ func FilterTerminalAllocs(allocs []*Allocation) ([]*Allocation, map[string]*Allo // ensured there are no collisions. If checkDevices is set to true, we check if // there is a device oversubscription. func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevices bool) (bool, string, *ComparableResources, error) { - // Compute the utilization from zero + // Compute the allocs' utilization from zero used := new(ComparableResources) - // Add the reserved resources of the node - used.Add(node.ComparableReservedResources()) - // For each alloc, add the resources for _, alloc := range allocs { // Do not consider the resource impact of terminal allocations @@ -117,9 +114,11 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi used.Add(alloc.ComparableResources()) } - // Check that the node resources are a super set of those - // that are being allocated - if superset, dimension := node.ComparableResources().Superset(used); !superset { + // Check that the node resources (after subtracting reserved) are a + // super set of those that are being allocated + available := node.ComparableResources() + available.Subtract(node.ComparableReservedResources()) + if superset, dimension := available.Superset(used); !superset { return false, dimension, used, nil } diff --git a/nomad/structs/funcs_test.go b/nomad/structs/funcs_test.go index b01ce5e6371..4302164a61f 100644 --- a/nomad/structs/funcs_test.go +++ b/nomad/structs/funcs_test.go @@ -188,8 +188,8 @@ func TestAllocsFit_Old(t *testing.T) { require.True(fit) // Sanity check the used resources - require.EqualValues(2000, used.Flattened.Cpu.CpuShares) - require.EqualValues(2048, used.Flattened.Memory.MemoryMB) + require.EqualValues(1000, used.Flattened.Cpu.CpuShares) + require.EqualValues(1024, used.Flattened.Memory.MemoryMB) // Should not fit second allocation fit, _, used, err = AllocsFit(n, []*Allocation{a1, a1}, nil, false) @@ -197,8 +197,8 @@ func TestAllocsFit_Old(t *testing.T) { require.False(fit) // Sanity check the used resources - require.EqualValues(3000, used.Flattened.Cpu.CpuShares) - require.EqualValues(3072, used.Flattened.Memory.MemoryMB) + require.EqualValues(2000, used.Flattened.Cpu.CpuShares) + require.EqualValues(2048, used.Flattened.Memory.MemoryMB) } // COMPAT(0.11): Remove in 0.11 @@ -255,8 +255,8 @@ func TestAllocsFit_TerminalAlloc_Old(t *testing.T) { require.True(fit) // Sanity check the used resources - require.EqualValues(2000, used.Flattened.Cpu.CpuShares) - require.EqualValues(2048, used.Flattened.Memory.MemoryMB) + require.EqualValues(1000, used.Flattened.Cpu.CpuShares) + require.EqualValues(1024, used.Flattened.Memory.MemoryMB) // Should fit second allocation since it is terminal a2 := a1.Copy() @@ -266,8 +266,8 @@ func TestAllocsFit_TerminalAlloc_Old(t *testing.T) { require.True(fit) // Sanity check the used resources - require.EqualValues(2000, used.Flattened.Cpu.CpuShares) - require.EqualValues(2048, used.Flattened.Memory.MemoryMB) + require.EqualValues(1000, used.Flattened.Cpu.CpuShares) + require.EqualValues(1024, used.Flattened.Memory.MemoryMB) } func TestAllocsFit(t *testing.T) { @@ -340,8 +340,8 @@ func TestAllocsFit(t *testing.T) { require.True(fit) // Sanity check the used resources - require.EqualValues(2000, used.Flattened.Cpu.CpuShares) - require.EqualValues(2048, used.Flattened.Memory.MemoryMB) + require.EqualValues(1000, used.Flattened.Cpu.CpuShares) + require.EqualValues(1024, used.Flattened.Memory.MemoryMB) // Should not fit second allocation fit, _, used, err = AllocsFit(n, []*Allocation{a1, a1}, nil, false) @@ -349,8 +349,8 @@ func TestAllocsFit(t *testing.T) { require.False(fit) // Sanity check the used resources - require.EqualValues(3000, used.Flattened.Cpu.CpuShares) - require.EqualValues(3072, used.Flattened.Memory.MemoryMB) + require.EqualValues(2000, used.Flattened.Cpu.CpuShares) + require.EqualValues(2048, used.Flattened.Memory.MemoryMB) } func TestAllocsFit_TerminalAlloc(t *testing.T) { @@ -424,8 +424,8 @@ func TestAllocsFit_TerminalAlloc(t *testing.T) { require.True(fit) // Sanity check the used resources - require.EqualValues(2000, used.Flattened.Cpu.CpuShares) - require.EqualValues(2048, used.Flattened.Memory.MemoryMB) + require.EqualValues(1000, used.Flattened.Cpu.CpuShares) + require.EqualValues(1024, used.Flattened.Memory.MemoryMB) // Should fit second allocation since it is terminal a2 := a1.Copy() @@ -435,8 +435,8 @@ func TestAllocsFit_TerminalAlloc(t *testing.T) { require.True(fit, dim) // Sanity check the used resources - require.EqualValues(2000, used.Flattened.Cpu.CpuShares) - require.EqualValues(2048, used.Flattened.Memory.MemoryMB) + require.EqualValues(1000, used.Flattened.Cpu.CpuShares) + require.EqualValues(1024, used.Flattened.Memory.MemoryMB) } // Tests that AllocsFit detects device collisions diff --git a/scheduler/rank_test.go b/scheduler/rank_test.go index 2cae33087f6..aa1ee01a1f0 100644 --- a/scheduler/rank_test.go +++ b/scheduler/rank_test.go @@ -1,6 +1,7 @@ package scheduler import ( + "sort" "testing" "github.com/hashicorp/nomad/helper/uuid" @@ -122,11 +123,128 @@ func TestBinPackIterator_NoExistingAlloc(t *testing.T) { if out[0].FinalScore != 1.0 { t.Fatalf("Bad Score: %v", out[0].FinalScore) } - if out[1].FinalScore < 0.75 || out[1].FinalScore > 0.95 { + if out[1].FinalScore < 0.50 || out[1].FinalScore > 0.60 { t.Fatalf("Bad Score: %v", out[1].FinalScore) } } +// TestBinPackIterator_NoExistingAlloc_MixedReserve asserts that node's with +// reserved resources are scored equivalent to as if they had a lower amount of +// resources. +func TestBinPackIterator_NoExistingAlloc_MixedReserve(t *testing.T) { + _, ctx := testContext(t) + nodes := []*RankedNode{ + { + // Best fit + Node: &structs.Node{ + Name: "no-reserved", + NodeResources: &structs.NodeResources{ + Cpu: structs.NodeCpuResources{ + CpuShares: 1100, + }, + Memory: structs.NodeMemoryResources{ + MemoryMB: 1100, + }, + }, + }, + }, + { + // Not best fit if reserve is calculated properly + Node: &structs.Node{ + Name: "reserved", + NodeResources: &structs.NodeResources{ + Cpu: structs.NodeCpuResources{ + CpuShares: 2000, + }, + Memory: structs.NodeMemoryResources{ + MemoryMB: 2000, + }, + }, + ReservedResources: &structs.NodeReservedResources{ + Cpu: structs.NodeReservedCpuResources{ + CpuShares: 800, + }, + Memory: structs.NodeReservedMemoryResources{ + MemoryMB: 800, + }, + }, + }, + }, + { + // Even worse fit due to reservations + Node: &structs.Node{ + Name: "reserved2", + NodeResources: &structs.NodeResources{ + Cpu: structs.NodeCpuResources{ + CpuShares: 2000, + }, + Memory: structs.NodeMemoryResources{ + MemoryMB: 2000, + }, + }, + ReservedResources: &structs.NodeReservedResources{ + Cpu: structs.NodeReservedCpuResources{ + CpuShares: 500, + }, + Memory: structs.NodeReservedMemoryResources{ + MemoryMB: 500, + }, + }, + }, + }, + { + Node: &structs.Node{ + Name: "overloaded", + NodeResources: &structs.NodeResources{ + Cpu: structs.NodeCpuResources{ + CpuShares: 900, + }, + Memory: structs.NodeMemoryResources{ + MemoryMB: 900, + }, + }, + }, + }, + } + static := NewStaticRankIterator(ctx, nodes) + + taskGroup := &structs.TaskGroup{ + EphemeralDisk: &structs.EphemeralDisk{}, + Tasks: []*structs.Task{ + { + Name: "web", + Resources: &structs.Resources{ + CPU: 1000, + MemoryMB: 1000, + }, + }, + }, + } + binp := NewBinPackIterator(ctx, static, false, 0) + binp.SetTaskGroup(taskGroup) + + scoreNorm := NewScoreNormalizationIterator(ctx, binp) + + out := collectRanked(scoreNorm) + + // Sort descending (highest score to lowest) and log for debugging + sort.Slice(out, func(i, j int) bool { return out[i].FinalScore > out[j].FinalScore }) + for i := range out { + t.Logf("Node: %-12s Score: %-1.4f", out[i].Node.Name, out[i].FinalScore) + } + + // 3 nodes should be feasible + require.Len(t, out, 3) + + // Node without reservations is the best fit + require.Equal(t, nodes[0].Node.Name, out[0].Node.Name) + + // Node with smallest remaining resources ("best fit") should get a + // higher score than node with more remaining resources ("worse fit") + require.Equal(t, nodes[1].Node.Name, out[1].Node.Name) + require.Equal(t, nodes[2].Node.Name, out[2].Node.Name) +} + // Tests bin packing iterator with network resources at task and task group level func TestBinPackIterator_Network_Success(t *testing.T) { _, ctx := testContext(t) @@ -239,7 +357,7 @@ func TestBinPackIterator_Network_Success(t *testing.T) { // First node should have a perfect score require.Equal(1.0, out[0].FinalScore) - if out[1].FinalScore < 0.75 || out[1].FinalScore > 0.95 { + if out[1].FinalScore < 0.50 || out[1].FinalScore > 0.60 { t.Fatalf("Bad Score: %v", out[1].FinalScore) } diff --git a/website/pages/docs/upgrade/upgrade-specific.mdx b/website/pages/docs/upgrade/upgrade-specific.mdx index 105a6d5f6ba..ec3db96041c 100644 --- a/website/pages/docs/upgrade/upgrade-specific.mdx +++ b/website/pages/docs/upgrade/upgrade-specific.mdx @@ -15,6 +15,22 @@ details provided for their upgrades as a result of new features or changed behavior. This page is used to document those details separately from the standard upgrade flow. +## Nomad 0.11.2 + +### Scheduler Scoring Changes + +Prior to Nomad 0.11.2 the scheduler algorithm used a [node's reserved +resources][reserved] +incorrectly during scoring. The result of this bug was that scoring biased in +favor of nodes with reserved resources vs nodes without reserved resources. + +Placements will be more correct but slightly different in v0.11.2 vs earlier +versions of Nomad. Operators do *not* need to take any actions as the impact of +the bug fix will only minimally affect scoring. + +Feasability (whether a node is capable of running a job at all) is *not* +affected. + ## Nomad 0.11.0 ### client.template: `vault_grace` deprecation @@ -466,21 +482,22 @@ draining the node so no tasks are running on it. This can be verified by running state. Once that is done the client can be killed, the `data_dir` should be deleted and then Nomad 0.3.0 can be launched. +[dangling-containers]: /docs/drivers/docker#dangling-containers [drain-api]: /api-docs/nodes#drain-node [drain-cli]: /docs/commands/node/drain -[dangling-containers]: /docs/drivers/docker#dangling-containers [gh-6787]: https://github.com/hashicorp/nomad/issues/6787 [hcl2]: https://github.com/hashicorp/hcl2 [limits]: /docs/configuration#limits [lxc]: /docs/drivers/external/lxc [migrate]: /docs/job-specification/migrate -[plugins]: /docs/drivers/external [plugin-stanza]: /docs/configuration/plugin -[preemption]: /docs/internals/scheduling/preemption +[plugins]: /docs/drivers/external [preemption-api]: /api-docs/operator#update-scheduler-configuration +[preemption]: /docs/internals/scheduling/preemption +[reserved]: /docs/configuration/client#reserved-parameters [task-config]: /docs/job-specification/task#config -[validate]: /docs/commands/job/validate -[vault_grace]: /docs/job-specification/template -[update]: /docs/job-specification/update [tls-guide]: https://learn.hashicorp.com/nomad/transport-security/enable-tls [tls-vault-guide]: https://learn.hashicorp.com/nomad/vault-integration/vault-pki-nomad +[update]: /docs/job-specification/update +[validate]: /docs/commands/job/validate +[vault_grace]: /docs/job-specification/template