Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Failed Allocation Metrics stored in Evaluation #1188

Merged
merged 6 commits into from
May 25, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions api/evaluations.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ type Evaluation struct {
Wait time.Duration
NextEval string
PreviousEval string
BlockedEval string
FailedTGAllocs map[string]*AllocationMetric
CreateIndex uint64
ModifyIndex uint64
}
Expand Down
59 changes: 45 additions & 14 deletions command/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,14 @@ func (m *monitor) update(update *evalState) {
} else {
switch {
case existing.client != alloc.client:
description := ""
if alloc.clientDesc != "" {
description = fmt.Sprintf(" (%s)", alloc.clientDesc)
}
// Allocation status has changed
m.ui.Output(fmt.Sprintf(
"Allocation %q status changed: %q -> %q (%s)",
limit(alloc.id, m.length), existing.client, alloc.client, alloc.clientDesc))
"Allocation %q status changed: %q -> %q%s",
limit(alloc.id, m.length), existing.client, alloc.client, description))
}
}
}
Expand Down Expand Up @@ -288,9 +292,31 @@ func (m *monitor) monitor(evalID string, allowPrefix bool) int {
m.update(state)

switch eval.Status {
case structs.EvalStatusComplete, structs.EvalStatusFailed:
m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
limit(eval.ID, m.length), eval.Status))
case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled:
if len(eval.FailedTGAllocs) == 0 {
m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
limit(eval.ID, m.length), eval.Status))
} else {
// There were failures making the allocations
schedFailure = true
m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:",
limit(eval.ID, m.length), eval.Status))

// Print the failures per task group
for tg, metrics := range eval.FailedTGAllocs {
noun := "allocation"
if metrics.CoalescedFailures > 0 {
noun += "s"
}
m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun))
dumpAllocMetrics(m.ui, metrics, false)
}

if eval.BlockedEval != "" {
m.ui.Output(fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder",
limit(eval.BlockedEval, m.length)))
}
}
default:
// Wait for the next update
time.Sleep(updateWait)
Expand Down Expand Up @@ -332,41 +358,46 @@ func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation, length int) {
ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)",
limit(alloc.ID, length), alloc.ClientStatus,
alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated))
dumpAllocMetrics(ui, alloc.Metrics, true)
}

func dumpAllocMetrics(ui cli.Ui, metrics *api.AllocationMetric, scores bool) {
// Print a helpful message if we have an eligibility problem
if alloc.Metrics.NodesEvaluated == 0 {
if metrics.NodesEvaluated == 0 {
ui.Output(" * No nodes were eligible for evaluation")
}

// Print a helpful message if the user has asked for a DC that has no
// available nodes.
for dc, available := range alloc.Metrics.NodesAvailable {
for dc, available := range metrics.NodesAvailable {
if available == 0 {
ui.Output(fmt.Sprintf(" * No nodes are available in datacenter %q", dc))
}
}

// Print filter info
for class, num := range alloc.Metrics.ClassFiltered {
for class, num := range metrics.ClassFiltered {
ui.Output(fmt.Sprintf(" * Class %q filtered %d nodes", class, num))
}
for cs, num := range alloc.Metrics.ConstraintFiltered {
for cs, num := range metrics.ConstraintFiltered {
ui.Output(fmt.Sprintf(" * Constraint %q filtered %d nodes", cs, num))
}

// Print exhaustion info
if ne := alloc.Metrics.NodesExhausted; ne > 0 {
if ne := metrics.NodesExhausted; ne > 0 {
ui.Output(fmt.Sprintf(" * Resources exhausted on %d nodes", ne))
}
for class, num := range alloc.Metrics.ClassExhausted {
for class, num := range metrics.ClassExhausted {
ui.Output(fmt.Sprintf(" * Class %q exhausted on %d nodes", class, num))
}
for dim, num := range alloc.Metrics.DimensionExhausted {
for dim, num := range metrics.DimensionExhausted {
ui.Output(fmt.Sprintf(" * Dimension %q exhausted on %d nodes", dim, num))
}

// Print scores
for name, score := range alloc.Metrics.Scores {
ui.Output(fmt.Sprintf(" * Score %q = %f", name, score))
if scores {
for name, score := range metrics.Scores {
ui.Output(fmt.Sprintf(" * Score %q = %f", name, score))
}
}
}
3 changes: 0 additions & 3 deletions nomad/plan_apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ func (s *Server) applyPlan(job *structs.Job, result *structs.PlanResult, snap *s
// are multiple updates per node
minUpdates := len(result.NodeUpdate)
minUpdates += len(result.NodeAllocation)
minUpdates += len(result.FailedAllocs)

// Setup the update request
req := structs.AllocUpdateRequest{
Expand All @@ -137,7 +136,6 @@ func (s *Server) applyPlan(job *structs.Job, result *structs.PlanResult, snap *s
for _, allocList := range result.NodeAllocation {
req.Alloc = append(req.Alloc, allocList...)
}
req.Alloc = append(req.Alloc, result.FailedAllocs...)

// Set the time the alloc was applied for the first time. This can be used
// to approximate the scheduling time.
Expand Down Expand Up @@ -200,7 +198,6 @@ func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.P
result := &structs.PlanResult{
NodeUpdate: make(map[string][]*structs.Allocation),
NodeAllocation: make(map[string][]*structs.Allocation),
FailedAllocs: plan.FailedAllocs,
}

// Collect all the nodeIDs
Expand Down
17 changes: 2 additions & 15 deletions nomad/plan_apply_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,10 @@ func TestPlanApply_applyPlan(t *testing.T) {

// Register alloc
alloc := mock.Alloc()
allocFail := mock.Alloc()
plan := &structs.PlanResult{
NodeAllocation: map[string][]*structs.Allocation{
node.ID: []*structs.Allocation{alloc},
},
FailedAllocs: []*structs.Allocation{allocFail},
}

// Snapshot the state
Expand Down Expand Up @@ -94,15 +92,6 @@ func TestPlanApply_applyPlan(t *testing.T) {
t.Fatalf("missing alloc")
}

// Lookup the allocation
out, err = s1.fsm.State().AllocByID(allocFail.ID)
if err != nil {
t.Fatalf("err: %v", err)
}
if out == nil {
t.Fatalf("missing alloc")
}

// Evict alloc, Register alloc2
allocEvict := new(structs.Allocation)
*allocEvict = *alloc
Expand Down Expand Up @@ -178,12 +167,10 @@ func TestPlanApply_EvalPlan_Simple(t *testing.T) {
snap, _ := state.Snapshot()

alloc := mock.Alloc()
allocFail := mock.Alloc()
plan := &structs.Plan{
NodeAllocation: map[string][]*structs.Allocation{
node.ID: []*structs.Allocation{alloc},
},
FailedAllocs: []*structs.Allocation{allocFail},
}

pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize)
Expand All @@ -196,8 +183,8 @@ func TestPlanApply_EvalPlan_Simple(t *testing.T) {
if result == nil {
t.Fatalf("missing result")
}
if !reflect.DeepEqual(result.FailedAllocs, plan.FailedAllocs) {
t.Fatalf("missing failed allocs")
if !reflect.DeepEqual(result.NodeAllocation, plan.NodeAllocation) {
t.Fatalf("incorrect node allocations")
}
}

Expand Down
51 changes: 33 additions & 18 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -2617,6 +2617,16 @@ type Evaluation struct {
// This is used to support rolling upgrades, where we need a chain of evaluations.
PreviousEval string

// BlockedEval is the evaluation ID for a created blocked eval. A
// blocked eval will be created if all allocations could not be placed due
// to constraints or lacking resources.
BlockedEval string

// FailedTGAllocs are task groups which have allocations that could not be
// made, but the metrics are persisted so that the user can use the feedback
// to determine the cause.
FailedTGAllocs map[string]*AllocMetric

// ClassEligibility tracks computed node classes that have been explicitly
// marked as eligible or ineligible.
ClassEligibility map[string]bool
Expand Down Expand Up @@ -2655,6 +2665,25 @@ func (e *Evaluation) Copy() *Evaluation {
}
ne := new(Evaluation)
*ne = *e

// Copy ClassEligibility
if e.ClassEligibility != nil {
classes := make(map[string]bool, len(e.ClassEligibility))
for class, elig := range e.ClassEligibility {
classes[class] = elig
}
ne.ClassEligibility = classes
}

// Copy FailedTGAllocs
if e.FailedTGAllocs != nil {
failedTGs := make(map[string]*AllocMetric, len(e.FailedTGAllocs))
for tg, metric := range e.FailedTGAllocs {
failedTGs[tg] = metric.Copy()
}
ne.FailedTGAllocs = failedTGs
}

return ne
}

Expand Down Expand Up @@ -2715,10 +2744,10 @@ func (e *Evaluation) NextRollingEval(wait time.Duration) *Evaluation {
}
}

// BlockedEval creates a blocked evaluation to followup this eval to place any
// CreateBlockedEval creates a blocked evaluation to followup this eval to place any
// failed allocations. It takes the classes marked explicitly eligible or
// ineligible and whether the job has escaped computed node classes.
func (e *Evaluation) BlockedEval(classEligibility map[string]bool, escaped bool) *Evaluation {
func (e *Evaluation) CreateBlockedEval(classEligibility map[string]bool, escaped bool) *Evaluation {
return &Evaluation{
ID: GenerateUUID(),
Priority: e.Priority,
Expand Down Expand Up @@ -2769,11 +2798,6 @@ type Plan struct {
// The evicts must be considered prior to the allocations.
NodeAllocation map[string][]*Allocation

// FailedAllocs are allocations that could not be made,
// but are persisted so that the user can use the feedback
// to determine the cause.
FailedAllocs []*Allocation

// Annotations contains annotations by the scheduler to be used by operators
// to understand the decisions made by the scheduler.
Annotations *PlanAnnotations
Expand Down Expand Up @@ -2821,13 +2845,9 @@ func (p *Plan) AppendAlloc(alloc *Allocation) {
p.NodeAllocation[node] = append(existing, alloc)
}

func (p *Plan) AppendFailed(alloc *Allocation) {
p.FailedAllocs = append(p.FailedAllocs, alloc)
}

// IsNoOp checks if this plan would do nothing
func (p *Plan) IsNoOp() bool {
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 && len(p.FailedAllocs) == 0
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0
}

// PlanResult is the result of a plan submitted to the leader.
Expand All @@ -2838,11 +2858,6 @@ type PlanResult struct {
// NodeAllocation contains all the allocations that were committed.
NodeAllocation map[string][]*Allocation

// FailedAllocs are allocations that could not be made,
// but are persisted so that the user can use the feedback
// to determine the cause.
FailedAllocs []*Allocation

// RefreshIndex is the index the worker should refresh state up to.
// This allows all evictions and allocations to be materialized.
// If any allocations were rejected due to stale data (node state,
Expand All @@ -2856,7 +2871,7 @@ type PlanResult struct {

// IsNoOp checks if this plan result would do nothing
func (p *PlanResult) IsNoOp() bool {
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 && len(p.FailedAllocs) == 0
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0
}

// FullCommit is used to check if all the allocations in a plan
Expand Down
Loading