Skip to content

Commit

Permalink
Prevent issue handler from trying to handle issues on finished pods (#…
Browse files Browse the repository at this point in the history
…4057)

This was causing issues where we'd send events for long finished pods

For example a pod that fails - but the pod is kept around for a few days
for debugging purposes

Once the finished pod reached its activeDeadlineSeconds, the executor
would send a failed event and delete the pod

We don't need to look for issues on finished pods - and sending events
for finished pods is actively confusing to downstream

We now filter pods we look for issues on based on their state
  • Loading branch information
JamesMurkin authored Dec 9, 2024
1 parent 9ff8d32 commit 78d76c4
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 1 deletion.
5 changes: 5 additions & 0 deletions internal/executor/service/pod_issue_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,11 @@ func (p *IssueHandler) detectPodIssues(allManagedPods []*v1.Pod) {
if p.hasIssue(util.ExtractJobRunId(pod)) {
continue
}
if util.IsInTerminalState(pod) && util.HasCurrentStateBeenReported(pod) {
// No need to detect issues on completed pods
// This prevents us sending updates on pods that are already finished and reported
continue
}
if pod.DeletionTimestamp != nil && pod.DeletionTimestamp.Add(p.stuckTerminatingPodExpiry).Before(p.clock.Now()) {
// pod is stuck in terminating phase, this sometimes happen on node failure
// it is safer to produce failed event than retrying as the job might have run already
Expand Down
10 changes: 9 additions & 1 deletion internal/executor/service/pod_issue_handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfStuckTerminating(t *testin
func TestPodIssueService_DeletesPodAndReportsFailed_IfExceedsActiveDeadline(t *testing.T) {
startTime := time.Now().Add(-time.Minute * 10)

completedPodPastDeadline := makePodWithDeadline(startTime, 300, 0)
completedPodPastDeadline.Status.Phase = v1.PodFailed
completedPodPastDeadline.Annotations[string(v1.PodFailed)] = "true"

tests := map[string]struct {
expectIssueDetected bool
pod *v1.Pod
Expand All @@ -100,14 +104,18 @@ func TestPodIssueService_DeletesPodAndReportsFailed_IfExceedsActiveDeadline(t *t
// Created 10 mins ago, 5 min deadline
pod: makePodWithDeadline(startTime, 300, 0),
},
"PodPastDeadline - Completed pod": {
expectIssueDetected: false,
pod: completedPodPastDeadline,
},
"PodPastDeadlineWithinTerminationGracePeriod": {
expectIssueDetected: false,
// Created 10 mins ago, 5 min deadline, 10 minute grace period
pod: makePodWithDeadline(startTime, 300, 600),
},
"PodWithNoStartTime": {
expectIssueDetected: false,
// Created 10 mins ago, 5 min deadline, no start time
// No start time so cannot determine if past its deadline
pod: makePodWithDeadline(time.Time{}, 300, 0),
},
}
Expand Down

0 comments on commit 78d76c4

Please sign in to comment.