Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
When a Job can't schedule a Pod, log the Job's recent events (#88)
Browse files Browse the repository at this point in the history
When a Job can't schedule a Pod, log the Job's recent events

In cases where the Job definition itself is fine but the Job is unable to
schedule any Pods due to scheduling constraints (resources, node availability,
etc), we weren't able to give users much more information than that the Pod for
their flow never started.  With this change, we'll go inspect any events related
to that Job and include them in the logs.  These events include things like
scheduling constraint violations, in enough detail to help someone diagnose the
issue without going back to the cluster.

Closes #87
  • Loading branch information
chrisguidry authored Aug 29, 2023
1 parent fa0b01d commit 9ccee0a
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 2 deletions.
6 changes: 4 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

- Log more information when a Pod cannot be scheduled - [#87](https://github.com/PrefectHQ/prefect-kubernetes/issues/87)

### Added

- Handling for spot instance eviction - [#85](https://github.com/PrefectHQ/prefect-kubernetes/pull/85)
Expand Down Expand Up @@ -53,7 +55,7 @@ Released May 25th, 2023.
- Improve failure message when creating a Kubernetes job fails - [#71](https://github.com/PrefectHQ/prefect-kubernetes/pull/71)
- Stream Kubernetes Worker flow run logs to the API - [#72](https://github.com/PrefectHQ/prefect-kubernetes/pull/72)

## 0.2.7
## 0.2.7

Released May 4th, 2023.

Expand All @@ -80,7 +82,7 @@ Released April 20th, 2023.

Released April 6th, 2023.

###
###

- Temporary `prefect` version guard - [#48](https://github.com/PrefectHQ/prefect-kubernetes/pull/48)
- Advanced configuration documentation - [#50](https://github.com/PrefectHQ/prefect-kubernetes/pull/50)
Expand Down
39 changes: 39 additions & 0 deletions prefect_kubernetes/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -917,4 +917,43 @@ def _get_job_pod(

last_phase = phase

# If we've gotten here, we never found the Pod that was created for the flow run
# Job, so let's inspect the situation and log what we can find. It's possible
# that the Job ran into scheduling constraints it couldn't satisfy, like
# memory/CPU requests, or a volume that wasn't available, or a node with an
# available GPU.
logger.error(f"Job {job_name!r}: Pod never started.")
self._log_recent_job_events(logger, job_name, configuration, client)

def _log_recent_job_events(
self,
logger: logging.Logger,
job_name: str,
configuration: KubernetesWorkerJobConfiguration,
client: "ApiClient",
) -> None:
"""Look for reasons why a Job may not have been able to schedule a Pod and log
them to the provided logger."""
from kubernetes.client.models import CoreV1Event, CoreV1EventList

with self._get_core_client(client) as core_client:
events: CoreV1EventList = core_client.list_namespaced_event(
configuration.namespace
)
event: CoreV1Event
for event in events.items:
if not (
event.involved_object.api_version == "batch/v1"
and event.involved_object.kind == "Job"
and event.involved_object.namespace == configuration.namespace
and event.involved_object.name == job_name
):
continue

logger.info(
"Job event %r (%s times) as of %s: %s",
event.reason,
event.count,
event.last_timestamp.isoformat(),
event.message,
)
88 changes: 88 additions & 0 deletions tests/test_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
import prefect
import pytest
from kubernetes.client.exceptions import ApiException
from kubernetes.client.models import (
CoreV1Event,
CoreV1EventList,
V1ListMeta,
V1ObjectMeta,
V1ObjectReference,
)
from kubernetes.config import ConfigException
from prefect.client.schemas import FlowRun
from prefect.docker import get_prefect_image_name
Expand Down Expand Up @@ -2262,3 +2269,84 @@ async def test_kill_infrastructure_passes_other_k8s_api_errors_through(
grace_seconds=0,
configuration=default_configuration,
)

@pytest.fixture
def mock_events(self, mock_core_client):
mock_core_client.list_namespaced_event.return_value = CoreV1EventList(
metadata=V1ListMeta(resource_version="1"),
items=[
CoreV1Event(
metadata=V1ObjectMeta(),
involved_object=V1ObjectReference(
api_version="batch/v1",
kind="Job",
namespace="default",
name="mock-job",
),
reason="StuffBlewUp",
count=2,
last_timestamp=pendulum.parse("2022-01-02T03:04:05Z"),
message="Whew, that was baaaaad",
),
CoreV1Event(
metadata=V1ObjectMeta(),
involved_object=V1ObjectReference(
api_version="batch/v1",
kind="Job",
namespace="default",
name="this-aint-me", # not my flow run ID
),
reason="NahChief",
count=2,
last_timestamp=pendulum.parse("2022-01-02T03:04:05Z"),
message="You do not want to know about this one",
),
CoreV1Event(
metadata=V1ObjectMeta(),
involved_object=V1ObjectReference(
api_version="batch/v1",
kind="Job",
namespace="default",
name="mock-job",
),
reason="StuffBlewUp",
count=2,
last_timestamp=pendulum.parse("2022-01-02T03:04:05Z"),
message="I mean really really bad",
),
],
)

async def test_explains_what_might_have_gone_wrong_in_scheduling_the_pod(
self,
default_configuration: KubernetesWorkerJobConfiguration,
flow_run,
mock_batch_client,
mock_core_client: mock.Mock,
mock_watch,
mock_events,
caplog: pytest.LogCaptureFixture,
):
"""Regression test for #87, where workers were giving only very vague
information about the reason a pod never started."""
async with KubernetesWorker(work_pool_name="test") as k8s_worker:
await k8s_worker.run(
flow_run=flow_run,
configuration=default_configuration,
task_status=MagicMock(spec=anyio.abc.TaskStatus),
)

mock_core_client.list_namespaced_event.assert_called_once_with(
default_configuration.namespace
)

# The original error log should still be included
assert "Pod never started" in caplog.text

# The events for the job should be included
assert "StuffBlewUp" in caplog.text
assert "Whew, that was baaaaad" in caplog.text
assert "I mean really really bad" in caplog.text

# The event for another job shouldn't be included
assert "NahChief" not in caplog.text

0 comments on commit 9ccee0a

Please sign in to comment.