-
-
Notifications
You must be signed in to change notification settings - Fork 725
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Zombie tasks after missing->released transition #5316
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -2984,6 +2984,57 @@ async def test_who_has_consistent_remove_replica(c, s, *workers): | |||||
assert s.tasks[f1.key].suspicious == 0 | ||||||
|
||||||
|
||||||
@gen_cluster(client=True) | ||||||
async def test_missing_released_zombie_tasks(c, s, a, b): | ||||||
""" | ||||||
Ensure that no fetch/flight tasks are left in the task dict of a | ||||||
worker after everything was released | ||||||
""" | ||||||
a.total_in_connections = 0 | ||||||
f1 = c.submit(inc, 1, key="f1", workers=[a.address]) | ||||||
f2 = c.submit(inc, f1, key="f2", workers=[b.address]) | ||||||
key = f1.key | ||||||
|
||||||
while key not in b.tasks or b.tasks[key].state != "fetch": | ||||||
await asyncio.sleep(0.01) | ||||||
|
||||||
await a.close(report=False) | ||||||
|
||||||
del f1, f2 | ||||||
|
||||||
while b.tasks: | ||||||
await asyncio.sleep(0.01) | ||||||
|
||||||
|
||||||
@gen_cluster(client=True) | ||||||
async def test_missing_released_zombie_tasks_2(c, s, a, b): | ||||||
a.total_in_connections = 0 | ||||||
f1 = c.submit(inc, 1, key="f1", workers=[a.address]) | ||||||
f2 = c.submit(inc, f1, key="f2", workers=[b.address]) | ||||||
|
||||||
while f1.key not in b.tasks: | ||||||
await asyncio.sleep(0) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The zero sleep is actually deliberate to avoid any timing related problems. fetch->flight->memory can be very fast and I don't want to miss the transition. My sleeping There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, the tasks will transition fo |
||||||
|
||||||
ts = b.tasks[f1.key] | ||||||
assert ts.state == "fetch" | ||||||
|
||||||
# A few things can happen to clear who_has. The dominant process is upon | ||||||
# connection failure to a worker. Regardless of how the set was cleared, the | ||||||
# task will be transitioned to missing where the worker is trying to | ||||||
# reaquire this information from the scheduler. While this is happening on | ||||||
# worker side, the tasks are released and we want to ensure that no dangling | ||||||
# zombie tasks are left on the worker | ||||||
ts.who_has.clear() | ||||||
|
||||||
del f1, f2 | ||||||
|
||||||
while b.tasks: | ||||||
await asyncio.sleep(0.01) | ||||||
|
||||||
story = b.story(ts) | ||||||
assert any("missing" in msg for msg in story) | ||||||
|
||||||
|
||||||
@pytest.mark.slow | ||||||
@gen_cluster( | ||||||
client=True, | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.