Skip to content

Commit

Permalink
Add back Worker.transition_fetch_missing (#6112)
Browse files Browse the repository at this point in the history
Fixes #5951

In #5653 we removed the fetch
-> missing transition.  This caused deadlocks.  Now we add it back in.
  • Loading branch information
mrocklin authored Apr 13, 2022
1 parent 7d55039 commit 4f6926e
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 2 deletions.
1 change: 0 additions & 1 deletion distributed/tests/test_cancelled_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,6 @@ async def wait_and_raise(*args, **kwargs):

b_story = b.story(fut1.key)
assert any("receive-dep-failed" in msg for msg in b_story)
assert any("missing-dep" in msg for msg in b_story)
assert any("cancelled" in msg for msg in b_story)
assert any("resumed" in msg for msg in b_story)

Expand Down
21 changes: 21 additions & 0 deletions distributed/tests/test_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3477,3 +3477,24 @@ async def test_tick_interval(c, s, a, b):
while s.workers[a.address].metrics["event_loop_interval"] < 0.100:
await asyncio.sleep(0.01)
time.sleep(0.200)


class BreakingWorker(Worker):
broke_once = False

def get_data(self, comm, **kwargs):
if not self.broke_once:
self.broke_once = True
raise OSError("fake error")
return super().get_data(comm, **kwargs)


@pytest.mark.slow
@gen_cluster(client=True, Worker=BreakingWorker)
async def test_broken_comm(c, s, a, b):
df = dask.datasets.timeseries(
start="2000-01-01",
end="2000-01-10",
)
s = df.shuffle("id", shuffle="tasks")
await c.compute(s.size)
18 changes: 17 additions & 1 deletion distributed/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,7 @@ def __init__(
("executing", "released"): self.transition_executing_released,
("executing", "rescheduled"): self.transition_executing_rescheduled,
("fetch", "flight"): self.transition_fetch_flight,
("fetch", "missing"): self.transition_fetch_missing,
("fetch", "released"): self.transition_generic_released,
("flight", "error"): self.transition_flight_error,
("flight", "fetch"): self.transition_flight_fetch,
Expand Down Expand Up @@ -1948,6 +1949,14 @@ def transition_flight_missing(
ts.done = False
return {}, []

def transition_fetch_missing(
self, ts: TaskState, *, stimulus_id: str
) -> RecsInstrs:
ts.state = "missing"
self._missing_dep_flight.add(ts)
ts.done = False
return {}, []

def transition_released_fetch(
self, ts: TaskState, *, stimulus_id: str
) -> RecsInstrs:
Expand Down Expand Up @@ -2704,6 +2713,9 @@ def ensure_communicating(self) -> None:
if ts.state != "fetch":
continue

if self.validate:
assert ts.who_has

workers = [w for w in ts.who_has if w not in self.in_flight_workers]
if not workers:
assert ts.priority is not None
Expand Down Expand Up @@ -3032,7 +3044,11 @@ async def gather_dep(
for d in has_what:
ts = self.tasks[d]
ts.who_has.remove(worker)

if not ts.who_has:
recommendations[ts] = "missing"
self.log.append(
("missing-who-has", worker, ts.key, stimulus_id, time())
)
except Exception as e:
logger.exception(e)
if self.batched_stream and LOG_PDB:
Expand Down

0 comments on commit 4f6926e

Please sign in to comment.