From 8c97f7bbb3e1cebbf11cad94e2bfbcc8b14c5d1b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 23 Jul 2020 13:14:15 -0400
Subject: [PATCH 01/61] Create runtime variables

---
 tractor/_state.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tractor/_state.py b/tractor/_state.py
index d624fc90e..2a18a4ca4 100644
--- a/tractor/_state.py
+++ b/tractor/_state.py
@@ -3,10 +3,15 @@
 """
 from typing import Optional
 from collections import Mapping
+import multiprocessing as mp
 
 import trio
 
 _current_actor: Optional['Actor'] = None  # type: ignore
+_runtime_vars = {
+    '_debug_mode': False,
+    '_is_root': False,
+}
 
 
 def current_actor() -> 'Actor':  # type: ignore
@@ -36,3 +41,20 @@ def __getitem__(self, key: str):
         except RuntimeError:
             # no local actor/task context initialized yet
             return f'no {key} context'
+
+
+def is_main_process() -> bool:
+    """Bool determining if this actor is running in the top-most process.
+    """
+    return mp.current_process().name == 'MainProcess'
+
+
+def debug_mode() -> bool:
+    """Bool determining if "debug mode" is on which enables
+    remote subactor pdb entry on crashes.
+    """
+    return _runtime_vars['_debug_mode']
+
+
+def is_root_process() -> bool:
+    return _runtime_vars['_is_root']

From b11e91375cebeabb1f320a592b29c128cf940a01 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 23 Jul 2020 13:23:55 -0400
Subject: [PATCH 02/61] Initial attempt at multi-actor debugging

Allow entering and attaching to a `pdb` instance in a child process.
The current hackery is to have the child make an rpc to the parent and
ask it to hijack stdin, once complete the child enters a `pdb` blocking
method. The parent then relays all stdin input to the child thus
controlling the "remote" debugger.

A few things were added to accomplish this:
- tracking the mapping of subactors to their parent nurseries
- in the root actor, cancelling all nurseries under the root `trio` task
  on cancellation (i.e. `Actor.cancel()`)
- pass a "runtime vars" map down the actor tree for propagating global state
---
 tractor/_actor.py    |  12 +++-
 tractor/_debug.py    | 134 +++++++++++++++++++++++++++++++++++++++++++
 tractor/_entry.py    |   3 +
 tractor/_spawn.py    |  17 +++---
 tractor/_trionics.py |   8 ++-
 5 files changed, 164 insertions(+), 10 deletions(-)
 create mode 100644 tractor/_debug.py

diff --git a/tractor/_actor.py b/tractor/_actor.py
index d59bd82e3..9053d6fbf 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -7,6 +7,7 @@
 import importlib
 import importlib.util
 import inspect
+import bdb
 import uuid
 import typing
 from typing import Dict, List, Tuple, Any, Optional
@@ -125,8 +126,15 @@ async def _invoke(
                         task_status.started(cs)
                         await chan.send({'return': await coro, 'cid': cid})
     except (Exception, trio.MultiError) as err:
-        # always ship errors back to caller
         log.exception("Actor errored:")
+
+        # NOTE: don't enter debug mode recursively after quitting pdb
+        if _state.debug_mode() and not isinstance(err, bdb.BdbQuit):
+            # Allow for pdb control in parent
+            from ._debug import post_mortem
+            await post_mortem()
+
+        # always ship errors back to caller
         err_msg = pack_error(err)
         err_msg['cid'] = cid
         try:
@@ -178,6 +186,7 @@ class Actor:
     def __init__(
         self,
         name: str,
+        *,
         rpc_module_paths: List[str] = [],
         statespace: Optional[Dict[str, Any]] = None,
         uid: str = None,
@@ -237,6 +246,7 @@ def __init__(
         self._parent_chan: Optional[Channel] = None
         self._forkserver_info: Optional[
             Tuple[Any, Any, Any, Any, Any]] = None
+        self._actoruid2nursery: Dict[str, 'ActorNursery'] = {}
 
     async def wait_for_peer(
         self, uid: Tuple[str, str]
diff --git a/tractor/_debug.py b/tractor/_debug.py
new file mode 100644
index 000000000..e462b7647
--- /dev/null
+++ b/tractor/_debug.py
@@ -0,0 +1,134 @@
+"""
+Multi-core debugging for da peeps!
+"""
+import pdb
+import sys
+import tty
+from functools import partial
+from typing import Awaitable, Tuple
+
+from async_generator import aclosing
+import tractor
+import trio
+
+from .log import get_logger
+
+log = get_logger(__name__)
+
+
+__all__ = ['breakpoint', 'post_mortem']
+
+
+# TODO: is there some way to determine this programatically?
+_pdb_exit_patterns = tuple(
+    str.encode(patt + "\n") for patt in ('c', 'cont', 'continue', 'q', 'quit')
+)
+
+
+def subactoruid2proc(
+    actor: 'Actor',  # noqa
+    uid: Tuple[str, str]
+) -> trio.Process:
+    n = actor._actoruid2nursery[uid]
+    _, proc, _ = n._children[uid]
+    return proc
+
+
+async def _hijack_stdin_relay_to_child(
+    subactor_uid: Tuple[str, str]
+) -> None:
+    actor = tractor.current_actor()
+    proc = subactoruid2proc(actor, subactor_uid)
+
+    # nlb = []
+
+    async def hijack_stdin():
+        log.info(f"Hijacking stdin from {actor.uid}")
+        # try:
+            # # disable cooked mode
+            # fd = sys.stdin.fileno()
+            # old = tty.tcgetattr(fd)
+            # tty.setcbreak(fd)
+
+        # trap std in and relay to subproc
+        async_stdin = trio.wrap_file(sys.stdin)
+
+        async with aclosing(async_stdin):
+            # while True:
+            async for msg in async_stdin:
+                log.trace(f"Stdin input:\n{msg}")
+                # nlb.append(msg)
+                # encode to bytes
+                bmsg = str.encode(msg)
+
+                # relay bytes to subproc over pipe
+                await proc.stdin.send_all(bmsg)
+
+                # line = str.encode(''.join(nlb))
+                # print(line)
+
+                if bmsg in _pdb_exit_patterns:
+                    log.info("Closing stdin hijack")
+                    break
+        # finally:
+        #     tty.tcsetattr(fd, tty.TCSAFLUSH, old)
+
+    # schedule hijacking in root scope
+    actor._root_nursery.start_soon(hijack_stdin)
+
+
+# XXX: We only make this sync in case someone wants to
+# overload the ``breakpoint()`` built-in.
+def _breakpoint(debug_func) -> Awaitable[None]:
+    """``tractor`` breakpoint entry for engaging pdb machinery
+    in subactors.
+    """
+    actor = tractor.current_actor()
+
+    async def wait_for_parent_stdin_hijack():
+        log.debug('Breakpoint engaged!')
+
+        # TODO: need a more robust check for the "root" actor
+        if actor._parent_chan:
+            async with tractor._portal.open_portal(
+                actor._parent_chan,
+                start_msg_loop=False,
+            ) as portal:
+                # with trio.fail_after(1):
+                await portal.run(
+                    'tractor._debug',
+                    '_hijack_stdin_relay_to_child',
+                    subactor_uid=actor.uid,
+                )
+
+        # block here one frame up where ``breakpoint()``
+        # was awaited and begin handling stdin
+        debug_func(actor)
+
+    # this must be awaited by caller
+    return wait_for_parent_stdin_hijack()
+
+
+def _set_trace(actor):
+    pdb.set_trace(
+        header=f"\nAttaching pdb to actor: {actor.uid}\n",
+        # start 2 levels up
+        frame=sys._getframe().f_back.f_back,
+    )
+
+
+breakpoint = partial(
+    _breakpoint,
+    _set_trace,
+)
+
+
+def _post_mortem(actor):
+    log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
+    pdb.post_mortem()
+
+
+post_mortem = partial(
+    _breakpoint,
+    _post_mortem,
+)
diff --git a/tractor/_entry.py b/tractor/_entry.py
index 883cc6b3b..71dfdec40 100644
--- a/tractor/_entry.py
+++ b/tractor/_entry.py
@@ -57,6 +57,9 @@ def _trio_main(
 ) -> None:
     """Entry point for a `trio_run_in_process` subactor.
     """
+    # TODO: make a global func to set this or is it too hacky?
+    # os.environ['PYTHONBREAKPOINT'] = 'tractor._debug.breakpoint'
+
     if actor.loglevel is not None:
         log.info(
             f"Setting loglevel for {actor.uid} to {actor.loglevel}")
diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index f0eb01221..6d1961467 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -23,7 +23,7 @@
 from typing import Tuple
 
 from . import _forkserver_override
-from ._state import current_actor
+from ._state import current_actor, is_main_process
 from .log import get_logger
 from ._portal import Portal
 from ._actor import Actor, ActorFailure
@@ -87,12 +87,6 @@ def try_set_start_method(name: str) -> Optional[mp.context.BaseContext]:
     return _ctx
 
 
-def is_main_process() -> bool:
-    """Bool determining if this actor is running in the top-most process.
-    """
-    return mp.current_process().name == 'MainProcess'
-
-
 async def exhaust_portal(
     portal: Portal,
     actor: Actor
@@ -206,6 +200,8 @@ async def new_proc(
     # passed through to actor main
     bind_addr: Tuple[str, int],
     parent_addr: Tuple[str, int],
+    _runtime_vars: Dict[str, Any],  # serialized and sent to _child
+    *,
     use_trio_run_in_process: bool = False,
     task_status: TaskStatus[Portal] = trio.TASK_STATUS_IGNORED
 ) -> None:
@@ -241,9 +237,14 @@ async def new_proc(
                     "statespace": subactor.statespace,
                     "_arb_addr": subactor._arb_addr,
                     "bind_host": bind_addr[0],
-                    "bind_port": bind_addr[1]
+                    "bind_port": bind_addr[1],
+                    "_runtime_vars": _runtime_vars,
                 })
 
+                # track subactor in current nursery
+                curr_actor = current_actor()
+                curr_actor._actoruid2nursery[subactor.uid] = actor_nursery
+
                 # resume caller at next checkpoint now that child is up
                 task_status.started(portal)
 
diff --git a/tractor/_trionics.py b/tractor/_trionics.py
index e84614493..d51a59b8d 100644
--- a/tractor/_trionics.py
+++ b/tractor/_trionics.py
@@ -13,10 +13,11 @@
 from .log import get_logger, get_loglevel
 from ._actor import Actor
 from ._portal import Portal
+from . import _state
 from . import _spawn
 
 
-log = get_logger('tractor')
+log = get_logger(__name__)
 
 _default_bind_addr: Tuple[str, int] = ('127.0.0.1', 0)
 
@@ -58,6 +59,10 @@ async def start_actor(
     ) -> Portal:
         loglevel = loglevel or self._actor.loglevel or get_loglevel()
 
+        # configure and pass runtime state
+        _rtv = _state._runtime_vars.copy()
+        _rtv['_is_root'] = False
+
         subactor = Actor(
             name,
             # modules allowed to invoked funcs from
@@ -83,6 +88,7 @@ async def start_actor(
                 self.errors,
                 bind_addr,
                 parent_addr,
+                _rtv,  # run time vars
             )
         )
 

From b06d4b023e6bc9b27b1553b3d4505ce9d6707853 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 23 Jul 2020 13:32:29 -0400
Subject: [PATCH 03/61] Add support for "debug mode"

When enabled a crashed actor will connect to the parent with `pdb`
in post mortem mode.
---
 tractor/__init__.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index ea61d882a..5450cbcfa 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -18,6 +18,7 @@
 from ._trionics import open_nursery
 from ._state import current_actor
 from ._exceptions import RemoteActorError, ModuleNotExposed
+from ._debug import breakpoint, post_mortem
 from . import msg
 from . import _spawn
 
@@ -103,14 +104,26 @@ def run(
     # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
     # OR `trio_run_in_process` (the new default).
     start_method: Optional[str] = None,
+    debug_mode: bool = False,
     **kwargs,
 ) -> Any:
     """Run a trio-actor async function in process.
 
     This is tractor's main entry and the start point for any async actor.
     """
+    # mark top most level process as root actor
+    _state._runtime_vars['_is_root'] = True
+
     if start_method is not None:
         _spawn.try_set_start_method(start_method)
+
+    if debug_mode:
+        _state._runtime_vars['_debug_mode'] = True
+
+    # expose internal debug module to every actor allowing
+    # for use of ``await tractor.breakpoint()``
+    kwargs.setdefault('rpc_module_paths', []).append('tractor._debug')
+
     return trio.run(_main, async_fn, args, kwargs, arbiter_addr, name)
 
 

From 1d1c881fd737ec9f3057a13599d1f9eed45e8176 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 23 Jul 2020 13:34:03 -0400
Subject: [PATCH 04/61] WIP debugging test script

---
 debugging/mp_debug.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 debugging/mp_debug.py

diff --git a/debugging/mp_debug.py b/debugging/mp_debug.py
new file mode 100644
index 000000000..c43a81064
--- /dev/null
+++ b/debugging/mp_debug.py
@@ -0,0 +1,28 @@
+import tractor
+import trio
+
+
+async def bubble():
+    print('IN BUBBLE')
+    await trio.sleep(.1)
+    await tractor.breakpoint()
+
+
+async def bail():
+    getattr(doggy)
+
+
+async def main():
+    """The main ``tractor`` routine.
+    """
+    async with tractor.open_nursery() as n:
+
+        # portal = await n.run_in_actor('future_self', bubble)
+        portal = await n.run_in_actor('future_self', bail)
+
+    # The ``async with`` will unblock here since the 'some_linguist'
+    # actor has completed its main task ``cellar_door``.
+
+
+if __name__ == '__main__':
+    tractor.run(main, loglevel='info', debug_mode=True)

From e7ee0fec345bb5686c53bf7d40e60942af5e9745 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 26 Jul 2020 15:33:24 -0400
Subject: [PATCH 05/61] Pass a copy of the expected exposed modules

---
 tests/test_rpc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_rpc.py b/tests/test_rpc.py
index b3fa1df91..8ac9b09eb 100644
--- a/tests/test_rpc.py
+++ b/tests/test_rpc.py
@@ -63,7 +63,7 @@ def test_rpc_errors(arb_addr, to_call, testdir):
         # module should raise a ModuleNotFoundError at import
         testdir.makefile('.py', tmp_mod=funcname)
 
-        # no need to exposed module to the subactor
+        # no need to expose module to the subactor
         subactor_exposed_mods = exposed_mods
         exposed_mods = []
         func_defined = False
@@ -95,7 +95,7 @@ def run():
         tractor.run(
             main,
             arbiter_addr=arb_addr,
-            rpc_module_paths=exposed_mods,
+            rpc_module_paths=exposed_mods.copy(),
         )
 
     # handle both parameterized cases

From 8eb9a742dd9448394a2ea27152c627ac599e0d71 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 26 Jul 2020 17:46:55 -0400
Subject: [PATCH 06/61] Add multi-process debugging support using `pdbpp`

This is the first step in addressing #113 and the initial support
of #130. Basically this allows (sub)processes to engage the `pdbpp`
debug machinery which read/writes the root actor's tty but only in
a FIFO semaphored way such that no two processes are using it
simultaneously. That means you can have multiple actors enter a trace or
crash and run the debugger in a sensible way without clobbering each
other's access to stdio. It required adding some "tear down hooks" to
a custom `pdbpp.Pdb` type such that we release a child's lock on the
parent on debugger exit (in this case when either of the "continue" or
"quit" commands are issued to the debugger console).

There's some code left commented in anticipation of full support for
issue #130 where we're need to actually capture and feed stdin to the
target (remote) actor which won't necessarily being running on the same
host.
---
 debugging/mp_debug.py |   8 +-
 tractor/_actor.py     |   8 +-
 tractor/_debug.py     | 174 ++++++++++++++++++++++++++++++------------
 3 files changed, 135 insertions(+), 55 deletions(-)

diff --git a/debugging/mp_debug.py b/debugging/mp_debug.py
index c43a81064..dfbfa9177 100644
--- a/debugging/mp_debug.py
+++ b/debugging/mp_debug.py
@@ -17,12 +17,14 @@ async def main():
     """
     async with tractor.open_nursery() as n:
 
-        # portal = await n.run_in_actor('future_self', bubble)
-        portal = await n.run_in_actor('future_self', bail)
+        portal1 = await n.run_in_actor('bubble', bubble)
+        portal = await n.run_in_actor('bail', bail)
+        # await portal.result()
+        # await portal1.result()
 
     # The ``async with`` will unblock here since the 'some_linguist'
     # actor has completed its main task ``cellar_door``.
 
 
 if __name__ == '__main__':
-    tractor.run(main, loglevel='info', debug_mode=True)
+    tractor.run(main, loglevel='critical', debug_mode=True)
diff --git a/tractor/_actor.py b/tractor/_actor.py
index 9053d6fbf..5cfbb7b30 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -126,13 +126,14 @@ async def _invoke(
                         task_status.started(cs)
                         await chan.send({'return': await coro, 'cid': cid})
     except (Exception, trio.MultiError) as err:
-        log.exception("Actor errored:")
-
         # NOTE: don't enter debug mode recursively after quitting pdb
         if _state.debug_mode() and not isinstance(err, bdb.BdbQuit):
             # Allow for pdb control in parent
             from ._debug import post_mortem
+            log.exception("Actor crashed, entering debug mode:")
             await post_mortem()
+        else:
+            log.exception("Actor crashed:")
 
         # always ship errors back to caller
         err_msg = pack_error(err)
@@ -182,6 +183,7 @@ class Actor:
 
     # Information about `__main__` from parent
     _parent_main_data: Dict[str, str]
+    _parent_chan_cs: Optional[trio.CancelScope] = None
 
     def __init__(
         self,
@@ -840,6 +842,8 @@ async def cancel(self) -> bool:
     #         for n in root.child_nurseries:
     #             n.cancel_scope.cancel()
 
+        self._parent_chan_cs.cancel()
+
     async def _cancel_task(self, cid, chan):
         """Cancel a local task by call-id / channel.
 
diff --git a/tractor/_debug.py b/tractor/_debug.py
index e462b7647..5d3626be4 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -1,9 +1,7 @@
 """
 Multi-core debugging for da peeps!
 """
-import pdb
 import sys
-import tty
 from functools import partial
 from typing import Awaitable, Tuple
 
@@ -13,6 +11,16 @@
 
 from .log import get_logger
 
+try:
+    # wtf: only exported when installed in dev mode?
+    import pdbpp
+except ImportError:
+    # pdbpp is installed in regular mode...
+    import pdb
+    assert pdb.xpm, "pdbpp is not installed?"
+    pdbpp = pdb
+
+
 log = get_logger(__name__)
 
 
@@ -25,56 +33,96 @@
 )
 
 
-def subactoruid2proc(
-    actor: 'Actor',  # noqa
-    uid: Tuple[str, str]
-) -> trio.Process:
-    n = actor._actoruid2nursery[uid]
-    _, proc, _ = n._children[uid]
-    return proc
+_pdb_release_hook = None
+
+
+class PdbwTeardown(pdbpp.Pdb):
+    """Add teardown hooks to the regular ``pdbpp.Pdb``.
+    """
+    # TODO: figure out how to dissallow recursive .set_trace() entry
+    # since that'll cause deadlock for us.
+    def set_continue(self):
+        super().set_continue()
+        self.config.teardown(self)
+
+    def set_quit(self):
+        super().set_quit()
+        self.config.teardown(self)
+
+
+class TractorConfig(pdbpp.DefaultConfig):
+    """Custom ``pdbpp`` goodness.
+    """
+    sticky_by_default = True
+
+    def teardown(self, _pdb):
+        _pdb_release_hook(_pdb)
+
+
+# override the pdbpp config with our coolio one
+pdbpp.Pdb.DefaultConfig = TractorConfig
+
+
+# TODO: will be needed whenever we get to true remote debugging.
+# XXX see https://github.com/goodboy/tractor/issues/130
+
+# def subactoruid2proc(
+#     actor: 'Actor',  # noqa
+#     uid: Tuple[str, str]
+# ) -> trio.Process:
+#     n = actor._actoruid2nursery[uid]
+#     _, proc, _ = n._children[uid]
+#     return proc
+
+# async def hijack_stdin():
+#     log.info(f"Hijacking stdin from {actor.uid}")
+
+#     trap std in and relay to subproc
+#     async_stdin = trio.wrap_file(sys.stdin)
+
+#     async with aclosing(async_stdin):
+#         async for msg in async_stdin:
+#             log.trace(f"Stdin input:\n{msg}")
+#             # encode to bytes
+#             bmsg = str.encode(msg)
+
+#             # relay bytes to subproc over pipe
+#             # await proc.stdin.send_all(bmsg)
+
+#             if bmsg in _pdb_exit_patterns:
+#                 log.info("Closing stdin hijack")
+#                 break
 
 
 async def _hijack_stdin_relay_to_child(
     subactor_uid: Tuple[str, str]
 ) -> None:
     actor = tractor.current_actor()
-    proc = subactoruid2proc(actor, subactor_uid)
-
-    # nlb = []
-
-    async def hijack_stdin():
-        log.info(f"Hijacking stdin from {actor.uid}")
-        # try:
-            # # disable cooked mode
-            # fd = sys.stdin.fileno()
-            # old = tty.tcgetattr(fd)
-            # tty.setcbreak(fd)
-
-        # trap std in and relay to subproc
-        async_stdin = trio.wrap_file(sys.stdin)
+    debug_lock = actor.statespace.setdefault(
+        '_debug_lock', trio.StrictFIFOLock()
+    )
 
-        async with aclosing(async_stdin):
-            # while True:
-            async for msg in async_stdin:
-                log.trace(f"Stdin input:\n{msg}")
-                # nlb.append(msg)
-                # encode to bytes
-                bmsg = str.encode(msg)
+    log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock")
+    await debug_lock.acquire()
+    log.warning(f"Actor {subactor_uid} acquired stdin hijack lock")
 
-                # relay bytes to subproc over pipe
-                await proc.stdin.send_all(bmsg)
+    # TODO: when we get to true remote debugging
+    # this will deliver stdin data
+    try:
+        # indicate to child that we've locked stdio
+        yield 'Locked'
 
-                # line = str.encode(''.join(nlb))
-                # print(line)
+        # wait for cancellation of stream by child
+        await trio.sleep_forever()
 
-                if bmsg in _pdb_exit_patterns:
-                    log.info("Closing stdin hijack")
-                    break
-        # finally:
-        #     tty.tcsetattr(fd, tty.TCSAFLUSH, old)
+        # TODO: for remote debugging schedule hijacking in root scope
+        # (see above)
+        # actor._root_nursery.start_soon(hijack_stdin)
 
-    # schedule hijacking in root scope
-    actor._root_nursery.start_soon(hijack_stdin)
+    finally:
+        if debug_lock.locked():
+            debug_lock.release()
+            log.debug(f"Actor {subactor_uid} released stdin hijack lock")
 
 
 # XXX: We only make this sync in case someone wants to
@@ -84,35 +132,61 @@ def _breakpoint(debug_func) -> Awaitable[None]:
     in subactors.
     """
     actor = tractor.current_actor()
+    do_unlock = trio.Event()
 
-    async def wait_for_parent_stdin_hijack():
-        log.debug('Breakpoint engaged!')
+    async def wait_for_parent_stdin_hijack(
+        task_status=trio.TASK_STATUS_IGNORED
+    ):
 
         # TODO: need a more robust check for the "root" actor
         if actor._parent_chan:
             async with tractor._portal.open_portal(
                 actor._parent_chan,
                 start_msg_loop=False,
+                shield=True,
             ) as portal:
                 # with trio.fail_after(1):
-                await portal.run(
+                agen = await portal.run(
                     'tractor._debug',
                     '_hijack_stdin_relay_to_child',
                     subactor_uid=actor.uid,
                 )
+                async with aclosing(agen):
+                    async for val in agen:
+                        assert val == 'Locked'
+                        task_status.started()
+                        with trio.CancelScope(shield=True):
+                            await do_unlock.wait()
+
+                            # trigger cancellation of remote stream
+                            break
+
+                log.debug(f"Child {actor} released parent stdio lock")
+
+    def unlock(_pdb):
+        do_unlock.set()
+
+    global _pdb_release_hook
+    _pdb_release_hook = unlock
+
+    async def _bp():
+        # this must be awaited by caller
+        await actor._root_nursery.start(
+            wait_for_parent_stdin_hijack
+        )
 
         # block here one frame up where ``breakpoint()``
         # was awaited and begin handling stdin
         debug_func(actor)
 
-    # this must be awaited by caller
-    return wait_for_parent_stdin_hijack()
+    # return wait_for_parent_stdin_hijack()
+    return _bp()
 
 
 def _set_trace(actor):
-    pdb.set_trace(
-        header=f"\nAttaching pdb to actor: {actor.uid}\n",
-        # start 2 levels up
+    log.critical(f"\nAttaching pdb to actor: {actor.uid}\n")
+    PdbwTeardown().set_trace(
+        # start 2 levels up in user code
         frame=sys._getframe().f_back.f_back,
     )
 
@@ -125,7 +199,7 @@ def _set_trace(actor):
 
 def _post_mortem(actor):
     log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
-    pdb.post_mortem()
+    pdbpp.xpm(Pdb=PdbwTeardown)
 
 
 post_mortem = partial(

From f7cd2be039c490897d40c1d8af214320116b0675 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 29 Jul 2020 20:48:15 -0400
Subject: [PATCH 07/61] Play with re-entrant trace

---
 debugging/mp_debug.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/debugging/mp_debug.py b/debugging/mp_debug.py
index dfbfa9177..a27971590 100644
--- a/debugging/mp_debug.py
+++ b/debugging/mp_debug.py
@@ -4,8 +4,9 @@
 
 async def bubble():
     print('IN BUBBLE')
-    await trio.sleep(.1)
-    await tractor.breakpoint()
+    while True:
+        await trio.sleep(.1)
+        await tractor.breakpoint()
 
 
 async def bail():
@@ -27,4 +28,4 @@ async def main():
 
 
 if __name__ == '__main__':
-    tractor.run(main, loglevel='critical', debug_mode=True)
+    tractor.run(main, loglevel='error', debug_mode=True)

From efd7095cf850e5b14b28767521f48eeb12a30eb0 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 29 Jul 2020 20:57:10 -0400
Subject: [PATCH 08/61] Add pdbpp as dep

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 24d44799e..e31e35957 100755
--- a/setup.py
+++ b/setup.py
@@ -39,7 +39,7 @@
     ],
     install_requires=[
         'msgpack', 'trio>0.8', 'async_generator', 'colorlog', 'wrapt',
-        'trio_typing'
+        'trio_typing', 'pdbpp',
     ],
     tests_require=['pytest'],
     python_requires=">=3.7",

From abaa2f5da0a47637d429abfcec9f8d8357fda0cc Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 Jul 2020 10:24:08 -0400
Subject: [PATCH 09/61] Drop uneeded `parent_chan_cs()` cancel call

---
 tractor/_actor.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tractor/_actor.py b/tractor/_actor.py
index 5cfbb7b30..a02285f90 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -842,8 +842,6 @@ async def cancel(self) -> bool:
     #         for n in root.child_nurseries:
     #             n.cancel_scope.cancel()
 
-        self._parent_chan_cs.cancel()
-
     async def _cancel_task(self, cid, chan):
         """Cancel a local task by call-id / channel.
 

From 68773d51fd10c7e56499308f51d1ea4f3343d26a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 Jul 2020 10:41:58 -0400
Subject: [PATCH 10/61] Always expose the debug module

---
 tractor/_actor.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tractor/_actor.py b/tractor/_actor.py
index a02285f90..3e244fa5d 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -206,8 +206,11 @@ def __init__(
         # will be passed to children
         self._parent_main_data = _mp_fixup_main._mp_figure_out_main()
 
+        # always include debugging tools module
+        rpc_module_paths.append('tractor._debug')
+
         mods = {}
-        for name in rpc_module_paths or ():
+        for name in rpc_module_paths:
             mod = importlib.import_module(name)
             mods[name] = _get_mod_abspath(mod)
 

From f9ef3fc5de0a5fb7d9d5e6f88aa93205069adcd7 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 30 Jul 2020 10:42:22 -0400
Subject: [PATCH 11/61] Cleanups and more comments

---
 tractor/_debug.py | 53 +++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/tractor/_debug.py b/tractor/_debug.py
index 5d3626be4..e5da9a53c 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -3,7 +3,7 @@
 """
 import sys
 from functools import partial
-from typing import Awaitable, Tuple
+from typing import Awaitable, Tuple, Optional, Callable
 
 from async_generator import aclosing
 import tractor
@@ -27,18 +27,25 @@
 __all__ = ['breakpoint', 'post_mortem']
 
 
-# TODO: is there some way to determine this programatically?
-_pdb_exit_patterns = tuple(
-    str.encode(patt + "\n") for patt in ('c', 'cont', 'continue', 'q', 'quit')
-)
+# placeholder for function to set a ``trio.Event``
+_pdb_release_hook: Optional[Callable] = None
 
 
-_pdb_release_hook = None
+class TractorConfig(pdbpp.DefaultConfig):
+    """Custom ``pdbpp`` goodness.
+    """
+    sticky_by_default = True
+
+    def teardown(self, _pdb):
+        _pdb_release_hook(_pdb)
 
 
 class PdbwTeardown(pdbpp.Pdb):
     """Add teardown hooks to the regular ``pdbpp.Pdb``.
     """
+    # override the pdbpp config with our coolio one
+    DefaultConfig = TractorConfig
+
     # TODO: figure out how to dissallow recursive .set_trace() entry
     # since that'll cause deadlock for us.
     def set_continue(self):
@@ -50,22 +57,15 @@ def set_quit(self):
         self.config.teardown(self)
 
 
-class TractorConfig(pdbpp.DefaultConfig):
-    """Custom ``pdbpp`` goodness.
-    """
-    sticky_by_default = True
-
-    def teardown(self, _pdb):
-        _pdb_release_hook(_pdb)
-
-
-# override the pdbpp config with our coolio one
-pdbpp.Pdb.DefaultConfig = TractorConfig
-
-
 # TODO: will be needed whenever we get to true remote debugging.
 # XXX see https://github.com/goodboy/tractor/issues/130
 
+# # TODO: is there some way to determine this programatically?
+# _pdb_exit_patterns = tuple(
+#     str.encode(patt + "\n") for patt in (
+#         'c', 'cont', 'continue', 'q', 'quit')
+# )
+
 # def subactoruid2proc(
 #     actor: 'Actor',  # noqa
 #     uid: Tuple[str, str]
@@ -170,17 +170,16 @@ def unlock(_pdb):
     _pdb_release_hook = unlock
 
     async def _bp():
-        # this must be awaited by caller
-        await actor._root_nursery.start(
-            wait_for_parent_stdin_hijack
-        )
+        # this **must** be awaited by the caller and is done using the
+        # root nursery so that the debugger can continue to run without
+        # being restricted by the scope of a new task nursery.
+        await actor._root_nursery.start(wait_for_parent_stdin_hijack)
 
-        # block here one frame up where ``breakpoint()``
-        # was awaited and begin handling stdin
+        # block here one (at the appropriate frame *up* where
+        # ``breakpoint()`` was awaited and begin handling stdio
         debug_func(actor)
 
-    # return wait_for_parent_stdin_hijack()
-    return _bp()
+    return _bp()  # user code **must** await this!
 
 
 def _set_trace(actor):

From ebb21b9ba3a20fac2c3a7f9ce9825da413098a16 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 1 Aug 2020 13:39:05 -0400
Subject: [PATCH 12/61] Support re-entrant breakpoints

Keep an actor local (bool) flag which determines if there is already
a running debugger instance for the current process. If another task
tries to enter in this case, simply ignore it since allowing entry may
result in a deadlock where the new task will be sync waiting on the
parent stdio lock (a case that will never arrive due to the current
debugger's active use of it).

In the future we may want to allow FIFO queueing of local tasks where
instead of ignoring re-entrant breakpoints we allow tasks to async wait
for debugger release, though not sure the implications of that since
you'd likely want to support switching the debugger to the new task and
that could cause deadlocks where tasks are inter-dependent. It may be
more sane to just error on multiple breakpoint requests within an actor.
---
 tractor/_debug.py | 75 +++++++++++++++++++++++++++--------------------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/tractor/_debug.py b/tractor/_debug.py
index e5da9a53c..b3a630b95 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -36,8 +36,8 @@ class TractorConfig(pdbpp.DefaultConfig):
     """
     sticky_by_default = True
 
-    def teardown(self, _pdb):
-        _pdb_release_hook(_pdb)
+    def teardown(self):
+        _pdb_release_hook()
 
 
 class PdbwTeardown(pdbpp.Pdb):
@@ -50,11 +50,11 @@ class PdbwTeardown(pdbpp.Pdb):
     # since that'll cause deadlock for us.
     def set_continue(self):
         super().set_continue()
-        self.config.teardown(self)
+        self.config.teardown()
 
     def set_quit(self):
         super().set_quit()
-        self.config.teardown(self)
+        self.config.teardown()
 
 
 # TODO: will be needed whenever we get to true remote debugging.
@@ -140,36 +140,48 @@ async def wait_for_parent_stdin_hijack(
 
         # TODO: need a more robust check for the "root" actor
         if actor._parent_chan:
-            async with tractor._portal.open_portal(
-                actor._parent_chan,
-                start_msg_loop=False,
-                shield=True,
-            ) as portal:
-                # with trio.fail_after(1):
-                agen = await portal.run(
-                    'tractor._debug',
-                    '_hijack_stdin_relay_to_child',
-                    subactor_uid=actor.uid,
-                )
-                async with aclosing(agen):
-                    async for val in agen:
-                        assert val == 'Locked'
-                        task_status.started()
-                        with trio.CancelScope(shield=True):
-                            await do_unlock.wait()
-
-                            # trigger cancellation of remote stream
-                            break
-
+            try:
+                async with tractor._portal.open_portal(
+                    actor._parent_chan,
+                    start_msg_loop=False,
+                    shield=True,
+                ) as portal:
+                    # with trio.fail_after(1):
+                    agen = await portal.run(
+                        'tractor._debug',
+                        '_hijack_stdin_relay_to_child',
+                        subactor_uid=actor.uid,
+                    )
+                    async with aclosing(agen):
+                        async for val in agen:
+                            assert val == 'Locked'
+                            task_status.started()
+                            with trio.CancelScope(shield=True):
+                                await do_unlock.wait()
+
+                                # trigger cancellation of remote stream
+                                break
+            finally:
+                log.debug(f"Exiting debugger for actor {actor}")
+                actor.statespace['_in_debug'] = False
                 log.debug(f"Child {actor} released parent stdio lock")
 
-    def unlock(_pdb):
-        do_unlock.set()
+    async def _bp():
+        """Async breakpoint which schedules a parent stdio lock, and once complete
+        enters the ``pdbpp`` debugging console.
+        """
+        in_debug = actor.statespace.setdefault('_in_debug', False)
 
-    global _pdb_release_hook
-    _pdb_release_hook = unlock
+        if in_debug:
+            log.warning(f"Actor {actor} already has a debug lock, skipping...")
+            return
+
+        # assign unlock callback for debugger teardown hooks
+        global _pdb_release_hook
+        _pdb_release_hook = do_unlock.set
+
+        actor.statespace['_in_debug'] = True
 
-    async def _bp():
         # this **must** be awaited by the caller and is done using the
         # root nursery so that the debugger can continue to run without
         # being restricted by the scope of a new task nursery.
@@ -179,7 +191,8 @@ async def _bp():
         # ``breakpoint()`` was awaited and begin handling stdio
         debug_func(actor)
 
-    return _bp()  # user code **must** await this!
+    # user code **must** await this!
+    return _bp()
 
 
 def _set_trace(actor):

From fd5fb9241a15ee66333c5898922e12c6d0855f23 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 2 Aug 2020 22:30:03 -0400
Subject: [PATCH 13/61] Sparsen some lines

---
 tractor/__init__.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index 5450cbcfa..e53dfa4fe 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -54,9 +54,14 @@ async def _main(
     """Async entry point for ``tractor``.
     """
     logger = log.get_logger('tractor')
+
     main = partial(async_fn, *args)
+
     arbiter_addr = (host, port) = arbiter_addr or (
-            _default_arbiter_host, _default_arbiter_port)
+        _default_arbiter_host,
+        _default_arbiter_port
+    )
+
     loglevel = kwargs.get('loglevel', log.get_loglevel())
     if loglevel is not None:
         log._default_loglevel = loglevel
@@ -99,7 +104,9 @@ def run(
     *args,
     name: Optional[str] = None,
     arbiter_addr: Tuple[str, int] = (
-        _default_arbiter_host, _default_arbiter_port),
+        _default_arbiter_host,
+        _default_arbiter_port,
+    ),
     # either the `multiprocessing` start method:
     # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
     # OR `trio_run_in_process` (the new default).

From bd157e05efecde891e4e61fdf488991dfe766997 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 8 Aug 2020 23:00:56 -0400
Subject: [PATCH 14/61] Port to service nursery

---
 tractor/_debug.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/_debug.py b/tractor/_debug.py
index b3a630b95..baa293e9f 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -185,7 +185,7 @@ async def _bp():
         # this **must** be awaited by the caller and is done using the
         # root nursery so that the debugger can continue to run without
         # being restricted by the scope of a new task nursery.
-        await actor._root_nursery.start(wait_for_parent_stdin_hijack)
+        await actor._service_n.start(wait_for_parent_stdin_hijack)
 
         # block here one (at the appropriate frame *up* where
         # ``breakpoint()`` was awaited and begin handling stdio

From 291ecec070e87248efe32f86ce4a025439485ecd Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 6 Aug 2020 23:47:43 -0400
Subject: [PATCH 15/61] Maybe not sticky by default

---
 tractor/_debug.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/_debug.py b/tractor/_debug.py
index baa293e9f..7e92a0334 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -34,7 +34,7 @@
 class TractorConfig(pdbpp.DefaultConfig):
     """Custom ``pdbpp`` goodness.
     """
-    sticky_by_default = True
+    # sticky_by_default = True
 
     def teardown(self):
         _pdb_release_hook()

From 150179bfe4de749dda0a6bb18f80cead2c8217cc Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 12 Sep 2020 11:47:14 -0400
Subject: [PATCH 16/61] Support entering post mortem on crashes in root actor

---
 tractor/_debug.py | 131 ++++++++++++++++++++++++++++++----------------
 1 file changed, 86 insertions(+), 45 deletions(-)

diff --git a/tractor/_debug.py b/tractor/_debug.py
index 7e92a0334..c31f16ffb 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -1,8 +1,10 @@
 """
 Multi-core debugging for da peeps!
 """
+import bdb
 import sys
 from functools import partial
+from contextlib import asynccontextmanager, AsyncExitStack
 from typing import Awaitable, Tuple, Optional, Callable
 
 from async_generator import aclosing
@@ -10,6 +12,7 @@
 import trio
 
 from .log import get_logger
+from . import _state
 
 try:
     # wtf: only exported when installed in dev mode?
@@ -92,37 +95,40 @@ def set_quit(self):
 #             if bmsg in _pdb_exit_patterns:
 #                 log.info("Closing stdin hijack")
 #                 break
+@asynccontextmanager
+async def _acquire_debug_lock():
+    """Acquire a actor local FIFO lock meant to mutex entry to a local
+    debugger entry point to avoid tty clobbering by multiple processes.
+    """
+    try:
+        actor = tractor.current_actor()
+        debug_lock = actor.statespace.setdefault(
+            '_debug_lock', trio.StrictFIFOLock()
+        )
+        await debug_lock.acquire()
+        log.error("TTY lock acquired")
+        yield
+    finally:
+        if debug_lock.locked():
+            debug_lock.release()
+        log.error("TTY lock released")
 
 
 async def _hijack_stdin_relay_to_child(
     subactor_uid: Tuple[str, str]
 ) -> None:
-    actor = tractor.current_actor()
-    debug_lock = actor.statespace.setdefault(
-        '_debug_lock', trio.StrictFIFOLock()
-    )
-
-    log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock")
-    await debug_lock.acquire()
-    log.warning(f"Actor {subactor_uid} acquired stdin hijack lock")
-
     # TODO: when we get to true remote debugging
     # this will deliver stdin data
-    try:
+    log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock")
+    async with _acquire_debug_lock():
+        log.warning(f"Actor {subactor_uid} acquired stdin hijack lock")
         # indicate to child that we've locked stdio
         yield 'Locked'
 
         # wait for cancellation of stream by child
         await trio.sleep_forever()
 
-        # TODO: for remote debugging schedule hijacking in root scope
-        # (see above)
-        # actor._root_nursery.start_soon(hijack_stdin)
-
-    finally:
-        if debug_lock.locked():
-            debug_lock.release()
-            log.debug(f"Actor {subactor_uid} released stdin hijack lock")
+    log.debug(f"Actor {subactor_uid} released stdin hijack lock")
 
 
 # XXX: We only make this sync in case someone wants to
@@ -137,34 +143,31 @@ def _breakpoint(debug_func) -> Awaitable[None]:
     async def wait_for_parent_stdin_hijack(
         task_status=trio.TASK_STATUS_IGNORED
     ):
-
-        # TODO: need a more robust check for the "root" actor
-        if actor._parent_chan:
-            try:
-                async with tractor._portal.open_portal(
-                    actor._parent_chan,
-                    start_msg_loop=False,
-                    shield=True,
-                ) as portal:
-                    # with trio.fail_after(1):
+        try:
+            async with tractor._portal.open_portal(
+                actor._parent_chan,
+                start_msg_loop=False,
+                shield=True,
+            ) as portal:
+                with trio.fail_after(1):
                     agen = await portal.run(
                         'tractor._debug',
                         '_hijack_stdin_relay_to_child',
                         subactor_uid=actor.uid,
                     )
-                    async with aclosing(agen):
-                        async for val in agen:
-                            assert val == 'Locked'
-                            task_status.started()
-                            with trio.CancelScope(shield=True):
-                                await do_unlock.wait()
-
-                                # trigger cancellation of remote stream
-                                break
-            finally:
-                log.debug(f"Exiting debugger for actor {actor}")
-                actor.statespace['_in_debug'] = False
-                log.debug(f"Child {actor} released parent stdio lock")
+                async with aclosing(agen):
+                    async for val in agen:
+                        assert val == 'Locked'
+                        task_status.started()
+                        with trio.CancelScope(shield=True):
+                            await do_unlock.wait()
+
+                            # trigger cancellation of remote stream
+                            break
+        finally:
+            log.debug(f"Exiting debugger for actor {actor}")
+            actor.statespace['_in_debug'] = False
+            log.debug(f"Child {actor} released parent stdio lock")
 
     async def _bp():
         """Async breakpoint which schedules a parent stdio lock, and once complete
@@ -182,10 +185,27 @@ async def _bp():
 
         actor.statespace['_in_debug'] = True
 
-        # this **must** be awaited by the caller and is done using the
-        # root nursery so that the debugger can continue to run without
-        # being restricted by the scope of a new task nursery.
-        await actor._service_n.start(wait_for_parent_stdin_hijack)
+        # TODO: need a more robust check for the "root" actor
+        if actor._parent_chan:
+            # this **must** be awaited by the caller and is done using the
+            # root nursery so that the debugger can continue to run without
+            # being restricted by the scope of a new task nursery.
+            await actor._service_n.start(wait_for_parent_stdin_hijack)
+
+            # block here one (at the appropriate frame *up* where
+            # ``breakpoint()`` was awaited and begin handling stdio
+            # debug_func(actor)
+        else:
+            # we also wait in the root-parent for any child that
+            # may have the tty locked prior
+            async def _lock(
+                task_status=trio.TASK_STATUS_IGNORED
+            ):
+                async with _acquire_debug_lock():
+                    task_status.started()
+                    await do_unlock.wait()
+
+            await actor._service_n.start(_lock)
 
         # block here one (at the appropriate frame *up* where
         # ``breakpoint()`` was awaited and begin handling stdio
@@ -218,3 +238,24 @@ def _post_mortem(actor):
     _breakpoint,
     _post_mortem,
 )
+
+
+async def _maybe_enter_pm(err):
+    if (
+        _state.debug_mode()
+        and not isinstance(err, bdb.BdbQuit)
+
+        # XXX: if the error is the likely result of runtime-wide
+        # cancellation, we don't want to enter the debugger since
+        # there's races between when the parent actor has killed all
+        # comms and when the child tries to contact said parent to
+        # acquire the tty lock.
+        # Really we just want to mostly avoid catching KBIs here so there
+        # might be a simpler check we can do?
+        and trio.MultiError.filter(
+            lambda exc: exc if not isinstance(exc, trio.Cancelled) else None,
+            err,
+        )
+    ):
+        log.warning("Actor crashed, entering debug mode")
+        await post_mortem()

From 8b6e9f5530be07645f4a2c6a5a21b0dd94877321 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 12 Sep 2020 11:48:20 -0400
Subject: [PATCH 17/61] Port to new debug api, set `_is_root` state flag on
 startup

---
 tractor/_actor.py | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/tractor/_actor.py b/tractor/_actor.py
index 3e244fa5d..5d2a35dea 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -7,7 +7,6 @@
 import importlib
 import importlib.util
 import inspect
-import bdb
 import uuid
 import typing
 from typing import Dict, List, Tuple, Any, Optional
@@ -27,6 +26,7 @@
     unpack_error,
     ModuleNotExposed
 )
+from ._debug import _maybe_enter_pm
 from ._discovery import get_arbiter
 from ._portal import Portal
 from . import _state
@@ -127,13 +127,8 @@ async def _invoke(
                         await chan.send({'return': await coro, 'cid': cid})
     except (Exception, trio.MultiError) as err:
         # NOTE: don't enter debug mode recursively after quitting pdb
-        if _state.debug_mode() and not isinstance(err, bdb.BdbQuit):
-            # Allow for pdb control in parent
-            from ._debug import post_mortem
-            log.exception("Actor crashed, entering debug mode:")
-            await post_mortem()
-        else:
-            log.exception("Actor crashed:")
+        log.exception("Actor crashed:")
+        await _maybe_enter_pm(err)
 
         # always ship errors back to caller
         err_msg = pack_error(err)
@@ -201,6 +196,7 @@ def __init__(
         """
         self.name = name
         self.uid = (name, uid or str(uuid.uuid4()))
+        self._is_cancelled: bool = False
 
         # retreive and store parent `__main__` data which
         # will be passed to children
@@ -251,7 +247,7 @@ def __init__(
         self._parent_chan: Optional[Channel] = None
         self._forkserver_info: Optional[
             Tuple[Any, Any, Any, Any, Any]] = None
-        self._actoruid2nursery: Dict[str, 'ActorNursery'] = {}
+        self._actoruid2nursery: Dict[str, 'ActorNursery'] = {}  # noqa
 
     async def wait_for_peer(
         self, uid: Tuple[str, str]
@@ -564,7 +560,7 @@ async def _process_messages(
                 f"Exiting msg loop for {chan} from {chan.uid} "
                 f"with last msg:\n{msg}")
 
-    async def _chan_to_parent(
+    async def _from_parent(
         self,
         parent_addr: Optional[Tuple[str, int]],
     ) -> Tuple[Channel, Optional[Tuple[str, int]]]:
@@ -593,6 +589,10 @@ async def _chan_to_parent(
                     parent_data.pop('bind_host'),
                     parent_data.pop('bind_port'),
                 )
+                rvs = parent_data.pop('_runtime_vars')
+                rvs['_is_root'] = False
+                _state._runtime_vars.update(rvs)
+
                 for attr, value in parent_data.items():
                     setattr(self, attr, value)
 
@@ -630,13 +630,13 @@ async def _async_main(
             # establish primary connection with immediate parent
             self._parent_chan = None
             if parent_addr is not None:
-                self._parent_chan, accept_addr_from_rent = await self._chan_to_parent(
+                self._parent_chan, accept_addr_rent = await self._from_parent(
                     parent_addr)
 
                 # either it's passed in because we're not a child
                 # or because we're running in mp mode
-                if accept_addr_from_rent is not None:
-                    accept_addr = accept_addr_from_rent
+                if accept_addr_rent is not None:
+                    accept_addr = accept_addr_rent
 
             # load exposed/allowed RPC modules
             # XXX: do this **after** establishing a channel to the parent
@@ -711,7 +711,7 @@ async def _async_main(
 
             # Blocks here as expected until the root nursery is
             # killed (i.e. this actor is cancelled or signalled by the parent)
-        except (trio.MultiError, Exception) as err:
+        except Exception as err:
             if not registered_with_arbiter:
                 # TODO: I guess we could try to connect back
                 # to the parent through a channel and engage a debugger
@@ -822,6 +822,8 @@ async def cancel(self) -> bool:
               spawning new rpc tasks
             - return control the parent channel message loop
         """
+        self._is_cancelled = True
+
         # cancel all ongoing rpc tasks
         with trio.CancelScope(shield=True):
             # kill all ongoing tasks
@@ -1039,7 +1041,12 @@ async def _start_actor(
                 parent_addr=None
             )
         )
-        result = await main()
+        try:
+            result = await main()
+        except (Exception, trio.MultiError) as err:
+            log.exception("Actor crashed:")
+            await _maybe_enter_pm(err)
+            raise
 
         # XXX: the actor is cancelled when this context is complete
         # given that there are no more active peer channels connected

From 09daba4c9c25bf666e4c526321cf05fc43ddd172 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 12 Sep 2020 11:48:57 -0400
Subject: [PATCH 18/61] Explicitly handle `debug_mode` flag correctly

---
 tractor/__init__.py | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index e53dfa4fe..b65811ac8 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -47,14 +47,25 @@
 async def _main(
     async_fn: typing.Callable[..., typing.Awaitable],
     args: Tuple,
-    kwargs: typing.Dict[str, typing.Any],
     arbiter_addr: Tuple[str, int],
     name: Optional[str] = None,
+    start_method: Optional[str] = None,
+    debug_mode: bool = False,
+    **kwargs: typing.Dict[str, typing.Any],
 ) -> typing.Any:
     """Async entry point for ``tractor``.
     """
     logger = log.get_logger('tractor')
 
+    if start_method is not None:
+        _spawn.try_set_start_method(start_method)
+
+    if debug_mode:
+        _state._runtime_vars['_debug_mode'] = True
+        # expose internal debug module to every actor allowing
+        # for use of ``await tractor.breakpoint()``
+        kwargs.setdefault('rpc_module_paths', []).append('tractor._debug')
+
     main = partial(async_fn, *args)
 
     arbiter_addr = (host, port) = arbiter_addr or (
@@ -109,7 +120,7 @@ def run(
     ),
     # either the `multiprocessing` start method:
     # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
-    # OR `trio_run_in_process` (the new default).
+    # OR `trio` (the new default).
     start_method: Optional[str] = None,
     debug_mode: bool = False,
     **kwargs,
@@ -121,17 +132,23 @@ def run(
     # mark top most level process as root actor
     _state._runtime_vars['_is_root'] = True
 
-    if start_method is not None:
-        _spawn.try_set_start_method(start_method)
-
-    if debug_mode:
-        _state._runtime_vars['_debug_mode'] = True
+    return trio.run(
+        partial(
+            # our entry
+            _main,
 
-    # expose internal debug module to every actor allowing
-    # for use of ``await tractor.breakpoint()``
-    kwargs.setdefault('rpc_module_paths', []).append('tractor._debug')
+            # user entry point
+            async_fn,
+            args,
 
-    return trio.run(_main, async_fn, args, kwargs, arbiter_addr, name)
+            # global kwargs
+            arbiter_addr=arbiter_addr,
+            name=name,
+            start_method=start_method,
+            debug_mode=debug_mode,
+            **kwargs,
+        )
+    )
 
 
 def run_daemon(

From 9e1d9a8ce11d865dd0ed9ee5ad8263882f8e527b Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Thu, 24 Sep 2020 09:59:18 -0400
Subject: [PATCH 19/61] Add an internal context stack

This aids with tearing down resources **after** the crash handling and
debugger have completed. Leaving this internal for now but should
eventually get a public convenience function like
`tractor.context_stack()`.
---
 tractor/_actor.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tractor/_actor.py b/tractor/_actor.py
index 5d2a35dea..e5b8ef0d7 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -13,6 +13,7 @@
 from types import ModuleType
 import sys
 import os
+from contextlib import ExitStack
 
 import trio  # type: ignore
 from trio_typing import TaskStatus
@@ -150,7 +151,7 @@ async def _invoke(
             # If we're cancelled before the task returns then the
             # cancel scope will not have been inserted yet
             log.warn(
-                f"Task {func} was likely cancelled before it was started")
+                f"Task {func} likely errored or cancelled before it started")
 
         if not actor._rpc_tasks:
             log.info("All RPC tasks have completed")
@@ -175,6 +176,7 @@ class Actor:
     _root_n: Optional[trio.Nursery] = None
     _service_n: Optional[trio.Nursery] = None
     _server_n: Optional[trio.Nursery] = None
+    _lifetime_stack: ExitStack = ExitStack()
 
     # Information about `__main__` from parent
     _parent_main_data: Dict[str, str]
@@ -426,7 +428,6 @@ async def send_cmd(
     async def _process_messages(
         self,
         chan: Channel,
-        treat_as_gen: bool = False,
         shield: bool = False,
         task_status: TaskStatus[trio.CancelScope] = trio.TASK_STATUS_IGNORED,
     ) -> None:
@@ -743,6 +744,11 @@ async def _async_main(
         finally:
             log.info("Root nursery complete")
 
+            # tear down all lifetime contexts
+            # api idea: ``tractor.open_context()``
+            log.warn("Closing all actor lifetime contexts")
+            self._lifetime_stack.close()
+
             # Unregister actor from the arbiter
             if registered_with_arbiter and (
                     self._arb_addr is not None

From f1b242f913b9bf3d84fbf6b8e9d7d154bb3ef9d5 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 28 Sep 2020 08:54:21 -0400
Subject: [PATCH 20/61] Block SIGINT handling while in the debugger

This seems to prevent a certain class of bugs to do with the root actor
cancelling local tasks and getting into deadlock while children are
trying to acquire the tty lock. I'm not sure it's the best idea yet
since you're pretty much guaranteed to get "stuck" if a child activates
the debugger after the root has been cancelled (at least "stuck" in
terms of SIGINT being ignored). That kinda race condition seems to still
exist somehow: a child can "beat" the root to activating the tty lock
and the parent is stuck waiting on the child to terminate via its
nursery.
---
 tractor/_debug.py | 135 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 99 insertions(+), 36 deletions(-)

diff --git a/tractor/_debug.py b/tractor/_debug.py
index c31f16ffb..4d43025a4 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -4,8 +4,9 @@
 import bdb
 import sys
 from functools import partial
-from contextlib import asynccontextmanager, AsyncExitStack
+from contextlib import asynccontextmanager, contextmanager
 from typing import Awaitable, Tuple, Optional, Callable
+import signal
 
 from async_generator import aclosing
 import tractor
@@ -23,25 +24,27 @@
     assert pdb.xpm, "pdbpp is not installed?"
     pdbpp = pdb
 
-
 log = get_logger(__name__)
 
 
 __all__ = ['breakpoint', 'post_mortem']
 
 
-# placeholder for function to set a ``trio.Event``
+# placeholder for function to set a ``trio.Event`` on debugger exit
 _pdb_release_hook: Optional[Callable] = None
 
+# actor-wide flag
+_in_debug = False
+
+# lock in root actor preventing multi-access to local tty
+_debug_lock = trio.StrictFIFOLock()
+
 
 class TractorConfig(pdbpp.DefaultConfig):
     """Custom ``pdbpp`` goodness.
     """
     # sticky_by_default = True
 
-    def teardown(self):
-        _pdb_release_hook()
-
 
 class PdbwTeardown(pdbpp.Pdb):
     """Add teardown hooks to the regular ``pdbpp.Pdb``.
@@ -52,12 +55,20 @@ class PdbwTeardown(pdbpp.Pdb):
     # TODO: figure out how to dissallow recursive .set_trace() entry
     # since that'll cause deadlock for us.
     def set_continue(self):
-        super().set_continue()
-        self.config.teardown()
+        global _in_debug
+        try:
+            super().set_continue()
+        finally:
+            _in_debug = False
+            _pdb_release_hook()
 
     def set_quit(self):
-        super().set_quit()
-        self.config.teardown()
+        global _in_debug
+        try:
+            super().set_quit()
+        finally:
+            _in_debug = False
+            _pdb_release_hook()
 
 
 # TODO: will be needed whenever we get to true remote debugging.
@@ -95,40 +106,75 @@ def set_quit(self):
 #             if bmsg in _pdb_exit_patterns:
 #                 log.info("Closing stdin hijack")
 #                 break
+
+
 @asynccontextmanager
 async def _acquire_debug_lock():
     """Acquire a actor local FIFO lock meant to mutex entry to a local
     debugger entry point to avoid tty clobbering by multiple processes.
     """
     try:
-        actor = tractor.current_actor()
-        debug_lock = actor.statespace.setdefault(
-            '_debug_lock', trio.StrictFIFOLock()
-        )
-        await debug_lock.acquire()
+        log.error("TTY BEING ACQUIRED")
+        await _debug_lock.acquire()
         log.error("TTY lock acquired")
         yield
     finally:
-        if debug_lock.locked():
-            debug_lock.release()
+        if _debug_lock.locked():
+            _debug_lock.release()
         log.error("TTY lock released")
 
 
+def handler(signum, frame):
+    """Block SIGINT while in debug to avoid deadlocks with cancellation.
+    """
+    print(
+        "tractor ignores SIGINT while in debug mode\n"
+        "If you have a special need for it please open an issue.\n"
+    )
+
+
+# don't allow those stdlib mofos to mess with sigint handler
+pdbpp.pdb.Pdb.sigint_handler = handler
+
+
+@contextmanager
+def _disable_sigint():
+    try:
+        # disable sigint handling while in debug
+        prior_handler = signal.signal(signal.SIGINT, handler)
+        yield
+    finally:
+        # restore SIGINT handling
+        signal.signal(signal.SIGINT, prior_handler)
+
+
 async def _hijack_stdin_relay_to_child(
     subactor_uid: Tuple[str, str]
 ) -> None:
+    actor = tractor.current_actor()
+    nursery = actor._actoruid2nursery[subactor_uid]
+    print(f'NURSERY: {nursery}')
+    print(f'nursery is cancelled {nursery.cancelled}')
+    if actor._is_cancelled or nursery.cancelled:
+        raise RuntimeError(
+            "Can not engage debugger actor is already cancelled")
+
+    await trio.sleep(0)
+
     # TODO: when we get to true remote debugging
     # this will deliver stdin data
-    log.debug(f"Actor {subactor_uid} is waiting on stdin hijack lock")
+    log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock")
     async with _acquire_debug_lock():
-        log.warning(f"Actor {subactor_uid} acquired stdin hijack lock")
-        # indicate to child that we've locked stdio
-        yield 'Locked'
+        log.warning(f"Actor {subactor_uid} ACQUIRED stdin hijack lock")
+
+        with _disable_sigint():
+            # indicate to child that we've locked stdio
+            yield 'Locked'
 
-        # wait for cancellation of stream by child
-        await trio.sleep_forever()
+            # wait for cancellation of stream by child
+            await trio.sleep_forever()
 
-    log.debug(f"Actor {subactor_uid} released stdin hijack lock")
+    log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock")
 
 
 # XXX: We only make this sync in case someone wants to
@@ -137,6 +183,7 @@ def _breakpoint(debug_func) -> Awaitable[None]:
     """``tractor`` breakpoint entry for engaging pdb machinery
     in subactors.
     """
+    global _in_debug
     actor = tractor.current_actor()
     do_unlock = trio.Event()
 
@@ -147,7 +194,7 @@ async def wait_for_parent_stdin_hijack(
             async with tractor._portal.open_portal(
                 actor._parent_chan,
                 start_msg_loop=False,
-                shield=True,
+                # shield=True,
             ) as portal:
                 with trio.fail_after(1):
                     agen = await portal.run(
@@ -156,34 +203,48 @@ async def wait_for_parent_stdin_hijack(
                         subactor_uid=actor.uid,
                     )
                 async with aclosing(agen):
+
+                    # block until first yield above
                     async for val in agen:
+
                         assert val == 'Locked'
                         task_status.started()
-                        with trio.CancelScope(shield=True):
-                            await do_unlock.wait()
 
-                            # trigger cancellation of remote stream
-                            break
+                        # with trio.CancelScope(shield=True):
+                        await do_unlock.wait()
+
+                        # trigger cancellation of remote stream
+                        break
         finally:
             log.debug(f"Exiting debugger for actor {actor}")
-            actor.statespace['_in_debug'] = False
+            global _in_debug
+            _in_debug = False
+            # actor.statespace['_in_debug'] = False
             log.debug(f"Child {actor} released parent stdio lock")
 
     async def _bp():
         """Async breakpoint which schedules a parent stdio lock, and once complete
         enters the ``pdbpp`` debugging console.
         """
-        in_debug = actor.statespace.setdefault('_in_debug', False)
-
-        if in_debug:
-            log.warning(f"Actor {actor} already has a debug lock, skipping...")
-            return
+        global _in_debug
+        # in_debug = actor.statespace.setdefault('_in_debug', False)
+
+        if _in_debug:
+            # if **this** actor is already in debug mode block here
+            # waiting for the control to be released - this allows
+            # support for recursive entries to `tractor.breakpoint()`
+            log.warning(
+                f"Actor {actor.uid} already has a debug lock, waiting...")
+            await do_unlock.wait()
+            await trio.sleep(0.1)
+            # return
 
         # assign unlock callback for debugger teardown hooks
         global _pdb_release_hook
         _pdb_release_hook = do_unlock.set
 
-        actor.statespace['_in_debug'] = True
+        # actor.statespace['_in_debug'] = True
+        _in_debug = True
 
         # TODO: need a more robust check for the "root" actor
         if actor._parent_chan:
@@ -231,6 +292,7 @@ def _set_trace(actor):
 
 def _post_mortem(actor):
     log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
+    # custom Pdb post-mortem entry
     pdbpp.xpm(Pdb=PdbwTeardown)
 
 
@@ -250,6 +312,7 @@ async def _maybe_enter_pm(err):
         # there's races between when the parent actor has killed all
         # comms and when the child tries to contact said parent to
         # acquire the tty lock.
+
         # Really we just want to mostly avoid catching KBIs here so there
         # might be a simpler check we can do?
         and trio.MultiError.filter(

From 363498b8826c20da956eccab4a3dc62d062dbbab Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 28 Sep 2020 09:24:36 -0400
Subject: [PATCH 21/61] Disable SIGINT handling in child processes

There seems to be no good reason not too since our cancellation
machinery/protocol should do this work when the root receives the
signal. This also (hopefully) helps with some debugging race condition
stuff.
---
 tractor/_entry.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tractor/_entry.py b/tractor/_entry.py
index 71dfdec40..9099fc022 100644
--- a/tractor/_entry.py
+++ b/tractor/_entry.py
@@ -3,6 +3,7 @@
 """
 from functools import partial
 from typing import Tuple, Any
+import signal
 
 import trio  # type: ignore
 
@@ -57,6 +58,10 @@ def _trio_main(
 ) -> None:
     """Entry point for a `trio_run_in_process` subactor.
     """
+    # Disable sigint handling in children;
+    # we don't need it thanks to our cancellation machinery.
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+
     # TODO: make a global func to set this or is it too hacky?
     # os.environ['PYTHONBREAKPOINT'] = 'tractor._debug.breakpoint'
 

From 25e93925b06ab3c0332c605c1a6ad3b220668bd1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 28 Sep 2020 13:02:33 -0400
Subject: [PATCH 22/61] Add a cancel scope around child debugger requests

This is needed in order to avoid the deadlock condition where
a child actor is waiting on the root actor's tty lock but it's parent
(possibly the root) is waiting on it to terminate after sending a cancel
request. The solution is simple: create a cancel scope around the
request in the child and always cancel it when a cancel request from the
parent arrives.
---
 tractor/_actor.py | 14 ++++++--
 tractor/_debug.py | 90 ++++++++++++++++++++++-------------------------
 2 files changed, 54 insertions(+), 50 deletions(-)

diff --git a/tractor/_actor.py b/tractor/_actor.py
index e5b8ef0d7..045ebb202 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -27,7 +27,7 @@
     unpack_error,
     ModuleNotExposed
 )
-from ._debug import _maybe_enter_pm
+from . import _debug
 from ._discovery import get_arbiter
 from ._portal import Portal
 from . import _state
@@ -129,7 +129,7 @@ async def _invoke(
     except (Exception, trio.MultiError) as err:
         # NOTE: don't enter debug mode recursively after quitting pdb
         log.exception("Actor crashed:")
-        await _maybe_enter_pm(err)
+        await _debug._maybe_enter_pm(err)
 
         # always ship errors back to caller
         err_msg = pack_error(err)
@@ -832,6 +832,14 @@ async def cancel(self) -> bool:
 
         # cancel all ongoing rpc tasks
         with trio.CancelScope(shield=True):
+
+            # kill any debugger request task to avoid deadlock
+            # with the root actor in this tree
+            dbcs = _debug._debugger_request_cs
+            if dbcs is not None:
+                log.debug("Cancelling active debugger request")
+                dbcs.cancel()
+
             # kill all ongoing tasks
             await self.cancel_rpc_tasks()
 
@@ -1051,7 +1059,7 @@ async def _start_actor(
             result = await main()
         except (Exception, trio.MultiError) as err:
             log.exception("Actor crashed:")
-            await _maybe_enter_pm(err)
+            await _debug._maybe_enter_pm(err)
             raise
 
         # XXX: the actor is cancelled when this context is complete
diff --git a/tractor/_debug.py b/tractor/_debug.py
index 4d43025a4..99814ea2a 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -11,6 +11,7 @@
 from async_generator import aclosing
 import tractor
 import trio
+# from trio.testing import wait_all_tasks_blocked
 
 from .log import get_logger
 from . import _state
@@ -39,6 +40,11 @@
 # lock in root actor preventing multi-access to local tty
 _debug_lock = trio.StrictFIFOLock()
 
+# XXX: set by the current task waiting on the root tty lock
+# and must be cancelled if this actor is cancelled via message
+# otherwise deadlocks with the parent actor may ensure
+_debugger_request_cs: trio.CancelScope = None
+
 
 class TractorConfig(pdbpp.DefaultConfig):
     """Custom ``pdbpp`` goodness.
@@ -119,8 +125,7 @@ async def _acquire_debug_lock():
         log.error("TTY lock acquired")
         yield
     finally:
-        if _debug_lock.locked():
-            _debug_lock.release()
+        _debug_lock.release()
         log.error("TTY lock released")
 
 
@@ -151,16 +156,6 @@ def _disable_sigint():
 async def _hijack_stdin_relay_to_child(
     subactor_uid: Tuple[str, str]
 ) -> None:
-    actor = tractor.current_actor()
-    nursery = actor._actoruid2nursery[subactor_uid]
-    print(f'NURSERY: {nursery}')
-    print(f'nursery is cancelled {nursery.cancelled}')
-    if actor._is_cancelled or nursery.cancelled:
-        raise RuntimeError(
-            "Can not engage debugger actor is already cancelled")
-
-    await trio.sleep(0)
-
     # TODO: when we get to true remote debugging
     # this will deliver stdin data
     log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock")
@@ -172,6 +167,7 @@ async def _hijack_stdin_relay_to_child(
             yield 'Locked'
 
             # wait for cancellation of stream by child
+            # indicating debugger is dis-engaged
             await trio.sleep_forever()
 
     log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock")
@@ -190,37 +186,40 @@ def _breakpoint(debug_func) -> Awaitable[None]:
     async def wait_for_parent_stdin_hijack(
         task_status=trio.TASK_STATUS_IGNORED
     ):
-        try:
-            async with tractor._portal.open_portal(
-                actor._parent_chan,
-                start_msg_loop=False,
-                # shield=True,
-            ) as portal:
-                with trio.fail_after(1):
-                    agen = await portal.run(
-                        'tractor._debug',
-                        '_hijack_stdin_relay_to_child',
-                        subactor_uid=actor.uid,
-                    )
-                async with aclosing(agen):
-
-                    # block until first yield above
-                    async for val in agen:
-
-                        assert val == 'Locked'
-                        task_status.started()
-
-                        # with trio.CancelScope(shield=True):
-                        await do_unlock.wait()
-
-                        # trigger cancellation of remote stream
-                        break
-        finally:
-            log.debug(f"Exiting debugger for actor {actor}")
-            global _in_debug
-            _in_debug = False
-            # actor.statespace['_in_debug'] = False
-            log.debug(f"Child {actor} released parent stdio lock")
+        global _debugger_request_cs
+        with trio.CancelScope() as cs:
+            _debugger_request_cs = cs
+            try:
+                async with tractor._portal.open_portal(
+                    actor._parent_chan,
+                    start_msg_loop=False,
+                    # shield=True,
+                ) as portal:
+                    with trio.fail_after(1):
+                        agen = await portal.run(
+                            'tractor._debug',
+                            '_hijack_stdin_relay_to_child',
+                            subactor_uid=actor.uid,
+                        )
+                    async with aclosing(agen):
+
+                        # block until first yield above
+                        async for val in agen:
+
+                            assert val == 'Locked'
+                            task_status.started()
+
+                            # with trio.CancelScope(shield=True):
+                            await do_unlock.wait()
+
+                            # trigger cancellation of remote stream
+                            break
+            finally:
+                log.debug(f"Exiting debugger for actor {actor}")
+                global _in_debug
+                _in_debug = False
+                # actor.statespace['_in_debug'] = False
+                log.debug(f"Child {actor} released parent stdio lock")
 
     async def _bp():
         """Async breakpoint which schedules a parent stdio lock, and once complete
@@ -237,7 +236,6 @@ async def _bp():
                 f"Actor {actor.uid} already has a debug lock, waiting...")
             await do_unlock.wait()
             await trio.sleep(0.1)
-            # return
 
         # assign unlock callback for debugger teardown hooks
         global _pdb_release_hook
@@ -253,9 +251,6 @@ async def _bp():
             # being restricted by the scope of a new task nursery.
             await actor._service_n.start(wait_for_parent_stdin_hijack)
 
-            # block here one (at the appropriate frame *up* where
-            # ``breakpoint()`` was awaited and begin handling stdio
-            # debug_func(actor)
         else:
             # we also wait in the root-parent for any child that
             # may have the tty locked prior
@@ -270,6 +265,7 @@ async def _lock(
 
         # block here one (at the appropriate frame *up* where
         # ``breakpoint()`` was awaited and begin handling stdio
+        log.debug("Entering the synchronous world of the debugger")
         debug_func(actor)
 
     # user code **must** await this!

From 5dd2d35fc5ca34f803d9720806b7b7a87473434f Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 28 Sep 2020 13:11:22 -0400
Subject: [PATCH 23/61] Huh, maybe we don't need to block SIGINT

Seems like the request task cancel scope is actually solving all the
deadlock issues and masking SIGINT isn't changing much behaviour at all.
I think let's keep it unmasked for now in case it does turn out useful
in cancelling from unrecoverable states while in debug.
---
 tractor/_debug.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/tractor/_debug.py b/tractor/_debug.py
index 99814ea2a..b9ed5a2c6 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -142,15 +142,15 @@ def handler(signum, frame):
 pdbpp.pdb.Pdb.sigint_handler = handler
 
 
-@contextmanager
-def _disable_sigint():
-    try:
-        # disable sigint handling while in debug
-        prior_handler = signal.signal(signal.SIGINT, handler)
-        yield
-    finally:
-        # restore SIGINT handling
-        signal.signal(signal.SIGINT, prior_handler)
+# @contextmanager
+# def _disable_sigint():
+#     try:
+#         # disable sigint handling while in debug
+#         prior_handler = signal.signal(signal.SIGINT, handler)
+#         yield
+#     finally:
+#         # restore SIGINT handling
+#         signal.signal(signal.SIGINT, prior_handler)
 
 
 async def _hijack_stdin_relay_to_child(
@@ -162,13 +162,14 @@ async def _hijack_stdin_relay_to_child(
     async with _acquire_debug_lock():
         log.warning(f"Actor {subactor_uid} ACQUIRED stdin hijack lock")
 
-        with _disable_sigint():
-            # indicate to child that we've locked stdio
-            yield 'Locked'
+        # with _disable_sigint():
+
+        # indicate to child that we've locked stdio
+        yield 'Locked'
 
-            # wait for cancellation of stream by child
-            # indicating debugger is dis-engaged
-            await trio.sleep_forever()
+        # wait for cancellation of stream by child
+        # indicating debugger is dis-engaged
+        await trio.sleep_forever()
 
     log.debug(f"Actor {subactor_uid} RELEASED stdin hijack lock")
 
@@ -218,7 +219,6 @@ async def wait_for_parent_stdin_hijack(
                 log.debug(f"Exiting debugger for actor {actor}")
                 global _in_debug
                 _in_debug = False
-                # actor.statespace['_in_debug'] = False
                 log.debug(f"Child {actor} released parent stdio lock")
 
     async def _bp():
@@ -226,7 +226,6 @@ async def _bp():
         enters the ``pdbpp`` debugging console.
         """
         global _in_debug
-        # in_debug = actor.statespace.setdefault('_in_debug', False)
 
         if _in_debug:
             # if **this** actor is already in debug mode block here
@@ -241,7 +240,8 @@ async def _bp():
         global _pdb_release_hook
         _pdb_release_hook = do_unlock.set
 
-        # actor.statespace['_in_debug'] = True
+        # mark local actor as "in debug mode" to avoid recurrent
+        # entries/requests to the root process
         _in_debug = True
 
         # TODO: need a more robust check for the "root" actor

From d7a472c7f2d85139ec3169fcf76278f2c52dffb4 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 28 Sep 2020 13:13:53 -0400
Subject: [PATCH 24/61] Update our debugging example to wait on results

---
 debugging/mp_debug.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/debugging/mp_debug.py b/debugging/mp_debug.py
index a27971590..fff92d3c6 100644
--- a/debugging/mp_debug.py
+++ b/debugging/mp_debug.py
@@ -9,7 +9,7 @@ async def bubble():
         await tractor.breakpoint()
 
 
-async def bail():
+async def name_error():
     getattr(doggy)
 
 
@@ -19,13 +19,21 @@ async def main():
     async with tractor.open_nursery() as n:
 
         portal1 = await n.run_in_actor('bubble', bubble)
-        portal = await n.run_in_actor('bail', bail)
-        # await portal.result()
-        # await portal1.result()
+        portal = await n.run_in_actor('name_error', name_error)
+        await portal1.result()
+        await portal.result()
 
     # The ``async with`` will unblock here since the 'some_linguist'
     # actor has completed its main task ``cellar_door``.
 
 
+# TODO:
+# - recurrent entry from single actor
+# - recurrent entry to breakpoint() from single actor *after* and an
+#   error
+# - root error alongside child errors
+# - recurrent root errors
+
+
 if __name__ == '__main__':
-    tractor.run(main, loglevel='error', debug_mode=True)
+    tractor.run(main, loglevel='info', debug_mode=True)

From fc2cb610b90302de1b453df1369976131fb9af59 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 28 Sep 2020 13:49:45 -0400
Subject: [PATCH 25/61] Make "hard kill" just a `Process.terminate()`

It's not like any of this code is really being used anyway since we
aren't indefinitely blocking for cancelled subactors to terminate (yet).
Drop the `do_hard_kill()` bit for now and just rely on the underlying
process api. Oh, and mark the nursery as cancelled asap.
---
 tractor/_trionics.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/tractor/_trionics.py b/tractor/_trionics.py
index d51a59b8d..d78e1106b 100644
--- a/tractor/_trionics.py
+++ b/tractor/_trionics.py
@@ -137,19 +137,14 @@ async def cancel(self, hard_kill: bool = False) -> None:
         If ``hard_killl`` is set to ``True`` then kill the processes
         directly without any far end graceful ``trio`` cancellation.
         """
-        def do_hard_kill(proc):
-            log.warning(f"Hard killing subactors {self._children}")
-            proc.terminate()
-            # XXX: below doesn't seem to work?
-            # send KeyBoardInterrupt (trio abort signal) to sub-actors
-            # os.kill(proc.pid, signal.SIGINT)
-
-        log.debug("Cancelling nursery")
+        self.cancelled = True
+
+        log.critical("Cancelling nursery")
         with trio.move_on_after(3) as cs:
             async with trio.open_nursery() as nursery:
                 for subactor, proc, portal in self._children.values():
                     if hard_kill:
-                        do_hard_kill(proc)
+                        proc.terminate()
                     else:
                         if portal is None:  # actor hasn't fully spawned yet
                             event = self._actor._peer_connected[subactor.uid]
@@ -169,7 +164,7 @@ def do_hard_kill(proc):
                                 if chan:
                                     portal = Portal(chan)
                                 else:  # there's no other choice left
-                                    do_hard_kill(proc)
+                                    proc.terminate()
 
                         # spawn cancel tasks for each sub-actor
                         assert portal
@@ -178,13 +173,13 @@ def do_hard_kill(proc):
         # if we cancelled the cancel (we hung cancelling remote actors)
         # then hard kill all sub-processes
         if cs.cancelled_caught:
-            log.error(f"Failed to gracefully cancel {self}, hard killing!")
-            async with trio.open_nursery():
-                for subactor, proc, portal in self._children.values():
-                    nursery.start_soon(do_hard_kill, proc)
+            log.error(
+                f"Failed to cancel {self}\nHard killing process tree!")
+            for subactor, proc, portal in self._children.values():
+                log.warning(f"Hard killing process {proc}")
+                proc.terminate()
 
         # mark ourselves as having (tried to have) cancelled all subactors
-        self.cancelled = True
         self._join_procs.set()
 
 

From 29ed065dc44dbe3f7b887be2ece77adeff24d080 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 28 Sep 2020 13:56:42 -0400
Subject: [PATCH 26/61] Ack our inability to hard kill sub-procs

---
 tractor/_portal.py |  3 ++-
 tractor/_spawn.py  | 15 +++++++++++++--
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/tractor/_portal.py b/tractor/_portal.py
index e749ec6ed..af3c1b5c1 100644
--- a/tractor/_portal.py
+++ b/tractor/_portal.py
@@ -278,7 +278,8 @@ async def cancel_actor(self):
         try:
             # send cancel cmd - might not get response
             # XXX: sure would be nice to make this work with a proper shield
-            # with trio.CancelScope(shield=True):
+            # with trio.CancelScope() as cancel_scope:
+            # with trio.CancelScope(shield=True) as cancel_scope:
             with trio.move_on_after(0.5) as cancel_scope:
                 cancel_scope.shield = True
                 await self.run('self', 'cancel')
diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 6d1961467..2a2f602db 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -188,8 +188,17 @@ async def spawn_subactor(
         # the outer scope since no actor zombies are
         # ever allowed. This ``__aexit__()`` also shields
         # internally.
-        async with proc:
-            log.debug(f"Terminating {proc}")
+        log.debug(f"Attempting to kill {proc}")
+
+        # NOTE: this timeout effectively does nothing right now since
+        # we are shielding the ``.wait()`` inside ``new_proc()`` which
+        # will pretty much never release until the process exits.
+        with trio.move_on_after(3) as cs:
+            async with proc:
+                log.debug(f"Terminating {proc}")
+        if cs.cancelled_caught:
+            log.critical(f"HARD KILLING {proc}")
+            proc.kill()
 
 
 async def new_proc(
@@ -354,6 +363,8 @@ async def new_proc(
                 await proc_waiter(proc)
             proc.join()
 
+        # This is again common logic for all backends:
+
         log.debug(f"Joined {proc}")
         # pop child entry to indicate we are no longer managing this subactor
         subactor, proc, portal = actor_nursery._children.pop(subactor.uid)

From 0a2a94fee038075a3496a750f02fcb09221a5848 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 3 Oct 2020 19:35:18 -0400
Subject: [PATCH 27/61] Add initial root actor debugger tests

---
 {debugging => examples/debugging}/mp_debug.py |   0
 examples/debugging/root_actor_breakpoint.py   |  15 +++
 examples/debugging/root_actor_error.py        |   9 ++
 tests/test_debugger.py                        | 106 ++++++++++++++++++
 tests/test_docs_examples.py                   |  15 ++-
 5 files changed, 141 insertions(+), 4 deletions(-)
 rename {debugging => examples/debugging}/mp_debug.py (100%)
 create mode 100644 examples/debugging/root_actor_breakpoint.py
 create mode 100644 examples/debugging/root_actor_error.py
 create mode 100644 tests/test_debugger.py

diff --git a/debugging/mp_debug.py b/examples/debugging/mp_debug.py
similarity index 100%
rename from debugging/mp_debug.py
rename to examples/debugging/mp_debug.py
diff --git a/examples/debugging/root_actor_breakpoint.py b/examples/debugging/root_actor_breakpoint.py
new file mode 100644
index 000000000..bd4dcb122
--- /dev/null
+++ b/examples/debugging/root_actor_breakpoint.py
@@ -0,0 +1,15 @@
+import trio
+import tractor
+
+
+async def main():
+
+    await trio.sleep(0.1)
+
+    await tractor.breakpoint()
+
+    await trio.sleep(0.1)
+
+
+if __name__ == '__main__':
+    tractor.run(main, debug_mode=True)
diff --git a/examples/debugging/root_actor_error.py b/examples/debugging/root_actor_error.py
new file mode 100644
index 000000000..748669905
--- /dev/null
+++ b/examples/debugging/root_actor_error.py
@@ -0,0 +1,9 @@
+import tractor
+
+
+async def main():
+    assert 0
+
+
+if __name__ == '__main__':
+    tractor.run(main, debug_mode=True)
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
new file mode 100644
index 000000000..9d746c650
--- /dev/null
+++ b/tests/test_debugger.py
@@ -0,0 +1,106 @@
+"""
+That native debug better work!
+"""
+from os import path
+
+import pytest
+import pexpect
+
+from .test_docs_examples import repodir
+
+
+def examples_dir():
+    """Return the abspath to the examples directory.
+    """
+    return path.join(repodir(), 'examples', 'debugging/')
+
+
+def mk_cmd(ex_name: str) -> str:
+    """Generate a command suitable to pass to ``pexpect.spawn()``.
+    """
+    return ' '.join(
+        ['python',
+         path.join(examples_dir(), f'{ex_name}.py')]
+    )
+
+
+def spawn(
+    cmd: str,
+    testdir,
+) -> pexpect.spawn:
+    return testdir.spawn(
+        cmd=mk_cmd(cmd),
+        expect_timeout=3,
+    )
+
+
+@pytest.mark.parametrize(
+    'user_in_out',
+    [
+        ('c', 'AssertionError'),
+        ('q', 'AssertionError'),
+    ],
+    ids=lambda item: item[1],
+)
+def test_root_actor_error(testdir, user_in_out):
+    """Demonstrate crash handler entering pdbpp from basic error in root actor.
+    """
+    user_input, expect_err_str = user_in_out
+
+    child = spawn('root_actor_error', testdir)
+
+    # scan for the pdbpp prompt
+    child.expect("\(Pdb\+\+\)")
+
+    # make sure expected logging and error arrives
+    assert 'TTY lock acquired' in str(child.before)
+    assert 'AssertionError' in str(child.before)
+
+    # send user command
+    child.sendline(user_input)
+    child.expect('\r\n')
+    child.expect('TTY lock released')
+
+    # process should exit
+    child.expect(pexpect.EOF)
+    assert expect_err_str in str(child.before)
+
+
+@pytest.mark.parametrize(
+    'user_in_out',
+    [
+        ('c', None),
+        ('q', 'bdb.BdbQuit'),
+    ],
+    ids=lambda item: f'{item[0]} -> {item[1]}',
+)
+def test_root_actor_bp(testdir, user_in_out):
+    """Demonstrate breakpoint from in root actor.
+    """
+    user_input, expect_err_str = user_in_out
+    child = spawn('root_actor_breakpoint', testdir)
+
+    # scan for the pdbpp prompt
+    child.expect("\(Pdb\+\+\)")
+
+    assert 'Error' not in str(child.before)
+
+    # send user command
+    child.sendline(user_input)
+    child.expect('\r\n')
+
+    # process should exit
+    child.expect(pexpect.EOF)
+
+    if expect_err_str is None:
+        assert 'Error' not in str(child.before)
+    else:
+        assert expect_err_str in str(child.before)
+
+
+def test_subactor_error(testdir):
+    ...
+
+
+def test_subactor_breakpoint(testdir):
+    ...
diff --git a/tests/test_docs_examples.py b/tests/test_docs_examples.py
index c6d5df860..67b34ee57 100644
--- a/tests/test_docs_examples.py
+++ b/tests/test_docs_examples.py
@@ -85,15 +85,22 @@ def run(script_code):
 
 @pytest.mark.parametrize(
     'example_script',
-    [f for f in os.listdir(examples_dir()) if '__' not in f],
+    [
+        f for f in os.listdir(examples_dir())
+        if (
+            ('__' not in f) and
+            ('debugging' not in f)
+        )
+    ],
 )
 def test_example(run_example_in_subproc, example_script):
     """Load and run scripts from this repo's ``examples/`` dir as a user
     would copy and pasing them into their editor.
 
-    On windows a little more "finessing" is done to make ``multiprocessing`` play nice:
-    we copy the ``__main__.py`` into the test directory and invoke the script as a module
-    with ``python -m test_example``.
+    On windows a little more "finessing" is done to make
+    ``multiprocessing`` play nice: we copy the ``__main__.py`` into the
+    test directory and invoke the script as a module with ``python -m
+    test_example``.
     """
     ex_file = os.path.join(examples_dir(), example_script)
     with open(ex_file, 'r') as ex:

From 9067bb2a41b0722ef33c2cfef9a9e3ceb5c7bcd8 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sat, 3 Oct 2020 19:36:04 -0400
Subject: [PATCH 28/61] Shorten arbiter contact timeout

---
 tractor/_actor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/_actor.py b/tractor/_actor.py
index 045ebb202..6517ed334 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -754,7 +754,7 @@ async def _async_main(
                     self._arb_addr is not None
             ):
                 failed = False
-                with trio.move_on_after(5) as cs:
+                with trio.move_on_after(0.5) as cs:
                     cs.shield = True
                     try:
                         async with get_arbiter(*self._arb_addr) as arb_portal:

From 73a32f7d9c48255f77a42f389b7c0dbfd3f88464 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 4 Oct 2020 09:54:36 -0400
Subject: [PATCH 29/61] Add initial subactor debug tests

---
 examples/debugging/subactor_breakpoint.py |  25 ++++++
 examples/debugging/subactor_error.py      |  16 ++++
 tests/test_debugger.py                    | 100 +++++++++++++++++++---
 3 files changed, 128 insertions(+), 13 deletions(-)
 create mode 100644 examples/debugging/subactor_breakpoint.py
 create mode 100644 examples/debugging/subactor_error.py

diff --git a/examples/debugging/subactor_breakpoint.py b/examples/debugging/subactor_breakpoint.py
new file mode 100644
index 000000000..35db6cf6f
--- /dev/null
+++ b/examples/debugging/subactor_breakpoint.py
@@ -0,0 +1,25 @@
+import trio
+import tractor
+
+
+async def breakpoint_forever():
+    """Indefinitely re-enter debugger in child actor.
+    """
+    while True:
+        await trio.sleep(0.1)
+        await tractor.breakpoint()
+
+
+async def main():
+
+    async with tractor.open_nursery() as n:
+
+        portal = await n.run_in_actor(
+            'breakpoint_forever',
+            breakpoint_forever,
+        )
+        await portal.result()
+
+
+if __name__ == '__main__':
+    tractor.run(main, debug_mode=True)
diff --git a/examples/debugging/subactor_error.py b/examples/debugging/subactor_error.py
new file mode 100644
index 000000000..67ec1b69e
--- /dev/null
+++ b/examples/debugging/subactor_error.py
@@ -0,0 +1,16 @@
+import tractor
+
+
+async def name_error():
+    getattr(doggypants)
+
+
+async def main():
+    async with tractor.open_nursery() as n:
+
+        portal = await n.run_in_actor('name_error', name_error)
+        await portal.result()
+
+
+if __name__ == '__main__':
+    tractor.run(main, debug_mode=True)
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 9d746c650..a26f9ceb9 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -24,14 +24,19 @@ def mk_cmd(ex_name: str) -> str:
     )
 
 
+@pytest.fixture
 def spawn(
-    cmd: str,
     testdir,
+    arb_addr,
 ) -> pexpect.spawn:
-    return testdir.spawn(
-        cmd=mk_cmd(cmd),
-        expect_timeout=3,
-    )
+
+    def _spawn(cmd):
+        return testdir.spawn(
+            cmd=mk_cmd(cmd),
+            expect_timeout=3,
+        )
+
+    return _spawn
 
 
 @pytest.mark.parametrize(
@@ -42,12 +47,12 @@ def spawn(
     ],
     ids=lambda item: item[1],
 )
-def test_root_actor_error(testdir, user_in_out):
+def test_root_actor_error(spawn, user_in_out):
     """Demonstrate crash handler entering pdbpp from basic error in root actor.
     """
     user_input, expect_err_str = user_in_out
 
-    child = spawn('root_actor_error', testdir)
+    child = spawn('root_actor_error')
 
     # scan for the pdbpp prompt
     child.expect("\(Pdb\+\+\)")
@@ -74,11 +79,11 @@ def test_root_actor_error(testdir, user_in_out):
     ],
     ids=lambda item: f'{item[0]} -> {item[1]}',
 )
-def test_root_actor_bp(testdir, user_in_out):
+def test_root_actor_bp(spawn, user_in_out):
     """Demonstrate breakpoint from in root actor.
     """
     user_input, expect_err_str = user_in_out
-    child = spawn('root_actor_breakpoint', testdir)
+    child = spawn('root_actor_breakpoint')
 
     # scan for the pdbpp prompt
     child.expect("\(Pdb\+\+\)")
@@ -98,9 +103,78 @@ def test_root_actor_bp(testdir, user_in_out):
         assert expect_err_str in str(child.before)
 
 
-def test_subactor_error(testdir):
-    ...
+def test_subactor_error(spawn):
+    child = spawn('subactor_error')
+
+    # scan for the pdbpp prompt
+    child.expect("\(Pdb\+\+\)")
+
+    before = str(child.before.decode())
+    assert "Attaching to pdb in crashed actor: ('name_error'" in before
+
+    # send user command
+    # (in this case it's the same for 'continue' vs. 'quit')
+    child.sendline('continue')
+    child.expect('\r\n')
+
+    # the debugger should enter a second time in the nursery
+    # creating actor
+
+    child.expect("\(Pdb\+\+\)")
+
+    before = str(child.before.decode())
+
+    # root actor gets debugger engaged
+    assert "Attaching to pdb in crashed actor: ('arbiter'" in before
+
+    # error is a remote error propagated from the subactor
+    assert "RemoteActorError: ('name_error'" in before
+
+    child.sendline('c')
+    child.expect('\r\n')
+
+    # process should exit
+    child.expect(pexpect.EOF)
+
+
+def test_subactor_breakpoint(spawn):
+    child = spawn('subactor_breakpoint')
+
+    # scan for the pdbpp prompt
+    child.expect("\(Pdb\+\+\)")
+
+    before = str(child.before.decode())
+    assert "Attaching pdb to actor: ('breakpoint_forever'" in before
 
+    # do some "next" commands to demonstrate recurrent breakpoint
+    # entries
+    for _ in range(10):
+        child.sendline('next')
+        child.expect("\(Pdb\+\+\)")
+
+    # now run some "continues" to show re-entries
+    for _ in range(5):
+        child.sendline('continue')
+        child.expect("\(Pdb\+\+\)")
+        before = str(child.before.decode())
+        assert "Attaching pdb to actor: ('breakpoint_forever'" in before
+
+    # finally quit the loop
+    child.sendline('q')
+
+    # child process should exit but parent will capture pdb.BdbQuit
+    child.expect("\(Pdb\+\+\)")
+
+    before = str(child.before.decode())
+    assert "RemoteActorError: ('breakpoint_forever'" in before
+    assert 'bdb.BdbQuit' in before
+
+    # quit the parent
+    child.sendline('c')
+
+    # process should exit
+    child.expect(pexpect.EOF)
 
-def test_subactor_breakpoint(testdir):
-    ...
+    before = str(child.before.decode())
+    assert "RemoteActorError: ('breakpoint_forever'" in before
+    assert 'bdb.BdbQuit' in before

From a2151cdd4d4091e60db49b7fef612677141f5102 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 4 Oct 2020 09:55:34 -0400
Subject: [PATCH 30/61] Allow re-entrant breakpoints during pdb stepping

---
 tractor/_debug.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tractor/_debug.py b/tractor/_debug.py
index b9ed5a2c6..a0a5fc6d7 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -180,7 +180,6 @@ def _breakpoint(debug_func) -> Awaitable[None]:
     """``tractor`` breakpoint entry for engaging pdb machinery
     in subactors.
     """
-    global _in_debug
     actor = tractor.current_actor()
     do_unlock = trio.Event()
 
@@ -196,7 +195,7 @@ async def wait_for_parent_stdin_hijack(
                     start_msg_loop=False,
                     # shield=True,
                 ) as portal:
-                    with trio.fail_after(1):
+                    with trio.fail_after(.5):
                         agen = await portal.run(
                             'tractor._debug',
                             '_hijack_stdin_relay_to_child',
@@ -225,9 +224,15 @@ async def _bp():
         """Async breakpoint which schedules a parent stdio lock, and once complete
         enters the ``pdbpp`` debugging console.
         """
+        task_name = trio.lowlevel.current_task()
+
         global _in_debug
+        if _in_debug :
+            if _in_debug == task_name:
+                # this task already has the lock and is
+                # likely recurrently entering a breakpoint
+                return
 
-        if _in_debug:
             # if **this** actor is already in debug mode block here
             # waiting for the control to be released - this allows
             # support for recursive entries to `tractor.breakpoint()`
@@ -242,7 +247,7 @@ async def _bp():
 
         # mark local actor as "in debug mode" to avoid recurrent
         # entries/requests to the root process
-        _in_debug = True
+        _in_debug = task_name
 
         # TODO: need a more robust check for the "root" actor
         if actor._parent_chan:

From e387e8b3221741ac7caab2983601bd421a8593a0 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 4 Oct 2020 17:31:02 -0400
Subject: [PATCH 31/61] Add a multi-subactor test with nesting

---
 examples/debugging/mp_debug.py        | 39 -----------
 examples/debugging/multi_subactors.py | 51 ++++++++++++++
 tests/test_debugger.py                | 95 ++++++++++++++++++++++++---
 3 files changed, 137 insertions(+), 48 deletions(-)
 delete mode 100644 examples/debugging/mp_debug.py
 create mode 100644 examples/debugging/multi_subactors.py

diff --git a/examples/debugging/mp_debug.py b/examples/debugging/mp_debug.py
deleted file mode 100644
index fff92d3c6..000000000
--- a/examples/debugging/mp_debug.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import tractor
-import trio
-
-
-async def bubble():
-    print('IN BUBBLE')
-    while True:
-        await trio.sleep(.1)
-        await tractor.breakpoint()
-
-
-async def name_error():
-    getattr(doggy)
-
-
-async def main():
-    """The main ``tractor`` routine.
-    """
-    async with tractor.open_nursery() as n:
-
-        portal1 = await n.run_in_actor('bubble', bubble)
-        portal = await n.run_in_actor('name_error', name_error)
-        await portal1.result()
-        await portal.result()
-
-    # The ``async with`` will unblock here since the 'some_linguist'
-    # actor has completed its main task ``cellar_door``.
-
-
-# TODO:
-# - recurrent entry from single actor
-# - recurrent entry to breakpoint() from single actor *after* and an
-#   error
-# - root error alongside child errors
-# - recurrent root errors
-
-
-if __name__ == '__main__':
-    tractor.run(main, loglevel='info', debug_mode=True)
diff --git a/examples/debugging/multi_subactors.py b/examples/debugging/multi_subactors.py
new file mode 100644
index 000000000..97de906e8
--- /dev/null
+++ b/examples/debugging/multi_subactors.py
@@ -0,0 +1,51 @@
+import tractor
+import trio
+
+
+async def breakpoint_forever():
+    "Indefinitely re-enter debugger in child actor."
+    while True:
+        await trio.sleep(0.1)
+        await tractor.breakpoint()
+
+
+async def name_error():
+    "Raise a ``NameError``"
+    getattr(doggypants)
+
+
+async def spawn_error():
+    """"A nested nursery that triggers another ``NameError``.
+    """
+    async with tractor.open_nursery() as n:
+        portal = await n.run_in_actor('name_error_1', name_error)
+        return await portal.result()
+
+
+async def main():
+    """The main ``tractor`` routine.
+
+    The process tree should look as approximately as follows:
+
+    -python examples/debugging/multi_subactors.py
+    |-python -m tractor._child --uid ('name_error', 'a7caf490 ...)
+    |-python -m tractor._child --uid ('bp_forever', '1f787a7e ...)
+    `-python -m tractor._child --uid ('spawn_error', '52ee14a5 ...)
+       `-python -m tractor._child --uid ('name_error', '3391222c ...)
+    """
+    async with tractor.open_nursery() as n:
+
+        # spawn both actors
+        portal1 = await n.run_in_actor('bp_forever', breakpoint_forever)
+        portal = await n.run_in_actor('name_error', name_error)
+        portal2 = await n.run_in_actor('spawn_error', spawn_error)
+
+        # attempt to collect results (which raises error in parent)
+        # still has some issues where the parent seems to get stuck
+        # await portal.result()
+        # await portal1.result()
+        # await portal2.result()
+
+
+if __name__ == '__main__':
+    tractor.run(main, debug_mode=True)
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index a26f9ceb9..de76247a8 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -9,6 +9,17 @@
 from .test_docs_examples import repodir
 
 
+# TODO:
+# - recurrent entry from single actor
+# - recurrent entry to breakpoint() from single actor *after* and an
+#   error
+# - root error before child errors
+# - root error after child errors
+# - root error before child breakpoint
+# - root error after child breakpoint
+# - recurrent root errors
+
+
 def examples_dir():
     """Return the abspath to the examples directory.
     """
@@ -45,7 +56,7 @@ def _spawn(cmd):
         ('c', 'AssertionError'),
         ('q', 'AssertionError'),
     ],
-    ids=lambda item: item[1],
+    ids=lambda item: f'{item[0]} -> {item[1]}',
 )
 def test_root_actor_error(spawn, user_in_out):
     """Demonstrate crash handler entering pdbpp from basic error in root actor.
@@ -55,7 +66,7 @@ def test_root_actor_error(spawn, user_in_out):
     child = spawn('root_actor_error')
 
     # scan for the pdbpp prompt
-    child.expect("\(Pdb\+\+\)")
+    child.expect(r"\(Pdb\+\+\)")
 
     # make sure expected logging and error arrives
     assert 'TTY lock acquired' in str(child.before)
@@ -86,7 +97,7 @@ def test_root_actor_bp(spawn, user_in_out):
     child = spawn('root_actor_breakpoint')
 
     # scan for the pdbpp prompt
-    child.expect("\(Pdb\+\+\)")
+    child.expect(r"\(Pdb\+\+\)")
 
     assert 'Error' not in str(child.before)
 
@@ -104,10 +115,12 @@ def test_root_actor_bp(spawn, user_in_out):
 
 
 def test_subactor_error(spawn):
+    "Single subactor raising an error"
+
     child = spawn('subactor_error')
 
     # scan for the pdbpp prompt
-    child.expect("\(Pdb\+\+\)")
+    child.expect(r"\(Pdb\+\+\)")
 
     before = str(child.before.decode())
     assert "Attaching to pdb in crashed actor: ('name_error'" in before
@@ -120,7 +133,7 @@ def test_subactor_error(spawn):
     # the debugger should enter a second time in the nursery
     # creating actor
 
-    child.expect("\(Pdb\+\+\)")
+    child.expect(r"\(Pdb\+\+\)")
 
     before = str(child.before.decode())
 
@@ -138,10 +151,12 @@ def test_subactor_error(spawn):
 
 
 def test_subactor_breakpoint(spawn):
+    "Single subactor with an infinite breakpoint loop"
+
     child = spawn('subactor_breakpoint')
 
     # scan for the pdbpp prompt
-    child.expect("\(Pdb\+\+\)")
+    child.expect(r"\(Pdb\+\+\)")
 
     before = str(child.before.decode())
     assert "Attaching pdb to actor: ('breakpoint_forever'" in before
@@ -150,12 +165,12 @@ def test_subactor_breakpoint(spawn):
     # entries
     for _ in range(10):
         child.sendline('next')
-        child.expect("\(Pdb\+\+\)")
+        child.expect(r"\(Pdb\+\+\)")
 
     # now run some "continues" to show re-entries
     for _ in range(5):
         child.sendline('continue')
-        child.expect("\(Pdb\+\+\)")
+        child.expect(r"\(Pdb\+\+\)")
         before = str(child.before.decode())
         assert "Attaching pdb to actor: ('breakpoint_forever'" in before
 
@@ -163,7 +178,7 @@ def test_subactor_breakpoint(spawn):
     child.sendline('q')
 
     # child process should exit but parent will capture pdb.BdbQuit
-    child.expect("\(Pdb\+\+\)")
+    child.expect(r"\(Pdb\+\+\)")
 
     before = str(child.before.decode())
     assert "RemoteActorError: ('breakpoint_forever'" in before
@@ -178,3 +193,65 @@ def test_subactor_breakpoint(spawn):
     before = str(child.before.decode())
     assert "RemoteActorError: ('breakpoint_forever'" in before
     assert 'bdb.BdbQuit' in before
+
+
+def test_multi_subactors(spawn):
+    """Multiple subactors, both erroring and breakpointing as well as
+    a nested subactor erroring.
+    """
+    child = spawn(r'multi_subactors')
+
+    # scan for the pdbpp prompt
+    child.expect(r"\(Pdb\+\+\)")
+
+    before = str(child.before.decode())
+    assert "Attaching pdb to actor: ('bp_forever'" in before
+
+    # do some "next" commands to demonstrate recurrent breakpoint
+    # entries
+    for _ in range(10):
+        child.sendline('next')
+        child.expect(r"\(Pdb\+\+\)")
+
+    # continue to next error
+    child.sendline('c')
+
+    # first name_error failure
+    child.expect(r"\(Pdb\+\+\)")
+    before = str(child.before.decode())
+    assert "NameError" in before
+
+    # continue again
+    child.sendline('c')
+
+    # 2nd name_error failure
+    child.expect(r"\(Pdb\+\+\)")
+    before = str(child.before.decode())
+    assert "NameError" in before
+
+    # breakpoint loop should re-engage
+    child.sendline('c')
+    child.expect(r"\(Pdb\+\+\)")
+    before = str(child.before.decode())
+    assert "Attaching pdb to actor: ('bp_forever'" in before
+
+    # now run some "continues" to show re-entries
+    for _ in range(5):
+        child.sendline('c')
+        child.expect(r"\(Pdb\+\+\)")
+
+    # quit the loop and expect parent to attach
+    child.sendline('q')
+    child.expect(r"\(Pdb\+\+\)")
+    before = str(child.before.decode())
+    assert "Attaching to pdb in crashed actor: ('arbiter'" in before
+    assert "RemoteActorError: ('bp_forever'" in before
+    assert 'bdb.BdbQuit' in before
+
+    # process should exit
+    child.sendline('c')
+    child.expect(pexpect.EOF)
+
+    before = str(child.before.decode())
+    assert "RemoteActorError: ('bp_forever'" in before
+    assert 'bdb.BdbQuit' in before

From 83a45119e9e6d103c57dedc6af225b9a9968dbca Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 4 Oct 2020 17:58:41 -0400
Subject: [PATCH 32/61] Add "root mailbox" contact info passing

Every subactor in the tree now receives the socket (or whatever the
mailbox type ends up being) during startup and can call the new
`tractor._discovery.get_root()` function to get a portal to the current
root actor in their tree. The main reason for adding this atm is to
support nested child actors gaining access to the root's tty lock for
debugging.

Also, when a channel disconnects from a message loop, might as well kill
all its rpc tasks.
---
 tractor/_actor.py     | 31 +++++++++++++++++++++++++------
 tractor/_debug.py     | 18 ++++++++++--------
 tractor/_discovery.py | 12 +++++++++++-
 tractor/_state.py     |  1 +
 4 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/tractor/_actor.py b/tractor/_actor.py
index 6517ed334..b79106f67 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -539,7 +539,9 @@ async def _process_messages(
                         f"Waiting on next msg for {chan} from {chan.uid}")
                 else:
                     # channel disconnect
-                    log.debug(f"{chan} from {chan.uid} disconnected")
+                    log.debug(
+                        f"{chan} from {chan.uid} disconnected, cancelling all rpc tasks")
+                    self.cancel_rpc_tasks(chan)
 
         except trio.ClosedResourceError:
             log.error(f"{chan} form {chan.uid} broke")
@@ -676,6 +678,9 @@ async def _async_main(
                             accept_port=port
                         )
                     )
+                    accept_addr = self.accept_addr
+                    if _state._runtime_vars['_is_root']:
+                        _state._runtime_vars['_root_mailbox'] = accept_addr
 
                     # Register with the arbiter if we're told its addr
                     log.debug(f"Registering {self} for role `{self.name}`")
@@ -686,7 +691,7 @@ async def _async_main(
                             'self',
                             'register_actor',
                             uid=self.uid,
-                            sockaddr=self.accept_addr,
+                            sockaddr=accept_addr,
                         )
 
                     registered_with_arbiter = True
@@ -895,15 +900,23 @@ async def _cancel_task(self, cid, chan):
             f"Sucessfully cancelled task:\ncid: {cid}\nfunc: {func}\n"
             f"peer: {chan.uid}\n")
 
-    async def cancel_rpc_tasks(self) -> None:
+    async def cancel_rpc_tasks(
+        self,
+        only_chan: Optional[Channel] = None,
+    ) -> None:
         """Cancel all existing RPC responder tasks using the cancel scope
         registered for each.
         """
         tasks = self._rpc_tasks
         log.info(f"Cancelling all {len(tasks)} rpc tasks:\n{tasks} ")
         for (chan, cid) in tasks.copy():
+            if only_chan is not None:
+                if only_chan != chan:
+                    continue
+
             # TODO: this should really done in a nursery batch
             await self._cancel_task(cid, chan)
+
         log.info(
             f"Waiting for remaining rpc tasks to complete {tasks}")
         await self._ongoing_rpc_tasks.wait()
@@ -1058,9 +1071,15 @@ async def _start_actor(
         try:
             result = await main()
         except (Exception, trio.MultiError) as err:
-            log.exception("Actor crashed:")
-            await _debug._maybe_enter_pm(err)
-            raise
+            try:
+                log.exception("Actor crashed:")
+                await _debug._maybe_enter_pm(err)
+
+                raise
+
+            finally:
+                actor._service_n.cancel_scope.cancel()
+                await actor.cancel()
 
         # XXX: the actor is cancelled when this context is complete
         # given that there are no more active peer channels connected
diff --git a/tractor/_debug.py b/tractor/_debug.py
index a0a5fc6d7..2e1c10726 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -15,6 +15,7 @@
 
 from .log import get_logger
 from . import _state
+from ._discovery import get_root
 
 try:
     # wtf: only exported when installed in dev mode?
@@ -190,11 +191,7 @@ async def wait_for_parent_stdin_hijack(
         with trio.CancelScope() as cs:
             _debugger_request_cs = cs
             try:
-                async with tractor._portal.open_portal(
-                    actor._parent_chan,
-                    start_msg_loop=False,
-                    # shield=True,
-                ) as portal:
+                async with get_root() as portal:
                     with trio.fail_after(.5):
                         agen = await portal.run(
                             'tractor._debug',
@@ -262,9 +259,14 @@ async def _bp():
             async def _lock(
                 task_status=trio.TASK_STATUS_IGNORED
             ):
-                async with _acquire_debug_lock():
-                    task_status.started()
-                    await do_unlock.wait()
+                try:
+                    async with _acquire_debug_lock():
+                        task_status.started()
+                        await do_unlock.wait()
+                finally:
+                    global _in_debug
+                    _in_debug = False
+                    log.debug(f"{actor} released tty lock")
 
             await actor._service_n.start(_lock)
 
diff --git a/tractor/_discovery.py b/tractor/_discovery.py
index 0407896bc..13da85d02 100644
--- a/tractor/_discovery.py
+++ b/tractor/_discovery.py
@@ -11,7 +11,7 @@
     open_portal,
     LocalPortal,
 )
-from ._state import current_actor
+from ._state import current_actor, _runtime_vars
 
 
 @asynccontextmanager
@@ -37,6 +37,16 @@ async def get_arbiter(
                 yield arb_portal
 
 
+@asynccontextmanager
+async def get_root(
+**kwargs,
+) -> typing.AsyncGenerator[Union[Portal, LocalPortal], None]:
+    host, port = _runtime_vars['_root_mailbox']
+    async with _connect_chan(host, port) as chan:
+        async with open_portal(chan, **kwargs) as portal:
+            yield portal
+
+
 @asynccontextmanager
 async def find_actor(
     name: str,
diff --git a/tractor/_state.py b/tractor/_state.py
index 2a18a4ca4..2728b7813 100644
--- a/tractor/_state.py
+++ b/tractor/_state.py
@@ -11,6 +11,7 @@
 _runtime_vars = {
     '_debug_mode': False,
     '_is_root': False,
+    '_root_mailbox': (None, None)
 }
 
 

From 31c1a32d58c6c2252bc788fb26815dbbb7ae1da1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Sun, 4 Oct 2020 19:39:00 -0400
Subject: [PATCH 33/61] Add re-entrant root breakpoint test; demonstrates a
 bug..

---
 .../root_actor_breakpoint_forever.py          | 11 +++++++++
 tests/test_debugger.py                        | 23 ++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 examples/debugging/root_actor_breakpoint_forever.py

diff --git a/examples/debugging/root_actor_breakpoint_forever.py b/examples/debugging/root_actor_breakpoint_forever.py
new file mode 100644
index 000000000..0332ab6af
--- /dev/null
+++ b/examples/debugging/root_actor_breakpoint_forever.py
@@ -0,0 +1,11 @@
+import tractor
+
+
+async def main():
+
+    while True:
+        await tractor.breakpoint()
+
+
+if __name__ == '__main__':
+    tractor.run(main, debug_mode=True)
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index de76247a8..0770a14c2 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -114,6 +114,28 @@ def test_root_actor_bp(spawn, user_in_out):
         assert expect_err_str in str(child.before)
 
 
+def test_root_actor_bp_forever(spawn):
+    "Re-enter a breakpoint from the root actor-task."
+    child = spawn('root_actor_breakpoint_forever')
+
+    # do some "next" commands to demonstrate recurrent breakpoint
+    # entries
+    for _ in range(10):
+        child.sendline('next')
+        child.expect(r"\(Pdb\+\+\)")
+
+    # do one continue which should trigger a new task to lock the tty
+    child.sendline('continue')
+    child.expect(r"\(Pdb\+\+\)")
+
+    # XXX: this previously caused a bug!
+    child.sendline('n')
+    child.expect(r"\(Pdb\+\+\)")
+
+    child.sendline('n')
+    child.expect(r"\(Pdb\+\+\)")
+
+
 def test_subactor_error(spawn):
     "Single subactor raising an error"
 
@@ -128,7 +150,6 @@ def test_subactor_error(spawn):
     # send user command
     # (in this case it's the same for 'continue' vs. 'quit')
     child.sendline('continue')
-    child.expect('\r\n')
 
     # the debugger should enter a second time in the nursery
     # creating actor

From d43d367153f731275de02f7336cb1276c5fbfe11 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 5 Oct 2020 10:07:06 -0400
Subject: [PATCH 34/61] Facepalm: tty locking from root doesn't require an
 extra task

---
 tractor/_actor.py    |  1 -
 tractor/_debug.py    | 80 ++++++++++++++++++++------------------------
 tractor/_trionics.py |  2 +-
 3 files changed, 38 insertions(+), 45 deletions(-)

diff --git a/tractor/_actor.py b/tractor/_actor.py
index b79106f67..2813f8d15 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -1078,7 +1078,6 @@ async def _start_actor(
                 raise
 
             finally:
-                actor._service_n.cancel_scope.cancel()
                 await actor.cancel()
 
         # XXX: the actor is cancelled when this context is complete
diff --git a/tractor/_debug.py b/tractor/_debug.py
index 2e1c10726..a00594ae7 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -4,9 +4,9 @@
 import bdb
 import sys
 from functools import partial
-from contextlib import asynccontextmanager, contextmanager
+from contextlib import asynccontextmanager
 from typing import Awaitable, Tuple, Optional, Callable
-import signal
+# import signal
 
 from async_generator import aclosing
 import tractor
@@ -16,6 +16,7 @@
 from .log import get_logger
 from . import _state
 from ._discovery import get_root
+from ._state import is_root_process
 
 try:
     # wtf: only exported when installed in dev mode?
@@ -35,7 +36,7 @@
 # placeholder for function to set a ``trio.Event`` on debugger exit
 _pdb_release_hook: Optional[Callable] = None
 
-# actor-wide flag
+# actor-wide variable pointing to current task name using debugger
 _in_debug = False
 
 # lock in root actor preventing multi-access to local tty
@@ -120,14 +121,15 @@ async def _acquire_debug_lock():
     """Acquire a actor local FIFO lock meant to mutex entry to a local
     debugger entry point to avoid tty clobbering by multiple processes.
     """
+    task_name = trio.lowlevel.current_task()
     try:
-        log.error("TTY BEING ACQUIRED")
+        log.error(f"TTY BEING ACQUIRED by {task_name}")
         await _debug_lock.acquire()
-        log.error("TTY lock acquired")
+        log.error(f"TTY lock acquired by {task_name}")
         yield
     finally:
         _debug_lock.release()
-        log.error("TTY lock released")
+        log.error(f"TTY lock released by {task_name}")
 
 
 def handler(signum, frame):
@@ -221,54 +223,46 @@ async def _bp():
         """Async breakpoint which schedules a parent stdio lock, and once complete
         enters the ``pdbpp`` debugging console.
         """
-        task_name = trio.lowlevel.current_task()
+        task_name = trio.lowlevel.current_task().name
 
         global _in_debug
-        if _in_debug :
-            if _in_debug == task_name:
-                # this task already has the lock and is
-                # likely recurrently entering a breakpoint
-                return
-
-            # if **this** actor is already in debug mode block here
-            # waiting for the control to be released - this allows
-            # support for recursive entries to `tractor.breakpoint()`
-            log.warning(
-                f"Actor {actor.uid} already has a debug lock, waiting...")
-            await do_unlock.wait()
-            await trio.sleep(0.1)
-
-        # assign unlock callback for debugger teardown hooks
-        global _pdb_release_hook
-        _pdb_release_hook = do_unlock.set
-
-        # mark local actor as "in debug mode" to avoid recurrent
-        # entries/requests to the root process
-        _in_debug = task_name
 
         # TODO: need a more robust check for the "root" actor
-        if actor._parent_chan:
+        if actor._parent_chan and not is_root_process():
+            if _in_debug:
+                if _in_debug == task_name:
+                    # this task already has the lock and is
+                    # likely recurrently entering a breakpoint
+                    return
+
+                # if **this** actor is already in debug mode block here
+                # waiting for the control to be released - this allows
+                # support for recursive entries to `tractor.breakpoint()`
+                log.warning(
+                    f"Actor {actor.uid} already has a debug lock, waiting...")
+                await do_unlock.wait()
+                await trio.sleep(0.1)
+
+            # assign unlock callback for debugger teardown hooks
+            global _pdb_release_hook
+            _pdb_release_hook = do_unlock.set
+
+            # mark local actor as "in debug mode" to avoid recurrent
+            # entries/requests to the root process
+            _in_debug = task_name
+
             # this **must** be awaited by the caller and is done using the
             # root nursery so that the debugger can continue to run without
             # being restricted by the scope of a new task nursery.
             await actor._service_n.start(wait_for_parent_stdin_hijack)
 
-        else:
+        elif is_root_process():
             # we also wait in the root-parent for any child that
             # may have the tty locked prior
-            async def _lock(
-                task_status=trio.TASK_STATUS_IGNORED
-            ):
-                try:
-                    async with _acquire_debug_lock():
-                        task_status.started()
-                        await do_unlock.wait()
-                finally:
-                    global _in_debug
-                    _in_debug = False
-                    log.debug(f"{actor} released tty lock")
-
-            await actor._service_n.start(_lock)
+            if _debug_lock.locked():  # root process already has it; ignore
+                return
+            await _debug_lock.acquire()
+            _pdb_release_hook = _debug_lock.release
 
         # block here one (at the appropriate frame *up* where
         # ``breakpoint()`` was awaited and begin handling stdio
diff --git a/tractor/_trionics.py b/tractor/_trionics.py
index d78e1106b..2d57ac67f 100644
--- a/tractor/_trionics.py
+++ b/tractor/_trionics.py
@@ -139,7 +139,7 @@ async def cancel(self, hard_kill: bool = False) -> None:
         """
         self.cancelled = True
 
-        log.critical("Cancelling nursery")
+        log.warning(f"Cancelling nursery in {self._actor.uid}")
         with trio.move_on_after(3) as cs:
             async with trio.open_nursery() as nursery:
                 for subactor, proc, portal in self._children.values():

From 371025947ab3ffa827ca247dfeecf62756e528d7 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 5 Oct 2020 11:38:39 -0400
Subject: [PATCH 35/61] Add a multi-subactor test where the root errors

---
 .../debugging/multi_subactor_root_errors.py   | 43 +++++++++++++++++
 examples/debugging/multi_subactors.py         | 16 +++----
 tests/test_debugger.py                        | 46 ++++++++++++++++---
 3 files changed, 89 insertions(+), 16 deletions(-)
 create mode 100644 examples/debugging/multi_subactor_root_errors.py

diff --git a/examples/debugging/multi_subactor_root_errors.py b/examples/debugging/multi_subactor_root_errors.py
new file mode 100644
index 000000000..f7a43724e
--- /dev/null
+++ b/examples/debugging/multi_subactor_root_errors.py
@@ -0,0 +1,43 @@
+import tractor
+
+
+async def name_error():
+    "Raise a ``NameError``"
+    getattr(doggypants)
+
+
+async def spawn_error():
+    """"A nested nursery that triggers another ``NameError``.
+    """
+    async with tractor.open_nursery() as n:
+        portal = await n.run_in_actor('name_error_1', name_error)
+        return await portal.result()
+
+
+async def main():
+    """The main ``tractor`` routine.
+
+    The process tree should look as approximately as follows:
+
+    -python examples/debugging/multi_subactors.py
+    |-python -m tractor._child --uid ('name_error', 'a7caf490 ...)
+    `-python -m tractor._child --uid ('spawn_error', '52ee14a5 ...)
+       `-python -m tractor._child --uid ('name_error', '3391222c ...)
+    """
+    async with tractor.open_nursery() as n:
+
+        # spawn both actors
+        portal = await n.run_in_actor('name_error', name_error)
+        portal1 = await n.run_in_actor('spawn_error', spawn_error)
+
+        # trigger a root actor error
+        assert 0
+
+        # attempt to collect results (which raises error in parent)
+        # still has some issues where the parent seems to get stuck
+        await portal.result()
+        await portal1.result()
+
+
+if __name__ == '__main__':
+    tractor.run(main, debug_mode=True)
diff --git a/examples/debugging/multi_subactors.py b/examples/debugging/multi_subactors.py
index 97de906e8..16ff22d86 100644
--- a/examples/debugging/multi_subactors.py
+++ b/examples/debugging/multi_subactors.py
@@ -35,16 +35,12 @@ async def main():
     """
     async with tractor.open_nursery() as n:
 
-        # spawn both actors
-        portal1 = await n.run_in_actor('bp_forever', breakpoint_forever)
-        portal = await n.run_in_actor('name_error', name_error)
-        portal2 = await n.run_in_actor('spawn_error', spawn_error)
-
-        # attempt to collect results (which raises error in parent)
-        # still has some issues where the parent seems to get stuck
-        # await portal.result()
-        # await portal1.result()
-        # await portal2.result()
+        # Spawn both actors, don't bother with collecting results
+        # (would result in a different debugger outcome due to parent's
+        # cancellation).
+        await n.run_in_actor('bp_forever', breakpoint_forever)
+        await n.run_in_actor('name_error', name_error)
+        await n.run_in_actor('spawn_error', spawn_error)
 
 
 if __name__ == '__main__':
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 0770a14c2..8c877919f 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -1,5 +1,8 @@
 """
 That native debug better work!
+
+All these tests can be understood (somewhat) by running the equivalent
+`examples/debugging/` scripts manually.
 """
 from os import path
 
@@ -10,9 +13,8 @@
 
 
 # TODO:
-# - recurrent entry from single actor
 # - recurrent entry to breakpoint() from single actor *after* and an
-#   error
+#   error in another task?
 # - root error before child errors
 # - root error after child errors
 # - root error before child breakpoint
@@ -68,14 +70,14 @@ def test_root_actor_error(spawn, user_in_out):
     # scan for the pdbpp prompt
     child.expect(r"\(Pdb\+\+\)")
 
+    before = str(child.before.decode())
+
     # make sure expected logging and error arrives
-    assert 'TTY lock acquired' in str(child.before)
-    assert 'AssertionError' in str(child.before)
+    assert "Attaching to pdb in crashed actor: ('arbiter'" in before
+    assert 'AssertionError' in before
 
     # send user command
     child.sendline(user_input)
-    child.expect('\r\n')
-    child.expect('TTY lock released')
 
     # process should exit
     child.expect(pexpect.EOF)
@@ -276,3 +278,35 @@ def test_multi_subactors(spawn):
     before = str(child.before.decode())
     assert "RemoteActorError: ('bp_forever'" in before
     assert 'bdb.BdbQuit' in before
+
+
+def test_multi_subactors_root_errors(spawn):
+    """Multiple subactors, both erroring and breakpointing as well as
+    a nested subactor erroring.
+    """
+    child = spawn('multi_subactor_root_errors')
+
+    # scan for the pdbpp prompt
+    child.expect(r"\(Pdb\+\+\)")
+
+    # at most one subactor should attach before the root is cancelled
+    before = str(child.before.decode())
+    assert "NameError: name 'doggypants' is not defined" in before
+
+    # continue again
+    child.sendline('c')
+    child.expect(r"\(Pdb\+\+\)")
+
+    # should now get attached in root with assert error
+    before = str(child.before.decode())
+    # should have come just after priot prompt
+    assert "Cancelling nursery in ('spawn_error'," in before
+    assert "Attaching to pdb in crashed actor: ('arbiter'" in before
+    assert "AssertionError" in before
+
+    # continue again
+    child.sendline('c')
+    child.expect(pexpect.EOF)
+
+    before = str(child.before.decode())
+    assert "AssertionError" in before

From 2b53c74b1c174723e8c1efe58196ae53d3c66709 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 5 Oct 2020 11:42:13 -0400
Subject: [PATCH 36/61] Change to relative conftest.py imports

---
 tests/test_cancellation.py  | 2 +-
 tests/test_discovery.py     | 2 +-
 tests/test_local.py         | 2 +-
 tests/test_multi_program.py | 2 +-
 tests/test_spawning.py      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index e6d460c52..dbb05024d 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -11,7 +11,7 @@
 import trio
 import tractor
 
-from conftest import tractor_test, no_windows
+from .conftest import tractor_test, no_windows
 
 
 async def assert_err(delay=0):
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 52a1dc831..7f071e24e 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -11,7 +11,7 @@
 import tractor
 import trio
 
-from conftest import tractor_test
+from .conftest import tractor_test
 
 
 @tractor_test
diff --git a/tests/test_local.py b/tests/test_local.py
index 0a594d0ef..c6a356997 100644
--- a/tests/test_local.py
+++ b/tests/test_local.py
@@ -7,7 +7,7 @@
 import trio
 import tractor
 
-from conftest import tractor_test
+from .conftest import tractor_test
 
 
 @pytest.mark.trio
diff --git a/tests/test_multi_program.py b/tests/test_multi_program.py
index 12ca3ef2d..2d08f2f5c 100644
--- a/tests/test_multi_program.py
+++ b/tests/test_multi_program.py
@@ -6,7 +6,7 @@
 
 import pytest
 import tractor
-from conftest import (
+from .conftest import (
     tractor_test,
     sig_prog,
     _INT_SIGNAL,
diff --git a/tests/test_spawning.py b/tests/test_spawning.py
index 220af3e1e..0800836cd 100644
--- a/tests/test_spawning.py
+++ b/tests/test_spawning.py
@@ -5,7 +5,7 @@
 import trio
 import tractor
 
-from conftest import tractor_test
+from .conftest import tractor_test
 
 statespace = {'doggy': 10, 'kitty': 4}
 

From abf8bb2813cb09364a293542506d17c55166acfc Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 6 Oct 2020 09:21:53 -0400
Subject: [PATCH 37/61] Add a deep nested error propagation test

---
 ...ed_subactors_error_up_through_nurseries.py | 58 +++++++++++++++++++
 .../debugging/multi_subactor_root_errors.py   |  4 +-
 tests/test_debugger.py                        | 21 +++++++
 3 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100644 examples/debugging/multi_nested_subactors_error_up_through_nurseries.py

diff --git a/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py b/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py
new file mode 100644
index 000000000..488fffaae
--- /dev/null
+++ b/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py
@@ -0,0 +1,58 @@
+import tractor
+
+
+async def name_error():
+    "Raise a ``NameError``"
+    getattr(doggypants)
+
+
+async def breakpoint_forever():
+    "Indefinitely re-enter debugger in child actor."
+    while True:
+        await tractor.breakpoint()
+
+
+async def spawn_until(depth=0):
+    """"A nested nursery that triggers another ``NameError``.
+    """
+    async with tractor.open_nursery() as n:
+        if depth < 1:
+            # await n.run_in_actor('breakpoint_forever', breakpoint_forever)
+            await n.run_in_actor('name_error', name_error)
+        else:
+            depth -= 1
+            await n.run_in_actor(f'spawn_until_{depth}', spawn_until, depth=depth)
+
+
+async def main():
+    """The main ``tractor`` routine.
+
+    The process tree should look as approximately as follows when the debugger
+    first engages:
+
+    python examples/debugging/multi_nested_subactors_bp_forever.py
+    ├─ python -m tractor._child --uid ('spawner1', '7eab8462 ...)
+    │  └─ python -m tractor._child --uid ('spawn_until_3', 'afcba7a8 ...)
+    │     └─ python -m tractor._child --uid ('spawn_until_2', 'd2433d13 ...)
+    │        └─ python -m tractor._child --uid ('spawn_until_1', '1df589de ...)
+    │           └─ python -m tractor._child --uid ('spawn_until_0', '3720602b ...)
+    │
+    └─ python -m tractor._child --uid ('spawner0', '1d42012b ...)
+       └─ python -m tractor._child --uid ('spawn_until_2', '2877e155 ...)
+          └─ python -m tractor._child --uid ('spawn_until_1', '0502d786 ...)
+             └─ python -m tractor._child --uid ('spawn_until_0', 'de918e6d ...)
+
+    """
+    async with tractor.open_nursery() as n:
+
+        # spawn both actors
+        portal = await n.run_in_actor('spawner0', spawn_until, depth=3)
+        portal1 = await n.run_in_actor('spawner1', spawn_until, depth=4)
+
+        # gah still an issue here.
+        # await portal.result()
+        # await portal1.result()
+
+
+if __name__ == '__main__':
+    tractor.run(main, debug_mode=True)
diff --git a/examples/debugging/multi_subactor_root_errors.py b/examples/debugging/multi_subactor_root_errors.py
index f7a43724e..05f0fa713 100644
--- a/examples/debugging/multi_subactor_root_errors.py
+++ b/examples/debugging/multi_subactor_root_errors.py
@@ -19,8 +19,8 @@ async def main():
 
     The process tree should look as approximately as follows:
 
-    -python examples/debugging/multi_subactors.py
-    |-python -m tractor._child --uid ('name_error', 'a7caf490 ...)
+    python examples/debugging/multi_subactors.py
+    ├─ python -m tractor._child --uid ('name_error', 'a7caf490 ...)
     `-python -m tractor._child --uid ('spawn_error', '52ee14a5 ...)
        `-python -m tractor._child --uid ('name_error', '3391222c ...)
     """
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 8c877919f..2540c2e12 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -310,3 +310,24 @@ def test_multi_subactors_root_errors(spawn):
 
     before = str(child.before.decode())
     assert "AssertionError" in before
+
+
+def test_multi_nested_subactors_error_through_nurseries(spawn):
+    """Verify deeply nested actors that error trigger debugger entries
+    at each level up the tree.
+    """
+
+    # TODO: inside this script there's still a bug where if the parent
+    # errors before a 2 levels lower actor has released the lock, the
+    # parent tries to cancel it but it's stuck in the debugger?
+
+    child = spawn('multi_nested_subactors_error_up_through_nurseries')
+
+    for _ in range(12):
+        child.expect(r"\(Pdb\+\+\)")
+        child.sendline('c')
+
+    child.expect(pexpect.EOF)
+
+    before = str(child.before.decode())
+    assert "NameError" in before

From 07112089d045a99ca3f58196cd5b4a4bea8c5b44 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 7 Oct 2020 05:53:26 -0400
Subject: [PATCH 38/61] Add mention subactor uid during locking

---
 tractor/_debug.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tractor/_debug.py b/tractor/_debug.py
index a00594ae7..04c963740 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -11,7 +11,7 @@
 from async_generator import aclosing
 import tractor
 import trio
-# from trio.testing import wait_all_tasks_blocked
+from trio.testing import wait_all_tasks_blocked
 
 from .log import get_logger
 from . import _state
@@ -117,19 +117,19 @@ def set_quit(self):
 
 
 @asynccontextmanager
-async def _acquire_debug_lock():
+async def _acquire_debug_lock(uid: Tuple[str, str]) -> None:
     """Acquire a actor local FIFO lock meant to mutex entry to a local
     debugger entry point to avoid tty clobbering by multiple processes.
     """
     task_name = trio.lowlevel.current_task()
     try:
-        log.error(f"TTY BEING ACQUIRED by {task_name}")
+        log.error(f"TTY BEING ACQUIRED by {task_name}:{uid}")
         await _debug_lock.acquire()
-        log.error(f"TTY lock acquired by {task_name}")
+        log.error(f"TTY lock acquired by {task_name}:{uid}")
         yield
     finally:
         _debug_lock.release()
-        log.error(f"TTY lock released by {task_name}")
+        log.error(f"TTY lock released by {task_name}:{uid}")
 
 
 def handler(signum, frame):
@@ -162,7 +162,7 @@ async def _hijack_stdin_relay_to_child(
     # TODO: when we get to true remote debugging
     # this will deliver stdin data
     log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock")
-    async with _acquire_debug_lock():
+    async with _acquire_debug_lock(subactor_uid):
         log.warning(f"Actor {subactor_uid} ACQUIRED stdin hijack lock")
 
         # with _disable_sigint():
@@ -266,9 +266,10 @@ async def _bp():
 
         # block here one (at the appropriate frame *up* where
         # ``breakpoint()`` was awaited and begin handling stdio
-        log.debug("Entering the synchronous world of the debugger")
+        log.debug("Entering the synchronous world of pdb")
         debug_func(actor)
 
+
     # user code **must** await this!
     return _bp()
 
@@ -288,7 +289,7 @@ def _set_trace(actor):
 
 
 def _post_mortem(actor):
-    log.error(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
+    log.critical(f"\nAttaching to pdb in crashed actor: {actor.uid}\n")
     # custom Pdb post-mortem entry
     pdbpp.xpm(Pdb=PdbwTeardown)
 

From acb4cb0b2be7b293b6aee501809380411b886f9c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 6 Oct 2020 22:39:20 -0400
Subject: [PATCH 39/61] Add test showing issue with child in tty lock when
 cancelled

---
 ...root_cancelled_but_child_is_in_tty_lock.py | 48 +++++++++++++++++++
 tests/test_debugger.py                        | 13 +++++
 2 files changed, 61 insertions(+)
 create mode 100644 examples/debugging/root_cancelled_but_child_is_in_tty_lock.py

diff --git a/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py b/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py
new file mode 100644
index 000000000..04ec7672c
--- /dev/null
+++ b/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py
@@ -0,0 +1,48 @@
+import tractor
+
+
+async def name_error():
+    "Raise a ``NameError``"
+    getattr(doggypants)
+
+
+async def spawn_until(depth=0):
+    """"A nested nursery that triggers another ``NameError``.
+    """
+    async with tractor.open_nursery() as n:
+        if depth < 1:
+            # await n.run_in_actor('breakpoint_forever', breakpoint_forever)
+            await n.run_in_actor('name_error', name_error)
+        else:
+            depth -= 1
+            await n.run_in_actor(f'spawn_until_{depth}', spawn_until, depth=depth)
+
+
+async def main():
+    """The main ``tractor`` routine.
+
+    The process tree should look as approximately as follows when the debugger
+    first engages:
+
+    python examples/debugging/multi_nested_subactors_bp_forever.py
+    ├─ python -m tractor._child --uid ('spawner1', '7eab8462 ...)
+    │  └─ python -m tractor._child --uid ('spawn_until_0', '3720602b ...)
+    │     └─ python -m tractor._child --uid ('name_error', '505bf71d ...)
+    │
+    └─ python -m tractor._child --uid ('spawner0', '1d42012b ...)
+       └─ python -m tractor._child --uid ('name_error', '6c2733b8 ...)
+
+    """
+    async with tractor.open_nursery() as n:
+
+        # spawn both actors
+        portal = await n.run_in_actor('spawner0', spawn_until, depth=0)
+        portal1 = await n.run_in_actor('spawner1', spawn_until, depth=1)
+
+        # nursery cancellation should be triggered due to propagated error
+        await portal.result()
+        await portal1.result()
+
+
+if __name__ == '__main__':
+    tractor.run(main, debug_mode=True, loglevel='warning')
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 2540c2e12..76113c967 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -331,3 +331,16 @@ def test_multi_nested_subactors_error_through_nurseries(spawn):
 
     before = str(child.before.decode())
     assert "NameError" in before
+
+
+def test_root_nursery_cancels_before_child_releases_tty_lock(spawn):
+    """Exemplifies a bug where the root sends a cancel message before a nested child
+    which has the tty lock (and is in pdb) doesn't cancel after exiting the debugger.
+    """
+    child = spawn('root_cancelled_but_child_is_in_tty_lock')
+
+    for _ in range(5):
+        child.expect(r"\(Pdb\+\+\)")
+        child.sendline('c')
+
+    # child.expect(pexpect.EOF)

From 0e344eead877c30bca4adad37db9bdb729694392 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 12 Oct 2020 08:56:49 -0400
Subject: [PATCH 40/61] Add a "cancel arrives during a sync sleep in child"
 test

This appears to demonstrate the same bug found in #156. It looks like
cancelling a subactor with a child, while that child is running sync code,
can result in the child never getting cancelled due to some strange
condition where the internal nurseries aren't being torn down as
expected when a `trio.Cancelled` is raised.
---
 tests/test_cancellation.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index dbb05024d..80232bfd0 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -413,3 +413,34 @@ async def main():
 
     with pytest.raises(KeyboardInterrupt):
         tractor.run(main)
+
+
+
+async def spin_for(period=3):
+    "Sync sleep."
+    time.sleep(period)
+
+
+async def spawn():
+    async with tractor.open_nursery() as tn:
+        portal = await tn.run_in_actor('sleeper', spin_for)
+
+
+def test_cancel_while_childs_child_in_sync_sleep(
+    loglevel,
+    start_method,
+    spawn_backend,
+):
+    """Verify that a child cancelled while executing sync code is torn
+    down even when that cancellation is triggered by the parent
+    2 nurseries "up".
+    """
+    async def main():
+        with trio.fail_after(2):
+            async with tractor.open_nursery() as tn:
+                portal = await tn.run_in_actor('spawn', spawn)
+                await trio.sleep(1)
+                assert 0
+
+    with pytest.raises(AssertionError):
+        tractor.run(main)

From 79c38b04e712259dee1137f0fd2aa93384b3dc80 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Mon, 12 Oct 2020 23:28:36 -0400
Subject: [PATCH 41/61] Report `trio.Cancelled` when exhausting portals..

For reliable remote cancellation we need to "report" `trio.Cancelled`s
(just like any other error) when exhausting a portal such that the
caller can make decisions about cancelling the respective actor if need
be.

Resolves #156
---
 tractor/_spawn.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tractor/_spawn.py b/tractor/_spawn.py
index 2a2f602db..206596750 100644
--- a/tractor/_spawn.py
+++ b/tractor/_spawn.py
@@ -112,6 +112,11 @@ async def exhaust_portal(
     except (Exception, trio.MultiError) as err:
         # we reraise in the parent task via a ``trio.MultiError``
         return err
+    except trio.Cancelled as err:
+        # lol, of course we need this too ;P
+        # TODO: merge with above?
+        log.warning(f"Cancelled result waiter for {portal.actor.uid}")
+        return err
     else:
         log.debug(f"Returning final result: {final}")
         return final

From a88a6ba7a3143d89d90748adad07377d206f3136 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 00:36:34 -0400
Subject: [PATCH 42/61] Add pattern matching to test

---
 tests/test_debugger.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 76113c967..071febf7d 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -334,13 +334,30 @@ def test_multi_nested_subactors_error_through_nurseries(spawn):
 
 
 def test_root_nursery_cancels_before_child_releases_tty_lock(spawn):
-    """Exemplifies a bug where the root sends a cancel message before a nested child
-    which has the tty lock (and is in pdb) doesn't cancel after exiting the debugger.
+    """Test that when the root sends a cancel message before a nested
+    child has unblocked (which can happen when it has the tty lock and
+    is engaged in pdb) it is indeed cancelled after exiting the debugger.
     """
     child = spawn('root_cancelled_but_child_is_in_tty_lock')
 
-    for _ in range(5):
+    child.expect(r"\(Pdb\+\+\)")
+
+    before = str(child.before.decode())
+    assert "NameError: name 'doggypants' is not defined" in before
+    assert "tractor._exceptions.RemoteActorError: ('name_error'" not in before
+    child.sendline('c')
+
+    for _ in range(4):
         child.expect(r"\(Pdb\+\+\)")
+
+        before = str(child.before.decode())
+        assert "NameError: name 'doggypants' is not defined" in before
+
         child.sendline('c')
 
-    # child.expect(pexpect.EOF)
+
+    child.expect(pexpect.EOF)
+    before = str(child.before.decode())
+    assert "tractor._exceptions.RemoteActorError: ('spawner0'" in before
+    assert "tractor._exceptions.RemoteActorError: ('name_error'" in before
+    assert "NameError: name 'doggypants' is not defined" in before

From c41e5c8313ac73c78544152c9cd764a676cc3e2c Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 00:45:29 -0400
Subject: [PATCH 43/61] Fix missing await

---
 tractor/_actor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tractor/_actor.py b/tractor/_actor.py
index 2813f8d15..74c204955 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -541,7 +541,7 @@ async def _process_messages(
                     # channel disconnect
                     log.debug(
                         f"{chan} from {chan.uid} disconnected, cancelling all rpc tasks")
-                    self.cancel_rpc_tasks(chan)
+                    await self.cancel_rpc_tasks(chan)
 
         except trio.ClosedResourceError:
             log.error(f"{chan} form {chan.uid} broke")

From 1710b642a5e89ef97ba4aa4989f1bc1cebb24ab5 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 10:50:21 -0400
Subject: [PATCH 44/61] Make tests a package (for relative imports)

---
 tests/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/__init__.py

diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb

From c375a2d028e727b41d45cd65ebb2af836e541682 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 11:03:55 -0400
Subject: [PATCH 45/61] mypy fixes

---
 tractor/__init__.py |  5 ++++-
 tractor/_actor.py   |  2 +-
 tractor/_debug.py   | 12 ++++++------
 tractor/_state.py   |  6 +++---
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index b65811ac8..d3aedc416 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -17,6 +17,7 @@
 from ._actor import Actor, _start_actor, Arbiter
 from ._trionics import open_nursery
 from ._state import current_actor
+from . import _state
 from ._exceptions import RemoteActorError, ModuleNotExposed
 from ._debug import breakpoint, post_mortem
 from . import msg
@@ -24,6 +25,8 @@
 
 
 __all__ = [
+    'breakpoint',
+    'post_mortem',
     'current_actor',
     'find_actor',
     'get_arbiter',
@@ -51,7 +54,7 @@ async def _main(
     name: Optional[str] = None,
     start_method: Optional[str] = None,
     debug_mode: bool = False,
-    **kwargs: typing.Dict[str, typing.Any],
+    **kwargs,
 ) -> typing.Any:
     """Async entry point for ``tractor``.
     """
diff --git a/tractor/_actor.py b/tractor/_actor.py
index 74c204955..80a7dd524 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -249,7 +249,7 @@ def __init__(
         self._parent_chan: Optional[Channel] = None
         self._forkserver_info: Optional[
             Tuple[Any, Any, Any, Any, Any]] = None
-        self._actoruid2nursery: Dict[str, 'ActorNursery'] = {}  # noqa
+        self._actoruid2nursery: Dict[str, 'ActorNursery'] = {}  # type: ignore
 
     async def wait_for_peer(
         self, uid: Tuple[str, str]
diff --git a/tractor/_debug.py b/tractor/_debug.py
index 04c963740..29c0430e4 100644
--- a/tractor/_debug.py
+++ b/tractor/_debug.py
@@ -5,7 +5,7 @@
 import sys
 from functools import partial
 from contextlib import asynccontextmanager
-from typing import Awaitable, Tuple, Optional, Callable
+from typing import Awaitable, Tuple, Optional, Callable, AsyncIterator
 # import signal
 
 from async_generator import aclosing
@@ -22,9 +22,9 @@
     # wtf: only exported when installed in dev mode?
     import pdbpp
 except ImportError:
-    # pdbpp is installed in regular mode...
+    # pdbpp is installed in regular mode...it monkey patches stuff
     import pdb
-    assert pdb.xpm, "pdbpp is not installed?"
+    assert pdb.xpm, "pdbpp is not installed?"  # type: ignore
     pdbpp = pdb
 
 log = get_logger(__name__)
@@ -45,7 +45,7 @@
 # XXX: set by the current task waiting on the root tty lock
 # and must be cancelled if this actor is cancelled via message
 # otherwise deadlocks with the parent actor may ensure
-_debugger_request_cs: trio.CancelScope = None
+_debugger_request_cs: Optional[trio.CancelScope] = None
 
 
 class TractorConfig(pdbpp.DefaultConfig):
@@ -117,7 +117,7 @@ def set_quit(self):
 
 
 @asynccontextmanager
-async def _acquire_debug_lock(uid: Tuple[str, str]) -> None:
+async def _acquire_debug_lock(uid: Tuple[str, str]) -> AsyncIterator[None]:
     """Acquire a actor local FIFO lock meant to mutex entry to a local
     debugger entry point to avoid tty clobbering by multiple processes.
     """
@@ -158,7 +158,7 @@ def handler(signum, frame):
 
 async def _hijack_stdin_relay_to_child(
     subactor_uid: Tuple[str, str]
-) -> None:
+) -> AsyncIterator[str]:
     # TODO: when we get to true remote debugging
     # this will deliver stdin data
     log.warning(f"Actor {subactor_uid} is WAITING on stdin hijack lock")
diff --git a/tractor/_state.py b/tractor/_state.py
index 2728b7813..03ddf1390 100644
--- a/tractor/_state.py
+++ b/tractor/_state.py
@@ -1,14 +1,14 @@
 """
 Per process state
 """
-from typing import Optional
+from typing import Optional, Dict, Any
 from collections import Mapping
 import multiprocessing as mp
 
 import trio
 
 _current_actor: Optional['Actor'] = None  # type: ignore
-_runtime_vars = {
+_runtime_vars: Dict[str, Any] = {
     '_debug_mode': False,
     '_is_root': False,
     '_root_mailbox': (None, None)
@@ -54,7 +54,7 @@ def debug_mode() -> bool:
     """Bool determining if "debug mode" is on which enables
     remote subactor pdb entry on crashes.
     """
-    return _runtime_vars['_debug_mode']
+    return bool(_runtime_vars['_debug_mode'])
 
 
 def is_root_process() -> bool:

From 0ce6d2b55c3654ee4a40da71958cf657809c6f80 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 11:04:16 -0400
Subject: [PATCH 46/61] Add `pexpect` dep for debugger tests

---
 requirements-test.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-test.txt b/requirements-test.txt
index e9dae3236..2e2d1b53b 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -3,3 +3,4 @@ pytest-trio
 pdbpp
 mypy
 trio_typing
+pexpect

From 573b8fef73fb515695093354fe0d470ab5182814 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 11:48:52 -0400
Subject: [PATCH 47/61] Add better actor cancellation tracking

Add `Actor._cancel_called` and `._cancel_complete` making it possible to
determine whether the actor has started the cancellation sequence and
whether that sequence has fully completed. This allows for blocking in
internal machinery tasks as necessary. Also, always trigger the end of
ongoing rpc tasks even if the last task errors; there's no guarantee the
trio cancellation semantics will guarantee us a nice internal "state"
without this.
---
 tractor/_actor.py | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/tractor/_actor.py b/tractor/_actor.py
index 80a7dd524..f4b979579 100644
--- a/tractor/_actor.py
+++ b/tractor/_actor.py
@@ -152,10 +152,10 @@ async def _invoke(
             # cancel scope will not have been inserted yet
             log.warn(
                 f"Task {func} likely errored or cancelled before it started")
-
-        if not actor._rpc_tasks:
-            log.info("All RPC tasks have completed")
-            actor._ongoing_rpc_tasks.set()
+        finally:
+            if not actor._rpc_tasks:
+                log.info("All RPC tasks have completed")
+                actor._ongoing_rpc_tasks.set()
 
 
 def _get_mod_abspath(module):
@@ -198,7 +198,9 @@ def __init__(
         """
         self.name = name
         self.uid = (name, uid or str(uuid.uuid4()))
-        self._is_cancelled: bool = False
+
+        self._cancel_complete = trio.Event()
+        self._cancel_called: bool = False
 
         # retreive and store parent `__main__` data which
         # will be passed to children
@@ -531,7 +533,10 @@ async def _process_messages(
                     else:
                         # self.cancel() was called so kill this msg loop
                         # and break out into ``_async_main()``
-                        log.warning(f"{self.uid} was remotely cancelled")
+                        log.warning(
+                            f"{self.uid} was remotely cancelled; "
+                            "waiting on cancellation completion..")
+                        await self._cancel_complete.wait()
                         loop_cs.cancel()
                         break
 
@@ -540,8 +545,9 @@ async def _process_messages(
                 else:
                     # channel disconnect
                     log.debug(
-                        f"{chan} from {chan.uid} disconnected, cancelling all rpc tasks")
-                    await self.cancel_rpc_tasks(chan)
+                        f"{chan} for {chan.uid} disconnected, cancelling tasks"
+                    )
+                    self.cancel_rpc_tasks(chan)
 
         except trio.ClosedResourceError:
             log.error(f"{chan} form {chan.uid} broke")
@@ -833,7 +839,8 @@ async def cancel(self) -> bool:
               spawning new rpc tasks
             - return control the parent channel message loop
         """
-        self._is_cancelled = True
+        log.warning(f"{self.uid} is trying to cancel")
+        self._cancel_called = True
 
         # cancel all ongoing rpc tasks
         with trio.CancelScope(shield=True):
@@ -848,14 +855,16 @@ async def cancel(self) -> bool:
             # kill all ongoing tasks
             await self.cancel_rpc_tasks()
 
+            # cancel all rpc tasks permanently
+            if self._service_n:
+                self._service_n.cancel_scope.cancel()
+
             # stop channel server
             self.cancel_server()
             await self._server_down.wait()
 
-            # rekt all channel loops
-            if self._service_n:
-                self._service_n.cancel_scope.cancel()
-
+        log.warning(f"{self.uid} was sucessfullly cancelled")
+        self._cancel_complete.set()
         return True
 
     # XXX: hard kill logic if needed?
@@ -895,6 +904,9 @@ async def _cancel_task(self, cid, chan):
 
         scope.cancel()
         # wait for _invoke to mark the task complete
+        log.debug(
+            f"Waiting on task to cancel:\ncid: {cid}\nfunc: {func}\n"
+            f"peer: {chan.uid}\n")
         await is_complete.wait()
         log.debug(
             f"Sucessfully cancelled task:\ncid: {cid}\nfunc: {func}\n"

From 08ff98963180e4c3aa8e7093a37b12b9d5916a16 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 11:59:18 -0400
Subject: [PATCH 48/61] Add some comments

---
 tractor/_trionics.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tractor/_trionics.py b/tractor/_trionics.py
index 2d57ac67f..aecf16e4b 100644
--- a/tractor/_trionics.py
+++ b/tractor/_trionics.py
@@ -230,7 +230,7 @@ async def open_nursery() -> typing.AsyncGenerator[ActorNursery, None]:
                     # after we yield upwards
                     yield anursery
                     log.debug(
-                        f"Waiting on subactors {anursery._children}"
+                        f"Waiting on subactors {anursery._children} "
                         "to complete"
                     )
                 except BaseException as err:
@@ -274,6 +274,7 @@ async def open_nursery() -> typing.AsyncGenerator[ActorNursery, None]:
 
                 # ria_nursery scope end
 
+        # XXX: do we need a `trio.Cancelled` catch here as well?
         except (Exception, trio.MultiError) as err:
             # If actor-local error was raised while waiting on
             # ".run_in_actor()" actors then we also want to cancel all
@@ -295,6 +296,8 @@ async def open_nursery() -> typing.AsyncGenerator[ActorNursery, None]:
                 if anursery._children:
                     with trio.CancelScope(shield=True):
                         await anursery.cancel()
+
+                # use `MultiError` as needed
                 if len(errors) > 1:
                     raise trio.MultiError(tuple(errors.values()))
                 else:

From 24ef9193343f746083a455347c3bb551d84b79ce Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 14:16:20 -0400
Subject: [PATCH 49/61] Skip sync sleep test on mp backend

---
 tests/test_cancellation.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index 80232bfd0..aace021d7 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -435,6 +435,9 @@ def test_cancel_while_childs_child_in_sync_sleep(
     down even when that cancellation is triggered by the parent
     2 nurseries "up".
     """
+    if start_method == 'forkserver':
+        pytest.skip("Forksever sux hard at resuming from sync sleep...")
+
     async def main():
         with trio.fail_after(2):
             async with tractor.open_nursery() as tn:

From ba52de79e1a3af42ebe0107542982efd27e043df Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 14:20:19 -0400
Subject: [PATCH 50/61] Skip quad ex on local mp tests as well

---
 tests/test_streaming.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/test_streaming.py b/tests/test_streaming.py
index 17ab9ea15..98855e0f8 100644
--- a/tests/test_streaming.py
+++ b/tests/test_streaming.py
@@ -203,9 +203,8 @@ async def cancel_after(wait):
 
 @pytest.fixture(scope='module')
 def time_quad_ex(arb_addr, ci_env, spawn_backend):
-    if ci_env and spawn_backend == 'mp' and (platform.system() != 'Windows'):
-        """no idea, but the travis and github actions, mp *nix runs are
-        flaking out here often
+    if spawn_backend == 'mp' and (platform.system() != 'Windows'):
+        """no idea but the  mp *nix runs are flaking out here often...
         """
         pytest.skip("Test is too flaky on mp in CI")
 

From e3c26943baad06b2142d9017ad427833ac68addf Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 14:20:44 -0400
Subject: [PATCH 51/61] Support debug mode only on the trio backend

---
 tractor/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tractor/__init__.py b/tractor/__init__.py
index d3aedc416..5ed843f22 100644
--- a/tractor/__init__.py
+++ b/tractor/__init__.py
@@ -63,11 +63,13 @@ async def _main(
     if start_method is not None:
         _spawn.try_set_start_method(start_method)
 
-    if debug_mode:
+    if debug_mode and _spawn._spawn_method == 'trio':
         _state._runtime_vars['_debug_mode'] = True
         # expose internal debug module to every actor allowing
         # for use of ``await tractor.breakpoint()``
         kwargs.setdefault('rpc_module_paths', []).append('tractor._debug')
+    elif debug_mode:
+        raise RuntimeError("Debug mode is only supported for the `trio` backend!")
 
     main = partial(async_fn, *args)
 

From 666966097ab26d02648f27d676dddba92dbac711 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 14:42:02 -0400
Subject: [PATCH 52/61] Revert "Change to relative conftest.py imports"

This reverts commit 2b53c74b1c174723e8c1efe58196ae53d3c66709.
---
 tests/test_cancellation.py  | 2 +-
 tests/test_discovery.py     | 2 +-
 tests/test_local.py         | 2 +-
 tests/test_multi_program.py | 2 +-
 tests/test_spawning.py      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index aace021d7..45452b597 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -11,7 +11,7 @@
 import trio
 import tractor
 
-from .conftest import tractor_test, no_windows
+from conftest import tractor_test, no_windows
 
 
 async def assert_err(delay=0):
diff --git a/tests/test_discovery.py b/tests/test_discovery.py
index 7f071e24e..52a1dc831 100644
--- a/tests/test_discovery.py
+++ b/tests/test_discovery.py
@@ -11,7 +11,7 @@
 import tractor
 import trio
 
-from .conftest import tractor_test
+from conftest import tractor_test
 
 
 @tractor_test
diff --git a/tests/test_local.py b/tests/test_local.py
index c6a356997..0a594d0ef 100644
--- a/tests/test_local.py
+++ b/tests/test_local.py
@@ -7,7 +7,7 @@
 import trio
 import tractor
 
-from .conftest import tractor_test
+from conftest import tractor_test
 
 
 @pytest.mark.trio
diff --git a/tests/test_multi_program.py b/tests/test_multi_program.py
index 2d08f2f5c..12ca3ef2d 100644
--- a/tests/test_multi_program.py
+++ b/tests/test_multi_program.py
@@ -6,7 +6,7 @@
 
 import pytest
 import tractor
-from .conftest import (
+from conftest import (
     tractor_test,
     sig_prog,
     _INT_SIGNAL,
diff --git a/tests/test_spawning.py b/tests/test_spawning.py
index 0800836cd..220af3e1e 100644
--- a/tests/test_spawning.py
+++ b/tests/test_spawning.py
@@ -5,7 +5,7 @@
 import trio
 import tractor
 
-from .conftest import tractor_test
+from conftest import tractor_test
 
 statespace = {'doggy': 10, 'kitty': 4}
 

From a49deb46f142875e6fcaede43004b7ea83f690b5 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 14:42:16 -0400
Subject: [PATCH 53/61] Revert "Make tests a package (for relative imports)"

This reverts commit 1710b642a5e89ef97ba4aa4989f1bc1cebb24ab5.
---
 tests/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/__init__.py

diff --git a/tests/__init__.py b/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000

From a934eb063c11e3b6e55a605ee0db62b381039415 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 14:49:31 -0400
Subject: [PATCH 54/61] Factor `repodir()` helper into conftest.py

---
 tests/conftest.py           | 10 ++++++++++
 tests/test_debugger.py      |  2 +-
 tests/test_docs_examples.py | 10 +---------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 6a9f938b2..347cc7de9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -39,6 +39,16 @@
 )
 
 
+def repodir():
+    """Return the abspath to the repo directory.
+    """
+    dirname = os.path.dirname
+    dirpath = os.path.abspath(
+        dirname(dirname(os.path.realpath(__file__)))
+        )
+    return dirpath
+
+
 def pytest_addoption(parser):
     parser.addoption(
         "--ll", action="store", dest='loglevel',
diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 071febf7d..fe357e8ce 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -9,7 +9,7 @@
 import pytest
 import pexpect
 
-from .test_docs_examples import repodir
+from conftest import repodir
 
 
 # TODO:
diff --git a/tests/test_docs_examples.py b/tests/test_docs_examples.py
index 67b34ee57..73e6a6907 100644
--- a/tests/test_docs_examples.py
+++ b/tests/test_docs_examples.py
@@ -11,15 +11,7 @@
 
 import pytest
 
-
-def repodir():
-    """Return the abspath to the repo directory.
-    """
-    dirname = os.path.dirname
-    dirpath = os.path.abspath(
-        dirname(dirname(os.path.realpath(__file__)))
-        )
-    return dirpath
+from conftest import repodir
 
 
 def examples_dir():

From fd59f4ad1692511299b8bc066b4bf728a862756e Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 14:56:26 -0400
Subject: [PATCH 55/61] On windows .spawn dne?

---
 tests/test_debugger.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index fe357e8ce..227e8f5a6 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -41,7 +41,7 @@ def mk_cmd(ex_name: str) -> str:
 def spawn(
     testdir,
     arb_addr,
-) -> pexpect.spawn:
+) -> 'pexpect.spawn':
 
     def _spawn(cmd):
         return testdir.spawn(
@@ -355,7 +355,6 @@ def test_root_nursery_cancels_before_child_releases_tty_lock(spawn):
 
         child.sendline('c')
 
-
     child.expect(pexpect.EOF)
     before = str(child.before.decode())
     assert "tractor._exceptions.RemoteActorError: ('spawner0'" in before

From 15edcc622d9c9d6a64939d452462e92c798ec15a Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 15:13:24 -0400
Subject: [PATCH 56/61] Skip it on windows too

---
 tests/test_streaming.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_streaming.py b/tests/test_streaming.py
index 98855e0f8..919b2787b 100644
--- a/tests/test_streaming.py
+++ b/tests/test_streaming.py
@@ -203,7 +203,7 @@ async def cancel_after(wait):
 
 @pytest.fixture(scope='module')
 def time_quad_ex(arb_addr, ci_env, spawn_backend):
-    if spawn_backend == 'mp' and (platform.system() != 'Windows'):
+    if spawn_backend == 'mp':
         """no idea but the  mp *nix runs are flaking out here often...
         """
         pytest.skip("Test is too flaky on mp in CI")

From 1b6ee2ecf670007121e7ae79995d4aec26c1cae1 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 15:26:14 -0400
Subject: [PATCH 57/61] Skip sync sleep test on windows

---
 tests/test_cancellation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index 45452b597..9e3c9f934 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -426,6 +426,7 @@ async def spawn():
         portal = await tn.run_in_actor('sleeper', spin_for)
 
 
+@no_windows
 def test_cancel_while_childs_child_in_sync_sleep(
     loglevel,
     start_method,

From 0177268f13bdd49391b2dde9f329133fd4c2a3de Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 16:30:19 -0400
Subject: [PATCH 58/61] Report on skipped tests

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 689e36450..3e271ffad 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -38,4 +38,4 @@ jobs:
       - name: Install dependencies
         run: pip install -U . -r requirements-test.txt -r requirements-docs.txt --upgrade-strategy eager
       - name: Run tests
-        run: pytest tests/ --spawn-backend=${{ matrix.spawn_backend }}
\ No newline at end of file
+        run: pytest tests/ --spawn-backend=${{ matrix.spawn_backend }} -rs

From 1c25f25ab0cc9f85512c7db1e5546fc5900ab785 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 21:27:17 -0400
Subject: [PATCH 59/61] Drop travisCI; it's slower and has worse windows
 support.

---
 .travis.yml | 68 -----------------------------------------------------
 1 file changed, 68 deletions(-)
 delete mode 100644 .travis.yml

diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 484f64d7e..000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,68 +0,0 @@
-language: python
-dist: xenial
-sudo: required
-
-matrix:
-    include:
-        - name: "Windows, Python Latest: multiprocessing"
-          os: windows
-          language: sh
-          python: 3.x  # only works on linux
-          env: SPAWN_BACKEND="mp"
-          before_install:
-              - choco install python3 --params "/InstallDir:C:\\Python"
-              - export PATH="/c/Python:/c/Python/Scripts:$PATH"
-              - python -m pip install --upgrade pip wheel
-
-        - name: "Windows, Python Latest: trio"
-          os: windows
-          language: sh
-          python: 3.x  # only works on linux
-          env: SPAWN_BACKEND="trio"
-          before_install:
-              - choco install python3 --params "/InstallDir:C:\\Python"
-              - export PATH="/c/Python:/c/Python/Scripts:$PATH"
-              - python -m pip install --upgrade pip wheel
-
-        - name: "Windows, Python 3.7: multiprocessing"
-          os: windows
-          python: 3.7  # only works on linux
-          env: SPAWN_BACKEND="mp"
-          language: sh
-          before_install:
-              - choco install python3 --version 3.7.4 --params "/InstallDir:C:\\Python"
-              - export PATH="/c/Python:/c/Python/Scripts:$PATH"
-              - python -m pip install --upgrade pip wheel
-
-        - name: "Windows, Python 3.7: trio"
-          os: windows
-          python: 3.7  # only works on linux
-          env: SPAWN_BACKEND="trio"
-          language: sh
-          before_install:
-              - choco install python3 --version 3.7.4 --params "/InstallDir:C:\\Python"
-              - export PATH="/c/Python:/c/Python/Scripts:$PATH"
-              - python -m pip install --upgrade pip wheel
-
-        - name: "Python 3.7: multiprocessing"
-          python: 3.7  # this works for Linux but is ignored on macOS or Windows
-          env: SPAWN_BACKEND="mp"
-        - name: "Python 3.7: trio"
-          python: 3.7  # this works for Linux but is ignored on macOS or Windows
-          env: SPAWN_BACKEND="trio"
-
-        - name: "Python 3.8: multiprocessing"
-          python: 3.8  # this works for Linux but is ignored on macOS or Windows
-          env: SPAWN_BACKEND="mp"
-        - name: "Python 3.8: trio"
-          python: 3.8  # this works for Linux but is ignored on macOS or Windows
-          env: SPAWN_BACKEND="trio"
-
-install:
-    - cd $TRAVIS_BUILD_DIR
-    - pip install -U pip
-    - pip install -U . -r requirements-test.txt -r requirements-docs.txt --upgrade-strategy eager
-
-script:
-    - mypy tractor/ --ignore-missing-imports
-    - pytest tests/  --spawn-backend=${SPAWN_BACKEND}

From bba47e4c7a2a63a5bcc8db55c48ed85f06c76b65 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Tue, 13 Oct 2020 21:33:22 -0400
Subject: [PATCH 60/61] Add gh actions badge

---
 README.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index 168f2dade..70f9df381 100644
--- a/README.rst
+++ b/README.rst
@@ -2,7 +2,8 @@ tractor
 =======
 A `structured concurrent`_, async-native "`actor model`_" built on trio_ and multiprocessing_.
 
-|travis| |docs|
+|gh_actions|
+|docs|
 
 .. _actor model: https://en.wikipedia.org/wiki/Actor_model
 .. _trio: https://github.com/python-trio/trio
@@ -54,8 +55,8 @@ say hi, please feel free to ping me on the `trio gitter channel`_!
 .. _trio gitter channel: https://gitter.im/python-trio/general
 
 
-.. |travis| image:: https://img.shields.io/travis/goodboy/tractor/master.svg
-    :target: https://travis-ci.org/goodboy/tractor
+.. |gh_actions| image:: https://img.shields.io/endpoint.svg?url=https%3A%2F%2Factions-badge.atrox.dev%2Fgoodboy%2Ftractor%2Fbadge&style=popout-square
+    :target: https://actions-badge.atrox.dev/goodboy/tractor/goto
 .. |docs| image:: https://readthedocs.org/projects/tractor/badge/?version=latest
     :target: https://tractor.readthedocs.io/en/latest/?badge=latest
     :alt: Documentation Status

From 61a8df358c74c8f06fd375345134248274949346 Mon Sep 17 00:00:00 2001
From: Tyler Goodlet <jgbt@protonmail.com>
Date: Wed, 14 Oct 2020 09:06:40 -0400
Subject: [PATCH 61/61] Comments tweak

---
 tests/test_debugger.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/test_debugger.py b/tests/test_debugger.py
index 227e8f5a6..5ea4f7123 100644
--- a/tests/test_debugger.py
+++ b/tests/test_debugger.py
@@ -3,6 +3,8 @@
 
 All these tests can be understood (somewhat) by running the equivalent
 `examples/debugging/` scripts manually.
+
+TODO: None of these tests have been run successfully on windows yet.
 """
 from os import path
 
@@ -12,7 +14,7 @@
 from conftest import repodir
 
 
-# TODO:
+# TODO: The next great debugger audit could be done by you!
 # - recurrent entry to breakpoint() from single actor *after* and an
 #   error in another task?
 # - root error before child errors
@@ -314,12 +316,14 @@ def test_multi_subactors_root_errors(spawn):
 
 def test_multi_nested_subactors_error_through_nurseries(spawn):
     """Verify deeply nested actors that error trigger debugger entries
-    at each level up the tree.
+    at each actor nurserly (level) all the way up the tree.
     """
 
-    # TODO: inside this script there's still a bug where if the parent
-    # errors before a 2 levels lower actor has released the lock, the
-    # parent tries to cancel it but it's stuck in the debugger?
+    # NOTE: previously, inside this script was a a bug where if the
+    # parent errors before a 2-levels-lower actor has released the lock,
+    # the parent tries to cancel it but it's stuck in the debugger?
+    # A test (below) has now been added to explicitly verify this is
+    # fixed.
 
     child = spawn('multi_nested_subactors_error_up_through_nurseries')