ray-project · krfricke · Mar 25, 2023 · Mar 11, 2023 · Mar 20, 2023 · Mar 20, 2023
diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml
@@ -326,6 +326,69 @@
     - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only python/ray/tests/horovod/...
     - bazel test --config=ci $(./ci/run/bazel_export_options) python/ray/tests/ray_lightning/...
 
+### NEW EXECUTION PATH
+
+
+- label: ":octopus: :sunny: New execution path: Tune tests and examples (small)"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]
+  instance_size: small
+  parallelism: 3
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - TUNE_TESTING=1 ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - ./ci/run/run_bazel_test_with_sharding.sh
+      --config=ci $(./ci/run/bazel_export_options) --build_tests_only
+      --test_env=TUNE_NEW_EXECUTION=1
+      --test_tag_filters=-medium_instance,-py37,-soft_imports,-gpu_only,-rllib,-multinode,-exclude_new_execution
+      python/ray/tune/...
+
+- label: ":octopus: :sunny: New execution path:Tune tests and examples (medium)"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]
+  instance_size: medium
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - TUNE_TESTING=1 ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only
+      --test_env=TUNE_NEW_EXECUTION=1
+      --test_tag_filters=medium_instance,-py37,-soft_imports,-gpu_only,-rllib,-multinode,-exclude_new_execution
+      python/ray/tune/...
+
+- label: ":octopus: :brain: :sunny: New execution path: Tune tests and examples {using RLlib}"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED", "RAY_CI_RLLIB_AFFECTED"]
+  instance_size: large
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - TUNE_TESTING=1 ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only
+      --test_env=TUNE_NEW_EXECUTION=1
+      --test_tag_filters=-gpu_only,rllib,-exclude_new_execution python/ray/tune/...
+
+- label: ":octopus: :sunny: New execution path: Tune tests and examples. Python 3.7"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]
+  instance_size: small
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - TUNE_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only 
+      --test_env=TUNE_NEW_EXECUTION=1
+      --test_tag_filters=py37,-client python/ray/tune/...
+
+- label: ":octopus: :sunny: New execution path: ML library integrations tests and examples. Python 3.7"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]
+  instance_size: small
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - TUNE_TESTING=1 INSTALL_HOROVOD=1 ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_env=TUNE_NEW_EXECUTION=1 python/ray/tests/xgboost/...
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_env=TUNE_NEW_EXECUTION=1 python/ray/tests/horovod/...
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --test_env=TUNE_NEW_EXECUTION=1 python/ray/tests/ray_lightning/...
+
+
 # TODO(amogkam): Re-enable Ludwig tests after Ludwig supports Ray 2.0
 #- label: ":octopus: Ludwig tests and examples. Python 3.7"
 #  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_TUNE_AFFECTED"]

diff --git a/python/ray/air/execution/_internal/actor_manager.py b/python/ray/air/execution/_internal/actor_manager.py
@@ -3,7 +3,6 @@
 import time
 import uuid
 from collections import defaultdict, Counter
-from functools import lru_cache
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 
 import ray
@@ -18,7 +17,6 @@
 from ray.air.execution._internal.tracked_actor_task import TrackedActorTask
 from ray.exceptions import RayTaskError, RayActorError
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -150,12 +148,16 @@ def __init__(self, resource_manager: ResourceManager):
         self._live_actors_to_ray_actors_resources: Dict[
             TrackedActor, Tuple[ray.actor.ActorHandle, AcquiredResources]
         ] = {}
+        self._live_resource_cache: Optional[Dict[str, Any]] = None
 
         # This dict contains all actors that should be killed (after calling
         # `remove_actor()`). Kill requests will be handled in wait().
         self._live_actors_to_kill: Set[TrackedActor] = set()
 
-    def next(self, timeout: Optional[Union[int, float]] = None) -> None:
+        # Track failed actors
+        self._failed_actor_ids: Set[int] = set()
+
+    def next(self, timeout: Optional[Union[int, float]] = None) -> bool:
         """Yield control to event manager to await the next event and invoke callbacks.
 
         Calling this method will wait for up to ``timeout`` seconds for the next
@@ -178,6 +180,9 @@ def next(self, timeout: Optional[Union[int, float]] = None) -> None:
         Args:
             timeout: Timeout in seconds to wait for next event.
 
+        Returns:
+            True if at least one event was processed.
+
         """
         # First issue any pending forceful actor kills
         actor_killed = self._try_kill_actor()
@@ -187,7 +192,7 @@ def next(self, timeout: Optional[Union[int, float]] = None) -> None:
 
         # If an actor was killed, this was our event, and we return.
         if actor_killed:
-            return
+            return True
 
         # Otherwise, collect all futures and await the next.
         resource_futures = self._resource_manager.get_resource_futures()
@@ -209,7 +214,7 @@ def next(self, timeout: Optional[Union[int, float]] = None) -> None:
         ready, _ = ray.wait(all_futures, num_returns=1, timeout=timeout)
 
         if not ready:
-            return
+            return False
 
         [future] = ready
 
@@ -228,6 +233,7 @@ def next(self, timeout: Optional[Union[int, float]] = None) -> None:
             )
 
         self._try_start_actors()
+        return True
 
     def _actor_start_resolved(self, tracked_actor: TrackedActor, future: ray.ObjectRef):
         """Callback to be invoked when actor started"""
@@ -245,6 +251,8 @@ def _actor_stop_resolved(self, tracked_actor: TrackedActor):
 
     def _actor_start_failed(self, tracked_actor: TrackedActor, exception: Exception):
         """Callback to be invoked when actor start/stop failed"""
+        self._failed_actor_ids.add(tracked_actor.actor_id)
+
         self._cleanup_actor(tracked_actor=tracked_actor)
 
         if tracked_actor._on_error:
@@ -262,16 +270,19 @@ def _actor_task_failed(
         tracked_actor = tracked_actor_task._tracked_actor
 
         if isinstance(exception, RayActorError):
-            # Here the actual actor process died.
-            # First, clean up any references to the actor and its futures
+            self._failed_actor_ids.add(tracked_actor.actor_id)
+
+            # Clean up any references to the actor and its futures
             self._cleanup_actor(tracked_actor=tracked_actor)
 
+            # Handle actor state callbacks
             if tracked_actor._on_error:
                 tracked_actor._on_error(tracked_actor, exception)
 
             # Then trigger actor task error callback
             if tracked_actor_task._on_error:
                 tracked_actor_task._on_error(tracked_actor, exception)
+
         elif isinstance(exception, RayTaskError):
             # Otherwise only the task failed. Invoke callback
             if tracked_actor_task._on_error:
@@ -385,7 +396,7 @@ def on_error(exception: Exception):
                     actor,
                     acquired_resources,
                 )
-                self.get_live_actors_resources.cache_clear()
+                self._live_resource_cache = None
 
                 self._enqueue_cached_actor_tasks(tracked_actor=tracked_actor)
 
@@ -422,27 +433,21 @@ def _try_kill_actor(self) -> bool:
         # Hard kill if requested
         ray.kill(ray_actor)
 
+        self._cleanup_actor_futures(tracked_actor)
+
         self._actor_stop_resolved(tracked_actor)
 
         return True
 
     def _cleanup_actor(self, tracked_actor: TrackedActor):
-        # Remove all actor task futures
-        futures = self._tracked_actors_to_task_futures.pop(tracked_actor, [])
-        for future in futures:
-            self._actor_task_events.discard_future(future)
-
-        # Remove all actor state futures
-        futures = self._tracked_actors_to_state_futures.pop(tracked_actor, [])
-        for future in futures:
-            self._actor_state_events.discard_future(future)
+        self._cleanup_actor_futures(tracked_actor)
 
         # Remove from tracked actors
         (
             ray_actor,
             acquired_resources,
         ) = self._live_actors_to_ray_actors_resources.pop(tracked_actor)
-        self.get_live_actors_resources.cache_clear()
+        self._live_resource_cache = None
 
         # Return resources
         self._resource_manager.free_resources(acquired_resource=acquired_resources)
@@ -482,13 +487,16 @@ def num_actor_tasks(self):
         """Return number of pending tasks"""
         return self._actor_task_events.num_futures
 
-    @lru_cache()
     def get_live_actors_resources(self):
+        if self._live_resource_cache:
+            return self._live_resource_cache
+
         counter = Counter()
         for _, acq in self._live_actors_to_ray_actors_resources.values():
             for bdl in acq.resource_request.bundles:
                 counter.update(bdl)
-        return dict(counter)
+        self._live_resource_cache = dict(counter)
+        return self._live_resource_cache
 
     def add_actor(
         self,
@@ -535,6 +543,7 @@ def remove_actor(
         self,
         tracked_actor: TrackedActor,
         kill: bool = False,
+        stop_future: Optional[ray.ObjectRef] = None,
     ) -> None:
         """Remove a tracked actor.
 
@@ -546,7 +555,6 @@ def remove_actor(
         If the actor has only been requested, but not started, yet, this will cancel
         the actor request. This will not trigger any callback.
 
-
         If ``kill=True``, this will use ``ray.kill()`` to forcefully terminate the
         actor. Otherwise, graceful actor deconstruction will be scheduled after
         all currently tracked futures are resolved.
@@ -555,23 +563,51 @@ def remove_actor(
             tracked_actor: Tracked actor to be removed.
             kill: If set, will forcefully terminate the actor instead of gracefully
                 scheduling termination.
+            stop_future: If set, use this future to track actor termination.
+                Otherwise, schedule a ``__ray_terminate__`` future.
         """
-        if tracked_actor in self._live_actors_to_ray_actors_resources:
+        if tracked_actor.actor_id in self._failed_actor_ids:
+            logger.debug(
+                f"Tracked actor already failed, no need to remove: {tracked_actor}"
+            )
+        elif tracked_actor in self._live_actors_to_ray_actors_resources:
             # Ray actor is running.
 
             if not kill:
                 # Schedule __ray_terminate__ future
                 ray_actor, _ = self._live_actors_to_ray_actors_resources[tracked_actor]
 
+                # Clear state futures here to avoid resolving __ray_ready__ futures
+                for future in list(
+                    self._tracked_actors_to_state_futures[tracked_actor]
+                ):
+                    self._actor_state_events.discard_future(future)
+                    self._tracked_actors_to_state_futures[tracked_actor].remove(future)
+
+                    # If the __ray_ready__ future hasn't resolved yet, but we already
+                    # scheduled the actor via Actor.remote(), we just want to stop
+                    # it but not trigger any callbacks. This is in accordance with
+                    # the contract defined in the docstring.
+                    tracked_actor._on_start = None
+                    tracked_actor._on_stop = None
+                    tracked_actor._on_error = None
+
                 def on_actor_stop(*args, **kwargs):
                     self._actor_stop_resolved(tracked_actor=tracked_actor)
 
-                stop_future = ray_actor.__ray_terminate__.remote()
+                if stop_future:
+                    # If the stop future was schedule via the actor manager,
+                    # discard (track it as state future instead).
+                    self._actor_task_events.discard_future(stop_future)
+                else:
+                    stop_future = ray_actor.__ray_terminate__.remote()
+
                 self._actor_state_events.track_future(
                     future=stop_future,
                     on_result=on_actor_stop,
                     on_error=on_actor_stop,
                 )
+
                 self._tracked_actors_to_state_futures[tracked_actor].add(stop_future)
 
             else:
@@ -581,6 +617,9 @@ def on_actor_stop(*args, **kwargs):
         elif tracked_actor in self._pending_actors_to_attrs:
             # Actor is pending, stop
             _, _, resource_request = self._pending_actors_to_attrs.pop(tracked_actor)
+            self._resource_request_to_pending_actors[resource_request].remove(
+                tracked_actor
+            )
             self._resource_manager.cancel_resource_request(
                 resource_request=resource_request
             )
@@ -593,7 +632,13 @@ def is_actor_started(self, tracked_actor: TrackedActor) -> bool:
         Args:
             tracked_actor: Tracked actor object.
         """
-        return tracked_actor in self._live_actors_to_ray_actors_resources
+        return (
+            tracked_actor in self._live_actors_to_ray_actors_resources
+            and tracked_actor.actor_id not in self._failed_actor_ids
+        )
+
+    def is_actor_failed(self, tracked_actor: TrackedActor) -> bool:
+        return tracked_actor.actor_id in self._failed_actor_ids
 
     def get_actor_resources(
         self, tracked_actor: TrackedActor
@@ -675,6 +720,7 @@ def schedule_actor_task(
                 method_name=method_name,
                 args=args,
                 kwargs=kwargs,
+                _return_future=_return_future,
             )
             if _return_future:
                 return res[1]
@@ -794,3 +840,40 @@ def schedule_actor_tasks(
                 on_result=on_result,
                 on_error=on_error,
             )
+
+    def clear_actor_task_futures(self, tracked_actor: TrackedActor):
+        """Discard all actor task futures from a tracked actor."""
+        futures = self._tracked_actors_to_task_futures.pop(tracked_actor, [])
+        for future in futures:
+            self._actor_task_events.discard_future(future)
+
+    def _cleanup_actor_futures(self, tracked_actor: TrackedActor):
+        # Remove all actor task futures
+        self.clear_actor_task_futures(tracked_actor=tracked_actor)
+
+        # Remove all actor state futures
+        futures = self._tracked_actors_to_state_futures.pop(tracked_actor, [])
+        for future in futures:
+            self._actor_state_events.discard_future(future)
+
+    def cleanup(self):
+        for (
+            actor,
+            acquired_resources,
+        ) in self._live_actors_to_ray_actors_resources.values():
+            ray.kill(actor)
+            self._resource_manager.free_resources(acquired_resources)
+
+        for (
+            resource_request,
+            pending_actors,
+        ) in self._resource_request_to_pending_actors.items():
+            for i in range(len(pending_actors)):
+                self._resource_manager.cancel_resource_request(resource_request)
+
+        self._resource_manager.clear()
+
+        self.__init__(resource_manager=self._resource_manager)
+
+    def __del__(self):
+        self.cleanup()
diff --git a/python/ray/air/execution/_internal/event_manager.py b/python/ray/air/execution/_internal/event_manager.py
@@ -109,9 +109,13 @@ def resolve_future(self, future: ray.ObjectRef):
         try:
             result = ray.get(future)
         except Exception as e:
-            on_error(e)
+            if on_error:
+                on_error(e)
+            else:
+                raise e
         else:
-            on_result(result)
+            if on_result:
+                on_result(result)
 
     def wait(
         self,