From 8f1dc84d4533f639fc10fae4f34f332f0eb40324 Mon Sep 17 00:00:00 2001
From: Cindy Zhang <cindyzyx9@gmail.com>
Date: Tue, 25 Jun 2024 15:36:39 -0700
Subject: [PATCH] [serve] change default for max and target ongoing requests

Change default for `max_ongoing_requests` from 100 to 5
Change default for `target_ongoing_requests` from 1.0 to 2.0.
Add warnings in rest api and @serve.deployment about these changes.

Signed-off-by: Cindy Zhang <cindyzyx9@gmail.com>
---
 .../modules/serve/serve_rest_api_impl.py      | 37 +++++++++++++++++++
 .../advanced-guides/advanced-autoscaling.md   | 12 +++---
 .../ray/serve/_private/application_state.py   | 11 +-----
 python/ray/serve/_private/config.py           | 10 +----
 python/ray/serve/_private/constants.py        |  4 +-
 python/ray/serve/api.py                       | 30 ++++++++++++++-
 python/ray/serve/config.py                    |  7 ++--
 python/ray/serve/tests/test_api.py            |  9 +++--
 .../serve/tests/test_autoscaling_policy.py    | 10 +++--
 python/ray/serve/tests/test_controller.py     |  6 +--
 .../tests/unit/test_autoscaling_policy.py     | 18 ++++-----
 python/ray/serve/tests/unit/test_config.py    |  4 +-
 12 files changed, 105 insertions(+), 53 deletions(-)

diff --git a/dashboard/modules/serve/serve_rest_api_impl.py b/dashboard/modules/serve/serve_rest_api_impl.py
index 3d59ebaf9c94..9b729518e89e 100644
--- a/dashboard/modules/serve/serve_rest_api_impl.py
+++ b/dashboard/modules/serve/serve_rest_api_impl.py
@@ -167,6 +167,8 @@ async def put_all_applications(self, req: Request) -> Response:
                     text=repr(e),
                 )
 
+            self.log_config_change_default_warning(config)
+
             config_http_options = config.http_options.dict()
             location = ProxyLocation._to_deployment_mode(config.proxy_location)
             full_http_options = dict({"location": location}, **config_http_options)
@@ -213,6 +215,41 @@ def validate_http_options(self, client, http_options):
                     f"updated: {divergent_http_options}."
                 )
 
+        def log_config_change_default_warning(self, config):
+            from ray.serve.config import AutoscalingConfig
+
+            for deployment in [
+                d for app in config.applications for d in app.deployments
+            ]:
+                if "max_ongoing_requests" not in deployment.dict(exclude_unset=True):
+                    logger.warning(
+                        "The default value for `max_ongoing_requests` has changed "
+                        "from 100 to 5 in Ray 2.32.0."
+                    )
+                    break
+
+            for deployment in [
+                d for app in config.applications for d in app.deployments
+            ]:
+                if isinstance(deployment.autoscaling_config, dict):
+                    autoscaling_config = deployment.autoscaling_config
+                elif isinstance(deployment.autoscaling_config, AutoscalingConfig):
+                    autoscaling_config = deployment.autoscaling_config.dict(
+                        exclude_unset=True
+                    )
+                else:
+                    continue
+
+                if (
+                    "target_num_ongoing_requests_per_replica" not in autoscaling_config
+                    and "target_ongoing_requests" not in autoscaling_config
+                ):
+                    logger.warning(
+                        "The default value for `target_ongoing_requests` has changed "
+                        "from 1.0 to 2.0 in Ray 2.32.0."
+                    )
+                    break
+
         async def get_serve_controller(self):
             """Gets the ServeController to the this cluster's Serve app.
 
diff --git a/doc/source/serve/advanced-guides/advanced-autoscaling.md b/doc/source/serve/advanced-guides/advanced-autoscaling.md
index c8514d4e746b..c82c888ccbd8 100644
--- a/doc/source/serve/advanced-guides/advanced-autoscaling.md
+++ b/doc/source/serve/advanced-guides/advanced-autoscaling.md
@@ -14,12 +14,12 @@ In this section, we go into more detail about Serve autoscaling concepts as well
 
 To define what the steady state of your deployments should be, set values for `target_ongoing_requests` and `max_ongoing_requests`.
 
-#### **target_num_ongoing_requests_per_replica [default=1]**
+#### **target_num_ongoing_requests_per_replica [default=2]**
 This parameter is renamed to `target_ongoing_requests`. `target_num_ongoing_requests_per_replica` will be removed in a future release.
 
-#### **target_ongoing_requests [default=1]**
+#### **target_ongoing_requests [default=2]**
 :::{note}
-The default for `target_ongoing_requests` will be changed to 2.0 in an upcoming Ray release. You can continue to set it manually to override the default.
+The default for `target_ongoing_requests` changed from 1.0 to 2.0 in Ray 2.32.0. You can continue to set it manually to override the default.
 :::
 Serve scales the number of replicas for a deployment up or down based on the average number of ongoing requests per replica. Specifically, Serve compares the *actual* number of ongoing requests per replica with the target value you set in the autoscaling config and makes upscale or downscale decisions from that. Set the target value with `target_ongoing_requests`, and Serve attempts to ensure that each replica has roughly that number
 of requests being processed and waiting in the queue. 
@@ -30,12 +30,12 @@ Always load test your workloads. For example, if the use case is latency sensiti
 As an example, suppose you have two replicas of a synchronous deployment that has 100ms latency, serving a traffic load of 30 QPS. Then Serve assigns requests to replicas faster than the replicas can finish processing them; more and more requests queue up at the replica (these requests are "ongoing requests") as time progresses, and then the average number of ongoing requests at each replica steadily increases. Latency also increases because new requests have to wait for old requests to finish processing. If you set `target_ongoing_requests = 1`, Serve detects a higher than desired number of ongoing requests per replica, and adds more replicas. At 3 replicas, your system would be able to process 30 QPS with 1 ongoing request per replica on average.
 :::
 
-#### **max_concurrent_queries [default=100] (DEPRECATED)**
+#### **max_concurrent_queries [default=5] (DEPRECATED)**
 This parameter is renamed to `max_ongoing_requests`. `max_concurrent_queries` will be removed in a future release.
 
-#### **max_ongoing_requests [default=100]**
+#### **max_ongoing_requests [default=5]**
 :::{note}
-The default for `max_ongoing_requests` will be changed to 5 in an upcoming Ray release. You can continue to set it manually to override the default.
+The default for `max_ongoing_requests` changed from 100 to 5 in Ray 2.32.0. You can continue to set it manually to override the default.
 :::
 There is also a maximum queue limit that proxies respect when assigning requests to replicas. Define the limit with `max_ongoing_requests`. Set `max_ongoing_requests` to ~20 to 50% higher than `target_ongoing_requests`. Note that `target_ongoing_requests` should always be strictly less than `max_ongoing_requests`, otherwise the deployment never scales up.
 
diff --git a/python/ray/serve/_private/application_state.py b/python/ray/serve/_private/application_state.py
index 8f8917e0afee..fda4865b78d5 100644
--- a/python/ray/serve/_private/application_state.py
+++ b/python/ray/serve/_private/application_state.py
@@ -21,11 +21,7 @@
     TargetCapacityDirection,
 )
 from ray.serve._private.config import DeploymentConfig
-from ray.serve._private.constants import (
-    NEW_DEFAULT_MAX_ONGOING_REQUESTS,
-    RAY_SERVE_ENABLE_TASK_EVENTS,
-    SERVE_LOGGER_NAME,
-)
+from ray.serve._private.constants import RAY_SERVE_ENABLE_TASK_EVENTS, SERVE_LOGGER_NAME
 from ray.serve._private.deploy_utils import (
     deploy_args_to_deployment_info,
     get_app_code_version,
@@ -1109,11 +1105,6 @@ def override_deployment_info(
         # `num_replicas="auto"`
         if options.get("num_replicas") == "auto":
             options["num_replicas"] = None
-            if (
-                "max_ongoing_requests"
-                not in original_options["user_configured_option_names"]
-            ):
-                options["max_ongoing_requests"] = NEW_DEFAULT_MAX_ONGOING_REQUESTS
 
             new_config = AutoscalingConfig.default().dict()
             # If `autoscaling_config` is specified, its values override
diff --git a/python/ray/serve/_private/config.py b/python/ray/serve/_private/config.py
index 6396ed7685f4..37a6b3662ab7 100644
--- a/python/ray/serve/_private/config.py
+++ b/python/ray/serve/_private/config.py
@@ -25,7 +25,6 @@
     DEFAULT_HEALTH_CHECK_TIMEOUT_S,
     DEFAULT_MAX_ONGOING_REQUESTS,
     MAX_REPLICAS_PER_NODE_MAX_VALUE,
-    NEW_DEFAULT_MAX_ONGOING_REQUESTS,
 )
 from ray.serve._private.utils import DEFAULT, DeploymentOptionUpdateType
 from ray.serve.config import AutoscalingConfig
@@ -87,7 +86,7 @@ class DeploymentConfig(BaseModel):
             handles requests to this deployment. Defaults to 1.
         max_ongoing_requests: The maximum number of queries
             that is sent to a replica of this deployment without receiving
-            a response. Defaults to 100.
+            a response. Defaults to 5.
         max_queued_requests: Maximum number of requests to this deployment that will be
             queued at each *caller* (proxy or DeploymentHandle). Once this limit is
             reached, subsequent requests will raise a BackPressureError (for handles) or
@@ -332,19 +331,12 @@ def handle_num_replicas_auto(
     """Return modified `max_ongoing_requests` and `autoscaling_config`
     for when num_replicas="auto".
 
-    If `max_ongoing_requests` is unspecified (DEFAULT.VALUE), returns
-    the modified value NEW_DEFAULT_MAX_ONGOING_REQUESTS. Otherwise,
-    doesn't change `max_ongoing_requests`.
-
     If `autoscaling_config` is unspecified, returns the modified value
     AutoscalingConfig.default().
     If it is specified, the specified fields in `autoscaling_config`
     override that of AutoscalingConfig.default().
     """
 
-    if max_ongoing_requests is DEFAULT.VALUE:
-        max_ongoing_requests = NEW_DEFAULT_MAX_ONGOING_REQUESTS
-
     if autoscaling_config in [DEFAULT.VALUE, None]:
         # If autoscaling config wasn't specified, use default
         # configuration
diff --git a/python/ray/serve/_private/constants.py b/python/ray/serve/_private/constants.py
index ff9b7156dd78..4f0e1ed4a766 100644
--- a/python/ray/serve/_private/constants.py
+++ b/python/ray/serve/_private/constants.py
@@ -102,8 +102,8 @@
 DEFAULT_GRACEFUL_SHUTDOWN_WAIT_LOOP_S = 2
 DEFAULT_HEALTH_CHECK_PERIOD_S = 10
 DEFAULT_HEALTH_CHECK_TIMEOUT_S = 30
-DEFAULT_MAX_ONGOING_REQUESTS = 100
-NEW_DEFAULT_MAX_ONGOING_REQUESTS = 5
+DEFAULT_MAX_ONGOING_REQUESTS = 5
+DEFAULT_TARGET_ONGOING_REQUESTS = 2
 
 # HTTP Proxy health check configs
 PROXY_HEALTH_CHECK_TIMEOUT_S = (
diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py
index 00f28cddacd7..2fb4ef0211dd 100644
--- a/python/ray/serve/api.py
+++ b/python/ray/serve/api.py
@@ -306,9 +306,9 @@ class MyDeployment:
             can be updated dynamically without restarting the replicas of the
             deployment. The user_config must be fully JSON-serializable.
         max_concurrent_queries: [DEPRECATED] Maximum number of queries that are sent to
-            a replica of this deployment without receiving a response. Defaults to 100.
+            a replica of this deployment without receiving a response. Defaults to 5.
         max_ongoing_requests: Maximum number of requests that are sent to a
-            replica of this deployment without receiving a response. Defaults to 100.
+            replica of this deployment without receiving a response. Defaults to 5.
         max_queued_requests: [EXPERIMENTAL] Maximum number of requests to this
             deployment that will be queued at each *caller* (proxy or DeploymentHandle).
             Once this limit is reached, subsequent requests will raise a
@@ -350,10 +350,30 @@ class MyDeployment:
                 "version."
             )
 
+        if (
+            isinstance(autoscaling_config, dict)
+            and "target_num_ongoing_requests_per_replica" not in autoscaling_config
+            and "target_ongoing_requests" not in autoscaling_config
+        ) or (
+            isinstance(autoscaling_config, AutoscalingConfig)
+            and "target_num_ongoing_requests_per_replica"
+            not in autoscaling_config.dict(exclude_unset=True)
+            and "target_ongoing_requests"
+            not in autoscaling_config.dict(exclude_unset=True)
+        ):
+            logger.warning(
+                "The default value for `target_ongoing_requests` has changed from 1.0 "
+                "to 2.0 in Ray 2.32.0."
+            )
+
     if max_ongoing_requests is None:
         raise ValueError("`max_ongoing_requests` must be non-null, got None.")
     elif max_ongoing_requests is DEFAULT.VALUE:
         if max_concurrent_queries is None:
+            logger.warning(
+                "The default value for `max_ongoing_requests` has changed from "
+                "100 to 5 in Ray 2.32.0."
+            )
             max_ongoing_requests = DEFAULT_MAX_ONGOING_REQUESTS
         else:
             max_ongoing_requests = max_concurrent_queries
@@ -408,6 +428,12 @@ class MyDeployment:
             "been deprecated and replaced by `max_ongoing_requests`."
         )
 
+    if max_ongoing_requests is DEFAULT.VALUE:
+        logger.warning(
+            "The default value for `max_ongoing_requests` has changed from 100 to 5 in "
+            "Ray 2.32.0."
+        )
+
     if isinstance(logging_config, LoggingConfig):
         logging_config = logging_config.dict()
 
diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py
index bd169f36cebd..5521d4a5519f 100644
--- a/python/ray/serve/config.py
+++ b/python/ray/serve/config.py
@@ -20,6 +20,7 @@
     DEFAULT_GRPC_PORT,
     DEFAULT_HTTP_HOST,
     DEFAULT_HTTP_PORT,
+    DEFAULT_TARGET_ONGOING_REQUESTS,
     DEFAULT_UVICORN_KEEP_ALIVE_TIMEOUT_S,
     SERVE_LOGGER_NAME,
 )
@@ -42,7 +43,7 @@ class AutoscalingConfig(BaseModel):
 
     # DEPRECATED: replaced by target_ongoing_requests
     target_num_ongoing_requests_per_replica: PositiveFloat = Field(
-        default=1.0,
+        default=DEFAULT_TARGET_ONGOING_REQUESTS,
         description="[DEPRECATED] Please use `target_ongoing_requests` instead.",
     )
     # Will default to 1.0 in the future.
@@ -134,10 +135,10 @@ def serialize_policy(self) -> None:
     @classmethod
     def default(cls):
         return cls(
+            target_num_ongoing_requests_per_replica=DEFAULT_TARGET_ONGOING_REQUESTS,
+            target_ongoing_requests=DEFAULT_TARGET_ONGOING_REQUESTS,
             min_replicas=1,
             max_replicas=100,
-            target_num_ongoing_requests_per_replica=2,
-            target_ongoing_requests=2,
         )
 
     def get_policy(self) -> Callable:
diff --git a/python/ray/serve/tests/test_api.py b/python/ray/serve/tests/test_api.py
index 9fb36047bf7b..f7e6d4629663 100644
--- a/python/ray/serve/tests/test_api.py
+++ b/python/ray/serve/tests/test_api.py
@@ -14,7 +14,10 @@
 from ray._private.test_utils import SignalActor, wait_for_condition
 from ray.serve._private.api import call_app_builder_with_args_if_necessary
 from ray.serve._private.common import DeploymentID
-from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME
+from ray.serve._private.constants import (
+    DEFAULT_MAX_ONGOING_REQUESTS,
+    SERVE_DEFAULT_APP_NAME,
+)
 from ray.serve.deployment import Application
 from ray.serve.exceptions import RayServeException
 from ray.serve.handle import DeploymentHandle
@@ -1080,10 +1083,10 @@ class A:
         ).bind()
 
     serve.run(serve.deployment(max_concurrent_queries=None)(A).bind())
-    assert get_max_ongoing_requests() == 100
+    assert get_max_ongoing_requests() == DEFAULT_MAX_ONGOING_REQUESTS
 
     serve.run(serve.deployment(A).options(max_concurrent_queries=None).bind())
-    assert get_max_ongoing_requests() == 100
+    assert get_max_ongoing_requests() == DEFAULT_MAX_ONGOING_REQUESTS
 
     serve.run(
         serve.deployment(max_ongoing_requests=8, max_concurrent_queries=None)(A).bind()
diff --git a/python/ray/serve/tests/test_autoscaling_policy.py b/python/ray/serve/tests/test_autoscaling_policy.py
index d14a25cbb6d3..6a0164db75d9 100644
--- a/python/ray/serve/tests/test_autoscaling_policy.py
+++ b/python/ray/serve/tests/test_autoscaling_policy.py
@@ -297,7 +297,7 @@ async def __call__(self):
                 await signal.wait.remote()
                 return "sup"
 
-        @serve.deployment(graceful_shutdown_timeout_s=1)
+        @serve.deployment(graceful_shutdown_timeout_s=1, max_ongoing_requests=50)
         class Router:
             def __init__(self, handle: DeploymentHandle):
                 if use_get_handle_api:
@@ -1144,8 +1144,9 @@ def test_max_ongoing_requests_set_to_one(
 
     @serve.deployment(
         autoscaling_config=AutoscalingConfig(
+            target_ongoing_requests=1.0,
             min_replicas=1,
-            max_replicas=5,
+            max_replicas=3,
             upscale_delay_s=0.5,
             downscale_delay_s=0.5,
             metrics_interval_s=0.5,
@@ -1171,11 +1172,12 @@ async def f():
     # 2. Wait for the number of waiters on signal to increase by 1.
     # 3. Assert the number of replicas has increased by 1.
     refs = []
-    for i in range(5):
+    for i in range(3):
         refs.append(h.remote())
 
         def check_num_waiters(target: int):
-            assert ray.get(signal.cur_num_waiters.remote()) == target
+            num_waiters = ray.get(signal.cur_num_waiters.remote())
+            assert num_waiters == target
             return True
 
         wait_for_condition(check_num_waiters, target=i + 1)
diff --git a/python/ray/serve/tests/test_controller.py b/python/ray/serve/tests/test_controller.py
index 2588fedd5f19..5c885e47bb77 100644
--- a/python/ray/serve/tests/test_controller.py
+++ b/python/ray/serve/tests/test_controller.py
@@ -156,15 +156,15 @@ def autoscaling_app():
                             "message": "",
                             "deployment_config": {
                                 "name": "autoscaling_app",
-                                "max_concurrent_queries": 100,
-                                "max_ongoing_requests": 100,
+                                "max_concurrent_queries": 5,
+                                "max_ongoing_requests": 5,
                                 "max_queued_requests": -1,
                                 "user_config": None,
                                 "autoscaling_config": {
                                     "min_replicas": 1,
                                     "initial_replicas": None,
                                     "max_replicas": 10,
-                                    "target_num_ongoing_requests_per_replica": 1.0,
+                                    "target_num_ongoing_requests_per_replica": 2.0,
                                     "target_ongoing_requests": None,
                                     "metrics_interval_s": 10.0,
                                     "look_back_period_s": 30.0,
diff --git a/python/ray/serve/tests/unit/test_autoscaling_policy.py b/python/ray/serve/tests/unit/test_autoscaling_policy.py
index e3d3c6f24a55..2dabba57e7df 100644
--- a/python/ray/serve/tests/unit/test_autoscaling_policy.py
+++ b/python/ray/serve/tests/unit/test_autoscaling_policy.py
@@ -116,9 +116,9 @@ def test_scaling_factor(
         config = {"min_replicas": 0, "max_replicas": 100}
 
         if use_target_ongoing_requests:
-            config["target_ongoing_requests"] = 1
+            config["target_ongoing_requests"] = 2
         if use_target_num_ongoing_requests_per_replica:
-            config["target_num_ongoing_requests_per_replica"] = 1
+            config["target_num_ongoing_requests_per_replica"] = 2
 
         if use_deprecated_smoothing_factor:
             config["smoothing_factor"] = 0.5
@@ -129,7 +129,7 @@ def test_scaling_factor(
         config = AutoscalingConfig(**config)
         num_replicas = 10
 
-        num_ongoing_requests = 4.0 * num_replicas
+        num_ongoing_requests = 8.0 * num_replicas
         desired_num_replicas = _calculate_desired_num_replicas(
             autoscaling_config=config,
             total_num_requests=num_ongoing_requests,
@@ -158,9 +158,9 @@ def test_upscaling_factor(
     ):
         config = {"min_replicas": 0, "max_replicas": 100}
         if use_target_ongoing_requests:
-            config["target_ongoing_requests"] = 1
+            config["target_ongoing_requests"] = 2
         if use_target_num_ongoing_requests_per_replica:
-            config["target_num_ongoing_requests_per_replica"] = 1
+            config["target_num_ongoing_requests_per_replica"] = 2
 
         if use_deprecated_smoothing_factor:
             config["upscale_smoothing_factor"] = 0.5
@@ -171,7 +171,7 @@ def test_upscaling_factor(
         num_replicas = 10
 
         # Should use upscale smoothing factor of 0.5
-        num_ongoing_requests = 4.0 * num_replicas
+        num_ongoing_requests = 8.0 * num_replicas
         desired_num_replicas = _calculate_desired_num_replicas(
             autoscaling_config=config,
             total_num_requests=num_ongoing_requests,
@@ -201,9 +201,9 @@ def test_downscaling_factor(
     ):
         config = {"min_replicas": 0, "max_replicas": 100}
         if use_target_ongoing_requests:
-            config["target_ongoing_requests"] = 1
+            config["target_ongoing_requests"] = 2
         if use_target_num_ongoing_requests_per_replica:
-            config["target_num_ongoing_requests_per_replica"] = 1
+            config["target_num_ongoing_requests_per_replica"] = 2
 
         if use_deprecated_smoothing_factor:
             config["downscale_smoothing_factor"] = 0.5
@@ -214,7 +214,7 @@ def test_downscaling_factor(
         num_replicas = 10
 
         # Should use upscale smoothing factor of 1 (default)
-        num_ongoing_requests = 4.0 * num_replicas
+        num_ongoing_requests = 8.0 * num_replicas
         desired_num_replicas = _calculate_desired_num_replicas(
             autoscaling_config=config,
             total_num_requests=num_ongoing_requests,
diff --git a/python/ray/serve/tests/unit/test_config.py b/python/ray/serve/tests/unit/test_config.py
index 5ebf5025a5d1..8df5b083ceb9 100644
--- a/python/ray/serve/tests/unit/test_config.py
+++ b/python/ray/serve/tests/unit/test_config.py
@@ -88,7 +88,7 @@ def test_deployment_config_validation(self):
             DeploymentConfig(num_replicas=-1)
 
         # Test dynamic default for max_ongoing_requests.
-        assert DeploymentConfig().max_ongoing_requests == 100
+        assert DeploymentConfig().max_ongoing_requests == 5
 
     def test_deployment_config_update(self):
         b = DeploymentConfig(num_replicas=1, max_ongoing_requests=1)
@@ -460,7 +460,7 @@ def f():
 class TestAutoscalingConfig:
     def test_target_ongoing_requests(self):
         autoscaling_config = AutoscalingConfig()
-        assert autoscaling_config.get_target_ongoing_requests() == 1
+        assert autoscaling_config.get_target_ongoing_requests() == 2
 
         autoscaling_config = AutoscalingConfig(target_ongoing_requests=7)
         assert autoscaling_config.get_target_ongoing_requests() == 7