From eddb973c34e85055546a893a94e967ebf2c6f0c6 Mon Sep 17 00:00:00 2001
From: chenxu02 <chenxu02@zhihu.com>
Date: Fri, 16 Aug 2024 16:54:19 +0800
Subject: [PATCH 1/2] add disable-custom-all-reduce

---
 python/sglang/srt/model_executor/model_runner.py | 2 ++
 python/sglang/srt/server_args.py                 | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index b74a19e60df..6854aed52a7 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -37,6 +37,7 @@
     get_tp_group,
     init_distributed_environment,
     initialize_model_parallel,
+    set_custom_all_reduce
 )
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.model_executor.model_loader import get_model
@@ -105,6 +106,7 @@ def __init__(
             nccl_init_method = f"tcp://{server_args.nccl_init_addr}"
         else:
             nccl_init_method = f"tcp://127.0.0.1:{self.nccl_port}"
+        set_custom_all_reduce(not server_args.disable_custom_all_reduce)
         init_distributed_environment(
             backend="nccl",
             world_size=self.tp_size,
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 99ecff6a588..04fb3fda540 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -86,6 +86,7 @@ class ServerArgs:
     enable_mla: bool = False
     attention_reduce_in_fp32: bool = False
     efficient_weight_load: bool = False
+    disable_custom_all_reduce: bool = False
 
     # Distributed args
     nccl_init_addr: Optional[str] = None
@@ -428,6 +429,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).",
         )
+        parser.add_argument('--disable-custom-all-reduce',
+             action='store_true',
+             default=False,
+             help='Disable the custom all-reduce kernel and fall back to NCCL.')
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):

From 5e065b3f57c0fd8a255aa5ce8b0cb6abb267f319 Mon Sep 17 00:00:00 2001
From: chenxu02 <chenxu02@zhihu.com>
Date: Mon, 19 Aug 2024 18:15:06 +0800
Subject: [PATCH 2/2] fix: isort

---
 python/sglang/srt/model_executor/model_runner.py |  2 +-
 python/sglang/srt/server_args.py                 | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 6854aed52a7..b2855ff351f 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -37,7 +37,7 @@
     get_tp_group,
     init_distributed_environment,
     initialize_model_parallel,
-    set_custom_all_reduce
+    set_custom_all_reduce,
 )
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.model_executor.model_loader import get_model
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
index 04fb3fda540..c7120564c17 100644
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -429,10 +429,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
             action="store_true",
             help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).",
         )
-        parser.add_argument('--disable-custom-all-reduce',
-             action='store_true',
-             default=False,
-             help='Disable the custom all-reduce kernel and fall back to NCCL.')
+        parser.add_argument(
+            "--disable-custom-all-reduce",
+            action="store_true",
+            default=False,
+            help="Disable the custom all-reduce kernel and fall back to NCCL.",
+        )
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):