From eddb973c34e85055546a893a94e967ebf2c6f0c6 Mon Sep 17 00:00:00 2001 From: chenxu02 Date: Fri, 16 Aug 2024 16:54:19 +0800 Subject: [PATCH 1/2] add disable-custom-all-reduce --- python/sglang/srt/model_executor/model_runner.py | 2 ++ python/sglang/srt/server_args.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index b74a19e60df..6854aed52a7 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -37,6 +37,7 @@ get_tp_group, init_distributed_environment, initialize_model_parallel, + set_custom_all_reduce ) from vllm.distributed.parallel_state import in_the_same_node_as from vllm.model_executor.model_loader import get_model @@ -105,6 +106,7 @@ def __init__( nccl_init_method = f"tcp://{server_args.nccl_init_addr}" else: nccl_init_method = f"tcp://127.0.0.1:{self.nccl_port}" + set_custom_all_reduce(not server_args.disable_custom_all_reduce) init_distributed_environment( backend="nccl", world_size=self.tp_size, diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 99ecff6a588..04fb3fda540 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -86,6 +86,7 @@ class ServerArgs: enable_mla: bool = False attention_reduce_in_fp32: bool = False efficient_weight_load: bool = False + disable_custom_all_reduce: bool = False # Distributed args nccl_init_addr: Optional[str] = None @@ -428,6 +429,10 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).", ) + parser.add_argument('--disable-custom-all-reduce', + action='store_true', + default=False, + help='Disable the custom all-reduce kernel and fall back to NCCL.') @classmethod def from_cli_args(cls, args: argparse.Namespace): From 5e065b3f57c0fd8a255aa5ce8b0cb6abb267f319 Mon Sep 17 00:00:00 2001 From: chenxu02 Date: Mon, 19 Aug 2024 18:15:06 +0800 Subject: [PATCH 2/2] fix: isort --- python/sglang/srt/model_executor/model_runner.py | 2 +- python/sglang/srt/server_args.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 6854aed52a7..b2855ff351f 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -37,7 +37,7 @@ get_tp_group, init_distributed_environment, initialize_model_parallel, - set_custom_all_reduce + set_custom_all_reduce, ) from vllm.distributed.parallel_state import in_the_same_node_as from vllm.model_executor.model_loader import get_model diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 04fb3fda540..c7120564c17 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -429,10 +429,12 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Turn on memory efficient weight loading with quantization (quantize per layer during loading).", ) - parser.add_argument('--disable-custom-all-reduce', - action='store_true', - default=False, - help='Disable the custom all-reduce kernel and fall back to NCCL.') + parser.add_argument( + "--disable-custom-all-reduce", + action="store_true", + default=False, + help="Disable the custom all-reduce kernel and fall back to NCCL.", + ) @classmethod def from_cli_args(cls, args: argparse.Namespace):