From e5cbb73c8c38f58a4498586883fd0c4e09f599ac Mon Sep 17 00:00:00 2001 From: wuhuachaocoding Date: Wed, 21 Sep 2022 07:21:49 +0000 Subject: [PATCH 1/4] update moe recompute. --- examples/language_model/moe/dygraph/modeling.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/examples/language_model/moe/dygraph/modeling.py b/examples/language_model/moe/dygraph/modeling.py index 64c1f220ca1d..60ea35d327c2 100644 --- a/examples/language_model/moe/dygraph/modeling.py +++ b/examples/language_model/moe/dygraph/modeling.py @@ -35,8 +35,6 @@ MoeLayer = moe.MoELayer from utils import get_timers -from paddle.distributed.fleet.meta_parallel.pp_utils.utils import _initialize_recompute_setting, _initialize_recompute_hcg - __all__ = [ 'GPTModel', "GPTPretrainedModel", @@ -769,11 +767,6 @@ def __init__(self, self.hidden_size = hidden_size self.vocab_size = vocab_size - if recompute_interval > 0: - _initialize_recompute_hcg(hcg) - _initialize_recompute_setting(recompute_offload, - recompute_partition) - self.embeddings = GPTEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, From 8eaa327426aec3fd654aa82ac5a530f51b1c7e59 Mon Sep 17 00:00:00 2001 From: sljlp Date: Wed, 21 Sep 2022 15:40:03 +0800 Subject: [PATCH 2/4] fix broadcast --- examples/language_model/moe/dygraph/run_moe_pretrain.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/language_model/moe/dygraph/run_moe_pretrain.py b/examples/language_model/moe/dygraph/run_moe_pretrain.py index cabeeb926473..9f12c19a9698 100644 --- a/examples/language_model/moe/dygraph/run_moe_pretrain.py +++ b/examples/language_model/moe/dygraph/run_moe_pretrain.py @@ -143,12 +143,12 @@ def initialize_mp_dp_parameters(model, hcg): paddle.distributed.broadcast(param.detach(), src=mp_src_rank, group=mp_group, - use_calc_stream=True) + sync_op=True) paddle.distributed.broadcast(param.detach(), src=dp_src_rank, group=dp_group, - use_calc_stream=True) + sync_op=True) def unscale_method(self, optimizer): From 1fddd1f6221f101c5f42afd5ad7d47e1c464038a Mon Sep 17 00:00:00 2001 From: wuhuachaocoding Date: Wed, 21 Sep 2022 07:46:49 +0000 Subject: [PATCH 3/4] update. --- examples/language_model/moe/dygraph/modeling.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/examples/language_model/moe/dygraph/modeling.py b/examples/language_model/moe/dygraph/modeling.py index 60ea35d327c2..71b1172d16fd 100644 --- a/examples/language_model/moe/dygraph/modeling.py +++ b/examples/language_model/moe/dygraph/modeling.py @@ -408,7 +408,9 @@ def __init__(self, top_k=2, hcg=None, gate=None, - recompute_interval=0): + recompute_interval=0, + recompute_partition=False, + recompute_offload=False): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 @@ -452,12 +454,19 @@ def __init__(self, "type": "gshard", "top_k": top_k, } + + recompute_ctx = { + "mp_group": mp_group, + "offload": recompute_offload, + "partition": recompute_partition + } self.moe_mlp = MoeLayer(d_model=d_model, experts=experts_list, gate=gate_config, moe_group=moe_group, mp_group=mp_group, - recompute_interval=self.recompute_interval) + recompute_interval=self.recompute_interval, + recompute_ctx=recompute_ctx) else: self.linear1 = fleet.meta_parallel.ColumnParallelLinear( d_model, @@ -793,7 +802,9 @@ def __init__(self, top_k=top_k, hcg=hcg, gate=gate, - recompute_interval=recompute_interval)) + recompute_interval=recompute_interval, + recompute_partition=recompute_partition, + recompute_offload=recompute_offload)) self.decoder = TransformerDecoder(decoder_layers, num_hidden_layers, From 725dd8b310cc9d7d5be6c4021c2828fa24693833 Mon Sep 17 00:00:00 2001 From: sljlp Date: Wed, 21 Sep 2022 15:53:53 +0800 Subject: [PATCH 4/4] change use_calc_stream to sync_op --- examples/language_model/moe/dygraph/run_moe_pretrain.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/language_model/moe/dygraph/run_moe_pretrain.py b/examples/language_model/moe/dygraph/run_moe_pretrain.py index 9f12c19a9698..183a96f39f69 100644 --- a/examples/language_model/moe/dygraph/run_moe_pretrain.py +++ b/examples/language_model/moe/dygraph/run_moe_pretrain.py @@ -206,7 +206,7 @@ def all_reduce_parameters(params, group): with paddle.framework.no_grad(): for p in params: grad = p.grad.scale_(div_factor) - paddle.distributed.all_reduce(grad, use_calc_stream=True) + paddle.distributed.all_reduce(grad, sync_op=True) def parameters_classify(model, use_sharding=False): @@ -492,9 +492,9 @@ def do_train(args): dist.broadcast(p, src=sharding_group.ranks[0], group=sharding_group, - use_calc_stream=True) + sync_op=True) # Multi stream operation will be supported later - dist.wait(tensor=p, group=sharding_group, use_calc_stream=True) + dist.wait(tensor=p, group=sharding_group, sync_op=True) else: initialize_mp_dp_parameters(model, hcg)