PaddlePaddle · zhangbo9674 · Jul 30, 2023 · Jul 17, 2023 · Jul 17, 2023 · Jul 17, 2023
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
@@ -45,9 +45,9 @@ class InstructionBase {
   OpFuncType KernelType() const;
   void SetKernelType(OpFuncType type) { type_ = type; }
 
-  int GetStreamPriority() const { return scheduling_priority_; }
-  void SetStreamPriority(SchedulingPriority scheduling_priority) {
-    scheduling_priority_ = scheduling_priority;
+  int GetStreamPriority() const { return stream_priority_; }
+  void SetStreamPriority(int stream_priority) {
+    stream_priority_ = stream_priority;
   }
 
   SchedulingPriority GetSchedulingPriority() const {
@@ -107,22 +107,31 @@ class InstructionBase {
   std::map<int, int>& GetMutableInplaceBackMap() { return inplace_back_map_; }
   const std::map<int, int>& GetInplaceBackMap() { return inplace_back_map_; }
 
-  const std::unordered_map<ir::Value, std::vector<int>>& Inputs() const {
+  const std::unordered_map<::ir::Value, std::vector<int>>& Inputs() const {
     return input_index_;
   }
-  std::unordered_map<ir::Value, std::vector<int>>& GetMutableInputs() {
+  std::unordered_map<::ir::Value, std::vector<int>>& GetMutableInputs() {
     return input_index_;
   }
-  void SetInputs(const std::unordered_map<ir::Value, std::vector<int>>& inputs);
+  void SetInputs(
+      const std::unordered_map<::ir::Value, std::vector<int>>& inputs);
 
-  const std::unordered_map<ir::Value, std::vector<int>>& Outputs() const {
+  const std::unordered_map<::ir::Value, std::vector<int>>& Outputs() const {
     return output_index_;
   }
-  std::unordered_map<ir::Value, std::vector<int>>& GetMutableOutputs() {
+  std::unordered_map<::ir::Value, std::vector<int>>& GetMutableOutputs() {
     return output_index_;
   }
   void SetOutputs(
-      const std::unordered_map<ir::Value, std::vector<int>>& outputs);
+      const std::unordered_map<::ir::Value, std::vector<int>>& outputs);
+
+  const std::unordered_set<::ir::Value>& NoNeedBuffer() const {
+    return no_need_buffer_values_;
+  }
+  void SetNoNeedBuffer(
+      const std::unordered_set<::ir::Value>& no_need_buffer_values) {
+    no_need_buffer_values_ = no_need_buffer_values;
+  }
 
   virtual void Run() = 0;
 
@@ -159,9 +168,11 @@ class InstructionBase {
 
   std::map<int, int> inplace_back_map_;
 
-  std::unordered_map<ir::Value, std::vector<int>> input_index_;
+  std::unordered_map<::ir::Value, std::vector<int>> input_index_;
+
+  std::unordered_map<::ir::Value, std::vector<int>> output_index_;
 
-  std::unordered_map<ir::Value, std::vector<int>> output_index_;
+  std::unordered_set<::ir::Value> no_need_buffer_values_;
 };
 
 }  // namespace framework

diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
@@ -15,12 +15,15 @@
 #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h"
 
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/ir/dialect/pd_dialect.h"
 #include "paddle/fluid/ir/interface/infermeta.h"
 #include "paddle/fluid/ir/interface/op_yaml_info.h"
 #include "paddle/fluid/ir/interface/op_yaml_info_parser.h"
 #include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/core/type_defs.h"
@@ -32,6 +35,77 @@
 namespace paddle {
 namespace framework {
 
+platform::DeviceContext* ParseDeviceContext(
+    ir::Operation* op,
+    platform::DeviceContext* origin_dev_ctx,
+    const platform::Place& place,
+    const std::string& execution_stream,
+    const int stream_priority) {
+  auto op_attributes = op->attributes();
+  auto op_name =
+      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
+  interpreter::ContextManager& ctx_manager =
+      interpreter::ContextManager::Instance();
+
+  platform::DeviceContext* dev_ctx = nullptr;
+
+  // only gpu need update. xpu not need, because xpu memcpy op kernel is
+  // synchronous.
+  if (platform::is_gpu_place(place) || platform::is_custom_place(place)) {
+    VLOG(6) << "Parse DeviceContext for " << op_name
+            << ", execution stream = " << execution_stream;
+    if (execution_stream != kDefaultStream) {
+      dev_ctx = ctx_manager
+                    .Get(std::string(kCustomStream) + "-" + execution_stream,
+                         place,
+                         stream_priority)
+                    .get()
+                    .get();
+      interpreter::SetDeviceCommContext(op, dev_ctx);
+      return dev_ctx;
+    }
+
+    if (op_name == interpreter::kMemcpyD2H) {
+      dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority)
+                    .get()
+                    .get();
+      interpreter::SetDeviceCommContext(op, dev_ctx);
+      return dev_ctx;
+    } else if (op_name == interpreter::kMemcpyH2D) {
+      dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority)
+                    .get()
+                    .get();
+      interpreter::SetDeviceCommContext(op, dev_ctx);
+      return dev_ctx;
+    }
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
+    // with use_cal_stream==false by returning a device context getting from the
+    // global NCCLCommContext instance. Because when use_calc_stream==false, in
+    // OP kernel, the NCCL communication will be launched to the stream directly
+    // getting from the global NCCLCommContext instance rather than the
+    // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in
+    // c_allreduce_op.h). Now it is just a temporary solution for ONLY
+    // c_allreduce_sum which is used in ResNet50 distributed training.
+    if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream")
+                                                .dyn_cast<::ir::BoolAttribute>()
+                                                .data() == false) {
+      int ring_id =
+          op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
+      return platform::NCCLCommContext::Instance()
+          .Get(ring_id, place)
+          ->dev_context();
+    }
+#endif
+  }
+
+  if (origin_dev_ctx != nullptr) {
+    interpreter::SetDeviceCommContext(op, origin_dev_ctx);
+  }
+  return origin_dev_ctx;
+}
+
 OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place) {
   if (platform::is_cpu_place(place)) {
     return OpFuncType::kCpuSync;
@@ -172,15 +246,27 @@ PhiKernelInstruction::PhiKernelInstruction(
   kernel_context_.SetDeviceContext(phi::DeviceContextPool::Instance().Get(
       phi::TransToPhiPlace(kernel_key.backend())));
   VLOG(6) << "finish process kernel context";
-
-  SetDeviceContext(phi::DeviceContextPool::Instance().Get(
-      phi::TransToPhiPlace(kernel_key.backend())));
+  SetDeviceContext(
+      ParseDeviceContext(op,
+                         phi::DeviceContextPool::Instance().Get(
+                             phi::TransToPhiPlace(kernel_key.backend())),
+                         place,
+                         GetExecutionStream(),
+                         GetStreamPriority()));
   VLOG(6) << "finish process device context";
 
   Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
   InitInputsOutputsIds(
       op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name);
   VLOG(6) << "finish process inputs outputs index";
+
+  auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds();
+  std::unordered_set<::ir::Value> no_need_buffer_values;
+  for (size_t id = 0; id < no_need_buffer_ids.size(); id++) {
+    no_need_buffer_values.insert(op->operand(no_need_buffer_ids[id]));
+  }
+  SetNoNeedBuffer(no_need_buffer_values);
+  VLOG(6) << "finish process no need buffer";
 }
 
 std::vector<int> GetValueIds(