[NewIR]new ir dygraph to static supoort gpu (PaddlePaddle#55620)

* add kernel dialect * change DenseTensorTypeStorage to DenseTensorType * add test case` * add first pd_op to kernel dialect * lower pd op to kernel dialect * update * update * remove useless code * add attrite print test * fix bug * update * update * update * update * polish code * fix bug * polish code and add python test * add test * fix test error * relax constraint when inserting get_parameter * add env flag * fix bug * dygraph2static support new ir * fix bug * revert test env * change cc_test_old to cc_test * update * fix build_static bug * update test * fix type test error * udpate cmake * disable test in windows * fix inference compile * fix program translator error * only run on cpu, not support gpu yet * fix conflict * polish code * fix bug * add feed with place op * update * remove useless unitest * udpate mkldnn * update * update * align mkldnn version * new ir support builtin slice op * fix bug * fix phi kernel adaptor bug * add enable static * add enable_static * remove useless test case * change feed list to single variable * update * add feed with place and shaddow output op * fix bug * remove usless code * support gpu * fix bug * fix bug * remove template * add more data type * fix cimpile bug * udpate * remove useless code * revert dygraph2st test * remove usless code * revert op * fix bug * new ir dygraph2static support gpu * remove usless code * code polish * add const * revert code and remove useless code * revert code * revert legacy op yaml * remove useless code * delete std::move --------- Co-authored-by: kangguangli <[email protected]>
jinjidejinmuyan · Aug 30, 2023 · 5098ba4 · 5098ba4
1 parent da7a95c
commit 5098ba4
Show file tree

Hide file tree

Showing 17 changed files with 459 additions and 36 deletions.
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -19,12 +19,16 @@
 #include "paddle/fluid/eager/tensor_wrapper.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/fluid/ir_adaptor/translator/program_translator.h"
 #include "paddle/fluid/operators/run_program_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/ir/core/program.h"
 #include "paddle/ir/core/value.h"
 
+PHI_DECLARE_bool(enable_new_ir_in_executor);
+
 namespace details {
 using Tensor = paddle::Tensor;
 
@@ -367,16 +371,32 @@ inline void RunProgramAPI(
     details::ShareTensorsIntoScope(x, global_inner_scope);
     details::ShareTensorsIntoScope(params, global_inner_scope);
     // Step 2. create new interpretercore
-    interpreter_core =
-        paddle::framework::CreateInterpreterCoreInfoToCache(*forward_program,
-                                                            place,
-                                                            /*is_grad=*/false,
-                                                            program_id,
-                                                            global_inner_scope);
+
+    if (FLAGS_enable_new_ir_in_executor) {
+      // build new ir program
+      auto ir_program = paddle::framework::ConstructFowardIrProgram(
+          forward_global_block, backward_global_block, output_names, x);
+      interpreter_core =
+          paddle::framework::CreateNewIRInterpreterCoreInfoToCache(
+              std::move(ir_program),
+              place,
+              /*is_grad=*/false,
+              program_id,
+              global_inner_scope);
+    } else {
+      interpreter_core =
+          paddle::framework::CreateProgramInterpreterCoreInfoToCache(
+              *forward_program,
+              place,
+              /*is_grad=*/false,
+              program_id,
+              global_inner_scope);
+    }
     // Step 3. get all eager gc vars
     std::set<std::string> skip_eager_delete_vars =
         paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
             *backward_program);
+
     // all out_vars are skip_eager_var
     skip_eager_delete_vars.insert(output_names.begin(), output_names.end());
     skip_eager_delete_vars.insert(dout_names.begin(), dout_names.end());
@@ -504,12 +524,27 @@ inline void RunProgramGradAPI(
         1);
     VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
     details::ShareTensorsIntoScope(out_grad, global_inner_scope);
-    interpreter_core =
-        paddle::framework::CreateInterpreterCoreInfoToCache(*backward_program,
-                                                            place,
-                                                            /*is_grad=*/true,
-                                                            program_id,
-                                                            global_inner_scope);
+
+    if (FLAGS_enable_new_ir_in_executor) {
+      auto res = paddle::framework::ConstructBackwardIrProgram(
+          backward_global_block, out_grad, x_grad, params_grad);
+
+      interpreter_core =
+          paddle::framework::CreateNewIRInterpreterCoreInfoToCache(
+              std::move(res),
+              place,
+              /*is_grad=*/true,
+              program_id,
+              global_inner_scope);
+    } else {
+      interpreter_core =
+          paddle::framework::CreateProgramInterpreterCoreInfoToCache(
+              *backward_program,
+              place,
+              /*is_grad=*/true,
+              program_id,
+              global_inner_scope);
+    }
 
     // share threadpool
     // NOTE(zhiqiu): this only works interpreter_core is executed strictly

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -1033,7 +1033,8 @@ cc_library(
 cc_library(
   executor_cache
   SRCS executor_cache.cc
-  DEPS parallel_executor standalone_executor)
+  DEPS parallel_executor standalone_executor phi_kernel_adaptor
+       pd_op_to_kernel_pass ir)
 if(WITH_PSCORE)
   get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
   if(WITH_HETERPS)

diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
@@ -15,6 +15,8 @@
 #include "paddle/fluid/framework/executor_cache.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/ir/core/program.h"
 #include "paddle/ir/core/value.h"
 
@@ -288,7 +290,7 @@ InterpreterCoreInfoCache &InterpreterCoreInfoCache::Instance() {
   return g_info_cache;
 }
 
-std::shared_ptr<InterpreterCore> CreateInterpreterCoreInfoToCache(
+std::shared_ptr<InterpreterCore> CreateProgramInterpreterCoreInfoToCache(
     const ProgramDesc &program_desc,
     const platform::Place &place,
     bool is_grad,
@@ -304,13 +306,172 @@ std::shared_ptr<InterpreterCore> CreateInterpreterCoreInfoToCache(
   interpreter::ExecutionConfig execution_config;
   execution_config.create_local_scope = false;
   execution_config.used_for_jit = true;
-  auto core = std::make_shared<InterpreterCore>(
-      place, program_desc.Block(0), scope, execution_config);
+
+  std::shared_ptr<InterpreterCore> core = nullptr;
+
+  core.reset(new InterpreterCore(
+      place, program_desc.Block(0), scope, execution_config));
+
+  auto &cached_value =
+      interpretercore_info_cache.GetMutable(program_id, is_grad);
+  cached_value.core_ = core;
+  return core;
+}
+
+std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
+    std::unique_ptr<::ir::Program> ir_program,
+    const platform::Place &place,
+    bool is_grad,
+    int64_t program_id,
+    framework::Scope *scope) {
+  auto &interpretercore_info_cache =
+      framework::InterpreterCoreInfoCache::Instance();
+  if (interpretercore_info_cache.Size() > 10u /* max_cached_size*/) {
+    VLOG(2) << "The cached info size has exceeded max_cached_size: 4, clear "
+               "all cache!";
+    interpretercore_info_cache.Finalize();
+  }
+  interpreter::ExecutionConfig execution_config;
+  execution_config.create_local_scope = false;
+  execution_config.used_for_jit = true;
+
+  std::shared_ptr<InterpreterCore> core = nullptr;
+
+  core.reset(new InterpreterCore(
+      place, std::move(ir_program), scope, execution_config));
+
   auto &cached_value =
       interpretercore_info_cache.GetMutable(program_id, is_grad);
   cached_value.core_ = core;
   return core;
 }
 
+std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
+    const paddle::framework::BlockDesc *forward_global_block,
+    const paddle::framework::BlockDesc *backward_global_block,
+    const std::vector<std::string> output_names,
+    const std::vector<paddle::Tensor> &x) {
+  auto ir_ctx = ::ir::IrContext::Instance();
+  auto program = std::make_unique<::ir::Program>(ir_ctx);
+
+  std::set<std::string> set_output_names;
+  auto local_program =
+      paddle::framework::ProgramDesc(*(forward_global_block->Program()));
+
+  for (auto op_desc : local_program.Block(0).AllOps()) {
+    for (const auto &n : op_desc->Outputs()) {
+      const auto &input_var_names = n.second;
+      for (const auto &var_name : input_var_names) {
+        set_output_names.insert(var_name);
+      }
+    }
+  }
+
+  // add fetch with place op to program
+  for (auto &in_t : x) {
+    auto name = in_t.name();
+    auto place = in_t.place().GetType();
+
+    auto op_desc = local_program.MutableBlock(0)->PrependOp();
+    op_desc->SetType("feed_with_place");
+    op_desc->SetAttr("index", 0);
+    // TODO(phlrain) : using tensor dtype
+    op_desc->SetAttr("dtype", 0);
+    op_desc->SetAttr("place", static_cast<int>(place));
+    op_desc->SetAttr("name", name);
+    op_desc->SetOutput("out", {name});
+  }
+
+  std::set<std::string> set_parameter_names;
+  for (auto op_desc : backward_global_block->Program()->Block(0).AllOps()) {
+    for (const auto &n : op_desc->Inputs()) {
+      const auto &input_var_names = n.second;
+      for (const auto &var_name : input_var_names) {
+        set_parameter_names.insert(var_name);
+      }
+    }
+  }
+
+  for (auto &t : output_names) {
+    set_parameter_names.insert(t);
+  }
+
+  for (auto &name : set_parameter_names) {
+    if (!set_output_names.count(name)) {
+      continue;
+    }
+
+    auto op_desc = local_program.MutableBlock(0)->AppendOp();
+    op_desc->SetType("shaddow_output");
+    op_desc->SetAttr("name", name);
+    op_desc->SetInput("x", {name});
+    op_desc->SetOutput("out", {"@EMPTY@"});
+  }
+
+  paddle::translator::ProgramTranslator program_translator(&local_program,
+                                                           program.get());
+
+  program_translator.Translate();
+
+  auto ir_res = paddle::dialect::PdOpLowerToKernelPass(program.get());
+
+  return ir_res;
+}
+
+std::unique_ptr<::ir::Program> ConstructBackwardIrProgram(
+    const paddle::framework::BlockDesc *backward_global_block,
+    const std::vector<paddle::Tensor> &out_grad,
+    const std::vector<paddle::Tensor *> &x_grad,
+    const std::vector<paddle::Tensor *> &params_grad) {
+  auto ir_ctx = ::ir::IrContext::Instance();
+  auto program = std::make_unique<::ir::Program>(ir_ctx);
+
+  auto local_program =
+      paddle::framework::ProgramDesc(*(backward_global_block->Program()));
+  // add feed kernel
+  for (auto &out_grad_t : out_grad) {
+    auto name = out_grad_t.name();
+    auto place = out_grad_t.place().GetType();
+    if (name == "@EMPTY@") {
+      continue;
+    }
+    auto op_desc = local_program.MutableBlock(0)->PrependOp();
+    op_desc->SetType("feed_with_place");
+    op_desc->SetAttr("index", 0);
+    // TODO(phlrain) : using tensor dtype
+    op_desc->SetAttr("dtype", 0);
+    op_desc->SetAttr("place", static_cast<int>(place));
+    op_desc->SetAttr("name", name);
+    op_desc->SetOutput("out", {name});
+  }
+
+  std::vector<std::string> param_grad_names;
+  for (auto &p_g : params_grad) {
+    param_grad_names.push_back(p_g->name());
+  }
+
+  for (auto &t : x_grad) {
+    param_grad_names.push_back(t->name());
+  }
+  for (auto &name : param_grad_names) {
+    if (name == "@EMPTY@") {
+      continue;
+    }
+    auto op_desc = local_program.MutableBlock(0)->AppendOp();
+    op_desc->SetType("shaddow_output");
+    op_desc->SetAttr("name", name);
+    op_desc->SetInput("x", {name});
+    op_desc->SetOutput("out", {"@EMPTY@"});
+  }
+
+  paddle::translator::ProgramTranslator program_translator(&local_program,
+                                                           program.get());
+  program_translator.Translate();
+
+  auto res = paddle::dialect::PdOpLowerToKernelPass(program.get());
+
+  return res;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h
@@ -29,6 +29,11 @@
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/string/string_helper.h"
 
+#include "paddle/fluid/ir_adaptor/translator/program_translator.h"
+#include "paddle/ir/core/dialect.h"
+#include "paddle/ir/core/ir_context.h"
+#include "paddle/ir/core/program.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -218,12 +223,31 @@ class InterpreterCoreInfoCache {
   std::unordered_map<int64_t, InterpreterCoreInfo> info_map_;
 };
 
-std::shared_ptr<InterpreterCore> CreateInterpreterCoreInfoToCache(
+std::shared_ptr<InterpreterCore> CreateProgramInterpreterCoreInfoToCache(
     const ProgramDesc& program_desc,
     const platform::Place& place,
     bool is_grad,
     int64_t program_id,
     framework::Scope* scope);
 
+std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
+    std::unique_ptr<::ir::Program> ir_prog,
+    const platform::Place& place,
+    bool is_grad,
+    int64_t program_id,
+    framework::Scope* scope);
+
+std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
+    const paddle::framework::BlockDesc* forward_global_block,
+    const paddle::framework::BlockDesc* backward_global_block,
+    const std::vector<std::string> output_names,
+    const std::vector<paddle::Tensor>& x);
+
+std::unique_ptr<::ir::Program> ConstructBackwardIrProgram(
+    const paddle::framework::BlockDesc* backward_global_block,
+    const std::vector<paddle::Tensor>& out_grad,
+    const std::vector<paddle::Tensor*>& x_grad,
+    const std::vector<paddle::Tensor*>& params_grad);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -958,7 +958,8 @@ void BuildOpFuncList(
 
     if (op_name == "builtin.combine" || op_name == "pd.feed" ||
         op_name == "builtin.set_parameter" ||
-        op_name == "builtin.get_parameter" || op_name == "builtin.slice") {
+        op_name == "builtin.get_parameter" || op_name == "builtin.slice" ||
+        op_name == "pd.feed_with_place" || op_name == "pd.shaddow_output") {
       VLOG(6) << "skip process " << op_name;
       continue;
     }

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
@@ -984,7 +984,7 @@ std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
   do {                                                            \
     if (paddle::framework::TransToProtoVarType(tensor.dtype()) == \
         proto_type) {                                             \
-      os << "  - dtype: " << proto_type << "\n";                  \
+      os << "  - dtype: " << tensor.dtype() << "\n";              \
       paddle::framework::print_tensor<cpp_type>(os, tensor);      \
       return os;                                                  \
     }                                                             \