Skip to content

Commit

Permalink
[NewIR]new ir dygraph to static supoort gpu (PaddlePaddle#55620)
Browse files Browse the repository at this point in the history
* add kernel dialect

* change DenseTensorTypeStorage to DenseTensorType

* add test case`

* add first pd_op to kernel dialect

* lower pd op to kernel dialect

* update

* update

* remove useless code

* add attrite print test

* fix bug

* update

* update

* update

* update

* polish code

* fix bug

* polish  code  and add python test

* add test

* fix test error

* relax constraint when inserting get_parameter

* add env flag

* fix bug

* dygraph2static support new ir

* fix bug

* revert test env

* change cc_test_old to cc_test

* update

* fix build_static bug

* update test

* fix type test error

* udpate cmake

* disable test in windows

* fix inference compile

* fix program translator error

* only run on cpu, not support gpu yet

* fix conflict

* polish code

* fix bug

* add feed with place op

* update

* remove useless unitest

* udpate mkldnn

* update

* update

* align mkldnn version

* new ir support builtin slice op

* fix bug

* fix phi kernel adaptor bug

* add enable static

* add enable_static

* remove useless test case

* change feed list to single variable

* update

* add feed with place and shaddow output op

* fix bug

* remove usless code

* support gpu

* fix bug

* fix bug

* remove template

* add more data type

* fix cimpile bug

* udpate

* remove useless code

* revert dygraph2st test

* remove usless code

* revert op

* fix bug

* new ir dygraph2static support gpu

* remove usless code

* code polish

* add const

* revert code and remove useless code

* revert code

* revert legacy op yaml

* remove useless code

* delete std::move

---------

Co-authored-by: kangguangli <[email protected]>
  • Loading branch information
2 people authored and wyf committed Aug 30, 2023
1 parent da7a95c commit 5098ba4
Show file tree
Hide file tree
Showing 17 changed files with 459 additions and 36 deletions.
59 changes: 47 additions & 12 deletions paddle/fluid/eager/to_static/run_program_op_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,16 @@
#include "paddle/fluid/eager/tensor_wrapper.h"
#include "paddle/fluid/framework/new_executor/interpretercore.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
#include "paddle/fluid/ir_adaptor/translator/program_translator.h"
#include "paddle/fluid/operators/run_program_op.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/ir/core/program.h"
#include "paddle/ir/core/value.h"

PHI_DECLARE_bool(enable_new_ir_in_executor);

namespace details {
using Tensor = paddle::Tensor;

Expand Down Expand Up @@ -367,16 +371,32 @@ inline void RunProgramAPI(
details::ShareTensorsIntoScope(x, global_inner_scope);
details::ShareTensorsIntoScope(params, global_inner_scope);
// Step 2. create new interpretercore
interpreter_core =
paddle::framework::CreateInterpreterCoreInfoToCache(*forward_program,
place,
/*is_grad=*/false,
program_id,
global_inner_scope);

if (FLAGS_enable_new_ir_in_executor) {
// build new ir program
auto ir_program = paddle::framework::ConstructFowardIrProgram(
forward_global_block, backward_global_block, output_names, x);
interpreter_core =
paddle::framework::CreateNewIRInterpreterCoreInfoToCache(
std::move(ir_program),
place,
/*is_grad=*/false,
program_id,
global_inner_scope);
} else {
interpreter_core =
paddle::framework::CreateProgramInterpreterCoreInfoToCache(
*forward_program,
place,
/*is_grad=*/false,
program_id,
global_inner_scope);
}
// Step 3. get all eager gc vars
std::set<std::string> skip_eager_delete_vars =
paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
*backward_program);

// all out_vars are skip_eager_var
skip_eager_delete_vars.insert(output_names.begin(), output_names.end());
skip_eager_delete_vars.insert(dout_names.begin(), dout_names.end());
Expand Down Expand Up @@ -504,12 +524,27 @@ inline void RunProgramGradAPI(
1);
VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
details::ShareTensorsIntoScope(out_grad, global_inner_scope);
interpreter_core =
paddle::framework::CreateInterpreterCoreInfoToCache(*backward_program,
place,
/*is_grad=*/true,
program_id,
global_inner_scope);

if (FLAGS_enable_new_ir_in_executor) {
auto res = paddle::framework::ConstructBackwardIrProgram(
backward_global_block, out_grad, x_grad, params_grad);

interpreter_core =
paddle::framework::CreateNewIRInterpreterCoreInfoToCache(
std::move(res),
place,
/*is_grad=*/true,
program_id,
global_inner_scope);
} else {
interpreter_core =
paddle::framework::CreateProgramInterpreterCoreInfoToCache(
*backward_program,
place,
/*is_grad=*/true,
program_id,
global_inner_scope);
}

// share threadpool
// NOTE(zhiqiu): this only works interpreter_core is executed strictly
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1033,7 +1033,8 @@ cc_library(
cc_library(
executor_cache
SRCS executor_cache.cc
DEPS parallel_executor standalone_executor)
DEPS parallel_executor standalone_executor phi_kernel_adaptor
pd_op_to_kernel_pass ir)
if(WITH_PSCORE)
get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
if(WITH_HETERPS)
Expand Down
167 changes: 164 additions & 3 deletions paddle/fluid/framework/executor_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include "paddle/fluid/framework/executor_cache.h"
#include "paddle/fluid/framework/new_executor/interpretercore.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
#include "paddle/fluid/ir_adaptor/translator/translate.h"
#include "paddle/ir/core/program.h"
#include "paddle/ir/core/value.h"

Expand Down Expand Up @@ -288,7 +290,7 @@ InterpreterCoreInfoCache &InterpreterCoreInfoCache::Instance() {
return g_info_cache;
}

std::shared_ptr<InterpreterCore> CreateInterpreterCoreInfoToCache(
std::shared_ptr<InterpreterCore> CreateProgramInterpreterCoreInfoToCache(
const ProgramDesc &program_desc,
const platform::Place &place,
bool is_grad,
Expand All @@ -304,13 +306,172 @@ std::shared_ptr<InterpreterCore> CreateInterpreterCoreInfoToCache(
interpreter::ExecutionConfig execution_config;
execution_config.create_local_scope = false;
execution_config.used_for_jit = true;
auto core = std::make_shared<InterpreterCore>(
place, program_desc.Block(0), scope, execution_config);

std::shared_ptr<InterpreterCore> core = nullptr;

core.reset(new InterpreterCore(
place, program_desc.Block(0), scope, execution_config));

auto &cached_value =
interpretercore_info_cache.GetMutable(program_id, is_grad);
cached_value.core_ = core;
return core;
}

std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
std::unique_ptr<::ir::Program> ir_program,
const platform::Place &place,
bool is_grad,
int64_t program_id,
framework::Scope *scope) {
auto &interpretercore_info_cache =
framework::InterpreterCoreInfoCache::Instance();
if (interpretercore_info_cache.Size() > 10u /* max_cached_size*/) {
VLOG(2) << "The cached info size has exceeded max_cached_size: 4, clear "
"all cache!";
interpretercore_info_cache.Finalize();
}
interpreter::ExecutionConfig execution_config;
execution_config.create_local_scope = false;
execution_config.used_for_jit = true;

std::shared_ptr<InterpreterCore> core = nullptr;

core.reset(new InterpreterCore(
place, std::move(ir_program), scope, execution_config));

auto &cached_value =
interpretercore_info_cache.GetMutable(program_id, is_grad);
cached_value.core_ = core;
return core;
}

std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
const paddle::framework::BlockDesc *forward_global_block,
const paddle::framework::BlockDesc *backward_global_block,
const std::vector<std::string> output_names,
const std::vector<paddle::Tensor> &x) {
auto ir_ctx = ::ir::IrContext::Instance();
auto program = std::make_unique<::ir::Program>(ir_ctx);

std::set<std::string> set_output_names;
auto local_program =
paddle::framework::ProgramDesc(*(forward_global_block->Program()));

for (auto op_desc : local_program.Block(0).AllOps()) {
for (const auto &n : op_desc->Outputs()) {
const auto &input_var_names = n.second;
for (const auto &var_name : input_var_names) {
set_output_names.insert(var_name);
}
}
}

// add fetch with place op to program
for (auto &in_t : x) {
auto name = in_t.name();
auto place = in_t.place().GetType();

auto op_desc = local_program.MutableBlock(0)->PrependOp();
op_desc->SetType("feed_with_place");
op_desc->SetAttr("index", 0);
// TODO(phlrain) : using tensor dtype
op_desc->SetAttr("dtype", 0);
op_desc->SetAttr("place", static_cast<int>(place));
op_desc->SetAttr("name", name);
op_desc->SetOutput("out", {name});
}

std::set<std::string> set_parameter_names;
for (auto op_desc : backward_global_block->Program()->Block(0).AllOps()) {
for (const auto &n : op_desc->Inputs()) {
const auto &input_var_names = n.second;
for (const auto &var_name : input_var_names) {
set_parameter_names.insert(var_name);
}
}
}

for (auto &t : output_names) {
set_parameter_names.insert(t);
}

for (auto &name : set_parameter_names) {
if (!set_output_names.count(name)) {
continue;
}

auto op_desc = local_program.MutableBlock(0)->AppendOp();
op_desc->SetType("shaddow_output");
op_desc->SetAttr("name", name);
op_desc->SetInput("x", {name});
op_desc->SetOutput("out", {"@EMPTY@"});
}

paddle::translator::ProgramTranslator program_translator(&local_program,
program.get());

program_translator.Translate();

auto ir_res = paddle::dialect::PdOpLowerToKernelPass(program.get());

return ir_res;
}

std::unique_ptr<::ir::Program> ConstructBackwardIrProgram(
const paddle::framework::BlockDesc *backward_global_block,
const std::vector<paddle::Tensor> &out_grad,
const std::vector<paddle::Tensor *> &x_grad,
const std::vector<paddle::Tensor *> &params_grad) {
auto ir_ctx = ::ir::IrContext::Instance();
auto program = std::make_unique<::ir::Program>(ir_ctx);

auto local_program =
paddle::framework::ProgramDesc(*(backward_global_block->Program()));
// add feed kernel
for (auto &out_grad_t : out_grad) {
auto name = out_grad_t.name();
auto place = out_grad_t.place().GetType();
if (name == "@EMPTY@") {
continue;
}
auto op_desc = local_program.MutableBlock(0)->PrependOp();
op_desc->SetType("feed_with_place");
op_desc->SetAttr("index", 0);
// TODO(phlrain) : using tensor dtype
op_desc->SetAttr("dtype", 0);
op_desc->SetAttr("place", static_cast<int>(place));
op_desc->SetAttr("name", name);
op_desc->SetOutput("out", {name});
}

std::vector<std::string> param_grad_names;
for (auto &p_g : params_grad) {
param_grad_names.push_back(p_g->name());
}

for (auto &t : x_grad) {
param_grad_names.push_back(t->name());
}
for (auto &name : param_grad_names) {
if (name == "@EMPTY@") {
continue;
}
auto op_desc = local_program.MutableBlock(0)->AppendOp();
op_desc->SetType("shaddow_output");
op_desc->SetAttr("name", name);
op_desc->SetInput("x", {name});
op_desc->SetOutput("out", {"@EMPTY@"});
}

paddle::translator::ProgramTranslator program_translator(&local_program,
program.get());
program_translator.Translate();

auto res = paddle::dialect::PdOpLowerToKernelPass(program.get());

return res;
}

} // namespace framework
} // namespace paddle
26 changes: 25 additions & 1 deletion paddle/fluid/framework/executor_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/string/string_helper.h"

#include "paddle/fluid/ir_adaptor/translator/program_translator.h"
#include "paddle/ir/core/dialect.h"
#include "paddle/ir/core/ir_context.h"
#include "paddle/ir/core/program.h"

namespace paddle {
namespace framework {
namespace ir {
Expand Down Expand Up @@ -218,12 +223,31 @@ class InterpreterCoreInfoCache {
std::unordered_map<int64_t, InterpreterCoreInfo> info_map_;
};

std::shared_ptr<InterpreterCore> CreateInterpreterCoreInfoToCache(
std::shared_ptr<InterpreterCore> CreateProgramInterpreterCoreInfoToCache(
const ProgramDesc& program_desc,
const platform::Place& place,
bool is_grad,
int64_t program_id,
framework::Scope* scope);

std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
std::unique_ptr<::ir::Program> ir_prog,
const platform::Place& place,
bool is_grad,
int64_t program_id,
framework::Scope* scope);

std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
const paddle::framework::BlockDesc* forward_global_block,
const paddle::framework::BlockDesc* backward_global_block,
const std::vector<std::string> output_names,
const std::vector<paddle::Tensor>& x);

std::unique_ptr<::ir::Program> ConstructBackwardIrProgram(
const paddle::framework::BlockDesc* backward_global_block,
const std::vector<paddle::Tensor>& out_grad,
const std::vector<paddle::Tensor*>& x_grad,
const std::vector<paddle::Tensor*>& params_grad);

} // namespace framework
} // namespace paddle
Original file line number Diff line number Diff line change
Expand Up @@ -958,7 +958,8 @@ void BuildOpFuncList(

if (op_name == "builtin.combine" || op_name == "pd.feed" ||
op_name == "builtin.set_parameter" ||
op_name == "builtin.get_parameter" || op_name == "builtin.slice") {
op_name == "builtin.get_parameter" || op_name == "builtin.slice" ||
op_name == "pd.feed_with_place" || op_name == "pd.shaddow_output") {
VLOG(6) << "skip process " << op_name;
continue;
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/tensor_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -984,7 +984,7 @@ std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
do { \
if (paddle::framework::TransToProtoVarType(tensor.dtype()) == \
proto_type) { \
os << " - dtype: " << proto_type << "\n"; \
os << " - dtype: " << tensor.dtype() << "\n"; \
paddle::framework::print_tensor<cpp_type>(os, tensor); \
return os; \
} \
Expand Down
Loading

0 comments on commit 5098ba4

Please sign in to comment.