Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IR] Add Construct event for new ir interpretercore #55555

Merged
merged 24 commits into from
Jul 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 22 additions & 11 deletions paddle/fluid/framework/new_executor/instruction/instruction_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ class InstructionBase {
OpFuncType KernelType() const;
void SetKernelType(OpFuncType type) { type_ = type; }

int GetStreamPriority() const { return scheduling_priority_; }
void SetStreamPriority(SchedulingPriority scheduling_priority) {
scheduling_priority_ = scheduling_priority;
int GetStreamPriority() const { return stream_priority_; }
void SetStreamPriority(int stream_priority) {
stream_priority_ = stream_priority;
}

SchedulingPriority GetSchedulingPriority() const {
Expand Down Expand Up @@ -107,22 +107,31 @@ class InstructionBase {
std::map<int, int>& GetMutableInplaceBackMap() { return inplace_back_map_; }
const std::map<int, int>& GetInplaceBackMap() { return inplace_back_map_; }

const std::unordered_map<ir::Value, std::vector<int>>& Inputs() const {
const std::unordered_map<::ir::Value, std::vector<int>>& Inputs() const {
return input_index_;
}
std::unordered_map<ir::Value, std::vector<int>>& GetMutableInputs() {
std::unordered_map<::ir::Value, std::vector<int>>& GetMutableInputs() {
return input_index_;
}
void SetInputs(const std::unordered_map<ir::Value, std::vector<int>>& inputs);
void SetInputs(
const std::unordered_map<::ir::Value, std::vector<int>>& inputs);

const std::unordered_map<ir::Value, std::vector<int>>& Outputs() const {
const std::unordered_map<::ir::Value, std::vector<int>>& Outputs() const {
return output_index_;
}
std::unordered_map<ir::Value, std::vector<int>>& GetMutableOutputs() {
std::unordered_map<::ir::Value, std::vector<int>>& GetMutableOutputs() {
return output_index_;
}
void SetOutputs(
const std::unordered_map<ir::Value, std::vector<int>>& outputs);
const std::unordered_map<::ir::Value, std::vector<int>>& outputs);

const std::unordered_set<::ir::Value>& NoNeedBuffer() const {
return no_need_buffer_values_;
}
void SetNoNeedBuffer(
const std::unordered_set<::ir::Value>& no_need_buffer_values) {
no_need_buffer_values_ = no_need_buffer_values;
}

virtual void Run() = 0;

Expand Down Expand Up @@ -159,9 +168,11 @@ class InstructionBase {

std::map<int, int> inplace_back_map_;

std::unordered_map<ir::Value, std::vector<int>> input_index_;
std::unordered_map<::ir::Value, std::vector<int>> input_index_;

std::unordered_map<::ir::Value, std::vector<int>> output_index_;

std::unordered_map<ir::Value, std::vector<int>> output_index_;
std::unordered_set<::ir::Value> no_need_buffer_values_;
};

} // namespace framework
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@
#include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h"

#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/ir/dialect/pd_dialect.h"
#include "paddle/fluid/ir/interface/infermeta.h"
#include "paddle/fluid/ir/interface/op_yaml_info.h"
#include "paddle/fluid/ir/interface/op_yaml_info_parser.h"
#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/type_defs.h"
Expand All @@ -32,6 +35,77 @@
namespace paddle {
namespace framework {

platform::DeviceContext* ParseDeviceContext(
ir::Operation* op,
platform::DeviceContext* origin_dev_ctx,
const platform::Place& place,
const std::string& execution_stream,
const int stream_priority) {
auto op_attributes = op->attributes();
auto op_name =
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
interpreter::ContextManager& ctx_manager =
interpreter::ContextManager::Instance();

platform::DeviceContext* dev_ctx = nullptr;

// only gpu need update. xpu not need, because xpu memcpy op kernel is
// synchronous.
if (platform::is_gpu_place(place) || platform::is_custom_place(place)) {
VLOG(6) << "Parse DeviceContext for " << op_name
<< ", execution stream = " << execution_stream;
if (execution_stream != kDefaultStream) {
dev_ctx = ctx_manager
.Get(std::string(kCustomStream) + "-" + execution_stream,
place,
stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
}

if (op_name == interpreter::kMemcpyD2H) {
dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
} else if (op_name == interpreter::kMemcpyH2D) {
dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
}

#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
// with use_cal_stream==false by returning a device context getting from the
// global NCCLCommContext instance. Because when use_calc_stream==false, in
// OP kernel, the NCCL communication will be launched to the stream directly
// getting from the global NCCLCommContext instance rather than the
// DeviceContext passed from executor (see CAllReduceOpCUDAKernel in
// c_allreduce_op.h). Now it is just a temporary solution for ONLY
// c_allreduce_sum which is used in ResNet50 distributed training.
if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream")
.dyn_cast<::ir::BoolAttribute>()
.data() == false) {
int ring_id =
op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
return platform::NCCLCommContext::Instance()
.Get(ring_id, place)
->dev_context();
}
#endif
}

if (origin_dev_ctx != nullptr) {
interpreter::SetDeviceCommContext(op, origin_dev_ctx);
}
return origin_dev_ctx;
}

OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place) {
if (platform::is_cpu_place(place)) {
return OpFuncType::kCpuSync;
Expand Down Expand Up @@ -172,15 +246,27 @@ PhiKernelInstruction::PhiKernelInstruction(
kernel_context_.SetDeviceContext(phi::DeviceContextPool::Instance().Get(
phi::TransToPhiPlace(kernel_key.backend())));
VLOG(6) << "finish process kernel context";

SetDeviceContext(phi::DeviceContextPool::Instance().Get(
phi::TransToPhiPlace(kernel_key.backend())));
SetDeviceContext(
ParseDeviceContext(op,
phi::DeviceContextPool::Instance().Get(
phi::TransToPhiPlace(kernel_key.backend())),
place,
GetExecutionStream(),
GetStreamPriority()));
VLOG(6) << "finish process device context";

Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
InitInputsOutputsIds(
op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name);
VLOG(6) << "finish process inputs outputs index";

auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds();
std::unordered_set<::ir::Value> no_need_buffer_values;
for (size_t id = 0; id < no_need_buffer_ids.size(); id++) {
no_need_buffer_values.insert(op->operand(no_need_buffer_ids[id]));
}
SetNoNeedBuffer(no_need_buffer_values);
VLOG(6) << "finish process no need buffer";
}

std::vector<int> GetValueIds(
Expand Down
Loading