Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangbo9674 committed Nov 30, 2023
2 parents 48be391 + 6579c90 commit 7340d73
Show file tree
Hide file tree
Showing 109 changed files with 3,991 additions and 1,526 deletions.
10 changes: 0 additions & 10 deletions paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -502,16 +502,6 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
}
}

// add fake symbolic args for test
if (FLAGS_cinn_bucket_compile) {
group_func_args.emplace_back(ir::_Var_::Make("fake_symbol1", Int(32)),
ir::Argument::IO::kOutput);
group_func_args.emplace_back(ir::_Var_::Make("fake_symbol2", Int(32)),
ir::Argument::IO::kOutput);
group->output_names.push_back("fake_symbol1");
group->output_names.push_back("fake_symbol2");
}

#ifdef CINN_WITH_CUDA
optim::OptimizeExprGPU(&(func_body));
#endif
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/distributed/collective/bkcl_tools.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ std::string BKCLDTypeToString(BKCLDataType dtype) {

#undef PD_BKCL_DTYPE_TO_STR
PADDLE_THROW(phi::errors::InvalidArgument(
"This datatype %d in nccl is not supported.", static_cast<int>(dtype)));
"This datatype %d in bkcl is not supported.", static_cast<int>(dtype)));
}

std::string BKCLRedTypeToString(BKCLOp op) {
Expand Down
12 changes: 6 additions & 6 deletions paddle/fluid/distributed/collective/process_group_bkcl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Recv(
[&](phi::distributed::BKCLCommContext* comm_context,
XPUStream stream,
int rank_in_group) {
VLOG(3) << "[bkcl_recv] "
VLOG(3) << "bkcl_recv "
<< "recvbuff: " << tensor->data()
<< ", count: " << tensor->numel() << ", datatype: "
<< BKCLDTypeToString(phi::ToBKCLDataType(tensor->dtype()))
Expand Down Expand Up @@ -146,7 +146,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Send(
[&](phi::distributed::BKCLCommContext* comm_context,
XPUStream stream,
int rank_in_group) {
VLOG(3) << "[bkcl_send] "
VLOG(3) << "bkcl_send "
<< "sendbuff: " << tensor_maybe_partial.data()
<< ", count: " << tensor_maybe_partial.numel() << ", datatype: "
<< BKCLDTypeToString(
Expand Down Expand Up @@ -360,7 +360,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
[&](phi::distributed::BKCLCommContext* comm_context, XPUStream stream) {
int root = opts.source_rank + opts.source_root;

VLOG(3) << "[bkcl_broadcast] "
VLOG(3) << "bkcl_broadcast "
<< "sendbuff: " << tensor_tmp.data()
<< ", recvbuff: " << out_tensor->data()
<< ", count: " << tensor_tmp.numel() << ", datatype: "
Expand Down Expand Up @@ -397,7 +397,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
phi::AllocationType::XPU);
return Collective(
[&](phi::distributed::BKCLCommContext* comm_context, XPUStream stream) {
VLOG(3) << "bkcl_all_gather"
VLOG(3) << "bkcl_all_gather "
<< "sendbuff: " << in_tensor_maybe_partial.data()
<< ", recvbuff: " << out_tensor->data()
<< ", count: " << in_tensor_maybe_partial.numel()
Expand Down Expand Up @@ -427,7 +427,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Reduce(
paddle::experimental::CheckAndTrans2NewContiguousTensor(in_tensor);
return Collective(
[&](phi::distributed::BKCLCommContext* comm_context, XPUStream stream) {
VLOG(3) << "[bkcl_reduce] "
VLOG(3) << "bkcl_reduce "
<< "sendbuff: " << tensor_tmp.data()
<< ", recvbuff: " << out_tensor->data()
<< ", count: " << tensor_tmp.numel() << ", datatype: "
Expand Down Expand Up @@ -461,7 +461,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::ReduceScatter(
paddle::experimental::CheckAndTrans2NewContiguousTensor(in_tensor);
return Collective(
[&](phi::distributed::BKCLCommContext* comm_context, XPUStream stream) {
VLOG(3) << "[bkcl_reduce_scatter] "
VLOG(3) << "bkcl_reduce_scatter "
<< "sendbuff: " << tensor_tmp.data()
<< ", recvbuff: " << out_tensor->data()
<< ", count: " << tensor_tmp.numel() << ", datatype: "
Expand Down
91 changes: 67 additions & 24 deletions paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -489,8 +489,9 @@ void Conv2dXPUFusePass::CreateTheReplicatedWeights(
true,
platform::errors::InvalidArgument("conv node ptr can not be null"));
auto conv_filter_name = conv->Op()->Input("Filter")[0];
std::string replicated_filter_name =
conv_filter_name + "_copy_" + std::to_string(conv->id());
std::string replicated_filter_name = conv_filter_name + "_copy_" +
std::to_string(block->ID()) + "_" +
std::to_string(conv->id());
auto* replicated_filter_var = scope->FindVar(replicated_filter_name);
if (replicated_filter_var == nullptr) {
auto* filter_tensor =
Expand Down Expand Up @@ -536,8 +537,9 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
auto conv_filter_name = conv->Op()->Input("Filter")[0];
Node* conv_filter = FindNodeWithName(graph, conv_filter_name);
CreateTheReplicatedWeights(graph, scope, block, nodes_map);
std::string replicated_filter_name =
conv_filter_name + "_copy_" + std::to_string(conv->id());
std::string replicated_filter_name = conv_filter_name + "_copy_" +
std::to_string(block->ID()) + "_" +
std::to_string(conv->id());
auto* conv_filter_replicated_node =
FindNodeWithName(graph, replicated_filter_name);
auto* filter_t =
Expand Down Expand Up @@ -718,28 +720,69 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
Node* filter_intx = nullptr;
Node* filter_max = nullptr;
Node* scale_max = nullptr;
bool per_channel_quant = false;

std::map<std::string, int> default_type;
default_type.insert(std::make_pair("conv2d", -1));
auto quant_post_type =
Has("quant_post_dynamic_weight_methods")
? Get<std::map<std::string, int>>("quant_post_dynamic_weight_methods")
: default_type;

for (auto it = quant_post_type.begin(); it != quant_post_type.end(); ++it) {
VLOG(5) << "Key:" << it->first;
VLOG(5) << "Value:" << it->second;
}

if (op_weights_precision != "int8") {
PrepareWeight<float, int16_t>(graph,
scope,
block,
conv_filter_replicated_node,
&filter_intx,
&filter_max,
&scale_max,
false,
weight_scale,
per_channel_quant);
if (quant_post_type.find("conv2d") != quant_post_type.end() &&
quant_post_type.find("conv2d")->second == 2 ||
quant_post_type.find("conv2d") != quant_post_type.end() &&
quant_post_type.find("conv2d")->second == -1) {
VLOG(5) << "Use int16 per-tensor weight";
PrepareWeight<float, int16_t>(graph,
scope,
block,
conv_filter_replicated_node,
&filter_intx,
&filter_max,
&scale_max,
false,
weight_scale,
false);
} else if (quant_post_type.find("conv2d") != quant_post_type.end() &&
quant_post_type.find("conv2d")->second == 3) {
VLOG(5) << "Use int16 per-channel weight";
PrepareWeight<float, int16_t>(graph,
scope,
block,
conv_filter_replicated_node,
&filter_intx,
&filter_max,
&scale_max,
false,
weight_scale,
true);
} else {
VLOG(5) << "Unsupported type weight by non-int8!";
}

} else {
PrepareWeight<int8_t, int8_t>(graph,
scope,
block,
conv_filter_replicated_node,
&filter_intx,
&filter_max,
&scale_max,
false,
weight_scale);
if (quant_post_type.find("conv2d") != quant_post_type.end() &&
quant_post_type.find("conv2d")->second == 0 ||
quant_post_type.find("conv2d") != quant_post_type.end() &&
quant_post_type.find("conv2d")->second == 1) {
PrepareWeight<int8_t, int8_t>(graph,
scope,
block,
conv_filter_replicated_node,
&filter_intx,
&filter_max,
&scale_max,
false,
weight_scale);
} else {
VLOG(5) << "Unsupported type weight!";
}
}

(*fusion_nodes_map)["filter"] = filter_intx;
Expand Down
106 changes: 80 additions & 26 deletions paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,9 @@ void FcXPUFusePass::CreateTheReplicatedWeights(
true,
platform::errors::InvalidArgument("mul node ptr can not be null"));
auto mul_w_name = mul->Op()->Input("Y")[0];
std::string replicated_w_name =
mul_w_name + "_copy_" + std::to_string(mul->id());
std::string replicated_w_name = mul_w_name + "_copy_" +
std::to_string(block->ID()) + "_" +
std::to_string(mul->id());
auto* replicated_w_var = scope->FindVar(replicated_w_name);
if (replicated_w_var == nullptr) {
auto* filter_tensor =
Expand Down Expand Up @@ -395,8 +396,9 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
auto mul_w_name = mul->Op()->Input("Y")[0];
Node* mul_w = FindNodeWithName(graph, mul_w_name);
CreateTheReplicatedWeights(graph, scope, block, nodes_map);
std::string replicated_w_name =
mul_w_name + "_copy_" + std::to_string(mul->id());
std::string replicated_w_name = mul_w_name + "_copy_" +
std::to_string(block->ID()) + "_" +
std::to_string(mul->id());
auto* mul_w_replicated_node = FindNodeWithName(graph, replicated_w_name);
// transfilter fp16 --> fp32
auto* filter_t = scope->FindVar(mul_w_replicated_node->Name())
Expand Down Expand Up @@ -527,31 +529,83 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
Node* filter_intx = nullptr;
Node* filter_max = nullptr;
Node* scale_max = nullptr;
bool per_channel_quant =
std::getenv("FLAGS_fc_gemm_use_per_channel") == nullptr ? false : true;

std::map<std::string, int> default_type;
default_type.insert(std::make_pair("fc", -1));
auto quant_post_type =
Has("quant_post_dynamic_weight_methods")
? Get<std::map<std::string, int>>("quant_post_dynamic_weight_methods")
: default_type;

for (auto it = quant_post_type.begin(); it != quant_post_type.end(); ++it) {
VLOG(5) << "Key:" << it->first;
VLOG(5) << "Value:" << it->second;
}

if (op_weights_precision != "int8") {
PrepareWeight<float, int16_t>(graph,
scope,
block,
mul_w_replicated_node,
&filter_intx,
&filter_max,
&scale_max,
!transpose_w,
weight_scale,
per_channel_quant);
if (quant_post_type.find("fc") != quant_post_type.end() &&
quant_post_type.find("fc")->second == 2 ||
quant_post_type.find("fc") != quant_post_type.end() &&
quant_post_type.find("fc")->second == -1) {
VLOG(5) << "Use int16 per-tensor weight";
PrepareWeight<float, int16_t>(graph,
scope,
block,
mul_w_replicated_node,
&filter_intx,
&filter_max,
&scale_max,
!transpose_w,
weight_scale,
false);
} else if (quant_post_type.find("fc") != quant_post_type.end() &&
quant_post_type.find("fc")->second == 3) {
VLOG(5) << "Use int16 per-channel weight";
PrepareWeight<float, int16_t>(graph,
scope,
block,
mul_w_replicated_node,
&filter_intx,
&filter_max,
&scale_max,
!transpose_w,
weight_scale,
true);
} else {
VLOG(5) << "Unsupported type weight by non-int8!";
}
} else {
PrepareWeight<int8_t, int8_t>(graph,
scope,
block,
mul_w_replicated_node,
&filter_intx,
&filter_max,
&scale_max,
!transpose_w,
weight_scale,
per_channel_quant);
if (quant_post_type.find("fc") != quant_post_type.end() &&
quant_post_type.find("fc")->second == 0) {
VLOG(5) << "Use int8 per-tensor weight";
PrepareWeight<int8_t, int8_t>(graph,
scope,
block,
mul_w_replicated_node,
&filter_intx,
&filter_max,
&scale_max,
!transpose_w,
weight_scale,
false);
} else if (quant_post_type.find("fc") != quant_post_type.end() &&
quant_post_type.find("fc")->second == 1) {
VLOG(5) << "Use int8 per-channel weight";
PrepareWeight<int8_t, int8_t>(graph,
scope,
block,
mul_w_replicated_node,
&filter_intx,
&filter_max,
&scale_max,
!transpose_w,
weight_scale,
true);
} else {
VLOG(5) << "Unsupported type weight!";
}
}

(*fusion_nodes_map)["w"] = filter_intx;
(*fusion_nodes_map)["w_max"] = filter_max;
(*fusion_nodes_map)["scale_max"] = scale_max;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,21 +69,21 @@ CinnJitInstruction::CinnJitInstruction(
size_t id,
const platform::Place& place,
::pir::Operation* op,
const ValueExecutionInfo& value_exec_info)
const ValueExecutionInfo* value_exec_info)
: InstructionBase(id, place) {
auto jit_kernel_op = op->dyn_cast<cinn::dialect::JitKernelOp>();
fn_ptr_impl_ = std::make_shared<FnPtrImpl>(jit_kernel_op.cuda_jit_info());
op_ = op;

place_ = place;

InitInputsOutputsIds(op, value_exec_info);
InitInputsOutputsIds(op, *value_exec_info);

for (size_t i = 0; i < op->num_operands(); ++i) {
auto in = op->operand_source(i);

auto var_name = value_exec_info.GetVarName(in);
auto tensor = value_exec_info.GetScope()
auto var_name = value_exec_info->GetVarName(in);
auto tensor = value_exec_info->GetScope()
->FindVar(var_name)
->GetMutable<phi::DenseTensor>();

Expand All @@ -94,9 +94,9 @@ CinnJitInstruction::CinnJitInstruction(

for (size_t i = 0; i < op->num_results(); ++i) {
pir::Value result = op->result(i);
auto var_name = value_exec_info.GetVarName(result);
auto var_name = value_exec_info->GetVarName(result);

auto tensor = value_exec_info.GetScope()
auto tensor = value_exec_info->GetScope()
->Var(var_name)
->GetMutable<phi::DenseTensor>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class CinnJitInstruction : public InstructionBase {
CinnJitInstruction(size_t id,
const platform::Place& place,
::pir::Operation* op,
const ValueExecutionInfo& value_exec_info);
const ValueExecutionInfo* value_exec_info);

// TODO(Aurelius84): Only implement core interface and need implement GC and
// Event logic.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ std::unordered_set<pir::Value> GetInternalOutputs(pir::Block* block) {
for (size_t arg_id = 0; arg_id < block->args_size(); ++arg_id) {
inner_outputs.insert(block->arg(arg_id));
}
for (auto& op : (*block)) {
for (auto& op : *block) {
VLOG(8) << "GetInternalOutputs of " << op.name();
if (op.num_regions()) {
for (size_t i = 0; i < op.num_regions(); ++i) {
Expand All @@ -248,7 +248,7 @@ std::unordered_set<pir::Value> GetInternalOutputs(pir::Block* block) {

std::unordered_set<pir::Value> GetInternalInputs(pir::Block* block) {
std::unordered_set<pir::Value> inner_inputs;
for (auto& op : (*block)) {
for (auto& op : *block) {
VLOG(8) << "GetInternalInputs of " << op.name();
if (op.num_regions()) {
for (size_t i = 0; i < op.num_regions(); ++i) {
Expand Down Expand Up @@ -299,7 +299,7 @@ std::vector<pir::Value> GetExternalInputs(

std::unordered_set<pir::Value> GetTuplePushContainer(pir::Block* block) {
std::unordered_set<pir::Value> inner_outputs;
for (auto& op : (*block)) {
for (auto& op : *block) {
VLOG(8) << "GetTuplePushContainer of " << op.name();
if (op.num_regions()) {
for (size_t i = 0; i < op.num_regions(); ++i) {
Expand Down
Loading

0 comments on commit 7340d73

Please sign in to comment.