fix

PaddlePaddle · Nov 30, 2023 · 7340d73 · 7340d73
2 parents 48be391 + 6579c90
commit 7340d73
Show file tree

Hide file tree

Showing 109 changed files with 3,991 additions and 1,526 deletions.
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -502,16 +502,6 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
     }
   }
 
-  // add fake symbolic args for test
-  if (FLAGS_cinn_bucket_compile) {
-    group_func_args.emplace_back(ir::_Var_::Make("fake_symbol1", Int(32)),
-                                 ir::Argument::IO::kOutput);
-    group_func_args.emplace_back(ir::_Var_::Make("fake_symbol2", Int(32)),
-                                 ir::Argument::IO::kOutput);
-    group->output_names.push_back("fake_symbol1");
-    group->output_names.push_back("fake_symbol2");
-  }
-
 #ifdef CINN_WITH_CUDA
   optim::OptimizeExprGPU(&(func_body));
 #endif

diff --git a/paddle/fluid/distributed/collective/bkcl_tools.cc b/paddle/fluid/distributed/collective/bkcl_tools.cc
@@ -54,7 +54,7 @@ std::string BKCLDTypeToString(BKCLDataType dtype) {
 
 #undef PD_BKCL_DTYPE_TO_STR
   PADDLE_THROW(phi::errors::InvalidArgument(
-      "This datatype %d in nccl is not supported.", static_cast<int>(dtype)));
+      "This datatype %d in bkcl is not supported.", static_cast<int>(dtype)));
 }
 
 std::string BKCLRedTypeToString(BKCLOp op) {

diff --git a/paddle/fluid/distributed/collective/process_group_bkcl.cc b/paddle/fluid/distributed/collective/process_group_bkcl.cc
@@ -110,7 +110,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Recv(
       [&](phi::distributed::BKCLCommContext* comm_context,
           XPUStream stream,
           int rank_in_group) {
-        VLOG(3) << "[bkcl_recv] "
+        VLOG(3) << "bkcl_recv "
                 << "recvbuff: " << tensor->data()
                 << ", count: " << tensor->numel() << ", datatype: "
                 << BKCLDTypeToString(phi::ToBKCLDataType(tensor->dtype()))
@@ -146,7 +146,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Send(
       [&](phi::distributed::BKCLCommContext* comm_context,
           XPUStream stream,
           int rank_in_group) {
-        VLOG(3) << "[bkcl_send] "
+        VLOG(3) << "bkcl_send "
                 << "sendbuff: " << tensor_maybe_partial.data()
                 << ", count: " << tensor_maybe_partial.numel() << ", datatype: "
                 << BKCLDTypeToString(
@@ -360,7 +360,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
       [&](phi::distributed::BKCLCommContext* comm_context, XPUStream stream) {
         int root = opts.source_rank + opts.source_root;
 
-        VLOG(3) << "[bkcl_broadcast] "
+        VLOG(3) << "bkcl_broadcast "
                 << "sendbuff: " << tensor_tmp.data()
                 << ", recvbuff: " << out_tensor->data()
                 << ", count: " << tensor_tmp.numel() << ", datatype: "
@@ -397,7 +397,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
                                                      phi::AllocationType::XPU);
   return Collective(
       [&](phi::distributed::BKCLCommContext* comm_context, XPUStream stream) {
-        VLOG(3) << "bkcl_all_gather"
+        VLOG(3) << "bkcl_all_gather "
                 << "sendbuff: " << in_tensor_maybe_partial.data()
                 << ", recvbuff: " << out_tensor->data()
                 << ", count: " << in_tensor_maybe_partial.numel()
@@ -427,7 +427,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Reduce(
       paddle::experimental::CheckAndTrans2NewContiguousTensor(in_tensor);
   return Collective(
       [&](phi::distributed::BKCLCommContext* comm_context, XPUStream stream) {
-        VLOG(3) << "[bkcl_reduce] "
+        VLOG(3) << "bkcl_reduce "
                 << "sendbuff: " << tensor_tmp.data()
                 << ", recvbuff: " << out_tensor->data()
                 << ", count: " << tensor_tmp.numel() << ", datatype: "
@@ -461,7 +461,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::ReduceScatter(
       paddle::experimental::CheckAndTrans2NewContiguousTensor(in_tensor);
   return Collective(
       [&](phi::distributed::BKCLCommContext* comm_context, XPUStream stream) {
-        VLOG(3) << "[bkcl_reduce_scatter] "
+        VLOG(3) << "bkcl_reduce_scatter "
                 << "sendbuff: " << tensor_tmp.data()
                 << ", recvbuff: " << out_tensor->data()
                 << ", count: " << tensor_tmp.numel() << ", datatype: "

diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -489,8 +489,9 @@ void Conv2dXPUFusePass::CreateTheReplicatedWeights(
       true,
       platform::errors::InvalidArgument("conv node ptr can not be null"));
   auto conv_filter_name = conv->Op()->Input("Filter")[0];
-  std::string replicated_filter_name =
-      conv_filter_name + "_copy_" + std::to_string(conv->id());
+  std::string replicated_filter_name = conv_filter_name + "_copy_" +
+                                       std::to_string(block->ID()) + "_" +
+                                       std::to_string(conv->id());
   auto* replicated_filter_var = scope->FindVar(replicated_filter_name);
   if (replicated_filter_var == nullptr) {
     auto* filter_tensor =
@@ -536,8 +537,9 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
   auto conv_filter_name = conv->Op()->Input("Filter")[0];
   Node* conv_filter = FindNodeWithName(graph, conv_filter_name);
   CreateTheReplicatedWeights(graph, scope, block, nodes_map);
-  std::string replicated_filter_name =
-      conv_filter_name + "_copy_" + std::to_string(conv->id());
+  std::string replicated_filter_name = conv_filter_name + "_copy_" +
+                                       std::to_string(block->ID()) + "_" +
+                                       std::to_string(conv->id());
   auto* conv_filter_replicated_node =
       FindNodeWithName(graph, replicated_filter_name);
   auto* filter_t =
@@ -718,28 +720,69 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
   Node* filter_intx = nullptr;
   Node* filter_max = nullptr;
   Node* scale_max = nullptr;
-  bool per_channel_quant = false;
+
+  std::map<std::string, int> default_type;
+  default_type.insert(std::make_pair("conv2d", -1));
+  auto quant_post_type =
+      Has("quant_post_dynamic_weight_methods")
+          ? Get<std::map<std::string, int>>("quant_post_dynamic_weight_methods")
+          : default_type;
+
+  for (auto it = quant_post_type.begin(); it != quant_post_type.end(); ++it) {
+    VLOG(5) << "Key:" << it->first;
+    VLOG(5) << "Value:" << it->second;
+  }
+
   if (op_weights_precision != "int8") {
-    PrepareWeight<float, int16_t>(graph,
-                                  scope,
-                                  block,
-                                  conv_filter_replicated_node,
-                                  &filter_intx,
-                                  &filter_max,
-                                  &scale_max,
-                                  false,
-                                  weight_scale,
-                                  per_channel_quant);
+    if (quant_post_type.find("conv2d") != quant_post_type.end() &&
+            quant_post_type.find("conv2d")->second == 2 ||
+        quant_post_type.find("conv2d") != quant_post_type.end() &&
+            quant_post_type.find("conv2d")->second == -1) {
+      VLOG(5) << "Use int16 per-tensor weight";
+      PrepareWeight<float, int16_t>(graph,
+                                    scope,
+                                    block,
+                                    conv_filter_replicated_node,
+                                    &filter_intx,
+                                    &filter_max,
+                                    &scale_max,
+                                    false,
+                                    weight_scale,
+                                    false);
+    } else if (quant_post_type.find("conv2d") != quant_post_type.end() &&
+               quant_post_type.find("conv2d")->second == 3) {
+      VLOG(5) << "Use int16 per-channel weight";
+      PrepareWeight<float, int16_t>(graph,
+                                    scope,
+                                    block,
+                                    conv_filter_replicated_node,
+                                    &filter_intx,
+                                    &filter_max,
+                                    &scale_max,
+                                    false,
+                                    weight_scale,
+                                    true);
+    } else {
+      VLOG(5) << "Unsupported type weight by non-int8!";
+    }
+
   } else {
-    PrepareWeight<int8_t, int8_t>(graph,
-                                  scope,
-                                  block,
-                                  conv_filter_replicated_node,
-                                  &filter_intx,
-                                  &filter_max,
-                                  &scale_max,
-                                  false,
-                                  weight_scale);
+    if (quant_post_type.find("conv2d") != quant_post_type.end() &&
+            quant_post_type.find("conv2d")->second == 0 ||
+        quant_post_type.find("conv2d") != quant_post_type.end() &&
+            quant_post_type.find("conv2d")->second == 1) {
+      PrepareWeight<int8_t, int8_t>(graph,
+                                    scope,
+                                    block,
+                                    conv_filter_replicated_node,
+                                    &filter_intx,
+                                    &filter_max,
+                                    &scale_max,
+                                    false,
+                                    weight_scale);
+    } else {
+      VLOG(5) << "Unsupported type weight!";
+    }
   }
 
   (*fusion_nodes_map)["filter"] = filter_intx;

diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -299,8 +299,9 @@ void FcXPUFusePass::CreateTheReplicatedWeights(
       true,
       platform::errors::InvalidArgument("mul node ptr can not be null"));
   auto mul_w_name = mul->Op()->Input("Y")[0];
-  std::string replicated_w_name =
-      mul_w_name + "_copy_" + std::to_string(mul->id());
+  std::string replicated_w_name = mul_w_name + "_copy_" +
+                                  std::to_string(block->ID()) + "_" +
+                                  std::to_string(mul->id());
   auto* replicated_w_var = scope->FindVar(replicated_w_name);
   if (replicated_w_var == nullptr) {
     auto* filter_tensor =
@@ -395,8 +396,9 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
   auto mul_w_name = mul->Op()->Input("Y")[0];
   Node* mul_w = FindNodeWithName(graph, mul_w_name);
   CreateTheReplicatedWeights(graph, scope, block, nodes_map);
-  std::string replicated_w_name =
-      mul_w_name + "_copy_" + std::to_string(mul->id());
+  std::string replicated_w_name = mul_w_name + "_copy_" +
+                                  std::to_string(block->ID()) + "_" +
+                                  std::to_string(mul->id());
   auto* mul_w_replicated_node = FindNodeWithName(graph, replicated_w_name);
   // transfilter fp16 --> fp32
   auto* filter_t = scope->FindVar(mul_w_replicated_node->Name())
@@ -527,31 +529,83 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
   Node* filter_intx = nullptr;
   Node* filter_max = nullptr;
   Node* scale_max = nullptr;
-  bool per_channel_quant =
-      std::getenv("FLAGS_fc_gemm_use_per_channel") == nullptr ? false : true;
+
+  std::map<std::string, int> default_type;
+  default_type.insert(std::make_pair("fc", -1));
+  auto quant_post_type =
+      Has("quant_post_dynamic_weight_methods")
+          ? Get<std::map<std::string, int>>("quant_post_dynamic_weight_methods")
+          : default_type;
+
+  for (auto it = quant_post_type.begin(); it != quant_post_type.end(); ++it) {
+    VLOG(5) << "Key:" << it->first;
+    VLOG(5) << "Value:" << it->second;
+  }
+
   if (op_weights_precision != "int8") {
-    PrepareWeight<float, int16_t>(graph,
-                                  scope,
-                                  block,
-                                  mul_w_replicated_node,
-                                  &filter_intx,
-                                  &filter_max,
-                                  &scale_max,
-                                  !transpose_w,
-                                  weight_scale,
-                                  per_channel_quant);
+    if (quant_post_type.find("fc") != quant_post_type.end() &&
+            quant_post_type.find("fc")->second == 2 ||
+        quant_post_type.find("fc") != quant_post_type.end() &&
+            quant_post_type.find("fc")->second == -1) {
+      VLOG(5) << "Use int16 per-tensor weight";
+      PrepareWeight<float, int16_t>(graph,
+                                    scope,
+                                    block,
+                                    mul_w_replicated_node,
+                                    &filter_intx,
+                                    &filter_max,
+                                    &scale_max,
+                                    !transpose_w,
+                                    weight_scale,
+                                    false);
+    } else if (quant_post_type.find("fc") != quant_post_type.end() &&
+               quant_post_type.find("fc")->second == 3) {
+      VLOG(5) << "Use int16 per-channel weight";
+      PrepareWeight<float, int16_t>(graph,
+                                    scope,
+                                    block,
+                                    mul_w_replicated_node,
+                                    &filter_intx,
+                                    &filter_max,
+                                    &scale_max,
+                                    !transpose_w,
+                                    weight_scale,
+                                    true);
+    } else {
+      VLOG(5) << "Unsupported type weight by non-int8!";
+    }
   } else {
-    PrepareWeight<int8_t, int8_t>(graph,
-                                  scope,
-                                  block,
-                                  mul_w_replicated_node,
-                                  &filter_intx,
-                                  &filter_max,
-                                  &scale_max,
-                                  !transpose_w,
-                                  weight_scale,
-                                  per_channel_quant);
+    if (quant_post_type.find("fc") != quant_post_type.end() &&
+        quant_post_type.find("fc")->second == 0) {
+      VLOG(5) << "Use int8  per-tensor weight";
+      PrepareWeight<int8_t, int8_t>(graph,
+                                    scope,
+                                    block,
+                                    mul_w_replicated_node,
+                                    &filter_intx,
+                                    &filter_max,
+                                    &scale_max,
+                                    !transpose_w,
+                                    weight_scale,
+                                    false);
+    } else if (quant_post_type.find("fc") != quant_post_type.end() &&
+               quant_post_type.find("fc")->second == 1) {
+      VLOG(5) << "Use int8  per-channel weight";
+      PrepareWeight<int8_t, int8_t>(graph,
+                                    scope,
+                                    block,
+                                    mul_w_replicated_node,
+                                    &filter_intx,
+                                    &filter_max,
+                                    &scale_max,
+                                    !transpose_w,
+                                    weight_scale,
+                                    true);
+    } else {
+      VLOG(5) << "Unsupported type weight!";
+    }
   }
+
   (*fusion_nodes_map)["w"] = filter_intx;
   (*fusion_nodes_map)["w_max"] = filter_max;
   (*fusion_nodes_map)["scale_max"] = scale_max;

diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
@@ -69,21 +69,21 @@ CinnJitInstruction::CinnJitInstruction(
     size_t id,
     const platform::Place& place,
     ::pir::Operation* op,
-    const ValueExecutionInfo& value_exec_info)
+    const ValueExecutionInfo* value_exec_info)
     : InstructionBase(id, place) {
   auto jit_kernel_op = op->dyn_cast<cinn::dialect::JitKernelOp>();
   fn_ptr_impl_ = std::make_shared<FnPtrImpl>(jit_kernel_op.cuda_jit_info());
   op_ = op;
 
   place_ = place;
 
-  InitInputsOutputsIds(op, value_exec_info);
+  InitInputsOutputsIds(op, *value_exec_info);
 
   for (size_t i = 0; i < op->num_operands(); ++i) {
     auto in = op->operand_source(i);
 
-    auto var_name = value_exec_info.GetVarName(in);
-    auto tensor = value_exec_info.GetScope()
+    auto var_name = value_exec_info->GetVarName(in);
+    auto tensor = value_exec_info->GetScope()
                       ->FindVar(var_name)
                       ->GetMutable<phi::DenseTensor>();
 
@@ -94,9 +94,9 @@ CinnJitInstruction::CinnJitInstruction(
 
   for (size_t i = 0; i < op->num_results(); ++i) {
     pir::Value result = op->result(i);
-    auto var_name = value_exec_info.GetVarName(result);
+    auto var_name = value_exec_info->GetVarName(result);
 
-    auto tensor = value_exec_info.GetScope()
+    auto tensor = value_exec_info->GetScope()
                       ->Var(var_name)
                       ->GetMutable<phi::DenseTensor>();
 

diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h
@@ -30,7 +30,7 @@ class CinnJitInstruction : public InstructionBase {
   CinnJitInstruction(size_t id,
                      const platform::Place& place,
                      ::pir::Operation* op,
-                     const ValueExecutionInfo& value_exec_info);
+                     const ValueExecutionInfo* value_exec_info);
 
   // TODO(Aurelius84): Only implement core interface and need implement GC and
   // Event logic.

diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -228,7 +228,7 @@ std::unordered_set<pir::Value> GetInternalOutputs(pir::Block* block) {
   for (size_t arg_id = 0; arg_id < block->args_size(); ++arg_id) {
     inner_outputs.insert(block->arg(arg_id));
   }
-  for (auto& op : (*block)) {
+  for (auto& op : *block) {
     VLOG(8) << "GetInternalOutputs of " << op.name();
     if (op.num_regions()) {
       for (size_t i = 0; i < op.num_regions(); ++i) {
@@ -248,7 +248,7 @@ std::unordered_set<pir::Value> GetInternalOutputs(pir::Block* block) {
 
 std::unordered_set<pir::Value> GetInternalInputs(pir::Block* block) {
   std::unordered_set<pir::Value> inner_inputs;
-  for (auto& op : (*block)) {
+  for (auto& op : *block) {
     VLOG(8) << "GetInternalInputs of " << op.name();
     if (op.num_regions()) {
       for (size_t i = 0; i < op.num_regions(); ++i) {
@@ -299,7 +299,7 @@ std::vector<pir::Value> GetExternalInputs(
 
 std::unordered_set<pir::Value> GetTuplePushContainer(pir::Block* block) {
   std::unordered_set<pir::Value> inner_outputs;
-  for (auto& op : (*block)) {
+  for (auto& op : *block) {
     VLOG(8) << "GetTuplePushContainer of " << op.name();
     if (op.num_regions()) {
       for (size_t i = 0; i < op.num_regions(); ++i) {