diff --git a/.clang-tidy b/.clang-tidy
index 820a244182d230..c0d5b09ac1f168 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -83,7 +83,7 @@ clang-analyzer-cplusplus.InnerPointer,
 -clang-analyzer-nullability.NullableDereferenced,
 -clang-analyzer-nullability.NullablePassedToNonnull,
 -clang-analyzer-nullability.NullableReturnedFromNonnull,
--clang-analyzer-optin.cplusplus.UninitializedObject,
+clang-analyzer-optin.cplusplus.UninitializedObject,
 -clang-analyzer-optin.cplusplus.VirtualCall,
 -clang-analyzer-optin.mpi.MPI-Checker,
 -clang-analyzer-optin.osx.OSObjectCStyleCast,
@@ -162,7 +162,7 @@ cppcoreguidelines-c-copy-assignment-signature,
 -cppcoreguidelines-pro-type-member-init,
 -cppcoreguidelines-slicing,
 -hicpp-avoid-goto,
--hicpp-exception-baseclass,
+hicpp-exception-baseclass,
 misc-unused-alias-decls,
 misc-unused-using-decls,
 modernize-avoid-bind,
@@ -170,7 +170,7 @@ modernize-avoid-c-arrays,
 -modernize-deprecated-headers,
 -modernize-deprecated-ios-base-aliases,
 modernize-loop-convert,
--modernize-make-shared,
+modernize-make-shared,
 modernize-make-unique,
 -modernize-pass-by-value,
 modernize-raw-string-literal,
diff --git a/.gitignore b/.gitignore
index 98a3d18b8a2a60..cc68215a8d2dc8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -98,3 +98,4 @@ paddle/phi/kernels/fusion/cutlass/conv2d/generated/*
 python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
 paddle/fluid/ir_adaptor/translator/op_compat_info.cc
 paddle/fluid/pybind/static_op_function.*
+paddle/fluid/pybind/ops_api.cc
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34e748c794f690..318c9df489311b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -254,6 +254,7 @@ option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
 option(ON_INFER "Turn on inference optimization and inference-lib generation"
        ON)
 option(WITH_CPP_DIST "Install PaddlePaddle C++ distribution" OFF)
+option(WITH_GFLAGS "Compile PaddlePaddle with gflags support" OFF)
 ################################ Internal Configurations #######################################
 option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF)
 option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools"
diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
index de13f71526c8a3..ca25a7d5d30a56 100644
--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
@@ -183,6 +183,11 @@ if(WITH_MKL)
   endif()
 endif()
 
+if(NOT WITH_GFLAGS)
+  target_link_libraries(cinnapi gflags)
+  add_dependencies(cinnapi gflags)
+endif()
+
 if(WITH_GPU)
   target_link_libraries(
     cinnapi
@@ -237,6 +242,11 @@ function(gen_cinncore LINKTYPE)
     endif()
   endif()
 
+  if(NOT WITH_GFLAGS)
+    target_link_libraries(${CINNCORE_TARGET} gflags)
+    add_dependencies(${CINNCORE_TARGET} gflags)
+  endif()
+
   if(WITH_GPU)
     target_link_libraries(
       ${CINNCORE_TARGET}
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index dc661fce388fe1..4d0b04209c059b 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -201,6 +201,10 @@ if(WITH_DISTRIBUTE)
   add_definitions(-DPADDLE_WITH_DISTRIBUTE)
 endif()
 
+if(WITH_GFLAGS)
+  add_definitions(-DPADDLE_WITH_GFLAGS)
+endif()
+
 if(WITH_PSCORE)
   add_definitions(-DPADDLE_WITH_PSCORE)
 endif()
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index ff33e142addbd6..3c9f2b69620483 100755
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -91,3 +91,16 @@ add_dependencies(brpc extern_brpc)
 add_definitions(-DBRPC_WITH_GLOG)
 
 list(APPEND external_project_dependencies brpc)
+
+set(EXTERNAL_BRPC_DEPS
+    brpc
+    protobuf
+    ssl
+    crypto
+    leveldb
+    glog
+    snappy)
+
+if(NOT WITH_GFLAGS)
+  set(EXTERNAL_BRPC_DEPS ${EXTERNAL_BRPC_DEPS} gflags)
+endif()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 06827732e5966a..75436783c7ede2 100755
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -102,3 +102,14 @@ if(WIN32)
     set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
   endif()
 endif()
+
+# We have implemented a custom flags tool paddle_flags to replace gflags.
+# User can also choose to use gflags by setting WITH_GFLAGS=ON. But when
+# using paddle_flags, gflags is also needed for other third party libraries
+# including glog and brpc. So we can not remove gflags completely.
+set(flags_dep)
+if(WITH_GFLAGS)
+  list(APPEND flags_dep gflags)
+else()
+  list(APPEND flags_dep paddle_flags)
+endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index c97a68a0175e1a..13fce9613650f1 100755
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -336,11 +336,22 @@ copy(
   inference_lib_dist
   SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/flat_hash_map.h
   DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/)
+copy(
+  inference_lib_dist
+  SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/flags.h
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/)
 copy(
   inference_lib_dist
   SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h
   DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/)
 
+if(NOT WITH_GFLAGS)
+  copy(
+    inference_lib_dist
+    SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/flags_native.h
+    DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/paddle/utils/)
+endif()
+
 # the include path of phi needs to be changed to adapt to inference api path
 add_custom_command(
   TARGET inference_lib_dist
diff --git a/paddle/cinn/hlir/dialect/.gitignore b/paddle/cinn/hlir/dialect/.gitignore
index a21ba08d95acf3..3e41ce4c67822f 100644
--- a/paddle/cinn/hlir/dialect/.gitignore
+++ b/paddle/cinn/hlir/dialect/.gitignore
@@ -1 +1,2 @@
 generated/**
+generated/*
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
index cf4046950964a2..ba468269b8230b 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
@@ -19,7 +19,6 @@
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/elementwise_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h"
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h"
@@ -32,9 +31,6 @@ namespace paddle {
 namespace distributed {
 namespace auto_parallel {
 
-// matmul rule
-REGISTER_SPMD_RULE(matmul, MatmulSPMDRule);
-
 // reduction rules
 REGISTER_SPMD_RULE(all, ReductionSPMDRule);
 REGISTER_SPMD_RULE(amax, ReductionSPMDRule);
diff --git a/paddle/fluid/distributed/collective/process_group_custom.cc b/paddle/fluid/distributed/collective/process_group_custom.cc
index 6d5c30da3133b5..a905fcc095599f 100644
--- a/paddle/fluid/distributed/collective/process_group_custom.cc
+++ b/paddle/fluid/distributed/collective/process_group_custom.cc
@@ -27,7 +27,7 @@
 
 constexpr int64_t kWaitBlockTImeout = 10;
 
-DECLARE_bool(use_stream_safe_cuda_allocator);
+PD_DECLARE_bool(use_stream_safe_cuda_allocator);
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc
index a8539278d2f280..7ffe00b8cd8244 100644
--- a/paddle/fluid/distributed/collective/process_group_nccl.cc
+++ b/paddle/fluid/distributed/collective/process_group_nccl.cc
@@ -29,7 +29,7 @@
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
 PHI_DECLARE_bool(nccl_blocking_wait);
-DECLARE_bool(use_stream_safe_cuda_allocator);
+PD_DECLARE_bool(use_stream_safe_cuda_allocator);
 
 // set this flag to `true` and recompile to enable dynamic checks
 constexpr bool FLAGS_enable_nccl_dynamic_check = false;
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index c3262b8db8d568..378668499c90b6 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -18,7 +18,7 @@
 #include "paddle/phi/backends/device_manager.h"
 #include "paddle/phi/core/flags.h"
 
-DECLARE_bool(use_stream_safe_cuda_allocator);
+PD_DECLARE_bool(use_stream_safe_cuda_allocator);
 PHI_DECLARE_string(allocator_strategy);
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index eea257b5ddf4fc..9c28205520129c 100755
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -7,16 +7,7 @@ proto_library(interceptor_message_proto SRCS interceptor_message.proto)
 if(WITH_ARM_BRPC)
   set(BRPC_DEPS arm_brpc snappy phi glog)
 elseif(WITH_DISTRIBUTE AND NOT WITH_PSLIB)
-  set(BRPC_DEPS
-      brpc
-      ssl
-      crypto
-      protobuf
-      zlib
-      leveldb
-      snappy
-      phi
-      glog)
+  set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS} zlib phi)
 else()
   set(BRPC_DEPS "")
 endif()
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 1dc29493af9d8b..bb128c1287c9cf 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -17,7 +17,6 @@
 #include <algorithm>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
@@ -29,6 +28,7 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/flags.h"
+#include "paddle/utils/flags.h"
 PADDLE_DEFINE_EXPORTED_bool(
     fleet_executor_with_standalone,
     false,
diff --git a/paddle/fluid/distributed/ps/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt
index 585dd111bf7d11..c23f26c6352180 100755
--- a/paddle/fluid/distributed/ps/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt
@@ -3,34 +3,11 @@ set_source_files_properties(${BRPC_SRCS})
 
 if(WITH_HETERPS)
 
-  set(BRPC_DEPS
-      brpc
-      ssl
-      crypto
-      protobuf
-      phi
-      glog
-      zlib
-      leveldb
-      snappy
-      glog
-      device_context
-      rocksdb)
+  set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS} phi zlib device_context rocksdb)
 
 else()
 
-  set(BRPC_DEPS
-      brpc
-      ssl
-      crypto
-      protobuf
-      phi
-      glog
-      zlib
-      leveldb
-      snappy
-      glog
-      device_context)
+  set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS} phi zlib device_context)
 
 endif()
 
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index 93fe8c849be691..9ad8768e0927d5 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -34,49 +34,53 @@ class Variable;
 namespace paddle {
 namespace distributed {
 
-DEFINE_int32(pserver_push_dense_merge_limit,
-             12,
-             "limit max push_dense local merge requests");
+PD_DEFINE_int32(pserver_push_dense_merge_limit,
+                12,
+                "limit max push_dense local merge requests");
 
-DEFINE_int32(pserver_push_sparse_merge_limit,
-             12,
-             "limit max push_sparse local merge requests");
+PD_DEFINE_int32(pserver_push_sparse_merge_limit,
+                12,
+                "limit max push_sparse local merge requests");
 
-DEFINE_int32(pserver_pull_dense_limit,
-             12,
-             "limit max push_sparse local merge requests");
+PD_DEFINE_int32(pserver_pull_dense_limit,
+                12,
+                "limit max push_sparse local merge requests");
 
-DEFINE_int32(pserver_async_push_dense_interval_ms,
-             10,
-             "async push_dense to server interval");
+PD_DEFINE_int32(pserver_async_push_dense_interval_ms,
+                10,
+                "async push_dense to server interval");
 
-DEFINE_int32(pserver_async_push_sparse_interval_ms,
-             10,
-             "async push_sparse to server interval");
+PD_DEFINE_int32(pserver_async_push_sparse_interval_ms,
+                10,
+                "async push_sparse to server interval");
 
-DEFINE_bool(pserver_scale_gradient_by_merge,
-            false,
-            "scale dense gradient when merged");
+PD_DEFINE_bool(pserver_scale_gradient_by_merge,
+               false,
+               "scale dense gradient when merged");
 
-DEFINE_int32(pserver_communicate_compress_type,
-             0,
-             "none:0 snappy:1 gzip:2 zlib:3 lz4:4");
+PD_DEFINE_int32(pserver_communicate_compress_type,
+                0,
+                "none:0 snappy:1 gzip:2 zlib:3 lz4:4");
 
-DEFINE_int32(pserver_max_async_call_num,
-             13,
-             "max task num in async_call_server");
+PD_DEFINE_int32(pserver_max_async_call_num,
+                13,
+                "max task num in async_call_server");
 
-DEFINE_int32(pserver_timeout_ms, 500000, "pserver request server timeout_ms");
+PD_DEFINE_int32(pserver_timeout_ms,
+                500000,
+                "pserver request server timeout_ms");
 
-DEFINE_int32(pserver_connect_timeout_ms,
-             10000,
-             "pserver connect server timeout_ms");
+PD_DEFINE_int32(pserver_connect_timeout_ms,
+                10000,
+                "pserver connect server timeout_ms");
 
-DEFINE_int32(pserver_sparse_merge_thread, 1, "pserver sparse merge thread num");
+PD_DEFINE_int32(pserver_sparse_merge_thread,
+                1,
+                "pserver sparse merge thread num");
 
-DEFINE_int32(pserver_sparse_table_shard_num,
-             1000,
-             "sparse table shard for save & load");
+PD_DEFINE_int32(pserver_sparse_table_shard_num,
+                1000,
+                "sparse table shard for save & load");
 
 inline size_t get_sparse_shard(uint32_t shard_num,
                                uint32_t server_num,
@@ -140,7 +144,7 @@ int32_t BrpcPsClient::StartFlClientService(const std::string &self_endpoint) {
 
   if (_fl_server.Start(self_endpoint.c_str(), &options) != 0) {
     VLOG(0) << "fl-ps > StartFlClientService failed. Try again.";
-    auto ip_port = paddle::string::Split(self_endpoint, ':');
+    auto ip_port = ::paddle::string::Split(self_endpoint, ':');
     std::string ip = ip_port[0];
     int port = std::stoi(ip_port[1]);
     std::string int_ip_port = GetIntTypeEndpoint(ip, port);
@@ -202,8 +206,7 @@ int32_t BrpcPsClient::InitializeFlWorker(const std::string &self_endpoint) {
   options.protocol = "baidu_std";
   options.timeout_ms = FLAGS_pserver_timeout_ms;
   options.connection_type = "pooled";
-  options.connect_timeout_ms =
-      paddle::distributed::FLAGS_pserver_connect_timeout_ms;
+  options.connect_timeout_ms = FLAGS_pserver_connect_timeout_ms;
   options.max_retry = 3;
   // 获取 coordinator 列表，并连接
   std::string coordinator_ip_port;
@@ -336,11 +339,11 @@ int32_t BrpcPsClient::Initialize() {
     auto table_id = worker_param.downpour_table_param(i).table_id();
     if (type == PS_DENSE_TABLE) {
       _push_dense_task_queue_map[table_id] =
-          paddle::framework::MakeChannel<DenseAsyncTask *>();
+          ::paddle::framework::MakeChannel<DenseAsyncTask *>();
     }
     if (type == PS_SPARSE_TABLE) {
       _push_sparse_task_queue_map[table_id] =
-          paddle::framework::MakeChannel<SparseAsyncTask *>();
+          ::paddle::framework::MakeChannel<SparseAsyncTask *>();
       _push_sparse_merge_count_map[table_id] = 0;
     }
   }
@@ -446,7 +449,7 @@ std::future<int32_t> BrpcPsClient::PrintTableStat(uint32_t table_id) {
         int ret = 0;
         uint64_t feasign_size = 0;
         uint64_t mf_size = 0;
-        paddle::framework::BinaryArchive ar;
+        ::paddle::framework::BinaryArchive ar;
         auto *closure = reinterpret_cast<DownpourBrpcClosure *>(done);
         for (size_t i = 0; i < request_call_num; ++i) {
           if (closure->check_response(i, PS_PRINT_TABLE_STAT) != 0) {
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
index 84784c32f3b2d9..28ac123fa08ff7 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
@@ -30,15 +30,15 @@ class RpcController;
 }  // namespace protobuf
 }  // namespace google
 
-DEFINE_int32(pserver_timeout_ms_s2s,
-             10000,
-             "pserver request server timeout_ms");
-DEFINE_int32(pserver_connect_timeout_ms_s2s,
-             10000,
-             "pserver connect server timeout_ms");
-DEFINE_string(pserver_connection_type_s2s,
-              "pooled",
-              "pserver connection_type[pooled:single]");
+PD_DEFINE_int32(pserver_timeout_ms_s2s,
+                10000,
+                "pserver request server timeout_ms");
+PD_DEFINE_int32(pserver_connect_timeout_ms_s2s,
+                10000,
+                "pserver connect server timeout_ms");
+PD_DEFINE_string(pserver_connection_type_s2s,
+                 "pooled",
+                 "pserver connection_type[pooled:single]");
 
 namespace paddle {
 namespace distributed {
@@ -169,7 +169,7 @@ int32_t BrpcPsServer::ReceiveFromPServer(int msg_type,
     LOG(WARNING) << "SERVER>>RESPONSE>>msg = 0 Finish S2S Response";
     return 0;
   }
-  paddle::framework::BinaryArchive ar;
+  ::paddle::framework::BinaryArchive ar;
   ar.SetReadBuffer(const_cast<char *>(msg.c_str()), msg.length(), nullptr);
   if (ar.Cursor() == ar.Finish()) {
     LOG(WARNING) << "SERVER>>RESPONSE ar = 0>> Finish S2S Response";
@@ -598,7 +598,7 @@ int32_t BrpcPsService::PrintTableStat(Table *table,
                                       brpc::Controller *cntl) {
   CHECK_TABLE_EXIST(table, request, response)
   std::pair<int64_t, int64_t> ret = table->PrintTableStat();
-  paddle::framework::BinaryArchive ar;
+  ::paddle::framework::BinaryArchive ar;
   ar << ret.first << ret.second;
   std::string table_info(ar.Buffer(), ar.Length());
   response.set_data(table_info);
@@ -723,7 +723,7 @@ int32_t BrpcPsService::CacheShuffle(Table *table,
   table->Flush();
   double cache_threshold = std::stod(request.params(2));
   LOG(INFO) << "cache threshold for cache shuffle: " << cache_threshold;
-  //    auto shuffled_ins = paddle::ps::make_channel<std::pair<uint64_t,
+  //    auto shuffled_ins = ::paddle::ps::make_channel<std::pair<uint64_t,
   //    std::string>>();
   //    shuffled_ins->set_block_size(80000);
   _server->StartS2S();
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
index 1ad58d9eb0fca0..9932343fa779bd 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -16,11 +16,11 @@ limitations under the License. */
 
 #include <google/protobuf/text_format.h>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
 #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/utils/flags.h"
 
 #define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@"
 #define STEP_COUNTER "@PS_STEP_COUNTER@"
@@ -42,7 +42,7 @@ Communicator::Communicator() = default;
 
 void Communicator::InitGFlag(const std::string &gflags) {
   VLOG(3) << "Init With Gflags:" << gflags;
-  std::vector<std::string> flags = paddle::string::split_string(gflags);
+  std::vector<std::string> flags = ::paddle::string::split_string(gflags);
   if (flags.empty()) {
     flags.push_back("-max_body_size=314217728");
     flags.push_back("-bthread_concurrency=40");
@@ -57,7 +57,7 @@ void Communicator::InitGFlag(const std::string &gflags) {
   }
   int params_cnt = flags.size();
   char **params_ptr = &(flags_ptr[0]);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&params_cnt, &params_ptr, true);
+  ::paddle::flags::ParseCommandLineFlags(&params_cnt, &params_ptr);
 }
 
 std::once_flag Communicator::init_flag_;
@@ -66,7 +66,7 @@ std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
 void Communicator::InitBrpcClient(
     const std::string &dist_desc,
     const std::vector<std::string> &host_sign_list) {
-  auto fleet = paddle::distributed::FleetWrapper::GetInstance();
+  auto fleet = ::paddle::distributed::FleetWrapper::GetInstance();
   if (_worker_ptr.get() == nullptr) {
     _worker_ptr = fleet->worker_ptr_;
   }
@@ -92,7 +92,7 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,
   platform::RecordEvent record_event("Communicator->RpcRecvDense",
                                      platform::TracerEventType::Communication,
                                      1);
-  std::vector<paddle::distributed::Region> regions;
+  std::vector<::paddle::distributed::Region> regions;
   regions.reserve(varnames.size());
   for (auto &t : varnames) {
     Variable *var = scope->Var(t);
@@ -103,7 +103,7 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,
       phi::DenseTensor *temp_tensor = temp_var->GetMutable<phi::DenseTensor>();
       temp_tensor->Resize(tensor->dims());
       float *temp_data = temp_tensor->mutable_data<float>(platform::CPUPlace());
-      paddle::distributed::Region reg(temp_data, tensor->numel());
+      ::paddle::distributed::Region reg(temp_data, tensor->numel());
       regions.emplace_back(std::move(reg));
       VLOG(1) << "Communicator::RpcRecvDense Var " << t << " table_id "
               << table_id << " Temp_data[0] " << temp_data[0]
@@ -111,7 +111,7 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,
 #endif
     } else {
       float *w = tensor->mutable_data<float>(tensor->place());
-      paddle::distributed::Region reg(w, tensor->numel());
+      ::paddle::distributed::Region reg(w, tensor->numel());
       regions.emplace_back(std::move(reg));
     }
   }
@@ -152,7 +152,7 @@ void Communicator::RpcSendDenseParam(const std::vector<std::string> &varnames,
                                      platform::TracerEventType::Communication,
                                      1);
   auto place = platform::CPUPlace();
-  std::vector<paddle::distributed::Region> regions;
+  std::vector<::paddle::distributed::Region> regions;
   for (auto &t : varnames) {
     Variable *var = scope.FindVar(t);
     CHECK(var != nullptr) << "var[" << t << "] not found";
@@ -164,7 +164,7 @@ void Communicator::RpcSendDenseParam(const std::vector<std::string> &varnames,
       temp_tensor->Resize(tensor->dims());
       float *temp_data = temp_tensor->mutable_data<float>(platform::CPUPlace());
       framework::TensorCopy(*tensor, platform::CPUPlace(), temp_tensor);
-      paddle::distributed::Region reg(temp_data, tensor->numel());
+      ::paddle::distributed::Region reg(temp_data, tensor->numel());
       regions.emplace_back(std::move(reg));
       VLOG(1) << "rpc_send_dense_param Var " << t << " table_id " << table_id
               << " Temp_data[0] " << temp_data[0] << " Temp_data[-1] "
@@ -172,7 +172,7 @@ void Communicator::RpcSendDenseParam(const std::vector<std::string> &varnames,
 #endif
     } else {
       float *w = tensor->mutable_data<float>(place);
-      paddle::distributed::Region reg(w, tensor->numel());
+      ::paddle::distributed::Region reg(w, tensor->numel());
       regions.emplace_back(reg);
       VLOG(1) << "rpc_send_dense_param Var " << t << " table_id " << table_id
               << " Temp_data[0] " << w[0] << " Temp_data[-1] "
@@ -1096,10 +1096,10 @@ void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
       parallel_task_nums_ += 1;
       sparse_id_queues_.insert(
           std::pair<std::string,
-                    paddle::framework::Channel<
+                    ::paddle::framework::Channel<
                         std::shared_ptr<std::vector<int64_t>>>>(
               splited_var,
-              paddle::framework::MakeChannel<
+              ::paddle::framework::MakeChannel<
                   std::shared_ptr<std::vector<int64_t>>>(send_queue_size_)));
     }
   }
@@ -1509,7 +1509,7 @@ void GeoCommunicator::MainThread() {
 void FLCommunicator::InitBrpcClient(
     const std::string &dist_desc,
     const std::vector<std::string> &host_sign_list) {
-  auto fleet = paddle::distributed::FleetWrapper::GetInstance();
+  auto fleet = ::paddle::distributed::FleetWrapper::GetInstance();
   if (_worker_ptr.get() == nullptr) {
     VLOG(0) << "fl-ps > FLCommunicator::InitBrpcClient get _worker_ptr";
     _worker_ptr =
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
index 643c91b5b05e94..f6d062460b826e 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -29,7 +29,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/distributed/ps/service/communicator/communicator_common.h"
 #include "paddle/fluid/distributed/ps/service/coordinator_client.h"
 #include "paddle/fluid/distributed/ps/service/ps_client.h"
@@ -45,6 +44,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace distributed {
@@ -410,8 +410,8 @@ class Communicator {
   }
 
   void InitGFlag(const std::string &gflags);
-  paddle::distributed::PSParameter _ps_param;
-  paddle::distributed::PaddlePSEnvironment _ps_env;
+  ::paddle::distributed::PSParameter _ps_param;
+  ::paddle::distributed::PaddlePSEnvironment _ps_env;
   int servers_ = 0;
   int trainers_;
   int trainer_id_ = 0;
@@ -661,7 +661,7 @@ class GeoCommunicator : public AsyncCommunicator {
 
   std::unordered_map<
       std::string,
-      paddle::framework::Channel<std::shared_ptr<std::vector<int64_t>>>>
+      ::paddle::framework::Channel<std::shared_ptr<std::vector<int64_t>>>>
       sparse_id_queues_;
 };
 
diff --git a/paddle/fluid/distributed/ps/service/coordinator_client.cc b/paddle/fluid/distributed/ps/service/coordinator_client.cc
index 9a77170b37c53c..c9c2ba49c9bf39 100644
--- a/paddle/fluid/distributed/ps/service/coordinator_client.cc
+++ b/paddle/fluid/distributed/ps/service/coordinator_client.cc
@@ -28,8 +28,8 @@ static const int MAX_PORT = 65535;
 namespace paddle {
 namespace distributed {
 
-DEFINE_uint64(total_fl_client_size, 100, "supported total fl client size");
-DEFINE_uint32(coordinator_wait_all_clients_max_time, 60, "uint32: s");
+PD_DEFINE_uint64(total_fl_client_size, 100, "supported total fl client size");
+PD_DEFINE_uint32(coordinator_wait_all_clients_max_time, 60, "uint32: s");
 
 void CoordinatorService::FLService(
     ::google::protobuf::RpcController* controller,
@@ -62,10 +62,10 @@ int32_t CoordinatorClient::Initialize(
     const std::vector<std::string>& trainer_endpoints) {
   brpc::ChannelOptions options;
   options.protocol = "baidu_std";
-  options.timeout_ms = paddle::distributed::FLAGS_pserver_timeout_ms;
+  options.timeout_ms = ::paddle::distributed::FLAGS_pserver_timeout_ms;
   options.connection_type = "pooled";
   options.connect_timeout_ms =
-      paddle::distributed::FLAGS_pserver_connect_timeout_ms;
+      ::paddle::distributed::FLAGS_pserver_connect_timeout_ms;
   options.max_retry = 3;
 
   std::string server_ip_port;
@@ -109,7 +109,7 @@ int32_t CoordinatorClient::Initialize(
   }
   for (size_t i = 0; i < trainer_endpoints.size(); i++) {
     std::vector<std::string> addr =
-        paddle::string::Split(trainer_endpoints[i], ':');
+        ::paddle::string::Split(trainer_endpoints[i], ':');
     fl_client_list[i].ip = addr[0];
     fl_client_list[i].port = std::stol(addr[1]);
     fl_client_list[i].rank = i;  // TO CHECK
@@ -152,7 +152,7 @@ int32_t CoordinatorClient::StartClientService() {
     LOG(ERROR) << "fl-ps > coordinator server endpoint not set";
     return -1;
   }
-  auto addr = paddle::string::Split(_endpoint, ':');
+  auto addr = ::paddle::string::Split(_endpoint, ':');
   std::string ip = addr[0];
   std::string port = addr[1];
   std::string rank = addr[2];
diff --git a/paddle/fluid/distributed/ps/service/coordinator_client.h b/paddle/fluid/distributed/ps/service/coordinator_client.h
index bd1f0f7754d8c8..8db08c3fc79994 100644
--- a/paddle/fluid/distributed/ps/service/coordinator_client.h
+++ b/paddle/fluid/distributed/ps/service/coordinator_client.h
@@ -34,10 +34,10 @@
 namespace paddle {
 namespace distributed {
 
-DECLARE_int32(pserver_timeout_ms);
-DECLARE_int32(pserver_connect_timeout_ms);
-DECLARE_uint64(total_fl_client_size);
-DECLARE_uint32(coordinator_wait_all_clients_max_time);
+PD_DECLARE_int32(pserver_timeout_ms);
+PD_DECLARE_int32(pserver_connect_timeout_ms);
+PD_DECLARE_uint64(total_fl_client_size);
+PD_DECLARE_uint32(coordinator_wait_all_clients_max_time);
 
 using CoordinatorServiceFunc =
     std::function<int32_t(const CoordinatorReqMessage& request,
@@ -91,10 +91,9 @@ class CoordinatorServiceHandle {
     timeline.Start();
     auto f = [&]() -> bool {
       while (query_wait_time <
-             paddle::distributed::
-                 FLAGS_coordinator_wait_all_clients_max_time) {  // in case that
-                                                                 // some
-                                                                 // clients down
+             FLAGS_coordinator_wait_all_clients_max_time) {  // in case that
+                                                             // some
+                                                             // clients down
         if (_is_all_clients_info_collected == true) {
           // LOG(INFO) << "fl-ps > _is_all_clients_info_collected";
           return true;
diff --git a/paddle/fluid/distributed/ps/service/env.h b/paddle/fluid/distributed/ps/service/env.h
index aa230f86c9d172..d6b403523496c5 100644
--- a/paddle/fluid/distributed/ps/service/env.h
+++ b/paddle/fluid/distributed/ps/service/env.h
@@ -25,8 +25,8 @@
 #include <unordered_set>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/phi/core/macros.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc
index 2ca9fef5c08765..d7a1f5cf7c4f13 100644
--- a/paddle/fluid/distributed/ps/service/heter_client.cc
+++ b/paddle/fluid/distributed/ps/service/heter_client.cc
@@ -19,8 +19,8 @@
 
 namespace paddle {
 namespace distributed {
-DEFINE_int32(heter_world_size, 100, "group size");  // group max size
-DEFINE_int32(switch_send_recv_timeout_s, 600, "switch_send_recv_timeout_s");
+PD_DEFINE_int32(heter_world_size, 100, "group size");  // group max size
+PD_DEFINE_int32(switch_send_recv_timeout_s, 600, "switch_send_recv_timeout_s");
 
 std::shared_ptr<HeterClient> HeterClient::s_instance_ = nullptr;
 std::mutex HeterClient::mtx_;
@@ -85,7 +85,7 @@ void HeterClient::CreateClient2XpuConnection() {
     xpu_channels_[i].reset(new brpc::Channel());
     if (xpu_channels_[i]->Init(xpu_list_[i].c_str(), "", &options) != 0) {
       VLOG(0) << "HeterClient channel init fail. Try Again";
-      auto ip_port = paddle::string::Split(xpu_list_[i], ':');
+      auto ip_port = ::paddle::string::Split(xpu_list_[i], ':');
       std::string ip = ip_port[0];
       int port = std::stoi(ip_port[1]);
       std::string int_ip_port = GetIntTypeEndpoint(ip, port);
@@ -100,7 +100,7 @@ void HeterClient::CreateClient2XpuConnection() {
     if (previous_xpu_channels_[i]->Init(
             previous_xpu_list_[i].c_str(), "", &options) != 0) {
       VLOG(0) << "HeterClient channel init fail. Try Again";
-      auto ip_port = paddle::string::Split(previous_xpu_list_[i], ':');
+      auto ip_port = ::paddle::string::Split(previous_xpu_list_[i], ':');
       std::string ip = ip_port[0];
       int port = std::stoi(ip_port[1]);
       std::string int_ip_port = GetIntTypeEndpoint(ip, port);
@@ -181,11 +181,11 @@ void HeterClient::SendAndRecvAsync(
 std::future<int32_t> HeterClient::SendCmd(
     uint32_t table_id, int cmd_id, const std::vector<std::string>& params) {
   size_t request_call_num = xpu_channels_.size();
-  paddle::distributed::DownpourBrpcClosure* closure =
-      new paddle::distributed::DownpourBrpcClosure(
+  ::paddle::distributed::DownpourBrpcClosure* closure =
+      new ::paddle::distributed::DownpourBrpcClosure(
           request_call_num, [request_call_num, cmd_id](void* done) {
             int ret = 0;
-            auto* closure = (paddle::distributed::DownpourBrpcClosure*)done;
+            auto* closure = (::paddle::distributed::DownpourBrpcClosure*)done;
             for (size_t i = 0; i < request_call_num; ++i) {
               if (closure->check_response(i, cmd_id) != 0) {
                 ret = -1;
diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h
index 10d916b7100118..e6c231338ac529 100755
--- a/paddle/fluid/distributed/ps/service/heter_client.h
+++ b/paddle/fluid/distributed/ps/service/heter_client.h
@@ -42,7 +42,7 @@ class Scope;
 
 namespace paddle {
 namespace distributed {
-DECLARE_int32(pserver_timeout_ms);
+PD_DECLARE_int32(pserver_timeout_ms);
 using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 
@@ -116,7 +116,7 @@ class HeterClient {
       if ((*client_channels)[i]->Init(node_list[i].c_str(), "", &options) !=
           0) {
         VLOG(0) << "client channel init failed! try again";
-        auto ip_port = paddle::string::Split(node_list[i], ':');
+        auto ip_port = ::paddle::string::Split(node_list[i], ':');
         std::string ip = ip_port[0];
         int port = std::stoi(ip_port[1]);
         std::string int_ip_port = GetIntTypeEndpoint(ip, port);
diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc
index d5d8546bf792e9..eb4d9b83045139 100644
--- a/paddle/fluid/distributed/ps/service/heter_server.cc
+++ b/paddle/fluid/distributed/ps/service/heter_server.cc
@@ -18,8 +18,8 @@
 
 namespace paddle {
 namespace distributed {
-// DEFINE_string(cert_path, "./cert.pem", "cert.pem path");
-// DEFINE_string(key_path, "./key.pem", "key.pem path");
+// PD_DEFINE_string(cert_path, "./cert.pem", "cert.pem path");
+// PD_DEFINE_string(key_path, "./key.pem", "key.pem path");
 std::shared_ptr<HeterServer> HeterServer::s_instance_ = nullptr;
 std::mutex HeterServer::mtx_;
 
@@ -37,7 +37,7 @@ void HeterServer::StartHeterService(bool neeed_encrypt) {
   }
   if (server_.Start(endpoint_.c_str(), &options) != 0) {
     VLOG(0) << "HeterServer start fail. Try again.";
-    auto ip_port = paddle::string::Split(endpoint_, ':');
+    auto ip_port = ::paddle::string::Split(endpoint_, ':');
     std::string ip = ip_port[0];
     int port = std::stoi(ip_port[1]);
     std::string int_ip_port = GetIntTypeEndpoint(ip, port);
@@ -72,7 +72,7 @@ void HeterServer::StartHeterInterService(bool neeed_encrypt) {
   }
   if (server_inter_.Start(endpoint_inter_.c_str(), &options) != 0) {
     VLOG(4) << "switch inter server start fail. Try again.";
-    auto ip_port = paddle::string::Split(endpoint_inter_, ':');
+    auto ip_port = ::paddle::string::Split(endpoint_inter_, ':');
     std::string ip = ip_port[0];
     int port = std::stoi(ip_port[1]);
     std::string int_ip_port = GetIntTypeEndpoint(ip, port);
diff --git a/paddle/fluid/distributed/ps/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h
index c4e9d05ac13520..1f134d4db18aed 100755
--- a/paddle/fluid/distributed/ps/service/heter_server.h
+++ b/paddle/fluid/distributed/ps/service/heter_server.h
@@ -57,9 +57,9 @@ PHI_DECLARE_double(eager_delete_tensor_gb);
 namespace paddle {
 namespace distributed {
 
-DECLARE_int32(pserver_timeout_ms);
-DECLARE_int32(heter_world_size);
-DECLARE_int32(switch_send_recv_timeout_s);
+PD_DECLARE_int32(pserver_timeout_ms);
+PD_DECLARE_int32(heter_world_size);
+PD_DECLARE_int32(switch_send_recv_timeout_s);
 
 using MultiVarMsg = MultiVariableMessage;
 using VarMsg = VariableMessage;
@@ -216,8 +216,8 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
     // get microID from request
     // deserialize variable to micro scope
     // Push to heter worker's task_queue
-    std::unique_ptr<paddle::framework::Scope> local_scope_ptr(
-        new paddle::framework::Scope());
+    std::unique_ptr<::paddle::framework::Scope> local_scope_ptr(
+        new ::paddle::framework::Scope());
     auto& local_scope = *(local_scope_ptr.get());
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     platform::CPUPlace cpu_place;
@@ -257,7 +257,7 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
       auto* minibatch_scope = &(scope_->NewScope());
       (*mini_scopes_)[minibatch_index] = minibatch_scope;
       (*micro_scopes_)[minibatch_index].reset(
-          new std::vector<paddle::framework::Scope*>{});
+          new std::vector<::paddle::framework::Scope*>{});
       for (int i = 0; i < num_microbatch_; i++) {
         auto* micro_scope = &(minibatch_scope->NewScope());
         (*((*micro_scopes_)[minibatch_index])).push_back(micro_scope);
@@ -300,7 +300,7 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
 
  public:
   using shard_type = SparseTableShard<std::string, ValueInSwitch>;
-  std::shared_ptr<paddle::framework::Scope> local_scope_ptr;  // for switch
+  std::shared_ptr<::paddle::framework::Scope> local_scope_ptr;  // for switch
   std::unordered_map<uint32_t, std::unordered_map<std::string, uint32_t>>
       vars_ready_flag;
   std::unique_ptr<shard_type[]> _local_shards;
@@ -344,7 +344,7 @@ class HeterService : public PsService {
                   std::placeholders::_3);
 
     service_handler_.local_scope_ptr =
-        std::make_shared<paddle::framework::Scope>();
+        std::make_shared<::paddle::framework::Scope>();
   }
 
   virtual ~HeterService() {}
@@ -613,7 +613,7 @@ class HeterServer {
 
   void SetLocalScope() {
     request_handler_->local_scope_ptr =
-        std::make_shared<paddle::framework::Scope>();
+        std::make_shared<::paddle::framework::Scope>();
   }
 
   void SetInterEndpoint(const std::string& endpoint) {
diff --git a/paddle/fluid/distributed/ps/service/ps_client.cc b/paddle/fluid/distributed/ps/service/ps_client.cc
index 3dd2b8c775cbee..ecdcd8b874377b 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_client.cc
@@ -37,7 +37,8 @@ REGISTER_PSCORE_CLASS(PSClient, PsGraphClient);
 
 int32_t PSClient::Configure(  // called in FleetWrapper::InitWorker
     const PSParameter &config,
-    const std::map<uint64_t, std::vector<paddle::distributed::Region>> &regions,
+    const std::map<uint64_t, std::vector<::paddle::distributed::Region>>
+        &regions,
     PSEnvironment &env,
     size_t client_id) {
   _env = &env;
@@ -88,7 +89,7 @@ PSClient *PSClientFactory::Create(const PSParameter &ps_config) {
 
   PSClient *client = NULL;
 #if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_GPU_GRAPH)
-  auto gloo = paddle::framework::GlooWrapper::GetInstance();
+  auto gloo = ::paddle::framework::GlooWrapper::GetInstance();
   if (client_name == "PsLocalClient" && gloo->Size() > 1) {
     client = CREATE_PSCORE_CLASS(PSClient, "PsGraphClient");
     LOG(WARNING) << "change PsLocalClient to PsGraphClient";
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index 33490281981d54..44836e7661b5f1 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -143,13 +143,13 @@ class GraphPyServer : public GraphPyService {
 
   void start_server(bool block = true);
   ::paddle::distributed::PSParameter GetServerProto();
-  std::shared_ptr<paddle::distributed::GraphBrpcServer> get_ps_server() {
+  std::shared_ptr<::paddle::distributed::GraphBrpcServer> get_ps_server() {
     return pserver_ptr;
   }
 
  protected:
   int rank;
-  std::shared_ptr<paddle::distributed::GraphBrpcServer> pserver_ptr;
+  std::shared_ptr<::paddle::distributed::GraphBrpcServer> pserver_ptr;
   std::thread* server_thread;
 };
 class GraphPyClient : public GraphPyService {
@@ -162,14 +162,14 @@ class GraphPyClient : public GraphPyService {
     set_client_id(client_id);
     GraphPyService::set_up(ips_str, shard_num, node_types, edge_types);
   }
-  std::shared_ptr<paddle::distributed::GraphBrpcClient> get_ps_client() {
+  std::shared_ptr<::paddle::distributed::GraphBrpcClient> get_ps_client() {
     return worker_ptr;
   }
   void bind_local_server(int local_channel_index,
                          GraphPyServer& server) {  // NOLINT
     worker_ptr->set_local_channel(local_channel_index);
     worker_ptr->set_local_graph_service(
-        (paddle::distributed::GraphBrpcService*)server.get_ps_server()
+        (::paddle::distributed::GraphBrpcService*)server.get_ps_server()
             ->get_service());
   }
   void StopServer();
@@ -209,7 +209,7 @@ class GraphPyClient : public GraphPyService {
  protected:
   mutable std::mutex mutex_;
   int client_id;
-  std::shared_ptr<paddle::distributed::GraphBrpcClient> worker_ptr;
+  std::shared_ptr<::paddle::distributed::GraphBrpcClient> worker_ptr;
   std::thread* client_thread;
   bool stoped_ = false;
 };
diff --git a/paddle/fluid/distributed/ps/service/ps_service/service.cc b/paddle/fluid/distributed/ps/service/ps_service/service.cc
index d62cdb4c133eda..e66475e88d8755 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/service.cc
@@ -28,9 +28,9 @@ using namespace std;  // NOLINT
 namespace paddle {
 namespace distributed {
 
-paddle::distributed::PSParameter load_from_prototxt(
+::paddle::distributed::PSParameter load_from_prototxt(
     const std::string& filename) {
-  paddle::distributed::PSParameter param;
+  ::paddle::distributed::PSParameter param;
   int file_descriptor = open(filename.c_str(), O_RDONLY);
 
   if (file_descriptor == -1) {
@@ -50,7 +50,7 @@ paddle::distributed::PSParameter load_from_prototxt(
 
 void PSCore::InitGFlag(const std::string& gflags) {
   VLOG(3) << "Init With Gflags:" << gflags;
-  std::vector<std::string> flags = paddle::string::split_string(gflags);
+  std::vector<std::string> flags = ::paddle::string::split_string(gflags);
   if (flags.empty()) {
     flags.push_back("-max_body_size=314217728");
     flags.push_back("-socket_max_unwritten_bytes=2048000000");
@@ -64,7 +64,7 @@ void PSCore::InitGFlag(const std::string& gflags) {
   }
   int params_cnt = flags.size();
   char** params_ptr = &(flags_ptr[0]);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&params_cnt, &params_ptr, true);
+  ::paddle::flags::ParseCommandLineFlags(&params_cnt, &params_ptr);
 }
 
 int PSCore::InitServer(
@@ -76,12 +76,12 @@ int PSCore::InitServer(
     const std::vector<framework::ProgramDesc>& server_sub_program) {
   google::protobuf::TextFormat::ParseFromString(dist_desc, &_ps_param);
   InitGFlag(_ps_param.init_gflags());
-  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env = ::paddle::distributed::PaddlePSEnvironment();
   _ps_env.SetPsServers(host_sign_list, node_num);
   _ps_env.SetTrainers(trainers);
   int ret = 0;
-  _server_ptr = std::shared_ptr<paddle::distributed::PSServer>(
-      paddle::distributed::PSServerFactory::Create(_ps_param));
+  _server_ptr = std::shared_ptr<::paddle::distributed::PSServer>(
+      ::paddle::distributed::PSServerFactory::Create(_ps_param));
   ret = _server_ptr->Configure(_ps_param, _ps_env, index, server_sub_program);
   CHECK(ret == 0) << "failed to configure server";
   return ret;
@@ -89,13 +89,14 @@ int PSCore::InitServer(
 
 int PSCore::InitWorker(
     const std::string& dist_desc,
-    const std::map<uint64_t, std::vector<paddle::distributed::Region>>& regions,
+    const std::map<uint64_t, std::vector<::paddle::distributed::Region>>&
+        regions,
     const std::vector<std::string>* host_sign_list,
     int node_num,
     int index) {
   google::protobuf::TextFormat::ParseFromString(dist_desc, &_ps_param);
   InitGFlag(_ps_param.init_gflags());
-  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env = ::paddle::distributed::PaddlePSEnvironment();
   _ps_env.SetPsServers(host_sign_list, node_num);
   int ret = 0;
   VLOG(1) << "PSCore::InitWorker";
@@ -132,6 +133,6 @@ int PSCore::StopServer() {
   stop_status.wait();
   return 0;
 }
-paddle::distributed::PSParameter* PSCore::GetParam() { return &_ps_param; }
+::paddle::distributed::PSParameter* PSCore::GetParam() { return &_ps_param; }
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/service/ps_service/service.h b/paddle/fluid/distributed/ps/service/ps_service/service.h
index eb190073fbd834..4c3c6db61e2bc4 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/service.h
@@ -33,9 +33,9 @@ class PsRequestMessage;
 class PsResponseMessage;
 class PsService;
 
-using paddle::distributed::PsRequestMessage;
-using paddle::distributed::PsResponseMessage;
-using paddle::distributed::PsService;
+using ::paddle::distributed::PsRequestMessage;
+using ::paddle::distributed::PsResponseMessage;
+using ::paddle::distributed::PsService;
 
 class PSCore {
  public:
@@ -51,7 +51,7 @@ class PSCore {
       const std::vector<framework::ProgramDesc>& server_sub_program = {});
   virtual int InitWorker(
       const std::string& dist_desc,
-      const std::map<uint64_t, std::vector<paddle::distributed::Region>>&
+      const std::map<uint64_t, std::vector<::paddle::distributed::Region>>&
           regions,
       const std::vector<std::string>* host_sign_list,
       int node_num,
@@ -63,16 +63,16 @@ class PSCore {
   virtual int CreateClient2ClientConnection(int pserver_timeout_ms,
                                             int pserver_connect_timeout_ms,
                                             int max_retry);
-  std::shared_ptr<paddle::distributed::PSServer>
+  std::shared_ptr<::paddle::distributed::PSServer>
       _server_ptr;  // pointer to server
-  std::shared_ptr<paddle::distributed::PSClient>
+  std::shared_ptr<::paddle::distributed::PSClient>
       _worker_ptr;  // pointer to worker
-  virtual paddle::distributed::PSParameter* GetParam();
+  virtual ::paddle::distributed::PSParameter* GetParam();
 
  private:
   void InitGFlag(const std::string& gflags);
-  paddle::distributed::PSParameter _ps_param;
-  paddle::distributed::PaddlePSEnvironment _ps_env;
+  ::paddle::distributed::PSParameter _ps_param;
+  ::paddle::distributed::PaddlePSEnvironment _ps_env;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h
index 48b32d22cac799..fc1d4a2bd343ba 100644
--- a/paddle/fluid/distributed/ps/service/server.h
+++ b/paddle/fluid/distributed/ps/service/server.h
@@ -57,8 +57,8 @@ namespace distributed {
 
 class Table;
 
-using paddle::distributed::PsRequestMessage;
-using paddle::distributed::PsResponseMessage;
+using ::paddle::distributed::PsRequestMessage;
+using ::paddle::distributed::PsResponseMessage;
 
 class PSServer {
  public:
@@ -134,7 +134,7 @@ class PSServer {
     return -1;
   }
 
-  paddle::framework::Channel<std::pair<uint64_t, std::string>> _shuffled_ins;
+  ::paddle::framework::Channel<std::pair<uint64_t, std::string>> _shuffled_ins;
 
  protected:
   virtual int32_t Initialize() = 0;
diff --git a/paddle/fluid/distributed/ps/service/simple_rpc/baidu_rpc_server.cc b/paddle/fluid/distributed/ps/service/simple_rpc/baidu_rpc_server.cc
index f3e501dd00ce1b..a10e78fe941629 100644
--- a/paddle/fluid/distributed/ps/service/simple_rpc/baidu_rpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/simple_rpc/baidu_rpc_server.cc
@@ -19,8 +19,8 @@
 #include "paddle/phi/core/enforce.h"
 
 namespace brpc {
-DECLARE_uint64(max_body_size);
-DECLARE_int64(socket_max_unwritten_bytes);
+PD_DECLARE_uint64(max_body_size);
+PD_DECLARE_int64(socket_max_unwritten_bytes);
 }  // namespace brpc
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index 316b2295c3389f..153c67317d54db 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -58,14 +58,14 @@ int32_t GraphTable::Load_to_ssd(const std::string &path,
   return 0;
 }
 
-paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
+::paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
     int gpu_id, std::vector<uint64_t> &node_ids, int slot_num) {
   size_t shard_num = 64;
   std::vector<std::vector<uint64_t>> bags(shard_num);
   std::vector<uint64_t> feature_array[shard_num];
   std::vector<uint8_t> slot_id_array[shard_num];
   std::vector<uint64_t> node_id_array[shard_num];
-  std::vector<paddle::framework::GpuPsFeaInfo> node_fea_info_array[shard_num];
+  std::vector<::paddle::framework::GpuPsFeaInfo> node_fea_info_array[shard_num];
   for (size_t i = 0; i < shard_num; i++) {
     auto predsize = node_ids.size() / shard_num;
     bags[i].reserve(predsize * 1.2);
@@ -92,7 +92,7 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
     if (bags[i].size() > 0) {
       tasks.push_back(_cpu_worker_pool[gpu_id]->enqueue([&, i, this]() -> int {
         uint64_t node_id;
-        paddle::framework::GpuPsFeaInfo x;
+        ::paddle::framework::GpuPsFeaInfo x;
         std::vector<uint64_t> feature_ids;
         for (size_t j = 0; j < bags[i].size(); j++) {
           Node *v = find_node(GraphTableType::FEATURE_TABLE, bags[i][j]);
@@ -134,7 +134,7 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
 
   tasks.clear();
 
-  paddle::framework::GpuPsCommGraphFea res;
+  ::paddle::framework::GpuPsCommGraphFea res;
   uint64_t tot_len = 0;
   for (size_t i = 0; i < shard_num; i++) {
     tot_len += feature_array[i].size();
@@ -165,7 +165,7 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
   return res;
 }
 
-paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
+::paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
     int idx, const std::vector<uint64_t> &ids) {
   std::vector<std::vector<uint64_t>> bags(task_pool_size_);
   for (int i = 0; i < task_pool_size_; i++) {
@@ -179,7 +179,7 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
 
   std::vector<std::future<int>> tasks;
   std::vector<uint64_t> node_array[task_pool_size_];  // node id list
-  std::vector<paddle::framework::GpuPsNodeInfo> info_array[task_pool_size_];
+  std::vector<::paddle::framework::GpuPsNodeInfo> info_array[task_pool_size_];
   std::vector<uint64_t> edge_array[task_pool_size_];  // edge id list
 
   for (size_t i = 0; i < bags.size(); i++) {
@@ -215,7 +215,7 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
     tot_len += edge_array[i].size();
   }
 
-  paddle::framework::GpuPsCommGraph res;
+  ::paddle::framework::GpuPsCommGraph res;
   res.init_on_cpu(tot_len, ids.size());
   int64_t offset = 0, ind = 0;
   for (int i = 0; i < task_pool_size_; i++) {
@@ -516,13 +516,13 @@ void GraphTable::release_graph() {
   build_graph_type_keys();
 
   if (FLAGS_gpugraph_storage_mode ==
-      paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
+      ::paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
     build_graph_total_keys();
   }
   // clear graph
-  if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::
+  if (FLAGS_gpugraph_storage_mode == ::paddle::framework::GpuGraphStorageMode::
                                          MEM_EMB_FEATURE_AND_GPU_GRAPH ||
-      FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::
+      FLAGS_gpugraph_storage_mode == ::paddle::framework::GpuGraphStorageMode::
                                          SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) {
     clear_edge_shard();
   } else {
@@ -532,7 +532,7 @@ void GraphTable::release_graph() {
 
 void GraphTable::release_graph_edge() {
   if (FLAGS_gpugraph_storage_mode ==
-      paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
+      ::paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
     build_graph_total_keys();
   }
   clear_edge_shard();
@@ -543,10 +543,12 @@ void GraphTable::release_graph_node() {
   if (FLAGS_graph_metapath_split_opt) {
     clear_feature_shard();
   } else {
-    if (FLAGS_gpugraph_storage_mode != paddle::framework::GpuGraphStorageMode::
-                                           MEM_EMB_FEATURE_AND_GPU_GRAPH &&
-        FLAGS_gpugraph_storage_mode != paddle::framework::GpuGraphStorageMode::
-                                           SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) {
+    if (FLAGS_gpugraph_storage_mode !=
+            ::paddle::framework::GpuGraphStorageMode::
+                MEM_EMB_FEATURE_AND_GPU_GRAPH &&
+        FLAGS_gpugraph_storage_mode !=
+            ::paddle::framework::GpuGraphStorageMode::
+                SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) {
       clear_feature_shard();
     } else {
       merge_feature_shard();
@@ -666,7 +668,7 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
     idx = edge_to_id[edge_type];
   }
   total_memory_cost = 0;
-  auto paths = paddle::string::split_string<std::string>(path, ";");
+  auto paths = ::paddle::string::split_string<std::string>(path, ";");
   int64_t count = 0;
   std::string sample_type = "random";
   for (auto path : paths) {
@@ -674,11 +676,12 @@ int32_t GraphTable::load_edges_to_ssd(const std::string &path,
     std::string line;
     while (std::getline(file, line)) {
       VLOG(0) << "get a line from file " << line;
-      auto values = paddle::string::split_string<std::string>(line, "\t");
+      auto values = ::paddle::string::split_string<std::string>(line, "\t");
       count++;
       if (values.size() < 2) continue;
       auto src_id = std::stoll(values[0]);
-      auto dist_ids = paddle::string::split_string<std::string>(values[1], ";");
+      auto dist_ids =
+          ::paddle::string::split_string<std::string>(values[1], ";");
       std::vector<uint64_t> dist_data;
       for (auto x : dist_ids) {
         dist_data.push_back(std::stoll(x));
@@ -798,7 +801,7 @@ int CompleteGraphSampler::run_graph_sampling() {
   sample_nodes.resize(gpu_num);
   sample_neighbors.resize(gpu_num);
   sample_res.resize(gpu_num);
-  std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+  std::vector<std::vector<std::vector<::paddle::framework::GpuPsGraphNode>>>
       sample_nodes_ex(graph_table->task_pool_size_);
   std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
       graph_table->task_pool_size_);
@@ -812,7 +815,7 @@ int CompleteGraphSampler::run_graph_sampling() {
         graph_table->_shards_task_pool[i % graph_table->task_pool_size_]
             ->enqueue([&, i, this]() -> int {
               if (this->status == GraphSamplerStatus::terminating) return 0;
-              paddle::framework::GpuPsGraphNode node;
+              ::paddle::framework::GpuPsGraphNode node;
               std::vector<Node *> &v =
                   this->graph_table->shards[i]->get_bucket();
               size_t ind = i % this->graph_table->task_pool_size_;
@@ -962,7 +965,7 @@ int BasicBfsGraphSampler::run_graph_sampling() {
     sample_nodes.resize(gpu_num);
     sample_neighbors.resize(gpu_num);
     sample_res.resize(gpu_num);
-    std::vector<std::vector<std::vector<paddle::framework::GpuPsGraphNode>>>
+    std::vector<std::vector<std::vector<::paddle::framework::GpuPsGraphNode>>>
         sample_nodes_ex(graph_table->task_pool_size_);
     std::vector<std::vector<std::vector<int64_t>>> sample_neighbors_ex(
         graph_table->task_pool_size_);
@@ -977,7 +980,7 @@ int BasicBfsGraphSampler::run_graph_sampling() {
             if (this->status == GraphSamplerStatus::terminating) {
               return 0;
             }
-            paddle::framework::GpuPsGraphNode node;
+            ::paddle::framework::GpuPsGraphNode node;
             auto iter = sample_neighbors_map[i].begin();
             size_t ind = i;
             for (; iter != sample_neighbors_map[i].end(); iter++) {
@@ -1237,7 +1240,7 @@ int32_t GraphTable::Load(const std::string &path, const std::string &param) {
 }
 
 std::string GraphTable::get_inverse_etype(std::string &etype) {
-  auto etype_split = paddle::string::split_string<std::string>(etype, "2");
+  auto etype_split = ::paddle::string::split_string<std::string>(etype, "2");
   std::string res;
   if (etype_split.size() == 3) {
     res = etype_split[2] + "2" + etype_split[1] + "2" + etype_split[0];
@@ -1253,13 +1256,13 @@ int32_t GraphTable::parse_type_to_typepath(
     std::vector<std::string> &res_type,
     std::unordered_map<std::string, std::string> &res_type2path) {
   auto type2files_split =
-      paddle::string::split_string<std::string>(type2files, ",");
+      ::paddle::string::split_string<std::string>(type2files, ",");
   if (type2files_split.empty()) {
     return -1;
   }
   for (auto one_type2file : type2files_split) {
     auto one_type2file_split =
-        paddle::string::split_string<std::string>(one_type2file, ":");
+        ::paddle::string::split_string<std::string>(one_type2file, ":");
     auto type = one_type2file_split[0];
     auto type_dir = one_type2file_split[1];
     res_type.push_back(type);
@@ -1304,17 +1307,17 @@ int32_t GraphTable::parse_edge_and_load(
             VLOG(1) << "only_load_reverse_edge is False, etype[" << etypes[i]
                     << "], file_path[" << etype_path << "]";
           }
-          auto etype_path_list = paddle::framework::localfs_list(etype_path);
+          auto etype_path_list = ::paddle::framework::localfs_list(etype_path);
           std::string etype_path_str;
           if (part_num > 0 &&
               part_num < static_cast<int>(etype_path_list.size())) {
             std::vector<std::string> sub_etype_path_list(
                 etype_path_list.begin(), etype_path_list.begin() + part_num);
             etype_path_str =
-                paddle::string::join_strings(sub_etype_path_list, delim);
+                ::paddle::string::join_strings(sub_etype_path_list, delim);
           } else {
             etype_path_str =
-                paddle::string::join_strings(etype_path_list, delim);
+                ::paddle::string::join_strings(etype_path_list, delim);
           }
           if (!only_load_reverse_edge) {
             this->load_edges(etype_path_str, false, etypes[i]);
@@ -1345,14 +1348,14 @@ int32_t GraphTable::parse_node_and_load(std::string ntype2files,
   }
   std::string delim = ";";
   std::string npath = node_to_nodedir[ntypes[0]];
-  auto npath_list = paddle::framework::localfs_list(npath);
+  auto npath_list = ::paddle::framework::localfs_list(npath);
   std::string npath_str;
   if (part_num > 0 && part_num < static_cast<int>(npath_list.size())) {
     std::vector<std::string> sub_npath_list(npath_list.begin(),
                                             npath_list.begin() + part_num);
-    npath_str = paddle::string::join_strings(sub_npath_list, delim);
+    npath_str = ::paddle::string::join_strings(sub_npath_list, delim);
   } else {
-    npath_str = paddle::string::join_strings(npath_list, delim);
+    npath_str = ::paddle::string::join_strings(npath_list, delim);
   }
 
   if (ntypes.empty()) {
@@ -1425,17 +1428,18 @@ int32_t GraphTable::load_node_and_edge_file(
               VLOG(1) << "only_load_reverse_edge is False, etype[" << etypes[i]
                       << "], file_path[" << etype_path << "]";
             }
-            auto etype_path_list = paddle::framework::localfs_list(etype_path);
+            auto etype_path_list =
+                ::paddle::framework::localfs_list(etype_path);
             std::string etype_path_str;
             if (part_num > 0 &&
                 part_num < static_cast<int>(etype_path_list.size())) {
               std::vector<std::string> sub_etype_path_list(
                   etype_path_list.begin(), etype_path_list.begin() + part_num);
               etype_path_str =
-                  paddle::string::join_strings(sub_etype_path_list, delim);
+                  ::paddle::string::join_strings(sub_etype_path_list, delim);
             } else {
               etype_path_str =
-                  paddle::string::join_strings(etype_path_list, delim);
+                  ::paddle::string::join_strings(etype_path_list, delim);
             }
             if (!only_load_reverse_edge) {
               this->load_edges(etype_path_str, false, etypes[i]);
@@ -1448,15 +1452,15 @@ int32_t GraphTable::load_node_and_edge_file(
             }
           } else {
             std::string npath = node_to_nodedir[ntypes[0]];
-            auto npath_list = paddle::framework::localfs_list(npath);
+            auto npath_list = ::paddle::framework::localfs_list(npath);
             std::string npath_str;
             if (part_num > 0 &&
                 part_num < static_cast<int>(npath_list.size())) {
               std::vector<std::string> sub_npath_list(
                   npath_list.begin(), npath_list.begin() + part_num);
-              npath_str = paddle::string::join_strings(sub_npath_list, delim);
+              npath_str = ::paddle::string::join_strings(sub_npath_list, delim);
             } else {
-              npath_str = paddle::string::join_strings(npath_list, delim);
+              npath_str = ::paddle::string::join_strings(npath_list, delim);
             }
 
             if (ntypes.empty()) {
@@ -1553,14 +1557,14 @@ std::pair<uint64_t, uint64_t> GraphTable::parse_node_file(
   uint64_t local_valid_count = 0;
 
   int num = 0;
-  std::vector<paddle::string::str_ptr> vals;
+  std::vector<::paddle::string::str_ptr> vals;
   size_t n = node_type.length();
   while (std::getline(file, line)) {
     if (strncmp(line.c_str(), node_type.c_str(), n) != 0) {
       continue;
     }
     vals.clear();
-    num = paddle::string::split_string_ptr(
+    num = ::paddle::string::split_string_ptr(
         line.c_str() + n + 1, line.length() - n - 1, '\t', &vals);
     if (num == 0) {
       continue;
@@ -1603,15 +1607,15 @@ std::pair<uint64_t, uint64_t> GraphTable::parse_node_file(
   uint64_t local_valid_count = 0;
   int idx = 0;
 
-  auto path_split = paddle::string::split_string<std::string>(path, "/");
+  auto path_split = ::paddle::string::split_string<std::string>(path, "/");
   auto path_name = path_split[path_split.size() - 1];
 
   int num = 0;
-  std::vector<paddle::string::str_ptr> vals;
+  std::vector<::paddle::string::str_ptr> vals;
 
   while (std::getline(file, line)) {
     vals.clear();
-    num = paddle::string::split_string_ptr(
+    num = ::paddle::string::split_string_ptr(
         line.c_str(), line.length(), '\t', &vals);
     if (vals.empty()) {
       continue;
@@ -1654,7 +1658,7 @@ std::pair<uint64_t, uint64_t> GraphTable::parse_node_file(
 
 // // TODO(danleifeng): opt load all node_types in once reading
 int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
-  auto paths = paddle::string::split_string<std::string>(path, ";");
+  auto paths = ::paddle::string::split_string<std::string>(path, ";");
   uint64_t count = 0;
   uint64_t valid_count = 0;
   int idx = 0;
@@ -1725,8 +1729,8 @@ std::pair<uint64_t, uint64_t> GraphTable::parse_edge_file(
   uint64_t local_valid_count = 0;
   uint64_t part_num = 0;
   if (FLAGS_graph_load_in_parallel) {
-    auto path_split = paddle::string::split_string<std::string>(path, "/");
-    auto part_name_split = paddle::string::split_string<std::string>(
+    auto path_split = ::paddle::string::split_string<std::string>(path, "/");
+    auto part_name_split = ::paddle::string::split_string<std::string>(
         path_split[path_split.size() - 1], "-");
     part_num = std::stoull(part_name_split[part_name_split.size() - 1]);
   }
@@ -1793,7 +1797,7 @@ int32_t GraphTable::load_edges(const std::string &path,
     idx = edge_to_id[edge_type];
   }
 
-  auto paths = paddle::string::split_string<std::string>(path, ";");
+  auto paths = ::paddle::string::split_string<std::string>(path, ";");
   uint64_t count = 0;
   uint64_t valid_count = 0;
 
@@ -1865,7 +1869,7 @@ Node *GraphTable::find_node(GraphTableType table_type, uint64_t id) {
       table_type == GraphTableType::EDGE_TABLE ? edge_shards : feature_shards;
   for (auto &search_shard : search_shards) {
     PADDLE_ENFORCE_NOT_NULL(search_shard[index],
-                            paddle::platform::errors::InvalidArgument(
+                            ::paddle::platform::errors::InvalidArgument(
                                 "search_shard[%d] should not be null.", index));
     node = search_shard[index]->find_node(id);
     if (node != nullptr) {
@@ -1885,7 +1889,7 @@ Node *GraphTable::find_node(GraphTableType table_type, int idx, uint64_t id) {
                             ? edge_shards[idx]
                             : feature_shards[idx];
   PADDLE_ENFORCE_NOT_NULL(search_shards[index],
-                          paddle::platform::errors::InvalidArgument(
+                          ::paddle::platform::errors::InvalidArgument(
                               "search_shard[%d] should not be null.", index));
   Node *node = search_shards[index]->find_node(id);
   return node;
@@ -2164,8 +2168,8 @@ void string_vector_2_string(std::vector<std::string>::iterator strs_begin,
 }
 
 void string_vector_2_string(
-    std::vector<paddle::string::str_ptr>::iterator strs_begin,
-    std::vector<paddle::string::str_ptr>::iterator strs_end,
+    std::vector<::paddle::string::str_ptr>::iterator strs_begin,
+    std::vector<::paddle::string::str_ptr>::iterator strs_end,
     char delim,
     std::string *output) {
   size_t i = 0;
@@ -2184,19 +2188,19 @@ int GraphTable::parse_feature(int idx,
                               FeatureNode *node) {
   // Return (feat_id, btyes) if name are in this->feat_name, else return (-1,
   // "")
-  thread_local std::vector<paddle::string::str_ptr> fields;
+  thread_local std::vector<::paddle::string::str_ptr> fields;
   fields.clear();
   char c = slot_feature_separator_.at(0);
-  paddle::string::split_string_ptr(feat_str, len, c, &fields);
+  ::paddle::string::split_string_ptr(feat_str, len, c, &fields);
 
-  thread_local std::vector<paddle::string::str_ptr> fea_fields;
+  thread_local std::vector<::paddle::string::str_ptr> fea_fields;
   fea_fields.clear();
   c = feature_separator_.at(0);
-  paddle::string::split_string_ptr(fields[1].ptr,
-                                   fields[1].len,
-                                   c,
-                                   &fea_fields,
-                                   FLAGS_gpugraph_slot_feasign_max_num);
+  ::paddle::string::split_string_ptr(fields[1].ptr,
+                                     fields[1].len,
+                                     c,
+                                     &fea_fields,
+                                     FLAGS_gpugraph_slot_feasign_max_num);
   std::string name = fields[0].to_string();
   auto it = feat_id_map[idx].find(name);
   if (it != feat_id_map[idx].end()) {
@@ -2522,14 +2526,14 @@ int32_t GraphTable::Initialize(const TableParameter &config,
 }
 
 void GraphTable::load_node_weight(int type_id, int idx, std::string path) {
-  auto paths = paddle::string::split_string<std::string>(path, ";");
+  auto paths = ::paddle::string::split_string<std::string>(path, ";");
   int64_t count = 0;
   auto &weight_map = node_weight[type_id][idx];
   for (auto path : paths) {
     std::ifstream file(path);
     std::string line;
     while (std::getline(file, line)) {
-      auto values = paddle::string::split_string<std::string>(line, "\t");
+      auto values = ::paddle::string::split_string<std::string>(line, "\t");
       count++;
       if (values.size() < 2) continue;
       auto src_id = std::stoull(values[0]);
@@ -2546,7 +2550,7 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) {
   _db = NULL;
   search_level = graph.search_level();
   if (search_level >= 2) {
-    _db = paddle::distributed::RocksDBHandler::GetInstance();
+    _db = ::paddle::distributed::RocksDBHandler::GetInstance();
     _db->initialize("./temp_gpups_db", task_pool_size_);
   }
 // gpups_mode = true;
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index 142327368281ad..91967ccb07c25e 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -712,9 +712,9 @@ class GraphTable : public Table {
       int &actual_size);  // NOLINT
   virtual int32_t add_node_to_ssd(
       int type_id, int idx, uint64_t src_id, char *data, int len);
-  virtual paddle::framework::GpuPsCommGraph make_gpu_ps_graph(
+  virtual ::paddle::framework::GpuPsCommGraph make_gpu_ps_graph(
       int idx, const std::vector<uint64_t> &ids);
-  virtual paddle::framework::GpuPsCommGraphFea make_gpu_ps_graph_fea(
+  virtual ::paddle::framework::GpuPsCommGraphFea make_gpu_ps_graph_fea(
       int gpu_id, std::vector<uint64_t> &node_ids, int slot_num);  // NOLINT
   int32_t Load_to_ssd(const std::string &path, const std::string &param);
   int64_t load_graph_to_memory_from_ssd(int idx,
@@ -786,7 +786,7 @@ class GraphTable : public Table {
   std::shared_ptr<pthread_rwlock_t> rw_lock;
 #ifdef PADDLE_WITH_HETERPS
   // paddle::framework::GpuPsGraphTable gpu_graph_table;
-  paddle::distributed::RocksDBHandler *_db;
+  ::paddle::distributed::RocksDBHandler *_db;
   // std::shared_ptr<::ThreadPool> graph_sample_pool;
   // std::shared_ptr<GraphSampler> graph_sampler;
   // REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler)
@@ -847,8 +847,8 @@ class BasicBfsGraphSampler : public GraphSampler {
 namespace std {
 
 template <>
-struct hash<paddle::distributed::SampleKey> {
-  size_t operator()(const paddle::distributed::SampleKey &s) const {
+struct hash<::paddle::distributed::SampleKey> {
+  size_t operator()(const ::paddle::distributed::SampleKey &s) const {
     return s.idx ^ s.node_key ^ s.sample_size;
   }
 };
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
index 61e748a5413673..ca634572a462ec 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
@@ -14,10 +14,9 @@
 
 #include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
 
-#include <gflags/gflags.h>
-
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
index 35c27242fe3556..46d3ebf400d5f4 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
@@ -14,10 +14,9 @@
 
 #include "paddle/fluid/distributed/ps/table/ctr_double_accessor.h"
 
-#include <gflags/gflags.h>
-
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
index 4824ab8946b9d0..a2943c2237cec0 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
@@ -14,10 +14,9 @@
 
 #include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
 
-#include <gflags/gflags.h>
-
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/depends/dense.h b/paddle/fluid/distributed/ps/table/depends/dense.h
index 3f09376b42db37..272fb0b00504f1 100644
--- a/paddle/fluid/distributed/ps/table/depends/dense.h
+++ b/paddle/fluid/distributed/ps/table/depends/dense.h
@@ -22,8 +22,8 @@
 #include <utility>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/distributed/common/utils.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/depends/feature_value.h b/paddle/fluid/distributed/ps/table/depends/feature_value.h
index c91502a8552e84..ce01c8762253cb 100644
--- a/paddle/fluid/distributed/ps/table/depends/feature_value.h
+++ b/paddle/fluid/distributed/ps/table/depends/feature_value.h
@@ -14,11 +14,12 @@
 
 #pragma once
 
-#include <mct/hash-map.hpp>
 #include <vector>
 
-#include "gflags/gflags.h"
+#include <mct/hash-map.hpp>
+
 #include "paddle/fluid/distributed/common/chunk_allocator.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h
index 467227097674b1..3ebe5549de4f76 100644
--- a/paddle/fluid/distributed/ps/table/depends/initializers.h
+++ b/paddle/fluid/distributed/ps/table/depends/initializers.h
@@ -21,9 +21,9 @@
 #include <utility>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 #include "paddle/phi/core/generator.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace distributed {
@@ -124,13 +124,13 @@ class TruncatedGaussianInitializer : public Initializer {
   }
 
   float GetValue() override {
-    paddle::operators::TruncatedNormal<float> truncated_normal(mean_, std_);
+    ::paddle::operators::TruncatedNormal<float> truncated_normal(mean_, std_);
     float value = truncated_normal(dist_(*random_engine_));
     return value;
   }
 
   void GetValue(float *value, int numel) {
-    paddle::operators::TruncatedNormal<float> truncated_normal(mean_, std_);
+    ::paddle::operators::TruncatedNormal<float> truncated_normal(mean_, std_);
     for (int x = 0; x < numel; ++x) {
       value[x] = truncated_normal(dist_(*random_engine_));
     }
diff --git a/paddle/fluid/distributed/ps/table/graph/class_macro.h b/paddle/fluid/distributed/ps/table/graph/class_macro.h
index bf59dbacb25370..895e59d09af62a 100644
--- a/paddle/fluid/distributed/ps/table/graph/class_macro.h
+++ b/paddle/fluid/distributed/ps/table/graph/class_macro.h
@@ -36,4 +36,4 @@
 #define DECLARE_11_FRIEND_CLASS(a, ...) \
   DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_10_FRIEND_CLASS(__VA_ARGS__)
 #define REGISTER_GRAPH_FRIEND_CLASS(n, ...) \
-  DECLARE_##n##_FRIEND_CLASS(__VA_ARGS__)
+  PD_DECLARE_##n##_FRIEND_CLASS(__VA_ARGS__)
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
index 5ee23010b526e0..dbdff119141a5c 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -26,16 +26,18 @@
 // #include "boost/lexical_cast.hpp"
 #include "paddle/fluid/platform/enforce.h"
 
-DEFINE_bool(pserver_print_missed_key_num_every_push,
-            false,
-            "pserver_print_missed_key_num_every_push");
-DEFINE_bool(pserver_create_value_when_push,
-            true,
-            "pserver create value when push");
-DEFINE_bool(pserver_enable_create_feasign_randomly,
-            false,
-            "pserver_enable_create_feasign_randomly");
-DEFINE_int32(pserver_table_save_max_retry, 3, "pserver_table_save_max_retry");
+PD_DEFINE_bool(pserver_print_missed_key_num_every_push,
+               false,
+               "pserver_print_missed_key_num_every_push");
+PD_DEFINE_bool(pserver_create_value_when_push,
+               true,
+               "pserver create value when push");
+PD_DEFINE_bool(pserver_enable_create_feasign_randomly,
+               false,
+               "pserver_enable_create_feasign_randomly");
+PD_DEFINE_int32(pserver_table_save_max_retry,
+                3,
+                "pserver_table_save_max_retry");
 
 namespace paddle {
 namespace distributed {
@@ -333,7 +335,7 @@ int32_t MemorySparseTable::Save(const std::string &dirname,
   TopkCalculator tk(_real_local_shard_num, tk_size);
 
   std::string table_path = TableDir(dirname);
-  _afs_client.remove(paddle::string::format_string(
+  _afs_client.remove(::paddle::string::format_string(
       "%s/part-%03d-*", table_path.c_str(), _shard_idx));
   std::atomic<uint32_t> feasign_size_all{0};
 
@@ -350,15 +352,15 @@ int32_t MemorySparseTable::Save(const std::string &dirname,
     FsChannelConfig channel_config;
     if (_config.compress_in_save() && (save_param == 0 || save_param == 3)) {
       channel_config.path =
-          paddle::string::format_string("%s/part-%03d-%05d.gz",
-                                        table_path.c_str(),
-                                        _shard_idx,
-                                        file_start_idx + i);
+          ::paddle::string::format_string("%s/part-%03d-%05d.gz",
+                                          table_path.c_str(),
+                                          _shard_idx,
+                                          file_start_idx + i);
     } else {
-      channel_config.path = paddle::string::format_string("%s/part-%03d-%05d",
-                                                          table_path.c_str(),
-                                                          _shard_idx,
-                                                          file_start_idx + i);
+      channel_config.path = ::paddle::string::format_string("%s/part-%03d-%05d",
+                                                            table_path.c_str(),
+                                                            _shard_idx,
+                                                            file_start_idx + i);
     }
     channel_config.converter = _value_accesor->Converter(save_param).converter;
     channel_config.deconverter =
@@ -385,7 +387,7 @@ int32_t MemorySparseTable::Save(const std::string &dirname,
         if (_value_accesor->Save(it.value().data(), save_param)) {
           std::string format_value = _value_accesor->ParseToString(
               it.value().data(), it.value().size());
-          if (0 != write_channel->write_line(paddle::string::format_string(
+          if (0 != write_channel->write_line(::paddle::string::format_string(
                        "%lu %s", it.key(), format_value.c_str()))) {
             ++retry_num;
             is_write_failed = true;
@@ -432,7 +434,7 @@ int32_t MemorySparseTable::SavePatch(const std::string &path, int save_param) {
   }
   size_t file_start_idx = _m_avg_local_shard_num * _shard_idx;
   std::string table_path = TableDir(path);
-  _afs_client.remove(paddle::string::format_string(
+  _afs_client.remove(::paddle::string::format_string(
       "%s/part-%03d-*", table_path.c_str(), _shard_idx));
   int thread_num = _m_real_local_shard_num < 20 ? _m_real_local_shard_num : 20;
 
@@ -442,10 +444,10 @@ int32_t MemorySparseTable::SavePatch(const std::string &path, int save_param) {
 #pragma omp parallel for schedule(dynamic)
   for (int i = 0; i < _m_real_local_shard_num; ++i) {
     FsChannelConfig channel_config;
-    channel_config.path = paddle::string::format_string("%s/part-%03d-%05d",
-                                                        table_path.c_str(),
-                                                        _shard_idx,
-                                                        file_start_idx + i);
+    channel_config.path = ::paddle::string::format_string("%s/part-%03d-%05d",
+                                                          table_path.c_str(),
+                                                          _shard_idx,
+                                                          file_start_idx + i);
 
     channel_config.converter = _value_accesor->Converter(save_param).converter;
     channel_config.deconverter =
@@ -469,8 +471,9 @@ int32_t MemorySparseTable::SavePatch(const std::string &path, int save_param) {
             if (_value_accesor->Save(it.value().data(), save_param)) {
               std::string format_value = _value_accesor->ParseToString(
                   it.value().data(), it.value().size());
-              if (0 != write_channel->write_line(paddle::string::format_string(
-                           "%lu %s", it.key(), format_value.c_str()))) {
+              if (0 !=
+                  write_channel->write_line(::paddle::string::format_string(
+                      "%lu %s", it.key(), format_value.c_str()))) {
                 ++retry_num;
                 is_write_failed = true;
                 LOG(ERROR) << "MemorySparseTable save failed, retry it! path:"
@@ -503,10 +506,10 @@ int32_t MemorySparseTable::SavePatch(const std::string &path, int save_param) {
     feasign_size_all += feasign_size;
   }
   LOG(INFO) << "MemorySparseTable save patch success, path:"
-            << paddle::string::format_string("%s/%03d/part-%03d-",
-                                             path.c_str(),
-                                             _config.table_id(),
-                                             _shard_idx)
+            << ::paddle::string::format_string("%s/%03d/part-%03d-",
+                                               path.c_str(),
+                                               _config.table_id(),
+                                               _shard_idx)
             << " from " << file_start_idx << " to "
             << file_start_idx + _m_real_local_shard_num - 1
             << ", feasign size: " << feasign_size_all;
@@ -519,7 +522,7 @@ int64_t MemorySparseTable::CacheShuffle(
     double cache_threshold,
     std::function<std::future<int32_t>(
         int msg_type, int to_pserver_id, std::string &msg)> send_msg_func,
-    paddle::framework::Channel<std::pair<uint64_t, std::string>>
+    ::paddle::framework::Channel<std::pair<uint64_t, std::string>>
         &shuffled_channel,
     const std::vector<Table *> &table_ptrs) {
   LOG(INFO) << "cache shuffle with cache threshold: " << cache_threshold;
@@ -536,24 +539,24 @@ int64_t MemorySparseTable::CacheShuffle(
   int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20;
 
   std::vector<
-      paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>>
+      ::paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>>
       writers(_real_local_shard_num);
   std::vector<std::vector<std::pair<uint64_t, std::string>>> datas(
       _real_local_shard_num);
 
   int feasign_size = 0;
-  std::vector<paddle::framework::Channel<std::pair<uint64_t, std::string>>>
+  std::vector<::paddle::framework::Channel<std::pair<uint64_t, std::string>>>
       tmp_channels;
   for (int i = 0; i < _real_local_shard_num; ++i) {
     tmp_channels.push_back(
-        paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>());
+        ::paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>());
   }
 
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
   for (int i = 0; i < _real_local_shard_num; ++i) {
-    paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>> &writer =
-        writers[i];
+    ::paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>
+        &writer = writers[i];
     writer.Reset(tmp_channels[i].get());
 
     for (auto table_ptr : table_ptrs) {
@@ -579,15 +582,15 @@ int64_t MemorySparseTable::CacheShuffle(
   // shard num: " << _real_local_shard_num;
   std::vector<std::pair<uint64_t, std::string>> local_datas;
   for (int idx_shard = 0; idx_shard < _real_local_shard_num; ++idx_shard) {
-    paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>> &writer =
-        writers[idx_shard];
+    ::paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>
+        &writer = writers[idx_shard];
     auto channel = writer.channel();
     std::vector<std::pair<uint64_t, std::string>> &data = datas[idx_shard];
-    std::vector<paddle::framework::BinaryArchive> ars(shuffle_node_num);
+    std::vector<::paddle::framework::BinaryArchive> ars(shuffle_node_num);
     while (channel->Read(data)) {
       for (auto &t : data) {
         auto pserver_id =
-            paddle::distributed::local_random_engine()() % shuffle_node_num;
+            ::paddle::distributed::local_random_engine()() % shuffle_node_num;
         if (pserver_id != _shard_idx) {
           ars[pserver_id] << t;
         } else {
@@ -618,7 +621,7 @@ int64_t MemorySparseTable::CacheShuffle(
         t.wait();
       }
       ars.clear();
-      ars = std::vector<paddle::framework::BinaryArchive>(shuffle_node_num);
+      ars = std::vector<::paddle::framework::BinaryArchive>(shuffle_node_num);
       data = std::vector<std::pair<uint64_t, std::string>>();
     }
   }
@@ -629,20 +632,20 @@ int64_t MemorySparseTable::CacheShuffle(
 int32_t MemorySparseTable::SaveCache(
     const std::string &path,
     const std::string &param,
-    paddle::framework::Channel<std::pair<uint64_t, std::string>>
+    ::paddle::framework::Channel<std::pair<uint64_t, std::string>>
         &shuffled_channel) {
   if (_shard_idx >= _config.sparse_table_cache_file_num()) {
     return 0;
   }
   int save_param = atoi(param.c_str());  // batch_model:0  xbox:1
-  std::string table_path = paddle::string::format_string(
+  std::string table_path = ::paddle::string::format_string(
       "%s/%03d_cache/", path.c_str(), _config.table_id());
-  _afs_client.remove(paddle::string::format_string(
+  _afs_client.remove(::paddle::string::format_string(
       "%s/part-%03d", table_path.c_str(), _shard_idx));
   uint32_t feasign_size = 0;
   FsChannelConfig channel_config;
   // not compress cache model
-  channel_config.path = paddle::string::format_string(
+  channel_config.path = ::paddle::string::format_string(
       "%s/part-%03d", table_path.c_str(), _shard_idx);
   channel_config.converter = _value_accesor->Converter(save_param).converter;
   channel_config.deconverter =
@@ -654,7 +657,7 @@ int32_t MemorySparseTable::SaveCache(
   while (shuffled_channel->Read(data)) {
     for (auto &t : data) {
       ++feasign_size;
-      if (0 != write_channel->write_line(paddle::string::format_string(
+      if (0 != write_channel->write_line(::paddle::string::format_string(
                    "%lu %s", t.first, t.second.c_str()))) {
         LOG(ERROR) << "Cache Table save failed, "
                       "path:"
diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.cc b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
index afa94703233cf0..77460a8d17e47b 100644
--- a/paddle/fluid/distributed/ps/table/sparse_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
@@ -14,10 +14,9 @@
 
 #include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
 
-#include <gflags/gflags.h>
-
 #include "glog/logging.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace distributed {
@@ -300,7 +299,7 @@ std::string SparseAccessor::ParseToString(const float* v, int param) {
 int SparseAccessor::ParseFromString(const std::string& str, float* value) {
   _embedx_sgd_rule->InitValue(value + sparse_feature_value.EmbedxWIndex(),
                               value + sparse_feature_value.EmbedxG2SumIndex());
-  auto ret = paddle::string::str_to_float(str.data(), value);
+  auto ret = ::paddle::string::str_to_float(str.data(), value);
   CHECK(ret >= 6) << "expect more than 6 real:" << ret;
   return ret;
 }
diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
index 6ab4506d29e4c6..0c66e9d407aa4d 100644
--- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
@@ -14,11 +14,11 @@
 
 #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
 
-#include <gflags/gflags.h>
-
 #include "glog/logging.h"
 
-DEFINE_bool(enable_show_scale_gradient, true, "enable show scale gradient");
+#include "paddle/utils/flags.h"
+
+PD_DEFINE_bool(enable_show_scale_gradient, true, "enable show scale gradient");
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index 7d96e0f49d1adf..bb6de81cbb357e 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -20,11 +20,11 @@
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/utils/string/string_helper.h"
-DECLARE_bool(pserver_print_missed_key_num_every_push);
-DECLARE_bool(pserver_create_value_when_push);
-DECLARE_bool(pserver_enable_create_feasign_randomly);
-DEFINE_bool(pserver_open_strict_check, false, "pserver_open_strict_check");
-DEFINE_int32(pserver_load_batch_size, 5000, "load batch size for ssd");
+PD_DECLARE_bool(pserver_print_missed_key_num_every_push);
+PD_DECLARE_bool(pserver_create_value_when_push);
+PD_DECLARE_bool(pserver_enable_create_feasign_randomly);
+PD_DEFINE_bool(pserver_open_strict_check, false, "pserver_open_strict_check");
+PD_DEFINE_int32(pserver_load_batch_size, 5000, "load batch size for ssd");
 PADDLE_DEFINE_EXPORTED_string(rocksdb_path,
                               "database",
                               "path of sparse table rocksdb file");
@@ -34,7 +34,7 @@ namespace distributed {
 
 int32_t SSDSparseTable::Initialize() {
   MemorySparseTable::Initialize();
-  _db = paddle::distributed::RocksDBHandler::GetInstance();
+  _db = ::paddle::distributed::RocksDBHandler::GetInstance();
   _db->initialize(FLAGS_rocksdb_path, _real_local_shard_num);
   VLOG(0) << "initalize SSDSparseTable succ";
   VLOG(0) << "SSD FLAGS_pserver_print_missed_key_num_every_push:"
@@ -135,7 +135,7 @@ int32_t SSDSparseTable::PullSparse(float* pull_values,
                     } else {
                       data_size = tmp_string.size() / sizeof(float);
                       memcpy(data_buffer_ptr,
-                             paddle::string::str_to_float(tmp_string),
+                             ::paddle::string::str_to_float(tmp_string),
                              data_size * sizeof(float));
                       // from rocksdb to mem
                       auto& feature_value = local_shard[key];
@@ -239,7 +239,7 @@ int32_t SSDSparseTable::PullSparsePtr(int shard_id,
                 auto& feature_value = local_shard[cur_key];
                 feature_value.resize(data_size);
                 memcpy(const_cast<float*>(feature_value.data()),
-                       paddle::string::str_to_float(
+                       ::paddle::string::str_to_float(
                            cur_ctx->batch_values[idx].data()),
                        data_size * sizeof(float));
                 _db->del_data(shard_id,
@@ -302,7 +302,7 @@ int32_t SSDSparseTable::PullSparsePtr(int shard_id,
           feature_value.resize(data_size);
           memcpy(
               const_cast<float*>(feature_value.data()),
-              paddle::string::str_to_float(cur_ctx->batch_values[idx].data()),
+              ::paddle::string::str_to_float(cur_ctx->batch_values[idx].data()),
               data_size * sizeof(float));
           _db->del_data(
               shard_id, reinterpret_cast<char*>(&cur_key), sizeof(uint64_t));
@@ -529,7 +529,7 @@ int32_t SSDSparseTable::Shrink(const std::string& param) {
     auto* it = _db->get_iterator(i);
     for (it->SeekToFirst(); it->Valid(); it->Next()) {
       if (_value_accesor->Shrink(
-              paddle::string::str_to_float(it->value().data()))) {
+              ::paddle::string::str_to_float(it->value().data()))) {
         _db->del_data(i, it->key().data(), it->key().size());
         ssd_count++;
       } else {
@@ -627,7 +627,7 @@ int32_t SSDSparseTable::SaveWithString(const std::string& path,
   VLOG(0) << "TopkCalculator top n:" << _cache_tk_size;
   size_t file_start_idx = _avg_local_shard_num * _shard_idx;
   std::string table_path = TableDir(path);
-  _afs_client.remove(paddle::string::format_string(
+  _afs_client.remove(::paddle::string::format_string(
       "%s/part-%03d-*", table_path.c_str(), _shard_idx));
 #ifdef PADDLE_WITH_GPU_GRAPH
   int thread_num = _real_local_shard_num;
@@ -640,12 +640,11 @@ int32_t SSDSparseTable::SaveWithString(const std::string& path,
   // feasign_size = 0;
 
   std::vector<
-      paddle::framework::Channel<std::pair<uint64_t, std::vector<float>>>>
+      ::paddle::framework::Channel<std::pair<uint64_t, std::vector<float>>>>
       fs_channel;
   for (int i = 0; i < _real_local_shard_num; i++) {
-    fs_channel.push_back(
-        paddle::framework::MakeChannel<std::pair<uint64_t, std::vector<float>>>(
-            10240));
+    fs_channel.push_back(::paddle::framework::MakeChannel<
+                         std::pair<uint64_t, std::vector<float>>>(10240));
   }
   std::vector<std::thread> threads;
   threads.resize(_real_local_shard_num);
@@ -659,29 +658,29 @@ int32_t SSDSparseTable::SaveWithString(const std::string& path,
     FsChannelConfig channel_config;
     if (_config.compress_in_save() && (save_param == 0 || save_param == 3)) {
       channel_config.path =
-          paddle::string::format_string("%s/part-%03d-%05d.gz",
-                                        table_path.c_str(),
-                                        _shard_idx,
-                                        file_start_idx + file_num);
+          ::paddle::string::format_string("%s/part-%03d-%05d.gz",
+                                          table_path.c_str(),
+                                          _shard_idx,
+                                          file_start_idx + file_num);
     } else {
       channel_config.path =
-          paddle::string::format_string("%s/part-%03d-%05d",
-                                        table_path.c_str(),
-                                        _shard_idx,
-                                        file_start_idx + file_num);
+          ::paddle::string::format_string("%s/part-%03d-%05d",
+                                          table_path.c_str(),
+                                          _shard_idx,
+                                          file_start_idx + file_num);
     }
     channel_config.converter = _value_accesor->Converter(save_param).converter;
     channel_config.deconverter =
         _value_accesor->Converter(save_param).deconverter;
     auto write_channel =
         _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no);
-    paddle::framework::ChannelReader<std::pair<uint64_t, std::vector<float>>>
+    ::paddle::framework::ChannelReader<std::pair<uint64_t, std::vector<float>>>
         reader(fs_channel[file_num].get());
     std::pair<uint64_t, std::vector<float>> out_str;
     while (reader >> out_str) {
       std::string format_value = _value_accesor->ParseToString(
           out_str.second.data(), out_str.second.size());
-      if (0 != write_channel->write_line(paddle::string::format_string(
+      if (0 != write_channel->write_line(::paddle::string::format_string(
                    "%lu %s", out_str.first, format_value.c_str()))) {
         LOG(FATAL) << "SSDSparseTable save failed, retry it! path:"
                    << channel_config.path;
@@ -693,8 +692,8 @@ int32_t SSDSparseTable::SaveWithString(const std::string& path,
     threads[i] = std::thread(save_func, i);
   }
 
-  std::vector<
-      paddle::framework::ChannelWriter<std::pair<uint64_t, std::vector<float>>>>
+  std::vector<::paddle::framework::ChannelWriter<
+      std::pair<uint64_t, std::vector<float>>>>
       writers(_real_local_shard_num);
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
@@ -726,14 +725,14 @@ int32_t SSDSparseTable::SaveWithString(const std::string& path,
       auto* it = _db->get_iterator(i);
       for (it->SeekToFirst(); it->Valid(); it->Next()) {
         bool need_save = _value_accesor->Save(
-            paddle::string::str_to_float(it->value().data()), save_param);
+            ::paddle::string::str_to_float(it->value().data()), save_param);
         _value_accesor->UpdateStatAfterSave(
-            paddle::string::str_to_float(it->value().data()), save_param);
+            ::paddle::string::str_to_float(it->value().data()), save_param);
         if (need_save) {
           std::vector<float> feature_value;
           feature_value.resize(it->value().size() / sizeof(float));
           memcpy(const_cast<float*>(feature_value.data()),
-                 paddle::string::str_to_float(it->value().data()),
+                 ::paddle::string::str_to_float(it->value().data()),
                  it->value().size());
           writer << std::make_pair(*(reinterpret_cast<uint64_t*>(
                                        const_cast<char*>(it->key().data()))),
@@ -766,10 +765,10 @@ int32_t SSDSparseTable::SaveWithString(const std::string& path,
   }
   VLOG(0) << "SSDSparseTable save success, feasign size:" << feasign_size_all
           << ", path:"
-          << paddle::string::format_string("%s/%03d/part-%03d-",
-                                           path.c_str(),
-                                           _config.table_id(),
-                                           _shard_idx)
+          << ::paddle::string::format_string("%s/%03d/part-%03d-",
+                                             path.c_str(),
+                                             _config.table_id(),
+                                             _shard_idx)
           << " from " << file_start_idx << " to "
           << file_start_idx + _real_local_shard_num - 1;
   _local_show_threshold = tk.top();
@@ -800,7 +799,7 @@ int32_t SSDSparseTable::SaveWithStringMultiOutput(const std::string& path,
   VLOG(0) << "TopkCalculator top n:" << _cache_tk_size;
   size_t file_start_idx = _avg_local_shard_num * _shard_idx;
   std::string table_path = TableDir(path);
-  _afs_client.remove(paddle::string::format_string(
+  _afs_client.remove(::paddle::string::format_string(
       "%s/part-%03d-*", table_path.c_str(), _shard_idx));
 #ifdef PADDLE_WITH_GPU_GRAPH
   int thread_num = _real_local_shard_num;
@@ -809,17 +808,17 @@ int32_t SSDSparseTable::SaveWithStringMultiOutput(const std::string& path,
 #endif
 
   std::atomic<uint32_t> feasign_size_all{0};
-  std::vector<paddle::framework::Channel<std::shared_ptr<MemRegion>>>
+  std::vector<::paddle::framework::Channel<std::shared_ptr<MemRegion>>>
       busy_channel;
-  std::vector<paddle::framework::Channel<std::shared_ptr<MemRegion>>>
+  std::vector<::paddle::framework::Channel<std::shared_ptr<MemRegion>>>
       free_channel;
   std::vector<std::thread> threads;
 
   for (int i = 0; i < _real_local_shard_num; i++) {
     busy_channel.push_back(
-        paddle::framework::MakeChannel<std::shared_ptr<MemRegion>>());
+        ::paddle::framework::MakeChannel<std::shared_ptr<MemRegion>>());
     free_channel.push_back(
-        paddle::framework::MakeChannel<std::shared_ptr<MemRegion>>());
+        ::paddle::framework::MakeChannel<std::shared_ptr<MemRegion>>());
   }
   threads.resize(_real_local_shard_num);
 
@@ -848,14 +847,14 @@ int32_t SSDSparseTable::SaveWithStringMultiOutput(const std::string& path,
                            int split_num) {
       if (compress && (save_param == 0 || save_param == 3)) {
         // return
-        // paddle::string::format_string("%s/part-%03d-%05d-%03d-%03d.gz",
+        // ::paddle::string::format_string("%s/part-%03d-%05d-%03d-%03d.gz",
         //     table_path, node_num, shard_num, part_num, split_num);
-        return paddle::string::format_string(
+        return ::paddle::string::format_string(
             "%s/part-%05d-%03d.gz", table_path, shard_num, split_num);
       } else {
-        // return paddle::string::format_string("%s/part-%03d-%05d-%03d-%03d",
+        // return ::paddle::string::format_string("%s/part-%03d-%05d-%03d-%03d",
         //     table_path, node_num,  shard_num, part_num, split_num);
-        return paddle::string::format_string(
+        return ::paddle::string::format_string(
             "%s/part-%05d-%03d", table_path, shard_num, split_num);
       }
     };
@@ -899,7 +898,7 @@ int32_t SSDSparseTable::SaveWithStringMultiOutput(const std::string& path,
         int dim = len / sizeof(float);
 
         std::string format_value = _value_accesor->ParseToString(value, dim);
-        if (0 != write_channel->write_line(paddle::string::format_string(
+        if (0 != write_channel->write_line(::paddle::string::format_string(
                      "%lu %s", k, format_value.c_str()))) {
           VLOG(0) << "SSDSparseTable save failed, retry it! path:"
                   << channel_config.path;
@@ -985,9 +984,9 @@ int32_t SSDSparseTable::SaveWithStringMultiOutput(const std::string& path,
       auto* it = _db->get_iterator(i);
       for (it->SeekToFirst(); it->Valid(); it->Next()) {
         bool need_save = _value_accesor->Save(
-            paddle::string::str_to_float(it->value().data()), save_param);
+            ::paddle::string::str_to_float(it->value().data()), save_param);
         _value_accesor->UpdateStatAfterSave(
-            paddle::string::str_to_float(it->value().data()), save_param);
+            ::paddle::string::str_to_float(it->value().data()), save_param);
         if (need_save) {
           uint32_t len =
               sizeof(uint64_t) + it->value().size() + sizeof(uint32_t);
@@ -1052,10 +1051,10 @@ int32_t SSDSparseTable::SaveWithStringMultiOutput(const std::string& path,
   }
   VLOG(0) << "DownpourSparseSSDTable save success, feasign size:"
           << feasign_size_all << " ,path:"
-          << paddle::string::format_string("%s/%03d/part-%03d-",
-                                           path.c_str(),
-                                           _config.table_id(),
-                                           _shard_idx)
+          << ::paddle::string::format_string("%s/%03d/part-%03d-",
+                                             path.c_str(),
+                                             _config.table_id(),
+                                             _shard_idx)
           << " from " << file_start_idx << " to "
           << file_start_idx + _real_local_shard_num - 1;
   if (_config.enable_sparse_table_cache()) {
@@ -1085,7 +1084,7 @@ int32_t SSDSparseTable::SaveWithBinary(const std::string& path,
   VLOG(0) << "TopkCalculator top n:" << _cache_tk_size;
   size_t file_start_idx = _avg_local_shard_num * _shard_idx;
   std::string table_path = TableDir(path);
-  _afs_client.remove(paddle::string::format_string(
+  _afs_client.remove(::paddle::string::format_string(
       "%s/part-%03d-*", table_path.c_str(), _shard_idx));
 #ifdef PADDLE_WITH_GPU_GRAPH
   int thread_num = _real_local_shard_num;
@@ -1094,17 +1093,17 @@ int32_t SSDSparseTable::SaveWithBinary(const std::string& path,
 #endif
 
   std::atomic<uint32_t> feasign_size_all{0};
-  std::vector<paddle::framework::Channel<std::shared_ptr<MemRegion>>>
+  std::vector<::paddle::framework::Channel<std::shared_ptr<MemRegion>>>
       busy_channel;
-  std::vector<paddle::framework::Channel<std::shared_ptr<MemRegion>>>
+  std::vector<::paddle::framework::Channel<std::shared_ptr<MemRegion>>>
       free_channel;
   std::vector<std::thread> threads;
 
   for (int i = 0; i < _real_local_shard_num; i++) {
     busy_channel.push_back(
-        paddle::framework::MakeChannel<std::shared_ptr<MemRegion>>());
+        ::paddle::framework::MakeChannel<std::shared_ptr<MemRegion>>());
     free_channel.push_back(
-        paddle::framework::MakeChannel<std::shared_ptr<MemRegion>>());
+        ::paddle::framework::MakeChannel<std::shared_ptr<MemRegion>>());
   }
   threads.resize(_real_local_shard_num);
 
@@ -1132,19 +1131,19 @@ int32_t SSDSparseTable::SaveWithBinary(const std::string& path,
                            int part_num,
                            int split_num) {
       if (compress && (save_param == 0 || save_param == 3)) {
-        return paddle::string::format_string("%s/part-%03d-%05d-%03d-%03d.gz",
-                                             table_path,
-                                             node_num,
-                                             shard_num,
-                                             part_num,
-                                             split_num);
+        return ::paddle::string::format_string("%s/part-%03d-%05d-%03d-%03d.gz",
+                                               table_path,
+                                               node_num,
+                                               shard_num,
+                                               part_num,
+                                               split_num);
       } else {
-        return paddle::string::format_string("%s/part-%03d-%05d-%03d-%03d",
-                                             table_path,
-                                             node_num,
-                                             shard_num,
-                                             part_num,
-                                             split_num);
+        return ::paddle::string::format_string("%s/part-%03d-%05d-%03d-%03d",
+                                               table_path,
+                                               node_num,
+                                               shard_num,
+                                               part_num,
+                                               split_num);
       }
     };
     std::shared_ptr<MemRegion> region = nullptr;
@@ -1206,7 +1205,7 @@ int32_t SSDSparseTable::SaveWithBinary(const std::string& path,
           int dim = len / sizeof(float);
 
           std::string format_value = _value_accesor->ParseToString(value, dim);
-          if (0 != write_channel->write_line(paddle::string::format_string(
+          if (0 != write_channel->write_line(::paddle::string::format_string(
                        "%lu %s", k, format_value.c_str()))) {
             LOG(FATAL) << "SSDSparseTable save failed, retry it! path:"
                        << channel_config.path;
@@ -1277,9 +1276,9 @@ int32_t SSDSparseTable::SaveWithBinary(const std::string& path,
       auto* it = _db->get_iterator(i);
       for (it->SeekToFirst(); it->Valid(); it->Next()) {
         bool need_save = _value_accesor->Save(
-            paddle::string::str_to_float(it->value().data()), save_param);
+            ::paddle::string::str_to_float(it->value().data()), save_param);
         _value_accesor->UpdateStatAfterSave(
-            paddle::string::str_to_float(it->value().data()), save_param);
+            ::paddle::string::str_to_float(it->value().data()), save_param);
         if (need_save) {
           uint32_t len =
               sizeof(uint64_t) + it->value().size() + sizeof(uint32_t);
@@ -1344,10 +1343,10 @@ int32_t SSDSparseTable::SaveWithBinary(const std::string& path,
   }
   VLOG(0) << "DownpourSparseSSDTable save success, feasign size:"
           << feasign_size_all << " ,path:"
-          << paddle::string::format_string("%s/%03d/part-%03d-",
-                                           path.c_str(),
-                                           _config.table_id(),
-                                           _shard_idx)
+          << ::paddle::string::format_string("%s/%03d/part-%03d-",
+                                             path.c_str(),
+                                             _config.table_id(),
+                                             _shard_idx)
           << " from " << file_start_idx << " to "
           << file_start_idx + _real_local_shard_num - 1;
   if (_config.enable_sparse_table_cache()) {
@@ -1364,7 +1363,7 @@ int64_t SSDSparseTable::CacheShuffle(
     double cache_threshold,
     std::function<std::future<int32_t>(
         int msg_type, int to_pserver_id, std::string& msg)> send_msg_func,
-    paddle::framework::Channel<std::pair<uint64_t, std::string>>&
+    ::paddle::framework::Channel<std::pair<uint64_t, std::string>>&
         shuffled_channel,
     const std::vector<Table*>& table_ptrs) {
   LOG(INFO) << "cache shuffle with cache threshold: " << cache_threshold
@@ -1381,27 +1380,27 @@ int64_t SSDSparseTable::CacheShuffle(
   int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20;
 
   std::vector<
-      paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>>
+      ::paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>>
       writers(_real_local_shard_num);
   std::vector<std::vector<std::pair<uint64_t, std::string>>> datas(
       _real_local_shard_num);
 
   int feasign_size = 0;
-  std::vector<paddle::framework::Channel<std::pair<uint64_t, std::string>>>
+  std::vector<::paddle::framework::Channel<std::pair<uint64_t, std::string>>>
       tmp_channels;
   for (int i = 0; i < _real_local_shard_num; ++i) {
     tmp_channels.push_back(
-        paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>());
+        ::paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>());
   }
 
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
   for (int i = 0; i < _real_local_shard_num; ++i) {
-    paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>& writer =
-        writers[i];
-    //    std::shared_ptr<paddle::framework::ChannelObject<std::pair<uint64_t,
+    ::paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>&
+        writer = writers[i];
+    //    std::shared_ptr<::paddle::framework::ChannelObject<std::pair<uint64_t,
     //    std::string>>> tmp_chan =
-    //        paddle::framework::MakeChannel<std::pair<uint64_t,
+    //        ::paddle::framework::MakeChannel<std::pair<uint64_t,
     //        std::string>>();
     writer.Reset(tmp_channels[i].get());
 
@@ -1426,15 +1425,15 @@ int64_t SSDSparseTable::CacheShuffle(
             << _real_local_shard_num;
   std::vector<std::pair<uint64_t, std::string>> local_datas;
   for (int idx_shard = 0; idx_shard < _real_local_shard_num; ++idx_shard) {
-    paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>& writer =
-        writers[idx_shard];
+    ::paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>&
+        writer = writers[idx_shard];
     auto channel = writer.channel();
     std::vector<std::pair<uint64_t, std::string>>& data = datas[idx_shard];
-    std::vector<paddle::framework::BinaryArchive> ars(shuffle_node_num);
+    std::vector<::paddle::framework::BinaryArchive> ars(shuffle_node_num);
     while (channel->Read(data)) {
       for (auto& t : data) {
         auto pserver_id =
-            paddle::distributed::local_random_engine()() % shuffle_node_num;
+            ::paddle::distributed::local_random_engine()() % shuffle_node_num;
         if (pserver_id != _shard_idx) {
           ars[pserver_id] << t;
         } else {
@@ -1465,7 +1464,7 @@ int64_t SSDSparseTable::CacheShuffle(
         t.wait();
       }
       ars.clear();
-      ars = std::vector<paddle::framework::BinaryArchive>(shuffle_node_num);
+      ars = std::vector<::paddle::framework::BinaryArchive>(shuffle_node_num);
       data = std::vector<std::pair<uint64_t, std::string>>();
     }
   }
@@ -1477,20 +1476,20 @@ int64_t SSDSparseTable::CacheShuffle(
 int32_t SSDSparseTable::SaveCache(
     const std::string& path,
     const std::string& param,
-    paddle::framework::Channel<std::pair<uint64_t, std::string>>&
+    ::paddle::framework::Channel<std::pair<uint64_t, std::string>>&
         shuffled_channel) {
   if (_shard_idx >= _config.sparse_table_cache_file_num()) {
     return 0;
   }
   int save_param = atoi(param.c_str());  // batch_model:0  xbox:1
-  std::string table_path = paddle::string::format_string(
+  std::string table_path = ::paddle::string::format_string(
       "%s/%03d_cache/", path.c_str(), _config.table_id());
-  _afs_client.remove(paddle::string::format_string(
+  _afs_client.remove(::paddle::string::format_string(
       "%s/part-%03d", table_path.c_str(), _shard_idx));
   uint32_t feasign_size = 0;
   FsChannelConfig channel_config;
   // not compress cache model
-  channel_config.path = paddle::string::format_string(
+  channel_config.path = ::paddle::string::format_string(
       "%s/part-%03d", table_path.c_str(), _shard_idx);
   channel_config.converter = _value_accesor->Converter(save_param).converter;
   channel_config.deconverter =
@@ -1502,7 +1501,7 @@ int32_t SSDSparseTable::SaveCache(
   while (shuffled_channel->Read(data)) {
     for (auto& t : data) {
       ++feasign_size;
-      if (0 != write_channel->write_line(paddle::string::format_string(
+      if (0 != write_channel->write_line(::paddle::string::format_string(
                    "%lu %s", t.first, t.second.c_str()))) {
         LOG(ERROR) << "Cache Table save failed, "
                       "path:"
@@ -1580,7 +1579,7 @@ int32_t SSDSparseTable::LoadWithString(
 #endif
 
   for (int i = 0; i < _real_local_shard_num; i++) {
-    _fs_channel.push_back(paddle::framework::MakeChannel<std::string>(30000));
+    _fs_channel.push_back(::paddle::framework::MakeChannel<std::string>(30000));
   }
 
   std::vector<std::thread> threads;
@@ -1598,7 +1597,7 @@ int32_t SSDSparseTable::LoadWithString(
 
     std::string line_data;
     auto read_channel = _afs_client.open_r(channel_config, 0, &err_no);
-    paddle::framework::ChannelWriter<std::string> writer(
+    ::paddle::framework::ChannelWriter<std::string> writer(
         _fs_channel[file_num].get());
     while (read_channel->read_line(line_data) == 0 && line_data.size() > 1) {
       writer << line_data;
@@ -1638,7 +1637,8 @@ int32_t SSDSparseTable::LoadWithString(
     uint64_t filter_time = 0;
     uint64_t filter_begin = 0;
 
-    paddle::framework::ChannelReader<std::string> reader(_fs_channel[i].get());
+    ::paddle::framework::ChannelReader<std::string> reader(
+        _fs_channel[i].get());
 
     while (reader >> line_data) {
       uint64_t key = std::strtoul(line_data.data(), &end, 10);
@@ -1724,8 +1724,8 @@ int32_t SSDSparseTable::LoadWithBinary(const std::string& path, int param) {
       _value_accesor->GetAccessorInfo().mf_size / sizeof(float);
   // task pool _file_num_one_shard default 7
   auto task_pool = std::make_shared<::ThreadPool>(_real_local_shard_num * 7);
-  auto filelists = _afs_client.list(
-      paddle::string::format_string("%s/part-%03d*", path.c_str(), _shard_idx));
+  auto filelists = _afs_client.list(::paddle::string::format_string(
+      "%s/part-%03d*", path.c_str(), _shard_idx));
   // #pragma omp parallel for schedule(dynamic)
   std::vector<std::future<int>> tasks;
 
@@ -1736,7 +1736,7 @@ int32_t SSDSparseTable::LoadWithBinary(const std::string& path, int param) {
     // _value_accesor->Converter(param).deconverter;
     for (auto& filename : filelists) {
       std::vector<std::string> split_filename_string =
-          paddle::string::split_string<std::string>(filename, "-");
+          ::paddle::string::split_string<std::string>(filename, "-");
       int file_split_idx =
           atoi(split_filename_string[split_filename_string.size() - 1].c_str());
       int file_shard_idx =
@@ -1798,10 +1798,10 @@ int32_t SSDSparseTable::LoadWithBinary(const std::string& path, int param) {
         int use_sst = 0;
         if (file_split_idx != 0) {
           std::string path =
-              paddle::string::format_string("%s_%d/part-%03d.sst",
-                                            FLAGS_rocksdb_path.c_str(),
-                                            shard_idx,
-                                            file_split_idx);
+              ::paddle::string::format_string("%s_%d/part-%03d.sst",
+                                              FLAGS_rocksdb_path.c_str(),
+                                              shard_idx,
+                                              file_split_idx);
           rocksdb::Status status = sst_writer.Open(path);
           if (!status.ok()) {
             VLOG(0) << "sst writer open " << path << "failed";
@@ -1925,7 +1925,7 @@ int32_t SSDSparseTable::LoadWithBinary(const std::string& path, int param) {
   }
   tasks.clear();
   for (int shard_idx = 0; shard_idx < _real_local_shard_num; shard_idx++) {
-    auto sst_filelist = _afs_client.list(paddle::string::format_string(
+    auto sst_filelist = _afs_client.list(::paddle::string::format_string(
         "%s_%d/part-*", FLAGS_rocksdb_path.c_str(), shard_idx));
     if (!sst_filelist.empty()) {
       int ret = _db->ingest_externel_file(shard_idx, sst_filelist);
@@ -2034,10 +2034,10 @@ int32_t SSDSparseTable::CacheTable(uint16_t pass_id) {
             if (!datas.empty()) {
               rocksdb::SstFileWriter sst_writer(rocksdb::EnvOptions(), options);
               std::string filename =
-                  paddle::string::format_string("%s_%d/cache-%05d.sst",
-                                                FLAGS_rocksdb_path.c_str(),
-                                                shard_id,
-                                                cache_table_count);
+                  ::paddle::string::format_string("%s_%d/cache-%05d.sst",
+                                                  FLAGS_rocksdb_path.c_str(),
+                                                  shard_id,
+                                                  cache_table_count);
               rocksdb::Status status = sst_writer.Open(filename);
               if (!status.ok()) {
                 VLOG(0) << "sst writer open " << filename << "failed"
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
index e5561c5e42b991..c003061d815162 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h"
 #include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h
index b64e05e3b0a111..dc44831e891ca1 100644
--- a/paddle/fluid/distributed/ps/table/table.h
+++ b/paddle/fluid/distributed/ps/table/table.h
@@ -118,7 +118,7 @@ class Table {
   virtual int32_t SaveCache(
       const std::string &path UNUSED,
       const std::string &param UNUSED,
-      paddle::framework::Channel<std::pair<uint64_t, std::string>>
+      ::paddle::framework::Channel<std::pair<uint64_t, std::string>>
           &shuffled_channel UNUSED) {
     return 0;
   }
@@ -130,7 +130,7 @@ class Table {
       std::function<std::future<int32_t>(
           int msg_type, int to_pserver_id, std::string &msg)>  // NOLINT
           send_msg_func UNUSED,
-      paddle::framework::Channel<std::pair<uint64_t, std::string>>
+      ::paddle::framework::Channel<std::pair<uint64_t, std::string>>
           &shuffled_channel UNUSED,
       const std::vector<Table *> &table_ptrs UNUSED) {
     return 0;
@@ -161,7 +161,7 @@ class Table {
   virtual int32_t InitializeAccessor();
   virtual int32_t InitializeShard() = 0;
   virtual std::string TableDir(const std::string &model_dir) {
-    return paddle::string::format_string(
+    return ::paddle::string::format_string(
         "%s/%03d/", model_dir.c_str(), _config.table_id());
   }
 
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index d25ad89d504fad..87eb250545a5bf 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -30,8 +30,10 @@ const uint32_t MAX_FEASIGN_NUM = 1024 * 100 * 100;
 std::shared_ptr<FleetWrapper> FleetWrapper::s_instance_ = NULL;
 bool FleetWrapper::is_initialized_ = false;
 
-std::shared_ptr<paddle::distributed::PSCore> FleetWrapper::pserver_ptr_ = NULL;
-std::shared_ptr<paddle::distributed::PSClient> FleetWrapper::worker_ptr_ = NULL;
+std::shared_ptr<::paddle::distributed::PSCore> FleetWrapper::pserver_ptr_ =
+    NULL;
+std::shared_ptr<::paddle::distributed::PSClient> FleetWrapper::worker_ptr_ =
+    NULL;
 
 int FleetWrapper::RegisterHeterCallback(HeterCallBackFunc handler) {
   VLOG(0) << "RegisterHeterCallback support later";
@@ -76,8 +78,8 @@ void FleetWrapper::InitServer(
     const std::vector<framework::ProgramDesc>& server_sub_program) {
   if (!is_initialized_) {
     VLOG(3) << "Going to init server";
-    pserver_ptr_ = std::shared_ptr<paddle::distributed::PSCore>(
-        new paddle::distributed::PSCore());
+    pserver_ptr_ = std::shared_ptr<::paddle::distributed::PSCore>(
+        new ::paddle::distributed::PSCore());
     pserver_ptr_->InitServer(dist_desc,
                              &host_sign_list,
                              host_sign_list.size(),
@@ -92,7 +94,7 @@ void FleetWrapper::InitServer(
 
 void FleetWrapper::InitGFlag(const std::string& gflags) {
   VLOG(3) << "Init With Gflags:" << gflags;
-  std::vector<std::string> flags = paddle::string::split_string(gflags);
+  std::vector<std::string> flags = ::paddle::string::split_string(gflags);
   if (flags.empty()) {
     flags.push_back("-max_body_size=314217728");
     flags.push_back("-bthread_concurrency=40");
@@ -107,7 +109,7 @@ void FleetWrapper::InitGFlag(const std::string& gflags) {
   }
   int params_cnt = flags.size();
   char** params_ptr = &(flags_ptr[0]);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&params_cnt, &params_ptr, true);
+  ::paddle::flags::ParseCommandLineFlags(&params_cnt, &params_ptr);
 }
 
 void FleetWrapper::InitWorker(const std::string& dist_desc,
@@ -116,17 +118,17 @@ void FleetWrapper::InitWorker(const std::string& dist_desc,
   if (!is_initialized_) {
     // not used, just for psclient's init
     // TODO(zhaocaibei123): remove this later
-    std::map<uint64_t, std::vector<paddle::distributed::Region>>
+    std::map<uint64_t, std::vector<::paddle::distributed::Region>>
         dense_pull_regions;
 
     if (worker_ptr_.get() == nullptr) {
-      paddle::distributed::PSParameter ps_param;
+      ::paddle::distributed::PSParameter ps_param;
       google::protobuf::TextFormat::ParseFromString(dist_desc, &ps_param);
       InitGFlag(ps_param.init_gflags());
       int servers = host_sign_list.size();
       ps_env_.SetPsServers(&host_sign_list, servers);
-      worker_ptr_ = std::shared_ptr<paddle::distributed::PSClient>(
-          paddle::distributed::PSClientFactory::Create(ps_param));
+      worker_ptr_ = std::shared_ptr<::paddle::distributed::PSClient>(
+          ::paddle::distributed::PSClientFactory::Create(ps_param));
       worker_ptr_->Configure(ps_param, dense_pull_regions, ps_env_, index);
     }
     dist_desc_ = dist_desc;
@@ -392,7 +394,7 @@ void FleetWrapper::PullDenseVarsAsync(
     Variable* var = scope.FindVar(varname);
     phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
     float* w = tensor->data<float>();
-    paddle::distributed::Region reg(w, tensor->numel());
+    ::paddle::distributed::Region reg(w, tensor->numel());
     regions[i] = std::move(reg);
   }
 
@@ -412,7 +414,7 @@ void FleetWrapper::PullDenseVarsSync(
     phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
     if (!platform::is_gpu_place(tensor->place())) {
       float* w = tensor->data<float>();
-      paddle::distributed::Region reg(w, tensor->numel());
+      ::paddle::distributed::Region reg(w, tensor->numel());
       regions.emplace_back(std::move(reg));
     }
   }
@@ -425,14 +427,14 @@ void FleetWrapper::PushDenseParamSync(
     const uint64_t table_id,
     const std::vector<std::string>& var_names) {
   auto place = platform::CPUPlace();
-  std::vector<paddle::distributed::Region> regions;
+  std::vector<::paddle::distributed::Region> regions;
   for (auto& t : var_names) {
     Variable* var = scope.FindVar(t);
     CHECK(var != nullptr) << "var[" << t << "] not found";
     phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
     if (!platform::is_gpu_place(tensor->place())) {
       float* g = tensor->mutable_data<float>(place);
-      paddle::distributed::Region reg(g, tensor->numel());
+      ::paddle::distributed::Region reg(g, tensor->numel());
       regions.emplace_back(std::move(reg));
     }
   }
@@ -456,7 +458,7 @@ void FleetWrapper::PushDenseVarsAsync(
     float scale_datanorm,
     int batch_size) {
   auto place = platform::CPUPlace();
-  std::vector<paddle::distributed::Region> regions;
+  std::vector<::paddle::distributed::Region> regions;
   for (auto& t : var_names) {
     Variable* var = scope.FindVar(t);
     CHECK(var != nullptr) << "var[" << t << "] not found";
@@ -479,7 +481,7 @@ void FleetWrapper::PushDenseVarsAsync(
       }
     }
 
-    paddle::distributed::Region reg(g, tensor->numel());
+    ::paddle::distributed::Region reg(g, tensor->numel());
     regions.emplace_back(std::move(reg));
     VLOG(3) << "FleetWrapper::PushDenseVarsAsync Var " << t << " talbe_id "
             << table_id << " Temp_data[0] " << g[0] << " Temp_data[-1] "
@@ -774,7 +776,7 @@ void FleetWrapper::ShrinkDenseTable(int table_id,
                                     std::vector<std::string> var_list,
                                     float decay,
                                     int emb_dim) {
-  std::vector<paddle::distributed::Region> regions;
+  std::vector<::paddle::distributed::Region> regions;
   for (std::string& name : var_list) {
     if (name.find("batch_sum") != std::string::npos) {
       Variable* var = scope->FindVar(name);
@@ -795,14 +797,14 @@ void FleetWrapper::ShrinkDenseTable(int table_id,
       for (int k = 0; k < tensor->numel(); k += emb_dim) {
         g[k] = g[k] + g_size[k] * log(decay);
       }
-      paddle::distributed::Region reg(g, tensor->numel());
+      ::paddle::distributed::Region reg(g, tensor->numel());
       regions.emplace_back(std::move(reg));
     } else {
       Variable* var = scope->FindVar(name);
       CHECK(var != nullptr) << "var[" << name << "] not found";
       phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
       float* g = tensor->data<float>();
-      paddle::distributed::Region reg(g, tensor->numel());
+      ::paddle::distributed::Region reg(g, tensor->numel());
       regions.emplace_back(std::move(reg));
     }
   }
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h
index 9bf6f3c84a945d..22dc0f1af724ab 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.h
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.h
@@ -295,7 +295,7 @@ class FleetWrapper {
   // FleetWrapper singleton
   static std::shared_ptr<FleetWrapper> GetInstance() {
     if (NULL == s_instance_) {
-      s_instance_.reset(new paddle::distributed::FleetWrapper());
+      s_instance_.reset(new ::paddle::distributed::FleetWrapper());
     }
     return s_instance_;
   }
@@ -322,13 +322,13 @@ class FleetWrapper {
   std::string PullFlStrategy();
   //**********
 
-  static std::shared_ptr<paddle::distributed::PSCore> pserver_ptr_;
-  static std::shared_ptr<paddle::distributed::PSClient> worker_ptr_;
+  static std::shared_ptr<::paddle::distributed::PSCore> pserver_ptr_;
+  static std::shared_ptr<::paddle::distributed::PSClient> worker_ptr_;
 
  private:
   static std::shared_ptr<FleetWrapper> s_instance_;
   std::string dist_desc_;
-  paddle::distributed::PaddlePSEnvironment ps_env_;
+  ::paddle::distributed::PaddlePSEnvironment ps_env_;
   size_t GetAbsoluteSum(size_t start,
                         size_t end,
                         size_t level,
@@ -336,7 +336,7 @@ class FleetWrapper {
 
  protected:
   static bool is_initialized_;
-  std::map<uint64_t, std::vector<paddle::distributed::Region>> regions_;
+  std::map<uint64_t, std::vector<::paddle::distributed::Region>> regions_;
   bool scale_sparse_gradient_with_batch_size_;
   int32_t sleep_seconds_before_fail_exit_;
   int client2client_request_timeout_ms_;
diff --git a/paddle/fluid/distributed/rpc/CMakeLists.txt b/paddle/fluid/distributed/rpc/CMakeLists.txt
index 76c6dc001104c3..4042a6fe3ccfeb 100644
--- a/paddle/fluid/distributed/rpc/CMakeLists.txt
+++ b/paddle/fluid/distributed/rpc/CMakeLists.txt
@@ -12,17 +12,7 @@ set_source_files_properties(
 set_source_files_properties(rpc_agent.cc PROPERTIES COMPILE_FLAGS
                                                     ${DISTRIBUTE_COMPILE_FLAGS})
 
-set(PADDLE_RPC_DEPS
-    brpc
-    ssl
-    crypto
-    protobuf
-    zlib
-    leveldb
-    snappy
-    phi
-    glog
-    pybind)
+set(PADDLE_RPC_DEPS ${EXTERNAL_BRPC_DEPS} zlib phi pybind)
 proto_library(paddle_rpc_proto SRCS rpc.proto)
 cc_library(
   paddle_rpc
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
index d14832b80a1db8..7dca372a23ba02 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
@@ -83,8 +83,8 @@ paddle::Tensor add_n_ad_func(const std::vector<paddle::Tensor>& x) {
     egr::EagerUtils::PassStopGradient(false, out_autograd_meta);
 
     // Node Construction
-    auto grad_node =
-        std::shared_ptr<AddNGradNodeFinal>(new AddNGradNodeFinal(1, 1));
+    auto grad_node = std::shared_ptr<AddNGradNodeFinal>(  // NOLINT
+        new AddNGradNodeFinal(1, 1));
 
     // Set forward's stack
     if (FLAGS_check_nan_inf) {
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
index b7ca5a7c267100..5ef6fcf9c3fa1c 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
@@ -137,8 +137,8 @@ paddle::Tensor conv2d_ad_func(const paddle::Tensor& input,
     egr::EagerUtils::PassStopGradient(false, out_autograd_meta);
 
     // Node Construction
-    auto grad_node =
-        std::shared_ptr<Conv2dGradNodeFinal>(new Conv2dGradNodeFinal(1, 2));
+    auto grad_node = std::shared_ptr<Conv2dGradNodeFinal>(  // NOLINT
+        new Conv2dGradNodeFinal(1, 2));
 
     // Set forward's stack
     if (FLAGS_check_nan_inf) {
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
index d9b34643034a45..0a72ab810fc648 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
@@ -132,8 +132,8 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
     egr::EagerUtils::PassStopGradient(false, out_autograd_meta);
 
     // Node Construction
-    auto grad_node =
-        std::shared_ptr<MultiplyGradNode>(new MultiplyGradNode(1, 2));
+    auto grad_node = std::shared_ptr<MultiplyGradNode>(  // NOLINT
+        new MultiplyGradNode(1, 2));
     // Set for forward trace
     if (FLAGS_check_nan_inf) {
       grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack());
@@ -275,7 +275,8 @@ paddle::Tensor& multiply__ad_func(paddle::Tensor& x,  // NOLINT
         paddle::platform::TracerEventType::OperatorInner,
         1);
 
-    grad_node = std::shared_ptr<MultiplyGradNode>(new MultiplyGradNode(1, 2));
+    grad_node = std::shared_ptr<MultiplyGradNode>(  // NOLINT
+        new MultiplyGradNode(1, 2));
     // Set for forward trace
     if (FLAGS_check_nan_inf) {
       grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack());
@@ -462,8 +463,8 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
     egr::EagerUtils::PassStopGradient(false, out_autograd_meta);
 
     // Node Construction
-    auto grad_node =
-        std::shared_ptr<MultiplyGradNode>(new MultiplyGradNode(1, 2));
+    auto grad_node = std::shared_ptr<MultiplyGradNode>(  // NOLINT
+        new MultiplyGradNode(1, 2));
     // Set for forward trace
     if (FLAGS_check_nan_inf) {
       grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack());
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc
index a5b0c4d70b07f2..79b5c60c66b19b 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc
@@ -225,8 +225,8 @@ sync_batch_norm__ad_func(const paddle::Tensor& x,
                                       reserve_space_autograd_meta);
 
     // Node Construction
-    auto grad_node =
-        std::shared_ptr<SyncBatchNormGradNode>(new SyncBatchNormGradNode(6, 5));
+    auto grad_node = std::shared_ptr<SyncBatchNormGradNode>(  // NOLINT
+        new SyncBatchNormGradNode(6, 5));
 
     // Set forward's stack
     if (FLAGS_check_nan_inf) {
@@ -567,8 +567,8 @@ sync_batch_norm__ad_func(const paddle::Tensor& x,
                                       reserve_space_autograd_meta);
 
     // Node Construction
-    auto grad_node =
-        std::shared_ptr<SyncBatchNormGradNode>(new SyncBatchNormGradNode(6, 5));
+    auto grad_node = std::shared_ptr<SyncBatchNormGradNode>(  // NOLINT
+        new SyncBatchNormGradNode(6, 5));
     egr::Controller::Instance().PushBackForceSequentialNodes(grad_node.get());
     // SetAttributes if needed
     grad_node->SetAttributemomentum(momentum);
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
index a7d00f8df18028..438188ea4b7f6e 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
@@ -123,7 +123,7 @@ Conv2dGradNodeFinal::operator()(
         1);
 
     // Node Construction
-    auto grad_node = std::shared_ptr<Conv2dDoubleGradNodeFinal>(
+    auto grad_node = std::shared_ptr<Conv2dDoubleGradNodeFinal>(  // NOLINT
         new Conv2dDoubleGradNodeFinal(2, 3));
     // SetAttributes if needed
     grad_node->SetAttributestrides(strides);
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc
index f7a90c43e7d933..f3ae667777aa69 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc
@@ -158,7 +158,7 @@ MultiplyGradNode::operator()(
           1);
 
       // Node Construction
-      auto grad_node = std::shared_ptr<MultiplyDoubleGradNode>(
+      auto grad_node = std::shared_ptr<MultiplyDoubleGradNode>(  // NOLINT
           new MultiplyDoubleGradNode(2, 3));
       // SetAttributes if needed
       grad_node->SetAttributeaxis(axis);
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
index 77ecc8a30e19f3..0f1192ae1bd288 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
@@ -390,8 +390,9 @@ fused_attention_dygraph_function(
                                         p_autograd_CacheKVOut,
                                         p_autograd_Y);
       // Create GradOpNode
-      auto grad_node = std::shared_ptr<fused_attentionGradNodeCompat>(
-          new fused_attentionGradNodeCompat(20, 23));
+      auto grad_node =
+          std::shared_ptr<fused_attentionGradNodeCompat>(  // NOLINT
+              new fused_attentionGradNodeCompat(20, 23));
 
       bool pre_layer_norm = false;
       if (attrs.count("pre_layer_norm")) {
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc
index 4b57d2e3c5ba25..1e61714525f483 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc
@@ -184,9 +184,9 @@ fused_bias_dropout_residual_layer_norm_dygraph_function(
                                         p_autograd_LnVariance,
                                         p_autograd_Y);
       // Create GradOpNode
-      auto grad_node =
-          std::shared_ptr<fused_bias_dropout_residual_layer_normGradNodeCompat>(
-              new fused_bias_dropout_residual_layer_normGradNodeCompat(5, 5));
+      auto grad_node = std::shared_ptr<  // NOLINT
+          fused_bias_dropout_residual_layer_normGradNodeCompat>(
+          new fused_bias_dropout_residual_layer_normGradNodeCompat(5, 5));
 
       // Set Attributes
       grad_node->SetAttrMap(std::move(attrs));
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
index 9f13579c5aab71..e46e677c318a19 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
@@ -310,8 +310,9 @@ fused_feedforward_dygraph_function(
                                         p_autograd_Dropout1Out,
                                         p_autograd_Dropout2Out);
       // Create GradOpNode
-      auto grad_node = std::shared_ptr<fused_feedforwardGradNodeCompat>(
-          new fused_feedforwardGradNodeCompat(11, 11));
+      auto grad_node =
+          std::shared_ptr<fused_feedforwardGradNodeCompat>(  // NOLINT
+              new fused_feedforwardGradNodeCompat(11, 11));
 
       bool pre_layer_norm = false;
       if (attrs.count("pre_layer_norm")) {
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
index 546b60438fedcc..8c66a106311424 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
@@ -301,8 +301,9 @@ fused_gate_attention_dygraph_function(
                                         p_autograd_GateOut,
                                         p_autograd_Out);
       // Create GradOpNode
-      auto grad_node = std::shared_ptr<fused_gate_attentionGradNodeCompat>(
-          new fused_gate_attentionGradNodeCompat(9, 12));
+      auto grad_node =
+          std::shared_ptr<fused_gate_attentionGradNodeCompat>(  // NOLINT
+              new fused_gate_attentionGradNodeCompat(9, 12));
 
       bool merge_qkv = true;
       if (attrs.count("merge_qkv")) {
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc
index 2eb7327601189d..74e10cd0685adf 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc
@@ -102,8 +102,9 @@ paddle::Tensor fused_gemm_epilogue_dygraph_function(
       VLOG(6) << " Construct Grad for fused_gemm_epilogue ";
       egr::EagerUtils::PassStopGradient(false, p_autograd_Out);
       // Create GradOpNode
-      auto grad_node = std::shared_ptr<fused_gemm_epilogueGradNodeCompat>(
-          new fused_gemm_epilogueGradNodeCompat(1, 3));
+      auto grad_node =
+          std::shared_ptr<fused_gemm_epilogueGradNodeCompat>(  // NOLINT
+              new fused_gemm_epilogueGradNodeCompat(1, 3));
 
       // Set Attributes
       grad_node->SetAttrMap(std::move(attrs));
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 6cc98ed6fb374c..d250989199a528 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -827,9 +827,8 @@ static bool CollectGradInformationFromOpInfo(
     const std::string& in_name = op_proto.inputs()[0].name();
     ins[in_name] = {};
     for (size_t i = 0; i < NUM_CREATED_DUP_INPUTS; i++) {
-      ins[in_name].emplace_back(std::shared_ptr<paddle::imperative::VarBase>(
-          new paddle::imperative::VarBase("auto_" + in_name + "_" +
-                                          std::to_string(i))));
+      ins[in_name].emplace_back(std::make_shared<paddle::imperative::VarBase>(
+          "auto_" + in_name + "_" + std::to_string(i)));
       ins[in_name][i]->SetOverridedStopGradient(false);
       ins[in_name][i]->MutableVar()->GetMutable<phi::DenseTensor>();
     }
@@ -852,8 +851,8 @@ static bool CollectGradInformationFromOpInfo(
       // but we only need to identify the slot name order,
       // therefore fill in 1 single input VarBase is enough in this scenario
 
-      ins[in_name] = {std::shared_ptr<paddle::imperative::VarBase>(
-          new paddle::imperative::VarBase("auto_" + in_name))};
+      ins[in_name] = {
+          std::make_shared<paddle::imperative::VarBase>("auto_" + in_name)};
       ins[in_name][0]->SetOverridedStopGradient(false);
       ins[in_name][0]->MutableVar()->GetMutable<phi::DenseTensor>();
     }
@@ -870,8 +869,8 @@ static bool CollectGradInformationFromOpInfo(
     // We always create output VarBase regardless of its dispensability.
     // We dont know the exact number of outputs during code generation,
     // however, simply identifying the slot name order would be enough
-    outs[out_name] = {std::shared_ptr<paddle::imperative::VarBase>(
-        new paddle::imperative::VarBase("auto_" + out_name))};
+    outs[out_name] = {
+        std::make_shared<paddle::imperative::VarBase>("auto_" + out_name)};
     outs[out_name][0]->SetOverridedStopGradient(false);
     outs[out_name][0]->MutableVar()->GetMutable<phi::DenseTensor>();
   }
@@ -1179,7 +1178,7 @@ static std::string GenerateGradNodeCreationContent(
   const char* GRAD_OP_NODE_TEMPLATE =
       "      auto grad_node = std::shared_ptr<%sGradNodeCompat>(new "
       "%sGradNodeCompat(%d, "
-      "%d));\n";
+      "%d)); // NOLINT\n";
   grad_node_creation_str += "    // Create GradOpNode\n";
   grad_node_creation_str += paddle::string::Sprintf(GRAD_OP_NODE_TEMPLATE,
                                                     op_type,
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 9bbe26d0f8a4e1..da4a9aab538707 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -953,13 +953,13 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
 
         # Helper
         indent = GetIndent(2)
-        # NOTE(Aurelius74): DO NOT use make_shared here. Because some Node contains experimental::Scalar
+        # NOTE(Aurelius84): DO NOT use make_shared here. Because some Node contains experimental::Scalar
         # which contains "complex128" as data. "complex128" is memory-aligned manually. But make_shared
         # request MEMALIGN for allocation (Maybe).
         # See https://stackoverflow.com/questions/31228656/how-can-shared-ptr-disrupt-alignment
         # and https://github.com/MRtrix3/mrtrix3/issues/957
-        node_construction_str = f"{indent}auto grad_node = std::shared_ptr<{grad_node_name}>(new {grad_node_name}({num_backward_inputs}, {num_backward_outputs}));"
-        node_assignment_str = f"{indent}grad_node = std::shared_ptr<{grad_node_name}>(new {grad_node_name}({num_backward_inputs}, {num_backward_outputs}));"
+        node_construction_str = f"{indent}auto grad_node = std::shared_ptr<{grad_node_name}>(new {grad_node_name}({num_backward_inputs}, {num_backward_outputs})); // NOLINT"
+        node_assignment_str = f"{indent}grad_node = std::shared_ptr<{grad_node_name}>(new {grad_node_name}({num_backward_inputs}, {num_backward_outputs})); // NOLINT"
 
         # SetAttributes
         set_attributes_list = []
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 182c83cc37bdee..055163ed6206be 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -228,13 +228,13 @@ cc_test(
 
 set(BRPC_DEPS "")
 if(WITH_PSCORE)
-  set(BRPC_DEPS brpc ssl crypto)
+  set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS})
 endif()
 if(WITH_PSLIB)
   if(WITH_PSLIB_BRPC)
     set(BRPC_DEPS pslib_brpc)
   elseif(NOT WITH_HETERPS)
-    set(BRPC_DEPS brpc ssl crypto)
+    set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS})
   endif()
   if(WITH_ARM_BRPC)
     set(BRPC_DEPS arm_brpc)
@@ -833,7 +833,7 @@ if(WITH_DISTRIBUTE)
            heter_service_proto
            fleet
            heter_server
-           brpc
+           ${${EXTERNAL_BRPC_DEPS}}
            phi)
     set(DISTRIBUTE_COMPILE_FLAGS "")
     if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index c5d898beba1ae2..055381f0d58aa0 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/async_executor.h"
 
-#include "gflags/gflags.h"
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"
@@ -32,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/pybind/pybind.h"
+#include "paddle/utils/flags.h"
 
 // phi
 #include "paddle/phi/kernels/declarations.h"
diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc
index 10e0b76f00459d..9b892c0c1b092e 100644
--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -16,15 +16,15 @@
 
 #include <random>
 
-#include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_bool(use_system_allocator);
+PD_DECLARE_bool(use_system_allocator);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/bkcl_op_handle.h b/paddle/fluid/framework/details/bkcl_op_handle.h
index 4ca8bf4cb58749..8a5afcf04bf9a0 100644
--- a/paddle/fluid/framework/details/bkcl_op_handle.h
+++ b/paddle/fluid/framework/details/bkcl_op_handle.h
@@ -24,7 +24,7 @@
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #include "xpu/bkcl.h"
 
-DECLARE_bool(sync_bkcl_allreduce);
+PD_DECLARE_bool(sync_bkcl_allreduce);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index d295ff6ad5a685..5a6f4e6e70d4c1 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -22,10 +22,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
 #include "paddle/phi/core/flags.h"
 
-DECLARE_bool(convert_all_blocks);
+PD_DECLARE_bool(convert_all_blocks);
 PHI_DECLARE_bool(use_mkldnn);
 #ifdef PADDLE_WITH_CINN
-DECLARE_bool(use_cinn);
+PD_DECLARE_bool(use_cinn);
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/build_strategy_test.cc b/paddle/fluid/framework/details/build_strategy_test.cc
index 7ec7d93ee66109..0990f134b3e1bf 100644
--- a/paddle/fluid/framework/details/build_strategy_test.cc
+++ b/paddle/fluid/framework/details/build_strategy_test.cc
@@ -30,7 +30,7 @@
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/platform/place.h"
 
-DECLARE_bool(convert_all_blocks);
+PD_DECLARE_bool(convert_all_blocks);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index 66428661503120..a075b4702e946f 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -21,7 +21,7 @@
 #include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/core/flags.h"
 
-DEFINE_bool(skip_fused_all_reduce_check, false, "");  // NOLINT
+PD_DEFINE_bool(skip_fused_all_reduce_check, false, "");  // NOLINT
 PHI_DECLARE_bool(allreduce_record_one_event);
 
 namespace paddle {
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 50c8cc926dc7f5..f18705ef099abf 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/phi/core/flags.h"
 
-DECLARE_bool(benchmark);
+PD_DECLARE_bool(benchmark);
 PHI_DECLARE_bool(use_mkldnn);
 
 namespace paddle {
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
index 4bbcba2151b26c..a3fb850373a5a5 100644
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <algorithm>
 #include <utility>
 
-#include "gflags/gflags.h"
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"
@@ -34,6 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/pybind/pybind.h"
+#include "paddle/utils/flags.h"
 
 // phi
 #include "paddle/phi/kernels/declarations.h"
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 7ebc58e61b588f..e2ba9c6d4f43ea 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -3,7 +3,7 @@ if(WITH_PSLIB)
     set(BRPC_DEPS pslib_brpc)
   else()
     if(NOT WITH_HETERPS)
-      set(BRPC_DEPS brpc)
+      set(BRPC_DEPS ${EXTERNAL_BRPC_DEPS})
     endif()
   endif()
   cc_library(
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 3a514d26ddb476..01f1baf9be1919 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -16,10 +16,10 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/utils/flags.h"
 
 PHI_DECLARE_double(eager_delete_tensor_gb);
 PHI_DECLARE_double(memory_fraction_of_eager_deletion);
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index ec5766882e6643..fa45bbcbdd2311 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -20,9 +20,9 @@
 #include <mutex>  // NOLINT
 #include <utility>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/stream_callback_manager.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 3eb2df7011c7ed..3596f4e0f0e29e 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <gflags/gflags.h>
-
 #include <map>
 #include <memory>
 #include <string>
@@ -27,8 +25,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 
 #include "paddle/utils/any.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_bool(convert_all_blocks);
+PD_DECLARE_bool(convert_all_blocks);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index aee64ba89ac0d5..87a710cd036aff 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
-DECLARE_bool(convert_all_blocks);
+PD_DECLARE_bool(convert_all_blocks);
 PADDLE_DEFINE_EXPORTED_string(print_sub_graph_dir,
                               "",
                               "FLAGS_print_sub_graph_dir is used "
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
index f57cdd9d9746ca..e1ed5ff5b041f5 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 
-#include <gflags/gflags.h>
-
 #include <algorithm>
 
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
index e5a8c5330e15ac..07675a3f4efeb5 100644
--- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc
@@ -99,8 +99,8 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse(
     } else if (matmul_desc->Inputs().at("Y").at(0) == input_var_name) {
       matmul_input_name = "Y";
     } else {
-      throw platform::errors::InvalidArgument("Unexpected input to " +
-                                              matmul_type + " encountered.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unexpected input to %s encountered.", matmul_type));
     }
 
     // Return if input of fused_matmul is already fused
diff --git a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
index 0b12559e31deb8..0403330f77cd18 100644
--- a/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
+++ b/paddle/fluid/framework/ir/trt_support_nhwc_pass.cc
@@ -161,7 +161,7 @@ void TrtSupportNHWCPass::ApplyImpl(Graph *graph) const {
       "affine_channel", "softmax", "temporal_shift"};
   // OPs unrelated to layout are consistent according to the layout of input
   // var！
-  std::unordered_set<std::string> any_layout_ops{"relu"};
+  std::unordered_set<std::string> any_layout_ops{"relu", "elementwise_add"};
   //
   //
   // TODO(liuyuanle): Add other op if needed!
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc
index 14436e9c763585..9d789c9957e6cb 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.cc
+++ b/paddle/fluid/framework/new_executor/executor_statistics.cc
@@ -28,7 +28,7 @@
 #include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/profiler/utils.h"
 
-DECLARE_bool(use_stream_safe_cuda_allocator);
+PD_DECLARE_bool(use_stream_safe_cuda_allocator);
 PADDLE_DEFINE_EXPORTED_string(static_executor_perfstat_filepath,
                               "",
                               "FLAGS_static_executor_perfstat_filepath "
diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
index 4e3fb8d1b24505..18a26ea770cece 100644
--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
@@ -19,7 +19,7 @@
 
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 
-DECLARE_bool(new_executor_sequential_run);
+PD_DECLARE_bool(new_executor_sequential_run);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
index 1e6a6f02e22303..cf3195bb8c2960 100644
--- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
@@ -22,7 +22,7 @@
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
 
-DECLARE_bool(new_executor_serial_run);
+PD_DECLARE_bool(new_executor_serial_run);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index 934dd44771e7fa..88ac481d16c5b7 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -20,8 +20,8 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "gflags/gflags.h"
 #include "paddle/fluid/platform/flags.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h"
@@ -38,14 +38,14 @@
 #include "paddle/fluid/platform/device_event.h"
 #include "paddle/phi/backends/device_manager.h"
 
-DECLARE_bool(new_executor_serial_run);
-DECLARE_bool(new_executor_static_build);
-DECLARE_bool(new_executor_use_inplace);
-DECLARE_bool(new_executor_use_local_scope);
+PD_DECLARE_bool(new_executor_serial_run);
+PD_DECLARE_bool(new_executor_static_build);
+PD_DECLARE_bool(new_executor_use_inplace);
+PD_DECLARE_bool(new_executor_use_local_scope);
 
 PHI_DECLARE_bool(check_nan_inf);
-DECLARE_bool(benchmark);
-DECLARE_uint64(executor_log_deps_every_microseconds);
+PD_DECLARE_bool(benchmark);
+PD_DECLARE_uint64(executor_log_deps_every_microseconds);
 PHI_DECLARE_bool(new_executor_use_cuda_graph);
 PHI_DECLARE_bool(enable_new_ir_in_executor);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 191f0b92eb8b2a..f01c12b27c3a1f 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -15,7 +15,7 @@
 
 #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
 
-DECLARE_bool(new_executor_use_local_scope);
+PD_DECLARE_bool(new_executor_use_local_scope);
 
 namespace ir {
 class Program;
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 28752ef1f7eb1a..e026e914adb575 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -16,7 +16,7 @@
 
 #include <unordered_set>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
@@ -42,6 +42,11 @@
 #include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h"
 #include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
+#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
+#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.h"
+#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h"
+#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h"
+#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/legacy_kernel_op.h"
 #include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
 #include "paddle/ir/core/builtin_attribute.h"
 
@@ -517,7 +522,7 @@ void NewIRInterpreter::BuildInstruction() {
       }
       VLOG(6) << "process " << op_name;
 
-      if (dialect::IsLegacyOp(op_name)) {
+      if (op->name().compare(paddle::dialect::LegacyKernelOp::name()) == 0) {
         vec_instruction_base_.emplace_back(
             std::make_unique<LegacyKernelInstruction>(op_idx++,
                                                       place_,
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index c7193c68d3c9cc..558ae8f9b9078a 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <string>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/library_type.h"
@@ -25,8 +24,9 @@ limitations under the License. */
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_factory.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_bool(use_stride_kernel);
+PD_DECLARE_bool(use_stride_kernel);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 65a150b7ccbdf3..0c03486fdd7500 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
@@ -43,6 +42,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/ops/compat/signatures.h"
+#include "paddle/utils/flags.h"
 
 namespace phi {
 class DenseTensor;
@@ -62,9 +62,9 @@ class DenseTensor;
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 
-DECLARE_bool(benchmark);
+PD_DECLARE_bool(benchmark);
 PHI_DECLARE_bool(check_nan_inf);
-DECLARE_bool(enable_unused_var_check);
+PD_DECLARE_bool(enable_unused_var_check);
 PHI_DECLARE_bool(run_kp_kernel);
 PHI_DECLARE_bool(enable_host_event_recorder_hook);
 
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 1d57efd875f069..baca5b3f06743a 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/init.h"
 
-DECLARE_bool(enable_unused_var_check);
+PD_DECLARE_bool(enable_unused_var_check);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index 9e2da446b26606..fd0dcff440b5db 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/cinn/frontend/op_mapper_registry.h"
 #include "paddle/cinn/frontend/op_mappers/use_op_mappers.h"
@@ -38,9 +37,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/cinn/cinn_launch_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_string(allow_cinn_ops);
-DECLARE_string(deny_cinn_ops);
+PD_DECLARE_string(allow_cinn_ops);
+PD_DECLARE_string(deny_cinn_ops);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 29ea6098088b67..e0ddafd37da704 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -22,7 +22,6 @@
 #include <string>
 #include <unordered_map>
 
-#include "gflags/gflags.h"
 #include "paddle/cinn/auto_schedule/auto_tuner.h"
 #include "paddle/cinn/auto_schedule/tuning.h"
 #include "paddle/cinn/common/target.h"
@@ -52,6 +51,7 @@
 #include "paddle/ir/core/program.h"
 #include "paddle/ir/core/value.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/utils/flags.h"
 
 PHI_DECLARE_bool(enable_pe_launch_cinn);
 PHI_DECLARE_bool(enable_cinn_auto_tune);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
index 4c66bc787ef894..519b78115748be 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -23,7 +23,6 @@
 #include <unordered_set>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/cinn/common/target.h"
@@ -38,6 +37,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/utils/flags.h"
 
 PHI_DECLARE_string(allow_cinn_ops);
 PHI_DECLARE_string(deny_cinn_ops);
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 36b40657bb2b40..744ce8923a2d52 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/platform/flags.h"
-DECLARE_bool(benchmark);
+PD_DECLARE_bool(benchmark);
 
 PADDLE_DEFINE_EXPORTED_bool(
     eager_delete_scope,
diff --git a/paddle/fluid/framework/unused_var_check.h b/paddle/fluid/framework/unused_var_check.h
index cc4977e439c4cc..55a3a020a06b5e 100644
--- a/paddle/fluid/framework/unused_var_check.h
+++ b/paddle/fluid/framework/unused_var_check.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 11a4d37d799f26..d336488a42327c 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -36,7 +36,7 @@
 #include "paddle/phi/core/flags.h"
 
 PHI_DECLARE_bool(check_nan_inf);
-DECLARE_bool(benchmark);
+PD_DECLARE_bool(benchmark);
 PHI_DECLARE_bool(run_kp_kernel);
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index f1374bc8f7bd7c..bd3bbf2108efc1 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -38,7 +38,7 @@
 PHI_DECLARE_bool(use_mkldnn);
 PHI_DECLARE_string(tracer_mkldnn_ops_on);
 PHI_DECLARE_string(tracer_mkldnn_ops_off);
-DECLARE_bool(use_stride_kernel);
+PD_DECLARE_bool(use_stride_kernel);
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 5e426724aaec3a..49b94c743fdb8d 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -32,7 +32,7 @@
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 
-DECLARE_bool(use_stride_kernel);
+PD_DECLARE_bool(use_stride_kernel);
 namespace paddle {
 namespace imperative {
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 7b6175a975676c..48c9f79f34de19 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -37,6 +37,10 @@ get_property(ir_targets GLOBAL PROPERTY IR_TARGETS)
 get_property(not_infer_modules GLOBAL PROPERTY NOT_INFER_MODULES)
 set(utils_modules pretty_log string_helper benchmark utf8proc)
 
+if(NOT WITH_GFLAGS)
+  set(utils_modules ${utils_modules} paddle_flags)
+endif()
+
 add_subdirectory(api)
 
 # Create static inference library if needed
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 3c053283666748..60f86ba10eb197 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -37,8 +37,8 @@ limitations under the License. */
 
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index fa35ffc45c2a86..221b25cae00992 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -27,7 +27,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 
-DEFINE_bool(  // NOLINT
+PD_DEFINE_bool(  // NOLINT
     custom_model_save_cpu,
     false,
     "Keep old mode for developers, the model is saved on cpu not device.");
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index 6c7690a4779bf9..29d123d44ad450 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <fstream>
 #include <string>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace inference {
@@ -30,7 +30,7 @@ extern void ReadBinaryFile(const std::string& filename, std::string* contents);
 
 namespace analysis {
 
-DEFINE_string(inference_model_dir, "", "inference test model dir");
+PD_DEFINE_string(inference_model_dir, "", "inference test model dir");
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 7d60d203f2b281..1fb7e2c157134d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1708,10 +1708,10 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
 
   auto SetGflags = [](const AnalysisConfig &config) {
     auto SetGflag = [](const char *name, const char *value) {
-      std::string ret = ::GFLAGS_NAMESPACE::SetCommandLineOption(name, value);
+      bool success = paddle::flags::SetFlagValue(name, value);
       PADDLE_ENFORCE_EQ(
-          ret.empty(),
-          false,
+          success,
+          true,
           platform::errors::InvalidArgument(
               "Fail to set gflag: %s, please make sure the gflag exists.",
               name));
@@ -3089,8 +3089,8 @@ std::tuple<int, int, int> GetTrtRuntimeVersion() {
 #endif
 }
 
-std::string UpdateDllFlag(const char *name, const char *value) {
-  return paddle::UpdateDllFlag(name, value);
+void UpdateDllFlag(const char *name, const char *value) {
+  paddle::UpdateDllFlag(name, value);
 }
 
 void ConvertToMixedPrecision(const std::string &model_file,
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 93b88632984a8f..a15f8be18bf344 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -14,13 +14,13 @@
 
 #include <sstream>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/commit.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 
@@ -134,20 +134,18 @@ std::string get_version() {
   return ss.str();
 }
 
-std::string UpdateDllFlag(const char *name, const char *value) {
+void UpdateDllFlag(const char *name, const char *value) {
   std::string ret;
   LOG(WARNING)
       << "The function \"UpdateDllFlag\" is only used to update the flag "
          "on the Windows shared library";
-  ret = ::GFLAGS_NAMESPACE::SetCommandLineOption(name, value);
+  bool success = paddle::flags::SetFlagValue(name, value);
 
   PADDLE_ENFORCE_EQ(
-      ret.empty(),
-      false,
+      success,
+      true,
       platform::errors::InvalidArgument(
           "Fail to update flag: %s, please make sure the flag exists.", name));
-  LOG(INFO) << ret;
-  return ret;
 }
 
 #ifdef PADDLE_WITH_CRYPTO
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index a2c2d099d771aa..d318042719a16c 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
-DEFINE_bool(profile, false, "Turn on profiler for fluid");  // NOLINT
+PD_DEFINE_bool(profile, false, "Turn on profiler for fluid");  // NOLINT
 
 namespace paddle {
 namespace {
@@ -373,7 +373,6 @@ CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
     std::vector<std::string> flags;
     if (config.fraction_of_gpu_memory >= 0.0f ||
         config.fraction_of_gpu_memory <= 0.95f) {
-      flags.emplace_back("dummpy");
       std::string flag = "--fraction_of_gpu_memory_to_use=" +
                          num2str<float>(config.fraction_of_gpu_memory);
       flags.push_back(flag);
diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
index fb5cee4e050346..04310139e5d022 100644
--- a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc
@@ -87,7 +87,7 @@ void Main() {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   paddle::demo::Main();
   return 0;
 }
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 66f2bc7056a668..dca147e8353e30 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -133,7 +133,7 @@ void MainThreads(int num_threads, bool use_gpu) {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   paddle::demo::Main(false /* use_gpu*/);
   paddle::demo::MainThreads(1, false /* use_gpu*/);
   paddle::demo::MainThreads(4, false /* use_gpu*/);
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index b6b20a901b2bb7..b0f05d4f268c05 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -73,7 +73,7 @@ void Main() {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   paddle::demo::Main();
   return 0;
 }
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 7850b4edb10982..022c1249af001b 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -28,9 +28,6 @@ DEFINE_string(data,
               "path of data; each line is a record, format is "
               "'<space split floats as data>\t<space split ints as shape'");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
-#ifdef PADDLE_WITH_SHARED_LIB
-DECLARE_bool(profile);
-#endif
 
 namespace paddle {
 namespace demo {
@@ -81,7 +78,7 @@ void Main(bool use_gpu) {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   if (FLAGS_use_gpu) {
     paddle::demo::Main(true /*use_gpu*/);
   } else {
diff --git a/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc b/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc
index 3db1861937d81e..79c93021599598 100644
--- a/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc
+++ b/paddle/fluid/inference/api/demo_ci/windows_mobilenet.cc
@@ -85,7 +85,7 @@ void RunAnalysis() {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   paddle::demo::RunAnalysis();
   std::cout << "=========================Runs successfully===================="
             << std::endl;
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
index 7b8e4b2ec9f74a..2ae97f1738a26a 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
@@ -29,7 +29,7 @@
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
-DEFINE_string(dirname, "", "dirname to tests.");
+PD_DEFINE_string(dirname, "", "dirname to tests.");
 
 namespace paddle {
 
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index d03c88735deef1..ba439a56469999 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -459,7 +459,7 @@ PD_INFER_DECL int PaddleDtypeSize(PaddleDType dtype);
 
 PD_INFER_DECL std::string get_version();
 
-PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
+PD_INFER_DECL void UpdateDllFlag(const char* name, const char* value);
 
 PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
     const std::string& config_file);
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 18b8b6dfd43ea0..0366a33a5f5e0d 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -235,7 +235,7 @@ PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
 PD_INFER_DECL std::string GetVersion();
 PD_INFER_DECL std::tuple<int, int, int> GetTrtCompileVersion();
 PD_INFER_DECL std::tuple<int, int, int> GetTrtRuntimeVersion();
-PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
+PD_INFER_DECL void UpdateDllFlag(const char* name, const char* value);
 
 PD_INFER_DECL void ConvertToMixedPrecision(
     const std::string& model_file,
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index fe502a15798aa0..000d4f5430ed44 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -29,12 +29,12 @@ limitations under the License. */
 // phi
 #include "paddle/phi/kernels/declarations.h"
 
-DEFINE_string(devices,  // NOLINT
-              "",
-              "The devices to be used which is joined by comma.");
-DEFINE_int32(math_num_threads,
-             1,
-             "Number of threads used to run math functions.");
+PD_DEFINE_string(devices,  // NOLINT
+                 "",
+                 "The devices to be used which is joined by comma.");
+PD_DEFINE_int32(math_num_threads,
+                1,
+                "Number of threads used to run math functions.");
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/paddle_inference.map b/paddle/fluid/inference/paddle_inference.map
index 93d90238e340a0..191f5934166c4e 100644
--- a/paddle/fluid/inference/paddle_inference.map
+++ b/paddle/fluid/inference/paddle_inference.map
@@ -45,6 +45,7 @@
 			*paddle::RegisterSymbolsFor*;
 			*paddle::from_blob*;
 			*paddle::InitPhi*;
+			*paddle::flags*;
 
 			/* ut needs the following symbol, we need to modify all the ut to hidden such symbols */
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index f08a8a75ba4067..b939dfaadc94da 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -31,7 +31,7 @@ namespace nvinfer1 {
 class ITensor;
 }  // namespace nvinfer1
 
-DECLARE_bool(profile);
+PD_DECLARE_bool(profile);
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/ir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/ir/dialect/op_generator/ops_api_gen.py
new file mode 100644
index 00000000000000..135d75ecf9fd8b
--- /dev/null
+++ b/paddle/fluid/ir/dialect/op_generator/ops_api_gen.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+from api_gen import NAMESPACE_TEMPLATE, PD_MANUAL_OP_LIST, CodeGen
+
+CPP_FILE_TEMPLATE = """
+#include <pybind11/pybind11.h>
+
+#include "paddle/fluid/pybind/static_op_function.h"
+#include "paddle/phi/core/enforce.h"
+
+{body}
+
+"""
+
+NAMESPACE_INNER_TEMPLATE = """
+{function_impl}
+
+static PyMethodDef OpsAPI[] = {{
+{ops_api}
+{{nullptr, nullptr, 0, nullptr}}
+}};
+
+void BindOpsAPI(pybind11::module *module) {{
+  if (PyModule_AddFunctions(module->ptr(), OpsAPI) < 0) {{
+    PADDLE_THROW(phi::errors::Fatal("Add C++ api to core.ops failed!"));
+  }}
+}}
+"""
+
+FUNCTION_IMPL_TEMPLATE = """
+static PyObject *{name}(PyObject *self, PyObject *args, PyObject *kwargs) {{
+  return static_api_{name}(self, args, kwargs);
+}}"""
+
+OPS_API_TEMPLATE = """
+{{"{name}", (PyCFunction)(void (*)(void)){name}, METH_VARARGS | METH_KEYWORDS, "C++ interface function for {name}."}},"""
+
+
+class OpsAPIGen(CodeGen):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _gen_one_function_impl(self, name):
+        return FUNCTION_IMPL_TEMPLATE.format(name=name)
+
+    def _gen_one_ops_api(self, name):
+        return OPS_API_TEMPLATE.format(name=name)
+
+    def gen_cpp_file(
+        self, op_yaml_files, op_compat_yaml_file, namespaces, cpp_file_path
+    ):
+        if os.path.exists(cpp_file_path):
+            os.remove(cpp_file_path)
+        op_info_items = self._parse_yaml(op_yaml_files, op_compat_yaml_file)
+        function_impl_str = ''
+        ops_api_str = ''
+        for op_info in op_info_items:
+            for op_name in op_info.op_phi_name:
+                if (
+                    op_info.infer_meta_func is None
+                    and op_name not in PD_MANUAL_OP_LIST
+                ):
+                    continue
+                function_impl_str += self._gen_one_function_impl(op_name)
+                ops_api_str += self._gen_one_ops_api(op_name)
+
+        inner_body = NAMESPACE_INNER_TEMPLATE.format(
+            function_impl=function_impl_str, ops_api=ops_api_str
+        )
+
+        body = inner_body
+        for namespace in reversed(namespaces):
+            body = NAMESPACE_TEMPLATE.format(namespace=namespace, body=body)
+        with open(cpp_file_path, 'w') as f:
+            f.write(CPP_FILE_TEMPLATE.format(body=body))
+
+
+def ParseArguments():
+    parser = argparse.ArgumentParser(
+        description='Generate Dialect Python C Files By Yaml'
+    )
+    parser.add_argument('--op_yaml_files', type=str)
+    parser.add_argument('--op_compat_yaml_file', type=str)
+    parser.add_argument('--namespaces', type=str)
+    parser.add_argument('--ops_api_file', type=str)
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = ParseArguments()
+    op_yaml_files = args.op_yaml_files.split(",")
+    op_compat_yaml_file = args.op_compat_yaml_file
+    if args.namespaces is not None:
+        namespaces = args.namespaces.split(",")
+    ops_api_file = args.ops_api_file
+
+    code_gen = OpsAPIGen()
+    code_gen.gen_cpp_file(
+        op_yaml_files, op_compat_yaml_file, namespaces, ops_api_file
+    )
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/CMakeLists.txt b/paddle/fluid/ir/dialect/paddle_dialect/ir/CMakeLists.txt
index 86ade99a3cc228..d7269369ae1a39 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/CMakeLists.txt
+++ b/paddle/fluid/ir/dialect/paddle_dialect/ir/CMakeLists.txt
@@ -113,6 +113,32 @@ add_custom_command(
 add_custom_target(static_op_function_gen ALL DEPENDS ${python_c_header_file}
                                                      ${python_c_source_file})
 
+set(ops_api_gen_file
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/ir/dialect/op_generator/ops_api_gen.py)
+set(ops_api_source_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/ops_api.cc)
+set(ops_api_source_file_tmp ${ops_api_source_file}.tmp)
+
+add_custom_command(
+  OUTPUT ${ops_api_source_file}
+  COMMAND
+    ${PYTHON_EXECUTABLE} ${ops_api_gen_file} --op_yaml_files ${op_yaml_files}
+    --op_compat_yaml_file ${op_compat_yaml_file} --namespaces "paddle,pybind"
+    --ops_api_file ${ops_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${ops_api_source_file_tmp}
+          ${ops_api_source_file}
+  COMMENT "copy_if_different ${ops_api_source_file}"
+  DEPENDS ${ops_api_gen_file}
+          ${op_forward_yaml_file1}
+          ${op_forward_yaml_file2}
+          ${op_backward_yaml_file1}
+          ${op_backward_yaml_file2}
+          ${op_compat_yaml_file}
+          ${python_c_header_file}
+          ${python_c_source_file}
+  VERBATIM)
+
+add_custom_target(ops_api_gen ALL DEPENDS ${ops_api_source_file})
+
 cc_library(
   pd_dialect_core
   SRCS pd_attribute.cc pd_type.cc
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.cc b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.cc
index 070c2e49ac6d5a..b95d78a74f4703 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.cc
+++ b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.cc
@@ -29,5 +29,15 @@ ir::OpResult split_grad(std::vector<ir::OpResult> out_grads,
 
   return split_grad_op.x_grad();
 }
+
+ir::OpResult split_grad(std::vector<ir::OpResult> out_grads, int axis) {
+  auto combine_op =
+      APIBuilder::Instance().GetBuilder()->Build<ir::CombineOp>(out_grads);
+  paddle::dialect::SplitGradOp split_grad_op =
+      APIBuilder::Instance().GetBuilder()->Build<paddle::dialect::SplitGradOp>(
+          combine_op.out(), axis);
+
+  return split_grad_op.x_grad();
+}
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.h b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.h
index 1d16bc07937882..5eba73e5182bd0 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.h
+++ b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.h
@@ -25,5 +25,6 @@ namespace dialect {
 
 ir::OpResult split_grad(std::vector<ir::OpResult> out_grads, ir::OpResult axis);
 
+ir::OpResult split_grad(std::vector<ir::OpResult> out_grads, int axis);
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.cc b/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.cc
index bb3d52d7664052..5269fe62892a53 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.cc
+++ b/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
 #include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h"
 #include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h"
+#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/legacy_kernel_op.h"
 #include "paddle/fluid/platform/init_phi.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
@@ -32,9 +33,9 @@ PaddleKernelDialect::PaddleKernelDialect(ir::IrContext *context)
 
 void PaddleKernelDialect::initialize() {
   RegisterTypes<paddle::dialect::AllocatedDenseTensorType>();
-  RegisterTypes<paddle::dialect::AllocatedSelectedRowsType>();
-  RegisterOps<dialect::PhiKernelOp>();
-
+  RegisterTypes<paddle::dialect::AllocatedDenseTensorType,
+                paddle::dialect::AllocatedSelectedRowsType>();
+  RegisterOps<dialect::PhiKernelOp, dialect::LegacyKernelOp>();
   RegisterAttributes<paddle::dialect::KernelAttribute>();
 }
 
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h b/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h
index 5cbd4b0b434b38..91dae1cfd560fe 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h
+++ b/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h
@@ -24,7 +24,7 @@ namespace dialect {
 class PhiKernelOp : public ir::Op<PhiKernelOp> {
  public:
   using Op::Op;
-  static const char *name() { return "phi.kernel"; }
+  static const char *name() { return "pd_kernel.phi_kernel"; }
   static constexpr uint32_t attributes_num = 3;
   static const char *attributes_name[attributes_num];
   std::string op_name();
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/legacy_kernel_op.cc b/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/legacy_kernel_op.cc
new file mode 100644
index 00000000000000..ca15657e104856
--- /dev/null
+++ b/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/legacy_kernel_op.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/legacy_kernel_op.h"
+#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
+#include "paddle/ir/core/builtin_attribute.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace dialect {
+
+const char* LegacyKernelOp::attributes_name[attributes_num] = {  // NOLINT
+    "op_name",
+    "kernel_name",
+    "kernel_key"};
+
+void LegacyKernelOp::Verify() {
+  VLOG(4) << "Verifying inputs, outputs and attributes for: LegacyKernelOp.";
+
+  auto& attributes = this->attributes();
+
+  PADDLE_ENFORCE(attributes.count("op_name") > 0 &&
+                     attributes.at("op_name").isa<ir::StrAttribute>(),
+                 phi::errors::PreconditionNotMet(
+                     "Type of attribute: op_name is not right."));
+
+  PADDLE_ENFORCE(attributes.count("kernel_name") > 0 &&
+                     attributes.at("kernel_name").isa<ir::StrAttribute>(),
+                 phi::errors::PreconditionNotMet(
+                     "Type of attribute: kernel_name is not right."));
+
+  PADDLE_ENFORCE(attributes.count("kernel_key") > 0 &&
+                     attributes.at("kernel_key").isa<KernelAttribute>(),
+                 phi::errors::PreconditionNotMet(
+                     "Type of attribute: kernel_key is not right."));
+}
+
+std::string LegacyKernelOp::op_name() {
+  return attributes().at("op_name").dyn_cast<ir::StrAttribute>().AsString();
+}
+std::string LegacyKernelOp::kernel_name() {
+  return attributes().at("kernel_name").dyn_cast<ir::StrAttribute>().AsString();
+}
+phi::KernelKey LegacyKernelOp::kernel_key() {
+  return attributes().at("kernel_key").dyn_cast<KernelAttribute>().data();
+}
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::LegacyKernelOp)
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_split_functor.h b/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/legacy_kernel_op.h
similarity index 50%
rename from paddle/phi/core/distributed/auto_parallel/reshard_split_functor.h
rename to paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/legacy_kernel_op.h
index 87b9f2301ad0ba..524505a48305fc 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_split_functor.h
+++ b/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/legacy_kernel_op.h
@@ -14,19 +14,25 @@
 
 #pragma once
 
-#include <map>
-#include <vector>
-#include "paddle/phi/common/int_array.h"
+#include "paddle/ir/core/builder.h"
+#include "paddle/ir/core/op_base.h"
+#include "paddle/phi/core/kernel_factory.h"
 
-namespace phi {
-class DeviceContext;
-class DenseTensor;
+namespace paddle {
+namespace dialect {
+class LegacyKernelOp : public ir::Op<LegacyKernelOp> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd_kernel.legacy_kernel"; }
+  static constexpr uint32_t attributes_num = 3;
+  static const char *attributes_name[attributes_num];
+  std::string op_name();
+  std::string kernel_name();
+  phi::KernelKey kernel_key();
+  void Verify();
+};
 
-namespace distributed {
-std::vector<DenseTensor> ReshardSplitFunctor(const DeviceContext& dev_ctx,
-                                             const DenseTensor& input,
-                                             const IntArray& sections,
-                                             int64_t axis);
+}  // namespace dialect
+}  // namespace paddle
 
-}  // namespace distributed
-}  // namespace phi
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::LegacyKernelOp)
diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
index 4198098f2bd4fb..84f18baa55aeaf 100644
--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.h"
 #include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h"
 #include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h"
+#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/legacy_kernel_op.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
@@ -93,6 +94,28 @@ bool NeedFallBackCpu(const ir::Operation* op,
   return false;
 }
 
+phi::Backend GetDstBackend(const std::string& op_name,
+                           phi::Place place,
+                           OpYamlInfoParser* op_yaml_info_parser,
+                           phi::Backend kernel_def_backend,
+                           size_t input_index) {
+  if (op_name == "builtin.set_parameter" &&
+      place.GetType() == phi::AllocationType::GPU) {
+    // NOTE: align old executor, all the paramter are initilizered
+    // on backend of executor place defined
+    return phi::TransToPhiBackend(place);
+  }
+
+  auto dst_backend = kernel_def_backend;
+  if (op_yaml_info_parser != nullptr &&
+      op_yaml_info_parser->IsTensorAttribute(input_index)) {
+    // Tensor Attribute should on cpu backend for better performance
+    dst_backend = phi::Backend::CPU;
+  }
+
+  return dst_backend;
+}
+
 bool NeedFallBackFromGPUDNN2GPU(ir::Operation* op,
                                 const phi::KernelKey kernel_key) {
   // NOTE(phlrain): keep the same kernel select strategy with
@@ -181,6 +204,10 @@ ir::OpResult AddPlaceTransferOp(ir::OpResult in,
     ir::Operation* op =
         ir::Operation::Create({in}, op_attribute, {out_type}, op_info);
 
+    if (in.GetDefiningOp()->HasAttribute(kAttrIsPersisable)) {
+      op->set_attribute(kAttrIsPersisable,
+                        in.GetDefiningOp()->attribute(kAttrIsPersisable));
+    }
     program->block()->push_back(op);
 
     auto new_in = op->result(0);
@@ -209,6 +236,50 @@ ir::OpResult AddPlaceTransferOp(ir::OpResult in,
   }
 }
 
+ir::Type BuildOutputType(ir::Type type,
+                         const phi::Place& place,
+                         phi::DataType data_type,
+                         ir::IrContext* ctx) {
+  if (type.isa<dialect::DenseTensorType>()) {
+    auto dense_tensor_type = type.dyn_cast<dialect::DenseTensorType>();
+    auto out_dtype = dense_tensor_type.dtype();
+
+    // TODO(phlrain): open this after fix pr(55509) confict
+    // if (data_type != phi::DataType::UNDEFINED) {
+    //   out_dtype = TransToIrDataType(data_type, ctx);
+    // }
+
+    return dialect::AllocatedDenseTensorType::get(
+        ctx,
+        place,
+        out_dtype,
+        dense_tensor_type.dims(),
+        dense_tensor_type.data_layout(),
+        dense_tensor_type.lod(),
+        dense_tensor_type.offset());
+
+  } else if (type.isa<dialect::SelectedRowsType>()) {
+    auto selected_rows_type = type.dyn_cast<dialect::SelectedRowsType>();
+    auto out_dtype = selected_rows_type.dtype();
+
+    // TODO(phlrain): open this after fix pr(55509) confict
+    // if (data_type != phi::DataType::UNDEFINED) {
+    //   out_dtype = TransToIrDataType(data_type, ctx);
+    // }
+    return dialect::AllocatedSelectedRowsType::get(
+        ctx,
+        place,
+        out_dtype,
+        selected_rows_type.dims(),
+        selected_rows_type.data_layout(),
+        selected_rows_type.lod(),
+        selected_rows_type.offset());
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "BuildOutputType only support DenseTensorType and SelectedRowsType"));
+  }
+}
+
 phi::DataType GetKernelDataTypeByYamlInfo(
     const ir::Operation* op,
     const std::unordered_map<ir::Value, ir::OpResult>& map_value_pair,
@@ -507,6 +578,10 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
   std::string phi_kernel_op_name = paddle::dialect::PhiKernelOp::name();
   ir::OpInfo phi_kernel_op_info = ctx->GetRegisteredOpInfo(phi_kernel_op_name);
 
+  std::string legacy_kernel_op_name = paddle::dialect::LegacyKernelOp::name();
+  ir::OpInfo legacy_kernel_op_info =
+      ctx->GetRegisteredOpInfo(legacy_kernel_op_name);
+
   auto skip_feed_names = GetSkipFeedNames(block);
 
   for (auto op_item : *block) {
@@ -520,6 +595,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
       std::vector<phi::Place> out_places;
       // Copy op inputs
       std::vector<ir::OpResult> vec_inputs;
+      std::vector<ir::Type> vec_inner_types;
       if (op_item->num_operands() > 0) {
         for (size_t i = 0; i < op_item->num_operands(); ++i) {
           auto cur_in = op_item->operand_source(i);
@@ -535,6 +611,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
                                 op_item->name()));
           auto new_in = map_value_pair.at(cur_in);
           vec_inputs.push_back(new_in);
+          vec_inner_types.push_back(new_in.type());
           if (new_in.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
             out_places.push_back(
                 new_in.type()
@@ -548,49 +625,9 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
       }
       // Copy op output type
       std::vector<ir::Type> op_output_types;
-      if (op_item->num_results() > 0) {
-        for (size_t i = 0; i < op_item->num_results(); ++i) {
-          auto result_type = op_item->result(i).type();
-          if (!result_type) {
-            op_output_types.push_back(result_type);
-          } else if (result_type.isa<ir::VectorType>()) {
-            std::vector<ir::Type> vec_inner_types;
-            auto base_types = result_type.dyn_cast<ir::VectorType>().data();
-            for (size_t idx = 0; idx < base_types.size(); idx++) {
-              auto& base_type = base_types[idx];
-              if (base_type) {
-                if (base_type.isa<dialect::DenseTensorType>()) {
-                  auto allocated_dense_tensor_dtype =
-                      paddle::dialect::AllocatedDenseTensorType::get(
-                          ctx,
-                          out_places[idx],
-                          base_type.dyn_cast<dialect::DenseTensorType>());
-                  vec_inner_types.push_back(allocated_dense_tensor_dtype);
-                } else {
-                  PADDLE_THROW(phi::errors::Unimplemented(
-                      "only support dense tensor in vector type for now"));
-                }
-              } else {
-                // NOTE(phlrain), kernel not support a nullptr in output
-                ir::Type fp32_dtype = ir::Float32Type::get(ctx);
-                phi::DDim dims = {};
-                phi::DataLayout data_layout = phi::DataLayout::NCHW;
-                phi::LoD lod = {{}};
-                size_t offset = 0;
-                auto dense_tensor_dtype = paddle::dialect::DenseTensorType::get(
-                    ctx, fp32_dtype, dims, data_layout, lod, offset);
-                vec_inner_types.push_back(dense_tensor_dtype);
-              }
-            }
-            ir::Type t1 = ir::VectorType::get(ctx, vec_inner_types);
-            op_output_types.push_back(t1);
-          } else {
-            PADDLE_THROW(phi::errors::Unimplemented(
-                "builtin.combine Result type only support "
-                "VectorType<DenseTensorType>"));
-          }
-        }
-      }
+      ir::Type t1 = ir::VectorType::get(ctx, vec_inner_types);
+      op_output_types.push_back(t1);
+
       // Get op info
       ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name());
       // Generate new op
@@ -609,9 +646,8 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
     }
 
     if (op_item->name() == "builtin.slice") {
-      phi::Place out_place = place;
-      // Copy op inputs
       std::vector<ir::OpResult> vec_inputs;
+      std::vector<ir::Type> op_output_types;
       if (op_item->num_operands() > 0) {
         for (size_t i = 0; i < op_item->num_operands(); ++i) {
           auto cur_in = op_item->operand_source(i);
@@ -630,39 +666,18 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
 
           if (new_in.type().isa<ir::VectorType>()) {
             auto vec_types = new_in.type().dyn_cast<ir::VectorType>().data();
-            out_place =
-                vec_types[op_item->attributes()
-                              .at("index")
-                              .dyn_cast<ir::Int32Attribute>()
-                              .data()]
-                    .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-                    .place();
+            auto index = op_item->attributes()
+                             .at("index")
+                             .dyn_cast<ir::Int32Attribute>()
+                             .data();
+            op_output_types.push_back(vec_types[index]);
           } else {
             PADDLE_THROW(
                 phi::errors::Unimplemented("only support vector type for now"));
           }
         }
       }
-      // Copy op output type
-      std::vector<ir::Type> op_output_types;
-      if (op_item->num_results() > 0) {
-        for (size_t i = 0; i < op_item->num_results(); ++i) {
-          auto result_type = op_item->result(i).type();
-          if (!result_type) {
-            op_output_types.push_back(result_type);
-          } else if (result_type.isa<dialect::DenseTensorType>()) {
-            auto allocated_dense_tensor_dtype =
-                paddle::dialect::AllocatedDenseTensorType::get(
-                    ctx,
-                    out_place,
-                    result_type.dyn_cast<dialect::DenseTensorType>());
-            op_output_types.push_back(allocated_dense_tensor_dtype);
-          } else {
-            PADDLE_THROW(phi::errors::Unimplemented(
-                "builtin.slice Result type only support DenseTensorType"));
-          }
-        }
-      }
+
       // Get op info
       ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name());
       // Generate new op
@@ -684,6 +699,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
       std::vector<phi::Place> out_places(op_item->num_results());
       // Copy op inputs
       std::vector<ir::OpResult> vec_inputs;
+      std::vector<ir::Type> op_output_types;
       if (op_item->num_operands() > 0) {
         for (size_t i = 0; i < op_item->num_operands(); ++i) {
           auto cur_in = op_item->operand_source(i);
@@ -703,10 +719,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
           if (new_in.type().isa<ir::VectorType>()) {
             auto vec_types = new_in.type().dyn_cast<ir::VectorType>().data();
             for (uint64_t idx = 0; idx < vec_types.size(); idx++) {
-              out_places[idx] =
-                  vec_types[idx]
-                      .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-                      .place();
+              op_output_types.push_back(vec_types[idx]);
             }
           } else {
             PADDLE_THROW(
@@ -714,26 +727,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
           }
         }
       }
-      // Copy op output type
-      std::vector<ir::Type> op_output_types;
-      if (op_item->num_results() > 0) {
-        for (size_t i = 0; i < op_item->num_results(); ++i) {
-          auto result_type = op_item->result(i).type();
-          if (!result_type) {
-            op_output_types.push_back(result_type);
-          } else if (result_type.isa<dialect::DenseTensorType>()) {
-            auto allocated_dense_tensor_dtype =
-                paddle::dialect::AllocatedDenseTensorType::get(
-                    ctx,
-                    out_places[i],
-                    result_type.dyn_cast<dialect::DenseTensorType>());
-            op_output_types.push_back(allocated_dense_tensor_dtype);
-          } else {
-            PADDLE_THROW(phi::errors::Unimplemented(
-                "builtin.split Result type only support DenseTensorType"));
-          }
-        }
-      }
+
       // Get op info
       ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name());
       // Generate new op
@@ -800,36 +794,30 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
       }
 
       for (size_t i = 0; i < op_item->num_results(); ++i) {
-        phi::Place out_place;
+        phi::Place out_place = phi::TransToPhiPlace(kernel_key.backend());
+
+        phi::DataType out_phi_dtype = phi::DataType::UNDEFINED;
         if ((!UnchangeOutputOps.count(op_item->name())) &&
             (!IsLegacyOp(op_item->name())) && phi_kernel.IsValid()) {
           out_place = phi::TransToPhiPlace(output_defs[i].backend);
-        } else {
-          out_place = phi::TransToPhiPlace(kernel_key.backend());
+          out_phi_dtype = output_defs[i].dtype;
         }
 
         auto result_type = op_item->result(i).type();
         if (!result_type) {
           op_output_types.push_back(result_type);
-        } else if (result_type.isa<dialect::DenseTensorType>()) {
-          auto allocated_dense_tensor_dtype =
-              paddle::dialect::AllocatedDenseTensorType::get(
-                  ctx,
-                  out_place,
-                  result_type.dyn_cast<dialect::DenseTensorType>());
-          op_output_types.push_back(allocated_dense_tensor_dtype);
+        } else if (result_type.isa<dialect::DenseTensorType>() ||
+                   result_type.isa<dialect::SelectedRowsType>()) {
+          op_output_types.push_back(
+              BuildOutputType(result_type, out_place, out_phi_dtype, ctx));
         } else if (result_type.isa<ir::VectorType>()) {
           std::vector<ir::Type> vec_inner_types;
           auto base_types = result_type.dyn_cast<ir::VectorType>().data();
           for (auto& base_type : base_types) {
             if (base_type) {
               if (base_type.isa<dialect::DenseTensorType>()) {
-                auto allocated_dense_tensor_dtype =
-                    paddle::dialect::AllocatedDenseTensorType::get(
-                        ctx,
-                        out_place,
-                        base_type.dyn_cast<dialect::DenseTensorType>());
-                vec_inner_types.push_back(allocated_dense_tensor_dtype);
+                vec_inner_types.push_back(
+                    BuildOutputType(base_type, out_place, out_phi_dtype, ctx));
               } else {
                 PADDLE_THROW(phi::errors::Unimplemented(
                     "only support dense tensor in vector type for now"));
@@ -852,16 +840,10 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
 
           ir::Type t1 = ir::VectorType::get(ctx, vec_inner_types);
           op_output_types.push_back(t1);
-        } else if (result_type.isa<dialect::SelectedRowsType>()) {
-          auto allocated_selected_rows_dtype =
-              paddle::dialect::AllocatedSelectedRowsType::get(
-                  ctx,
-                  out_place,
-                  result_type.dyn_cast<dialect::SelectedRowsType>());
-          op_output_types.emplace_back(allocated_selected_rows_dtype);
         } else {
           PADDLE_THROW(phi::errors::Unimplemented(
-              "Result type only support DenseTensorType and VectorType"));
+              "Result type only support DenseTensorType, SelectedRowType and "
+              "VectorType"));
         }
       }
     }
@@ -888,10 +870,14 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
         auto& kernel = phi::KernelFactory::Instance().SelectKernelWithGPUDNN(
             kernel_fn_str, kernel_key);
 
-        if (kernel.IsValid() && (!UnchangeOutputOps.count(op_item->name()))) {
+        bool check_place_transfer =
+            (op_item->name() == "builtin.set_parameter") ||
+            (kernel.IsValid() && (!UnchangeOutputOps.count(op_item->name())));
+
+        if (check_place_transfer) {
           if (new_in_type.isa<dialect::AllocatedDenseTensorType>()) {
             // allocated type
-            auto place =
+            auto in_place =
                 new_in_type.dyn_cast<dialect::AllocatedDenseTensorType>()
                     .place();
 
@@ -899,17 +885,21 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
             auto args_def = kernel.args_def();
             auto input_defs = args_def.input_defs();
 
+            auto dst_backend = GetDstBackend(op_item->name(),
+                                             place,
+                                             op_info_parser.get(),
+                                             kernel.InputAt(i).backend,
+                                             i);
+
             bool need_trans =
-                (place.GetType() != phi::AllocationType::UNDEFINED) &&
-                (op_info_parser != nullptr &&
-                 !op_info_parser->IsTensorAttribute(i)) &&
+                (in_place.GetType() != phi::AllocationType::UNDEFINED) &&
                 (paddle::experimental::NeedTransformPlace(
-                    place, kernel.InputAt(i).backend, {}));
+                    in_place, dst_backend, {}));
             if (need_trans) {
-              VLOG(6) << "need trans from " << place << " to "
+              VLOG(6) << "need trans from " << in_place << " to "
                       << kernel_key.backend();
               // build memcopy op
-              auto out_place = phi::TransToPhiPlace(kernel.InputAt(i).backend);
+              auto out_place = phi::TransToPhiPlace(dst_backend);
               auto new_in_alloc_type =
                   new_in_type.dyn_cast<dialect::AllocatedDenseTensorType>();
               auto out_type = dialect::AllocatedDenseTensorType::get(
@@ -922,7 +912,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
                   new_in_alloc_type.offset());
               new_in = AddPlaceTransferOp(new_in,
                                           out_type,
-                                          place,
+                                          in_place,
                                           out_place,
                                           kernel_key,
                                           program.get());
@@ -1008,7 +998,6 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
         {"op_name", ir::StrAttribute::get(ctx, op_item->name())},
         {"kernel_name", ir::StrAttribute::get(ctx, kernel_fn_str)},
         {"kernel_key", dialect::KernelAttribute::get(ctx, kernel_key)}};
-
     auto op_attr_map = op_item->attributes();
 
     for (auto& map_item : op_attr_map) {
@@ -1019,8 +1008,14 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
       op_attribute.emplace("is_inplace", ir::BoolAttribute::get(ctx, true));
     }
 
-    ir::Operation* op = ir::Operation::Create(
-        vec_inputs, op_attribute, op_output_types, phi_kernel_op_info);
+    ir::Operation* op;
+    if (dialect::IsLegacyOp(op_item->name())) {
+      op = ir::Operation::Create(
+          vec_inputs, op_attribute, op_output_types, legacy_kernel_op_info);
+    } else {
+      op = ir::Operation::Create(
+          vec_inputs, op_attribute, op_output_types, phi_kernel_op_info);
+    }
 
     map_op_pair[op_item] = op;
 
diff --git a/paddle/fluid/ir_adaptor/translator/op_compat_gen.py b/paddle/fluid/ir_adaptor/translator/op_compat_gen.py
index 7c1a78305e71cc..60a4a09204066a 100644
--- a/paddle/fluid/ir_adaptor/translator/op_compat_gen.py
+++ b/paddle/fluid/ir_adaptor/translator/op_compat_gen.py
@@ -128,6 +128,7 @@ def insert_new_mutable_attributes(
 
     # special mapping list
     op_arg_name_mappings["set_value_grad"]["values_grad"] = "ValueTensor@GRAD"
+    op_arg_name_mappings["fetch"] = {"x": "X"}
 
     op_name_normailzer_template = env.get_template("op_compat_info.cc.j2")
     with open(output_source_file, 'wt') as f:
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index cad900ee7f6da0..38d833fc312de5 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -1699,6 +1699,7 @@ OpTranslator::OpTranslator() {
   special_handlers["cast"] = CastOpTranscriber();
   special_handlers["feed"] = FeedOpTranscriber();
   special_handlers["data"] = DataOpTranscriber();
+  special_handlers["fetch"] = FetchOpTranscriber();
   special_handlers["fetch_v2"] = FetchOpTranscriber();
   special_handlers["fill_constant"] = FillConstantTranscriber();
   special_handlers["grad_add"] = GradAddOpTranscriber();
diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
index 1e09c43c4f12f7..21f734c7272666 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
@@ -22,7 +22,7 @@ PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
 PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb);
-DECLARE_int64(gpu_allocator_retry_time);
+PD_DECLARE_int64(gpu_allocator_retry_time);
 #endif
 PHI_DECLARE_string(allocator_strategy);
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
index 63e3eab3256c9f..049807f63888b6 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
@@ -22,7 +22,7 @@ PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
 PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb);
-DECLARE_int64(gpu_allocator_retry_time);
+PD_DECLARE_int64(gpu_allocator_retry_time);
 #endif
 PHI_DECLARE_string(allocator_strategy);
 
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
index bfd05b6b323fed..15dc505517907f 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc
@@ -26,7 +26,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
-DECLARE_int64(gpu_allocator_retry_time);
+PD_DECLARE_int64(gpu_allocator_retry_time);
 #endif
 
 PHI_DECLARE_string(allocator_strategy);
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index 890ebde2aa3c9e..350cdbd9b4c248 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -19,8 +19,8 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 
-DECLARE_bool(free_idle_chunk);
-DECLARE_bool(free_when_no_cache_hit);
+PD_DECLARE_bool(free_idle_chunk);
+PD_DECLARE_bool(free_when_no_cache_hit);
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 09053ec5dede71..1493913f5b2a8f 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -41,7 +41,7 @@ PADDLE_DEFINE_EXPORTED_bool(
 PHI_DECLARE_double(fraction_of_gpu_memory_to_use);
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
 PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb);
-DECLARE_bool(benchmark);
+PD_DECLARE_bool(benchmark);
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.cc
index b88f40f6a27329..ce63ab807e01e7 100644
--- a/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.cc
@@ -88,8 +88,7 @@ bool StreamSafeCustomDeviceAllocation::CanBeFreed() {
   }
   std::call_once(once_flag_, [this] { phi::DeviceManager::SetDevice(place_); });
   for (auto it = outstanding_event_map_.begin();
-       it != outstanding_event_map_.end();
-       ++it) {
+       it != outstanding_event_map_.end();) {
     auto& event = it->second;
     if (!event->Query()) {
       VLOG(9) << "Event " << event->raw_event() << " for " << ptr()
@@ -98,6 +97,7 @@ bool StreamSafeCustomDeviceAllocation::CanBeFreed() {
     }
     VLOG(8) << "Destroy event " << event->raw_event();
     event->Destroy();
+    it = outstanding_event_map_.erase(it);
   }
   outstanding_event_map_.clear();
   return true;
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index e533960c8a648f..b4e0f511f6d61b 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -287,11 +287,7 @@ void FillHashTable(const framework::ExecutionContext& ctx,
                    thrust::device_vector<T>* keys,
                    thrust::device_vector<T>* values,
                    thrust::device_vector<int64_t>* key_index) {
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
   int block = 1024;
-#endif
   const auto& dev_ctx = ctx.cuda_device_context();
   int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int grid_tmp = (num_input + block - 1) / block;
@@ -377,12 +373,8 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
   subset->resize(unique_items.size());
   thrust::copy(unique_items.begin(), unique_items.end(), subset->begin());
 
-// Fill outputs with reindex result.
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
+  // Fill outputs with reindex result.
   int block = 1024;
-#endif
   const auto& dev_ctx = ctx.cuda_device_context();
   int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid_tmp = (outputs->size() + block - 1) / block;
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index 0ecac6c5fb07a7..a386772405ab6e 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -126,11 +126,7 @@ class Unpool2dMaxFunctor<phi::GPUContext, T> {
     const T* input_data = input.data<T>();
     const int* indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMax<T>
         <<<grid, threads, 0, context.stream()>>>(input.numel(),
@@ -167,11 +163,7 @@ class Unpool2dMaxGradFunctor<phi::GPUContext, T> {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMaxGrad<T>
         <<<grid, threads, 0, context.stream()>>>(input.numel(),
@@ -206,11 +198,7 @@ class Unpool3dMaxFunctor<phi::GPUContext, T> {
     const T* input_data = input.data<T>();
     const int* indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool3dMax<T>
         <<<grid, threads, 0, context.stream()>>>(input.numel(),
@@ -251,11 +239,7 @@ class Unpool3dMaxGradFunctor<phi::GPUContext, T> {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool3dMaxGrad<T>
         <<<grid, threads, 0, context.stream()>>>(input.numel(),
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
index aeda36e537c7f0..59313bc95af0ae 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
@@ -116,9 +117,3 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(distributed_fused_lamb_init,
                              ops::DistributedFusedLambInitOp,
                              ops::DistributedFusedLambInitOpMaker);
-
-PD_REGISTER_STRUCT_KERNEL(distributed_fused_lamb_init,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::DistributedFusedLambInitOpKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
deleted file mode 100644
index 8841544366e87c..00000000000000
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ /dev/null
@@ -1,797 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/optimizers/cast_with_ptr.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/kernels/funcs/algorithm.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/tensor_to_string.h"
-
-namespace paddle {
-namespace operators {
-
-using phi::funcs::FlattenToString;
-using phi::funcs::ToVector;
-
-struct ParamGradInfo {
-  phi::DenseTensor *param_t{nullptr};
-  phi::DenseTensor *grad_t{nullptr};
-  size_t idx{0};
-  size_t numel{0};
-  size_t numel_with_padding{0};
-  size_t numel_offset{0};
-};
-
-static std::ostream &operator<<(std::ostream &os, const ParamGradInfo &info) {
-  return os << "{Param(" << info.param_t << "),Grad(" << info.grad_t << "),idx("
-            << info.idx << "),numel(" << info.numel << "),numel_with_padding("
-            << info.numel_with_padding << "),numel_offset(" << info.numel_offset
-            << "),padding(" << info.numel_offset + info.numel_with_padding
-            << "-" << info.numel_offset + info.numel << "="
-            << info.numel_with_padding - info.numel << ")}";
-}
-
-struct ParamGradInfoNumelOffsetCompFunctor {
-  bool operator()(const ParamGradInfo &x, const ParamGradInfo &y) const {
-    return x.numel_offset < y.numel_offset;
-  }
-
-  bool operator()(const ParamGradInfo &x, size_t y) const {
-    return x.numel_offset < y;
-  }
-
-  bool operator()(size_t x, const ParamGradInfo &y) const {
-    return x < y.numel_offset;
-  }
-
-  bool operator()(size_t x, size_t y) const { return x < y; }
-};
-
-static size_t GetAlignSize(size_t n, size_t alignment) {
-  auto remainder = n % alignment;
-  return remainder == 0 ? n : n + alignment - remainder;
-}
-
-// Shard the ParamGradInfo list by the numel size [start_size, end_size)
-// The final results should be:
-//
-// start_size = sum(infos[0:i].numel_with_padding) + start_numel_offset, where
-// start_numel_offset <= infos[i].numel_with_padding
-//
-// end_size = sum(infos[0:j].numel_with_padding) + end_numel_offset, where
-// end_numel_offset <= infos[j].numel_with_padding
-static void GetParamGradShardInfo(const std::vector<ParamGradInfo> &infos,
-                                  size_t start_size,
-                                  size_t end_size,
-                                  size_t *start_idx,
-                                  size_t *end_idx,
-                                  size_t *start_numel_offset,
-                                  size_t *end_numel_offset) {
-  VLOG(10) << "NumelOffset: "
-           << string::join_strings(infos, ",", [](const ParamGradInfo &info) {
-                return info.numel_offset;
-              });
-  VLOG(10) << "start_size = " << start_size << " , end_size = " << end_size;
-
-  if (infos.empty()) {
-    PADDLE_ENFORCE_EQ(
-        start_size,
-        0,
-        platform::errors::InvalidArgument("start_size should be 0."));
-    PADDLE_ENFORCE_EQ(
-        end_size,
-        0,
-        platform::errors::InvalidArgument("end_size should be 0."));
-    *start_idx = 0;
-    *end_idx = 0;
-    *start_numel_offset = 0;
-    *end_numel_offset = 0;
-    return;
-  }
-
-  PADDLE_ENFORCE_LT(start_size,
-                    end_size,
-                    platform::errors::InvalidArgument(
-                        "start_size should be less than end_size."));
-  size_t n = infos.size();
-  ParamGradInfoNumelOffsetCompFunctor comp;
-  auto i = static_cast<size_t>(
-      std::lower_bound(infos.begin(), infos.end(), start_size, comp) -
-      infos.begin());
-  if (i == n || infos[i].numel_offset != start_size) {
-    PADDLE_ENFORCE_GT(
-        i,
-        0,
-        platform::errors::InvalidArgument(
-            "Cannot find suitable sharding which is between [%d, %d)",
-            start_size,
-            end_size));
-    --i;
-  }
-  PADDLE_ENFORCE_LT(
-      i,
-      n,
-      platform::errors::InvalidArgument(
-          "Cannot find suitable sharding which is between [%d, %d)",
-          start_size,
-          end_size));
-  *start_idx = i;
-  *start_numel_offset = start_size - infos[i].numel_offset;
-  auto j = static_cast<size_t>(
-      std::lower_bound(infos.begin(), infos.end(), end_size, comp) -
-      infos.begin());
-  *end_idx = j - 1;
-  *end_numel_offset = end_size - infos[j - 1].numel_offset;
-  PADDLE_ENFORCE_GT(*end_numel_offset,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "Internal error when sharding, this may be a bug "
-                        "caused by empty parameter."));
-  VLOG(10) << "Sharding [start_size=" << start_size << ", end_size=" << end_size
-           << "): " << (*start_idx) << ":" << (*start_numel_offset) << " -> "
-           << (*end_idx) << ":" << (*end_numel_offset);
-}
-
-static size_t FillAlignmentPaddingInfo(std::vector<ParamGradInfo> *infos,
-                                       size_t alignment,
-                                       size_t nranks,
-                                       phi::DataType dtype) {
-  auto sizeof_dtype = phi::SizeOf(dtype);
-  PADDLE_ENFORCE_EQ(
-      alignment % sizeof_dtype,
-      0,
-      platform::errors::InvalidArgument(
-          "The attr(alignment) should be exactly divided by sizeof(T) %d.",
-          sizeof_dtype));
-  alignment /= sizeof_dtype;
-
-  size_t total_numel_sum_with_padding = 0;
-  size_t n = infos->size();
-  for (size_t i = 0; i < n; ++i) {
-    auto &info = (*infos)[i];
-    size_t numel_with_padding;
-    if (i + 1 == n) {
-      // the total fused numel must be a factor of alignment * nranks
-      numel_with_padding =
-          GetAlignSize(info.numel + total_numel_sum_with_padding,
-                       alignment * nranks) -
-          total_numel_sum_with_padding;
-    } else {
-      numel_with_padding = GetAlignSize(info.numel, alignment);
-    }
-    info.numel_with_padding = numel_with_padding;
-    info.numel_offset = total_numel_sum_with_padding;
-    total_numel_sum_with_padding += numel_with_padding;
-  }
-  return total_numel_sum_with_padding;
-}
-
-template <typename T>
-static T *TensorFillConstant(const phi::GPUContext &dev_ctx,
-                             phi::DenseTensor *tensor,
-                             const framework::DDim &dims,
-                             T value) {
-  tensor->Resize(dims);
-  auto *ptr = tensor->mutable_data<T>(dev_ctx.GetPlace());
-  phi::funcs::SetConstant<phi::GPUContext, T> set_constant;
-  set_constant(dev_ctx, tensor, value);
-  return ptr;
-}
-
-static phi::DenseTensor CastDataForInitedTensor(const phi::GPUContext &dev_ctx,
-                                                phi::DenseTensor *origin,
-                                                phi::DenseTensor *fused_out,
-                                                size_t numel_offset) {
-  PADDLE_ENFORCE_EQ(origin->IsInitialized(),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "The tensor to be cast should be initialized."));
-
-  PADDLE_ENFORCE_EQ(fused_out->dtype(),
-                    phi::DataType::FLOAT32,
-                    platform::errors::InvalidArgument(
-                        "The dst tensor to be cast should be FP32 tensor."));
-  PADDLE_ENFORCE_EQ(origin->dtype(),
-                    phi::DataType::FLOAT16,
-                    platform::errors::InvalidArgument(
-                        "The src tensor to be cast should be FP16 tensor."));
-  auto *dst = fused_out->data<float>() + numel_offset;
-  auto *src = origin->data<platform::float16>();
-  auto numel = origin->numel();
-  LaunchCastKernel(dev_ctx, src, dst, numel);
-  VLOG(10) << "Cast from FP32 -> FP16, range: [" << numel_offset << ", "
-           << numel_offset + numel << ")"
-           << " , total: [0, " << fused_out->numel() << ")";
-  framework::DDim fused_out_dim = fused_out->dims();
-  auto fused_out_numel = fused_out->numel();
-  fused_out->Resize({fused_out_numel});
-  auto sliced_tensor = fused_out->Slice(numel_offset, numel + numel_offset);
-  fused_out->Resize(fused_out_dim);
-  return sliced_tensor;
-}
-
-static phi::DenseTensor CopyAndShareBufferForInitedTensor(
-    phi::DenseTensor *origin,
-    phi::DenseTensor *fused_out,
-    size_t numel_offset,
-    gpuStream_t stream) {
-  PADDLE_ENFORCE_EQ(
-      origin->IsInitialized(),
-      true,
-      platform::errors::InvalidArgument(
-          "The tensor to be copied and shared data should be initialized."));
-  auto dtype = fused_out->type();
-  PADDLE_ENFORCE_EQ(origin->type(),
-                    dtype,
-                    platform::errors::InvalidArgument(
-                        "The tensor to be copied and shared data should be "
-                        "have the same data type."));
-  auto place = fused_out->place();
-  PADDLE_ENFORCE_EQ(
-      origin->place(),
-      place,
-      platform::errors::InvalidArgument("The tensor to be copied and shared "
-                                        "data should be have the same place."));
-  PADDLE_ENFORCE_EQ(
-      platform::is_gpu_place(place),
-      true,
-      platform::errors::InvalidArgument(
-          "The tensor to be copied and shared data should be on GPU place."));
-
-  auto numel = origin->numel();
-  framework::DDim fused_out_dim = fused_out->dims();
-  auto fused_out_numel = fused_out->numel();
-  auto sliced_tensor = fused_out->Resize({fused_out_numel})
-                           .Slice(numel_offset, numel + numel_offset);
-  memory::Copy(place,
-               sliced_tensor.data(),
-               place,
-               origin->data(),
-               numel * phi::SizeOf(dtype),
-               stream);
-  origin->ShareBufferWith(sliced_tensor);
-  fused_out->Resize(fused_out_dim);
-  VLOG(10) << "Copy and share buffer, range: [" << numel_offset << ", "
-           << numel_offset + numel << ") , total: [0, " << fused_out->numel()
-           << ") , dtype = " << dtype;
-  return sliced_tensor;
-}
-
-static void ShareBufferForNonInitedTensor(phi::DenseTensor *origin,
-                                          phi::DenseTensor *fused_out,
-                                          size_t numel_offset,
-                                          const framework::DDim &dims) {
-  PADDLE_ENFORCE_EQ(
-      origin->IsInitialized(),
-      false,
-      platform::errors::InvalidArgument(
-          "The tensor to be shared data should not be initialized."));
-
-  framework::DDim fused_out_dim = fused_out->dims();
-  auto fused_out_numel = fused_out->numel();
-  auto numel = phi::product(dims);
-  *origin = fused_out->Resize({fused_out_numel})
-                .Slice(numel_offset, numel + numel_offset);
-  origin->Resize(dims);
-  fused_out->Resize(fused_out_dim);
-  VLOG(10) << "Share buffer for non-inited, range: [" << numel_offset << ", "
-           << numel_offset + numel << "), total: [0, " << fused_out->numel()
-           << ") , dtype = " << fused_out->dtype();
-}
-
-template <typename T>
-static void CopyVectorToCPUTensor(const std::vector<T> &src,
-                                  phi::DenseTensor *dst) {
-  dst->Resize({static_cast<int64_t>(src.size())});
-  T *dst_ptr = dst->mutable_data<T>(platform::CPUPlace());
-  const T *src_ptr = src.data();
-  auto nbytes = src.size() * sizeof(T);
-  std::memcpy(dst_ptr, src_ptr, nbytes);
-}
-
-static size_t ReorderParamGradInfoList(const std::vector<int> &flags,
-                                       std::vector<ParamGradInfo> *infos) {
-  size_t n = infos->size();
-  std::vector<int> cur_flags;
-  cur_flags.reserve(n);
-  for (size_t i = 0; i < n; ++i) {
-    auto idx = (*infos)[i].idx;
-    cur_flags.push_back(flags[idx]);
-  }
-
-  auto origin_infos = *infos;
-  size_t j = 0;
-  for (size_t i = 0; i < n; ++i) {
-    if (cur_flags[i]) {
-      (*infos)[j] = origin_infos[i];
-      ++j;
-    }
-  }
-  size_t ret_idx = j;
-
-  for (size_t i = 0; i < n; ++i) {
-    if (!cur_flags[i]) {
-      (*infos)[j] = origin_infos[i];
-      ++j;
-    }
-  }
-  return ret_idx;
-}
-
-template <typename T>
-static T ClipByBound(T x, T low_value, T high_value) {
-  if (x < low_value) return low_value;
-  if (x > high_value) return high_value;
-  return x;
-}
-
-template <typename T>
-class DistributedFusedLambInitOpKernel<T, phi::GPUContext>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    VLOG(10) << "starts to run DistributedFusedLambInitOp";
-    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
-    auto place = ctx.GetPlace();
-    auto stream = dev_ctx.stream();
-
-    // Step 1: Check Input(Param) and Output(ParamOut), Input(Grad) and
-    // Output(GradOut)
-    auto params = ctx.MultiInput<phi::DenseTensor>("Param");
-    auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
-    auto master_params = ctx.MultiOutput<phi::DenseTensor>("MasterParamOut");
-    std::vector<ParamGradInfo> fp32_infos, fp16_infos;
-    {
-      PADDLE_ENFORCE_EQ(params.size(),
-                        grads.size(),
-                        platform::errors::InvalidArgument(
-                            "The parameter number and parameter gradient "
-                            "number should be the same."));
-
-      auto params_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
-      auto grads_out = ctx.MultiOutput<phi::DenseTensor>("GradOut");
-      PADDLE_ENFORCE_EQ(
-          params.size(),
-          params_out.size(),
-          platform::errors::InvalidArgument("Input(Param) and Output(ParamOut) "
-                                            "should have the same number."));
-      PADDLE_ENFORCE_EQ(
-          grads.size(),
-          grads_out.size(),
-          platform::errors::InvalidArgument(
-              "Input(Grad) and Output(GradOut) should have the same number."));
-      size_t n = params.size();
-      VLOG(10) << "parameter number: " << n;
-      for (size_t i = 0; i < n; ++i) {
-        auto *p = params[i];
-        auto *g = grads[i];
-        auto *p_out = params_out[i];
-        auto *g_out = grads_out[i];
-
-        PADDLE_ENFORCE_NOT_NULL(
-            p,
-            platform::errors::InvalidArgument(
-                "The %d-th parameter should not be nullptr.", i));
-        PADDLE_ENFORCE_EQ(p->IsInitialized(),
-                          true,
-                          platform::errors::InvalidArgument(
-                              "The %d-th parameter should be initialized.", i));
-        PADDLE_ENFORCE_EQ(
-            p->place(),
-            place,
-            platform::errors::InvalidArgument(
-                "The %d-th parameter is not initialized on the right place.",
-                i));
-        PADDLE_ENFORCE_EQ(p,
-                          p_out,
-                          platform::errors::InvalidArgument(
-                              "The %d-th Input(Param) and Output(ParamOut) "
-                              "should be the same tensor.",
-                              i));
-
-        auto dtype = p->dtype();
-        PADDLE_ENFORCE_NOT_NULL(
-            g,
-            platform::errors::InvalidArgument(
-                "The %d-th gradient should not be nullptr.", i));
-        PADDLE_ENFORCE_EQ(g,
-                          g_out,
-                          platform::errors::InvalidArgument(
-                              "The %d-th Input(Grad) and Output(Grad) should "
-                              "be the same tensor."));
-        auto numel = p->numel();
-        PADDLE_ENFORCE_GT(numel,
-                          0,
-                          platform::errors::InvalidArgument(
-                              "The %d-th Input(Param) have no elements."));
-
-        void *g_data = nullptr;
-        if (g->IsInitialized()) {
-          PADDLE_ENFORCE_EQ(g->dtype(),
-                            dtype,
-                            platform::errors::InvalidArgument(
-                                "The %d-th Input(Param) and Input(Grad) should "
-                                "have the same data type %s.",
-                                i,
-                                dtype));
-          PADDLE_ENFORCE_EQ(g->dims(),
-                            p->dims(),
-                            platform::errors::InvalidArgument(
-                                "The %d-th Input(Param) and Input(Grad) should "
-                                "have the same shape.",
-                                i));
-          g_data = g_out->data();
-        }
-
-        ParamGradInfo *info;
-        if (dtype == phi::DataType::FLOAT32) {
-          fp32_infos.emplace_back();
-          info = &fp32_infos.back();
-        } else if (dtype == phi::DataType::FLOAT16) {
-          fp16_infos.emplace_back();
-          info = &fp16_infos.back();
-        } else {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Unsupported data type %s.", dtype));
-        }
-
-        VLOG(10) << "Found " << dtype << " parameter " << i << " shape=["
-                 << p_out->dims() << "] numel=" << numel
-                 << " grad.IsInitialized()="
-                 << (g_out->IsInitialized() ? "true" : "false");
-
-        info->param_t = p_out;
-        info->grad_t = g_out;
-        info->idx = i;
-        info->numel = numel;
-        info->numel_with_padding = 0;  // not determined yet
-        info->numel_offset = 0;        // not determined yet
-      }
-    }
-    const auto &apply_weight_decay =
-        ctx.Attr<std::vector<int>>("apply_weight_decay");
-    size_t fp32_wd_end_idx =
-        ReorderParamGradInfoList(apply_weight_decay, &fp32_infos);
-    size_t fp16_wd_end_idx =
-        ReorderParamGradInfoList(apply_weight_decay, &fp16_infos);
-
-    auto *param_order_t = ctx.Output<phi::DenseTensor>("ParamOrder");
-    auto param_num = fp32_infos.size() + fp16_infos.size();
-    param_order_t->Resize({static_cast<int16_t>(param_num)});
-    auto *param_order = param_order_t->mutable_data<int>(platform::CPUPlace());
-    for (size_t i = 0; i < fp32_infos.size(); ++i) {
-      param_order[i] = static_cast<int>(fp32_infos[i].idx);
-    }
-    for (size_t i = 0; i < fp16_infos.size(); ++i) {
-      param_order[i + fp32_infos.size()] = static_cast<int>(fp16_infos[i].idx);
-    }
-
-    VLOG(10) << "Fill ParamGradInfo ends";
-
-    // Step 2: determine the numel_with_padding and numel_offset
-    auto rank = ctx.Attr<int>("rank");
-    auto nranks = ctx.Attr<int>("nranks");
-    auto alignment = ctx.Attr<int>("alignment");
-    VLOG(10) << "rank = " << rank << ", nranks = " << nranks
-             << " , alignment = " << alignment;
-    if (alignment <= 0) {
-      alignment = platform::GpuMinChunkSize();
-    }
-    PADDLE_ENFORCE_GE(alignment,
-                      1,
-                      platform::errors::InvalidArgument(
-                          "The attr(alignment) should be larger than 0."));
-    PADDLE_ENFORCE_EQ(alignment & (alignment - 1),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The attr(alignment) should be the power of 2."));
-    PADDLE_ENFORCE_GE(
-        rank,
-        0,
-        platform::errors::InvalidArgument(
-            "The attr(rank) should be equal to or larger than 0."));
-    PADDLE_ENFORCE_LT(
-        rank,
-        nranks,
-        platform::errors::InvalidArgument(
-            "The attr(rank) should be less than the attr(nranks)."));
-    // NOTE: We guarantee that both fp32_numel and fp16_numel can be exactly
-    // divided by alignment and nranks.
-    auto fp32_numel = FillAlignmentPaddingInfo(
-        &fp32_infos, alignment, nranks, phi::DataType::FLOAT32);
-    VLOG(10) << "FP32 ParamGradInfo: " << string::join_strings(fp32_infos, " ");
-    auto fp16_numel = FillAlignmentPaddingInfo(
-        &fp16_infos, alignment, nranks, phi::DataType::FLOAT16);
-    VLOG(10) << "FP16 ParamGradInfo: " << string::join_strings(fp16_infos, " ");
-    auto total_numel = fp32_numel + fp16_numel;
-    PADDLE_ENFORCE_LT(
-        total_numel,
-        std::numeric_limits<int>::max(),
-        platform::errors::InvalidArgument("Too many parameter number."));
-
-    auto fp32_numel_each_device = fp32_numel / nranks;
-    auto fp16_numel_each_device = fp16_numel / nranks;
-    auto numel_each_device = fp32_numel_each_device + fp16_numel_each_device;
-    VLOG(10) << "Fill padding ends. total_numel = " << total_numel
-             << ", fp32_numel = " << fp32_numel
-             << ", fp16_numel = " << fp16_numel
-             << ", fp32_numel_each_device = " << fp32_numel_each_device
-             << ", fp16_numel_each_device = " << fp16_numel_each_device;
-
-    // Step 3: allocate output tensor and do initialization
-    float *fused_fp32_param = nullptr, *fused_fp32_grad = nullptr;
-    platform::float16 *fused_fp16_param = nullptr, *fused_fp16_grad = nullptr;
-    phi::DenseTensor *fp32_p_t = nullptr, *fp16_p_t = nullptr,
-                     *fp32_g_t = nullptr, *fp16_g_t = nullptr;
-    std::vector<phi::DenseTensor *> fp16_master_params;
-    if (total_numel > 0) {
-      fp32_p_t = ctx.Output<phi::DenseTensor>("FP32FusedParam");
-      fused_fp32_param = TensorFillConstant<float>(
-          dev_ctx, fp32_p_t, {static_cast<int64_t>(total_numel)}, 0.0f);
-    }
-
-    if (fp32_numel > 0) {
-      fp32_g_t = ctx.Output<phi::DenseTensor>("FP32FusedGrad");
-      fused_fp32_grad = TensorFillConstant<float>(
-          dev_ctx, fp32_g_t, {static_cast<int64_t>(fp32_numel)}, 0.0f);
-    }
-
-    if (fp16_numel > 0) {
-      fp16_p_t = ctx.Output<phi::DenseTensor>("FP16FusedParam");
-      fused_fp16_param = TensorFillConstant<platform::float16>(
-          dev_ctx,
-          fp16_p_t,
-          {static_cast<int64_t>(fp16_numel)},
-          static_cast<platform::float16>(0));
-
-      fp16_g_t = ctx.Output<phi::DenseTensor>("FP16FusedGrad");
-      fused_fp16_grad = TensorFillConstant<platform::float16>(
-          dev_ctx,
-          fp16_g_t,
-          {static_cast<int64_t>(fp16_numel)},
-          static_cast<platform::float16>(0));
-    }
-    VLOG(10) << "Allocate FP32FusedParam/Grad, FP16FusedParam/Grad ends";
-
-    // (1) For FP32FusedParam, memcpy for fp32 param and then share data, cast
-    // for fp16 master weight
-    // (2) For FP16FusedParam, memcpy and then share data
-    // (3) For FP32FusedGrad/FP16FusedGrad, memcpy if gradient has been inited
-    for (const auto &info : fp32_infos) {
-      auto sliced_tensor = CopyAndShareBufferForInitedTensor(
-          info.param_t, fp32_p_t, info.numel_offset, stream);
-      master_params[info.idx]->Resize(info.param_t->dims());
-      master_params[info.idx]->ShareBufferWith(sliced_tensor);
-      PADDLE_ENFORCE_EQ(master_params[info.idx]->mutable_data<float>(place),
-                        sliced_tensor.data<float>(),
-                        platform::errors::InvalidArgument(
-                            "Invalid master weight tensor pointer."));
-      if (info.grad_t->IsInitialized()) {
-        CopyAndShareBufferForInitedTensor(
-            info.grad_t, fp32_g_t, info.numel_offset, stream);
-      } else {
-        ShareBufferForNonInitedTensor(
-            info.grad_t, fp32_g_t, info.numel_offset, info.param_t->dims());
-      }
-    }
-
-    size_t fp16_numel_offset = 0;
-    if (fp32_numel > 0) {
-      auto last_fp32_info = fp32_infos.back();
-      fp16_numel_offset =
-          last_fp32_info.numel_offset + last_fp32_info.numel_with_padding;
-    }
-
-    for (const auto &info : fp16_infos) {
-      auto master_weight_offset = info.numel_offset + fp16_numel_offset;
-      auto sliced_tensor = CastDataForInitedTensor(
-          dev_ctx, info.param_t, fp32_p_t, master_weight_offset);
-      master_params[info.idx]->Resize(info.param_t->dims());
-      master_params[info.idx]->ShareBufferWith(sliced_tensor);
-
-      CopyAndShareBufferForInitedTensor(
-          info.param_t, fp16_p_t, info.numel_offset, stream);
-      PADDLE_ENFORCE_EQ(master_params[info.idx]->mutable_data<float>(place),
-                        sliced_tensor.data<float>(),
-                        platform::errors::InvalidArgument(
-                            "Invalid master weight tensor pointer."));
-
-      if (info.grad_t->IsInitialized()) {
-        CopyAndShareBufferForInitedTensor(
-            info.grad_t, fp16_g_t, info.numel_offset, stream);
-      } else {
-        ShareBufferForNonInitedTensor(
-            info.grad_t, fp16_g_t, info.numel_offset, info.param_t->dims());
-      }
-    }
-    VLOG(10) << "Copy/share data for Param/Grad ends";
-
-    // Step 4: For Moment1, Moment2, Beta1Pow, Beta2Pow, just fill constant
-    TensorFillConstant<float>(dev_ctx,
-                              ctx.Output<phi::DenseTensor>("Moment1"),
-                              {static_cast<int64_t>(numel_each_device)},
-                              0.0f);
-    TensorFillConstant<float>(dev_ctx,
-                              ctx.Output<phi::DenseTensor>("Moment2"),
-                              {static_cast<int64_t>(numel_each_device)},
-                              0.0f);
-    TensorFillConstant<float>(dev_ctx,
-                              ctx.Output<phi::DenseTensor>("Beta1Pow"),
-                              {1},
-                              ctx.Attr<float>("beta1"));
-    TensorFillConstant<float>(dev_ctx,
-                              ctx.Output<phi::DenseTensor>("Beta2Pow"),
-                              {1},
-                              ctx.Attr<float>("beta2"));
-    VLOG(10) << "Init Moment and BetaPow ends";
-
-    // Step 5: Do sharding
-    size_t fp32_start_idx, fp32_end_idx, fp32_start_numel_offset,
-        fp32_end_numel_offset;
-    GetParamGradShardInfo(fp32_infos,
-                          rank * fp32_numel_each_device,
-                          (rank + 1) * fp32_numel_each_device,
-                          &fp32_start_idx,
-                          &fp32_end_idx,
-                          &fp32_start_numel_offset,
-                          &fp32_end_numel_offset);
-    size_t fp16_start_idx, fp16_end_idx, fp16_start_numel_offset,
-        fp16_end_numel_offset;
-    GetParamGradShardInfo(fp16_infos,
-                          rank * fp16_numel_each_device,
-                          (rank + 1) * fp16_numel_each_device,
-                          &fp16_start_idx,
-                          &fp16_end_idx,
-                          &fp16_start_numel_offset,
-                          &fp16_end_numel_offset);
-    size_t fp32_local_param_num =
-        fp32_numel_each_device > 0 ? fp32_end_idx - fp32_start_idx + 1 : 0;
-    size_t fp16_local_param_num =
-        fp16_numel_each_device > 0 ? fp16_end_idx - fp16_start_idx + 1 : 0;
-    size_t total_local_param_num = fp32_local_param_num + fp16_local_param_num;
-    VLOG(10) << "Found the sharding arguments";
-
-    auto *param_info_t = ctx.Output<phi::DenseTensor>("ParamInfo");
-    param_info_t->Resize({8});
-    auto *param_info = param_info_t->mutable_data<int>(platform::CPUPlace());
-    param_info[0] = static_cast<int>(fp32_start_idx);
-    param_info[1] = static_cast<int>(fp32_local_param_num);
-    param_info[2] = static_cast<int>(fp32_infos.size());
-    param_info[3] = ClipByBound<int>(fp32_wd_end_idx,
-                                     fp32_start_idx,
-                                     fp32_start_idx + fp32_local_param_num) -
-                    static_cast<int>(fp32_start_idx);
-    param_info[4] = static_cast<int>(fp16_start_idx + fp32_infos.size());
-    param_info[5] = static_cast<int>(fp16_local_param_num);
-    param_info[6] = static_cast<int>(fp16_infos.size());
-    param_info[7] = ClipByBound<int>(fp16_wd_end_idx,
-                                     fp16_start_idx,
-                                     fp16_start_idx + fp16_local_param_num) -
-                    static_cast<int>(fp16_start_idx);
-
-    VLOG(10) << "Start FP32 idx: " << param_info[0];
-    VLOG(10) << "Local FP32 param num: " << param_info[1];
-    VLOG(10) << "Global FP32 param num: " << param_info[2];
-
-    VLOG(10) << "Start FP16 idx: " << param_info[4];
-    VLOG(10) << "Local FP16 param num: " << param_info[5];
-    VLOG(10) << "Global FP16 param num: " << param_info[6];
-
-    std::vector<int> numel_offsets;
-    numel_offsets.reserve(params.size() + 1);
-    for (const auto &info : fp32_infos) {
-      numel_offsets.push_back(info.numel_offset);
-    }
-    for (const auto &info : fp16_infos) {
-      numel_offsets.push_back(info.numel_offset + fp16_numel_offset);
-    }
-    numel_offsets.push_back(fp32_numel + fp16_numel);
-    PADDLE_ENFORCE_EQ(numel_offsets.size(),
-                      params.size() + 1,
-                      platform::errors::InvalidArgument(
-                          "The numel_offsets number must be one larger than "
-                          "the parameter number."));
-    VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets);
-
-    std::vector<int> fp32_partial_numel_offsets;
-    fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1);
-    fp32_partial_numel_offsets.push_back(0);
-    // Fill the partial_numel_offsets
-    for (size_t i = fp32_start_idx; i < fp32_start_idx + fp32_local_param_num;
-         ++i) {
-      size_t valid_start_n = 0;
-      if (i == fp32_start_idx) {
-        valid_start_n = fp32_start_numel_offset;
-      }
-
-      size_t end_n = fp32_infos[i].numel_with_padding;
-      if (i + 1 == fp32_start_idx + fp32_local_param_num) {
-        end_n = std::min(end_n, fp32_end_numel_offset);
-      }
-
-      PADDLE_ENFORCE_NE(valid_start_n,
-                        end_n,
-                        platform::errors::InvalidArgument(
-                            "Indices sharding error. This may be a bug."));
-      VLOG(10) << "FP32 Partial numel = ["
-               << valid_start_n + fp32_infos[i].numel << ","
-               << end_n + fp32_infos[i].numel;
-      auto len = end_n - valid_start_n;
-      fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() +
-                                           len);
-    }
-
-    std::vector<int> fp16_partial_numel_offsets;
-    fp16_partial_numel_offsets.reserve(fp16_local_param_num + 1);
-    fp16_partial_numel_offsets.push_back(0);
-    for (size_t i = fp16_start_idx; i < fp16_start_idx + fp16_local_param_num;
-         ++i) {
-      size_t valid_start_n = 0;
-      if (i == fp16_start_idx) {
-        valid_start_n = fp16_start_numel_offset;
-      }
-
-      size_t end_n = fp16_infos[i].numel_with_padding;
-      if (i + 1 == fp16_start_idx + fp16_local_param_num) {
-        end_n = std::min(end_n, fp16_end_numel_offset);
-      }
-
-      PADDLE_ENFORCE_NE(valid_start_n,
-                        end_n,
-                        platform::errors::InvalidArgument(
-                            "Indices sharding error. This may be a bug."));
-      auto len = end_n - valid_start_n;
-      fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() +
-                                           len);
-    }
-
-    CopyVectorToCPUTensor(numel_offsets,
-                          ctx.Output<phi::DenseTensor>("FusedParamOffsets"));
-    CopyVectorToCPUTensor(
-        fp32_partial_numel_offsets,
-        ctx.Output<phi::DenseTensor>("FP32ShardFusedParamOffsets"));
-    CopyVectorToCPUTensor(
-        fp16_partial_numel_offsets,
-        ctx.Output<phi::DenseTensor>("FP16ShardFusedParamOffsets"));
-
-    auto *global_scale = ctx.Output<phi::DenseTensor>("GlobalScale");
-    if (!global_scale->IsInitialized()) {
-      TensorFillConstant<float>(dev_ctx, global_scale, {1}, 1.0f);
-    }
-    VLOG(10) << "Init global scale ends";
-
-    TensorFillConstant<int64_t>(dev_ctx,
-                                ctx.Output<phi::DenseTensor>("Step"),
-                                {1},
-                                static_cast<int64_t>(0));
-
-    dev_ctx.Wait();
-    VLOG(10) << "Wait for H2D copy";
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-PD_REGISTER_STRUCT_KERNEL(distributed_fused_lamb_init,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::DistributedFusedLambInitOpKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.h b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.h
deleted file mode 100644
index 7c314cd9e37908..00000000000000
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DevCtx>
-class DistributedFusedLambInitOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "The distributed_fused_lamb_init operator does not support CPU yet."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index cad7e38ba1c1a3..fdec898edbe914 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -170,11 +170,7 @@ static void MultiTensorL2Norm(const phi::GPUPlace &place,
 
   constexpr int kNumTensor = MaxTensorNumPerLaunch;
   constexpr int kNumChunk = MaxChunkNumPerLaunch;
-#ifdef PADDLE_WITH_HIP
-  constexpr int kBlockDim = 256;
-#else
   constexpr int kBlockDim = 512;
-#endif
 
   int max_chunk_num = -1;
   int vec_size = 8;
@@ -812,11 +808,7 @@ static void MultiTensorUpdateLambParamAndBetaPows(
         phi::errors::InvalidArgument("Beta2Pow should be nullptr."));
   }
 
-#ifdef PADDLE_WITH_HIP
-  const int block_dim = 256;
-#else
   const int block_dim = 512;
-#endif
 
   int vec_size = 8;
   for (int i = 0; i < n; ++i) {
diff --git a/paddle/fluid/operators/pscore/CMakeLists.txt b/paddle/fluid/operators/pscore/CMakeLists.txt
index 5a397699951ca0..dea89806bc202c 100755
--- a/paddle/fluid/operators/pscore/CMakeLists.txt
+++ b/paddle/fluid/operators/pscore/CMakeLists.txt
@@ -37,15 +37,9 @@ else()
     ps_framework_proto
     framework_proto
     sendrecv_rpc
-    brpc
-    leveldb
-    ssl
-    crypto
-    protobuf
+    ${EXTERNAL_BRPC_DEPS}
     phi
-    glog
     zlib
-    snappy
     device_context)
 endif()
 
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index bc1f5a0d34f60e..2450d64542a780 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -101,7 +101,9 @@ PD_REGISTER_KERNEL(save,
                    int16_t,
                    int64_t,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
 
 PD_REGISTER_KERNEL(save_sr,
                    CPU,
@@ -115,7 +117,9 @@ PD_REGISTER_KERNEL(save_sr,
                    int16_t,
                    int64_t,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(save,
@@ -130,7 +134,9 @@ PD_REGISTER_KERNEL(save,
                    int16_t,
                    int64_t,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
 
 PD_REGISTER_KERNEL(save_sr,
                    GPU,
@@ -144,5 +150,7 @@ PD_REGISTER_KERNEL(save_sr,
                    int16_t,
                    int64_t,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
 #endif
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
index 0ebac190cda310..1a126b0a8853cc 100644
--- a/paddle/fluid/operators/tdm_child_op.h
+++ b/paddle/fluid/operators/tdm_child_op.h
@@ -21,9 +21,9 @@
 #include <utility>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/mixed_vector.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/tdm_sampler_op.h b/paddle/fluid/operators/tdm_sampler_op.h
index 4baff820784713..f3b55c4a5cc34b 100644
--- a/paddle/fluid/operators/tdm_sampler_op.h
+++ b/paddle/fluid/operators/tdm_sampler_op.h
@@ -21,10 +21,10 @@
 #include <utility>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/sampler.h"
 #include "paddle/phi/core/mixed_vector.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/cpu_info_test.cc b/paddle/fluid/platform/cpu_info_test.cc
index 6f7ccd03cff2f2..fef343112dc030 100644
--- a/paddle/fluid/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
@@ -15,10 +15,10 @@
 
 #include <sstream>
 
-#include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/utils/flags.h"
 
 PHI_DECLARE_double(fraction_of_cpu_memory_to_use);
 
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
index 5799863f0aad70..d15f67e9965fa8 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/flags.h"
 
-DECLARE_bool(use_stream_safe_cuda_allocator);
+PD_DECLARE_bool(use_stream_safe_cuda_allocator);
 PHI_DECLARE_bool(new_executor_use_cuda_graph);
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 10ba4ce6e1d127..ba2494a2f4891b 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <set>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -32,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/mem_tracing.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/utils/flags.h"
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index d253a92c986cee..98c6e379342f25 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -32,14 +32,9 @@
 
 #include "paddle/fluid/platform/device_context.h"
 
-#ifdef __HIPCC__
-// HIP results in error or nan if > 256
-#define PREDEFINED_BLOCK_SIZE 256
-#else
 /* CUDA performs better as thread_per_block
    num is between [64, 512] */
 #define PREDEFINED_BLOCK_SIZE 512
-#endif
 
 namespace paddle {
 namespace platform {
@@ -58,11 +53,7 @@ static inline int RoundToPowerOfTwo(int n) {
   n |= (n >> 4);
   n |= (n >> 8);
   n |= (n >> 16);
-#ifdef __HIPCC__
-  return std::min(256, std::max(32, (n + 1)));
-#else
   return std::min(1024, std::max(32, (n + 1)));
-#endif
 }
 
 #ifdef WITH_NV_JETSON
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index 3193d5d881989c..374166faeb6f7e 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 #include <cstdlib>
 #include <string>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
@@ -23,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 40f69a87f37f86..94a96b67cd6b1c 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 425d4939b565f5..d9c9398461d5c7 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -60,10 +60,10 @@ limitations under the License. */
 #endif
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
diff --git a/paddle/fluid/platform/flags.h b/paddle/fluid/platform/flags.h
index b08aececd2b816..66d8f9557ef189 100644
--- a/paddle/fluid/platform/flags.h
+++ b/paddle/fluid/platform/flags.h
@@ -23,7 +23,7 @@
 
 #define __PADDLE_DEFINE_EXPORTED_FLAG(                                        \
     __name, __is_writable, __cpp_type, __gflag_type, __default_value, __doc)  \
-  DEFINE_##__gflag_type(__name, __default_value, __doc); /* NOLINT */         \
+  PD_DEFINE_##__gflag_type(__name, __default_value, __doc); /* NOLINT */      \
   struct __PaddleRegisterFlag_##__name {                                      \
     __PaddleRegisterFlag_##__name() {                                         \
       using FlagDeclaredType =                                                \
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 2ae413db5e6caf..4e0803c59ce598 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -57,6 +57,11 @@ limitations under the License. */
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/custom_kernel.h"
 
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
+#endif
+
 PHI_DECLARE_int32(paddle_num_threads);
 PADDLE_DEFINE_EXPORTED_int32(
     multiple_of_cupti_buffer_size,
@@ -97,8 +102,8 @@ bool InitGflags(std::vector<std::string> args) {
             << ", Init commandline: " << line;
 
     char **arr = argv.data();
-    ::GFLAGS_NAMESPACE::AllowCommandLineReparsing();
-    ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &arr, true);
+    paddle::flags::AllowUndefinedFlags();
+    paddle::flags::ParseCommandLineFlags(&argc, &arr);
     successed = true;
 
     VLOG(1) << "After Parse: argc is " << argc;
@@ -440,6 +445,41 @@ void InitMemoryMethod() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
 #endif
+
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+    // TODO(GhostScreaming): Use phi methods later.
+    memory_method->get_allocator =
+        [](int device_id, phi::gpuStream_t stream) -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::GPUPlace(device_id), stream)
+          .get();
+    };
+    memory_method->get_host_allocator = []() -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get();
+    };
+    memory_method->get_zero_allocator = [](int device_id) -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(phi::GPUPlace(device_id))
+          .get();
+    };
+    memory_method->get_host_zero_allocator = []() -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(phi::CPUPlace())
+          .get();
+    };
+    memory_method->get_pinned_allocator = []() -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::GPUPinnedPlace())
+          .get();
+    };
+    memory_method->get_new_cuda_event = [](int device_id) {
+      return paddle::platform::CudaEventResourcePool::Instance().New(device_id);
+    };
+#endif
+
     memory_method->emplace_device_contexts =
         paddle::platform::EmplaceDeviceContexts;
     memory_method->init_devices = InitDevices;
diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h
index 2d5c34002b4bb9..4ff99f2866e605 100644
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 54d27cbd7b7557..efee8a264bc636 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -38,7 +38,9 @@ PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
                             false,
                             "Enable rpc profiler or not.");
 
-DEFINE_bool(enable_record_memory, false, "enable memory recorder");  // NOLINT
+PD_DEFINE_bool(enable_record_memory,
+               false,
+               "enable memory recorder");  // NOLINT
 
 #if defined(_WIN32) && defined(PHI_SHARED)
 phi::ProfilerState phi::ProfilerHelper::g_state = phi::ProfilerState::kDisabled;
diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc
index 6907ccd3b044bf..56fe468838b836 100644
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -17,17 +17,9 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/common_event.h"
 #include "paddle/fluid/platform/profiler/host_event_recorder.h"
 
-// Used to filter events, works like glog VLOG(level).
-// RecordEvent will works if host_trace_level >= level.
-PADDLE_DEFINE_EXPORTED_int64(host_trace_level,
-                             1,
-                             "RecordEvent will works "
-                             "if host_trace_level >= level.");
-
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 78e6443cbac6d6..4f58b0e3ccef46 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -25,6 +25,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/cuda_tracer.h"
 #include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h"
 #include "paddle/fluid/platform/profiler/extra_info.h"
@@ -36,6 +37,13 @@
 #include "paddle/phi/backends/device_manager.h"
 #endif
 
+// Used to filter events, works like glog VLOG(level).
+// RecordEvent will works if host_trace_level >= level.
+PADDLE_DEFINE_EXPORTED_int64(host_trace_level,
+                             1,
+                             "RecordEvent will works "
+                             "if host_trace_level >= level.");
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h
index 28cf7a2d385722..4ab98bab53050d 100644
--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -27,7 +27,7 @@
 #include "paddle/fluid/platform/profiler/event_python.h"
 #include "paddle/fluid/platform/profiler/tracer_base.h"
 
-DECLARE_int64(host_trace_level);
+PD_DECLARE_int64(host_trace_level);
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index 62c777664f3038..7b3072675ec9a9 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -37,7 +37,15 @@
 # fmt: on
 
 
-VJPS = ['tanh_grad', 'mean_grad', 'add_grad', 'divide_grad', 'sum_grad']
+VJPS = [
+    'tanh_grad',
+    'mean_grad',
+    'add_grad',
+    'divide_grad',
+    'sum_grad',
+    'concat_grad',
+    'split_grad',
+]
 VJP_COMPS = ['divide_grad', 'sum_grad']
 BACKENDS = [
     'add_n',
@@ -58,6 +66,8 @@
     'add_grad',
     'divide_grad',
     'sum_grad',
+    'concat_grad',
+    'split_grad',
 ]
 
 
@@ -180,7 +190,11 @@ def to_apis_dict(apis):
 def get_inplace_api(apis):
     inplace_apis = []
     for api in apis:
-        if 'inplace' in api and api['inplace'] is not None:
+        if (
+            'inplace' in api
+            and api['inplace'] is not None
+            and not api['name'].endswith('_')
+        ):
             inplace_api = api.copy()
             inplace_api['name'] = api['name'] + '_'
             inplace_apis.append(inplace_api)
@@ -196,14 +210,13 @@ def extend_compat_info(apis, compats):
     apis_dict = to_apis_dict(apis)
     for compat_item in compats:
         fwd_op_name = compat_item["op"]
-        backward_op_names = (
-            compat_item["backward"].split(',')
-            if 'backward' in compat_item
-            else []
-        )
         if fwd_op_name not in apis_dict:
             continue
         fwd_api = apis_dict[fwd_op_name]
+        backward_op_names = []
+        while fwd_op_name is not None and fwd_op_name in apis_dict:
+            backward_op_names.append(apis_dict[fwd_op_name]['backward'])
+            fwd_op_name = apis_dict[fwd_op_name]['backward']
         backward_apis = []
         for backward_op_name in backward_op_names:
             if backward_op_name in apis_dict:
@@ -215,6 +228,7 @@ def extend_compat_info(apis, compats):
                 if (
                     'support_tensor' in attr_info
                     and attr_info['support_tensor'] is True
+                    or 'tensor_name' in attr_info
                 ):
                     support_tensor_attrs_names.append(attr_name)
                 if 'data_type' in attr_info:
@@ -226,6 +240,8 @@ def extend_compat_info(apis, compats):
                 if (
                     'support_tensor' in attr_info
                     and attr_info['support_tensor'] is True
+                    or 'tensor_name' in attr_info
+                    or 'tensors_name' in attr_info
                 ):
                     support_tensor_attrs_names.append(attr_name)
         if len(support_tensor_attrs_names) > 0:
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
index 8e38af42f9a2e5..4c2f02224e2f70 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
@@ -34,10 +34,9 @@ return ::{{name}}_ad_func({{common.args(input_names, attr_names)}});
 
 
 {% for api in apis %}
-  {#- TODO(cxxly): codegen for reshape -#}
-  {%- if api.is_prim and api.name in backend_white_list and api.name != 'reshape' -%}
-{{sig(api.name, api.inputs, api.attrs, api.outputs)}} {
-{{body(api.name, api.inputs, api.attrs, api.outputs)}} 
+  {%- if api.is_prim and api.name in backend_white_list -%}
+{{sig(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} {
+{{body(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} 
 }
 
   {% endif %}
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
index dd567490692fb4..a5d090162d04f5 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
@@ -53,15 +53,32 @@ template <>
     {%- endif -%}
   {% endfor %}
   auto op_res = paddle::dialect::{{name}}({{common.args(input_names, attr_names)}});
-  {% if outputs|length > 1 %}
-  return std::make_tuple(
+  {% if outputs|length == 1 %}
+    {% if outputs[0].typename == 'Tensor' %}
+  Tensor {{outputs[0].name}}(std::make_shared<LazyTensor>(op_res));
+  return {{outputs[0].name}};
+    {% elif outputs[0].typename == 'Tensor []' %}
+  std::vector<Tensor> {{outputs[0].name}}(op_res.size());
+  std::transform(op_res.begin(), op_res.end(), {{outputs[0].name}}.begin(), [](const OpResult& res) {
+    return Tensor(std::make_shared<LazyTensor>(res));
+  });
+  return {{outputs[0].name}};
+    {% else %} {#- render nothing -#}
+    {% endif %}
+  {% elif outputs|length > 1 %}
     {% for i in range(outputs|length) %}
-    Tensor(std::make_shared<LazyTensor>(std::get<{{i}}>(op_res))){%- if i!=outputs|length - 1 -%}, {% endif %}
-
+  auto op_res_{{i}} = std::get<{{i}}>(op_res);
+      {% if outputs[i].typename == 'Tensor' %}
+  Tensor {{outputs[i].name}}(std::make_shared<LazyTensor>(op_res_{{i}}));
+      {% elif outputs[i].typename == 'Tensor []' %}
+  std::vector<Tensor> {{outputs[i].name}}(op_res_{{i}}.size());
+  std::transform(op_res_{{i}}.begin(), op_res_{{i}}.end(), {{outputs[i].name}}.begin(), [](const OpResult& res) {
+    return Tensor(std::make_shared<LazyTensor>(res));
+  });
+      {% else %} {#- render nothing -#}
+      {% endif %}
     {% endfor %}
-  );
-  {% elif outputs|length == 1 %}
-  return Tensor(std::make_shared<LazyTensor>(op_res));
+  return std::make_tuple({% for i in range(outputs|length) %}{{outputs[i].name}}{%- if i!=outputs|length - 1 -%}, {% endif %}{% endfor %});
   {% else %} {#- render nothing -#}
   {% endif %}
 {% endmacro %}
diff --git a/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2 b/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2
index ccc4b44e139746..34ee37c5898e65 100644
--- a/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2
+++ b/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2
@@ -13,7 +13,7 @@ using Tensor = paddle::Tensor;
 using IntArray = paddle::experimental::IntArray;
 
 {% for api in apis %}
-{%- if api.is_prim and api.name in backend_white_list -%}
+{%- if api.is_prim and api.name in backend_white_list and api.name[-1] !=  '_' -%}
   {%- set input_names = [] -%}
   {%- for i in api.inputs -%} {%- do input_names.append(i.name) -%} {%- endfor -%}
   {%- set attr_names = [] -%}
diff --git a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
index 86d83dbee249d1..c56ac5c5f79ab0 100644
--- a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
+++ b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
@@ -24,49 +24,5 @@
 #include "paddle/ir/core/operation.h"
 
 namespace paddle {
-namespace primitive {
-
-std::vector<std::vector<paddle::Tensor>> concat_vjp(
-    const std::vector<Tensor>& x,
-    const Tensor& out_grad,
-    const Tensor& axis,
-    const std::vector<std::vector<bool>>& stop_gradients) {
-  std::vector<std::vector<paddle::Tensor>> vjp_res(2, std::vector<Tensor>());
-  // get concat_grad res.
-  std::vector<Tensor> op_res =
-      backend::concat_grad<primitive::LazyTensor>(x, out_grad, axis);
-
-  // construct vjp result by op result and stop_gradients info
-  vjp_res[0].resize(op_res.size());
-  for (uint64_t idx = 0; idx < op_res.size(); idx++) {
-    if (!stop_gradients[0][idx]) {
-      vjp_res[0][idx] = op_res[idx];
-    }
-  }
-  // vjp_res[1] is axis's grad which is attribute (no grad).
-  vjp_res[1].resize(1);
-  return vjp_res;
-}
-
-std::vector<std::vector<paddle::Tensor>> split_vjp(
-    const std::vector<Tensor>& out_grads,
-    const Tensor& axis,
-    const std::vector<std::vector<bool>>& stop_gradients) {
-  std::vector<std::vector<paddle::Tensor>> vjp_res(3, std::vector<Tensor>(1));
-  // get concat_grad res.
-  Tensor op_res = backend::split_grad<primitive::LazyTensor>(out_grads, axis);
-
-  // construct vjp result by op result and stop_gradients info
-  if (!stop_gradients[0][0]) {
-    vjp_res[0][0] = op_res;
-  }
-
-  // vjp_res[1] is sections's grad which is attribute (no grad).
-  // vjp_res[2] is axis's grad which is attribute (no grad).
-  vjp_res[1].resize(stop_gradients[1].size());
-  vjp_res[2].resize(stop_gradients[2].size());
-  return vjp_res;
-}
-
-}  // namespace primitive
+namespace primitive {}  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.h b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.h
index 87e1f33bb9ebdd..0fffd6ba31a4ce 100644
--- a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.h
+++ b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.h
@@ -23,17 +23,5 @@ namespace paddle {
 namespace primitive {
 
 using IntArray = paddle::experimental::IntArray;
-
-std::vector<std::vector<paddle::Tensor>> concat_vjp(
-    const std::vector<Tensor>& x,
-    const Tensor& out_grad,
-    const Tensor& axis,
-    const std::vector<std::vector<bool>>& stop_gradients);
-
-std::vector<std::vector<paddle::Tensor>> split_vjp(
-    const std::vector<Tensor>& out_grads,
-    const Tensor& axis,
-    const std::vector<std::vector<bool>>& stop_gradients);
-
 }  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 64388a4c88eb9d..478efa7a5af959 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -57,18 +57,7 @@ if(WITH_PSCORE)
   endif()
 endif()
 if(WITH_RPC)
-  set(PYBIND_DEPS
-      ${PYBIND_DEPS}
-      paddle_rpc
-      brpc
-      ssl
-      crypto
-      protobuf
-      zlib
-      leveldb
-      snappy
-      phi
-      glog)
+  set(PYBIND_DEPS ${PYBIND_DEPS} paddle_rpc ${EXTERNAL_BRPC_DEPS} zlib phi)
 endif()
 if(WITH_GPU OR WITH_ROCM)
   set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
@@ -536,6 +525,7 @@ if(WITH_PYTHON)
   #                PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON)
   # endif()
 
+  add_dependencies(${SHARD_LIB_NAME} ops_api_gen)
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     add_dependencies(${SHARD_LIB_NAME} legacy_eager_codegen)
     add_dependencies(${SHARD_LIB_NAME} eager_legacy_op_function_generator_cmd)
diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
index e03292faa9e42c..6f639f145dceab 100644
--- a/paddle/fluid/pybind/auto_parallel_py.cc
+++ b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -15,13 +15,16 @@
 #include <pybind11/operators.h>
 #include <pybind11/stl.h>
 
+#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/pybind/auto_parallel_py.h"
+#include "paddle/fluid/pybind/pybind_variant_caster.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/distributed/auto_parallel/device_mesh.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_mapper.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
 #include "paddle/utils/optional.h"
 #include "paddle/utils/pybind.h"
@@ -32,6 +35,10 @@
 #include "paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.h"
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/infermeta/spmd_rules/rules.h"
+#endif
+
 namespace py = pybind11;
 
 namespace paddle {
@@ -42,6 +49,7 @@ using paddle::distributed::auto_parallel::kDefault;
 using paddle::distributed::auto_parallel::OperatorDistAttr;
 using paddle::distributed::auto_parallel::SPMDRuleBase;
 using paddle::distributed::auto_parallel::SPMDRuleMap;
+using paddle::framework::BlockDesc;
 using paddle::framework::OpDesc;
 using paddle::framework::VarDesc;
 using phi::distributed::ProcessMesh;
@@ -343,6 +351,41 @@ void BindAutoParallel(py::module *m) {
                &SPMDRuleBase::InferBackward));
   // .def("infer_backward", &SPMDRuleBase::InferBackward) [revert in future]
 
+  py::class_<phi::distributed::SpmdRule>(*m, "SpmdRule")
+      .def("infer_forward",
+           [](const phi::distributed::SpmdRule &self,
+              const std::vector<DistTensorSpec> &input_specs,
+              const std::vector<phi::Attribute> &attrs) {
+             phi::distributed::InferSpmdContext ctx;
+             for (auto &spec : input_specs) {
+               ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
+                   phi::make_ddim(spec.shape()), spec.dist_attr()));
+             }
+             for (auto &attr : attrs) {
+               ctx.EmplaceBackAttr(attr);
+             }
+             return self.InferForward(ctx);
+           })
+      .def("infer_backward",
+           [](const phi::distributed::SpmdRule &self,
+              const std::vector<DistTensorSpec> &input_specs,
+              const std::vector<DistTensorSpec> &output_specs,
+              const std::vector<phi::Attribute> &attrs) {
+             phi::distributed::InferSpmdContext ctx;
+             for (auto &spec : input_specs) {
+               ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
+                   phi::make_ddim(spec.shape()), spec.dist_attr()));
+             }
+             for (auto &spec : output_specs) {
+               ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
+                   phi::make_ddim(spec.shape()), spec.dist_attr()));
+             }
+             for (auto &attr : attrs) {
+               ctx.EmplaceBackAttr(attr);
+             }
+             return self.InferBackward(ctx);
+           });
+
   py::class_<DistTensorSpec>(*m, "DistTensorSpec")
       .def(py::init<>())
       .def(py::init<const DistTensorSpec &>())
@@ -472,6 +515,14 @@ void BindAutoParallel(py::module *m) {
       },
       py::return_value_policy::reference);
 
+  m->def(
+      "get_phi_spmd_rule",
+      [](const std::string op_type) {
+        return phi::distributed::SpmdRuleFactory::Instance().GetSpmdRule(
+            op_type);
+      },
+      py::return_value_policy::reference);
+
   // TODO(liuzhenhai): DistributedMapper is not used for now, but
   // dist_mapper_test need the symbols forch DistributedMapper to be linked,
   // remove it latter
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index bf58f1d6ac0d84..82408b5236936e 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -28,8 +28,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/store/store_utils.h"
 #include "paddle/phi/core/distributed/store/tcp_store.h"
 
 namespace py = pybind11;
@@ -42,14 +42,14 @@ void BindCommContextManager(py::module *m) {
       py::class_<phi::distributed::CommContextManager,
                  std::shared_ptr<phi::distributed::CommContextManager>>(
           *m, "CommContextManager")
+          .def_static("set_device_id",
+                      &phi::distributed::CommContextManager::SetDeviceId,
+                      py::call_guard<py::gil_scoped_release>())
 #if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
           .def_static(
               "create_nccl_comm_context",
               &phi::distributed::CommContextManager::CreateNCCLCommContext,
               py::call_guard<py::gil_scoped_release>())
-          .def_static("set_cuda_device_id",
-                      &phi::distributed::CommContextManager::SetCUDADeviceId,
-                      py::call_guard<py::gil_scoped_release>())
 #endif
 #if defined(PADDLE_WITH_GLOO)
           .def_static(
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index db6ce75c7d0450..d03a20537eee66 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -1187,8 +1187,8 @@ static PyObject* eager_api_to_uva_tensor(PyObject* self,
                                          PyObject* kwargs) {
   EAGER_TRY
   VLOG(4) << "Running in eager_api_to_uva_tensor.";
-  auto new_tensor = std::shared_ptr<paddle::Tensor>(
-      new paddle::Tensor(egr::Controller::Instance().GenerateUniqueName()));
+  auto new_tensor = std::make_shared<paddle::Tensor>(
+      egr::Controller::Instance().GenerateUniqueName());
   PyObject* obj = PyTuple_GET_ITEM(args, 0);
   auto array = py::cast<py::array>(py::handle(obj));
 
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index f667602a493105..7d3037a076d572 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -67,7 +67,7 @@ typedef SSIZE_T ssize_t;
 #include "paddle/utils/pybind.h"
 
 PHI_DECLARE_bool(set_to_1d);
-DECLARE_bool(use_stride_kernel);
+PD_DECLARE_bool(use_stride_kernel);
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index e6f25199c43554..95d86f544c4bf4 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -664,7 +664,7 @@ paddle::DataType CastPyArg2DataTypeDirectly(PyObject* obj,
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s: argument (position %d) must be "
-        "one of core.VarDesc.VarType, "
+        "one of paddle::DataType, "
         "but got %s",
         op_type,
         arg_pos + 1,
diff --git a/paddle/fluid/pybind/generate_file_structures.py b/paddle/fluid/pybind/generate_file_structures.py
index 966c512a31eb27..97d8ffc7fc149c 100644
--- a/paddle/fluid/pybind/generate_file_structures.py
+++ b/paddle/fluid/pybind/generate_file_structures.py
@@ -23,6 +23,7 @@
     empty_files = [os.path.join(pybind_dir, "eager_legacy_op_function.cc")]
     empty_files.append(os.path.join(pybind_dir, "eager_op_function.cc"))
     empty_files.append(os.path.join(pybind_dir, "static_op_function.cc"))
+    empty_files.append(os.path.join(pybind_dir, "ops_api.cc"))
 
     for path in empty_files:
         if not os.path.exists(path):
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 94e3ca1ba41bdb..7f8cac9ee1edf9 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -22,12 +22,12 @@
 #include <utility>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/phi/core/macros.h"
+#include "paddle/utils/flags.h"
 #include "pybind11/stl.h"
 
 // FIXME(zengjinle): these 2 flags may be removed by the linker when compiling
@@ -41,8 +41,8 @@ PADDLE_FORCE_LINK_FLAG(free_when_no_cache_hit);
 
 // NOTE: where are these 2 flags from?
 #ifdef PADDLE_WITH_DISTRIBUTE
-DECLARE_int32(rpc_get_thread_num);
-DECLARE_int32(rpc_prefetch_thread_num);
+PD_DECLARE_int32(rpc_get_thread_num);
+PD_DECLARE_int32(rpc_prefetch_thread_num);
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 2e67906f4f2a91..111e8ebdfdf86b 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -976,8 +976,8 @@ void BindImperative(py::module *m_ptr) {
       "to_uva_tensor",
       [](const py::object &obj, int device_id) {
         const auto &tracer = imperative::GetCurrentTracer();
-        auto new_tensor = std::shared_ptr<imperative::VarBase>(
-            new imperative::VarBase(tracer->GenerateUniqueName()));
+        auto new_tensor =
+            std::make_shared<imperative::VarBase>(tracer->GenerateUniqueName());
         auto array = obj.cast<py::array>();
         if (py::isinstance<py::array_t<int32_t>>(array)) {
           SetUVATensorFromPyArray<int32_t>(new_tensor, array, device_id);
diff --git a/paddle/fluid/pybind/ops_api.cc b/paddle/fluid/pybind/ops_api.cc
deleted file mode 100644
index 9efe49a97c8c9e..00000000000000
--- a/paddle/fluid/pybind/ops_api.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <pybind11/pybind11.h>
-
-#include "paddle/fluid/pybind/static_op_function.h"
-#include "paddle/phi/core/enforce.h"
-
-namespace paddle {
-namespace pybind {
-
-static PyObject *add_n(PyObject *self, PyObject *args, PyObject *kwargs) {
-  return static_api_add_n(self, args, kwargs);
-}
-
-static PyObject *mean(PyObject *self, PyObject *args, PyObject *kwargs) {
-  return static_api_mean(self, args, kwargs);
-}
-
-static PyObject *sum(PyObject *self, PyObject *args, PyObject *kwargs) {
-  return static_api_sum(self, args, kwargs);
-}
-
-static PyObject *full(PyObject *self, PyObject *args, PyObject *kwargs) {
-  return static_api_full(self, args, kwargs);
-}
-
-static PyObject *divide(PyObject *self, PyObject *args, PyObject *kwargs) {
-  return static_api_divide(self, args, kwargs);
-}
-
-static PyObject *data(PyObject *self, PyObject *args, PyObject *kwargs) {
-  return static_api_data(self, args, kwargs);
-}
-
-static PyObject *fetch(PyObject *self, PyObject *args, PyObject *kwargs) {
-  return static_api_fetch(self, args, kwargs);
-}
-
-static PyObject *concat(PyObject *self, PyObject *args, PyObject *kwargs) {
-  return static_api_concat(self, args, kwargs);
-}
-
-static PyObject *split(PyObject *self, PyObject *args, PyObject *kwargs) {
-  return static_api_split(self, args, kwargs);
-}
-
-static PyMethodDef OpsAPI[] = {{"add_n",
-                                (PyCFunction)(void (*)(void))add_n,
-                                METH_VARARGS | METH_KEYWORDS,
-                                "C++ interface function for add_n."},
-                               {"mean",
-                                (PyCFunction)(void (*)(void))mean,
-                                METH_VARARGS | METH_KEYWORDS,
-                                "C++ interface function for mean."},
-                               {"sum",
-                                (PyCFunction)(void (*)(void))sum,
-                                METH_VARARGS | METH_KEYWORDS,
-                                "C++ interface function for sum."},
-                               {"divide",
-                                (PyCFunction)(void (*)(void))divide,
-                                METH_VARARGS | METH_KEYWORDS,
-                                "C++ interface function for divide."},
-                               {"concat",
-                                (PyCFunction)(void (*)(void))concat,
-                                METH_VARARGS | METH_KEYWORDS,
-                                "C++ interface function for concat."},
-                               {"full",
-                                (PyCFunction)(void (*)(void))full,
-                                METH_VARARGS | METH_KEYWORDS,
-                                "C++ interface function for full."},
-                               {"split",
-                                (PyCFunction)(void (*)(void))split,
-                                METH_VARARGS | METH_KEYWORDS,
-                                "C++ interface function for split."},
-                               {"data",
-                                (PyCFunction)(void (*)(void))data,
-                                METH_VARARGS | METH_KEYWORDS,
-                                "C++ interface function for data."},
-                               {"fetch",
-                                (PyCFunction)(void (*)(void))fetch,
-                                METH_VARARGS | METH_KEYWORDS,
-                                "C++ interface function for fetch."},
-                               {nullptr, nullptr, 0, nullptr}};
-
-void BindOpsAPI(pybind11::module *module) {
-  if (PyModule_AddFunctions(module->ptr(), OpsAPI) < 0) {
-    PADDLE_THROW(phi::errors::Fatal("Add C++ api to core.ops failed!"));
-  }
-}
-
-}  // namespace pybind
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 0ae52f7fdd74b0..0e581e45b5970d 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -23,7 +23,6 @@
 
 #include "Python.h"
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/tracer.h"
@@ -33,6 +32,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/utils/flags.h"
 #include "pybind11/stl.h"
 
 PHI_DECLARE_bool(reader_queue_speed_test_mode);
diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h
index d161b2a912fcaa..8e34cbb8ab9dc0 100644
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
@@ -18,5 +18,5 @@
 #include <string>
 #include <utility>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "paddle/utils/string/pretty_log.h"
diff --git a/paddle/ir/core/ir_printer.cc b/paddle/ir/core/ir_printer.cc
index 080e0bafc966a7..25f23b31e28541 100644
--- a/paddle/ir/core/ir_printer.cc
+++ b/paddle/ir/core/ir_printer.cc
@@ -324,4 +324,9 @@ std::ostream& operator<<(std::ostream& os, Attribute attr) {
   return os;
 }
 
+std::ostream& operator<<(std::ostream& os, const Program& prog) {
+  prog.Print(os);
+  return os;
+}
+
 }  // namespace ir
diff --git a/paddle/ir/core/program.h b/paddle/ir/core/program.h
index 0e2ecb58d91816..6f44a3fe4699ce 100644
--- a/paddle/ir/core/program.h
+++ b/paddle/ir/core/program.h
@@ -71,4 +71,6 @@ class IR_API Program {
   ParameterMap parameters_;
 };
 
+std::ostream& operator<<(std::ostream& os, const Program& prog);
+
 }  // namespace ir
diff --git a/paddle/ir/core/value.cc b/paddle/ir/core/value.cc
index d38bcdca36314b..58622a47540231 100644
--- a/paddle/ir/core/value.cc
+++ b/paddle/ir/core/value.cc
@@ -264,10 +264,7 @@ uint32_t OpResultImpl::GetResultIndex() const {
   return ir::dyn_cast<OpInlineResultImpl>(this)->GetResultIndex();
 }
 
-OpResultImpl::~OpResultImpl() {
-  assert(use_empty() &&
-         owner()->name() + " operation destroyed but still has uses.");
-}
+OpResultImpl::~OpResultImpl() { assert(use_empty()); }
 
 ir::Operation *OpResultImpl::owner() const {
   // For inline result, pointer offset index to obtain the address of op.
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 83d8006957ac46..cfbf8fec0adfdc 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -36,7 +36,7 @@ endif()
 set(PHI_DEPS
     phi_profiler_proto
     auto_parallel_proto
-    gflags
+    ${flags_dep}
     glog
     warpctc
     warprnnt
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index 73a1ed4c7fdc85..80e169fae10c89 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/api/lib/api_gen_utils.h"
-#include "gflags/gflags.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/strided_copy_kernel.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_bool(use_stride_kernel);
+PD_DECLARE_bool(use_stride_kernel);
 
 #include "glog/logging.h"
 
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 3d717c6dbfe059..a8eb379e359fc4 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
@@ -28,8 +27,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/contiguous_kernel.h"
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_bool(use_stride_kernel);
+PD_DECLARE_bool(use_stride_kernel);
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/profiler/device_tracer.cc b/paddle/phi/api/profiler/device_tracer.cc
index b16579491f49d5..e294130da7bab8 100644
--- a/paddle/phi/api/profiler/device_tracer.cc
+++ b/paddle/phi/api/profiler/device_tracer.cc
@@ -21,11 +21,11 @@ limitations under the License. */
 #include <string>
 #include <thread>  // NOLINT
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_bool(enable_host_event_recorder_hook);
+PD_DECLARE_bool(enable_host_event_recorder_hook);
 
 namespace phi {
 
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index 648384422ca8ab..09ccd2fe7d87d5 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -160,6 +160,16 @@
   backward: fused_rotary_position_embedding_grad
   support_dygraph_mode : true
 
+- op : fused_scale_bias_relu_conv_bnstats
+  args : (Tensor x, Tensor w, Tensor scale, Tensor bias, Tensor bn_scale, Tensor bn_bias, Tensor input_running_mean, Tensor input_running_var, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, str data_format, float momentum, float epsilon, bool fuse_prologue, bool exhaustive_search, int64_t accumulation_count = 0)
+  optional : scale, bias
+  output : Tensor(out), Tensor(out_running_mean), Tensor(out_running_var), Tensor(saved_mean), Tensor(saved_var), Tensor(eq_scale), Tensor(eq_bias)
+  infer_meta :
+    func : FusedScaleBiasReluConvBnstatsInferMeta
+  kernel :
+    func : fused_scale_bias_relu_conv_bnstats
+    data_type : x
+
 - op : generate_sequence_xpu
   args : (Tensor x, DataType dtype)
   output : Tensor
diff --git a/paddle/phi/api/yaml/generator/api_gen.py b/paddle/phi/api/yaml/generator/api_gen.py
index 50ba2078b045f9..5164ebda840355 100644
--- a/paddle/phi/api/yaml/generator/api_gen.py
+++ b/paddle/phi/api/yaml/generator/api_gen.py
@@ -360,7 +360,7 @@ def source_include(header_file_path):
 #include <memory>
 
 #include "glog/logging.h"
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
@@ -379,8 +379,8 @@ def source_include(header_file_path):
 #include "paddle/phi/api/profiler/event_tracing.h"
 #include "paddle/phi/api/profiler/supplement_tracing.h"
 
-DECLARE_bool(conv2d_disable_cudnn);
-DECLARE_int32(low_precision_op_list);
+PD_DECLARE_bool(conv2d_disable_cudnn);
+PD_DECLARE_int32(low_precision_op_list);
 """
 
 
diff --git a/paddle/phi/api/yaml/generator/backward_api_gen.py b/paddle/phi/api/yaml/generator/backward_api_gen.py
index 183904f642c986..9347552dbb1342 100644
--- a/paddle/phi/api/yaml/generator/backward_api_gen.py
+++ b/paddle/phi/api/yaml/generator/backward_api_gen.py
@@ -275,7 +275,7 @@ def source_include(header_file_path, fw_header_file_path):
 #include <memory>
 
 #include "glog/logging.h"
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
@@ -290,8 +290,8 @@ def source_include(header_file_path, fw_header_file_path):
 #include "paddle/phi/api/profiler/event_tracing.h"
 #include "paddle/phi/api/profiler/supplement_tracing.h"
 
-DECLARE_bool(conv2d_disable_cudnn);
-DECLARE_int32(low_precision_op_list);
+PD_DECLARE_bool(conv2d_disable_cudnn);
+PD_DECLARE_int32(low_precision_op_list);
 """
 
 
diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
index ac6f5f2e916eb3..95a6f94706eeee 100644
--- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
@@ -138,7 +138,7 @@ def source_include(header_file_path, fw_header_file_path):
 #include <memory>
 
 #include "glog/logging.h"
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
@@ -153,8 +153,8 @@ def source_include(header_file_path, fw_header_file_path):
 #include "paddle/phi/api/profiler/event_tracing.h"
 #include "paddle/phi/api/profiler/supplement_tracing.h"
 
-DECLARE_bool(conv2d_disable_cudnn);
-DECLARE_int32(low_precision_op_list);
+PD_DECLARE_bool(conv2d_disable_cudnn);
+PD_DECLARE_int32(low_precision_op_list);
 """
 
 
diff --git a/paddle/phi/api/yaml/generator/intermediate_api_gen.py b/paddle/phi/api/yaml/generator/intermediate_api_gen.py
index e8f9c1cf751525..5c3b6ddd32d5d9 100644
--- a/paddle/phi/api/yaml/generator/intermediate_api_gen.py
+++ b/paddle/phi/api/yaml/generator/intermediate_api_gen.py
@@ -36,7 +36,7 @@ def source_include(header_file_path):
 #include <memory>
 
 #include "glog/logging.h"
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
@@ -56,7 +56,7 @@ def source_include(header_file_path):
 #include "paddle/phi/api/profiler/event_tracing.h"
 #include "paddle/phi/api/profiler/supplement_tracing.h"
 
-DECLARE_int32(low_precision_op_list);
+PD_DECLARE_int32(low_precision_op_list);
 """
 
 
diff --git a/paddle/phi/api/yaml/generator/sparse_api_gen.py b/paddle/phi/api/yaml/generator/sparse_api_gen.py
index af28fcb0948c90..9a017725d68889 100644
--- a/paddle/phi/api/yaml/generator/sparse_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_api_gen.py
@@ -426,7 +426,7 @@ def source_include(header_file_path):
 #include <memory>
 
 #include "glog/logging.h"
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
@@ -442,7 +442,7 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/sparse/binary.h"
 #include "paddle/phi/infermeta/sparse/multiary.h"
 
-DECLARE_int32(low_precision_op_list);
+PD_DECLARE_int32(low_precision_op_list);
 """
 
 
diff --git a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
index 67a8514bf19373..064cf07d0dbf76 100644
--- a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
@@ -121,7 +121,7 @@ def source_include(header_file_path):
 #include <memory>
 
 #include "glog/logging.h"
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
@@ -137,7 +137,7 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/sparse/binary.h"
 #include "paddle/phi/infermeta/sparse/backward.h"
 
-DECLARE_int32(low_precision_op_list);
+PD_DECLARE_int32(low_precision_op_list);
 """
 
 
diff --git a/paddle/phi/api/yaml/generator/strings_api_gen.py b/paddle/phi/api/yaml/generator/strings_api_gen.py
index ed0bfe42bd8b78..4e66bd5f2fdc6b 100644
--- a/paddle/phi/api/yaml/generator/strings_api_gen.py
+++ b/paddle/phi/api/yaml/generator/strings_api_gen.py
@@ -329,8 +329,8 @@ def source_include(header_file_path):
     return f"""
 #include "{header_file_path}"
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/core/kernel_context.h"
@@ -340,7 +340,7 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-DECLARE_int32(low_precision_op_list);
+PD_DECLARE_int32(low_precision_op_list);
 """
 
 
diff --git a/paddle/phi/api/yaml/generator/tensor_operants_gen.py b/paddle/phi/api/yaml/generator/tensor_operants_gen.py
index f86efeaaefac88..1ca80a8bd76e82 100644
--- a/paddle/phi/api/yaml/generator/tensor_operants_gen.py
+++ b/paddle/phi/api/yaml/generator/tensor_operants_gen.py
@@ -441,16 +441,16 @@ class OperantsManager {
 
 #include "paddle/phi/api/include/operants_manager.h"
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
+#include "paddle/utils/flags.h"
 
 """
 
 
 operants_manager_source_start = """
-DECLARE_string(tensor_operants_mode);
+PD_DECLARE_string(tensor_operants_mode);
 
 namespace paddle {
 
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index eca5b93e24f883..fbc058ff64e78f 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1515,7 +1515,7 @@
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
-    func : ElementwiseInferMeta
+    func : LogicalBinaryInferMeta
   kernel :
     func : logical_and
     data_type : x
@@ -1526,7 +1526,7 @@
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
-    func : UnchangedInferMeta
+    func : LogicalNotInfermeta
   kernel :
     func : logical_not
     data_type : x
@@ -1537,7 +1537,7 @@
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
-    func : ElementwiseInferMeta
+    func : LogicalBinaryInferMeta
   kernel :
     func : logical_or
     data_type : x
@@ -1548,7 +1548,7 @@
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
-    func : ElementwiseInferMeta
+    func : LogicalBinaryInferMeta
   kernel :
     func : logical_xor
     data_type : x
diff --git a/paddle/phi/backends/cpu/cpu_info.cc b/paddle/phi/backends/cpu/cpu_info.cc
index 8d1d757dfd5983..2eda0104877543 100644
--- a/paddle/phi/backends/cpu/cpu_info.cc
+++ b/paddle/phi/backends/cpu/cpu_info.cc
@@ -35,9 +35,9 @@ limitations under the License. */
 
 #include "paddle/phi/core/flags.h"
 
-DECLARE_double(fraction_of_cpu_memory_to_use);
-DECLARE_uint64(initial_cpu_memory_in_mb);
-DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+PD_DECLARE_double(fraction_of_cpu_memory_to_use);
+PD_DECLARE_uint64(initial_cpu_memory_in_mb);
+PD_DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc
index 2b7d0411fedcab..5b6b8fcfc2fe9a 100644
--- a/paddle/phi/backends/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -14,13 +14,13 @@
 
 #include "paddle/phi/backends/device_base.h"
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_double(fraction_of_gpu_memory_to_use);
-DECLARE_uint64(initial_gpu_memory_in_mb);
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
+PD_DECLARE_double(fraction_of_gpu_memory_to_use);
+PD_DECLARE_uint64(initial_gpu_memory_in_mb);
+PD_DECLARE_uint64(reallocate_gpu_memory_in_mb);
 
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
diff --git a/paddle/phi/backends/dynload/cudnn_frontend.h b/paddle/phi/backends/dynload/cudnn_frontend.h
index 4d0b67ce2285c2..ef680d7cba035e 100644
--- a/paddle/phi/backends/dynload/cudnn_frontend.h
+++ b/paddle/phi/backends/dynload/cudnn_frontend.h
@@ -14,13 +14,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
-DECLARE_bool(enable_cudnn_frontend);
+PD_DECLARE_bool(enable_cudnn_frontend);
 
 // Redirect the CUDNN APIs in the cudnn_frontend namespace to
 // the functions in phi::dynload
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 4621a9c3ddafa9..6989f32b18e9e0 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -103,7 +103,7 @@ PHI_DEFINE_string(rccl_dir,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so.");
+PD_DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so.");
 #endif
 
 namespace phi {
diff --git a/paddle/phi/backends/gpu/cuda/cudnn_helper.h b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
index 8b6c04090d88e2..651a4247a12df0 100644
--- a/paddle/phi/backends/gpu/cuda/cudnn_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
@@ -26,8 +25,9 @@ limitations under the License. */
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
 #include "paddle/phi/core/macros.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_bool(cudnn_deterministic);
+PD_DECLARE_bool(cudnn_deterministic);
 
 namespace phi {
 namespace backends {
diff --git a/paddle/phi/backends/gpu/gpu_info.cc b/paddle/phi/backends/gpu/gpu_info.cc
index 417ff4c72e86c9..f6ca9d4168b2c8 100644
--- a/paddle/phi/backends/gpu/gpu_info.cc
+++ b/paddle/phi/backends/gpu/gpu_info.cc
@@ -17,12 +17,12 @@ limitations under the License. */
 #include <sstream>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/common/memory_utils.h"
 
-DECLARE_string(selected_gpus);
+PD_DECLARE_string(selected_gpus);
 
 namespace phi {
 namespace backends {
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index a7a7ad03ad6647..fd712baf754803 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -34,13 +34,8 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/enforce.h"
 
-#ifdef __HIPCC__
-// HIP results in error or nan if > 256
-#define PREDEFINED_BLOCK_SIZE 256
-#else
 // CUDA performs better when thread_per_block is between [64, 512]
 #define PREDEFINED_BLOCK_SIZE 512
-#endif
 
 namespace phi {
 namespace backends {
@@ -69,11 +64,7 @@ inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val = 1) {
 inline int64_t RoundToPowerOfTwo(int64_t n) {
   constexpr int64_t min_val = 32;
   int64_t num = RoundToNextHighPowOfTwo(n, min_val);
-#ifdef __HIPCC__
-  int64_t max_val = 256;
-#else
   int64_t max_val = 1024;
-#endif
   return std::min(max_val, num);
 }
 
diff --git a/paddle/phi/backends/gpu/rocm/miopen_helper.h b/paddle/phi/backends/gpu/rocm/miopen_helper.h
index 095f32ba460d01..b8ce6e22e939be 100644
--- a/paddle/phi/backends/gpu/rocm/miopen_helper.h
+++ b/paddle/phi/backends/gpu/rocm/miopen_helper.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/backends/dynload/miopen.h"
 #include "paddle/phi/common/bfloat16.h"
@@ -31,7 +31,7 @@ limitations under the License. */
 // MIOPEN do not have epslion definition
 #define CUDNN_BN_MIN_EPSILON 1e-05
 
-DECLARE_bool(cudnn_deterministic);
+PD_DECLARE_bool(cudnn_deterministic);
 
 namespace phi {
 namespace backends {
diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc
index f9ef606049297d..1af8cc442a1178 100644
--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -90,6 +90,34 @@ void EmplaceDeviceContexts(
       stream_priority);
 }
 
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) {
+  return MemoryUtils::Instance().GetAllocator(device_id, stream);
+}
+
+const phi::Allocator* GetHostAllocator() {
+  return MemoryUtils::Instance().GetHostAllocator();
+}
+
+const phi::Allocator* GetZeroAllocator(int device_id) {
+  return MemoryUtils::Instance().GetZeroAllocator(device_id);
+}
+
+const phi::Allocator* GetHostZeroAllocator() {
+  return MemoryUtils::Instance().GetHostZeroAllocator();
+}
+
+const phi::Allocator* GetPinnedAllocator() {
+  return MemoryUtils::Instance().GetPinnedAllocator();
+}
+
+std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
+    int device_id) {
+  return MemoryUtils::Instance().GetCudaEvent(device_id);
+}
+#endif
+
 }  // namespace memory_utils
 
 }  // namespace phi
diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
index f6a4afcea2f789..5f4766f8b6b91b 100644
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -24,6 +24,15 @@
 #include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/stream.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 namespace phi {
 
 struct MemoryInterface {
@@ -150,6 +159,17 @@ struct MemoryInterface {
       const std::vector<phi::Place>& places,
       bool disable_setting_default_stream_for_allocator,
       int stream_priority);
+
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+  phi::Allocator* (*get_allocator)(int device_id, phi::gpuStream_t stream);
+  phi::Allocator* (*get_host_allocator)();
+  phi::Allocator* (*get_zero_allocator)(int device_id);
+  phi::Allocator* (*get_host_zero_allocator)();
+  phi::Allocator* (*get_pinned_allocator)();
+  std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> (
+      *get_new_cuda_event)(int device_id);
+#endif
 };
 
 class MemoryUtils {
@@ -323,6 +343,34 @@ class MemoryUtils {
             "Fluid. You can call InitMemoryMethod() for initialization."));
   }
 
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+  const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) {
+    return memory_method_->get_allocator(device_id, stream);
+  }
+
+  const phi::Allocator* GetHostAllocator() {
+    return memory_method_->get_host_allocator();
+  }
+
+  const phi::Allocator* GetZeroAllocator(int device_id) {
+    return memory_method_->get_zero_allocator(device_id);
+  }
+
+  const phi::Allocator* GetHostZeroAllocator() {
+    return memory_method_->get_host_zero_allocator();
+  }
+
+  const phi::Allocator* GetPinnedAllocator() {
+    return memory_method_->get_pinned_allocator();
+  }
+
+  std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
+      int device_id) {
+    return memory_method_->get_new_cuda_event(device_id);
+  }
+#endif
+
  private:
   MemoryUtils() = default;
 
@@ -385,6 +433,22 @@ void EmplaceDeviceContexts(
     bool disable_setting_default_stream_for_allocator,
     int stream_priority);
 
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+const Allocator* GetAllocator(int device_id, phi::gpuStream_t stream);
+
+const Allocator* GetHostAllocator();
+
+const Allocator* GetZeroAllocator(int device_id);
+
+const Allocator* GetHostZeroAllocator();
+
+const Allocator* GetPinnedAllocator();
+
+std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
+    int device_id);
+#endif
+
 class Buffer {
  public:
   explicit Buffer(const phi::Place& place) : place_(place) {}
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index 09f7c53cde80df..82731d9984dc8f 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -54,7 +54,8 @@ const char* AllocationTypeStr(AllocationType type);
 /// \brief The place is used to specify where the data is stored.
 class PADDLE_API Place {
  public:
-  Place() : device(0), alloc_type_(AllocationType::UNDEFINED) {}
+  Place()
+      : device(0), alloc_type_(AllocationType::UNDEFINED), device_type_id_(0) {}
 
   explicit Place(AllocationType type,
                  int8_t id,
diff --git a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
index 0aee1b53638389..d9d1c27ed23f3d 100644
--- a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
@@ -9,9 +9,8 @@ collect_srcs(
   dist_mapper.cc
   reshard_utils.cc
   dist_tensor.cc
+  dist_meta_tensor.cc
+  inferspmd_utils.cc
   reshard_function.cc
-  reshard_split_functor.cc
-  reshard_concat_functor.cc
-  reshard_all_gather_functor.cc
   r_to_s_reshard_function.cc
   s_to_r_reshard_function.cc)
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc
new file mode 100644
index 00000000000000..dc5d6c20e62b33
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+
+namespace phi {
+namespace distributed {
+
+phi::DDim DistMetaTensor::dims() const {
+  // member values in tensor_ have higher priority than those in DistMetaTensor
+  if (tensor_ != nullptr) {
+    PADDLE_ENFORCE_EQ(this->is_dist(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "The current MetaTensor doesn't contains "
+                          "DistTensor when call `dist_attr` method."));
+    return MetaTensor::dims();
+  } else {
+    return dims_;
+  }
+}
+
+const distributed::TensorDistAttr& DistMetaTensor::dist_attr() const {
+  // member values in tensor_ have higher priority than those in DistMetaTensor
+  if (tensor_ != nullptr) {
+    PADDLE_ENFORCE_EQ(this->is_dist(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "The current MetaTensor doesn't contains "
+                          "DistTensor when call `dist_attr` method."));
+    return static_cast<phi::distributed::DistTensor*>(tensor_)->dist_attr();
+  } else {
+    return dist_attr_;
+  }
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h b/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h
new file mode 100644
index 00000000000000..efbf38d28f9f0a
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+namespace phi {
+namespace distributed {
+
+class DistMetaTensor : public MetaTensor {
+ public:
+  // supporting implicit construction is easier to use
+  DistMetaTensor(TensorBase* tensor)  // NOLINT
+      : MetaTensor(tensor) {}
+  DistMetaTensor(const TensorBase& tensor)  // NOLINT
+      : MetaTensor(tensor) {}
+  DistMetaTensor(const TensorBase* tensor)  // NOLINT
+      : MetaTensor(tensor) {}
+  DistMetaTensor(TensorBase& tensor)  // NOLINT
+      : MetaTensor(tensor) {}
+  // For static mode only
+  DistMetaTensor(const phi::DDim& dims, const TensorDistAttr& dist_attr)
+      : dims_(dims), dist_attr_(dist_attr) {}
+
+  DistMetaTensor(DistMetaTensor&&) = default;
+  DistMetaTensor& operator=(DistMetaTensor&&) = default;
+  DistMetaTensor(const DistMetaTensor&) = default;
+  DistMetaTensor& operator=(const DistMetaTensor&) = default;
+
+  virtual ~DistMetaTensor() = default;
+
+  DDim dims() const override;
+
+  const distributed::TensorDistAttr& dist_attr() const;
+
+ private:
+  /**
+   * Note: When using the semi-automatic parallel segmentation derivation rules
+   * of the static graph, in order to facilitate the packaging of the input
+   * parameters of the construction, the DistMetaTensor is inherited and
+   * encapsulated, and the class members dims_ and dist_attr_ are added to it.
+   *
+   * The information contained in these two members is also in the tensor of the
+   * meta_tensor of the base class, and there is redundancy.
+   *
+   * We need to pay attention when using it to ensure the consistency.
+   * These two members are read-only, and their values cannot be changed
+   * after construction. To change their values, they need to be set
+   * directly in tensor_*/
+  phi::DDim dims_;
+  TensorDistAttr dist_attr_;
+};
+
+}  // namespace distributed
+}  //  namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
new file mode 100644
index 00000000000000..3b94dc017e5e71
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+
+namespace phi {
+namespace distributed {
+
+void InferSpmdContext::EmplaceBackInput(DistMetaTensor input) {
+  inputs_.emplace_back(std::move(input));
+}
+
+void InferSpmdContext::EmplaceBackAttr(Attribute attr) {
+  attrs_.emplace_back(std::move(attr));
+}
+
+const DistMetaTensor& InferSpmdContext::InputAt(size_t idx) const {
+  return inputs_.at(idx);
+}
+
+template <typename AttrType>
+AttrType InferSpmdContext::AttrAt(size_t idx) const {
+  try {
+    return paddle::get<AttrType>(attrs_.at(idx));
+  } catch (paddle::bad_variant_access const& e) {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Attribute cast error in InferSpmd Context, the input attr type is "
+        "`%s`, but the expected attribute type is `%s`.",
+        attrs_.at(idx).type().name(),
+        std::type_index(typeid(AttrType)).name()));
+  }
+}
+
+template <>
+bool InferSpmdContext::AttrAt<bool>(size_t idx) const {
+  try {
+    auto attr = attrs_.at(idx);
+    if (attr.type() == typeid(int)) {
+      return static_cast<bool>(paddle::get<int>(attr));
+    } else {
+      return paddle::get<bool>(attr);
+    }
+  } catch (paddle::bad_variant_access const& e) {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Attribute cast error in InferSpmd Context, the input attr type is "
+        "`%s`, but the expected attribute type is `bool`.",
+        attrs_.at(idx).type().name()));
+  }
+}
+
+const Attribute& InferSpmdContext::AttrAt(size_t idx) const {
+  return attrs_.at(idx);
+}
+
+SpmdRuleFactory& SpmdRuleFactory::Instance() {
+  static SpmdRuleFactory g_spmd_rule_map;
+  return g_spmd_rule_map;
+}
+
+bool SpmdRuleFactory::ContainsSpmdRule(const std::string& kernel_name) const {
+  return spmd_rule_map_.count(kernel_name) > 0;
+}
+
+int SpmdRuleFactory::InsertSpmdRule(std::string kernel_name, SpmdRule rule) {
+  PADDLE_ENFORCE_NE(
+      ContainsSpmdRule(kernel_name),
+      true,
+      phi::errors::AlreadyExists(
+          "`%s` Kernel's Spmd rules has been registered.", kernel_name));
+  spmd_rule_map_.insert({std::move(kernel_name), std::move(rule)});
+  return 0;
+}
+
+const SpmdRule& SpmdRuleFactory::GetSpmdRule(
+    const std::string& kernel_name) const {
+  auto it = spmd_rule_map_.find(kernel_name);
+  PADDLE_ENFORCE_NE(
+      it,
+      spmd_rule_map_.end(),
+      phi::errors::NotFound("`%s` Kernel's Spmd rules is not registered.",
+                            kernel_name));
+  return it->second;
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
new file mode 100644
index 00000000000000..bccee2bf5981ab
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
@@ -0,0 +1,186 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <typeindex>
+#include <typeinfo>
+#include <utility>
+
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/attribute.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/macros.h"
+#include "paddle/phi/core/type_defs.h"
+#include "paddle/utils/any.h"
+#include "paddle/utils/flat_hash_map.h"
+#include "paddle/utils/small_vector.h"
+
+namespace phi {
+namespace distributed {
+
+class InferSpmdContext {
+ public:
+  InferSpmdContext() = default;
+  InferSpmdContext(
+      paddle::small_vector<DistMetaTensor, phi::kInputSmallVectorSize> inputs,
+      paddle::small_vector<Attribute, phi::kAttrSmallVectorSize> attrs)
+      : inputs_(std::move(inputs)), attrs_(std::move(attrs)) {}
+
+  void EmplaceBackInput(DistMetaTensor input);
+  void EmplaceBackAttr(Attribute attr);
+
+  const DistMetaTensor& InputAt(size_t idx) const;
+
+  template <typename AttrType>
+  AttrType AttrAt(size_t idx) const;
+
+  const Attribute& AttrAt(size_t idx) const;
+
+ private:
+  // Now we only need `inputs`, for backward, the `output` is passed as input
+  paddle::small_vector<DistMetaTensor, phi::kInputSmallVectorSize> inputs_;
+  // Because the attribute arguments of dygraph do not have `attr name`,
+  // so we use vector instead of map
+  paddle::small_vector<Attribute, phi::kAttrSmallVectorSize> attrs_;
+};
+
+using InferSpmdFn = SpmdInfo (*)(const InferSpmdContext&);
+
+#define PD_INFER_SPMD(...)                                    \
+  ::phi::distributed::InferSpmdFnImpl<decltype(&__VA_ARGS__), \
+                                      &__VA_ARGS__>::Call
+
+template <typename T>
+struct InferSpmdTypeTag {};
+
+template <typename Fn, Fn fn>
+struct InferSpmdFnImpl;
+
+template <typename Return, typename... Args, Return (*infer_spmd_fn)(Args...)>
+struct InferSpmdFnImpl<Return (*)(Args...), infer_spmd_fn> {
+  static SpmdInfo Call(const InferSpmdContext& ctx) {
+    return InferSpmdFnCallHelper<Args..., InferSpmdTypeTag<int>>::
+        template Call<0, 0>(ctx);
+  }
+
+ private:
+  template <typename... RemainingArgs>
+  struct InferSpmdFnCallHelper;
+
+  // TODO(chenweihang): support other input type later as needed
+  template <typename... Tail>
+  struct InferSpmdFnCallHelper<const DistMetaTensor&, Tail...> {
+    template <int in_idx, int attr_idx, typename... PreviousArgs>
+    static SpmdInfo Call(const InferSpmdContext& ctx, PreviousArgs&... pargs) {
+      static_assert(attr_idx == 0,
+                    "InferSpmd's Input should appear before Attributes.");
+      const DistMetaTensor& arg = ctx.InputAt(in_idx);
+      return InferSpmdFnCallHelper<Tail...>::template Call<in_idx + 1,
+                                                           attr_idx>(
+          ctx, pargs..., arg);
+    }
+  };
+
+#define PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(attr_type)      \
+  template <typename... Tail>                                             \
+  struct InferSpmdFnCallHelper<attr_type, Tail...> {                      \
+    template <int in_idx, int attr_idx, typename... PreviousArgs>         \
+    static SpmdInfo Call(const InferSpmdContext& ctx,                     \
+                         PreviousArgs&... pargs) {                        \
+      attr_type arg = ctx.AttrAt<attr_type>(attr_idx);                    \
+      return InferSpmdFnCallHelper<Tail...>::template Call<in_idx,        \
+                                                           attr_idx + 1>( \
+          ctx, pargs..., arg);                                            \
+    }                                                                     \
+  }
+
+  // TODO(chenweihang): support other attr type later as needed
+  PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(bool);
+
+  /* End case */
+  template <typename T>
+  struct InferSpmdFnCallHelper<InferSpmdTypeTag<T>> {
+    template <int in_idx, int attr_idx>
+    static SpmdInfo Call(const InferSpmdContext& ctx UNUSED, Args&... args) {
+      return infer_spmd_fn(args...);
+    }
+  };
+};
+
+class SpmdRule {
+ public:
+  explicit SpmdRule(InferSpmdFn forward_fn)
+      : forward_fn_(forward_fn), backward_fn_(nullptr) {}
+
+  SpmdRule(InferSpmdFn forward_fn, InferSpmdFn backward_fn)
+      : forward_fn_(forward_fn), backward_fn_(backward_fn) {}
+
+  SpmdInfo InferForward(const InferSpmdContext& ctx) const {
+    PADDLE_ENFORCE_NE(
+        forward_fn_,
+        nullptr,
+        phi::errors::NotFound("Current SpmdRule's forward function is not "
+                              "found, Please make sure "
+                              "that you have registered the rule correctly."));
+    return forward_fn_(ctx);
+  }
+
+  SpmdInfo InferBackward(const InferSpmdContext& ctx) const {
+    PADDLE_ENFORCE_NE(
+        backward_fn_,
+        nullptr,
+        phi::errors::NotFound("Current SpmdRule's backward function is not "
+                              "found, Please make sure "
+                              "that you have registered the rule correctly."));
+    return backward_fn_(ctx);
+  }
+
+ private:
+  InferSpmdFn forward_fn_;
+  InferSpmdFn backward_fn_;
+};
+
+// SpmdRuleFactory manage the spmd rules and cache the propagate results
+// TODO(chenweihang): Add spmd caching impl later
+class SpmdRuleFactory {
+ public:
+  static SpmdRuleFactory& Instance();
+
+  bool ContainsSpmdRule(const std::string& kernel_name) const;
+
+  int InsertSpmdRule(std::string kernel_name, SpmdRule rule);
+
+  const SpmdRule& GetSpmdRule(const std::string& kernel_name) const;
+
+ private:
+  SpmdRuleFactory() = default;
+
+  paddle::flat_hash_map<std::string, SpmdRule> spmd_rule_map_;
+
+  DISABLE_COPY_AND_ASSIGN(SpmdRuleFactory);
+};
+
+#define PD_REGISTER_SPMD_RULE(kernel_name, ...)                       \
+  UNUSED static int ___registrar_spmd_rule_for_##kernel_name =        \
+      ::phi::distributed::SpmdRuleFactory::Instance().InsertSpmdRule( \
+          #kernel_name, ::phi::distributed::SpmdRule(__VA_ARGS__));
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
index 3a60e226793627..c0c59e5d8c3950 100644
--- a/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
@@ -15,12 +15,10 @@
 #include "paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.h"
 
 #include "glog/logging.h"
-#include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
-#include "paddle/phi/core/distributed/auto_parallel/reshard_split_functor.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
-#include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/kernels/split_kernel.h"
 
 namespace phi {
 namespace distributed {
@@ -73,17 +71,21 @@ void RToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
       BalancedSplit(in.dims()[split_axis], num_of_process);
   IntArray sections(split_num_vec);
 
-  std::vector<DenseTensor> split_out_vec = ReshardSplitFunctor(
-      *dev_ctx, in_physical_tensor_cur_rank, sections, split_axis);
+  std::vector<DenseTensor> split_out_vec;
+  auto dtype = in_physical_tensor_cur_rank.dtype();
+  RESHARD_FUNCTOR(dev_ctx,
+                  Split,
+                  dtype,
+                  in_physical_tensor_cur_rank,
+                  sections,
+                  split_axis,
+                  &split_out_vec);
 
   VLOG(3) << "The current process will remain the idx "
           << coord_in_mesh[mesh_axis] << " piece of tensor";
 
-  out_physical_tensor_cur_rank = split_out_vec[coord_in_mesh[mesh_axis]];
-  VLOG(3) << "The shape of physical tensor after split is "
-          << out_physical_tensor_cur_rank.dims();
-
-  set_dist_props(out, out_physical_tensor_cur_rank, in.dims(), out_dist_attr);
+  SetValue(out, split_out_vec[coord_in_mesh[mesh_axis]]);
+  SetDistProps(out, in.dims(), out_dist_attr);
 }
 
 REGISTER_RESHARD_FUNC(RToSReshardFunction);
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_all_gather_functor.cc b/paddle/phi/core/distributed/auto_parallel/reshard_all_gather_functor.cc
deleted file mode 100644
index 4cf18ca2181384..00000000000000
--- a/paddle/phi/core/distributed/auto_parallel/reshard_all_gather_functor.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/distributed/auto_parallel/reshard_all_gather_functor.h"
-
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
-#include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/all_gather_kernel.h"
-
-namespace phi {
-namespace distributed {
-
-DenseTensor ReshardAllGatherFunctor(DeviceContext* dev_ctx,
-                                    const DenseTensor& input,
-                                    const std::vector<int64_t>& process_ids) {
-  DenseTensor out;
-
-  int64_t world_size = process_ids.size();
-  auto* comm_context = CreateOrGetCommContext(*dev_ctx, process_ids);
-  dev_ctx->SetCommContext(comm_context);
-
-  if (phi::CPUContext::classof(dev_ctx)) {
-    PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(
-        input.dtype(), "AllGather", ([&] {
-          AllGather<data_t>(static_cast<const CPUContext&>(*dev_ctx),
-                            input,
-                            world_size,
-                            &out);
-        }));
-    return out;
-  }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (phi::GPUContext::classof(dev_ctx)) {
-    PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(
-        input.dtype(), "AllGather", ([&] {
-          AllGather<data_t>(static_cast<const GPUContext&>(*dev_ctx),
-                            input,
-                            world_size,
-                            &out);
-        }));
-    return out;
-  }
-#endif
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-  if (phi::CustomContext::classof(dev_ctx)) {
-    PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(
-        input.dtype(), "AllGather", ([&] {
-          AllGather<data_t>(static_cast<const CustomContext&>(*dev_ctx),
-                            input,
-                            world_size,
-                            &out);
-        }));
-    return out;
-  }
-#endif
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "The all_gather in reshard only supported on CPU and GPU for now."));
-}
-
-}  // namespace distributed
-}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_concat_functor.cc b/paddle/phi/core/distributed/auto_parallel/reshard_concat_functor.cc
deleted file mode 100644
index 49115dbffd0bd9..00000000000000
--- a/paddle/phi/core/distributed/auto_parallel/reshard_concat_functor.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/distributed/auto_parallel/reshard_concat_functor.h"
-
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
-#include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/concat_kernel.h"
-
-namespace phi {
-namespace distributed {
-
-DenseTensor ReshardConcatFunctor(const DeviceContext& dev_ctx,
-                                 const std::vector<const DenseTensor*>& input,
-                                 int64_t axis) {
-  DenseTensor result;
-  auto dtype = (*input.begin())->dtype();
-
-  if (phi::CPUContext::classof(&dev_ctx)) {
-    PD_VISIT_ALL_TYPES(
-        dtype, "Concat", ([&] {
-          Concat<data_t>(
-              static_cast<const CPUContext&>(dev_ctx), input, axis, &result);
-        }));
-    return result;
-  }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (phi::GPUContext::classof(&dev_ctx)) {
-    PD_VISIT_ALL_TYPES(
-        dtype, "Concat", ([&] {
-          Concat<data_t>(
-              static_cast<const GPUContext&>(dev_ctx), input, axis, &result);
-        }));
-    return result;
-  }
-#endif
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "The concat in reshard only supported on CPU and GPU for now."));
-}
-
-}  // namespace distributed
-}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard_function.cc
index 637af9641d3d36..63044549a2e370 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_function.cc
@@ -29,20 +29,26 @@ std::shared_ptr<DistTensor> ReshardFunction::Eval(
   return out;
 }
 
-void ReshardFunction::set_dist_props(DistTensor* tensor,
-                                     const DenseTensor& value,
-                                     const DDim& dims,
-                                     const TensorDistAttr& dist_attr) {
+void ReshardFunction::SetValue(DistTensor* tensor, const DenseTensor& value) {
+  tensor->value_ = value;
+}
+
+void ReshardFunction::SetDistProps(DistTensor* tensor,
+                                   const DDim& dims,
+                                   const TensorDistAttr& dist_attr) {
   PADDLE_ENFORCE_EQ(dist_attr.verify(vectorize(dims)),
                     true,
                     phi::errors::InvalidArgument(
                         "The input dist_attr and dims are improper."));
 
-  tensor->value_ = value;
   tensor->dims_ = dims;
   tensor->dist_attr_ = dist_attr;
 }
 
+DenseTensor* ReshardFunction::GetMutableTensor(DistTensor* tensor) {
+  return &tensor->value_;
+}
+
 ReshardFunction* ChooseProperReshardFunction(
     const DistTensor& in, const TensorDistAttr& out_dist_attr) {
   for (const auto& func : GetReshardFunctionList()) {
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_function.h b/paddle/phi/core/distributed/auto_parallel/reshard_function.h
index 305a9af337c240..48d9fe64eabcc8 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_function.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_function.h
@@ -44,10 +44,11 @@ class ReshardFunction {
                     DistTensor* out) = 0;
 
  protected:
-  void set_dist_props(DistTensor* tensor,
-                      const DenseTensor& value,
-                      const DDim& dims,
-                      const TensorDistAttr& dist_attr);
+  void SetValue(DistTensor* tensor, const DenseTensor& value);
+  void SetDistProps(DistTensor* tensor,
+                    const DDim& dims,
+                    const TensorDistAttr& dist_attr);
+  DenseTensor* GetMutableTensor(DistTensor* tensor);
 };
 
 std::vector<std::unique_ptr<ReshardFunction>>& GetReshardFunctionList();
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_split_functor.cc b/paddle/phi/core/distributed/auto_parallel/reshard_split_functor.cc
deleted file mode 100644
index 4d0818eed4c0aa..00000000000000
--- a/paddle/phi/core/distributed/auto_parallel/reshard_split_functor.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/distributed/auto_parallel/reshard_split_functor.h"
-
-#include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
-#include "paddle/phi/core/visit_type.h"
-#include "paddle/phi/kernels/split_kernel.h"
-
-namespace phi {
-namespace distributed {
-
-std::vector<DenseTensor> ReshardSplitFunctor(const DeviceContext& dev_ctx,
-                                             const DenseTensor& input,
-                                             const IntArray& sections,
-                                             int64_t axis) {
-  std::vector<DenseTensor> result;
-
-  if (phi::CPUContext::classof(&dev_ctx)) {
-    PD_VISIT_ALL_TYPES(input.dtype(), "Split", ([&] {
-                         Split<data_t>(static_cast<const CPUContext&>(dev_ctx),
-                                       input,
-                                       sections,
-                                       axis,
-                                       &result);
-                       }));
-    return result;
-  }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (phi::GPUContext::classof(&dev_ctx)) {
-    PD_VISIT_ALL_TYPES(input.dtype(), "Split", ([&] {
-                         Split<data_t>(static_cast<const GPUContext&>(dev_ctx),
-                                       input,
-                                       sections,
-                                       axis,
-                                       &result);
-                       }));
-    return result;
-  }
-#endif
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "The split in reshard only supported on CPU and GPU for now."));
-}
-
-}  // namespace distributed
-}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
index c1b569b1dbcb05..0c86eb864ed2b6 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
@@ -14,22 +14,31 @@
 
 #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
 
-#include <cstdlib>
-
-// the <winsock2.h> needs to be included before <winsock.h>, otherwise
-// there will be symbol redefinition error on windows
-#include "paddle/phi/core/distributed/store/tcp_store.h"
-
 #include "glog/logging.h"
-#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
-#include "paddle/phi/core/distributed/auto_parallel/utils.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/store/store_utils.h"
 
 namespace phi {
 namespace distributed {
-using auto_parallel::str_split;
+
+namespace {
+int64_t GetLocalRankInParticipate(const std::vector<int64_t>& process_ids) {
+  int64_t cur_global_rank = GetCurGlobalRank();
+  auto iter =
+      std::find(process_ids.begin(), process_ids.end(), cur_global_rank);
+  return iter - process_ids.begin();
+}
+
+std::string GenUniqueCommKey(const std::vector<int64_t>& process_ids) {
+  std::string unique_comm_key = "ReshardGroup";
+  for (const auto& id : process_ids) {
+    unique_comm_key += "/" + std::to_string(id);
+  }
+  return unique_comm_key;
+}
+}  // namespace
 
 bool IsDimsMappingShard(const std::vector<int64_t>& dims_mapping) {
   return std::any_of(dims_mapping.begin(),
@@ -70,93 +79,6 @@ std::vector<int64_t> GetCurRankCoordInMesh(const ProcessMesh& process_mesh) {
   return coord;
 }
 
-std::map<int64_t, int64_t> GetSplitAxisWithDimsMapping(
-    const std::vector<int64_t>& dims_mapping) {
-  std::map<int64_t, int64_t> split_axis_to_mesh_axis;
-  for (size_t i = 0; i < dims_mapping.size(); ++i) {
-    if (dims_mapping[i] != -1) {
-      split_axis_to_mesh_axis.emplace(i, dims_mapping[i]);
-    }
-  }
-  return split_axis_to_mesh_axis;
-}
-
-int64_t GetCurGlobalRank() {
-  const char* cur_rank = std::getenv("PADDLE_TRAINER_ID");
-  PADDLE_ENFORCE_NOT_NULL(
-      cur_rank,
-      phi::errors::NotFound(
-          "The environment variable 'PADDLE_TRAINER_ID' cannot be found."));
-  return std::atoi(cur_rank);
-}
-
-int64_t GetGlobalWorldSize() {
-  const char* world_size = std::getenv("PADDLE_TRAINERS_NUM");
-  PADDLE_ENFORCE_NOT_NULL(
-      world_size,
-      phi::errors::NotFound(
-          "The environment variable 'PADDLE_TRAINERS_NUM' cannot be found."));
-  return std::atoi(world_size);
-}
-
-namespace {
-std::string GetMasterEndpoint() {
-  const char* master_endpoint = std::getenv("PADDLE_MASTER");
-  if (!master_endpoint) {
-    const char* trainer_endpoints = std::getenv("PADDLE_TRAINER_ENDPOINTS");
-    PADDLE_ENFORCE_NOT_NULL(
-        trainer_endpoints,
-        phi::errors::NotFound("The environment variable "
-                              "'PADDLE_TRAINER_ENDPOINTS' cannot be found."));
-    return str_split(trainer_endpoints, ",")[0];
-  }
-
-  PADDLE_ENFORCE_NOT_NULL(
-      master_endpoint,
-      phi::errors::NotFound(
-          "The environment variable 'PADDLE_MASTER' cannot be found."));
-  return master_endpoint;
-}
-
-std::string GenUniqueCommKey(const std::vector<int64_t>& process_ids) {
-  std::string unique_comm_key = "ReshardGroup";
-  for (const auto& id : process_ids) {
-    unique_comm_key += "/" + std::to_string(id);
-  }
-  return unique_comm_key;
-}
-
-int64_t GetLocalRankInParticipate(const std::vector<int64_t>& process_ids) {
-  int64_t cur_global_rank = GetCurGlobalRank();
-  auto iter =
-      std::find(process_ids.begin(), process_ids.end(), cur_global_rank);
-  return iter - process_ids.begin();
-}
-
-}  // namespace
-
-std::string GetMasterAddr() {
-  std::string master_endpoint = GetMasterEndpoint();
-  return str_split(master_endpoint, ":")[0];
-}
-
-uint16_t GetMasterPort() {
-  std::string master_endpoint = GetMasterEndpoint();
-  return std::stoi(str_split(master_endpoint, ":")[1]);
-}
-
-std::shared_ptr<TCPStore> CreateOrGetGlobalTCPStore() {
-  std::string host = GetMasterAddr();
-  uint16_t port = GetMasterPort();
-  int64_t cur_rank = GetCurGlobalRank();
-  int64_t world_size = GetGlobalWorldSize();
-  bool is_master = (cur_rank == 0);
-
-  static std::shared_ptr<TCPStore> store =
-      std::make_shared<TCPStore>(host, port, is_master, world_size);
-  return store;
-}
-
 CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
                                     const std::vector<int64_t>& process_ids) {
   std::string unique_comm_key = GenUniqueCommKey(process_ids);
@@ -202,6 +124,17 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
   return comm_context;
 }
 
+std::map<int64_t, int64_t> GetSplitAxisWithDimsMapping(
+    const std::vector<int64_t>& dims_mapping) {
+  std::map<int64_t, int64_t> split_axis_to_mesh_axis;
+  for (size_t i = 0; i < dims_mapping.size(); ++i) {
+    if (dims_mapping[i] != -1) {
+      split_axis_to_mesh_axis.emplace(i, dims_mapping[i]);
+    }
+  }
+  return split_axis_to_mesh_axis;
+}
+
 std::vector<int64_t> BalancedSplit(int64_t total_nums, int64_t num_of_pieces) {
   std::vector<int64_t> result(num_of_pieces, total_nums / num_of_pieces);
   int64_t remain_nums = total_nums % num_of_pieces;
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
index f50d9496d39fe8..ebd54544f15e30 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
@@ -20,13 +20,14 @@
 #include <string>
 #include <vector>
 
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/visit_type.h"
+
 namespace phi {
 class DeviceContext;
 
 namespace distributed {
-class CommContext;
-class TCPStore;
-
 class ProcessMesh;
 
 bool IsDimsMappingShard(const std::vector<int64_t>& dims_mapping);
@@ -46,6 +47,11 @@ std::vector<int64_t> GetCurRankCoordInMesh(const ProcessMesh& process_mesh);
 std::map<int64_t, int64_t> GetSplitAxisWithDimsMapping(
     const std::vector<int64_t>& dims_mapping);
 
+// If given a number, balance split it to multiple pieces.
+// For example, the input value is 12, split it to 5 pieces, then return
+// {3, 3, 2, 2, 2}.
+std::vector<int64_t> BalancedSplit(int64_t total_nums, int64_t num_of_pieces);
+
 // Create a comm context of the input process_ids. Once the newly comm context
 // created, it will be cached in the global instance, and get from the global
 // cache later. If the input dev_ctx is GPU, then nccl comm context will be
@@ -53,20 +59,54 @@ std::map<int64_t, int64_t> GetSplitAxisWithDimsMapping(
 CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
                                     const std::vector<int64_t>& process_ids);
 
-int64_t GetCurGlobalRank();
-
-std::string GetMasterAddr();
-
-int64_t GetGlobalWorldSize();
-
-uint16_t GetMasterPort();
-
-std::shared_ptr<TCPStore> CreateOrGetGlobalTCPStore();
-
-// If given a number, balance split it to multiple pieces.
-// For example, the input value is 12, split it to 5 pieces, then return
-// {3, 3, 2, 2, 2}.
-std::vector<int64_t> BalancedSplit(int64_t total_nums, int64_t num_of_pieces);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, ...)            \
+  do {                                                                \
+    if (phi::CPUContext::classof(dev_ctx)) {                          \
+      PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(                           \
+          dtype, #fn_name, ([&] {                                     \
+            fn_name<data_t>(static_cast<const CPUContext&>(*dev_ctx), \
+                            __VA_ARGS__);                             \
+          }));                                                        \
+    } else if (phi::GPUContext::classof(dev_ctx)) {                   \
+      PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(                           \
+          dtype, #fn_name, ([&] {                                     \
+            fn_name<data_t>(static_cast<const GPUContext&>(*dev_ctx), \
+                            __VA_ARGS__);                             \
+          }));                                                        \
+    } else {                                                          \
+      PADDLE_THROW(phi::errors::Unimplemented(                        \
+          "The %s in reshard only supported on CPU and GPU for now.", \
+          #fn_name));                                                 \
+    }                                                                 \
+  } while (0)
+#else
+#define RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, ...)                \
+  do {                                                                    \
+    if (phi::CPUContext::classof(dev_ctx)) {                              \
+      PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(                               \
+          dtype, #fn_name, ([&] {                                         \
+            fn_name<data_t>(static_cast<const CPUContext&>(*dev_ctx),     \
+                            __VA_ARGS__);                                 \
+          }));                                                            \
+    } else {                                                              \
+      PADDLE_THROW(phi::errors::Unimplemented(                            \
+          "The %s in reshard only supported on CPU for now.", #fn_name)); \
+    }                                                                     \
+  } while (0)
+#endif
+
+#define RESHARD_FUNCTOR_WITH_COMM(dev_ctx, fn_name, dtype, process_ids, ...) \
+  do {                                                                       \
+    auto* comm_context = CreateOrGetCommContext(*dev_ctx, process_ids);      \
+    dev_ctx->SetCommContext(comm_context);                                   \
+    RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, __VA_ARGS__);              \
+  } while (0)
+
+#define RESHARD_FUNCTOR(dev_ctx, fn_name, dtype, ...)           \
+  do {                                                          \
+    RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, __VA_ARGS__); \
+  } while (0)
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
index e10587237ba3e7..61f57e1d6695cb 100644
--- a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
@@ -18,10 +18,10 @@
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
-#include "paddle/phi/core/distributed/auto_parallel/reshard_all_gather_functor.h"
-#include "paddle/phi/core/distributed/auto_parallel/reshard_concat_functor.h"
-#include "paddle/phi/core/distributed/auto_parallel/reshard_split_functor.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
+#include "paddle/phi/kernels/all_gather_kernel.h"
+#include "paddle/phi/kernels/concat_kernel.h"
+#include "paddle/phi/kernels/split_kernel.h"
 
 namespace phi {
 namespace distributed {
@@ -60,17 +60,22 @@ void SToRReshardFunction::Eval(DeviceContext* dev_ctx,
                                const DistTensor& in,
                                const TensorDistAttr& out_dist_attr,
                                DistTensor* out) {
-  const DenseTensor& in_physical_tensor_cur_rank = in.value();
   const auto& in_dist_attr = in.dist_attr();
   const auto& in_dims_mapping = in_dist_attr.dims_mapping();
   const auto& in_process_mesh = in_dist_attr.process_mesh();
   const auto& in_process_ids = in_process_mesh.process_ids();
+  auto dtype = in.dtype();
 
   // Since the precondition ensure the out_process_ids is equal to the
   // in_process_ids, so the participate process ids mush equal to either
   // in_process_ids or out_process_ids.
-  DenseTensor out_all_gather = ReshardAllGatherFunctor(
-      dev_ctx, in_physical_tensor_cur_rank, in_process_ids);
+  RESHARD_FUNCTOR_WITH_COMM(dev_ctx,
+                            AllGather,
+                            dtype,
+                            in_process_ids,
+                            in.value(),
+                            in_process_ids.size(),
+                            GetMutableTensor(out));
 
   std::map<int64_t, int64_t> split_axis_to_mesh_axis =
       GetSplitAxisWithDimsMapping(in_dims_mapping);
@@ -79,7 +84,7 @@ void SToRReshardFunction::Eval(DeviceContext* dev_ctx,
   if (split_axis == 0) {
     // If the input dist tensor is shard(0), the subsequent split
     // and concat is unnecessary.
-    set_dist_props(out, out_all_gather, out_all_gather.dims(), out_dist_attr);
+    SetDistProps(out, in.dims(), out_dist_attr);
   } else {
     // Since the result of all_gather always concat the tensor on axis 0,
     // first we need to split the result on axis 0,
@@ -88,21 +93,30 @@ void SToRReshardFunction::Eval(DeviceContext* dev_ctx,
     int64_t num_of_process = in_process_ids.size();
 
     IntArray sections(std::vector<int64_t>(
-        num_of_process,
-        in_physical_tensor_cur_rank.dims()[default_split_axis]));
-    std::vector<DenseTensor> split_out_vec = ReshardSplitFunctor(
-        *dev_ctx, out_all_gather, sections, default_split_axis);
+        num_of_process, in.value().dims()[default_split_axis]));
+    std::vector<DenseTensor> split_out_vec;
+    RESHARD_FUNCTOR(dev_ctx,
+                    Split,
+                    dtype,
+                    out->value(),
+                    sections,
+                    default_split_axis,
+                    &split_out_vec);
 
     // Concat the result after split on correct axis.
     std::vector<const DenseTensor*> concat_input_vec;
     for (const auto& tensor : split_out_vec) {
       concat_input_vec.emplace_back(&tensor);
     }
-    DenseTensor concat_out_tensor =
-        ReshardConcatFunctor(*dev_ctx, concat_input_vec, split_axis);
 
-    set_dist_props(
-        out, concat_out_tensor, concat_out_tensor.dims(), out_dist_attr);
+    RESHARD_FUNCTOR(dev_ctx,
+                    Concat,
+                    dtype,
+                    concat_input_vec,
+                    split_axis,
+                    GetMutableTensor(out));
+
+    SetDistProps(out, in.dims(), out_dist_attr);
   }
 }
 
diff --git a/paddle/phi/core/distributed/auto_parallel/utils.h b/paddle/phi/core/distributed/auto_parallel/utils.h
index c9e69dd550abb8..915c1565296700 100644
--- a/paddle/phi/core/distributed/auto_parallel/utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/utils.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <iterator>
 #include <map>
 #include <sstream>
 #include <string>
diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc
index 7bbf0612ab323e..385bbb137cfea0 100644
--- a/paddle/phi/core/distributed/comm_context_manager.cc
+++ b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -12,44 +12,53 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_GLOO)
-#include <gloo/rendezvous/prefix_store.h>
-
-#include "paddle/phi/core/distributed/gloo_comm_context.h"
-#include "paddle/phi/core/distributed/gloo_utils.h"
-#include "paddle/phi/core/distributed/store/gloo_store.h"
-#endif
-
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
 #include <memory>
 #include <string>
+#include "glog/logging.h"
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/distributed/store/store.h"
 #include "paddle/phi/core/enforce.h"
 
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/rendezvous/prefix_store.h>
+#include "paddle/phi/core/distributed/gloo_comm_context.h"
+#include "paddle/phi/core/distributed/gloo_utils.h"
+#include "paddle/phi/core/distributed/store/gloo_store.h"
+#endif
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "glog/logging.h"
 #include "paddle/phi/core/distributed/xccl_comm_context.h"
 #endif
 
 namespace phi {
 namespace distributed {
 
+int CommContextManager::device_id = -1;
+
+void CommContextManager::SetDeviceId(int dev_id) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-void CommContextManager::SetCUDADeviceId(int dev_id) {
   phi::backends::gpu::SetDeviceId(dev_id);
+  CommContextManager::device_id = dev_id;
+#endif
 }
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 void CommContextManager::CreateNCCLCommContext(
     const std::shared_ptr<Store>& store,
     const std::string& unique_comm_key,
     int rank,
     int size) {
+  auto& comm_context_manager = CommContextManager::GetInstance();
+  if (comm_context_manager.Has(unique_comm_key)) {
+    return;
+  }
   ncclUniqueId nccl_id;
   if (rank == 0) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id));
@@ -68,7 +77,28 @@ void CommContextManager::CreateNCCLCommContext(
 
   auto nccl_comm_context =
       std::make_unique<NCCLCommContext>(rank, size, nccl_id);
-  auto& comm_context_manager = CommContextManager::GetInstance();
+
+  if (CommContextManager::device_id != -1) {
+    std::unique_ptr<phi::GPUContext> dev_ctx(
+        new phi::GPUContext(phi::GPUPlace(CommContextManager::device_id)));
+    dev_ctx->SetAllocator(phi::memory_utils::GetAllocator(
+        CommContextManager::device_id, dev_ctx->stream()));
+    dev_ctx->SetHostAllocator(phi::memory_utils::GetHostAllocator());
+    dev_ctx->SetZeroAllocator(
+        phi::memory_utils::GetZeroAllocator(CommContextManager::device_id));
+    dev_ctx->SetHostZeroAllocator(phi::memory_utils::GetHostZeroAllocator());
+    dev_ctx->SetPinnedAllocator(phi::memory_utils::GetPinnedAllocator());
+    dev_ctx->PartialInitWithAllocator();
+    auto compute_event =
+        phi::memory_utils::GetCudaEvent(CommContextManager::device_id);
+    auto comm_event =
+        phi::memory_utils::GetCudaEvent(CommContextManager::device_id);
+
+    nccl_comm_context->SetDevContext(std::move(dev_ctx));
+    nccl_comm_context->SetComputeEvent(std::move(compute_event));
+    nccl_comm_context->SetCommEvent(std::move(comm_event));
+  }
+
   comm_context_manager.SetStore(store);
   comm_context_manager.Emplace(unique_comm_key, std::move(nccl_comm_context));
 }
diff --git a/paddle/phi/core/distributed/comm_context_manager.h b/paddle/phi/core/distributed/comm_context_manager.h
index 6d82e89f92ba02..55fa831c27014c 100644
--- a/paddle/phi/core/distributed/comm_context_manager.h
+++ b/paddle/phi/core/distributed/comm_context_manager.h
@@ -46,13 +46,13 @@ class CommContextManager {
 
   bool Has(const std::string& unique_comm_key) const;
 
+  static void SetDeviceId(int dev_id);
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   static void CreateNCCLCommContext(const std::shared_ptr<Store>& store,
                                     const std::string& unique_comm_key,
                                     int rank,
                                     int size);
-
-  static void SetCUDADeviceId(int dev_id);
 #endif
 
 #if defined(PADDLE_WITH_GLOO)
@@ -76,6 +76,7 @@ class CommContextManager {
   std::unordered_map<std::string, std::unique_ptr<CommContext>>
       id_to_comm_context_;
   std::shared_ptr<Store> store_;
+  static int device_id;
 };
 
 }  // namespace distributed
diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc
index 2ad3ece71a5d6d..90b6a4c447c925 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.cc
+++ b/paddle/phi/core/distributed/nccl_comm_context.cc
@@ -37,6 +37,30 @@ NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id)
 
 ncclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; }
 
+gpuStream_t NCCLCommContext::GetStream() { return dev_ctx_->stream(); }
+
+phi::GPUContext* NCCLCommContext::GetDevContext() { return dev_ctx_.get(); }
+
+void NCCLCommContext::SetDevContext(
+    std::unique_ptr<phi::GPUContext>&& dev_ctx) {
+  dev_ctx_ = std::move(dev_ctx);
+}
+
+gpuEvent_t NCCLCommContext::GetComputeEvent() { return compute_event_.get(); }
+
+void NCCLCommContext::SetComputeEvent(
+    std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&&
+        compute_event) {
+  compute_event_ = std::move(compute_event);
+}
+
+gpuEvent_t NCCLCommContext::GetCommEvent() { return comm_event_.get(); }
+
+void NCCLCommContext::SetCommEvent(
+    std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&& comm_event) {
+  comm_event_ = std::move(comm_event);
+}
+
 void NCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
                                 const phi::DenseTensor& in_tensor,
                                 int root,
diff --git a/paddle/phi/core/distributed/nccl_comm_context.h b/paddle/phi/core/distributed/nccl_comm_context.h
index b8f14cef131e41..fdd45793a6387c 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.h
+++ b/paddle/phi/core/distributed/nccl_comm_context.h
@@ -13,6 +13,16 @@
 // limitations under the License.
 #pragma once
 
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/distributed/comm_context.h"
 #include "paddle/phi/core/macros.h"
@@ -30,9 +40,27 @@ namespace distributed {
 class NCCLCommContext final : public CommContext {
  public:
   NCCLCommContext(int rank, int size, ncclUniqueId nccl_id);
+  ~NCCLCommContext() {}
 
   ncclComm_t GetNcclComm();
 
+  gpuStream_t GetStream();
+
+  gpuEvent_t GetComputeEvent();
+
+  void SetComputeEvent(
+      std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&&
+          compute_event);
+
+  gpuEvent_t GetCommEvent();
+
+  void SetCommEvent(
+      std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&& comm_event);
+
+  phi::GPUContext* GetDevContext();
+
+  void SetDevContext(std::unique_ptr<phi::GPUContext>&& dev_ctx);
+
   void Broadcast(phi::DenseTensor* out_tensor,
                  const phi::DenseTensor& in_tensor,
                  int root,
@@ -75,6 +103,14 @@ class NCCLCommContext final : public CommContext {
   DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
 
   ncclComm_t nccl_comm_;
+
+  std::unique_ptr<phi::GPUContext> dev_ctx_;
+
+  // used for comm wait compute, compute_stream-->event-->comm_stream
+  std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> compute_event_;
+
+  // used for compute wait comm, comm_stream-->event-->compute_stream
+  std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> comm_event_;
 };
 
 }  // namespace distributed
diff --git a/paddle/phi/core/distributed/store/CMakeLists.txt b/paddle/phi/core/distributed/store/CMakeLists.txt
index 8eaa76eac1c677..3b62a1367eea9e 100644
--- a/paddle/phi/core/distributed/store/CMakeLists.txt
+++ b/paddle/phi/core/distributed/store/CMakeLists.txt
@@ -1,4 +1,5 @@
-set(STORE_COMMON_SRCS tcp_store.cc tcp_utils.cc socket.cpp store.cc)
+set(STORE_COMMON_SRCS tcp_store.cc tcp_utils.cc socket.cpp store.cc
+                      store_utils.cc)
 
 if(WITH_GLOO)
   list(APPEND STORE_COMMON_SRCS gloo_store.cc)
diff --git a/paddle/phi/core/distributed/store/store_utils.cc b/paddle/phi/core/distributed/store/store_utils.cc
new file mode 100644
index 00000000000000..c2679ef2192a33
--- /dev/null
+++ b/paddle/phi/core/distributed/store/store_utils.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/distributed/store/store_utils.h"
+
+#include <cstdlib>
+
+// the <winsock2.h> needs to be included before <winsock.h>, otherwise
+// there will be symbol redefinition error on windows
+#include "paddle/phi/core/distributed/store/tcp_store.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+
+namespace phi {
+namespace distributed {
+using auto_parallel::str_split;
+
+namespace {
+std::string GetMasterEndpoint() {
+  const char* master_endpoint = std::getenv("PADDLE_MASTER");
+  if (!master_endpoint) {
+    const char* trainer_endpoints = std::getenv("PADDLE_TRAINER_ENDPOINTS");
+    PADDLE_ENFORCE_NOT_NULL(
+        trainer_endpoints,
+        phi::errors::NotFound("The environment variable "
+                              "'PADDLE_TRAINER_ENDPOINTS' cannot be found."));
+    return str_split(trainer_endpoints, ",")[0];
+  }
+
+  PADDLE_ENFORCE_NOT_NULL(
+      master_endpoint,
+      phi::errors::NotFound(
+          "The environment variable 'PADDLE_MASTER' cannot be found."));
+  return master_endpoint;
+}
+}  // namespace
+
+int64_t GetCurGlobalRank() {
+  const char* cur_rank = std::getenv("PADDLE_TRAINER_ID");
+  PADDLE_ENFORCE_NOT_NULL(
+      cur_rank,
+      phi::errors::NotFound(
+          "The environment variable 'PADDLE_TRAINER_ID' cannot be found."));
+  return std::atoi(cur_rank);
+}
+
+int64_t GetGlobalWorldSize() {
+  const char* world_size = std::getenv("PADDLE_TRAINERS_NUM");
+  PADDLE_ENFORCE_NOT_NULL(
+      world_size,
+      phi::errors::NotFound(
+          "The environment variable 'PADDLE_TRAINERS_NUM' cannot be found."));
+  return std::atoi(world_size);
+}
+
+std::string GetMasterAddr() {
+  std::string master_endpoint = GetMasterEndpoint();
+  return str_split(master_endpoint, ":")[0];
+}
+
+uint16_t GetMasterPort() {
+  std::string master_endpoint = GetMasterEndpoint();
+  return std::stoi(str_split(master_endpoint, ":")[1]);
+}
+
+std::shared_ptr<Store> CreateOrGetGlobalTCPStore() {
+  std::string host = GetMasterAddr();
+  uint16_t port = GetMasterPort();
+  int64_t cur_rank = GetCurGlobalRank();
+  int64_t world_size = GetGlobalWorldSize();
+  bool is_master = (cur_rank == 0);
+
+  static std::shared_ptr<TCPStore> store =
+      std::make_shared<TCPStore>(host, port, is_master, world_size);
+  return store;
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_concat_functor.h b/paddle/phi/core/distributed/store/store_utils.h
similarity index 75%
rename from paddle/phi/core/distributed/auto_parallel/reshard_concat_functor.h
rename to paddle/phi/core/distributed/store/store_utils.h
index ce4798458bde7a..3aad27a46b5ea1 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_concat_functor.h
+++ b/paddle/phi/core/distributed/store/store_utils.h
@@ -15,16 +15,22 @@
 #pragma once
 
 #include <cstdint>
-#include <vector>
+#include <memory>
+#include <string>
 
 namespace phi {
-class DeviceContext;
-class DenseTensor;
 namespace distributed {
+class Store;
 
-DenseTensor ReshardConcatFunctor(const DeviceContext& dev_ctx,
-                                 const std::vector<const DenseTensor*>& input,
-                                 int64_t axis);
+int64_t GetCurGlobalRank();
+
+std::string GetMasterAddr();
+
+int64_t GetGlobalWorldSize();
+
+uint16_t GetMasterPort();
+
+std::shared_ptr<Store> CreateOrGetGlobalTCPStore();
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_all_gather_functor.h b/paddle/phi/core/distributed/type_defs.h
similarity index 67%
rename from paddle/phi/core/distributed/auto_parallel/reshard_all_gather_functor.h
rename to paddle/phi/core/distributed/type_defs.h
index 311bd74765d2e5..cd201ac5c5aafe 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_all_gather_functor.h
+++ b/paddle/phi/core/distributed/type_defs.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,18 +14,16 @@
 
 #pragma once
 
-#include <cstdint>
+#include <functional>
+#include <string>
 #include <vector>
 
 namespace phi {
-class DenseTensor;
-class DeviceContext;
-
 namespace distributed {
+class TensorDistAttr;
 
-DenseTensor ReshardAllGatherFunctor(DeviceContext* dev_ctx,
-                                    const DenseTensor& input,
-                                    const std::vector<int64_t>& process_ids);
+using SpmdInfo =
+    std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>;
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc
index 69bcbf91ef463a..f47eb91e97d97e 100644
--- a/paddle/phi/core/enforce.cc
+++ b/paddle/phi/core/enforce.cc
@@ -20,16 +20,16 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/utils/blank.h"
+#include "paddle/utils/flags.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/phi/core/external_error.pb.h"
 #endif  // PADDLE_WITH_CUDA
 
-DECLARE_int32(call_stack_level);
+PD_DECLARE_int32(call_stack_level);
 
 namespace egr {
 class EagerVariable;
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index 2470981f6617c9..41d2dc8003bb6b 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1056,19 +1056,21 @@ PHI_DEFINE_EXPORTED_uint64(executor_log_deps_every_microseconds,
                            0,
                            "Enable new executor log deps every n microseconds");
 
-DEFINE_int32(record_pool_max_size,
-             2000000,
-             "SlotRecordDataset slot record pool max size");
-DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num");
-DEFINE_bool(enable_slotpool_wait_release,  // NOLINT
-            false,
-            "enable slotrecord object wait release, default false");
-DEFINE_bool(enable_slotrecord_reset_shrink,  // NOLINT
-            false,
-            "enable slotrecord object reset shrink memory, default false");
-DEFINE_bool(enable_ins_parser_file,  // NOLINT
-            false,
-            "enable parser ins file, default false");
+PD_DEFINE_int32(record_pool_max_size,
+                2000000,
+                "SlotRecordDataset slot record pool max size");
+PD_DEFINE_int32(slotpool_thread_num,
+                1,
+                "SlotRecordDataset slot pool thread num");
+PD_DEFINE_bool(enable_slotpool_wait_release,  // NOLINT
+               false,
+               "enable slotrecord object wait release, default false");
+PD_DEFINE_bool(enable_slotrecord_reset_shrink,  // NOLINT
+               false,
+               "enable slotrecord object reset shrink memory, default false");
+PD_DEFINE_bool(enable_ins_parser_file,  // NOLINT
+               false,
+               "enable parser ins file, default false");
 PHI_DEFINE_EXPORTED_bool(
     gpugraph_enable_hbm_table_collision_stat,
     false,
diff --git a/paddle/phi/core/flags.h b/paddle/phi/core/flags.h
index 278090ff97d267..776e268b201c7c 100644
--- a/paddle/phi/core/flags.h
+++ b/paddle/phi/core/flags.h
@@ -20,9 +20,8 @@
 #include <string>
 #include <type_traits>
 
-#include "gflags/gflags.h"
 #include "paddle/phi/core/macros.h"
-
+#include "paddle/utils/flags.h"
 #include "paddle/utils/variant.h"
 
 #if defined(_WIN32)
@@ -33,6 +32,7 @@
 #define PHI_IMPORT_FLAG
 #endif  // _WIN32
 
+#ifdef PADDLE_WITH_GFLAGS
 // We redefine the gflags' macro for exporting global variable
 
 // ----------------------------DECLARE FLAGS----------------------------
@@ -127,6 +127,23 @@
   clstring& FLAGS_##name = *FLAGS_no##name;                        \
   } /* NOLINT */                                                   \
   using fLS::FLAGS_##name
+#else  // PADDLE_WITH_GFLAGS
+#define PHI_DEFINE_bool(name, val, txt) PD_DEFINE_bool(name, val, txt)
+#define PHI_DEFINE_int32(name, val, txt) PD_DEFINE_int32(name, val, txt)
+#define PHI_DEFINE_uint32(name, val, txt) PD_DEFINE_uint32(name, val, txt)
+#define PHI_DEFINE_int64(name, val, txt) PD_DEFINE_int64(name, val, txt)
+#define PHI_DEFINE_uint64(name, val, txt) PD_DEFINE_uint64(name, val, txt)
+#define PHI_DEFINE_double(name, val, txt) PD_DEFINE_double(name, val, txt)
+#define PHI_DEFINE_string(name, val, txt) PD_DEFINE_string(name, val, txt)
+
+#define PHI_DECLARE_bool(name) PD_DECLARE_bool(name)
+#define PHI_DECLARE_int32(name) PD_DECLARE_int32(name)
+#define PHI_DECLARE_uint32(name) PD_DECLARE_uint32(name)
+#define PHI_DECLARE_int64(name) PD_DECLARE_int64(name)
+#define PHI_DECLARE_uint64(name) PD_DECLARE_uint64(name)
+#define PHI_DECLARE_double(name) PD_DECLARE_double(name)
+#define PHI_DECLARE_string(name) PD_DECLARE_string(name)
+#endif
 
 namespace phi {
 
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 543fe4fef7611a..3ead251e332207 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/phi/core/kernel_factory.h"
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/utils/flags.h"
 #if defined(PADDLE_WITH_XPU)
 #include "paddle/phi/backends/xpu/xpu_op_list.h"
 #include "paddle/phi/common/data_type.h"
@@ -34,9 +34,9 @@ PADDLE_DEFINE_EXPORTED_bool(
     true,
     "Whether to use strdie kernel if op support stride.");
 
-DECLARE_int32(low_precision_op_list);
-DECLARE_bool(enable_api_kernel_fallback);
-DECLARE_bool(run_kp_kernel);
+PD_DECLARE_int32(low_precision_op_list);
+PD_DECLARE_bool(enable_api_kernel_fallback);
+PD_DECLARE_bool(run_kp_kernel);
 namespace phi {
 
 const static Kernel empty_kernel;  // NOLINT
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 4ef20d99582843..146e0bc4fc662d 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 #include "glog/logging.h"
 
 #include "paddle/phi/core/dense_tensor.h"
-
-#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/core/string_tensor.h"
@@ -208,6 +206,9 @@ bool MetaTensor::is_dense() const { return DenseTensor::classof(tensor_); }
 bool MetaTensor::is_selected_rows() const {
   return SelectedRows::classof(tensor_);
 }
+bool MetaTensor::is_dist() const {
+  return distributed::DistTensor::classof(tensor_);
+}
 bool MetaTensor::is_tensor_array() const { return false; }
 
 bool MetaTensor::is_same_tensor(const MetaTensor& meta_tensor) const {
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 900bfb3eb6b3fb..e7ccc1a61c5f27 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -23,7 +23,6 @@ limitations under the License. */
 
 namespace phi {
 
-// TODO(chenweihang): add other flags if needed
 struct MetaConfig {
   bool is_runtime{true};
   bool is_run_mkldnn_kernel{false};
@@ -82,6 +81,8 @@ class MetaTensor {
 
   virtual bool is_selected_rows() const;
   virtual bool is_dense() const;
+  virtual bool is_dist() const;
+
   // TODO(YuanRisheng) This API is for compatible with Fluid
   //  and it will be deleted in the future.
   virtual bool is_tensor_array() const;
@@ -97,7 +98,7 @@ class MetaTensor {
  protected:
   static void unspecified_bool_true() {}
 
- private:
+ protected:
   // Because the lod in compiletime and runtime is different,
   // so `LoD` cannot in public methods
   const LoD& lod() const;
diff --git a/paddle/phi/core/threadpool.cc b/paddle/phi/core/threadpool.cc
index 5e146023c29cec..7538087f4e8553 100644
--- a/paddle/phi/core/threadpool.cc
+++ b/paddle/phi/core/threadpool.cc
@@ -16,14 +16,14 @@
 
 #include <thread>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_int32(dist_threadpool_size);
-DEFINE_int32(io_threadpool_size,
-             100,
-             "number of threads used for doing IO, default 100");
+PD_DECLARE_int32(dist_threadpool_size);
+PD_DEFINE_int32(io_threadpool_size,
+                100,
+                "number of threads used for doing IO, default 100");
 
 namespace phi {
 
diff --git a/paddle/phi/infermeta/CMakeLists.txt b/paddle/phi/infermeta/CMakeLists.txt
index f53f655b244095..ef68ac8632ce42 100644
--- a/paddle/phi/infermeta/CMakeLists.txt
+++ b/paddle/phi/infermeta/CMakeLists.txt
@@ -1,5 +1,10 @@
 add_subdirectory(strings)
 add_subdirectory(sparse)
+
+if(WITH_DISTRIBUTE)
+  add_subdirectory(spmd_rules)
+endif()
+
 collect_srcs(
   infermeta_srcs
   SRCS
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 0118db6041203d..a9b14d2df3d172 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -1942,6 +1942,15 @@ void LogLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void LogicalBinaryInferMeta(const MetaTensor& x,
+                            const MetaTensor& y,
+                            MetaTensor* out) {
+  ElementwiseInferMeta(x, y, out);
+  if (!(out->is_same_tensor(x))) {
+    out->set_dtype(DataType::BOOL);
+  }
+}
+
 void LUUnpackInferMeta(const MetaTensor& x,
                        const MetaTensor& pivots,
                        bool unpack_ludata,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 8aa4114e740460..9060d2abc6564d 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -300,6 +300,10 @@ void IndexAddInferMeta(const MetaTensor& x,
 
 void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 
+void LogicalBinaryInferMeta(const MetaTensor& x,
+                            const MetaTensor& y,
+                            MetaTensor* out);
+
 void LogLossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       float epsilon,
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index 3143c5cde2e1e5..993fb5d5887b82 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -821,4 +821,138 @@ void FastLayernormXPUInferMeta(const MetaTensor& x,
   out->set_layout(x.layout());
 }
 
+void FusedScaleBiasReluConvBnstatsInferMeta(
+    const MetaTensor& x,
+    const MetaTensor& w,
+    const MetaTensor& scale,
+    const MetaTensor& bias,
+    const MetaTensor& bn_scale,
+    const MetaTensor& bn_bias,
+    const MetaTensor& input_running_mean,
+    const MetaTensor& input_running_var,
+    const std::vector<int>& paddings,
+    const std::vector<int>& dilations,
+    const std::vector<int>& strides,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::string& data_format,
+    float momentum,
+    float epsilon,
+    bool fuse_prologue,
+    bool exhaustive_search,
+    int64_t accumulation_count,
+    MetaTensor* out,
+    MetaTensor* out_running_mean,
+    MetaTensor* out_running_var,
+    MetaTensor* saved_mean,
+    MetaTensor* saved_var,
+    MetaTensor* eq_scale,
+    MetaTensor* eq_bias) {
+  auto in_dims = x.dims();
+  auto filter_dims = w.dims();
+  // do some checks
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      4,
+      phi::errors::InvalidArgument(
+          "The input of Op(FusedScaleBiasReluConvBnstats) should be a 4-D "
+          "Tensor. But "
+          "received: input's dimension is %u, input's shape is [%s].",
+          in_dims.size(),
+          in_dims));
+
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      filter_dims.size(),
+      phi::errors::InvalidArgument(
+          "The input's dimension and filter's dimension of "
+          "Op(FusedScaleBiasReluConvBnstats) should be equal. But received: "
+          "the input's"
+          " shape is [%s], "
+          "the input's dimension is %d; the filter's shape is [%s],  "
+          "the filter's dimension is %d.",
+          in_dims,
+          in_dims.size(),
+          filter_dims,
+          filter_dims.size()));
+
+  // Check if data format is NHWC
+  PADDLE_ENFORCE_EQ(
+      data_format,
+      "NHWC",
+      phi::errors::InvalidArgument(
+          "Operator(FusedScaleBiasReluConvBnstats) only supports data format "
+          "of "
+          "channel last (NHWC) now. But recieved: data_format = '%s'.",
+          data_format));
+
+  PADDLE_ENFORCE_EQ(
+      groups,
+      1,
+      phi::errors::InvalidArgument("Expect group to be 1, got %d.", groups));
+
+  const auto input_channels = in_dims[in_dims.size() - 1];
+  int dilation_size = dilations.size();
+  for (int i = 0; i < dilation_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        dilations[i],
+        0,
+        phi::errors::InvalidArgument(
+            "The dilation of Op(Conv) should be larget than 0, but received "
+            "dilation is %d.",
+            dilations[i]));
+  }
+
+  PADDLE_ENFORCE_EQ(
+      input_channels,
+      filter_dims[1] * groups,
+      phi::errors::InvalidArgument(
+          "The number of input's channels should be equal to filter's channels "
+          "* groups for Op(FusedScaleBiasReluConvBnstats). But received: the "
+          "input's"
+          " channels is %d, "
+          "the input's shape is [%s]; the filter's channels is %d, the "
+          "filter's shape is [%s]; the groups is %d. ",
+          input_channels,
+          in_dims,
+          filter_dims[1],
+          filter_dims,
+          groups));
+
+  // update paddings and dilations accoring to padding_algorithm
+  std::vector<int> paddings_vec = paddings;
+  std::vector<int> dilations_vec = dilations;
+  // get "HW" from "NHWC"
+  DDim in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+  DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+  phi::UpdatePaddingAndDilation(&paddings_vec,
+                                &dilations_vec,
+                                padding_algorithm,
+                                in_data_dims,
+                                strides,
+                                ksize);
+
+  std::vector<int64_t> out_shape({in_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    out_shape.push_back(ConvOutSize(in_dims[i + 1],
+                                    filter_dims[i + 2],
+                                    dilations[i],
+                                    paddings_vec[i * 2],
+                                    paddings_vec[i * 2 + 1],
+                                    strides[i]));
+  }
+  out_shape.push_back(filter_dims[0]);
+  // make shape for other outputs
+  auto c_dims = phi::make_ddim({filter_dims[0]});
+  // set output and output max dims
+  out->set_dims(DDim(out_shape.data(), out_shape.size()));
+  out_running_mean->set_dims(c_dims);
+  out_running_var->set_dims(c_dims);
+  saved_mean->set_dims(c_dims);
+  saved_var->set_dims(c_dims);
+  eq_scale->set_dims(c_dims);
+  eq_bias->set_dims(c_dims);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index 25c27bdd406b96..3d7ba19c4ec3f5 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -201,4 +201,32 @@ void FastLayernormXPUInferMeta(const MetaTensor& x,
                                float epsilon,
                                MetaTensor* out);
 
+void FusedScaleBiasReluConvBnstatsInferMeta(
+    const MetaTensor& x,
+    const MetaTensor& w,
+    const MetaTensor& scale,
+    const MetaTensor& bias,
+    const MetaTensor& bn_scale,
+    const MetaTensor& bn_bias,
+    const MetaTensor& input_running_mean,
+    const MetaTensor& input_running_var,
+    const std::vector<int>& paddings,
+    const std::vector<int>& dilations,
+    const std::vector<int>& strides,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::string& data_format,
+    float momentum,
+    float epsilon,
+    bool fuse_prologue,
+    bool exhaustive_search,
+    int64_t accumulation_count,
+    MetaTensor* out,
+    MetaTensor* out_running_mean,
+    MetaTensor* out_running_var,
+    MetaTensor* saved_mean,
+    MetaTensor* saved_var,
+    MetaTensor* eq_scale,
+    MetaTensor* eq_bias);
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/CMakeLists.txt b/paddle/phi/infermeta/spmd_rules/CMakeLists.txt
new file mode 100644
index 00000000000000..c28cd85c718c88
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(
+  GLOB spmd_rules_srcs
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "*.cc")
+
+collect_srcs(infermeta_srcs SRCS ${spmd_rules_srcs})
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.cc b/paddle/phi/infermeta/spmd_rules/matmul.cc
similarity index 69%
rename from paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.cc
rename to paddle/phi/infermeta/spmd_rules/matmul.cc
index d280ccec37d7a1..088f9ab16363ad 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.cc
+++ b/paddle/phi/infermeta/spmd_rules/matmul.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,22 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h"
+#include "paddle/phi/infermeta/spmd_rules/matmul.h"
 
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace paddle {
+namespace phi {
 namespace distributed {
-namespace auto_parallel {
+
 using phi::distributed::auto_parallel::str_join;
 
-TensorDistAttr GetInferedDistAttr(
+////////////////// Utils Functions //////////////////
+
+TensorDistAttr GetMatmulInferedDistAttr(
     const TensorDistAttr& origin_dist_attr,
     const std::vector<int64_t>& shape,
     const std::string& tensor_axis,
     const std::unordered_map<std::string, int64_t>& axis_to_dim_map,
-    const bool trans_axis) {
-  TensorDistAttr dist_attr_ = CopyTensorDistAttrForOutput(origin_dist_attr);
+    bool trans_axis) {
+  TensorDistAttr dist_attr = CopyTensorDistAttrForOutput(origin_dist_attr);
   std::vector<int64_t> infered_dims_mapping;
   infered_dims_mapping.reserve(tensor_axis.size());
 
@@ -50,8 +57,8 @@ TensorDistAttr GetInferedDistAttr(
                    infered_dims_mapping.end() - 1);
   }
 
-  dist_attr_.set_dims_mapping(infered_dims_mapping);
-  return dist_attr_;
+  dist_attr.set_dims_mapping(infered_dims_mapping);
+  return dist_attr;
 }
 
 void FillMatmulOperandNotation(const int x_ndim,
@@ -105,42 +112,35 @@ void FillMatmulOperandNotation(const int x_ndim,
   }
 }
 
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-MatmulSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
-                             const paddle::framework::AttributeMap& attrs) {
-  // step0: verify input args based on matmul logic
-  auto input_specs_size = input_specs.size();
-  PADDLE_ENFORCE_EQ(
-      input_specs_size,
-      2,
-      phi::errors::InvalidArgument(
-          "The size of InputSpec of matmul should be 2, but got [%d].",
-          input_specs_size));
-  auto x_shape = input_specs[0].shape();
-  auto y_shape = input_specs[1].shape();
+////////////////// InferMeta(Contains SPMD) Functions //////////////////
+
+SpmdInfo MatmulSpmdInferForward(const DistMetaTensor& x,
+                                const DistMetaTensor& y,
+                                bool trans_x,
+                                bool trans_y) {
+  // Step0: verify input args based on matmul logic
+  auto x_shape = phi::vectorize(x.dims());
+  auto y_shape = phi::vectorize(y.dims());
   int x_ndim = x_shape.size();
   int y_ndim = y_shape.size();
-  auto x_dist_attr_src = input_specs[0].dist_attr();
-  auto y_dist_attr_src = input_specs[1].dist_attr();
+  auto x_dist_attr_src = x.dist_attr();
+  auto y_dist_attr_src = y.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
   std::vector<int64_t> y_dims_mapping = y_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
       x_ndim,
       x_dims_mapping.size(),
-      phi::errors::InvalidArgument(
-          "Mismatch of X's tensor size: [%d] and X's dims_mapping size [%d].",
-          x_ndim,
-          x_dims_mapping.size()));
+      phi::errors::InvalidArgument("The Tensor X's rank [%d] and X's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   x_ndim,
+                                   x_dims_mapping.size()));
   PADDLE_ENFORCE_EQ(
       y_ndim,
       y_dims_mapping.size(),
-      phi::errors::InvalidArgument(
-          "Mismatch of Y's tensor size: [%d] and Y's dims_mapping size [%d].",
-          y_ndim,
-          y_dims_mapping.size()));
-
-  bool trans_x = ExtractAttr<bool>("trans_x", attrs);
-  bool trans_y = ExtractAttr<bool>("trans_y", attrs);
+      phi::errors::InvalidArgument("The Tensor Y's rank [%d] and Y's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   y_ndim,
+                                   y_dims_mapping.size()));
 
   VLOG(6) << "MatmulSPMDRule InferForward Inputs: "
           << "X shape: [" << str_join(x_shape) << "], x_dims_mapping: ["
@@ -151,37 +151,37 @@ MatmulSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
           << "trans_y: "
           << "[" << (trans_y ? "true" : "false") << "]; ";
 
-  // step1: build Einsum Notation
+  // Step1: build Einsum Notation
   std::string x_axes;
   std::string y_axes;
   std::string out_axes;
   FillMatmulOperandNotation(x_ndim, y_ndim, &x_axes, &y_axes, &out_axes);
 
-  // step2: Sharding Propogation
+  // Step2: Sharding Propogation
   if (trans_x) {
-    PADDLE_ENFORCE_GE(
-        x_ndim,
-        2,
-        phi::errors::InvalidArgument("When trans_x is True, the size of X "
-                                     "tensor should be 2,  but got [%d].",
-                                     x_ndim));
+    PADDLE_ENFORCE_GE(x_ndim,
+                      2,
+                      phi::errors::InvalidArgument(
+                          "When trans_x is True, the size of X "
+                          "tensor should be greater than 2,  but got [%d].",
+                          x_ndim));
     std::iter_swap(x_dims_mapping.end() - 2, x_dims_mapping.end() - 1);
   }
   if (trans_y) {
-    PADDLE_ENFORCE_GE(
-        y_ndim,
-        2,
-        phi::errors::InvalidArgument("When trans_x is True, the size of X "
-                                     "tensor should be 2,  but got [%d].",
-                                     y_ndim));
+    PADDLE_ENFORCE_GE(y_ndim,
+                      2,
+                      phi::errors::InvalidArgument(
+                          "When trans_y is True, the size of Y "
+                          "tensor should be greater than 2,  but got [%d].",
+                          y_ndim));
     std::iter_swap(y_dims_mapping.end() - 2, y_dims_mapping.end() - 1);
   }
-  // step2.1: Sharding Merge
+  // Step2.1: Sharding Merge
   std::pair<std::string, std::vector<int64_t>> x_pair(x_axes, x_dims_mapping);
   std::pair<std::string, std::vector<int64_t>> y_pair(y_axes, y_dims_mapping);
   auto axis_to_dim_map = ShardingMergeForTensors({x_pair, y_pair});
 
-  // step2.2: Infer Output's Dims Mapping.
+  // Step2.2: Infer Output's Dims Mapping.
   TensorDistAttr output_dist_attr_dst =
       CopyTensorDistAttrForOutput(x_dist_attr_src);
   std::vector<int64_t> out_dims_mapping;
@@ -191,13 +191,13 @@ MatmulSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
   }
   output_dist_attr_dst.set_dims_mapping(out_dims_mapping);
 
-  // step2.3: Merge and get Inputs' New Dims Mapping.
-  TensorDistAttr x_dist_attr_dst = GetInferedDistAttr(
+  // Step2.3: Merge and get Inputs' New Dims Mapping.
+  TensorDistAttr x_dist_attr_dst = GetMatmulInferedDistAttr(
       x_dist_attr_src, x_shape, x_axes, axis_to_dim_map, trans_x);
-  TensorDistAttr y_dist_attr_dst = GetInferedDistAttr(
+  TensorDistAttr y_dist_attr_dst = GetMatmulInferedDistAttr(
       y_dist_attr_src, y_shape, y_axes, axis_to_dim_map, trans_y);
 
-  // step2.3: Handle Partial
+  // Step2.3: Handle Partial
   // Step2.3.1 Output Partial
   std::vector<int64_t> partial_on_dims =
       ResoluteOutputPartialDimension(axis_to_dim_map, out_axes);
@@ -221,24 +221,16 @@ MatmulSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
   return {{x_dist_attr_dst, y_dist_attr_dst}, {output_dist_attr_dst}};
 }
 
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-MatmulSPMDRule::InferBackward(const std::vector<DistTensorSpec>& input_specs,
-                              const std::vector<DistTensorSpec>& output_specs,
-                              const paddle::framework::AttributeMap& attrs) {
-  // extra & verify input
-  auto output_specs_size = output_specs.size();
-  PADDLE_ENFORCE_EQ(
-      output_specs_size,
-      1,
-      phi::errors::InvalidArgument(
-          "The size of OutputSpec of matmul should be 1, but got [%d].",
-          output_specs_size));
-
-  auto out_shape = output_specs[0].shape();
+SpmdInfo MatmulSpmdInferBackward(const DistMetaTensor& x,
+                                 const DistMetaTensor& y,
+                                 const DistMetaTensor& out,
+                                 bool trans_x,
+                                 bool trans_y) {
+  auto out_shape = phi::vectorize(out.dims());
   int out_ndim = out_shape.size();
 
-  auto x_shape = input_specs[0].shape();
-  auto y_shape = input_specs[1].shape();
+  auto x_shape = phi::vectorize(x.dims());
+  auto y_shape = phi::vectorize(y.dims());
   int x_ndim = x_shape.size();
   int y_ndim = y_shape.size();
   int max_ndim = std::max(x_ndim, y_ndim);
@@ -250,10 +242,7 @@ MatmulSPMDRule::InferBackward(const std::vector<DistTensorSpec>& input_specs,
                         max_ndim,
                         out_ndim));
 
-  bool trans_x = ExtractAttr<bool>("trans_x", attrs);
-  bool trans_y = ExtractAttr<bool>("trans_y", attrs);
-
-  auto out_dist_attr_src = output_specs[0].dist_attr();
+  auto out_dist_attr_src = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
 
   // step1: build Einsum Notation
@@ -267,10 +256,10 @@ MatmulSPMDRule::InferBackward(const std::vector<DistTensorSpec>& input_specs,
   auto axis_to_dim_map =
       ShardingMergeForTensors({{out_axes, out_dims_mapping}}, false);
 
-  TensorDistAttr x_dist_attr_dst = GetInferedDistAttr(
-      input_specs[0].dist_attr(), x_shape, x_axes, axis_to_dim_map, trans_x);
-  TensorDistAttr y_dist_attr_dst = GetInferedDistAttr(
-      input_specs[1].dist_attr(), y_shape, y_axes, axis_to_dim_map, trans_y);
+  TensorDistAttr x_dist_attr_dst = GetMatmulInferedDistAttr(
+      x.dist_attr(), x_shape, x_axes, axis_to_dim_map, trans_x);
+  TensorDistAttr y_dist_attr_dst = GetMatmulInferedDistAttr(
+      y.dist_attr(), y_shape, y_axes, axis_to_dim_map, trans_y);
 
   // step3: Handle Partial
   // NOTE we skip the partial backward inference in Partial Stage-I.
@@ -289,6 +278,5 @@ MatmulSPMDRule::InferBackward(const std::vector<DistTensorSpec>& input_specs,
   return {{x_dist_attr_dst, y_dist_attr_dst}, {out_dist_attr_src}};
 }
 
-}  // namespace auto_parallel
 }  // namespace distributed
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/matmul.h b/paddle/phi/infermeta/spmd_rules/matmul.h
new file mode 100644
index 00000000000000..64cfba26a7445c
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/matmul.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+
+namespace phi {
+namespace distributed {
+
+SpmdInfo MatmulSpmdInferForward(const DistMetaTensor& x,
+                                const DistMetaTensor& y,
+                                bool trans_x,
+                                bool trans_y);
+
+SpmdInfo MatmulSpmdInferBackward(const DistMetaTensor& x,
+                                 const DistMetaTensor& y,
+                                 const DistMetaTensor& out,
+                                 bool trans_x,
+                                 bool trans_y);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h
new file mode 100644
index 00000000000000..ad519ff287a33b
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/rules.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+
+#include "paddle/phi/infermeta/spmd_rules/matmul.h"
+
+/**
+ * Design Notes:
+ *
+ * 1. SPMD info is the special meta info of DistTensor, so we put Spmd infer
+ * functions in `infermeta` directory.
+ *
+ * 2. Since the infer functions of Spmd forward and backward are closely related
+ * and need to be registered together, we manage them together in one file.
+ *
+ * 3. SPMD rules are much smaller than infermeta function, and we manage files
+ * in operator units.
+ *
+ * 4. The previous registration used some compile-time regular matching methods,
+ * which was less flexible, and the registration of SPMD rules here is declare
+ * directly in the header file
+ */
+
+namespace phi {
+namespace distributed {
+
+// matmul rule
+PD_REGISTER_SPMD_RULE(matmul,
+                      PD_INFER_SPMD(phi::distributed::MatmulSpmdInferForward),
+                      PD_INFER_SPMD(phi::distributed::MatmulSpmdInferBackward));
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/utils.cc b/paddle/phi/infermeta/spmd_rules/utils.cc
new file mode 100644
index 00000000000000..2252de98a78b35
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/utils.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace distributed {
+
+using phi::distributed::auto_parallel::str_join;
+
+std::string GetBroadcastAxes(const int64_t& tenosr_ndim,
+                             const int64_t& broadcast_ndim,
+                             const std::string& alphabet) {
+  PADDLE_ENFORCE_GE(
+      alphabet.size(),
+      broadcast_ndim,
+      phi::errors::InvalidArgument(
+          "The size of alphabet [%d] is less than broadcast ndim [%d]",
+          alphabet.size(),
+          broadcast_ndim));
+  PADDLE_ENFORCE_GE(broadcast_ndim,
+                    tenosr_ndim,
+                    phi::errors::InvalidArgument(
+                        "The broadcast ndim [%d] is less than tenosr ndim [%d]",
+                        broadcast_ndim,
+                        tenosr_ndim));
+  if (tenosr_ndim <= 0) {
+    return std::string();
+  }
+  return alphabet.substr(broadcast_ndim - tenosr_ndim, tenosr_ndim);
+}
+
+// Rule1: A repicated dimension could be merged by any sharded dimension.
+// Rule2: A tensor axis could at most be sharded by one mesh dimension.
+// (TODO trigger heuristics cost model and reshard to handle axis sharded by
+// multiple dimension case.)
+int64_t ShardingMergeForAxis(const std::string& axis,
+                             const int64_t& mesh_dim1,
+                             const int64_t& mesh_dim2) {
+  if (mesh_dim1 != mesh_dim2) {
+    if (mesh_dim1 == -1) {
+      return mesh_dim2;
+    } else if (mesh_dim2 == -1) {
+      return mesh_dim1;
+    } else {
+      // (TODO) local cost model here.
+      PADDLE_THROW(
+          phi::errors::Unimplemented("Tensor Axis[%s] is Sharded by two "
+                                     "different mesh dimension [%d] and [%d].",
+                                     axis,
+                                     mesh_dim1,
+                                     mesh_dim2));
+    }
+
+  } else {
+    return mesh_dim1;
+  }
+}
+
+std::unordered_map<std::string, int64_t> ShardingMergeForTensors(
+    const std::vector<std::pair<std::string, std::vector<int64_t>>>&
+        tensor_axes_to_dim_pairs,
+    const bool merge_conflicts) {
+  std::unordered_map<std::string, int64_t> axis_to_dim_map;
+  std::unordered_map<int64_t, std::string> dim_to_axis_map;
+  int64_t merge_dim;
+
+  for (auto& pair : tensor_axes_to_dim_pairs) {
+    for (size_t i = 0; i < pair.second.size(); ++i) {
+      auto tensor_axis = pair.first.substr(i, 1);
+      auto mesh_dim = pair.second[i];
+
+      if (axis_to_dim_map.count(tensor_axis) == 0) {
+        merge_dim = mesh_dim;
+      } else {
+        merge_dim = ShardingMergeForAxis(
+            tensor_axis, mesh_dim, axis_to_dim_map[tensor_axis]);
+      }
+      axis_to_dim_map[tensor_axis] = merge_dim;
+      if (merge_dim != -1) {
+        if (dim_to_axis_map.count(merge_dim) == 0) {
+          dim_to_axis_map.insert({merge_dim, tensor_axis});
+        } else if (dim_to_axis_map[merge_dim].find(tensor_axis) ==
+                   std::string::npos) {
+          dim_to_axis_map[merge_dim] += tensor_axis;
+        }
+      }
+    }
+  }
+
+  // Resolute "mesh_dim shard by more than one axis" confict.
+  // Now we just naive pick the first axis naively.
+  // (TODO) use local cost model to pick the axis with lowest cost(in concern of
+  // memory or communication or computation).
+  for (auto& it : dim_to_axis_map) {
+    if (it.second.size() > 1) {
+      if (merge_conflicts) {
+        VLOG(4) << "Sharding Conflict: Mesh_Dim [" << it.first
+                << "] are Sharding Multiple Tensor Axis: [" << it.second
+                << "]. The Axis: [" << it.second[0] << "] is Picked.";
+        for (size_t i = 1; i < it.second.size(); ++i) {
+          axis_to_dim_map[it.second.substr(i, 1)] = -1;
+        }
+      } else {
+        PADDLE_THROW(phi::errors::PreconditionNotMet(
+            "Multiple Tensor Axes [%s] is sharded by same mesh dimension [%d].",
+            str_join(it.second),
+            it.first));
+      }
+    }
+  }
+
+  return axis_to_dim_map;
+}
+
+TensorDistAttr CopyTensorDistAttrForOutput(
+    const TensorDistAttr& src_dist_attr) {
+  TensorDistAttr new_dist_attr = TensorDistAttr();
+  new_dist_attr.set_process_mesh(src_dist_attr.process_mesh());
+  new_dist_attr.set_batch_dim(src_dist_attr.batch_dim());
+  new_dist_attr.set_dynamic_dims(src_dist_attr.dynamic_dims());
+  // new_dist_attr.set_annotated(false); TODO unset field is false by default.
+  return new_dist_attr;
+}
+
+std::vector<int64_t> ResoluteOutputPartialDimension(
+    const std::unordered_map<std::string, int64_t>& axis_to_dim_map,
+    const std::string& tensor_axes) {
+  std::vector<int64_t> partial_on_dims;
+
+  for (auto& it : axis_to_dim_map) {
+    if (tensor_axes.find(it.first) == std::string::npos) {
+      if (it.second > -1) {
+        partial_on_dims.push_back(it.second);
+      }
+    }
+  }
+  return partial_on_dims;
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/utils.h b/paddle/phi/infermeta/spmd_rules/utils.h
new file mode 100644
index 00000000000000..5e3c3a3d0961c7
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/utils.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace phi {
+namespace distributed {
+class TensorDistAttr;
+
+// Generate the axis notation of tensor for the einsum notation of a broadcast
+// operation(alignment star from the rightmost axis). tenosr_ndim: the size of
+// the tensor. broadcast_ndim: the maxium size of tensors in this broadcast
+// operation. alphabet: the characters used to represent the axes of tensor.
+// length of alphabet should >= broadcast_ndim.
+std::string GetBroadcastAxes(const int64_t& tenosr_ndim,
+                             const int64_t& broadcast_ndim,
+                             const std::string& alphabet);
+
+// Merge the sharding specification (dims mapping) for one tensor Axis.
+// Rule1: A repicated dimension could be merged by any sharded dimension.
+// Rule2: A tensor axis could at most be sharded by one mesh dimension.
+// (TODO trigger heuristics cost model and reshard to handle axis sharded by
+// multiple dimension case.)
+int64_t ShardingMergeForAxis(const std::string& axis,
+                             const int64_t& mesh_dim1,
+                             const int64_t& mesh_dim2);
+
+// Merge sharding specification (dims mapping) of given tensors.
+// The same axes of different tensors will be merged.
+std::unordered_map<std::string, int64_t> ShardingMergeForTensors(
+    const std::vector<std::pair<std::string, std::vector<int64_t>>>&
+        tensor_axes_to_dim_pairs,
+    const bool merge_conflicts = true);
+
+// Intend to use for generating the TensorDistAttr of output based on the input
+// activation TensorDistAttr. The process_mesh, batch_dim, dynamic_dim are
+// copied with annotated is forced to False, and dims_mapping is leave to be
+// null.
+TensorDistAttr CopyTensorDistAttrForOutput(const TensorDistAttr& src_dist_attr);
+
+// Resolute the partial mesh dimension of a output tensor, giving the
+// merged sharding specifcation of input tensors and the axis names of output
+// tensor. Input are
+std::vector<int64_t> ResoluteOutputPartialDimension(
+    const std::unordered_map<std::string, int64_t>& axis_to_dim_map,
+    const std::string& tensor_axes);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 6e04a149266ac1..28b80b58155fa4 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <algorithm>
 #include <set>
 
-#include "gflags/gflags.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
@@ -31,6 +30,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
+#include "paddle/utils/flags.h"
 
 namespace phi {
 
@@ -2083,6 +2083,13 @@ void KthvalueInferMeta(const MetaTensor& x,
   indices->set_dtype(x.dtype());
 }
 
+void LogicalNotInfermeta(const MetaTensor& x, MetaTensor* out) {
+  UnchangedInferMeta(x, out);
+  if (!(out->is_same_tensor(x))) {
+    out->set_dtype(DataType::BOOL);
+  }
+}
+
 void LogsumexpInferMeta(const MetaTensor& input,
                         const std::vector<int64_t>& axis,
                         bool keepdim,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 136b8c240e5f33..2bf90048d30d36 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -298,6 +298,8 @@ void KthvalueInferMeta(const MetaTensor& x,
                        MetaTensor* indices,
                        MetaConfig = MetaConfig());
 
+void LogicalNotInfermeta(const MetaTensor& x, MetaTensor* out);
+
 void LogsumexpInferMeta(const MetaTensor& input,
                         const std::vector<int64_t>& axis,
                         bool keepdim,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 9a917004c83148..cc8df692a1267f 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -94,6 +94,11 @@ if(WITH_CUTLASS)
   list(APPEND kernel_cu ${cutlass_cu})
 endif()
 
+if(NOT WITH_CUDNN_FRONTEND)
+  list(REMOVE_ITEM kernel_cu
+       "fusion/gpu/fused_scale_bias_relu_conv_bnstats_kernel.cu")
+endif()
+
 set(cc_search_pattern
     "*.cc"
     "cpu/*.cc"
diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc
index 6ff1296b5135e7..ba48e2e00ce54f 100644
--- a/paddle/phi/kernels/autotune/cache.cc
+++ b/paddle/phi/kernels/autotune/cache.cc
@@ -47,6 +47,11 @@ std::string AlgorithmTypeString(int64_t algo_type) {
   } else if (algo_type ==
              static_cast<int64_t>(AlgorithmType::kConvBackwardFilterV8)) {
     return "conv_backward_filter_v8";
+  } else if (algo_type ==
+             static_cast<int64_t>(AlgorithmType::kScaleBiasReluConvBNstats)) {
+    return "scale_bias_relu_conv_bnstats";
+  } else if (algo_type == static_cast<int64_t>(AlgorithmType::kBNFinalize)) {
+    return "bn_finalize";
   }
 #endif
   return std::to_string(algo_type);
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
index 188faaed71be3a..34b98e28f50c7b 100644
--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -55,7 +55,9 @@ enum class AlgorithmType {
   kConvForwardV8 = 10,
   kConvBackwardDataV8 = 11,
   kConvBackwardFilterV8 = 12,
-  kAlgorithmCount = 13
+  kScaleBiasReluConvBNstats = 13,
+  kBNFinalize = 14,
+  kAlgorithmCount = 15
 #endif
 };
 
@@ -178,9 +180,8 @@ class AutoTuneCache {
         conv_auto_tune_map_[key] = cache;
       }
 #ifdef PADDLE_WITH_CUDNN_FRONTEND
-    } else if (algo_type == AlgorithmType::kConvForwardV8 ||
-               algo_type == AlgorithmType::kConvBackwardDataV8 ||
-               algo_type == AlgorithmType::kConvBackwardFilterV8) {
+    } else if (algo_type >= AlgorithmType::kConvForwardV8 &&
+               algo_type <= AlgorithmType::kBNFinalize) {
       int64_t key = static_cast<int64_t>(algo_type);
       if (cudnn_v8_auto_tune_map_.find(key) == cudnn_v8_auto_tune_map_.end()) {
         CudnnFrontendPlanCache cache;
diff --git a/paddle/phi/kernels/autotune/cache_cudnn_frontend.h b/paddle/phi/kernels/autotune/cache_cudnn_frontend.h
index 095cedccb991c2..cfd16e5143393f 100644
--- a/paddle/phi/kernels/autotune/cache_cudnn_frontend.h
+++ b/paddle/phi/kernels/autotune/cache_cudnn_frontend.h
@@ -22,7 +22,7 @@
 
 #include "paddle/phi/backends/dynload/cudnn_frontend.h"
 
-DECLARE_int32(cudnn_cache_saturation_count);
+PD_DECLARE_int32(cudnn_cache_saturation_count);
 
 namespace phi {
 namespace autotune {
@@ -79,10 +79,10 @@ class CudnnFrontendPlanCache {
     return ret;
   }
 
-  void GetPlan(const cudnn_frontend::feature_vector_t &feature,
-               const cudnn_frontend::ExecutionPlan **plan,
-               int64_t *workspace_size,
-               cudnnHandle_t handle) {
+  void GetPlanAndWorkspaceSize(const cudnn_frontend::feature_vector_t &feature,
+                               const cudnn_frontend::ExecutionPlan **plan,
+                               int64_t *workspace_size,
+                               cudnnHandle_t handle) {
     // Note(tizheng): CUDNNv8 execution plan is not thread-safe.
     // A shared plan being executed by different threads is
     // generally not safe (for now).
@@ -90,11 +90,11 @@ class CudnnFrontendPlanCache {
     auto &local_map = map_[hasher(std::this_thread::get_id())];
 
     auto it = local_map.find(GetExtendedFeature(feature, handle));
-    if (it == local_map.end()) {
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "[cudnn_frontend] Cached Plan Not Found."));
-      return;
-    }
+    PADDLE_ENFORCE_NE(it,
+                      local_map.end(),
+                      phi::errors::InvalidArgument(
+                          "[cudnn_frontend] Cached Plan Not Found."));
+
     *plan = &(it->second);
     *workspace_size = (*plan)->getWorkspaceSize();
     VLOG(4) << "Cached execution plan found." << (*plan)->getTag()
@@ -133,11 +133,12 @@ class CudnnFrontendPlanCache {
     return FindPlan(op_graph.getFeatureVector(), handle);
   }
 
-  void GetPlan(const cudnn_frontend::OperationGraph &op_graph,
-               const cudnn_frontend::ExecutionPlan **plan,
-               int64_t *workspace_size,
-               cudnnHandle_t handle) {
-    GetPlan(op_graph.getFeatureVector(), plan, workspace_size, handle);
+  void GetPlanAndWorkspaceSize(const cudnn_frontend::OperationGraph &op_graph,
+                               const cudnn_frontend::ExecutionPlan **plan,
+                               int64_t *workspace_size,
+                               cudnnHandle_t handle) {
+    GetPlanAndWorkspaceSize(
+        op_graph.getFeatureVector(), plan, workspace_size, handle);
   }
 
   void InsertPlan(const cudnn_frontend::OperationGraph &op_graph,
@@ -176,5 +177,49 @@ class CudnnFrontendPlanCache {
   int64_t cache_misses_{0};
 };  // class CudnnFrontendPlanCache
 
+template <typename T>
+inline void BuildFeatureVectorSingle(cudnn_frontend::feature_vector_t *v,
+                                     const T &value) {
+  v->push_back(static_cast<int64_t>(value));
+}
+
+template <>
+inline void BuildFeatureVectorSingle(cudnn_frontend::feature_vector_t *v,
+                                     const float &value) {
+  int64_t val = 0;
+  memcpy(&val, &value, sizeof(float));
+  v->push_back(val);
+}
+
+template <>
+inline void BuildFeatureVectorSingle<std::vector<int64_t>>(
+    cudnn_frontend::feature_vector_t *v, const std::vector<int64_t> &value) {
+  v->insert(v->end(), value.begin(), value.end());
+}
+
+template <>
+inline void BuildFeatureVectorSingle<std::vector<int>>(
+    cudnn_frontend::feature_vector_t *v, const std::vector<int> &value) {
+  for (auto &val : value) {
+    v->push_back(static_cast<int64_t>(val));
+  }
+}
+
+template <>
+inline void BuildFeatureVectorSingle<std::string>(
+    cudnn_frontend::feature_vector_t *v, const std::string &value) {
+  v->push_back(std::hash<std::string>()(value));
+}
+
+inline void BuildFeatureVector(cudnn_frontend::feature_vector_t *v) { return; }
+
+template <typename T, typename... Args>
+inline void BuildFeatureVector(cudnn_frontend::feature_vector_t *v,
+                               const T &value,
+                               Args... args) {
+  BuildFeatureVectorSingle(v, value);
+  BuildFeatureVector(v, args...);
+}
+
 }  // namespace autotune
 }  // namespace phi
diff --git a/paddle/phi/kernels/autotune/switch_autotune.cc b/paddle/phi/kernels/autotune/switch_autotune.cc
index 3742749b3bf032..e287705d08b79b 100644
--- a/paddle/phi/kernels/autotune/switch_autotune.cc
+++ b/paddle/phi/kernels/autotune/switch_autotune.cc
@@ -14,10 +14,10 @@
 
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_bool(use_autotune);
+PD_DECLARE_bool(use_autotune);
 
 namespace phi {
 namespace autotune {
diff --git a/paddle/phi/kernels/cpu/adam_kernel.cc b/paddle/phi/kernels/cpu/adam_kernel.cc
index 083c9dab74001c..1a63b779b02a19 100644
--- a/paddle/phi/kernels/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/cpu/adam_kernel.cc
@@ -24,7 +24,7 @@
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/jit/kernels.h"
 
-DECLARE_int32(inner_op_parallelism);
+PD_DECLARE_int32(inner_op_parallelism);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/isclose_kernel.cc b/paddle/phi/kernels/cpu/isclose_kernel.cc
index dca21494b3ee95..33457921df61e2 100644
--- a/paddle/phi/kernels/cpu/isclose_kernel.cc
+++ b/paddle/phi/kernels/cpu/isclose_kernel.cc
@@ -18,5 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/isclose_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    isclose, CPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {}
+PD_REGISTER_KERNEL(isclose,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IscloseKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc
index 33e3ea6e05f7e5..06dff8428533fe 100644
--- a/paddle/phi/kernels/cpu/logical_kernel.cc
+++ b/paddle/phi/kernels/cpu/logical_kernel.cc
@@ -77,6 +77,8 @@ void LogicalNotKernel(const Context& dev_ctx,
                      int64_t,                               \
                      int,                                   \
                      int8_t,                                \
+                     phi::dtype::complex<float>,            \
+                     phi::dtype::complex<double>,           \
                      int16_t) {}
 
 REGISTER_LOGICAL_CPU_KERNEL(logical_and, And)
diff --git a/paddle/phi/kernels/distributed_fused_lamb_init_kernel.h b/paddle/phi/kernels/distributed_fused_lamb_init_kernel.h
new file mode 100644
index 00000000000000..182c79ab80319f
--- /dev/null
+++ b/paddle/phi/kernels/distributed_fused_lamb_init_kernel.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DistributedFusedLambInitOpKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& grad,
+    float beta1,
+    float beta2,
+    const std::vector<int>& apply_weight_decay,
+    int alignment,
+    int rank,
+    int nranks,
+    DenseTensor* fp32_fused_param,
+    DenseTensor* fp32_fused_grad,
+    DenseTensor* fp16_fused_param,
+    DenseTensor* fp16_fused_grad,
+    DenseTensor* moment1,
+    DenseTensor* moment2,
+    DenseTensor* beta1_pow,
+    DenseTensor* beta2_pow,
+    DenseTensor* fused_param_offsets,
+    DenseTensor* fp32_shard_fused_param_offsets,
+    DenseTensor* fp16_shard_fused_param_offsets,
+    DenseTensor* param_info,
+    DenseTensor* param_order,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> master_param_out,
+    std::vector<DenseTensor*> grad_out,
+    DenseTensor* global_scale,
+    DenseTensor* step);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index 84eea97da9f510..6e4f7e22781f13 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -17,8 +17,8 @@
 #if defined(__NVCC__)
 #include <thrust/device_vector.h>
 #endif
-#include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
index 805a718ab85ed4..224cf4d6cb4970 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
@@ -14,13 +14,13 @@
 
 #pragma once
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/backends/dynload/rocblas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-DECLARE_bool(enable_cublas_tensor_op_math);
+PD_DECLARE_bool(enable_cublas_tensor_op_math);
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/cross_entropy.cu b/paddle/phi/kernels/funcs/cross_entropy.cu
index add838106bfe8d..20a15f9e944fef 100644
--- a/paddle/phi/kernels/funcs/cross_entropy.cu
+++ b/paddle/phi/kernels/funcs/cross_entropy.cu
@@ -124,11 +124,7 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
 
   int batch_size = prob->dims()[0];
   int class_num = prob->dims()[1];
-#ifdef __HIPCC__
-  constexpr int kMaxBlockDim = 256;
-#else
   constexpr int kMaxBlockDim = 512;
-#endif
 
   if (softLabel) {
     const T* label_data = labels->data<T>();
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index df9e93474446e6..5ff70c86d5fe8b 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -32,11 +32,7 @@ limitations under the License. */
 
 #endif
 
-#ifdef __HIPCC__
-constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
-#else
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
-#endif
 
 #define BLOCK_X 32
 #define BLOCK_Y 32
diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h
index 9648a7d845ff05..484fbd21dc7709 100644
--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -65,10 +65,7 @@ struct ForRange<phi::GPUContext> {
 
   template <typename Function>
   inline void operator()(Function func) const {
-#ifdef __HIPCC__
-    // HIP will throw core dump when threads > 256
-    constexpr int num_threads = 256;
-#elif WITH_NV_JETSON
+#if WITH_NV_JETSON
     // JETSON_NANO will throw core dump when threads > 128
     int num_thread = 256;
     backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128);
diff --git a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
index 6f4eb46bf4eb74..eb5f0fa540f8d3 100644
--- a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
+++ b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
@@ -26,7 +26,6 @@ limitations under the License. */
 
 #if CUDA_VERSION >= 11060
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/dynload/cublasLt.h"
@@ -38,9 +37,10 @@ limitations under the License. */
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/scope_guard.h"
 #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
+#include "paddle/utils/flags.h"
 #include "paddle/utils/optional.h"
 
-DECLARE_int64(cublaslt_exhaustive_search_times);
+PD_DECLARE_int64(cublaslt_exhaustive_search_times);
 
 namespace phi {
 namespace funcs {
diff --git a/paddle/phi/kernels/funcs/jit/benchmark.cc b/paddle/phi/kernels/funcs/jit/benchmark.cc
index ad7146db0d93a6..d5d1da7385797f 100644
--- a/paddle/phi/kernels/funcs/jit/benchmark.cc
+++ b/paddle/phi/kernels/funcs/jit/benchmark.cc
@@ -15,18 +15,18 @@
 #include <iostream>
 #include <random>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/api/profiler/device_tracer.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/kernels.h"
+#include "paddle/utils/flags.h"
 
-DEFINE_int32(burning, 10, "Burning times.");
-DEFINE_int32(repeat, 3000, "Repeat times.");
-DEFINE_int32(max_size, 1000, "The Max size would be tested.");
-DEFINE_string(filter, "", "The Benchmark name would be run.");  // NOLINT
+PD_DEFINE_int32(burning, 10, "Burning times.");
+PD_DEFINE_int32(repeat, 3000, "Repeat times.");
+PD_DEFINE_int32(max_size, 1000, "The Max size would be tested.");
+PD_DEFINE_string(filter, "", "The Benchmark name would be run.");  // NOLINT
 
 class BenchJITKernel {
  public:
@@ -546,7 +546,7 @@ BENCH_FP32_CPU(VBroadcast);
 //     --max_size: the max size would be tested
 //     --filter: the bench name would be run
 int main(int argc, char* argv[]) {
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::flags::ParseCommandLineFlags(&argc, &argv);
   google::InitGoogleLogging(argv[0]);
   LOG(INFO) << "Burning " << FLAGS_burning << " times, Repeat " << FLAGS_repeat
             << " times.";
diff --git a/paddle/phi/kernels/funcs/jit/gen_base.h b/paddle/phi/kernels/funcs/jit/gen_base.h
index dfad19eff34482..f96f08005ab033 100644
--- a/paddle/phi/kernels/funcs/jit/gen_base.h
+++ b/paddle/phi/kernels/funcs/jit/gen_base.h
@@ -22,9 +22,9 @@
 #include <malloc.h>  // for _aligned_malloc
 #endif
 
-#include "gflags/gflags.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/jit/kernel_base.h"
+#include "paddle/utils/flags.h"
 
 PHI_DECLARE_bool(dump_jitcode);
 
diff --git a/paddle/phi/kernels/funcs/jit/test.cc b/paddle/phi/kernels/funcs/jit/test.cc
index 0dd5f6c6ba5aef..d388d95975cff9 100644
--- a/paddle/phi/kernels/funcs/jit/test.cc
+++ b/paddle/phi/kernels/funcs/jit/test.cc
@@ -16,15 +16,15 @@ limitations under the License. */
 #include <iostream>
 #include <random>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/jit/kernels.h"
+#include "paddle/utils/flags.h"
 
-DEFINE_double(acc, 1e-5, "Test accuracy threshold.");
+PD_DEFINE_double(acc, 1e-5, "Test accuracy threshold.");
 
 template <typename T>
 void RandomVec(const int n,
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 778f13634300b1..1a52e57e45f236 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -42,11 +42,10 @@ template <typename T>
 using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
 inline static int GetDesiredBlockDim(int64_t block_dim) {
+  const int kMaxBlockDim = 512;
 #ifdef __HIPCC__
-  const int kMaxBlockDim = 256;
   const int lwarpSize = 64;
 #else
-  const int kMaxBlockDim = 512;
   const int lwarpSize = 32;
 #endif
   return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
@@ -1875,11 +1874,7 @@ static void LayerNormBackward(
     int64_t feature_size,
     const phi::GPUContext &dev_ctx) {
   auto stream = dev_ctx.stream();
-#ifdef __HIPCC__
-  const int kMaxBlockDim = 256;
-#else
   const int kMaxBlockDim = 512;
-#endif
   const int kMaxBlockNum = 128;
   int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
                       ((d_scale != nullptr ? 1 : 0) << 1) |
diff --git a/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc b/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc
new file mode 100644
index 00000000000000..3cb37ccf2ed89d
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cpu/distributed_fused_lamb_init_kernel.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/distributed_fused_lamb_init_kernel.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void DistributedFusedLambInitOpKernel(
+    const Context& dev_ctx,
+    const std::vector<const DenseTensor*>& param,
+    const std::vector<const DenseTensor*>& grad,
+    float beta1,
+    float beta2,
+    const std::vector<int>& apply_weight_decay,
+    int alignment,
+    int rank,
+    int nranks,
+    DenseTensor* fp32_fused_param,
+    DenseTensor* fp32_fused_grad,
+    DenseTensor* fp16_fused_param,
+    DenseTensor* fp16_fused_grad,
+    DenseTensor* moment1,
+    DenseTensor* moment2,
+    DenseTensor* beta1_pow,
+    DenseTensor* beta2_pow,
+    DenseTensor* fused_param_offsets,
+    DenseTensor* fp32_shard_fused_param_offsets,
+    DenseTensor* fp16_shard_fused_param_offsets,
+    DenseTensor* param_info,
+    DenseTensor* param_order,
+    std::vector<DenseTensor*> param_out,
+    std::vector<DenseTensor*> master_param_out,
+    std::vector<DenseTensor*> grad_out,
+    DenseTensor* global_scale,
+    DenseTensor* step) {
+  PADDLE_THROW(phi::errors::Unavailable(
+      "Do not support expert count op for cpu kernel now."));
+}
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(distributed_fused_lamb_init,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::fusion::DistributedFusedLambInitOpKernel,
+                   float) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT16);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT16);
+  kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(6).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(7).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(8).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(9).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(10).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(11).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(12).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(13).SetDataType(kernel_key.dtype());
+  kernel->OutputAt(14).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(15).SetDataType(kernel_key.dtype());
+  kernel->OutputAt(16).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(17).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
similarity index 74%
rename from paddle/fluid/operators/optimizers/cast_with_ptr.h
rename to paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
index 205eb2853a3419..5ae8aed256ccdd 100644
--- a/paddle/fluid/operators/optimizers/cast_with_ptr.h
+++ b/paddle/phi/kernels/fusion/gpu/cast_with_ptr.h
@@ -14,28 +14,24 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
-namespace paddle {
-namespace operators {
-namespace details {
+namespace phi {
 
 template <typename InT, typename OutT>
 struct CastFunctor {
   HOSTDEVICE OutT operator()(InT x) const { return static_cast<OutT>(x); }
 };
-
 template <typename InT, typename OutT, int VecSize>
 static void VecCastKernel(const phi::GPUContext &ctx,
                           const InT *x,
                           OutT *y,
                           size_t n) {
-  auto config = platform::GetGpuLaunchConfig1D(ctx, n, VecSize);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, n, VecSize);
   auto block = config.GetGridSize();
   auto thread = config.GetBlockSize();
   auto main_offset = n / (VecSize * thread) * VecSize * thread;
@@ -50,8 +46,6 @@ static void VecCastKernel(const phi::GPUContext &ctx,
           in_arr, out_arr, n, main_offset, VecSize, FunctorT());
 }
 
-}  // namespace details
-
 template <typename InT, typename OutT>
 static void LaunchCastKernel(const phi::GPUContext &ctx,
                              const InT *x,
@@ -61,20 +55,19 @@ static void LaunchCastKernel(const phi::GPUContext &ctx,
   PADDLE_ENFORCE_NE(
       static_cast<const void *>(x),
       static_cast<void *>(y),
-      platform::errors::InvalidArgument("Inplace cast is not supported yet."));
+      errors::InvalidArgument("Inplace cast is not supported yet."));
   int vec_size = std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(y));
   switch (vec_size) {
     case 4:
-      return details::VecCastKernel<InT, OutT, 4>(ctx, x, y, n);
+      return VecCastKernel<InT, OutT, 4>(ctx, x, y, n);
     case 2:
-      return details::VecCastKernel<InT, OutT, 2>(ctx, x, y, n);
+      return VecCastKernel<InT, OutT, 2>(ctx, x, y, n);
     case 1:
-      return details::VecCastKernel<InT, OutT, 1>(ctx, x, y, n);
+      return VecCastKernel<InT, OutT, 1>(ctx, x, y, n);
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "The vectorized size must be 1, 2 or 4."));
+      PADDLE_THROW(
+          errors::InvalidArgument("The vectorized size must be 1, 2 or 4."));
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu b/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu
new file mode 100644
index 00000000000000..3ae7f0682bc75b
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/distributed_fused_lamb_init_kernel.cu
@@ -0,0 +1,804 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/distributed_fused_lamb_init_kernel.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/algorithm.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/tensor_to_string.h"
+#include "paddle/phi/kernels/fusion/gpu/cast_with_ptr.h"
+
+namespace phi {
+namespace fusion {
+
+using phi::funcs::FlattenToString;
+using phi::funcs::ToVector;
+
+struct ParamGradInfo {
+  DenseTensor *param_t{nullptr};
+  DenseTensor *grad_t{nullptr};
+  size_t idx{0};
+  size_t numel{0};
+  size_t numel_with_padding{0};
+  size_t numel_offset{0};
+};
+
+static std::ostream &operator<<(std::ostream &os, const ParamGradInfo &info) {
+  return os << "{Param(" << info.param_t << "),Grad(" << info.grad_t << "),idx("
+            << info.idx << "),numel(" << info.numel << "),numel_with_padding("
+            << info.numel_with_padding << "),numel_offset(" << info.numel_offset
+            << "),padding(" << info.numel_offset + info.numel_with_padding
+            << "-" << info.numel_offset + info.numel << "="
+            << info.numel_with_padding - info.numel << ")}";
+}
+
+struct ParamGradInfoNumelOffsetCompFunctor {
+  bool operator()(const ParamGradInfo &x, const ParamGradInfo &y) const {
+    return x.numel_offset < y.numel_offset;
+  }
+
+  bool operator()(const ParamGradInfo &x, size_t y) const {
+    return x.numel_offset < y;
+  }
+
+  bool operator()(size_t x, const ParamGradInfo &y) const {
+    return x < y.numel_offset;
+  }
+
+  bool operator()(size_t x, size_t y) const { return x < y; }
+};
+
+static size_t GetAlignSize(size_t n, size_t alignment) {
+  auto remainder = n % alignment;
+  return remainder == 0 ? n : n + alignment - remainder;
+}
+
+// Shard the ParamGradInfo list by the numel size [start_size, end_size)
+// The final results should be:
+//
+// start_size = sum(infos[0:i].numel_with_padding) + start_numel_offset, where
+// start_numel_offset <= infos[i].numel_with_padding
+//
+// end_size = sum(infos[0:j].numel_with_padding) + end_numel_offset, where
+// end_numel_offset <= infos[j].numel_with_padding
+static void GetParamGradShardInfo(const std::vector<ParamGradInfo> &infos,
+                                  size_t start_size,
+                                  size_t end_size,
+                                  size_t *start_idx,
+                                  size_t *end_idx,
+                                  size_t *start_numel_offset,
+                                  size_t *end_numel_offset) {
+  VLOG(10) << "NumelOffset: "
+           << paddle::string::join_strings(
+                  infos, ",", [](const ParamGradInfo &info) {
+                    return info.numel_offset;
+                  });
+  VLOG(10) << "start_size = " << start_size << " , end_size = " << end_size;
+
+  if (infos.empty()) {
+    PADDLE_ENFORCE_EQ(
+        start_size, 0, errors::InvalidArgument("start_size should be 0."));
+    PADDLE_ENFORCE_EQ(
+        end_size, 0, errors::InvalidArgument("end_size should be 0."));
+    *start_idx = 0;
+    *end_idx = 0;
+    *start_numel_offset = 0;
+    *end_numel_offset = 0;
+    return;
+  }
+
+  PADDLE_ENFORCE_LT(
+      start_size,
+      end_size,
+      errors::InvalidArgument("start_size should be less than end_size."));
+  size_t n = infos.size();
+  ParamGradInfoNumelOffsetCompFunctor comp;
+  auto i = static_cast<size_t>(
+      std::lower_bound(infos.begin(), infos.end(), start_size, comp) -
+      infos.begin());
+  if (i == n || infos[i].numel_offset != start_size) {
+    PADDLE_ENFORCE_GT(
+        i,
+        0,
+        errors::InvalidArgument(
+            "Cannot find suitable sharding which is between [%d, %d)",
+            start_size,
+            end_size));
+    --i;
+  }
+  PADDLE_ENFORCE_LT(
+      i,
+      n,
+      errors::InvalidArgument(
+          "Cannot find suitable sharding which is between [%d, %d)",
+          start_size,
+          end_size));
+  *start_idx = i;
+  *start_numel_offset = start_size - infos[i].numel_offset;
+  auto j = static_cast<size_t>(
+      std::lower_bound(infos.begin(), infos.end(), end_size, comp) -
+      infos.begin());
+  *end_idx = j - 1;
+  *end_numel_offset = end_size - infos[j - 1].numel_offset;
+  PADDLE_ENFORCE_GT(
+      *end_numel_offset,
+      0,
+      errors::InvalidArgument("Internal error when sharding, this may be a bug "
+                              "caused by empty parameter."));
+  VLOG(10) << "Sharding [start_size=" << start_size << ", end_size=" << end_size
+           << "): " << (*start_idx) << ":" << (*start_numel_offset) << " -> "
+           << (*end_idx) << ":" << (*end_numel_offset);
+}
+
+static size_t FillAlignmentPaddingInfo(std::vector<ParamGradInfo> *infos,
+                                       size_t alignment,
+                                       size_t nranks,
+                                       phi::DataType dtype) {
+  auto sizeof_dtype = phi::SizeOf(dtype);
+  PADDLE_ENFORCE_EQ(
+      alignment % sizeof_dtype,
+      0,
+      errors::InvalidArgument(
+          "The attr(alignment) should be exactly divided by sizeof(T) %d.",
+          sizeof_dtype));
+  alignment /= sizeof_dtype;
+
+  size_t total_numel_sum_with_padding = 0;
+  size_t n = infos->size();
+  for (size_t i = 0; i < n; ++i) {
+    auto &info = (*infos)[i];
+    size_t numel_with_padding;
+    if (i + 1 == n) {
+      // the total fused numel must be a factor of alignment * nranks
+      numel_with_padding =
+          GetAlignSize(info.numel + total_numel_sum_with_padding,
+                       alignment * nranks) -
+          total_numel_sum_with_padding;
+    } else {
+      numel_with_padding = GetAlignSize(info.numel, alignment);
+    }
+    info.numel_with_padding = numel_with_padding;
+    info.numel_offset = total_numel_sum_with_padding;
+    total_numel_sum_with_padding += numel_with_padding;
+  }
+  return total_numel_sum_with_padding;
+}
+
+template <typename T>
+static T *TensorFillConstant(const phi::GPUContext &dev_ctx,
+                             DenseTensor *tensor,
+                             const DDim &dims,
+                             T value) {
+  tensor->Resize(dims);
+  auto *ptr = dev_ctx.template Alloc<T>(tensor);
+  phi::funcs::SetConstant<phi::GPUContext, T> set_constant;
+  set_constant(dev_ctx, tensor, value);
+  return ptr;
+}
+
+static DenseTensor CastDataForInitedTensor(const phi::GPUContext &dev_ctx,
+                                           DenseTensor *origin,
+                                           DenseTensor *fused_out,
+                                           size_t numel_offset) {
+  PADDLE_ENFORCE_EQ(
+      origin->IsInitialized(),
+      true,
+      errors::InvalidArgument("The tensor to be cast should be initialized."));
+
+  PADDLE_ENFORCE_EQ(fused_out->dtype(),
+                    phi::DataType::FLOAT32,
+                    errors::InvalidArgument(
+                        "The dst tensor to be cast should be FP32 tensor."));
+  PADDLE_ENFORCE_EQ(origin->dtype(),
+                    phi::DataType::FLOAT16,
+                    errors::InvalidArgument(
+                        "The src tensor to be cast should be FP16 tensor."));
+  auto *dst = fused_out->data<float>() + numel_offset;
+  auto *src = origin->data<dtype::float16>();
+  auto numel = origin->numel();
+  LaunchCastKernel(dev_ctx, src, dst, numel);
+  VLOG(10) << "Cast from FP32 -> FP16, range: [" << numel_offset << ", "
+           << numel_offset + numel << ")"
+           << " , total: [0, " << fused_out->numel() << ")";
+  DDim fused_out_dim = fused_out->dims();
+  auto fused_out_numel = fused_out->numel();
+  fused_out->Resize({fused_out_numel});
+  auto sliced_tensor = fused_out->Slice(numel_offset, numel + numel_offset);
+  fused_out->Resize(fused_out_dim);
+  return sliced_tensor;
+}
+
+static DenseTensor CopyAndShareBufferForInitedTensor(
+    const phi::GPUContext &dev_ctx,
+    DenseTensor *origin,
+    DenseTensor *fused_out,
+    size_t numel_offset) {
+  PADDLE_ENFORCE_EQ(
+      origin->IsInitialized(),
+      true,
+      errors::InvalidArgument(
+          "The tensor to be copied and shared data should be initialized."));
+  auto dtype = fused_out->type();
+  PADDLE_ENFORCE_EQ(origin->type(),
+                    dtype,
+                    errors::InvalidArgument(
+                        "The tensor to be copied and shared data should be "
+                        "have the same data type."));
+  auto place = fused_out->place();
+  PADDLE_ENFORCE_EQ(
+      origin->place(),
+      place,
+      errors::InvalidArgument("The tensor to be copied and shared "
+                              "data should be have the same place."));
+  PADDLE_ENFORCE_EQ(
+      dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
+      true,
+      errors::InvalidArgument(
+          "The tensor to be copied and shared data should be on GPU place."));
+
+  auto numel = origin->numel();
+  DDim fused_out_dim = fused_out->dims();
+  auto fused_out_numel = fused_out->numel();
+  auto sliced_tensor = fused_out->Resize({fused_out_numel})
+                           .Slice(numel_offset, numel + numel_offset);
+  phi::Copy(dev_ctx, *origin, dev_ctx.GetPlace(), false, &sliced_tensor);
+  origin->ShareBufferWith(sliced_tensor);
+  fused_out->Resize(fused_out_dim);
+  VLOG(10) << "Copy and share buffer, range: [" << numel_offset << ", "
+           << numel_offset + numel << ") , total: [0, " << fused_out->numel()
+           << ") , dtype = " << dtype;
+  return sliced_tensor;
+}
+
+static void ShareBufferForNonInitedTensor(DenseTensor *origin,
+                                          DenseTensor *fused_out,
+                                          size_t numel_offset,
+                                          const DDim &dims) {
+  PADDLE_ENFORCE_EQ(
+      origin->IsInitialized(),
+      false,
+      errors::InvalidArgument(
+          "The tensor to be shared data should not be initialized."));
+
+  DDim fused_out_dim = fused_out->dims();
+  auto fused_out_numel = fused_out->numel();
+  auto numel = phi::product(dims);
+  *origin = fused_out->Resize({fused_out_numel})
+                .Slice(numel_offset, numel + numel_offset);
+  origin->Resize(dims);
+  fused_out->Resize(fused_out_dim);
+  VLOG(10) << "Share buffer for non-inited, range: [" << numel_offset << ", "
+           << numel_offset + numel << "), total: [0, " << fused_out->numel()
+           << ") , dtype = " << fused_out->dtype();
+}
+
+template <typename T>
+static void CopyVectorToCPUTensor(const phi::GPUContext &dev_ctx,
+                                  const std::vector<T> &src,
+                                  DenseTensor *dst) {
+  dst->Resize({static_cast<int64_t>(src.size())});
+  T *dst_ptr = dev_ctx.template HostAlloc<T>(dst);
+  const T *src_ptr = src.data();
+  auto nbytes = src.size() * sizeof(T);
+  std::memcpy(dst_ptr, src_ptr, nbytes);
+}
+
+static size_t ReorderParamGradInfoList(const std::vector<int> &flags,
+                                       std::vector<ParamGradInfo> *infos) {
+  size_t n = infos->size();
+  std::vector<int> cur_flags;
+  cur_flags.reserve(n);
+  for (size_t i = 0; i < n; ++i) {
+    auto idx = (*infos)[i].idx;
+    cur_flags.push_back(flags[idx]);
+  }
+
+  auto origin_infos = *infos;
+  size_t j = 0;
+  for (size_t i = 0; i < n; ++i) {
+    if (cur_flags[i]) {
+      (*infos)[j] = origin_infos[i];
+      ++j;
+    }
+  }
+  size_t ret_idx = j;
+
+  for (size_t i = 0; i < n; ++i) {
+    if (!cur_flags[i]) {
+      (*infos)[j] = origin_infos[i];
+      ++j;
+    }
+  }
+  return ret_idx;
+}
+
+template <typename T>
+static T ClipByBound(T x, T low_value, T high_value) {
+  if (x < low_value) return low_value;
+  if (x > high_value) return high_value;
+  return x;
+}
+
+template <typename T, typename Context>
+void DistributedFusedLambInitOpKernel(
+    const Context &dev_ctx,
+    const std::vector<const DenseTensor *> &param,
+    const std::vector<const DenseTensor *> &grad,
+    float beta1,
+    float beta2,
+    const std::vector<int> &apply_weight_decay,
+    int alignment,
+    int rank,
+    int nranks,
+    DenseTensor *fp32_fused_param,
+    DenseTensor *fp32_fused_grad,
+    DenseTensor *fp16_fused_param,
+    DenseTensor *fp16_fused_grad,
+    DenseTensor *moment1,
+    DenseTensor *moment2,
+    DenseTensor *beta1_pow,
+    DenseTensor *beta2_pow,
+    DenseTensor *fused_param_offsets,
+    DenseTensor *fp32_shard_fused_param_offsets,
+    DenseTensor *fp16_shard_fused_param_offsets,
+    DenseTensor *param_info,
+    DenseTensor *param_order,
+    std::vector<DenseTensor *> param_out,
+    std::vector<DenseTensor *> master_param_out,
+    std::vector<DenseTensor *> grad_out,
+    DenseTensor *global_scale,
+    DenseTensor *step) {
+  VLOG(10) << "starts to run DistributedFusedLambInitOp";
+  auto place = dev_ctx.GetPlace();
+  auto stream = dev_ctx.stream();
+
+  // Step 1: Check Input(Param) and Output(ParamOut), Input(Grad) and
+  // Output(GradOut)
+  std::vector<ParamGradInfo> fp32_infos, fp16_infos;
+  {
+    PADDLE_ENFORCE_EQ(
+        param.size(),
+        grad.size(),
+        errors::InvalidArgument("The parameter number and parameter gradient "
+                                "number should be the same."));
+
+    PADDLE_ENFORCE_EQ(
+        param.size(),
+        param_out.size(),
+        errors::InvalidArgument("Input(Param) and Output(ParamOut) "
+                                "should have the same number."));
+    PADDLE_ENFORCE_EQ(
+        grad.size(),
+        grad_out.size(),
+        errors::InvalidArgument(
+            "Input(Grad) and Output(GradOut) should have the same number."));
+    size_t n = param.size();
+    VLOG(10) << "parameter number: " << n;
+    for (size_t i = 0; i < n; ++i) {
+      auto *p = param[i];
+      auto *g = grad[i];
+      auto *p_out = param_out[i];
+      auto *g_out = grad_out[i];
+
+      PADDLE_ENFORCE_NOT_NULL(
+          p,
+          errors::InvalidArgument("The %d-th parameter should not be nullptr.",
+                                  i));
+      PADDLE_ENFORCE_EQ(p->IsInitialized(),
+                        true,
+                        errors::InvalidArgument(
+                            "The %d-th parameter should be initialized.", i));
+      PADDLE_ENFORCE_EQ(
+          p->place(),
+          place,
+          errors::InvalidArgument(
+              "The %d-th parameter is not initialized on the right place.", i));
+      PADDLE_ENFORCE_EQ(
+          p,
+          p_out,
+          errors::InvalidArgument("The %d-th Input(Param) and Output(ParamOut) "
+                                  "should be the same tensor.",
+                                  i));
+
+      auto dtype = p->dtype();
+      PADDLE_ENFORCE_NOT_NULL(
+          g,
+          errors::InvalidArgument("The %d-th gradient should not be nullptr.",
+                                  i));
+      PADDLE_ENFORCE_EQ(g,
+                        g_out,
+                        errors::InvalidArgument(
+                            "The %d-th Input(Grad) and Output(Grad) should "
+                            "be the same tensor."));
+      auto numel = p->numel();
+      PADDLE_ENFORCE_GT(
+          numel,
+          0,
+          errors::InvalidArgument("The %d-th Input(Param) have no elements."));
+
+      void *g_data = nullptr;
+      if (g->IsInitialized()) {
+        PADDLE_ENFORCE_EQ(g->dtype(),
+                          dtype,
+                          errors::InvalidArgument(
+                              "The %d-th Input(Param) and Input(Grad) should "
+                              "have the same data type %s.",
+                              i,
+                              dtype));
+        PADDLE_ENFORCE_EQ(g->dims(),
+                          p->dims(),
+                          errors::InvalidArgument(
+                              "The %d-th Input(Param) and Input(Grad) should "
+                              "have the same shape.",
+                              i));
+        g_data = g_out->data();
+      }
+
+      ParamGradInfo *info;
+      if (dtype == phi::DataType::FLOAT32) {
+        fp32_infos.emplace_back();
+        info = &fp32_infos.back();
+      } else if (dtype == phi::DataType::FLOAT16) {
+        fp16_infos.emplace_back();
+        info = &fp16_infos.back();
+      } else {
+        PADDLE_THROW(
+            errors::InvalidArgument("Unsupported data type %s.", dtype));
+      }
+
+      VLOG(10) << "Found " << dtype << " parameter " << i << " shape=["
+               << p_out->dims() << "] numel=" << numel
+               << " grad.IsInitialized()="
+               << (g_out->IsInitialized() ? "true" : "false");
+
+      info->param_t = p_out;
+      info->grad_t = g_out;
+      info->idx = i;
+      info->numel = numel;
+      info->numel_with_padding = 0;  // not determined yet
+      info->numel_offset = 0;        // not determined yet
+    }
+  }
+
+  size_t fp32_wd_end_idx =
+      ReorderParamGradInfoList(apply_weight_decay, &fp32_infos);
+  size_t fp16_wd_end_idx =
+      ReorderParamGradInfoList(apply_weight_decay, &fp16_infos);
+
+  auto param_num = fp32_infos.size() + fp16_infos.size();
+  param_order->Resize({static_cast<int16_t>(param_num)});
+  auto *param_order_t = dev_ctx.template HostAlloc<int>(param_order);
+  for (size_t i = 0; i < fp32_infos.size(); ++i) {
+    param_order_t[i] = static_cast<int>(fp32_infos[i].idx);
+  }
+  for (size_t i = 0; i < fp16_infos.size(); ++i) {
+    param_order_t[i + fp32_infos.size()] = static_cast<int>(fp16_infos[i].idx);
+  }
+
+  VLOG(10) << "Fill ParamGradInfo ends";
+
+  // Step 2: determine the numel_with_padding and numel_offset
+  VLOG(10) << "rank = " << rank << ", nranks = " << nranks
+           << " , alignment = " << alignment;
+  if (alignment <= 0) {
+    alignment = phi::backends::gpu::GpuMinChunkSize();
+  }
+  PADDLE_ENFORCE_GE(
+      alignment,
+      1,
+      errors::InvalidArgument("The attr(alignment) should be larger than 0."));
+  PADDLE_ENFORCE_EQ(
+      alignment & (alignment - 1),
+      0,
+      errors::InvalidArgument("The attr(alignment) should be the power of 2."));
+  PADDLE_ENFORCE_GE(rank,
+                    0,
+                    errors::InvalidArgument(
+                        "The attr(rank) should be equal to or larger than 0."));
+  PADDLE_ENFORCE_LT(
+      rank,
+      nranks,
+      errors::InvalidArgument(
+          "The attr(rank) should be less than the attr(nranks)."));
+  // NOTE: We guarantee that both fp32_numel and fp16_numel can be exactly
+  // divided by alignment and nranks.
+  auto fp32_numel = FillAlignmentPaddingInfo(
+      &fp32_infos, alignment, nranks, phi::DataType::FLOAT32);
+  VLOG(10) << "FP32 ParamGradInfo: "
+           << paddle::string::join_strings(fp32_infos, " ");
+  auto fp16_numel = FillAlignmentPaddingInfo(
+      &fp16_infos, alignment, nranks, phi::DataType::FLOAT16);
+  VLOG(10) << "FP16 ParamGradInfo: "
+           << paddle::string::join_strings(fp16_infos, " ");
+  auto total_numel = fp32_numel + fp16_numel;
+  PADDLE_ENFORCE_LT(total_numel,
+                    std::numeric_limits<int>::max(),
+                    errors::InvalidArgument("Too many parameter number."));
+
+  auto fp32_numel_each_device = fp32_numel / nranks;
+  auto fp16_numel_each_device = fp16_numel / nranks;
+  auto numel_each_device = fp32_numel_each_device + fp16_numel_each_device;
+  VLOG(10) << "Fill padding ends. total_numel = " << total_numel
+           << ", fp32_numel = " << fp32_numel << ", fp16_numel = " << fp16_numel
+           << ", fp32_numel_each_device = " << fp32_numel_each_device
+           << ", fp16_numel_each_device = " << fp16_numel_each_device;
+
+  // Step 3: allocate output tensor and do initialization
+  float *fused_fp32_param = nullptr, *fused_fp32_grad = nullptr;
+  dtype::float16 *fused_fp16_param = nullptr, *fused_fp16_grad = nullptr;
+  DenseTensor *fp32_p_t = nullptr, *fp16_p_t = nullptr, *fp32_g_t = nullptr,
+              *fp16_g_t = nullptr;
+  std::vector<DenseTensor *> fp16_master_params;
+  if (total_numel > 0) {
+    fp32_p_t = fp32_fused_param;
+    fused_fp32_param = TensorFillConstant<float>(
+        dev_ctx, fp32_p_t, {static_cast<int64_t>(total_numel)}, 0.0f);
+  }
+
+  if (fp32_numel > 0) {
+    fp32_g_t = fp32_fused_grad;
+    fused_fp32_grad = TensorFillConstant<float>(
+        dev_ctx, fp32_g_t, {static_cast<int64_t>(fp32_numel)}, 0.0f);
+  }
+
+  if (fp16_numel > 0) {
+    fp16_p_t = fp16_fused_param;
+    fused_fp16_param =
+        TensorFillConstant<dtype::float16>(dev_ctx,
+                                           fp16_p_t,
+                                           {static_cast<int64_t>(fp16_numel)},
+                                           static_cast<dtype::float16>(0));
+
+    fp16_g_t = fp16_fused_grad;
+    fused_fp16_grad =
+        TensorFillConstant<dtype::float16>(dev_ctx,
+                                           fp16_g_t,
+                                           {static_cast<int64_t>(fp16_numel)},
+                                           static_cast<dtype::float16>(0));
+  }
+  VLOG(10) << "Allocate FP32FusedParam/Grad, FP16FusedParam/Grad ends";
+
+  // (1) For FP32FusedParam, memcpy for fp32 param and then share data, cast
+  // for fp16 master weight
+  // (2) For FP16FusedParam, memcpy and then share data
+  // (3) For FP32FusedGrad/FP16FusedGrad, memcpy if gradient has been inited
+  for (const auto &info : fp32_infos) {
+    auto sliced_tensor = CopyAndShareBufferForInitedTensor(
+        dev_ctx, info.param_t, fp32_p_t, info.numel_offset);
+    master_param_out[info.idx]->Resize(info.param_t->dims());
+    master_param_out[info.idx]->ShareBufferWith(sliced_tensor);
+    float *master_param_tmp =
+        dev_ctx.template Alloc<float>(master_param_out[info.idx]);
+    float *sliced_tensor_tmp = reinterpret_cast<float *>(sliced_tensor.data());
+    PADDLE_ENFORCE_EQ(
+        master_param_tmp,
+        sliced_tensor_tmp,
+        errors::InvalidArgument("Invalid master weight tensor pointer."));
+
+    if (info.grad_t->IsInitialized()) {
+      CopyAndShareBufferForInitedTensor(
+          dev_ctx, info.grad_t, fp32_g_t, info.numel_offset);
+    } else {
+      ShareBufferForNonInitedTensor(
+          info.grad_t, fp32_g_t, info.numel_offset, info.param_t->dims());
+    }
+  }
+
+  size_t fp16_numel_offset = 0;
+  if (fp32_numel > 0) {
+    auto last_fp32_info = fp32_infos.back();
+    fp16_numel_offset =
+        last_fp32_info.numel_offset + last_fp32_info.numel_with_padding;
+  }
+
+  for (const auto &info : fp16_infos) {
+    auto master_weight_offset = info.numel_offset + fp16_numel_offset;
+    auto sliced_tensor = CastDataForInitedTensor(
+        dev_ctx, info.param_t, fp32_p_t, master_weight_offset);
+    master_param_out[info.idx]->Resize(info.param_t->dims());
+    master_param_out[info.idx]->ShareBufferWith(sliced_tensor);
+
+    CopyAndShareBufferForInitedTensor(
+        dev_ctx, info.param_t, fp16_p_t, info.numel_offset);
+    float *master_param_tmp =
+        dev_ctx.template Alloc<float>(master_param_out[info.idx]);
+    float *sliced_tensor_tmp = reinterpret_cast<float *>(sliced_tensor.data());
+    PADDLE_ENFORCE_EQ(
+        master_param_tmp,
+        sliced_tensor_tmp,
+        errors::InvalidArgument("Invalid master weight tensor pointer."));
+
+    if (info.grad_t->IsInitialized()) {
+      CopyAndShareBufferForInitedTensor(
+          dev_ctx, info.grad_t, fp16_g_t, info.numel_offset);
+    } else {
+      ShareBufferForNonInitedTensor(
+          info.grad_t, fp16_g_t, info.numel_offset, info.param_t->dims());
+    }
+  }
+  VLOG(10) << "Copy/share data for Param/Grad ends";
+
+  // Step 4: For Moment1, Moment2, Beta1Pow, Beta2Pow, just fill constant
+  TensorFillConstant<float>(
+      dev_ctx, moment1, {static_cast<int64_t>(numel_each_device)}, 0.0f);
+  TensorFillConstant<float>(
+      dev_ctx, moment2, {static_cast<int64_t>(numel_each_device)}, 0.0f);
+  TensorFillConstant<float>(dev_ctx, beta1_pow, {1}, beta1);
+  TensorFillConstant<float>(dev_ctx, beta2_pow, {1}, beta2);
+  VLOG(10) << "Init Moment and BetaPow ends";
+
+  // Step 5: Do sharding
+  size_t fp32_start_idx, fp32_end_idx, fp32_start_numel_offset,
+      fp32_end_numel_offset;
+  GetParamGradShardInfo(fp32_infos,
+                        rank * fp32_numel_each_device,
+                        (rank + 1) * fp32_numel_each_device,
+                        &fp32_start_idx,
+                        &fp32_end_idx,
+                        &fp32_start_numel_offset,
+                        &fp32_end_numel_offset);
+  size_t fp16_start_idx, fp16_end_idx, fp16_start_numel_offset,
+      fp16_end_numel_offset;
+  GetParamGradShardInfo(fp16_infos,
+                        rank * fp16_numel_each_device,
+                        (rank + 1) * fp16_numel_each_device,
+                        &fp16_start_idx,
+                        &fp16_end_idx,
+                        &fp16_start_numel_offset,
+                        &fp16_end_numel_offset);
+  size_t fp32_local_param_num =
+      fp32_numel_each_device > 0 ? fp32_end_idx - fp32_start_idx + 1 : 0;
+  size_t fp16_local_param_num =
+      fp16_numel_each_device > 0 ? fp16_end_idx - fp16_start_idx + 1 : 0;
+  size_t total_local_param_num = fp32_local_param_num + fp16_local_param_num;
+  VLOG(10) << "Found the sharding arguments";
+
+  param_info->Resize({8});
+  auto *param_info_t = dev_ctx.template HostAlloc<int>(param_info);
+  param_info_t[0] = static_cast<int>(fp32_start_idx);
+  param_info_t[1] = static_cast<int>(fp32_local_param_num);
+  param_info_t[2] = static_cast<int>(fp32_infos.size());
+  param_info_t[3] = ClipByBound<int>(fp32_wd_end_idx,
+                                     fp32_start_idx,
+                                     fp32_start_idx + fp32_local_param_num) -
+                    static_cast<int>(fp32_start_idx);
+  param_info_t[4] = static_cast<int>(fp16_start_idx + fp32_infos.size());
+  param_info_t[5] = static_cast<int>(fp16_local_param_num);
+  param_info_t[6] = static_cast<int>(fp16_infos.size());
+  param_info_t[7] = ClipByBound<int>(fp16_wd_end_idx,
+                                     fp16_start_idx,
+                                     fp16_start_idx + fp16_local_param_num) -
+                    static_cast<int>(fp16_start_idx);
+
+  VLOG(10) << "Start FP32 idx: " << param_info_t[0];
+  VLOG(10) << "Local FP32 param num: " << param_info_t[1];
+  VLOG(10) << "Global FP32 param num: " << param_info_t[2];
+
+  VLOG(10) << "Start FP16 idx: " << param_info_t[4];
+  VLOG(10) << "Local FP16 param num: " << param_info_t[5];
+  VLOG(10) << "Global FP16 param num: " << param_info_t[6];
+
+  std::vector<int> numel_offsets;
+  numel_offsets.reserve(param.size() + 1);
+  for (const auto &info : fp32_infos) {
+    numel_offsets.push_back(info.numel_offset);
+  }
+  for (const auto &info : fp16_infos) {
+    numel_offsets.push_back(info.numel_offset + fp16_numel_offset);
+  }
+  numel_offsets.push_back(fp32_numel + fp16_numel);
+  PADDLE_ENFORCE_EQ(numel_offsets.size(),
+                    param.size() + 1,
+                    errors::InvalidArgument(
+                        "The numel_offsets number must be one larger than "
+                        "the parameter number."));
+  VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets);
+
+  std::vector<int> fp32_partial_numel_offsets;
+  fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1);
+  fp32_partial_numel_offsets.push_back(0);
+  // Fill the partial_numel_offsets
+  for (size_t i = fp32_start_idx; i < fp32_start_idx + fp32_local_param_num;
+       ++i) {
+    size_t valid_start_n = 0;
+    if (i == fp32_start_idx) {
+      valid_start_n = fp32_start_numel_offset;
+    }
+
+    size_t end_n = fp32_infos[i].numel_with_padding;
+    if (i + 1 == fp32_start_idx + fp32_local_param_num) {
+      end_n = std::min(end_n, fp32_end_numel_offset);
+    }
+
+    PADDLE_ENFORCE_NE(
+        valid_start_n,
+        end_n,
+        errors::InvalidArgument("Indices sharding error. This may be a bug."));
+    VLOG(10) << "FP32 Partial numel = [" << valid_start_n + fp32_infos[i].numel
+             << "," << end_n + fp32_infos[i].numel;
+    auto len = end_n - valid_start_n;
+    fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() +
+                                         len);
+  }
+
+  std::vector<int> fp16_partial_numel_offsets;
+  fp16_partial_numel_offsets.reserve(fp16_local_param_num + 1);
+  fp16_partial_numel_offsets.push_back(0);
+  for (size_t i = fp16_start_idx; i < fp16_start_idx + fp16_local_param_num;
+       ++i) {
+    size_t valid_start_n = 0;
+    if (i == fp16_start_idx) {
+      valid_start_n = fp16_start_numel_offset;
+    }
+
+    size_t end_n = fp16_infos[i].numel_with_padding;
+    if (i + 1 == fp16_start_idx + fp16_local_param_num) {
+      end_n = std::min(end_n, fp16_end_numel_offset);
+    }
+
+    PADDLE_ENFORCE_NE(
+        valid_start_n,
+        end_n,
+        errors::InvalidArgument("Indices sharding error. This may be a bug."));
+    auto len = end_n - valid_start_n;
+    fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() +
+                                         len);
+  }
+
+  CopyVectorToCPUTensor(dev_ctx, numel_offsets, fused_param_offsets);
+  CopyVectorToCPUTensor(
+      dev_ctx, fp32_partial_numel_offsets, fp32_shard_fused_param_offsets);
+  CopyVectorToCPUTensor(
+      dev_ctx, fp16_partial_numel_offsets, fp16_shard_fused_param_offsets);
+
+  if (!global_scale->IsInitialized()) {
+    TensorFillConstant<float>(dev_ctx, global_scale, {1}, 1.0f);
+  }
+  VLOG(10) << "Init global scale ends";
+
+  TensorFillConstant<int64_t>(dev_ctx, step, {1}, static_cast<int64_t>(0));
+
+  dev_ctx.Wait();
+  VLOG(10) << "Wait for H2D copy";
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(distributed_fused_lamb_init,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::DistributedFusedLambInitOpKernel,
+                   float) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT16);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT16);
+  kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(6).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(7).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(8).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(9).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(10).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(11).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(12).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(13).SetDataType(kernel_key.dtype());
+  kernel->OutputAt(14).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(15).SetDataType(kernel_key.dtype());
+  kernel->OutputAt(16).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(17).SetDataType(phi::DataType::INT64);
+}
diff --git a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bnstats_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bnstats_kernel.cu
new file mode 100644
index 00000000000000..e19996d63c7913
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_relu_conv_bnstats_kernel.cu
@@ -0,0 +1,618 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <array>
+
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/flags.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h"
+
+DECLARE_bool(cudnn_deterministic);
+DECLARE_bool(cudnn_exhaustive_search);
+
+namespace phi {
+namespace fusion {
+
+using helper = phi::CudnnFrontendConvHelper;
+template <typename T>
+using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+
+/*
+ * Implements Scale + Bias + ReLU + Conv + BNStats fusion pattern.
+ * Same as the following (x and output are in NHWC format):
+ * ```
+ *   output = conv2d(relu(x * scale + bias), w)
+ *   sum_output, sqsum_output = bnstats(output)
+ * ```
+ * Here, bnstats generates per-channel statistics, same as:
+ * ```
+ *   sum_output = output.sum(axis=[0,1,2])
+ *   sqsum_output = (output ** 2).sum(axis=[0,1,2])
+ * ```
+ * More details:
+ * https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#genstats-runtime-fusion-engine
+ */
+template <typename T, typename Context>
+void FusedScaleBiasReluConvBnstatsImpl(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& w,
+    const paddle::optional<DenseTensor>& scale,
+    const paddle::optional<DenseTensor>& bias,
+    const std::vector<int>& paddings,
+    const std::vector<int>& dilations,
+    const std::vector<int>& strides,
+    const std::string& padding_algorithm,
+    bool fuse_prologue,
+    bool exhaustive_search,
+    bool deterministic,
+    DenseTensor* output,
+    DenseTensor* sum_output,
+    DenseTensor* sqsum_output) {
+  auto& plan_cache = phi::autotune::AutoTuneCache::Instance().GetConvV8(
+      phi::autotune::AlgorithmType::kScaleBiasReluConvBNstats);
+
+  // transformed tensor
+  DenseTensor w_transformed(w.dtype());
+  // Assume input and output already in NHWC.
+  // No transformation is needed for them.
+  VLOG(3) << "Transform filter tensor from NCHW to NHWC.";
+  ResizeToChannelLast<Context, T>(dev_ctx, &w, &w_transformed);
+  TransToChannelLast<Context, T>(dev_ctx, &w, &w_transformed);
+
+  // update padding and dilation
+  std::vector<int> paddings_vec = paddings;
+  std::vector<int> dilations_vec = dilations;
+  auto in_dims = x.dims();
+  auto filter_dims = w_transformed.dims();
+  DDim in_data_dims = slice_ddim(in_dims, 1, in_dims.size() - 1);
+  DDim filter_data_dims = slice_ddim(filter_dims, 1, filter_dims.size() - 1);
+  std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+  phi::UpdatePaddingAndDilation(&paddings_vec,
+                                &dilations_vec,
+                                padding_algorithm,
+                                in_data_dims,
+                                strides,
+                                ksize);
+
+  int data_dim = strides.size();  // 2d only
+
+  std::vector<int64_t> pre_padding(data_dim, 0);
+  std::vector<int64_t> post_padding(data_dim, 0);
+  for (size_t i = 0; i < data_dim; ++i) {
+    pre_padding[i] = static_cast<int64_t>(paddings_vec[2 * i]);
+    post_padding[i] = static_cast<int64_t>(paddings_vec[2 * i + 1]);
+  }
+
+  // input pointers
+  T* input_data = const_cast<T*>(x.data<T>());
+  T* filter_data = w_transformed.data<T>();
+
+  // output pointers
+  T* output_data = output->data<T>();
+  float* sum_output_data = sum_output->data<float>();
+  float* sqsum_output_data = sqsum_output->data<float>();
+
+  auto handle = dev_ctx.cudnn_handle();
+  auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+
+  // build tensors
+  cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC;
+  auto tensor_format = phi::backends::gpu::ToCudnnDataType(x.dtype());
+
+  auto tensor_format_math = CUDNN_DATA_FLOAT;
+  auto compute_dtype = CUDNN_DATA_FLOAT;
+
+  // get dims in CUDNN manner: [N, C, H, W]
+  auto dim_x =
+      phi::backends::gpu::TransformDimOrder(phi::vectorize<int64_t>(in_dims));
+  auto dim_filt = phi::backends::gpu::TransformDimOrder(
+      phi::vectorize<int64_t>(filter_dims));
+  auto dim_y = phi::backends::gpu::TransformDimOrder(
+      phi::vectorize<int64_t>(output->dims()));
+  std::vector<int64_t> dim_scale(dim_x.size(), 1);
+  dim_scale[1] = dim_x[1];                        //  [1, C, 1, 1]
+  std::vector<int64_t> dim_sum(dim_x.size(), 1);  // [1, K, 1, 1]
+  dim_sum[1] = dim_filt[0];
+
+  std::vector<void*> data_ptrs;
+  std::vector<int64_t> uids;
+  int64_t uid = 100;
+
+  // inputs
+  auto input_desc = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format);
+  data_ptrs.push_back(input_data);
+  uids.push_back(uid);
+
+  auto filter_desc = helper::GetGeneralTensorDescriptor(
+      dim_filt, layout_format, ++uid, 16, tensor_format);
+  data_ptrs.push_back(filter_data);
+  uids.push_back(uid);
+
+  // dispensable inputs
+  auto scale_desc = helper::GetGeneralTensorDescriptor(
+      dim_scale, layout_format, ++uid, 16, tensor_format);
+  if (fuse_prologue) {
+    data_ptrs.push_back(const_cast<T*>(scale->data<T>()));
+    uids.push_back(uid);
+  }
+
+  auto bias_desc = helper::GetGeneralTensorDescriptor(
+      dim_scale, layout_format, ++uid, 16, tensor_format);
+  if (fuse_prologue) {
+    data_ptrs.push_back(const_cast<T*>(bias->data<T>()));
+    uids.push_back(uid);
+  }
+
+  // outputs
+  auto output_desc = helper::GetGeneralTensorDescriptor(
+      dim_y, layout_format, ++uid, 16, tensor_format);
+  data_ptrs.push_back(output_data);
+  uids.push_back(uid);
+
+  auto sum_output_desc = helper::GetGeneralTensorDescriptor(
+      dim_sum, layout_format, ++uid, 16, tensor_format_math);
+  data_ptrs.push_back(sum_output_data);
+  uids.push_back(uid);
+
+  auto sqsum_output_desc = helper::GetGeneralTensorDescriptor(
+      dim_sum, layout_format, ++uid, 16, tensor_format_math);
+  data_ptrs.push_back(sqsum_output_data);
+  uids.push_back(uid);
+
+  // virtual outputs
+  auto after_scale = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format_math, true);
+  auto after_bias = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format_math, true);
+  auto after_relu = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format_math, true);
+
+  // create ops
+  auto scale_op = helper::MakePointwiseOp(
+      CUDNN_POINTWISE_MUL, compute_dtype, input_desc, scale_desc, after_scale);
+
+  auto bias_op = helper::MakePointwiseOp(
+      CUDNN_POINTWISE_ADD, compute_dtype, after_scale, bias_desc, after_bias);
+
+  auto relu_desc = cudnn_frontend::PointWiseDescBuilder()
+                       .setMode(CUDNN_POINTWISE_RELU_FWD)
+                       .setComputeType(compute_dtype)
+                       .build();
+
+  auto relu_op = cudnn_frontend::OperationBuilder(
+                     CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                     .setxDesc(after_bias)
+                     .setyDesc(after_relu)
+                     .setpwDesc(relu_desc)
+                     .build();
+  VLOG(6) << relu_op.describe();
+
+  std::vector<int64_t> stride_int64 = helper::GetInt64Array(strides);
+  std::vector<int64_t> dilation_int64 = helper::GetInt64Array(dilations_vec);
+  auto conv_desc = cudnn_frontend::ConvDescBuilder()
+                       .setComputeType(compute_dtype)
+                       .setMathMode(CUDNN_CROSS_CORRELATION)
+                       .setSpatialDimCount(data_dim)
+                       .setSpatialStride(data_dim, stride_int64.data())
+                       .setPrePadding(data_dim, pre_padding.data())
+                       .setPostPadding(data_dim, post_padding.data())
+                       .setDilation(data_dim, dilation_int64.data())
+                       .build();
+
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  cudnn_frontend::Tensor* input_to_conv =
+      fuse_prologue ? &after_relu : &input_desc;
+  auto conv_op = cudnn_frontend::OperationBuilder(
+                     CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+                     .setxDesc(*input_to_conv)
+                     .setwDesc(filter_desc)
+                     .setyDesc(output_desc)
+                     .setcDesc(conv_desc)
+                     .setAlpha(alpha)
+                     .setBeta(beta)
+                     .build();
+  VLOG(6) << conv_op.describe();
+
+  auto genstat_op = cudnn_frontend::OperationBuilder(
+                        CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR)
+                        .setxDesc(output_desc)
+                        .setComputeType(compute_dtype)
+                        .setGenStatsMode(CUDNN_GENSTATS_SUM_SQSUM)
+                        .setSumDesc(sum_output_desc)
+                        .setSqSumDesc(sqsum_output_desc)
+                        .build();
+  VLOG(6) << genstat_op.describe();
+
+  // build op graph
+  std::vector<cudnn_frontend::Operation const*> ops;
+  if (fuse_prologue) {
+    ops = std::vector<cudnn_frontend::Operation const*>(
+        {&scale_op, &bias_op, &relu_op, &conv_op, &genstat_op});
+  } else {
+    ops =
+        std::vector<cudnn_frontend::Operation const*>({&conv_op, &genstat_op});
+  }
+
+  auto op_graph = cudnn_frontend::OperationGraphBuilder()
+                      .setHandle(handle)
+                      .setOperationGraph(ops.size(), ops.data())
+                      .build();
+  VLOG(6) << op_graph.describe();
+
+  cudnn_frontend::feature_vector_t feature_vector;
+  phi::autotune::BuildFeatureVector(&feature_vector,
+                                    dim_x,
+                                    dim_filt,
+                                    strides,
+                                    paddings,
+                                    dilations,
+                                    pre_padding,
+                                    post_padding,
+                                    fuse_prologue);
+
+  helper::QueryCacheAndExecute(handle,
+                               &workspace_handle,
+                               &op_graph,
+                               &data_ptrs,
+                               &uids,
+                               exhaustive_search,
+                               deterministic,
+                               feature_vector,
+                               &plan_cache);
+}
+
+/*
+ * Implements BNFinalize pattern. It works with aforementioned bnstats node:
+ * ```
+ *   y = bn_finalize(genstats(conv_out))
+ * ```
+ * is the same as:
+ * ```
+ *   y = batchnorm2d(conv_out)
+ * ```
+ */
+template <typename T, typename Context>
+void BNFinalizeImpl(const Context& dev_ctx,
+                    const DenseTensor& sum_tensor,
+                    const DenseTensor& sqsum_tensor,
+                    const DenseTensor& bn_scale,
+                    const DenseTensor& bn_bias,
+                    const DenseTensor& input_running_mean,
+                    const DenseTensor& input_running_var,
+                    int64_t accumulation_count,
+                    float exp_decay,
+                    float epsilon,
+                    bool exhaustive_search,
+                    bool deterministic,
+                    DenseTensor* out_running_mean,
+                    DenseTensor* out_running_var,
+                    DenseTensor* saved_mean,
+                    DenseTensor* saved_var,
+                    DenseTensor* eq_scale,
+                    DenseTensor* eq_bias) {
+  auto& plan_cache = phi::autotune::AutoTuneCache::Instance().GetConvV8(
+      phi::autotune::AlgorithmType::kBNFinalize);
+
+  auto handle = dev_ctx.cudnn_handle();
+  auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+  // set dtypes
+  cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC;
+  auto tensor_format_bn =
+      phi::backends::gpu::ToCudnnDataType(sum_tensor.dtype());
+  auto tensor_format = phi::backends::gpu::ToCudnnDataType(eq_scale->dtype());
+  auto compute_dtype = CUDNN_DATA_FLOAT;
+  // create tensor descriptors
+  auto dim_input = phi::vectorize<int64_t>(sum_tensor.dims());
+  std::vector<int64_t> dim_c = {1, dim_input[0], 1, 1};  //  [1, C, 1, 1]
+  std::vector<int64_t> dim_scalar = {1, 1, 1, 1};
+  std::vector<int64_t> stride_scalar = {1, 1, 1, 1};
+
+  std::vector<void*> data_ptrs;
+  std::vector<int64_t> uids;
+  int64_t uid = 100;
+
+  // inputs
+  auto sum_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format_bn);
+  data_ptrs.push_back(const_cast<float*>(sum_tensor.data<float>()));
+  uids.push_back(uid);
+
+  auto sqsum_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format_bn);
+  data_ptrs.push_back(const_cast<float*>(sqsum_tensor.data<float>()));
+  uids.push_back(uid);
+
+  auto scale_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format_bn);
+  data_ptrs.push_back(const_cast<float*>(bn_scale.data<float>()));
+  uids.push_back(uid);
+
+  auto bias_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format_bn);
+  data_ptrs.push_back(const_cast<float*>(bn_bias.data<float>()));
+  uids.push_back(uid);
+
+  auto input_running_mean_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format_bn);
+  data_ptrs.push_back(const_cast<float*>(input_running_mean.data<float>()));
+  uids.push_back(uid);
+
+  auto input_running_var_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format_bn);
+  data_ptrs.push_back(const_cast<float*>(input_running_var.data<float>()));
+  uids.push_back(uid);
+
+  // outputs
+  auto updated_running_mean_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format_bn);
+  data_ptrs.push_back(out_running_mean->data<float>());
+  uids.push_back(uid);
+
+  auto updated_running_var_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format_bn);
+  data_ptrs.push_back(out_running_var->data<float>());
+  uids.push_back(uid);
+
+  auto saved_mean_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format_bn);
+  data_ptrs.push_back(saved_mean->data<float>());
+  uids.push_back(uid);
+
+  auto saved_inv_var_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format_bn);
+  data_ptrs.push_back(saved_var->data<float>());
+  uids.push_back(uid);
+
+  auto eq_scale_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format);
+  data_ptrs.push_back(eq_scale->data<T>());
+  uids.push_back(uid);
+
+  auto eq_bias_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format);
+  data_ptrs.push_back(eq_bias->data<T>());
+  uids.push_back(uid);
+
+  // scalar descriptors
+  auto epsilon_desc = cudnn_frontend::TensorBuilder()
+                          .setDim(dim_scalar.size(), dim_scalar.data())
+                          .setStride(stride_scalar.size(), stride_scalar.data())
+                          .setId(++uid)
+                          .setAlignment(16)
+                          .setDataType(CUDNN_DATA_FLOAT)
+                          .setByValue(true)
+                          .build();
+  data_ptrs.push_back(&epsilon);
+  uids.push_back(uid);
+
+  auto exp_decay_desc =
+      cudnn_frontend::TensorBuilder()
+          .setDim(dim_scalar.size(), dim_scalar.data())
+          .setStride(stride_scalar.size(), stride_scalar.data())
+          .setId(++uid)
+          .setAlignment(16)
+          .setDataType(CUDNN_DATA_FLOAT)
+          .setByValue(true)
+          .build();
+  data_ptrs.push_back(&exp_decay);
+  uids.push_back(uid);
+
+  auto accum_count_desc =
+      cudnn_frontend::TensorBuilder()
+          .setDim(dim_scalar.size(), dim_scalar.data())
+          .setStride(stride_scalar.size(), stride_scalar.data())
+          .setId(++uid)
+          .setAlignment(16)
+          .setDataType(CUDNN_DATA_INT64)
+          .setByValue(true)
+          .build();
+  data_ptrs.push_back(&accumulation_count);
+  uids.push_back(uid);
+
+  //  build ops
+  auto finalize_stat_op =
+      cudnn_frontend::OperationBuilder(
+          CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR)
+          .setComputeType(compute_dtype)
+          .setBNFinalizeMode(CUDNN_BN_FINALIZE_STATISTICS_TRAINING)
+          .setSumDesc(sum_desc)
+          .setSqSumDesc(sqsum_desc)
+          .setScaleAndBias(scale_desc, bias_desc)
+          .setEqScaleAndBias(eq_scale_desc, eq_bias_desc)
+          .setPrevRunningMeanAndVar(input_running_mean_desc,
+                                    input_running_var_desc)
+          .setNextRunningMeanAndVar(updated_running_mean_desc,
+                                    updated_running_var_desc)
+          .setSavedMeanAndInvVar(saved_mean_desc, saved_inv_var_desc)
+          .setEpsilonTensor(epsilon_desc)
+          .setAccumCountTensor(accum_count_desc)
+          .setExpDecayFactorTensor(exp_decay_desc)
+          .build();
+
+  std::array<cudnn_frontend::Operation const*, 1> ops = {&finalize_stat_op};
+  auto op_graph = cudnn_frontend::OperationGraphBuilder()
+                      .setHandle(handle)
+                      .setOperationGraph(ops.size(), ops.data())
+                      .build();
+  VLOG(6) << op_graph.describe();
+
+  cudnn_frontend::feature_vector_t feature_vector;
+  phi::autotune::BuildFeatureVector(
+      &feature_vector, dim_input, accumulation_count, exp_decay, epsilon);
+
+  helper::QueryCacheAndExecute(handle,
+                               &workspace_handle,
+                               &op_graph,
+                               &data_ptrs,
+                               &uids,
+                               exhaustive_search,
+                               deterministic,
+                               feature_vector,
+                               &plan_cache);
+}
+
+template <typename T, typename Context>
+void FusedScaleBiasReluConvBnstatsKernel(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& w,
+    const paddle::optional<DenseTensor>& scale,
+    const paddle::optional<DenseTensor>& bias,
+    const DenseTensor& bn_scale,
+    const DenseTensor& bn_bias,
+    const DenseTensor& input_running_mean,
+    const DenseTensor& input_running_var,
+    const std::vector<int>& paddings,
+    const std::vector<int>& dilations,
+    const std::vector<int>& strides,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::string& data_format,
+    float momentum,
+    float epsilon,
+    bool fuse_prologue,
+    bool exhaustive_search,
+    int64_t accumulation_count,
+    DenseTensor* out,
+    DenseTensor* out_running_mean,
+    DenseTensor* out_running_var,
+    DenseTensor* saved_mean,
+    DenseTensor* saved_var,
+    DenseTensor* eq_scale,
+    DenseTensor* eq_bias) {
+  auto cudnn_version = phi::backends::gpu::DnnVersion();
+  PADDLE_ENFORCE_GE(cudnn_version,
+                    8800,
+                    phi::errors::PreconditionNotMet(
+                        "This op only supports CUDNN version >= 8800, "
+                        "but got %d.",
+                        cudnn_version));
+  PADDLE_ENFORCE_GE(dev_ctx.GetComputeCapability(),
+                    80,
+                    phi::errors::PreconditionNotMet(
+                        "This op only supports Ampere and later devices, "
+                        "but got compute capability: %d.",
+                        dev_ctx.GetComputeCapability()));
+  // attr
+  float exp_decay = 1. - momentum;
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon =
+      std::max(epsilon, static_cast<float>(CUDNN_BN_MIN_EPSILON + FLT_EPSILON));
+  // exhaustive search
+  exhaustive_search = exhaustive_search || FLAGS_cudnn_exhaustive_search;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+  // check optional inputs
+  if (fuse_prologue) {
+    PADDLE_ENFORCE_EQ(
+        scale && bias,
+        true,
+        phi::errors::InvalidArgument(
+            "\"scale\" and \"bias\" must be provided "
+            "when fuse_prologue = true. Got scale = %d; bias = %d.",
+            scale,
+            bias));
+  }
+
+  // alloc output variables
+  dev_ctx.template Alloc<T>(out);
+  dev_ctx.template Alloc<float>(out_running_mean);
+  dev_ctx.template Alloc<float>(out_running_var);
+  dev_ctx.template Alloc<float>(saved_mean);
+  dev_ctx.template Alloc<float>(saved_var);
+  dev_ctx.template Alloc<T>(eq_scale);
+  dev_ctx.template Alloc<T>(eq_bias);
+
+  // deal with strides, dilations and paddings
+  if (accumulation_count == 0) {
+    // dim_out = [N, H, W, C]
+    // accumulation_count = N * H * W
+    auto dim_out = phi::vectorize<int64_t>(out->dims());
+    accumulation_count = dim_out[0] * dim_out[1] * dim_out[2];
+  }
+
+  // Step 1: Scale Bias ReLU Conv BNStats
+  auto bn_dims = bn_scale.dims();
+  DenseTensor sum_tensor(bn_scale.dtype());
+  DenseTensor sqsum_tensor(bn_scale.dtype());
+  sum_tensor.Resize(bn_dims);
+  sqsum_tensor.Resize(bn_dims);
+  dev_ctx.template Alloc<float>(&sum_tensor);
+  dev_ctx.template Alloc<float>(&sqsum_tensor);
+  FusedScaleBiasReluConvBnstatsImpl<T, Context>(dev_ctx,
+                                                x,
+                                                w,
+                                                scale,
+                                                bias,
+                                                paddings,
+                                                dilations,
+                                                strides,
+                                                padding_algorithm,
+                                                fuse_prologue,
+                                                exhaustive_search,
+                                                deterministic,
+                                                out,
+                                                &sum_tensor,
+                                                &sqsum_tensor);
+  // Step 2: BN Finalize
+  BNFinalizeImpl<T, Context>(dev_ctx,
+                             sum_tensor,
+                             sqsum_tensor,
+                             bn_scale,
+                             bn_bias,
+                             input_running_mean,
+                             input_running_var,
+                             accumulation_count,
+                             exp_decay,
+                             epsilon,
+                             exhaustive_search,
+                             deterministic,
+                             out_running_mean,
+                             out_running_var,
+                             saved_mean,
+                             saved_var,
+                             eq_scale,
+                             eq_bias);
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_scale_bias_relu_conv_bnstats,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedScaleBiasReluConvBnstatsKernel,
+                   phi::dtype::float16) {
+  kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+}
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index 0609bf945d9b38..78c3723ceedcbd 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -34,7 +34,7 @@
 #define LAUNCH_BOUNDS(BlockDim)
 #endif
 
-DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+PD_DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 namespace phi {
 
 template <typename T>
@@ -1387,7 +1387,6 @@ PD_REGISTER_KERNEL(batch_norm_grad,
                    phi::dtype::float16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
       kernel_key.dtype() == phi::DataType::BFLOAT16) {
-    kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
     kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
   }
@@ -1405,7 +1404,6 @@ PD_REGISTER_KERNEL(batch_norm_grad,
                    double,
                    phi::dtype::float16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16) {
-    kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
     kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
   }
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 1a07e5f0d49098..ad276ec6f1812b 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -41,7 +41,7 @@ namespace cub = hipcub;
 #define LAUNCH_BOUNDS(BlockDim)
 #endif
 
-DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+PD_DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/c_embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/c_embedding_grad_kernel.cu
index aaa5c6865be474..66057db357e50a 100644
--- a/paddle/phi/kernels/gpu/c_embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/c_embedding_grad_kernel.cu
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/kernels/c_embedding_kernel.h"
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -22,8 +21,9 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/embedding_grad.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_int64(embedding_deterministic);
+PD_DECLARE_int64(embedding_deterministic);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index 99ba12b1d62135..a7c75e64a462ad 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -15,7 +15,6 @@
 #include "paddle/phi/kernels/embedding_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/embedding_grad.h"
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
@@ -26,8 +25,9 @@
 #include "paddle/phi/core/mixed_vector.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/embedding_util.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_int64(embedding_deterministic);
+PD_DECLARE_int64(embedding_deterministic);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
index 60a1c54d72678e..7b76a5f458dddf 100644
--- a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
@@ -24,7 +24,7 @@
 #include "paddle/phi/kernels/gpu/flash_attn_utils.h"
 #include "paddle/phi/kernels/reshape_kernel.h"
 
-DECLARE_bool(cudnn_deterministic);
+PD_DECLARE_bool(cudnn_deterministic);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/flash_attn_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_kernel.cu
index bcf8791d3c17f6..4bd5e28c09fedc 100644
--- a/paddle/phi/kernels/gpu/flash_attn_kernel.cu
+++ b/paddle/phi/kernels/gpu/flash_attn_kernel.cu
@@ -24,7 +24,7 @@
 #include "paddle/phi/kernels/gpu/flash_attn_utils.h"
 #include "paddle/phi/kernels/reshape_kernel.h"
 
-DECLARE_bool(cudnn_deterministic);
+PD_DECLARE_bool(cudnn_deterministic);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
index b1ffa921f912b7..2b847fb216bb9c 100644
--- a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
@@ -21,7 +21,7 @@
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/gpu/gelu_funcs.h"
 
-DECLARE_bool(use_fast_math);
+PD_DECLARE_bool(use_fast_math);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu
index e0792c387d7510..8400b5d8cd5a5f 100644
--- a/paddle/phi/kernels/gpu/gelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_kernel.cu
@@ -25,7 +25,7 @@
 #include "paddle/phi/kernels/gpu/gelu_funcs.h"
 // clang-format on
 
-DECLARE_bool(use_fast_math);
+PD_DECLARE_bool(use_fast_math);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu
index 9ba5bde7f61360..8fd15d5435f98b 100644
--- a/paddle/phi/kernels/gpu/index_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_kernel.cu
@@ -14,15 +14,15 @@
 
 #include "paddle/phi/kernels/index_add_kernel.h"
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_bool(cudnn_deterministic);
+PD_DECLARE_bool(cudnn_deterministic);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
index 64e3428cc9aaf8..03f74888fca0f6 100644
--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/kernels/index_select_grad_kernel.h"
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
@@ -22,8 +21,9 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_bool(cudnn_deterministic);
+PD_DECLARE_bool(cudnn_deterministic);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/isclose_kernel.cu b/paddle/phi/kernels/gpu/isclose_kernel.cu
index cfae8d0bbda29a..1242269242e0bf 100644
--- a/paddle/phi/kernels/gpu/isclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/isclose_kernel.cu
@@ -25,4 +25,6 @@ PD_REGISTER_KERNEL(isclose,
                    phi::IscloseKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/lars_momentum_kernel.cu b/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
index 14b7f1ca328604..be8b1ff3796d01 100644
--- a/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
+++ b/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
@@ -25,11 +25,7 @@
 #include <cooperative_groups.h>
 #endif
 
-#ifdef __HIPCC__
-#define LARS_BLOCK_SIZE 256
-#else
 #define LARS_BLOCK_SIZE 512
-#endif
 
 #define LARS_MAX_MERGED_OPS 60
 
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index c5bb0c288f2609..eb85d9ac826d0a 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -13,13 +13,13 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/layer_norm_kernel.h"
-#include "gflags/gflags.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
 #include "paddle/phi/kernels/funcs/layer_norm_util.h"
+#include "paddle/utils/flags.h"
 
-DECLARE_bool(use_fast_math);
+PD_DECLARE_bool(use_fast_math);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 2ad512701e097f..4c6597b93f91fd 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -26,7 +26,6 @@
 namespace cub = hipcub;
 #endif
 
-#include "gflags/gflags.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -34,6 +33,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/randint_kernel.h"
+#include "paddle/utils/flags.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
index d368c43a297532..e455714c50829d 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
@@ -113,11 +113,7 @@ void CalculateXGrad(const Context& ctx,
                     const DenseTensor& out_grad_tensor,
                     const DenseTensor* dst_count = nullptr,
                     const DenseTensor* out = nullptr) {
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
   int block = 1024;
-#endif
   int64_t n = slice_size * index_size;
   int max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid_tmp = (n + block - 1) / block;
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
index 764490bd1cb8b0..7274b391e8d135 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
@@ -101,11 +101,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
   const dim3 grid(nbx, nby);
   const dim3 block(ntx, nty);
   int64_t input_size = x.dims()[0];
-#ifdef PADDLE_WITH_HIP
-  int block_ = 256;
-#else
   int block_ = 1024;
-#endif
   if (reduce_op == "SUM" || reduce_op == "MEAN") {
     GraphSendUERecvSumCUDAFunctor<T> sum_functor;
     if (message_op == "ADD") {
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
index 3962d86c3e7b9b..307b51a1ca119c 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -35,11 +35,7 @@ namespace cub = hipcub;
 
 namespace phi {
 
-#ifdef __HIPCC__
-static constexpr int kNumCUDAThreads = 256;
-#else
 static constexpr int kNumCUDAThreads = 512;
-#endif
 static constexpr int kNumMaxinumNumBlocks = 4096;
 
 static inline int NumBlocks(const int N) {
diff --git a/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu b/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu
index 653a64b127a254..87bae7fe5647fa 100644
--- a/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu
@@ -16,11 +16,11 @@ limitations under the License. */
 
 #include <thrust/random.h>
 
-#include "gflags/gflags.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/utils/flags.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/uniform_kernel.cu b/paddle/phi/kernels/gpu/uniform_kernel.cu
index 04217db0a74c1a..2a514947bb7177 100644
--- a/paddle/phi/kernels/gpu/uniform_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_kernel.cu
@@ -16,10 +16,10 @@
 
 #include <thrust/random.h>
 
-#include "gflags/gflags.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/utils/flags.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/unpool_grad_kernel.cu b/paddle/phi/kernels/gpu/unpool_grad_kernel.cu
index 959544cdbb969c..7cf08d92401cb7 100644
--- a/paddle/phi/kernels/gpu/unpool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/unpool_grad_kernel.cu
@@ -88,11 +88,7 @@ class Unpool2dMaxGradFunctor {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMaxGrad<T>
         <<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
@@ -131,11 +127,7 @@ class Unpool3dMaxGradFunctor {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = dev_ctx.template Alloc<T>(input_grad);
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool3dMaxGrad<T>
         <<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
diff --git a/paddle/phi/kernels/gpu/unpool_kernel.cu b/paddle/phi/kernels/gpu/unpool_kernel.cu
index 9365c286195be6..1e09323642b673 100644
--- a/paddle/phi/kernels/gpu/unpool_kernel.cu
+++ b/paddle/phi/kernels/gpu/unpool_kernel.cu
@@ -80,11 +80,7 @@ class Unpool2dMaxFunctor {
     const T* input_data = input.data<T>();
     const int* indices_data = indices.data<int>();
     T* output_data = dev_ctx.template Alloc<T>(output);
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMax<T>
         <<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
@@ -117,11 +113,7 @@ class Unpool3dMaxFunctor {
     const T* input_data = input.data<T>();
     const int* indices_data = indices.data<int>();
     T* output_data = dev_ctx.template Alloc<T>(output);
-#ifdef __HIPCC__
-    int threads = 256;
-#else
     int threads = 1024;
-#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool3dMax<T>
         <<<grid, threads, 0, dev_ctx.stream()>>>(input.numel(),
diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
index ef8e606e547ce5..d0bdcc10beaa83 100644
--- a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
@@ -367,6 +367,48 @@ class CudnnFrontendConvHelper {
                          plan_cache);
   }
 
+  static void QueryCacheAndExecute(
+      cudnnHandle_t handle,
+      phi::DnnWorkspaceHandle* workspace_handle,
+      cudnn_frontend::OperationGraph* op_graph_pointer,
+      std::vector<void*>* data_ptrs,
+      std::vector<int64_t>* uids,
+      bool exhaustive_search,
+      bool deterministic,
+      const cudnn_frontend::feature_vector_t& feature_vector,
+      phi::autotune::CudnnFrontendPlanCache* plan_cache) {
+    if (plan_cache->FindPlan(feature_vector, handle)) {
+      const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
+      int64_t workspace_size = 0;
+      plan_cache->GetPlanAndWorkspaceSize(
+          feature_vector, &cached_plan, &workspace_size, handle);
+      ExecutePlan(handle,
+                  workspace_handle,
+                  data_ptrs,
+                  uids,
+                  cached_plan->get_raw_desc(),
+                  workspace_size);
+      return;
+    }
+
+    auto plans = FindExecutionPlans(op_graph_pointer,
+                                    exhaustive_search,
+                                    deterministic,
+                                    data_ptrs,
+                                    uids,
+                                    handle,
+                                    workspace_handle);
+
+    ExecutePlansAndCache(handle,
+                         workspace_handle,
+                         data_ptrs,
+                         uids,
+                         &plans,
+                         exhaustive_search,
+                         feature_vector,
+                         plan_cache);
+  }
+
   static cudnn_frontend::Operation MakePointwiseOp(
       cudnnPointwiseMode_t mode,
       cudnnDataType_t dtype,
@@ -435,7 +477,7 @@ void CudnnConvBwdDataV8(const DenseTensor* dy_tensor,
   if (plan_cache_bwd_data.FindPlan(op_graph, handle)) {
     const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
     int64_t workspace_size = 0;
-    plan_cache_bwd_data.GetPlan(
+    plan_cache_bwd_data.GetPlanAndWorkspaceSize(
         op_graph, &cached_plan, &workspace_size, handle);
     helper::ExecutePlan(handle,
                         workspace_handle,
@@ -509,7 +551,7 @@ void CudnnConvBwdFilterV8(const DenseTensor* x_tensor,
   if (plan_cache_bwd_filter.FindPlan(op_graph, handle)) {
     const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
     int64_t workspace_size = 0;
-    plan_cache_bwd_filter.GetPlan(
+    plan_cache_bwd_filter.GetPlanAndWorkspaceSize(
         op_graph, &cached_plan, &workspace_size, handle);
     helper::ExecutePlan(handle,
                         workspace_handle,
diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_info.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_info.h
index 9b7b35f4357cd4..0dd3d7f0cdea1b 100644
--- a/paddle/phi/kernels/gpudnn/conv_gpudnn_info.h
+++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_info.h
@@ -20,9 +20,9 @@ limitations under the License. */
 
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 
-DECLARE_int64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
-DECLARE_int64(cudnn_exhaustive_search_times);
+PD_DECLARE_int64(conv_workspace_size_limit);
+PD_DECLARE_bool(cudnn_exhaustive_search);
+PD_DECLARE_int64(cudnn_exhaustive_search_times);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index 6dc7fc9e6131de..65418673827cd5 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -264,7 +264,8 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor,
   if (plan_cache.FindPlan(op_graph, handle)) {
     const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
     int64_t workspace_size = 0;
-    plan_cache.GetPlan(op_graph, &cached_plan, &workspace_size, handle);
+    plan_cache.GetPlanAndWorkspaceSize(
+        op_graph, &cached_plan, &workspace_size, handle);
     helper::ExecutePlan(handle,
                         &workspace_handle,
                         input_data,
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index fb434b5c9cfd75..a4571b83e39e75 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -870,11 +870,7 @@ static void GetGridDim(
 }
 
 static void GetBlockDim(int mid_dim, int low_dim, dim3* block) {
-#ifdef __HIPCC__
-  constexpr int max_num_threads = 256;
-#else
   constexpr int max_num_threads = 1024;
-#endif
   int block_x = 1 << Log2Ceil(low_dim);
   int block_y = 1 << Log2Ceil(mid_dim);
   block->x = std::min(block_x, 32);
diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h
index c918eeec831219..81a7c7726203ea 100644
--- a/paddle/phi/kernels/impl/conv_cudnn_impl.h
+++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h
@@ -30,9 +30,9 @@
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 #include "paddle/phi/kernels/funcs/padding.h"
 
-DECLARE_bool(cudnn_deterministic);
-DECLARE_int64(conv_workspace_size_limit);
-DECLARE_bool(cudnn_exhaustive_search);
+PD_DECLARE_bool(cudnn_deterministic);
+PD_DECLARE_int64(conv_workspace_size_limit);
+PD_DECLARE_bool(cudnn_exhaustive_search);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index d297e786845bc1..e32f64f347f4c6 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -15,8 +15,8 @@
 
 #include <set>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/diagonal_kernel.h"
@@ -27,7 +27,7 @@
 #include "paddle/phi/kernels/transpose_kernel.h"
 #include "paddle/utils/string/string_helper.h"
 
-DECLARE_bool(einsum_opt);
+PD_DECLARE_bool(einsum_opt);
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
index de59cb0c32ca13..93dfb7790b4abd 100644
--- a/paddle/phi/kernels/impl/isclose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -19,6 +19,7 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -86,6 +87,40 @@ struct IscloseFunctor<phi::CPUContext, T> {
   }
 };
 
+template <typename T>
+struct IscloseFunctor<phi::CPUContext, phi::dtype::complex<T>> {
+  void operator()(const phi::CPUContext& ctx,
+                  const DenseTensor& in,
+                  const DenseTensor& other,
+                  const double rtol,
+                  const double atol,
+                  bool equal_nan,
+                  DenseTensor* output) {
+    auto* in_a = in.data<phi::dtype::complex<T>>();
+    auto* in_b = other.data<phi::dtype::complex<T>>();
+    auto* out_data = ctx.template Alloc<bool>(output);
+    auto num = in.numel();
+    // *out_data = true;
+    for (int i = 0; i < num; i++) {
+      out_data[i] = true;
+    }
+    for (int i = 0; i < num; i++) {
+      const phi::dtype::complex<T> a = in_a[i], b = in_b[i];
+      bool val;
+      if (std::isnan(a) || std::isnan(b)) {
+        val = equal_nan && std::isnan(a) == std::isnan(b);
+      } else {
+        T left = abs(a - b);
+        T right = atol + rtol * abs(b);
+        T diff = abs(left - right);
+        val = a == b || left <= right || diff <= 1e-15;
+        // *out_data &= val;
+        out_data[i] = val;
+      }
+    }
+  }
+};
+
 #if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 __global__ void IscloseCUDAKernel(const T* in_data,
@@ -113,7 +148,59 @@ __global__ void IscloseCUDAKernel(const T* in_data,
     // if (!val) *out_data = false;
   }
 }
+template <>
+__global__ void IscloseCUDAKernel<phi::dtype::complex<float>>(
+    const phi::dtype::complex<float>* in_data,
+    const phi::dtype::complex<float>* other_data,
+    const double rtol,
+    const double atol,
+    bool equal_nan,
+    int num,
+    bool* out_data) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  bool val;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    const phi::dtype::complex<float> a = in_data[i];
+    const phi::dtype::complex<float> b = other_data[i];
+    if (isnan(a) || isnan(b)) {
+      val = equal_nan && isnan(a) == isnan(b);
+    } else {
+      float left = abs(a - b);
+      float right = atol + rtol * abs(b);
+      float diff = abs(left - right);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    out_data[i] = val;
+    // if (!val) *out_data = false;
+  }
+}
 
+template <>
+__global__ void IscloseCUDAKernel<phi::dtype::complex<double>>(
+    const phi::dtype::complex<double>* in_data,
+    const phi::dtype::complex<double>* other_data,
+    const double rtol,
+    const double atol,
+    bool equal_nan,
+    int num,
+    bool* out_data) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  bool val;
+  for (int i = idx; i < num; i += blockDim.x * gridDim.x) {
+    const phi::dtype::complex<double> a = in_data[i];
+    const phi::dtype::complex<double> b = other_data[i];
+    if (isnan(a) || isnan(b)) {
+      val = equal_nan && isnan(a) == isnan(b);
+    } else {
+      double left = abs(a - b);
+      double right = atol + rtol * abs(b);
+      double diff = abs(left - right);
+      val = a == b || left <= right || diff <= 1e-15;
+    }
+    out_data[i] = val;
+    // if (!val) *out_data = false;
+  }
+}
 template <typename T>
 struct GetTensorValue<phi::GPUContext, T> {
   T operator()(const phi::GPUContext& dev_ctx,
diff --git a/paddle/phi/kernels/kps/logical_kernel.cu b/paddle/phi/kernels/kps/logical_kernel.cu
index d2a6346fd3f04e..f7c390e65d0ff5 100644
--- a/paddle/phi/kernels/kps/logical_kernel.cu
+++ b/paddle/phi/kernels/kps/logical_kernel.cu
@@ -97,6 +97,8 @@ PD_REGISTER_KERNEL(logical_xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {
                      int64_t,                                \
                      int,                                    \
                      int8_t,                                 \
+                     phi::dtype::complex<float>,             \
+                     phi::dtype::complex<double>,            \
                      int16_t) {}
 
 REGISTER_LOGICAL_CUDA_KERNEL(logical_and, And)
diff --git a/paddle/phi/kernels/legacy/gpu/uniform_kernel.cu b/paddle/phi/kernels/legacy/gpu/uniform_kernel.cu
index 609238435c96f1..abf51cf61f2b5c 100644
--- a/paddle/phi/kernels/legacy/gpu/uniform_kernel.cu
+++ b/paddle/phi/kernels/legacy/gpu/uniform_kernel.cu
@@ -16,10 +16,10 @@
 
 #include <thrust/random.h>
 
-#include "gflags/gflags.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/utils/flags.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
index c4732db041edf7..b93975c188b01e 100644
--- a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/phi/kernels/selected_rows/adam_kernel.h"
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -24,7 +24,7 @@
 #include "paddle/phi/kernels/funcs/adam_functors.h"
 #include "paddle/phi/kernels/funcs/selected_rows_functor.h"
 
-DECLARE_int32(inner_op_parallelism);
+PD_DECLARE_int32(inner_op_parallelism);
 
 namespace phi {
 namespace sr {
diff --git a/paddle/phi/ops/compat/distributed_fused_lamb_init_sig.cc b/paddle/phi/ops/compat/distributed_fused_lamb_init_sig.cc
new file mode 100644
index 00000000000000..90c64a1d2ef7c0
--- /dev/null
+++ b/paddle/phi/ops/compat/distributed_fused_lamb_init_sig.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature DistributedFusedLambInitOpArgumentMapping(
+    const ArgumentMappingContext& ctx UNUSED) {
+  return KernelSignature(
+      "distributed_fused_lamb_init",
+      {"Param", "Grad"},
+      {"beta1", "beta2", "apply_weight_decay", "alignment", "rank", "nranks"},
+      {"FP32FusedParam",
+       "FP32FusedGrad",
+       "FP16FusedParam",
+       "FP16FusedGrad",
+       "Moment1",
+       "Moment2",
+       "Beta1Pow",
+       "Beta2Pow",
+       "FusedParamOffsets",
+       "FP32ShardFusedParamOffsets",
+       "FP16ShardFusedParamOffsets",
+       "ParamInfo",
+       "ParamOrder",
+       "ParamOut",
+       "MasterParamOut",
+       "GradOut",
+       "GlobalScale",
+       "Step"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(distributed_fused_lamb_init,
+                           phi::DistributedFusedLambInitOpArgumentMapping);
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index a6884b0e1e7b70..5098e025fea5e2 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -402,8 +402,8 @@ if %day_now% NEQ %day_before% (
 )
 
 echo set -ex > cache.sh
-echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake  ^|md5sum ^| awk '{print $1}') >> cache.sh
-echo echo ${md5_content}^>md5.txt >> cache.sh
+echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake^|md5sum^|awk '{print $1}')$(git submodule status^|md5sum^|awk '{print $1}')>>cache.sh
+echo echo ${md5_content}^>md5.txt>>cache.sh
 
 %cache_dir%\tools\busybox64.exe cat cache.sh
 %cache_dir%\tools\busybox64.exe bash cache.sh
@@ -415,6 +415,51 @@ if "%WITH_GPU%"=="ON" (
 ) else (
     set sub_dir=cpu
 )
+
+@ECHO ON
+cd /d %work_dir%
+python -c "import wget;wget.download('https://paddle-windows.bj.bcebos.com/third_party_code/%sub_dir%/%md5%.tar.gz')" 2>nul
+if !ERRORLEVEL! EQU 0 (
+    echo Getting source code of third party : extracting ...
+    tar -xf %md5%.tar.gz
+    del %md5%.tar.gz
+    if !errorlevel! EQU 0 (
+        echo Getting source code of third party : successful
+    )
+) else (
+    git submodule update --init --recursive
+    set BCE_FILE=%cache_dir%\bce-python-sdk-new\BosClient.py
+    echo Uploading source code of third_party: checking bce ...
+    if not exist %cache_dir%\bce-python-sdk-new (
+        echo There is no bce in this PC, will install bce.
+        cd /d %cache_dir%
+        echo Download package from https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz
+        python -c "import wget;wget.download('https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz')"
+        python -c "import shutil;shutil.unpack_archive('bos_new.tar.gz', extract_dir='./bce-python-sdk-new',format='gztar')"
+    )
+    python -m pip install pycryptodome
+    python -m pip install bce-python-sdk==0.8.74
+    if !errorlevel! EQU 0 (
+        cd /d %work_dir%
+        echo Uploading source code of third party: compressing ...
+        tar -zcf %md5%.tar.gz ./third_party ./.git/modules
+        if !errorlevel! EQU 0 (
+            echo Uploading source code of third party: uploading ...
+            python !BCE_FILE! %md5%.tar.gz paddle-windows/third_party_code/%sub_dir% 1>nul
+            if !errorlevel! EQU 0 (
+                echo Upload source code of third party %md5% to bos paddle-windows/third_party_code/%sub_dir% successfully.
+            ) else (
+                echo Failed upload source code of third party to bos, reason: upload failed.
+            )
+        ) else (
+            echo Failed upload source code of third party to bos, reason: compress failed.
+        )
+        del %md5%.tar.gz
+    ) else (
+        echo Failed upload source code of third party to bos, reason: install bce failed.
+    )
+)
+
 set THIRD_PARTY_HOME=%cache_dir:\=/%/third_party/%sub_dir%
 set THIRD_PARTY_PATH=%THIRD_PARTY_HOME%/%md5%
 
@@ -450,7 +495,9 @@ if not exist %THIRD_PARTY_PATH% (
     echo Found reusable third_party cache in %THIRD_PARTY_PATH%, will reuse it.
 )
 
+
 :cmake_impl
+cd /d %work_dir%\%BUILD_DIR%
 echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
@@ -582,8 +629,9 @@ if "%UPLOAD_TP_FILE%"=="ON" (
         echo Download package from https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz
         python -c "import wget;wget.download('https://xly-devops.bj.bcebos.com/home/bos_new.tar.gz')"
         python -c "import shutil;shutil.unpack_archive('bos_new.tar.gz', extract_dir='./bce-python-sdk-new',format='gztar')"
-        python -m pip install bce-python-sdk==0.8.74
     )
+    python -m pip install pycryptodome
+    python -m pip install bce-python-sdk==0.8.74
     if !errorlevel! EQU 0 (
         cd /d %THIRD_PARTY_HOME%
         echo Uploading third_party: compressing ...
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index b22f0c3a450497..6701693545e49e 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/utils/flags.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-DECLARE_bool(enable_gpu_memory_usage_log);
+PD_DECLARE_bool(enable_gpu_memory_usage_log);
 #endif
 
 int main(int argc, char** argv) {  // NOLINT
@@ -32,11 +32,8 @@ int main(int argc, char** argv) {  // NOLINT
   }
 
   std::vector<std::string> envs;
-  std::vector<std::string> undefok;
 #if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_PSLIB)
-  std::string str_max_body_size;
-  if (::GFLAGS_NAMESPACE::GetCommandLineOption("max_body_size",
-                                               &str_max_body_size)) {
+  if (paddle::flags::FindFlag("max_body_size")) {
     setenv("FLAGS_max_body_size", "2147483647", 1);
     envs.push_back("max_body_size");
   }
@@ -45,18 +42,8 @@ int main(int argc, char** argv) {  // NOLINT
   const auto& flag_map = phi::GetExportedFlagInfoMap();
   for (const auto& pair : flag_map) {
     const std::string& name = pair.second.name;
-    // NOTE(zhiqiu): some names may not linked in some tests, so add to
-    // `undefok`.
-    // One way to handle that is to check each flag item by item, and put it in
-    // `envs` or `undefok`;
-    // another way is to add all flags to `envs` and `undeok`, basically it is
-    // not a good design,
-    // but it can simplify the procedure of creating new flag and seems no side
-    // effects.
-    // see details: https://gflags.github.io/gflags/#special
     if (pair.second.is_writable) {  // means public
       envs.push_back(name);
-      undefok.push_back(name);
     }
   }
 
@@ -72,20 +59,8 @@ int main(int argc, char** argv) {  // NOLINT
     VLOG(1) << "gtest env_string:" << env_string;
   }
 
-  char* undefok_str = nullptr;
-  if (!undefok.empty()) {
-    std::string undefok_string = "--undefok=";
-    for (auto t : undefok) {
-      undefok_string += t + ",";
-    }
-    undefok_string = undefok_string.substr(0, undefok_string.length() - 1);
-    undefok_str = strdup(undefok_string.c_str());
-    new_argv.push_back(undefok_str);
-    VLOG(1) << "gtest undefok_string:" << undefok_string;
-  }
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (strstr(undefok_str, "enable_gpu_memory_usage_log")) {
+  if (strstr(env_str, "enable_gpu_memory_usage_log")) {
     VLOG(1) << "Set FLAGS_enable_gpu_memory_usage_log to true";
     FLAGS_enable_gpu_memory_usage_log = true;
   }
@@ -93,8 +68,8 @@ int main(int argc, char** argv) {  // NOLINT
 
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(
-      &new_argc, &new_argv_address, false);
+  paddle::flags::AllowUndefinedFlags();
+  paddle::flags::ParseCommandLineFlags(&new_argc, &new_argv_address);
   paddle::framework::InitMemoryMethod();
   paddle::framework::InitDevices();
   paddle::framework::InitDefaultKernelSignatureMap();
@@ -102,6 +77,5 @@ int main(int argc, char** argv) {  // NOLINT
   int ret = RUN_ALL_TESTS();
 
   if (env_str) free(env_str);
-  if (undefok_str) free(undefok_str);
   return ret;
 }
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index dbb53a3ac368a2..bb177e7578bb40 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -23,3 +23,11 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     SRCS pybind.cc
     DEPS phi)
 endif()
+
+if(NOT WITH_GFLAGS)
+  cc_library(paddle_flags SRCS flags_native.cc)
+  cc_test(
+    flags_native_test
+    SRCS flags_native_test.cc
+    DEPS paddle_flags)
+endif()
diff --git a/paddle/utils/flags.h b/paddle/utils/flags.h
new file mode 100644
index 00000000000000..5a019f1439b434
--- /dev/null
+++ b/paddle/utils/flags.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_GFLAGS
+#include "gflags/gflags.h"
+#else
+#include "paddle/utils/flags_native.h"
+#endif
+
+#ifdef PADDLE_WITH_GFLAGS
+#define PD_DEFINE_bool(name, val, txt) DEFINE_bool(name, val, txt)
+#define PD_DEFINE_int32(name, val, txt) DEFINE_int32(name, val, txt)
+#define PD_DEFINE_uint32(name, val, txt) DEFINE_uint32(name, val, txt)
+#define PD_DEFINE_int64(name, val, txt) DEFINE_int64(name, val, txt)
+#define PD_DEFINE_uint64(name, val, txt) DEFINE_uint64(name, val, txt)
+#define PD_DEFINE_double(name, val, txt) DEFINE_double(name, val, txt)
+#define PD_DEFINE_string(name, val, txt) DEFINE_string(name, val, txt)
+
+#define PD_DECLARE_bool(name) DECLARE_bool(name)
+#define PD_DECLARE_int32(name) DECLARE_int32(name)
+#define PD_DECLARE_uint32(name) DECLARE_uint32(name)
+#define PD_DECLARE_int64(name) DECLARE_int64(name)
+#define PD_DECLARE_uint64(name) DECLARE_uint64(name)
+#define PD_DECLARE_double(name) DECLARE_double(name)
+#define PD_DECLARE_string(name) DECLARE_string(name)
+#endif
+
+namespace paddle {
+namespace flags {
+
+#ifdef PADDLE_WITH_GFLAGS
+inline void ParseCommandLineFlags(int* argc, char*** argv) {
+  gflags::ParseCommandLineFlags(argc, argv, true);
+}
+#else
+using paddle::flags::ParseCommandLineFlags;
+#endif
+
+#ifdef PADDLE_WITH_GFLAGS
+inline bool SetFlagValue(const char* name, const char* value) {
+  std::string ret = gflags::SetCommandLineOption(name, value);
+  return ret.empty() ? false : true;
+}
+#else
+using paddle::flags::SetFlagValue;
+#endif
+
+#ifdef PADDLE_WITH_GFLAGS
+inline bool FindFlag(const char* name) {
+  std::string value;
+  return gflags::GetCommandLineOption(name, &value);
+}
+#else
+using paddle::flags::FindFlag;
+#endif
+
+#ifdef PADDLE_WITH_GFLAGS
+inline void AllowUndefinedFlags() { gflags::AllowCommandLineReparsing(); }
+#else
+using paddle::flags::AllowUndefinedFlags;
+#endif
+
+}  // namespace flags
+}  // namespace paddle
diff --git a/paddle/utils/flags_native.cc b/paddle/utils/flags_native.cc
new file mode 100644
index 00000000000000..05d90d8adf21cc
--- /dev/null
+++ b/paddle/utils/flags_native.cc
@@ -0,0 +1,484 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/utils/flags_native.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <set>
+#include <sstream>
+
+namespace paddle {
+namespace flags {
+
+std::stringstream& ErrorStream() {
+  static std::stringstream err_ss;
+  return err_ss;
+}
+
+inline void exit_with_errors() {
+  std::cerr << ErrorStream().str();
+  exit(-1);
+}
+
+#define LOG_FLAG_ERROR(message)                                             \
+  ErrorStream() << "paddle flags error: " << message << " (at " << __FILE__ \
+                << ":" << __LINE__ << ")" << std::endl
+
+#define LOG_FLAG_FATAL_ERROR(message) \
+  LOG_FLAG_ERROR(message);            \
+  exit_with_errors()
+
+enum class FlagType : uint8_t {
+  BOOL = 0,
+  INT32 = 1,
+  UINT32 = 2,
+  INT64 = 3,
+  UINT64 = 4,
+  DOUBLE = 5,
+  STRING = 6,
+  UNDEFINED = 7,
+};
+
+class Flag {
+ public:
+  Flag(std::string name,
+       std::string description,
+       std::string file,
+       FlagType type,
+       const void* default_value,
+       void* value)
+      : name_(name),
+        description_(description),
+        file_(file),
+        type_(type),
+        default_value_(default_value),
+        value_(value) {}
+  ~Flag() = default;
+
+  // Summary: --name_: type_, description_ (default: default_value_)
+  std::string Summary() const;
+
+  void SetValueFromString(const std::string& value);
+
+ private:
+  friend class FlagRegistry;
+
+  const std::string name_;
+  const std::string description_;
+  const std::string file_;
+  const FlagType type_;
+  const void* default_value_;
+  void* value_;
+};
+
+class FlagRegistry {
+ public:
+  static FlagRegistry* Instance() {
+    static FlagRegistry* global_registry_ = new FlagRegistry();
+    return global_registry_;
+  }
+
+  void RegisterFlag(Flag* flag);
+
+  bool SetFlagValue(const std::string& name, const std::string& value);
+
+  bool HasFlag(const std::string& name) const;
+
+  void PrintAllFlagHelp(std::ostream& os) const;
+
+ private:
+  FlagRegistry() = default;
+
+  std::map<std::string, Flag*> flags_;
+
+  struct FlagCompare {
+    bool operator()(const Flag* flag1, const Flag* flag2) const {
+      return flag1->name_ < flag2->name_;
+    }
+  };
+
+  std::map<std::string, std::set<Flag*, FlagCompare>> flags_by_file_;
+
+  std::mutex mutex_;
+};
+
+template <typename T>
+struct FlagTypeTraits {
+  static constexpr FlagType Type = FlagType::UNDEFINED;
+};
+
+#define DEFINE_FLAG_TYPE_TRAITS(type, flag_type) \
+  template <>                                    \
+  struct FlagTypeTraits<type> {                  \
+    static constexpr FlagType Type = flag_type;  \
+  }
+
+DEFINE_FLAG_TYPE_TRAITS(bool, FlagType::BOOL);
+DEFINE_FLAG_TYPE_TRAITS(int32_t, FlagType::INT32);
+DEFINE_FLAG_TYPE_TRAITS(uint32_t, FlagType::UINT32);
+DEFINE_FLAG_TYPE_TRAITS(int64_t, FlagType::INT64);
+DEFINE_FLAG_TYPE_TRAITS(uint64_t, FlagType::UINT64);
+DEFINE_FLAG_TYPE_TRAITS(double, FlagType::DOUBLE);
+DEFINE_FLAG_TYPE_TRAITS(std::string, FlagType::STRING);
+
+#undef DEFINE_FLAG_TYPE_TRAITS
+
+template <typename T>
+FlagRegisterer::FlagRegisterer(std::string name,
+                               std::string help,
+                               std::string file,
+                               const T* default_value,
+                               T* value) {
+  FlagType type = FlagTypeTraits<T>::Type;
+  Flag* flag = new Flag(name, help, file, type, default_value, value);
+  FlagRegistry::Instance()->RegisterFlag(flag);
+}
+
+// Instantiate FlagRegisterer for supported types.
+#define INSTANTIATE_FLAG_REGISTERER(type)                            \
+  template FlagRegisterer::FlagRegisterer(std::string name,          \
+                                          std::string help,          \
+                                          std::string file,          \
+                                          const type* default_value, \
+                                          type* value)
+
+INSTANTIATE_FLAG_REGISTERER(bool);
+INSTANTIATE_FLAG_REGISTERER(int32_t);
+INSTANTIATE_FLAG_REGISTERER(uint32_t);
+INSTANTIATE_FLAG_REGISTERER(int64_t);
+INSTANTIATE_FLAG_REGISTERER(uint64_t);
+INSTANTIATE_FLAG_REGISTERER(double);
+INSTANTIATE_FLAG_REGISTERER(std::string);
+
+#undef INSTANTIATE_FLAG_REGISTERER
+
+std::string FlagType2String(FlagType type) {
+  switch (type) {
+    case FlagType::BOOL:
+      return "bool";
+    case FlagType::INT32:
+      return "int32";
+    case FlagType::UINT32:
+      return "uint32";
+    case FlagType::INT64:
+      return "int64";
+    case FlagType::UINT64:
+      return "uint64";
+    case FlagType::DOUBLE:
+      return "double";
+    case FlagType::STRING:
+      return "string";
+    default:
+      return "undefined";
+  }
+}
+
+std::string Value2String(const void* value, FlagType type) {
+  switch (type) {
+    case FlagType::BOOL: {
+      const bool* val = static_cast<const bool*>(value);
+      return *val ? "true" : "false";
+    }
+    case FlagType::INT32: {
+      const int32_t* val = static_cast<const int32_t*>(value);
+      return std::to_string(*val);
+    }
+    case FlagType::UINT32: {
+      const uint32_t* val = static_cast<const uint32_t*>(value);
+      return std::to_string(*val);
+    }
+    case FlagType::INT64: {
+      const int64_t* val = static_cast<const int64_t*>(value);
+      return std::to_string(*val);
+    }
+    case FlagType::UINT64: {
+      const uint64_t* val = static_cast<const uint64_t*>(value);
+      return std::to_string(*val);
+    }
+    case FlagType::DOUBLE: {
+      const double* val = static_cast<const double*>(value);
+      return std::to_string(*val);
+    }
+    case FlagType::STRING: {
+      const std::string* val = static_cast<const std::string*>(value);
+      return *val;
+    }
+    default:
+      LOG_FLAG_FATAL_ERROR("flag type is undefined.");
+      return "";
+  }
+}
+
+std::string Flag::Summary() const {
+  return "--" + name_ + ": " + FlagType2String(type_) + ", " + description_ +
+         " (default: " + Value2String(default_value_, type_) + ")";
+}
+
+void Flag::SetValueFromString(const std::string& value) {
+  try {
+    switch (type_) {
+      case FlagType::BOOL: {
+        bool* val = static_cast<bool*>(value_);
+        if (value == "true" || value == "True" || value == "TRUE" ||
+            value == "1") {
+          *val = true;
+        } else if (value == "false" || value == "False" || value == "FALSE" ||
+                   value == "0") {
+          *val = false;
+        } else {
+          throw std::invalid_argument(
+              ", please use [true, True, TRUE, 1] or [false, False, FALSE, "
+              "0].");
+        }
+        break;
+      }
+      case FlagType::INT32: {
+        int32_t* val = static_cast<int32_t*>(value_);
+        *val = std::stoi(value);
+        break;
+      }
+      case FlagType::UINT32: {
+        uint32_t* val = static_cast<uint32_t*>(value_);
+        *val = std::stoul(value);
+        break;
+      }
+      case FlagType::INT64: {
+        int64_t* val = static_cast<int64_t*>(value_);
+        *val = std::stoll(value);
+        break;
+      }
+      case FlagType::UINT64: {
+        uint64_t* val = static_cast<uint64_t*>(value_);
+        *val = std::stoull(value);
+        break;
+      }
+      case FlagType::DOUBLE: {
+        double* val = static_cast<double*>(value_);
+        *val = std::stod(value);
+        break;
+      }
+      case FlagType::STRING: {
+        std::string* val = static_cast<std::string*>(value_);
+        *val = value;
+        break;
+      }
+      default: {
+        LOG_FLAG_FATAL_ERROR("flag type is undefined.");
+      }
+    }
+  } catch (const std::exception& e) {
+    std::string error_msg = "value: \"" + value + "\" is invalid for " +
+                            FlagType2String(type_) + " flag \"" + name_ + "\"";
+    if (type_ == FlagType::BOOL) {
+      error_msg += e.what();
+    } else {
+      error_msg += ".";
+    }
+    LOG_FLAG_ERROR(error_msg);
+  }
+}
+
+void FlagRegistry::RegisterFlag(Flag* flag) {
+  auto iter = flags_.find(flag->name_);
+  if (iter != flags_.end()) {
+    LOG_FLAG_FATAL_ERROR("illegal RegisterFlag, flag \"" + flag->name_ +
+                         "\" has been defined in " + iter->second->file_);
+  } else {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flags_[flag->name_] = flag;
+    flags_by_file_[flag->file_].insert(flag);
+  }
+}
+
+bool FlagRegistry::SetFlagValue(const std::string& name,
+                                const std::string& value) {
+  if (HasFlag(name)) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flags_[name]->SetValueFromString(value);
+    return true;
+  } else {
+    LOG_FLAG_ERROR("illegal SetFlagValue, flag \"" + name +
+                   "\" is not defined.");
+    return false;
+  }
+}
+
+bool FlagRegistry::HasFlag(const std::string& name) const {
+  return flags_.find(name) != flags_.end();
+}
+
+void FlagRegistry::PrintAllFlagHelp(std::ostream& os) const {
+  for (const auto& iter : flags_by_file_) {
+    os << std::endl << "Flags defined in " << iter.first << ":" << std::endl;
+    for (const auto& flag : iter.second) {
+      os << "  " << flag->Summary() << std::endl;
+    }
+  }
+  os << std::endl;
+}
+
+void PrintAllFlagHelp(bool to_file, const std::string& file_path) {
+  if (to_file) {
+    std::ofstream fout(file_path);
+    FlagRegistry::Instance()->PrintAllFlagHelp(fout);
+  } else {
+    FlagRegistry::Instance()->PrintAllFlagHelp(std::cout);
+  }
+}
+
+bool SetFlagValue(const std::string& name, const std::string& value) {
+  return FlagRegistry::Instance()->SetFlagValue(name, value);
+}
+
+bool FindFlag(const std::string& name) {
+  return FlagRegistry::Instance()->HasFlag(name);
+}
+
+bool GetValueFromEnv(const std::string& name, std::string* value) {
+  const char* env_var = std::getenv(name.c_str());
+  if (env_var == nullptr) {
+    return false;
+  }
+  *value = std::string(env_var);
+  return true;
+}
+
+void SetFlagsFromEnv(const std::vector<std::string>& flags, bool error_fatal) {
+  bool success = true;
+  for (const std::string& flag_name : flags) {
+    std::string env_var_name = std::string("FLAGS_") + flag_name;
+    std::string env_var_value;
+    if (GetValueFromEnv(env_var_name, &env_var_value)) {
+      success =
+          FlagRegistry::Instance()->SetFlagValue(flag_name, env_var_value);
+    } else if (error_fatal) {
+      LOG_FLAG_ERROR("environment variable \"" + env_var_name +
+                     "\" is not set.");
+      success = false;
+    }
+  }
+  if (error_fatal && !success) {
+    exit_with_errors();
+  }
+}
+
+static bool allow_undefined_flags = false;
+
+void AllowUndefinedFlags() { allow_undefined_flags = true; }
+
+void ParseCommandLineFlags(int* pargc, char*** pargv) {
+  assert(*pargc > 0);
+  size_t argv_num = *pargc - 1;
+  std::vector<std::string> argvs(*pargv + 1, *pargv + *pargc);
+
+  std::string arg_format_help =
+      "please follow the formats: \"--help(h)\", \"--name=value\""
+      " or \"--name value\".";
+  for (size_t i = 0; i < argv_num; i++) {
+    const std::string& argv = argvs[i];
+
+    if (argv.size() < 2 || argv[0] != '-') {
+      LOG_FLAG_FATAL_ERROR("invalid commandline argument: \"" + argv + "\", " +
+                           arg_format_help);
+    }
+
+    // parse arg name and value
+    size_t hyphen_num = argv[1] == '-' ? 2 : 1;
+    std::string name, value;
+    size_t split_pos = argv.find('=');
+    if (split_pos == std::string::npos) {
+      // the argv format is "--name" or "--name value"
+      name = argv.substr(hyphen_num);
+      if (name.empty()) {
+        LOG_FLAG_FATAL_ERROR("invalid commandline argument: \"" + argv +
+                             "\", " + arg_format_help);
+      }
+
+      // print help message
+      if (name == "help" || name == "h") {
+        FlagRegistry::Instance()->PrintAllFlagHelp(std::cout);
+        exit(1);
+      }
+
+      // get the value from next argv.
+      if (++i == argv_num) {
+        LOG_FLAG_FATAL_ERROR("expected value of flag \"" + name +
+                             "\" but found none.");
+      } else {
+        value = argvs[i];
+      }
+    } else {
+      // the argv format is "--name=value"
+      if (split_pos == hyphen_num || split_pos == argv.size() - 1) {
+        LOG_FLAG_FATAL_ERROR("invalid commandline argument: \"" + argv +
+                             "\", " + arg_format_help);
+      }
+      name = argv.substr(hyphen_num, split_pos - hyphen_num);
+      value = argv.substr(split_pos + 1);
+    }
+
+    // special case for flag value enclosed in ""
+    if (value[0] == '"') {
+      value = value.substr(1);
+      if (value.back() == '"') {
+        value.pop_back();
+      } else {
+        while (i < argv_num) {
+          value += " ";
+          value += argvs[++i];
+          if (value.back() == '"') {
+            break;
+          }
+        }
+        if (value.back() == '"') {
+          value.pop_back();
+        } else {
+          LOG_FLAG_FATAL_ERROR("unexperted end of flag \"" + name +
+                               "\" value while looking for matching `\"'");
+        }
+      }
+    }
+
+    if (name == "fromenv" || name == "tryfromenv") {
+      // Value of --fromenv or --tryfromenv should be
+      // a comma separated list of env var names.
+      std::vector<std::string> env_flag_names;
+      for (size_t start_pos = 0, end_pos = 0;
+           start_pos < value.size() && end_pos != std::string::npos;
+           start_pos = end_pos + 1) {
+        end_pos = value.find(',', start_pos);
+        env_flag_names.push_back(value.substr(start_pos, end_pos - start_pos));
+      }
+      if (name == "fromenv") {
+        SetFlagsFromEnv(env_flag_names, true);
+      } else {
+        SetFlagsFromEnv(env_flag_names, false);
+      }
+      continue;
+    }
+
+    FlagRegistry::Instance()->SetFlagValue(name, value);
+  }
+  if (!allow_undefined_flags && !ErrorStream().str().empty()) {
+    exit_with_errors();
+  }
+}
+
+}  // namespace flags
+}  // namespace paddle
diff --git a/paddle/utils/flags_native.h b/paddle/utils/flags_native.h
new file mode 100644
index 00000000000000..04814a4f679507
--- /dev/null
+++ b/paddle/utils/flags_native.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+// This is a simple commandline flags tool for paddle, which is inspired by
+// gflags but only implements the following necessary features:
+// 1. Define or declare a flag.
+// 2. Parse commandline flags.
+// 3. Other utility functions.
+
+namespace paddle {
+namespace flags {
+/**
+ * @brief Parse commandline flags.
+ *
+ * It recieves commandline arguments passed in argc and argv from main function,
+ * argv[0] is the program name, and argv[1:] are the commandline arguments
+ * which matching the format "--name=value" or "--name value". After parsing,
+ * the corresponding flag value will be reset.
+ */
+void ParseCommandLineFlags(int* argc, char*** argv);
+
+/**
+ * @brief Allow undefined flags in ParseCommandLineFlags()
+ */
+void AllowUndefinedFlags();
+
+/**
+ * @brief Set flags from environment variables.
+ *
+ * It recieves a list of flags name, and will find the corresponding environment
+ * variables named "FLAGS_name", if found, it will set the environment variable
+ * values to the flags. If error_fatal is true, the program will exit when the
+ * environment variable is not set or the flag is not defined, that is the same
+ * effect as using commandline argument "--fromenv=var_name1,var_name2,...".
+ * Otherwise, the errors above will be ignored, that is the same effect as using
+ * commandline argument "--tryfromenv=var_name1,var_name2,...".
+ */
+void SetFlagsFromEnv(const std::vector<std::string>& flags, bool error_fatal);
+
+/**
+ * @brief Set Single flag value, return true if success.
+ */
+bool SetFlagValue(const std::string& name, const std::string& value);
+
+/**
+ * @brief Find flag by name, return true if found.
+ */
+bool FindFlag(const std::string& name);
+
+/**
+ * @brief Print all registered flags' help message. If to_file is true,
+ * write help message to file.
+ */
+void PrintAllFlagHelp(bool to_file = false,
+                      const std::string& file_name = "all_flags.txt");
+}  // namespace flags
+}  // namespace paddle
+
+// ----------------------------DECLARE FLAGS----------------------------
+#define PD_DECLARE_VARIABLE(type, name) \
+  namespace paddle {                    \
+  namespace flags {                     \
+  extern type FLAGS_##name;             \
+  }                                     \
+  }                                     \
+  using paddle::flags::FLAGS_##name
+
+#define PD_DECLARE_bool(name) PD_DECLARE_VARIABLE(bool, name)
+#define PD_DECLARE_int32(name) PD_DECLARE_VARIABLE(int32_t, name)
+#define PD_DECLARE_uint32(name) PD_DECLARE_VARIABLE(uint32_t, name)
+#define PD_DECLARE_int64(name) PD_DECLARE_VARIABLE(int64_t, name)
+#define PD_DECLARE_uint64(name) PD_DECLARE_VARIABLE(uint64_t, name)
+#define PD_DECLARE_double(name) PD_DECLARE_VARIABLE(double, name)
+#define PD_DECLARE_string(name) PD_DECLARE_VARIABLE(std::string, name)
+
+namespace paddle {
+namespace flags {
+class FlagRegisterer {
+ public:
+  template <typename T>
+  FlagRegisterer(std::string name,
+                 std::string description,
+                 std::string file,
+                 const T* default_value,
+                 T* value);
+};
+}  // namespace flags
+}  // namespace paddle
+
+// ----------------------------DEFINE FLAGS----------------------------
+#define PD_DEFINE_VARIABLE(type, name, default_value, description)           \
+  namespace paddle {                                                         \
+  namespace flags {                                                          \
+  static const type FLAGS_##name##_default = default_value;                  \
+  type FLAGS_##name = default_value;                                         \
+  /* Register FLAG */                                                        \
+  static ::paddle::flags::FlagRegisterer flag_##name##_registerer(           \
+      #name, description, __FILE__, &FLAGS_##name##_default, &FLAGS_##name); \
+  }                                                                          \
+  }                                                                          \
+  using paddle::flags::FLAGS_##name
+
+#define PD_DEFINE_bool(name, val, txt) PD_DEFINE_VARIABLE(bool, name, val, txt)
+#define PD_DEFINE_int32(name, val, txt) \
+  PD_DEFINE_VARIABLE(int32_t, name, val, txt)
+#define PD_DEFINE_uint32(name, val, txt) \
+  PD_DEFINE_VARIABLE(uint32_t, name, val, txt)
+#define PD_DEFINE_int64(name, val, txt) \
+  PD_DEFINE_VARIABLE(int64_t, name, val, txt)
+#define PD_DEFINE_uint64(name, val, txt) \
+  PD_DEFINE_VARIABLE(uint64_t, name, val, txt)
+#define PD_DEFINE_double(name, val, txt) \
+  PD_DEFINE_VARIABLE(double, name, val, txt)
+#define PD_DEFINE_string(name, val, txt) \
+  PD_DEFINE_VARIABLE(std::string, name, val, txt)
diff --git a/paddle/utils/flags_native_test.cc b/paddle/utils/flags_native_test.cc
new file mode 100644
index 00000000000000..2b26a8cd1e9dbb
--- /dev/null
+++ b/paddle/utils/flags_native_test.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/utils/flags_native.h"
+
+#include <stdlib.h>
+#include "gtest/gtest.h"
+
+PD_DEFINE_int32(paddle_test_int32, 1, "test int32 flag");
+PD_DEFINE_uint32(paddle_test_uint32, 2, "test uint32 flag");
+PD_DEFINE_string(paddle_test_string, "raw", "test string flag");
+
+using namespace paddle::flags;  // NOLINT
+
+void SplitCommandlineArg(const std::string& commandline,
+                         int* argc,
+                         char*** argv) {
+  static std::vector<std::string> args;
+  args.clear();
+  for (size_t start_pos = 0, end_pos = 0;
+       start_pos < commandline.size() && end_pos != std::string::npos;
+       start_pos = end_pos + 1) {
+    end_pos = commandline.find(' ', start_pos);
+    args.push_back(commandline.substr(start_pos, end_pos - start_pos));
+  }
+  *argc = args.size();
+  *argv = new char*[*argc];
+  for (size_t i = 0; i < args.size(); i++) {
+    (*argv)[i] = const_cast<char*>(args[i].c_str());
+  }
+}
+
+TEST(flags_native_test, ParseCommandLineFlags) {
+  uint32_t test_int32 = 2;
+  ASSERT_EQ(FLAGS_paddle_test_int32, 1);
+  ASSERT_EQ(FLAGS_paddle_test_uint32, test_int32);
+  ASSERT_EQ(FLAGS_paddle_test_string, "raw");
+
+  // Construct commandline arguments input
+  std::string commandline =
+      "test --paddle_test_int32=3 --paddle_test_uint32=\"4\" "
+      "--paddle_test_string \"modified string\"";
+  int argc;
+  char** argv;
+  SplitCommandlineArg(commandline, &argc, &argv);
+
+  // Parse commandline flags and check
+  ParseCommandLineFlags(&argc, &argv);
+  delete argv;
+
+  test_int32 = 4;
+  ASSERT_EQ(FLAGS_paddle_test_int32, 3);
+  ASSERT_EQ(FLAGS_paddle_test_uint32, test_int32);
+  ASSERT_EQ(FLAGS_paddle_test_string, "modified string");
+
+  // test FindFlag and SetFlagValue
+  ASSERT_TRUE(FindFlag("paddle_test_int32"));
+
+  SetFlagValue("paddle_test_int32", "9");
+  ASSERT_EQ(FLAGS_paddle_test_int32, 9);
+}
+
+#if defined(_POSIX_C_SOURCE) && \
+    _POSIX_C_SOURCE >= 200112L  // environment for use setenv
+bool SetEnvVar(const std::string& var_name, const std::string& var_value) {
+  int res = setenv(var_name.c_str(), var_value.c_str(), 1);
+  return res == 0;
+}
+
+PD_DEFINE_bool(paddle_test_env_bool, false, "test env bool flag");
+PD_DEFINE_double(paddle_test_env_double, 3.14, "test env double flag");
+
+TEST(flags_native_test, SetFlagsFromEnv) {
+  ASSERT_EQ(FLAGS_paddle_test_env_bool, false);
+  ASSERT_EQ(FLAGS_paddle_test_env_double, 3.14);
+
+  ASSERT_TRUE(SetEnvVar("FLAGS_paddle_test_env_bool", "true"));
+  ASSERT_TRUE(SetEnvVar("FLAGS_paddle_test_env_double", "2.71"));
+
+  std::string commandline =
+      "test --fromenv=paddle_test_env_bool,paddle_test_env_double";
+  int argc;
+  char** argv;
+  SplitCommandlineArg(commandline, &argc, &argv);
+  ParseCommandLineFlags(&argc, &argv);
+  delete argv;
+
+  ASSERT_EQ(FLAGS_paddle_test_env_bool, true);
+  ASSERT_EQ(FLAGS_paddle_test_env_double, 2.71);
+}
+#endif
+
+TEST(flags_native_test, PrintAllFlagHelp) { PrintAllFlagHelp(); }
diff --git a/paddle/utils/string/pretty_log.cc b/paddle/utils/string/pretty_log.cc
index 44bbbd0cc1bb62..bb84ad201848ce 100644
--- a/paddle/utils/string/pretty_log.cc
+++ b/paddle/utils/string/pretty_log.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/utils/string/pretty_log.h"
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
-DEFINE_bool(color, true, "Whether to turn on pretty log");  // NOLINT
+PD_DEFINE_bool(color, true, "Whether to turn on pretty log");  // NOLINT
 
 namespace paddle {
 namespace string {}  // namespace string
diff --git a/paddle/utils/string/pretty_log.h b/paddle/utils/string/pretty_log.h
index 9de7ce24abd72d..546bf1eec7d1da 100644
--- a/paddle/utils/string/pretty_log.h
+++ b/paddle/utils/string/pretty_log.h
@@ -18,10 +18,10 @@
 #include <string>
 #include <utility>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "paddle/utils/string/printf.h"
 
-DECLARE_bool(color);
+PD_DECLARE_bool(color);
 
 namespace paddle {
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index ca7ab3485e52d6..b59aadf5cc0df2 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -478,9 +478,10 @@
 
 disable_static()
 
-from .new_ir_utils import _switch_to_new_ir  # noqa: F401
+from .new_ir_utils import IrChange  # noqa: F401
 
-_switch_to_new_ir()
+ir_change = IrChange()
+ir_change._switch_to_new_ir()
 
 __all__ = [  # noqa
     'iinfo',
diff --git a/python/paddle/autograd/backward.py b/python/paddle/autograd/ir_backward.py
similarity index 99%
rename from python/paddle/autograd/backward.py
rename to python/paddle/autograd/ir_backward.py
index 4439d5e172efd0..fc2619bbdcf5b3 100644
--- a/python/paddle/autograd/backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -707,26 +707,26 @@ def grad(
         outputs,
         'outputs',
         ((paddle.ir.Value, paddle.ir.OpResult), list, tuple),
-        'paddle.autograd.backward.grad',
+        'paddle.autograd.ir_backward.grad',
     )
     check_type(
         inputs,
         'inputs',
         ((paddle.ir.Value, paddle.ir.OpResult), list, tuple),
-        'paddle.autograd.backward.grad',
+        'paddle.autograd.ir_backward.grad',
     )
     check_type(
         grad_outputs,
         'grad_outputs',
         ((paddle.ir.Value, paddle.ir.OpResult), list, tuple, type(None)),
-        'paddle.autograd.backward.grad',
+        'paddle.autograd.ir_backward.grad',
     )
 
     check_type(
         no_grad_vars,
         'no_grad_vars',
         ((paddle.ir.Value, paddle.ir.OpResult), list, tuple, set, type(None)),
-        'paddle.autograd.backward.grad',
+        'paddle.autograd.ir_backward.grad',
     )
     outputs = _as_list(outputs)
     inputs = _as_list(inputs)
diff --git a/python/paddle/decomposition/decomp.py b/python/paddle/decomposition/decomp.py
index 9bd288dacd4816..940249d0ae5265 100644
--- a/python/paddle/decomposition/decomp.py
+++ b/python/paddle/decomposition/decomp.py
@@ -27,7 +27,7 @@ def _build_tensor_tuple(xs):
         return (xs,)
     elif isinstance(xs, typing.Sequence):
         return tuple(xs)
-    return TypeError(f"Type {type(xs)} is not supported")
+    return TypeError(f"Type {type(xs)} is not supported.")
 
 
 def _prepare_python_api_arguments(op):
@@ -125,6 +125,8 @@ def decompose(
     Returns:
         dst_vars (list): A list contains all vars which replace origin ones in src_vars.
     """
+    if not core._is_fwd_prim_enabled():
+        return src_vars
     if not isinstance(program, Program):
         raise TypeError(f"Expect type Program, but got type {type(program)}.")
     block = program.block()
diff --git a/python/paddle/decomposition/rules.py b/python/paddle/decomposition/rules.py
index ec8959cc960c0a..4184dbdbea62e4 100644
--- a/python/paddle/decomposition/rules.py
+++ b/python/paddle/decomposition/rules.py
@@ -20,8 +20,9 @@
 def mean(x, axis, keepdim):
     """define composite rule of op mean"""
     x_shape = x.shape
-    axes = axis or tuple(range(0, len(x_shape)))
-    axes = (axes,) if isinstance(axes, int) else axes
+    if axis in (None, []):
+        axis = tuple(range(0, len(x_shape)))
+    axes = (axis,) if isinstance(axis, int) else axis
     sum_x = sum(x, axis=axes, keepdim=keepdim)
     value_to_fill = 1
     for axis in axes:
diff --git a/python/paddle/distributed/auto_parallel/static/cluster.py b/python/paddle/distributed/auto_parallel/static/cluster.py
index b9a37cf66b2e63..3145153893f479 100644
--- a/python/paddle/distributed/auto_parallel/static/cluster.py
+++ b/python/paddle/distributed/auto_parallel/static/cluster.py
@@ -429,6 +429,7 @@ def __init__(self):
         # This property only be valid when the cluster consists of machines,
         # which have the same number accelerators.
         self._num_devices_per_machine = None
+        self._gpu_model = None
 
     def gen_default_config_cluster(
         self,
@@ -451,6 +452,7 @@ def gen_default_config_cluster(
         dcu_models = ["DCU"]
         all_gpu_models = gpu_models + xpu_models + dcu_models
         self._num_devices_per_machine = device_count
+        self._gpu_model = gpu_model
 
         def _convert_to_type(gpu_model):
             type = None
diff --git a/python/paddle/distributed/auto_parallel/static/cost/__init__.py b/python/paddle/distributed/auto_parallel/static/cost/__init__.py
index e8ba0300d45dfe..33af1093c746dd 100644
--- a/python/paddle/distributed/auto_parallel/static/cost/__init__.py
+++ b/python/paddle/distributed/auto_parallel/static/cost/__init__.py
@@ -22,6 +22,7 @@
 from .base_cost import build_comm_desc_from_dist_op
 from .base_cost import build_comm_costs_from_descs
 from .base_cost import build_comp_costs_from_descs
+from .base_cost import calc_time_by_cost_model
 
 from .comp_op_cost import EmbeddingOpCost
 from .comp_op_cost import EmbeddingGradOpCost
diff --git a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
index 77f590b0ce8558..197656d6ea8454 100644
--- a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
@@ -19,7 +19,7 @@
 import paddle
 from paddle.utils.flops import flops
 
-from ..cluster import LinkType
+from ..cluster import LinkType, get_default_cluster
 from ..dist_tensor import DistributedTensor
 from ..process_group import get_process_group
 from ..utils import _get_comm_group, _get_idx_in_axis
@@ -785,9 +785,12 @@ def comm_count(self):
             if self.op is not None:
                 vars = self.op.block.vars
                 # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overrided
-                var_name = self.op.input("X")[0]
+                try:
+                    var_name = self.op.input("X")[0]
+                except:
+                    var_name = self.op.output("Out")[0]
                 var = get_var_with_recursion(
-                    var_name, self.op.block, self.program
+                    var_name, self.op.block, self.op.block.program
                 )
                 dtype = var.dtype
                 shape = var.shape
@@ -838,7 +841,7 @@ def group_ranks(self):
             if self.op_desc is not None:
                 self._group_ranks = self.op_desc["group_ranks"]
             elif self.op is not None:
-                ring_id = self.op.attrs("ring_id")
+                ring_id = self.op.attr("ring_id")
                 process_group = get_process_group(ring_id)
                 if process_group is None:
                     raise ValueError(
@@ -921,3 +924,57 @@ def calc_time_by_modeling(op=None, desc=None, cluster=None):
         )
     time = op_cost.calc_time()
     return time
+
+
+def calc_time_by_cost_model(op, cluster=None):
+    """Calc op time by cost model and the unit is microsecond."""
+    if not isinstance(op, paddle.fluid.framework.Operator):
+        raise TypeError(
+            "OP must be paddle.fluid.framework.Operator, but got {}.".format(
+                type(op)
+            )
+        )
+    if not cluster:
+        cluster = get_default_cluster()
+    time = 0.0
+    op_type = op.type
+    # calc comp op time by flops
+    if op_type not in NON_COMP_TYPE:
+        attrs = op.all_attrs()
+        # build comp op inputs desc to calc flops.
+        # for example, a matmul op inputs desc will be {"X": [(1024, 1024)], "Y": [(1024, 1024)]}
+        inputs = {}
+        for input_name in op.input_names:
+            var_names = op.input(input_name)
+            inputs[input_name] = []
+            for var_name in var_names:
+                var = op.block._var_recursive(var_name)
+                inputs[input_name].append(var.shape)
+
+        # the time of grad operator is twice than its forward operator empirically
+        if "_grad" in op_type:
+            op_type = op_type[: len(op_type) - 5]
+            flops_count = 2 * flops(op_type, inputs, attrs)
+        else:
+            flops_count = flops(op_type, inputs, attrs)
+
+        if cluster._gpu_model == "V100":
+            time = flops_count * 2.9e-7 * 2.6
+        elif cluster._gpu_model == "A100":
+            time = flops_count * 2.9e-7
+        else:
+            raise ValueError(
+                "Only A100 and V100 gpu has been supported currently."
+            )
+
+    # calc comm op time by communication modeling formula
+    elif op_type in COMM_OP_TYPE:
+        op_cost = _g_op_cost_factory[op_type](
+            op=op, comm_context=CommContext(cluster)
+        )
+        time = op_cost.calc_time()
+
+    else:
+        raise ValueError(f"The {op_type} has not been supported now.")
+
+    return time
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 0e3c316a5470ca..2b66ac4e16a22d 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -242,6 +242,14 @@ def __init__(
         elif self._strategy.pipeline.enable:
             self._acc_steps = self._strategy.pipeline.accumulate_steps
 
+        if (
+            self._strategy.pipeline.enable
+            and self._strategy.pipeline.schedule_mode == "1F1B"
+        ):
+            assert (
+                os.getenv("CUDA_MODULE_LOADING") != "LAZY"
+            ), "EXP_CUDA_MODULE_LOADING_LAZY not supported in 1F1B pipeline."
+
         self.history = None
 
         paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1})
@@ -941,6 +949,7 @@ def fit(
                 ...             batch_size=64)
         """
         self._mode = 'train'
+
         self._inputs_spec, self._labels_spec = self._prepare_data_spec(
             train_data, train_sample_split, batch_size
         )
@@ -1003,14 +1012,14 @@ def fit(
             logs = {}
             cbks.on_epoch_begin(epoch)
 
-            for step, data in enumerate(train_dataloader):
+            for step, batch in enumerate(train_dataloader):
                 if auto_utils.use_new_executor():
-                    feeds = self._validate_feed(data)
+                    batches = self._validate_batch(batch)
                 else:
-                    feeds = [{}]
+                    batches = [{}]
 
                 try:
-                    for micro_feed in feeds:
+                    for micro_batch in batches:
                         with paddle.profiler.utils._nvprof_range(
                             iter_id=step,
                             start=nvprof_range[0],
@@ -1019,7 +1028,7 @@ def fit(
                             cbks.on_batch_begin('train', step, logs)
                             outs = self._executor.run(
                                 self.main_program,
-                                feed=micro_feed,
+                                feed=micro_batch,
                                 fetch_list=fetch_names,
                                 use_program_cache=self._strategy.use_cache,
                                 return_numpy=self._strategy.return_numpy,
@@ -1128,12 +1137,13 @@ def evaluate(
         self._inputs_spec, self._labels_spec = self._prepare_data_spec(
             valid_data, valid_sample_split, batch_size
         )
-        micro_batch_size = self._validate_batch_size(batch_size)
+
         if not self._has_prepared[self._mode]:
             self._prepare_program(self._mode)
         else:
             self._switch_mode(self._mode)
 
+        micro_batch_size = self._validate_batch_size(batch_size)
         valid_dataloader = self._prepare_dataloader_from_generator(
             dataset=valid_data,
             capacity=70,
@@ -1235,12 +1245,13 @@ def predict(
         self._inputs_spec, self._labels_spec = self._prepare_data_spec(
             test_data, test_sample_split, batch_size
         )
-        micro_batch_size = self._validate_batch_size(batch_size)
+
         if not self._has_prepared[self._mode]:
             self._prepare_program(self._mode)
         else:
             self._switch_mode(self._mode)
 
+        micro_batch_size = self._validate_batch_size(batch_size)
         test_dataloader = self._prepare_dataloader_from_generator(
             dataset=test_data,
             capacity=70,
@@ -1296,19 +1307,21 @@ def dataloader(
     ):
         if mode is not None:
             self.to_mode(mode)
+
         self._inputs_spec, self._labels_spec = self._prepare_data_spec(
             dataset, sample_split, batch_size
         )
-        micro_batch_size = self._validate_batch_size(batch_size)
+
         if not self._has_prepared[self._mode]:
             self._prepare_program(self._mode)
         else:
             self._switch_mode(self._mode)
 
+        batch_size = self._validate_batch_size(batch_size)
         dataloader = self._prepare_dataloader(
             dataset,
             return_list=False,
-            batch_size=micro_batch_size,
+            batch_size=batch_size,
             shuffle=shuffle,
             drop_last=drop_last,
             collate_fn=collate_fn,
@@ -1343,12 +1356,13 @@ def dataloader_from_generator(
         self._inputs_spec, self._labels_spec = self._prepare_data_spec(
             dataset, sample_split, batch_size
         )
-        micro_batch_size = self._validate_batch_size(batch_size)
+
         if not self._has_prepared[self._mode]:
             self._prepare_program(self._mode)
         else:
             self._switch_mode(self._mode)
 
+        micro_batch_size = self._validate_batch_size(batch_size)
         dataloader = self._prepare_dataloader_from_generator(
             dataset=dataset,
             capacity=capacity,
@@ -1574,33 +1588,48 @@ def _tune(self, tune_data, tune_sample_split=None, batch_size=1):
     def _validate_batch_size(self, batch_size):
         if batch_size is None:
             return None
-        if self._strategy.pipeline.enable and auto_utils.use_new_executor():
-            return batch_size
-        assert (
-            batch_size % self._acc_steps == 0
-        ), "Requires batch_size:[{}] to be divisible by acc_steps:[{}].".format(
-            batch_size, self._acc_steps
-        )
-        return batch_size // self._acc_steps
 
-    def _validate_feed(self, feed):
-        if feed is None:
+        if auto_utils.use_new_executor():
+            assert (
+                len(set(self._dp_world_sizes)) == 1
+            ), "DistributedBatchSampler only support one data parallel group, but got [{}] different data parallel groups".format(
+                len(set(self._dp_world_sizes))
+            )
+            assert (
+                batch_size % self._dp_world_sizes[0] == 0
+            ), "batch_size [{}] is not divisible by dp_world_size [{}]".format(
+                str(batch_size), str(self._dp_world_sizes[0])
+            )
+            return batch_size // self._dp_world_sizes[0]
+        else:
+            assert (
+                batch_size % self._acc_steps == 0
+            ), "Requires batch_size:[{}] to be divisible by acc_steps:[{}].".format(
+                batch_size, self._acc_steps
+            )
+            return batch_size // self._acc_steps
+
+    def _validate_batch(self, batch):
+        if batch is None:
             return [None]
-        # pp with schedule or navie-pp
+
         if self._strategy.pipeline.enable or self._acc_steps == 1:
-            return feed
-
-        # split feed data with gradient_merge k_steps
-        feed_names = []
-        split_feeds = []
-        for feed_name, cur_feed in feed[0].items():
-            feed_names.append(feed_name)
-            split_feeds.append(np.split(np.array(cur_feed), self._acc_steps, 0))
-        micro_feeds = []
-        for i in range(self._acc_steps):
-            split_feed = [sf[i] for sf in split_feeds]
-            micro_feeds.append(dict(zip(feed_names, split_feed)))
-        return micro_feeds
+            # pp with schedule or navie-pp
+            return batch
+        else:
+            # split feed data with gradient_merge k_steps
+            feed_names = []
+            split_batches = []
+            for feed_name, cur_feed in batch[0].items():
+                feed_names.append(feed_name)
+                split_batches.append(
+                    np.split(np.array(cur_feed), self._acc_steps, 0)
+                )
+            baches = []
+            for i in range(self._acc_steps):
+                micro_batch = [split_batch[i] for split_batch in split_batches]
+                baches.append(dict(zip(feed_names, micro_batch)))
+            return baches
 
     def _validate_spec(self, specs):
         specs = auto_utils.to_list(specs)
diff --git a/python/paddle/distributed/auto_parallel/static/process_group.py b/python/paddle/distributed/auto_parallel/static/process_group.py
index 285184db14cd9b..9bbe1fbcf48e10 100644
--- a/python/paddle/distributed/auto_parallel/static/process_group.py
+++ b/python/paddle/distributed/auto_parallel/static/process_group.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+import os
 from collections import OrderedDict
 
 import paddle
@@ -146,9 +147,6 @@ def instantiate(self):
         global_rank = genv.rank
 
         if self.nranks >= 2 and global_rank in self.ranks:
-            logger.info(
-                f"group_id: {self.id}, ranks: {self.ranks}, nranks: {self.nranks}, trainer_endpoints: {genv.current_endpoint}"
-            )
             strategy = core.ParallelStrategy()
             strategy.nranks = self.nranks
             strategy.local_rank = self.local_rank(global_rank)
@@ -159,9 +157,22 @@ def instantiate(self):
             strategy.nrings = 1
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(genv.device_id)
-                core.NCCLParallelContext(strategy, place).init_with_ring_id(
-                    ring_id
+                use_new_comm = os.getenv(
+                    "FLAGS_dynamic_static_unified_comm", "0"
                 )
+                if use_new_comm in ["1", "True", "true"]:
+                    store = core.create_or_get_global_tcp_store()
+                    core.CommContextManager.set_device_id(genv.device_id)
+                    core.CommContextManager.create_nccl_comm_context(
+                        store,
+                        str(ring_id),
+                        strategy.local_rank,
+                        strategy.nranks,
+                    )
+                else:
+                    core.NCCLParallelContext(strategy, place).init_with_ring_id(
+                        ring_id
+                    )
             elif core.is_compiled_with_xpu():
                 place = core.XPUPlace(genv.device_id)
                 core.BKCLParallelContext(strategy, place).init_with_ring_id(
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index b1d61196e056d9..144be980a75999 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -330,7 +330,7 @@ def _init_parallel_env(backend):
             store, "0", rank, world_size
         )
     elif backend == "nccl":
-        core.CommContextManager.set_cuda_device_id(dev_id)
+        core.CommContextManager.set_device_id(dev_id)
         core.CommContextManager.create_nccl_comm_context(
             store, "0", rank, world_size
         )
diff --git a/python/paddle/distributed/entry_attr.py b/python/paddle/distributed/entry_attr.py
index dcd5153bb5f1ea..831fff7e2c9e53 100644
--- a/python/paddle/distributed/entry_attr.py
+++ b/python/paddle/distributed/entry_attr.py
@@ -22,22 +22,26 @@ class EntryAttr:
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            sparse_feature_dim = 1024
-            embedding_size = 64
+            >>> sparse_feature_dim = 1024
+            >>> embedding_size = 64
 
-            entry = paddle.distributed.ProbabilityEntry(0.1)
+            >>> entry = paddle.distributed.ProbabilityEntry(0.1)
 
-            input = paddle.static.data(name='ins', shape=[1], dtype='int64')
+            >>> input = paddle.static.data(name='ins', shape=[1], dtype='int64')
 
-            emb = paddle.static.nn.sparse_embedding((
-                input=input,
-                size=[sparse_feature_dim, embedding_size],
-                is_test=False,
-                entry=entry,
-                param_attr=paddle.ParamAttr(name="SparseFeatFactors",
-                                           initializer=paddle.nn.initializer.Uniform()))
+            >>> emb = paddle.static.nn.sparse_embedding(
+            ...     input=input,
+            ...     size=[sparse_feature_dim, embedding_size],
+            ...     is_test=False,
+            ...     entry=entry,
+            ...     param_attr=paddle.ParamAttr(
+            ...         name="SparseFeatFactors",
+            ...         initializer=paddle.nn.initializer.Uniform()
+            ...     )
+            ... )
 
     """
 
@@ -59,22 +63,26 @@ class ProbabilityEntry(EntryAttr):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            sparse_feature_dim = 1024
-            embedding_size = 64
+            >>> sparse_feature_dim = 1024
+            >>> embedding_size = 64
 
-            entry = paddle.distributed.ProbabilityEntry(0.1)
+            >>> entry = paddle.distributed.ProbabilityEntry(0.1)
 
-            input = paddle.static.data(name='ins', shape=[1], dtype='int64')
+            >>> input = paddle.static.data(name='ins', shape=[1], dtype='int64')
 
-            emb = paddle.static.nn.sparse_embedding((
-                input=input,
-                size=[sparse_feature_dim, embedding_size],
-                is_test=False,
-                entry=entry,
-                param_attr=paddle.ParamAttr(name="SparseFeatFactors",
-                                           initializer=paddle.nn.initializer.Uniform()))
+            >>> emb = paddle.static.nn.sparse_embedding(
+            ...     input=input,
+            ...     size=[sparse_feature_dim, embedding_size],
+            ...     is_test=False,
+            ...     entry=entry,
+            ...     param_attr=paddle.ParamAttr(
+            ...         name="SparseFeatFactors",
+            ...         initializer=paddle.nn.initializer.Uniform()
+            ...     )
+            ... )
 
 
     """
@@ -100,22 +108,26 @@ class CountFilterEntry(EntryAttr):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            sparse_feature_dim = 1024
-            embedding_size = 64
+            >>> sparse_feature_dim = 1024
+            >>> embedding_size = 64
 
-            entry = paddle.distributed.CountFilterEntry(10)
+            >>> entry = paddle.distributed.CountFilterEntry(10)
 
-            input = paddle.static.data(name='ins', shape=[1], dtype='int64')
+            >>> input = paddle.static.data(name='ins', shape=[1], dtype='int64')
 
-            emb = paddle.static.nn.sparse_embedding((
-                input=input,
-                size=[sparse_feature_dim, embedding_size],
-                is_test=False,
-                entry=entry,
-                param_attr=paddle.ParamAttr(name="SparseFeatFactors",
-                                           initializer=paddle.nn.initializer.Uniform()))
+            >>> emb = paddle.static.nn.sparse_embedding(
+            ...     input=input,
+            ...     size=[sparse_feature_dim, embedding_size],
+            ...     is_test=False,
+            ...     entry=entry,
+            ...     param_attr=paddle.ParamAttr(
+            ...         name="SparseFeatFactors",
+            ...         initializer=paddle.nn.initializer.Uniform()
+            ...     )
+            ... )
 
     """
 
@@ -144,25 +156,28 @@ class ShowClickEntry(EntryAttr):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            sparse_feature_dim = 1024
-            embedding_size = 64
+            >>> sparse_feature_dim = 1024
+            >>> embedding_size = 64
 
-            shows = paddle.static.data(name='show', shape=[1], dtype='int64')
-            clicks = paddle.static.data(name='click', shape=[1], dtype='int64')
-            input = paddle.static.data(name='ins', shape=[1], dtype='int64')
+            >>> shows = paddle.static.data(name='show', shape=[1], dtype='int64')
+            >>> clicks = paddle.static.data(name='click', shape=[1], dtype='int64')
+            >>> input = paddle.static.data(name='ins', shape=[1], dtype='int64')
 
-            entry = paddle.distributed.ShowClickEntry("show", "click")
+            >>> entry = paddle.distributed.ShowClickEntry("show", "click")
 
-            emb = paddle.static.nn.sparse_embedding(
-                input=input,
-                size=[sparse_feature_dim, embedding_size],
-                is_test=False,
-                entry=entry,
-                param_attr=paddle.ParamAttr(name="SparseFeatFactors",
-                                           initializer=paddle.nn.initializer.Uniform()))
+            >>> emb = paddle.static.nn.sparse_embedding(
+            ...     input=input,
+            ...     size=[sparse_feature_dim, embedding_size],
+            ...     is_test=False,
+            ...     entry=entry,
+            ...     param_attr=paddle.ParamAttr(
+            ...         name="SparseFeatFactors",
+            ...         initializer=paddle.nn.initializer.Uniform()
+            ...     )
+            ... )
 
 
     """
diff --git a/python/paddle/distributed/fleet/recompute/recompute.py b/python/paddle/distributed/fleet/recompute/recompute.py
index b3bf3889a347b5..ba22372e791475 100755
--- a/python/paddle/distributed/fleet/recompute/recompute.py
+++ b/python/paddle/distributed/fleet/recompute/recompute.py
@@ -31,10 +31,24 @@
 def detach_variable(inputs):
     out = []
     for inp in inputs:
-        if not isinstance(inp, core.eager.Tensor):
+        if not isinstance(inp, core.eager.Tensor) and (
+            type(inp) is not tuple or not isinstance(inp[0], core.eager.Tensor)
+        ):
+            # the inp is not a tensor or not a tuple of tensors
             out.append(inp)
             continue
 
+        if type(inp) is tuple:
+            detach_inp = []
+            for i in inp:
+                # detach all tensors in the tuple
+                assert isinstance(i, core.eager.Tensor)
+                tmp_i = i.detach()
+                tmp_i.stop_gradient = i.stop_gradient
+                detach_inp.append(tmp_i)
+            out.append(tuple(detach_inp))
+            continue
+
         x = inp.detach()
         x.stop_gradient = inp.stop_gradient
         out.append(x)
@@ -42,11 +56,16 @@ def detach_variable(inputs):
 
 
 def check_recompute_necessary(inputs):
-    if not any(
-        not input_.stop_gradient
-        for input_ in inputs
-        if isinstance(input_, (core.eager.Tensor, paddle.Tensor))
-    ):
+    necessary_for_each_input = []
+    for input_ in inputs:
+        if isinstance(input_, (core.eager.Tensor, paddle.Tensor)):
+            necessary_for_each_input.append(input_.stop_gradient)
+        elif type(input_) is tuple:
+            for i in input_:
+                # traverse all tensors in the tuple
+                if isinstance(i, (core.eager.Tensor, paddle.Tensor)):
+                    necessary_for_each_input.append(i.stop_gradient)
+    if all(necessary_for_each_input):
         logger.warning(
             "[Recompute]: None of the inputs to current recompute block need grad, "
             "therefore there is NO need to recompute this block in backward !"
@@ -81,12 +100,37 @@ def forward(ctx, run_function, preserve_rng_state, *args, **kwargs):
         # save input for backward
         ctx.inputs = []
         ctx.tensor_indices = []
+        ctx.duplicate_tensor = [False for _ in range(len(args))]
         tensor_inputs = []
         for i, arg in enumerate(args):
             if paddle.is_tensor(arg):
                 tensor_inputs.append(arg)
                 ctx.tensor_indices.append(i)
                 ctx.inputs.append(None)
+            elif type(arg) is tuple:
+                is_tensors = [paddle.is_tensor(a) for a in arg]
+                if all(is_tensors):
+                    # the tuple is a tuple of tensors
+                    tensors_stop_gradient = [a.stop_gradient for a in arg]
+                    if not all(tensors_stop_gradient) and any(
+                        tensors_stop_gradient
+                    ):
+                        # tensors in the tuple have different stop_gradient value, which pylayer doesn't support
+                        raise ValueError(
+                            "Recompute receive a tuple containing tensor holds different stop gradient."
+                        )
+                    tensor_inputs.append(arg)
+                    ctx.tensor_indices.append(i)
+                    # Mark the tuple is a tuple of tensors
+                    ctx.duplicate_tensor[i] = True
+                    ctx.inputs.append(None)
+                elif any(is_tensors):
+                    # the tuple contains tensors and non-tensor values
+                    raise ValueError(
+                        "Recompute receive a tuple containing tensor and non-tensor at same time."
+                    )
+                else:
+                    ctx.inputs.append(arg)
             else:
                 ctx.inputs.append(arg)
         ctx.save_for_backward(*tensor_inputs)
@@ -132,6 +176,7 @@ def backward(ctx, *args):
             # Restore inputs
             inputs = list(ctx.inputs)
             tensor_indices = ctx.tensor_indices
+            duplicate_tensor = ctx.duplicate_tensor
             tensors = ctx.saved_tensor()
             for i, idx in enumerate(tensor_indices):
                 inputs[idx] = tensors[i]
@@ -198,18 +243,23 @@ def backward(ctx, *args):
                     forward_outputs_with_grad, backward_inputs_with_grad
                 )
 
+            grads = []
+            for idx, inp in enumerate(detached_inputs):
+                if isinstance(inp, core.eager.Tensor):
+                    grads.append(inp._grad_ivar())
+                elif type(inp) is tuple and duplicate_tensor[idx]:
+                    # input is a tuple and is a tuple of tensors
+                    if all(i.stop_gradient for i in inp):
+                        # all tensors in the tuple doesn't need grad, only return a None for the whole tuple
+                        grads.append(None)
+                    else:
+                        # all tensors in the tuple nees grad, should return a tuple of grads
+                        grads.append(tuple(i._grad_ivar() for i in inp))
+
             if in_dynamic_mode():
-                grads = tuple(
-                    inp._grad_ivar()
-                    for inp in detached_inputs
-                    if isinstance(inp, core.eager.Tensor)
-                )
+                grads = tuple(grads)
             else:
-                grads = [
-                    inp._grad_ivar()
-                    for inp in detached_inputs
-                    if isinstance(inp, core.eager.Tensor)
-                ]
+                grads = list(grads)
             return grads
 
 
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
index 62e8c189bd9bc4..5a2009b2fd0f21 100644
--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -27,17 +27,17 @@ def _number_count(numbers, upper_range):
         out (Tensor): The output expert count.
     Examples:
         .. code-block:: python
-            # required: distributed
-            import paddle
-
-            numbers = [
-                [0, 2],
-                [0, 2]
-            ]
-            upper_range = 6
-            numbers = paddle.to_tensor(numbers, dtype="int32")
-            number_count = paddle.distributed.utils.number_count(numbers, upper_range)
-            print(number_count) # the result: [2, 0, 2, 0, 0, 0]
+
+            >>> # doctest: +REQUIRES(env: DISTRIBUTED)
+            >>> import paddle
+            >>> from paddle.distributed.models.moe import utils
+            >>> numbers = [[0, 2], [0, 2]]
+            >>> upper_range = 6
+            >>> numbers = paddle.to_tensor(numbers, dtype="int64")
+            >>> number_count = utils._number_count(numbers, upper_range)
+            >>> print(number_count)
+            Tensor(shape=[6], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            [2, 0, 2, 0, 0, 0])
     """
     if in_dynamic_mode():
         return _legacy_C_ops.number_count(numbers, 'upper_range', upper_range)
@@ -73,18 +73,18 @@ def _assign_pos(x, cum_count):
     Examples:
         .. code-block:: python
 
-            # required: distributed
-            import paddle
-            number_count = [2, 0, 2, 0]
-            numbers = [
-                [0, 2],
-                [0, 2]
-            ]
-            number_count = paddle.to_tensor(number_count)
-            numbers = paddle.to_tensor(numbers, dtype="int32")
-            num_cum = paddle.cumsum(number_count)
-            pos = paddle.distributed.utils.assign_pos(x=numbers, cum_count=num_cum)
-            print(pos) # the result: (2, 0, 3, 1)
+            >>> # doctest: +REQUIRES(env: DISTRIBUTED)
+            >>> import paddle
+            >>> from paddle.distributed.models.moe import utils
+            >>> number_count = [2, 0, 2, 0]
+            >>> numbers = [[0, 2], [0, 2]]
+            >>> number_count = paddle.to_tensor(number_count, dtype="int64")
+            >>> numbers = paddle.to_tensor(numbers, dtype="int64")
+            >>> num_cum = paddle.cumsum(number_count)
+            >>> pos = utils._assign_pos(x=numbers, cum_count=num_cum)
+            >>> print(pos)
+            Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            [2, 0, 3, 1])
     """
     if in_dynamic_mode():
         return _legacy_C_ops.assign_pos(x, cum_count, cum_count[-1])
@@ -140,15 +140,19 @@ def _limit_by_capacity(expert_count, capacity, n_worker):
         out (Tensor): The output expert count limit by capacity.
     Examples:
         .. code-block:: python
-            # required: distributed
-            import paddle
-            expert_count = [1, 2, 2, 8, 3, 6]
-            capacity = [5, 5, 5]
-            n_work = 2
-            expert_count = paddle.to_tensor(expert_count, dtype="int32")
-            capacity = paddle.to_tensor(capacity, dtype="int32")
-            out = paddle.distributed.utils.limit_by_capacity(expert_count, capacity, n_work)
-            print(out) # the result: [1, 2, 2, 4, 3, 3]
+
+            >>> # doctest: +REQUIRES(env: DISTRIBUTED)
+            >>> import paddle
+            >>> from paddle.distributed.models.moe import utils
+            >>> expert_count = [1, 2, 2, 8, 3, 6]
+            >>> capacity = [5, 5, 5]
+            >>> n_work = 2
+            >>> expert_count = paddle.to_tensor(expert_count, dtype="int64")
+            >>> capacity = paddle.to_tensor(capacity, dtype="int64")
+            >>> out = utils._limit_by_capacity(expert_count, capacity, n_work)
+            >>> print(out)
+            Tensor(shape=[6], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            [1, 2, 2, 4, 3, 3])
     """
     if in_dynamic_mode():
         return _legacy_C_ops.limit_by_capacity(
@@ -186,14 +190,19 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
     Examples:
         .. code-block:: python
 
-            import paddle
-            gate_idx = paddle.to_tensor([1, 3, 3, 3, 3, 2, 1, 1], dtype='int32')
-            expert_count = paddle.to_tensor([0, 3, 1, 3, 0, 0, 0, 0], dtype='int32')
-            n_worker = 1
-            new_gate_id = paddle.distributed.utils.prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker)
-            print(new_gate_id)
-            # Tensor(shape=[8], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
-              [1, 3, 3, 3, -1, 2, 1, 1])
+            >>> # doctest: +REQUIRES(env: DISTRIBUTED)
+            >>> import paddle
+            >>> from paddle.distributed.models.moe import utils
+            >>> gate_idx = paddle.to_tensor([1, 3, 3, 3, 3, 2, 1, 1], dtype='int64')
+            >>> expert_count = paddle.to_tensor([0, 3, 1, 3, 0, 0, 0, 0], dtype='int64')
+            >>> n_worker = 1
+            >>> n_expert = 8
+            >>> new_gate_id = utils._prune_gate_by_capacity(
+            ...     gate_idx, expert_count, n_expert, n_worker
+            ... )
+            >>> print(new_gate_id)
+            Tensor(shape=[8], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            [1, 3, 3, 3, -1, 2, 1, 1])
     """
     if in_dynamic_mode():
         return _legacy_C_ops.prune_gate_by_capacity(
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 217920debe4de2..67452c1a4e1b2e 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -1084,13 +1084,7 @@ def train():
         master_port = int(master_port)
         is_master = rank == 0
         stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
-        default_store = core.TCPStore(
-            master_addr,
-            master_port,
-            is_master,
-            world_size,
-            timeout=stop_check_timeout,
-        )
+        default_store = core.create_or_get_global_tcp_store()
         _set_default_store(default_store)
         pg = _new_process_group_impl(
             backend,
@@ -1108,6 +1102,11 @@ def train():
         _add_new_group(group)
         parallel_helper._set_parallel_ctx(True)
 
+        # barrier will call CreateNCCLEnvCache which will call CreateNCCLCommContext.
+        # Set device_id to prevent creating null dev_ctx.
+        # TODO(mine): support XPU and other backends.
+        if backend in ["nccl", 'xccl', 'bkcl']:
+            core.CommContextManager.set_device_id(parallel_env.device_id)
         paddle.distributed.barrier(group=group)
         return group
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 01f63eca8576a2..39d589cb2a9009 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -7456,7 +7456,7 @@ def default_main_program():
             paddle.enable_static()
             # Sample Network:
             x = paddle.static.data(name='x', shape=[100, 100], dtype='float32')
-            y = paddle.static.data(name='x', shape=[100, 100], dtype='float32')
+            y = paddle.static.data(name='y', shape=[100, 100], dtype='float32')
             out = paddle.add(x, y)
 
             #print the number of blocks in the program, 1 in this case
diff --git a/python/paddle/incubate/asp/asp.py b/python/paddle/incubate/asp/asp.py
index 7bf04dc151c7fe..8ba4966973fed9 100644
--- a/python/paddle/incubate/asp/asp.py
+++ b/python/paddle/incubate/asp/asp.py
@@ -51,71 +51,71 @@ def set_excluded_layers(param_names, main_program=None):
 
             .. code-block:: python
 
-                import paddle
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.conv1 = paddle.nn.Conv2D(
-                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                        self.linear1 = paddle.nn.Linear(4624, 100)
-
-                    def forward(self, img):
-                        hidden = self.conv1(img)
-                        hidden = paddle.flatten(hidden, start_axis=1)
-                        prediction = self.linear1(hidden)
-                        return prediction
-
-                my_layer = MyLayer()
-                optimizer = paddle.optimizer.SGD(
-                    learning_rate=0.01, parameters=my_layer.parameters())
-
-                # Need to set excluded layers before calling decorate
-                paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
-
-                optimizer = paddle.incubate.asp.decorate(optimizer)
+                >>> import paddle
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.conv1 = paddle.nn.Conv2D(
+                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                ...         self.linear1 = paddle.nn.Linear(4624, 100)
+                ...
+                ...     def forward(self, img):
+                ...         hidden = self.conv1(img)
+                ...         hidden = paddle.flatten(hidden, start_axis=1)
+                ...         prediction = self.linear1(hidden)
+                ...         return prediction
+
+                >>> my_layer = MyLayer()
+                >>> optimizer = paddle.optimizer.SGD(
+                ...     learning_rate=0.01, parameters=my_layer.parameters())
+
+                >>> # Need to set excluded layers before calling decorate
+                >>> paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
+
+                >>> optimizer = paddle.incubate.asp.decorate(optimizer)
 
         2. Usage of Static Graph
 
             .. code-block:: python
 
-                import paddle
-
-                paddle.enable_static()
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.conv1 = paddle.nn.Conv2D(
-                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                        self.linear1 = paddle.nn.Linear(4624, 100)
-
-                    def forward(self, img):
-                        hidden = self.conv1(img)
-                        hidden = paddle.flatten(hidden, start_axis=1)
-                        prediction = self.linear1(hidden)
-                        return prediction
-
-                main_program = paddle.static.Program()
-                startup_program = paddle.static.Program()
-
-                with paddle.static.program_guard(main_program, startup_program):
-                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
-                    label = paddle.static.data(name='label', shape=[None, 100])
-                    my_layer = MyLayer()
-                    prob = my_layer(input_data)
-                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-
-                    # Setup exluded layers out from ASP workflow.
-                    # Please note, excluded_layers must be set before calling optimizer.minimize().
-                    paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
-
-                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                    optimizer = paddle.static.amp.decorate(optimizer )
-                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
-                    # will insert necessary masking operations for ASP workflow.
-                    optimizer = paddle.incubate.asp.decorate(optimizer)
-                    optimizer.minimize(loss, startup_program)
+                >>> import paddle
+
+                >>> paddle.enable_static()
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.conv1 = paddle.nn.Conv2D(
+                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                ...         self.linear1 = paddle.nn.Linear(4624, 100)
+                ...
+                ...     def forward(self, img):
+                ...         hidden = self.conv1(img)
+                ...         hidden = paddle.flatten(hidden, start_axis=1)
+                ...         prediction = self.linear1(hidden)
+                ...         return prediction
+
+                >>> main_program = paddle.static.Program()
+                >>> startup_program = paddle.static.Program()
+
+                >>> with paddle.static.program_guard(main_program, startup_program):
+                ...     input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                ...     label = paddle.static.data(name='label', shape=[None, 100])
+                ...     my_layer = MyLayer()
+                ...     prob = my_layer(input_data)
+                ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+                ...
+                ...     # Setup exluded layers out from ASP workflow.
+                ...     # Please note, excluded_layers must be set before calling optimizer.minimize().
+                ...     paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
+                ...
+                ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                ...     optimizer = paddle.static.amp.decorate(optimizer )
+                ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
+                ...     # will insert necessary masking operations for ASP workflow.
+                ...     optimizer = paddle.incubate.asp.decorate(optimizer)
+                ...     optimizer.minimize(loss, startup_program)
     """
     if main_program is None:
         main_program = paddle.static.default_main_program()
@@ -138,77 +138,77 @@ def reset_excluded_layers(main_program=None):
 
             .. code-block:: python
 
-                import paddle
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.conv1 = paddle.nn.Conv2D(
-                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                        self.linear1 = paddle.nn.Linear(4624, 100)
-
-                    def forward(self, img):
-                        hidden = self.conv1(img)
-                        hidden = paddle.flatten(hidden, start_axis=1)
-                        prediction = self.linear1(hidden)
-                        return prediction
-
-                my_layer = MyLayer()
-                optimizer = paddle.optimizer.SGD(
-                    learning_rate=0.01, parameters=my_layer.parameters())
-
-                # Need to set excluded layers before calling decorate
-                paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
-                # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
-                # Please note, reset_excluded_layers also must be called before calling asp.decorate().
-                paddle.incubate.asp.reset_excluded_layers()
-
-                optimizer = paddle.incubate.asp.decorate(optimizer)
+                >>> import paddle
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.conv1 = paddle.nn.Conv2D(
+                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                ...         self.linear1 = paddle.nn.Linear(4624, 100)
+                ...
+                ...     def forward(self, img):
+                ...         hidden = self.conv1(img)
+                ...         hidden = paddle.flatten(hidden, start_axis=1)
+                ...         prediction = self.linear1(hidden)
+                ...         return prediction
+
+                >>> my_layer = MyLayer()
+                >>> optimizer = paddle.optimizer.SGD(
+                ...     learning_rate=0.01, parameters=my_layer.parameters())
+
+                >>> # Need to set excluded layers before calling decorate
+                >>> paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
+                >>> # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
+                >>> # Please note, reset_excluded_layers also must be called before calling asp.decorate().
+                >>> paddle.incubate.asp.reset_excluded_layers()
+
+                >>> optimizer = paddle.incubate.asp.decorate(optimizer)
 
         2. Usage of Static Graph
 
             .. code-block:: python
 
-                import paddle
-
-                paddle.enable_static()
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.conv1 = paddle.nn.Conv2D(
-                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                        self.linear1 = paddle.nn.Linear(4624, 100)
-
-                    def forward(self, img):
-                        hidden = self.conv1(img)
-                        hidden = paddle.flatten(hidden, start_axis=1)
-                        prediction = self.linear1(hidden)
-                        return prediction
-
-                main_program = paddle.static.Program()
-                startup_program = paddle.static.Program()
-
-                with paddle.static.program_guard(main_program, startup_program):
-                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
-                    label = paddle.static.data(name='label', shape=[None, 100])
-                    my_layer = MyLayer()
-                    prob = my_layer(input_data)
-                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-
-                    # Setup exluded layers out from ASP workflow.
-                    # Please note, excluded_layers must be set before calling optimizer.minimize().
-                    paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
-                    # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
-                    # Please note, reset_excluded_layers also must be called before calling optimizer.minimize().
-                    paddle.incubate.asp.reset_excluded_layers(main_program)
-
-                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                    optimizer = paddle.static.amp.decorate(optimizer )
-                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
-                    # will insert necessary masking operations for ASP workflow.
-                    optimizer = paddle.incubate.asp.decorate(optimizer)
-                    optimizer.minimize(loss, startup_program)
+                >>> import paddle
+
+                >>> paddle.enable_static()
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.conv1 = paddle.nn.Conv2D(
+                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                ...         self.linear1 = paddle.nn.Linear(4624, 100)
+                ...
+                ...     def forward(self, img):
+                ...         hidden = self.conv1(img)
+                ...         hidden = paddle.flatten(hidden, start_axis=1)
+                ...         prediction = self.linear1(hidden)
+                ...         return prediction
+
+                >>> main_program = paddle.static.Program()
+                >>> startup_program = paddle.static.Program()
+
+                >>> with paddle.static.program_guard(main_program, startup_program):
+                ...     input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                ...     label = paddle.static.data(name='label', shape=[None, 100])
+                ...     my_layer = MyLayer()
+                ...     prob = my_layer(input_data)
+                ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+                ...
+                ...     # Setup exluded layers out from ASP workflow.
+                ...     # Please note, excluded_layers must be set before calling optimizer.minimize().
+                ...     paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
+                ...     # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
+                ...     # Please note, reset_excluded_layers also must be called before calling optimizer.minimize().
+                ...     paddle.incubate.asp.reset_excluded_layers(main_program)
+                ...
+                ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                ...     optimizer = paddle.static.amp.decorate(optimizer )
+                ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
+                ...     # will insert necessary masking operations for ASP workflow.
+                ...     optimizer = paddle.incubate.asp.decorate(optimizer)
+                ...     optimizer.minimize(loss, startup_program)
     """
     ASPHelper.reset_excluded_layers(main_program=main_program)
 
@@ -229,72 +229,72 @@ def decorate(optimizer):
 
             .. code-block:: python
 
-                import paddle
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.conv1 = paddle.nn.Conv2D(
-                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                        self.linear1 = paddle.nn.Linear(4624, 32)
-                        self.linear2 = paddle.nn.Linear(32, 32)
-                        self.linear3 = paddle.nn.Linear(32, 10)
-
-                    def forward(self, img):
-                        hidden = self.conv1(img)
-                        hidden = paddle.flatten(hidden, start_axis=1)
-                        hidden = self.linear1(hidden)
-                        hidden = self.linear2(hidden)
-                        prediction = self.linear3(hidden)
-                        return prediction
-
-                my_layer = MyLayer()
-                optimizer = paddle.optimizer.SGD(
-                    learning_rate=0.01, parameters=my_layer.parameters())
-
-                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which
-                # will apply necessary masking operations for ASP workflow.
-                # In dynamic graph mode, ASP would create related mask variables during decoration.
-                optimizer = paddle.incubate.asp.decorate(optimizer)
+                >>> import paddle
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.conv1 = paddle.nn.Conv2D(
+                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                ...         self.linear1 = paddle.nn.Linear(4624, 32)
+                ...         self.linear2 = paddle.nn.Linear(32, 32)
+                ...         self.linear3 = paddle.nn.Linear(32, 10)
+                ...
+                ...     def forward(self, img):
+                ...         hidden = self.conv1(img)
+                ...         hidden = paddle.flatten(hidden, start_axis=1)
+                ...         hidden = self.linear1(hidden)
+                ...         hidden = self.linear2(hidden)
+                ...         prediction = self.linear3(hidden)
+                ...         return prediction
+
+                >>> my_layer = MyLayer()
+                >>> optimizer = paddle.optimizer.SGD(
+                ...     learning_rate=0.01, parameters=my_layer.parameters())
+
+                >>> # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which
+                >>> # will apply necessary masking operations for ASP workflow.
+                >>> # In dynamic graph mode, ASP would create related mask variables during decoration.
+                >>> optimizer = paddle.incubate.asp.decorate(optimizer)
 
         2. Usage of Static Graph
 
             .. code-block:: python
 
-                import paddle
-
-                paddle.enable_static()
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.conv1 = paddle.nn.Conv2D(
-                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                        self.linear1 = paddle.nn.Linear(4624, 100)
-
-                    def forward(self, img):
-                        hidden = self.conv1(img)
-                        hidden = paddle.flatten(hidden, start_axis=1)
-                        prediction = self.linear1(hidden)
-                        return prediction
-
-                main_program = paddle.static.Program()
-                startup_program = paddle.static.Program()
-
-                with paddle.static.program_guard(main_program, startup_program):
-                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
-                    label = paddle.static.data(name='label', shape=[None, 100])
-                    my_layer = MyLayer()
-                    prob = my_layer(input_data)
-                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-
-                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
-                    # will insert necessary masking operations for ASP workflow.
-                    # In static graph mode, ASP creates related mask variables
-                    # during minimize().
-                    optimizer = paddle.incubate.asp.decorate(optimizer)
-                    optimizer.minimize(loss, startup_program)
+                >>> import paddle
+
+                >>> paddle.enable_static()
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.conv1 = paddle.nn.Conv2D(
+                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                ...         self.linear1 = paddle.nn.Linear(4624, 100)
+                ...
+                ...     def forward(self, img):
+                ...         hidden = self.conv1(img)
+                ...         hidden = paddle.flatten(hidden, start_axis=1)
+                ...         prediction = self.linear1(hidden)
+                ...         return prediction
+
+                >>> main_program = paddle.static.Program()
+                >>> startup_program = paddle.static.Program()
+
+                >>> with paddle.static.program_guard(main_program, startup_program):
+                ...     input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                ...     label = paddle.static.data(name='label', shape=[None, 100])
+                ...     my_layer = MyLayer()
+                ...     prob = my_layer(input_data)
+                ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+                ...
+                ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
+                ...     # will insert necessary masking operations for ASP workflow.
+                ...     # In static graph mode, ASP creates related mask variables
+                ...     # during minimize().
+                ...     optimizer = paddle.incubate.asp.decorate(optimizer)
+                ...     optimizer.minimize(loss, startup_program)
     """
     return ASPHelper.decorate(optimizer)
 
@@ -326,112 +326,112 @@ def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
 
             .. code-block:: python
 
-                import paddle
-                import numpy as np
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.conv1 = paddle.nn.Conv2D(
-                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                        self.linear1 = paddle.nn.Linear(4624, 32)
-                        self.linear2 = paddle.nn.Linear(32, 32)
-                        self.linear3 = paddle.nn.Linear(32, 10)
-
-                    def forward(self, img):
-                        hidden = self.conv1(img)
-                        hidden = paddle.flatten(hidden, start_axis=1)
-                        hidden = self.linear1(hidden)
-                        hidden = self.linear2(hidden)
-                        prediction = self.linear3(hidden)
-                        return prediction
-
-                my_layer = MyLayer()
-                loss_fn = paddle.nn.MSELoss(reduction='mean')
-
-                optimizer = paddle.optimizer.SGD(
-                    learning_rate=0.01, parameters=my_layer.parameters())
-
-                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which
-                # will apply necessary masking operations for ASP workflow.
-                # In dynamic graph mode, ASP would create related mask variables during decoration.
-                optimizer = paddle.incubate.asp.decorate(optimizer)
-
-                # Must call paddle.incubate.asp.decorate() first before calling paddle.incubate.asp.prune_model()
-                paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
-
-                for i in range(10):
-                    imgs = paddle.to_tensor(
-                        np.random.randn(64, 3, 32, 32),
-                        dtype='float32', stop_gradient=False)
-                    labels = paddle.to_tensor(
-                        np.random.randint(10, size=(64, 1)),
-                        dtype='float32', stop_gradient=False)
-                    output = my_layer(imgs)
-                    loss = loss_fn(output, labels)
-                    loss.backward()
-                    optimizer.step()
-                    optimizer.clear_grad()
+                >>> import paddle
+                >>> import numpy as np
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.conv1 = paddle.nn.Conv2D(
+                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                ...         self.linear1 = paddle.nn.Linear(4624, 32)
+                ...         self.linear2 = paddle.nn.Linear(32, 32)
+                ...         self.linear3 = paddle.nn.Linear(32, 10)
+                ...
+                ...     def forward(self, img):
+                ...         hidden = self.conv1(img)
+                ...         hidden = paddle.flatten(hidden, start_axis=1)
+                ...         hidden = self.linear1(hidden)
+                ...         hidden = self.linear2(hidden)
+                ...         prediction = self.linear3(hidden)
+                ...         return prediction
+
+                >>> my_layer = MyLayer()
+                >>> loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+                >>> optimizer = paddle.optimizer.SGD(
+                ...     learning_rate=0.01, parameters=my_layer.parameters())
+
+                >>> # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which
+                >>> # will apply necessary masking operations for ASP workflow.
+                >>> # In dynamic graph mode, ASP would create related mask variables during decoration.
+                >>> optimizer = paddle.incubate.asp.decorate(optimizer)
+
+                >>> # Must call paddle.incubate.asp.decorate() first before calling paddle.incubate.asp.prune_model()
+                >>> paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
+
+                >>> for i in range(10):
+                ...     imgs = paddle.to_tensor(
+                ...         np.random.randn(64, 3, 32, 32),
+                ...         dtype='float32', stop_gradient=False)
+                ...     labels = paddle.to_tensor(
+                ...         np.random.randint(10, size=(64, 1)),
+                ...         dtype='float32', stop_gradient=False)
+                ...     output = my_layer(imgs)
+                ...     loss = loss_fn(output, labels)
+                ...     loss.backward()
+                ...     optimizer.step()
+                ...     optimizer.clear_grad()
 
         2. Usage of Static Graph
 
             .. code-block:: python
 
-                import paddle
-                import numpy as np
-
-                paddle.enable_static()
-
-                class MyLayer(paddle.nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self.conv1 = paddle.nn.Conv2D(
-                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
-                        self.linear1 = paddle.nn.Linear(4624, 32)
-                        self.linear2 = paddle.nn.Linear(32, 32)
-                        self.linear3 = paddle.nn.Linear(32, 10)
-
-                    def forward(self, img):
-                        hidden = self.conv1(img)
-                        hidden = paddle.flatten(hidden, start_axis=1)
-                        hidden = self.linear1(hidden)
-                        hidden = self.linear2(hidden)
-                        prediction = self.linear3(hidden)
-                        return prediction
-
-                main_program = paddle.static.Program()
-                startup_program = paddle.static.Program()
-
-                with paddle.static.program_guard(main_program, startup_program):
-                    input_data = paddle.static.data(name='data', shape=[None, 3, 32, 32])
-                    label = paddle.static.data(name='label', shape=[None, 1])
-                    my_layer = MyLayer()
-                    prob = my_layer(input_data)
-                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-
-                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
-                    # will insert necessary masking operations for ASP workflow.
-                    # In static graph mode, ASP creates related mask variables
-                    # during minimize().
-                    optimizer = paddle.incubate.asp.decorate(optimizer)
-                    optimizer.minimize(loss, startup_program)
-
-                device = paddle.device.get_device()
-                place = paddle.set_device(device)
-
-                exe = paddle.static.Executor(place)
-                exe.run(startup_program)
-
-                # Must call exe.run(startup_program) first before calling paddle.asp.prune_model()
-                paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
-                # it also be accepted to call
-                # paddle.incubate.asp.prune_model(main_program, mask_algo='mask_2d_best')
-
-                for i in range(10):
-                    imgs = np.random.randn(64, 3, 32, 32).astype('float32')
-                    labels = np.random.randint(10, size=(64, 1)).astype('float32')
-                    exe.run(main_program, feed={'data':imgs, 'label':labels})
+                >>> import paddle
+                >>> import numpy as np
+
+                >>> paddle.enable_static()
+
+                >>> class MyLayer(paddle.nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self.conv1 = paddle.nn.Conv2D(
+                ...             in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                ...         self.linear1 = paddle.nn.Linear(4624, 32)
+                ...         self.linear2 = paddle.nn.Linear(32, 32)
+                ...         self.linear3 = paddle.nn.Linear(32, 10)
+                ...
+                ...     def forward(self, img):
+                ...         hidden = self.conv1(img)
+                ...         hidden = paddle.flatten(hidden, start_axis=1)
+                ...         hidden = self.linear1(hidden)
+                ...         hidden = self.linear2(hidden)
+                ...         prediction = self.linear3(hidden)
+                ...         return prediction
+
+                >>> main_program = paddle.static.Program()
+                >>> startup_program = paddle.static.Program()
+
+                >>> with paddle.static.program_guard(main_program, startup_program):
+                ...     input_data = paddle.static.data(name='data', shape=[None, 3, 32, 32])
+                ...     label = paddle.static.data(name='label', shape=[None, 1])
+                ...     my_layer = MyLayer()
+                ...     prob = my_layer(input_data)
+                ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+                ...
+                ...     optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                ...     # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which
+                ...     # will insert necessary masking operations for ASP workflow.
+                ...     # In static graph mode, ASP creates related mask variables
+                ...     # during minimize().
+                ...     optimizer = paddle.incubate.asp.decorate(optimizer)
+                ...     optimizer.minimize(loss, startup_program)
+
+                >>> device = paddle.device.get_device()
+                >>> place = paddle.set_device(device)
+
+                >>> exe = paddle.static.Executor(place)
+                >>> exe.run(startup_program)
+
+                >>> # Must call exe.run(startup_program) first before calling paddle.asp.prune_model()
+                >>> paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
+                >>> # it also be accepted to call
+                >>> # paddle.incubate.asp.prune_model(main_program, mask_algo='mask_2d_best')
+
+                >>> for i in range(10):
+                ...     imgs = np.random.randn(64, 3, 32, 32).astype('float32')
+                ...     labels = np.random.randint(10, size=(64, 1)).astype('float32')
+                ...     exe.run(main_program, feed={'data':imgs, 'label':labels})
     """
     device = paddle.device.get_device()
     place = paddle.set_device(device)
@@ -726,19 +726,20 @@ def _is_supported_layer(cls, main_program, param_name):
         Examples:
             .. code-block:: python
 
-              from paddle.incubate.asp import ASPHelper
+                >>> from paddle.incubate.asp import ASPHelper
+                >>> paddle.enable_static()
 
-              main_program = paddle.static.Program()
-              startup_program = paddle.static.Program()
+                >>> main_program = paddle.static.Program()
+                >>> startup_program = paddle.static.Program()
 
-              with paddle.static.program_guard(main_program, startup_program):
-                  input_data = paddle.static.data(name='data', shape=[None, 128])
-                  fc = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None)
+                >>> with paddle.static.program_guard(main_program, startup_program):
+                ...     input_data = paddle.static.data(name='data', shape=[None, 128])
+                ...     fc = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None)
 
-              for param in main_program.global_block().all_parameters():
-                  ASPHelper._is_supported_layer(main_program, param.name)
-              # fc_0.w_0 -> True
-              # fc_0.b_0 -> False
+                >>> for param in main_program.global_block().all_parameters():
+                ...     print(param.name,'->',ASPHelper._is_supported_layer(main_program, param.name))
+                fc_0.w_0 -> True
+                fc_0.b_0 -> False
         """
         param_name_list = param_name.split('.')
 
diff --git a/python/paddle/incubate/autograd/composite_rules.py b/python/paddle/incubate/autograd/composite_rules.py
index caedc31a3c17d7..1ca8ab62a83633 100644
--- a/python/paddle/incubate/autograd/composite_rules.py
+++ b/python/paddle/incubate/autograd/composite_rules.py
@@ -171,11 +171,11 @@ def layernorm_composite(x, scale, bias, epsilon, begin_norm_axis):
     out = difference * rsqrt_var
 
     if scale is not None:
-        if x.shape[begin_norm_axis:] is not scale.shape:
+        if x.shape[begin_norm_axis:] != scale.shape:
             scale = reshape(scale, x.shape[begin_norm_axis:])
         out = out * scale
     if bias is not None:
-        if x.shape[begin_norm_axis:] is not bias.shape:
+        if x.shape[begin_norm_axis:] != bias.shape:
             bias = reshape(bias, x.shape[begin_norm_axis:])
         out = out + bias
 
@@ -266,8 +266,9 @@ def mean_composite(x, axis, keepdim):
         is_amp = True
         x = cast(x, "float32")
 
-    axes = axis or list(range(0, len(x.shape)))
-    axes = [axes] if isinstance(axes, int) else axes
+    if axis in (None, []):
+        axis = tuple(range(0, len(x.shape)))
+    axes = (axis,) if isinstance(axis, int) else axis
     sum_x = sum(x, axis=axes, keepdim=keepdim)
     ele_nums_list = [x.shape[axis] for axis in axes]
     if ele_nums_list == []:
diff --git a/python/paddle/incubate/nn/loss.py b/python/paddle/incubate/nn/loss.py
index 09d41e3f82de75..d31fe41d8ce3fe 100644
--- a/python/paddle/incubate/nn/loss.py
+++ b/python/paddle/incubate/nn/loss.py
@@ -49,10 +49,10 @@ def identity_loss(x, reduction="none"):
 
         .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
-            loss = paddle.static.data(name="loss", shape=[-1, 1], dtype="float32")
-            out = paddle.incubate.identity_loss(loss, reduction=1)
+            >>> import paddle
+            >>> paddle.enable_static()
+            >>> loss = paddle.static.data(name="loss", shape=[-1, 1], dtype="float32")
+            >>> out = paddle.incubate.identity_loss(loss, reduction=1)
     """
     if isinstance(reduction, str):
         reduction = {"sum": 0, "mean": 1, "none": 2}.get(reduction.lower())
diff --git a/python/paddle/incubate/operators/graph_khop_sampler.py b/python/paddle/incubate/operators/graph_khop_sampler.py
index 06b079d7f1c36c..9cade59f1fff3c 100644
--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
@@ -70,17 +70,17 @@ def graph_khop_sampler(
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
-            colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
-            nodes = [0, 8, 1, 2]
-            sample_sizes = [2, 2]
-            row = paddle.to_tensor(row, dtype="int64")
-            colptr = paddle.to_tensor(colptr, dtype="int64")
-            nodes = paddle.to_tensor(nodes, dtype="int64")
+            >>> row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
+            >>> colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
+            >>> nodes = [0, 8, 1, 2]
+            >>> sample_sizes = [2, 2]
+            >>> row = paddle.to_tensor(row, dtype="int64")
+            >>> colptr = paddle.to_tensor(colptr, dtype="int64")
+            >>> nodes = paddle.to_tensor(nodes, dtype="int64")
 
-            edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
+            >>> edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
 
     """
 
diff --git a/python/paddle/incubate/optimizer/lookahead.py b/python/paddle/incubate/optimizer/lookahead.py
index bfa08c40556bec..40e0fd55e4b502 100644
--- a/python/paddle/incubate/optimizer/lookahead.py
+++ b/python/paddle/incubate/optimizer/lookahead.py
@@ -50,66 +50,63 @@ class LookAhead(Optimizer):
 
         .. code-block:: python
 
-            import numpy as np
-            import paddle
-            import paddle.nn as nn
-
-            BATCH_SIZE = 16
-            BATCH_NUM = 4
-            EPOCH_NUM = 4
-
-            IMAGE_SIZE = 784
-            CLASS_NUM = 10
-            # define a random dataset
-            class RandomDataset(paddle.io.Dataset):
-                def __init__(self, num_samples):
-                    self.num_samples = num_samples
-
-                def __getitem__(self, idx):
-                    image = np.random.random([IMAGE_SIZE]).astype('float32')
-                    label = np.random.randint(0, CLASS_NUM - 1,
-                                            (1, )).astype('int64')
-                    return image, label
-
-                def __len__(self):
-                    return self.num_samples
-
-            class LinearNet(nn.Layer):
-                def __init__(self):
-                    super().__init__()
-                    self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
-                    self.bias = self._linear.bias
-
-                @paddle.jit.to_static
-                def forward(self, x):
-                    return self._linear(x)
-
-            def train(layer, loader, loss_fn, opt):
-                for epoch_id in range(EPOCH_NUM):
-                    for batch_id, (image, label) in enumerate(loader()):
-                        out = layer(image)
-                        loss = loss_fn(out, label)
-                        loss.backward()
-                        opt.step()
-                        opt.clear_grad()
-                        print("Train Epoch {} batch {}: loss = {}".format(
-                            epoch_id, batch_id, np.mean(loss.numpy())))
-
-            layer = LinearNet()
-            loss_fn = nn.CrossEntropyLoss()
-            optimizer = paddle.optimizer.SGD(learning_rate=0.1, parameters=layer.parameters())
-            lookahead = paddle.incubate.LookAhead(optimizer, alpha=0.2, k=5)
-
-            # create data loader
-            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-            loader = paddle.io.DataLoader(
-                dataset,
-                batch_size=BATCH_SIZE,
-                shuffle=True,
-                drop_last=True,
-                num_workers=2)
-
-            train(layer, loader, loss_fn, lookahead)
+            >>> import numpy as np
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> BATCH_SIZE = 16
+            >>> BATCH_NUM = 4
+            >>> EPOCH_NUM = 4
+
+            >>> IMAGE_SIZE = 784
+            >>> CLASS_NUM = 10
+            >>> # define a random dataset
+            >>> class RandomDataset(paddle.io.Dataset):
+            ...     def __init__(self, num_samples):
+            ...         self.num_samples = num_samples
+            ...     def __getitem__(self, idx):
+            ...         image = np.random.random([IMAGE_SIZE]).astype('float32')
+            ...         label = np.random.randint(0, CLASS_NUM - 1,
+            ...                                 (1, )).astype('int64')
+            ...         return image, label
+            ...     def __len__(self):
+            ...         return self.num_samples
+
+            >>> class LinearNet(nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+            ...         self.bias = self._linear.bias
+            ...     @paddle.jit.to_static
+            ...     def forward(self, x):
+            ...         return self._linear(x)
+
+            >>> def train(layer, loader, loss_fn, opt):
+            ...     for epoch_id in range(EPOCH_NUM):
+            ...         for batch_id, (image, label) in enumerate(loader()):
+            ...             out = layer(image)
+            ...             loss = loss_fn(out, label)
+            ...             loss.backward()
+            ...             opt.step()
+            ...             opt.clear_grad()
+            ...             print("Train Epoch {} batch {}: loss = {}".format(
+            ...                 epoch_id, batch_id, np.mean(loss.numpy())))
+            >>> layer = LinearNet()
+            >>> loss_fn = nn.CrossEntropyLoss()
+            >>> optimizer = paddle.optimizer.SGD(learning_rate=0.1, parameters=layer.parameters())
+            >>> lookahead = paddle.incubate.LookAhead(optimizer, alpha=0.2, k=5)
+
+            >>> # create data loader
+            >>> dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            >>> loader = paddle.io.DataLoader(
+            ...     dataset,
+            ...     batch_size=BATCH_SIZE,
+            ...     shuffle=True,
+            ...     drop_last=True,
+            ...     num_workers=2)
+
+            >>> # doctest: +SKIP('The run time is too long to pass the CI check.')
+            >>> train(layer, loader, loss_fn, lookahead)
 
     """
     _slow_str = "slow"
@@ -161,16 +158,16 @@ def step(self):
 
             .. code-block:: python
 
-                import paddle
-                inp = paddle.rand([1,10], dtype="float32")
-                linear = paddle.nn.Linear(10, 1)
-                out = linear(inp)
-                loss = paddle.mean(out)
-                sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
-                lookahead = paddle.incubate.LookAhead(sgd, alpha=0.2, k=5)
-                loss.backward()
-                lookahead.step()
-                lookahead.clear_grad()
+                >>> import paddle
+                >>> inp = paddle.rand([1,10], dtype="float32")
+                >>> linear = paddle.nn.Linear(10, 1)
+                >>> out = linear(inp)
+                >>> loss = paddle.mean(out)
+                >>> sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
+                >>> lookahead = paddle.incubate.LookAhead(sgd, alpha=0.2, k=5)
+                >>> loss.backward()
+                >>> lookahead.step()
+                >>> lookahead.clear_grad()
 
         """
         self.inner_optimizer.step()
@@ -274,17 +271,17 @@ def minimize(
 
             .. code-block:: python
 
-                import paddle
-
-                inp = paddle.rand([1, 10], dtype="float32")
-                linear = paddle.nn.Linear(10, 1)
-                out = linear(inp)
-                loss = paddle.mean(out)
-                sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
-                lookahead = paddle.incubate.LookAhead(sgd, alpha=0.2, k=5)
-                loss.backward()
-                lookahead.minimize(loss)
-                lookahead.clear_grad()
+                >>> import paddle
+
+                >>> inp = paddle.rand([1, 10], dtype="float32")
+                >>> linear = paddle.nn.Linear(10, 1)
+                >>> out = linear(inp)
+                >>> loss = paddle.mean(out)
+                >>> sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
+                >>> lookahead = paddle.incubate.LookAhead(sgd, alpha=0.2, k=5)
+                >>> loss.backward()
+                >>> lookahead.minimize(loss)
+                >>> lookahead.clear_grad()
 
         """
         assert isinstance(loss, Variable), "The loss should be an Tensor."
diff --git a/python/paddle/incubate/tensor/manipulation.py b/python/paddle/incubate/tensor/manipulation.py
index 4e1380a0bd7237..70c017a8eec363 100644
--- a/python/paddle/incubate/tensor/manipulation.py
+++ b/python/paddle/incubate/tensor/manipulation.py
@@ -40,12 +40,14 @@ def _npu_identity(x, format=-1):
     Examples:
         .. code-block:: python
 
-            # required: npu
-            import paddle
+            >>> # doctest: +REQUIRES(env:NPU)
+            >>> import paddle
+            >>> paddle.device.set_device('npu')
 
-            x = paddle.ones(shape=[6])
-            y = paddle.incubate._npu_identity(x, 3) # ACL_FORMAT_NC1HWC0 = 3
-            # y.shape = [1, 1, 1, 1, 16]
+            >>> x = paddle.ones(shape=[6])
+            >>> y = paddle.incubate._npu_identity(x, 3) # ACL_FORMAT_NC1HWC0 = 3
+            >>> print(y.shape)
+            [1, 1, 1, 1, 16]
     """
     if in_dynamic_mode():
         return _C_ops.npu_identity(x, format)
diff --git a/python/paddle/ir/core.py b/python/paddle/ir/core.py
index fe496b55f91297..c7f2a73f2ad5e5 100644
--- a/python/paddle/ir/core.py
+++ b/python/paddle/ir/core.py
@@ -141,7 +141,7 @@ def default_main_program():
             paddle.enable_static()
             # Sample Network:
             x = paddle.static.data(name='x', shape=[100, 100], dtype='float32')
-            y = paddle.static.data(name='x', shape=[100, 100], dtype='float32')
+            y = paddle.static.data(name='y', shape=[100, 100], dtype='float32')
             out = paddle.add(x, y)
 
             #print the number of blocks in the program, 1 in this case
diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py
index 58d49065d3c1ca..dd96a1001eae5f 100644
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -418,6 +418,9 @@ def __init__(self):
         # when need to save a prune model, use input_names_after_prune to specify the inputs left after pruning
         self.input_names_after_prune = None
 
+        # in the scene of llm-inference, prunning program can cause unexpectable result, an option to skip prune is necessary
+        self.skip_prune_program = False
+
     @property
     def output_spec(self):
         return self._output_spec
@@ -497,6 +500,7 @@ def _parse_save_configs(configs):
         "clip_extra",
         "skip_forward",
         "input_names_after_prune",
+        "skip_prune_program",
     ]
 
     # input check
@@ -517,6 +521,7 @@ def _parse_save_configs(configs):
     inner_config.input_names_after_prune = configs.get(
         "input_names_after_prune", None
     )
+    inner_config.skip_prune_program = configs.get("skip_prune_program", False)
 
     return inner_config
 
@@ -1248,10 +1253,10 @@ def save(layer, path, input_spec=None, **configs):
             file_prefix = file_prefix + '.' + attr_func
         file_prefix = os.path.join(model_path, file_prefix)
         with scope_guard(scope):
-            input_vars = []
-            for var in concrete_program.main_program.clone().list_vars():
-                if var.name in input_var_names:
-                    input_vars.append(var)
+            input_vars = [
+                concrete_program.main_program.global_block().var(name)
+                for name in input_var_names
+            ]
             save_inference_model(
                 path_prefix=file_prefix,
                 feed_vars=input_vars,
@@ -1259,6 +1264,7 @@ def save(layer, path, input_spec=None, **configs):
                 executor=Executor(_current_expected_place()),
                 program=concrete_program.main_program.clone(),
                 clip_extra=configs.clip_extra,
+                skip_prune_program=configs.skip_prune_program,
             )
 
         if combine_params:
diff --git a/python/paddle/new_ir_utils.py b/python/paddle/new_ir_utils.py
index 443ac48ae829c4..83c9b5f826d8d5 100644
--- a/python/paddle/new_ir_utils.py
+++ b/python/paddle/new_ir_utils.py
@@ -12,17 +12,67 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import paddle
 
+from .fluid.wrapped_decorator import signature_safe_contextmanager
+
+
+class IrChange:
+    def __init__(self):
+        old_flag = paddle.fluid.framework.get_flags("FLAGS_enable_new_ir_api")
+        paddle.fluid.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        if not paddle.ir.core._use_new_ir_api():
+            self.old_Program = paddle.static.Program
+            self.old_program_guard = paddle.fluid.program_guard
+            self.old_default_main_program = paddle.static.default_main_program
+        else:
+            raise RuntimeError(
+                "IrChange only init when paddle.ir.core._use_new_ir_api() is false, \
+                please set FLAGS_enable_new_ir_api = false"
+            )
+        paddle.fluid.framework.set_flags(old_flag)
+
+    def _switch_to_new_ir(self):
+        if paddle.ir.core._use_new_ir_api():
+            paddle.framework.set_flags(
+                {"FLAGS_enable_new_ir_in_executor": True}
+            )
+            paddle.ir.register_paddle_dialect()
+            paddle.static.Program = paddle.ir.Program
+            paddle.fluid.Program = paddle.ir.Program
+            paddle.fluid.program_guard = paddle.ir.core.program_guard
+            paddle.static.program_guard = paddle.ir.core.program_guard
+            paddle.framework.default_main_program = (
+                paddle.ir.core.default_main_program
+            )
+
+    def _switch_to_old_ir(self):
+        if not paddle.ir.core._use_new_ir_api():
+            paddle.framework.set_flags(
+                {"FLAGS_enable_new_ir_in_executor": False}
+            )
+            paddle.static.Program = self.old_Program
+            paddle.fluid.Program = self.old_Program
+            paddle.fluid.program_guard = self.old_program_guard
+            paddle.static.program_guard = self.old_program_guard
+            paddle.framework.default_main_program = (
+                self.old_default_main_program
+            )
+        else:
+            raise RuntimeError(
+                "IrChange._switch_to_old_ir only work when paddle.ir.core._use_new_ir_api() is false, \
+                please set FLAGS_enable_new_ir_api = false"
+            )
+
 
-def _switch_to_new_ir():
-    if paddle.ir.core._use_new_ir_api():
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_in_executor": True})
-        paddle.ir.register_paddle_dialect()
-        paddle.static.Program = paddle.ir.Program
-        paddle.fluid.Program = paddle.ir.Program
-        paddle.fluid.program_guard = paddle.ir.core.program_guard
-        paddle.static.program_guard = paddle.ir.core.program_guard
-        paddle.framework.default_main_program = (
-            paddle.ir.core.default_main_program
-        )
+@signature_safe_contextmanager
+def _newir_guard():
+    ir_change = IrChange()
+    paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
+    ir_change._switch_to_new_ir()
+    try:
+        yield
+    finally:
+        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        ir_change._switch_to_old_ir()
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 26d81c7e38d1c9..dbbed03b3aac4e 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -49,12 +49,15 @@ def celu(x, alpha=1.0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-            x = paddle.to_tensor([[-1., 6.], [1., 15.6]])
-            out = F.celu(x, alpha=0.2)
-            # [[-0.19865242,  6.        ],
-            #  [ 1.        , 15.60000038]]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> x = paddle.to_tensor([[-1., 6.], [1., 15.6]])
+            >>> out = F.celu(x, alpha=0.2)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.19865242,  6.        ],
+             [ 1.        , 15.60000038]])
     """
     if alpha == 0:
         raise ZeroDivisionError("alpha cannot be 0 for celu")
@@ -100,13 +103,15 @@ def elu(x, alpha=1.0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([[-1., 6.], [1., 15.6]])
-            out = F.elu(x, alpha=0.2)
-            # [[-0.12642411  6.        ]
-            #  [ 1.          15.6      ]]
+            >>> x = paddle.to_tensor([[-1., 6.], [1., 15.6]])
+            >>> out = F.elu(x, alpha=0.2)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.12642412,  6.        ],
+             [ 1.        , 15.60000038]])
     """
 
     if in_dynamic_mode():
@@ -168,16 +173,20 @@ def gelu(x, approximate=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            x = paddle.to_tensor([[-1, 0.5], [1, 1.5]])
-            out1 = F.gelu(x)
-            # [[-0.15865529,  0.34573123],
-            #  [ 0.84134471,  1.39978933]]
-            out2 = F.gelu(x, True)
-            # [[-0.15880799,  0.34571400],
-            #  [ 0.84119201,  1.39957154]]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> x = paddle.to_tensor([[-1, 0.5], [1, 1.5]])
+            >>> out1 = F.gelu(x)
+            >>> print(out1)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15865529,  0.34573123],
+             [ 0.84134471,  1.39978933]])
+            >>> out2 = F.gelu(x, True)
+            >>> print(out2)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15880796,  0.34571400],
+             [ 0.84119201,  1.39957154]])
     """
 
     if in_dynamic_mode():
@@ -223,11 +232,15 @@ def hardshrink(x, threshold=0.5, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> x = paddle.to_tensor([-1, 0.3, 2.5])
+            >>> out = F.hardshrink(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-1.       ,  0.       , 2.50000000])
 
-            x = paddle.to_tensor([-1, 0.3, 2.5])
-            out = F.hardshrink(x) # [-1., 0., 2.5]
 
     """
     if in_dynamic_mode():
@@ -274,11 +287,14 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-1.5, 0.3, 2.5])
-            out = F.hardtanh(x) # [-1., 0.3, 1.]
+            >>> x = paddle.to_tensor([-1.5, 0.3, 2.5])
+            >>> out = F.hardtanh(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-1.       , 0.30000001,  1.       ])
     """
 
     if in_dynamic_mode():
@@ -338,11 +354,14 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-4., 5., 1.])
-            out = F.hardsigmoid(x) # [0., 1., 0.666667]
+            >>> x = paddle.to_tensor([-4., 5., 1.])
+            >>> out = F.hardsigmoid(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.        , 1.        , 0.66666669])
     """
 
     if in_dynamic_mode():
@@ -390,11 +409,14 @@ def hardswish(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-4., 5., 1.])
-            out = F.hardswish(x) # [0., 5., 0.666667]
+            >>> x = paddle.to_tensor([-4., 5., 1.])
+            >>> out = F.hardswish(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.       , 5.        , 0.66666669])
     """
     if in_dynamic_mode():
         return _C_ops.hardswish(x)
@@ -442,13 +464,14 @@ def leaky_relu(x, negative_slope=0.01, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-2., 0., 1.])
-            out = F.leaky_relu(x)
-            print(out)
-            # [-0.02, 0., 1.]
+            >>> x = paddle.to_tensor([-2., 0., 1.])
+            >>> out = F.leaky_relu(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.02000000,  0.        ,  1.        ])
 
     """
     if in_dynamic_mode():
@@ -502,25 +525,26 @@ def prelu(x, weight, data_format="NCHW", name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            data = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
-                               [ 3.0, -4.0,  5.0, -6.0],
-                               [-7.0, -8.0,  8.0,  9.0]],
-                              [[ 1.0, -2.0, -3.0,  4.0],
-                               [-5.0,  6.0,  7.0, -8.0],
-                               [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
-
-            w = paddle.to_tensor([0.25], dtype='float32')
-            out = F.prelu(data, w)
-            print(out)
-            # [[[[-0.5 ,  3.  , -1.  ,  5.  ],
-            #    [ 3.  , -1.  ,  5.  , -1.5 ],
-            #    [-1.75, -2.  ,  8.  ,  9.  ]],
-            #   [[ 1.  , -0.5 , -0.75,  4.  ],
-            #    [-1.25,  6.  ,  7.  , -2.  ],
-            #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> data = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
+            ...                            [ 3.0, -4.0,  5.0, -6.0],
+            ...                            [-7.0, -8.0,  8.0,  9.0]],
+            ...                           [[ 1.0, -2.0, -3.0,  4.0],
+            ...                            [-5.0,  6.0,  7.0, -8.0],
+            ...                            [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
+
+            >>> w = paddle.to_tensor([0.25], dtype='float32')
+            >>> out = F.prelu(data, w)
+            >>> print(out)
+            Tensor(shape=[1, 2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[-0.50000000,  3.        , -1.        ,  5.        ],
+               [ 3.        , -1.        ,  5.        , -1.50000000],
+               [-1.75000000, -2.        ,  8.        ,  9.        ]],
+              [[ 1.        , -0.50000000, -0.75000000,  4.        ],
+               [-1.25000000,  6.        ,  7.        , -2.        ],
+               [ 6.        ,  7.        ,  8.        ,  9.        ]]]])
     """
     assert (
         len(weight.shape) == 0 or len(weight.shape) == 1
@@ -634,24 +658,24 @@ def rrelu(x, lower=1.0 / 8.0, upper=1.0 / 3.0, training=True, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            input_tensor = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
-                                            [ 3.0, -4.0,  5.0, -6.0],
-                                            [-7.0, -8.0,  8.0,  9.0]],
-                                            [[ 1.0, -2.0, -3.0,  4.0],
-                                            [-5.0,  6.0,  7.0, -8.0],
-                                            [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
-
-            out = F.rrelu(input_tensor, 0.1, 0.3)
-            print(out)
-            #[[[[-0.20000899  3.         -0.8810822   5.        ]
-            #   [ 3.         -0.55175185  5.         -1.0776101 ]
-            #   [-1.0680687  -1.9896201   8.          9.        ]]
-            #  [[ 1.         -0.5238267  -0.65515125  4.        ]
-            #   [-1.3766339   6.          7.         -2.3465784 ]
-            #   [ 6.          7.          8.          9.        ]]]]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+            >>> paddle.seed(1)
+            >>> input_tensor = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
+            ...                                    [ 3.0, -4.0,  5.0, -6.0],
+            ...                                    [-7.0, -8.0,  8.0,  9.0]],
+            ...                                   [[ 1.0, -2.0, -3.0,  4.0],
+            ...                                    [-5.0,  6.0,  7.0, -8.0],
+            ...                                    [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
+            >>> out = F.rrelu(input_tensor, 0.1, 0.3)
+            >>> print(out)
+            Tensor(shape=[1, 2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[-0.20715050,  3.        , -1.01193857,  5.        ],
+               [ 3.        , -0.94084597,  5.        , -0.65544695],
+               [-1.24268556, -2.34339547,  8.        ,  9.        ]],
+              [[ 1.        , -0.44942653, -0.68969047,  4.        ],
+               [-1.03736508,  6.        ,  7.        , -0.95799232],
+               [ 6.        ,  7.        ,  8.        ,  9.        ]]]])
     """
     if not isinstance(lower, float) or not isinstance(upper, float):
         raise TypeError(
@@ -722,13 +746,14 @@ def relu(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-2, 0, 1], dtype='float32')
-            out = F.relu(x)
-            print(out)
-            # [0., 0., 1.]
+            >>> x = paddle.to_tensor([-2, 0, 1], dtype='float32')
+            >>> out = F.relu(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0., 0., 1.])
     """
 
     if in_dynamic_mode():
@@ -770,11 +795,14 @@ def log_sigmoid(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
-            out = F.log_sigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
+            >>> x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            >>> out = F.log_sigmoid(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.31326166, -0.12692805, -0.04858733, -0.01814996])
     """
 
     if in_dynamic_mode():
@@ -830,20 +858,25 @@ def maxout(x, groups, axis=1, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            x = paddle.rand([1, 2, 3, 4])
-            # [[[[0.5002636  0.22272532 0.17402348 0.2874594 ]
-            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
-            #    [0.02879342 0.88725346 0.61093384 0.38833922]]
-            #   [[0.5231306  0.03807496 0.91661984 0.15602879]
-            #    [0.666127   0.616567   0.30741522 0.24044901]
-            #    [0.7142536  0.7351477  0.31588817 0.23782359]]]]
-            out = F.maxout(x, groups=2)
-            # [[[[0.5231306  0.22272532 0.91661984 0.2874594 ]
-            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
-            #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> paddle.seed(2023)
+            >>> x = paddle.rand([1, 2, 3, 4])
+            >>> print(x)
+            Tensor(shape=[1, 2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0.86583614, 0.52014720, 0.25960937, 0.90525323],
+               [0.42400089, 0.40641287, 0.97020894, 0.74437362],
+               [0.51785129, 0.73292869, 0.97786582, 0.04315904]],
+              [[0.42639419, 0.71958369, 0.20811461, 0.19731510],
+               [0.38424349, 0.14603184, 0.22713774, 0.44607511],
+               [0.21657862, 0.67685395, 0.46460176, 0.92382854]]]])
+            >>> out = F.maxout(x, groups=2)
+            >>> print(out)
+            Tensor(shape=[1, 1, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0.86583614, 0.71958369, 0.25960937, 0.90525323],
+               [0.42400089, 0.40641287, 0.97020894, 0.74437362],
+               [0.51785129, 0.73292869, 0.97786582, 0.92382854]]]])
     """
     if in_dynamic_mode():
         return _C_ops.maxout(x, groups, axis)
@@ -888,13 +921,14 @@ def relu6(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-1, 0.3, 6.5])
-            out = F.relu6(x)
-            print(out)
-            # [0, 0.3, 6]
+            >>> x = paddle.to_tensor([-1, 0.3, 6.5])
+            >>> out = F.relu6(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.        , 0.30000001, 6.        ])
     """
     threshold = 6.0
     if in_dynamic_mode():
@@ -945,13 +979,15 @@ def selu(
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([[0.0, 1.0],[2.0, 3.0]])
-            out = F.selu(x)
-            print(out)
-            # [[0, 1.050701],[2.101402, 3.152103]]
+            >>> x = paddle.to_tensor([[0.0, 1.0],[2.0, 3.0]])
+            >>> out = F.selu(x)
+            >>> print(out)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.        , 1.05070102],
+             [2.10140204, 3.15210295]])
     """
     if scale <= 1.0:
         raise ValueError(
@@ -1000,11 +1036,14 @@ def silu(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
-            out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+            >>> x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            >>> out = F.silu(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.73105860, 1.76159406, 2.85772228, 3.92805505])
     """
 
     if in_dynamic_mode():
@@ -1111,25 +1150,35 @@ def softmax(x, axis=-1, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            x = paddle.to_tensor([[[2.0, 3.0, 4.0, 5.0],
-                        [3.0, 4.0, 5.0, 6.0],
-                        [7.0, 8.0, 8.0, 9.0]],
-                        [[1.0, 2.0, 3.0, 4.0],
-                        [5.0, 6.0, 7.0, 8.0],
-                        [6.0, 7.0, 8.0, 9.0]]],dtype='float32')
-            out1 = F.softmax(x)
-            out2 = F.softmax(x, dtype='float64')
-            # out1's data type is float32; out2's data type is float64
-            # out1 and out2's value is as follows:
-            # [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-            #   [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
-            # [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-            #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> x = paddle.to_tensor([[[2.0, 3.0, 4.0, 5.0],
+            ...                        [3.0, 4.0, 5.0, 6.0],
+            ...                        [7.0, 8.0, 8.0, 9.0]],
+            ...                       [[1.0, 2.0, 3.0, 4.0],
+            ...                        [5.0, 6.0, 7.0, 8.0],
+            ...                        [6.0, 7.0, 8.0, 9.0]]],dtype='float32')
+            >>> out1 = F.softmax(x)
+            >>> out2 = F.softmax(x, dtype='float64')
+            >>> #out1's data type is float32; out2's data type is float64
+            >>> #out1 and out2's value is as follows:
+            >>> print(out1)
+            >>> print(out2)
+            Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[0.03205860, 0.08714432, 0.23688284, 0.64391428],
+              [0.03205860, 0.08714432, 0.23688284, 0.64391428],
+              [0.07232949, 0.19661194, 0.19661194, 0.53444666]],
+             [[0.03205860, 0.08714432, 0.23688284, 0.64391428],
+              [0.03205860, 0.08714432, 0.23688284, 0.64391428],
+              [0.03205860, 0.08714432, 0.23688284, 0.64391428]]])
+            Tensor(shape=[2, 3, 4], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[[0.03205860, 0.08714432, 0.23688282, 0.64391426],
+              [0.03205860, 0.08714432, 0.23688282, 0.64391426],
+              [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+             [[0.03205860, 0.08714432, 0.23688282, 0.64391426],
+              [0.03205860, 0.08714432, 0.23688282, 0.64391426],
+              [0.03205860, 0.08714432, 0.23688282, 0.64391426]]])
     """
 
     if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
@@ -1214,11 +1263,14 @@ def softplus(x, beta=1, threshold=20, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3], dtype='float32')
-            out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
+            >>> x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3], dtype='float32')
+            >>> out = F.softplus(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.51301527, 0.59813893, 0.74439669, 0.85435522])
     """
 
     if in_dynamic_mode():
@@ -1264,14 +1316,14 @@ def softshrink(x, threshold=0.5, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-0.9, -0.2, 0.1, 0.8])
-            out = F.softshrink(x)
-            print(out)
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.39999998,  0.        ,  0.        ,  0.30000001])
+            >>> x = paddle.to_tensor([-0.9, -0.2, 0.1, 0.8])
+            >>> out = F.softshrink(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.39999998,  0.        ,  0.        ,  0.30000001])
     """
     if threshold < 0:
         raise ValueError(
@@ -1315,14 +1367,14 @@ def softsign(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-            out = F.softsign(x)
-            print(out)
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.28571430, -0.16666666,  0.09090909,  0.23076925])
+            >>> x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+            >>> out = F.softsign(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.28571430, -0.16666666,  0.09090909,  0.23076925])
     """
     if in_dynamic_mode():
         return _C_ops.softsign(x)
@@ -1354,14 +1406,14 @@ def swish(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-2., 0., 1.])
-            out = F.swish(x)
-            print(out)
-            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.23840584,  0.        ,  0.73105854])
+            >>> x = paddle.to_tensor([-2., 0., 1.])
+            >>> out = F.swish(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.23840584,  0.        ,  0.73105860])
     """
     if in_dynamic_mode():
         return _C_ops.swish(x)
@@ -1403,11 +1455,14 @@ def mish(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-5., 0., 5.])
-            out = F.mish(x) # [-0.03357624, 0., 4.99955208]
+            >>> x = paddle.to_tensor([-5., 0., 5.])
+            >>> out = F.mish(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.03357624,  0.        ,  4.99955177])
     """
     if in_dynamic_mode():
         return _C_ops.mish(x, 20)
@@ -1439,14 +1494,14 @@ def tanhshrink(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-            out = F.tanhshrink(x)
-            print(out)
-            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [-0.02005106, -0.00262468,  0.00033200,  0.00868741])
+            >>> x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+            >>> out = F.tanhshrink(x)
+            >>> print(out)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [-0.02005100, -0.00262472,  0.00033201,  0.00868741])
     """
     if in_dynamic_mode():
         return _C_ops.tanh_shrink(x)
@@ -1488,14 +1543,14 @@ def thresholded_relu(x, threshold=1.0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.to_tensor([2., 0., 1.])
-            out = F.thresholded_relu(x)
-            print(out)
-            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [2., 0., 0.])
+            >>> x = paddle.to_tensor([2., 0., 1.])
+            >>> out = F.thresholded_relu(x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [2., 0., 0.])
     """
 
     if in_dynamic_mode():
@@ -1561,26 +1616,35 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            x = [[[-2.0, 3.0, -4.0, 5.0],
-                  [3.0, -4.0, 5.0, -6.0],
-                  [-7.0, -8.0, 8.0, 9.0]],
-                 [[1.0, -2.0, -3.0, 4.0],
-                  [-5.0, 6.0, 7.0, -8.0],
-                  [6.0, 7.0, 8.0, 9.0]]]
-            x = paddle.to_tensor(x)
-            out1 = F.log_softmax(x)
-            out2 = F.log_softmax(x, dtype='float64')
-            # out1's data type is float32; out2's data type is float64
-            # out1 and out2's value is as follows:
-            # [[[ -7.1278396   -2.1278396   -9.127839    -0.12783948]
-            #   [ -2.1270514   -9.127051    -0.12705144 -11.127051  ]
-            #   [-16.313261   -17.313261    -1.3132617   -0.31326184]]
-            #  [[ -3.0518122   -6.051812    -7.051812    -0.051812  ]
-            #   [-12.313267    -1.3132664   -0.3132665  -15.313267  ]
-            #   [ -3.4401896   -2.4401896   -1.4401896   -0.44018966]]]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+            >>> x = [[[-2.0, 3.0, -4.0, 5.0],
+            ...       [3.0, -4.0, 5.0, -6.0],
+            ...       [-7.0, -8.0, 8.0, 9.0]],
+            ...      [[1.0, -2.0, -3.0, 4.0],
+            ...       [-5.0, 6.0, 7.0, -8.0],
+            ...       [6.0, 7.0, 8.0, 9.0]]]
+            >>> x = paddle.to_tensor(x)
+            >>> out1 = F.log_softmax(x)
+            >>> out2 = F.log_softmax(x, dtype='float64')
+            >>> #out1's data type is float32; out2's data type is float64
+            >>> #out1 and out2's value is as follows:
+            >>> print(out1)
+            Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[-7.12783957 , -2.12783957 , -9.12783909 , -0.12783945 ],
+              [-2.12705135 , -9.12705135 , -0.12705141 , -11.12705135],
+              [-16.31326103, -17.31326103, -1.31326187 , -0.31326184 ]],
+             [[-3.05181193 , -6.05181217 , -7.05181217 , -0.05181199 ],
+              [-12.31326675, -1.31326652 , -0.31326646 , -15.31326675],
+              [-3.44018984 , -2.44018984 , -1.44018972 , -0.44018975 ]]])
+            >>> print(out2)
+            Tensor(shape=[2, 3, 4], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[[-7.12783948 , -2.12783948 , -9.12783948 , -0.12783948 ],
+              [-2.12705141 , -9.12705141 , -0.12705141 , -11.12705141],
+              [-16.31326180, -17.31326180, -1.31326180 , -0.31326180 ]],
+             [[-3.05181198 , -6.05181198 , -7.05181198 , -0.05181198 ],
+              [-12.31326640, -1.31326640 , -0.31326640 , -15.31326640],
+              [-3.44018970 , -2.44018970 , -1.44018970 , -0.44018970 ]]])
     """
 
     if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
@@ -1655,17 +1719,16 @@ def glu(x, axis=-1, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            from paddle.nn import functional as F
-
-            x = paddle.to_tensor(
-                [[-0.22014759, -1.76358426,  0.80566144,  0.04241343],
-                    [-1.94900405, -1.89956081,  0.17134808, -1.11280477]]
-            )
-            print(F.glu(x))
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[-0.15216254, -0.90048921],
-            #         [-1.05778778, -0.46985325]])
+            >>> import paddle
+            >>> from paddle.nn import functional as F
+            >>> x = paddle.to_tensor(
+            ...     [[-0.22014759, -1.76358426,  0.80566144,  0.04241343],
+            ...         [-1.94900405, -1.89956081,  0.17134808, -1.11280477]]
+            ... )
+            >>> print(F.glu(x))
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.15216254, -0.90048921],
+            [-1.05778778, -0.46985325]])
 
     """
     check_variable_and_dtype(
@@ -1727,18 +1790,19 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            logits = paddle.randn([4, 6])
-            temperature = 0.01
-            gumbel_softmax = F.gumbel_softmax(logits, temperature)
-            print(gumbel_softmax)
-            # out's value is as follows:
-            # [[0.00000001, 1.        , 0.00000000, 0.00000000, 0.00000006, 0.00000000],
-            # [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 1.        ],
-            # [0.00000062, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.99999940],
-            # [0.00000000, 0.00000000, 0.00000000, 0.00001258, 0.99998736, 0.00000000]]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> paddle.seed(2023)
+            >>> logits = paddle.randn([4, 6])
+            >>> temperature = 0.01
+            >>> gumbel_softmax = F.gumbel_softmax(logits, temperature)
+            >>> print(gumbel_softmax)
+            Tensor(shape=[4, 6], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.00000000, 1.        , 0.00000000, 0.00000000, 0.00000000, 0.00000000],
+             [0.00000000, 0.00000000, 1.        , 0.00000000, 0.00000000, 0.00000000],
+             [0.00000000, 0.00000004, 0.00000000, 0.00000000, 1.        , 0.00000000],
+             [0.00000000, 1.        , 0.00000000, 0.00000000, 0.00000000, 0.00000000]])
 
     """
     if in_dynamic_mode():
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index e513fb0670ef7d..bc43e73c4163db 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -95,11 +95,11 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
 
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.randn((100,3,224,224))
-            y = F.unfold(x, [3, 3], 1, 1, 1)
+            >>> x = paddle.randn((100,3,224,224))
+            >>> y = F.unfold(x, [3, 3], 1, 1, 1)
     """
 
     helper = LayerHelper("unfold", **locals())
@@ -348,23 +348,21 @@ def interpolate(
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
-            output_1 = F.interpolate(x=input_data, size=[12,12])
-            print(output_1.shape)
-            # [2L, 3L, 12L, 12L]
-
-            # given scale
-            output_2 = F.interpolate(x=input_data, scale_factor=[2,1])
-            print(output_2.shape)
-            # [2L, 3L, 12L, 10L]
-
-            # bilinear interp
-            output_3 = F.interpolate(x=input_data, scale_factor=[2,1], mode="bilinear")
-            print(output_2.shape)
-            # [2L, 3L, 12L, 10L]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
+            >>> output_1 = F.interpolate(x=input_data, size=[12,12])
+            >>> print(output_1.shape)
+            [2, 3, 12, 12]
+            >>> # given scale
+            >>> output_2 = F.interpolate(x=input_data, scale_factor=[2,1])
+            >>> print(output_2.shape)
+            [2, 3, 12, 10]
+            >>> # bilinear interp
+            >>> output_3 = F.interpolate(x=input_data, scale_factor=[2,1], mode="bilinear")
+            >>> print(output_2.shape)
+            [2, 3, 12, 10]
     """
     data_format = data_format.upper()
     resample = mode.upper()
@@ -877,15 +875,14 @@ def upsample(
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-
-            input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
-            upsample_out = paddle.nn.Upsample(size=[12,12])
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            output = upsample_out(x=input_data)
-            print(output.shape)
-            # [2L, 3L, 12L, 12L]
+            >>> input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
+            >>> upsample_out = paddle.nn.Upsample(size=[12,12])
+            >>> output = upsample_out(x=input_data)
+            >>> print(output.shape)
+            [2, 3, 12, 12]
 
     """
     return interpolate(
@@ -913,17 +910,16 @@ def bilinear(x1, x2, weight, bias=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x1 = paddle.randn((5, 5)).astype(paddle.float32)
-            x2 = paddle.randn((5, 4)).astype(paddle.float32)
-            w = paddle.randn((1000, 5, 4)).astype(paddle.float32)
-            b = paddle.randn((1, 1000)).astype(paddle.float32)
-
-            result = F.bilinear(x1, x2, w, b)
-            print(result.shape)
-            # [5, 1000]
+            >>> x1 = paddle.randn((5, 5)).astype(paddle.float32)
+            >>> x2 = paddle.randn((5, 4)).astype(paddle.float32)
+            >>> w = paddle.randn((1000, 5, 4)).astype(paddle.float32)
+            >>> b = paddle.randn((1, 1000)).astype(paddle.float32)
+            >>> result = F.bilinear(x1, x2, w, b)
+            >>> print(result.shape)
+            [5, 1000]
     """
 
     if in_dynamic_mode():
@@ -1061,39 +1057,38 @@ def dropout(
 
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[1,2,3], [4,5,6]]).astype(paddle.float32)
-            y_train = paddle.nn.functional.dropout(x, 0.5)
-            y_test = paddle.nn.functional.dropout(x, 0.5, training=False)
-            y_0 = paddle.nn.functional.dropout(x, axis=0)
-            y_1 = paddle.nn.functional.dropout(x, axis=1)
-            y_01 = paddle.nn.functional.dropout(x, axis=[0,1])
-            print(x)
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [[1., 2., 3.],
-            #         [4., 5., 6.]])
-            print(y_train)
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [[2. , 0. , 6. ],
-            #         [8. , 0. , 12.]])
-            print(y_test)
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [[1., 2., 3.],
-            #         [4., 5., 6.]])
-            print(y_0)
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [[0. , 0. , 0. ],
-            #         [8. , 10., 12.]])
-            print(y_1)
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [[2. , 0. , 6. ],
-            #         [8. , 0. , 12.]])
-            print(y_01)
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [[0. , 0. , 0. ],
-            #         [8. , 0. , 12.]])
-
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> x = paddle.to_tensor([[1,2,3], [4,5,6]]).astype(paddle.float32)
+            >>> y_train = paddle.nn.functional.dropout(x, 0.5)
+            >>> y_test = paddle.nn.functional.dropout(x, 0.5, training=False)
+            >>> y_0 = paddle.nn.functional.dropout(x, axis=0)
+            >>> y_1 = paddle.nn.functional.dropout(x, axis=1)
+            >>> y_01 = paddle.nn.functional.dropout(x, axis=[0,1])
+            >>> print(x)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[1., 2., 3.],
+             [4., 5., 6.]])
+            >>> print(y_train)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[2., 4., 0.],
+            [8., 0., 0.]])
+            >>> print(y_test)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[1., 2., 3.],
+             [4., 5., 6.]])
+            >>> print(y_0)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[2., 4., 6.],
+             [8. , 10., 12.]])
+            >>> print(y_1)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[2. , 4. , 6. ],
+             [8. , 10., 12.]])
+            >>> print(y_01)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0., 0., 6.],
+             [0., 0., 0.]])
     """
     if not isinstance(p, (float, int, Variable)):
         raise TypeError("p argument should be a number or Variable")
@@ -1258,17 +1253,106 @@ def dropout2d(x, p=0.5, training=True, data_format='NCHW', name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.randn(shape=(2, 3, 4, 5)).astype(paddle.float32)
-            y_train = paddle.nn.functional.dropout2d(x)  #train
-            y_test = paddle.nn.functional.dropout2d(x, training=False) #test
-            for i in range(2):
-                for j in range(3):
-                    print(x[i,j,:,:])
-                    print(y_train[i,j,:,:]) # may all 0
-                    print(y_test[i,j,:,:])
-
+            >>> import paddle
+            >>> paddle.seed(1)
+            >>> x = paddle.randn(shape=(2, 3, 4, 5)).astype(paddle.float32)
+            >>> y_train = paddle.nn.functional.dropout2d(x)  #train
+            >>> y_test = paddle.nn.functional.dropout2d(x, training=False) #test
+            >>> for i in range(2):
+            ...     for j in range(3):
+            ...         print(x[i,j,:,:])
+            ...         print(y_train[i,j,:,:]) # may all 0
+            ...         print(y_test[i,j,:,:])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.30557564,  0.11855337,  0.41220093, -0.09968963,  1.50014710],
+             [ 1.24004936, -0.92485696,  0.08612321,  1.15149164, -0.09276631],
+             [ 1.22873247, -1.46587241, -1.30802727,  0.19496460,  1.73776841],
+             [ 0.40092674,  0.67630458,  0.72265440,  1.31720388, -1.41899264]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.61115128,  0.23710674,  0.82440186, -0.19937925,  3.00029421],
+             [ 2.48009872, -1.84971392,  0.17224643,  2.30298328, -0.18553263],
+             [ 2.45746493, -2.93174481, -2.61605453,  0.38992921,  3.47553682],
+             [ 0.80185348,  1.35260916,  1.44530880,  2.63440776, -2.83798528]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.30557564,  0.11855337,  0.41220093, -0.09968963,  1.50014710],
+             [ 1.24004936, -0.92485696,  0.08612321,  1.15149164, -0.09276631],
+             [ 1.22873247, -1.46587241, -1.30802727,  0.19496460,  1.73776841],
+             [ 0.40092674,  0.67630458,  0.72265440,  1.31720388, -1.41899264]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.88350385, -1.14767575,  0.51043051, -0.10051888, -0.61305630],
+             [-0.12084112,  0.48506257, -1.13189507,  0.62806708, -0.80003673],
+             [ 0.51513153, -0.08890446,  0.22753835,  0.11557858,  0.78117645],
+             [ 1.47505593,  0.84618902, -0.38528305, -1.05887091,  0.16592593]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 1.76700771, -2.29535151,  1.02086103, -0.20103776, -1.22611260],
+             [-0.24168225,  0.97012514, -2.26379013,  1.25613415, -1.60007346],
+             [ 1.03026307, -0.17780893,  0.45507669,  0.23115715,  1.56235290],
+             [ 2.95011187,  1.69237804, -0.77056611, -2.11774182,  0.33185187]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.88350385, -1.14767575,  0.51043051, -0.10051888, -0.61305630],
+             [-0.12084112,  0.48506257, -1.13189507,  0.62806708, -0.80003673],
+             [ 0.51513153, -0.08890446,  0.22753835,  0.11557858,  0.78117645],
+             [ 1.47505593,  0.84618902, -0.38528305, -1.05887091,  0.16592593]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-1.46668839, -0.38117948,  1.18678427,  0.38740095,  0.29117522],
+             [-0.13538910, -0.14527084, -0.04912176, -0.26063353,  0.23640174],
+             [ 0.45643106,  0.60587281, -1.03242552, -0.45319262, -1.57911122],
+             [-0.08732958, -0.75898546,  0.14563090, -1.73751652, -0.89109969]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0., -0., 0. , 0. , 0. ],
+             [-0., -0., -0., -0., 0. ],
+             [0. , 0. , -0., -0., -0.],
+             [-0., -0., 0. , -0., -0.]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-1.46668839, -0.38117948,  1.18678427,  0.38740095,  0.29117522],
+             [-0.13538910, -0.14527084, -0.04912176, -0.26063353,  0.23640174],
+             [ 0.45643106,  0.60587281, -1.03242552, -0.45319262, -1.57911122],
+             [-0.08732958, -0.75898546,  0.14563090, -1.73751652, -0.89109969]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.32110816, -0.76044011,  0.34456784, -0.39410326,  0.37896338],
+             [ 0.52747023,  0.72711533,  0.29204839,  0.72493637,  0.31128070],
+             [ 0.58046782, -1.78499067, -1.67504823, -0.38590902, -0.26243693],
+             [ 0.96669912,  0.43670532, -0.38109761,  0.78405094, -2.17882323]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0., -0., 0. , -0., 0. ],
+             [0. , 0. , 0. , 0. , 0. ],
+             [0. , -0., -0., -0., -0.],
+             [0. , 0. , -0., 0. , -0.]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.32110816, -0.76044011,  0.34456784, -0.39410326,  0.37896338],
+             [ 0.52747023,  0.72711533,  0.29204839,  0.72493637,  0.31128070],
+             [ 0.58046782, -1.78499067, -1.67504823, -0.38590902, -0.26243693],
+             [ 0.96669912,  0.43670532, -0.38109761,  0.78405094, -2.17882323]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.17168395,  0.45112833,  0.63307828,  2.38763475, -1.27247131],
+             [ 0.56171960, -1.09584677,  0.38300961, -0.57512099,  0.31011426],
+             [-0.95336407, -1.04852903, -0.21312937, -0.53549880, -0.00074209],
+             [ 2.22819090,  1.12403083, -0.04198794, -1.51167727, -0.42699185]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0. , 0. , 0. , 0. , -0.],
+             [0. , -0., 0. , -0., 0. ],
+             [-0., -0., -0., -0., -0.],
+             [0. , 0. , -0., -0., -0.]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.17168395,  0.45112833,  0.63307828,  2.38763475, -1.27247131],
+             [ 0.56171960, -1.09584677,  0.38300961, -0.57512099,  0.31011426],
+             [-0.95336407, -1.04852903, -0.21312937, -0.53549880, -0.00074209],
+             [ 2.22819090,  1.12403083, -0.04198794, -1.51167727, -0.42699185]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.62503546, -0.20989063, -0.22046235, -0.38679042, -1.02590704],
+             [ 1.04561794,  1.08428383, -0.52219963, -1.56003857,  0.89213932],
+             [-0.16578521,  0.14524542, -0.45563069,  0.48180851,  1.35843253],
+             [ 1.07669640, -0.84535235, -1.18651557,  0.79144061, -0.45565742]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0. , -0., -0., -0., -0.],
+             [0. , 0. , -0., -0., 0. ],
+             [-0., 0. , -0., 0. , 0. ],
+             [0. , -0., -0., 0. , -0.]])
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.62503546, -0.20989063, -0.22046235, -0.38679042, -1.02590704],
+             [ 1.04561794,  1.08428383, -0.52219963, -1.56003857,  0.89213932],
+             [-0.16578521,  0.14524542, -0.45563069,  0.48180851,  1.35843253],
+             [ 1.07669640, -0.84535235, -1.18651557,  0.79144061, -0.45565742]])
     """
     input_shape = x.shape
     if len(input_shape) != 4:
@@ -1317,14 +1401,14 @@ def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x = paddle.randn(shape=(2, 3, 4, 5, 6)).astype(paddle.float32)
-            y_train = paddle.nn.functional.dropout3d(x)  #train
-            y_test = paddle.nn.functional.dropout3d(x, training=False) #test
-            print(x[0,0,:,:,:])
-            print(y_train[0,0,:,:,:]) # may all 0
-            print(y_test[0,0,:,:,:])
+            >>> x = paddle.randn(shape=(2, 3, 4, 5, 6)).astype(paddle.float32)
+            >>> y_train = paddle.nn.functional.dropout3d(x)  #train
+            >>> y_test = paddle.nn.functional.dropout3d(x, training=False) #test
+            >>> print(x[0,0,:,:,:])
+            >>> print(y_train[0,0,:,:,:]) # may all 0
+            >>> print(y_test[0,0,:,:,:])
 
     """
 
@@ -1371,19 +1455,19 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.to_tensor([[-1, 1], [-1, 1]]).astype(paddle.float32)
-            y_train = paddle.nn.functional.alpha_dropout(x, 0.5)
-            y_test = paddle.nn.functional.alpha_dropout(x, 0.5, training=False)
-            print(y_train)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [[-0.10721093, -0.77919382],
-            #         [-0.10721093,  1.66559887]]) (randomly)
-            print(y_test)
-            # Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-            #        [[-1.,  1.],
-            #         [-1.,  1.]])
+            >>> import paddle
+            >>> paddle.seed(1)
+            >>> x = paddle.to_tensor([[-1, 1], [-1, 1]]).astype(paddle.float32)
+            >>> y_train = paddle.nn.functional.alpha_dropout(x, 0.5)
+            >>> y_test = paddle.nn.functional.alpha_dropout(x, 0.5, training=False)
+            >>> print(y_train)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.77919382,  1.66559887],
+            [-0.10721093, -0.77919382]])
+            >>> print(y_test)
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-1.,  1.],
+            [-1.,  1.]])
     """
     if not isinstance(p, (float, int)):
         raise TypeError("p argument should be a float or int")
@@ -1516,32 +1600,35 @@ def pad(x, pad, mode='constant', value=0.0, data_format="NCHW", name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            # example 1
-            x_shape = (1, 1, 3)
-            x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
-            y = F.pad(x, [0, 0, 0, 0, 2, 3], value=1, mode='constant', data_format="NCL")
-            print(y)
-            # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
-
-            # example 2
-            x_shape = (1, 1, 3)
-            x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
-            y = F.pad(x, [2, 3], value=1, mode='constant', data_format="NCL")
-            print(y)
-            # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
-
-            # example 3
-            x_shape = (1, 1, 2, 3)
-            x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
-            y = F.pad(x, [1, 2, 1, 1], value=1, mode='circular')
-            print(y)
-            # [[[[6. 4. 5. 6. 4. 5.]
-            #    [3. 1. 2. 3. 1. 2.]
-            #    [6. 4. 5. 6. 4. 5.]
-            #    [3. 1. 2. 3. 1. 2.]]]]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> # example 1
+            >>> x_shape = (1, 1, 3)
+            >>> x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
+            >>> y = F.pad(x, [0, 0, 0, 0, 2, 3], value=1, mode='constant', data_format="NCL")
+            >>> print(y)
+            Tensor(shape=[1, 1, 8], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[1., 1., 1., 2., 3., 1., 1., 1.]]])
+
+            >>> # example 2
+            >>> x_shape = (1, 1, 3)
+            >>> x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
+            >>> y = F.pad(x, [2, 3], value=1, mode='constant', data_format="NCL")
+            >>> print(y)
+            Tensor(shape=[1, 1, 8], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[1., 1., 1., 2., 3., 1., 1., 1.]]])
+
+            >>> # example 3
+            >>> x_shape = (1, 1, 2, 3)
+            >>> x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
+            >>> y = F.pad(x, [1, 2, 1, 1], value=1, mode='circular')
+            >>> print(y)
+            Tensor(shape=[1, 1, 4, 6], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[6., 4., 5., 6., 4., 5.],
+               [3., 1., 2., 3., 1., 2.],
+               [6., 4., 5., 6., 4., 5.],
+               [3., 1., 2., 3., 1., 2.]]]])
     """
     assert mode in [
         'reflect',
@@ -1713,16 +1800,18 @@ def zeropad2d(x, padding, data_format="NCHW", name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-            x_shape = paddle.to_tensor([1, 1, 2, 3])
-            x = paddle.arange(paddle.prod(x_shape), dtype="float32").reshape(x_shape) + 1
-            y = F.zeropad2d(x, [1, 2, 1, 1])
-            print(y)
-            # [[[[0. 0. 0. 0. 0. 0.]
-            #    [0. 1. 2. 3. 0. 0.]
-            #    [0. 4. 5. 6. 0. 0.]
-            #    [0. 0. 0. 0. 0. 0.]]]]
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> x_shape = paddle.to_tensor([1, 1, 2, 3])
+            >>> x = paddle.arange(paddle.prod(x_shape), dtype="float32").reshape(x_shape) + 1
+            >>> y = F.zeropad2d(x, [1, 2, 1, 1])
+            >>> print(y)
+            Tensor(shape=[1, 1, 4, 6], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0., 0., 0., 0., 0., 0.],
+               [0., 1., 2., 3., 0., 0.],
+               [0., 4., 5., 6., 0., 0.],
+               [0., 0., 0., 0., 0., 0.]]]])
     """
 
     return pad(
@@ -1767,16 +1856,17 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
     Code Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
+            >>> import paddle
+            >>> import paddle.nn as nn
 
-            paddle.seed(1)
-            x1 = paddle.randn(shape=[2, 3])
-            x2 = paddle.randn(shape=[2, 3])
+            >>> paddle.seed(1)
+            >>> x1 = paddle.randn(shape=[2, 3])
+            >>> x2 = paddle.randn(shape=[2, 3])
 
-            result = paddle.nn.functional.cosine_similarity(x1, x2, axis=0)
-            print(result)
-            # [0.97689527,  0.99996042, -0.55138415]
+            >>> result = paddle.nn.functional.cosine_similarity(x1, x2, axis=0)
+            >>> print(result)
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [ 0.97689527,  0.99996042, -0.55138415])
 
     """
     w12 = sum(paddle.multiply(x1, x2), axis=axis)
@@ -1822,21 +1912,29 @@ def linear(x, weight, bias=None, name=None):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          x = paddle.randn((3, 2), dtype="float32")
-          # x: [[-0.32342386 -1.200079  ]
-          #     [ 0.7979031  -0.90978354]
-          #     [ 0.40597573  1.8095392 ]]
-          weight = paddle.full(shape=[2, 4], fill_value="0.5", dtype="float32", name="weight")
-          # weight: [[0.5 0.5 0.5 0.5]
-          #          [0.5 0.5 0.5 0.5]]
-          bias = paddle.ones(shape=[4], dtype="float32", name="bias")
-          # bias: [1. 1. 1. 1.]
-          y = paddle.nn.functional.linear(x, weight, bias)
-          # y: [[0.23824859 0.23824859 0.23824859 0.23824859]
-          #     [0.9440598  0.9440598  0.9440598  0.9440598 ]
-          #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> x = paddle.randn((3, 2), dtype="float32")
+            >>> print(x)
+            Tensor(shape=[3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.06132207,  1.11349595],
+             [ 0.41906244, -0.24858207],
+             [-1.85169315, -1.50370061]])
+            >>> weight = paddle.full(shape=[2, 4], fill_value="0.5", dtype="float32", name="weight")
+            >>> print(weight)
+            Tensor(shape=[2, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.50000000, 0.50000000, 0.50000000, 0.50000000],
+             [0.50000000, 0.50000000, 0.50000000, 0.50000000]])
+            >>> bias = paddle.ones(shape=[4], dtype="float32", name="bias")
+            >>> print(bias)
+            Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [1., 1., 1., 1.])
+            >>> y = paddle.nn.functional.linear(x, weight, bias)
+            >>> print(y)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 1.58740902,  1.58740902,  1.58740902,  1.58740902],
+             [ 1.08524013,  1.08524013,  1.08524013,  1.08524013],
+             [-0.67769694, -0.67769694, -0.67769694, -0.67769694]])
     """
     if in_dynamic_mode():
         # TODO(jiabin): using addmm for fast forward route
@@ -1921,17 +2019,17 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.disable_static()
+            >>> import paddle
+            >>> paddle.disable_static()
 
-            x = paddle.to_tensor([[[0, 1, 0],
-                                [ 1,  0, 1]]], dtype="float32", stop_gradient=False)
+            >>> x = paddle.to_tensor([[[0, 1, 0],
+            >>>                     [ 1,  0, 1]]], dtype="float32", stop_gradient=False)
 
-            output = paddle.nn.functional.label_smooth(x)
-            print(output)
-            # Tensor(shape=[1, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=False,
-            #        [[[0.03333334, 0.93333334, 0.03333334],
-            #          [0.93333334, 0.03333334, 0.93333334]]])
+            >>> output = paddle.nn.functional.label_smooth(x)
+            >>> print(output)
+            Tensor(shape=[1, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[0.03333334, 0.93333334, 0.03333334],
+            [0.93333334, 0.03333334, 0.93333334]]])
     """
     if epsilon > 1.0 or epsilon < 0.0:
         raise ValueError("The value of epsilon must be between 0 and 1.")
@@ -2002,67 +2100,64 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
     .. code-block:: python
         :name: code-example1
 
-        # CPU or single GPU
-        import paddle
-        num_classes = 20
-        batch_size = 10
-        num_samples = 6
-        label = paddle.randint(low=0, high=num_classes, shape=[batch_size], dtype='int64')
-        remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample(label, num_classes, num_samples)
-
-        print(label)
-        print(remapped_label)
-        print(sampled_class_index)
-
-        # the output is
-        #Tensor(shape=[10], dtype=int64, place=CPUPlace, stop_gradient=True,
-        #       [11, 5 , 1 , 3 , 12, 2 , 15, 19, 18, 19])
-        #Tensor(shape=[10], dtype=int64, place=CPUPlace, stop_gradient=True,
-        #       [4, 3, 0, 2, 5, 1, 6, 8, 7, 8])
-        #Tensor(shape=[9], dtype=int64, place=CPUPlace, stop_gradient=True,
-        #       [1 , 2 , 3 , 5 , 11, 12, 15, 18, 19])
+        >>> # CPU or single GPU
+        >>> import paddle
+        >>> num_classes = 20
+        >>> batch_size = 10
+        >>> num_samples = 6
+        >>> paddle.seed(2023)
+        >>> label = paddle.randint(low=0, high=num_classes, shape=[batch_size], dtype='int64')
+        >>> remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample(label, num_classes, num_samples)
+        >>> print(label)
+        Tensor(shape=[10], dtype=int64, place=Place(cpu), stop_gradient=True,
+        [17, 10, 5 , 18, 8 , 8 , 19, 14, 10, 14])
+        >>> print(remapped_label)
+        Tensor(shape=[10], dtype=int64, place=Place(cpu), stop_gradient=True,
+        [4, 2, 0, 5, 1, 1, 6, 3, 2, 3])
+        >>> print(sampled_class_index)
+        Tensor(shape=[7], dtype=int64, place=Place(cpu), stop_gradient=True,
+        [5 , 8 , 10, 14, 17, 18, 19])
 
     .. code-block:: python
         :name: code-example2
 
-        # required: distributed
-        # Multi GPU, test_class_center_sample.py
-        import paddle
-        import paddle.distributed as dist
-        strategy = dist.fleet.DistributedStrategy()
-        dist.fleet.init(is_collective=True, strategy=strategy)
-        batch_size = 10
-        num_samples = 6
-        rank_id = dist.get_rank()
-        # num_classes of each GPU can be different, e.g num_classes_list = [10, 8]
-        num_classes_list = [10, 10]
-        num_classes = paddle.sum(paddle.to_tensor(num_classes_list))
-        label = paddle.randint(low=0, high=num_classes.item(), shape=[batch_size], dtype='int64')
-        label_list = []
-        dist.all_gather(label_list, label)
-        label = paddle.concat(label_list, axis=0)
-        remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample(label, num_classes_list[rank_id], num_samples)
-
-        print(label)
-        print(remapped_label)
-        print(sampled_class_index)
-
-        #python -m paddle.distributed.launch --gpus=0,1 test_class_center_sample.py
-        # rank 0 output:
-        #Tensor(shape=[20], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
-        #       [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ])
-        #Tensor(shape=[20], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
-        #       [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ])
-        #Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
-        #       [0, 2, 4, 8, 9, 3])
-
-        # rank 1 output:
-        #Tensor(shape=[20], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
-        #       [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ])
-        #Tensor(shape=[20], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
-        #       [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ])
-        #Tensor(shape=[7], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
-        #       [0, 1, 2, 3, 5, 7, 8])
+        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+        >>> # required: distributed
+        >>> # Multi GPU, test_class_center_sample.py
+        >>> import paddle
+        >>> import paddle.distributed as dist
+        >>> strategy = dist.fleet.DistributedStrategy()
+        >>> dist.fleet.init(is_collective=True, strategy=strategy)
+        >>> batch_size = 10
+        >>> num_samples = 6
+        >>> rank_id = dist.get_rank()
+        >>> # num_classes of each GPU can be different, e.g num_classes_list = [10, 8]
+        >>> num_classes_list = [10, 10]
+        >>> num_classes = paddle.sum(paddle.to_tensor(num_classes_list))
+        >>> label = paddle.randint(low=0, high=num_classes.item(), shape=[batch_size], dtype='int64')
+        >>> label_list = []
+        >>> dist.all_gather(label_list, label)
+        >>> label = paddle.concat(label_list, axis=0)
+        >>> remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample(label, num_classes_list[rank_id], num_samples)
+
+        >>> print(label)
+        >>> print(remapped_label)
+        >>> print(sampled_class_index)
+        >>> #python -m paddle.distributed.launch --gpus=0,1 test_class_center_sample.py
+        >>> # rank 0 output:
+        Tensor(shape=[20], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+        [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ])
+        Tensor(shape=[20], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+        [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ])
+        Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+        [0, 2, 4, 8, 9, 3])
+        >>> # rank 1 output:
+        Tensor(shape=[20], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
+        [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ])
+        Tensor(shape=[20], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
+        [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ])
+        Tensor(shape=[7], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
+        [0, 1, 2, 3, 5, 7, 8])
     """
     if not (group is False or group is None or hasattr(group, 'is_member')):
         raise ValueError(
@@ -2216,12 +2311,15 @@ def fold(
 
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x = paddle.randn([2,3*2*2,12])
-            y = F.fold(x, output_sizes=[4, 5], kernel_sizes=2)
-            # y.shape = [2,3,4,5]
+            >>> x = paddle.randn([2,3*2*2,12])
+            >>> y = F.fold(x, output_sizes=[4, 5], kernel_sizes=2)
+            >>> x = paddle.randn([2,3*2*2,12])
+            >>> y = F.fold(x, output_sizes=[4, 5], kernel_sizes=2)
+            >>> print(y.shape)
+            [2, 3, 4, 5]
 
     """
 
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 29bcbf880c2596..81af0d9be5f4fd 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -368,24 +368,24 @@ def conv1d(
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn.functional as F
-
-          x = paddle.to_tensor([[[4, 8, 1, 9],
-                                 [7, 2, 0, 9],
-                                 [6, 9, 2, 6]]], dtype="float32")
-          w = paddle.to_tensor([[[9, 3, 4],
-                                 [0, 0, 7],
-                                 [2, 5, 6]],
-                                [[0, 3, 4],
-                                 [2, 9, 7],
-                                 [5, 6, 8]]], dtype="float32")
-
-          y = F.conv1d(x, w)
-          print(y)
-          # Tensor(shape=[1, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-          #        [[[133., 238.],
-          #          [160., 211.]]])
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> x = paddle.to_tensor([[[4, 8, 1, 9],
+            ...                        [7, 2, 0, 9],
+            ...                        [6, 9, 2, 6]]], dtype="float32")
+            >>> w = paddle.to_tensor([[[9, 3, 4],
+            ...                        [0, 0, 7],
+            ...                        [2, 5, 6]],
+            ...                       [[0, 3, 4],
+            ...                        [2, 9, 7],
+            ...                        [5, 6, 8]]], dtype="float32")
+
+            >>> y = F.conv1d(x, w)
+            >>> print(y)
+            Tensor(shape=[1, 2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[133., 238.],
+            [160., 211.]]])
     """
     cudnn_version = get_cudnn_version()
     if cudnn_version is not None:
@@ -632,16 +632,16 @@ def conv2d(
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-          x_var = paddle.randn((2, 3, 8, 8), dtype='float32')
-          w_var = paddle.randn((6, 3, 3, 3), dtype='float32')
+            >>> x_var = paddle.randn((2, 3, 8, 8), dtype='float32')
+            >>> w_var = paddle.randn((6, 3, 3, 3), dtype='float32')
 
-          y_var = F.conv2d(x_var, w_var)
+            >>> y_var = F.conv2d(x_var, w_var)
 
-          print(y_var.shape)
-          # [2, 6, 6, 6]
+            >>> print(y_var.shape)
+            [2, 6, 6, 6]
     """
     # entry checks
     if data_format not in ["NCHW", "NHWC"]:
@@ -887,20 +887,20 @@ def conv1d_transpose(
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-          # shape: (1, 2, 4)
-          x = paddle.to_tensor([[[4, 0, 9, 7],
-                                [8, 0, 9, 2,]]], dtype="float32")
-          # shape: (2, 1, 2)
-          w = paddle.to_tensor([[[7, 0]],
-                                [[4, 2]]], dtype="float32")
+            >>> # shape: (1, 2, 4)
+            >>> x = paddle.to_tensor([[[4, 0, 9, 7],
+            >>>                       [8, 0, 9, 2,]]], dtype="float32")
+            >>> # shape: (2, 1, 2)
+            >>> w = paddle.to_tensor([[[7, 0]],
+            >>>                       [[4, 2]]], dtype="float32")
 
-          y = F.conv1d_transpose(x, w)
-          print(y)
-          # Tensor(shape=[1, 1, 5], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-          #        [[[60., 16., 99., 75., 4. ]]])
+            >>> y = F.conv1d_transpose(x, w)
+            >>> print(y)
+            Tensor(shape=[1, 1, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[60., 16., 99., 75., 4. ]]])
     """
     cudnn_version = get_cudnn_version()
     if cudnn_version is not None:
@@ -1183,16 +1183,16 @@ def conv2d_transpose(
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-          x_var = paddle.randn((2, 3, 8, 8), dtype='float32')
-          w_var = paddle.randn((3, 6, 3, 3), dtype='float32')
+            >>> x_var = paddle.randn((2, 3, 8, 8), dtype='float32')
+            >>> w_var = paddle.randn((3, 6, 3, 3), dtype='float32')
 
-          y_var = F.conv2d_transpose(x_var, w_var)
+            >>> y_var = F.conv2d_transpose(x_var, w_var)
 
-          print(y_var.shape)
-          # [2, 6, 10, 10]
+            >>> print(y_var.shape)
+            [2, 6, 10, 10]
     """
 
     if data_format not in ['NCHW', 'NHWC']:
@@ -1476,16 +1476,16 @@ def conv3d(
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            x_var = paddle.randn((2, 3, 8, 8, 8), dtype='float32')
-            w_var = paddle.randn((6, 3, 3, 3, 3), dtype='float32')
+            >>> x_var = paddle.randn((2, 3, 8, 8, 8), dtype='float32')
+            >>> w_var = paddle.randn((6, 3, 3, 3, 3), dtype='float32')
 
-            y_var = F.conv3d(x_var, w_var)
+            >>> y_var = F.conv3d(x_var, w_var)
 
-            print(y_var.shape)
-            # [2, 6, 6, 6, 6]
+            >>> print(y_var.shape)
+            [2, 6, 6, 6, 6]
     """
     # entry check
     if data_format not in ["NCDHW", "NDHWC"]:
@@ -1688,18 +1688,18 @@ def conv3d_transpose(
         variable storing transposed convolution and non-linearity activation result.
 
     Examples:
-       .. code-block:: python
+        .. code-block:: python
 
-          import paddle
-          import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-          x_var = paddle.randn((2, 3, 8, 8, 8), dtype='float32')
-          w_var = paddle.randn((3, 6, 3, 3, 3), dtype='float32')
+            >>> x_var = paddle.randn((2, 3, 8, 8, 8), dtype='float32')
+            >>> w_var = paddle.randn((3, 6, 3, 3, 3), dtype='float32')
 
-          y_var = F.conv3d_transpose(x_var, w_var)
+            >>> y_var = F.conv3d_transpose(x_var, w_var)
 
-          print(y_var.shape)
-          # [2, 6, 10, 10, 10]
+            >>> print(y_var.shape)
+            [2, 6, 10, 10, 10]
     """
     # entry checks
     if data_format not in ["NCDHW", "NDHWC"]:
diff --git a/python/paddle/nn/functional/distance.py b/python/paddle/nn/functional/distance.py
index cb004fe9b622fe..64352cd051dead 100644
--- a/python/paddle/nn/functional/distance.py
+++ b/python/paddle/nn/functional/distance.py
@@ -59,14 +59,13 @@ def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.to_tensor([[1., 3.], [3., 5.]], dtype=paddle.float64)
-            y = paddle.to_tensor([[5., 6.], [7., 8.]], dtype=paddle.float64)
-            distance = paddle.nn.functional.pairwise_distance(x, y)
-            print(distance)
-            # Tensor(shape=[2], dtype=float64, place=Place(gpu:0), stop_gradient=True,
-            #        [4.99999860, 4.99999860])
-
+            >>> import paddle
+            >>> x = paddle.to_tensor([[1., 3.], [3., 5.]], dtype=paddle.float64)
+            >>> y = paddle.to_tensor([[5., 6.], [7., 8.]], dtype=paddle.float64)
+            >>> distance = paddle.nn.functional.pairwise_distance(x, y)
+            >>> print(distance)
+            Tensor(shape=[2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [4.99999860, 4.99999860])
     """
     if in_dynamic_mode():
         sub = _C_ops.subtract(x, y)
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index a5c1e2ef42e3a7..adfb976293987a 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -55,48 +55,46 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            diag_embed_input = paddle.arange(6)
-
-            diag_embed_output1 = F.diag_embed(diag_embed_input)
-            print(diag_embed_output1)
-            # Tensor(shape=[6, 6], dtype=int64, place=Place(cpu), stop_gradient=True,
-            #        [[0, 0, 0, 0, 0, 0],
-            #         [0, 1, 0, 0, 0, 0],
-            #         [0, 0, 2, 0, 0, 0],
-            #         [0, 0, 0, 3, 0, 0],
-            #         [0, 0, 0, 0, 4, 0],
-            #         [0, 0, 0, 0, 0, 5]])
-
-            diag_embed_output2 = F.diag_embed(diag_embed_input, offset=-1, dim1=0,dim2=1 )
-            print(diag_embed_output2)
-            # Tensor(shape=[7, 7], dtype=int64, place=Place(cpu), stop_gradient=True,
-            #        [[0, 0, 0, 0, 0, 0, 0],
-            #         [0, 0, 0, 0, 0, 0, 0],
-            #         [0, 1, 0, 0, 0, 0, 0],
-            #         [0, 0, 2, 0, 0, 0, 0],
-            #         [0, 0, 0, 3, 0, 0, 0],
-            #         [0, 0, 0, 0, 4, 0, 0],
-            #         [0, 0, 0, 0, 0, 5, 0]])
-
-            diag_embed_input_2dim = paddle.reshape(diag_embed_input,[2,3])
-            print(diag_embed_input_2dim)
-            # Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
-            #        [[0, 1, 2],
-            #         [3, 4, 5]])
-            diag_embed_output3 = F.diag_embed(diag_embed_input_2dim,offset= 0, dim1=0, dim2=2 )
-            print(diag_embed_output3)
-            # Tensor(shape=[3, 2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
-            #        [[[0, 0, 0],
-            #          [3, 0, 0]],
-
-            #         [[0, 1, 0],
-            #          [0, 4, 0]],
-
-            #         [[0, 0, 2],
-            #          [0, 0, 5]]])
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> diag_embed_input = paddle.arange(6)
+
+            >>> diag_embed_output1 = F.diag_embed(diag_embed_input)
+            >>> print(diag_embed_output1)
+            Tensor(shape=[6, 6], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 0, 0, 0, 0, 0],
+             [0, 1, 0, 0, 0, 0],
+             [0, 0, 2, 0, 0, 0],
+             [0, 0, 0, 3, 0, 0],
+             [0, 0, 0, 0, 4, 0],
+             [0, 0, 0, 0, 0, 5]])
+
+            >>> diag_embed_output2 = F.diag_embed(diag_embed_input, offset=-1, dim1=0,dim2=1 )
+            >>> print(diag_embed_output2)
+            Tensor(shape=[7, 7], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 0, 0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0, 0, 0],
+             [0, 1, 0, 0, 0, 0, 0],
+             [0, 0, 2, 0, 0, 0, 0],
+             [0, 0, 0, 3, 0, 0, 0],
+             [0, 0, 0, 0, 4, 0, 0],
+             [0, 0, 0, 0, 0, 5, 0]])
+
+            >>> diag_embed_input_2dim = paddle.reshape(diag_embed_input,[2,3])
+            >>> print(diag_embed_input_2dim)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 1, 2],
+            [3, 4, 5]])
+            >>> diag_embed_output3 = F.diag_embed(diag_embed_input_2dim,offset= 0, dim1=0, dim2=2 )
+            >>> print(diag_embed_output3)
+            Tensor(shape=[3, 2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[[0, 0, 0],
+              [3, 0, 0]],
+             [[0, 1, 0],
+              [0, 4, 0]],
+             [[0, 0, 2],
+              [0, 0, 5]]])
     """
     if not isinstance(input, Variable):
         input = assign(input)
@@ -200,16 +198,16 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            lengths = paddle.to_tensor([10, 9, 8])
-            mask = paddle.nn.functional.sequence_mask(lengths)
+            >>> lengths = paddle.to_tensor([10, 9, 8])
+            >>> mask = paddle.nn.functional.sequence_mask(lengths)
 
-            print(mask)
-            # Tensor(shape=[3, 10], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-            #         [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
-            #         [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+            >>> print(mask)
+            Tensor(shape=[3, 10], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+             [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
+             [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
 
     """
 
@@ -296,14 +294,24 @@ def gather_tree(ids, parents):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            ids = paddle.to_tensor([[[2, 2], [6, 1]], [[3, 9], [6, 1]], [[0, 1], [9, 0]]])
+            >>> ids = paddle.to_tensor([[[2, 2], [6, 1]], [[3, 9], [6, 1]], [[0, 1], [9, 0]]])
 
-            parents = paddle.to_tensor([[[0, 0], [1, 1]], [[1, 0], [1, 0]], [[0, 0], [0, 1]]])
+            >>> parents = paddle.to_tensor([[[0, 0], [1, 1]], [[1, 0], [1, 0]], [[0, 0], [0, 1]]])
+
+            >>> final_sequences = paddle.nn.functional.gather_tree(ids, parents)
+            >>> [[[2, 2], [1, 6]], [[3, 3], [6, 1]], [[0, 1], [9, 0]]]
+            >>> final_sequences = paddle.nn.functional.gather_tree(ids, parents)
+            >>> print(final_sequences)
+            Tensor(shape=[3, 2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[[2, 2],
+              [1, 6]],
+             [[3, 3],
+              [6, 1]],
+             [[0, 1],
+              [9, 0]]])
 
-            final_sequences = paddle.nn.functional.gather_tree(ids, parents)
-            # [[[2, 2], [1, 6]], [[3, 3], [6, 1]], [[0, 1], [9, 0]]]
 
     """
     if ids.ndim != 3:
@@ -388,11 +396,11 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            input = paddle.randn([6, 4, 2, 2])
-            out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
+            >>> input = paddle.randn([6, 4, 2, 2])
+            >>> out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
diff --git a/python/paddle/nn/functional/flash_attention.py b/python/paddle/nn/functional/flash_attention.py
index d6eb44e66e251a..822348d5be8524 100644
--- a/python/paddle/nn/functional/flash_attention.py
+++ b/python/paddle/nn/functional/flash_attention.py
@@ -181,13 +181,12 @@ def flash_attention(
     Examples:
         .. code-block:: python
 
-            # required: skiptest
-            import paddle
+            >>> import paddle
 
-            q = paddle.rand((1, 128, 2, 16), dtype=paddle.float16)
+            >>> paddle.seed(1)
+            >>> q = paddle.rand((1, 128, 2, 16))
 
-            output = paddle.nn.functional.flash_attention(q, q, q, 0.9, False, False)
-            print(output)
+            >>> output = paddle.nn.functional.flash_attention.flash_attention(q, q, q, 0.9, False, False)
     """
     head_dim = query.shape[3]
     sdp_func_name = _select_sdp(head_dim)
@@ -340,13 +339,12 @@ def flash_attn_unpadded(
     Examples:
         .. code-block:: python
 
-            # required: skiptest
-            import paddle
-
-            q = paddle.rand((1, 128, 2, 16), dtype=paddle.float16)
+            >>> import paddle
+            >>> paddle.seed(1)
+            >>> q = paddle.rand((1, 128, 2, 16))
 
-            output = paddle.nn.functional.flash_attn_unpadded(q, q, q, 0.9, False, False)
-            print(output)
+            >>> output = paddle.nn.functional.flash_attention.flash_attn_unpadded(q, q, q, 0.9, False, False)
+            >>> print(output)
     """
     if in_dynamic_mode():
         (
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 1a556a7c5106e6..81ae8efd37b050 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -53,28 +53,29 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
-
-            paddle.disable_static()
-            x = paddle.arange(6, dtype="float32").reshape([2,3])
-            y = F.normalize(x)
-            print(y)
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0.        , 0.44721359, 0.89442718],
-            #         [0.42426404, 0.56568539, 0.70710671]])
-
-            y = F.normalize(x, p=1.5)
-            print(y)
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0.        , 0.40862012, 0.81724024],
-            #         [0.35684016, 0.47578689, 0.59473360]])
-
-            y = F.normalize(x, axis=0)
-            print(y)
-            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0.        , 0.24253564, 0.37139067],
-            #         [1.        , 0.97014254, 0.92847669]])
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> paddle.disable_static()
+            >>> x = paddle.arange(6, dtype="float32").reshape([2,3])
+            >>> y = F.normalize(x)
+            >>> print(y)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.        , 0.44721359, 0.89442718],
+             [0.42426404, 0.56568539, 0.70710671]])
+
+            >>> y = F.normalize(x, p=1.5)
+            >>> print(y)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.        , 0.40862012, 0.81724024],
+             [0.35684016, 0.47578689, 0.59473360]])
+
+            >>> y = F.normalize(x, axis=0)
+            >>> print(y)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.        , 0.24253564, 0.37139067],
+             [1.        , 0.97014254, 0.92847669]])
+
     """
 
     if in_dygraph_mode():
@@ -148,31 +149,29 @@ def batch_norm(
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            x = paddle.arange(12, dtype="float32").reshape([2, 1, 2, 3])
-            print(x)
-            # Tensor(shape=[2, 1, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[[0. , 1. , 2. ],
-            #           [3. , 4. , 5. ]]],
-
-            #         [[[6. , 7. , 8. ],
-            #           [9. , 10., 11.]]]])
-
-            running_mean = paddle.to_tensor([0], dtype="float32")
-            running_variance = paddle.to_tensor([1], dtype="float32")
-            weight = paddle.to_tensor([2], dtype="float32")
-            bias = paddle.to_tensor([1], dtype="float32")
-
-            batch_norm_out = paddle.nn.functional.batch_norm(x, running_mean,
-                                                        running_variance, weight, bias)
-            print(batch_norm_out)
-            # Tensor(shape=[2, 1, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[[[1.         , 2.99998999 , 4.99997997 ],
-            #           [6.99996948 , 8.99995995 , 10.99994946]]],
+            >>> import paddle
+
+            >>> x = paddle.arange(12, dtype="float32").reshape([2, 1, 2, 3])
+            >>> print(x)
+            Tensor(shape=[2, 1, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[0. , 1. , 2. ],
+               [3. , 4. , 5. ]]],
+             [[[6. , 7. , 8. ],
+               [9. , 10., 11.]]]])
+            >>> running_mean = paddle.to_tensor([0], dtype="float32")
+            >>> running_variance = paddle.to_tensor([1], dtype="float32")
+            >>> weight = paddle.to_tensor([2], dtype="float32")
+            >>> bias = paddle.to_tensor([1], dtype="float32")
+
+            >>> batch_norm_out = paddle.nn.functional.batch_norm(x, running_mean,
+            ...                                             running_variance, weight, bias)
+            >>> print(batch_norm_out)
+            Tensor(shape=[2, 1, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[1.         , 2.99998999 , 4.99997997 ],
+               [6.99996948 , 8.99995995 , 10.99994946]]],
+             [[[12.99993896, 14.99992943, 16.99991989],
+               [18.99990845, 20.99989891, 22.99988937]]]])
 
-            #         [[[12.99993896, 14.99992943, 16.99991989],
-            #           [18.99990845, 20.99989891, 22.99988937]]]])
     """
     assert len(x.shape) >= 2, "input dim must be larger than 1"
 
@@ -300,11 +299,21 @@ def layer_norm(
 
         .. code-block:: python
 
-          import paddle
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> x = paddle.rand((2, 2, 2, 3))
+            >>> layer_norm_out = paddle.nn.functional.layer_norm(x, x.shape[1:])
+            >>> print(layer_norm_out)
+            Tensor(shape=[2, 2, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[ 0.87799639, -0.32706568, -1.23529339],
+               [ 1.01540327, -0.66222906, -0.72354043]],
+              [[ 1.24183702,  0.45458138, -0.33506915],
+               [ 0.41468468,  1.26852870, -1.98983312]]],
+             [[[ 0.02837803,  1.27684665, -0.90110683],
+               [-0.94709367, -0.15110941, -1.16546965]],
+              [[-0.82010198,  0.11218392, -0.86506516],
+               [ 1.09489357,  0.19107464,  2.14656854]]]])
 
-          x = paddle.rand((2, 2, 2, 3))
-          layer_norm_out = paddle.nn.functional.layer_norm(x, x.shape[1:])
-          print(layer_norm_out)
     """
     input_shape = list(x.shape)
     input_ndim = len(input_shape)
@@ -415,12 +424,21 @@ def instance_norm(
 
         .. code-block:: python
 
-          import paddle
-
-          x = paddle.rand((2, 2, 2, 3))
-          instance_norm_out = paddle.nn.functional.instance_norm(x)
-
-          print(instance_norm_out)
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> x = paddle.rand((2, 2, 2, 3))
+            >>> instance_norm_out = paddle.nn.functional.instance_norm(x)
+
+            >>> print(instance_norm_out)
+            Tensor(shape=[2, 2, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[[ 1.25768495, -0.18054862, -1.26451230],
+               [ 1.42167914, -0.58056390, -0.65373862]],
+              [[ 0.95882601,  0.25075224, -0.45947552],
+               [ 0.21486834,  0.98283297, -1.94780385]]],
+             [[[ 0.40697321,  1.90885782, -0.71117985],
+               [-0.76650119,  0.19105314, -1.02920341]],
+              [[-1.06926346, -0.18710862, -1.11180890],
+               [ 0.74275863, -0.11246002,  1.73788261]]]])
 
     """
     if in_dygraph_mode():
@@ -510,13 +528,15 @@ def local_response_norm(
 
     Examples:
 
-    .. code-block:: python
+        .. code-block:: python
+
+            >>> import paddle
 
-        import paddle
+            >>> x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
+            >>> y = paddle.nn.functional.local_response_norm(x, size=5)
+            >>> print(y.shape)
+            [3, 3, 112, 112]
 
-        x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
-        y = paddle.nn.functional.local_response_norm(x, size=5)
-        print(y.shape)  # [3, 3, 112, 112]
     """
     if not in_dynamic_mode():
         check_variable_and_dtype(
diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
index cd64a15b7519ed..a46ae2a41e3cdd 100644
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
@@ -158,24 +158,33 @@ class Uniform(UniformInitializer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            data = paddle.ones(shape=[3, 1, 2], dtype='float32')
-            weight_attr = paddle.framework.ParamAttr(
-                name="linear_weight",
-                initializer=paddle.nn.initializer.Uniform(low=-0.5, high=0.5))
-            bias_attr = paddle.framework.ParamAttr(
-                name="linear_bias",
-                initializer=paddle.nn.initializer.Uniform(low=-0.5, high=0.5))
-            linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
-            # linear.weight:  [[-0.46245047  0.05260676]
-            #                  [ 0.38054508  0.29169726]]
-            # linear.bias:  [-0.2734719   0.23939109]
-
-            res = linear(data)
-            # res:  [[[-0.3553773  0.5836951]]
-            #        [[-0.3553773  0.5836951]]
-            #        [[-0.3553773  0.5836951]]]
+            >>> import paddle
+            >>> paddle.seed(1)
+            >>> data = paddle.ones(shape=[3, 1, 2], dtype='float32')
+            >>> weight_attr = paddle.framework.ParamAttr(
+            ...     name="linear_weight",
+            ...     initializer=paddle.nn.initializer.Uniform(low=-0.5, high=0.5))
+            >>> bias_attr = paddle.framework.ParamAttr(
+            ...     name="linear_bias",
+            ...     initializer=paddle.nn.initializer.Uniform(low=-0.5, high=0.5))
+            >>> linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+            >>> print(linear.weight)
+            Parameter containing:
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[-0.48212373,  0.26492310],
+             [ 0.17605734, -0.45379421]])
+
+            >>> print(linear.bias)
+            Parameter containing:
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [-0.11236754,  0.46462214])
+
+            >>> res = linear(data)
+            >>> print(res)
+            Tensor(shape=[3, 1, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[-0.41843393,  0.27575102]],
+             [[-0.41843393,  0.27575102]],
+             [[-0.41843393,  0.27575102]]])
     """
 
     def __init__(self, low=-1.0, high=1.0, name=None):
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 6d17c029f587c2..60242ecf5b27c3 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -214,24 +214,33 @@ class XavierNormal(XavierInitializer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            data = paddle.ones(shape=[3, 1, 2], dtype='float32')
-            weight_attr = paddle.framework.ParamAttr(
-                name="linear_weight",
-                initializer=paddle.nn.initializer.XavierNormal())
-            bias_attr = paddle.framework.ParamAttr(
-                name="linear_bias",
-                initializer=paddle.nn.initializer.XavierNormal())
-            linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
-            # inear.weight:  [[ 0.06910077 -0.18103665]
-            #                 [-0.02546741 -1.0402188 ]]
-            # linear.bias:  [-0.5012929   0.12418364]
-
-            res = linear(data)
-            # res:  [[[-0.4576595 -1.0970719]]
-            #        [[-0.4576595 -1.0970719]]
-            #        [[-0.4576595 -1.0970719]]]
+            >>> import paddle
+            >>> paddle.seed(1)
+            >>> data = paddle.ones(shape=[3, 1, 2], dtype='float32')
+            >>> weight_attr = paddle.framework.ParamAttr(
+            ...     name="linear_weight",
+            ...     initializer=paddle.nn.initializer.XavierNormal())
+            >>> bias_attr = paddle.framework.ParamAttr(
+            ...     name="linear_bias",
+            ...     initializer=paddle.nn.initializer.XavierNormal())
+            >>> linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+            >>> print(linear.weight)
+            Parameter containing:
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[-0.21607460,  0.08382989],
+             [ 0.29147008, -0.07049121]])
+
+            >>> print(linear.bias)
+            Parameter containing:
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [1.06076419, 0.87684733])
+
+            >>> res = linear(data)
+            >>> print(res)
+            Tensor(shape=[3, 1, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[1.13615966, 0.89018601]],
+             [[1.13615966, 0.89018601]],
+             [[1.13615966, 0.89018601]]])
     """
 
     def __init__(self, fan_in=None, fan_out=None, name=None):
@@ -266,24 +275,32 @@ class XavierUniform(XavierInitializer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            data = paddle.ones(shape=[3, 1, 2], dtype='float32')
-            weight_attr = paddle.framework.ParamAttr(
-                name="linear_weight",
-                initializer=paddle.nn.initializer.XavierUniform())
-            bias_attr = paddle.framework.ParamAttr(
-                name="linear_bias",
-                initializer=paddle.nn.initializer.XavierUniform())
-            linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
-            # linear.weight:  [[-0.04229349 -1.1248565 ]
-            #                  [-0.10789523 -0.5938053 ]]
-            # linear.bias:  [ 1.1983747  -0.40201235]
-
-            res = linear(data)
-            # res:  [[[ 1.0481861 -2.1206741]]
-            #        [[ 1.0481861 -2.1206741]]
-            #        [[ 1.0481861 -2.1206741]]]
+            >>> import paddle
+            >>> paddle.seed(1)
+            >>> data = paddle.ones(shape=[3, 1, 2], dtype='float32')
+            >>> weight_attr = paddle.framework.ParamAttr(
+            ...     name="linear_weight",
+            ...     initializer=paddle.nn.initializer.XavierUniform())
+            >>> bias_attr = paddle.framework.ParamAttr(
+            ...     name="linear_bias",
+            ...     initializer=paddle.nn.initializer.XavierUniform())
+            >>> linear = paddle.nn.Linear(2, 2, weight_attr=weight_attr, bias_attr=bias_attr)
+            >>> print(linear.weight)
+            Parameter containing:
+            Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[-1.18095720,  0.64892638],
+             [ 0.43125069, -1.11156428]])
+            >>> print(linear.bias)
+            Parameter containing:
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [-0.27524316,  1.13808715])
+
+            >>> res = linear(data)
+            >>> print(res)
+            Tensor(shape=[3, 1, 2], dtype=float32, place=Place(cpu), stop_gradient=False,
+            [[[-1.02494967,  0.67544925]],
+             [[-1.02494967,  0.67544925]],
+             [[-1.02494967,  0.67544925]]])
     """
 
     def __init__(self, fan_in=None, fan_out=None, name=None):
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index b810a7b8c43cb1..4ce0ed643c3430 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -31,9 +31,9 @@
 items. It can be any function with no parameter that creates a iterable
 (anything can be used in :code:`for x in iterable`)\:
 
-..  code-block:: python
+.. code-block:: python
 
-    iterable = data_reader()
+    >>> iterable = data_reader()
 
 Element produced from the iterable should be a **single** entry of data,
 **not** a mini batch. That entry of data could be a single item, or a tuple of
@@ -43,23 +43,23 @@
 
 An example implementation for single item data reader creator:
 
-..  code-block:: python
+.. code-block:: python
 
-    def reader_creator_random_image(width, height):
-        def reader():
-            while True:
-                yield numpy.random.uniform(-1, 1, size=width*height)
-    return reader
+    >>> def reader_creator_random_image(width, height):
+    ...     def reader():
+    ...         while True:
+    ...             yield numpy.random.uniform(-1, 1, size=width*height)
+    ...     return reader
 
 An example implementation for multiple item data reader creator:
 
-..  code-block:: python
+.. code-block:: python
 
-    def reader_creator_random_image_and_label(width, height, label):
-        def reader():
-            while True:
-                yield numpy.random.uniform(-1, 1, size=width*height), label
-    return reader
+    >>> def reader_creator_random_image_and_label(width, height, label):
+    ...     def reader():
+    ...         while True:
+    ...             yield numpy.random.uniform(-1, 1, size=width*height), label
+    ...     return reader
 
 """
 
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index bd40c4553e89d4..fba81b96f63553 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -60,18 +60,20 @@ def cache(reader):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            def reader():
-                for i in range(3):
-                    yield i
-
-            # All data is cached into memory
-            cached_reader = paddle.io.cache(reader)
-
-            # Output: 0 1 2
-            for i in cached_reader():
-                print(i)
+            >>> import paddle
+
+            >>> def reader():
+            ...     for i in range(3):
+            ...         yield i
+            ...
+            >>> # All data is cached into memory
+            >>> cached_reader = paddle.fluid.io.cache(reader)
+
+            >>> for i in cached_reader():
+            ...     print(i)
+            0
+            1
+            2
     """
     all_data = tuple(reader())
 
@@ -103,14 +105,14 @@ def map_readers(func, *readers):
 
         .. code-block:: python
 
-         import paddle.reader
-         d = {"h": 0, "i": 1}
-         def func(x):
-             return d[x]
-         def reader():
-             yield "h"
-             yield "i"
-         map_reader_result = paddle.reader.map_readers(func, reader)
+            >>> import paddle.reader
+            >>> d = {"h": 0, "i": 1}
+            >>> def func(x):
+            ...     return d[x]
+            >>> def reader():
+            ...     yield "h"
+            ...     yield "i"
+            >>> map_reader_result = paddle.reader.map_readers(func, reader)
     """
 
     def reader():
@@ -142,15 +144,14 @@ def shuffle(reader, buf_size):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            def reader():
-                for i in range(5):
-                    yield i
-            shuffled_reader = fluid.io.shuffle(reader, 3)
-            for e in shuffled_reader():
-                print(e)
-            # outputs are 0~4 unordered arrangement
+            >>> # doctest: +SKIP('outputs are 0~4 unordered arrangement')
+            >>> def reader():
+            ...     for i in range(5):
+            ...         yield i
+            >>> shuffled_reader = paddle.reader.decorator.shuffle(reader, 3)
+            >>> for e in shuffled_reader():
+            ...     print(e)
+            >>> # outputs are 0~4 unordered arrangement
     """
 
     def data_reader():
@@ -195,29 +196,28 @@ def chain(*readers):
         callable: the new chained data reader.
 
     Examples:
-        ..  code-block:: python
-
-            import paddle
-
-            def reader_creator_3(start):
-                def reader():
-                    for i in range(start, start + 3):
-                        yield [i, i, i]
-                return reader
-
-            c = paddle.reader.chain(reader_creator_3(0), reader_creator_3(10), reader_creator_3(20))
-            for e in c():
-                print(e)
-            # Output:
-            # [0, 0, 0]
-            # [1, 1, 1]
-            # [2, 2, 2]
-            # [10, 10, 10]
-            # [11, 11, 11]
-            # [12, 12, 12]
-            # [20, 20, 20]
-            # [21, 21, 21]
-            # [22, 22, 22]
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> def reader_creator_3(start):
+            ...     def reader():
+            ...         for i in range(start, start + 3):
+            ...             yield [i, i, i]
+            ...     return reader
+            ...
+            >>> c = paddle.reader.chain(reader_creator_3(0), reader_creator_3(10), reader_creator_3(20))
+            >>> for e in c():
+            ...     print(e)
+            [0, 0, 0]
+            [1, 1, 1]
+            [2, 2, 2]
+            [10, 10, 10]
+            [11, 11, 11]
+            [12, 12, 12]
+            [20, 20, 20]
+            [21, 21, 21]
+            [22, 22, 22]
 
     """
 
@@ -257,13 +257,12 @@ def compose(*readers, **kwargs):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          def reader_creator_10(dur):
-              def reader():
-                 for i in range(10):
-                     yield i
-              return reader
-          reader = fluid.io.compose(reader_creator_10(0), reader_creator_10(0))
+            >>> def reader_creator_10(dur):
+            ...     def reader():
+            ...         for i in range(10):
+            ...             yield i
+            ...     return reader
+            >>> reader = paddle.reader.decorator.compose(reader_creator_10(0), reader_creator_10(0))
     """
     check_alignment = kwargs.pop('check_alignment', True)
 
@@ -311,18 +310,21 @@ def buffered(reader, size):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            def reader():
-                for i in range(3):
-                    yield i
-
-            # Create a buffered reader, and the buffer size is 2.
-            buffered_reader = paddle.io.buffered(reader, 2)
-
-            # Output: 0 1 2
-            for i in buffered_reader():
-                print(i)
+            >>> import paddle
+
+            >>> def reader():
+            ...     for i in range(3):
+            ...         yield i
+            ...
+            >>> # Create a buffered reader, and the buffer size is 2.
+            >>> buffered_reader = paddle.reader.decorator.buffered(reader, 2)
+
+            >>> # Output: 0 1 2
+            >>> for i in buffered_reader():
+            ...     print(i)
+            0
+            1
+            2
     """
 
     class EndSignal:
@@ -373,15 +375,17 @@ def firstn(reader, n):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            def reader():
-                for i in range(100):
-                    yield i
-            firstn_reader = fluid.io.firstn(reader, 5)
-            for e in firstn_reader():
-                print(e)
-            # the outputs are: 0 1 2 3 4
+            >>> def reader():
+            ...     for i in range(100):
+            ...         yield i
+            >>> firstn_reader = paddle.reader.decorator.firstn(reader, 5)
+            >>> for e in firstn_reader():
+            ...     print(e)
+            0
+            1
+            2
+            3
+            4
     """
 
     # TODO(yuyang18): Check if just drop the reader, could clean the opened
@@ -523,60 +527,56 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
 
     Example:
 
-    .. code-block:: python
-        import paddle
-        import paddle.fluid as fluid
-        from paddle.fluid.io import multiprocess_reader
-        import numpy as np
-
-        sample_files = ['sample_file_1', 'sample_file_2']
-
-        def fake_input_files():
-            with open(sample_files[0], 'w') as f:
-               np.savez(f, a=np.array([1, 2]), b=np.array([3, 4]), c=np.array([5, 6]), d=np.array([7, 8]))
-            with open(sample_files[1], 'w') as f:
-               np.savez(f, a=np.array([9, 10]), b=np.array([11, 12]), c=np.array([13, 14]))
-
-
-        def generate_reader(file_name):
-            # load data file
-            def _impl():
-                data = np.load(file_name)
-                for item in sorted(data.files):
-                    yield data[item],
-            return _impl
-
-        if __name__ == '__main__':
-            # generate sample input files
-            fake_input_files()
-
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                place = fluid.CPUPlace()
-                # the 1st 2 is batch size
-
-                image = paddle.static.data(name='image', dtype='int64', shape=[2, 1, 2])
-                paddle.static.Print(image)
-                # print detailed tensor info of image variable
-
-                reader = fluid.io.PyReader(feed_list=[image], capacity=2)
-
-                decorated_reader = multiprocess_reader(
-                    [generate_reader(sample_files[0]), generate_reader(sample_files[1])], False)
-
-                reader.decorate_sample_generator(decorated_reader, batch_size=2, places=[place])
-
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-
-                for data in reader():
-                    res = exe.run(feed=data, fetch_list=[image])
-                    print(res[0])
-                    # print below content in this case
-                    # [[[1 2]], [[3 4]]]
-                    # [[[5 6]], [[7 8]]]
-                    # [[[9 10]], [[11 12]]]
-                    # [13,14] will be dropped
+        .. code-block:: python
 
+            >>> import paddle
+            >>> import numpy as np
+
+            >>> sample_files = ['sample_file_1', 'sample_file_2']
+
+            >>> def fake_input_files():
+            ...     with open(sample_files[0], 'wb') as f:
+            ...         np.savez(f, a=np.array([1, 2]), b=np.array([3, 4]), c=np.array([5, 6]), d=np.array([7, 8]))
+            ...     with open(sample_files[1], 'wb') as f:
+            ...         np.savez(f, a=np.array([9, 10]), b=np.array([11, 12]), c=np.array([13, 14]))
+            ...
+            ...
+            >>> def generate_reader(file_name):
+            ...     # load data file
+            ...     def _impl():
+            ...         data = np.load(file_name)
+            ...         for item in sorted(data.files):
+            ...             yield data[item],
+            ...     return _impl
+            ...
+            >>> if __name__ == '__main__':
+            ...     # generate sample input files
+            ...     fake_input_files()
+            ...
+            ...     with fluid.program_guard(fluid.Program(), fluid.Program()):
+            ...         place = fluid.CPUPlace()
+            ...         # the 1st 2 is batch size
+            ...
+            ...         image = paddle.static.data(name='image', dtype='int64', shape=[2, 1, 2])
+            ...         paddle.static.Print(image)
+            ...         # print detailed tensor info of image variable
+            ...
+            ...         reader = fluid.io.PyReader(feed_list=[image], capacity=2)
+            ...
+            ...         decorated_reader = paddle.reader.multiprocess_reader(
+            ...             [generate_reader(sample_files[0]), generate_reader(sample_files[1])], False)
+            ...
+            ...         reader.decorate_sample_generator(decorated_reader, batch_size=2, places=[place])
+            ...
+            ...         exe = fluid.Executor(place)
+            ...         exe.run(fluid.default_startup_program())
+            ...
+            ...         for data in reader():
+            ...             res = exe.run(feed=data, fetch_list=[image])
+            ...             print(res[0])
+            [[[1 2]], [[3 4]]]
+            [[[5 6]], [[7 8]]]
+            [[[9 10]], [[11 12]]]
     """
 
     if sys.platform == 'win32':
diff --git a/python/paddle/regularizer.py b/python/paddle/regularizer.py
index 9a746cb4bf53ca..501a06e32f89e7 100644
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
@@ -67,42 +67,42 @@ class L1Decay(WeightDecayRegularizer):
         .. code-block:: python
             :name: code-example1
 
-            # Example1: set Regularizer in optimizer
-            import paddle
-            from paddle.regularizer import L1Decay
-
-            linear = paddle.nn.Linear(10, 10)
-            inp = paddle.rand(shape=[10, 10], dtype="float32")
-            out = linear(inp)
-            loss = paddle.mean(out)
-            beta1 = paddle.to_tensor([0.9], dtype="float32")
-            beta2 = paddle.to_tensor([0.99], dtype="float32")
-            momentum = paddle.optimizer.Momentum(
-                learning_rate=0.1,
-                parameters=linear.parameters(),
-                weight_decay=L1Decay(0.0001))
-            back = out.backward()
-            momentum.step()
-            momentum.clear_grad()
+            >>> # Example1: set Regularizer in optimizer
+            >>> import paddle
+            >>> from paddle.regularizer import L1Decay
+
+            >>> linear = paddle.nn.Linear(10, 10)
+            >>> inp = paddle.rand(shape=[10, 10], dtype="float32")
+            >>> out = linear(inp)
+            >>> loss = paddle.mean(out)
+            >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
+            >>> beta2 = paddle.to_tensor([0.99], dtype="float32")
+            >>> momentum = paddle.optimizer.Momentum(
+            ...     learning_rate=0.1,
+            ...     parameters=linear.parameters(),
+            ...     weight_decay=L1Decay(0.0001))
+            >>> back = out.backward()
+            >>> momentum.step()
+            >>> momentum.clear_grad()
 
         .. code-block:: python
             :name: code-example2
 
-            # Example2: set Regularizer in parameters
-            # Set L1 regularization in parameters.
-            # Global regularizer does not take effect on my_conv2d for this case.
-            from paddle.nn import Conv2D
-            from paddle import ParamAttr
-            from paddle.regularizer import L2Decay
-
-            my_conv2d = Conv2D(
-                    in_channels=10,
-                    out_channels=10,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
-                    bias_attr=False)
+            >>> # Example2: set Regularizer in parameters
+            >>> # Set L1 regularization in parameters.
+            >>> # Global regularizer does not take effect on my_conv2d for this case.
+            >>> from paddle.nn import Conv2D
+            >>> from paddle import ParamAttr
+            >>> from paddle.regularizer import L1Decay
+
+            >>> my_conv2d = Conv2D(
+            ...         in_channels=10,
+            ...         out_channels=10,
+            ...         kernel_size=1,
+            ...         stride=1,
+            ...         padding=0,
+            ...         weight_attr=ParamAttr(regularizer=L1Decay(coeff=0.01)),
+            ...         bias_attr=False)
     """
 
     def __init__(self, coeff=0.0):
@@ -178,40 +178,41 @@ class L2Decay(WeightDecayRegularizer):
         .. code-block:: python
             :name: code-example1
 
-            # Example1: set Regularizer in optimizer
-            import paddle
-            from paddle.regularizer import L2Decay
-            linear = paddle.nn.Linear(10, 10)
-            inp = paddle.rand(shape=[10, 10], dtype="float32")
-            out = linear(inp)
-            loss = paddle.mean(out)
-            beta1 = paddle.to_tensor([0.9], dtype="float32")
-            beta2 = paddle.to_tensor([0.99], dtype="float32")
-            momentum = paddle.optimizer.Momentum(
-                learning_rate=0.1,
-                parameters=linear.parameters(),
-                weight_decay=L2Decay(0.0001))
-            back = out.backward()
-            momentum.step()
-            momentum.clear_grad()
+            >>> # Example1: set Regularizer in optimizer
+            >>> import paddle
+            >>> from paddle.regularizer import L2Decay
+            >>> linear = paddle.nn.Linear(10, 10)
+            >>> inp = paddle.rand(shape=[10, 10], dtype="float32")
+            >>> out = linear(inp)
+            >>> loss = paddle.mean(out)
+            >>> beta1 = paddle.to_tensor([0.9], dtype="float32")
+            >>> beta2 = paddle.to_tensor([0.99], dtype="float32")
+            >>> momentum = paddle.optimizer.Momentum(
+            ...     learning_rate=0.1,
+            ...     parameters=linear.parameters(),
+            ...     weight_decay=L2Decay(0.0001))
+            >>> back = out.backward()
+            >>> momentum.step()
+            >>> momentum.clear_grad()
 
         .. code-block:: python
             :name: code-example2
-            # Example2: set Regularizer in parameters
-            # Set L2 regularization in parameters.
-            # Global regularizer does not take effect on my_conv2d for this case.
-            from paddle.nn import Conv2D
-            from paddle import ParamAttr
-            from paddle.regularizer import L2Decay
-
-            my_conv2d = Conv2D(
-                    in_channels=10,
-                    out_channels=10,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
-                    bias_attr=False)
+
+            >>> # Example2: set Regularizer in parameters
+            >>> # Set L2 regularization in parameters.
+            >>> # Global regularizer does not take effect on my_conv2d for this case.
+            >>> from paddle.nn import Conv2D
+            >>> from paddle import ParamAttr
+            >>> from paddle.regularizer import L2Decay
+
+            >>> my_conv2d = Conv2D(
+            ...         in_channels=10,
+            ...         out_channels=10,
+            ...         kernel_size=1,
+            ...         stride=1,
+            ...         padding=0,
+            ...         weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
+            ...         bias_attr=False)
     """
 
     def __init__(self, coeff=0.0):
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index f61df7a2b07a36..d1dc910f043ea4 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -49,60 +49,57 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
 
     Examples:
 
-    .. code-block:: python
-
-        import paddle
-        from paddle.signal import frame
-
-        # 1D
-        x = paddle.arange(8)
-        y0 = frame(x, frame_length=4, hop_length=2, axis=-1)  # [4, 3]
-        # [[0, 2, 4],
-        #  [1, 3, 5],
-        #  [2, 4, 6],
-        #  [3, 5, 7]]
-
-        y1 = frame(x, frame_length=4, hop_length=2, axis=0)   # [3, 4]
-        # [[0, 1, 2, 3],
-        #  [2, 3, 4, 5],
-        #  [4, 5, 6, 7]]
-
-        # 2D
-        x0 = paddle.arange(16).reshape([2, 8])
-        y0 = frame(x0, frame_length=4, hop_length=2, axis=-1)  # [2, 4, 3]
-        # [[[0, 2, 4],
-        #   [1, 3, 5],
-        #   [2, 4, 6],
-        #   [3, 5, 7]],
-        #
-        #  [[8 , 10, 12],
-        #   [9 , 11, 13],
-        #   [10, 12, 14],
-        #   [11, 13, 15]]]
-
-        x1 = paddle.arange(16).reshape([8, 2])
-        y1 = frame(x1, frame_length=4, hop_length=2, axis=0)   # [3, 4, 2]
-        # [[[0 , 1 ],
-        #   [2 , 3 ],
-        #   [4 , 5 ],
-        #   [6 , 7 ]],
-        #
-        #   [4 , 5 ],
-        #   [6 , 7 ],
-        #   [8 , 9 ],
-        #   [10, 11]],
-        #
-        #   [8 , 9 ],
-        #   [10, 11],
-        #   [12, 13],
-        #   [14, 15]]]
-
-        # > 2D
-        x0 = paddle.arange(32).reshape([2, 2, 8])
-        y0 = frame(x0, frame_length=4, hop_length=2, axis=-1)  # [2, 2, 4, 3]
-
-        x1 = paddle.arange(32).reshape([8, 2, 2])
-        y1 = frame(x1, frame_length=4, hop_length=2, axis=0)   # [3, 4, 2, 2]
+        .. code-block:: python
+
+            >>> import paddle
+            >>> from paddle import signal
+
+            >>> # 1D
+            >>> x = paddle.arange(8)
+            >>> y0 = signal.frame(x, frame_length=4, hop_length=2, axis=-1)
+            >>> print(y0)
+            Tensor(shape=[4, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 2, 4],
+             [1, 3, 5],
+             [2, 4, 6],
+             [3, 5, 7]])
+
+            >>> y1 = signal.frame(x, frame_length=4, hop_length=2, axis=0)
+            >>> print(y1)
+            Tensor(shape=[3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 1, 2, 3],
+             [2, 3, 4, 5],
+             [4, 5, 6, 7]])
+
+            >>> # 2D
+            >>> x0 = paddle.arange(16).reshape([2, 8])
+            >>> y0 = signal.frame(x0, frame_length=4, hop_length=2, axis=-1)
+            >>> print(y0)
+            Tensor(shape=[2, 4, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[[0 , 2 , 4 ],
+              [1 , 3 , 5 ],
+              [2 , 4 , 6 ],
+              [3 , 5 , 7 ]],
+             [[8 , 10, 12],
+              [9 , 11, 13],
+              [10, 12, 14],
+              [11, 13, 15]]])
+
+            >>> x1 = paddle.arange(16).reshape([8, 2])
+            >>> y1 = signal.frame(x1, frame_length=4, hop_length=2, axis=0)
+            >>> print(y1.shape)
+            [3, 4, 2]
+
+            >>> # > 2D
+            >>> x0 = paddle.arange(32).reshape([2, 2, 8])
+            >>> y0 = signal.frame(x0, frame_length=4, hop_length=2, axis=-1)
+            >>> print(y0.shape)
+            [2, 2, 4, 3]
+
+            >>> x1 = paddle.arange(32).reshape([8, 2, 2])
+            >>> y1 = signal.frame(x1, frame_length=4, hop_length=2, axis=0)
+            >>> print(y1.shape)
+            [3, 4, 2, 2]
     """
     if axis not in [0, -1]:
         raise ValueError(f'Unexpected axis: {axis}. It should be 0 or -1.')
@@ -167,36 +164,53 @@ def overlap_add(x, hop_length, axis=-1, name=None):
 
     Examples:
 
-    .. code-block:: python
-
-        import paddle
-        from paddle.signal import overlap_add
-
-        # 2D
-        x0 = paddle.arange(16).reshape([8, 2])
-        # [[0 , 1 ],
-        #  [2 , 3 ],
-        #  [4 , 5 ],
-        #  [6 , 7 ],
-        #  [8 , 9 ],
-        #  [10, 11],
-        #  [12, 13],
-        #  [14, 15]]
-        y0 = overlap_add(x0, hop_length=2, axis=-1)  # [10]
-        # [0 , 2 , 5 , 9 , 13, 17, 21, 25, 13, 15]
-
-        x1 = paddle.arange(16).reshape([2, 8])
-        # [[0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ],
-        #  [8 , 9 , 10, 11, 12, 13, 14, 15]]
-        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10]
-        # [0 , 1 , 10, 12, 14, 16, 18, 20, 14, 15]
-
-        # > 2D
-        x0 = paddle.arange(32).reshape([2, 1, 8, 2])
-        y0 = overlap_add(x0, hop_length=2, axis=-1)  # [2, 1, 10]
-
-        x1 = paddle.arange(32).reshape([2, 8, 1, 2])
-        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10, 1, 2]
+        .. code-block:: python
+
+            >>> import paddle
+            >>> from paddle.signal import overlap_add
+
+            >>> # 2D
+            >>> x0 = paddle.arange(16).reshape([8, 2])
+            >>> print(x0)
+            Tensor(shape=[8, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0 , 1 ],
+             [2 , 3 ],
+             [4 , 5 ],
+             [6 , 7 ],
+             [8 , 9 ],
+             [10, 11],
+             [12, 13],
+             [14, 15]])
+
+
+            >>> y0 = overlap_add(x0, hop_length=2, axis=-1)
+            >>> print(y0)
+            Tensor(shape=[10], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [0 , 2 , 5 , 9 , 13, 17, 21, 25, 13, 15])
+
+            >>> x1 = paddle.arange(16).reshape([2, 8])
+            >>> print(x1)
+            Tensor(shape=[2, 8], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 ],
+             [8 , 9 , 10, 11, 12, 13, 14, 15]])
+
+
+            >>> y1 = overlap_add(x1, hop_length=2, axis=0)
+            >>> print(y1)
+            Tensor(shape=[10], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [0 , 1 , 10, 12, 14, 16, 18, 20, 14, 15])
+
+
+            >>> # > 2D
+            >>> x0 = paddle.arange(32).reshape([2, 1, 8, 2])
+            >>> y0 = overlap_add(x0, hop_length=2, axis=-1)
+            >>> print(y0.shape)
+            [2, 1, 10]
+
+            >>> x1 = paddle.arange(32).reshape([2, 8, 1, 2])
+            >>> y1 = overlap_add(x1, hop_length=2, axis=0)
+            >>> print(y1.shape)
+            [10, 1, 2]
     """
     if axis not in [0, -1]:
         raise ValueError(f'Unexpected axis: {axis}. It should be 0 or -1.')
@@ -289,18 +303,30 @@ def stft(
     Examples:
         .. code-block:: python
 
-            import paddle
-            from paddle.signal import stft
+            >>> import paddle
+            >>> from paddle.signal import stft
+
+            >>> # real-valued input
+            >>> x = paddle.randn([8, 48000], dtype=paddle.float64)
+            >>> y1 = stft(x, n_fft=512)
+            >>> print(y1.shape)
+            [8, 257, 376]
+
+            >>> y2 = stft(x, n_fft=512, onesided=False)
+            >>> print(y2.shape)
+            [8, 512, 376]
 
-            # real-valued input
-            x = paddle.randn([8, 48000], dtype=paddle.float64)
-            y1 = stft(x, n_fft=512)  # [8, 257, 376]
-            y2 = stft(x, n_fft=512, onesided=False)  # [8, 512, 376]
+            >>> # complex input
+            >>> x = paddle.randn([8, 48000], dtype=paddle.float64) + \
+            ...         paddle.randn([8, 48000], dtype=paddle.float64)*1j
+            >>> print(x.shape)
+            [8, 48000]
+            >>> print(x.dtype)
+            paddle.complex128
 
-            # complex input
-            x = paddle.randn([8, 48000], dtype=paddle.float64) + \
-                    paddle.randn([8, 48000], dtype=paddle.float64)*1j  # [8, 48000] complex128
-            y1 = stft(x, n_fft=512, center=False, onesided=False)  # [8, 512, 372]
+            >>> y1 = stft(x, n_fft=512, center=False, onesided=False)
+            >>> print(y1.shape)
+            [8, 512, 372]
 
     """
 
@@ -465,20 +491,25 @@ def istft(
     Examples:
         .. code-block:: python
 
-            import numpy as np
-            import paddle
-            from paddle.signal import stft, istft
+            >>> import numpy as np
+            >>> import paddle
+            >>> from paddle.signal import stft, istft
 
-            paddle.seed(0)
+            >>> paddle.seed(0)
 
-            # STFT
-            x = paddle.randn([8, 48000], dtype=paddle.float64)
-            y = stft(x, n_fft=512)  # [8, 257, 376]
+            >>> # STFT
+            >>> x = paddle.randn([8, 48000], dtype=paddle.float64)
+            >>> y = stft(x, n_fft=512)
+            >>> print(y.shape)
+            [8, 257, 376]
 
-            # ISTFT
-            x_ = istft(y, n_fft=512)  # [8, 48000]
+            >>> # ISTFT
+            >>> x_ = istft(y, n_fft=512)
+            >>> print(x_.shape)
+            [8, 48000]
 
-            np.allclose(x, x_)  # True
+            >>> np.allclose(x, x_)
+            True
     """
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'istft')
 
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index 8cdcb5f551a7c3..30a853336c976e 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -98,44 +98,47 @@ def data(name, shape, dtype=None, lod_level=0):
                     [2.]]], dtype=float32)]
 
     """
-    helper = LayerHelper('data', **locals())
-    check_type(name, 'name', (bytes, str), 'data')
-    check_type(shape, 'shape', (list, tuple), 'data')
-
-    shape = list(shape)
-    for i in range(len(shape)):
-        if shape[i] is None:
-            shape[i] = -1
-
-    if dtype:
-        out = helper.create_global_variable(
-            name=name,
-            shape=shape,
-            dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            stop_gradient=True,
-            lod_level=lod_level,
-            is_data=True,
-            need_check_feed=True,
-        )
-
-    else:
-        out = helper.create_global_variable(
-            name=name,
-            shape=shape,
-            dtype=paddle.get_default_dtype(),
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            stop_gradient=True,
-            lod_level=lod_level,
-            is_data=True,
-            need_check_feed=True,
-        )
-        dtype = paddle.get_default_dtype()
 
     if paddle.ir.core._use_new_ir_api():
+        if not dtype:
+            dtype = paddle.get_default_dtype()
         ir_dtype = paddle.ir.core.convert_np_dtype_to_dtype_(dtype)
         return paddle._ir_ops.data(name, shape, ir_dtype, core.Place())
+
     else:
+        helper = LayerHelper('data', **locals())
+        check_type(name, 'name', (bytes, str), 'data')
+        check_type(shape, 'shape', (list, tuple), 'data')
+
+        shape = list(shape)
+        for i in range(len(shape)):
+            if shape[i] is None:
+                shape[i] = -1
+
+        if dtype:
+            out = helper.create_global_variable(
+                name=name,
+                shape=shape,
+                dtype=dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                stop_gradient=True,
+                lod_level=lod_level,
+                is_data=True,
+                need_check_feed=True,
+            )
+
+        else:
+            out = helper.create_global_variable(
+                name=name,
+                shape=shape,
+                dtype=paddle.get_default_dtype(),
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                stop_gradient=True,
+                lod_level=lod_level,
+                is_data=True,
+                need_check_feed=True,
+            )
+
         is_new_ir_mode = os.environ.get("FLAGS_enable_new_ir_in_executor", None)
         if evaluate_flag(is_new_ir_mode):
             helper = LayerHelper('data', **locals())
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index feaaf905bc70b5..1b6f65e28e5eda 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -187,7 +187,7 @@ def append_fetch_ops(
         )
 
 
-def normalize_program(program, feed_vars, fetch_vars):
+def normalize_program(program, feed_vars, fetch_vars, **kwargs):
     """
 
     Normalize/Optimize a program according to feed_vars and fetch_vars.
@@ -196,6 +196,8 @@ def normalize_program(program, feed_vars, fetch_vars):
         program(Program): Specify a program you want to optimize.
         feed_vars(Tensor | list[Tensor]): Variables needed by inference.
         fetch_vars(Tensor | list[Tensor]): Variables returned by inference.
+        kwargs: Supported keys including ``skip_prune_program``.
+            - skip_prune_program(bool): whether to skip prunning program. Defaults to False.
 
     Returns:
         Program: Normalized/Optimized program.
@@ -277,9 +279,12 @@ def normalize_program(program, feed_vars, fetch_vars):
     copy_program.desc.flush()
 
     feed_var_names = [var.name for var in feed_vars]
-    copy_program = copy_program._prune_with_input(
-        feeded_var_names=feed_var_names, targets=fetch_vars
-    )
+
+    skip_prune_program = kwargs.get('skip_prune_program', False)
+    if not skip_prune_program:
+        copy_program = copy_program._prune_with_input(
+            feeded_var_names=feed_var_names, targets=fetch_vars
+        )
     copy_program = copy_program._inference_optimize(prune_read_op=True)
     fetch_var_names = [var.name for var in fetch_vars]
     prepend_feed_ops(copy_program, feed_var_names)
@@ -569,7 +574,12 @@ def save_inference_model(
 
     program = _get_valid_program(kwargs.get('program', None))
     clip_extra = kwargs.get('clip_extra', True)
-    program = normalize_program(program, feed_vars, fetch_vars)
+    program = normalize_program(
+        program,
+        feed_vars,
+        fetch_vars,
+        skip_prune_program=kwargs.get('skip_prune_program', False),
+    )
 
     # serialize and save program
     legacy_format = kwargs.get('legacy_format', False)
diff --git a/python/paddle/sysconfig.py b/python/paddle/sysconfig.py
index 720f07a4ce2fcc..e717eaa05d38da 100644
--- a/python/paddle/sysconfig.py
+++ b/python/paddle/sysconfig.py
@@ -27,8 +27,8 @@ def get_include():
     Examples:
         .. code-block:: python
 
-            import paddle
-            include_dir = paddle.sysconfig.get_include()
+            >>> import paddle
+            >>> include_dir = paddle.sysconfig.get_include()
 
     """
     import paddle
@@ -46,8 +46,8 @@ def get_lib():
     Examples:
         .. code-block:: python
 
-            import paddle
-            include_dir = paddle.sysconfig.get_lib()
+            >>> import paddle
+            >>> include_dir = paddle.sysconfig.get_lib()
 
     """
     import paddle
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index c379136e26287f..11fb7e6f47607f 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -953,73 +953,69 @@ def einsum(equation, *operands):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.seed(102)
-            x = paddle.rand([4])
-            y = paddle.rand([5])
-
-            # sum
-            print(paddle.einsum('i->', x))
-            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #   1.95791852)
-
-            # dot
-            print(paddle.einsum('i,i->', x, x))
-            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #   1.45936954)
-
-            # outer
-            print(paddle.einsum("i,j->ij", x, y))
-            # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #   [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194],
-            #    [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545],
-            #    [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654],
-            #    [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]])
-
-            A = paddle.rand([2, 3, 2])
-            B = paddle.rand([2, 2, 3])
-
-            # transpose
-            print(paddle.einsum('ijk->kji', A))
-            #  Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #   [[[0.95649719, 0.49684682],
-            #     [0.80071914, 0.46258664],
-            #     [0.49814570, 0.33383518]],
-            #
-            #    [[0.07637714, 0.29374704],
-            #     [0.51470858, 0.51907635],
-            #     [0.99066722, 0.55802226]]])
-
-            # batch matrix multiplication
-            print(paddle.einsum('ijk, ikl->ijl', A,B))
-            # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #   [[[0.32172769, 0.50617385, 0.41394392],
-            #     [0.51736701, 0.49921003, 0.38730967],
-            #     [0.69078457, 0.42282537, 0.30161136]],
-            #
-            #    [[0.32043904, 0.18164253, 0.27810261],
-            #     [0.50226176, 0.24512935, 0.39881429],
-            #     [0.51476848, 0.23367381, 0.39229113]]])
-
-            # Ellipsis transpose
-            print(paddle.einsum('...jk->...kj', A))
-            # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #   [[[0.95649719, 0.80071914, 0.49814570],
-            #     [0.07637714, 0.51470858, 0.99066722]],
-            #
-            #    [[0.49684682, 0.46258664, 0.33383518],
-            #     [0.29374704, 0.51907635, 0.55802226]]])
-
-            # Ellipsis batch matrix multiplication
-            print(paddle.einsum('...jk, ...kl->...jl', A,B))
-            # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-            #   [[[0.32172769, 0.50617385, 0.41394392],
-            #     [0.51736701, 0.49921003, 0.38730967],
-            #     [0.69078457, 0.42282537, 0.30161136]],
-            #
-            #    [[0.32043904, 0.18164253, 0.27810261],
-            #     [0.50226176, 0.24512935, 0.39881429],
-            #     [0.51476848, 0.23367381, 0.39229113]]])
+            >>> import paddle
+            >>> paddle.seed(102)
+            >>> x = paddle.rand([4])
+            >>> y = paddle.rand([5])
+
+            >>> # sum
+            >>> print(paddle.einsum('i->', x))
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            1.81225157)
+
+            >>> # dot
+            >>> print(paddle.einsum('i,i->', x, x))
+            Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+            1.13530672)
+
+            >>> # outer
+            >>> print(paddle.einsum("i,j->ij", x, y))
+            Tensor(shape=[4, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+                   [[0.26443148, 0.05962684, 0.25360870, 0.21900642, 0.56994802],
+                    [0.20955276, 0.04725220, 0.20097610, 0.17355499, 0.45166403],
+                    [0.35836059, 0.08080698, 0.34369346, 0.29680005, 0.77240014],
+                    [0.00484230, 0.00109189, 0.00464411, 0.00401047, 0.01043695]])
+
+            >>> A = paddle.rand([2, 3, 2])
+            >>> B = paddle.rand([2, 2, 3])
+
+            >>> # transpose
+            >>> print(paddle.einsum('ijk->kji', A))
+            Tensor(shape=[2, 3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                   [[[0.50882483, 0.56067896],
+                     [0.84598064, 0.36310029],
+                     [0.55289471, 0.33273944]],
+                    [[0.04836850, 0.73811269],
+                     [0.29769155, 0.28137168],
+                     [0.84636718, 0.67521429]]])
+
+            >>> # batch matrix multiplication
+            >>> print(paddle.einsum('ijk, ikl->ijl', A,B))
+            Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                   [[[0.36321065, 0.42009076, 0.40849245],
+                     [0.74353045, 0.79189068, 0.81345987],
+                     [0.90488225, 0.79786193, 0.93451476]],
+                    [[0.12680580, 1.06945944, 0.79821426],
+                     [0.07774551, 0.55068684, 0.44512171],
+                     [0.08053084, 0.80583858, 0.56031936]]])
+
+            >>> # Ellipsis transpose
+            >>> print(paddle.einsum('...jk->...kj', A))
+            Tensor(shape=[2, 2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                   [[[0.50882483, 0.84598064, 0.55289471],
+                     [0.04836850, 0.29769155, 0.84636718]],
+                    [[0.56067896, 0.36310029, 0.33273944],
+                     [0.73811269, 0.28137168, 0.67521429]]])
+
+            >>> # Ellipsis batch matrix multiplication
+            >>> print(paddle.einsum('...jk, ...kl->...jl', A,B))
+            Tensor(shape=[2, 3, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                   [[[0.36321065, 0.42009076, 0.40849245],
+                     [0.74353045, 0.79189068, 0.81345987],
+                     [0.90488225, 0.79786193, 0.93451476]],
+                    [[0.12680580, 1.06945944, 0.79821426],
+                     [0.07774551, 0.55068684, 0.44512171],
+                     [0.08053084, 0.80583858, 0.56031936]]])
 
     """
     import os
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index c9009b37ab71a7..58a3b1fc0ea897 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -53,6 +53,8 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
                 "float32",
                 "float64",
                 "uint16",
+                "complex64",
+                "complex128",
             ],
             op_name,
         )
@@ -70,6 +72,8 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
                     "float32",
                     "float64",
                     "uint16",
+                    "complex64",
+                    "complex128",
                 ],
                 op_name,
             )
@@ -114,8 +118,8 @@ def logical_and(x, y, out=None, name=None):
         .. _Introduction to Tensor: ../../guides/beginner/tensor_en.html#chapter5-broadcasting-of-tensor
 
     Args:
-        x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float16, float32, float64.
-        y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float16, float32, float64.
+        x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float16, float32, float64, complex64, complex128.
+        y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float16, float32, float64, complex64, complex128.
         out(Tensor, optional): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -173,8 +177,8 @@ def logical_or(x, y, out=None, name=None):
         .. _Introduction to Tensor: ../../guides/beginner/tensor_en.html#chapter5-broadcasting-of-tensor
 
     Args:
-        x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float16, float32, float64.
-        y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float16, float32, float64.
+        x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float16, float32, float64, complex64, complex128.
+        y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float16, float32, float64, complex64, complex128.
         out(Tensor): The ``Variable`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -234,8 +238,8 @@ def logical_xor(x, y, out=None, name=None):
         .. _Introduction to Tensor: ../../guides/beginner/tensor_en.html#chapter5-broadcasting-of-tensor
 
     Args:
-        x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, int32, int64, float16, float32, float64.
-        y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, int32, int64, float16, float32, float64.
+        x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, int32, int64, float16, float32, float64, complex64, complex128.
+        y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, int32, int64, float16, float32, float64, complex64, complex128.
         out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -296,7 +300,7 @@ def logical_not(x, out=None, name=None):
         .. _Introduction to Tensor: ../../guides/beginner/tensor_en.html#chapter5-broadcasting-of-tensor
 
     Args:
-        x(Tensor):  Operand of logical_not operator. Must be a Tensor of type bool, int8, int16, in32, in64, float16, float32, or float64.
+        x(Tensor):  Operand of logical_not operator. Must be a Tensor of type bool, int8, int16, in32, in64, float16, float32, or float64, complex64, complex128.
         out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor` will be created to save the output.
         name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
@@ -1316,8 +1320,8 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     two tensors are elementwise equal within a tolerance.
 
     Args:
-        x(Tensor): The input tensor, it's data type should be float16, float32, float64.
-        y(Tensor): The input tensor, it's data type should be float16, float32, float64.
+        x(Tensor): The input tensor, it's data type should be float16, float32, float64, complex64, complex128.
+        y(Tensor): The input tensor, it's data type should be float16, float32, float64, complex64, complex128.
         rtol(rtoltype, optional): The relative tolerance. Default: :math:`1e-5` .
         atol(atoltype, optional): The absolute tolerance. Default: :math:`1e-8` .
         equal_nan(equalnantype, optional): If :math:`True` , then two :math:`NaNs` will be compared as equal. Default: :math:`False` .
@@ -1355,10 +1359,16 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         return _C_ops.isclose(x, y, rtol, atol, equal_nan)
     else:
         check_variable_and_dtype(
-            x, "input", ['float16', 'float32', 'float64'], 'isclose'
+            x,
+            "input",
+            ['float16', 'float32', 'float64', 'complex64', 'complex128'],
+            'isclose',
         )
         check_variable_and_dtype(
-            y, "input", ['float16', 'float32', 'float64'], 'isclose'
+            y,
+            "input",
+            ['float16', 'float32', 'float64', 'complex64', 'complex128'],
+            'isclose',
         )
         check_type(rtol, 'rtol', float, 'isclose')
         check_type(atol, 'atol', float, 'isclose')
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 086edfd4dd70b3..d620b57124207e 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -270,6 +270,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
             "x",
             [
                 'float16',
+                'bfloat16',
                 'uint16',
                 'float32',
                 'float64',
@@ -278,6 +279,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
                 'int32',
                 'int64',
                 'uint8',
+                'complex64',
+                'complex128',
             ],
             "scale",
         )
@@ -435,7 +438,7 @@ def multiplex(inputs, index, name=None):
 def scale_(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     """
     Inplace version of ``scale`` API, the output Tensor will be inplaced with input ``x``.
-    Please refer to :ref:`api_tensor_scale`.
+    Please refer to :ref:`api_paddle_scale`.
     """
     if in_dynamic_mode():
         return _C_ops.scale_(x, scale, float(bias), bias_after_scale)
@@ -685,7 +688,7 @@ def add(x, y, name=None):
 def add_(x, y, name=None):
     """
     Inplace version of ``add`` API, the output Tensor will be inplaced with input ``x``.
-    Please refer to :ref:`api_tensor_add`.
+    Please refer to :ref:`api_paddle_add`.
     """
 
     out_shape = broadcast_shape(x.shape, y.shape)
@@ -825,7 +828,7 @@ def subtract(x, y, name=None):
 def subtract_(x, y, name=None):
     """
     Inplace version of ``subtract`` API, the output Tensor will be inplaced with input ``x``.
-    Please refer to :ref:`api_tensor_subtract`.
+    Please refer to :ref:`api_paddle_subtract`.
     """
 
     out_shape = broadcast_shape(x.shape, y.shape)
@@ -1003,7 +1006,7 @@ def remainder(x, y, name=None):
 def remainder_(x, y, name=None):
     r"""
     Inplace version of ``remainder`` API, the output Tensor will be inplaced with input ``x``.
-    Please refer to :ref:`api_tensor_remainder`.
+    Please refer to :ref:`api_paddle_remainder`.
     """
     out_shape = broadcast_shape(x.shape, y.shape)
     if out_shape != x.shape:
@@ -1086,7 +1089,7 @@ def multiply(x, y, name=None):
 def multiply_(x, y, name=None):
     """
     Inplace version of ``multiply`` API, the output Tensor will be inplaced with input ``x``.
-    Please refer to :ref:`api_tensor_multiply`.
+    Please refer to :ref:`api_paddle_multiply`.
     """
 
     out_shape = broadcast_shape(x.shape, y.shape)
@@ -2287,7 +2290,7 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 def addmm_(input, x, y, beta=1.0, alpha=1.0, name=None):
     """
     Inplace version of ``addmm`` API, the output Tensor will be inplaced with input ``x``.
-    Please refer to :ref:`api_label_addmm`.
+    Please refer to :ref:`api_paddle_addmm`.
     """
     input_shape = input.shape
     x_shape = x.shape
@@ -3575,7 +3578,7 @@ def clip(x, min=None, max=None, name=None):
 def clip_(x, min=None, max=None, name=None):
     """
     Inplace version of ``clip`` API, the output Tensor will be inplaced with input ``x``.
-    Please refer to :ref:`api_tensor_clip`.
+    Please refer to :ref:`api_paddle_clip`.
     """
     fmin = float(np.finfo(np.float32).min)
     fmax = float(np.finfo(np.float32).max)
@@ -4638,7 +4641,7 @@ def tanh(x, name=None):
 def tanh_(x, name=None):
     r"""
     Inplace version of ``tanh`` API, the output Tensor will be inplaced with input ``x``.
-    Please refer to :ref:`api_tensor_tanh`.
+    Please refer to :ref:`api_paddle_tanh`.
     """
     return _C_ops.tanh_(x)
 
@@ -5287,7 +5290,7 @@ def lerp(x, y, weight, name=None):
 def lerp_(x, y, weight, name=None):
     r"""
     Inplace version of ``lerp`` API, the output Tensor will be inplaced with input ``x``.
-    Please refer to :ref:`api_tensor_lerp`.
+    Please refer to :ref:`api_paddle_lerp`.
     """
     out_shape = broadcast_shape(x.shape, y.shape)
     check_type(weight, 'weight', (float, paddle.Tensor, Variable), 'lerp')
@@ -5347,7 +5350,7 @@ def erfinv(x, name=None):
 def erfinv_(x, name=None):
     r"""
     Inplace version of ``erfinv`` API, the output Tensor will be inplaced with input ``x``.
-    Please refer to :ref:`api_tensor_erfinv`.
+    Please refer to :ref:`api_paddle_erfinv`.
     """
     check_type(x, 'x', (paddle.Tensor, Variable), 'erfinv')
     return _C_ops.erfinv_(x)
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index 817508e57034cf..eef02ecb28cc9b 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -162,6 +162,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_rule_based_tuner MODULES test_rule_based_tuner)
   py_test_modules(test_dist_tensor MODULES test_dist_tensor)
   py_test_modules(test_shard_tensor_api MODULES test_shard_tensor_api)
+  py_test_modules(test_cost_interface MODULES test_cost_interface)
   # End of unittests WITH single card WITHOUT timeout
 
 endif()
diff --git a/test/auto_parallel/spmd_rules/test_matmul_rule.py b/test/auto_parallel/spmd_rules/test_matmul_rule.py
index 59e47113302db9..1cf2f49860b331 100644
--- a/test/auto_parallel/spmd_rules/test_matmul_rule.py
+++ b/test/auto_parallel/spmd_rules/test_matmul_rule.py
@@ -13,23 +13,23 @@
 # limitations under the License.
 
 import unittest
+from collections import OrderedDict
 
-from paddle.distributed.auto_parallel.static.completion import get_spmd_rule
 from paddle.distributed.auto_parallel.static.dist_attribute import (
     DistTensorSpec,
     TensorDistAttr,
 )
 from paddle.distributed.fleet import auto
+from paddle.framework import core
 
 
 class TestMatmulSPMDRule(unittest.TestCase):
     def setUp(self):
-        self.rule = get_spmd_rule("matmul")
+        # After replaced all spmd rules by phi impl, we can recover the
+        # api name to `get_spmd_rule`
+        self.rule = core.get_phi_spmd_rule("matmul")
 
-        self.attrs = {
-            'trans_x': False,
-            'trans_y': False,
-        }
+        self.attrs = OrderedDict([('trans_x', False), ('trans_y', False)])
 
     def test_matmul_infer_forward(self):
         # forward setup
@@ -49,7 +49,8 @@ def test_matmul_infer_forward(self):
 
         # TODO test partial: mk[1, 0],kn[0, -1] --> mk[1, 0],kn[0, -1] = nm[1, -1] partial[0]
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+            [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -68,7 +69,8 @@ def test_matmul_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([1, -1])
         self.y_dist_tensor_spec.set_dims_mapping([-1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+            [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -82,7 +84,8 @@ def test_matmul_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([1, -1])
         self.y_dist_tensor_spec.set_dims_mapping([-1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+            [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -95,7 +98,8 @@ def test_matmul_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([-1, -1])
         self.y_dist_tensor_spec.set_dims_mapping([-1, 0])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+            [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -108,7 +112,8 @@ def test_matmul_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([1, 0])
         self.y_dist_tensor_spec.set_dims_mapping([-1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+            [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -122,7 +127,8 @@ def test_matmul_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([-1, -1])
         self.y_dist_tensor_spec.set_dims_mapping([1, 0])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+            [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -137,7 +143,8 @@ def test_matmul_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1])
         self.y_dist_tensor_spec.set_dims_mapping([-1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+            [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -154,7 +161,8 @@ def test_matmul_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1, 0])
         self.y_dist_tensor_spec.set_dims_mapping([-1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+            [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -173,7 +181,8 @@ def test_matmul_infer_forward(self):
         self.y_dist_tensor_spec.set_dims_mapping([-1, -1])
         self.attrs['trans_x'] = True
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+            [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -192,7 +201,8 @@ def test_matmul_infer_forward(self):
         self.attrs['trans_x'] = False
         self.attrs['trans_y'] = True
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+            [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -215,7 +225,8 @@ def test_matmul_infer_forward(self):
         self.attrs['trans_x'] = True
         self.attrs['trans_y'] = True
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+            [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -239,7 +250,8 @@ def test_matmul_infer_forward(self):
         self.attrs['trans_y'] = True
         with self.assertRaises(NotImplementedError):
             self.rule.infer_forward(
-                [self.x_dist_tensor_spec, self.y_dist_tensor_spec], self.attrs
+                [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
+                list(self.attrs.values()),
             )
 
     def test_matmul_infer_backward(self):
@@ -270,7 +282,7 @@ def test_matmul_infer_backward(self):
         result_dist_attrs = self.rule.infer_backward(
             [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
             [self.out_dist_tensor_spec],
-            self.attrs,
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -307,7 +319,7 @@ def test_matmul_infer_backward(self):
         result_dist_attrs = self.rule.infer_backward(
             [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
             [self.out_dist_tensor_spec],
-            self.attrs,
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -329,7 +341,7 @@ def test_matmul_infer_backward(self):
         result_dist_attrs = self.rule.infer_backward(
             [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
             [self.out_dist_tensor_spec],
-            self.attrs,
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -354,7 +366,7 @@ def test_matmul_infer_backward(self):
         result_dist_attrs = self.rule.infer_backward(
             [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
             [self.out_dist_tensor_spec],
-            self.attrs,
+            list(self.attrs.values()),
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -376,7 +388,7 @@ def test_matmul_infer_backward(self):
             self.rule.infer_backward(
                 [self.x_dist_tensor_spec, self.y_dist_tensor_spec],
                 [self.out_dist_tensor_spec],
-                self.attrs,
+                list(self.attrs.values()),
             )
 
 
diff --git a/test/auto_parallel/test_cost_interface.py b/test/auto_parallel/test_cost_interface.py
new file mode 100644
index 00000000000000..cdd63143fd1aab
--- /dev/null
+++ b/test/auto_parallel/test_cost_interface.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn, static, utils
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.static.cluster import Cluster
+from paddle.distributed.auto_parallel.static.completion import Completer
+from paddle.distributed.auto_parallel.static.cost import calc_time_by_cost_model
+from paddle.distributed.auto_parallel.static.dist_context import (
+    DistributedContext,
+)
+from paddle.distributed.auto_parallel.static.parallelizer import (
+    AutoParallelizer,
+)
+from paddle.distributed.auto_parallel.static.partitioner import Partitioner
+from paddle.distributed.auto_parallel.static.reshard import Resharder
+from paddle.distributed.fleet import auto
+
+paddle.enable_static()
+_global_parallel_strategy = "dp_mp_pp"
+_global_process_mesh = auto.ProcessMesh(
+    [[[0, 1], [4, 5]], [[2, 3], [6, 7]]], dim_names=["x", "y", "z"]
+)
+PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], dim_names=["x", "y"])
+PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], dim_names=["x", "y"])
+
+
+class MLPLayer(nn.Layer):
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4 * 1024,
+        initializer_range=0.02,
+    ):
+        super().__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range)
+        )
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr
+        )
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr
+        )
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, "y"])
+        auto.shard_tensor(self.linear1.weight, PP_MESH_1, ["y", None])
+
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        param = paddle.create_parameter([1024, 4096], paddle.float32)
+        auto.shard_tensor(param, PP_MESH_1, [None, "y"])
+        out = paddle.matmul(out, param)
+
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(
+        train_program, start_program
+    ), utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 1024
+        sequence_len = 512
+        input = static.data(
+            name="input", shape=[batch_size, hidden_size], dtype='float32'
+        )
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32'
+        )
+
+        auto.shard_tensor(input, PP_MESH_0, ["x", None])
+        auto.shard_tensor(label, PP_MESH_1, ["x", None])
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02,
+        )
+
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+
+    return loss, train_program, start_program
+
+
+def get_dist_prog(train_program, startup_program, dist_context, rank_id):
+    global _global_process_mesh
+    dist_context.process_mesh = _global_process_mesh
+    loss, train_program, startup_program = mlp_forward(
+        train_program, startup_program
+    )
+
+    fleet._user_defined_strategy = fleet.DistributedStrategy()
+    fleet.user_defined_optimizer = paddle.optimizer.Adam()
+    parallelizer = AutoParallelizer(fleet)
+    parallelizer._dist_context = dist_context
+
+    # serial forward & backward completion
+    completer = Completer(dist_context)
+    complete_train_program = completer.complete_forward_annotation(
+        train_program
+    )
+    dist_context.block_state.parse_forward_blocks(complete_train_program)
+    params_grads = parallelizer._generate_backward(
+        complete_train_program,
+        startup_program,
+        loss,
+        parameter_list=None,
+        no_grad_set=None,
+        callbacks=None,
+    )
+
+    # logical partition
+    partitioner = Partitioner(dist_context, rank_id)
+    (
+        auto_parallel_main_prog,
+        auto_parallel_startup_prog,
+        dist_params_grads,
+    ) = partitioner.partition(
+        complete_train_program, startup_program, params_grads
+    )
+
+    partitioned_optimize_ops = parallelizer._apply_optimize(
+        auto_parallel_main_prog, auto_parallel_startup_prog, dist_params_grads
+    )
+
+    return (
+        auto_parallel_main_prog,
+        auto_parallel_startup_prog,
+        dist_params_grads,
+    )
+
+
+class TestCostInterface(unittest.TestCase):
+    def test_cost_interface(self):
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        dist_context = DistributedContext()
+        rank_id = 2
+        dist_main_prog, dist_startup_prog, dist_params_grads = get_dist_prog(
+            train_program, startup_program, dist_context, rank_id
+        )
+
+        resharder = Resharder(
+            dist_main_prog,
+            dist_startup_prog,
+            rank_id,
+            dist_context,
+            dist_params_grads,
+        )
+        resharder.reshard()
+        cluster = Cluster()
+        cluster.gen_default_config_cluster(node_count=1, device_count=8)
+        for op in dist_main_prog.global_block().ops:
+            time = calc_time_by_cost_model(op, cluster)
+            assert time > -1
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/test_dist_tensor.py b/test/auto_parallel/test_dist_tensor.py
index 45aa8c9fbcaae0..0bf2d88db42370 100644
--- a/test/auto_parallel/test_dist_tensor.py
+++ b/test/auto_parallel/test_dist_tensor.py
@@ -83,6 +83,21 @@ def test_relu_api_for_dist_tensor(self):
         dist_out.backward()
         self.check_tensor_eq(local_in.grad, dist_in.grad)
 
+    def test_matmul_api_for_dist_tensor(self):
+        x = np.random.random(size=[4, 4]).astype("float32")
+        y = np.random.random(size=[4, 4]).astype("float32")
+        local_x, dist_x = self.create_local_and_dist_tensor_pair(x)
+        local_y, dist_y = self.create_local_and_dist_tensor_pair(y)
+        local_out = paddle.matmul(local_x, local_y)
+        dist_out = paddle.matmul(dist_x, dist_y)
+        self.check_tensor_eq(local_out, dist_out)
+
+        # test backward
+        local_out.backward()
+        dist_out.backward()
+        self.check_tensor_eq(local_x.grad, dist_x.grad)
+        self.check_tensor_eq(local_y.grad, dist_y.grad)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/cpp/auto_parallel/CMakeLists.txt b/test/cpp/auto_parallel/CMakeLists.txt
index c5912a6fa10210..ae7300bf62f089 100644
--- a/test/cpp/auto_parallel/CMakeLists.txt
+++ b/test/cpp/auto_parallel/CMakeLists.txt
@@ -9,8 +9,7 @@ if(WITH_DISTRIBUTE)
     dist_tensor_test
     SRCS dist_tensor_test.cc
     DEPS phi)
+  cc_test_old(spmd_rule_test SRCS spmd_rule_test.cc DEPS spmd_rules)
 endif()
 
 cc_test_old(dist_mapper_test SRCS dist_mapper_test.cc DEPS phi)
-
-cc_test_old(spmd_rule_test SRCS spmd_rule_test.cc DEPS spmd_rules)
diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc
index dfd8394faa16ae..30907b707aa9e1 100644
--- a/test/cpp/auto_parallel/spmd_rule_test.cc
+++ b/test/cpp/auto_parallel/spmd_rule_test.cc
@@ -14,12 +14,16 @@ limitations under the License. */
 
 #include <iostream>
 #include <sstream>
+
+#include "glog/logging.h"
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
+#include "paddle/phi/infermeta/spmd_rules/rules.h"
 
 namespace paddle {
 namespace distributed {
@@ -45,22 +49,20 @@ TEST(MatmulSPMDRule, Ctor) {
   y_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1}));
   y_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
 
-  DistTensorSpec x_dist_tensor_spec = DistTensorSpec(x_shape, x_dist_attr);
-  DistTensorSpec y_dist_tensor_spec = DistTensorSpec(y_shape, y_dist_attr);
+  size_t input_size = 2;
+  size_t output_size = 1;
 
-  paddle::framework::AttributeMap attrs;
-  attrs["trans_x"] = false;
-  attrs["trans_y"] = false;
+  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor y(phi::make_ddim(y_shape), y_dist_attr);
 
-  SPMDRuleBase* matmul_rule = SPMDRuleMap::Instance().Get("matmul");
+  auto matmul_spmd_rule =
+      phi::distributed::SpmdRuleFactory::Instance().GetSpmdRule("matmul");
 
   // mk[1, -1],kn[-1, -1] --> mk[1, -1],kn[-1, -1] = nm[1, -1] partial[]
-  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-      infered_dist_attrs = matmul_rule->InferForward(
-          {x_dist_tensor_spec, y_dist_tensor_spec}, attrs);
+  phi::distributed::InferSpmdContext ctx(
+      {x, y}, {/*trans_x=*/false, /*trans_x=*/false});
+  auto infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
 
-  size_t input_size = 2;
-  size_t output_size = 1;
   EXPECT_EQ(infered_dist_attrs.first.size(), input_size);
   EXPECT_EQ(infered_dist_attrs.second.size(), output_size);
 
@@ -74,10 +76,13 @@ TEST(MatmulSPMDRule, Ctor) {
   VLOG(4) << "test1 done." << std::endl << std::endl << std::endl;
 
   // mk[-1,-1],kn[-1,0] --> mk[-1,-1],kn[-1,0] = nm[-1,0] partial[]
-  x_dist_tensor_spec.set_dims_mapping({-1, -1});
-  y_dist_tensor_spec.set_dims_mapping({-1, 0});
-  infered_dist_attrs = matmul_rule->InferForward(
-      {x_dist_tensor_spec, y_dist_tensor_spec}, attrs);
+  x_dist_attr.set_dims_mapping({-1, -1});
+  y_dist_attr.set_dims_mapping({-1, 0});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  ctx = phi::distributed::InferSpmdContext(
+      {x, y}, {/*trans_x=*/false, /*trans_x=*/false});
+  infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
   EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(),
             std::vector<int64_t>({-1, -1}));
   EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(),
@@ -88,10 +93,13 @@ TEST(MatmulSPMDRule, Ctor) {
   VLOG(4) << "test2 done." << std::endl << std::endl << std::endl;
 
   // mk[1, 0],kn[-1,-1] --> mk[1, 0],kn[0, -1] = nm[1, -1] partial[0]: done
-  x_dist_tensor_spec.set_dims_mapping({1, 0});
-  y_dist_tensor_spec.set_dims_mapping({-1, -1});
-  infered_dist_attrs = matmul_rule->InferForward(
-      {x_dist_tensor_spec, y_dist_tensor_spec}, attrs);
+  x_dist_attr.set_dims_mapping({1, 0});
+  y_dist_attr.set_dims_mapping({-1, -1});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  ctx = phi::distributed::InferSpmdContext(
+      {x, y}, {/*trans_x=*/false, /*trans_x=*/false});
+  infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
   EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(),
             std::vector<int64_t>({1, 0}));
   EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(),
@@ -104,10 +112,13 @@ TEST(MatmulSPMDRule, Ctor) {
   VLOG(4) << "test3 done." << std::endl << std::endl << std::endl;
 
   // mk[-1,-1],kn[1,0] --> mk[-1, 1],kn[1, 0] = nm[-1, 0] partial[1]: done
-  x_dist_tensor_spec.set_dims_mapping({-1, -1});
-  y_dist_tensor_spec.set_dims_mapping({1, 0});
-  infered_dist_attrs = matmul_rule->InferForward(
-      {x_dist_tensor_spec, y_dist_tensor_spec}, attrs);
+  x_dist_attr.set_dims_mapping({-1, -1});
+  y_dist_attr.set_dims_mapping({1, 0});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  ctx = phi::distributed::InferSpmdContext(
+      {x, y}, {/*trans_x=*/false, /*trans_x=*/false});
+  infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
   EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(),
             std::vector<int64_t>({-1, 1}));
   EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(),
@@ -121,11 +132,14 @@ TEST(MatmulSPMDRule, Ctor) {
 
   // abcmk[1, 0, -1, -1],kn[-1, -1] --> abcmk[1, 0, -1, -1],kn[-1, -1] =
   // abcmn[1, 0, -1, -1] partial[]: done
-  x_dist_tensor_spec.set_shape({512, 48, 64, 32});
-  x_dist_tensor_spec.set_dims_mapping({0, 1, -1, -1});
-  y_dist_tensor_spec.set_dims_mapping({-1, -1});
-  infered_dist_attrs = matmul_rule->InferForward(
-      {x_dist_tensor_spec, y_dist_tensor_spec}, attrs);
+  x_shape = {512, 48, 64, 32};
+  x_dist_attr.set_dims_mapping({0, 1, -1, -1});
+  y_dist_attr.set_dims_mapping({-1, -1});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  ctx = phi::distributed::InferSpmdContext(
+      {x, y}, {/*trans_x=*/false, /*trans_x=*/false});
+  infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
   EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(),
             std::vector<int64_t>({0, 1, -1, -1}));
   EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(),
@@ -137,10 +151,13 @@ TEST(MatmulSPMDRule, Ctor) {
 
   // abcmk[1, -1, -1, 0],kn[-1, -1] --> abcmk[1, -1, -1, 0],kn[0, -1] = abcmn[1,
   // -1, -1, -1] partial[0]: done
-  x_dist_tensor_spec.set_dims_mapping({1, -1, -1, 0});
-  y_dist_tensor_spec.set_dims_mapping({-1, -1});
-  infered_dist_attrs = matmul_rule->InferForward(
-      {x_dist_tensor_spec, y_dist_tensor_spec}, attrs);
+  x_dist_attr.set_dims_mapping({1, -1, -1, 0});
+  y_dist_attr.set_dims_mapping({-1, -1});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  ctx = phi::distributed::InferSpmdContext(
+      {x, y}, {/*trans_x=*/false, /*trans_x=*/false});
+  infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
   EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(),
             std::vector<int64_t>({1, -1, -1, 0}));
   EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(),
@@ -154,11 +171,13 @@ TEST(MatmulSPMDRule, Ctor) {
 
   // abcmk[1, -1, -1, 0], kn[-1, -1] --> abcmk[1, -1, -1, 0],kn[-1, -1] =
   // abcmn[1, -1, 0, -1] partial[]: done
-  x_dist_tensor_spec.set_dims_mapping({1, -1, -1, 0});
-  y_dist_tensor_spec.set_dims_mapping({-1, -1});
-  attrs["trans_x"] = true;
-  infered_dist_attrs = matmul_rule->InferForward(
-      {x_dist_tensor_spec, y_dist_tensor_spec}, attrs);
+  x_dist_attr.set_dims_mapping({1, -1, -1, 0});
+  y_dist_attr.set_dims_mapping({-1, -1});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  ctx = phi::distributed::InferSpmdContext(
+      {x, y}, {/*trans_x=*/true, /*trans_x=*/false});
+  infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
   EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(),
             std::vector<int64_t>({1, -1, -1, 0}));
   EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(),
@@ -170,12 +189,13 @@ TEST(MatmulSPMDRule, Ctor) {
 
   // abcmk[-1, -1, -1, -1], kn[1, 0] --> abcmk[-1, -1, -1, 0],kn[1, 0] =
   // abcmn[-1, -1, -1, 1] partial[0]: done
-  x_dist_tensor_spec.set_dims_mapping({-1, -1, -1, -1});
-  y_dist_tensor_spec.set_dims_mapping({1, 0});
-  attrs["trans_x"] = false;
-  attrs["trans_y"] = true;
-  infered_dist_attrs = matmul_rule->InferForward(
-      {x_dist_tensor_spec, y_dist_tensor_spec}, attrs);
+  x_dist_attr.set_dims_mapping({-1, -1, -1, -1});
+  y_dist_attr.set_dims_mapping({1, 0});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  ctx = phi::distributed::InferSpmdContext(
+      {x, y}, {/*trans_x=*/false, /*trans_x=*/true});
+  infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
   EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(),
             std::vector<int64_t>({-1, -1, -1, 0}));
   EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(),
@@ -191,12 +211,13 @@ TEST(MatmulSPMDRule, Ctor) {
 
   // abcmk[-1, -1, -1, -1], kn[1, 0] --> abcmk[-1, -1, -1, 0],kn[1, 0] =
   // abcmn[-1, -1, -1, 1] partial[0]: done
-  x_dist_tensor_spec.set_dims_mapping({-1, -1, 0, 1});
-  y_dist_tensor_spec.set_dims_mapping({1, 0});
-  attrs["trans_y"] = true;
-  attrs["trans_x"] = true;
-  infered_dist_attrs = matmul_rule->InferForward(
-      {x_dist_tensor_spec, y_dist_tensor_spec}, attrs);
+  x_dist_attr.set_dims_mapping({-1, -1, 0, 1});
+  y_dist_attr.set_dims_mapping({1, 0});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  ctx = phi::distributed::InferSpmdContext(
+      {x, y}, {/*trans_x=*/true, /*trans_x=*/true});
+  infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
   EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(),
             std::vector<int64_t>({-1, -1, 0, 1}));
   EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(),
@@ -214,23 +235,25 @@ TEST(MatmulSPMDRule, Ctor) {
 
   // abcmk[-1, -1, 1, 0], kn[1, 0] --> abcmk[-1, -1, -1, 0],kn[1, 0] =
   // abcmn[-1, -1, -1, 1] partial[0]: done
-  x_dist_tensor_spec.set_dims_mapping({-1, -1, 1, 0});
-  y_dist_tensor_spec.set_dims_mapping({1, 0});
-  attrs["trans_y"] = true;
-  attrs["trans_x"] = true;
-  EXPECT_ANY_THROW(infered_dist_attrs = matmul_rule->InferForward(
-                       {x_dist_tensor_spec, y_dist_tensor_spec}, attrs));
+  x_dist_attr.set_dims_mapping({-1, -1, 1, 0});
+  y_dist_attr.set_dims_mapping({1, 0});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  ctx = phi::distributed::InferSpmdContext(
+      {x, y}, {/*trans_x=*/true, /*trans_x=*/true});
+  EXPECT_ANY_THROW(infered_dist_attrs = matmul_spmd_rule.InferForward(ctx));
   // Error
   VLOG(4) << "test10 done." << std::endl << std::endl << std::endl;
 
   // abcmk[-1, -1, -1, -1], kn[1, 0] --> abcmk[-1, -1, -1, 0],kn[1, 0] =
   // abcmn[-1, -1, -1, 1] partial[0]:
-  x_dist_tensor_spec.set_dims_mapping({-1, -1, 0, 1});
-  y_dist_tensor_spec.set_dims_mapping({1, 0});
-  attrs["trans_y"] = true;
-  attrs["trans_x"] = true;
-  infered_dist_attrs = matmul_rule->InferForward(
-      {x_dist_tensor_spec, y_dist_tensor_spec}, attrs);
+  x_dist_attr.set_dims_mapping({-1, -1, 0, 1});
+  y_dist_attr.set_dims_mapping({1, 0});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  ctx = phi::distributed::InferSpmdContext(
+      {x, y}, {/*trans_x=*/true, /*trans_x=*/true});
+  infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
   EXPECT_ANY_THROW(infered_dist_attrs.second[0].clean_partial_dims(
       std::vector<int64_t>({1})));
   infered_dist_attrs.second[0].set_partial_status(std::vector<int64_t>({1}));
@@ -242,7 +265,6 @@ TEST(MatmulSPMDRule, Ctor) {
             std::set<int64_t>({0}));
   infered_dist_attrs.second[0].clean_partial_dims(std::vector<int64_t>({0}));
   EXPECT_EQ(infered_dist_attrs.second[0].is_partial(), false);
-
   VLOG(4) << "test11 done." << std::endl << std::endl << std::endl;
 }
 
@@ -372,25 +394,21 @@ TEST(MatmulSPMDRuleInferBackward, Ctor) {
   out_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
   out_dist_attr.set_partial_status(std::vector<int64_t>({0}));
 
-  DistTensorSpec x_dist_tensor_spec = DistTensorSpec(x_shape, x_dist_attr);
-  DistTensorSpec y_dist_tensor_spec = DistTensorSpec(y_shape, y_dist_attr);
-  DistTensorSpec out_dist_tensor_spec =
-      DistTensorSpec(out_shape, out_dist_attr);
-
-  paddle::framework::AttributeMap attrs;
-  attrs["trans_x"] = false;
-  attrs["trans_y"] = false;
+  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor y(phi::make_ddim(y_shape), y_dist_attr);
+  phi::distributed::DistMetaTensor out(phi::make_ddim(out_shape),
+                                       out_dist_attr);
 
-  SPMDRuleBase* matmul_rule = SPMDRuleMap::Instance().Get("matmul");
+  auto matmul_spmd_rule =
+      phi::distributed::SpmdRuleFactory::Instance().GetSpmdRule("matmul");
 
   // TODO(zyc) update in future: propogate the partial in inferbackward
   // abmn[-1, -1, 1, -1] + partial[0] --> abmk[-1, -1, 1, -1], a1kn[-1, -1, -1,
   // -1]
+  phi::distributed::InferSpmdContext ctx(
+      {x, y, out}, {/*trans_x=*/false, /*trans_x=*/false});
   std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-      infered_dist_attrs =
-          matmul_rule->InferBackward({x_dist_tensor_spec, y_dist_tensor_spec},
-                                     {out_dist_tensor_spec},
-                                     attrs);
+      infered_dist_attrs = matmul_spmd_rule.InferBackward(ctx);
 
   size_t input_size = 2;
   size_t output_size = 1;
diff --git a/test/cpp/eager/performance_tests/benchmark_utils.cc b/test/cpp/eager/performance_tests/benchmark_utils.cc
index 83d14a6b45b897..73c4404fca7a8b 100644
--- a/test/cpp/eager/performance_tests/benchmark_utils.cc
+++ b/test/cpp/eager/performance_tests/benchmark_utils.cc
@@ -241,9 +241,7 @@ void benchmark_fluid_scale(const std::shared_ptr<imperative::VarBase>& X,
   for (size_t i = 0; i < max_num_runs; i++) {
     imperative::NameVarBaseMap ins = {{"X", {tmp_out}}};
     imperative::NameVarBaseMap outs = {
-        {"Out",
-         {std::shared_ptr<imperative::VarBase>(
-             new imperative::VarBase(true, "Out"))}}};
+        {"Out", {std::make_shared<imperative::VarBase>(true, "Out")}}};
 
     tracer.TraceOp<VarBase>("scale", ins, outs, attrs, place, true);
 
@@ -277,9 +275,7 @@ void benchmark_fluid_matmul(const std::shared_ptr<imperative::VarBase>& X,
     framework::AttributeMap attrs;
     imperative::NameVarBaseMap ins = {{"X", {tmp_out}}, {"Y", {Y}}};
     imperative::NameVarBaseMap outs = {
-        {"Out",
-         {std::shared_ptr<imperative::VarBase>(
-             new imperative::VarBase(true, "Out"))}}};
+        {"Out", {std::make_shared<imperative::VarBase>(true, "Out")}}};
 
     tracer.TraceOp<VarBase>("matmul_v2", ins, outs, attrs, place, true);
 
@@ -316,17 +312,13 @@ void benchmark_fluid_mlp(
   for (size_t i = 0; i < MLP_NUM_LINEAR; i++) {
     // Matmul0
     ins = {{"X", {input0}}, {"Y", {Ws[0]}}};
-    outs = {{"Out",
-             {std::shared_ptr<imperative::VarBase>(
-                 new imperative::VarBase(true, "Out"))}}};
+    outs = {{"Out", {std::make_shared<imperative::VarBase>(true, "Out")}}};
 
     tracer.TraceOp<VarBase>("matmul_v2", ins, outs, attrs, place, true);
 
     // EW-Add0
     ins = {{"X", outs["Out"]}, {"Y", {Bs[i]}}};
-    outs = {{"Out",
-             {std::shared_ptr<imperative::VarBase>(
-                 new imperative::VarBase(true, "Out"))}}};
+    outs = {{"Out", {std::make_shared<imperative::VarBase>(true, "Out")}}};
 
     tracer.TraceOp<VarBase>("elementwise_add", ins, outs, attrs, place, true);
     input0 = outs["Out"][0];
@@ -334,9 +326,7 @@ void benchmark_fluid_mlp(
 
   // ReduceSum
   ins = {{"X", {input0}}};
-  outs = {{"Out",
-           {std::shared_ptr<imperative::VarBase>(
-               new imperative::VarBase(true, "Out"))}}};
+  outs = {{"Out", {std::make_shared<imperative::VarBase>(true, "Out")}}};
   attrs = {{"reduce_all", true}};
 
   tracer.TraceOp<VarBase>("reduce_sum", ins, outs, attrs, place, true);
diff --git a/test/cpp/fluid/benchmark/op_tester.cc b/test/cpp/fluid/benchmark/op_tester.cc
index 0ab90f03999da6..6f68ab23a45669 100644
--- a/test/cpp/fluid/benchmark/op_tester.cc
+++ b/test/cpp/fluid/benchmark/op_tester.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <fstream>
 
-#include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -25,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/pybind/pybind.h"
+#include "paddle/utils/flags.h"
 
 // phi
 #include "paddle/phi/kernels/declarations.h"
@@ -33,8 +33,8 @@ namespace paddle {
 namespace operators {
 namespace benchmark {
 
-DEFINE_string(op_config_list, "", "Path of op config file.");  // NOLINT
-DEFINE_int32(specified_config_id, -1, "Test the specified op config.");
+PD_DEFINE_string(op_config_list, "", "Path of op config file.");  // NOLINT
+PD_DEFINE_int32(specified_config_id, -1, "Test the specified op config.");
 
 void OpTester::Init(const std::string &filename) {
   Init(OpTesterConfig(filename));
@@ -57,13 +57,13 @@ void OpTester::Init(const OpTesterConfig &config) {
   }
 
   if (config_.device_id >= 0) {
-    place_ = paddle::platform::CUDAPlace(config_.device_id);
+    place_ = ::paddle::platform::CUDAPlace(config_.device_id);
   } else {
-    place_ = paddle::platform::CPUPlace();
+    place_ = ::paddle::platform::CPUPlace();
   }
 
   framework::InitDevices();
-  scope_ = std::make_unique<paddle::framework::Scope>();
+  scope_ = std::make_unique<::paddle::framework::Scope>();
 
   op_ = framework::OpRegistry::CreateOp(op_desc_);
   CreateVariables(scope_.get());
@@ -318,7 +318,7 @@ void OpTester::SetupTensor(phi::DenseTensor *tensor,
   }
 
   if (!platform::is_cpu_place(place_)) {
-    paddle::framework::TensorCopySync(cpu_tensor, place_, tensor);
+    ::paddle::framework::TensorCopySync(cpu_tensor, place_, tensor);
   }
 }
 
diff --git a/test/cpp/fluid/mkldnn/test_mkldnn_cpu_quantize_pass.cc b/test/cpp/fluid/mkldnn/test_mkldnn_cpu_quantize_pass.cc
index 6f1ac7e56f3045..d6ba2f7a0c2322 100644
--- a/test/cpp/fluid/mkldnn/test_mkldnn_cpu_quantize_pass.cc
+++ b/test/cpp/fluid/mkldnn/test_mkldnn_cpu_quantize_pass.cc
@@ -29,7 +29,7 @@ using std::pair;
 using std::string;
 using std::unordered_map;
 
-DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
+PD_DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
 
 namespace paddle {
 namespace pass {
diff --git a/test/cpp/fluid/pscore/CMakeLists.txt b/test/cpp/fluid/pscore/CMakeLists.txt
index c19df6b4696056..07d3efaa311102 100644
--- a/test/cpp/fluid/pscore/CMakeLists.txt
+++ b/test/cpp/fluid/pscore/CMakeLists.txt
@@ -18,7 +18,7 @@ if(WITH_ARM_BRPC)
     framework_proto
     sendrecv_rpc
     arm_brpc
-    gflags
+    ${flags_dep}
     glog
     snappy
     device_context)
@@ -35,15 +35,9 @@ else()
     ps_framework_proto
     framework_proto
     sendrecv_rpc
-    brpc
-    leveldb
-    ssl
-    crypto
-    protobuf
-    gflags
-    glog
+    ${EXTERNAL_BRPC_DEPS}
+    ${flags_dep}
     zlib
-    snappy
     device_context)
 endif()
 
diff --git a/test/cpp/fluid/pscore/switch_server_test.cc b/test/cpp/fluid/pscore/switch_server_test.cc
index a5e6fff4804af4..5ea2e28d4543b8 100644
--- a/test/cpp/fluid/pscore/switch_server_test.cc
+++ b/test/cpp/fluid/pscore/switch_server_test.cc
@@ -21,18 +21,18 @@ limitations under the License. */
 #include <string>
 #include <thread>  // NOLINT
 
-#include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
 #include "paddle/fluid/distributed/ps/service/heter_server.h"
+#include "paddle/utils/flags.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace distributed = paddle::distributed;
 
-DEFINE_string(switch_addr_inner, "127.0.0.1:6000", "addr of inner cluster");
-DEFINE_string(switch_addr_heter, "127.0.0.1:6100", "add of inter cluster");
-DEFINE_string(peer_switch_addr, "127.0.0.1:7100", "add of inter cluster");
+PD_DEFINE_string(switch_addr_inner, "127.0.0.1:6000", "addr of inner cluster");
+PD_DEFINE_string(switch_addr_heter, "127.0.0.1:6100", "add of inter cluster");
+PD_DEFINE_string(peer_switch_addr, "127.0.0.1:7100", "add of inter cluster");
 
 void StartSwitchServer(
     std::shared_ptr<distributed::HeterServer>& switch_server_ptr,  // NOLINT
@@ -61,7 +61,7 @@ int main(int argc, char* argv[]) {
   framework::ProgramDesc program;
   exe.Prepare(program, 0);  // solve undefined symbol: tensor_table.cc
 
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::flags::ParseCommandLineFlags(&argc, &argv);
 
   std::string switch_a_endpoint(FLAGS_switch_addr_inner);
   std::string switch_a_endpoint_inter(FLAGS_switch_addr_heter);
diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc
index 2f794a4c784fa4..35c07c3a83790c 100644
--- a/test/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/cpp/inference/api/analysis_predictor_tester.cc
@@ -31,7 +31,7 @@
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
-DEFINE_string(dirname, "", "dirname to tests.");
+PD_DEFINE_string(dirname, "", "dirname to tests.");
 
 namespace paddle {
 
diff --git a/test/cpp/inference/api/analyzer_bfloat16_image_classification_tester.cc b/test/cpp/inference/api/analyzer_bfloat16_image_classification_tester.cc
index f6d4d8e8f769fb..fe7d2a3a6f6cf9 100644
--- a/test/cpp/inference/api/analyzer_bfloat16_image_classification_tester.cc
+++ b/test/cpp/inference/api/analyzer_bfloat16_image_classification_tester.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
-DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
+PD_DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
 
 namespace paddle {
 namespace inference {
diff --git a/test/cpp/inference/api/analyzer_dam_tester.cc b/test/cpp/inference/api/analyzer_dam_tester.cc
index 21276c71f63ec3..d17f8670adcf43 100644
--- a/test/cpp/inference/api/analyzer_dam_tester.cc
+++ b/test/cpp/inference/api/analyzer_dam_tester.cc
@@ -130,7 +130,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots,
   auto one_batch = data->NextBatch();
   PADDLE_ENFORCE(
       !one_batch.response.empty(),
-      paddle::platform::errors::Fatal("The response of one batch is empty."));
+      ::paddle::platform::errors::Fatal("The response of one batch is empty."));
   int size = one_batch.response[0].size();
   CHECK_EQ(size, kMaxTurnLen);
   // turn tensor assignment
@@ -228,17 +228,17 @@ void profile(bool use_mkldnn = false) {
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     PADDLE_ENFORCE_GT(outputs.size(),
                       0,
-                      paddle::platform::errors::Fatal(
+                      ::paddle::platform::errors::Fatal(
                           "The size of outputs should be greater than 0."));
     auto output = outputs.back();
     PADDLE_ENFORCE_GT(output.size(),
                       0,
-                      paddle::platform::errors::Fatal(
+                      ::paddle::platform::errors::Fatal(
                           "The size of output should be greater than 0."));
     size_t size = GetSize(output[0]);
     PADDLE_ENFORCE_GT(size,
                       0,
-                      paddle::platform::errors::Fatal(
+                      ::paddle::platform::errors::Fatal(
                           "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     for (size_t i = 0; i < size; i++) {
diff --git a/test/cpp/inference/api/analyzer_detect_functional_mkldnn_tester.cc b/test/cpp/inference/api/analyzer_detect_functional_mkldnn_tester.cc
index 389e0c9648d291..8d5627fce3cd8b 100644
--- a/test/cpp/inference/api/analyzer_detect_functional_mkldnn_tester.cc
+++ b/test/cpp/inference/api/analyzer_detect_functional_mkldnn_tester.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
-DEFINE_string(infer_shape, "", "data shape file");
-DEFINE_int32(sample, 20, "number of sample");
+PD_DEFINE_string(infer_shape, "", "data shape file");
+PD_DEFINE_int32(sample, 20, "number of sample");
 
 namespace paddle {
 namespace inference {
diff --git a/test/cpp/inference/api/analyzer_detect_tester.cc b/test/cpp/inference/api/analyzer_detect_tester.cc
index 62b97d635b5e23..72e498521cd6a0 100644
--- a/test/cpp/inference/api/analyzer_detect_tester.cc
+++ b/test/cpp/inference/api/analyzer_detect_tester.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 
 #include "test/cpp/inference/api/tester_helper.h"
 
-DEFINE_string(infer_shape, "", "data shape file");
-DEFINE_int32(sample, 20, "number of sample");
+PD_DEFINE_string(infer_shape, "", "data shape file");
+PD_DEFINE_int32(sample, 20, "number of sample");
 
 namespace paddle {
 namespace inference {
diff --git a/test/cpp/inference/api/analyzer_image_classification_tester.cc b/test/cpp/inference/api/analyzer_image_classification_tester.cc
index 93b8f92c797d99..1df6d4488614d4 100644
--- a/test/cpp/inference/api/analyzer_image_classification_tester.cc
+++ b/test/cpp/inference/api/analyzer_image_classification_tester.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "test/cpp/inference/api/tester_helper.h"
 
-DEFINE_bool(disable_mkldnn_fc, false, "Disable usage of MKL-DNN's FC op");
+PD_DEFINE_bool(disable_mkldnn_fc, false, "Disable usage of MKL-DNN's FC op");
 
 namespace paddle {
 namespace inference {
diff --git a/test/cpp/inference/api/analyzer_int8_image_classification_tester.cc b/test/cpp/inference/api/analyzer_int8_image_classification_tester.cc
index 57552ccb82e602..77c12dcfe0f524 100644
--- a/test/cpp/inference/api/analyzer_int8_image_classification_tester.cc
+++ b/test/cpp/inference/api/analyzer_int8_image_classification_tester.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
-DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
+PD_DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
 
 namespace paddle {
 namespace inference {
@@ -53,7 +53,7 @@ TEST(Analyzer_int8_image_classification, quantization) {
     // prepare warmup batch from input data read earlier
     // warmup batch size can be different than batch size
     std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
-        paddle::inference::GetWarmupData(input_slots_all);
+        ::paddle::inference::GetWarmupData(input_slots_all);
 
     // INT8 implies FC oneDNN passes to be used
     q_cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
diff --git a/test/cpp/inference/api/analyzer_int8_object_detection_tester.cc b/test/cpp/inference/api/analyzer_int8_object_detection_tester.cc
index a3b13dd691cd7b..311fb0946ca006 100644
--- a/test/cpp/inference/api/analyzer_int8_object_detection_tester.cc
+++ b/test/cpp/inference/api/analyzer_int8_object_detection_tester.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
-DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
+PD_DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
 
 // setting iterations to 0 means processing the whole dataset
 namespace paddle {
@@ -153,7 +153,7 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
   PADDLE_ENFORCE_LE(
       static_cast<size_t>(num_images),
       iterations * test_data_batch_size,
-      paddle::platform::errors::Fatal(
+      ::paddle::platform::errors::Fatal(
           "The requested quantization warmup data size " +
           std::to_string(num_images) + " is bigger than all test data size."));
 
@@ -247,9 +247,9 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
   PADDLE_ENFORCE_EQ(
       static_cast<size_t>(num_objects),
       static_cast<size_t>(objects_accum),
-      paddle::platform::errors::Fatal("The requested num of objects " +
-                                      std::to_string(num_objects) +
-                                      " is the same as objects_accum."));
+      ::paddle::platform::errors::Fatal("The requested num of objects " +
+                                        std::to_string(num_objects) +
+                                        " is the same as objects_accum."));
 
   auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(4);
   (*warmup_data)[0] = std::move(images);
diff --git a/test/cpp/inference/api/analyzer_lac_tester.cc b/test/cpp/inference/api/analyzer_lac_tester.cc
index fb82bbc3b2d947..9bdb819e5fbd68 100644
--- a/test/cpp/inference/api/analyzer_lac_tester.cc
+++ b/test/cpp/inference/api/analyzer_lac_tester.cc
@@ -99,10 +99,10 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots,
   input_tensor.name = "word";
   input_tensor.dtype = PaddleDType::INT64;
   TensorAssignData<int64_t>(&input_tensor, {one_batch.data}, one_batch.lod);
-  PADDLE_ENFORCE_EQ(
-      batch_size,
-      static_cast<int>(one_batch.lod.size() - 1),
-      paddle::platform::errors::Fatal("The lod size of one batch is invaild."));
+  PADDLE_ENFORCE_EQ(batch_size,
+                    static_cast<int>(one_batch.lod.size() - 1),
+                    ::paddle::platform::errors::Fatal(
+                        "The lod size of one batch is invaild."));
   input_slots->assign({input_tensor});
 }
 
@@ -145,19 +145,19 @@ TEST(Analyzer_LAC, profile) {
         15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
     PADDLE_ENFORCE_GT(outputs.size(),
                       0,
-                      paddle::platform::errors::Fatal(
+                      ::paddle::platform::errors::Fatal(
                           "The size of output should be greater than 0."));
     auto output = outputs.back();
     PADDLE_ENFORCE_EQ(output.size(),
                       1UL,
-                      paddle::platform::errors::Fatal(
+                      ::paddle::platform::errors::Fatal(
                           "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
     size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
     PADDLE_ENFORCE_GE(
         size,
         batch1_size,
-        paddle::platform::errors::Fatal("The size of batch is invaild."));
+        ::paddle::platform::errors::Fatal("The size of batch is invaild."));
     int64_t *pdata = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < batch1_size; ++i) {
       EXPECT_EQ(pdata[i], lac_ref_data[i]);
diff --git a/test/cpp/inference/api/analyzer_mmp_tester.cc b/test/cpp/inference/api/analyzer_mmp_tester.cc
index a432d5c10b2295..92345fc8950a53 100644
--- a/test/cpp/inference/api/analyzer_mmp_tester.cc
+++ b/test/cpp/inference/api/analyzer_mmp_tester.cc
@@ -18,8 +18,8 @@
 #include "test/cpp/inference/api/tester_helper.h"
 
 // Here add missing commands
-DEFINE_string(infer_model2, "", "model path");
-DEFINE_string(infer_model3, "", "model path");
+PD_DEFINE_string(infer_model2, "", "model path");
+PD_DEFINE_string(infer_model3, "", "model path");
 
 namespace paddle {
 namespace inference {
@@ -100,12 +100,12 @@ void compare(bool use_mkldnn = false) {
       xx2_output.begin(),
       [](const float& l, const float& r) { return fabs(l - r) < 1e-4; });
 
-  PADDLE_ENFORCE_EQ(
-      result,
-      true,
-      paddle::platform::errors::Fatal("Results of model run independently "
-                                      "differs from results of the same model "
-                                      "run as a sequence of models"));
+  PADDLE_ENFORCE_EQ(result,
+                    true,
+                    ::paddle::platform::errors::Fatal(
+                        "Results of model run independently "
+                        "differs from results of the same model "
+                        "run as a sequence of models"));
 }
 
 TEST(Analyzer_mmp, compare) { compare(); }
diff --git a/test/cpp/inference/api/analyzer_quant_image_classification_tester.cc b/test/cpp/inference/api/analyzer_quant_image_classification_tester.cc
index 69b627275cd056..e9b841ec772abc 100644
--- a/test/cpp/inference/api/analyzer_quant_image_classification_tester.cc
+++ b/test/cpp/inference/api/analyzer_quant_image_classification_tester.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
-DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
+PD_DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");
 
 namespace paddle {
 namespace inference {
diff --git a/test/cpp/inference/api/analyzer_rnn1_tester.cc b/test/cpp/inference/api/analyzer_rnn1_tester.cc
index 7f0f11f5515bdd..c5c7df887dd5ed 100644
--- a/test/cpp/inference/api/analyzer_rnn1_tester.cc
+++ b/test/cpp/inference/api/analyzer_rnn1_tester.cc
@@ -14,7 +14,7 @@
 
 #include "test/cpp/inference/api/tester_helper.h"
 
-DEFINE_bool(with_precision_check, true, "turn on test");
+PD_DEFINE_bool(with_precision_check, true, "turn on test");
 
 namespace paddle {
 namespace inference {
diff --git a/test/cpp/inference/api/analyzer_seq_pool1_tester_helper.h b/test/cpp/inference/api/analyzer_seq_pool1_tester_helper.h
index 3c0ad4b5f823d8..0d75eacbbdf4e6 100644
--- a/test/cpp/inference/api/analyzer_seq_pool1_tester_helper.h
+++ b/test/cpp/inference/api/analyzer_seq_pool1_tester_helper.h
@@ -65,7 +65,7 @@ struct DataRecord {
       PADDLE_ENFORCE_EQ(
           slot_data.size() % 11,
           0UL,
-          paddle::platform::errors::Fatal(
+          ::paddle::platform::errors::Fatal(
               "line %d, %s should be divisible", num_lines, name));
       datasets[name].emplace_back(std::move(slot_data));
     }
@@ -73,19 +73,19 @@ struct DataRecord {
     PADDLE_ENFORCE_EQ(
         num_samples * num_slots,
         static_cast<size_t>(num_lines),
-        paddle::platform::errors::Fatal("num samples should be divisible"));
+        ::paddle::platform::errors::Fatal("num samples should be divisible"));
     PADDLE_ENFORCE_GT(num_samples,
                       0UL,
-                      paddle::platform::errors::Fatal(
+                      ::paddle::platform::errors::Fatal(
                           "The num of samples should be greater than 0."));
   }
 
   void Prepare(int bs) {
     for (auto it = datasets.begin(); it != datasets.end(); ++it) {
-      PADDLE_ENFORCE_EQ(
-          it->second.size(),
-          num_samples,
-          paddle::platform::errors::Fatal("size of each slot should be equal"));
+      PADDLE_ENFORCE_EQ(it->second.size(),
+                        num_samples,
+                        ::paddle::platform::errors::Fatal(
+                            "size of each slot should be equal"));
     }
     size_t num_batches = num_samples / bs;
     EXPECT_GT(num_batches, 0UL);
@@ -110,7 +110,7 @@ struct DataRecord {
           PADDLE_ENFORCE_EQ(
               len * 11,
               datas[id].size(),
-              paddle::platform::errors::Fatal(
+              ::paddle::platform::errors::Fatal(
                   "%s %d size should be divisible", slot.name, id));
           lod[k + 1] = lod[k] + len;
         }
diff --git a/test/cpp/inference/api/analyzer_vis_tester.cc b/test/cpp/inference/api/analyzer_vis_tester.cc
index cf79d26847f452..d8b15393ad6013 100644
--- a/test/cpp/inference/api/analyzer_vis_tester.cc
+++ b/test/cpp/inference/api/analyzer_vis_tester.cc
@@ -64,7 +64,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
   PADDLE_ENFORCE_EQ(
       FLAGS_test_all_data,
       0,
-      paddle::platform::errors::Fatal("Only have single batch of data."));
+      ::paddle::platform::errors::Fatal("Only have single batch of data."));
   std::string line;
   std::ifstream file(FLAGS_infer_data);
   std::getline(file, line);
@@ -107,7 +107,7 @@ void profile(bool use_mkldnn = false) {
 
     PADDLE_ENFORCE_GT(outputs.size(),
                       0,
-                      paddle::platform::errors::Fatal(
+                      ::paddle::platform::errors::Fatal(
                           "The size of output should be greater than 0."));
     auto &output = outputs.back().front();
     size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
diff --git a/test/cpp/inference/api/api_impl_tester.cc b/test/cpp/inference/api/api_impl_tester.cc
index 934ff06535054d..78e908189cc1d4 100644
--- a/test/cpp/inference/api/api_impl_tester.cc
+++ b/test/cpp/inference/api/api_impl_tester.cc
@@ -17,9 +17,9 @@ limitations under the License. */
 
 #include <thread>  // NOLINT
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/test_helper.h"
 
 #ifdef __clang__
@@ -28,10 +28,10 @@ limitations under the License. */
 #define ACC_DIFF 1e-3
 #endif
 
-DEFINE_string(word2vec_dirname,
-              "",
-              "Directory of the word2vec inference model.");
-DEFINE_string(book_dirname, "", "Directory of the book inference model.");
+PD_DEFINE_string(word2vec_dirname,
+                 "",
+                 "Directory of the word2vec inference model.");
+PD_DEFINE_string(book_dirname, "", "Directory of the book inference model.");
 
 namespace paddle {
 
@@ -67,11 +67,11 @@ NativeConfig GetConfig() {
   return config;
 }
 
-void MainWord2Vec(const paddle::PaddlePlace& place) {
+void MainWord2Vec(const ::paddle::PaddlePlace& place) {
   NativeConfig config = GetConfig();
   auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-  config.use_gpu = paddle::gpu_place_used(place);
-  config.use_xpu = paddle::xpu_place_used(place);
+  config.use_gpu = ::paddle::gpu_place_used(place);
+  config.use_xpu = ::paddle::xpu_place_used(place);
 
   phi::DenseTensor first_word, second_word, third_word, fourth_word;
   framework::LoD lod{{0, 1}};
@@ -105,7 +105,7 @@ void MainWord2Vec(const paddle::PaddlePlace& place) {
   cpu_feeds.push_back(&fourth_word);
 
   framework::FetchType output1;
-  std::vector<paddle::framework::FetchType*> cpu_fetchs1;
+  std::vector<::paddle::framework::FetchType*> cpu_fetchs1;
   cpu_fetchs1.push_back(&output1);
 
   TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);
@@ -118,12 +118,12 @@ void MainWord2Vec(const paddle::PaddlePlace& place) {
   }
 }
 
-void MainImageClassification(const paddle::PaddlePlace& place) {
+void MainImageClassification(const ::paddle::PaddlePlace& place) {
   int batch_size = 2;
   bool repeat = false;
   NativeConfig config = GetConfig();
-  config.use_gpu = paddle::gpu_place_used(place);
-  config.use_xpu = paddle::xpu_place_used(place);
+  config.use_gpu = ::paddle::gpu_place_used(place);
+  config.use_xpu = ::paddle::xpu_place_used(place);
   config.model_dir =
       FLAGS_book_dirname + "/image_classification_resnet.inference.model";
 
@@ -163,10 +163,10 @@ void MainImageClassification(const paddle::PaddlePlace& place) {
   }
 }
 
-void MainThreadsWord2Vec(const paddle::PaddlePlace& place) {
+void MainThreadsWord2Vec(const ::paddle::PaddlePlace& place) {
   NativeConfig config = GetConfig();
-  config.use_gpu = paddle::gpu_place_used(place);
-  config.use_xpu = paddle::xpu_place_used(place);
+  config.use_gpu = ::paddle::gpu_place_used(place);
+  config.use_xpu = ::paddle::xpu_place_used(place);
   auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
 
   // prepare inputs data and reference results
@@ -186,7 +186,7 @@ void MainThreadsWord2Vec(const paddle::PaddlePlace& place) {
 
     // get reference result of each job
     std::vector<phi::DenseTensor*> ref_feeds;
-    std::vector<paddle::framework::FetchType*> ref_fetches(1, &refs[i]);
+    std::vector<::paddle::framework::FetchType*> ref_fetches(1, &refs[i]);
     for (auto& word : jobs[i]) {
       ref_feeds.push_back(&word);
     }
@@ -225,12 +225,12 @@ void MainThreadsWord2Vec(const paddle::PaddlePlace& place) {
   }
 }
 
-void MainThreadsImageClassification(const paddle::PaddlePlace& place) {
+void MainThreadsImageClassification(const ::paddle::PaddlePlace& place) {
   constexpr int num_jobs = 4;  // each job run 1 batch
   constexpr int batch_size = 1;
   NativeConfig config = GetConfig();
-  config.use_gpu = paddle::gpu_place_used(place);
-  config.use_xpu = paddle::xpu_place_used(place);
+  config.use_gpu = ::paddle::gpu_place_used(place);
+  config.use_xpu = ::paddle::xpu_place_used(place);
   config.model_dir =
       FLAGS_book_dirname + "/image_classification_resnet.inference.model";
 
@@ -280,53 +280,53 @@ void MainThreadsImageClassification(const paddle::PaddlePlace& place) {
 }
 
 TEST(inference_api_native, word2vec_cpu) {
-  MainWord2Vec(paddle::PaddlePlace::kCPU);
+  MainWord2Vec(::paddle::PaddlePlace::kCPU);
 }
 TEST(inference_api_native, word2vec_cpu_threads) {
-  MainThreadsWord2Vec(paddle::PaddlePlace::kCPU);
+  MainThreadsWord2Vec(::paddle::PaddlePlace::kCPU);
 }
 TEST(inference_api_native, image_classification_cpu) {
-  MainImageClassification(paddle::PaddlePlace::kCPU);
+  MainImageClassification(::paddle::PaddlePlace::kCPU);
 }
 TEST(inference_api_native, image_classification_cpu_threads) {
-  MainThreadsImageClassification(paddle::PaddlePlace::kCPU);
+  MainThreadsImageClassification(::paddle::PaddlePlace::kCPU);
 }
 
 #ifdef PADDLE_WITH_XPU
 TEST(inference_api_native, word2vec_xpu) {
-  MainWord2Vec(paddle::PaddlePlace::kXPU);
+  MainWord2Vec(::paddle::PaddlePlace::kXPU);
 }
 TEST(inference_api_native, image_classification_xpu) {
-  MainImageClassification(paddle::PaddlePlace::kXPU);
+  MainImageClassification(::paddle::PaddlePlace::kXPU);
 }
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(inference_api_native, word2vec_gpu) {
-  MainWord2Vec(paddle::PaddlePlace::kGPU);
+  MainWord2Vec(::paddle::PaddlePlace::kGPU);
 }
 // Turn off temporarily for the unstable result.
 // TEST(inference_api_native, word2vec_gpu_threads) {
-//   MainThreadsWord2Vec(paddle::PaddlePlace::kGPU);
+//   MainThreadsWord2Vec(::paddle::PaddlePlace::kGPU);
 // }
 TEST(inference_api_native, image_classification_gpu) {
-  MainImageClassification(paddle::PaddlePlace::kGPU);
+  MainImageClassification(::paddle::PaddlePlace::kGPU);
 }
 // Turn off temporarily for the unstable result.
 // TEST(inference_api_native, image_classification_gpu_threads) {
-//   MainThreadsImageClassification(paddle::PaddlePlace::kGPU);
+//   MainThreadsImageClassification(::paddle::PaddlePlace::kGPU);
 // }
 #endif
 
 #ifdef PADDLE_WITH_DNNL
 TEST(inference_api_native, image_classification_cpu_onednn) {
   FLAGS_use_mkldnn = true;
-  MainImageClassification(paddle::PaddlePlace::kCPU);
+  MainImageClassification(::paddle::PaddlePlace::kCPU);
 }
 
 TEST(inference_api_native, word2vec_cpu_onednn) {
   FLAGS_use_mkldnn = true;
-  MainWord2Vec(paddle::PaddlePlace::kCPU);
+  MainWord2Vec(::paddle::PaddlePlace::kCPU);
 }
 #endif
 
diff --git a/test/cpp/inference/api/ipu_multi_model_profile.cc b/test/cpp/inference/api/ipu_multi_model_profile.cc
index 3c5b1af1594e68..d5b8ce2532244b 100644
--- a/test/cpp/inference/api/ipu_multi_model_profile.cc
+++ b/test/cpp/inference/api/ipu_multi_model_profile.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/ipu_resnet50_fp16_test.cc b/test/cpp/inference/api/ipu_resnet50_fp16_test.cc
index 99f0d58926dbdc..1e3ddb51cb8235 100644
--- a/test/cpp/inference/api/ipu_resnet50_fp16_test.cc
+++ b/test/cpp/inference/api/ipu_resnet50_fp16_test.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include <cmath>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/ipu_resnet50_test.cc b/test/cpp/inference/api/ipu_resnet50_test.cc
index 5a414bf9415bc7..d45cd9bf49ae25 100644
--- a/test/cpp/inference/api/ipu_resnet50_test.cc
+++ b/test/cpp/inference/api/ipu_resnet50_test.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <cmath>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/ipu_word2vec_sample.cc b/test/cpp/inference/api/ipu_word2vec_sample.cc
index ba8f28ee5e19fa..e43d03c5108599 100644
--- a/test/cpp/inference/api/ipu_word2vec_sample.cc
+++ b/test/cpp/inference/api/ipu_word2vec_sample.cc
@@ -24,11 +24,11 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/utils/flags.h"
 
-DEFINE_string(infer_model, "", "Directory of the inference model.");
+PD_DEFINE_string(infer_model, "", "Directory of the inference model.");
 
 using paddle_infer::Config;
 using paddle_infer::CreatePredictor;
@@ -70,7 +70,7 @@ void inference(std::string model_path,
 }
 
 int main(int argc, char *argv[]) {
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  ::paddle::flags::ParseCommandLineFlags(&argc, &argv);
   std::vector<float> ipu_result;
   std::vector<float> cpu_result;
   inference(FLAGS_infer_model, true, &ipu_result);
diff --git a/test/cpp/inference/api/lite_mul_model_test.cc b/test/cpp/inference/api/lite_mul_model_test.cc
index e600a3bab916c9..3fa8e545a57da4 100644
--- a/test/cpp/inference/api/lite_mul_model_test.cc
+++ b/test/cpp/inference/api/lite_mul_model_test.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <mutex>   // NOLINT
 #include <thread>  // NOLINT
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/lite_resnet50_test.cc b/test/cpp/inference/api/lite_resnet50_test.cc
index e35e28388234d4..dce9a8932fe32e 100644
--- a/test/cpp/inference/api/lite_resnet50_test.cc
+++ b/test/cpp/inference/api/lite_resnet50_test.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include <cmath>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/mkldnn_quantizer_tester.cc b/test/cpp/inference/api/mkldnn_quantizer_tester.cc
index 5e699a8b4c6a58..8edad9fe27127a 100644
--- a/test/cpp/inference/api/mkldnn_quantizer_tester.cc
+++ b/test/cpp/inference/api/mkldnn_quantizer_tester.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
-DEFINE_string(dirname, "", "dirname to tests.");
+PD_DEFINE_string(dirname, "", "dirname to tests.");
 
 namespace paddle {
 
diff --git a/test/cpp/inference/api/paddle_infer_api_copy_tensor_tester.cc b/test/cpp/inference/api/paddle_infer_api_copy_tensor_tester.cc
index 4674b77091a513..56b1b5b5d7fe06 100644
--- a/test/cpp/inference/api/paddle_infer_api_copy_tensor_tester.cc
+++ b/test/cpp/inference/api/paddle_infer_api_copy_tensor_tester.cc
@@ -19,10 +19,10 @@ limitations under the License. */
 #include <cstring>
 #include <numeric>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle_infer {
diff --git a/test/cpp/inference/api/paddle_infer_api_errors_tester.cc b/test/cpp/inference/api/paddle_infer_api_errors_tester.cc
index c716115ce2a4c9..4fc1f6f0843c0d 100644
--- a/test/cpp/inference/api/paddle_infer_api_errors_tester.cc
+++ b/test/cpp/inference/api/paddle_infer_api_errors_tester.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/api/paddle_infer_contrib.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/utils/flags.h"
 
 namespace paddle_infer {
 namespace contrib {
diff --git a/test/cpp/inference/api/paddle_infer_api_test.cc b/test/cpp/inference/api/paddle_infer_api_test.cc
index c59ac40e5e5ecb..e53473a520d803 100644
--- a/test/cpp/inference/api/paddle_infer_api_test.cc
+++ b/test/cpp/inference/api/paddle_infer_api_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
 namespace paddle_infer {
diff --git a/test/cpp/inference/api/tester_helper.h b/test/cpp/inference/api/tester_helper.h
index fe016abdaee02b..a204c31cfd306b 100644
--- a/test/cpp/inference/api/tester_helper.h
+++ b/test/cpp/inference/api/tester_helper.h
@@ -40,83 +40,87 @@
 #include "test/cpp/inference/api/config_printer.h"
 #include "test/cpp/inference/test_helper.h"
 
-DEFINE_string(model_name, "", "model name");
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(fp32_model, "", "FP32 model path");
-DEFINE_string(int8_model, "", "INT8 model path");
-DEFINE_string(infer_data, "", "data file");
-DEFINE_string(refer_result, "", "reference result for comparison");
-DEFINE_int32(batch_size, 1, "batch size");
-DEFINE_bool(ernie_large, false, "Test ernie large");
-DEFINE_bool(with_accuracy_layer,
-            true,
-            "Calculate the accuracy while label is in the input");
-DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction");
-DEFINE_bool(enable_bf16, false, "Enable BF16 type prediction");
-DEFINE_bool(enable_int8_ptq,
-            false,
-            "Enable INT8 post-training quantization prediction");
-DEFINE_bool(enable_int8_qat,
-            false,
-            "Enable INT8 quant-aware training prediction");
-DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
+PD_DEFINE_string(model_name, "", "model name");
+PD_DEFINE_string(infer_model, "", "model path");
+PD_DEFINE_string(fp32_model, "", "FP32 model path");
+PD_DEFINE_string(int8_model, "", "INT8 model path");
+PD_DEFINE_string(infer_data, "", "data file");
+PD_DEFINE_string(refer_result, "", "reference result for comparison");
+PD_DEFINE_int32(batch_size, 1, "batch size");
+PD_DEFINE_bool(ernie_large, false, "Test ernie large");
+PD_DEFINE_bool(with_accuracy_layer,
+               true,
+               "Calculate the accuracy while label is in the input");
+PD_DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction");
+PD_DEFINE_bool(enable_bf16, false, "Enable BF16 type prediction");
+PD_DEFINE_bool(enable_int8_ptq,
+               false,
+               "Enable INT8 post-training quantization prediction");
+PD_DEFINE_bool(enable_int8_qat,
+               false,
+               "Enable INT8 quant-aware training prediction");
+PD_DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
 // setting iterations to 0 means processing the whole dataset
-DEFINE_int32(iterations, 0, "number of batches to process");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
-DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
-DEFINE_bool(use_analysis,
-            true,
-            "Running the inference program in analysis mode.");
-DEFINE_bool(record_benchmark,
-            false,
-            "Record benchmark after profiling the model");
-DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
-DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
-DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
-DEFINE_bool(warmup,
-            false,
-            "Use warmup to calculate elapsed_time more accurately. "
-            "To reduce CI time, it sets false in default.");
-DEFINE_int32(warmup_iters, 1, "Number of batches to process during warmup.");
-
-DEFINE_bool(enable_profile, false, "Turn on profiler for fluid");
-DEFINE_int32(cpu_num_threads, 1, "Number of threads for each paddle instance.");
-DEFINE_bool(fuse_multi_gru,
-            false,
-            "Running the inference program with multi_gru_fuse_pass");
+PD_DEFINE_int32(iterations, 0, "number of batches to process");
+PD_DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+PD_DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+PD_DEFINE_int32(num_threads,
+                1,
+                "Running the inference program in multi-threads.");
+PD_DEFINE_bool(use_analysis,
+               true,
+               "Running the inference program in analysis mode.");
+PD_DEFINE_bool(record_benchmark,
+               false,
+               "Record benchmark after profiling the model");
+PD_DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
+PD_DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
+PD_DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
+PD_DEFINE_bool(warmup,
+               false,
+               "Use warmup to calculate elapsed_time more accurately. "
+               "To reduce CI time, it sets false in default.");
+PD_DEFINE_int32(warmup_iters, 1, "Number of batches to process during warmup.");
+
+PD_DEFINE_bool(enable_profile, false, "Turn on profiler for fluid");
+PD_DEFINE_int32(cpu_num_threads,
+                1,
+                "Number of threads for each paddle instance.");
+PD_DEFINE_bool(fuse_multi_gru,
+               false,
+               "Running the inference program with multi_gru_fuse_pass");
 
 // ipu related
-DEFINE_int32(ipu_micro_batch_size, 1, "micro batch size");
-DEFINE_int32(ipu_device_num, 1, "device num");
-DEFINE_bool(ipu_enable_pipelining, false, "enable pipelining");
-DEFINE_int32(ipu_batches_per_step,
-             1,
-             "the number of batches per run in pipelining");
-DEFINE_bool(ipu_enable_fp16, false, "enable fp16");
-DEFINE_int32(ipu_replica_num, 1, "replica num");
-DEFINE_double(ipu_available_memory_proportion,
-              1.0,
-              "available memory proportion");
-DEFINE_bool(ipu_enable_half_partial, false, "enable half partial");
+PD_DEFINE_int32(ipu_micro_batch_size, 1, "micro batch size");
+PD_DEFINE_int32(ipu_device_num, 1, "device num");
+PD_DEFINE_bool(ipu_enable_pipelining, false, "enable pipelining");
+PD_DEFINE_int32(ipu_batches_per_step,
+                1,
+                "the number of batches per run in pipelining");
+PD_DEFINE_bool(ipu_enable_fp16, false, "enable fp16");
+PD_DEFINE_int32(ipu_replica_num, 1, "replica num");
+PD_DEFINE_double(ipu_available_memory_proportion,
+                 1.0,
+                 "available memory proportion");
+PD_DEFINE_bool(ipu_enable_half_partial, false, "enable half partial");
 
 namespace paddle {
 namespace inference {
 
-using paddle::framework::proto::VarType;
-using float16 = paddle::platform::float16;
+using ::paddle::framework::proto::VarType;
+using float16 = ::paddle::platform::float16;
 
 template <typename T>
-constexpr paddle::PaddleDType GetPaddleDType();
+constexpr ::paddle::PaddleDType GetPaddleDType();
 
 template <>
-constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
-  return paddle::PaddleDType::INT64;
+constexpr ::paddle::PaddleDType GetPaddleDType<int64_t>() {
+  return ::paddle::PaddleDType::INT64;
 }
 
 template <>
-constexpr paddle::PaddleDType GetPaddleDType<float>() {
-  return paddle::PaddleDType::FLOAT32;
+constexpr ::paddle::PaddleDType GetPaddleDType<float>() {
+  return ::paddle::PaddleDType::FLOAT32;
 }
 
 void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
@@ -521,7 +525,7 @@ void PredictionWarmUp(PaddlePredictor *predictor,
   PrintTime(
       batch_size, 1, num_threads, tid, batch_latency, iterations, data_type);
   if (FLAGS_enable_profile) {
-    paddle::platform::ResetProfiler();
+    ::paddle::platform::ResetProfiler();
   }
 }
 
@@ -749,7 +753,7 @@ float CompareAccuracyOne(
     if (output_slots[i][compared_idx].lod.size() > 0)
       throw std::invalid_argument("CompareAccuracy: output has nonempty LoD.");
 
-    if (output_slots[i][compared_idx].dtype != paddle::PaddleDType::FLOAT32)
+    if (output_slots[i][compared_idx].dtype != ::paddle::PaddleDType::FLOAT32)
       throw std::invalid_argument(
           "CompareAccuracy: output is of a wrong type.");
 
@@ -1156,7 +1160,7 @@ static bool CompareTensor(const phi::DenseTensor &a,
   return true;
 }
 
-void ConvertFP32toFP16(paddle::PaddleTensor &tensor  // NOLINT
+void ConvertFP32toFP16(::paddle::PaddleTensor &tensor  // NOLINT
 ) {
   int num = 1;
   for (auto dim : tensor.shape) {
@@ -1177,7 +1181,7 @@ void ConvertFP32toFP16(paddle::PaddleTensor &tensor  // NOLINT
   tensor.dtype = PaddleDType::FLOAT16;
 }
 
-void ConvertFP16toFP32(paddle::PaddleTensor &tensor  // NOLINT
+void ConvertFP16toFP32(::paddle::PaddleTensor &tensor  // NOLINT
 ) {
   int num = 1;
   for (auto dim : tensor.shape) {
diff --git a/test/cpp/inference/api/trt_cascade_rcnn_test.cc b/test/cpp/inference/api/trt_cascade_rcnn_test.cc
index 86759c33e472ee..710e6481d018ca 100644
--- a/test/cpp/inference/api/trt_cascade_rcnn_test.cc
+++ b/test/cpp/inference/api/trt_cascade_rcnn_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc b/test/cpp/inference/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
index 806950ca8d6fda..34ddb8fa3c330f 100644
--- a/test/cpp/inference/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
+++ b/test/cpp/inference/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc b/test/cpp/inference/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
index bcf82d66f781ed..7946b7c2428b2d 100644
--- a/test/cpp/inference/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
+++ b/test/cpp/inference/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h b/test/cpp/inference/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
index 09b20d23e9708d..e046181dbf094c 100644
--- a/test/cpp/inference/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
+++ b/test/cpp/inference/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_dynamic_shape_ernie_test.cc b/test/cpp/inference/api/trt_dynamic_shape_ernie_test.cc
index 8abf7224a135e4..fd31613c2b6289 100644
--- a/test/cpp/inference/api/trt_dynamic_shape_ernie_test.cc
+++ b/test/cpp/inference/api/trt_dynamic_shape_ernie_test.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
diff --git a/test/cpp/inference/api/trt_dynamic_shape_test.cc b/test/cpp/inference/api/trt_dynamic_shape_test.cc
index 505c0cdf083c0b..8f284d75b7e3ce 100644
--- a/test/cpp/inference/api/trt_dynamic_shape_test.cc
+++ b/test/cpp/inference/api/trt_dynamic_shape_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_dynamic_shape_transformer_prune_test.cc b/test/cpp/inference/api/trt_dynamic_shape_transformer_prune_test.cc
index 2e71da39b5f453..ff8c60df005595 100644
--- a/test/cpp/inference/api/trt_dynamic_shape_transformer_prune_test.cc
+++ b/test/cpp/inference/api/trt_dynamic_shape_transformer_prune_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_fc_prelu_test.cc b/test/cpp/inference/api/trt_fc_prelu_test.cc
index a1ef33407596ad..5f10c12bf3dd13 100644
--- a/test/cpp/inference/api/trt_fc_prelu_test.cc
+++ b/test/cpp/inference/api/trt_fc_prelu_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_instance_norm_converter_test.cc b/test/cpp/inference/api/trt_instance_norm_converter_test.cc
index b58ddb2d919d8e..fc78219a9db6d1 100644
--- a/test/cpp/inference/api/trt_instance_norm_converter_test.cc
+++ b/test/cpp/inference/api/trt_instance_norm_converter_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc b/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc
index 9c6a87a6d161a7..d34d640cfaf40e 100644
--- a/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc
+++ b/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc
@@ -12,7 +12,6 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_mobilenet_test.cc b/test/cpp/inference/api/trt_mobilenet_test.cc
index 7cae99e0d3479a..670eaa7b1169e0 100644
--- a/test/cpp/inference/api/trt_mobilenet_test.cc
+++ b/test/cpp/inference/api/trt_mobilenet_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_quant_int8_test.cc b/test/cpp/inference/api/trt_quant_int8_test.cc
index f40b2197fb2292..46c1fb7c9f742b 100644
--- a/test/cpp/inference/api/trt_quant_int8_test.cc
+++ b/test/cpp/inference/api/trt_quant_int8_test.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include <numeric>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_quant_int8_yolov3_r50_test.cc b/test/cpp/inference/api/trt_quant_int8_yolov3_r50_test.cc
index ce058a1275c926..412aeae6ed75ad 100644
--- a/test/cpp/inference/api/trt_quant_int8_yolov3_r50_test.cc
+++ b/test/cpp/inference/api/trt_quant_int8_yolov3_r50_test.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <numeric>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_rebind_stream_test.cc b/test/cpp/inference/api/trt_rebind_stream_test.cc
index 3a42af93427c50..8c8e78a51907e3 100644
--- a/test/cpp/inference/api/trt_rebind_stream_test.cc
+++ b/test/cpp/inference/api/trt_rebind_stream_test.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <thread>
 
-#include "gflags/gflags.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_resnet50_test.cc b/test/cpp/inference/api/trt_resnet50_test.cc
index 8dde6a0f5dd8ce..085b64ef882b56 100644
--- a/test/cpp/inference/api/trt_resnet50_test.cc
+++ b/test/cpp/inference/api/trt_resnet50_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_resnext_test.cc b/test/cpp/inference/api/trt_resnext_test.cc
index a80058468d5768..65e09d3532d86b 100644
--- a/test/cpp/inference/api/trt_resnext_test.cc
+++ b/test/cpp/inference/api/trt_resnext_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_split_converter_test.cc b/test/cpp/inference/api/trt_split_converter_test.cc
index ca41ac5681e4e1..8d87b98f6e34bf 100644
--- a/test/cpp/inference/api/trt_split_converter_test.cc
+++ b/test/cpp/inference/api/trt_split_converter_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/trt_test_helper.h"
 
 namespace paddle {
diff --git a/test/cpp/inference/api/trt_test_helper.h b/test/cpp/inference/api/trt_test_helper.h
index db446e64054c03..dccbb589bdb0b2 100644
--- a/test/cpp/inference/api/trt_test_helper.h
+++ b/test/cpp/inference/api/trt_test_helper.h
@@ -17,17 +17,17 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
 namespace paddle {
 namespace inference {
 
-DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine.");
-DEFINE_string(prog_filename, "", "Name of model file.");
-DEFINE_string(param_filename, "", "Name of parameters file.");
+PD_DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine.");
+PD_DEFINE_string(prog_filename, "", "Name of model file.");
+PD_DEFINE_string(param_filename, "", "Name of parameters file.");
 
 template <typename ConfigType>
 void SetConfig(ConfigType* config,
diff --git a/test/cpp/inference/api/xpu_config_resnet50_test.cc b/test/cpp/inference/api/xpu_config_resnet50_test.cc
index ce3796e420970d..d118eef9e88fd4 100644
--- a/test/cpp/inference/api/xpu_config_resnet50_test.cc
+++ b/test/cpp/inference/api/xpu_config_resnet50_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <cmath>
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/tester_helper.h"
 
 namespace paddle_infer {
diff --git a/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc b/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
index b9ab6ea68d7b5c..94de193c89513c 100644
--- a/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
+++ b/test/cpp/inference/api/xpu_runtime_config_resnet50_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <cmath>
-#include "gflags/gflags.h"
+#include "paddle/utils/flags.h"
 #include "test/cpp/inference/api/tester_helper.h"
 #include "xpu/runtime.h"
 #include "xpu/xdnn.h"
diff --git a/test/cpp/inference/infer_ut/test_LeViT.cc b/test/cpp/inference/infer_ut/test_LeViT.cc
index 056371b0ae662a..ed30f04e7301bd 100644
--- a/test/cpp/inference/infer_ut/test_LeViT.cc
+++ b/test/cpp/inference/infer_ut/test_LeViT.cc
@@ -248,6 +248,6 @@ TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/test/cpp/inference/infer_ut/test_det_mv3_db.cc b/test/cpp/inference/infer_ut/test_det_mv3_db.cc
index a407251ccba666..474d0701225f57 100644
--- a/test/cpp/inference/infer_ut/test_det_mv3_db.cc
+++ b/test/cpp/inference/infer_ut/test_det_mv3_db.cc
@@ -183,6 +183,6 @@ TEST(mkldnn_tester_det_mv3_db, multi_thread2_mkl_fp32_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/test/cpp/inference/infer_ut/test_ernie_text_cls.cc b/test/cpp/inference/infer_ut/test_ernie_text_cls.cc
index 2ffeb604230f7b..ddf2dbc49d8979 100644
--- a/test/cpp/inference/infer_ut/test_ernie_text_cls.cc
+++ b/test/cpp/inference/infer_ut/test_ernie_text_cls.cc
@@ -134,6 +134,6 @@ TEST(mkldnn_tester_ernie_text_cls, multi_thread4_mkl_fp32_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/test/cpp/inference/infer_ut/test_ernie_xnli_int8.cc b/test/cpp/inference/infer_ut/test_ernie_xnli_int8.cc
index e3030d08021ce0..4e9c96c530a100 100644
--- a/test/cpp/inference/infer_ut/test_ernie_xnli_int8.cc
+++ b/test/cpp/inference/infer_ut/test_ernie_xnli_int8.cc
@@ -192,7 +192,7 @@ TEST(tensorrt_tester_ernie_xnli, oss_varlen_truth_data_int8) {
 
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
 
 #if IS_TRT_VERSION_GE(7200)
   return RUN_ALL_TESTS();
diff --git a/test/cpp/inference/infer_ut/test_mobilnetv1.cc b/test/cpp/inference/infer_ut/test_mobilnetv1.cc
index 582c34e1b0b47b..2660cc5cbd5d65 100644
--- a/test/cpp/inference/infer_ut/test_mobilnetv1.cc
+++ b/test/cpp/inference/infer_ut/test_mobilnetv1.cc
@@ -81,6 +81,6 @@ TEST(tensorrt_tester_mobilenetv1, tuned_dynamic_trt_fp32_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/test/cpp/inference/infer_ut/test_ppyolo_mbv3.cc b/test/cpp/inference/infer_ut/test_ppyolo_mbv3.cc
index a075192a58054b..407e7c87dc97c6 100644
--- a/test/cpp/inference/infer_ut/test_ppyolo_mbv3.cc
+++ b/test/cpp/inference/infer_ut/test_ppyolo_mbv3.cc
@@ -155,6 +155,6 @@ TEST(DISABLED_mkldnn_tester_ppyolo_mbv3, multi_thread4_mkl_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/test/cpp/inference/infer_ut/test_ppyolov2_r50vd.cc b/test/cpp/inference/infer_ut/test_ppyolov2_r50vd.cc
index ad91cb5bc9e855..c90256fb8b8c12 100644
--- a/test/cpp/inference/infer_ut/test_ppyolov2_r50vd.cc
+++ b/test/cpp/inference/infer_ut/test_ppyolov2_r50vd.cc
@@ -154,6 +154,6 @@ TEST(mkldnn_tester_ppyolov2_r50vd, multi_thread2_mkl_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/test/cpp/inference/infer_ut/test_resnet50.cc b/test/cpp/inference/infer_ut/test_resnet50.cc
index 1c9b70564929f2..50b0b71c2e1da8 100644
--- a/test/cpp/inference/infer_ut/test_resnet50.cc
+++ b/test/cpp/inference/infer_ut/test_resnet50.cc
@@ -242,6 +242,6 @@ TEST(DISABLED_tensorrt_tester_resnet50, profile_multi_thread_trt_fp32) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/test/cpp/inference/infer_ut/test_resnet50_quant.cc b/test/cpp/inference/infer_ut/test_resnet50_quant.cc
index 452cf31b311392..c2c44102f56b69 100644
--- a/test/cpp/inference/infer_ut/test_resnet50_quant.cc
+++ b/test/cpp/inference/infer_ut/test_resnet50_quant.cc
@@ -171,6 +171,6 @@ TEST(DISABLED_tensorrt_tester_resnet50_quant, multi_thread_multi_instance) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/test/cpp/inference/infer_ut/test_yolov3.cc b/test/cpp/inference/infer_ut/test_yolov3.cc
index 4dc35af4959354..4a3e65ba5a574c 100644
--- a/test/cpp/inference/infer_ut/test_yolov3.cc
+++ b/test/cpp/inference/infer_ut/test_yolov3.cc
@@ -154,6 +154,6 @@ TEST(test_yolov3, multi_thread4_mkl_bz2) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
diff --git a/test/cpp/ir/core/ir_program_test.cc b/test/cpp/ir/core/ir_program_test.cc
index dcc81e9c517827..c7729ae89fde8a 100644
--- a/test/cpp/ir/core/ir_program_test.cc
+++ b/test/cpp/ir/core/ir_program_test.cc
@@ -14,6 +14,8 @@
 
 #include <gtest/gtest.h>
 
+#include <sstream>
+
 #include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
 #include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
 #include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
@@ -193,7 +195,13 @@ TEST(program_test, program) {
   EXPECT_EQ(program.block()->size() == 5, true);
   EXPECT_EQ(program.parameters_num() == 3, true);
 
-  program.Print(std::cout);
+  std::stringstream ss;
+  program.Print(ss);
+
+  std::stringstream ss_ostram;
+  ss_ostram << program;
+
+  EXPECT_EQ(ss.str(), ss_ostram.str());
 }
 
 TEST(program_test, slice_combine_test) {
diff --git a/test/cpp/phi/api/scale_api.h b/test/cpp/phi/api/scale_api.h
index 571ab0defbce74..104034d5effe99 100644
--- a/test/cpp/phi/api/scale_api.h
+++ b/test/cpp/phi/api/scale_api.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
@@ -26,6 +25,7 @@
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/scale_kernel.h"
+#include "paddle/utils/flags.h"
 
 PHI_DECLARE_int32(low_precision_op_list);
 namespace paddle {
diff --git a/test/cpp/prim/test_static_prim.cc b/test/cpp/prim/test_static_prim.cc
index e26f54a44be430..d4f5dcb8998ae7 100644
--- a/test/cpp/prim/test_static_prim.cc
+++ b/test/cpp/prim/test_static_prim.cc
@@ -28,7 +28,7 @@
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-DECLARE_bool(prim_enabled);
+PD_DECLARE_bool(prim_enabled);
 PHI_DECLARE_string(tensor_operants_mode);
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
diff --git a/test/ir/inference/test_trt_support_nhwc_pass.py b/test/ir/inference/test_trt_support_nhwc_pass.py
index 7c0a6eb4b4a890..0648202aba30c4 100644
--- a/test/ir/inference/test_trt_support_nhwc_pass.py
+++ b/test/ir/inference/test_trt_support_nhwc_pass.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import shutil
+import tempfile
 import unittest
 
 import numpy as np
@@ -53,6 +55,15 @@ def __init__(self):
             data_format='NHWC',
         )
         self.relu3 = nn.ReLU()
+        self.conv4 = nn.Conv2D(
+            in_channels=2,
+            out_channels=1,
+            kernel_size=3,
+            stride=2,
+            padding=0,
+            data_format='NHWC',
+        )
+        self.relu4 = nn.ReLU()
         self.flatten = nn.Flatten()
         self.fc = nn.Linear(729, 10)
         self.softmax = nn.Softmax()
@@ -62,8 +73,12 @@ def forward(self, x):
         x = self.relu1(x)
         x = self.conv2(x)
         x = self.relu2(x)
+        res = x
         x = self.conv3(x)
         x = self.relu3(x)
+        res = self.conv4(res)
+        res = self.relu4(res)
+        x = x + res
         x = self.flatten(x)
         x = self.fc(x)
         x = self.softmax(x)
@@ -73,7 +88,11 @@ def forward(self, x):
 class TRTNHWCConvertTest(unittest.TestCase):
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
-        self.path = './inference_pass/nhwc_convert/infer_model'
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.path = os.path.join(
+            self.temp_dir.name, 'inference_pass', 'nhwc_converter', ''
+        )
+        self.model_prefix = self.path + 'infer_model'
 
     def create_model(self):
         image = static.data(
@@ -82,11 +101,13 @@ def create_model(self):
         predict = SimpleNet()(image)
         exe = paddle.static.Executor(self.place)
         exe.run(paddle.static.default_startup_program())
-        paddle.static.save_inference_model(self.path, [image], [predict], exe)
+        paddle.static.save_inference_model(
+            self.model_prefix, [image], [predict], exe
+        )
 
     def create_predictor(self):
         config = paddle.inference.Config(
-            self.path + '.pdmodel', self.path + '.pdiparams'
+            self.model_prefix + '.pdmodel', self.model_prefix + '.pdiparams'
         )
         config.enable_memory_optim()
         config.enable_use_gpu(100, 0)
@@ -123,7 +144,7 @@ def test_nhwc_convert(self):
         result = self.infer(predictor, img=[img])
 
     def tearDown(self):
-        shutil.rmtree('./inference_pass/nhwc_convert/')
+        shutil.rmtree(self.path)
 
 
 if __name__ == '__main__':
diff --git a/test/ir/new_ir/test_build_model.py b/test/ir/new_ir/test_build_model.py
index 67c634821fd821..8c9cbb32aa28fa 100644
--- a/test/ir/new_ir/test_build_model.py
+++ b/test/ir/new_ir/test_build_model.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import unittest
 
 import numpy as np
@@ -29,15 +28,25 @@ def test_basic_network(self):
             y = paddle.static.data('y', [4, 4], dtype='float32')
             divide_out = paddle.divide(x, y)
             sum_out = paddle.sum(divide_out)
-
             exe = paddle.static.Executor()
             x_feed = np.ones([4, 4], dtype=np.float32) * 10
             y_feed = np.ones([4, 4], dtype=np.float32) * 2
             (sum_value,) = exe.run(
-                feed={'x': x_feed, 'y': y_feed}, fetch_list=[sum_out]
+                main_program,
+                feed={'x': x_feed, 'y': y_feed},
+                fetch_list=[sum_out],
             )
             self.assertEqual(sum_value, 5 * 4 * 4)
 
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
+            x = paddle.static.data('x', [4, 4], dtype='float32')
+            out = paddle.mean(x)
+            exe = paddle.static.Executor()
+            x_feed = np.ones([4, 4], dtype=np.float32) * 10
+            (sum_value,) = exe.run(feed={'x': x_feed}, fetch_list=[out])
+            self.assertEqual(sum_value, 10)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/ir/new_ir/test_ir_backward.py b/test/ir/new_ir/test_ir_backward.py
index be29baa1069d2f..5d858fc2b76026 100644
--- a/test/ir/new_ir/test_ir_backward.py
+++ b/test/ir/new_ir/test_ir_backward.py
@@ -16,7 +16,7 @@
 
 import paddle
 from paddle import ir
-from paddle.autograd.backward import grad
+from paddle.autograd.ir_backward import grad
 
 paddle.enable_static()
 
diff --git a/test/ir/new_ir/test_standalone_new_ir.py b/test/ir/new_ir/test_standalone_new_ir.py
index fd5ee675707694..51843b8b5037eb 100644
--- a/test/ir/new_ir/test_standalone_new_ir.py
+++ b/test/ir/new_ir/test_standalone_new_ir.py
@@ -345,6 +345,26 @@ def func(x, y):
         np.testing.assert_array_equal(z.numpy(), gold_res)
 
 
+# TODO(phlrain): open this after fix pr(55509) confict
+# class TestNewIrLogicalDygraph(unittest.TestCase):
+#     def test_with_new_ir(self):
+#         paddle.disable_static()
+
+#         @paddle.jit.to_static
+#         def func(x, y, z):
+#             a = paddle.logical_and(x, y)
+#             return z + a.cast("float32")
+
+#         x = paddle.ones([2, 2], dtype='float32')
+#         y = paddle.ones([2, 2], dtype='float32')
+#         z = paddle.ones([2, 2], dtype='float32')
+
+#         z = func(x, y, z)
+
+#         gold_res = np.ones([2, 2], dtype="float32") * 2
+#         np.testing.assert_array_equal(z.numpy(), gold_res)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 079233a9c1676c..86443907bd9aab 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -503,6 +503,10 @@ if(NOT WITH_GPU
   list(REMOVE_ITEM TEST_OPS test_build_strategy_fusion_group_pass)
 endif()
 
+if(NOT WITH_CUDNN_FRONTEND)
+  list(REMOVE_ITEM TEST_OPS test_fused_scale_bias_relu_conv_bnstats_op)
+endif()
+
 # Some ops need to check results when gc is enabled
 # Currently, only ops that register NoNeedBufferVarsInference need to do this test
 set(TEST_OPS_WITH_GC
diff --git a/test/legacy_test/eager_op_test.py b/test/legacy_test/eager_op_test.py
index c8b7ec70b70a7e..3c50f3d6f5a6f7 100644
--- a/test/legacy_test/eager_op_test.py
+++ b/test/legacy_test/eager_op_test.py
@@ -31,6 +31,7 @@
 from white_list import (
     check_shape_white_list,
     compile_vs_runtime_white_list,
+    new_ir_python_api_grad_white_list,
     no_check_set_white_list,
     no_grad_set_white_list,
     op_accuracy_white_list,
@@ -39,6 +40,7 @@
 
 import paddle
 from paddle import fluid
+from paddle.autograd.ir_backward import grad as ir_grad
 from paddle.fluid import core, unique_name
 from paddle.fluid.backward import append_backward
 from paddle.fluid.executor import Executor
@@ -1201,6 +1203,164 @@ def _calc_dygraph_output(
             )
             return outputs
 
+    def get_kernel_signature(self, place, egr_inps=None, egr_oups=None):
+        with fluid.dygraph.base.guard(place=place):
+            block = fluid.default_main_program().global_block()
+            op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
+            # prepare input variable
+            dygraph_tensor_inputs = (
+                egr_inps
+                if egr_inps
+                else self.append_input_output_for_dygraph(
+                    op_proto, self.inputs, True, False, block
+                )
+            )
+            # prepare output variable
+            dygraph_tensor_outputs = (
+                egr_oups
+                if egr_oups
+                else self.append_input_output_for_dygraph(
+                    op_proto, self.outputs, False, False, block
+                )
+            )
+
+            # prepare attributes
+            attrs_outputs = {}
+            if hasattr(self, "attrs"):
+                for attrs_name in self.attrs:
+                    if self.attrs[attrs_name] is not None:
+                        attrs_outputs[attrs_name] = self.attrs[attrs_name]
+
+            kernel_sig = OpTestUtils._get_kernel_signature(
+                self.op_type,
+                dygraph_tensor_inputs,
+                dygraph_tensor_outputs,
+                canonicalize_attrs(attrs_outputs, op_proto),
+            )
+            if not kernel_sig or (
+                len(kernel_sig[0]) == 0
+                and len(kernel_sig[1]) == 0
+                and len(kernel_sig[2]) == 0
+            ):
+                return None
+            if not hasattr(self, "python_api"):
+                print(kernel_sig)
+            assert hasattr(self, "python_api"), (
+                "Detect there is KernelSignature for `%s` op, please set the `self.python_api` if you set check_dygraph = True"
+                % self.op_type
+            )
+            return kernel_sig
+
+    def get_ir_input_attr_dict_and_feed(self, stop_gradient):
+        attrs_outputs = {}
+        if hasattr(self, "attrs"):
+            for attrs_name in self.attrs:
+                if self.attrs[attrs_name] is not None:
+                    attrs_outputs[attrs_name] = self.attrs[attrs_name]
+        input_dict = {}
+        static_inputs = defaultdict(list)
+        feed = {}
+        for name, item in self.inputs.items():
+            if isinstance(item, list):
+                for tup in item:
+                    dtype = (
+                        "bfloat16"
+                        if OpTestUtils.is_bfloat16_type(tup[1].dtype)
+                        else tup[1].dtype
+                    )
+                    x = paddle.static.data(
+                        name=str(tup[0]), shape=tup[1].shape, dtype=dtype
+                    )
+                    x.stop_gradient = stop_gradient
+                    static_inputs[name].append(x)
+                    feed.update({str(tup[0]): tup[1]})
+                    input_dict.update({str(tup[0]): x})
+            else:
+                dtype = (
+                    "bfloat16"
+                    if OpTestUtils.is_bfloat16_type(item.dtype)
+                    else item.dtype
+                )
+                x = paddle.static.data(name=name, shape=item.shape, dtype=dtype)
+                x.stop_gradient = stop_gradient
+                static_inputs[name].append(x)
+                feed.update({name: item})
+                input_dict.update({name: x})
+        return static_inputs, attrs_outputs, input_dict, feed
+
+    def _calc_new_ir_output(
+        self, place, no_check_set=None, inps=None, oups=None
+    ):
+        """set egr_inps and egr_oups = None if you want to create it by yourself."""
+
+        def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
+            if hasattr(self, "python_out_sig"):
+                output_sig = self.python_out_sig
+            if not isinstance(ret_tuple, (tuple, list)):
+                ret_tuple = [ret_tuple]
+            if len(output_sig) == len(ret_tuple):
+                # [assumption]: we assume {"Out": [Tensor]}
+                return {a: [b] for a, b in zip(output_sig, ret_tuple)}
+            else:
+                # [assumption]: return multi-Tensor in a single output. such as paddle.split()
+                assert (
+                    len(output_sig) == 1
+                ), "Don't support multi-output with multi-tensor output. (May be you can use set `python_out_sig`, see `test_squeeze2_op` as a example.)"
+                return {output_sig[0]: ret_tuple}
+
+        # get kernel signature
+        kernel_sig = self.get_kernel_signature(place)
+        ir_program = paddle.static.Program()
+        with paddle.static.program_guard(ir_program):
+            # prepare inps attributes feed
+            (
+                static_inputs,
+                attrs,
+                input_dict,
+                feed,
+            ) = self.get_ir_input_attr_dict_and_feed(stop_gradient=True)
+            # prepare args
+            args = OpTestUtils.prepare_python_api_arguments(
+                self.python_api,
+                static_inputs,
+                attrs,
+                kernel_sig,
+            )
+            inputs_sig, attrs_sig, outputs_sig = kernel_sig
+            args = OpTestUtils.assumption_assert_and_transform(
+                args, len(inputs_sig)
+            )
+            ret_tuple = self.python_api(*args)
+            result = construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig)
+            if hasattr(self, "python_out_sig_sub_name"):
+                for key in self.python_out_sig_sub_name.keys():
+                    for i in range(len(self.python_out_sig_sub_name[key])):
+                        result[key][0][i].name = self.python_out_sig_sub_name[
+                            key
+                        ][i]
+            fetch_list = getattr(self, "fetch_list", [])
+            # if the fetch_list is customized by user, we use it directly.
+            # if not, fill the fetch_list by the user configured outputs in test.
+
+            if len(fetch_list) == 0:
+                for var in result.items():
+                    if no_check_set is not None and var in no_check_set:
+                        continue
+                    if isinstance(var[1], list):
+                        for v in var[1]:
+                            fetch_list.append(v)
+                    else:
+                        fetch_list.append(var[1])
+
+            # executor run
+            executor = Executor(place)
+            (outs,) = executor.run(
+                ir_program,
+                feed=feed,
+                fetch_list=fetch_list,
+            )
+        return outs
+
     def _check_ir_output(self, place, program, feed_map, fetch_list, outs):
         if os.getenv("FLAGS_NEW_IR_OPTEST") is None:
             return
@@ -2123,6 +2283,114 @@ def _is_skip_name(self, name):
                     return True
                 return super()._is_skip_name(name)
 
+        class NewIRChecker(Checker):
+            def init(self):
+                self.checker_name = "new ir checker"
+
+            def calculate_output(self):
+                self.is_python_api_test = True
+                new_ir_outs = self.op_test._calc_new_ir_output(place)
+                if new_ir_outs is None:
+                    self.is_python_api_test = False
+                    # missing KernelSignature, fall back to eager middle output.
+                    new_ir_outs = self.op_test._calc_dygraph_output(
+                        place, no_check_set=no_check_set
+                    )
+                self.outputs = new_ir_outs
+                if self.op_test.is_compared_with_fp32():
+                    self.op_test.enable_cal_ref_output()
+                    self.is_python_api_test = True
+                    self.ref_outputs = self.op_test._calc_new_ir_output(place)
+                    if self.ref_outputs is None:
+                        self.is_python_api_test = False
+                        # missing KernelSignature, fall back to eager middle output.
+                        self.ref_outputs = self.op_test._calc_dygraph_output(
+                            place, no_check_set=no_check_set
+                        )
+                    self.op_test.disable_cal_ref_output()
+
+            def _compare_numpy(self, name, actual_np, expect_np):
+                expect_np = np.array(expect_np)
+                assert (
+                    actual_np.shape == expect_np.shape
+                ), "Operator ({}) : Output ({}) shape mismatch, expect shape is {}, but actual shape is {}".format(
+                    self.op_type, name, expect_np.shape, actual_np.shape
+                )
+                np.testing.assert_allclose(
+                    actual_np,
+                    expect_np,
+                    atol=atol,
+                    rtol=self.rtol if hasattr(self, 'rtol') else rtol,
+                    equal_nan=equal_nan,
+                    err_msg=(
+                        "Operator ("
+                        + self.op_type
+                        + ") Output ("
+                        + name
+                        + ") has diff at "
+                        + str(place)
+                        + " in "
+                        + self.checker_name
+                    ),
+                )
+
+            def convert_uint16_to_float_ifneed(self, actual_np, expect_np):
+                if actual_np.dtype == np.uint16:
+                    self.rtol = 1.0e-2
+                elif actual_np.dtype == np.float16:
+                    self.rtol = 1.0e-3
+                else:
+                    self.rtol = 1.0e-5
+                if self.op_test.is_bfloat16_op():
+                    if actual_np.dtype == np.uint16:
+                        actual_np = convert_uint16_to_float(actual_np)
+                    if expect_np.dtype == np.uint16:
+                        expect_np = convert_uint16_to_float(expect_np)
+                return actual_np, expect_np
+
+            def find_actual_value(self, target_name):
+                with paddle.ir.core.program_guard(
+                    paddle.ir.core.default_main_program()
+                ):
+                    actual = self.outputs
+                    actual_t = np.array(actual)
+                    return actual, actual_t
+
+            def find_expect_value(self, name):
+                with paddle.ir.core.program_guard(
+                    paddle.ir.core.default_main_program()
+                ):
+                    expect = self.ref_outputs
+                    expect_t = np.array(expect)
+                    return expect, expect_t
+
+            def _compare_list(self, name, actual, expect):
+                """if expect is a tuple, we need to compare list."""
+                with paddle.ir.core.program_guard(place=place):
+                    self.op_test.assertListEqual(
+                        actual.value()
+                        .get_tensor()
+                        .recursive_sequence_lengths(),
+                        expect[1],
+                        "Operator ("
+                        + self.op_type
+                        + ") Output ("
+                        + name
+                        + ") has different lod at "
+                        + str(place)
+                        + " in dygraph mode",
+                    )
+
+            def _is_skip_name(self, name):
+                # if in final state and kernel signature don't have name, then skip it.
+                if (
+                    self.is_python_api_test
+                    and hasattr(self.op_test, "python_out_sig")
+                    and name not in self.op_test.python_out_sig
+                ):
+                    return True
+                return super()._is_skip_name(name)
+
         # set some flags by the combination of arguments.
         if self.is_float16_op():
             self.dtype = np.float16
@@ -2184,6 +2452,21 @@ def _is_skip_name(self, name):
             dygraph_checker.check()
             dygraph_dygraph_outs = dygraph_checker.outputs
 
+        if (
+            self.op_type
+            in new_ir_python_api_grad_white_list.new_ir_python_api_grad_white_list
+        ):
+            if (
+                type(place) is paddle.fluid.libpaddle.CPUPlace
+                or type(place) is paddle.fluid.libpaddle.CUDAPlace
+            ):
+                print("New IR checker begins...........")
+                with paddle.new_ir_utils._newir_guard():
+                    new_ir_checker = NewIRChecker(self, self.outputs)
+                    new_ir_checker.check()
+
+                print("New IR checker ends...........")
+
         # Note(zhiqiu): inplace_atol should be only set when op doesn't ensure
         # computational consistency.
         # For example, group_norm uses AtomicAdd on CUDAPlace, which do not ensure
@@ -2567,14 +2850,6 @@ def check_grad_with_place(
             numeric_grad_delta = 1e-5
             max_relative_error = 1e-7
 
-        if (
-            self.dtype == np.complex128
-            and self.op_type
-            not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST
-        ):
-            numeric_grad_delta = 1e-5
-            max_relative_error = 1e-6
-
         cache_list = None
         if hasattr(self, "cache_name_list"):
             cache_list = self.cache_name_list
@@ -2728,6 +3003,33 @@ def check_grad_with_place(
                     "Gradient Check On %s" % str(place),
                     atol=atol,
                 )
+        # get new ir gradient
+        if (
+            self.op_type
+            in new_ir_python_api_grad_white_list.new_ir_python_api_grad_white_list
+        ):
+            if (
+                type(place) is paddle.fluid.libpaddle.CPUPlace
+                or type(place) is paddle.fluid.libpaddle.CUDAPlace
+            ):
+                print("New IR gradient begins...........")
+                with paddle.new_ir_utils._newir_guard():
+                    new_ir_grad = self._get_ir_gradient(
+                        inputs_to_check,
+                        place,
+                        output_names,
+                        user_defined_grad_outputs,
+                        no_grad_set,
+                    )
+                print("New IR gradient ends...........")
+                self._assert_is_close(
+                    numeric_grads,
+                    [new_ir_grad],
+                    inputs_to_check,
+                    max_relative_error,
+                    "Gradient Check On %s" % str(place),
+                    atol=atol,
+                )
 
     def _find_var_in_dygraph(self, output_vars, name):
         if name in output_vars:
@@ -3073,6 +3375,106 @@ def _get_gradient(
 
         return res
 
+    def _get_ir_gradient(
+        self,
+        inputs_to_check,
+        place,
+        output_names,
+        user_defined_grad_outputs=None,
+        no_grad_set=None,
+    ):
+        def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
+            if hasattr(self, "python_out_sig"):
+                output_sig = self.python_out_sig
+            if not isinstance(ret_tuple, (tuple, list)):
+                ret_tuple = [ret_tuple]
+            if len(output_sig) == len(ret_tuple):
+                # [assumption]: we assume {"Out": [Tensor]}
+                return {a: [b] for a, b in zip(output_sig, ret_tuple)}
+            else:
+                # [assumption]: return multi-Tensor in a single output. such as paddle.split()
+                assert (
+                    len(output_sig) == 1
+                ), "Don't support multi-output with multi-tensor output. (May be you can use set `python_out_sig`, see `test_squeeze2_op` as a example.)"
+                return {output_sig[0]: ret_tuple}
+
+        # get kernel signature
+        kernel_sig = self.get_kernel_signature(place)
+        ir_program = paddle.static.Program()
+        with paddle.static.program_guard(ir_program):
+            # prepare inps attributes feed
+            (
+                static_inputs,
+                attrs,
+                input_dict,
+                feed,
+            ) = self.get_ir_input_attr_dict_and_feed(stop_gradient=False)
+            # prepare args
+            args = OpTestUtils.prepare_python_api_arguments(
+                self.python_api,
+                static_inputs,
+                attrs,
+                kernel_sig,
+            )
+            inputs_sig, attrs_sig, outputs_sig = kernel_sig
+            args = OpTestUtils.assumption_assert_and_transform(
+                args, len(inputs_sig)
+            )
+            ret_tuple = self.python_api(*args)
+            result = construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig)
+            if hasattr(self, "python_out_sig_sub_name"):
+                for key in self.python_out_sig_sub_name.keys():
+                    for i in range(len(self.python_out_sig_sub_name[key])):
+                        result[key][0][i].name = self.python_out_sig_sub_name[
+                            key
+                        ][i]
+            fetch_list = getattr(self, "fetch_list", [])
+            if len(fetch_list) == 0:
+                for var in result.items():
+                    if isinstance(var[1], list):
+                        for v in var[1]:
+                            fetch_list.append(v)
+                    else:
+                        fetch_list.append(var[1])
+            outputs = result
+            outputs_valid = outputs
+            grad_inputs = inputs_to_check
+            if user_defined_grad_outputs is None:
+                if len(outputs_valid) == 1:
+                    for outputs_valid_key in outputs_valid:
+                        loss = paddle.mean(outputs_valid[outputs_valid_key][0])
+                grad_inputs = ir_grad(
+                    outputs=paddle.utils.flatten(loss),
+                    inputs=paddle.utils.flatten(static_inputs),
+                    grad_outputs=None,
+                )
+            else:
+                # user_defined_grad_outputs here are numpy arrays
+                if not isinstance(user_defined_grad_outputs, list):
+                    user_defined_grad_outputs = [user_defined_grad_outputs]
+                grad_outputs = []
+                for grad_out_value in user_defined_grad_outputs:
+                    grad_outputs.append(paddle.to_tensor(grad_out_value))
+                # delete the inputs which no need to calculate grad
+                for no_grad_val in no_grad_set:
+                    del static_inputs[no_grad_val]
+
+                grad_inputs = ir_grad(
+                    outputs=paddle.utils.flatten(outputs),
+                    inputs=paddle.utils.flatten(static_inputs),
+                    grad_outputs=grad_outputs,
+                )
+            fetch_list = list(grad_inputs)
+
+            # executor run
+            executor = paddle.static.Executor()
+            (outs,) = executor.run(
+                ir_program,
+                feed=feed,
+                fetch_list=fetch_list,
+            )
+            return outs
+
 
 class OpTestTool:
     @classmethod
diff --git a/test/legacy_test/test_elementwise_div_op.py b/test/legacy_test/test_elementwise_div_op.py
index 5972f8089fd552..7fb7ec87e45013 100644
--- a/test/legacy_test/test_elementwise_div_op.py
+++ b/test/legacy_test/test_elementwise_div_op.py
@@ -538,6 +538,8 @@ def test_check_grad_normal(self):
         self.check_grad(
             ['X', 'Y'],
             'Out',
+            numeric_grad_delta=1e-5,
+            max_relative_error=1e-6,
         )
 
     def test_check_grad_ingore_x(self):
@@ -545,6 +547,8 @@ def test_check_grad_ingore_x(self):
             ['Y'],
             'Out',
             no_grad_set=set("X"),
+            numeric_grad_delta=1e-5,
+            max_relative_error=1e-6,
         )
 
     def test_check_grad_ingore_y(self):
@@ -552,6 +556,8 @@ def test_check_grad_ingore_y(self):
             ['X'],
             'Out',
             no_grad_set=set('Y'),
+            numeric_grad_delta=1e-5,
+            max_relative_error=1e-6,
         )
 
 
diff --git a/test/legacy_test/test_fused_scale_bias_relu_conv_bnstats_op.py b/test/legacy_test/test_fused_scale_bias_relu_conv_bnstats_op.py
new file mode 100644
index 00000000000000..cbed4e5b33fcf2
--- /dev/null
+++ b/test/legacy_test/test_fused_scale_bias_relu_conv_bnstats_op.py
@@ -0,0 +1,239 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+from eager_op_test import OpTest, skip_check_grad_ci
+
+import paddle
+from paddle import nn
+from paddle.fluid import core
+
+
+def skip_unit_test():
+    return (
+        not paddle.is_compiled_with_cuda()
+        or paddle.device.cuda.get_device_capability()[0] < 8
+        or paddle.get_cudnn_version() < 8800
+    )
+
+
+skip_msg = (
+    "only support with cuda and CUDNN 8.8 or later,"
+    " and only Ampere or later devices are supported"
+)
+
+
+@skip_check_grad_ci(reason="no grap op")
+@unittest.skipIf(skip_unit_test(), skip_msg)
+class TestFusedScaleBiasReluConvBnstatsOp(OpTest):
+    def setUp(self):
+        self.__class__.op_type = "fused_scale_bias_relu_conv_bnstats"
+        self.dtype = np.float16
+        self.outputs = None
+        self.padding_algorithm = "EXIPLICIT"
+        self.data_format = "NHWC"
+        self.groups = 1
+        self.init_attr()
+        self.init_test_case()
+        self.rtol = 1e-5
+        self.atol = 2e-2
+
+        self.attrs = {
+            'fuse_prologue': self.fuse_prologue,
+            'strides': self.stride,
+            'paddings': self.pad,
+            'dilations': self.dilations,
+            'data_format': self.data_format,
+            'padding_algorithm': self.padding_algorithm,
+            'accumulation_count': self.accumulation_count,
+            'momentum': self.momentum,
+            'epsilon': self.epsilon,
+            'exhaustive_search': self.exhaustive_search,
+            'groups': self.groups,
+        }
+
+        # prepare inputs
+        np.random.seed(0)
+        self.x_input = np.random.random(self.x_size).astype(self.dtype)
+        self.bias_input = np.random.random(self.in_channel_num).astype(
+            self.dtype
+        )
+        self.scale_input = np.random.random(self.in_channel_num).astype(
+            self.dtype
+        )
+
+        self.x_input_prologue = self.x_input.astype(np.float32)
+        if self.fuse_prologue:
+            self.x_input_prologue *= self.scale_input.reshape(
+                (1, 1, 1, self.in_channel_num)
+            ).astype(
+                np.float32
+            )  # scale
+            self.x_input_prologue += self.bias_input.reshape(
+                (1, 1, 1, self.in_channel_num)
+            ).astype(
+                np.float32
+            )  # bias
+            self.x_input_prologue = np.maximum(self.x_input_prologue, 0)  # relu
+        self.x_input_prologue = self.x_input_prologue.astype(self.dtype)
+
+        paddle.disable_static()
+        paddle.seed(0)
+        paddle.set_default_dtype(self.dtype)
+
+        self.conv = nn.Conv2D(
+            in_channels=self.x_size[-1],
+            out_channels=self.filter_size[0],
+            kernel_size=self.filter_size[-1],
+            stride=self.stride,
+            padding=self.pad,
+            groups=self.groups,
+            bias_attr=False,
+            data_format=self.data_format,
+        )
+
+        self.bn = nn.BatchNorm(
+            self.filter_size[0],
+            momentum=self.momentum,
+            epsilon=self.epsilon,
+            data_layout=self.data_format,
+        )
+
+        self.w_input = self.conv.weight.numpy().astype(self.dtype)
+        self.bn_scale_input = self.bn.weight.numpy()
+        self.bn_bias_input = self.bn.bias.numpy()
+        self.bn_running_mean_input = self.bn._mean.numpy()
+        self.bn_running_var_input = self.bn._variance.numpy()
+
+        (
+            y_ref,
+            running_mean_out_ref,
+            running_var_out_ref,
+            saved_mean_out_ref,
+            saved_invvar_out_ref,
+            eqscale_ref,
+            eqbias_ref,
+        ) = self.calc_ref()
+
+        self.inputs = {
+            'x': self.x_input,
+            'w': self.w_input,
+            'bn_scale': self.bn_scale_input,
+            'bn_bias': self.bn_bias_input,
+            'input_running_mean': self.bn_running_mean_input,
+            'input_running_var': self.bn_running_var_input,
+        }
+        if self.fuse_prologue:
+            extra_inputs = {
+                'bias': self.bias_input,
+                'scale': self.scale_input,
+            }
+            self.inputs.update(extra_inputs)
+
+        self.outputs = {
+            'out': y_ref,
+            'out_running_mean': running_mean_out_ref,
+            'out_running_var': running_var_out_ref,
+            'saved_mean': saved_mean_out_ref,
+            'saved_var': saved_invvar_out_ref,
+            'eq_scale': eqscale_ref,
+            'eq_bias': eqbias_ref,
+        }
+
+    def calc_ref(self):
+        # Calculate normal (scale + bias + relu +) Conv + BN
+        x_input_np = self.x_input
+        if self.fuse_prologue:
+            x_input_np = self.x_input_prologue
+        x_tensor = paddle.to_tensor(x_input_np, stop_gradient=False)
+        after_conv = self.conv(x_tensor)
+        after_bn = self.bn(after_conv)
+        # Calculate reference for saved_mean and saved_invvar
+        after_conv_np = (
+            after_conv.numpy()
+            .astype(np.float32)
+            .reshape((-1, after_conv.shape[-1]))
+        )
+        mean_np = after_conv_np.mean(axis=0)
+        var_np = after_conv_np.var(axis=0)
+        invstd_np = 1 / np.sqrt(var_np + self.epsilon)
+        # Calculate reference for eqscale and eqbias
+        eqscale_np = self.bn_scale_input * invstd_np
+        eqbias_np = (
+            self.bn_bias_input - self.bn_scale_input * mean_np * invstd_np
+        )
+        return (
+            after_conv.numpy().astype(self.dtype),
+            self.bn._mean.numpy(),
+            self.bn._variance.numpy(),
+            mean_np,
+            invstd_np,
+            eqscale_np,
+            eqbias_np,
+        )
+
+    def has_cuda(self):
+        return core.is_compiled_with_cuda()
+
+    def test_check_output(self):
+        if self.has_cuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(
+                place, atol=self.atol, rtol=self.rtol, check_dygraph=False
+            )
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+
+        self.x_size = [8, 16, 16, 32]  # NHWC
+        self.filter_size = [64, 32, 1, 1]
+        self.y_size = [8, 16, 16, 64]
+        self.in_channel_num = self.x_size[-1]
+        self.out_channel_num = self.y_size[-1]
+        self.scale_size = [self.in_channel_num]
+        self.bn_size = [self.out_channel_num]
+        self.momentum = 0.9
+        self.epsilon = 1e-5
+        self.accumulation_count = (
+            self.y_size[0] * self.y_size[1] * self.y_size[2]
+        )
+
+    def init_attr(self):
+        self.fuse_prologue = True
+        self.exhaustive_search = False
+
+
+class TestFusedScaleBiasReluConvBnstatsOpNoPrologue(
+    TestFusedScaleBiasReluConvBnstatsOp
+):
+    def init_attr(self):
+        self.fuse_prologue = False
+        self.exhaustive_search = False
+
+
+class TestFusedScaleBiasReluConvBnstatsOpExhaustive(
+    TestFusedScaleBiasReluConvBnstatsOp
+):
+    def init_attr(self):
+        self.fuse_prologue = True
+        self.exhaustive_search = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_isclose_op.py b/test/legacy_test/test_isclose_op.py
index c09d7fd7751a68..2074a160c5b3d9 100644
--- a/test/legacy_test/test_isclose_op.py
+++ b/test/legacy_test/test_isclose_op.py
@@ -214,7 +214,7 @@ def test_fp16(self):
         y_data = np.random.rand(10, 10).astype('float16')
         with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.static.data(shape=[10, 10], name='x', dtype='float16')
-            y = paddle.static.data(shape=[10, 10], name='x', dtype='float16')
+            y = paddle.static.data(shape=[10, 10], name='y', dtype='float16')
             out = paddle.isclose(x, y, rtol=1e-05, atol=1e-08)
             if core.is_compiled_with_cuda():
                 place = paddle.CUDAPlace(0)
@@ -259,6 +259,69 @@ def test_check_output(self):
         self.check_output()
 
 
+class TestIscloseOpCp64(unittest.TestCase):
+    def test_cp64(self):
+        x_data = (
+            np.random.rand(10, 10) + 1.0j * np.random.rand(10, 10)
+        ).astype(np.complex64)
+        y_data = (
+            np.random.rand(10, 10) + 1.0j * np.random.rand(10, 10)
+        ).astype(np.complex64)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(shape=[10, 10], name='x', dtype=np.complex64)
+            y = paddle.static.data(shape=[10, 10], name='y', dtype=np.complex64)
+            out = paddle.isclose(x, y, rtol=1e-05, atol=1e-08)
+            if core.is_compiled_with_cuda():
+                place = paddle.CUDAPlace(0)
+                exe = paddle.static.Executor(place)
+                exe.run(paddle.static.default_startup_program())
+                out = exe.run(feed={'x': x_data, 'y': y_data}, fetch_list=[out])
+
+
+class TestIscloseOpCp128(unittest.TestCase):
+    def test_cp128(self):
+        x_data = (
+            np.random.rand(10, 10) + 1.0j * np.random.rand(10, 10)
+        ).astype(np.complex128)
+        y_data = (
+            np.random.rand(10, 10) + 1.0j * np.random.rand(10, 10)
+        ).astype(np.complex128)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(
+                shape=[10, 10], name='x', dtype=np.complex128
+            )
+            y = paddle.static.data(
+                shape=[10, 10], name='y', dtype=np.complex128
+            )
+            out = paddle.isclose(x, y, rtol=1e-05, atol=1e-08)
+            if core.is_compiled_with_cuda():
+                place = paddle.CUDAPlace(0)
+                exe = paddle.static.Executor(place)
+                exe.run(paddle.static.default_startup_program())
+                out = exe.run(feed={'x': x_data, 'y': y_data}, fetch_list=[out])
+
+
+class TestIscloseOpComplex64(TestIscloseOp):
+    def set_args(self):
+        self.input = np.array([10.1 + 0.1j]).astype(np.complex64)
+        self.other = np.array([10 + 0j]).astype(np.complex64)
+        self.rtol = np.array([0.01]).astype("float64")
+        self.atol = np.array([0]).astype("float64")
+        self.equal_nan = False
+
+
+class TestIscloseOpComplex128(TestIscloseOp):
+    def set_args(self):
+        self.input = np.array([10.1 + 0.1j]).astype(np.complex128)
+        self.other = np.array([10 + 0j]).astype(np.complex128)
+        self.rtol = np.array([0.01]).astype("float64")
+        self.atol = np.array([0]).astype("float64")
+        self.equal_nan = False
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestIscloseOpLargeDimInput(TestIscloseOp):
     def set_args(self):
         self.input = np.array(np.zeros([2048, 1024])).astype("float64")
diff --git a/test/legacy_test/test_logical_op.py b/test/legacy_test/test_logical_op.py
index 80a42d4145181d..10bf33e40a5e09 100755
--- a/test/legacy_test/test_logical_op.py
+++ b/test/legacy_test/test_logical_op.py
@@ -31,6 +31,8 @@
     np.float16,
     np.float32,
     np.float64,
+    np.complex64,
+    np.complex128,
 ]
 
 TEST_META_OP_DATA = [
@@ -124,6 +126,10 @@ def np_data_generator(np_shape, dtype, *args, **kwargs):
     elif dtype == np.uint16:
         x = np.random.uniform(0.0, 1.0, np_shape).astype(np.float32)
         return convert_float_to_uint16(x)
+    elif dtype == np.complex64 or dtype == np.complex128:
+        return np.random.normal(0, 1, np_shape).astype(dtype) + (
+            1.0j * np.random.normal(0, 1, np_shape)
+        ).astype(dtype)
     else:
         return np.random.normal(0, 1, np_shape).astype(dtype)
 
@@ -169,6 +175,41 @@ def test(unit_test, use_gpu=False, test_error=False):
                     (dygraph_result.numpy() == np_result).all()
                 )
                 unit_test.assertTrue((eager_result.numpy() == np_result).all())
+            # add some corner case for complex datatype
+            for complex_data_type in [np.complex64, np.complex128]:
+                for x_data in (0 + 0j, 0 + 1j, 1 + 0j, 1 + 1j):
+                    for y_data in (0 + 0j, 0 + 1j, 1 + 0j, 1 + 1j):
+                        meta_data['x_np'] = (
+                            x_data * np.ones(shape_data['x_shape'])
+                        ).astype(complex_data_type)
+                        meta_data['y_np'] = (
+                            y_data * np.ones(shape_data['y_shape'])
+                        ).astype(complex_data_type)
+                        if meta_data['binary_op'] and test_error:
+                            # catch C++ Exception
+                            unit_test.assertRaises(
+                                BaseException, run_static, **meta_data
+                            )
+                            unit_test.assertRaises(
+                                BaseException, run_dygraph, **meta_data
+                            )
+                            continue
+                        static_result = run_static(**meta_data)
+                        dygraph_result = run_dygraph(**meta_data)
+                        eager_result = run_eager(**meta_data)
+                        if meta_data['binary_op']:
+                            np_result = np_op(
+                                meta_data['x_np'], meta_data['y_np']
+                            )
+                        else:
+                            np_result = np_op(meta_data['x_np'])
+                        unit_test.assertTrue((static_result == np_result).all())
+                        unit_test.assertTrue(
+                            (dygraph_result.numpy() == np_result).all()
+                        )
+                        unit_test.assertTrue(
+                            (eager_result.numpy() == np_result).all()
+                        )
 
 
 def test_type_error(unit_test, use_gpu, type_str_map):
@@ -180,7 +221,9 @@ def check_type(op_str, x, y, binary_op):
             y = paddle.to_tensor(y)
             error_type = BaseException
         if binary_op:
-            if type_str_map['x'] != type_str_map['y']:
+            if type_str_map['x'] != type_str_map['y'] and type_str_map[
+                'x'
+            ] not in [np.complex64, np.complex128]:
                 unit_test.assertRaises(error_type, op, x=x, y=y)
             if not in_dynamic_mode():
                 error_type = TypeError
diff --git a/test/legacy_test/test_recompute_with_tuple_input.py b/test/legacy_test/test_recompute_with_tuple_input.py
new file mode 100644
index 00000000000000..90b6c37dca14a9
--- /dev/null
+++ b/test/legacy_test/test_recompute_with_tuple_input.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.distributed.fleet.utils import recompute
+
+
+class Layer(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = paddle.nn.Linear(10, 10)
+        self.linear2 = paddle.nn.Linear(10, 10)
+        self.linear3 = paddle.nn.Linear(10, 10)
+        self.silu1 = paddle.nn.Silu()
+        self.silu2 = paddle.nn.Silu()
+        self.silu3 = paddle.nn.Silu()
+
+    def forward(self, x, y):
+        assert type(x) is tuple
+        assert len(x) == 2
+        o1 = self.silu1(self.linear1(x[0]))
+        o2 = self.silu2(self.linear2(x[1]))
+        o3 = self.silu3(self.linear3(y))
+        o = o1 + o2 + o3
+        return o
+
+
+class TestPyLayer(unittest.TestCase):
+    def test_tuple_input(self):
+        layer = Layer()
+        x1 = paddle.rand(shape=[10, 10])
+        x1.stop_gradient = False
+        x2 = paddle.rand(shape=[10, 10])
+        x2.stop_gradient = False
+        y = paddle.rand(shape=[10, 10])
+        y.stop_gradient = False
+        o = recompute(layer, (x1, x2), y)
+        loss = paddle.mean(o, keepdim=True)
+        loss.backward()
+
+    def test_tuple_input_with_non_tensor(self):
+        layer = Layer()
+        x1 = paddle.rand(shape=[10, 10])
+        x1.stop_gradient = False
+        y = paddle.rand(shape=[10, 10])
+        y.stop_gradient = False
+        try:
+            o = recompute(layer, (x1, True), y)
+        except ValueError:
+            pass
+
+    def test_tuple_input_with_different_stop_gradient(self):
+        layer = Layer()
+        x1 = paddle.rand(shape=[10, 10])
+        x1.stop_gradient = False
+        x2 = paddle.rand(shape=[10, 10])
+        y = paddle.rand(shape=[10, 10])
+        y.stop_gradient = False
+        try:
+            o = recompute(layer, (x1, True), y)
+        except ValueError:
+            pass
+
+    def test_tuple_input_all_no_gradient(self):
+        layer = Layer()
+        x1 = paddle.rand(shape=[10, 10])
+        x2 = paddle.rand(shape=[10, 10])
+        y = paddle.rand(shape=[10, 10])
+        y.stop_gradient = False
+        o = recompute(layer, (x1, x2), y)
+        loss = paddle.mean(o, keepdim=True)
+        loss.backward()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/prim/new_ir_prim/CMakeLists.txt b/test/prim/new_ir_prim/CMakeLists.txt
index 393bc869d9beae..85611d846cbbe0 100644
--- a/test/prim/new_ir_prim/CMakeLists.txt
+++ b/test/prim/new_ir_prim/CMakeLists.txt
@@ -1,10 +1,20 @@
+set(TEST_PRIM_PURE_NEW_IR_CASES test_prim_program)
+
+foreach(target ${TEST_PRIM_PURE_NEW_IR_CASES})
+  py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1
+                  FLAGS_enable_new_ir_api=true)
+endforeach()
+
 file(
-  GLOB TEST_INTERP_CASES
+  GLOB TEST_PRIM_TRANS_NEW_IR_CASES
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "test_*.py")
-string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}")
+string(REPLACE ".py" "" TEST_PRIM_TRANS_NEW_IR_CASES
+               "${TEST_PRIM_TRANS_NEW_IR_CASES}")
+
+list(REMOVE_ITEM TEST_PRIM_TRANS_NEW_IR_CASES ${TEST_PRIM_PURE_NEW_IR_CASES})
 
-foreach(target ${TEST_INTERP_CASES})
+foreach(target ${TEST_PRIM_TRANS_NEW_IR_CASES})
   py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1
                   FLAGS_enable_new_ir_in_executor=true)
 endforeach()
diff --git a/test/prim/new_ir_prim/test_decomp_op.py b/test/prim/new_ir_prim/test_decomp_op.py
index f90e0fe24391bd..413008f814f7fd 100644
--- a/test/prim/new_ir_prim/test_decomp_op.py
+++ b/test/prim/new_ir_prim/test_decomp_op.py
@@ -17,6 +17,7 @@
 import paddle
 from paddle import ir
 from paddle.decomposition import decompose
+from paddle.framework import core
 
 paddle.enable_static()
 
@@ -44,7 +45,9 @@ def test_build_op(self):
         y = newir_program.block().ops[-2].results()
         orig_shape = y[0].shape
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
+        core._set_prim_forward_enabled(True)
         y_new = decompose(newir_program, y)
+        core._set_prim_forward_enabled(False)
         new_shape = y_new[0].shape
         assert (
             orig_shape == new_shape
diff --git a/test/prim/new_ir_prim/test_prim_program.py b/test/prim/new_ir_prim/test_prim_program.py
new file mode 100644
index 00000000000000..c4cc0187b1ad8b
--- /dev/null
+++ b/test/prim/new_ir_prim/test_prim_program.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.autograd.ir_backward import grad
+from paddle.decomposition import decompose
+from paddle.framework import core
+
+paddle.enable_static()
+
+
+class TestPrimMode(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.shape_x = [8, 16, 32, 64]
+        self.shape_y = [8, 16, 32, 64]
+        self.x = np.random.random(self.shape_x).astype("float32")
+        self.y = np.random.random(self.shape_y).astype("float32")
+
+    def base_net(self, flag=None):
+        if flag == "forward":
+            core._set_prim_forward_enabled(True)
+        elif flag == "backward":
+            core._set_prim_backward_enabled(True)
+        elif flag == "all":
+            core._set_prim_all_enabled(True)
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
+            x = paddle.static.data('x', self.shape_x, dtype='float32')
+            y = paddle.static.data('y', self.shape_y, dtype='float32')
+            x.stop_gradient = False
+            y.stop_gradient = False
+            divide_out = paddle.divide(x, y)
+            sum_out = paddle.mean(divide_out, axis=0)
+            [new_out] = decompose(main_program, [sum_out])
+            gradients = grad(new_out, (x, y))
+
+            exe = paddle.static.Executor()
+            [fwd, dx, dy] = exe.run(
+                feed={'x': self.x, 'y': self.y}, fetch_list=[new_out, gradients]
+            )
+
+        whole_ops = [op.name() for op in main_program.block().ops]
+        if flag == "forward":
+            core._set_prim_forward_enabled(False)
+            assert 'pd.mean' not in whole_ops and 'pd.divide_grad' in whole_ops
+        elif flag == "backward":
+            core._set_prim_backward_enabled(False)
+            assert 'pd.mean' in whole_ops and 'pd.divide_grad' not in whole_ops
+        elif flag == "all":
+            core._set_prim_all_enabled(False)
+            assert (
+                'pd.mean' not in whole_ops and 'pd.divide_grad' not in whole_ops
+            )
+        else:
+            assert 'pd.mean' in whole_ops and 'pd.divide_grad' in whole_ops
+        return fwd, dx, dy
+
+    def test_prim_forward(self):
+        res_ref = self.base_net()
+        res = self.base_net("forward")
+        for ref, actual in zip(res_ref, res):
+            np.testing.assert_equal(ref, actual)
+
+    def test_prim_backward(self):
+        res_ref = self.base_net()
+        res = self.base_net("backward")
+        for ref, actual in zip(res_ref, res):
+            np.testing.assert_allclose(ref, actual, rtol=1e-6)
+
+    def test_prim_all(self):
+        res_ref = self.base_net()
+        res = self.base_net("all")
+        for ref, actual in zip(res_ref, res):
+            np.testing.assert_allclose(ref, actual, rtol=1e-6)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/white_list/new_ir_python_api_grad_white_list.py b/test/white_list/new_ir_python_api_grad_white_list.py
new file mode 100644
index 00000000000000..81ab325a12aeb6
--- /dev/null
+++ b/test/white_list/new_ir_python_api_grad_white_list.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+new_ir_python_api_grad_white_list = [
+    "mean",
+]
diff --git a/test/white_list/op_accuracy_white_list.py b/test/white_list/op_accuracy_white_list.py
index 12a97a160aab67..49b501e765b541 100644
--- a/test/white_list/op_accuracy_white_list.py
+++ b/test/white_list/op_accuracy_white_list.py
@@ -92,6 +92,7 @@
 
 NO_FP16_COMPARED_WITH_FP32_OP_LIST = [
     'fake_quantize_moving_average_abs_max',
+    'fused_scale_bias_relu_conv_bnstats',
     'p_norm',
 ]
 
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 6c86c2d5e32223..140ba2091e1403 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -108,6 +108,19 @@ if [ "$inference_approve" != "" ]; then
     check_approval 1 qingqing01 heavengate
 fi
 
+filter_fluid=`git diff --name-only upstream/develop |  grep "py$" | grep "^test/"`
+filter_fluid+=" `git diff --name-only upstream/develop | grep "py$" | grep -v "^python/paddle/fluid"| grep "^python/paddle"`"
+has_fluid=`git diff -U0 upstream/$BRANCH -- $filter_fluid | grep '^\+' | grep -v '^++' | grep -E "(fluid\.)|(paddle\.fluid)"`
+if [ "${has_fluid}" != "" ]; then
+    for fluid in "${has_fluid}";
+    do
+        echo "${fluid}"
+    done
+    echo_line="You must have one RD (zoooo0820(Recommend), or jeff41404) approval for using fluid API, because fluid API is going to be removed.\n"
+    check_approval 1 zoooo0820 jeff41404
+fi
+
+
 DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_DEV.spec
 PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_PR.spec
 ADDED_OP_USE_DEFAULT_GRAD_MAKER=`python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py ${DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC} ${PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC}`
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index a78a545a5a6600..73428595c7d7e4 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -86,7 +86,9 @@ API_FILES=("CMakeLists.txt"
            "paddle/fluid/prim/api/manual_prim/prim_manual_api.h"
            "paddle/fluid/prim/api/api.yaml"
            "python/paddle/incubate/autograd/composite_rules.py"
-	   "python/paddle/incubate/autograd/primitives.py"
+           "python/paddle/incubate/autograd/primitives.py"
+           "python/paddle/autograd/ir_backward.py"
+           "python/paddle/autograd/backward_utils.py"
            )
 
 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
@@ -218,6 +220,9 @@ for API_FILE in ${API_FILES[*]}; do
       elif [ "${API_FILE}" == "python/paddle/incubate/autograd/primitives.py" ] || [ "${API_FILE}" == "python/paddle/incubate/autograd/composite_rules.py" ]; then
             echo_line="You must have one RD (cyber-pioneer(chenzhuo), xiaoguoguo626807(wangruting), Charles-hit(wanghao), JiabinYang) approval for changing ${API_FILE} , which manages the composite rules.\n"
             check_approval 1 cyber-pioneer xiaoguoguo626807 Charles-hit JiabinYang
+      elif [ "${API_FILE}" == "python/paddle/autograd/ir_backward.py" ] || [ "${API_FILE}" == "python/paddle/autograd/backward_utils.py" ]; then
+            echo_line="You must be approved by Aurelius84(zhangliujie) or cxxly(chenxiaoxu) or xiaoguoguo626807(wangruting) or changeyoung98(chenzhiyang) for python/paddle/autograd/ir_backward.py or python/paddle/autograd/backward_utils.py changes.\n"
+            check_approval 1 Aurelius84 cxxly xiaoguoguo626807 changeyoung98
       else
           echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1,qili93,Aurelius84) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
           check_approval 1 XiaoguangHu01 chenwhql zhiqiu Xreki luotao1 qili93 Aurelius84
@@ -270,6 +275,12 @@ if [ "${HAS_MODIFIED_DECLARATIONS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 chenwhql zyfncg
   fi
 
+HAS_USED_CCTESTOLD=`git diff -U0 upstream/$BRANCH |grep "cc_test_old" || true`
+if [ "${HAS_USED_CCTESTOLD}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must be approved by phlrain or risemeup1 or zhangbo9674 for using cc_test_old. Thanks!\n"
+    check_approval 1 phlrain risemeup1 zhangbo9674
+fi
+
 HAS_MODIFIED_API_COMPAT_YAML=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/api/yaml/op_compat.yaml" || true`
 if [ "${HAS_MODIFIED_API_COMPAT_YAML}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must be approved by chenwhql or zyfncg or heavyrain-lzy for paddle/phi/api/yaml/op_compat.yaml changes, which manages the extra params of Op and name mapping between Yaml and OpMaker. In order to ensure compatibility of framework, this file isn't allowed to be modified at will!\n"
diff --git a/tools/cinn/build.sh b/tools/cinn/build.sh
index e2c1d0d0025d26..a32ae972e340b0 100755
--- a/tools/cinn/build.sh
+++ b/tools/cinn/build.sh
@@ -24,7 +24,10 @@ cinn_whl_path=python/dist/cinn-0.0.0-py3-none-any.whl
 
 #export LLVM11_DIR=${workspace}/THIRDS/usr
 
-JOBS=8
+if [[ "" == ${JOBS} ]]; then
+  JOBS=`nproc`
+fi
+
 cuda_config=OFF
 cudnn_config=OFF
 
@@ -118,19 +121,6 @@ function prepare_model {
 
     proxy_on
     mkdir -p $build_dir/paddle
-    cd $build_dir/paddle
-    if [[ ! -f "libexternal_kernels.so.tgz" ]]; then
-        wget https://github.com/T8T9/files/raw/main/libexternal_kernels.so.tgz
-    fi
-    tar -zxvf libexternal_kernels.so.tgz
-    if [[ ! -f "paddle_1.8_fc_model.tgz" ]]; then
-        wget https://github.com/T8T9/files/raw/main/paddle_1.8_fc_model.tgz
-    fi
-    tar -zxvf paddle_1.8_fc_model.tgz
-    if [[ ! -f "mkldnn.tgz" ]]; then
-        wget https://github.com/T8T9/files/raw/main/mkldnn.tgz
-    fi
-    tar -zxvf mkldnn.tgz
     cd $build_dir/third_party
     python${py_version} $workspace/test/cinn/fake_model/naive_mul.py
     python${py_version} $workspace/test/cinn/fake_model/naive_multi_fc.py