Skip to content

Commit

Permalink
Merge branch 'develop' into broadcast_div
Browse files Browse the repository at this point in the history
  • Loading branch information
Zjq9409 committed Jan 4, 2022
2 parents 8259c34 + 26b845e commit d2f3776
Show file tree
Hide file tree
Showing 50 changed files with 2,754 additions and 140 deletions.
2 changes: 1 addition & 1 deletion paddle/fluid/framework/fleet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ if(WITH_PSLIB)
else()
set(BRPC_DEPS brpc)
endif(WITH_PSLIB_BRPC)
cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope ${BRPC_DEPS} pslib)
cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto proto_desc op_registry variable_helper scope ${BRPC_DEPS} pslib)
else()
cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
endif(WITH_PSLIB)
Expand Down
36 changes: 18 additions & 18 deletions paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,22 @@ namespace ir {
class Node;

MulGRUFusePass::MulGRUFusePass() {
AddOpCompat(OpCompat("mul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("x_num_col_dims")
.IsNumEQ(1)
.End()
.AddAttr("y_num_col_dims")
.IsNumEQ(1)
.End();
AddOpCompat(OpCompat("gru"))
.AddInput("Input")
.IsTensor()
Expand Down Expand Up @@ -58,10 +74,10 @@ MulGRUFusePass::MulGRUFusePass() {
.IsTensor()
.End()
.AddAttr("activation")
.IsStringIn({"sigmoid", "tanh", "relu", "identity"})
.IsStringIn({"sigmoid", "tanh"})
.End()
.AddAttr("gate_activation")
.IsStringIn({"sigmoid", "tanh", "relu", "identity"})
.IsStringIn({"sigmoid", "tanh"})
.End()
.AddAttr("is_reverse")
.IsType<bool>()
Expand All @@ -70,22 +86,6 @@ MulGRUFusePass::MulGRUFusePass() {
.IsType<bool>()
.IsOptional()
.End();
AddOpCompat(OpCompat("mul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("x_num_col_dims")
.IsNumEQ(1)
.End()
.AddAttr("y_num_col_dims")
.IsNumEQ(1)
.End();
}

FCGRUFusePass::FCGRUFusePass() {
Expand Down
59 changes: 59 additions & 0 deletions paddle/fluid/framework/tensor_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,36 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
"Copying from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_mlu_place = BOOST_GET_CONST(platform::MLUPlace, src_place);
auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
auto stream =
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
memory::Copy(dst_cpu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
auto dst_mlu_place = BOOST_GET_CONST(platform::MLUPlace, dst_place);
auto stream =
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
memory::Copy(dst_mlu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
}
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
auto src_mlu_place = BOOST_GET_CONST(platform::MLUPlace, src_place);
auto dst_mlu_place = BOOST_GET_CONST(platform::MLUPlace, dst_place);
auto stream =
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
memory::Copy(dst_mlu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copying from %s to %s is not supported.", src_place, dst_place));
}
#endif
}

void TensorCopy(const Tensor& src, const platform::Place& dst_place,
Expand Down Expand Up @@ -526,6 +556,35 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::MLUPlace, src_place), src_ptr, size,
nullptr);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
nullptr);
}
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::MLUPlace, src_place), src_ptr, size,
nullptr);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
}

template <typename Predicate, typename DevCtx>
Expand Down
10 changes: 10 additions & 0 deletions paddle/fluid/imperative/prepared_operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,16 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_MLU
if (kernel_iter == kernels.end() &&
is_mlu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing MLU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
// TODO(jiabin): Add operator.cc's line 1000 part back when we need that
// case
Expand Down
5 changes: 4 additions & 1 deletion paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,10 @@ void AnalysisPredictor::MkldnnPreSet(
void AnalysisPredictor::MkldnnPostReset() {
#ifdef PADDLE_WITH_MKLDNN
// In cache clearing mode.
if (config_.mkldnn_cache_capacity_ > 0) {
if (config_.mkldnn_cache_capacity_ > 0 &&
static_cast<platform::MKLDNNDeviceContext *>(
(&platform::DeviceContextPool::Instance())->Get(platform::CPUPlace()))
->GetCachedObjectsNumber() > 0) {
if (VLOG_IS_ON(2)) {
auto shape_blob_size = static_cast<platform::MKLDNNDeviceContext *>(
(&platform::DeviceContextPool::Instance())
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -690,7 +690,7 @@ class AllocatorFacadePrivate {
#ifdef PADDLE_WITH_MLU
int device_count = platform::GetMLUDeviceCount();
for (int i = 0; i < device_count; ++i) {
platform::XPUPlace p(i);
platform::MLUPlace p(i);
system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
Expand Down
24 changes: 12 additions & 12 deletions paddle/fluid/operators/elementwise/elementwise_div_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,30 @@ ElementwiseDivGrad(const framework::ExecutionContext& ctx,
const framework::Tensor* out, const framework::Tensor* dout,
framework::Tensor* dx, framework::Tensor* dy) {
int axis = ctx.Attr<int>("axis");
const auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
const auto& dev_ctx = ctx.template device_context<DeviceContext>();
const auto place = ctx.GetPlace();
if (dx != nullptr && dy != nullptr) {
dx->mutable_data<T>(platform::CUDAPlace());
dx->mutable_data<T>(place);
if (dx->IsSharedBufferWith(*dout)) {
dx->clear();
dx->mutable_data<T>(x->dims(), platform::CUDAPlace());
dx->mutable_data<T>(x->dims(), place);
}
std::vector<const framework::Tensor*> ins = {dout, out, y};
GetGradXAndYOut<ElementwiseType::kTernary, T>(dev_ctx, axis, ins, dout, dx,
dy, DivGradXYFunctor<T, T>());
GetGradXAndYOut<ElementwiseType::kTernary, T>(
dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor<T, T>());
} else if (dx != nullptr && dy == nullptr) {
dx->mutable_data<T>(platform::CUDAPlace());
dx->mutable_data<T>(place);
if (dx->IsSharedBufferWith(*dout)) {
dx->clear();
dx->mutable_data<T>(x->dims(), platform::CUDAPlace());
dx->mutable_data<T>(x->dims(), place);
}
std::vector<const framework::Tensor*> ins = {dout, y};
GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, axis, ins, dout, dx,
DivGradXFunctor<T>());
GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
dx, DivGradXFunctor<T>());
} else if (dy != nullptr && dx == nullptr) {
std::vector<const framework::Tensor*> ins = {dout, out, y};
GetGradXOrYOut<ElementwiseType::kTernary, T>(dev_ctx, axis, ins, dout, dy,
DivGradYFunctor<T>());
GetGradXOrYOut<ElementwiseType::kTernary, T>(
dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor<T>());
}
}

Expand Down
20 changes: 11 additions & 9 deletions paddle/fluid/operators/elementwise/elementwise_op_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -2631,25 +2631,26 @@ void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis,
}

template <ElementwiseType ET, typename T, typename Functor>
void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, int axis,
void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
const platform::Place &place, int axis,
std::vector<const framework::Tensor *> ins,
const framework::Tensor *dout, framework::Tensor *dx,
framework::Tensor *dy, Functor func) {
framework::Tensor tmp_dx;
framework::Tensor tmp_dy;
dy->mutable_data<T>(platform::CUDAPlace());
dy->mutable_data<T>(place);
std::vector<framework::Tensor *> outs;
if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) {
outs = {dx, dy};
} else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) {
tmp_dx.mutable_data<T>(dout->dims(), platform::CUDAPlace());
tmp_dx.mutable_data<T>(dout->dims(), place);
outs = {&tmp_dx, dy};
} else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) {
tmp_dy.mutable_data<T>(dout->dims(), platform::CUDAPlace());
tmp_dy.mutable_data<T>(dout->dims(), place);
outs = {dx, &tmp_dy};
} else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) {
tmp_dy.mutable_data<T>(dout->dims(), platform::CUDAPlace());
tmp_dx.mutable_data<T>(dout->dims(), platform::CUDAPlace());
tmp_dy.mutable_data<T>(dout->dims(), place);
tmp_dx.mutable_data<T>(dout->dims(), place);
outs = {&tmp_dx, &tmp_dy};
}

Expand All @@ -2667,16 +2668,17 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, int axis,
}

template <ElementwiseType ET, typename T, typename Functor>
void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx, int axis,
void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx,
const platform::Place &place, int axis,
std::vector<const framework::Tensor *> ins,
const framework::Tensor *dout, framework::Tensor *dxy,
Functor func) {
framework::Tensor tmp_dxy;
dxy->mutable_data<T>(platform::CUDAPlace());
dxy->mutable_data<T>(place);

std::vector<framework::Tensor *> outs;
if (dxy->dims() != dout->dims()) {
tmp_dxy.mutable_data<T>(dout->dims(), platform::CUDAPlace());
tmp_dxy.mutable_data<T>(dout->dims(), place);
outs = {&tmp_dxy};
} else {
outs = {dxy};
Expand Down
Loading

0 comments on commit d2f3776

Please sign in to comment.