Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use elementwise to optimize gelu forward implementation on GPU #38188

Merged
merged 3 commits into from
Dec 21, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions paddle/fluid/operators/gelu_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/gelu_op.h"
#include "paddle/fluid/platform/float16.h"

namespace paddle {
namespace operators {

template <typename T>
struct GeluWithApproximateFunctor {
using MT = typename details::MPTypeTrait<T>::Type;
inline HOSTDEVICE T operator()(T x) {
// this function is tanh approximation of gelu
MT mx = static_cast<MT>(x);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里的命名可以参考activation_op.cu中的命名方式

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

MT out = mx * static_cast<MT>(0.5) *
(static_cast<MT>(1.0) +
tanh(static_cast<MT>(0.79788456) * mx *
(static_cast<MT>(1) + static_cast<MT>(0.044715) * mx * mx)));
return static_cast<T>(out);
}
};

template <typename T>
struct GeluNoApproximateFunctor {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

名字改一下,跟上面的对应,改成without

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

using MT = typename details::MPTypeTrait<T>::Type;
inline HOSTDEVICE T operator()(T x) {
// actual gelu with approximation=false
// x * 0.5 * (1.0 + erf(x * 0.70710678))
MT mx = static_cast<MT>(x);
MT temp = erf(mx * static_cast<MT>(M_SQRT1_2));
MT out = mx * static_cast<MT>(0.5) * (static_cast<MT>(1) + temp);
return static_cast<T>(out);
}
};

template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
default_gelu_fw(const framework::ExecutionContext& ctx,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

不需要重新写一个新的函数,直接特化一个CUDA版本的GeluKernel就可以了

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

const framework::Tensor* in, const bool approximate,
framework::Tensor* out) {
std::vector<const framework::Tensor*> ins = {in};
std::vector<framework::Tensor*> outs = {out};
const auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
if (approximate) {
LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
} else {
LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
dev_ctx, ins, &outs, 0, GeluNoApproximateFunctor<T>());
}
}

} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
gelu, ops::GeluKernel<paddle::platform::CUDADeviceContext, float>,
Expand Down
33 changes: 26 additions & 7 deletions paddle/fluid/operators/gelu_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,31 @@ struct GeluGradFunctor {
}
};

template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
default_gelu_fw(const framework::ExecutionContext& ctx,
const framework::Tensor* in, const bool approximate,
framework::Tensor* out) {
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);

auto& place =
*ctx.template device_context<platform::CPUDeviceContext>().eigen_device();

GeluFunctor<T> functor;
functor(place, eigen_in, eigen_out, approximate);
}

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
default_gelu_fw(const framework::ExecutionContext& ctx,
const framework::Tensor* in, const bool approximate,
framework::Tensor* out);
#endif

template <typename DeviceContext, typename T>
class GeluKernel : public framework::OpKernel<T> {
public:
Expand All @@ -193,13 +218,7 @@ class GeluKernel : public framework::OpKernel<T> {
auto approximate = context.Attr<bool>("approximate");
out->mutable_data<T>(in->place());

auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();

GeluFunctor<T> functor;
functor(place, eigen_in, eigen_out, approximate);
default_gelu_fw<DeviceContext, T>(context, in, approximate, out);
}
};

Expand Down