PaddlePaddle · ZzSean · Dec 21, 2021 · Dec 16, 2021 · Dec 16, 2021 · Dec 20, 2021
diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu
@@ -12,9 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/gelu_op.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct GeluWithApproximateFunctor {
+  using MT = typename details::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T x) {
+    // this function is tanh approximation of gelu
+    MT mx = static_cast<MT>(x);
+    MT out = mx * static_cast<MT>(0.5) *
+             (static_cast<MT>(1.0) +
+              tanh(static_cast<MT>(0.79788456) * mx *
+                   (static_cast<MT>(1) + static_cast<MT>(0.044715) * mx * mx)));
+    return static_cast<T>(out);
+  }
+};
+
+template <typename T>
+struct GeluNoApproximateFunctor {
+  using MT = typename details::MPTypeTrait<T>::Type;
+  inline HOSTDEVICE T operator()(T x) {
+    // actual gelu with approximation=false
+    // x * 0.5 * (1.0 + erf(x * 0.70710678))
+    MT mx = static_cast<MT>(x);
+    MT temp = erf(mx * static_cast<MT>(M_SQRT1_2));
+    MT out = mx * static_cast<MT>(0.5) * (static_cast<MT>(1) + temp);
+    return static_cast<T>(out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+default_gelu_fw(const framework::ExecutionContext& ctx,
+                const framework::Tensor* in, const bool approximate,
+                framework::Tensor* out) {
+  std::vector<const framework::Tensor*> ins = {in};
+  std::vector<framework::Tensor*> outs = {out};
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CUDADeviceContext>();
+  if (approximate) {
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor<T>());
+  } else {
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, 0, GeluNoApproximateFunctor<T>());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     gelu, ops::GeluKernel<paddle::platform::CUDADeviceContext, float>,

diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h
@@ -184,6 +184,31 @@ struct GeluGradFunctor {
   }
 };
 
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+default_gelu_fw(const framework::ExecutionContext& ctx,
+                const framework::Tensor* in, const bool approximate,
+                framework::Tensor* out) {
+  auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+  auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+
+  auto& place =
+      *ctx.template device_context<platform::CPUDeviceContext>().eigen_device();
+
+  GeluFunctor<T> functor;
+  functor(place, eigen_in, eigen_out, approximate);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+default_gelu_fw(const framework::ExecutionContext& ctx,
+                const framework::Tensor* in, const bool approximate,
+                framework::Tensor* out);
+#endif
+
 template <typename DeviceContext, typename T>
 class GeluKernel : public framework::OpKernel<T> {
  public:
@@ -193,13 +218,7 @@ class GeluKernel : public framework::OpKernel<T> {
     auto approximate = context.Attr<bool>("approximate");
     out->mutable_data<T>(in->place());
 
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    GeluFunctor<T> functor;
-    functor(place, eigen_in, eigen_out, approximate);
+    default_gelu_fw<DeviceContext, T>(context, in, approximate, out);
   }
 };