PaddlePaddle · thisjiang · May 15, 2023 · May 12, 2023 · May 12, 2023
diff --git a/cinn/runtime/cuda/cublas_util.h b/cinn/runtime/cuda/cublas_util.h
@@ -52,6 +52,23 @@ inline cublasStatus_t cublasGemm(cudaDataType_t dtype,
                        reinterpret_cast<const float *>(&beta),
                        reinterpret_cast<float *>(C),
                        ldc);
+  } else if (dtype == CUDA_R_64F) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    return cublasDgemm(handle,
+                       transa,
+                       transb,
+                       m,
+                       n,
+                       k,
+                       &alpha_fp64,
+                       reinterpret_cast<const double *>(A),
+                       lda,
+                       reinterpret_cast<const double *>(B),
+                       ldb,
+                       &beta_fp64,
+                       reinterpret_cast<double *>(C),
+                       ldc);
   } else if (dtype == CUDA_R_16F) {
     common::float16 alpha_fp16{alpha};
     common::float16 beta_fp16{beta};
@@ -135,6 +152,27 @@ inline cublasStatus_t cublasGemmStridedBatched(cudaDataType_t dtype,
                                      ldc,
                                      strideC,
                                      batchCount);
+  } else if (dtype == CUDA_R_64F) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    return cublasDgemmStridedBatched(handle,
+                                     transa,
+                                     transb,
+                                     m,
+                                     n,
+                                     k,
+                                     &alpha_fp64,
+                                     reinterpret_cast<const double *>(A),
+                                     lda,
+                                     strideA,
+                                     reinterpret_cast<const double *>(B),
+                                     ldb,
+                                     strideB,
+                                     &beta_fp64,
+                                     reinterpret_cast<double *>(C),
+                                     ldc,
+                                     strideC,
+                                     batchCount);
   } else if (dtype == CUDA_R_16F) {
     common::float16 alpha_fp16{alpha};
     common::float16 beta_fp16{beta};
@@ -220,6 +258,24 @@ inline cublasStatus_t cublasGemmBatched(cudaDataType_t dtype,
                               reinterpret_cast<float **>(C),
                               ldc,
                               batchCount);
+  } else if (dtype == CUDA_R_64F) {
+    const double alpha_fp64 = static_cast<double>(alpha);
+    const double beta_fp64  = static_cast<double>(beta);
+    return cublasDgemmBatched(handle,
+                              transa,
+                              transb,
+                              m,
+                              n,
+                              k,
+                              &alpha_fp64,
+                              reinterpret_cast<double **>(A),
+                              lda,
+                              reinterpret_cast<double **>(B),
+                              ldb,
+                              &beta_fp64,
+                              reinterpret_cast<double **>(C),
+                              ldc,
+                              batchCount);
   } else if (dtype == CUDA_R_16F) {
     __half alpha_fp16{alpha};
     __half beta_fp16{beta};

diff --git a/cinn/runtime/cuda/cuda_util.cc b/cinn/runtime/cuda/cuda_util.cc
@@ -163,6 +163,8 @@ void cinn_call_cublas(void *v_args,
     cuda_dtype = CUDA_R_16F;
   } else if (is_float && bytes == sizeof(float)) {
     cuda_dtype = CUDA_R_32F;
+  } else if (is_float && bytes == sizeof(double)) {
+    cuda_dtype = CUDA_R_64F;
   } else if (is_bfloat16) {
     cuda_dtype = CUDA_R_16BF;
   } else {
@@ -326,6 +328,8 @@ void cinn_call_batched_cublas(void *v_args,
     cuda_dtype = CUDA_R_16F;
   } else if (is_float && bytes == sizeof(float)) {
     cuda_dtype = CUDA_R_32F;
+  } else if (is_float && bytes == sizeof(double)) {
+    cuda_dtype = CUDA_R_64F;
   } else if (is_bfloat16) {
     cuda_dtype = CUDA_R_16BF;
   } else {