Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuBLAS: non-contiguous tensor support #1215

Merged
merged 7 commits into from
Apr 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -339,3 +339,31 @@ void ggml_init_cublas(void) {
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
}
}

cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
const uint64_t ne0 = src->ne[0];
const uint64_t ne1 = src->ne[1];
const uint64_t nb0 = src->nb[0];
const uint64_t nb1 = src->nb[1];
const uint64_t nb2 = src->nb[2];
const uint64_t nb3 = src->nb[3];
const enum ggml_type type = src->type;
const size_t ts = ggml_type_size(type);
const size_t bs = ggml_blck_size(type);

const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
if (nb0 == ts && nb1 == ts*ne0/bs) {
return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream);
} else if (nb0 == ts) {
return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream);
} else {
for (uint64_t i1 = 0; i1 < ne1; i1++) {
const void * rx = (const void *) ((const char *) x + i1*nb1);
void * rd = (void *) ((char *) dst + i1*ts*ne0/bs);
// pretend the row is a matrix with cols=1
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
if (r != cudaSuccess) return r;
}
return cudaSuccess;
}
}
3 changes: 3 additions & 0 deletions ggml-cuda.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "ggml.h"

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -39,6 +40,8 @@ void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t st
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);

cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);

#ifdef __cplusplus
}
#endif
24 changes: 13 additions & 11 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -8124,8 +8124,12 @@ static bool ggml_compute_forward_mul_mat_use_blas(
const int64_t ne1 = dst->ne[1];

// TODO: find the optimal values for these
if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
if (
#if !defined(GGML_USE_CUBLAS)
ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
#endif
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {

/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
return true;
Expand Down Expand Up @@ -8235,15 +8239,16 @@ static void ggml_compute_forward_mul_mat_f32(

for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
#if !defined(GGML_USE_CUBLAS)
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);

#endif
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);

#if defined(GGML_USE_CUBLAS)
// copy data to device
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));

// compute
CUBLAS_CHECK(
Expand Down Expand Up @@ -8463,13 +8468,12 @@ static void ggml_compute_forward_mul_mat_f16_f32(
#endif

#if defined(GGML_USE_CUBLAS)
const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03);
const ggml_fp16_t * y = (ggml_fp16_t *) wdata;

float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);

// copy data to device
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));

// compute
Expand Down Expand Up @@ -8736,9 +8740,7 @@ static void ggml_compute_forward_mul_mat_q_f32(

#if defined(GGML_USE_CUBLAS)
// copy and dequantize on device
CUDA_CHECK(
cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02,
GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, g_cudaStream));
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream));

dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
CUDA_CHECK(cudaGetLastError());
Expand All @@ -8758,7 +8760,7 @@ static void ggml_compute_forward_mul_mat_q_f32(

#if defined(GGML_USE_CUBLAS)
// copy data to device
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));

// compute
CUBLAS_CHECK(
Expand Down