Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix OpenCL kernels for the new formats #1422

Merged
merged 2 commits into from
May 13, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 90 additions & 99 deletions ggml-opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,109 +12,129 @@
#define MULTILINE_QUOTE(...) #__VA_ARGS__
const char * clblast_dequant = MULTILINE_QUOTE(

typedef uchar uint8_t;
typedef int int32_t;
typedef uint uint32_t;

constant uint QK4_0 = 32;
struct block_q4_0
{
float d;
uchar qs[16];
uint8_t qs[QK4_0 / 2];
};

__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);

const float d = blocks[i].d;

const uchar vi = blocks[i].qs[l];

const uint index = i*32 + l*2;
result[index + 0] = ((vi & 0xf) - 8)*d;
result[index + 1] = ((vi >> 4) - 8)*d;
}

constant uint QK4_1 = 32;
struct block_q4_1
{
float d;
float m;
uchar qs[16];
uint8_t qs[QK4_1 / 2];
};

__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);

const float d = blocks[i].d;
const float m = blocks[i].m;

const uchar vi = blocks[i].qs[l];
constant uint QK5_0 = 32;
struct __attribute__ ((packed)) block_q5_0
{
half d;
uint32_t qh;
uint8_t qs[QK5_0 / 2];
};

const uint index = i*32 + l*2;
result[index + 0] = (vi & 0xf) * d + m;
result[index + 1] = (vi >> 4) * d + m;
}
constant uint QK5_1 = 32;
struct block_q5_1
{
half d;
half m;
uint32_t qh;
uint8_t qs[QK5_1 / 2];
};

struct block_q5_0
constant uint QK8_0 = 32;
struct block_q8_0
{
float d;
uint qh;
uchar qs[16];
uint8_t qs[QK8_0];
};

__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);

const float d = blocks[i].d;
__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
constant uint qk = QK4_0;

const uchar vi = blocks[i].qs[l];
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);

const uint l2 = l * 2;
const float d = x[i].d;

const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
const int x0 = (x[i].qs[j] & 0xf) - 8;
const int x1 = (x[i].qs[j] >> 4) - 8;

const uint index = i*32 + l2;
result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
}

struct block_q5_1
{
ushort d;
ushort m;
uint qh;
uchar qs[16];
};
__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
constant uint qk = QK4_1;

__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);

const float d = vload_half(0, (__global half*) &blocks[i].d);
const float m = vload_half(0, (__global half*) &blocks[i].m);
const float d = x[i].d;
const float m = x[i].m;

const uchar vi = blocks[i].qs[l];
const int x0 = (x[i].qs[j] & 0xf);
const int x1 = (x[i].qs[j] >> 4);

y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
}

const uint l2 = l * 2;
__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
constant uint qk = QK5_0;

const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);

const uint index = i*32 + l2;
result[index + 0] = ((vi & 0xf) | vh0)*d + m;
result[index + 1] = ((vi >> 4) | vh1)*d + m;
const float d = vload_half(0, (__global half*) &x[i].d);

uint32_t qh = x[i].qh;

const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;

const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;

y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
}

struct block_q8_0
{
float d;
char qs[32];
};
__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
constant uint qk = QK5_1;

const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);

const float d = vload_half(0, (__global half*) &x[i].d);
const float m = vload_half(0, (__global half*) &x[i].m);

__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
uint32_t qh = x[i].qh;

result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;

const int x0 = (x[i].qs[j] & 0xf) | xh_0;
const int x1 = (x[i].qs[j] >> 4) | xh_1;

y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
}

__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
constant uint qk = QK8_0;
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);

const float d = x[i].d;
y[i*qk + j] = x[i].qs[j]*d;
}

);
Expand All @@ -128,20 +148,6 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
} \
} while (0)

#define QK5_0 32
typedef struct {
ggml_fp16_t d; // delta
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} block_q5_0;


typedef struct {
float d; // delta
uint32_t qh; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} cl_block_q5_0;

static cl_platform_id platform;
static cl_device_id device;
static cl_context context;
Expand Down Expand Up @@ -252,7 +258,6 @@ void ggml_cl_sgemm_wrapper(
cl_kernel kernel;
size_t global = n * k, local, size_qb;
bool dequant;
cl_block_q5_0* cl_host_b;

switch (btype) {
case GGML_TYPE_F32:
Expand All @@ -274,18 +279,7 @@ void ggml_cl_sgemm_wrapper(
dequant = true;
kernel = kernel_q5_0;
local = 16;
// For some reason OpenCL seems to be incapable of working with structs of size 22.
// 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
// TODO Find the reason, fix and remove workaround.
const block_q5_0* b = (const block_q5_0*) host_b;
cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
for (size_t i = 0; i < global / 32; i++) {
cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
}
host_b = (const float*) cl_host_b;
size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
break;
case GGML_TYPE_Q5_1:
dequant = true;
Expand Down Expand Up @@ -364,7 +358,4 @@ void ggml_cl_sgemm_wrapper(
clWaitForEvents(1, &ev_c);
clReleaseEvent(ev_sgemm);
clReleaseEvent(ev_c);
if (btype == GGML_TYPE_Q5_0) {
free((void*) cl_host_b);
}
}