Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix OpenCL kernels for the new formats #1422

Merged
merged 2 commits into from
May 13, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 89 additions & 69 deletions ggml-opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,109 +12,129 @@
#define MULTILINE_QUOTE(...) #__VA_ARGS__
const char * clblast_dequant = MULTILINE_QUOTE(

typedef uchar uint8_t;
typedef int int32_t;
typedef uint uint32_t;

constant uint QK4_0 = 32;
struct block_q4_0
{
float d;
uchar qs[16];
uchar qs[QK4_0 / 2];
};

__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);

const float d = blocks[i].d;

const uchar vi = blocks[i].qs[l];

const uint index = i*32 + l*2;
result[index + 0] = ((vi & 0xf) - 8)*d;
result[index + 1] = ((vi >> 4) - 8)*d;
}

constant uint QK4_1 = 32;
struct block_q4_1
{
float d;
float m;
uchar qs[16];
uchar qs[QK4_1 / 2];
};

__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);

const float d = blocks[i].d;
const float m = blocks[i].m;

const uchar vi = blocks[i].qs[l];

const uint index = i*32 + l*2;
result[index + 0] = (vi & 0xf) * d + m;
result[index + 1] = (vi >> 4) * d + m;
}

constant uint QK5_0 = 32;
struct block_q5_0
{
float d;
uint qh;
uchar qs[16];
uchar qs[QK5_0 / 2];
};

constant uint QK5_1 = 32;
struct block_q5_1
{
ushort d;
ushort m;
uint qh;
uchar qs[QK5_1 / 2];
};

constant uint QK8_0 = 32;
struct block_q8_0
{
float d;
char qs[QK8_0];
};

__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);

const float d = blocks[i].d;
__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
constant uint qk = QK4_0;

const uchar vi = blocks[i].qs[l];
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);

const uint l2 = l * 2;
const float d = x[i].d;

const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
const int x0 = (x[i].qs[j] & 0xf) - 8;
const int x1 = (x[i].qs[j] >> 4) - 8;

const uint index = i*32 + l2;
result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
}

struct block_q5_1
{
ushort d;
ushort m;
uint qh;
uchar qs[16];
};
__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
constant uint qk = QK4_1;

const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);

const float d = x[i].d;
const float m = x[i].m;

const int x0 = (x[i].qs[j] & 0xf);
const int x1 = (x[i].qs[j] >> 4);

y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
}

__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
constant uint qk = QK5_0;

__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);

const float d = vload_half(0, (__global half*) &blocks[i].d);
const float m = vload_half(0, (__global half*) &blocks[i].m);
const float d = x[i].d;

const uchar vi = blocks[i].qs[l];
uint32_t qh = x[i].qh;

const uint l2 = l * 2;
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;

const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;

const uint index = i*32 + l2;
result[index + 0] = ((vi & 0xf) | vh0)*d + m;
result[index + 1] = ((vi >> 4) | vh1)*d + m;
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
}

struct block_q8_0
{
float d;
char qs[32];
};
__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
constant uint qk = QK5_1;

const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);

const float d = vload_half(0, (__global half*) &x[i].d);
const float m = vload_half(0, (__global half*) &x[i].m);

uint32_t qh = x[i].qh;

const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;

const int x0 = (x[i].qs[j] & 0xf) | xh_0;
const int x1 = (x[i].qs[j] >> 4) | xh_1;

y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
}

__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
constant uint qk = QK8_0;
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);

result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
const float d = x[i].d;
y[i*qk + j] = x[i].qs[j]*d;
}

);
Expand Down