Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow specifying p scale factor for ggml rope and rope_back ops #1967

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ ifndef LLAMA_NO_K_QUANTS
OBJS += k_quants.o
endif

ifdef LLAMA_ROPE_SCALE
CXXFLAGS += -DLLAMA_ROPE_SCALE=$(LLAMA_ROPE_SCALE)
SlyEcho marked this conversation as resolved.
Show resolved Hide resolved
endif

ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
Expand Down
156 changes: 105 additions & 51 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -6603,6 +6603,7 @@ struct ggml_tensor * ggml_rope_impl(
int n_past,
int n_dims,
int mode,
float p_scale,
bool inplace) {
GGML_ASSERT(n_past >= 0);
bool is_node = false;
Expand All @@ -6615,11 +6616,13 @@ struct ggml_tensor * ggml_rope_impl(

ggml_scratch_save(ctx);

struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4);
ggml_set_name(b, "n_past, n_dims, mode, p_scale");

((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[2] = mode;
((float *) b->data)[0] = (float)n_past;
((float *) b->data)[1] = (float)n_dims;
((float *) b->data)[2] = (float)mode;
((float *) b->data)[3] = p_scale;

ggml_scratch_load(ctx);

Expand All @@ -6637,7 +6640,7 @@ struct ggml_tensor * ggml_rope(
int n_past,
int n_dims,
int mode) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 1.0, false);
}

struct ggml_tensor * ggml_rope_inplace(
Expand All @@ -6646,17 +6649,39 @@ struct ggml_tensor * ggml_rope_inplace(
int n_past,
int n_dims,
int mode) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 1.0, true);
}

struct ggml_tensor * ggml_rope_scaled(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float p_scale) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, p_scale, false);
}

struct ggml_tensor * ggml_rope_scaled_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float p_scale) {
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, p_scale, true);
}


// ggml_rope_back

struct ggml_tensor * ggml_rope_back(
struct ggml_tensor * ggml_rope_back_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode) {
int mode,
float p_scale) {
GGML_ASSERT(n_past >= 0);
bool is_node = false;

Expand All @@ -6668,12 +6693,13 @@ struct ggml_tensor * ggml_rope_back(

ggml_scratch_save(ctx);

struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
ggml_set_name(b, "n_past, n_dims, mode");
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4);
ggml_set_name(b, "n_past, n_dims, mode, p_scale");

((int32_t *) b->data)[0] = n_past;
((int32_t *) b->data)[1] = n_dims;
((int32_t *) b->data)[2] = mode;
((float *) b->data)[0] = (float)n_past;
((float *) b->data)[1] = (float)n_dims;
((float *) b->data)[2] = (float)mode;
((float *) b->data)[3] = p_scale;
Comment on lines +6699 to +6702
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use memcpy to store the params so we can all sleep well knowing this is not UB :)


ggml_scratch_load(ctx);

Expand All @@ -6685,6 +6711,26 @@ struct ggml_tensor * ggml_rope_back(
return result;
}

struct ggml_tensor * ggml_rope_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode) {
return ggml_rope_back_impl(ctx, a, n_past, n_dims, mode, 1.0);
}

struct ggml_tensor * ggml_rope_back_scaled(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float p_scale) {
return ggml_rope_back_impl(ctx, a, n_past, n_dims, mode, p_scale);
}


// ggml_alibi

struct ggml_tensor * ggml_alibi(
Expand Down Expand Up @@ -12110,16 +12156,17 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) == 3);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_nelements(src1) == 4);

if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}

const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_past = (int)((float *) src1->data)[0];
const int n_dims = (int)((float *) src1->data)[1];
const int mode = (int)((float *) src1->data)[2];
const float p_scale = ((float *) src1->data)[3];

assert(n_past >= 0);

Expand Down Expand Up @@ -12172,7 +12219,7 @@ static void ggml_compute_forward_rope_f32(
if (ir++ < ir0) continue;
if (ir > ir1) break;

float theta = (float)p;
float theta = p_scale * (float)p;

if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
Expand Down Expand Up @@ -12223,16 +12270,17 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) == 3);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_nelements(src1) == 4);

if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}

const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_past = (int)((float *) src1->data)[0];
const int n_dims = (int)((float *) src1->data)[1];
const int mode = (int)((float *) src1->data)[2];
const float p_scale = ((float *) src1->data)[3];

assert(n_past >= 0);

Expand Down Expand Up @@ -12285,7 +12333,7 @@ static void ggml_compute_forward_rope_f16(
if (ir++ < ir0) continue;
if (ir > ir1) break;

float theta = (float)p;
float theta = p_scale * (float)p;

if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
Expand Down Expand Up @@ -12359,8 +12407,8 @@ static void ggml_compute_forward_rope_back_f32(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3);
assert(src1->type == GGML_TYPE_F32);
assert(ggml_nelements(src1) == 4);

if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
Expand All @@ -12370,9 +12418,10 @@ static void ggml_compute_forward_rope_back_f32(
// dx = rope_back(dy, src1)
// src0 is dy, src1 contains options

const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_past = (int)((float *) src1->data)[0];
const int n_dims = (int)((float *) src1->data)[1];
const int mode = (int)((float *) src1->data)[2];
const float p_scale = ((float *) src1->data)[3];

assert(n_past >= 0);

Expand Down Expand Up @@ -12423,7 +12472,7 @@ static void ggml_compute_forward_rope_back_f32(
if (ir++ < ir0) continue;
if (ir > ir1) break;

float theta = (float)p;
float theta = p_scale * (float)p;

if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
Expand Down Expand Up @@ -12472,8 +12521,8 @@ static void ggml_compute_forward_rope_back_f16(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3);
assert(src1->type == GGML_TYPE_F32);
assert(ggml_nelements(src1) == 4);

if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
Expand All @@ -12483,9 +12532,10 @@ static void ggml_compute_forward_rope_back_f16(
// dx = rope_back(dy, src1)
// src0 is dy, src1 contains options

const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
const int n_past = (int)((float *) src1->data)[0];
const int n_dims = (int)((float *) src1->data)[1];
const int mode = (int)((float *) src1->data)[2];
const float p_scale = ((float *) src1->data)[3];

assert(n_past >= 0);

Expand Down Expand Up @@ -12536,7 +12586,7 @@ static void ggml_compute_forward_rope_back_f16(
if (ir++ < ir0) continue;
if (ir > ir1) break;

float theta = (float)p;
float theta = p_scale * (float)p;

if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
Expand Down Expand Up @@ -15713,18 +15763,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
// necessary for llama
if (src0->grad) {
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3);
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
assert(src1->type == GGML_TYPE_F32);
assert(ggml_nelements(src1) == 4);
const int n_past = (int)((float *) src1->data)[0];
const int n_dims = (int)((float *) src1->data)[1];
const int mode = (int)((float *) src1->data)[2];
const float p_scale = ((float *) src1->data)[3];
src0->grad = ggml_add_impl(ctx,
src0->grad,
ggml_rope_back(ctx,
ggml_rope_back_scaled(ctx,
tensor->grad,
n_past,
n_dims,
mode),
mode,
p_scale),
inplace);
}
if (src1->grad) {
Expand All @@ -15734,18 +15786,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
case GGML_OP_ROPE_BACK:
{
if (src0->grad) {
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 3);
const int n_past = ((int32_t *) src1->data)[0];
const int n_dims = ((int32_t *) src1->data)[1];
const int mode = ((int32_t *) src1->data)[2];
assert(src1->type == GGML_TYPE_F32);
assert(ggml_nelements(src1) == 4);
const int n_past = (int)((float *) src1->data)[0];
const int n_dims = (int)((float *) src1->data)[1];
const int mode = (int)((float *) src1->data)[2];
const float p_scale = ((float *) src1->data)[3];
src0->grad = ggml_add_impl(ctx,
src0->grad,
ggml_rope(ctx,
ggml_rope_scaled(ctx,
tensor->grad,
n_past,
n_dims,
mode),
mode,
p_scale),
inplace);
}
if (src1->grad) {
Expand Down
27 changes: 27 additions & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,24 @@ extern "C" {
int n_dims,
int mode);

// same as ggml_rope but allows specifying p scale factor
GGML_API struct ggml_tensor * ggml_rope_scaled(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float p_scale);

// same as ggml_rope_inplace but allows specifying p scale factor
GGML_API struct ggml_tensor * ggml_rope_scaled_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float p_scale);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to extend the API - add p_scale to original ggml_rope_xxx() and add comment to use p_scale == 1.0f for regular computation. Add GGML_ASSERT(p_scale == 1.0f) in backward call

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to extend the API - add p_scale to original ggml_rope_xxx()

Won't this break every single thing that currently uses the llama.cpp version of GGML?

What do you think about using a define to enable the p_scale argument for rope and having it be off by default? That way existing stuff can opt in.

It might also be worth thinking about adding something like GGML_API_VERSION which could be bumped when incompatible changes occur, so stuff building against GGML could handle API changes more gracefully.


// rotary position embedding backward, i.e compute dx from dy
// a - dy
GGML_API struct ggml_tensor * ggml_rope_back(
Expand All @@ -1053,6 +1071,15 @@ extern "C" {
int n_dims,
int mode);

// same as ggml_rope_back but allows specifying p scale factor
GGML_API struct ggml_tensor * ggml_rope_back_scaled(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode,
float p_scale);

// alibi position embedding
// in-place, returns view(a)
struct ggml_tensor * ggml_alibi(
Expand Down
8 changes: 6 additions & 2 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@
#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16

#ifndef LLAMA_ROPE_SCALE
#define LLAMA_ROPE_SCALE 1.0
#endif

// available llama models
enum e_model {
MODEL_UNKNOWN,
Expand Down Expand Up @@ -1473,11 +1477,11 @@ static bool llama_eval_internal(
offload_func_kq(tmpq);
ggml_set_name(tmpq, "tmpq");

struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope_scaled_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, LLAMA_ROPE_SCALE);
offload_func_kq(Kcur);
ggml_set_name(Kcur, "Kcur");

struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Qcur = ggml_rope_scaled_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, LLAMA_ROPE_SCALE);
offload_func_kq(Qcur);
ggml_set_name(Qcur, "Qcur");

Expand Down