From bc17e115908815d2b84ba315ebb634c55a13c99f Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 22 Jun 2023 05:29:11 -0600 Subject: [PATCH 1/4] Allow specifying p scale factor for ggml rope and rope_back ops This adds ggml_rope_scaled, ggml_rope_scaled_inplace, ggml_rope_back_scaled ops Add LLAMA_ROPE_SCALE to Makefile (note not in cmake yet), if not specified defaults to 1.0 --- Makefile | 4 ++ ggml.c | 124 ++++++++++++++++++++++++++++++++++++++---------------- ggml.h | 27 ++++++++++++ llama.cpp | 8 +++- 4 files changed, 124 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index 5dd676fada417..bf777ac638d54 100644 --- a/Makefile +++ b/Makefile @@ -133,6 +133,10 @@ ifndef LLAMA_NO_K_QUANTS OBJS += k_quants.o endif +ifdef LLAMA_ROPE_SCALE + CXXFLAGS += -DLLAMA_ROPE_SCALE=$(LLAMA_ROPE_SCALE) +endif + ifndef LLAMA_NO_ACCELERATE # Mac M1 - include Accelerate framework. # `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). diff --git a/ggml.c b/ggml.c index 4319683f5186e..7376a15f571d3 100644 --- a/ggml.c +++ b/ggml.c @@ -6603,6 +6603,7 @@ struct ggml_tensor * ggml_rope_impl( int n_past, int n_dims, int mode, + float p_scale, bool inplace) { GGML_ASSERT(n_past >= 0); bool is_node = false; @@ -6615,11 +6616,13 @@ struct ggml_tensor * ggml_rope_impl( ggml_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4); + ggml_set_name(b, "n_past, n_dims, mode, p_scale"); - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = n_dims; - ((int32_t *) b->data)[2] = mode; + ((float *) b->data)[0] = (float)n_past; + ((float *) b->data)[1] = (float)n_dims; + ((float *) b->data)[2] = (float)mode; + ((float *) b->data)[3] = p_scale; ggml_scratch_load(ctx); @@ -6637,7 +6640,7 @@ struct ggml_tensor * ggml_rope( int n_past, int n_dims, int mode) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false, 1.0); } struct ggml_tensor * ggml_rope_inplace( @@ -6646,17 +6649,39 @@ struct ggml_tensor * ggml_rope_inplace( int n_past, int n_dims, int mode) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true, 1.0); +} + +struct ggml_tensor * ggml_rope_scaled( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float p_scale) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false, p_scale); +} + +struct ggml_tensor * ggml_rope_scaled_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float p_scale) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true, p_scale); } + // ggml_rope_back -struct ggml_tensor * ggml_rope_back( +struct ggml_tensor * ggml_rope_back_impl( struct ggml_context * ctx, struct ggml_tensor * a, int n_past, int n_dims, - int mode) { + int mode, + float p_scale) { GGML_ASSERT(n_past >= 0); bool is_node = false; @@ -6668,12 +6693,13 @@ struct ggml_tensor * ggml_rope_back( ggml_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); - ggml_set_name(b, "n_past, n_dims, mode"); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4); + ggml_set_name(b, "n_past, n_dims, mode, p_scale"); - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = n_dims; - ((int32_t *) b->data)[2] = mode; + ((float *) b->data)[0] = (float)n_past; + ((float *) b->data)[1] = (float)n_dims; + ((float *) b->data)[2] = (float)mode; + ((float *) b->data)[3] = p_scale; ggml_scratch_load(ctx); @@ -6685,6 +6711,26 @@ struct ggml_tensor * ggml_rope_back( return result; } +struct ggml_tensor * ggml_rope_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode) { + return ggml_rope_back_impl(ctx, a, n_past, n_dims, mode, 1.0); +} + +struct ggml_tensor * ggml_rope_back_scaled( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float p_scale) { + return ggml_rope_back_impl(ctx, a, n_past, n_dims, mode, p_scale); +} + + // ggml_alibi struct ggml_tensor * ggml_alibi( @@ -12110,16 +12156,17 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 3); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(src1) == 4); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + const int n_past = (int)((float *) src1->data)[0]; + const int n_dims = (int)((float *) src1->data)[1]; + const int mode = (int)((float *) src1->data)[2]; + const float p_scale = ((float *) src1->data)[3]; assert(n_past >= 0); @@ -12172,7 +12219,7 @@ static void ggml_compute_forward_rope_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = p_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { @@ -12223,16 +12270,17 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 3); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(src1) == 4); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + const int n_past = (int)((float *) src1->data)[0]; + const int n_dims = (int)((float *) src1->data)[1]; + const int mode = (int)((float *) src1->data)[2]; + const float p_scale = ((float *) src1->data)[3]; assert(n_past >= 0); @@ -12285,7 +12333,7 @@ static void ggml_compute_forward_rope_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = p_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { @@ -12359,8 +12407,8 @@ static void ggml_compute_forward_rope_back_f32( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + assert(src1->type == GGML_TYPE_F32); + assert(ggml_nelements(src1) == 4); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -12370,9 +12418,10 @@ static void ggml_compute_forward_rope_back_f32( // dx = rope_back(dy, src1) // src0 is dy, src1 contains options - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + const int n_past = (int)((float *) src1->data)[0]; + const int n_dims = (int)((float *) src1->data)[1]; + const int mode = (int)((float *) src1->data)[2]; + const float p_scale = ((float *) src1->data)[3]; assert(n_past >= 0); @@ -12423,7 +12472,7 @@ static void ggml_compute_forward_rope_back_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = p_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { @@ -12472,8 +12521,8 @@ static void ggml_compute_forward_rope_back_f16( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + assert(src1->type == GGML_TYPE_F32); + assert(ggml_nelements(src1) == 4); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -12483,9 +12532,10 @@ static void ggml_compute_forward_rope_back_f16( // dx = rope_back(dy, src1) // src0 is dy, src1 contains options - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + const int n_past = (int)((float *) src1->data)[0]; + const int n_dims = (int)((float *) src1->data)[1]; + const int mode = (int)((float *) src1->data)[2]; + const float p_scale = ((float *) src1->data)[3]; assert(n_past >= 0); @@ -12536,7 +12586,7 @@ static void ggml_compute_forward_rope_back_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = p_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { diff --git a/ggml.h b/ggml.h index 18c78551f3dcd..c4e40da768f5b 100644 --- a/ggml.h +++ b/ggml.h @@ -1044,6 +1044,24 @@ extern "C" { int n_dims, int mode); + // same as ggml_rope but allows specifying p scale factor + GGML_API struct ggml_tensor * ggml_rope_scaled( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float p_scale); + + // same as ggml_rope_inplace but allows specifying p scale factor + GGML_API struct ggml_tensor * ggml_rope_scaled_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float p_scale); + // rotary position embedding backward, i.e compute dx from dy // a - dy GGML_API struct ggml_tensor * ggml_rope_back( @@ -1053,6 +1071,15 @@ extern "C" { int n_dims, int mode); + // same as ggml_rope_back but allows specifying p scale factor + GGML_API struct ggml_tensor * ggml_rope_back_scaled( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + float p_scale); + // alibi position embedding // in-place, returns view(a) struct ggml_tensor * ggml_alibi( diff --git a/llama.cpp b/llama.cpp index e597f5048234b..e875486b2de12 100644 --- a/llama.cpp +++ b/llama.cpp @@ -52,6 +52,10 @@ #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 +#ifndef LLAMA_ROPE_SCALE +#define LLAMA_ROPE_SCALE 1.0 +#endif + // available llama models enum e_model { MODEL_UNKNOWN, @@ -1473,11 +1477,11 @@ static bool llama_eval_internal( offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_scaled_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, LLAMA_ROPE_SCALE); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Qcur = ggml_rope_scaled_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, LLAMA_ROPE_SCALE); offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); From 4bf45a7dbeb239a428b49b2c68e49c41d5ed6e80 Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 22 Jun 2023 06:37:21 -0600 Subject: [PATCH 2/4] Helps to pass args in the correct order --- ggml.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/ggml.c b/ggml.c index 7376a15f571d3..3d2900fe3b95c 100644 --- a/ggml.c +++ b/ggml.c @@ -6640,7 +6640,7 @@ struct ggml_tensor * ggml_rope( int n_past, int n_dims, int mode) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false, 1.0); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 1.0, false); } struct ggml_tensor * ggml_rope_inplace( @@ -6649,7 +6649,7 @@ struct ggml_tensor * ggml_rope_inplace( int n_past, int n_dims, int mode) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true, 1.0); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 1.0, true); } struct ggml_tensor * ggml_rope_scaled( @@ -6659,7 +6659,7 @@ struct ggml_tensor * ggml_rope_scaled( int n_dims, int mode, float p_scale) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false, p_scale); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, p_scale, false); } struct ggml_tensor * ggml_rope_scaled_inplace( @@ -6669,7 +6669,7 @@ struct ggml_tensor * ggml_rope_scaled_inplace( int n_dims, int mode, float p_scale) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true, p_scale); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, p_scale, true); } @@ -15763,18 +15763,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + assert(src1->type == GGML_TYPE_F32); + assert(ggml_nelements(src1) == 4); + const int n_past = (int)((float *) src1->data)[0]; + const int n_dims = (int)((float *) src1->data)[1]; + const int mode = (int)((float *) src1->data)[2]; + const float p_scale = ((float *) src1->data)[3]; src0->grad = ggml_add_impl(ctx, src0->grad, - ggml_rope_back(ctx, + ggml_rope_back_scaled(ctx, tensor->grad, n_past, n_dims, - mode), + mode, + p_scale), inplace); } if (src1->grad) { @@ -15784,18 +15786,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_ROPE_BACK: { if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + assert(src1->type == GGML_TYPE_F32); + assert(ggml_nelements(src1) == 4); + const int n_past = (int)((float *) src1->data)[0]; + const int n_dims = (int)((float *) src1->data)[1]; + const int mode = (int)((float *) src1->data)[2]; + const float p_scale = ((float *) src1->data)[3]; src0->grad = ggml_add_impl(ctx, src0->grad, - ggml_rope(ctx, + ggml_rope_scaled(ctx, tensor->grad, n_past, n_dims, - mode), + mode, + p_scale), inplace); } if (src1->grad) { From 887694acfd3f8fe9737e01537cab7090d202a84b Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 22 Jun 2023 08:18:01 -0600 Subject: [PATCH 3/4] Handle rope params in CUDA, Metal Bail out if p_scale != 1.0 n rope operation for the time being --- ggml-cuda.cu | 10 +++++++--- ggml-metal.m | 11 +++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 36a251ecce973..a64547cd929fb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1906,10 +1906,14 @@ inline void ggml_cuda_op_rope( const int64_t ne00 = src0->ne[0]; const int64_t i01_diff = i01_high - i01_low; - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + assert(src1->type == GGML_TYPE_F32); + assert(ggml_nelements(src1) == 4); + const int n_past = (int)((float *) src1->data)[0]; + const int n_dims = (int)((float *) src1->data)[1]; + const int mode = (int)((float *) src1->data)[2]; + const float p_scale = ((float *) src1->data)[3]; GGML_ASSERT(mode == 0); + GGML_ASSERT(p_scale == 1.0); const float theta_scale = powf(10000.0, -2.0f/n_dims); const float p = ((mode & 1) == 0 ? n_past + i02 : i02); diff --git a/ggml-metal.m b/ggml-metal.m index a7e104dc76fca..1798b68e62ab4 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -861,10 +861,13 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - - const int n_past = ((int32_t *)(src1->data))[0]; + assert(src1->type == GGML_TYPE_F32); + assert(ggml_nelements(src1) == 4); + const int n_past = (int)((float *) src1->data)[0]; + const int n_dims = (int)((float *) src1->data)[1]; + const int mode = (int)((float *) src1->data)[2]; + const float p_scale = ((float *) src1->data)[3]; + GGML_ASSERT(p_scale == 1.0 && "no Metal support for rope p_scale != 1.0"); [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; From e92795f2f4d02c4dfb660b932477f6eacec17c2a Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Thu, 22 Jun 2023 14:06:13 -0600 Subject: [PATCH 4/4] Add CUDA and hopefully Metal support for p_scale --- ggml-cuda.cu | 3 +-- ggml-metal.m | 40 ++++++++++++++++++++-------------------- ggml-metal.metal | 3 ++- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a64547cd929fb..f2897c3ada25a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1913,10 +1913,9 @@ inline void ggml_cuda_op_rope( const int mode = (int)((float *) src1->data)[2]; const float p_scale = ((float *) src1->data)[3]; GGML_ASSERT(mode == 0); - GGML_ASSERT(p_scale == 1.0); const float theta_scale = powf(10000.0, -2.0f/n_dims); - const float p = ((mode & 1) == 0 ? n_past + i02 : i02); + const float p = p_scale * ((mode & 1) == 0 ? n_past + i02 : i02); // compute rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); diff --git a/ggml-metal.m b/ggml-metal.m index 1798b68e62ab4..2cb146b6790b7 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -867,30 +867,30 @@ void ggml_metal_graph_compute( const int n_dims = (int)((float *) src1->data)[1]; const int mode = (int)((float *) src1->data)[2]; const float p_scale = ((float *) src1->data)[3]; - GGML_ASSERT(p_scale == 1.0 && "no Metal support for rope p_scale != 1.0"); [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; - [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; - [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; - [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; - [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; - [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; - [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; - [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; - [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; - [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; - [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; - [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; - [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; - [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; - [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; - [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; - [encoder setBytes:&mode length:sizeof( int) atIndex:20]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; + [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; + [encoder setBytes:&mode length:sizeof( int) atIndex:20]; + [encoder setBytes:&p_scale length:sizeof( float) atIndex:21]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index d1e49222db2eb..f50bfd8119e79 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -615,6 +615,7 @@ kernel void kernel_rope( constant int & n_past, constant int & n_dims, constant int & mode, + constant float & p_scale, uint3 tpig[[thread_position_in_grid]]) { const int64_t i3 = tpig[2]; const int64_t i2 = tpig[1]; @@ -625,7 +626,7 @@ kernel void kernel_rope( const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); - float theta = (float)p; + float theta = p_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) {