From bc17e115908815d2b84ba315ebb634c55a13c99f Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 22 Jun 2023 05:29:11 -0600
Subject: [PATCH 1/4] Allow specifying p scale factor for ggml rope and
 rope_back ops

This adds ggml_rope_scaled, ggml_rope_scaled_inplace, ggml_rope_back_scaled ops

Add LLAMA_ROPE_SCALE to Makefile (note not in cmake yet), if not specified defaults to 1.0
---
 Makefile  |   4 ++
 ggml.c    | 124 ++++++++++++++++++++++++++++++++++++++----------------
 ggml.h    |  27 ++++++++++++
 llama.cpp |   8 +++-
 4 files changed, 124 insertions(+), 39 deletions(-)

diff --git a/Makefile b/Makefile
index 5dd676fada417..bf777ac638d54 100644
--- a/Makefile
+++ b/Makefile
@@ -133,6 +133,10 @@ ifndef LLAMA_NO_K_QUANTS
 	OBJS     += k_quants.o
 endif
 
+ifdef LLAMA_ROPE_SCALE
+	CXXFLAGS  += -DLLAMA_ROPE_SCALE=$(LLAMA_ROPE_SCALE)
+endif
+
 ifndef LLAMA_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework.
 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
diff --git a/ggml.c b/ggml.c
index 4319683f5186e..7376a15f571d3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6603,6 +6603,7 @@ struct ggml_tensor * ggml_rope_impl(
         int                   n_past,
         int                   n_dims,
         int                   mode,
+        float                 p_scale,
         bool                  inplace) {
     GGML_ASSERT(n_past >= 0);
     bool is_node = false;
@@ -6615,11 +6616,13 @@ struct ggml_tensor * ggml_rope_impl(
 
     ggml_scratch_save(ctx);
 
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4);
+    ggml_set_name(b, "n_past, n_dims, mode, p_scale");
 
-    ((int32_t *) b->data)[0] = n_past;
-    ((int32_t *) b->data)[1] = n_dims;
-    ((int32_t *) b->data)[2] = mode;
+    ((float *) b->data)[0] = (float)n_past;
+    ((float *) b->data)[1] = (float)n_dims;
+    ((float *) b->data)[2] = (float)mode;
+    ((float *) b->data)[3] = p_scale;
 
     ggml_scratch_load(ctx);
 
@@ -6637,7 +6640,7 @@ struct ggml_tensor * ggml_rope(
         int                   n_past,
         int                   n_dims,
         int                   mode) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false, 1.0);
 }
 
 struct ggml_tensor * ggml_rope_inplace(
@@ -6646,17 +6649,39 @@ struct ggml_tensor * ggml_rope_inplace(
         int                   n_past,
         int                   n_dims,
         int                   mode) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true, 1.0);
+}
+
+struct ggml_tensor * ggml_rope_scaled(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode,
+        float                 p_scale) {
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false, p_scale);
+}
+
+struct ggml_tensor * ggml_rope_scaled_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode,
+        float                 p_scale) {
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true, p_scale);
 }
 
+
 // ggml_rope_back
 
-struct ggml_tensor * ggml_rope_back(
+struct ggml_tensor * ggml_rope_back_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         int                   n_past,
         int                   n_dims,
-        int                   mode) {
+        int                   mode,
+        float                 p_scale) {
     GGML_ASSERT(n_past >= 0);
     bool is_node = false;
 
@@ -6668,12 +6693,13 @@ struct ggml_tensor * ggml_rope_back(
 
     ggml_scratch_save(ctx);
 
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
-    ggml_set_name(b, "n_past, n_dims, mode");
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4);
+    ggml_set_name(b, "n_past, n_dims, mode, p_scale");
 
-    ((int32_t *) b->data)[0] = n_past;
-    ((int32_t *) b->data)[1] = n_dims;
-    ((int32_t *) b->data)[2] = mode;
+    ((float *) b->data)[0] = (float)n_past;
+    ((float *) b->data)[1] = (float)n_dims;
+    ((float *) b->data)[2] = (float)mode;
+    ((float *) b->data)[3] = p_scale;
 
     ggml_scratch_load(ctx);
 
@@ -6685,6 +6711,26 @@ struct ggml_tensor * ggml_rope_back(
     return result;
 }
 
+struct ggml_tensor * ggml_rope_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode) {
+    return ggml_rope_back_impl(ctx, a, n_past, n_dims, mode, 1.0);
+}
+
+struct ggml_tensor * ggml_rope_back_scaled(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_dims,
+        int                   mode,
+        float                 p_scale) {
+    return ggml_rope_back_impl(ctx, a, n_past, n_dims, mode, p_scale);
+}
+
+
 // ggml_alibi
 
 struct ggml_tensor * ggml_alibi(
@@ -12110,16 +12156,17 @@ static void ggml_compute_forward_rope_f32(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 3);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_nelements(src1) == 4);
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_dims = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
+    const int n_past    = (int)((float *) src1->data)[0];
+    const int n_dims    = (int)((float *) src1->data)[1];
+    const int mode      = (int)((float *) src1->data)[2];
+    const float p_scale = ((float *) src1->data)[3];
 
     assert(n_past >= 0);
 
@@ -12172,7 +12219,7 @@ static void ggml_compute_forward_rope_f32(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = (float)p;
+                float theta = p_scale * (float)p;
 
                 if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
@@ -12223,16 +12270,17 @@ static void ggml_compute_forward_rope_f16(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(ggml_nelements(src1) == 3);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_nelements(src1) == 4);
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_dims = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
+    const int n_past    = (int)((float *) src1->data)[0];
+    const int n_dims    = (int)((float *) src1->data)[1];
+    const int mode      = (int)((float *) src1->data)[2];
+    const float p_scale = ((float *) src1->data)[3];
 
     assert(n_past >= 0);
 
@@ -12285,7 +12333,7 @@ static void ggml_compute_forward_rope_f16(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = (float)p;
+                float theta = p_scale * (float)p;
 
                 if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
@@ -12359,8 +12407,8 @@ static void ggml_compute_forward_rope_back_f32(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
+    assert(src1->type == GGML_TYPE_F32);
+    assert(ggml_nelements(src1) == 4);
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
@@ -12370,9 +12418,10 @@ static void ggml_compute_forward_rope_back_f32(
     // dx = rope_back(dy, src1)
     // src0 is dy, src1 contains options
 
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_dims = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
+    const int n_past    = (int)((float *) src1->data)[0];
+    const int n_dims    = (int)((float *) src1->data)[1];
+    const int mode      = (int)((float *) src1->data)[2];
+    const float p_scale = ((float *) src1->data)[3];
 
     assert(n_past >= 0);
 
@@ -12423,7 +12472,7 @@ static void ggml_compute_forward_rope_back_f32(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = (float)p;
+                float theta = p_scale * (float)p;
 
                 if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
@@ -12472,8 +12521,8 @@ static void ggml_compute_forward_rope_back_f16(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
+    assert(src1->type == GGML_TYPE_F32);
+    assert(ggml_nelements(src1) == 4);
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
@@ -12483,9 +12532,10 @@ static void ggml_compute_forward_rope_back_f16(
     // dx = rope_back(dy, src1)
     // src0 is dy, src1 contains options
 
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_dims = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
+    const int n_past    = (int)((float *) src1->data)[0];
+    const int n_dims    = (int)((float *) src1->data)[1];
+    const int mode      = (int)((float *) src1->data)[2];
+    const float p_scale = ((float *) src1->data)[3];
 
     assert(n_past >= 0);
 
@@ -12536,7 +12586,7 @@ static void ggml_compute_forward_rope_back_f16(
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta = (float)p;
+                float theta = p_scale * (float)p;
 
                 if (!is_neox) {
                     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
diff --git a/ggml.h b/ggml.h
index 18c78551f3dcd..c4e40da768f5b 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1044,6 +1044,24 @@ extern "C" {
             int                   n_dims,
             int                   mode);
 
+    // same as ggml_rope but allows specifying p scale factor
+    GGML_API struct ggml_tensor * ggml_rope_scaled(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode,
+            float                 p_scale);
+
+    // same as ggml_rope_inplace but allows specifying p scale factor
+    GGML_API struct ggml_tensor * ggml_rope_scaled_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode,
+            float                 p_scale);
+
     // rotary position embedding backward, i.e compute dx from dy
     // a - dy
     GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1053,6 +1071,15 @@ extern "C" {
             int                   n_dims,
             int                   mode);
 
+    // same as ggml_rope_back but allows specifying p scale factor
+    GGML_API struct ggml_tensor * ggml_rope_back_scaled(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode,
+            float                 p_scale);
+
     // alibi position embedding
     // in-place, returns view(a)
     struct ggml_tensor * ggml_alibi(
diff --git a/llama.cpp b/llama.cpp
index e597f5048234b..e875486b2de12 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -52,6 +52,10 @@
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
 
+#ifndef LLAMA_ROPE_SCALE
+#define LLAMA_ROPE_SCALE 1.0
+#endif
+
 // available llama models
 enum e_model {
     MODEL_UNKNOWN,
@@ -1473,11 +1477,11 @@ static bool llama_eval_internal(
             offload_func_kq(tmpq);
             ggml_set_name(tmpq, "tmpq");
 
-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope_scaled_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, LLAMA_ROPE_SCALE);
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
+            struct ggml_tensor * Qcur = ggml_rope_scaled_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, LLAMA_ROPE_SCALE);
             offload_func_kq(Qcur);
             ggml_set_name(Qcur, "Qcur");
 

From 4bf45a7dbeb239a428b49b2c68e49c41d5ed6e80 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 22 Jun 2023 06:37:21 -0600
Subject: [PATCH 2/4] Helps to pass args in the correct order

---
 ggml.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/ggml.c b/ggml.c
index 7376a15f571d3..3d2900fe3b95c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6640,7 +6640,7 @@ struct ggml_tensor * ggml_rope(
         int                   n_past,
         int                   n_dims,
         int                   mode) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false, 1.0);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 1.0, false);
 }
 
 struct ggml_tensor * ggml_rope_inplace(
@@ -6649,7 +6649,7 @@ struct ggml_tensor * ggml_rope_inplace(
         int                   n_past,
         int                   n_dims,
         int                   mode) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true, 1.0);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, 1.0, true);
 }
 
 struct ggml_tensor * ggml_rope_scaled(
@@ -6659,7 +6659,7 @@ struct ggml_tensor * ggml_rope_scaled(
         int                   n_dims,
         int                   mode,
         float                 p_scale) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, false, p_scale);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, p_scale, false);
 }
 
 struct ggml_tensor * ggml_rope_scaled_inplace(
@@ -6669,7 +6669,7 @@ struct ggml_tensor * ggml_rope_scaled_inplace(
         int                   n_dims,
         int                   mode,
         float                 p_scale) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, true, p_scale);
+    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, p_scale, true);
 }
 
 
@@ -15763,18 +15763,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 // necessary for llama
                 if (src0->grad) {
-                    assert(src1->type == GGML_TYPE_I32);
-                    assert(ggml_nelements(src1) == 3);
-                    const int n_past = ((int32_t *) src1->data)[0];
-                    const int n_dims = ((int32_t *) src1->data)[1];
-                    const int mode   = ((int32_t *) src1->data)[2];
+                    assert(src1->type == GGML_TYPE_F32);
+                    assert(ggml_nelements(src1) == 4);
+                    const int n_past    = (int)((float *) src1->data)[0];
+                    const int n_dims    = (int)((float *) src1->data)[1];
+                    const int mode      = (int)((float *) src1->data)[2];
+                    const float p_scale = ((float *) src1->data)[3];
                     src0->grad = ggml_add_impl(ctx,
                             src0->grad,
-                            ggml_rope_back(ctx,
+                            ggml_rope_back_scaled(ctx,
                                 tensor->grad,
                                 n_past,
                                 n_dims,
-                                mode),
+                                mode,
+                                p_scale),
                             inplace);
                 }
                 if (src1->grad) {
@@ -15784,18 +15786,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
         case GGML_OP_ROPE_BACK:
             {
                 if (src0->grad) {
-                    assert(src1->type == GGML_TYPE_I32);
-                    assert(ggml_nelements(src1) == 3);
-                    const int n_past = ((int32_t *) src1->data)[0];
-                    const int n_dims = ((int32_t *) src1->data)[1];
-                    const int mode   = ((int32_t *) src1->data)[2];
+                    assert(src1->type == GGML_TYPE_F32);
+                    assert(ggml_nelements(src1) == 4);
+                    const int n_past    = (int)((float *) src1->data)[0];
+                    const int n_dims    = (int)((float *) src1->data)[1];
+                    const int mode      = (int)((float *) src1->data)[2];
+                    const float p_scale = ((float *) src1->data)[3];
                     src0->grad = ggml_add_impl(ctx,
                             src0->grad,
-                            ggml_rope(ctx,
+                            ggml_rope_scaled(ctx,
                                 tensor->grad,
                                 n_past,
                                 n_dims,
-                                mode),
+                                mode,
+                                p_scale),
                             inplace);
                 }
                 if (src1->grad) {

From 887694acfd3f8fe9737e01537cab7090d202a84b Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 22 Jun 2023 08:18:01 -0600
Subject: [PATCH 3/4] Handle rope params in CUDA, Metal

Bail out if p_scale != 1.0 n rope operation for the time being
---
 ggml-cuda.cu | 10 +++++++---
 ggml-metal.m | 11 +++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 36a251ecce973..a64547cd929fb 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1906,10 +1906,14 @@ inline void ggml_cuda_op_rope(
     const int64_t ne00 = src0->ne[0];
     const int64_t i01_diff = i01_high - i01_low;
 
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_dims = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
+    assert(src1->type == GGML_TYPE_F32);
+    assert(ggml_nelements(src1) == 4);
+    const int n_past    = (int)((float *) src1->data)[0];
+    const int n_dims    = (int)((float *) src1->data)[1];
+    const int mode      = (int)((float *) src1->data)[2];
+    const float p_scale = ((float *) src1->data)[3];
     GGML_ASSERT(mode == 0);
+    GGML_ASSERT(p_scale == 1.0);
 
     const float theta_scale = powf(10000.0, -2.0f/n_dims);
     const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
diff --git a/ggml-metal.m b/ggml-metal.m
index a7e104dc76fca..1798b68e62ab4 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -861,10 +861,13 @@ void ggml_metal_graph_compute(
                                 encoder = [command_buffer computeCommandEncoder];
                             }
 
-                            const int n_dims = ((int32_t *) src1->data)[1];
-                            const int mode   = ((int32_t *) src1->data)[2];
-
-                            const int n_past = ((int32_t *)(src1->data))[0];
+                            assert(src1->type == GGML_TYPE_F32);
+                            assert(ggml_nelements(src1) == 4);
+                            const int n_past    = (int)((float *) src1->data)[0];
+                            const int n_dims    = (int)((float *) src1->data)[1];
+                            const int mode      = (int)((float *) src1->data)[2];
+                            const float p_scale = ((float *) src1->data)[3];
+                            GGML_ASSERT(p_scale == 1.0 && "no Metal support for rope p_scale != 1.0");
 
                             [encoder setComputePipelineState:ctx->pipeline_rope];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];

From e92795f2f4d02c4dfb660b932477f6eacec17c2a Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 22 Jun 2023 14:06:13 -0600
Subject: [PATCH 4/4] Add CUDA and hopefully Metal support for p_scale

---
 ggml-cuda.cu     |  3 +--
 ggml-metal.m     | 40 ++++++++++++++++++++--------------------
 ggml-metal.metal |  3 ++-
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index a64547cd929fb..f2897c3ada25a 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1913,10 +1913,9 @@ inline void ggml_cuda_op_rope(
     const int mode      = (int)((float *) src1->data)[2];
     const float p_scale = ((float *) src1->data)[3];
     GGML_ASSERT(mode == 0);
-    GGML_ASSERT(p_scale == 1.0);
 
     const float theta_scale = powf(10000.0, -2.0f/n_dims);
-    const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
+    const float p = p_scale * ((mode & 1) == 0 ? n_past + i02 : i02);
 
     // compute
     rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
diff --git a/ggml-metal.m b/ggml-metal.m
index 1798b68e62ab4..2cb146b6790b7 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -867,30 +867,30 @@ void ggml_metal_graph_compute(
                             const int n_dims    = (int)((float *) src1->data)[1];
                             const int mode      = (int)((float *) src1->data)[2];
                             const float p_scale = ((float *) src1->data)[3];
-                            GGML_ASSERT(p_scale == 1.0 && "no Metal support for rope p_scale != 1.0");
 
                             [encoder setComputePipelineState:ctx->pipeline_rope];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
-                            [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:18];
+                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&p_scale length:sizeof(   float) atIndex:21];
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                         } break;
diff --git a/ggml-metal.metal b/ggml-metal.metal
index d1e49222db2eb..f50bfd8119e79 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -615,6 +615,7 @@ kernel void kernel_rope(
         constant       int & n_past,
         constant       int & n_dims,
         constant       int & mode,
+        constant     float & p_scale,
         uint3 tpig[[thread_position_in_grid]]) {
     const int64_t i3 = tpig[2];
     const int64_t i2 = tpig[1];
@@ -625,7 +626,7 @@ kernel void kernel_rope(
 
     const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
 
-    float theta = (float)p;
+    float theta = p_scale * (float)p;
 
     if (!is_neox) {
         for (int64_t i0 = 0; i0 < ne0; i0 += 2) {