From c5df72e8486a0c1f28b564d5e99c6653bc531b35 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 17 Sep 2023 17:54:14 +0300
Subject: [PATCH 01/55] tests : verify that RoPE is "additive"

---
 ggml.c               |   9 +-
 tests/CMakeLists.txt |   2 +
 tests/test-rope.cpp  | 211 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 217 insertions(+), 5 deletions(-)
 create mode 100644 tests/test-rope.cpp
diff --git a/ggml.c b/ggml.c
index a0be068d6c9f7..fd3a51ba776dc 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6977,7 +6977,6 @@ static struct ggml_tensor * ggml_rope_impl(
         float                 xpos_base,
         bool                  xpos_down,
         bool                  inplace) {
-    GGML_ASSERT(n_past >= 0);
     bool is_node = false;
 
     if (a->grad) {
@@ -12645,8 +12644,6 @@ static void ggml_compute_forward_rope_f32(
     memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
     memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
 
-    assert(n_past >= 0);
-
     GGML_TENSOR_UNARY_OP_LOCALS;
 
     //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
@@ -12674,12 +12671,14 @@ static void ggml_compute_forward_rope_f32(
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
+    const bool is_skip = mode & 1;
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
+    const bool is_diff = mode & 8; // TODO: temporary
 
     for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
+        for (int64_t i2 = (is_skip ? n_past : 0); i2 < ne2; i2++) {
+            const int64_t p = is_diff ? n_past : is_skip ? i2 : n_past + i2;
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 916dc9d055a2d..a19e1376ed389 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -37,6 +37,8 @@ llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp) # SLOW
 # llama_build_and_test_executable(test-opt.cpp) # SLOW
 
+llama_build_and_test_executable(test-rope.cpp)
+
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
new file mode 100644
index 0000000000000..a35bbd35352bb
--- /dev/null
+++ b/tests/test-rope.cpp
@@ -0,0 +1,211 @@
+#include "ggml.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+#endif
+
+#define MAX_NARGS 3
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define GGML_SILU_FP16
+
+//
+// logging
+//
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+static float frand(void) {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+static int irand(int n) {
+    if (n == 0) return 0;
+    return rand()%n;
+}
+
+static void get_random_dims(int64_t * dims, int ndims) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = 1 + irand(4);
+    }
+}
+
+static struct ggml_tensor * get_random_tensor_f32(
+        struct ggml_context * ctx0,
+        int ndims,
+        const int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+int main(int /*argc*/, const char ** /*argv*/) {
+    struct ggml_init_params params = {
+        /* .mem_size   = */ 128*1024*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ false,
+    };
+
+    std::vector<uint8_t> work_buffer;
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_tensor * x;
+
+    // rope f32
+    for (int m = 0; m < 3; ++m) {
+        const int ndims = 4;
+
+        const int64_t n_rot = 128;
+        const int64_t ne[4] = { 2*n_rot, 32, 73, 1 };
+
+        const int n_past_0 = 100;
+        const int n_past_1 = 33;
+
+        // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
+        const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
+
+        x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
+
+        // 100, 101, 102, ..., 172
+        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  n_past_0,            n_rot, mode,     1024);
+        // -67, -67, -67, ..., -67
+        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, n_past_1 - n_past_0, n_rot, mode + 8, 1024); // diff mode
+
+        //  33,  34,  35, ..., 105
+        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  n_past_1,            n_rot, mode,     1024);
+
+        ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+        ggml_build_forward_expand(gf, r0);
+        ggml_build_forward_expand(gf, r1);
+        ggml_build_forward_expand(gf, r2);
+
+        ggml_graph_compute_helper(work_buffer, gf, 4);
+
+        // check that r1 and r2 are the same
+        {
+            double sum0 = 0.0f;
+            double sum1 = 0.0f;
+            double diff = 0.0f;
+
+            const float * r1_data = (float *) r1->data;
+            const float * r2_data = (float *) r2->data;
+
+            const int n_elements = ggml_nelements(r1);
+
+            for (int i = 0; i < n_elements; ++i) {
+                sum0 += fabs(r1_data[i]);
+                sum1 += fabs(r2_data[i]);
+                diff += fabs(r1_data[i] - r2_data[i]);
+                //if (fabs(r1_data[i] - r2_data[i]) > 0.0001f) {
+                //    printf("%d: %f %f\n", i, r1_data[i], r2_data[i]);
+                //    printf("diff: %f\n", fabs(r1_data[i] - r2_data[i]));
+                //}
+            }
+
+            //for (int i = 4096; i < 4096 + 128; ++i) {
+            //    printf("%f %f\n", r1_data[i], r2_data[i]);
+            //}
+
+            printf("mode: %d\n", mode);
+            printf("sum0: %f\n", sum0);
+            printf("sum1: %f\n", sum1);
+            printf("diff: %f\n", diff);
+            printf("rel err: %f\n", diff / sum0);
+            printf("rel err: %f\n", diff / sum1);
+
+            GGML_ASSERT(diff / sum0 < 0.0001f);
+            GGML_ASSERT(diff / sum1 < 0.0001f);
+        }
+    }
+
+    ggml_free(ctx0);
+
+    return 0;
+}
+

From 3b4bab6a38502d9e68587c2c19f26472480ec4dd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 17 Sep 2023 19:42:39 +0300
Subject: [PATCH 02/55] llama : replace ggml_diag_mask_inf with ggml_add
 (custom -inf mask)

---
 ggml-metal.m     | 50 +++++++++++++++++++++++++++++++++-------
 ggml-metal.metal | 59 +++++++++++++++++++++++++++++++++++++++++++-----
 ggml.c           |  2 --
 llama.cpp        | 50 +++++++++++++++++++++++++++-------------
 4 files changed, 129 insertions(+), 32 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 1139ee3114610..d793083d99e7a 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -736,25 +736,59 @@ void ggml_metal_graph_compute(
                             GGML_ASSERT(ggml_is_contiguous(src0));
                             GGML_ASSERT(ggml_is_contiguous(src1));
 
-                            // utilize float4
-                            GGML_ASSERT(ne00 % 4 == 0);
-                            const int64_t nb = ne00/4;
+                            bool bcast_row = false;
 
-                            if (ggml_nelements(src1) == ne10) {
+                            int64_t nb = ne00;
+
+                            if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
                                 // src1 is a row
                                 GGML_ASSERT(ne11 == 1);
+
+                                nb = ne00 / 4;
                                 [encoder setComputePipelineState:ctx->pipeline_add_row];
+
+                                bcast_row = true;
                             } else {
                                 [encoder setComputePipelineState:ctx->pipeline_add];
                             }
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                             [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&nb     length:sizeof(nb) atIndex:3];
-
-                            const int64_t n = ggml_nelements(dst)/4;
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
+                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
+                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
+                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
+
+                            if (bcast_row) {
+                                const int64_t n = ggml_nelements(dst)/4;
+
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } else {
+                                const int nth = MIN(1024, ne0);
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                            }
                         } break;
                     case GGML_OP_MUL:
                         {
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 7f1c3d9ea74bd..c913fe1d95c6c 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -24,12 +24,59 @@ typedef struct {
     int8_t  qs[QK8_0]; // quants
 } block_q8_0;
 
+// general-purpose kernel for addition of two tensors
+// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
+// cons: not very efficient
 kernel void kernel_add(
-        device const float4 * src0,
-        device const float4 * src1,
-        device       float4 * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] + src1[tpig];
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        constant  int64_t & ne00,
+        constant  int64_t & ne01,
+        constant  int64_t & ne02,
+        constant  int64_t & ne03,
+        constant  int64_t & nb00,
+        constant  int64_t & nb01,
+        constant  int64_t & nb02,
+        constant  int64_t & nb03,
+        constant  int64_t & ne10,
+        constant  int64_t & ne11,
+        constant  int64_t & ne12,
+        constant  int64_t & ne13,
+        constant  int64_t & nb10,
+        constant  int64_t & nb11,
+        constant  int64_t & nb12,
+        constant  int64_t & nb13,
+        constant  int64_t & ne0,
+        constant  int64_t & ne1,
+        constant  int64_t & ne2,
+        constant  int64_t & ne3,
+        constant  int64_t & nb0,
+        constant  int64_t & nb1,
+        constant  int64_t & nb2,
+        constant  int64_t & nb3,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig.z;
+    const int64_t i02 = tgpig.y;
+    const int64_t i01 = tgpig.x;
+
+    const int64_t i13 = i03 % ne13;
+    const int64_t i12 = i02 % ne12;
+    const int64_t i11 = i01 % ne11;
+
+    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
+    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
+    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + tpitg.x*nb0;
+
+    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
+        ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0] + ((device float *)src1_ptr)[0];
+
+        src0_ptr += ntg.x*nb00;
+        src1_ptr += ntg.x*nb10;
+        dst_ptr  += ntg.x*nb0;
+    }
 }
 
 // assumption: src1 is a row
@@ -38,7 +85,7 @@ kernel void kernel_add_row(
         device const float4 * src0,
         device const float4 * src1,
         device       float4 * dst,
-        constant    int64_t & nb,
+        constant    int64_t & nb [[buffer(27)]],
         uint tpig[[thread_position_in_grid]]) {
     dst[tpig] = src0[tpig] + src1[tpig % nb];
 }
diff --git a/ggml.c b/ggml.c
index fd3a51ba776dc..37124f1e51970 100644
--- a/ggml.c
+++ b/ggml.c
@@ -8797,8 +8797,6 @@ static void ggml_compute_forward_add_f32(
 #else
             ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
 #endif
-                // }
-            // }
         }
     } else {
         // src1 is not contiguous
diff --git a/llama.cpp b/llama.cpp
index 0cab18093a848..f129c59f30899 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2404,6 +2404,7 @@ static struct ggml_cgraph * llm_build_llama(
     }
 #endif // GGML_USE_CUBLAS
 
+    // KQ_scale
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
     ggml_allocr_alloc(lctx.alloc, KQ_scale);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -2411,6 +2412,22 @@ static struct ggml_cgraph * llm_build_llama(
     }
     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
+    // KQ_mask
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, N, 1);
+    ggml_allocr_alloc(lctx.alloc, KQ_mask);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        float * data = (float *) KQ_mask->data;
+        memset(data, 0, ggml_nbytes(KQ_mask));
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < N; ++j) {
+                for (int i = n_past + j + 1; i < n_past + N; ++i) {
+                    data[h*(n_past + N)*N + j*(n_past + N) + i] = -INFINITY;
+                }
+            }
+        }
+    }
+
     for (int il = 0; il < n_layer; ++il) {
         ggml_format_name(inpL, "layer_inp_%d", il);
 
@@ -2447,11 +2464,11 @@ static struct ggml_cgraph * llm_build_llama(
             offload_func_kq(tmpq);
             ggml_set_name(tmpq, "tmpq");
 
-            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
-            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Qcur);
             ggml_set_name(Qcur, "Qcur");
 
@@ -2502,17 +2519,18 @@ static struct ggml_cgraph * llm_build_llama(
 
             // KQ_scaled = KQ / sqrt(n_embd_head)
             // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
             offload_func_kq(KQ_scaled);
             ggml_set_name(KQ_scaled, "KQ_scaled");
 
             // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
+            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
             offload_func_kq(KQ_masked);
             ggml_set_name(KQ_masked, "KQ_masked");
 
             // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
             offload_func_v(KQ_soft_max);
             ggml_set_name(KQ_soft_max, "KQ_soft_max");
 
@@ -2783,8 +2801,8 @@ static struct ggml_cgraph * llm_build_baichaun(
             struct ggml_tensor * Qcur;
             switch (model.type) {
                 case MODEL_7B:
-                    Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
-                    Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+                    Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+                    Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
                     break;
                 case MODEL_13B:
                     Kcur  = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
@@ -2847,7 +2865,7 @@ static struct ggml_cgraph * llm_build_baichaun(
 
             // KQ_scaled = KQ / sqrt(n_embd_head)
             // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
             offload_func_kq(KQ_scaled);
             ggml_set_name(KQ_scaled, "KQ_scaled");
 
@@ -2856,7 +2874,7 @@ static struct ggml_cgraph * llm_build_baichaun(
 
             switch (model.type) {
                 case MODEL_7B:
-                    KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+                    KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
                     break;
                 case MODEL_13B:
                     KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
@@ -2867,13 +2885,13 @@ static struct ggml_cgraph * llm_build_baichaun(
                     GGML_ASSERT(false);
             }
             // KQ_masked = mask_past(KQ_scaled)
-            // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
             // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
             // offload_func_kq(KQ_masked);
             // ggml_set_name(KQ_masked, "KQ_masked");
 
             // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
             offload_func_v(KQ_soft_max);
             ggml_set_name(KQ_soft_max, "KQ_soft_max");
 
@@ -3179,9 +3197,9 @@ static struct ggml_cgraph * llm_build_falcon(
             offload_func_v(tmpv);
 
             // using mode = 2 for neox mode
-            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
+            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
             offload_func_kq(Qcur);
-            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
+            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
             offload_func_kq(Kcur);
 
             {
@@ -3220,15 +3238,15 @@ static struct ggml_cgraph * llm_build_falcon(
             offload_func_kq(KQ);
             ggml_set_name(KQ, "KQ");
 
-            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
             offload_func_kq(KQ_scaled);
             ggml_set_name(KQ_scaled, "KQ_scaled");
 
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
             offload_func_kq(KQ_masked);
             ggml_set_name(KQ_masked, "KQ_masked");
 
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
             offload_func_v(KQ_soft_max);
             ggml_set_name(KQ_soft_max, "KQ_soft_max");
 

From 1fb033fd85f8125d2830bbfe6d384be3baa17ae8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 17 Sep 2023 21:12:51 +0300
Subject: [PATCH 03/55] ggml : ggml_rope now takes a vector with positions
 instead of n_past

---
 examples/baby-llama/baby-llama.cpp            |  37 +++++-
 .../train-text-from-scratch.cpp               |  14 ++-
 ggml-metal.m                                  |  49 ++++----
 ggml-metal.metal                              |  55 +++++----
 ggml.c                                        | 113 ++++++++++--------
 ggml.h                                        |  17 +--
 llama.cpp                                     |  84 +++++++++++--
 tests/test-grad0.cpp                          |  14 ++-
 tests/test-rope.cpp                           |  18 ++-
 9 files changed, 270 insertions(+), 131 deletions(-)

diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index a99ece9a66fd1..b5916f2306354 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -556,6 +556,14 @@ struct ggml_tensor * forward(
     struct ggml_tensor * kc = kv_self.k;
     struct ggml_tensor * vc = kv_self.v;
 
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    {
+        int * data = (int *) KQ_pos->data;
+        for (int i = 0; i < N; ++i) {
+            data[i] = n_past + i;
+        }
+    }
+
     // inpL shape [n_embd,N,1,1]
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
     for (int il = 0; il < n_layer; ++il) {
@@ -583,8 +591,8 @@ struct ggml_tensor * forward(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, 1]
             // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
 
             // store key and value to memory
             {
@@ -810,9 +818,18 @@ struct ggml_tensor * forward_batch(
     struct ggml_tensor * kc = kv_self.k;
     struct ggml_tensor * vc = kv_self.v;
 
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    {
+        int * data = (int *) KQ_pos->data;
+        for (int i = 0; i < N; ++i) {
+            data[i] = n_past + i;
+        }
+    }
+
     // inpL shape [n_embd,N*n_batch,1]
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
     assert_shape_2d(inpL, n_embd, N*n_batch);
+
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
 
@@ -840,8 +857,8 @@ struct ggml_tensor * forward_batch(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, n_batch]
             // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
             assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
             assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
 
@@ -1100,6 +1117,14 @@ struct ggml_tensor * forward_lora(
     struct ggml_tensor * kc = kv_self.k;
     struct ggml_tensor * vc = kv_self.v;
 
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    {
+        int * data = (int *) KQ_pos->data;
+        for (int i = 0; i < N; ++i) {
+            data[i] = n_past + i;
+        }
+    }
+
     // inpL shape [n_embd,N,1,1]
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens);
     for (int il = 0; il < n_layer; ++il) {
@@ -1133,7 +1158,7 @@ struct ggml_tensor * forward_lora(
                                                         model->layers[il].wqb,
                                                         cur)),
                                                 n_embd/n_head, n_head, N),
-                                            n_past, n_rot, 0, 0);
+                                            KQ_pos, n_rot, 0, 0);
             struct ggml_tensor * Kcur = ggml_rope(ctx0,
                                             ggml_reshape_3d(ctx0,
                                                 ggml_mul_mat(ctx0,
@@ -1142,7 +1167,7 @@ struct ggml_tensor * forward_lora(
                                                         model->layers[il].wkb,
                                                         cur)),
                                                 n_embd/n_head, n_head, N),
-                                            n_past, n_rot, 0, 0);
+                                            KQ_pos, n_rot, 0, 0);
 
             // store key and value to memory
             {
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 947aa7ed3bd3e..d91566586fb14 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -679,15 +679,23 @@ struct ggml_tensor * llama_build_train_graphs(
         }
     };
 
+    // KQ_pos - contains the positions
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
+    {
+        int * data = (int *) KQ_pos->data;
+        for (int i = 0; i < N; ++i) {
+            data[i] = n_past + i;
+        }
+    }
+
     // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
+    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
                 (struct ggml_tensor * t) -> struct ggml_tensor * {
         // not capturing these, to silcence warnings
-        const int n_past    = 0;
         const int rope_mode = 0;
 
         return ggml_rope_custom(ctx,
-            t, n_past, n_rot, rope_mode, n_ctx,
+            t, KQ_pos, n_rot, rope_mode, n_ctx,
             rope_freq_base, rope_freq_scale);
     };
 
diff --git a/ggml-metal.m b/ggml-metal.m
index d793083d99e7a..d4027d35099b7 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1210,7 +1210,9 @@ void ggml_metal_graph_compute(
                         } break;
                     case GGML_OP_ROPE:
                         {
-                            const int n_past = ((int32_t *) dst->op_params)[0];
+                            GGML_ASSERT(ne10 == ne02);
+
+                            //const int n_past = ((int32_t *) dst->op_params)[0];
                             const int n_dims = ((int32_t *) dst->op_params)[1];
                             const int mode   = ((int32_t *) dst->op_params)[2];
 
@@ -1221,28 +1223,29 @@ void ggml_metal_graph_compute(
 
                             [encoder setComputePipelineState:ctx->pipeline_rope];
                             [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:18];
-                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:20];
-                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
-                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
+                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:2];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:6];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:10];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:14];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:18];
+                            //[encoder setBytes:&n_past  length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:21];
+                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:22];
+                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
 
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
                         } break;
diff --git a/ggml-metal.metal b/ggml-metal.metal
index c913fe1d95c6c..16937be5e17c1 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -854,29 +854,30 @@ kernel void kernel_alibi_f32(
 }
 
 kernel void kernel_rope(
-        device const  void * src0,
-        device       float * dst,
-        constant   int64_t & ne00,
-        constant   int64_t & ne01,
-        constant   int64_t & ne02,
-        constant   int64_t & ne03,
-        constant  uint64_t & nb00,
-        constant  uint64_t & nb01,
-        constant  uint64_t & nb02,
-        constant  uint64_t & nb03,
-        constant   int64_t & ne0,
-        constant   int64_t & ne1,
-        constant   int64_t & ne2,
-        constant   int64_t & ne3,
-        constant  uint64_t & nb0,
-        constant  uint64_t & nb1,
-        constant  uint64_t & nb2,
-        constant  uint64_t & nb3,
-        constant       int & n_past,
-        constant       int & n_dims,
-        constant       int & mode,
-        constant     float & freq_base,
-        constant     float & freq_scale,
+        device const    void * src0,
+        device const int32_t * src1,
+        device         float * dst,
+        constant     int64_t & ne00,
+        constant     int64_t & ne01,
+        constant     int64_t & ne02,
+        constant     int64_t & ne03,
+        constant    uint64_t & nb00,
+        constant    uint64_t & nb01,
+        constant    uint64_t & nb02,
+        constant    uint64_t & nb03,
+        constant     int64_t & ne0,
+        constant     int64_t & ne1,
+        constant     int64_t & ne2,
+        constant     int64_t & ne3,
+        constant    uint64_t & nb0,
+        constant    uint64_t & nb1,
+        constant    uint64_t & nb2,
+        constant    uint64_t & nb3,
+        constant         int & n_past,
+        constant         int & n_dims,
+        constant         int & mode,
+        constant       float & freq_base,
+        constant       float & freq_scale,
         uint  tiitg[[thread_index_in_threadgroup]],
         uint3 tptg[[threads_per_threadgroup]],
         uint3 tgpig[[threadgroup_position_in_grid]]) {
@@ -886,7 +887,9 @@ kernel void kernel_rope(
 
     const bool is_neox = mode & 2;
 
-    const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
+    device const int32_t * pos = src1;
+
+    const int64_t p = pos[i2];
 
     const float theta_0 = freq_scale * (float)p;
     const float inv_ndims = -1.f/n_dims;
@@ -1320,8 +1323,8 @@ kernel void kernel_mul_mat_q3_K_f32(
 
     float yl[32];
 
-    const uint16_t kmask1 = 0x3030;
-    const uint16_t kmask2 = 0x0f0f;
+    //const uint16_t kmask1 = 0x3030;
+    //const uint16_t kmask2 = 0x0f0f;
 
     const int tid = tiisg/4;
     const int ix  = tiisg%4;
diff --git a/ggml.c b/ggml.c
index 37124f1e51970..e4faafee6d115 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6968,7 +6968,7 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
 static struct ggml_tensor * ggml_rope_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   n_past,
+        struct ggml_tensor  * b,
         int                   n_dims,
         int                   mode,
         int                   n_ctx,
@@ -6977,6 +6977,10 @@ static struct ggml_tensor * ggml_rope_impl(
         float                 xpos_base,
         bool                  xpos_down,
         bool                  inplace) {
+    GGML_ASSERT(ggml_is_vector(b));
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[2] == b->ne[0]);
+
     bool is_node = false;
 
     if (a->grad) {
@@ -6985,7 +6989,7 @@ static struct ggml_tensor * ggml_rope_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[8] = { n_past, n_dims, mode, n_ctx };
+    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
     memcpy(params + 4, &freq_base,  sizeof(float));
     memcpy(params + 5, &freq_scale, sizeof(float));
     memcpy(params + 6, &xpos_base,  sizeof(float));
@@ -6995,6 +6999,7 @@ static struct ggml_tensor * ggml_rope_impl(
     result->op   = GGML_OP_ROPE;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -7002,55 +7007,55 @@ static struct ggml_tensor * ggml_rope_impl(
 struct ggml_tensor * ggml_rope(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   n_past,
+        struct ggml_tensor  * b,
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
 }
 
 struct ggml_tensor * ggml_rope_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   n_past,
+        struct ggml_tensor  * b,
         int                   n_dims,
         int                   mode,
         int                   n_ctx) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
 }
 
 struct ggml_tensor * ggml_rope_custom(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   n_past,
+        struct ggml_tensor  * b,
         int                   n_dims,
         int                   mode,
         int                   n_ctx,
         float                 freq_base,
         float                 freq_scale) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
 }
 
 struct ggml_tensor * ggml_rope_custom_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   n_past,
+        struct ggml_tensor  * b,
         int                   n_dims,
         int                   mode,
         int                   n_ctx,
         float                 freq_base,
         float                 freq_scale) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
 }
 
 struct ggml_tensor * ggml_rope_xpos_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   n_past,
+        struct ggml_tensor  * b,
         int                   n_dims,
         float                 base,
         bool                  down) {
-    return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
+    return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
 }
 
 // ggml_rope_back
@@ -7058,7 +7063,7 @@ struct ggml_tensor * ggml_rope_xpos_inplace(
 struct ggml_tensor * ggml_rope_back(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
-        int                   n_past,
+        struct ggml_tensor  * b,
         int                   n_dims,
         int                   mode,
         int                   n_ctx,
@@ -7066,7 +7071,10 @@ struct ggml_tensor * ggml_rope_back(
         float                 freq_scale,
         float                 xpos_base,
         bool                  xpos_down) {
-    GGML_ASSERT(n_past >= 0);
+    GGML_ASSERT(ggml_is_vector(b));
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+    GGML_ASSERT(a->ne[2] == b->ne[0]);
+
     GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
 
     bool is_node = false;
@@ -7077,7 +7085,7 @@ struct ggml_tensor * ggml_rope_back(
 
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 
-    int32_t params[8] = { n_past, n_dims, mode, n_ctx };
+    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
     memcpy(params + 4, &freq_base,  sizeof(float));
     memcpy(params + 5, &freq_scale, sizeof(float));
     memcpy(params + 6, &xpos_base,  sizeof(float));
@@ -7087,6 +7095,7 @@ struct ggml_tensor * ggml_rope_back(
     result->op   = GGML_OP_ROPE_BACK;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
+    result->src[1] = b;
 
     return result;
 }
@@ -12620,8 +12629,8 @@ static void ggml_compute_forward_clamp(
 static void ggml_compute_forward_rope_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
@@ -12631,9 +12640,9 @@ static void ggml_compute_forward_rope_f32(
 
     // these two only relevant for xPos RoPE:
     float xpos_base;
-    bool xpos_down;
+    bool  xpos_down;
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
+    //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_dims = ((int32_t *) dst->op_params)[1];
     const int mode   = ((int32_t *) dst->op_params)[2];
     const int n_ctx  = ((int32_t *) dst->op_params)[3];
@@ -12669,14 +12678,14 @@ static void ggml_compute_forward_rope_f32(
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
-    const bool is_skip = mode & 1;
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
-    const bool is_diff = mode & 8; // TODO: temporary
+
+    const int32_t * pos = (const int32_t *) src1->data;
 
     for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = (is_skip ? n_past : 0); i2 < ne2; i2++) {
-            const int64_t p = is_diff ? n_past : is_skip ? i2 : n_past + i2;
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
@@ -12713,7 +12722,7 @@ static void ggml_compute_forward_rope_f32(
                         const float cos_theta = cosf(theta);
                         const float sin_theta = sinf(theta);
                         // zeta scaling for xPos only:
-                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
+                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
                         if (xpos_down) zeta = 1.0f / zeta;
 
                         theta *= theta_scale;
@@ -12758,8 +12767,8 @@ static void ggml_compute_forward_rope_f32(
 static void ggml_compute_forward_rope_f16(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
-
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
@@ -12767,15 +12776,13 @@ static void ggml_compute_forward_rope_f16(
     float freq_base;
     float freq_scale;
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
+    //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_dims = ((int32_t *) dst->op_params)[1];
     const int mode   = ((int32_t *) dst->op_params)[2];
     const int n_ctx  = ((int32_t *) dst->op_params)[3];
     memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
     memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
 
-    assert(n_past >= 0);
-
     GGML_TENSOR_UNARY_OP_LOCALS;
 
     //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
@@ -12806,9 +12813,11 @@ static void ggml_compute_forward_rope_f16(
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
 
+    const int32_t * pos = (const int32_t *) src1->data;
+
     for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
@@ -12887,15 +12896,16 @@ static void ggml_compute_forward_rope_f16(
 static void ggml_compute_forward_rope(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_F16:
             {
-                ggml_compute_forward_rope_f16(params, src0, dst);
+                ggml_compute_forward_rope_f16(params, src0, src1, dst);
             } break;
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_rope_f32(params, src0, dst);
+                ggml_compute_forward_rope_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -12909,6 +12919,7 @@ static void ggml_compute_forward_rope(
 static void ggml_compute_forward_rope_back_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -12926,7 +12937,7 @@ static void ggml_compute_forward_rope_back_f32(
     float xpos_base;
     bool xpos_down;
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
+    //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_dims = ((int32_t *) dst->op_params)[1];
     const int mode   = ((int32_t *) dst->op_params)[2];
     const int n_ctx  = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
@@ -12935,8 +12946,6 @@ static void ggml_compute_forward_rope_back_f32(
     memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
     memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
 
-    assert(n_past >= 0);
-
     GGML_TENSOR_UNARY_OP_LOCALS;
 
     //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
@@ -12963,9 +12972,11 @@ static void ggml_compute_forward_rope_back_f32(
 
     const bool is_neox = mode & 2;
 
+    const int32_t * pos = (const int32_t *) src1->data;
+
     for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
@@ -12977,7 +12988,7 @@ static void ggml_compute_forward_rope_back_f32(
                         const float cos_theta = cosf(theta);
                         const float sin_theta = sinf(theta);
                         // zeta scaling for xPos only:
-                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f;
+                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
                         if (xpos_down) zeta = 1.0f / zeta;
 
                         theta *= theta_scale;
@@ -13020,6 +13031,7 @@ static void ggml_compute_forward_rope_back_f32(
 static void ggml_compute_forward_rope_back_f16(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
 
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@@ -13030,12 +13042,10 @@ static void ggml_compute_forward_rope_back_f16(
     // dx = rope_back(dy, src1)
     // src0 is dy, src1 contains options
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
+    //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_dims = ((int32_t *) dst->op_params)[1];
     const int mode   = ((int32_t *) dst->op_params)[2];
 
-    assert(n_past >= 0);
-
     GGML_TENSOR_UNARY_OP_LOCALS;
 
     //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
@@ -13062,9 +13072,11 @@ static void ggml_compute_forward_rope_back_f16(
 
     const bool is_neox = mode & 2;
 
+    const int32_t * pos = (const int32_t *) src1->data;
+
     for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            const int64_t p = pos[i2];
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
@@ -13116,15 +13128,16 @@ static void ggml_compute_forward_rope_back_f16(
 static void ggml_compute_forward_rope_back(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_F16:
             {
-                ggml_compute_forward_rope_back_f16(params, src0, dst);
+                ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
             } break;
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_rope_back_f32(params, src0, dst);
+                ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -15861,11 +15874,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             } break;
         case GGML_OP_ROPE:
             {
-                ggml_compute_forward_rope(params, tensor->src[0], tensor);
+                ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ROPE_BACK:
             {
-                ggml_compute_forward_rope_back(params, tensor->src[0], tensor);
+                ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case GGML_OP_ALIBI:
             {
@@ -16503,7 +16516,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 // necessary for llama
                 if (src0->grad) {
-                    const int n_past = ((int32_t *) tensor->op_params)[0];
+                    //const int n_past = ((int32_t *) tensor->op_params)[0];
                     const int n_dims = ((int32_t *) tensor->op_params)[1];
                     const int mode   = ((int32_t *) tensor->op_params)[2];
                     const int n_ctx  = ((int32_t *) tensor->op_params)[3];
@@ -16520,7 +16533,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             src0->grad,
                             ggml_rope_back(ctx,
                                 tensor->grad,
-                                n_past,
+                                src1,
                                 n_dims,
                                 mode,
                                 n_ctx,
@@ -16534,7 +16547,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
         case GGML_OP_ROPE_BACK:
             {
                 if (src0->grad) {
-                    const int n_past = ((int32_t *) tensor->op_params)[0];
+                    //const int n_past = ((int32_t *) tensor->op_params)[0];
                     const int n_dims = ((int32_t *) tensor->op_params)[1];
                     const int mode   = ((int32_t *) tensor->op_params)[2];
                     const int n_ctx  = ((int32_t *) tensor->op_params)[3];
@@ -16551,7 +16564,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             src0->grad,
                             ggml_rope_impl(ctx,
                                 tensor->grad,
-                                n_past,
+                                src1,
                                 n_dims,
                                 mode,
                                 n_ctx,
diff --git a/ggml.h b/ggml.h
index f45456876da62..e2bfa1ae4f18d 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1219,14 +1219,15 @@ extern "C" {
             struct ggml_tensor  * b);
 
     // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements
+    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
     // if mode & 2 == 1, GPT-NeoX style
     // if mode & 4 == 1, ChatGLM style
-    // TODO: avoid creating a new tensor every time
+    //
+    // b is an int32 vector with size a->ne[2], it contains the positions
     GGML_API struct ggml_tensor * ggml_rope(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx);
@@ -1235,7 +1236,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_rope_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx);
@@ -1244,7 +1245,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_rope_custom(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
@@ -1255,7 +1256,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
@@ -1266,7 +1267,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             float                 base,
             bool                  down);
@@ -1276,7 +1277,7 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_rope_back(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
-            int                   n_past,
+            struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
diff --git a/llama.cpp b/llama.cpp
index f129c59f30899..9d41689f78a3f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2428,6 +2428,16 @@ static struct ggml_cgraph * llm_build_llama(
         }
     }
 
+    // KQ_pos - contains the positions
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(lctx.alloc, KQ_pos);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        int * data = (int *) KQ_pos->data;
+        for (int i = 0; i < N; ++i) {
+            data[i] = n_past + i;
+        }
+    }
+
     for (int il = 0; il < n_layer; ++il) {
         ggml_format_name(inpL, "layer_inp_%d", il);
 
@@ -2464,11 +2474,11 @@ static struct ggml_cgraph * llm_build_llama(
             offload_func_kq(tmpq);
             ggml_set_name(tmpq, "tmpq");
 
-            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
-            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Qcur);
             ggml_set_name(Qcur, "Qcur");
 
@@ -2754,6 +2764,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     }
 #endif // GGML_USE_CUBLAS
 
+    // KQ_scale
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
     ggml_allocr_alloc(lctx.alloc, KQ_scale);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -2761,6 +2772,32 @@ static struct ggml_cgraph * llm_build_baichaun(
     }
     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
+    // KQ_mask
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, N, 1);
+    ggml_allocr_alloc(lctx.alloc, KQ_mask);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        float * data = (float *) KQ_mask->data;
+        memset(data, 0, ggml_nbytes(KQ_mask));
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < N; ++j) {
+                for (int i = n_past + j + 1; i < n_past + N; ++i) {
+                    data[h*(n_past + N)*N + j*(n_past + N) + i] = -INFINITY;
+                }
+            }
+        }
+    }
+
+    // KQ_pos - contains the positions
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(lctx.alloc, KQ_pos);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        int * data = (int *) KQ_pos->data;
+        for (int i = 0; i < N; ++i) {
+            data[i] = n_past + i;
+        }
+    }
+
     for (int il = 0; il < n_layer; ++il) {
         ggml_format_name(inpL, "layer_inp_%d", il);
 
@@ -2801,11 +2838,11 @@ static struct ggml_cgraph * llm_build_baichaun(
             struct ggml_tensor * Qcur;
             switch (model.type) {
                 case MODEL_7B:
-                    Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
-                    Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+                    Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+                    Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
                     break;
                 case MODEL_13B:
-                    Kcur  = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
+                    Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
                     Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
                     break;
                 default:
@@ -2874,12 +2911,14 @@ static struct ggml_cgraph * llm_build_baichaun(
 
             switch (model.type) {
                 case MODEL_7B:
-                    KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+                    KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
+                    //KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
                     break;
                 case MODEL_13B:
                     KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
                     ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
-                    KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
+                    KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
+                    //KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
                     break;
                 default:
                     GGML_ASSERT(false);
@@ -3114,6 +3153,7 @@ static struct ggml_cgraph * llm_build_falcon(
     }
 #endif // GGML_USE_CUBLAS
 
+    // KQ_scale
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
     ggml_allocr_alloc(lctx.alloc, KQ_scale);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -3121,6 +3161,32 @@ static struct ggml_cgraph * llm_build_falcon(
     }
     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
+    // KQ_mask
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, N, 1);
+    ggml_allocr_alloc(lctx.alloc, KQ_mask);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        float * data = (float *) KQ_mask->data;
+        memset(data, 0, ggml_nbytes(KQ_mask));
+
+        for (int h = 0; h < 1; ++h) {
+            for (int j = 0; j < N; ++j) {
+                for (int i = n_past + j + 1; i < n_past + N; ++i) {
+                    data[h*(n_past + N)*N + j*(n_past + N) + i] = -INFINITY;
+                }
+            }
+        }
+    }
+
+    // KQ_pos - contains the positions
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(lctx.alloc, KQ_pos);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        int * data = (int *) KQ_pos->data;
+        for (int i = 0; i < N; ++i) {
+            data[i] = n_past + i;
+        }
+    }
+
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * attn_norm;
 
@@ -3197,9 +3263,9 @@ static struct ggml_cgraph * llm_build_falcon(
             offload_func_v(tmpv);
 
             // using mode = 2 for neox mode
-            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
+            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
             offload_func_kq(Qcur);
-            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
+            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale);
             offload_func_kq(Kcur);
 
             {
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 468cde66adc65..7b0c0fcdbb54c 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -1404,6 +1404,11 @@ int main(int argc, const char ** argv) {
                     for (int n_past = 1; n_past < ne2[2]; ++n_past) {
                         x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
 
+                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
+                        for (int i = 0; i < ne2[2]; ++i) {
+                            ((int32_t *) p->data)[i] = n_past + i;
+                        }
+
                         ggml_set_param(ctx0, x[0]);
 
                         const bool skip_past = (mode & 1);
@@ -1415,7 +1420,7 @@ int main(int argc, const char ** argv) {
                             continue;
                         }
 
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
 
                         GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
                         check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
@@ -1438,6 +1443,11 @@ int main(int argc, const char ** argv) {
                     for (int n_past = 1; n_past < ne2[2]; ++n_past) {
                         x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
 
+                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
+                        for (int i = 0; i < ne2[2]; ++i) {
+                            ((int32_t *) p->data)[i] = n_past + i;
+                        }
+
                         ggml_set_param(ctx0, x[0]);
 
                         const bool skip_past = (mode & 1);
@@ -1449,7 +1459,7 @@ int main(int argc, const char ** argv) {
                             continue;
                         }
 
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
 
                         GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
                         check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index a35bbd35352bb..26c1f42dc0e95 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -144,7 +144,17 @@ int main(int /*argc*/, const char ** /*argv*/) {
         const int64_t ne[4] = { 2*n_rot, 32, 73, 1 };
 
         const int n_past_0 = 100;
-        const int n_past_1 = 33;
+        const int n_past_2 = 33;
+
+        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
+
+        for (int i = 0; i < ne[2]; ++i) {
+            ((int32_t *) p0->data)[i] = n_past_0 + i;
+            ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
+            ((int32_t *) p2->data)[i] = n_past_2 + i;
+        }
 
         // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
         const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
@@ -152,12 +162,12 @@ int main(int /*argc*/, const char ** /*argv*/) {
         x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 
         // 100, 101, 102, ..., 172
-        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  n_past_0,            n_rot, mode,     1024);
+        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode, 1024);
         // -67, -67, -67, ..., -67
-        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, n_past_1 - n_past_0, n_rot, mode + 8, 1024); // diff mode
+        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode, 1024); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
 
         //  33,  34,  35, ..., 105
-        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  n_past_1,            n_rot, mode,     1024);
+        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode, 1024);
 
         ggml_cgraph * gf = ggml_new_graph(ctx0);
 

From fad56936d484a48eede12f30d194a26e1ea9e6b1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 17 Sep 2023 23:09:48 +0300
Subject: [PATCH 04/55] metal : add rope_f16 kernel + optimize cpy kernels

---
 ggml-metal.m     | 36 +++++++++++++++++++++++-------------
 ggml-metal.metal | 45 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index d4027d35099b7..231debcfd04a0 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -100,7 +100,8 @@
     GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
     GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
-    GGML_METAL_DECL_KERNEL(rope);
+    GGML_METAL_DECL_KERNEL(rope_f32);
+    GGML_METAL_DECL_KERNEL(rope_f16);
     GGML_METAL_DECL_KERNEL(alibi_f32);
     GGML_METAL_DECL_KERNEL(cpy_f32_f16);
     GGML_METAL_DECL_KERNEL(cpy_f32_f32);
@@ -261,7 +262,8 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
         GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
         GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
-        GGML_METAL_ADD_KERNEL(rope);
+        GGML_METAL_ADD_KERNEL(rope_f32);
+        GGML_METAL_ADD_KERNEL(rope_f16);
         GGML_METAL_ADD_KERNEL(alibi_f32);
         GGML_METAL_ADD_KERNEL(cpy_f32_f16);
         GGML_METAL_ADD_KERNEL(cpy_f32_f32);
@@ -335,7 +337,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
     GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
     GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
-    GGML_METAL_DEL_KERNEL(rope);
+    GGML_METAL_DEL_KERNEL(rope_f32);
+    GGML_METAL_DEL_KERNEL(rope_f16);
     GGML_METAL_DEL_KERNEL(alibi_f32);
     GGML_METAL_DEL_KERNEL(cpy_f32_f16);
     GGML_METAL_DEL_KERNEL(cpy_f32_f32);
@@ -870,7 +873,7 @@ void ggml_metal_graph_compute(
                         } break;
                     case GGML_OP_SOFT_MAX:
                         {
-                            const int nth = 32;
+                            const int nth = MIN(32, ne00);
 
                             if (ne00%4 == 0) {
                                 [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
@@ -1134,7 +1137,7 @@ void ggml_metal_graph_compute(
                             float eps;
                             memcpy(&eps, dst->op_params, sizeof(float));
 
-                            const int nth = 512;
+                            const int nth = MIN(512, ne00);
 
                             [encoder setComputePipelineState:ctx->pipeline_rms_norm];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1153,7 +1156,7 @@ void ggml_metal_graph_compute(
                             float eps;
                             memcpy(&eps, dst->op_params, sizeof(float));
 
-                            const int nth = 256;
+                            const int nth = MIN(256, ne00);
 
                             [encoder setComputePipelineState:ctx->pipeline_norm];
                             [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
@@ -1171,6 +1174,8 @@ void ggml_metal_graph_compute(
                         {
                             GGML_ASSERT((src0t == GGML_TYPE_F32));
 
+                            const int nth = MIN(1024, ne00);
+
                             const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
                             const int n_head = ((int32_t *) dst->op_params)[1];
                             float max_bias;
@@ -1204,15 +1209,15 @@ void ggml_metal_graph_compute(
                             [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
                             [encoder setBytes:&m0  length:sizeof(    float) atIndex:18];
 
-                            const int nth = 32;
-
                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
                     case GGML_OP_ROPE:
                         {
                             GGML_ASSERT(ne10 == ne02);
 
-                            //const int n_past = ((int32_t *) dst->op_params)[0];
+                            const int nth = MIN(1024, ne00);
+
+                            const int n_past = ((int32_t *) dst->op_params)[0];
                             const int n_dims = ((int32_t *) dst->op_params)[1];
                             const int mode   = ((int32_t *) dst->op_params)[2];
 
@@ -1221,7 +1226,12 @@ void ggml_metal_graph_compute(
                             memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
                             memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
 
-                            [encoder setComputePipelineState:ctx->pipeline_rope];
+                            switch (src0->type) {
+                                case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
+                                case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_rope_f16]; break;
+                                default: GGML_ASSERT(false);
+                            };
+
                             [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
                             [encoder setBuffer:id_src1 offset:offs_src1        atIndex:1];
                             [encoder setBuffer:id_dst  offset:offs_dst         atIndex:2];
@@ -1241,19 +1251,19 @@ void ggml_metal_graph_compute(
                             [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:16];
                             [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:17];
                             [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:18];
-                            //[encoder setBytes:&n_past  length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:19];
                             [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:20];
                             [encoder setBytes:&mode    length:sizeof(     int) atIndex:21];
                             [encoder setBytes:&freq_base  length:sizeof(float) atIndex:22];
                             [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                         } break;
                     case GGML_OP_DUP:
                     case GGML_OP_CPY:
                     case GGML_OP_CONT:
                         {
-                            const int nth = 32;
+                            const int nth = MIN(1024, ne00);
 
                             switch (src0t) {
                                 case GGML_TYPE_F32:
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 16937be5e17c1..5e1af6a092aed 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -853,6 +853,36 @@ kernel void kernel_alibi_f32(
     }
 }
 
+typedef void (rope_t)(
+        device const    void * src0,
+        device const int32_t * src1,
+        device         float * dst,
+        constant     int64_t & ne00,
+        constant     int64_t & ne01,
+        constant     int64_t & ne02,
+        constant     int64_t & ne03,
+        constant    uint64_t & nb00,
+        constant    uint64_t & nb01,
+        constant    uint64_t & nb02,
+        constant    uint64_t & nb03,
+        constant     int64_t & ne0,
+        constant     int64_t & ne1,
+        constant     int64_t & ne2,
+        constant     int64_t & ne3,
+        constant    uint64_t & nb0,
+        constant    uint64_t & nb1,
+        constant    uint64_t & nb2,
+        constant    uint64_t & nb3,
+        constant         int & n_past,
+        constant         int & n_dims,
+        constant         int & mode,
+        constant       float & freq_base,
+        constant       float & freq_scale,
+        uint  tiitg[[thread_index_in_threadgroup]],
+        uint3 tptg[[threads_per_threadgroup]],
+        uint3 tgpig[[threadgroup_position_in_grid]]);
+
+template<typename T>
 kernel void kernel_rope(
         device const    void * src0,
         device const int32_t * src1,
@@ -901,11 +931,11 @@ kernel void kernel_rope(
             const float cos_theta = cos(theta);
             const float sin_theta = sin(theta);
 
-            device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-            device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
-            const float x0 = src[0];
-            const float x1 = src[1];
+            const T x0 = src[0];
+            const T x1 = src[1];
 
             dst_data[0] = x0*cos_theta - x1*sin_theta;
             dst_data[1] = x0*sin_theta + x1*cos_theta;
@@ -920,8 +950,8 @@ kernel void kernel_rope(
 
                 const int64_t i0 = ib*n_dims + ic/2;
 
-                device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                device       float * dst_data  = (device float *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
                 const float x0 = src[0];
                 const float x1 = src[n_dims/2];
@@ -933,6 +963,9 @@ kernel void kernel_rope(
     }
 }
 
+template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
+template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
+
 kernel void kernel_cpy_f16_f16(
         device const half * src0,
         device       half * dst,

From d29e76937c3c5e1a128d2883bd1149e3b5dfb652 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 10:08:22 +0300
Subject: [PATCH 05/55] llama : unified KV cache + batch inference API

---
 common/common.cpp                    |   6 +-
 common/common.h                      |   1 -
 examples/beam-search/beam-search.cpp |   3 +-
 examples/main/main.cpp               |   9 -
 examples/perplexity/perplexity.cpp   |   2 +-
 examples/simple/simple.cpp           |   6 +-
 ggml.c                               |   6 +-
 llama.cpp                            | 466 ++++++++++++++++-----------
 llama.h                              |  34 +-
 tests/test-tokenizer-1-llama.cpp     |  18 +-
 10 files changed, 315 insertions(+), 236 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 9969cb97d3c2a..8fbff1da783b1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -436,8 +436,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.use_mmap = false;
         } else if (arg == "--numa") {
             params.numa = true;
-        } else if (arg == "--export") {
-            params.export_cgraph = true;
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -685,7 +683,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif // GGML_USE_CUBLAS
 #endif
-    printf("  --export              export the computation graph to 'llama.ggml'\n");
     printf("  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
     printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
@@ -782,7 +779,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
     {
         LOG("warming up the model with an empty run\n");
 
-        const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
+        std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
         llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
         llama_reset_timings(lctx);
     }
@@ -1182,7 +1179,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
     fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
     fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
-    fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
     fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
     fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
     dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
diff --git a/common/common.h b/common/common.h
index 37d15415f15dd..504e5944e9c37 100644
--- a/common/common.h
+++ b/common/common.h
@@ -111,7 +111,6 @@ struct gpt_params {
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool numa              = false; // attempt optimizations that help on some NUMA systems
-    bool export_cgraph     = false; // export the computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
 };
 
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
index 6b31aea78823e..37c9f81a9297d 100644
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -158,7 +158,8 @@ int main(int argc, char ** argv)
     }
     std::cout << std::flush;
 
-    int n_past = llama_get_kv_cache_token_count(ctx);
+    int n_past = 0;
+
     if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
     {
         fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index a8179f1bf011f..b9d26bc758696 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -198,15 +198,6 @@ int main(int argc, char ** argv) {
                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
     }
 
-    // export the cgraph and exit
-    if (params.export_cgraph) {
-        llama_eval_export(ctx, "llama.ggml");
-        llama_free(ctx);
-        llama_free_model(model);
-
-        return 0;
-    }
-
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 3a1c8c28da09b..2b7472dcc3bb5 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -400,7 +400,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
     return {tokens, ppl, logit_history, prob_history};
 }
 
-std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
+std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int> & tokens, int n_past, int n_batch,
         int n_vocab, int n_thread) {
     std::vector<float> result;
     result.reserve(tokens.size() * n_vocab);
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index ba5de0cc61e54..9f160376a4148 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -73,10 +73,12 @@ int main(int argc, char ** argv) {
 
     const int n_gen = std::min(32, max_context_size);
 
-    while (llama_get_kv_cache_token_count(ctx) < n_gen) {
+    int n_cur = 0;
+
+    while (n_cur < n_gen) {
         // evaluate the transformer
 
-        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
+        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), n_cur, params.n_threads)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return 1;
         }
diff --git a/ggml.c b/ggml.c
index e4faafee6d115..2075617946a6c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -12462,13 +12462,11 @@ static void ggml_compute_forward_alibi_f16(
         return;
     }
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
+    //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_head = ((int32_t *) dst->op_params)[1];
     float max_bias;
     memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
 
-    assert(n_past >= 0);
-
     const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
     const int ne1 = src0->ne[1]; // seq_len_without_past
     const int ne2 = src0->ne[2]; // n_head -> this is k
@@ -12483,7 +12481,7 @@ static void ggml_compute_forward_alibi_f16(
     //const int nb3 = src0->nb[3];
 
     GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
+    //GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
     GGML_ASSERT(n_head == ne2);
 
     // add alibi to src0 (KQ_scaled)
diff --git a/llama.cpp b/llama.cpp
index 9d41689f78a3f..532937da8f56c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -71,6 +71,7 @@
 #include <sstream>
 #include <thread>
 #include <unordered_map>
+#include <set>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -975,7 +976,25 @@ struct llama_layer {
     struct ggml_tensor * w3; // ffn_up
 };
 
+struct llama_kv_cell {
+    llama_pos pos = -1;
+
+    std::set<llama_seq_id> seq_id;
+
+    bool has_seq_id(const llama_seq_id & id) const {
+        return seq_id.find(id) != seq_id.end();
+    }
+};
+
+// ring-buffer of cached KV data
 struct llama_kv_cache {
+    bool is_roped = false;
+
+    uint32_t head = 0;
+    uint32_t size = 0;
+
+    std::vector<llama_kv_cell> cells;
+
     struct ggml_tensor * k = NULL;
     struct ggml_tensor * v = NULL;
 
@@ -983,8 +1002,6 @@ struct llama_kv_cache {
 
     llama_buffer buf;
 
-    int n; // number of tokens currently in the cache
-
     ~llama_kv_cache() {
         if (ctx) {
             ggml_free(ctx);
@@ -1167,16 +1184,21 @@ static bool llama_kv_cache_init(
         const struct llama_hparams & hparams,
              struct llama_kv_cache & cache,
                          ggml_type   wtype,
-                               int   n_ctx,
                                int   n_gpu_layers) {
-    const int n_embd  = hparams.n_embd_gqa();
-    const int n_layer = hparams.n_layer;
+    const uint32_t n_embd  = hparams.n_embd_gqa();
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_ctx   = hparams.n_ctx;
 
     const int64_t n_mem      = n_layer*n_ctx;
     const int64_t n_elements = n_embd*n_mem;
 
+    cache.head = 0;
+    cache.size = n_ctx;
+
+    cache.cells.clear();
+    cache.cells.resize(n_ctx);
+
     cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
-    cache.n = 0;
 
     struct ggml_init_params params;
     params.mem_size   = cache.buf.size;
@@ -1208,6 +1230,68 @@ static bool llama_kv_cache_init(
     return true;
 }
 
+// find an empty slot of size "n_tokens" in the cache
+// updates the cache head
+static bool llama_kv_cache_find_slot(
+             struct llama_kv_cache & cache,
+                struct llama_batch & batch) {
+    const uint32_t n_ctx    = cache.size;
+    const uint32_t n_tokens = batch.n_tokens;
+
+    if (n_tokens > n_ctx) {
+        LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
+        return false;
+    }
+
+    uint32_t n_tested = 0;
+
+    while (true) {
+        if (cache.head + n_tokens > n_ctx) {
+            cache.head = 0;
+            n_tested   += n_ctx - cache.head;
+            continue;
+        }
+
+        bool found = true;
+        for (uint32_t i = 0; i < n_tokens; i++) {
+            if (cache.cells[cache.head + i].pos >= 0) {
+                found = false;
+                cache.head += i + 1;
+                n_tested   += i + 1;
+                break;
+            }
+        }
+
+        if (found) {
+            break;
+        }
+
+        if (n_tested >= n_ctx) {
+            LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+            return false;
+        }
+    }
+
+    for (uint32_t i = 0; i < n_tokens; i++) {
+        cache.cells[cache.head + i].pos = batch.pos[i];
+        cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i]);
+    }
+
+    return true;
+}
+
+void llama_kv_cache_clear(struct llama_kv_cache & cache, int32_t p0, int32_t p1) {
+    cache.head = p0;
+
+    if (p0 < 0) p0 = 0;
+    if (p1 < 0) p1 = cache.size;
+
+    for (int32_t i = p0; i < p1; ++i) {
+        cache.cells[i].pos = -1;
+        cache.cells[i].seq_id.clear();
+    }
+}
+
 //
 // model loading and saving
 //
@@ -2308,15 +2392,7 @@ static bool llama_model_load(
 
 static struct ggml_cgraph * llm_build_llama(
          llama_context & lctx,
-     const llama_token * tokens,
-           const float * embd,
-                   int   n_tokens,
-                   int   n_past) {
-
-    GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
-
-    const int N = n_tokens;
-
+           llama_batch & batch) {
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
 
@@ -2340,6 +2416,8 @@ static struct ggml_cgraph * llm_build_llama(
 
     const int n_gpu_layers = model.n_gpu_layers;
 
+    const int32_t n_tokens = batch.n_tokens;
+
     auto & buf_compute = lctx.buf_compute;
 
     struct ggml_init_params params = {
@@ -2357,12 +2435,12 @@ static struct ggml_cgraph * llm_build_llama(
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
 
-    if (tokens) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    if (batch.token) {
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
 
         ggml_allocr_alloc(lctx.alloc, inp_tokens);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
+            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
         }
         ggml_set_name(inp_tokens, "inp_tokens");
 
@@ -2372,11 +2450,11 @@ static struct ggml_cgraph * llm_build_llama(
         GGML_ASSERT(false && "not implemented");
 #endif
 
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
 
         ggml_allocr_alloc(lctx.alloc, inpL);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
+            memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
         }
     }
 
@@ -2408,33 +2486,35 @@ static struct ggml_cgraph * llm_build_llama(
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
     ggml_allocr_alloc(lctx.alloc, KQ_scale);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
-        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+        ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
     }
     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
-    // KQ_mask
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, N, 1);
+    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_tokens, 1);
     ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         float * data = (float *) KQ_mask->data;
         memset(data, 0, ggml_nbytes(KQ_mask));
 
         for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < N; ++j) {
-                for (int i = n_past + j + 1; i < n_past + N; ++i) {
-                    data[h*(n_past + N)*N + j*(n_past + N) + i] = -INFINITY;
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_ctx; ++i) {
+                    if (!kv_self.cells[i].has_seq_id(batch.seq_id[j]) || kv_self.cells[i].pos > batch.pos[j]) {
+                        data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
+                    }
                 }
             }
         }
     }
 
     // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
     ggml_allocr_alloc(lctx.alloc, KQ_pos);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < N; ++i) {
-            data[i] = n_past + i;
+        for (int i = 0; i < n_tokens; ++i) {
+            data[i] = batch.pos[i];
         }
     }
 
@@ -2474,33 +2554,33 @@ static struct ggml_cgraph * llm_build_llama(
             offload_func_kq(tmpq);
             ggml_set_name(tmpq, "tmpq");
 
-            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
-            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens),    KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Qcur);
             ggml_set_name(Qcur, "Qcur");
 
             // store key and value to memory
             {
-                // compute the transposed [N, n_embd] V matrix
+                // compute the transposed [n_tokens, n_embd] V matrix
 
                 struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
                 offload_func_v(tmpv);
                 ggml_set_name(tmpv, "tmpv");
 
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
                 offload_func_v(Vcur);
                 ggml_set_name(Vcur, "Vcur");
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
                 offload_func_kq(k);
                 ggml_set_name(k, "k");
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
+                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
                         (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
                 offload_func_v(v);
                 ggml_set_name(v, "v");
 
@@ -2515,7 +2595,7 @@ static struct ggml_cgraph * llm_build_llama(
 
             struct ggml_tensor * K =
                 ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_past + N, n_head_kv,
+                        n_embd_head, n_ctx, n_head_kv,
                         ggml_element_size(kv_self.k)*n_embd_gqa,
                         ggml_element_size(kv_self.k)*n_embd_head,
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2528,14 +2608,13 @@ static struct ggml_cgraph * llm_build_llama(
             ggml_set_name(KQ, "KQ");
 
             // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_past + N, N, n_head, 1]
+            // KQ_scaled shape [n_ctx, n_tokens, n_head, 1]
             struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
             offload_func_kq(KQ_scaled);
             ggml_set_name(KQ_scaled, "KQ_scaled");
 
             // KQ_masked = mask_past(KQ_scaled)
             struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
             offload_func_kq(KQ_masked);
             ggml_set_name(KQ_masked, "KQ_masked");
 
@@ -2547,7 +2626,7 @@ static struct ggml_cgraph * llm_build_llama(
             // split cached V into n_head heads
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
-                        n_past + N, n_embd_head, n_head_kv,
+                        n_ctx, n_embd_head, n_head_kv,
                         ggml_element_size(kv_self.v)*n_ctx,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -2562,7 +2641,7 @@ static struct ggml_cgraph * llm_build_llama(
             // make V contiguous in memory to speed up the matmul, however we waste time on the copy
             // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
             // is there a better way?
-            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
+            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
 #endif
 
@@ -2571,10 +2650,10 @@ static struct ggml_cgraph * llm_build_llama(
             offload_func_v(KQV_merged);
             ggml_set_name(KQV_merged, "KQV_merged");
 
-            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
             cur = ggml_cpy(ctx0,
                     KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
             offload_func_v(cur);
             ggml_set_name(cur, "KQV_merged_contiguous");
 
@@ -2665,18 +2744,9 @@ static struct ggml_cgraph * llm_build_llama(
     return gf;
 }
 
-
 static struct ggml_cgraph * llm_build_baichaun(
          llama_context & lctx,
-     const llama_token * tokens,
-           const float * embd,
-                   int   n_tokens,
-                   int   n_past) {
-
-    GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
-
-    const int N = n_tokens;
-
+           llama_batch & batch) {
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
 
@@ -2700,6 +2770,8 @@ static struct ggml_cgraph * llm_build_baichaun(
 
     const int n_gpu_layers = model.n_gpu_layers;
 
+    const int32_t n_tokens = batch.n_tokens;
+
     auto & buf_compute = lctx.buf_compute;
 
     struct ggml_init_params params = {
@@ -2717,12 +2789,12 @@ static struct ggml_cgraph * llm_build_baichaun(
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
 
-    if (tokens) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    if (batch.token) {
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
 
         ggml_allocr_alloc(lctx.alloc, inp_tokens);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
+            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
         }
         ggml_set_name(inp_tokens, "inp_tokens");
 
@@ -2732,11 +2804,11 @@ static struct ggml_cgraph * llm_build_baichaun(
         GGML_ASSERT(false && "not implemented");
 #endif
 
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
 
         ggml_allocr_alloc(lctx.alloc, inpL);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
+            memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
         }
     }
 
@@ -2772,29 +2844,31 @@ static struct ggml_cgraph * llm_build_baichaun(
     }
     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
-    // KQ_mask
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, N, 1);
+    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_tokens, 1);
     ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         float * data = (float *) KQ_mask->data;
         memset(data, 0, ggml_nbytes(KQ_mask));
 
         for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < N; ++j) {
-                for (int i = n_past + j + 1; i < n_past + N; ++i) {
-                    data[h*(n_past + N)*N + j*(n_past + N) + i] = -INFINITY;
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_ctx; ++i) {
+                    if (!kv_self.cells[i].has_seq_id(batch.seq_id[j]) || kv_self.cells[i].pos > batch.pos[j]) {
+                        data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
+                    }
                 }
             }
         }
     }
 
     // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
     ggml_allocr_alloc(lctx.alloc, KQ_pos);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < N; ++i) {
-            data[i] = n_past + i;
+        for (int i = 0; i < n_tokens; ++i) {
+            data[i] = batch.pos[i];
         }
     }
 
@@ -2838,12 +2912,12 @@ static struct ggml_cgraph * llm_build_baichaun(
             struct ggml_tensor * Qcur;
             switch (model.type) {
                 case MODEL_7B:
-                    Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
-                    Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+                    Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+                    Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens),    KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
                     break;
                 case MODEL_13B:
-                    Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
-                    Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
+                    Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens);
+                    Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens);
                     break;
                 default:
                     GGML_ASSERT(false);
@@ -2857,23 +2931,23 @@ static struct ggml_cgraph * llm_build_baichaun(
 
             // store key and value to memory
             {
-                // compute the transposed [N, n_embd] V matrix
+                // compute the transposed [n_tokens, n_embd] V matrix
 
                 struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
                 offload_func_v(tmpv);
                 ggml_set_name(tmpv, "tmpv");
 
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
                 offload_func_v(Vcur);
                 ggml_set_name(Vcur, "Vcur");
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
                 offload_func_kq(k);
                 ggml_set_name(k, "k");
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
+                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
                         (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
                 offload_func_v(v);
                 ggml_set_name(v, "v");
 
@@ -2888,7 +2962,7 @@ static struct ggml_cgraph * llm_build_baichaun(
 
             struct ggml_tensor * K =
                 ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_past + N, n_head_kv,
+                        n_embd_head, n_ctx, n_head_kv,
                         ggml_element_size(kv_self.k)*n_embd_gqa,
                         ggml_element_size(kv_self.k)*n_embd_head,
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2901,7 +2975,7 @@ static struct ggml_cgraph * llm_build_baichaun(
             ggml_set_name(KQ, "KQ");
 
             // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_past + N, N, n_head, 1]
+            // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
             struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
             offload_func_kq(KQ_scaled);
             ggml_set_name(KQ_scaled, "KQ_scaled");
@@ -2912,22 +2986,16 @@ static struct ggml_cgraph * llm_build_baichaun(
             switch (model.type) {
                 case MODEL_7B:
                     KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
-                    //KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
                     break;
                 case MODEL_13B:
-                    KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
+                    // TODO: replace with ggml_add()
+                    KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
                     ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
                     KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
-                    //KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
                     break;
                 default:
                     GGML_ASSERT(false);
             }
-            // KQ_masked = mask_past(KQ_scaled)
-            // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
-            // offload_func_kq(KQ_masked);
-            // ggml_set_name(KQ_masked, "KQ_masked");
 
             // KQ = soft_max(KQ_masked)
             struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
@@ -2937,34 +3005,26 @@ static struct ggml_cgraph * llm_build_baichaun(
             // split cached V into n_head heads
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
-                        n_past + N, n_embd_head, n_head_kv,
+                        n_ctx, n_embd_head, n_head_kv,
                         ggml_element_size(kv_self.v)*n_ctx,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
             offload_func_v(V);
             ggml_set_name(V, "V");
 
-#if 1
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
             offload_func_v(KQV);
             ggml_set_name(KQV, "KQV");
-#else
-            // make V contiguous in memory to speed up the matmul, however we waste time on the copy
-            // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
-            // is there a better way?
-            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
-#endif
 
             // KQV_merged = KQV.permute(0, 2, 1, 3)
             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
             offload_func_v(KQV_merged);
             ggml_set_name(KQV_merged, "KQV_merged");
 
-            // cur = KQV_merged.contiguous().view(n_embd, N)
+            // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
             cur = ggml_cpy(ctx0,
                     KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
             offload_func_v(cur);
             ggml_set_name(cur, "KQV_merged_contiguous");
 
@@ -3057,15 +3117,7 @@ static struct ggml_cgraph * llm_build_baichaun(
 
 static struct ggml_cgraph * llm_build_falcon(
          llama_context & lctx,
-     const llama_token * tokens,
-           const float * embd,
-                   int   n_tokens,
-                   int   n_past) {
-
-    GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
-
-    const int N = n_tokens;
-
+           llama_batch & batch) {
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
 
@@ -3089,6 +3141,8 @@ static struct ggml_cgraph * llm_build_falcon(
 
     const int n_gpu_layers = model.n_gpu_layers;
 
+    const int32_t n_tokens = batch.n_tokens;
+
     auto & buf_compute = lctx.buf_compute;
 
     struct ggml_init_params params = {
@@ -3106,12 +3160,12 @@ static struct ggml_cgraph * llm_build_falcon(
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
 
-    if (tokens) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    if (batch.token) {
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
 
         ggml_allocr_alloc(lctx.alloc, inp_tokens);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
+            memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
         }
         ggml_set_name(inp_tokens, "inp_tokens");
 
@@ -3121,11 +3175,11 @@ static struct ggml_cgraph * llm_build_falcon(
         GGML_ASSERT(false && "not implemented");
 #endif
 
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
 
         ggml_allocr_alloc(lctx.alloc, inpL);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
-            memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
+            memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
         }
     }
 
@@ -3161,29 +3215,31 @@ static struct ggml_cgraph * llm_build_falcon(
     }
     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
-    // KQ_mask
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, N, 1);
+    // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_tokens, 1);
     ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         float * data = (float *) KQ_mask->data;
         memset(data, 0, ggml_nbytes(KQ_mask));
 
         for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < N; ++j) {
-                for (int i = n_past + j + 1; i < n_past + N; ++i) {
-                    data[h*(n_past + N)*N + j*(n_past + N) + i] = -INFINITY;
+            for (int j = 0; j < n_tokens; ++j) {
+                for (int i = 0; i < n_ctx; ++i) {
+                    if (!kv_self.cells[i].has_seq_id(batch.seq_id[j]) || kv_self.cells[i].pos > batch.pos[j]) {
+                        data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
+                    }
                 }
             }
         }
     }
 
     // KQ_pos - contains the positions
-    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
     ggml_allocr_alloc(lctx.alloc, KQ_pos);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         int * data = (int *) KQ_pos->data;
-        for (int i = 0; i < N; ++i) {
-            data[i] = n_past + i;
+        for (int i = 0; i < n_tokens; ++i) {
+            data[i] = batch.pos[i];
         }
     }
 
@@ -3242,21 +3298,21 @@ static struct ggml_cgraph * llm_build_falcon(
             // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
             //       non-contiguous views is added for the rope operator
             struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head, N,
+                ctx0, cur, n_embd_head, n_head, n_tokens,
                 wsize * n_embd_head,
                 wsize * n_embd_head * (n_head + 2 * n_head_kv),
                 0));
             offload_func_kq(tmpq);
 
             struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, N,
+                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
                 wsize * n_embd_head,
                 wsize * n_embd_head * (n_head + 2 * n_head_kv),
                 wsize * n_embd_head *  n_head));
             offload_func_kq(tmpk);
 
             struct ggml_tensor * tmpv = ggml_view_3d(
-                ctx0, cur, n_embd_head, n_head_kv, N,
+                ctx0, cur, n_embd_head, n_head_kv, n_tokens,
                 wsize * n_embd_head,
                 wsize * n_embd_head * (n_head + 2 * n_head_kv),
                 wsize * n_embd_head * (n_head +     n_head_kv));
@@ -3269,18 +3325,18 @@ static struct ggml_cgraph * llm_build_falcon(
             offload_func_kq(Kcur);
 
             {
-                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
                 offload_func_v(Vcur);
                 offload_func_v(Vcur->src[0]->src[0]);
                 ggml_set_name(Vcur, "Vcur");
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
                 offload_func_kq(k);
                 ggml_set_name(k, "k");
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
+                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
                         (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
                 offload_func_v(v);
 
                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -3293,7 +3349,7 @@ static struct ggml_cgraph * llm_build_falcon(
 
             struct ggml_tensor * K =
                 ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_past + N, n_head_kv,
+                        n_embd_head, n_ctx, n_head_kv,
                         ggml_element_size(kv_self.k)*n_embd_gqa,
                         ggml_element_size(kv_self.k)*n_embd_head,
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3308,7 +3364,7 @@ static struct ggml_cgraph * llm_build_falcon(
             offload_func_kq(KQ_scaled);
             ggml_set_name(KQ_scaled, "KQ_scaled");
 
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
             offload_func_kq(KQ_masked);
             ggml_set_name(KQ_masked, "KQ_masked");
 
@@ -3318,7 +3374,7 @@ static struct ggml_cgraph * llm_build_falcon(
 
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
-                        n_past + N, n_embd_head, n_head_kv,
+                        n_ctx, n_embd_head, n_head_kv,
                         ggml_element_size(kv_self.v)*n_ctx,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3333,7 +3389,7 @@ static struct ggml_cgraph * llm_build_falcon(
             offload_func_v(KQV_merged);
             ggml_set_name(KQV_merged, "KQV_merged");
 
-            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
             offload_func_v(cur);
             ggml_set_name(cur, "KQV_merged_contiguous");
 
@@ -3391,10 +3447,7 @@ static struct ggml_cgraph * llm_build_falcon(
 
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
-     const llama_token * tokens,
-           const float * embd,
-                   int   n_tokens,
-                   int   n_past) {
+           llama_batch & batch) {
     const auto & model = lctx.model;
 
     struct ggml_cgraph * result = NULL;
@@ -3402,15 +3455,15 @@ static struct ggml_cgraph * llama_build_graph(
     switch (model.arch) {
         case LLM_ARCH_LLAMA:
             {
-                result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
+                result = llm_build_llama(lctx, batch);
             } break;
         case LLM_ARCH_BAICHUAN:
             {
-                result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
+                result = llm_build_baichaun(lctx, batch);
             } break;
         case LLM_ARCH_FALCON:
             {
-                result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
+                result = llm_build_falcon(lctx, batch);
             } break;
         default:
             GGML_ASSERT(false);
@@ -3422,52 +3475,48 @@ static struct ggml_cgraph * llama_build_graph(
 // evaluate the transformer
 //
 //   - lctx:      llama context
-//   - tokens:    new batch of tokens to process
-//   - embd       embeddings input
-//   - n_tokens   number of tokens
-//   - n_past:    the context size so far
+//   - batch:     batch to evaluate
 //   - n_threads: number of threads to use
 //
 static bool llama_eval_internal(
          llama_context & lctx,
-     const llama_token * tokens,
-           const float * embd,
-                   int   n_tokens,
-                   int   n_past,
-                   int   n_threads,
-            const char * cgraph_fname) {
+           llama_batch & batch,
+                   int   n_threads) {
+    const uint32_t n_tokens = batch.n_tokens;
 
-    GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
+    if (n_tokens == 0) {
+        LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
+        return false;
+    }
 
-    GGML_ASSERT(n_tokens > 0);
-    GGML_ASSERT(n_past >= 0);
-    // TODO: keep the values of n_batch and n_ctx
-    // GGML_ASSERT(n_tokens <= n_batch);
-    // GGML_ASSERT(n_past + n_tokens <= n_ctx);
+    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
     const int64_t t_start_us = ggml_time_us();
 
 #ifdef GGML_USE_MPI
+    // TODO: needs fix after #3228
     ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
 #endif
 
     GGML_ASSERT(n_threads > 0);
 
-    const int N = n_tokens;
-
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
 
-    const auto & kv_self = lctx.kv_self;
+    auto & kv_self = lctx.kv_self;
 
     GGML_ASSERT(!!kv_self.ctx);
 
     const int64_t n_embd  = hparams.n_embd;
     const int64_t n_vocab = hparams.n_vocab;
 
+    if (!llama_kv_cache_find_slot(kv_self, batch)) {
+        return false;
+    }
+
     ggml_allocr_reset(lctx.alloc);
 
-    ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
+    ggml_cgraph * gf = llama_build_graph(lctx, batch);
 
     ggml_allocr_alloc_graph(lctx.alloc, gf);
 
@@ -3494,7 +3543,7 @@ static bool llama_eval_internal(
     // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
     //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
     //       with the BLAS calls. need a better solution
-    if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
+    if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
         n_threads = std::min(4, n_threads);
     }
 
@@ -3524,12 +3573,8 @@ static bool llama_eval_internal(
     ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
 #endif
 
-    // update kv token count
-    lctx.kv_self.n = n_past + N;
-
-    if (cgraph_fname) {
-        ggml_graph_export(gf, cgraph_fname);
-    }
+    // update the kv ring buffer head
+    lctx.kv_self.head += n_tokens;
 
 #ifdef GGML_PERF
     // print timing information per ggml operation (for debugging purposes)
@@ -3547,12 +3592,12 @@ static bool llama_eval_internal(
         auto & logits_out = lctx.logits;
 
         if (lctx.logits_all) {
-            logits_out.resize(n_vocab * N);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
+            logits_out.resize(n_vocab * n_tokens);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
         } else {
             // return result for just the last token
             logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
         }
     }
 
@@ -3561,17 +3606,17 @@ static bool llama_eval_internal(
         auto & embedding_out = lctx.embedding;
 
         embedding_out.resize(n_embd);
-        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
+        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
     }
 
     // measure the performance only for the single-token evals
-    if (N == 1) {
+    if (n_tokens == 1) {
         lctx.t_eval_us += ggml_time_us() - t_start_us;
         lctx.n_eval++;
     }
-    else if (N > 1) {
+    else if (n_tokens > 1) {
         lctx.t_p_eval_us += ggml_time_us() - t_start_us;
-        lctx.n_p_eval += N;
+        lctx.n_p_eval += n_tokens;
     }
 
     return true;
@@ -6043,12 +6088,16 @@ struct llama_context * llama_new_context_with_model(
 
     // reserve memory for context buffers
     if (!params.vocab_only) {
-        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
+        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, params.n_gpu_layers)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
             return nullptr;
         }
 
+        if (model->arch == LLM_ARCH_LLAMA) {
+            ctx->kv_self.is_roped = true;
+        }
+
         {
             const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
             LLAMA_LOG_INFO("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
@@ -6076,10 +6125,11 @@ struct llama_context * llama_new_context_with_model(
             ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
 
             // build worst-case graph
-            int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
-            int n_past = hparams.n_ctx - n_tokens;
+            uint32_t n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
             llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-            ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
+            llama_batch batch = { n_tokens, &token, nullptr, nullptr, nullptr };
+            ggml_cgraph * gf = llama_build_graph(*ctx, batch);
+
 #ifdef GGML_USE_METAL
             if (params.n_gpu_layers > 0) {
                 ctx->ctx_metal = ggml_metal_init(1);
@@ -6279,7 +6329,11 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
 }
 
 int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
-    return ctx->kv_self.n;
+    return ctx->kv_self.head;
+}
+
+void llama_kv_clear(struct llama_context * ctx, int32_t p0, int32_t p1) {
+    llama_kv_cache_clear(ctx->kv_self, p0, p1);
 }
 
 #define LLAMA_MAX_RNG_STATE (64*1024)
@@ -6376,6 +6430,16 @@ struct llama_data_file_context : llama_data_context {
  *
 */
 void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
+    // TODO: does not support multi-sequence states
+    {
+        const auto & kv_self = ctx->kv_self;
+        for (uint32_t i = 0; i < kv_self.head; ++i) {
+            GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
+            GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
+            GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
+        }
+    }
+
     // copy rng
     {
         std::stringstream rng_ss;
@@ -6431,7 +6495,7 @@ void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_conte
         const int    n_ctx   = hparams.n_ctx;
 
         const size_t kv_size = kv_self.buf.size;
-        const int    kv_ntok = llama_get_kv_cache_token_count(ctx);
+        const int    kv_ntok = kv_self.head;
 
         data_ctx->write(&kv_size, sizeof(kv_size));
         data_ctx->write(&kv_ntok, sizeof(kv_ntok));
@@ -6575,7 +6639,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
             ggml_free(cpy_ctx);
         }
 
-        ctx->kv_self.n = kv_ntok;
+        ctx->kv_self.head = kv_ntok;
     }
 
     const size_t nread    = inp - src;
@@ -6671,10 +6735,24 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
 int llama_eval(
         struct llama_context * ctx,
            const llama_token * tokens,
-                         int   n_tokens,
+                    uint32_t   n_tokens,
                          int   n_past,
                          int   n_threads) {
-    if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
+    std::vector<llama_pos> pos(n_tokens);
+    for (uint32_t i = 0; i < n_tokens; i++) {
+        pos[i] = n_past + i;
+    }
+
+    std::vector<llama_seq_id> seq_id(n_tokens);
+    for (uint32_t i = 0; i < n_tokens; i++) {
+        seq_id[i] = 0;
+    }
+
+    llama_batch batch = { n_tokens, tokens, nullptr, pos.data(), seq_id.data(), };
+
+    llama_kv_cache_clear(ctx->kv_self, n_past, -1);
+
+    if (!llama_eval_internal(*ctx, batch, n_threads)) {
         LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
         return 1;
     }
@@ -6692,10 +6770,22 @@ int llama_eval(
 int llama_eval_embd(
             struct llama_context * ctx,
                      const float * embd,
-                             int   n_tokens,
+                        uint32_t   n_tokens,
                              int   n_past,
                              int   n_threads) {
-    if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
+    std::vector<llama_pos> pos(n_tokens);
+    for (uint32_t i = 0; i < n_tokens; i++) {
+        pos[i] = n_past + i;
+    }
+
+    std::vector<llama_seq_id> seq_id(n_tokens);
+    for (uint32_t i = 0; i < n_tokens; i++) {
+        seq_id[i] = 0;
+    }
+
+    llama_batch batch = { n_tokens, nullptr, embd, pos.data(), seq_id.data(), };
+
+    if (!llama_eval_internal(*ctx, batch, n_threads)) {
         LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
         return 1;
     }
@@ -6710,20 +6800,6 @@ int llama_eval_embd(
     return 0;
 }
 
-int llama_eval_export(struct llama_context * ctx, const char * fname) {
-    const int n_batch = 1;
-    const int n_ctx   = 512 - n_batch;
-
-    const std::vector<llama_token> tmp(n_batch, llama_token_bos(ctx));
-
-    if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
-        LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
-        return 1;
-    }
-
-    return 0;
-}
-
 float * llama_get_logits(struct llama_context * ctx) {
     return ctx->logits.data();
 }
diff --git a/llama.h b/llama.h
index 37975bebed22e..043b62e10f106 100644
--- a/llama.h
+++ b/llama.h
@@ -60,7 +60,20 @@ extern "C" {
     struct llama_model;
     struct llama_context;
 
-    typedef int llama_token;
+    typedef int32_t llama_pos;
+    typedef int32_t llama_token;
+    typedef int32_t llama_seq_id;
+
+    // data used for batch inference
+    typedef struct llama_batch {
+        uint32_t n_tokens;
+
+        // TODO: not sure about these consts - might just get in the way all the time with no benefit
+        const llama_token  * token;
+        const       float  * embd;
+        const llama_pos    * pos;
+        const llama_seq_id * seq_id;
+    } llama_seq;
 
     enum llama_log_level {
         LLAMA_LOG_LEVEL_ERROR = 2,
@@ -289,8 +302,15 @@ extern "C" {
                           const char * path_base_model,
                                  int   n_threads);
 
+    //
+    // KV cache API
+    //
+
     // Returns the number of tokens in the KV cache
-    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "avoid using this, it will be removed in the future");
+
+    LLAMA_API void llama_kv_clear(struct llama_context * ctx, int32_t p0, int32_t p1);
 
     // Sets the current rng seed.
     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
@@ -319,7 +339,7 @@ extern "C" {
     LLAMA_API int llama_eval(
             struct llama_context * ctx,
                const llama_token * tokens,
-                             int   n_tokens,
+                        uint32_t   n_tokens,
                              int   n_past,
                              int   n_threads);
 
@@ -327,16 +347,10 @@ extern "C" {
     LLAMA_API int llama_eval_embd(
             struct llama_context * ctx,
                      const float * embd,
-                             int   n_tokens,
+                        uint32_t   n_tokens,
                              int   n_past,
                              int   n_threads);
 
-    // Export a static computation graph for context of 511 and batch size of 1
-    // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
-    //       parameters here to keep things simple
-    // IMPORTANT: do not use for anything else other than debugging and testing!
-    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
-
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
     // Can be mutated in order to change the probabilities of the next token
diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp
index ab3d822f2b1cd..24c42037282c6 100644
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@@ -87,10 +87,11 @@ int main(int argc, char **argv) {
         std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
         std::string check = llama_detokenize_spm(ctx, tokens);
         if (check != str) {
-            fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%llu) but tokenization of this detokenizes to >%s<(%llu)\n",
-                __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
-            if(i != 3)
+            fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%d) but tokenization of this detokenizes to >%s<(%d)\n",
+                __func__, i, str.c_str(), (int) str.length(), check.c_str(), (int) check.length());
+            if (i != 3) {
                 return 2;
+            }
         }
     }
 
@@ -100,10 +101,11 @@ int main(int argc, char **argv) {
             std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
             std::string check = llama_detokenize_spm(ctx, tokens);
             if (str != check) {
-                fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
-                    __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-                if(cp != 0 && cp != 9601)
+                fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%d) instead of >%s<(%d)\n",
+                    __func__, cp, check.c_str(), (int) check.length(), str.c_str(), (int) str.length());
+                if (cp != 0 && cp != 9601) {
                     return 3;
+                }
             }
         }
     }
@@ -112,8 +114,8 @@ int main(int argc, char **argv) {
         std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
         std::string check = llama_detokenize_spm(ctx, tokens);
         if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
+            fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%d) instead of >%s<(%d)\n",
+                __func__, cp, check.c_str(), (int) check.length(), str.c_str(), (int) str.length());
             return 4;
         }
     }

From 9f42e75489e38d09792ccc169f2eb25a4387afdd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 14:23:52 +0300
Subject: [PATCH 06/55] llama : add new llama_decode() API that works with
 llama_batch

---
 common/common.cpp                            |   2 +-
 examples/beam-search/beam-search.cpp         |   2 +-
 examples/embd-input/embd-input-lib.cpp       |   5 +-
 examples/embedding/embedding.cpp             |   2 +-
 examples/llama-bench/llama-bench.cpp         |   4 +-
 examples/main/main.cpp                       |   4 +-
 examples/perplexity/perplexity.cpp           |   6 +-
 examples/save-load-state/save-load-state.cpp |  16 +--
 examples/server/server.cpp                   |   2 +-
 examples/simple/simple.cpp                   |   2 +-
 examples/speculative/speculative.cpp         |  12 +-
 llama.cpp                                    | 119 ++++++++++++-------
 llama.h                                      |  45 +++++--
 13 files changed, 146 insertions(+), 75 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index b8d306ae22511..b638efe9ebae8 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -780,7 +780,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
         LOG("warming up the model with an empty run\n");
 
         std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
-        llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
+        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0), params.n_threads);
         llama_reset_timings(lctx);
     }
 
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
index 2e0481ad6e2f5..63da7c3ec02a5 100644
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -160,7 +160,7 @@ int main(int argc, char ** argv)
 
     int n_past = 0;
 
-    if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
+    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0), params.n_threads))
     {
         fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
         return 1;
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index fc6e44eb20680..ed0966a511f0b 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -79,7 +79,8 @@ bool eval_float(void * model, float * input, int N){
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        if (llama_eval_embd(ctx, (input+i*n_emb), n_eval, n_past, params.n_threads)) {
+        llama_batch batch = { uint32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, n_past, 1, 0, false };
+        if (llama_decode(ctx, batch, params.n_threads)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return false;
         }
@@ -100,7 +101,7 @@ bool eval_tokens(void * model, std::vector<llama_token> tokens) {
         if (n_eval > params.n_batch) {
             n_eval = params.n_batch;
         }
-        if (llama_eval(ctx, &tokens[i], n_eval, n_past, params.n_threads)) {
+        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0), params.n_threads)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return false;
         }
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 0788f362c7460..54a156b28a009 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -77,7 +77,7 @@ int main(int argc, char ** argv) {
 
     while (!embd_inp.empty()) {
         int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
-        if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
+        if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0), params.n_threads)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return 1;
         }
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 34ddfde39d295..2551f84224cd6 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -891,7 +891,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
     int n_processed = 0;
     while (n_processed < n_prompt) {
         int n_tokens = std::min(n_prompt - n_processed, n_batch);
-        llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads);
+        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0), n_threads);
         n_processed += n_tokens;
     }
 }
@@ -899,7 +899,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
     llama_token token = llama_token_bos(ctx);
     for (int i = 0; i < n_gen; i++) {
-        llama_eval(ctx, &token, 1, n_past + i, n_threads);
+        llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0), n_threads);
     }
 }
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 19cbbb2a12440..3e78fdaa05459 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -571,7 +571,7 @@ int main(int argc, char ** argv) {
 
                 for (int i = 0; i < input_size; i += params.n_batch) {
                     int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
+                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0), params.n_threads)) {
                         LOG_TEE("%s : failed to eval\n", __func__);
                         return 1;
                     }
@@ -588,7 +588,7 @@ int main(int argc, char ** argv) {
 
                 LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
 
-                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0), params.n_threads)) {
                     LOG_TEE("%s : failed to eval\n", __func__);
                     return 1;
                 }
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 4958cdfb9a392..2a046d55e8c3c 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -199,7 +199,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
             const int batch_size  = std::min(end - batch_start, n_batch);
 
             //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
-            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
+            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0), params.n_threads)) {
                 //fprintf(stderr, "%s : failed to eval\n", __func__);
                 return {tokens, -1, logit_history, prob_history};
             }
@@ -331,7 +331,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                 tokens[batch_start] = llama_token_bos(ctx);
             }
 
-            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
+            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0), params.n_threads)) {
                 fprintf(stderr, "%s : failed to eval\n", __func__);
                 return {tokens, -1, logit_history, prob_history};
             }
@@ -409,7 +409,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
     for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
         size_t n_tokens = tokens.size() - i_chunk * n_batch;
         n_tokens = std::min(n_tokens, size_t(n_batch));
-        if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
+        if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0), n_thread)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return {};
         }
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index eac307904fbc1..5e1a097be2477 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -34,11 +34,11 @@ int main(int argc, char ** argv) {
     auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
 
     // init
-    auto model = llama_load_model_from_file(params.model.c_str(), lparams);
+    auto * model = llama_load_model_from_file(params.model.c_str(), lparams);
     if (model == nullptr) {
         return 1;
     }
-    auto ctx = llama_new_context_with_model(model, lparams);
+    auto * ctx = llama_new_context_with_model(model, lparams);
     if (ctx == nullptr) {
         llama_free_model(model);
         return 1;
@@ -53,7 +53,7 @@ int main(int argc, char ** argv) {
     }
 
     // evaluate prompt
-    llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
+    llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0), params.n_threads);
 
     last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
     n_past += n_prompt_tokens;
@@ -77,7 +77,7 @@ int main(int argc, char ** argv) {
     printf("\n%s", params.prompt.c_str());
 
     for (auto i = 0; i < params.n_predict; i++) {
-        auto logits = llama_get_logits(ctx);
+        auto * logits = llama_get_logits(ctx);
         auto n_vocab = llama_n_vocab(ctx);
         std::vector<llama_token_data> candidates;
         candidates.reserve(n_vocab);
@@ -90,7 +90,7 @@ int main(int argc, char ** argv) {
         last_n_tokens_data.push_back(next_token);
 
         printf("%s", next_token_str.c_str());
-        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
+        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0), params.n_threads)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             llama_free(ctx);
             llama_free_model(model);
@@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
 
     // make new context
-    auto ctx2 = llama_new_context_with_model(model, lparams);
+    auto * ctx2 = llama_new_context_with_model(model, lparams);
 
     // Load state (rng, logits, embedding and kv_cache) from file
     {
@@ -137,7 +137,7 @@ int main(int argc, char ** argv) {
 
     // second run
     for (auto i = 0; i < params.n_predict; i++) {
-        auto logits = llama_get_logits(ctx2);
+        auto * logits = llama_get_logits(ctx2);
         auto n_vocab = llama_n_vocab(ctx2);
         std::vector<llama_token_data> candidates;
         candidates.reserve(n_vocab);
@@ -150,7 +150,7 @@ int main(int argc, char ** argv) {
         last_n_tokens_data.push_back(next_token);
 
         printf("%s", next_token_str.c_str());
-        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
+        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0), params.n_threads)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
             llama_free(ctx2);
             llama_free_model(model);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1bb8e92c0f95e..6c81bd618d5e5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -434,7 +434,7 @@ struct llama_server_context
             {
                 n_eval = params.n_batch;
             }
-            if (llama_eval(ctx, &embd[n_past], n_eval, n_past, params.n_threads))
+            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads))
             {
                 LOG_ERROR("failed to eval", {
                                                 {"n_eval", n_eval},
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 37eaf3b2cd0ad..33ef0770b2f87 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -76,7 +76,7 @@ int main(int argc, char ** argv) {
     while (n_cur < n_gen) {
         // evaluate the transformer
 
-        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), n_cur, params.n_threads)) {
+        if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), int(tokens_list.size()), n_cur, 0), params.n_threads)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return 1;
         }
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index aa904183fa2d8..06173393ccedc 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -70,9 +70,9 @@ int main(int argc, char ** argv) {
     const auto t_enc_start = ggml_time_us();
 
     // eval the prompt with both models
-    llama_eval(ctx_tgt,  inp.data(), int(inp.size() - 1), 0, params.n_threads);
-    llama_eval(ctx_tgt, &inp.back(),      1, inp.size() - 1, params.n_threads);
-    llama_eval(ctx_dft,  inp.data(),     int(inp.size()), 0, params.n_threads);
+    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0,           0), params.n_threads);
+    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0), params.n_threads);
+    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0), params.n_threads);
 
     const auto t_enc_end = ggml_time_us();
 
@@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
                 LOG("out of drafted tokens\n");
             }
 
-            llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
+            llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0), params.n_threads);
             ++n_past_dft;
 
             // heuristic for n_draft
@@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
             }
 
             // evaluate the drafted token on the draft model
-            llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
+            llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0), params.n_threads);
             ++n_past_cur;
 
             if (grammar_dft != NULL) {
@@ -265,7 +265,7 @@ int main(int argc, char ** argv) {
         }
 
         // evaluate the target model on the drafted tokens
-        llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
+        llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0), params.n_threads);
         ++n_past_tgt;
 
         // the first token is always proposed by the traget model before the speculation loop
diff --git a/llama.cpp b/llama.cpp
index 0e1c8755c7f0f..601f557ef4e54 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1265,7 +1265,7 @@ static bool llama_kv_cache_init(
 // updates the cache head
 static bool llama_kv_cache_find_slot(
              struct llama_kv_cache & cache,
-                struct llama_batch & batch) {
+          const struct llama_batch & batch) {
     const uint32_t n_ctx    = cache.size;
     const uint32_t n_tokens = batch.n_tokens;
 
@@ -2522,7 +2522,7 @@ static bool llama_model_load(
 
 static struct ggml_cgraph * llm_build_llama(
          llama_context & lctx,
-           llama_batch & batch) {
+     const llama_batch & batch) {
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
 
@@ -2876,7 +2876,7 @@ static struct ggml_cgraph * llm_build_llama(
 
 static struct ggml_cgraph * llm_build_baichaun(
          llama_context & lctx,
-           llama_batch & batch) {
+     const llama_batch & batch) {
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
 
@@ -3247,7 +3247,7 @@ static struct ggml_cgraph * llm_build_baichaun(
 
 static struct ggml_cgraph * llm_build_falcon(
          llama_context & lctx,
-           llama_batch & batch) {
+     const llama_batch & batch) {
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
 
@@ -3577,7 +3577,7 @@ static struct ggml_cgraph * llm_build_falcon(
 
 static struct ggml_cgraph * llm_build_starcoder(
          llama_context & lctx,
-           llama_batch & batch) {
+     const llama_batch & batch) {
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
 
@@ -3819,7 +3819,7 @@ static struct ggml_cgraph * llm_build_starcoder(
 
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
-           llama_batch & batch) {
+     const llama_batch & batch) {
     const auto & model = lctx.model;
 
     struct ggml_cgraph * result = NULL;
@@ -3856,7 +3856,7 @@ static struct ggml_cgraph * llama_build_graph(
 //
 static bool llama_eval_internal(
          llama_context & lctx,
-           llama_batch & batch,
+           llama_batch   batch,
                    int   n_threads) {
     const uint32_t n_tokens = batch.n_tokens;
 
@@ -3886,6 +3886,31 @@ static bool llama_eval_internal(
     const int64_t n_embd  = hparams.n_embd;
     const int64_t n_vocab = hparams.n_vocab;
 
+    std::vector<llama_pos>    pos;
+    std::vector<llama_seq_id> seq_id;
+
+    if (batch.pos == nullptr) {
+        pos.resize(n_tokens);
+        for (uint32_t i = 0; i < n_tokens; i++) {
+            pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
+        }
+
+        batch.pos = pos.data();
+    }
+
+    if (batch.seq_id == nullptr) {
+        seq_id.resize(n_tokens);
+        for (uint32_t i = 0; i < n_tokens; i++) {
+            seq_id[i] = batch.all_seq_id;
+        }
+
+        batch.seq_id = seq_id.data();
+    }
+
+    if (batch.clear_kv) {
+        llama_kv_cache_clear(kv_self, 0, -1);
+    }
+
     if (!llama_kv_cache_find_slot(kv_self, batch)) {
         return false;
     }
@@ -4820,6 +4845,13 @@ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar)
 // sampling
 //
 
+void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        seed = time(NULL);
+    }
+    ctx->rng.seed(seed);
+}
+
 void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
     GGML_ASSERT(candidates->size > 0);
 
@@ -5469,7 +5501,7 @@ struct llama_beam_search_data {
         } else {
             // beam is not at end-of-sentence, so branch with next top_k tokens.
             if (!beam.tokens.empty()) {
-                llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
+                llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0), n_threads);
             }
             llama_logit_info logit_info(ctx);
             std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
@@ -5543,7 +5575,7 @@ struct llama_beam_search_data {
             callback(callback_data, get_beams_state(false));  // Sets common_prefix_length
             update_beams_from_beam_views();   // Update values (p,eob) that callback may have changed.
             if (common_prefix_length) {
-                llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
+                llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0), n_threads);
                 n_past += common_prefix_length;
             }
             // Zero-out next_beam probabilities to place them last in following min-heap.
@@ -6505,8 +6537,7 @@ struct llama_context * llama_new_context_with_model(
             // build worst-case graph
             uint32_t n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
             llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-            llama_batch batch = { n_tokens, &token, nullptr, nullptr, nullptr };
-            ggml_cgraph * gf = llama_build_graph(*ctx, batch);
+            ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, 0, 0));
 
 #ifdef GGML_USE_METAL
             if (params.n_gpu_layers > 0) {
@@ -6714,15 +6745,6 @@ void llama_kv_clear(struct llama_context * ctx, int32_t p0, int32_t p1) {
     llama_kv_cache_clear(ctx->kv_self, p0, p1);
 }
 
-#define LLAMA_MAX_RNG_STATE (64*1024)
-
-void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
-    if (seed == LLAMA_DEFAULT_SEED) {
-        seed = time(NULL);
-    }
-    ctx->rng.seed(seed);
-}
-
 // Returns the *maximum* size of the state
 size_t llama_get_state_size(const struct llama_context * ctx) {
     // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -7116,21 +7138,9 @@ int llama_eval(
                     uint32_t   n_tokens,
                          int   n_past,
                          int   n_threads) {
-    std::vector<llama_pos> pos(n_tokens);
-    for (uint32_t i = 0; i < n_tokens; i++) {
-        pos[i] = n_past + i;
-    }
-
-    std::vector<llama_seq_id> seq_id(n_tokens);
-    for (uint32_t i = 0; i < n_tokens; i++) {
-        seq_id[i] = 0;
-    }
-
-    llama_batch batch = { n_tokens, tokens, nullptr, pos.data(), seq_id.data(), };
-
     llama_kv_cache_clear(ctx->kv_self, n_past, -1);
 
-    if (!llama_eval_internal(*ctx, batch, n_threads)) {
+    if (!llama_eval_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0), n_threads)) {
         LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
         return 1;
     }
@@ -7151,18 +7161,47 @@ int llama_eval_embd(
                         uint32_t   n_tokens,
                              int   n_past,
                              int   n_threads) {
-    std::vector<llama_pos> pos(n_tokens);
-    for (uint32_t i = 0; i < n_tokens; i++) {
-        pos[i] = n_past + i;
+    llama_kv_cache_clear(ctx->kv_self, n_past, -1);
+
+    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, n_past, 1, 0, n_past == 0, };
+
+    if (!llama_eval_internal(*ctx, batch, n_threads)) {
+        LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
+        return 1;
     }
 
-    std::vector<llama_seq_id> seq_id(n_tokens);
-    for (uint32_t i = 0; i < n_tokens; i++) {
-        seq_id[i] = 0;
+    // get a more accurate load time, upon first eval
+    // TODO: fix this
+    if (!ctx->has_evaluated_once) {
+        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
+        ctx->has_evaluated_once = true;
     }
 
-    llama_batch batch = { n_tokens, nullptr, embd, pos.data(), seq_id.data(), };
+    return 0;
+}
 
+struct llama_batch llama_batch_get_one(
+       const llama_token * tokens,
+                uint32_t   n_tokens,
+               llama_pos   pos_0,
+            llama_seq_id   seq_id) {
+    return {
+        /*n_tokens    =*/ n_tokens,
+        /*tokens      =*/ tokens,
+        /*embd        =*/ nullptr,
+        /*pos         =*/ nullptr,
+        /*seq_id      =*/ nullptr,
+        /*all_pos_0   =*/ pos_0,
+        /*all_pos_1   =*/ 1,
+        /*all_seq_id  =*/ seq_id,
+        /*clear_kv    =*/ pos_0 == 0,
+    };
+}
+
+int llama_decode(
+        struct llama_context * ctx,
+          struct llama_batch   batch,
+                         int   n_threads) {
     if (!llama_eval_internal(*ctx, batch, n_threads)) {
         LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
         return 1;
diff --git a/llama.h b/llama.h
index 0af9c10896ed0..b844e172bb9e6 100644
--- a/llama.h
+++ b/llama.h
@@ -37,6 +37,8 @@
 
 #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
 
+#define LLAMA_MAX_RNG_STATE (64*1024)
+
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
@@ -70,9 +72,20 @@ extern "C" {
 
         // TODO: not sure about these consts - might just get in the way all the time with no benefit
         const llama_token  * token;
-        const       float  * embd;
+        const float        * embd;
         const llama_pos    * pos;
         const llama_seq_id * seq_id;
+
+        // NOTE: helpers for smooth API transition - can be deprecated in the future
+        //       for future-proof code, use the above fields instead and ignore everything below
+        //
+        // pos[i] = all_pos_0 + i*all_pos_1
+        //
+        llama_pos    all_pos_0;  // used if pos == NULL
+        llama_pos    all_pos_1;  // used if pos == NULL
+        llama_seq_id all_seq_id; // used if seq_id == NULL
+
+        bool clear_kv; // if true, clear the entire KV cache. common usage for perplexity calculations
     } llama_seq;
 
     enum llama_log_level {
@@ -312,9 +325,6 @@ extern "C" {
 
     LLAMA_API void llama_kv_clear(struct llama_context * ctx, int32_t p0, int32_t p1);
 
-    // Sets the current rng seed.
-    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
-
     // Returns the maximum size in bytes of the state (rng, logits, embedding
     // and kv_cache) - will often be smaller after compacting tokens
     LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
@@ -336,19 +346,37 @@ extern "C" {
     // tokens + n_tokens is the provided batch of new tokens to process
     // n_past is the number of tokens to use from previous eval calls
     // Returns 0 on success
-    LLAMA_API int llama_eval(
+    LLAMA_API DEPRECATED(int llama_eval(
             struct llama_context * ctx,
                const llama_token * tokens,
                         uint32_t   n_tokens,
                              int   n_past,
-                             int   n_threads);
+                             int   n_threads),
+            "please use llama_decode() instead");
 
     // Same as llama_eval, but use float matrix input directly.
-    LLAMA_API int llama_eval_embd(
+    LLAMA_API DEPRECATED(int llama_eval_embd(
             struct llama_context * ctx,
                      const float * embd,
                         uint32_t   n_tokens,
                              int   n_past,
+                             int   n_threads),
+            "please use llama_decode() instead");
+
+    // Return batch for single sequence of tokens starting at pos_0
+    // If pos_0 == 0, the clear_kv flag will be auto set to true
+    //
+    // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
+    //
+    LLAMA_API struct llama_batch llama_batch_get_one(
+            const llama_token * tokens,
+                     uint32_t   n_tokens,
+                    llama_pos   pos_0,
+                 llama_seq_id   seq_id);
+
+    LLAMA_API int llama_decode(
+            struct llama_context * ctx,
+              struct llama_batch   batch,
                              int   n_threads);
 
     // Token logits obtained from the last call to llama_eval()
@@ -434,6 +462,9 @@ extern "C" {
     // Sampling functions
     //
 
+    // Sets the current rng seed.
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
+
     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
     LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
 

From 6952a460b9857d4d603853042e7b417c802d4137 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 15:31:24 +0300
Subject: [PATCH 07/55] llama : add cell_max heuristic for more efficient
 kv_cache

---
 llama.cpp | 119 +++++++++++++++++++++++++++++++++++++++++-------------
 llama.h   |  12 +++++-
 2 files changed, 102 insertions(+), 29 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 601f557ef4e54..4867d348f86b9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1023,6 +1023,9 @@ struct llama_kv_cache {
     uint32_t head = 0;
     uint32_t size = 0;
 
+    // largest index of an occupied cell (used for a basic optimization heuristic)
+    uint32_t cell_max = 0;
+
     std::vector<llama_kv_cell> cells;
 
     struct ggml_tensor * k = NULL;
@@ -1226,6 +1229,8 @@ static bool llama_kv_cache_init(
     cache.head = 0;
     cache.size = n_ctx;
 
+    cache.cell_max = 0;
+
     cache.cells.clear();
     cache.cells.resize(n_ctx);
 
@@ -1311,6 +1316,16 @@ static bool llama_kv_cache_find_slot(
     return true;
 }
 
+void llama_kv_cache_update_cell_max(struct llama_kv_cache & cache) {
+    cache.cell_max = 0;
+
+    for (uint32_t i = 0; i < cache.size; i++) {
+        if (cache.cells[i].pos >= 0) {
+            cache.cell_max = i + 1;
+        }
+    }
+}
+
 void llama_kv_cache_clear(struct llama_kv_cache & cache, int32_t p0, int32_t p1) {
     cache.head = p0;
 
@@ -1321,6 +1336,8 @@ void llama_kv_cache_clear(struct llama_kv_cache & cache, int32_t p0, int32_t p1)
         cache.cells[i].pos = -1;
         cache.cells[i].seq_id.clear();
     }
+
+    llama_kv_cache_update_cell_max(cache);
 }
 
 //
@@ -2547,6 +2564,7 @@ static struct ggml_cgraph * llm_build_llama(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
+    const int32_t n_kv     = kv_self.cell_max + n_tokens;
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -2621,7 +2639,7 @@ static struct ggml_cgraph * llm_build_llama(
     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_tokens, 1);
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
     ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         float * data = (float *) KQ_mask->data;
@@ -2629,9 +2647,19 @@ static struct ggml_cgraph * llm_build_llama(
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_ctx; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(batch.seq_id[j]) || kv_self.cells[i].pos > batch.pos[j]) {
-                        data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
+                const llama_pos    pos    = batch.pos[j];
+                const llama_seq_id seq_id = batch.seq_id[j];
+
+                for (int i = 0; i < n_kv; ++i) {
+                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+                    }
+                }
+
+                // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
+                for (int i = n_kv; i < n_ctx; ++i) {
+                    if (kv_self.cells[i].has_seq_id(seq_id) && kv_self.cells[i].pos >= 0) {
+                        GGML_ASSERT(false && "cell_max is too small - this might indicate a bug");
                     }
                 }
             }
@@ -2725,7 +2753,7 @@ static struct ggml_cgraph * llm_build_llama(
 
             struct ggml_tensor * K =
                 ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_ctx, n_head_kv,
+                        n_embd_head, n_kv, n_head_kv,
                         ggml_element_size(kv_self.k)*n_embd_gqa,
                         ggml_element_size(kv_self.k)*n_embd_head,
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -2738,7 +2766,7 @@ static struct ggml_cgraph * llm_build_llama(
             ggml_set_name(KQ, "KQ");
 
             // KQ_scaled = KQ / sqrt(n_embd_head)
-            // KQ_scaled shape [n_ctx, n_tokens, n_head, 1]
+            // KQ_scaled shape [n_kv, n_tokens, n_head, 1]
             struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
             offload_func_kq(KQ_scaled);
             ggml_set_name(KQ_scaled, "KQ_scaled");
@@ -2756,7 +2784,7 @@ static struct ggml_cgraph * llm_build_llama(
             // split cached V into n_head heads
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
-                        n_ctx, n_embd_head, n_head_kv,
+                        n_kv, n_embd_head, n_head_kv,
                         ggml_element_size(kv_self.v)*n_ctx,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -2901,6 +2929,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
+    const int32_t n_kv     = kv_self.cell_max + n_tokens;
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -2975,7 +3004,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_tokens, 1);
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
     ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         float * data = (float *) KQ_mask->data;
@@ -2983,9 +3012,19 @@ static struct ggml_cgraph * llm_build_baichaun(
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_ctx; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(batch.seq_id[j]) || kv_self.cells[i].pos > batch.pos[j]) {
-                        data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
+                const llama_pos    pos    = batch.pos[j];
+                const llama_seq_id seq_id = batch.seq_id[j];
+
+                for (int i = 0; i < n_kv; ++i) {
+                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+                    }
+                }
+
+                // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
+                for (int i = n_kv; i < n_ctx; ++i) {
+                    if (kv_self.cells[i].has_seq_id(seq_id) && kv_self.cells[i].pos >= 0) {
+                        GGML_ASSERT(false && "cell_max is too small - this might indicate a bug");
                     }
                 }
             }
@@ -3092,7 +3131,7 @@ static struct ggml_cgraph * llm_build_baichaun(
 
             struct ggml_tensor * K =
                 ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_ctx, n_head_kv,
+                        n_embd_head, n_kv, n_head_kv,
                         ggml_element_size(kv_self.k)*n_embd_gqa,
                         ggml_element_size(kv_self.k)*n_embd_head,
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3135,7 +3174,7 @@ static struct ggml_cgraph * llm_build_baichaun(
             // split cached V into n_head heads
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
-                        n_ctx, n_embd_head, n_head_kv,
+                        n_kv, n_embd_head, n_head_kv,
                         ggml_element_size(kv_self.v)*n_ctx,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3272,6 +3311,7 @@ static struct ggml_cgraph * llm_build_falcon(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
+    const int32_t n_kv     = kv_self.cell_max + n_tokens;
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -3346,7 +3386,7 @@ static struct ggml_cgraph * llm_build_falcon(
     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_tokens, 1);
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
     ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         float * data = (float *) KQ_mask->data;
@@ -3354,9 +3394,19 @@ static struct ggml_cgraph * llm_build_falcon(
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_ctx; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(batch.seq_id[j]) || kv_self.cells[i].pos > batch.pos[j]) {
-                        data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
+                const llama_pos    pos    = batch.pos[j];
+                const llama_seq_id seq_id = batch.seq_id[j];
+
+                for (int i = 0; i < n_kv; ++i) {
+                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+                    }
+                }
+
+                // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
+                for (int i = n_kv; i < n_ctx; ++i) {
+                    if (kv_self.cells[i].has_seq_id(seq_id) && kv_self.cells[i].pos >= 0) {
+                        GGML_ASSERT(false && "cell_max is too small - this might indicate a bug");
                     }
                 }
             }
@@ -3479,7 +3529,7 @@ static struct ggml_cgraph * llm_build_falcon(
 
             struct ggml_tensor * K =
                 ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_ctx, n_head_kv,
+                        n_embd_head, n_kv, n_head_kv,
                         ggml_element_size(kv_self.k)*n_embd_gqa,
                         ggml_element_size(kv_self.k)*n_embd_head,
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3504,7 +3554,7 @@ static struct ggml_cgraph * llm_build_falcon(
 
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
-                        n_ctx, n_embd_head, n_head_kv,
+                        n_kv, n_embd_head, n_head_kv,
                         ggml_element_size(kv_self.v)*n_ctx,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3598,6 +3648,7 @@ static struct ggml_cgraph * llm_build_starcoder(
     const float norm_eps = hparams.f_norm_eps;
 
     const int32_t n_tokens = batch.n_tokens;
+    const int32_t n_kv     = kv_self.cell_max + n_tokens;
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -3664,7 +3715,7 @@ static struct ggml_cgraph * llm_build_starcoder(
     ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_ctx, n_tokens, 1);
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
     ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         float * data = (float *) KQ_mask->data;
@@ -3672,9 +3723,19 @@ static struct ggml_cgraph * llm_build_starcoder(
 
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_ctx; ++i) {
-                    if (!kv_self.cells[i].has_seq_id(batch.seq_id[j]) || kv_self.cells[i].pos > batch.pos[j]) {
-                        data[h*(n_ctx*n_tokens) + j*n_ctx + i] = -INFINITY;
+                const llama_pos    pos    = batch.pos[j];
+                const llama_seq_id seq_id = batch.seq_id[j];
+
+                for (int i = 0; i < n_kv; ++i) {
+                    if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+                        data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+                    }
+                }
+
+                // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
+                for (int i = n_kv; i < n_ctx; ++i) {
+                    if (kv_self.cells[i].has_seq_id(seq_id) && kv_self.cells[i].pos >= 0) {
+                        GGML_ASSERT(false && "cell_max is too small - this might indicate a bug");
                     }
                 }
             }
@@ -3727,7 +3788,7 @@ static struct ggml_cgraph * llm_build_starcoder(
 
             struct ggml_tensor * K =
                 ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_ctx, n_head_kv,
+                        n_embd_head, n_kv, n_head_kv,
                         ggml_element_size(kv_self.k)*n_embd_gqa,
                         ggml_element_size(kv_self.k)*n_embd_head,
                         ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
@@ -3753,7 +3814,7 @@ static struct ggml_cgraph * llm_build_starcoder(
             // split cached V into n_head heads
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
-                        n_ctx, n_embd_head, n_head_kv,
+                        n_kv, n_embd_head, n_head_kv,
                         ggml_element_size(kv_self.v)*n_ctx,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
@@ -3974,8 +4035,9 @@ static bool llama_eval_internal(
     ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
 #endif
 
-    // update the kv ring buffer head
-    lctx.kv_self.head += n_tokens;
+    // update the kv ring buffer
+    lctx.kv_self.head     += n_tokens;
+    lctx.kv_self.cell_max  = std::max(lctx.kv_self.cell_max, lctx.kv_self.head);
 
 #ifdef GGML_PERF
     // print timing information per ggml operation (for debugging purposes)
@@ -7040,6 +7102,9 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
         }
 
         ctx->kv_self.head = kv_ntok;
+        ctx->kv_self.size = kv_size;
+
+        ctx->kv_self.cell_max = kv_ntok;
     }
 
     const size_t nread    = inp - src;
diff --git a/llama.h b/llama.h
index b844e172bb9e6..ae7ac5e3d868a 100644
--- a/llama.h
+++ b/llama.h
@@ -316,15 +316,19 @@ extern "C" {
                                  int   n_threads);
 
     //
-    // KV cache API
+    // KV cache
     //
 
     // Returns the number of tokens in the KV cache
     LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
-            "avoid using this, it will be removed in the future");
+            "avoid using this, it will be removed in the future, instead - count the tokens in user code");
 
     LLAMA_API void llama_kv_clear(struct llama_context * ctx, int32_t p0, int32_t p1);
 
+    //
+    // State / sessions
+    //
+
     // Returns the maximum size in bytes of the state (rng, logits, embedding
     // and kv_cache) - will often be smaller after compacting tokens
     LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
@@ -342,6 +346,10 @@ extern "C" {
     LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
     LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
 
+    //
+    // Decoding
+    //
+
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process
     // n_past is the number of tokens to use from previous eval calls

From 4d76d762ef6d0292506007e6721a9f2b8bd52861 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 15:53:03 +0300
Subject: [PATCH 08/55] llama : extend llama_kv_cache API

---
 examples/embd-input/embd-input-lib.cpp |  2 +-
 examples/perplexity/perplexity.cpp     | 41 ++++++++++++-----
 llama.cpp                              | 62 ++++++++++++++++++--------
 llama.h                                | 11 +++--
 4 files changed, 84 insertions(+), 32 deletions(-)

diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index ed0966a511f0b..344a8b2c3262e 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -79,7 +79,7 @@ bool eval_float(void * model, float * input, int N){
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        llama_batch batch = { uint32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, n_past, 1, 0, false };
+        llama_batch batch = { uint32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, n_past, 1, 0, };
         if (llama_decode(ctx, batch, params.n_threads)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return false;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 2a046d55e8c3c..fd2160bbf2b6e 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -79,7 +79,9 @@ static void write_logfile(
 static std::vector<float> softmax(const std::vector<float>& logits) {
     std::vector<float> probs(logits.size());
     float max_logit = logits[0];
-    for (float v : logits) max_logit = std::max(max_logit, v);
+    for (float v : logits) {
+        max_logit = std::max(max_logit, v);
+    }
     double sum_exp = 0.0;
     for (size_t i = 0; i < logits.size(); i++) {
         // Subtract the maximum logit value from the current logit value for numerical stability
@@ -88,15 +90,21 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
         sum_exp += exp_logit;
         probs[i] = exp_logit;
     }
-    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
+    for (size_t i = 0; i < probs.size(); i++) {
+        probs[i] /= sum_exp;
+    }
     return probs;
 }
 
 static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
     float max_logit = logits[0];
-    for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
+    for (int i = 1; i < n_vocab; ++i) {
+        max_logit = std::max(max_logit, logits[i]);
+    }
     double sum_exp = 0.0;
-    for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
+    for (int i = 0; i < n_vocab; ++i) {
+        sum_exp += expf(logits[i] - max_logit);
+    }
     return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 }
 
@@ -107,7 +115,8 @@ static void process_logits(
     std::mutex mutex;
     int counter = 0;
     auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
-        double local_nll = 0, local_nll2 = 0;
+        double local_nll  = 0;
+        double local_nll2 = 0;
         while (true) {
             std::unique_lock<std::mutex> lock(mutex);
             int i = counter++;
@@ -125,10 +134,13 @@ static void process_logits(
             prob_history[i]  = results.prob;
         }
     };
-    for (auto & w : workers) w = std::thread(compute);
+    for (auto & w : workers) {
+        w = std::thread(compute);
+    }
     compute();
-    for (auto & w : workers) w.join();
-
+    for (auto & w : workers) {
+        w.join();
+    }
 }
 
 static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
@@ -151,8 +163,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
         return {std::move(tokens), 0., {}, {}};
     }
 
-    std::vector<float>       logit_history;
-    std::vector<float>       prob_history;
+    std::vector<float> logit_history;
+    std::vector<float> prob_history;
 
     logit_history.resize(tokens.size());
     prob_history.resize(tokens.size());
@@ -194,6 +206,9 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
 
         const auto t_start = std::chrono::high_resolution_clock::now();
 
+        // clear the KV cache
+        llama_kv_cache_keep_seq(ctx, -1);
+
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
@@ -319,6 +334,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
         const auto t_start = std::chrono::high_resolution_clock::now();
 
+        // clear the KV cache
+        llama_kv_cache_keep_seq(ctx, -1);
+
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
@@ -549,6 +567,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             query_embd.resize(32);
         }
 
+        // clear the KV cache
+        llama_kv_cache_keep_seq(ctx, -1);
+
         auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
         if (logits.empty()) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
diff --git a/llama.cpp b/llama.cpp
index 4867d348f86b9..ce7ea408bdf97 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1316,7 +1316,8 @@ static bool llama_kv_cache_find_slot(
     return true;
 }
 
-void llama_kv_cache_update_cell_max(struct llama_kv_cache & cache) {
+void llama_kv_cache_update(struct llama_kv_cache & cache) {
+    // compute new cell_max
     cache.cell_max = 0;
 
     for (uint32_t i = 0; i < cache.size; i++) {
@@ -1326,18 +1327,40 @@ void llama_kv_cache_update_cell_max(struct llama_kv_cache & cache) {
     }
 }
 
-void llama_kv_cache_clear(struct llama_kv_cache & cache, int32_t p0, int32_t p1) {
-    cache.head = p0;
+void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
+    if (c0 < 0) c0 = 0;
+    if (c1 < 0) c1 = cache.size;
 
-    if (p0 < 0) p0 = 0;
-    if (p1 < 0) p1 = cache.size;
-
-    for (int32_t i = p0; i < p1; ++i) {
+    for (int32_t i = c0; i < c1; ++i) {
         cache.cells[i].pos = -1;
         cache.cells[i].seq_id.clear();
     }
 
-    llama_kv_cache_update_cell_max(cache);
+    llama_kv_cache_update(cache);
+}
+
+void llama_kv_cache_rm_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) {
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (cache.cells[i].has_seq_id(seq_id)) {
+            cache.cells[i].seq_id.erase(seq_id);
+            if (cache.cells[i].seq_id.empty()) {
+                cache.cells[i].pos = -1;
+            }
+        }
+    }
+
+    llama_kv_cache_update(cache);
+}
+
+void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) {
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (!cache.cells[i].has_seq_id(seq_id)) {
+            cache.cells[i].pos = -1;
+            cache.cells[i].seq_id.clear();
+        }
+    }
+
+    llama_kv_cache_update(cache);
 }
 
 //
@@ -3968,10 +3991,6 @@ static bool llama_eval_internal(
         batch.seq_id = seq_id.data();
     }
 
-    if (batch.clear_kv) {
-        llama_kv_cache_clear(kv_self, 0, -1);
-    }
-
     if (!llama_kv_cache_find_slot(kv_self, batch)) {
         return false;
     }
@@ -6803,8 +6822,16 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
     return ctx->kv_self.head;
 }
 
-void llama_kv_clear(struct llama_context * ctx, int32_t p0, int32_t p1) {
-    llama_kv_cache_clear(ctx->kv_self, p0, p1);
+void llama_kv_cache_rm_tokens(struct llama_context * ctx, int32_t c0, int32_t c1) {
+    llama_kv_cache_rm_tokens(ctx->kv_self, c0, c1);
+}
+
+void llama_kv_cache_rm_seq(struct llama_context * ctx, llama_seq_id seq_id) {
+    llama_kv_cache_rm_seq(ctx->kv_self, seq_id);
+}
+
+void llama_kv_cache_keep_seq(struct llama_context * ctx, llama_seq_id seq_id) {
+    llama_kv_cache_keep_seq(ctx->kv_self, seq_id);
 }
 
 // Returns the *maximum* size of the state
@@ -7203,7 +7230,7 @@ int llama_eval(
                     uint32_t   n_tokens,
                          int   n_past,
                          int   n_threads) {
-    llama_kv_cache_clear(ctx->kv_self, n_past, -1);
+    llama_kv_cache_rm_tokens(ctx->kv_self, n_past, -1);
 
     if (!llama_eval_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0), n_threads)) {
         LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
@@ -7226,9 +7253,9 @@ int llama_eval_embd(
                         uint32_t   n_tokens,
                              int   n_past,
                              int   n_threads) {
-    llama_kv_cache_clear(ctx->kv_self, n_past, -1);
+    llama_kv_cache_rm_tokens(ctx->kv_self, n_past, -1);
 
-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, n_past, 1, 0, n_past == 0, };
+    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, n_past, 1, 0, };
 
     if (!llama_eval_internal(*ctx, batch, n_threads)) {
         LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
@@ -7259,7 +7286,6 @@ struct llama_batch llama_batch_get_one(
         /*all_pos_0   =*/ pos_0,
         /*all_pos_1   =*/ 1,
         /*all_seq_id  =*/ seq_id,
-        /*clear_kv    =*/ pos_0 == 0,
     };
 }
 
diff --git a/llama.h b/llama.h
index ae7ac5e3d868a..4b70509b09229 100644
--- a/llama.h
+++ b/llama.h
@@ -84,8 +84,6 @@ extern "C" {
         llama_pos    all_pos_0;  // used if pos == NULL
         llama_pos    all_pos_1;  // used if pos == NULL
         llama_seq_id all_seq_id; // used if seq_id == NULL
-
-        bool clear_kv; // if true, clear the entire KV cache. common usage for perplexity calculations
     } llama_seq;
 
     enum llama_log_level {
@@ -323,7 +321,14 @@ extern "C" {
     LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
             "avoid using this, it will be removed in the future, instead - count the tokens in user code");
 
-    LLAMA_API void llama_kv_clear(struct llama_context * ctx, int32_t p0, int32_t p1);
+    // Remove all tokens between cells [c0, c1)
+    LLAMA_API void llama_kv_cache_rm_tokens(struct llama_context * ctx, int32_t c0, int32_t c1);
+
+    // Removes all tokens that belong to the specified sequence
+    LLAMA_API void llama_kv_cache_rm_seq(struct llama_context * ctx, llama_seq_id seq_id);
+
+    // Removes all tokens that do not belong to the specified sequence
+    LLAMA_API void llama_kv_cache_keep_seq(struct llama_context * ctx, llama_seq_id seq_id);
 
     //
     // State / sessions

From f015b266892b1634ac5a738c5861ea6976848ab4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 17:15:25 +0300
Subject: [PATCH 09/55] llama : more robust cell_max heuristic + wip shift

---
 examples/llama-bench/llama-bench.cpp |  4 ++
 llama.cpp                            | 81 +++++++++++-----------------
 llama.h                              |  6 ++-
 3 files changed, 39 insertions(+), 52 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 2551f84224cd6..8fdbd80330f19 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -977,6 +977,8 @@ int main(int argc, char ** argv) {
 
         test t(inst, lmodel, ctx);
 
+        llama_kv_cache_keep_seq(ctx, -1);
+
         // warmup run
         if (t.n_prompt > 0) {
             test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
@@ -986,6 +988,8 @@ int main(int argc, char ** argv) {
         }
 
         for (int i = 0; i < params.reps; i++) {
+            llama_kv_cache_keep_seq(ctx, -1);
+
             uint64_t t_start = get_time_ns();
             if (t.n_prompt > 0) {
                 test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
diff --git a/llama.cpp b/llama.cpp
index ce7ea408bdf97..1ef615811e5bd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1023,9 +1023,6 @@ struct llama_kv_cache {
     uint32_t head = 0;
     uint32_t size = 0;
 
-    // largest index of an occupied cell (used for a basic optimization heuristic)
-    uint32_t cell_max = 0;
-
     std::vector<llama_kv_cell> cells;
 
     struct ggml_tensor * k = NULL;
@@ -1229,8 +1226,6 @@ static bool llama_kv_cache_init(
     cache.head = 0;
     cache.size = n_ctx;
 
-    cache.cell_max = 0;
-
     cache.cells.clear();
     cache.cells.resize(n_ctx);
 
@@ -1316,15 +1311,16 @@ static bool llama_kv_cache_find_slot(
     return true;
 }
 
-void llama_kv_cache_update(struct llama_kv_cache & cache) {
-    // compute new cell_max
-    cache.cell_max = 0;
+int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
+    int32_t res = 0;
 
     for (uint32_t i = 0; i < cache.size; i++) {
-        if (cache.cells[i].pos >= 0) {
-            cache.cell_max = i + 1;
+        if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
+            res = i + 1;
         }
     }
+
+    return res;
 }
 
 void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
@@ -1335,8 +1331,6 @@ void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t
         cache.cells[i].pos = -1;
         cache.cells[i].seq_id.clear();
     }
-
-    llama_kv_cache_update(cache);
 }
 
 void llama_kv_cache_rm_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) {
@@ -1348,8 +1342,6 @@ void llama_kv_cache_rm_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) {
             }
         }
     }
-
-    llama_kv_cache_update(cache);
 }
 
 void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) {
@@ -1359,8 +1351,22 @@ void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id seq_id)
             cache.cells[i].seq_id.clear();
         }
     }
+}
+
+void llama_kv_cache_shift(
+              struct llama_context & ctx,
+                      llama_seq_id   seq_id,
+                         llama_pos   p0,
+                         llama_pos   p1,
+                         llama_pos   delta) {
+    auto & hparams = ctx.model.hparams;
+    auto & cache   = ctx.kv_self;
 
-    llama_kv_cache_update(cache);
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+            cache.cells[i].pos += delta;
+        }
+    }
 }
 
 //
@@ -2587,7 +2593,7 @@ static struct ggml_cgraph * llm_build_llama(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = kv_self.cell_max + n_tokens;
+    const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -2678,13 +2684,6 @@ static struct ggml_cgraph * llm_build_llama(
                         data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
                     }
                 }
-
-                // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
-                for (int i = n_kv; i < n_ctx; ++i) {
-                    if (kv_self.cells[i].has_seq_id(seq_id) && kv_self.cells[i].pos >= 0) {
-                        GGML_ASSERT(false && "cell_max is too small - this might indicate a bug");
-                    }
-                }
             }
         }
     }
@@ -2952,7 +2951,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = kv_self.cell_max + n_tokens;
+    const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -3043,13 +3042,6 @@ static struct ggml_cgraph * llm_build_baichaun(
                         data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
                     }
                 }
-
-                // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
-                for (int i = n_kv; i < n_ctx; ++i) {
-                    if (kv_self.cells[i].has_seq_id(seq_id) && kv_self.cells[i].pos >= 0) {
-                        GGML_ASSERT(false && "cell_max is too small - this might indicate a bug");
-                    }
-                }
             }
         }
     }
@@ -3334,7 +3326,7 @@ static struct ggml_cgraph * llm_build_falcon(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = kv_self.cell_max + n_tokens;
+    const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -3425,13 +3417,6 @@ static struct ggml_cgraph * llm_build_falcon(
                         data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
                     }
                 }
-
-                // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
-                for (int i = n_kv; i < n_ctx; ++i) {
-                    if (kv_self.cells[i].has_seq_id(seq_id) && kv_self.cells[i].pos >= 0) {
-                        GGML_ASSERT(false && "cell_max is too small - this might indicate a bug");
-                    }
-                }
             }
         }
     }
@@ -3671,7 +3656,7 @@ static struct ggml_cgraph * llm_build_starcoder(
     const float norm_eps = hparams.f_norm_eps;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = kv_self.cell_max + n_tokens;
+    const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -3754,13 +3739,6 @@ static struct ggml_cgraph * llm_build_starcoder(
                         data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
                     }
                 }
-
-                // TODO: temporary heuristic verification - if this fails then there is a bug with cell_max computation
-                for (int i = n_kv; i < n_ctx; ++i) {
-                    if (kv_self.cells[i].has_seq_id(seq_id) && kv_self.cells[i].pos >= 0) {
-                        GGML_ASSERT(false && "cell_max is too small - this might indicate a bug");
-                    }
-                }
             }
         }
     }
@@ -4055,8 +4033,7 @@ static bool llama_eval_internal(
 #endif
 
     // update the kv ring buffer
-    lctx.kv_self.head     += n_tokens;
-    lctx.kv_self.cell_max  = std::max(lctx.kv_self.cell_max, lctx.kv_self.head);
+    lctx.kv_self.head += n_tokens;
 
 #ifdef GGML_PERF
     // print timing information per ggml operation (for debugging purposes)
@@ -6834,6 +6811,10 @@ void llama_kv_cache_keep_seq(struct llama_context * ctx, llama_seq_id seq_id) {
     llama_kv_cache_keep_seq(ctx->kv_self, seq_id);
 }
 
+void llama_kv_cache_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    llama_kv_cache_shift(*ctx, seq_id, p0, p1, delta);
+}
+
 // Returns the *maximum* size of the state
 size_t llama_get_state_size(const struct llama_context * ctx) {
     // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -7130,8 +7111,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
         ctx->kv_self.head = kv_ntok;
         ctx->kv_self.size = kv_size;
-
-        ctx->kv_self.cell_max = kv_ntok;
     }
 
     const size_t nread    = inp - src;
diff --git a/llama.h b/llama.h
index 4b70509b09229..ec05fa6ea0eec 100644
--- a/llama.h
+++ b/llama.h
@@ -321,7 +321,7 @@ extern "C" {
     LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
             "avoid using this, it will be removed in the future, instead - count the tokens in user code");
 
-    // Remove all tokens between cells [c0, c1)
+    // Remove all tokens data of cells in [c0, c1)
     LLAMA_API void llama_kv_cache_rm_tokens(struct llama_context * ctx, int32_t c0, int32_t c1);
 
     // Removes all tokens that belong to the specified sequence
@@ -330,6 +330,10 @@ extern "C" {
     // Removes all tokens that do not belong to the specified sequence
     LLAMA_API void llama_kv_cache_keep_seq(struct llama_context * ctx, llama_seq_id seq_id);
 
+    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // If the KV cache is RoPEd, the KV data is updated accordingly
+    LLAMA_API void llama_kv_cache_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
+
     //
     // State / sessions
     //

From 86c90e34f5ef87f15da654c9358d41de7a0550ce Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 18:00:01 +0300
Subject: [PATCH 10/55] metal : disable concurrency optimization

---
 llama.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 1ef615811e5bd..6634f753fd165 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6605,8 +6605,8 @@ struct llama_context * llama_new_context_with_model(
                     llama_free(ctx);
                     return NULL;
                 }
-                ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
-                ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
+                //ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
+                //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
             }
 #endif
             // measure memory requirements for the graph
@@ -6621,7 +6621,7 @@ struct llama_context * llama_new_context_with_model(
             ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
 #ifdef GGML_USE_METAL
             if (ctx->ctx_metal) {
-                ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
+                //ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
             }
 #endif
 #ifdef GGML_USE_CUBLAS

From 0cbf3bfef88b176df470bd2238a7bb265e940e03 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 18:00:25 +0300
Subject: [PATCH 11/55] llama : add llama_kv_cache_shift_seq + no more context
 swaps

---
 common/common.cpp      |  1 +
 examples/main/main.cpp | 21 ++++++++-----
 llama.cpp              | 67 ++++++++++++++++++++++++++++++------------
 llama.h                |  6 ++--
 4 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index b638efe9ebae8..fd50891f8cdbc 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -781,6 +781,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
         std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
         llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0), params.n_threads);
+        llama_kv_cache_keep_seq(lctx, -1);
         llama_reset_timings(lctx);
     }
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 3e78fdaa05459..ed2d9e2f706c5 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -499,17 +499,22 @@ int main(int argc, char ** argv) {
                     break;
                 }
 
-                const int n_left = n_past - params.n_keep;
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
+                const int n_left    = n_past - params.n_keep - 1;
+                const int n_discard = n_left/2;
 
-                // always keep the first token - BOS
-                n_past          = std::max(1, params.n_keep);
-                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
+                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                    n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+                llama_kv_cache_rm_seq   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_cache_shift_seq(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+
+                n_past -= n_discard;
 
-                // insert n_left/2 tokens at the start of embd from last_tokens
-                embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
+                if (ctx_guidance) {
+                    n_past_guidance -= n_discard;
+                }
+
+                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
 
                 LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
 
diff --git a/llama.cpp b/llama.cpp
index 6634f753fd165..c4059c9eb0454 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1007,7 +1007,8 @@ struct llama_layer {
 };
 
 struct llama_kv_cell {
-    llama_pos pos = -1;
+    llama_pos pos   = -1;
+    llama_pos delta = 0;
 
     std::set<llama_seq_id> seq_id;
 
@@ -1018,7 +1019,7 @@ struct llama_kv_cell {
 
 // ring-buffer of cached KV data
 struct llama_kv_cache {
-    bool is_roped = false;
+    bool has_shift = false;
 
     uint32_t head = 0;
     uint32_t size = 0;
@@ -1223,6 +1224,8 @@ static bool llama_kv_cache_init(
     const int64_t n_mem      = n_layer*n_ctx;
     const int64_t n_elements = n_embd*n_mem;
 
+    cache.has_shift = false;
+
     cache.head = 0;
     cache.size = n_ctx;
 
@@ -1333,9 +1336,13 @@ void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t
     }
 }
 
-void llama_kv_cache_rm_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) {
+void llama_kv_cache_rm_seq(
+             struct llama_kv_cache & cache,
+                      llama_seq_id   seq_id,
+                         llama_pos   p0,
+                         llama_pos   p1) {
     for (uint32_t i = 0; i < cache.size; ++i) {
-        if (cache.cells[i].has_seq_id(seq_id)) {
+        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
             cache.cells[i].seq_id.erase(seq_id);
             if (cache.cells[i].seq_id.empty()) {
                 cache.cells[i].pos = -1;
@@ -1353,18 +1360,22 @@ void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id seq_id)
     }
 }
 
-void llama_kv_cache_shift(
-              struct llama_context & ctx,
+void llama_kv_cache_shift_seq(
+             struct llama_kv_cache & cache,
                       llama_seq_id   seq_id,
                          llama_pos   p0,
                          llama_pos   p1,
                          llama_pos   delta) {
-    auto & hparams = ctx.model.hparams;
-    auto & cache   = ctx.kv_self;
-
     for (uint32_t i = 0; i < cache.size; ++i) {
         if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
             cache.cells[i].pos += delta;
+            if (cache.cells[i].pos < 0) {
+                cache.cells[i].pos = -1;
+                cache.cells[i].seq_id.clear();
+            } else {
+                cache.has_shift = true;
+                cache.cells[i].delta = delta;
+            }
         }
     }
 }
@@ -2595,6 +2606,8 @@ static struct ggml_cgraph * llm_build_llama(
     const int32_t n_tokens = batch.n_tokens;
     const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
 
+    const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure(lctx.alloc);
+
     auto & buf_compute = lctx.buf_compute;
 
     struct ggml_init_params params = {
@@ -2698,6 +2711,16 @@ static struct ggml_cgraph * llm_build_llama(
         }
     }
 
+    // K_shift
+    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+    ggml_allocr_alloc(lctx.alloc, K_shift);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        int * data = (int *) K_shift->data;
+        for (int i = 0; i < n_ctx; ++i) {
+            data[i] = kv_self.cells[i].delta;
+        }
+    }
+
     for (int il = 0; il < n_layer; ++il) {
         ggml_format_name(inpL, "layer_inp_%d", il);
 
@@ -2723,6 +2746,17 @@ static struct ggml_cgraph * llm_build_llama(
             ggml_set_name(cur, "attention_norm_0");
         }
 
+        if (do_rope_shift) {
+            ggml_build_forward_expand(gf,
+                    ggml_rope_custom_inplace(ctx0,
+                        ggml_view_3d(ctx0, kv_self.k,
+                            n_embd_head, n_head_kv, n_ctx,
+                            ggml_element_size(kv_self.k)*n_embd_head,
+                            ggml_element_size(kv_self.k)*n_embd_gqa,
+                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
+                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
+        }
+
         // self-attention
         {
             // compute Q and K and RoPE them
@@ -4033,7 +4067,8 @@ static bool llama_eval_internal(
 #endif
 
     // update the kv ring buffer
-    lctx.kv_self.head += n_tokens;
+    lctx.kv_self.head      += n_tokens;
+    lctx.kv_self.has_shift  = false;
 
 #ifdef GGML_PERF
     // print timing information per ggml operation (for debugging purposes)
@@ -6562,10 +6597,6 @@ struct llama_context * llama_new_context_with_model(
             return nullptr;
         }
 
-        if (model->arch == LLM_ARCH_LLAMA) {
-            ctx->kv_self.is_roped = true;
-        }
-
         {
             const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
             LLAMA_LOG_INFO("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
@@ -6803,16 +6834,16 @@ void llama_kv_cache_rm_tokens(struct llama_context * ctx, int32_t c0, int32_t c1
     llama_kv_cache_rm_tokens(ctx->kv_self, c0, c1);
 }
 
-void llama_kv_cache_rm_seq(struct llama_context * ctx, llama_seq_id seq_id) {
-    llama_kv_cache_rm_seq(ctx->kv_self, seq_id);
+void llama_kv_cache_rm_seq(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    llama_kv_cache_rm_seq(ctx->kv_self, seq_id, p0, p1);
 }
 
 void llama_kv_cache_keep_seq(struct llama_context * ctx, llama_seq_id seq_id) {
     llama_kv_cache_keep_seq(ctx->kv_self, seq_id);
 }
 
-void llama_kv_cache_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-    llama_kv_cache_shift(*ctx, seq_id, p0, p1, delta);
+void llama_kv_cache_shift_seq(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    llama_kv_cache_shift_seq(ctx->kv_self, seq_id, p0, p1, delta);
 }
 
 // Returns the *maximum* size of the state
diff --git a/llama.h b/llama.h
index ec05fa6ea0eec..4a5f2e3bf70cf 100644
--- a/llama.h
+++ b/llama.h
@@ -324,15 +324,15 @@ extern "C" {
     // Remove all tokens data of cells in [c0, c1)
     LLAMA_API void llama_kv_cache_rm_tokens(struct llama_context * ctx, int32_t c0, int32_t c1);
 
-    // Removes all tokens that belong to the specified sequence
-    LLAMA_API void llama_kv_cache_rm_seq(struct llama_context * ctx, llama_seq_id seq_id);
+    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+    LLAMA_API void llama_kv_cache_rm_seq(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1);
 
     // Removes all tokens that do not belong to the specified sequence
     LLAMA_API void llama_kv_cache_keep_seq(struct llama_context * ctx, llama_seq_id seq_id);
 
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly
-    LLAMA_API void llama_kv_cache_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
+    LLAMA_API void llama_kv_cache_shift_seq(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
 
     //
     // State / sessions

From 7c1bdd0e8af2debf8defeced205d8513d69ab823 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 18:26:05 +0300
Subject: [PATCH 12/55] llama : apply K-cache roping for Falcon and Baichuan

---
 llama.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index c4059c9eb0454..a7c7604d90b4a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2746,6 +2746,7 @@ static struct ggml_cgraph * llm_build_llama(
             ggml_set_name(cur, "attention_norm_0");
         }
 
+        // shift the entire K-cache if needed
         if (do_rope_shift) {
             ggml_build_forward_expand(gf,
                     ggml_rope_custom_inplace(ctx0,
@@ -2987,6 +2988,8 @@ static struct ggml_cgraph * llm_build_baichaun(
     const int32_t n_tokens = batch.n_tokens;
     const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
 
+    const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure(lctx.alloc);
+
     auto & buf_compute = lctx.buf_compute;
 
     struct ggml_init_params params = {
@@ -3090,6 +3093,16 @@ static struct ggml_cgraph * llm_build_baichaun(
         }
     }
 
+    // K_shift
+    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+    ggml_allocr_alloc(lctx.alloc, K_shift);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        int * data = (int *) K_shift->data;
+        for (int i = 0; i < n_ctx; ++i) {
+            data[i] = kv_self.cells[i].delta;
+        }
+    }
+
     for (int il = 0; il < n_layer; ++il) {
         ggml_format_name(inpL, "layer_inp_%d", il);
 
@@ -3115,6 +3128,18 @@ static struct ggml_cgraph * llm_build_baichaun(
             ggml_set_name(cur, "attention_norm_0");
         }
 
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            ggml_build_forward_expand(gf,
+                    ggml_rope_custom_inplace(ctx0,
+                        ggml_view_3d(ctx0, kv_self.k,
+                            n_embd_head, n_head_kv, n_ctx,
+                            ggml_element_size(kv_self.k)*n_embd_head,
+                            ggml_element_size(kv_self.k)*n_embd_gqa,
+                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
+                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
+        }
+
         // self-attention
         {
             // compute Q and K and RoPE them
@@ -3362,6 +3387,8 @@ static struct ggml_cgraph * llm_build_falcon(
     const int32_t n_tokens = batch.n_tokens;
     const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
 
+    const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure(lctx.alloc);
+
     auto & buf_compute = lctx.buf_compute;
 
     struct ggml_init_params params = {
@@ -3465,6 +3492,16 @@ static struct ggml_cgraph * llm_build_falcon(
         }
     }
 
+    // K_shift
+    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+    ggml_allocr_alloc(lctx.alloc, K_shift);
+    if (!ggml_allocr_is_measure(lctx.alloc)) {
+        int * data = (int *) K_shift->data;
+        for (int i = 0; i < n_ctx; ++i) {
+            data[i] = kv_self.cells[i].delta;
+        }
+    }
+
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * attn_norm;
 
@@ -3476,6 +3513,18 @@ static struct ggml_cgraph * llm_build_falcon(
         }
 #endif // GGML_USE_CUBLAS
 
+        // shift the entire K-cache if needed
+        if (do_rope_shift) {
+            ggml_build_forward_expand(gf,
+                    ggml_rope_custom_inplace(ctx0,
+                        ggml_view_3d(ctx0, kv_self.k,
+                            n_embd_head, n_head_kv, n_ctx,
+                            ggml_element_size(kv_self.k)*n_embd_head,
+                            ggml_element_size(kv_self.k)*n_embd_gqa,
+                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
+                        K_shift, n_embd_head, 2, 0, freq_base, freq_scale));
+        }
+
         // self-attention
         // TODO: refactor into common function (shared with LLaMA)
         {

From 1f17ea631c863e50f292354c8916046de01aacf7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 19:01:20 +0300
Subject: [PATCH 13/55] speculative : fix KV cache management

---
 examples/speculative/speculative.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 06173393ccedc..053073397d146 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -172,6 +172,7 @@ int main(int argc, char ** argv) {
                 LOG("out of drafted tokens\n");
             }
 
+            llama_kv_cache_rm_seq(ctx_dft, 0, n_past_dft, n_ctx);
             llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0), params.n_threads);
             ++n_past_dft;
 
@@ -217,6 +218,7 @@ int main(int argc, char ** argv) {
 
         // sample n_draft tokens from the draft model using greedy decoding
         int n_past_cur = n_past_dft;
+
         for (int i = 0; i < n_draft; ++i) {
             float * logits = llama_get_logits(ctx_dft);
 
@@ -256,6 +258,7 @@ int main(int argc, char ** argv) {
             }
 
             // evaluate the drafted token on the draft model
+            llama_kv_cache_rm_seq(ctx_dft, 0, n_past_cur, n_ctx);
             llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0), params.n_threads);
             ++n_past_cur;
 
@@ -265,6 +268,7 @@ int main(int argc, char ** argv) {
         }
 
         // evaluate the target model on the drafted tokens
+        llama_kv_cache_rm_seq(ctx_tgt, 0, n_past_tgt, n_ctx);
         llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0), params.n_threads);
         ++n_past_tgt;
 

From 0161372b9a1a72d245b4694b547ed6905c2d7167 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 20:30:05 +0300
Subject: [PATCH 14/55] parallel : example for serving multiple users in
 parallel

---
 common/common.cpp                    |   8 +-
 common/common.h                      |   2 +-
 examples/CMakeLists.txt              |   1 +
 examples/main/main.cpp               |   2 +-
 examples/parallel/CMakeLists.txt     |   8 +
 examples/parallel/parallel.cpp       | 244 +++++++++++++++++++++++++++
 examples/perplexity/perplexity.cpp   |   2 +-
 examples/speculative/speculative.cpp |   6 +-
 llama.cpp                            |   2 +-
 9 files changed, 262 insertions(+), 13 deletions(-)
 create mode 100644 examples/parallel/CMakeLists.txt
 create mode 100644 examples/parallel/parallel.cpp

diff --git a/common/common.cpp b/common/common.cpp
index fd50891f8cdbc..ff1b5ee9fc6b5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -454,8 +454,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                 params.logdir += DIRECTORY_SEPARATOR;
             }
-        } else if (arg == "--perplexity") {
-            params.perplexity = true;
+        } else if (arg == "--perplexity" || arg == "--all-logits") {
+            params.logits_all = true;
         } else if (arg == "--ppl-stride") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -653,7 +653,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
     printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    printf("  --perplexity          compute perplexity over each ctx window of the prompt\n");
+    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
     printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
     printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
@@ -735,7 +735,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     lparams.f16_kv          = params.memory_f16;
     lparams.use_mmap        = params.use_mmap;
     lparams.use_mlock       = params.use_mlock;
-    lparams.logits_all      = params.perplexity;
+    lparams.logits_all      = params.logits_all;
     lparams.embedding       = params.embedding;
     lparams.rope_freq_base  = params.rope_freq_base;
     lparams.rope_freq_scale = params.rope_freq_scale;
diff --git a/common/common.h b/common/common.h
index 2cc11966b6b53..9454032740634 100644
--- a/common/common.h
+++ b/common/common.h
@@ -113,7 +113,7 @@ struct gpt_params {
     bool ignore_eos        = false; // ignore generated EOS tokens
     bool instruct          = false; // instruction mode (used for Alpaca models)
     bool penalize_nl       = true;  // consider newlines as a repeatable token
-    bool perplexity        = false; // compute perplexity over the prompt
+    bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool numa              = false; // attempt optimizations that help on some NUMA systems
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 884c4276422eb..df7307072c1b6 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -24,6 +24,7 @@ else()
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(simple)
     add_subdirectory(speculative)
+    add_subdirectory(parallel)
     add_subdirectory(embd-input)
     add_subdirectory(llama-bench)
     add_subdirectory(beam-search)
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index ed2d9e2f706c5..9c5f2746affbc 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -124,7 +124,7 @@ int main(int argc, char ** argv) {
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
 
-    if (params.perplexity) {
+    if (params.logits_all) {
         printf("\n************\n");
         printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
         printf("************\n\n");
diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt
new file mode 100644
index 0000000000000..0bbf89eaefce6
--- /dev/null
+++ b/examples/parallel/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET parallel)
+add_executable(${TARGET} parallel.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
new file mode 100644
index 0000000000000..1bb4d497fa220
--- /dev/null
+++ b/examples/parallel/parallel.cpp
@@ -0,0 +1,244 @@
+// A basic application simulating a server with multiple clients.
+// The clients submite requests to the server and they are processed in parallel.
+
+#include "build-info.h"
+
+#include "common.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+// trim whitespace from the beginning and end of a string
+static std::string trim(const std::string & str) {
+    size_t start = 0;
+    size_t end = str.size();
+
+    while (start < end && isspace(str[start])) {
+        start += 1;
+    }
+
+    while (end > start && isspace(str[end - 1])) {
+        end -= 1;
+    }
+
+    return str.substr(start, end - start);
+}
+
+static std::string k_system = R"(
+Transcript of a dialog, where the User interacts with an Assistant.
+The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
+
+User: Hello, what is the temperature outside?
+Assistant: It is 72 degrees Fahrenheit.
+User: What is the definition of a prime number?
+Assistant: A prime number is a number that is divisible only by itself and 1.
+User: )";
+
+static std::vector<std::string> k_prompts = {
+    "What is the meaning of life?",
+    "What is the population of Europe?",
+    "List all planets in the Solar System.",
+    "What is the capital of France?",
+    "Tell me an interesting fact about llamas.",
+    "What is the best way to cook a steak?",
+    "Are you familiar with the Special Theory of Relativity and can you explain it to me?",
+    "Recommend some interesting books to read.",
+    "What is the best way to learn a new language?",
+    "How to get a job at Google?",
+    "If you could have any superpower, what would it be?",
+    "I want to learn how to play the piano.",
+};
+
+struct client {
+    int32_t id = 0;
+
+    llama_seq_id seq_id = -1;
+
+    llama_token sampled;
+
+    int32_t n_prompt  = 0;
+    int32_t n_decoded = 0;
+    int32_t i_batch   = -1;
+
+    std::string input;
+    std::string prompt;
+    std::string response;
+
+    std::vector<llama_token> last_tokens;
+};
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    const int n_clients = 16;
+
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("parallel", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+#endif // LOG_DISABLE_LOGS
+
+    // init llama.cpp
+    llama_backend_init(params.numa);
+
+    llama_model * model = NULL;
+
+    llama_context * ctx = NULL;
+
+    // load the target model
+    params.logits_all = true;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+
+    fprintf(stderr, "\n\n");
+    fflush(stderr);
+
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_vocab = llama_n_vocab(ctx);
+
+    std::vector<client> clients(n_clients);
+    for (size_t i = 0; i < clients.size(); ++i) {
+        auto & client = clients[i];
+        client.id = i;
+        client.last_tokens.resize(n_ctx);
+        std::fill(client.last_tokens.begin(), client.last_tokens.end(), 0);
+    }
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+
+    auto t_main_start = ggml_time_us();
+
+    int64_t n_tokens_total = 0;
+
+    llama_seq_id g_seq_id = 0;
+
+    std::vector<llama_token>  batch_token;
+    std::vector<llama_pos>    batch_pos;
+    std::vector<llama_seq_id> batch_seq_id;
+    std::vector<client *>     batch_clients;
+
+    while (true) {
+        uint32_t n_tokens = 0;
+
+        batch_token.clear();
+        batch_pos.clear();
+        batch_seq_id.clear();
+
+        for (auto & client : clients) {
+            if (client.seq_id == -1) {
+                client.seq_id = g_seq_id;
+                client.input = k_prompts[rand() % k_prompts.size()];
+                client.prompt = k_system + client.input + "\nAssistant:";
+                client.response = "";
+                std::fill(client.last_tokens.begin(), client.last_tokens.end(), 0);
+
+                std::vector<llama_token> prompt_tokens;
+                prompt_tokens = ::llama_tokenize(ctx, client.prompt, true);
+
+                for (size_t i = 0; i < prompt_tokens.size(); ++i) {
+                    batch_token.push_back(prompt_tokens[i]);
+                    batch_pos.push_back(i);
+                    batch_seq_id.push_back(client.seq_id);
+                    batch_clients.push_back(&client);
+                }
+                client.n_prompt  = prompt_tokens.size();
+                client.n_decoded = prompt_tokens.size();
+                client.i_batch   = batch_token.size() - 1;
+
+                g_seq_id += 1;
+            } else {
+                batch_token.push_back(client.sampled);
+                batch_pos.push_back(client.n_decoded);
+                batch_seq_id.push_back(client.seq_id);
+                batch_clients.push_back(&client);
+                client.n_decoded += 1;
+                client.i_batch = batch_token.size() - 1;
+            }
+        }
+
+        // process in chunks of params.n_batch
+        for (size_t i = 0; i < batch_token.size(); i += params.n_batch) {
+            n_tokens = std::min(params.n_batch, (int32_t) (batch_token.size() - i));
+
+            llama_batch batch = {
+                n_tokens,
+                batch_token.data() + i,
+                nullptr,
+                batch_pos.data() + i,
+                batch_seq_id.data() + i,
+                0, 0, 0, // unused
+            };
+
+            if (llama_decode(ctx, batch, params.n_threads)) {
+                LOG_TEE("%s : failed to decode batch\n", __func__);
+                return 1;
+            }
+
+            for (auto & client : clients) {
+                if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
+                    continue;
+                }
+
+                const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.last_tokens, candidates, client.i_batch - i);
+
+                // remember which tokens were sampled - used for repetition penalties during sampling
+                client.last_tokens.erase(client.last_tokens.begin());
+                client.last_tokens.push_back(id);
+
+                const std::string token_str = llama_token_to_piece(ctx, id);
+                client.response += token_str;
+                client.sampled = id;
+
+                //printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n",
+                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
+
+                if (id == llama_token_eos(ctx) || client.n_decoded > params.n_predict || client.response.find("User:") != std::string::npos) {
+                    const size_t pos = client.response.find("User:");
+                    if (pos != std::string::npos) {
+                        client.response = client.response.substr(0, pos);
+                    }
+
+                    llama_kv_cache_rm_seq(ctx, client.seq_id, 0, n_ctx);
+
+                    const auto t_main_end = ggml_time_us();
+
+                    n_tokens_total += client.n_decoded - client.n_prompt;
+
+                    printf("\033[1mClient %d, seq %d, prompt %d t, response %d t, speed: %.2f t/s\033[0m: \n\nInput:    %s\nResponse: %s\n\n",
+                            client.id, client.seq_id, client.n_prompt, client.n_decoded - client.n_prompt,
+                            (double) n_tokens_total / (t_main_end - t_main_start) * 1e6,
+                            client.input.c_str(), ::trim(client.response).c_str());
+
+                    client.seq_id = -1;
+                }
+            }
+        }
+
+        static bool is_first = true;
+        if (is_first) {
+            t_main_start = ggml_time_us();
+            n_tokens_total = 0;
+            is_first = false;
+        }
+    }
+
+    LOG_TEE("\n\n");
+
+    llama_print_timings(ctx);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    fprintf(stderr, "\n\n");
+
+    return 0;
+}
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index fd2160bbf2b6e..8386a3d16869a 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -681,7 +681,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    params.perplexity = true;
+    params.logits_all = true;
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
     if (params.ppl_stride > 0) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 053073397d146..526d98e668f16 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
     llama_context * ctx_dft = NULL;
 
     // load the target model
-    params.perplexity = true; // HACK: enable logits_all = true
+    params.logits_all = true;
     std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
 
     // load the draft model
@@ -172,7 +172,6 @@ int main(int argc, char ** argv) {
                 LOG("out of drafted tokens\n");
             }
 
-            llama_kv_cache_rm_seq(ctx_dft, 0, n_past_dft, n_ctx);
             llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0), params.n_threads);
             ++n_past_dft;
 
@@ -218,7 +217,6 @@ int main(int argc, char ** argv) {
 
         // sample n_draft tokens from the draft model using greedy decoding
         int n_past_cur = n_past_dft;
-
         for (int i = 0; i < n_draft; ++i) {
             float * logits = llama_get_logits(ctx_dft);
 
@@ -258,7 +256,6 @@ int main(int argc, char ** argv) {
             }
 
             // evaluate the drafted token on the draft model
-            llama_kv_cache_rm_seq(ctx_dft, 0, n_past_cur, n_ctx);
             llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0), params.n_threads);
             ++n_past_cur;
 
@@ -268,7 +265,6 @@ int main(int argc, char ** argv) {
         }
 
         // evaluate the target model on the drafted tokens
-        llama_kv_cache_rm_seq(ctx_tgt, 0, n_past_tgt, n_ctx);
         llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0), params.n_threads);
         ++n_past_tgt;
 
diff --git a/llama.cpp b/llama.cpp
index a7c7604d90b4a..875fd5227c910 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6673,7 +6673,7 @@ struct llama_context * llama_new_context_with_model(
             ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
 
             // build worst-case graph
-            uint32_t n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
+            uint32_t n_tokens = std::max((int)hparams.n_ctx, params.n_batch);
             llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, 0, 0));
 

From 466b513851ff8ec73889ce6414b8a15d570f77c7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 21:34:20 +0300
Subject: [PATCH 15/55] parallel : disable hot-plug to avoid cache
 fragmentation

---
 examples/parallel/parallel.cpp | 91 ++++++++++++++++++++++------------
 llama.cpp                      |  4 ++
 2 files changed, 64 insertions(+), 31 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 1bb4d497fa220..23fda9d58d07d 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -28,7 +28,7 @@ static std::string trim(const std::string & str) {
 }
 
 static std::string k_system = R"(
-Transcript of a dialog, where the User interacts with an Assistant.
+Transcript of a never ending dialog, where the User interacts with an Assistant.
 The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
 
 User: Hello, what is the temperature outside?
@@ -59,6 +59,9 @@ struct client {
 
     llama_token sampled;
 
+    int64_t t_start_prompt;
+    int64_t t_start_gen;
+
     int32_t n_prompt  = 0;
     int32_t n_decoded = 0;
     int32_t i_batch   = -1;
@@ -133,33 +136,47 @@ int main(int argc, char ** argv) {
 
         for (auto & client : clients) {
             if (client.seq_id == -1) {
-                client.seq_id = g_seq_id;
-                client.input = k_prompts[rand() % k_prompts.size()];
-                client.prompt = k_system + client.input + "\nAssistant:";
-                client.response = "";
-                std::fill(client.last_tokens.begin(), client.last_tokens.end(), 0);
-
-                std::vector<llama_token> prompt_tokens;
-                prompt_tokens = ::llama_tokenize(ctx, client.prompt, true);
-
-                for (size_t i = 0; i < prompt_tokens.size(); ++i) {
-                    batch_token.push_back(prompt_tokens[i]);
-                    batch_pos.push_back(i);
-                    batch_seq_id.push_back(client.seq_id);
-                    batch_clients.push_back(&client);
+                continue;
+            }
+
+            batch_token.push_back(client.sampled);
+            batch_pos.push_back(client.n_decoded);
+            batch_seq_id.push_back(client.seq_id);
+            batch_clients.push_back(&client);
+            client.n_decoded += 1;
+            client.i_batch = batch_token.size() - 1;
+        }
+
+        if (batch_token.empty()) {
+            // all sequences have ended - clear the entire KV cache
+            llama_kv_cache_rm_tokens(ctx, -1, -1);
+
+            for (auto & client : clients) {
+                if (client.seq_id == -1) {
+                    client.seq_id = g_seq_id;
+                    client.t_start_prompt = ggml_time_us();
+                    client.t_start_gen = 0;
+
+                    client.input = k_prompts[rand() % k_prompts.size()];
+                    client.prompt = k_system + client.input + "\nAssistant:";
+                    client.response = "";
+                    std::fill(client.last_tokens.begin(), client.last_tokens.end(), 0);
+
+                    std::vector<llama_token> prompt_tokens;
+                    prompt_tokens = ::llama_tokenize(ctx, client.prompt, true);
+
+                    for (size_t i = 0; i < prompt_tokens.size(); ++i) {
+                        batch_token.push_back(prompt_tokens[i]);
+                        batch_pos.push_back(i);
+                        batch_seq_id.push_back(client.seq_id);
+                        batch_clients.push_back(&client);
+                    }
+                    client.n_prompt  = prompt_tokens.size();
+                    client.n_decoded = prompt_tokens.size();
+                    client.i_batch   = batch_token.size() - 1;
+
+                    g_seq_id += 1;
                 }
-                client.n_prompt  = prompt_tokens.size();
-                client.n_decoded = prompt_tokens.size();
-                client.i_batch   = batch_token.size() - 1;
-
-                g_seq_id += 1;
-            } else {
-                batch_token.push_back(client.sampled);
-                batch_pos.push_back(client.n_decoded);
-                batch_seq_id.push_back(client.seq_id);
-                batch_clients.push_back(&client);
-                client.n_decoded += 1;
-                client.i_batch = batch_token.size() - 1;
             }
         }
 
@@ -188,6 +205,10 @@ int main(int argc, char ** argv) {
 
                 const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.last_tokens, candidates, client.i_batch - i);
 
+                if (client.t_start_gen == 0) {
+                    client.t_start_gen = ggml_time_us();
+                }
+
                 // remember which tokens were sampled - used for repetition penalties during sampling
                 client.last_tokens.erase(client.last_tokens.begin());
                 client.last_tokens.push_back(id);
@@ -199,7 +220,10 @@ int main(int argc, char ** argv) {
                 //printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n",
                 //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
 
-                if (id == llama_token_eos(ctx) || client.n_decoded > params.n_predict || client.response.find("User:") != std::string::npos) {
+                if (id == llama_token_eos(ctx) || client.n_decoded > params.n_predict ||
+                    client.response.find("User:") != std::string::npos ||
+                    client.response.find('\n') != std::string::npos) {
+                    // basic reverse prompt
                     const size_t pos = client.response.find("User:");
                     if (pos != std::string::npos) {
                         client.response = client.response.substr(0, pos);
@@ -211,13 +235,18 @@ int main(int argc, char ** argv) {
 
                     n_tokens_total += client.n_decoded - client.n_prompt;
 
-                    printf("\033[1mClient %d, seq %d, prompt %d t, response %d t, speed: %.2f t/s\033[0m: \n\nInput:    %s\nResponse: %s\n\n",
+                    printf("\033[1mClient %2d, seq %4d, prompt %4d t, response %4d t, speed: PP %5.2f t/s, TG %5.2f, AVG %5.2f \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
                             client.id, client.seq_id, client.n_prompt, client.n_decoded - client.n_prompt,
-                            (double) n_tokens_total / (t_main_end - t_main_start) * 1e6,
-                            client.input.c_str(), ::trim(client.response).c_str());
+                            (double) (client.n_prompt                   ) / (client.t_start_gen - client.t_start_prompt) * 1e6,
+                            (double) (client.n_decoded - client.n_prompt) / (t_main_end         - client.t_start_gen)    * 1e6,
+                            (double) (client.n_decoded                  ) / (t_main_end         - client.t_start_prompt) * 1e6,
+                            ::trim(client.input).c_str(),
+                            ::trim(client.response).c_str());
 
                     client.seq_id = -1;
                 }
+
+                client.i_batch = -1;
             }
         }
 
diff --git a/llama.cpp b/llama.cpp
index 875fd5227c910..f56ecc272c14c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2606,6 +2606,8 @@ static struct ggml_cgraph * llm_build_llama(
     const int32_t n_tokens = batch.n_tokens;
     const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
 
+    //printf("n_kv = %d\n", n_kv);
+
     const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure(lctx.alloc);
 
     auto & buf_compute = lctx.buf_compute;
@@ -4052,6 +4054,8 @@ static bool llama_eval_internal(
         batch.seq_id = seq_id.data();
     }
 
+    kv_self.head = 0;
+
     if (!llama_kv_cache_find_slot(kv_self, batch)) {
         return false;
     }

From 897caccdf4248a3500f3b10a355b5eb487682c2d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 18 Sep 2023 22:00:02 +0300
Subject: [PATCH 16/55] fixes : speculative KV cache + llama worst-case graph

---
 examples/parallel/parallel.cpp       | 20 +++++---------------
 examples/speculative/speculative.cpp |  3 +++
 llama.cpp                            |  8 ++++----
 3 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 23fda9d58d07d..a8b6f629d51d7 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -80,7 +80,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    const int n_clients = 16;
+    const int n_clients = 4;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("parallel", "log"));
@@ -116,10 +116,6 @@ int main(int argc, char ** argv) {
     std::vector<llama_token_data> candidates;
     candidates.reserve(n_vocab);
 
-    auto t_main_start = ggml_time_us();
-
-    int64_t n_tokens_total = 0;
-
     llama_seq_id g_seq_id = 0;
 
     std::vector<llama_token>  batch_token;
@@ -203,6 +199,9 @@ int main(int argc, char ** argv) {
                     continue;
                 }
 
+                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
+                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
+
                 const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.last_tokens, candidates, client.i_batch - i);
 
                 if (client.t_start_gen == 0) {
@@ -233,9 +232,7 @@ int main(int argc, char ** argv) {
 
                     const auto t_main_end = ggml_time_us();
 
-                    n_tokens_total += client.n_decoded - client.n_prompt;
-
-                    printf("\033[1mClient %2d, seq %4d, prompt %4d t, response %4d t, speed: PP %5.2f t/s, TG %5.2f, AVG %5.2f \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
+                    printf("\033[1mClient %2d, seq %4d, prompt %4d t, response %4d t, speed: PP %5.2f t/s, TG %5.2f t/s, AVG %5.2f t/s \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
                             client.id, client.seq_id, client.n_prompt, client.n_decoded - client.n_prompt,
                             (double) (client.n_prompt                   ) / (client.t_start_gen - client.t_start_prompt) * 1e6,
                             (double) (client.n_decoded - client.n_prompt) / (t_main_end         - client.t_start_gen)    * 1e6,
@@ -249,13 +246,6 @@ int main(int argc, char ** argv) {
                 client.i_batch = -1;
             }
         }
-
-        static bool is_first = true;
-        if (is_first) {
-            t_main_start = ggml_time_us();
-            n_tokens_total = 0;
-            is_first = false;
-        }
     }
 
     LOG_TEE("\n\n");
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 526d98e668f16..ea628211b2226 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -172,6 +172,7 @@ int main(int argc, char ** argv) {
                 LOG("out of drafted tokens\n");
             }
 
+            llama_kv_cache_rm_seq(ctx_dft, 0, n_past_dft, n_ctx);
             llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0), params.n_threads);
             ++n_past_dft;
 
@@ -256,6 +257,7 @@ int main(int argc, char ** argv) {
             }
 
             // evaluate the drafted token on the draft model
+            llama_kv_cache_rm_seq(ctx_dft, 0, n_past_cur, n_ctx);
             llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0), params.n_threads);
             ++n_past_cur;
 
@@ -265,6 +267,7 @@ int main(int argc, char ** argv) {
         }
 
         // evaluate the target model on the drafted tokens
+        llama_kv_cache_rm_seq(ctx_tgt, 0, n_past_tgt, n_ctx);
         llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0), params.n_threads);
         ++n_past_tgt;
 
diff --git a/llama.cpp b/llama.cpp
index f56ecc272c14c..3a4a2b6acc0d9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2604,7 +2604,7 @@ static struct ggml_cgraph * llm_build_llama(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : llama_kv_cache_cell_max(kv_self);
 
     //printf("n_kv = %d\n", n_kv);
 
@@ -2775,7 +2775,7 @@ static struct ggml_cgraph * llm_build_llama(
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
-            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens),    KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Qcur);
             ggml_set_name(Qcur, "Qcur");
 
@@ -6677,9 +6677,9 @@ struct llama_context * llama_new_context_with_model(
             ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
 
             // build worst-case graph
-            uint32_t n_tokens = std::max((int)hparams.n_ctx, params.n_batch);
+            const uint32_t n_tokens = std::min((int) hparams.n_ctx, params.n_batch);
             llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
-            ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, 0, 0));
+            ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, hparams.n_ctx - n_tokens, 0));
 
 #ifdef GGML_USE_METAL
             if (params.n_gpu_layers > 0) {

From fa0e67782086a19d47f327d51a4be14a45e4b891 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 00:24:13 +0300
Subject: [PATCH 17/55] llama : extend batch API to select which logits to
 output

---
 examples/embd-input/embd-input-lib.cpp |  2 +-
 examples/parallel/parallel.cpp         | 34 ++++++++++++++++++++++++--
 llama.cpp                              | 14 +++++++++--
 llama.h                                |  2 +-
 4 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index 344a8b2c3262e..339612cceed6c 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -79,7 +79,7 @@ bool eval_float(void * model, float * input, int N){
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        llama_batch batch = { uint32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, n_past, 1, 0, };
+        llama_batch batch = { uint32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
         if (llama_decode(ctx, batch, params.n_threads)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return false;
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index a8b6f629d51d7..6e68c5afce4ba 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -82,6 +82,9 @@ int main(int argc, char ** argv) {
 
     const int n_clients = 4;
 
+    // insert new requests as soon as the previous one is done
+    const bool hot_swap = true;
+
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("parallel", "log"));
     LOG_TEE("Log start\n");
@@ -121,14 +124,23 @@ int main(int argc, char ** argv) {
     std::vector<llama_token>  batch_token;
     std::vector<llama_pos>    batch_pos;
     std::vector<llama_seq_id> batch_seq_id;
+    std::vector<int8_t>       batch_logits;
     std::vector<client *>     batch_clients;
 
-    while (true) {
+    int32_t n_total_prompt = 0;
+    int32_t n_total_gen    = 0;
+
+    float t_avg = 0.0f;
+
+    const int32_t n_seq = 128;
+
+    while (g_seq_id < n_seq + n_clients) {
         uint32_t n_tokens = 0;
 
         batch_token.clear();
         batch_pos.clear();
         batch_seq_id.clear();
+        batch_logits.clear();
 
         for (auto & client : clients) {
             if (client.seq_id == -1) {
@@ -138,6 +150,7 @@ int main(int argc, char ** argv) {
             batch_token.push_back(client.sampled);
             batch_pos.push_back(client.n_decoded);
             batch_seq_id.push_back(client.seq_id);
+            batch_logits.push_back(true);
             batch_clients.push_back(&client);
             client.n_decoded += 1;
             client.i_batch = batch_token.size() - 1;
@@ -146,7 +159,9 @@ int main(int argc, char ** argv) {
         if (batch_token.empty()) {
             // all sequences have ended - clear the entire KV cache
             llama_kv_cache_rm_tokens(ctx, -1, -1);
+        }
 
+        if (hot_swap || batch_token.empty()) {
             for (auto & client : clients) {
                 if (client.seq_id == -1) {
                     client.seq_id = g_seq_id;
@@ -166,7 +181,10 @@ int main(int argc, char ** argv) {
                         batch_pos.push_back(i);
                         batch_seq_id.push_back(client.seq_id);
                         batch_clients.push_back(&client);
+                        batch_logits.push_back(false);
                     }
+                    batch_logits.back() = true;
+
                     client.n_prompt  = prompt_tokens.size();
                     client.n_decoded = prompt_tokens.size();
                     client.i_batch   = batch_token.size() - 1;
@@ -186,6 +204,7 @@ int main(int argc, char ** argv) {
                 nullptr,
                 batch_pos.data() + i,
                 batch_seq_id.data() + i,
+                batch_logits.data() + i,
                 0, 0, 0, // unused
             };
 
@@ -232,14 +251,20 @@ int main(int argc, char ** argv) {
 
                     const auto t_main_end = ggml_time_us();
 
-                    printf("\033[1mClient %2d, seq %4d, prompt %4d t, response %4d t, speed: PP %5.2f t/s, TG %5.2f t/s, AVG %5.2f t/s \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
+                    printf("\033[1mClient %2d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed: PP %5.2f t/s, TG %5.2f t/s, AVG %5.2f t/s \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
                             client.id, client.seq_id, client.n_prompt, client.n_decoded - client.n_prompt,
+                            (t_main_end - client.t_start_prompt) / 1e6,
                             (double) (client.n_prompt                   ) / (client.t_start_gen - client.t_start_prompt) * 1e6,
                             (double) (client.n_decoded - client.n_prompt) / (t_main_end         - client.t_start_gen)    * 1e6,
                             (double) (client.n_decoded                  ) / (t_main_end         - client.t_start_prompt) * 1e6,
                             ::trim(client.input).c_str(),
                             ::trim(client.response).c_str());
 
+                    n_total_prompt += client.n_prompt;
+                    n_total_gen    += client.n_decoded - client.n_prompt;
+
+                    t_avg += (t_main_end - client.t_start_prompt) / 1e6;
+
                     client.seq_id = -1;
                 }
 
@@ -248,6 +273,11 @@ int main(int argc, char ** argv) {
         }
     }
 
+    LOG_TEE("\n\n");
+    LOG_TEE("Total prompt tokens: %d\n", n_total_prompt);
+    LOG_TEE("Total gen tokens:    %d\n", n_total_gen);
+    LOG_TEE("Avg time per seq:    %.2f s\n", t_avg / n_seq);
+
     LOG_TEE("\n\n");
 
     llama_print_timings(ctx);
diff --git a/llama.cpp b/llama.cpp
index 3a4a2b6acc0d9..3e54fed7c2253 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4140,7 +4140,16 @@ static bool llama_eval_internal(
 
         if (lctx.logits_all) {
             logits_out.resize(n_vocab * n_tokens);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
+            if (batch.logits) {
+                for (uint32_t i = 0; i < n_tokens; i++) {
+                    if (batch.logits[i] == 0) {
+                        continue;
+                    }
+                    memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
+                }
+            } else {
+                memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
+            }
         } else {
             // return result for just the last token
             logits_out.resize(n_vocab);
@@ -7318,7 +7327,7 @@ int llama_eval_embd(
                              int   n_threads) {
     llama_kv_cache_rm_tokens(ctx->kv_self, n_past, -1);
 
-    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, n_past, 1, 0, };
+    llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
 
     if (!llama_eval_internal(*ctx, batch, n_threads)) {
         LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
@@ -7346,6 +7355,7 @@ struct llama_batch llama_batch_get_one(
         /*embd        =*/ nullptr,
         /*pos         =*/ nullptr,
         /*seq_id      =*/ nullptr,
+        /*logits      =*/ nullptr,
         /*all_pos_0   =*/ pos_0,
         /*all_pos_1   =*/ 1,
         /*all_seq_id  =*/ seq_id,
diff --git a/llama.h b/llama.h
index 4a5f2e3bf70cf..e4f02c9787da7 100644
--- a/llama.h
+++ b/llama.h
@@ -70,11 +70,11 @@ extern "C" {
     typedef struct llama_batch {
         uint32_t n_tokens;
 
-        // TODO: not sure about these consts - might just get in the way all the time with no benefit
         const llama_token  * token;
         const float        * embd;
         const llama_pos    * pos;
         const llama_seq_id * seq_id;
+        const int8_t       * logits; // if 0, do not extract logits for that token
 
         // NOTE: helpers for smooth API transition - can be deprecated in the future
         //       for future-proof code, use the above fields instead and ignore everything below

From daf4c6d360b8883ea294d7a858991d0411e898c3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 11:05:08 +0300
Subject: [PATCH 18/55] llama : fix worst case graph build

---
 common/common.cpp                    |   2 +-
 examples/llama-bench/llama-bench.cpp |   4 +-
 llama.cpp                            | 175 +++++++++++++++------------
 3 files changed, 100 insertions(+), 81 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index ff1b5ee9fc6b5..52387e2a6123d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -781,7 +781,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
         std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
         llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0), params.n_threads);
-        llama_kv_cache_keep_seq(lctx, -1);
+        llama_kv_cache_rm_tokens(lctx, -1, -1);
         llama_reset_timings(lctx);
     }
 
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 8fdbd80330f19..7a3d3b97fcfd3 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -977,7 +977,7 @@ int main(int argc, char ** argv) {
 
         test t(inst, lmodel, ctx);
 
-        llama_kv_cache_keep_seq(ctx, -1);
+        llama_kv_cache_rm_tokens(ctx, -1, -1);
 
         // warmup run
         if (t.n_prompt > 0) {
@@ -988,7 +988,7 @@ int main(int argc, char ** argv) {
         }
 
         for (int i = 0; i < params.reps; i++) {
-            llama_kv_cache_keep_seq(ctx, -1);
+            llama_kv_cache_rm_tokens(ctx, -1, -1);
 
             uint64_t t_start = get_time_ns();
             if (t.n_prompt > 0) {
diff --git a/llama.cpp b/llama.cpp
index 3e54fed7c2253..5f40a9b5f194a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1024,6 +1024,9 @@ struct llama_kv_cache {
     uint32_t head = 0;
     uint32_t size = 0;
 
+    // computed before each graph build
+    uint32_t cell_max = 0;
+
     std::vector<llama_kv_cell> cells;
 
     struct ggml_tensor * k = NULL;
@@ -1314,16 +1317,15 @@ static bool llama_kv_cache_find_slot(
     return true;
 }
 
+// find how many cells are currently in use
 int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
-    int32_t res = 0;
-
-    for (uint32_t i = 0; i < cache.size; i++) {
+    for (uint32_t i = cache.size - 2; i > 0; --i) {
         if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
-            res = i + 1;
+            return i + 1;
         }
     }
 
-    return res;
+    return 0;
 }
 
 void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
@@ -2604,11 +2606,12 @@ static struct ggml_cgraph * llm_build_llama(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : llama_kv_cache_cell_max(kv_self);
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
+    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
-    //printf("n_kv = %d\n", n_kv);
+    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
 
-    const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure(lctx.alloc);
+    //printf("n_kv = %d\n", n_kv);
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -2713,13 +2716,26 @@ static struct ggml_cgraph * llm_build_llama(
         }
     }
 
-    // K_shift
-    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-    ggml_allocr_alloc(lctx.alloc, K_shift);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        int * data = (int *) K_shift->data;
-        for (int i = 0; i < n_ctx; ++i) {
-            data[i] = kv_self.cells[i].delta;
+    // shift the entire K-cache if needed
+    if (do_rope_shift) {
+        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+        ggml_allocr_alloc(lctx.alloc, K_shift);
+        if (!ggml_allocr_is_measure(lctx.alloc)) {
+            int * data = (int *) K_shift->data;
+            for (int i = 0; i < n_ctx; ++i) {
+                data[i] = kv_self.cells[i].delta;
+            }
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_build_forward_expand(gf,
+                    ggml_rope_custom_inplace(ctx0,
+                        ggml_view_3d(ctx0, kv_self.k,
+                            n_embd_head, n_head_kv, n_ctx,
+                            ggml_element_size(kv_self.k)*n_embd_head,
+                            ggml_element_size(kv_self.k)*n_embd_gqa,
+                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
+                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
         }
     }
 
@@ -2748,18 +2764,6 @@ static struct ggml_cgraph * llm_build_llama(
             ggml_set_name(cur, "attention_norm_0");
         }
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            ggml_build_forward_expand(gf,
-                    ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
-                            n_embd_head, n_head_kv, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
-        }
-
         // self-attention
         {
             // compute Q and K and RoPE them
@@ -2791,13 +2795,13 @@ static struct ggml_cgraph * llm_build_llama(
                 offload_func_v(Vcur);
                 ggml_set_name(Vcur, "Vcur");
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
                 offload_func_kq(k);
                 ggml_set_name(k, "k");
 
                 struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
                         (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
                 offload_func_v(v);
                 ggml_set_name(v, "v");
 
@@ -2988,9 +2992,10 @@ static struct ggml_cgraph * llm_build_baichaun(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
+    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
-    const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure(lctx.alloc);
+    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -3095,13 +3100,26 @@ static struct ggml_cgraph * llm_build_baichaun(
         }
     }
 
-    // K_shift
-    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-    ggml_allocr_alloc(lctx.alloc, K_shift);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        int * data = (int *) K_shift->data;
-        for (int i = 0; i < n_ctx; ++i) {
-            data[i] = kv_self.cells[i].delta;
+    // shift the entire K-cache if needed
+    if (do_rope_shift) {
+        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+        ggml_allocr_alloc(lctx.alloc, K_shift);
+        if (!ggml_allocr_is_measure(lctx.alloc)) {
+            int * data = (int *) K_shift->data;
+            for (int i = 0; i < n_ctx; ++i) {
+                data[i] = kv_self.cells[i].delta;
+            }
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_build_forward_expand(gf,
+                    ggml_rope_custom_inplace(ctx0,
+                        ggml_view_3d(ctx0, kv_self.k,
+                            n_embd_head, n_head_kv, n_ctx,
+                            ggml_element_size(kv_self.k)*n_embd_head,
+                            ggml_element_size(kv_self.k)*n_embd_gqa,
+                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
+                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
         }
     }
 
@@ -3130,18 +3148,6 @@ static struct ggml_cgraph * llm_build_baichaun(
             ggml_set_name(cur, "attention_norm_0");
         }
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            ggml_build_forward_expand(gf,
-                    ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
-                            n_embd_head, n_head_kv, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
-        }
-
         // self-attention
         {
             // compute Q and K and RoPE them
@@ -3186,13 +3192,13 @@ static struct ggml_cgraph * llm_build_baichaun(
                 offload_func_v(Vcur);
                 ggml_set_name(Vcur, "Vcur");
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
                 offload_func_kq(k);
                 ggml_set_name(k, "k");
 
                 struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
                         (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
                 offload_func_v(v);
                 ggml_set_name(v, "v");
 
@@ -3387,9 +3393,13 @@ static struct ggml_cgraph * llm_build_falcon(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
+    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
-    const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure(lctx.alloc);
+    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
+
+    //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
+    //        kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -3494,13 +3504,26 @@ static struct ggml_cgraph * llm_build_falcon(
         }
     }
 
-    // K_shift
-    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
-    ggml_allocr_alloc(lctx.alloc, K_shift);
-    if (!ggml_allocr_is_measure(lctx.alloc)) {
-        int * data = (int *) K_shift->data;
-        for (int i = 0; i < n_ctx; ++i) {
-            data[i] = kv_self.cells[i].delta;
+    // shift the entire K-cache if needed
+    if (do_rope_shift) {
+        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+        ggml_allocr_alloc(lctx.alloc, K_shift);
+        if (!ggml_allocr_is_measure(lctx.alloc)) {
+            int * data = (int *) K_shift->data;
+            for (int i = 0; i < n_ctx; ++i) {
+                data[i] = kv_self.cells[i].delta;
+            }
+        }
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_build_forward_expand(gf,
+                    ggml_rope_custom_inplace(ctx0,
+                        ggml_view_3d(ctx0, kv_self.k,
+                            n_embd_head, n_head_kv, n_ctx,
+                            ggml_element_size(kv_self.k)*n_embd_head,
+                            ggml_element_size(kv_self.k)*n_embd_gqa,
+                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
+                        K_shift, n_embd_head, 2, 0, freq_base, freq_scale));
         }
     }
 
@@ -3515,18 +3538,6 @@ static struct ggml_cgraph * llm_build_falcon(
         }
 #endif // GGML_USE_CUBLAS
 
-        // shift the entire K-cache if needed
-        if (do_rope_shift) {
-            ggml_build_forward_expand(gf,
-                    ggml_rope_custom_inplace(ctx0,
-                        ggml_view_3d(ctx0, kv_self.k,
-                            n_embd_head, n_head_kv, n_ctx,
-                            ggml_element_size(kv_self.k)*n_embd_head,
-                            ggml_element_size(kv_self.k)*n_embd_gqa,
-                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 2, 0, freq_base, freq_scale));
-        }
-
         // self-attention
         // TODO: refactor into common function (shared with LLaMA)
         {
@@ -3603,13 +3614,13 @@ static struct ggml_cgraph * llm_build_falcon(
                 offload_func_v(Vcur->src[0]->src[0]);
                 ggml_set_name(Vcur, "Vcur");
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
                 offload_func_kq(k);
                 ggml_set_name(k, "k");
 
                 struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
                         (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
                 offload_func_v(v);
 
                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@@ -3741,7 +3752,8 @@ static struct ggml_cgraph * llm_build_starcoder(
     const float norm_eps = hparams.f_norm_eps;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
+    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
     auto & buf_compute = lctx.buf_compute;
 
@@ -3853,12 +3865,12 @@ static struct ggml_cgraph * llm_build_starcoder(
                 struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
                 ggml_set_name(Vcur, "Vcur");
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
+                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
                 ggml_set_name(k, "k");
 
                 struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
                         (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
+                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
 
                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
@@ -4054,8 +4066,15 @@ static bool llama_eval_internal(
         batch.seq_id = seq_id.data();
     }
 
+    // we always start to search for a free slot from the start of the cache
+    // TODO: better strategies can be implemented
     kv_self.head = 0;
 
+    // a heuristic, to avoid attending the full cache if it is not yet utilized
+    // after enough generations, the benefit from this heuristic disappears
+    // if we start defragmenting the cache, the benefit from this will be more important
+    kv_self.cell_max = llama_kv_cache_cell_max(kv_self);
+
     if (!llama_kv_cache_find_slot(kv_self, batch)) {
         return false;
     }

From 7e2b9974d1ded87010d8d300cb09e194bf7f156d Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 19 Sep 2023 10:31:36 +0200
Subject: [PATCH 19/55] ggml-cuda : update rope implementation for parallel
 decoding (#3254)

* ggml-cuda : update rope implementation for parallel decoding

* better solution for p0 computation

* fix rope

* simpler rope implementation

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml-cuda.cu | 79 +++++++++++++++++++++++++++++++++++-----------------
 llama.cpp    |  6 ++++
 2 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 08428ea3fab3b..14b1ecf7d2cf3 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -439,6 +439,7 @@ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullpt
 struct ggml_tensor_extra_gpu {
     void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
     cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
+    bool copied;
 };
 
 // this is faster on Windows
@@ -4355,8 +4356,9 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
 }
 
 // rope == RoPE == rotary positional embedding
-static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
-                                const float p_delta, const int p_delta_rows, const float theta_scale) {
+
+static __global__ void rope_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                                const int p_delta_rows, const float theta_scale) {
     const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
     if (col >= ncols) {
@@ -4365,8 +4367,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
 
     const int row = blockDim.x*blockIdx.x + threadIdx.x;
     const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
 
-    const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
+    const int p = pos != nullptr ? pos[i2] : 0;
+    const float p0 = p * freq_scale;
+    const float theta = p0*powf(theta_scale, col/2);
     const float sin_theta = sinf(theta);
     const float cos_theta = cosf(theta);
 
@@ -4377,8 +4382,8 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
     dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }
 
-static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
-                                const float p_delta, const int p_delta_rows, const float theta_scale) {
+static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                                     const int p_delta_rows, const float theta_scale) {
     const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
     if (col >= ncols) {
@@ -4387,8 +4392,11 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
 
     const int row = blockDim.x*blockIdx.x + threadIdx.x;
     const int i = row*ncols + col/2;
+    const int i2 = row/p_delta_rows;
 
-    const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
+    const int p = pos != nullptr ? pos[i2] : 0;
+    const float p0 = p * freq_scale;
+    const float theta = p0*powf(theta_scale, col/2);
     const float sin_theta = sinf(theta);
     const float cos_theta = cosf(theta);
 
@@ -4399,8 +4407,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
     dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
 }
 
-static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p0,
-                                    const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx) {
+static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                                    const int p_delta_rows, const float theta_scale, const int n_ctx) {
     const int col = blockDim.x*blockIdx.x + threadIdx.x;
     const int half_n_dims = ncols/4;
 
@@ -4410,11 +4418,13 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
 
     const int row = blockDim.y*blockIdx.y + threadIdx.y;
     const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
 
     const float col_theta_scale = powf(theta_scale, col);
-    const float p = p0 + p_delta*(row/p_delta_rows);
+     // FIXME: this is likely wrong
+    const int p = pos != nullptr ? pos[i2] : 0;
 
-    const float theta = min(p, p_delta*(n_ctx - 2))*col_theta_scale;
+    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
     const float sin_theta = sinf(theta);
     const float cos_theta = cosf(theta);
 
@@ -4424,7 +4434,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol
     dst[i + 0]           = x0*cos_theta - x1*sin_theta;
     dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
 
-    const float block_theta = max(p - p_delta*(n_ctx - 2), 0.f)*col_theta_scale;
+    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
     const float sin_block_theta = sinf(block_theta);
     const float cos_block_theta = cosf(block_theta);
 
@@ -5361,31 +5371,31 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
     scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
 }
 
-static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
-                          const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
+static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
     GGML_ASSERT(ncols % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
     const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nrows, num_blocks_x, 1);
-    rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
+    rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
 }
 
-static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
-                          const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
+static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
     GGML_ASSERT(ncols % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
     const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nrows, num_blocks_x, 1);
-    rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
+    rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
 }
 
-static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
-                              const float p_delta, const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
+static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                              const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
     GGML_ASSERT(ncols % 4 == 0);
     const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
     const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
     const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale, n_ctx);
+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
 }
 
 static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -6069,9 +6079,10 @@ inline void ggml_cuda_op_rope(
 
     const int64_t ne00 = src0->ne[0];
     const int64_t ne01 = src0->ne[1];
+    const int64_t ne2 = dst->ne[2];
     const int64_t nrows = ggml_nrows(src0);
 
-    const int n_past = ((int32_t *) dst->op_params)[0];
+    //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_dims = ((int32_t *) dst->op_params)[1];
     const int mode   = ((int32_t *) dst->op_params)[2];
     const int n_ctx  = ((int32_t *) dst->op_params)[3];
@@ -6082,19 +6093,37 @@ inline void ggml_cuda_op_rope(
     memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
+    // const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
+
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(src1->ne[0] == ne2);
+    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
+
+    int * pos = nullptr;
+    if ((mode & 1) == 0) {
+        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+        pos = (int *) src1_extra->data_device[id];
+        if (!src1_extra->copied) {
+            CUDA_CHECK(cudaMemcpyAsync(pos, src1->data, ggml_nbytes(src1), cudaMemcpyHostToDevice, main_stream));
+            src1_extra->copied = true;
+        }
+    }
 
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
 
     // compute
     if (is_glm) {
-        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, n_ctx, main_stream);
+        GGML_ASSERT(false);
+        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
     } else if (is_neox) {
         GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
-        rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
+        rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
     } else {
-        rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream);
+        rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
     }
 
     (void) src1;
diff --git a/llama.cpp b/llama.cpp
index 5f40a9b5f194a..df0b39bfb302f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2708,6 +2708,7 @@ static struct ggml_cgraph * llm_build_llama(
 
     // KQ_pos - contains the positions
     struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    offload_func_kq(KQ_pos);
     ggml_allocr_alloc(lctx.alloc, KQ_pos);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         int * data = (int *) KQ_pos->data;
@@ -2719,6 +2720,7 @@ static struct ggml_cgraph * llm_build_llama(
     // shift the entire K-cache if needed
     if (do_rope_shift) {
         struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+        offload_func_kq(K_shift);
         ggml_allocr_alloc(lctx.alloc, K_shift);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
             int * data = (int *) K_shift->data;
@@ -3092,6 +3094,7 @@ static struct ggml_cgraph * llm_build_baichaun(
 
     // KQ_pos - contains the positions
     struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    offload_func_kq(KQ_pos);
     ggml_allocr_alloc(lctx.alloc, KQ_pos);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         int * data = (int *) KQ_pos->data;
@@ -3103,6 +3106,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     // shift the entire K-cache if needed
     if (do_rope_shift) {
         struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+        offload_func_kq(K_shift);
         ggml_allocr_alloc(lctx.alloc, K_shift);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
             int * data = (int *) K_shift->data;
@@ -3496,6 +3500,7 @@ static struct ggml_cgraph * llm_build_falcon(
 
     // KQ_pos - contains the positions
     struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    offload_func_kq(KQ_pos);
     ggml_allocr_alloc(lctx.alloc, KQ_pos);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         int * data = (int *) KQ_pos->data;
@@ -3507,6 +3512,7 @@ static struct ggml_cgraph * llm_build_falcon(
     // shift the entire K-cache if needed
     if (do_rope_shift) {
         struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+        offload_func_kq(K_shift);
         ggml_allocr_alloc(lctx.alloc, K_shift);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
             int * data = (int *) K_shift->data;

From 25bd2540895bf34fd71c2c64d4d3f48d0a75df4d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 11:37:02 +0300
Subject: [PATCH 20/55] make : add parallel to build + fix static functions in
 llama.cpp

---
 .gitignore |  1 +
 Makefile   |  5 ++++-
 llama.cpp  | 10 +++++-----
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index b862a0415f279..1f841c8308a41 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,6 +52,7 @@ models-mnt
 /server
 /simple
 /speculative
+/parallel
 /train-text-from-scratch
 /vdot
 build-info.h
diff --git a/Makefile b/Makefile
index dc8ae38075653..9b631c2a56244 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
@@ -563,6 +563,9 @@ beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o co
 speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 ifdef LLAMA_METAL
 metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
diff --git a/llama.cpp b/llama.cpp
index df0b39bfb302f..abfc16c1a5b28 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1318,7 +1318,7 @@ static bool llama_kv_cache_find_slot(
 }
 
 // find how many cells are currently in use
-int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
+static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
     for (uint32_t i = cache.size - 2; i > 0; --i) {
         if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
             return i + 1;
@@ -1328,7 +1328,7 @@ int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
     return 0;
 }
 
-void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
+static void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
     if (c0 < 0) c0 = 0;
     if (c1 < 0) c1 = cache.size;
 
@@ -1338,7 +1338,7 @@ void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t
     }
 }
 
-void llama_kv_cache_rm_seq(
+static void llama_kv_cache_rm_seq(
              struct llama_kv_cache & cache,
                       llama_seq_id   seq_id,
                          llama_pos   p0,
@@ -1353,7 +1353,7 @@ void llama_kv_cache_rm_seq(
     }
 }
 
-void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) {
+static void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) {
     for (uint32_t i = 0; i < cache.size; ++i) {
         if (!cache.cells[i].has_seq_id(seq_id)) {
             cache.cells[i].pos = -1;
@@ -1362,7 +1362,7 @@ void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id seq_id)
     }
 }
 
-void llama_kv_cache_shift_seq(
+static void llama_kv_cache_shift_seq(
              struct llama_kv_cache & cache,
                       llama_seq_id   seq_id,
                          llama_pos   p0,

From 467e307931ef353676f092be4bb3b2bb3c7d3bbf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 11:45:33 +0300
Subject: [PATCH 21/55] simple : fix token counting

---
 examples/simple/simple.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 33ef0770b2f87..593949c87bd6c 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -81,6 +81,7 @@ int main(int argc, char ** argv) {
             return 1;
         }
 
+        n_cur += tokens_list.size();
         tokens_list.clear();
 
         // sample the next token

From 36714e16d0404963052c83433251e8ea09e0f309 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 12:29:37 +0300
Subject: [PATCH 22/55] parallel : various improvements

---
 examples/parallel/parallel.cpp | 60 ++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 25 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 6e68c5afce4ba..3c3fe6ddb9a11 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -80,10 +80,13 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    const int n_clients = 4;
+    const int n_clients = 8;
 
     // insert new requests as soon as the previous one is done
-    const bool hot_swap = true;
+    const bool hot_plug = false;
+
+    // requests to simulate
+    const int32_t n_seq = 128;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("parallel", "log"));
@@ -95,7 +98,6 @@ int main(int argc, char ** argv) {
     llama_backend_init(params.numa);
 
     llama_model * model = NULL;
-
     llama_context * ctx = NULL;
 
     // load the target model
@@ -130,11 +132,9 @@ int main(int argc, char ** argv) {
     int32_t n_total_prompt = 0;
     int32_t n_total_gen    = 0;
 
-    float t_avg = 0.0f;
+    const auto t_main_start = ggml_time_us();
 
-    const int32_t n_seq = 128;
-
-    while (g_seq_id < n_seq + n_clients) {
+    while (true) {
         uint32_t n_tokens = 0;
 
         batch_token.clear();
@@ -148,7 +148,7 @@ int main(int argc, char ** argv) {
             }
 
             batch_token.push_back(client.sampled);
-            batch_pos.push_back(client.n_decoded);
+            batch_pos.push_back(client.n_decoded + client.n_prompt);
             batch_seq_id.push_back(client.seq_id);
             batch_logits.push_back(true);
             batch_clients.push_back(&client);
@@ -161,12 +161,12 @@ int main(int argc, char ** argv) {
             llama_kv_cache_rm_tokens(ctx, -1, -1);
         }
 
-        if (hot_swap || batch_token.empty()) {
+        if (hot_plug || batch_token.empty()) {
             for (auto & client : clients) {
-                if (client.seq_id == -1) {
+                if (client.seq_id == -1 && g_seq_id < n_seq) {
                     client.seq_id = g_seq_id;
                     client.t_start_prompt = ggml_time_us();
-                    client.t_start_gen = 0;
+                    client.t_start_gen    = 0;
 
                     client.input = k_prompts[rand() % k_prompts.size()];
                     client.prompt = k_system + client.input + "\nAssistant:";
@@ -186,14 +186,21 @@ int main(int argc, char ** argv) {
                     batch_logits.back() = true;
 
                     client.n_prompt  = prompt_tokens.size();
-                    client.n_decoded = prompt_tokens.size();
+                    client.n_decoded = 0;
                     client.i_batch   = batch_token.size() - 1;
 
                     g_seq_id += 1;
+                    if (hot_plug) {
+                        break;
+                    }
                 }
             }
         }
 
+        if (batch_token.empty()) {
+            break;
+        }
+
         // process in chunks of params.n_batch
         for (size_t i = 0; i < batch_token.size(); i += params.n_batch) {
             n_tokens = std::min(params.n_batch, (int32_t) (batch_token.size() - i));
@@ -223,7 +230,9 @@ int main(int argc, char ** argv) {
 
                 const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.last_tokens, candidates, client.i_batch - i);
 
-                if (client.t_start_gen == 0) {
+                if (client.n_decoded == 1) {
+                    // start measuring generation time after the first token to make sure all concurrent clients
+                    // have their prompt already processed
                     client.t_start_gen = ggml_time_us();
                 }
 
@@ -238,9 +247,10 @@ int main(int argc, char ** argv) {
                 //printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n",
                 //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
 
-                if (id == llama_token_eos(ctx) || client.n_decoded > params.n_predict ||
-                    client.response.find("User:") != std::string::npos ||
-                    client.response.find('\n') != std::string::npos) {
+                if (client.n_decoded > 2 &&
+                        (id == llama_token_eos(ctx) || client.n_decoded > params.n_predict ||
+                         client.response.find("User:") != std::string::npos ||
+                         client.response.find('\n') != std::string::npos)) {
                     // basic reverse prompt
                     const size_t pos = client.response.find("User:");
                     if (pos != std::string::npos) {
@@ -252,18 +262,16 @@ int main(int argc, char ** argv) {
                     const auto t_main_end = ggml_time_us();
 
                     printf("\033[1mClient %2d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed: PP %5.2f t/s, TG %5.2f t/s, AVG %5.2f t/s \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
-                            client.id, client.seq_id, client.n_prompt, client.n_decoded - client.n_prompt,
+                            client.id, client.seq_id, client.n_prompt, client.n_decoded,
                             (t_main_end - client.t_start_prompt) / 1e6,
                             (double) (client.n_prompt                   ) / (client.t_start_gen - client.t_start_prompt) * 1e6,
-                            (double) (client.n_decoded - client.n_prompt) / (t_main_end         - client.t_start_gen)    * 1e6,
-                            (double) (client.n_decoded                  ) / (t_main_end         - client.t_start_prompt) * 1e6,
+                            (double) (client.n_decoded                  ) / (t_main_end         - client.t_start_gen)    * 1e6,
+                            (double) (client.n_decoded + client.n_prompt) / (t_main_end         - client.t_start_prompt) * 1e6,
                             ::trim(client.input).c_str(),
                             ::trim(client.response).c_str());
 
                     n_total_prompt += client.n_prompt;
-                    n_total_gen    += client.n_decoded - client.n_prompt;
-
-                    t_avg += (t_main_end - client.t_start_prompt) / 1e6;
+                    n_total_gen    += client.n_decoded;
 
                     client.seq_id = -1;
                 }
@@ -273,10 +281,12 @@ int main(int argc, char ** argv) {
         }
     }
 
+    const auto t_main_end = ggml_time_us();
+
     LOG_TEE("\n\n");
-    LOG_TEE("Total prompt tokens: %d\n", n_total_prompt);
-    LOG_TEE("Total gen tokens:    %d\n", n_total_gen);
-    LOG_TEE("Avg time per seq:    %.2f s\n", t_avg / n_seq);
+    LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
 
     LOG_TEE("\n\n");
 

From ddad2277827865f69456e1864973f580e6c241c3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 13:21:12 +0300
Subject: [PATCH 23/55] llama : fix cell_max logic + rename functions

---
 llama.cpp | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index abfc16c1a5b28..0ecda72682784 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1319,7 +1319,7 @@ static bool llama_kv_cache_find_slot(
 
 // find how many cells are currently in use
 static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
-    for (uint32_t i = cache.size - 2; i > 0; --i) {
+    for (uint32_t i = cache.size - 1; i > 0; --i) {
         if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
             return i + 1;
         }
@@ -2606,7 +2606,7 @@ static struct ggml_cgraph * llm_build_llama(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max;
     const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
     const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -2994,7 +2994,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max;
     const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
     const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -3397,7 +3397,7 @@ static struct ggml_cgraph * llm_build_falcon(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max;
     const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
     const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -3758,7 +3758,7 @@ static struct ggml_cgraph * llm_build_starcoder(
     const float norm_eps = hparams.f_norm_eps;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max;
     const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
     auto & buf_compute = lctx.buf_compute;
@@ -4013,13 +4013,13 @@ static struct ggml_cgraph * llama_build_graph(
     return result;
 }
 
-// evaluate the transformer
+// decode a batch of tokens by evaluating the transformer
 //
 //   - lctx:      llama context
 //   - batch:     batch to evaluate
 //   - n_threads: number of threads to use
 //
-static bool llama_eval_internal(
+static bool llama_decode_internal(
          llama_context & lctx,
            llama_batch   batch,
                    int   n_threads) {
@@ -4051,6 +4051,8 @@ static bool llama_eval_internal(
     const int64_t n_embd  = hparams.n_embd;
     const int64_t n_vocab = hparams.n_vocab;
 
+    // helpers for smoother batch API transistion
+    // after deprecating the llama_eval calls, these will be removed
     std::vector<llama_pos>    pos;
     std::vector<llama_seq_id> seq_id;
 
@@ -4076,14 +4078,15 @@ static bool llama_eval_internal(
     // TODO: better strategies can be implemented
     kv_self.head = 0;
 
+    if (!llama_kv_cache_find_slot(kv_self, batch)) {
+        return false;
+    }
+
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // after enough generations, the benefit from this heuristic disappears
     // if we start defragmenting the cache, the benefit from this will be more important
     kv_self.cell_max = llama_kv_cache_cell_max(kv_self);
-
-    if (!llama_kv_cache_find_slot(kv_self, batch)) {
-        return false;
-    }
+    //printf("kv_self.cell_max = %d\n", kv_self.cell_max);
 
     ggml_allocr_reset(lctx.alloc);
 
@@ -7329,7 +7332,7 @@ int llama_eval(
                          int   n_threads) {
     llama_kv_cache_rm_tokens(ctx->kv_self, n_past, -1);
 
-    if (!llama_eval_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0), n_threads)) {
+    if (!llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0), n_threads)) {
         LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
         return 1;
     }
@@ -7354,7 +7357,7 @@ int llama_eval_embd(
 
     llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
 
-    if (!llama_eval_internal(*ctx, batch, n_threads)) {
+    if (!llama_decode_internal(*ctx, batch, n_threads)) {
         LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
         return 1;
     }
@@ -7391,7 +7394,7 @@ int llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch,
                          int   n_threads) {
-    if (!llama_eval_internal(*ctx, batch, n_threads)) {
+    if (!llama_decode_internal(*ctx, batch, n_threads)) {
         LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
         return 1;
     }

From 806d397c1a13416c10e01908d95b04ce0e9fa8a4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 13:21:36 +0300
Subject: [PATCH 24/55] parallel : try smaller batches when the KV cache is
 fragmented

---
 examples/parallel/parallel.cpp | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 3c3fe6ddb9a11..c35552e4a0b41 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -83,7 +83,7 @@ int main(int argc, char ** argv) {
     const int n_clients = 8;
 
     // insert new requests as soon as the previous one is done
-    const bool hot_plug = false;
+    const bool hot_plug = true;
 
     // requests to simulate
     const int32_t n_seq = 128;
@@ -202,8 +202,10 @@ int main(int argc, char ** argv) {
         }
 
         // process in chunks of params.n_batch
-        for (size_t i = 0; i < batch_token.size(); i += params.n_batch) {
-            n_tokens = std::min(params.n_batch, (int32_t) (batch_token.size() - i));
+        int32_t n_batch = params.n_batch;
+
+        for (int32_t i = 0; i < (int32_t) batch_token.size(); i += n_batch) {
+            n_tokens = std::min(n_batch, (int32_t) (batch_token.size() - i));
 
             llama_batch batch = {
                 n_tokens,
@@ -216,10 +218,22 @@ int main(int argc, char ** argv) {
             };
 
             if (llama_decode(ctx, batch, params.n_threads)) {
-                LOG_TEE("%s : failed to decode batch\n", __func__);
-                return 1;
+                if (n_batch == 1) {
+                    LOG_TEE("%s : failed to decode batch\n", __func__);
+                    return 1;
+                }
+
+                LOG("%s : failed to decode batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+
+                // retry with half the batch size to try to find a free slot in the KV cache
+                n_batch /= 2;
+                i -= n_batch;
+
+                continue;
             }
 
+            LOG_TEE("%s : decoded batch of %d tokens\n", __func__, n_tokens);
+
             for (auto & client : clients) {
                 if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
                     continue;

From 16090a5ddeb53783ca29fcc0b4ee3893fed64f90 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 13:29:29 +0300
Subject: [PATCH 25/55] parallel : fix sequence termination criteria

---
 examples/parallel/parallel.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index c35552e4a0b41..55fd7921ff819 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -232,7 +232,7 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
-            LOG_TEE("%s : decoded batch of %d tokens\n", __func__, n_tokens);
+            LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
 
             for (auto & client : clients) {
                 if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
@@ -262,7 +262,7 @@ int main(int argc, char ** argv) {
                 //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
 
                 if (client.n_decoded > 2 &&
-                        (id == llama_token_eos(ctx) || client.n_decoded > params.n_predict ||
+                        (id == llama_token_eos(ctx) || client.n_decoded + client.n_prompt >= params.n_predict ||
                          client.response.find("User:") != std::string::npos ||
                          client.response.find('\n') != std::string::npos)) {
                     // basic reverse prompt
@@ -275,7 +275,7 @@ int main(int argc, char ** argv) {
 
                     const auto t_main_end = ggml_time_us();
 
-                    printf("\033[1mClient %2d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed: PP %5.2f t/s, TG %5.2f t/s, AVG %5.2f t/s \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
+                    LOG_TEE("\033[1mClient %2d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed: PP %5.2f t/s, TG %5.2f t/s, AVG %5.2f t/s \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
                             client.id, client.seq_id, client.n_prompt, client.n_decoded,
                             (t_main_end - client.t_start_prompt) / 1e6,
                             (double) (client.n_prompt                   ) / (client.t_start_gen - client.t_start_prompt) * 1e6,

From d37081ae5dadb2484615098b293bc46d72d82568 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 13:39:52 +0300
Subject: [PATCH 26/55] llama : silence errors KV cache errors

---
 llama.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 0ecda72682784..089b87f560256 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1304,7 +1304,7 @@ static bool llama_kv_cache_find_slot(
         }
 
         if (n_tested >= n_ctx) {
-            LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
+            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
             return false;
         }
     }
@@ -7333,7 +7333,7 @@ int llama_eval(
     llama_kv_cache_rm_tokens(ctx->kv_self, n_past, -1);
 
     if (!llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0), n_threads)) {
-        LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
+        //LLAMA_LOG_ERROR("%s: failed to decode\n", __func__);
         return 1;
     }
 
@@ -7358,7 +7358,7 @@ int llama_eval_embd(
     llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
 
     if (!llama_decode_internal(*ctx, batch, n_threads)) {
-        LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
+        //LLAMA_LOG_ERROR("%s: failed to decode\n", __func__);
         return 1;
     }
 
@@ -7395,7 +7395,7 @@ int llama_decode(
           struct llama_batch   batch,
                          int   n_threads) {
     if (!llama_decode_internal(*ctx, batch, n_threads)) {
-        LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
+        //LLAMA_LOG_ERROR("%s: failed to decode\n", __func__);
         return 1;
     }
 

From 82e20e9ba0ec5a543a1a4fdd30b6c09c314e3f89 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 13:54:41 +0300
Subject: [PATCH 27/55] parallel : remove new line from prompt

---
 examples/parallel/parallel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 55fd7921ff819..c6e8d9f5c4630 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -27,8 +27,8 @@ static std::string trim(const std::string & str) {
     return str.substr(start, end - start);
 }
 
-static std::string k_system = R"(
-Transcript of a never ending dialog, where the User interacts with an Assistant.
+static std::string k_system =
+R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
 The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
 
 User: Hello, what is the temperature outside?

From 4b5f3cd6bf51d7974c7480fa23a44563b0a785a4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 17:00:42 +0300
Subject: [PATCH 28/55] parallel : process system prompt once + configurable
 paramters + llama API

---
 common/common.cpp                    |  20 ++++-
 common/common.h                      |   3 +
 examples/llama-bench/llama-bench.cpp |   4 +-
 examples/main/main.cpp               |   4 +-
 examples/parallel/parallel.cpp       | 101 ++++++++++++++++-------
 examples/perplexity/perplexity.cpp   |   6 +-
 examples/speculative/speculative.cpp |   6 +-
 llama.cpp                            | 115 ++++++++++++++++-----------
 llama.h                              |  15 +++-
 9 files changed, 184 insertions(+), 90 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 52387e2a6123d..8bd0069601b9d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -317,6 +317,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_chunks = std::stoi(argv[i]);
+        } else if (arg == "-np" || arg == "--parallel") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_parallel = std::stoi(argv[i]);
+        } else if (arg == "-ns" || arg == "--sequences") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_sequences = std::stoi(argv[i]);
         } else if (arg == "-m" || arg == "--model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -360,6 +372,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.multiline_input = true;
         } else if (arg == "--simple-io") {
             params.simple_io = true;
+        } else if (arg == "--hot-plug") {
+            params.hot_plug = true;
         } else if (arg == "--color") {
             params.use_color = true;
         } else if (arg == "--mlock") {
@@ -659,6 +673,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
     printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
     printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
+    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
+    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
+    printf("  --hot-plug            enable hot-plugging of new sequences for decoding (default: disabled)\n");
     if (llama_mlock_supported()) {
         printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
@@ -781,7 +798,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
         std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
         llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0), params.n_threads);
-        llama_kv_cache_rm_tokens(lctx, -1, -1);
+        llama_kv_cache_tokens_rm(lctx, -1, -1);
         llama_reset_timings(lctx);
     }
 
@@ -1253,6 +1270,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
     fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
     fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
+    fprintf(stream, "hot_plug: %s # default: false\n", params.hot_plug ? "true" : "false");
     fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
 
     const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
diff --git a/common/common.h b/common/common.h
index 9454032740634..9269a5d3604d6 100644
--- a/common/common.h
+++ b/common/common.h
@@ -43,6 +43,8 @@ struct gpt_params {
     int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
     int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
     int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel                      = 1;    // number of parallel sequences to decode
+    int32_t n_sequences                     = 1;    // number of sequences to decode
     int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
@@ -108,6 +110,7 @@ struct gpt_params {
     bool interactive_first = false; // wait for user input immediately
     bool multiline_input   = false; // reverse the usage of `\`
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
+    bool hot_plug          = false; // hot-plug new sequences for decoding
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 7a3d3b97fcfd3..4d23db5ee23e8 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -977,7 +977,7 @@ int main(int argc, char ** argv) {
 
         test t(inst, lmodel, ctx);
 
-        llama_kv_cache_rm_tokens(ctx, -1, -1);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);
 
         // warmup run
         if (t.n_prompt > 0) {
@@ -988,7 +988,7 @@ int main(int argc, char ** argv) {
         }
 
         for (int i = 0; i < params.reps; i++) {
-            llama_kv_cache_rm_tokens(ctx, -1, -1);
+            llama_kv_cache_tokens_rm(ctx, -1, -1);
 
             uint64_t t_start = get_time_ns();
             if (t.n_prompt > 0) {
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 9c5f2746affbc..1ed543cbc627a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -505,8 +505,8 @@ int main(int argc, char ** argv) {
                 LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
-                llama_kv_cache_rm_seq   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_shift_seq(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
 
                 n_past -= n_discard;
 
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index c6e8d9f5c4630..20918fd31dca9 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -35,7 +35,7 @@ User: Hello, what is the temperature outside?
 Assistant: It is 72 degrees Fahrenheit.
 User: What is the definition of a prime number?
 Assistant: A prime number is a number that is divisible only by itself and 1.
-User: )";
+User:)";
 
 static std::vector<std::string> k_prompts = {
     "What is the meaning of life?",
@@ -70,7 +70,7 @@ struct client {
     std::string prompt;
     std::string response;
 
-    std::vector<llama_token> last_tokens;
+    std::vector<llama_token> tokens_prev;
 };
 
 int main(int argc, char ** argv) {
@@ -80,13 +80,14 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    const int n_clients = 8;
-
-    // insert new requests as soon as the previous one is done
-    const bool hot_plug = true;
+    // number of simultaneous "clients" to simulate
+    const int32_t n_clients = params.n_parallel;
 
     // requests to simulate
-    const int32_t n_seq = 128;
+    const int32_t n_seq = params.n_sequences;
+
+    // insert new requests as soon as the previous one is done
+    const bool hot_plug = params.hot_plug;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("parallel", "log"));
@@ -114,13 +115,17 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.last_tokens.resize(n_ctx);
-        std::fill(client.last_tokens.begin(), client.last_tokens.end(), 0);
+        client.tokens_prev.resize(n_ctx);
+        std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
     }
 
     std::vector<llama_token_data> candidates;
     candidates.reserve(n_vocab);
 
+    std::vector<llama_token> tokens_system;
+    tokens_system = ::llama_tokenize(ctx, k_system, true);
+    const uint32_t n_tokens_system = tokens_system.size();
+
     llama_seq_id g_seq_id = 0;
 
     std::vector<llama_token>  batch_token;
@@ -134,6 +139,44 @@ int main(int argc, char ** argv) {
 
     const auto t_main_start = ggml_time_us();
 
+    LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
+    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, hot_plug = %d, system tokens = %d\n", __func__, n_clients, n_seq, hot_plug, n_tokens_system);
+    LOG_TEE("\n");
+
+    {
+        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
+
+        batch_pos.clear();
+        batch_seq_id.clear();
+
+        for (size_t i = 0; i < n_tokens_system; ++i) {
+            batch_pos.push_back(i);
+            batch_seq_id.push_back(0);
+        }
+
+        llama_batch batch = {
+            n_tokens_system,
+            tokens_system.data(),
+            nullptr,
+            batch_pos.data(),
+            batch_seq_id.data(),
+            nullptr,
+            0, 0, 0, // unused
+        };
+
+        if (llama_decode(ctx, batch, params.n_threads) != 0) {
+            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+
+        // assign the system KV cachce to all parallel sequences
+        for (int32_t i = 1; i < n_clients; ++i) {
+            llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
+        }
+
+        LOG_TEE("\n");
+    }
+
     while (true) {
         uint32_t n_tokens = 0;
 
@@ -148,7 +191,7 @@ int main(int argc, char ** argv) {
             }
 
             batch_token.push_back(client.sampled);
-            batch_pos.push_back(client.n_decoded + client.n_prompt);
+            batch_pos.push_back(n_tokens_system + client.n_prompt + client.n_decoded);
             batch_seq_id.push_back(client.seq_id);
             batch_logits.push_back(true);
             batch_clients.push_back(&client);
@@ -158,34 +201,36 @@ int main(int argc, char ** argv) {
 
         if (batch_token.empty()) {
             // all sequences have ended - clear the entire KV cache
-            llama_kv_cache_rm_tokens(ctx, -1, -1);
+            for (int i = 0; i < n_clients; ++i) {
+                llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
+            }
         }
 
         if (hot_plug || batch_token.empty()) {
             for (auto & client : clients) {
                 if (client.seq_id == -1 && g_seq_id < n_seq) {
-                    client.seq_id = g_seq_id;
+                    client.seq_id = client.id;
                     client.t_start_prompt = ggml_time_us();
                     client.t_start_gen    = 0;
 
                     client.input = k_prompts[rand() % k_prompts.size()];
-                    client.prompt = k_system + client.input + "\nAssistant:";
+                    client.prompt = client.input + "\nAssistant:";
                     client.response = "";
-                    std::fill(client.last_tokens.begin(), client.last_tokens.end(), 0);
+                    std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
 
-                    std::vector<llama_token> prompt_tokens;
-                    prompt_tokens = ::llama_tokenize(ctx, client.prompt, true);
+                    std::vector<llama_token> tokens_prompt;
+                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, true);
 
-                    for (size_t i = 0; i < prompt_tokens.size(); ++i) {
-                        batch_token.push_back(prompt_tokens[i]);
-                        batch_pos.push_back(i);
+                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
+                        batch_token.push_back(tokens_prompt[i]);
+                        batch_pos.push_back(i + n_tokens_system);
                         batch_seq_id.push_back(client.seq_id);
                         batch_clients.push_back(&client);
                         batch_logits.push_back(false);
                     }
                     batch_logits.back() = true;
 
-                    client.n_prompt  = prompt_tokens.size();
+                    client.n_prompt  = tokens_prompt.size();
                     client.n_decoded = 0;
                     client.i_batch   = batch_token.size() - 1;
 
@@ -217,9 +262,10 @@ int main(int argc, char ** argv) {
                 0, 0, 0, // unused
             };
 
-            if (llama_decode(ctx, batch, params.n_threads)) {
-                if (n_batch == 1) {
-                    LOG_TEE("%s : failed to decode batch\n", __func__);
+            const int ret = llama_decode(ctx, batch, params.n_threads);
+            if (ret != 0) {
+                if (n_batch == 1 || ret < 0) {
+                    LOG_TEE("%s : failed to decode batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                     return 1;
                 }
 
@@ -242,7 +288,7 @@ int main(int argc, char ** argv) {
                 //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                 //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
 
-                const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.last_tokens, candidates, client.i_batch - i);
+                const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
 
                 if (client.n_decoded == 1) {
                     // start measuring generation time after the first token to make sure all concurrent clients
@@ -251,8 +297,8 @@ int main(int argc, char ** argv) {
                 }
 
                 // remember which tokens were sampled - used for repetition penalties during sampling
-                client.last_tokens.erase(client.last_tokens.begin());
-                client.last_tokens.push_back(id);
+                client.tokens_prev.erase(client.tokens_prev.begin());
+                client.tokens_prev.push_back(id);
 
                 const std::string token_str = llama_token_to_piece(ctx, id);
                 client.response += token_str;
@@ -271,7 +317,8 @@ int main(int argc, char ** argv) {
                         client.response = client.response.substr(0, pos);
                     }
 
-                    llama_kv_cache_rm_seq(ctx, client.seq_id, 0, n_ctx);
+                    // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
+                    llama_kv_cache_seq_rm(ctx, client.seq_id, n_tokens_system, n_ctx);
 
                     const auto t_main_end = ggml_time_us();
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 8386a3d16869a..be87011d123d7 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -207,7 +207,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_keep_seq(ctx, -1);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -335,7 +335,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
-        llama_kv_cache_keep_seq(ctx, -1);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);
 
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -568,7 +568,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         }
 
         // clear the KV cache
-        llama_kv_cache_keep_seq(ctx, -1);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);
 
         auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
         if (logits.empty()) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index ea628211b2226..df93c9cd4d3af 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
                 LOG("out of drafted tokens\n");
             }
 
-            llama_kv_cache_rm_seq(ctx_dft, 0, n_past_dft, n_ctx);
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx);
             llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0), params.n_threads);
             ++n_past_dft;
 
@@ -257,7 +257,7 @@ int main(int argc, char ** argv) {
             }
 
             // evaluate the drafted token on the draft model
-            llama_kv_cache_rm_seq(ctx_dft, 0, n_past_cur, n_ctx);
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx);
             llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0), params.n_threads);
             ++n_past_cur;
 
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
         }
 
         // evaluate the target model on the drafted tokens
-        llama_kv_cache_rm_seq(ctx_tgt, 0, n_past_tgt, n_ctx);
+        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx);
         llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0), params.n_threads);
         ++n_past_tgt;
 
diff --git a/llama.cpp b/llama.cpp
index 089b87f560256..12b8c49d0b18c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1328,7 +1328,7 @@ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
     return 0;
 }
 
-static void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
+static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
     if (c0 < 0) c0 = 0;
     if (c1 < 0) c1 = cache.size;
 
@@ -1338,7 +1338,7 @@ static void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0,
     }
 }
 
-static void llama_kv_cache_rm_seq(
+static void llama_kv_cache_seq_rm(
              struct llama_kv_cache & cache,
                       llama_seq_id   seq_id,
                          llama_pos   p0,
@@ -1353,7 +1353,20 @@ static void llama_kv_cache_rm_seq(
     }
 }
 
-static void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id seq_id) {
+static void llama_kv_cache_seq_cp(
+             struct llama_kv_cache & cache,
+                      llama_seq_id   seq_id_src,
+                      llama_seq_id   seq_id_dst,
+                         llama_pos   p0,
+                         llama_pos   p1) {
+    for (uint32_t i = 0; i < cache.size; ++i) {
+        if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
+            cache.cells[i].seq_id.insert(seq_id_dst);
+        }
+    }
+}
+
+static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
     for (uint32_t i = 0; i < cache.size; ++i) {
         if (!cache.cells[i].has_seq_id(seq_id)) {
             cache.cells[i].pos = -1;
@@ -1362,7 +1375,7 @@ static void llama_kv_cache_keep_seq(struct llama_kv_cache & cache, llama_seq_id
     }
 }
 
-static void llama_kv_cache_shift_seq(
+static void llama_kv_cache_seq_shift(
              struct llama_kv_cache & cache,
                       llama_seq_id   seq_id,
                          llama_pos   p0,
@@ -4019,7 +4032,11 @@ static struct ggml_cgraph * llama_build_graph(
 //   - batch:     batch to evaluate
 //   - n_threads: number of threads to use
 //
-static bool llama_decode_internal(
+// return 0 on success
+// return positive int on warning
+// return negative int on error
+//
+static int llama_decode_internal(
          llama_context & lctx,
            llama_batch   batch,
                    int   n_threads) {
@@ -4027,7 +4044,7 @@ static bool llama_decode_internal(
 
     if (n_tokens == 0) {
         LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
-        return false;
+        return -1;
     }
 
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
@@ -4079,7 +4096,7 @@ static bool llama_decode_internal(
     kv_self.head = 0;
 
     if (!llama_kv_cache_find_slot(kv_self, batch)) {
-        return false;
+        return 1;
     }
 
     // a heuristic, to avoid attending the full cache if it is not yet utilized
@@ -4203,7 +4220,14 @@ static bool llama_decode_internal(
         lctx.n_p_eval += n_tokens;
     }
 
-    return true;
+    // get a more accurate load time, upon first eval
+    // TODO: fix this
+    if (!lctx.has_evaluated_once) {
+        lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
+        lctx.has_evaluated_once = true;
+    }
+
+    return 0;
 }
 
 //
@@ -6920,20 +6944,24 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
     return ctx->kv_self.head;
 }
 
-void llama_kv_cache_rm_tokens(struct llama_context * ctx, int32_t c0, int32_t c1) {
-    llama_kv_cache_rm_tokens(ctx->kv_self, c0, c1);
+void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
+    llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
+}
+
+void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
 }
 
-void llama_kv_cache_rm_seq(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    llama_kv_cache_rm_seq(ctx->kv_self, seq_id, p0, p1);
+void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
 }
 
-void llama_kv_cache_keep_seq(struct llama_context * ctx, llama_seq_id seq_id) {
-    llama_kv_cache_keep_seq(ctx->kv_self, seq_id);
+void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
+    llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
 }
 
-void llama_kv_cache_shift_seq(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-    llama_kv_cache_shift_seq(ctx->kv_self, seq_id, p0, p1, delta);
+void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
 }
 
 // Returns the *maximum* size of the state
@@ -7330,21 +7358,18 @@ int llama_eval(
                     uint32_t   n_tokens,
                          int   n_past,
                          int   n_threads) {
-    llama_kv_cache_rm_tokens(ctx->kv_self, n_past, -1);
+    llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
 
-    if (!llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0), n_threads)) {
-        //LLAMA_LOG_ERROR("%s: failed to decode\n", __func__);
-        return 1;
-    }
+    const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0), n_threads);
+    if (ret != 0) {
+        if (ret < 0) {
+            LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
+        }
 
-    // get a more accurate load time, upon first eval
-    // TODO: fix this
-    if (!ctx->has_evaluated_once) {
-        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
-        ctx->has_evaluated_once = true;
+        return ret;
     }
 
-    return 0;
+    return ret;
 }
 
 int llama_eval_embd(
@@ -7353,23 +7378,20 @@ int llama_eval_embd(
                         uint32_t   n_tokens,
                              int   n_past,
                              int   n_threads) {
-    llama_kv_cache_rm_tokens(ctx->kv_self, n_past, -1);
+    llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
 
     llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
 
-    if (!llama_decode_internal(*ctx, batch, n_threads)) {
-        //LLAMA_LOG_ERROR("%s: failed to decode\n", __func__);
-        return 1;
-    }
+    const int ret = llama_decode_internal(*ctx, batch, n_threads);
+    if (ret != 0) {
+        if (ret < 0) {
+            LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
+        }
 
-    // get a more accurate load time, upon first eval
-    // TODO: fix this
-    if (!ctx->has_evaluated_once) {
-        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
-        ctx->has_evaluated_once = true;
+        return ret;
     }
 
-    return 0;
+    return ret;
 }
 
 struct llama_batch llama_batch_get_one(
@@ -7394,19 +7416,16 @@ int llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch,
                          int   n_threads) {
-    if (!llama_decode_internal(*ctx, batch, n_threads)) {
-        //LLAMA_LOG_ERROR("%s: failed to decode\n", __func__);
-        return 1;
-    }
+    const int ret = llama_decode_internal(*ctx, batch, n_threads);
+    if (ret != 0) {
+        if (ret < 0) {
+            LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
+        }
 
-    // get a more accurate load time, upon first eval
-    // TODO: fix this
-    if (!ctx->has_evaluated_once) {
-        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
-        ctx->has_evaluated_once = true;
+        return ret;
     }
 
-    return 0;
+    return ret;
 }
 
 float * llama_get_logits(struct llama_context * ctx) {
diff --git a/llama.h b/llama.h
index e4f02c9787da7..2f344eb14d9c7 100644
--- a/llama.h
+++ b/llama.h
@@ -322,17 +322,20 @@ extern "C" {
             "avoid using this, it will be removed in the future, instead - count the tokens in user code");
 
     // Remove all tokens data of cells in [c0, c1)
-    LLAMA_API void llama_kv_cache_rm_tokens(struct llama_context * ctx, int32_t c0, int32_t c1);
+    LLAMA_API void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1);
 
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-    LLAMA_API void llama_kv_cache_rm_seq(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+    LLAMA_API void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+
+    // Copy all tokens that belong to the specified sequence to another sequence
+    LLAMA_API void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
 
     // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_keep_seq(struct llama_context * ctx, llama_seq_id seq_id);
+    LLAMA_API void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id);
 
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly
-    LLAMA_API void llama_kv_cache_shift_seq(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
+    LLAMA_API void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
 
     //
     // State / sessions
@@ -391,6 +394,10 @@ extern "C" {
                     llama_pos   pos_0,
                  llama_seq_id   seq_id);
 
+    // Positive return values does not mean a fatal error, but rather a warning.
+    //   0 - success
+    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
+    // < 0 - error
     LLAMA_API int llama_decode(
             struct llama_context * ctx,
               struct llama_batch   batch,

From 8a9aca37c196b3496306d6c5bc78755394bd661a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 23:34:30 +0300
Subject: [PATCH 29/55] parallel : remove question with short answers

---
 examples/parallel/parallel.cpp | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 20918fd31dca9..8f2ce2e98ec5a 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -31,17 +31,14 @@ static std::string k_system =
 R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
 The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
 
-User: Hello, what is the temperature outside?
-Assistant: It is 72 degrees Fahrenheit.
-User: What is the definition of a prime number?
-Assistant: A prime number is a number that is divisible only by itself and 1.
+User: Recommend a nice restaurant in the area.
+Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
+User: Who is Richard Feynman?
+Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
 User:)";
 
 static std::vector<std::string> k_prompts = {
     "What is the meaning of life?",
-    "What is the population of Europe?",
-    "List all planets in the Solar System.",
-    "What is the capital of France?",
     "Tell me an interesting fact about llamas.",
     "What is the best way to cook a steak?",
     "Are you familiar with the Special Theory of Relativity and can you explain it to me?",
@@ -74,6 +71,8 @@ struct client {
 };
 
 int main(int argc, char ** argv) {
+    srand(1234);
+
     gpt_params params;
 
     if (gpt_params_parse(argc, argv, params) == false) {
@@ -177,6 +176,8 @@ int main(int argc, char ** argv) {
         LOG_TEE("\n");
     }
 
+    LOG_TEE("Processing requests ...\n\n");
+
     while (true) {
         uint32_t n_tokens = 0;
 
@@ -192,7 +193,7 @@ int main(int argc, char ** argv) {
 
             batch_token.push_back(client.sampled);
             batch_pos.push_back(n_tokens_system + client.n_prompt + client.n_decoded);
-            batch_seq_id.push_back(client.seq_id);
+            batch_seq_id.push_back(client.id);
             batch_logits.push_back(true);
             batch_clients.push_back(&client);
             client.n_decoded += 1;
@@ -209,7 +210,7 @@ int main(int argc, char ** argv) {
         if (hot_plug || batch_token.empty()) {
             for (auto & client : clients) {
                 if (client.seq_id == -1 && g_seq_id < n_seq) {
-                    client.seq_id = client.id;
+                    client.seq_id = g_seq_id;
                     client.t_start_prompt = ggml_time_us();
                     client.t_start_gen    = 0;
 
@@ -224,7 +225,7 @@ int main(int argc, char ** argv) {
                     for (size_t i = 0; i < tokens_prompt.size(); ++i) {
                         batch_token.push_back(tokens_prompt[i]);
                         batch_pos.push_back(i + n_tokens_system);
-                        batch_seq_id.push_back(client.seq_id);
+                        batch_seq_id.push_back(client.id);
                         batch_clients.push_back(&client);
                         batch_logits.push_back(false);
                     }
@@ -236,7 +237,7 @@ int main(int argc, char ** argv) {
 
                     g_seq_id += 1;
                     if (hot_plug) {
-                        break;
+                        //break;
                     }
                 }
             }
@@ -318,11 +319,11 @@ int main(int argc, char ** argv) {
                     }
 
                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx, client.seq_id, n_tokens_system, n_ctx);
+                    llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx);
 
                     const auto t_main_end = ggml_time_us();
 
-                    LOG_TEE("\033[1mClient %2d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed: PP %5.2f t/s, TG %5.2f t/s, AVG %5.2f t/s \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
+                    LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed: PP %5.2f t/s, TG %5.2f t/s, AVG %5.2f t/s \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
                             client.id, client.seq_id, client.n_prompt, client.n_decoded,
                             (t_main_end - client.t_start_prompt) / 1e6,
                             (double) (client.n_prompt                   ) / (client.t_start_gen - client.t_start_prompt) * 1e6,

From eed3fd4234ea055493657ea58b4ad14e6797922f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 23:47:47 +0300
Subject: [PATCH 30/55] parallel : count cache misses

---
 examples/parallel/parallel.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 8f2ce2e98ec5a..b674a034425bf 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -135,6 +135,7 @@ int main(int argc, char ** argv) {
 
     int32_t n_total_prompt = 0;
     int32_t n_total_gen    = 0;
+    int32_t n_cache_miss   = 0;
 
     const auto t_main_start = ggml_time_us();
 
@@ -272,6 +273,8 @@ int main(int argc, char ** argv) {
 
                 LOG("%s : failed to decode batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
 
+                n_cache_miss += 1;
+
                 // retry with half the batch size to try to find a free slot in the KV cache
                 n_batch /= 2;
                 i -= n_batch;
@@ -349,6 +352,7 @@ int main(int argc, char ** argv) {
     LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
     LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
     LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
+    LOG_TEE("Cache misses:        %6d\n", n_cache_miss);
 
     LOG_TEE("\n\n");
 

From 6028879f56a9b8c2ac1b0d14270f38998c8ec0f2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 19 Sep 2023 23:50:05 +0300
Subject: [PATCH 31/55] parallel : print misses on each request

---
 examples/parallel/parallel.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index b674a034425bf..0f1df36d61ab2 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -326,12 +326,10 @@ int main(int argc, char ** argv) {
 
                     const auto t_main_end = ggml_time_us();
 
-                    LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed: PP %5.2f t/s, TG %5.2f t/s, AVG %5.2f t/s \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
+                    LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, cache miss %d \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
                             client.id, client.seq_id, client.n_prompt, client.n_decoded,
                             (t_main_end - client.t_start_prompt) / 1e6,
-                            (double) (client.n_prompt                   ) / (client.t_start_gen - client.t_start_prompt) * 1e6,
-                            (double) (client.n_decoded                  ) / (t_main_end         - client.t_start_gen)    * 1e6,
-                            (double) (client.n_decoded + client.n_prompt) / (t_main_end         - client.t_start_prompt) * 1e6,
+                            n_cache_miss,
                             ::trim(client.input).c_str(),
                             ::trim(client.response).c_str());
 

From 7b7472ee26158f87607d85e145a1ab927d202562 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 00:35:10 +0300
Subject: [PATCH 32/55] parallel : minor

---
 examples/parallel/parallel.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 0f1df36d61ab2..a7b5bad71f870 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -326,9 +326,10 @@ int main(int argc, char ** argv) {
 
                     const auto t_main_end = ggml_time_us();
 
-                    LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, cache miss %d \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
+                    LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\nResponse: %s\n\n",
                             client.id, client.seq_id, client.n_prompt, client.n_decoded,
                             (t_main_end - client.t_start_prompt) / 1e6,
+                            (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
                             n_cache_miss,
                             ::trim(client.input).c_str(),
                             ::trim(client.response).c_str());

From e1067efbfa0895115cc639ead8b22cdceef4eca1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 09:17:05 +0300
Subject: [PATCH 33/55] llama : fix n_kv to never become 0

---
 llama.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 12b8c49d0b18c..f38a033a59fbd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1025,7 +1025,7 @@ struct llama_kv_cache {
     uint32_t size = 0;
 
     // computed before each graph build
-    uint32_t cell_max = 0;
+    uint32_t n = 0;
 
     std::vector<llama_kv_cell> cells;
 
@@ -2619,7 +2619,7 @@ static struct ggml_cgraph * llm_build_llama(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
     const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
     const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -3007,7 +3007,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
     const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
     const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -3410,7 +3410,7 @@ static struct ggml_cgraph * llm_build_falcon(
     const int n_gpu_layers = model.n_gpu_layers;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
     const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
     const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@@ -3771,7 +3771,7 @@ static struct ggml_cgraph * llm_build_starcoder(
     const float norm_eps = hparams.f_norm_eps;
 
     const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
     const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 
     auto & buf_compute = lctx.buf_compute;
@@ -4102,8 +4102,10 @@ static int llama_decode_internal(
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // after enough generations, the benefit from this heuristic disappears
     // if we start defragmenting the cache, the benefit from this will be more important
-    kv_self.cell_max = llama_kv_cache_cell_max(kv_self);
-    //printf("kv_self.cell_max = %d\n", kv_self.cell_max);
+    //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
+    kv_self.n = std::max(32, llama_kv_cache_cell_max(kv_self));
+
+    //printf("kv_self.n = %d\n", kv_self.n);
 
     ggml_allocr_reset(lctx.alloc);
 

From a1327c71c6cfee3b1697aa6646f52f1de249120b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 09:24:02 +0300
Subject: [PATCH 34/55] parallel : rename hot-plug to continuous-batching

---
 common/common.cpp              |  8 ++++----
 common/common.h                |  2 +-
 examples/parallel/parallel.cpp | 12 ++++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 8bd0069601b9d..303b38240d5b1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -372,8 +372,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.multiline_input = true;
         } else if (arg == "--simple-io") {
             params.simple_io = true;
-        } else if (arg == "--hot-plug") {
-            params.hot_plug = true;
+        } else if (arg == "-cb" || arg == "--cont-batching") {
+            params.cont_batching = true;
         } else if (arg == "--color") {
             params.use_color = true;
         } else if (arg == "--mlock") {
@@ -675,7 +675,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
     printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
     printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
-    printf("  --hot-plug            enable hot-plugging of new sequences for decoding (default: disabled)\n");
+    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
     if (llama_mlock_supported()) {
         printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
@@ -1270,7 +1270,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
     fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
     fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
-    fprintf(stream, "hot_plug: %s # default: false\n", params.hot_plug ? "true" : "false");
+    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
     fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
 
     const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
diff --git a/common/common.h b/common/common.h
index 9269a5d3604d6..4218f4698191d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -110,7 +110,7 @@ struct gpt_params {
     bool interactive_first = false; // wait for user input immediately
     bool multiline_input   = false; // reverse the usage of `\`
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
-    bool hot_plug          = false; // hot-plug new sequences for decoding
+    bool cont_batching     = false; // insert new sequences for decoding on-the-fly
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index a7b5bad71f870..4af4d2cd27f8d 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -86,7 +86,7 @@ int main(int argc, char ** argv) {
     const int32_t n_seq = params.n_sequences;
 
     // insert new requests as soon as the previous one is done
-    const bool hot_plug = params.hot_plug;
+    const bool cont_batching = params.cont_batching;
 
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("parallel", "log"));
@@ -140,7 +140,7 @@ int main(int argc, char ** argv) {
     const auto t_main_start = ggml_time_us();
 
     LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
-    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, hot_plug = %d, system tokens = %d\n", __func__, n_clients, n_seq, hot_plug, n_tokens_system);
+    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
     LOG_TEE("\n");
 
     {
@@ -208,7 +208,7 @@ int main(int argc, char ** argv) {
             }
         }
 
-        if (hot_plug || batch_token.empty()) {
+        if (cont_batching || batch_token.empty()) {
             for (auto & client : clients) {
                 if (client.seq_id == -1 && g_seq_id < n_seq) {
                     client.seq_id = g_seq_id;
@@ -237,9 +237,9 @@ int main(int argc, char ** argv) {
                     client.i_batch   = batch_token.size() - 1;
 
                     g_seq_id += 1;
-                    if (hot_plug) {
-                        //break;
-                    }
+                    //if (cont_batching) {
+                    //    break;
+                    //}
                 }
             }
         }

From addae65fd44d362995acd8c05b99c3351c214df8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 10:46:18 +0300
Subject: [PATCH 35/55] llama : improve llama_batch API + simplify parallel
 example

---
 examples/parallel/parallel.cpp       | 107 +++++++++++++--------------
 examples/perplexity/perplexity.cpp   |   2 +-
 examples/simple/simple.cpp           |   8 +-
 examples/speculative/speculative.cpp |   2 +-
 llama.cpp                            |  30 +++++++-
 llama.h                              |  32 +++++---
 6 files changed, 111 insertions(+), 70 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 4af4d2cd27f8d..e252b0f53fa00 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -127,11 +127,7 @@ int main(int argc, char ** argv) {
 
     llama_seq_id g_seq_id = 0;
 
-    std::vector<llama_token>  batch_token;
-    std::vector<llama_pos>    batch_pos;
-    std::vector<llama_seq_id> batch_seq_id;
-    std::vector<int8_t>       batch_logits;
-    std::vector<client *>     batch_clients;
+    llama_batch batch = llama_batch_init(params.n_batch, 0);
 
     int32_t n_total_prompt = 0;
     int32_t n_total_gen    = 0;
@@ -146,24 +142,15 @@ int main(int argc, char ** argv) {
     {
         LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
 
-        batch_pos.clear();
-        batch_seq_id.clear();
+        batch.n_tokens = n_tokens_system;
 
-        for (size_t i = 0; i < n_tokens_system; ++i) {
-            batch_pos.push_back(i);
-            batch_seq_id.push_back(0);
+        for (uint32_t i = 0; i < batch.n_tokens; ++i) {
+            batch.token[i]  = tokens_system[i];
+            batch.pos[i]    = i;
+            batch.seq_id[i] = 0;
+            batch.logits[i] = false;
         }
 
-        llama_batch batch = {
-            n_tokens_system,
-            tokens_system.data(),
-            nullptr,
-            batch_pos.data(),
-            batch_seq_id.data(),
-            nullptr,
-            0, 0, 0, // unused
-        };
-
         if (llama_decode(ctx, batch, params.n_threads) != 0) {
             LOG_TEE("%s: llama_decode() failed\n", __func__);
             return 1;
@@ -180,63 +167,72 @@ int main(int argc, char ** argv) {
     LOG_TEE("Processing requests ...\n\n");
 
     while (true) {
-        uint32_t n_tokens = 0;
-
-        batch_token.clear();
-        batch_pos.clear();
-        batch_seq_id.clear();
-        batch_logits.clear();
+        batch.n_tokens = 0;
 
+        // decode any currently ongoing sequences
         for (auto & client : clients) {
             if (client.seq_id == -1) {
                 continue;
             }
 
-            batch_token.push_back(client.sampled);
-            batch_pos.push_back(n_tokens_system + client.n_prompt + client.n_decoded);
-            batch_seq_id.push_back(client.id);
-            batch_logits.push_back(true);
-            batch_clients.push_back(&client);
+            batch.token [batch.n_tokens] = client.sampled;
+            batch.pos   [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
+            batch.seq_id[batch.n_tokens] = client.id;
+            batch.logits[batch.n_tokens] = true;
+
             client.n_decoded += 1;
-            client.i_batch = batch_token.size() - 1;
+            client.i_batch = batch.n_tokens;
+
+            batch.n_tokens += 1;
         }
 
-        if (batch_token.empty()) {
+        if (batch.n_tokens == 0) {
             // all sequences have ended - clear the entire KV cache
             for (int i = 0; i < n_clients; ++i) {
                 llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
             }
         }
 
-        if (cont_batching || batch_token.empty()) {
+        // insert new sequences for decoding
+        if (cont_batching || batch.n_tokens == 0) {
             for (auto & client : clients) {
                 if (client.seq_id == -1 && g_seq_id < n_seq) {
                     client.seq_id = g_seq_id;
+
                     client.t_start_prompt = ggml_time_us();
                     client.t_start_gen    = 0;
 
-                    client.input = k_prompts[rand() % k_prompts.size()];
-                    client.prompt = client.input + "\nAssistant:";
+                    client.input    = k_prompts[rand() % k_prompts.size()];
+                    client.prompt   = client.input + "\nAssistant:";
                     client.response = "";
+
                     std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
 
                     std::vector<llama_token> tokens_prompt;
                     tokens_prompt = ::llama_tokenize(ctx, client.prompt, true);
 
                     for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        batch_token.push_back(tokens_prompt[i]);
-                        batch_pos.push_back(i + n_tokens_system);
-                        batch_seq_id.push_back(client.id);
-                        batch_clients.push_back(&client);
-                        batch_logits.push_back(false);
+                        batch.token [batch.n_tokens] = tokens_prompt[i];
+                        batch.pos   [batch.n_tokens] = i + n_tokens_system;
+                        batch.seq_id[batch.n_tokens] = client.id;
+                        batch.logits[batch.n_tokens] = false;
+                        batch.n_tokens += 1;
+                    }
+
+                    // extract the logits only for the last token
+                    if (batch.n_tokens > 0) {
+                        batch.logits[batch.n_tokens - 1] = true;
                     }
-                    batch_logits.back() = true;
 
                     client.n_prompt  = tokens_prompt.size();
                     client.n_decoded = 0;
-                    client.i_batch   = batch_token.size() - 1;
+                    client.i_batch   = batch.n_tokens - 1;
+
+                    LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
 
                     g_seq_id += 1;
+
+                    // insert new requests one-by-one
                     //if (cont_batching) {
                     //    break;
                     //}
@@ -244,34 +240,35 @@ int main(int argc, char ** argv) {
             }
         }
 
-        if (batch_token.empty()) {
+        if (batch.n_tokens == 0) {
             break;
         }
 
         // process in chunks of params.n_batch
         int32_t n_batch = params.n_batch;
 
-        for (int32_t i = 0; i < (int32_t) batch_token.size(); i += n_batch) {
-            n_tokens = std::min(n_batch, (int32_t) (batch_token.size() - i));
+        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+            const uint32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
 
-            llama_batch batch = {
+            llama_batch batch_view = {
                 n_tokens,
-                batch_token.data() + i,
+                batch.token  + i,
                 nullptr,
-                batch_pos.data() + i,
-                batch_seq_id.data() + i,
-                batch_logits.data() + i,
+                batch.pos    + i,
+                batch.seq_id + i,
+                batch.logits + i,
                 0, 0, 0, // unused
             };
 
-            const int ret = llama_decode(ctx, batch, params.n_threads);
+            const int ret = llama_decode(ctx, batch_view, params.n_threads);
             if (ret != 0) {
                 if (n_batch == 1 || ret < 0) {
-                    LOG_TEE("%s : failed to decode batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    // if you get here, it means the KV cache is full - try increasing it via the context size
+                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                     return 1;
                 }
 
-                LOG("%s : failed to decode batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+                LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
 
                 n_cache_miss += 1;
 
@@ -357,6 +354,8 @@ int main(int argc, char ** argv) {
 
     llama_print_timings(ctx);
 
+    llama_batch_free(batch);
+
     llama_free(ctx);
     llama_free_model(model);
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index be87011d123d7..19063416718e8 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -419,7 +419,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 }
 
 static std::vector<float> hellaswag_evaluate_tokens(
-    llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread
+    llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab, int n_thread
 ) {
     std::vector<float> result;
     result.reserve(tokens.size() * n_vocab);
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 593949c87bd6c..8a9a1bf5461a6 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -10,10 +10,12 @@ int main(int argc, char ** argv) {
     gpt_params params;
 
     if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
         return 1 ;
     }
 
+    int n_parallel = 1;
+
     if (argc >= 2) {
         params.model = argv[1];
     }
@@ -22,6 +24,10 @@ int main(int argc, char ** argv) {
         params.prompt = argv[2];
     }
 
+    if (argc >= 4) {
+        n_parallel = std::atoi(argv[3]);
+    }
+
     if (params.prompt.empty()) {
         params.prompt = "Hello my name is";
     }
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index df93c9cd4d3af..2445d78dc9788 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -134,7 +134,7 @@ int main(int argc, char ** argv) {
 
         while (true) {
             // sample from the target model
-            const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
+            llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
 
             // remember which tokens were sampled - used for repetition penalties during sampling
             last_tokens.erase(last_tokens.begin());
diff --git a/llama.cpp b/llama.cpp
index f38a033a59fbd..f47d9b5984994 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7356,7 +7356,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
 
 int llama_eval(
         struct llama_context * ctx,
-           const llama_token * tokens,
+                 llama_token * tokens,
                     uint32_t   n_tokens,
                          int   n_past,
                          int   n_threads) {
@@ -7376,7 +7376,7 @@ int llama_eval(
 
 int llama_eval_embd(
             struct llama_context * ctx,
-                     const float * embd,
+                           float * embd,
                         uint32_t   n_tokens,
                              int   n_past,
                              int   n_threads) {
@@ -7397,7 +7397,7 @@ int llama_eval_embd(
 }
 
 struct llama_batch llama_batch_get_one(
-       const llama_token * tokens,
+             llama_token * tokens,
                 uint32_t   n_tokens,
                llama_pos   pos_0,
             llama_seq_id   seq_id) {
@@ -7414,6 +7414,30 @@ struct llama_batch llama_batch_get_one(
     };
 }
 
+struct llama_batch llama_batch_init(uint32_t n_tokens, int32_t embd) {
+    llama_batch batch = { n_tokens, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
+
+    if (embd) {
+        batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
+    } else {
+        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
+    }
+
+    batch.pos    = (llama_pos *)    malloc(sizeof(llama_pos)    * n_tokens);
+    batch.seq_id = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_tokens);
+    batch.logits = (int8_t *)       malloc(sizeof(int8_t)       * n_tokens);
+
+    return batch;
+}
+
+void llama_batch_free(struct llama_batch batch) {
+    if (batch.token)  free(batch.token);
+    if (batch.embd)   free(batch.embd);
+    if (batch.pos)    free(batch.pos);
+    if (batch.seq_id) free(batch.seq_id);
+    if (batch.logits) free(batch.logits);
+}
+
 int llama_decode(
         struct llama_context * ctx,
           struct llama_batch   batch,
diff --git a/llama.h b/llama.h
index 2f344eb14d9c7..3a46e1ea049d3 100644
--- a/llama.h
+++ b/llama.h
@@ -70,11 +70,11 @@ extern "C" {
     typedef struct llama_batch {
         uint32_t n_tokens;
 
-        const llama_token  * token;
-        const float        * embd;
-        const llama_pos    * pos;
-        const llama_seq_id * seq_id;
-        const int8_t       * logits; // if 0, do not extract logits for that token
+        llama_token  * token;
+        float        * embd;
+        llama_pos    * pos;
+        llama_seq_id * seq_id;
+        int8_t       * logits; // if 0, do not extract logits for that token
 
         // NOTE: helpers for smooth API transition - can be deprecated in the future
         //       for future-proof code, use the above fields instead and ignore everything below
@@ -84,7 +84,7 @@ extern "C" {
         llama_pos    all_pos_0;  // used if pos == NULL
         llama_pos    all_pos_1;  // used if pos == NULL
         llama_seq_id all_seq_id; // used if seq_id == NULL
-    } llama_seq;
+    } llama_batch;
 
     enum llama_log_level {
         LLAMA_LOG_LEVEL_ERROR = 2,
@@ -366,34 +366,46 @@ extern "C" {
     // tokens + n_tokens is the provided batch of new tokens to process
     // n_past is the number of tokens to use from previous eval calls
     // Returns 0 on success
+    // DEPRECATED: use llama_decode() instead
     LLAMA_API DEPRECATED(int llama_eval(
             struct llama_context * ctx,
-               const llama_token * tokens,
+                     llama_token * tokens,
                         uint32_t   n_tokens,
                              int   n_past,
                              int   n_threads),
             "please use llama_decode() instead");
 
     // Same as llama_eval, but use float matrix input directly.
+    // DEPRECATED: use llama_decode() instead
     LLAMA_API DEPRECATED(int llama_eval_embd(
             struct llama_context * ctx,
-                     const float * embd,
+                           float * embd,
                         uint32_t   n_tokens,
                              int   n_past,
                              int   n_threads),
             "please use llama_decode() instead");
 
     // Return batch for single sequence of tokens starting at pos_0
-    // If pos_0 == 0, the clear_kv flag will be auto set to true
     //
     // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
     //
     LLAMA_API struct llama_batch llama_batch_get_one(
-            const llama_token * tokens,
+                  llama_token * tokens,
                      uint32_t   n_tokens,
                     llama_pos   pos_0,
                  llama_seq_id   seq_id);
 
+    // Allocates a batch of tokens on the heap
+    // The batch needs to be freed with llama_batch_free()
+    // If embd > 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
+    // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
+    // The rest of the llama_batch members are allocated with size n_tokens
+    // All members are left uninitialized
+    LLAMA_API struct llama_batch llama_batch_init(uint32_t n_tokens, int32_t embd);
+
+    // Frees a batch of tokens allocated with llama_batch_init()
+    LLAMA_API void llama_batch_free(struct llama_batch batch);
+
     // Positive return values does not mean a fatal error, but rather a warning.
     //   0 - success
     //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)

From b377bf2266fcc7e98bef0ac2b8318b8b7c523947 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 13:06:34 +0300
Subject: [PATCH 36/55] simple : add parallel decoding support

---
 common/common.cpp                      |   6 +-
 examples/embd-input/embd-input-lib.cpp |   8 +-
 examples/parallel/parallel.cpp         |   8 +-
 examples/server/server.cpp             |   6 +-
 examples/simple/simple.cpp             | 182 +++++++++++++++++++------
 llama.cpp                              |  34 ++---
 llama.h                                |  15 +-
 7 files changed, 185 insertions(+), 74 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 303b38240d5b1..6da466bbe4214 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -956,11 +956,11 @@ llama_token llama_sample_token(
         if (mirostat == 1) {
             static float mirostat_mu = 2.0f * mirostat_tau;
             const int mirostat_m = 100;
-            llama_sample_temperature(ctx, &cur_p, temp);
+            llama_sample_temp(ctx, &cur_p, temp);
             id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
         } else if (mirostat == 2) {
             static float mirostat_mu = 2.0f * mirostat_tau;
-            llama_sample_temperature(ctx, &cur_p, temp);
+            llama_sample_temp(ctx, &cur_p, temp);
             id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
         } else {
             // Temperature sampling
@@ -968,7 +968,7 @@ llama_token llama_sample_token(
             llama_sample_tail_free  (ctx, &cur_p, tfs_z, 1);
             llama_sample_typical    (ctx, &cur_p, typical_p, 1);
             llama_sample_top_p      (ctx, &cur_p, top_p, 1);
-            llama_sample_temperature(ctx, &cur_p, temp);
+            llama_sample_temp(ctx, &cur_p, temp);
 
             {
                 const int n_top = 10;
diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp
index 339612cceed6c..f0089e1f9557c 100644
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -79,7 +79,7 @@ bool eval_float(void * model, float * input, int N){
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
-        llama_batch batch = { uint32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
+        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
         if (llama_decode(ctx, batch, params.n_threads)) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return false;
@@ -183,11 +183,11 @@ llama_token sampling_id(struct MyModel* mymodel) {
             if (mirostat == 1) {
                 static float mirostat_mu = 2.0f * mirostat_tau;
                 const int mirostat_m = 100;
-                llama_sample_temperature(ctx, &candidates_p, temp);
+                llama_sample_temp(ctx, &candidates_p, temp);
                 id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
             } else if (mirostat == 2) {
                 static float mirostat_mu = 2.0f * mirostat_tau;
-                llama_sample_temperature(ctx, &candidates_p, temp);
+                llama_sample_temp(ctx, &candidates_p, temp);
                 id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
             } else {
                 // Temperature sampling
@@ -195,7 +195,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
                 llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
                 llama_sample_typical(ctx, &candidates_p, typical_p, 1);
                 llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-                llama_sample_temperature(ctx, &candidates_p, temp);
+                llama_sample_temp(ctx, &candidates_p, temp);
                 id = llama_sample_token(ctx, &candidates_p);
             }
         }
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index e252b0f53fa00..b8bd6d93662e4 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -123,7 +123,7 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> tokens_system;
     tokens_system = ::llama_tokenize(ctx, k_system, true);
-    const uint32_t n_tokens_system = tokens_system.size();
+    const int32_t n_tokens_system = tokens_system.size();
 
     llama_seq_id g_seq_id = 0;
 
@@ -144,7 +144,7 @@ int main(int argc, char ** argv) {
 
         batch.n_tokens = n_tokens_system;
 
-        for (uint32_t i = 0; i < batch.n_tokens; ++i) {
+        for (int32_t i = 0; i < batch.n_tokens; ++i) {
             batch.token[i]  = tokens_system[i];
             batch.pos[i]    = i;
             batch.seq_id[i] = 0;
@@ -156,7 +156,7 @@ int main(int argc, char ** argv) {
             return 1;
         }
 
-        // assign the system KV cachce to all parallel sequences
+        // assign the system KV cache to all parallel sequences
         for (int32_t i = 1; i < n_clients; ++i) {
             llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
         }
@@ -248,7 +248,7 @@ int main(int argc, char ** argv) {
         int32_t n_batch = params.n_batch;
 
         for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
-            const uint32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
 
             llama_batch batch_view = {
                 n_tokens,
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 6c81bd618d5e5..35908b7f086e9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -523,13 +523,13 @@ struct llama_server_context
                 {
                     static float mirostat_mu = 2.0f * mirostat_tau;
                     const int mirostat_m = 100;
-                    llama_sample_temperature(ctx, &candidates_p, temp);
+                    llama_sample_temp(ctx, &candidates_p, temp);
                     result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
                 }
                 else if (mirostat == 2)
                 {
                     static float mirostat_mu = 2.0f * mirostat_tau;
-                    llama_sample_temperature(ctx, &candidates_p, temp);
+                    llama_sample_temp(ctx, &candidates_p, temp);
                     result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
                 }
                 else
@@ -540,7 +540,7 @@ struct llama_server_context
                     llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
                     llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
                     llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
-                    llama_sample_temperature(ctx, &candidates_p, temp);
+                    llama_sample_temp(ctx, &candidates_p, temp);
                     result.tok = llama_sample_token(ctx, &candidates_p);
                 }
             }
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 8a9a1bf5461a6..88d087354f409 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -32,12 +32,18 @@ int main(int argc, char ** argv) {
         params.prompt = "Hello my name is";
     }
 
+    // total length of the sequences including the prompt
+    const int n_len = 32;
+
     // init LLM
 
     llama_backend_init(params.numa);
 
     llama_context_params ctx_params = llama_context_default_params();
 
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx = 2048;
+
     llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
 
     if (model == NULL) {
@@ -47,20 +53,29 @@ int main(int argc, char ** argv) {
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
     // tokenize the prompt
 
     std::vector<llama_token> tokens_list;
     tokens_list = ::llama_tokenize(ctx, params.prompt, true);
 
-    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4;
+    const int n_ctx    = llama_n_ctx(ctx);
+    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
 
-    if ((int) tokens_list.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_parallel, n_kv_req);
+
+    // make sure wi
+    if (n_kv_req > n_ctx) {
+        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
+        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
         return 1;
     }
 
-    fprintf(stderr, "\n\n");
+    fprintf(stderr, "\n");
 
     for (auto id : tokens_list) {
         fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
@@ -68,66 +83,157 @@ int main(int argc, char ** argv) {
 
     fflush(stderr);
 
+    // create a llama_batch with size 512
+    // we use this object to submit token data for decoding
+
+    llama_batch batch = llama_batch_init(512, 0);
+
+    // evaluate the initial prompt
+    batch.n_tokens = tokens_list.size();
+
+    for (int32_t i = 0; i < batch.n_tokens; i++) {
+        batch.token[i]  = tokens_list[i];
+        batch.pos[i]    = i;
+        batch.seq_id[i] = 0;
+        batch.logits[i] = false;
+    }
+
+    // llama_decode will output logits only for the last token of the prompt
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (llama_decode(ctx, batch, params.n_threads) != 0) {
+        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        return 1;
+    }
+
+    // assign the system KV cache to all parallel sequences
+    for (int32_t i = 1; i < n_parallel; ++i) {
+        llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
+    }
+
+    if (n_parallel > 1) {
+        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+    }
+
     // main loop
 
-    // The LLM keeps a contextual cache memory of previous token evaluation.
-    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
-    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
-    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
+    // we will store the parallel decoded sequences in this vector
+    std::vector<std::string> streams(n_parallel);
 
-    const int n_gen = std::min(32, max_context_size);
+    // remember the batch index of the last tokenn for each parallel sequence
+    // we will use this to know which logits to sample from
+    std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
 
-    int n_cur = 0;
+    int n_cur    = batch.n_tokens;
+    int n_decode = 0;
 
-    while (n_cur < n_gen) {
-        // evaluate the transformer
+    const auto t_main_start = ggml_time_us();
 
-        if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), int(tokens_list.size()), n_cur, 0), params.n_threads)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
+    while (n_cur <= n_len) {
+        // evaluate the current batch with the transformer model
+        if (llama_decode(ctx, batch, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
 
-        n_cur += tokens_list.size();
-        tokens_list.clear();
+        // prepare the next batch
+        batch.n_tokens = 0;
 
-        // sample the next token
+        // sample the next token for each parallel sequence / stream
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            if (i_batch[i] < 0) {
+                // the stream has already finished
+                continue;
+            }
 
-        llama_token new_token_id = 0;
+            auto n_vocab = llama_n_vocab(ctx);
+            auto logits  = llama_get_logits(ctx) + i_batch[i] * n_vocab;
 
-        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(ctx);
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(n_vocab);
 
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+            }
 
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-        }
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+            const int   top_k = 40;
+            const float top_p = 0.9f;
+            const float temp  = 0.4f;
 
-        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
+            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+            llama_sample_temp (ctx, &candidates_p, temp);
 
-        // is it an end of stream ?
-        if (new_token_id == llama_token_eos(ctx)) {
-            fprintf(stderr, " [end of text]\n");
+            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
+
+            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+
+            // is it an end of stream ?
+            // mark this stream as finished
+            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
+                i_batch[i] = -1;
+                LOG_TEE("\n");
+                if (n_parallel > 1) {
+                    LOG_TEE("%s: stream %d finished", __func__, i);
+                }
+
+                continue;
+            }
+
+            if (n_parallel == 1) {
+                // print the new token :
+                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+                fflush(stdout);
+            }
+
+            streams[i] += llama_token_to_piece(ctx, new_token_id);
+
+            // push this new token for next evaluation
+            batch.token [batch.n_tokens] = new_token_id;
+            batch.pos   [batch.n_tokens] = n_cur;
+            batch.seq_id[batch.n_tokens] = i;
+            batch.logits[batch.n_tokens] = true;
+
+            i_batch[i] = batch.n_tokens;
+
+            batch.n_tokens += 1;
+
+            n_decode += 1;
+        }
+
+        if (batch.n_tokens == 0) {
+            // all streams are finished
             break;
         }
 
-        // print the new token :
-        printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-        fflush(stdout);
+        n_cur += 1;
+    }
+
+    LOG_TEE("\n");
 
-        // push this new token for next evaluation
-        tokens_list.push_back(new_token_id);
+    if (n_parallel > 1) {
+        LOG_TEE("\n");
+
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+        }
     }
 
+    const auto t_main_end = ggml_time_us();
+
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
+
+    llama_print_timings(ctx);
+
+    fprintf(stderr, "\n");
+
     llama_free(ctx);
     llama_free_model(model);
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
-
     return 0;
 }
diff --git a/llama.cpp b/llama.cpp
index f47d9b5984994..ce3f2c8bb4707 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4185,20 +4185,18 @@ static int llama_decode_internal(
     {
         auto & logits_out = lctx.logits;
 
-        if (lctx.logits_all) {
+        if (batch.logits) {
             logits_out.resize(n_vocab * n_tokens);
-            if (batch.logits) {
-                for (uint32_t i = 0; i < n_tokens; i++) {
-                    if (batch.logits[i] == 0) {
-                        continue;
-                    }
-                    memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
+            for (uint32_t i = 0; i < n_tokens; i++) {
+                if (batch.logits[i] == 0) {
+                    continue;
                 }
-            } else {
-                memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
+                memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab);
             }
+        } else if (lctx.logits_all) {
+            logits_out.resize(n_vocab * n_tokens);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens);
         } else {
-            // return result for just the last token
             logits_out.resize(n_vocab);
             memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab);
         }
@@ -5269,7 +5267,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
     }
 }
 
-void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
+void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
     const int64_t t_start_sample_us = ggml_time_us();
 
     for (size_t i = 0; i < candidates_p->size; ++i) {
@@ -5281,6 +5279,10 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
     }
 }
 
+void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
+    llama_sample_temp(ctx, candidates_p, temp);
+}
+
 void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
     if (last_tokens_size == 0 || penalty == 1.0f) {
         return;
@@ -7357,7 +7359,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
 int llama_eval(
         struct llama_context * ctx,
                  llama_token * tokens,
-                    uint32_t   n_tokens,
+                     int32_t   n_tokens,
                          int   n_past,
                          int   n_threads) {
     llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
@@ -7377,7 +7379,7 @@ int llama_eval(
 int llama_eval_embd(
             struct llama_context * ctx,
                            float * embd,
-                        uint32_t   n_tokens,
+                         int32_t   n_tokens,
                              int   n_past,
                              int   n_threads) {
     llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
@@ -7398,7 +7400,7 @@ int llama_eval_embd(
 
 struct llama_batch llama_batch_get_one(
              llama_token * tokens,
-                uint32_t   n_tokens,
+                 int32_t   n_tokens,
                llama_pos   pos_0,
             llama_seq_id   seq_id) {
     return {
@@ -7414,8 +7416,8 @@ struct llama_batch llama_batch_get_one(
     };
 }
 
-struct llama_batch llama_batch_init(uint32_t n_tokens, int32_t embd) {
-    llama_batch batch = { n_tokens, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
+struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
+    llama_batch batch = { -1, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
diff --git a/llama.h b/llama.h
index 3a46e1ea049d3..54eab8f0885a3 100644
--- a/llama.h
+++ b/llama.h
@@ -68,7 +68,7 @@ extern "C" {
 
     // data used for batch inference
     typedef struct llama_batch {
-        uint32_t n_tokens;
+        int32_t n_tokens;
 
         llama_token  * token;
         float        * embd;
@@ -370,7 +370,7 @@ extern "C" {
     LLAMA_API DEPRECATED(int llama_eval(
             struct llama_context * ctx,
                      llama_token * tokens,
-                        uint32_t   n_tokens,
+                         int32_t   n_tokens,
                              int   n_past,
                              int   n_threads),
             "please use llama_decode() instead");
@@ -380,7 +380,7 @@ extern "C" {
     LLAMA_API DEPRECATED(int llama_eval_embd(
             struct llama_context * ctx,
                            float * embd,
-                        uint32_t   n_tokens,
+                         int32_t   n_tokens,
                              int   n_past,
                              int   n_threads),
             "please use llama_decode() instead");
@@ -391,7 +391,7 @@ extern "C" {
     //
     LLAMA_API struct llama_batch llama_batch_get_one(
                   llama_token * tokens,
-                     uint32_t   n_tokens,
+                      int32_t   n_tokens,
                     llama_pos   pos_0,
                  llama_seq_id   seq_id);
 
@@ -401,7 +401,7 @@ extern "C" {
     // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
     // The rest of the llama_batch members are allocated with size n_tokens
     // All members are left uninitialized
-    LLAMA_API struct llama_batch llama_batch_init(uint32_t n_tokens, int32_t embd);
+    LLAMA_API struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd);
 
     // Frees a batch of tokens allocated with llama_batch_init()
     LLAMA_API void llama_batch_free(struct llama_batch batch);
@@ -531,7 +531,10 @@ extern "C" {
 
     /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
     LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
-    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+    LLAMA_API void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+
+    LLAMA_API DEPRECATED(void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp),
+            "Use llama_sample_temp instead");
 
     /// @details Apply constraints from grammar
     LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);

From db0fc2da0632731a2358840981419827767ef3fb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 13:54:20 +0300
Subject: [PATCH 37/55] simple : improve comments + free batch

---
 examples/simple/simple.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 88d087354f409..cf48ce0c01bc3 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -68,13 +68,15 @@ int main(int argc, char ** argv) {
 
     LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_parallel, n_kv_req);
 
-    // make sure wi
+    // make sure the KV cache is big enough to hold all the prompt and generated tokens
     if (n_kv_req > n_ctx) {
         LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
         LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
         return 1;
     }
 
+    // print the prompt token-by-token
+
     fprintf(stderr, "\n");
 
     for (auto id : tokens_list) {
@@ -107,6 +109,7 @@ int main(int argc, char ** argv) {
     }
 
     // assign the system KV cache to all parallel sequences
+    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
     for (int32_t i = 1; i < n_parallel; ++i) {
         llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
     }
@@ -120,8 +123,8 @@ int main(int argc, char ** argv) {
     // we will store the parallel decoded sequences in this vector
     std::vector<std::string> streams(n_parallel);
 
-    // remember the batch index of the last tokenn for each parallel sequence
-    // we will use this to know which logits to sample from
+    // remember the batch index of the last token for each parallel sequence
+    // we need this to determine which logits to sample from
     std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
 
     int n_cur    = batch.n_tokens;
@@ -170,8 +173,7 @@ int main(int argc, char ** argv) {
 
             //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
 
-            // is it an end of stream ?
-            // mark this stream as finished
+            // is it an end of stream? -> mark the stream as finished
             if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
                 i_batch[i] = -1;
                 LOG_TEE("\n");
@@ -182,8 +184,8 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
+            // if there is only one stream, we print immediately to stdout
             if (n_parallel == 1) {
-                // print the new token :
                 LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
                 fflush(stdout);
             }
@@ -203,8 +205,8 @@ int main(int argc, char ** argv) {
             n_decode += 1;
         }
 
+        // all streams are finished
         if (batch.n_tokens == 0) {
-            // all streams are finished
             break;
         }
 
@@ -230,6 +232,8 @@ int main(int argc, char ** argv) {
 
     fprintf(stderr, "\n");
 
+    llama_batch_free(batch);
+
     llama_free(ctx);
     llama_free_model(model);
 

From e04dc519887d0a6682c6ca495236e410ef899dc7 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 20 Sep 2023 13:00:28 +0200
Subject: [PATCH 38/55] ggml-cuda : add rope f16, restore performance with
 parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance

* offload KQ_mask with all models

* fix rope shift

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml-cuda.cu | 130 ++++++++++++++++++++++++++++++---------------------
 ggml-cuda.h  |   1 +
 ggml.c       |   2 +-
 llama.cpp    |  44 ++++++++++++-----
 4 files changed, 110 insertions(+), 67 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 14b1ecf7d2cf3..87d2e2e71a8ad 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -439,7 +439,6 @@ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullpt
 struct ggml_tensor_extra_gpu {
     void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
     cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs
-    bool copied;
 };
 
 // this is faster on Windows
@@ -4357,8 +4356,9 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
 
 // rope == RoPE == rotary positional embedding
 
-static __global__ void rope_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
-                                const int p_delta_rows, const float theta_scale) {
+template<typename T, bool has_pos>
+static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                            const int p_delta_rows, const float theta_scale) {
     const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
     if (col >= ncols) {
@@ -4369,8 +4369,8 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
     const int i = row*ncols + col;
     const int i2 = row/p_delta_rows;
 
-    const int p = pos != nullptr ? pos[i2] : 0;
-    const float p0 = p * freq_scale;
+    const int p = has_pos ? pos[i2] : 0;
+    const float p0 = p*freq_scale;
     const float theta = p0*powf(theta_scale, col/2);
     const float sin_theta = sinf(theta);
     const float cos_theta = cosf(theta);
@@ -4382,8 +4382,9 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
     dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }
 
-static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
-                                     const int p_delta_rows, const float theta_scale) {
+template<typename T, bool has_pos>
+static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                                 const int p_delta_rows, const float theta_scale) {
     const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
     if (col >= ncols) {
@@ -4394,8 +4395,8 @@ static __global__ void rope_neox_f32(const float * x, float * dst, const int nco
     const int i = row*ncols + col/2;
     const int i2 = row/p_delta_rows;
 
-    const int p = pos != nullptr ? pos[i2] : 0;
-    const float p0 = p * freq_scale;
+    const int p = has_pos ? pos[i2] : 0;
+    const float p0 = p*freq_scale;
     const float theta = p0*powf(theta_scale, col/2);
     const float sin_theta = sinf(theta);
     const float cos_theta = cosf(theta);
@@ -5371,22 +5372,32 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
     scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
 }
 
-static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+template<typename T>
+static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
                           const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
     GGML_ASSERT(ncols % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
     const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nrows, num_blocks_x, 1);
-    rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    if (pos == nullptr) {
+        rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    } else {
+        rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    }
 }
 
-static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+template<typename T>
+static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
                           const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
     GGML_ASSERT(ncols % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
     const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
     const dim3 block_nums(nrows, num_blocks_x, 1);
-    rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    if (pos == nullptr) {
+        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    } else {
+        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
+    }
 }
 
 static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
@@ -6036,7 +6047,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
     const int64_t ne0 = dst->ne[0];
     const int64_t row_diff = row_high - row_low;
 
-    float * src0_ddq_as_f32;
+    float * src0_ddq_as_f32 = nullptr;
     size_t src0_as = 0;
 
     if (src0->type != GGML_TYPE_F32) {
@@ -6074,8 +6085,9 @@ inline void ggml_cuda_op_rope(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
     const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == dst->type);
 
     const int64_t ne00 = src0->ne[0];
     const int64_t ne01 = src0->ne[1];
@@ -6093,23 +6105,12 @@ inline void ggml_cuda_op_rope(
     memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    // const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
-
-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(src1->ne[0] == ne2);
-    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
 
-    int id;
-    CUDA_CHECK(cudaGetDevice(&id));
-
-    int * pos = nullptr;
+    const int32_t * pos = nullptr;
     if ((mode & 1) == 0) {
-        struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-        pos = (int *) src1_extra->data_device[id];
-        if (!src1_extra->copied) {
-            CUDA_CHECK(cudaMemcpyAsync(pos, src1->data, ggml_nbytes(src1), cudaMemcpyHostToDevice, main_stream));
-            src1_extra->copied = true;
-        }
+        GGML_ASSERT(src1->type == GGML_TYPE_I32);
+        GGML_ASSERT(src1->ne[0] == ne2);
+        pos = (const int32_t *) src1_dd;
     }
 
     const bool is_neox = mode & 2;
@@ -6121,9 +6122,21 @@ inline void ggml_cuda_op_rope(
         rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
     } else if (is_neox) {
         GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
-        rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        if (src0->type == GGML_TYPE_F32) {
+            rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
     } else {
-        rope_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        if (src0->type == GGML_TYPE_F32) {
+            rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
     }
 
     (void) src1;
@@ -6294,7 +6307,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
     }
 }
 
-void ggml_cuda_set_peer_access(const int n_tokens) {
+static void ggml_cuda_set_peer_access(const int n_tokens) {
     static bool peer_access_enabled = false;
 
     const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
@@ -6622,27 +6635,27 @@ static void ggml_cuda_op_mul_mat(
     }
 }
 
-void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
 }
 
-void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
 }
 
-void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
 }
 
-void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
 }
 
-void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
 }
 
-void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
 }
 
@@ -6663,7 +6676,7 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
     return false;
 }
 
-void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
+static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
     GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
     GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
     GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -6692,7 +6705,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
     ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
 }
 
-void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
+static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
     GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
     GGML_ASSERT(!ggml_is_permuted(src0));
     GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
@@ -6726,7 +6739,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
     ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
 }
 
-void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
         src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
 
@@ -6770,11 +6783,11 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
     }
 }
 
-void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
 }
 
-void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
 
@@ -6822,29 +6835,29 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
     (void) dst;
 }
 
-void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_cpy(src0, dst, nullptr);
     (void) src1;
 }
 
-void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
 }
 
-void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
 }
 
-void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
 }
 
-void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
 }
 
-void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     (void) src0;
     (void) src1;
     (void) dst;
@@ -6967,11 +6980,13 @@ static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
     return extra;
 }
 
-void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
+static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
     if (scratch && g_scratch_size == 0) {
         return;
     }
 
+    tensor->backend = GGML_BACKEND_GPU;
+
     // recursively assign CUDA buffers until a compute tensor is found
     if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
         const ggml_op src0_op = tensor->src[0]->op;
@@ -6983,8 +6998,6 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
         ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
     }
 
-    tensor->backend = GGML_BACKEND_GPU;
-
     if (scratch && no_alloc) {
         return;
     }
@@ -7069,6 +7082,15 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
     tensor->extra = extra;
 }
 
+void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(ggml_is_contiguous(tensor));
+
+    struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
+}
+
 void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
     ggml_cuda_assign_buffers_impl(tensor, true, false, false);
 }
diff --git a/ggml-cuda.h b/ggml-cuda.h
index a72e82069b9f1..fda704b665623 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -31,6 +31,7 @@ GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tens
 
 GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
+GGML_API void   ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
 
 GGML_API void   ggml_cuda_set_main_device(int main_device);
 GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
diff --git a/ggml.c b/ggml.c
index 2075617946a6c..7648659396e89 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6343,7 +6343,7 @@ static struct ggml_tensor * ggml_cpy_impl(
     }
 
     // make a view of the destination
-    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
+    struct ggml_tensor * result = b->op == GGML_OP_NONE ? b : ggml_view_tensor(ctx, b);
     if (strlen(b->name) > 0) {
         ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
     } else {
diff --git a/llama.cpp b/llama.cpp
index ce3f2c8bb4707..d490d4e95d676 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1256,10 +1256,10 @@ static bool llama_kv_cache_init(
 
     (void) n_gpu_layers;
 #ifdef GGML_USE_CUBLAS
-    if (n_gpu_layers > n_layer + 1) {
+    if (n_gpu_layers > (int)n_layer + 1) {
         ggml_cuda_assign_buffers_no_scratch(cache.v);
     }
-    if (n_gpu_layers > n_layer + 2) {
+    if (n_gpu_layers > (int)n_layer + 2) {
         ggml_cuda_assign_buffers_no_scratch(cache.k);
     }
 #endif // GGML_USE_CUBLAS
@@ -2692,14 +2692,16 @@ static struct ggml_cgraph * llm_build_llama(
 
     // KQ_scale
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
     ggml_allocr_alloc(lctx.alloc, KQ_scale);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
     }
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    offload_func_kq(KQ_mask);
+    ggml_set_name(KQ_mask, "KQ_mask");
     ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         float * data = (float *) KQ_mask->data;
@@ -2722,6 +2724,7 @@ static struct ggml_cgraph * llm_build_llama(
     // KQ_pos - contains the positions
     struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
     offload_func_kq(KQ_pos);
+    ggml_set_name(KQ_pos, "KQ_pos");
     ggml_allocr_alloc(lctx.alloc, KQ_pos);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         int * data = (int *) KQ_pos->data;
@@ -2734,6 +2737,7 @@ static struct ggml_cgraph * llm_build_llama(
     if (do_rope_shift) {
         struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
         offload_func_kq(K_shift);
+        ggml_set_name(K_shift, "K_shift");
         ggml_allocr_alloc(lctx.alloc, K_shift);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
             int * data = (int *) K_shift->data;
@@ -2743,14 +2747,16 @@ static struct ggml_cgraph * llm_build_llama(
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            ggml_build_forward_expand(gf,
+            struct ggml_tensor * tmp =
                     ggml_rope_custom_inplace(ctx0,
                         ggml_view_3d(ctx0, kv_self.k,
                             n_embd_head, n_head_kv, n_ctx,
                             ggml_element_size(kv_self.k)*n_embd_head,
                             ggml_element_size(kv_self.k)*n_embd_gqa,
                             ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
+                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
+            offload_func_kq(tmp);
+            ggml_build_forward_expand(gf, tmp);
         }
     }
 
@@ -3078,14 +3084,16 @@ static struct ggml_cgraph * llm_build_baichaun(
 
     // KQ_scale
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
     ggml_allocr_alloc(lctx.alloc, KQ_scale);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
     }
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    offload_func_kq(KQ_mask);
+    ggml_set_name(KQ_mask, "KQ_mask");
     ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         float * data = (float *) KQ_mask->data;
@@ -3108,6 +3116,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     // KQ_pos - contains the positions
     struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
     offload_func_kq(KQ_pos);
+    ggml_set_name(KQ_pos, "KQ_pos");
     ggml_allocr_alloc(lctx.alloc, KQ_pos);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         int * data = (int *) KQ_pos->data;
@@ -3120,6 +3129,7 @@ static struct ggml_cgraph * llm_build_baichaun(
     if (do_rope_shift) {
         struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
         offload_func_kq(K_shift);
+        ggml_set_name(K_shift, "K_shift");
         ggml_allocr_alloc(lctx.alloc, K_shift);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
             int * data = (int *) K_shift->data;
@@ -3129,14 +3139,16 @@ static struct ggml_cgraph * llm_build_baichaun(
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            ggml_build_forward_expand(gf,
+            struct ggml_tensor * tmp =
                     ggml_rope_custom_inplace(ctx0,
                         ggml_view_3d(ctx0, kv_self.k,
                             n_embd_head, n_head_kv, n_ctx,
                             ggml_element_size(kv_self.k)*n_embd_head,
                             ggml_element_size(kv_self.k)*n_embd_gqa,
                             ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
+                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
+            offload_func_kq(tmp);
+            ggml_build_forward_expand(gf, tmp);
         }
     }
 
@@ -3484,14 +3496,16 @@ static struct ggml_cgraph * llm_build_falcon(
 
     // KQ_scale
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
     ggml_allocr_alloc(lctx.alloc, KQ_scale);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
     }
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    offload_func_kq(KQ_mask);
+    ggml_set_name(KQ_mask, "KQ_mask");
     ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         float * data = (float *) KQ_mask->data;
@@ -3514,6 +3528,7 @@ static struct ggml_cgraph * llm_build_falcon(
     // KQ_pos - contains the positions
     struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
     offload_func_kq(KQ_pos);
+    ggml_set_name(KQ_pos, "KQ_pos");
     ggml_allocr_alloc(lctx.alloc, KQ_pos);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         int * data = (int *) KQ_pos->data;
@@ -3526,6 +3541,7 @@ static struct ggml_cgraph * llm_build_falcon(
     if (do_rope_shift) {
         struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
         offload_func_kq(K_shift);
+        ggml_set_name(K_shift, "K_shift");
         ggml_allocr_alloc(lctx.alloc, K_shift);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
             int * data = (int *) K_shift->data;
@@ -3535,14 +3551,16 @@ static struct ggml_cgraph * llm_build_falcon(
         }
 
         for (int il = 0; il < n_layer; ++il) {
-            ggml_build_forward_expand(gf,
+            struct ggml_tensor * tmp =
                     ggml_rope_custom_inplace(ctx0,
                         ggml_view_3d(ctx0, kv_self.k,
                             n_embd_head, n_head_kv, n_ctx,
                             ggml_element_size(kv_self.k)*n_embd_head,
                             ggml_element_size(kv_self.k)*n_embd_gqa,
                             ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
-                        K_shift, n_embd_head, 2, 0, freq_base, freq_scale));
+                        K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
+            offload_func_kq(tmp);
+            ggml_build_forward_expand(gf, tmp);
         }
     }
 
@@ -3832,14 +3850,15 @@ static struct ggml_cgraph * llm_build_starcoder(
 
     // KQ_scale
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
     ggml_allocr_alloc(lctx.alloc, KQ_scale);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
     }
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
     struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    ggml_set_name(KQ_mask, "KQ_mask");
     ggml_allocr_alloc(lctx.alloc, KQ_mask);
     if (!ggml_allocr_is_measure(lctx.alloc)) {
         float * data = (float *) KQ_mask->data;
@@ -4118,6 +4137,7 @@ static int llama_decode_internal(
         ggml_tensor * node = gf->leafs[i];
         if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
             ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
+            ggml_cuda_copy_to_device(node);
         }
     }
 

From 54206962c71fa4acea5bdf2288f6d6d970cb8e6a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 14:06:41 +0300
Subject: [PATCH 39/55] llama : disable MPI for now

ggml-ci
---
 llama.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d490d4e95d676..e3c3568c806c5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4072,7 +4072,8 @@ static int llama_decode_internal(
 
 #ifdef GGML_USE_MPI
     // TODO: needs fix after #3228
-    ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
+    GGML_ASSERT(false && "not implemented");
+    //ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
 #endif
 
     GGML_ASSERT(n_threads > 0);
@@ -6846,8 +6847,10 @@ struct llama_context * llama_new_context_with_model(
 
     if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
         // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
-        const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
-        while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
+        // TODO: needs fix after #3228
+        GGML_ASSERT(false && "not implemented");
+        //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
+        //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
         llama_backend_free();
         exit(1);
     }

From 2f3a46fccf047788a108cfef480206c865e3c4cb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 14:14:50 +0300
Subject: [PATCH 40/55] train : make KQ_pos memory buffer permanent via dummy
 scale op

---
 examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 025eac2a60945..5f541a14100e0 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -795,6 +795,8 @@ struct ggml_tensor * llama_build_train_graphs(
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
         // input gradient
         ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
+        // KQ_pos
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
         GGML_ASSERT(t36->grad->data == NULL && !ggml_is_view(t36->grad));
         ggml_allocr_alloc(alloc, t36->grad);
         // gradient tensors (will be set to zero by ggml_graph_reset)

From 1be2b8c19b318a4637682719bc6c03f4ea0823d3 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 20 Sep 2023 15:12:51 +0200
Subject: [PATCH 41/55] ggml : revert change to ggml_cpy, add ggml_cont_Nd
 instead (#3275)

ggml-ci
---
 ggml.c    | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 ggml.h    | 28 +++++++++++++++++++++++++++-
 llama.cpp | 14 ++++----------
 3 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/ggml.c b/ggml.c
index 7648659396e89..35751342f9b16 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6343,7 +6343,7 @@ static struct ggml_tensor * ggml_cpy_impl(
     }
 
     // make a view of the destination
-    struct ggml_tensor * result = b->op == GGML_OP_NONE ? b : ggml_view_tensor(ctx, b);
+    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
     if (strlen(b->name) > 0) {
         ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
     } else {
@@ -6406,6 +6406,54 @@ struct ggml_tensor * ggml_cont_inplace(
     return ggml_cont_impl(ctx, a, true);
 }
 
+
+// make contiguous, with new shape
+GGML_API struct ggml_tensor * ggml_cont_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0) {
+    return ggml_cont_4d(ctx, a, ne0, 1, 1, 1);
+}
+
+GGML_API struct ggml_tensor * ggml_cont_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1) {
+    return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1);
+}
+
+GGML_API struct ggml_tensor * ggml_cont_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2) {
+    return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1);
+}
+
+struct ggml_tensor * ggml_cont_4d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        int64_t               ne3) {
+    GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
+
+    bool is_node = false;
+
+    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
+    ggml_format_name(result, "%s (cont)", a->name);
+
+    result->op   = GGML_OP_CONT;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+
 // ggml_reshape
 
 struct ggml_tensor * ggml_reshape(
diff --git a/ggml.h b/ggml.h
index e2bfa1ae4f18d..545ca438a04be 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1049,7 +1049,6 @@ extern "C" {
             size_t                nb1,
             size_t                offset);
 
-
     // a -> b, return view(b)
     GGML_API struct ggml_tensor * ggml_cpy(
             struct ggml_context * ctx,
@@ -1072,6 +1071,33 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    // make contiguous, with new shape
+    GGML_API struct ggml_tensor * ggml_cont_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
+
+    GGML_API struct ggml_tensor * ggml_cont_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
+
+    GGML_API struct ggml_tensor * ggml_cont_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
+
+    GGML_API struct ggml_tensor * ggml_cont_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
     // return view(a), b specifies the new shape
     // TODO: when we start computing gradient, make a copy instead of view
     GGML_API struct ggml_tensor * ggml_reshape(
diff --git a/llama.cpp b/llama.cpp
index e3c3568c806c5..1576c3b866933 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2893,9 +2893,7 @@ static struct ggml_cgraph * llm_build_llama(
             ggml_set_name(KQV_merged, "KQV_merged");
 
             // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
+            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
             offload_func_v(cur);
             ggml_set_name(cur, "KQV_merged_contiguous");
 
@@ -3302,9 +3300,7 @@ static struct ggml_cgraph * llm_build_baichaun(
             ggml_set_name(KQV_merged, "KQV_merged");
 
             // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
+            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
             offload_func_v(cur);
             ggml_set_name(cur, "KQV_merged_contiguous");
 
@@ -3710,7 +3706,7 @@ static struct ggml_cgraph * llm_build_falcon(
             offload_func_v(KQV_merged);
             ggml_set_name(KQV_merged, "KQV_merged");
 
-            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
+            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
             offload_func_v(cur);
             ggml_set_name(cur, "KQV_merged_contiguous");
 
@@ -3964,9 +3960,7 @@ static struct ggml_cgraph * llm_build_starcoder(
             ggml_set_name(KQV_merged, "KQV_merged");
 
             // cur = KQV_merged.contiguous().view(n_embd, n_tokens)
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
+            cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
             ggml_set_name(cur, "KQV_merged_contiguous");
         }
 

From ee1d670cc6eef301d913b698864e1f4cbbe4d912 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 17:32:21 +0300
Subject: [PATCH 42/55] parallel : fix bug (extra BOS) + smaller token_prev
 array

---
 examples/parallel/parallel.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index b8bd6d93662e4..9c7cfd0dcb692 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.tokens_prev.resize(n_ctx);
+        client.tokens_prev.resize(params.n_predict);
         std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
     }
 
@@ -191,6 +191,8 @@ int main(int argc, char ** argv) {
             for (int i = 0; i < n_clients; ++i) {
                 llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
             }
+
+            LOG_TEE("%s: clearing the KV cache\n", __func__);
         }
 
         // insert new sequences for decoding
@@ -208,8 +210,9 @@ int main(int argc, char ** argv) {
 
                     std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
 
+                    // do not prepend BOS because we have a system prompt!
                     std::vector<llama_token> tokens_prompt;
-                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, true);
+                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
 
                     for (size_t i = 0; i < tokens_prompt.size(); ++i) {
                         batch.token [batch.n_tokens] = tokens_prompt[i];

From ded9b43cad687d229c28c707c60d1dbac5c9c1ea Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 19:09:25 +0300
Subject: [PATCH 43/55] parallel : fix cases where the input prompts can
 overflow the batch

---
 examples/parallel/parallel.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 9c7cfd0dcb692..abf3991a1c38d 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -127,7 +127,9 @@ int main(int argc, char ** argv) {
 
     llama_seq_id g_seq_id = 0;
 
-    llama_batch batch = llama_batch_init(params.n_batch, 0);
+    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
+    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
+    llama_batch batch = llama_batch_init(params.n_ctx, 0);
 
     int32_t n_total_prompt = 0;
     int32_t n_total_gen    = 0;

From b2debf65f2f0953777fd2f5a760da0036c14e530 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 20 Sep 2023 20:14:05 +0300
Subject: [PATCH 44/55] parallel : add disabled experimental batch chunking in
 powers of two

---
 examples/parallel/parallel.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index abf3991a1c38d..c7fb6d81a3561 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -253,6 +253,13 @@ int main(int argc, char ** argv) {
         int32_t n_batch = params.n_batch;
 
         for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+            // experiment: process in powers of 2
+            //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
+            //    n_batch /= 2;
+            //    i -= n_batch;
+            //    continue;
+            //}
+
             const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
 
             llama_batch batch_view = {

From 5a3369d8e851891ab365816c9aa6cdbf0f874d7b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 21 Sep 2023 19:51:32 +0200
Subject: [PATCH 45/55] llama : llama.h formatting + comments

---
 llama.cpp |   4 +
 llama.h   | 235 ++++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 181 insertions(+), 58 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 1576c3b866933..73a636cea59a4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7477,6 +7477,10 @@ float * llama_get_logits(struct llama_context * ctx) {
     return ctx->logits.data();
 }
 
+float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
+    return ctx->logits.data() + i*ctx->model.hparams.n_vocab;
+}
+
 float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embedding.data();
 }
diff --git a/llama.h b/llama.h
index 54eab8f0885a3..590af79bb0061 100644
--- a/llama.h
+++ b/llama.h
@@ -66,26 +66,6 @@ extern "C" {
     typedef int32_t llama_token;
     typedef int32_t llama_seq_id;
 
-    // data used for batch inference
-    typedef struct llama_batch {
-        int32_t n_tokens;
-
-        llama_token  * token;
-        float        * embd;
-        llama_pos    * pos;
-        llama_seq_id * seq_id;
-        int8_t       * logits; // if 0, do not extract logits for that token
-
-        // NOTE: helpers for smooth API transition - can be deprecated in the future
-        //       for future-proof code, use the above fields instead and ignore everything below
-        //
-        // pos[i] = all_pos_0 + i*all_pos_1
-        //
-        llama_pos    all_pos_0;  // used if pos == NULL
-        llama_pos    all_pos_1;  // used if pos == NULL
-        llama_seq_id all_seq_id; // used if seq_id == NULL
-    } llama_batch;
-
     enum llama_log_level {
         LLAMA_LOG_LEVEL_ERROR = 2,
         LLAMA_LOG_LEVEL_WARN  = 3,
@@ -146,6 +126,35 @@ extern "C" {
 
     typedef void (*llama_progress_callback)(float progress, void *ctx);
 
+    // Input data for llama_decode
+    // A llama_batch object can contain input about one or many sequences
+    // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
+    //
+    // - token  : the token ids of the input (used when embd is NULL)
+    // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
+    // - pos    : the positions of the respective token in the sequence
+    // - seq_id : the sequence to which the respective token belongs
+    // - logits : if zero, the logits for the respective token will not be output
+    //
+    typedef struct llama_batch {
+        int32_t n_tokens;
+
+        llama_token  * token;
+        float        * embd;
+        llama_pos    * pos;
+        llama_seq_id * seq_id;
+        int8_t       * logits;
+
+        // NOTE: helpers for smooth API transition - can be deprecated in the future
+        //       for future-proof code, use the above fields instead and ignore everything below
+        //
+        // pos[i] = all_pos_0 + i*all_pos_1
+        //
+        llama_pos    all_pos_0;  // used if pos == NULL
+        llama_pos    all_pos_1;  // used if pos == NULL
+        llama_seq_id all_seq_id; // used if seq_id == NULL
+    } llama_batch;
+
     struct llama_context_params {
         uint32_t seed;         // RNG seed, -1 for random
         int32_t  n_ctx;        // text context
@@ -239,6 +248,7 @@ extern "C" {
         int32_t n_eval;
     };
 
+    // Helpers for getting default parameters
     LLAMA_API struct llama_context_params llama_context_default_params(void);
     LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
 
@@ -283,8 +293,10 @@ extern "C" {
 
     // Get a string describing the model type
     LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+
     // Returns the total size of all the tensors in the model in bytes
     LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
+
     // Returns the total number of parameters in the model
     LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
 
@@ -305,7 +317,7 @@ extern "C" {
                       const char * path_lora,
                       const char * path_base_model,
                              int   n_threads),
-            "please use llama_model_apply_lora_from_file instead");
+            "use llama_model_apply_lora_from_file instead");
 
     LLAMA_API int llama_model_apply_lora_from_file(
             const struct llama_model * model,
@@ -322,20 +334,40 @@ extern "C" {
             "avoid using this, it will be removed in the future, instead - count the tokens in user code");
 
     // Remove all tokens data of cells in [c0, c1)
-    LLAMA_API void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1);
+    LLAMA_API void llama_kv_cache_tokens_rm(
+            struct llama_context * ctx,
+                         int32_t   c0,
+                         int32_t   c1);
 
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
-    LLAMA_API void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+    LLAMA_API void llama_kv_cache_seq_rm(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1);
 
     // Copy all tokens that belong to the specified sequence to another sequence
-    LLAMA_API void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
+    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
+    LLAMA_API void llama_kv_cache_seq_cp(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id_src,
+                    llama_seq_id   seq_id_dst,
+                       llama_pos   p0,
+                       llama_pos   p1);
 
     // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id);
+    LLAMA_API void llama_kv_cache_seq_keep(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id);
 
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly
-    LLAMA_API void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
+    LLAMA_API void llama_kv_cache_seq_shift(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                       llama_pos   delta);
 
     //
     // State / sessions
@@ -348,21 +380,35 @@ extern "C" {
     // Copies the state to the specified destination address.
     // Destination needs to have allocated enough memory.
     // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
+    LLAMA_API size_t llama_copy_state_data(
+            struct llama_context * ctx,
+                         uint8_t * dst);
 
     // Set the state reading from the specified address
     // Returns the number of bytes read
-    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
+    LLAMA_API size_t llama_set_state_data(
+            struct llama_context * ctx,
+                         uint8_t * src);
 
     // Save/load session file
-    LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
-    LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
+    LLAMA_API bool llama_load_session_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+                     llama_token * tokens_out,
+                          size_t   n_token_capacity,
+                          size_t * n_token_count_out);
+
+    LLAMA_API bool llama_save_session_file(
+            struct llama_context * ctx,
+                      const char * path_session,
+               const llama_token * tokens,
+                          size_t   n_token_count);
 
     //
     // Decoding
     //
 
-    // Run the llama inference to obtain the logits and probabilities for the next token.
+    // Run the llama inference to obtain the logits and probabilities for the next token(s).
     // tokens + n_tokens is the provided batch of new tokens to process
     // n_past is the number of tokens to use from previous eval calls
     // Returns 0 on success
@@ -373,7 +419,7 @@ extern "C" {
                          int32_t   n_tokens,
                              int   n_past,
                              int   n_threads),
-            "please use llama_decode() instead");
+            "use llama_decode() instead");
 
     // Same as llama_eval, but use float matrix input directly.
     // DEPRECATED: use llama_decode() instead
@@ -383,7 +429,7 @@ extern "C" {
                          int32_t   n_tokens,
                              int   n_past,
                              int   n_threads),
-            "please use llama_decode() instead");
+            "use llama_decode() instead");
 
     // Return batch for single sequence of tokens starting at pos_0
     //
@@ -396,12 +442,14 @@ extern "C" {
                  llama_seq_id   seq_id);
 
     // Allocates a batch of tokens on the heap
-    // The batch needs to be freed with llama_batch_free()
-    // If embd > 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
+    // The batch has to be freed with llama_batch_free()
+    // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
     // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
     // The rest of the llama_batch members are allocated with size n_tokens
     // All members are left uninitialized
-    LLAMA_API struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd);
+    LLAMA_API struct llama_batch llama_batch_init(
+            int32_t n_tokens,
+            int32_t embd);
 
     // Frees a batch of tokens allocated with llama_batch_init()
     LLAMA_API void llama_batch_free(struct llama_batch batch);
@@ -417,11 +465,15 @@ extern "C" {
 
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
-    // Can be mutated in order to change the probabilities of the next token
-    // Rows: n_tokens
+    // Logits for which llama_batch.logits[i] == 0 are undefined
+    // Rows: n_tokens provided with llama_batch
     // Cols: n_vocab
     LLAMA_API float * llama_get_logits(struct llama_context * ctx);
 
+    // Logits for the ith token. Equivalent to:
+    // llama_get_logits(ctx) + i*n_vocab
+    LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
+
     // Get the embeddings for the input
     // shape: [n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
@@ -502,10 +554,21 @@ extern "C" {
     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
 
     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
+    LLAMA_API void llama_sample_repetition_penalty(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+               const llama_token * last_tokens,
+                          size_t   last_tokens_size,
+                          float    penalty);
 
     /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+    LLAMA_API void llama_sample_frequency_and_presence_penalties(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+               const llama_token * last_tokens,
+                          size_t   last_tokens_size,
+                           float   alpha_frequency,
+                           float   alpha_presence);
 
     /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
@@ -518,26 +581,54 @@ extern "C" {
                              float   scale);
 
     /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
+    LLAMA_API void llama_sample_softmax(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates);
 
     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
+    LLAMA_API void llama_sample_top_k(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                             int   k,
+                          size_t   min_keep);
 
     /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
+    LLAMA_API void llama_sample_top_p(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   p,
+                          size_t   min_keep);
 
     /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
+    LLAMA_API void llama_sample_tail_free(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   z,
+                          size_t   min_keep);
 
     /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
-    LLAMA_API void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+    LLAMA_API void llama_sample_typical(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   p,
+                          size_t   min_keep);
+
+    LLAMA_API void llama_sample_temp(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   temp);
 
-    LLAMA_API DEPRECATED(void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp),
-            "Use llama_sample_temp instead");
+    LLAMA_API DEPRECATED(void llama_sample_temperature(
+                struct llama_context * ctx,
+              llama_token_data_array * candidates,
+                               float   temp),
+            "use llama_sample_temp instead");
 
     /// @details Apply constraints from grammar
-    LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
+    LLAMA_API void llama_sample_grammar(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+      const struct llama_grammar * grammar);
 
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@@ -545,23 +636,41 @@ extern "C" {
     /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
     /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
     /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
+    LLAMA_API llama_token llama_sample_token_mirostat(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   tau,
+                           float   eta,
+                             int   m,
+                           float * mu);
 
     /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
     /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
     /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
+    LLAMA_API llama_token llama_sample_token_mirostat_v2(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+                           float   tau,
+                           float   eta,
+                           float * mu);
 
     /// @details Selects the token with the highest probability.
-    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
+    LLAMA_API llama_token llama_sample_token_greedy(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates);
 
     /// @details Randomly selects a token from the candidates based on their probabilities.
-    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
+    LLAMA_API llama_token llama_sample_token(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates);
 
     /// @details Accepts the sampled token into the grammar
-    LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
+    LLAMA_API void llama_grammar_accept_token(
+            struct llama_context * ctx,
+            struct llama_grammar * grammar,
+                     llama_token   token);
 
     //
     // Beam search
@@ -569,9 +678,10 @@ extern "C" {
 
     struct llama_beam_view {
         const llama_token * tokens;
+
         size_t n_tokens;
-        float p;   // Cumulative beam probability (renormalized relative to all beams)
-        bool eob;  // Callback should set this to true when a beam is at end-of-beam.
+        float  p;        // Cumulative beam probability (renormalized relative to all beams)
+        bool   eob;      // Callback should set this to true when a beam is at end-of-beam.
     };
 
     // Passed to beam_search_callback function.
@@ -580,9 +690,10 @@ extern "C" {
     // These pointers are valid only during the synchronous callback, so should not be saved.
     struct llama_beams_state {
         struct llama_beam_view * beam_views;
+
         size_t n_beams;               // Number of elements in beam_views[].
         size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
-        bool last_call;               // True iff this is the last callback invocation.
+        bool   last_call;             // True iff this is the last callback invocation.
     };
 
     // Type of pointer to the beam_search_callback function.
@@ -598,10 +709,18 @@ extern "C" {
     /// @param n_past Number of tokens already evaluated.
     /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
     /// @param n_threads Number of threads as passed to llama_eval().
-    LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
+    LLAMA_API void llama_beam_search(
+                   struct llama_context * ctx,
+        llama_beam_search_callback_fn_t   callback,
+                                   void * callback_data,
+                                 size_t   n_beams,
+                                    int   n_past,
+                                    int   n_predict,
+                                    int   n_threads);
 
     // Performance information
     LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
+
     LLAMA_API void llama_print_timings(struct llama_context * ctx);
     LLAMA_API void llama_reset_timings(struct llama_context * ctx);
 

From 8845160058a058233c8b26886c78598a778b89cb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 21 Sep 2023 20:10:14 +0200
Subject: [PATCH 46/55] simple : add README.md

---
 examples/parallel/README.md |  3 ++
 examples/simple/README.md   | 67 +++++++++++++++++++++++++++++++++++++
 llama.h                     | 36 ++++++++++----------
 3 files changed, 88 insertions(+), 18 deletions(-)
 create mode 100644 examples/parallel/README.md
 create mode 100644 examples/simple/README.md

diff --git a/examples/parallel/README.md b/examples/parallel/README.md
new file mode 100644
index 0000000000000..4d0fe5cef12fa
--- /dev/null
+++ b/examples/parallel/README.md
@@ -0,0 +1,3 @@
+# llama.cpp/example/parallel
+
+Simplified simluation for serving incoming requests in parallel
diff --git a/examples/simple/README.md b/examples/simple/README.md
new file mode 100644
index 0000000000000..765d32210db2e
--- /dev/null
+++ b/examples/simple/README.md
@@ -0,0 +1,67 @@
+# llama.cpp/example/simple
+
+The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
+The example demonstrates single-batch as well as parallel generation.
+
+## Single-batch generation
+
+```bash
+./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 1
+
+...
+
+main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32
+
+ Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old
+
+main: decoded 27 tokens in 2.31 s, speed: 11.68 t/s
+
+llama_print_timings:        load time =   579.15 ms
+llama_print_timings:      sample time =     0.72 ms /    28 runs   (    0.03 ms per token, 38888.89 tokens per second)
+llama_print_timings: prompt eval time =   655.63 ms /    10 tokens (   65.56 ms per token,    15.25 tokens per second)
+llama_print_timings:        eval time =  2180.97 ms /    27 runs   (   80.78 ms per token,    12.38 tokens per second)
+llama_print_timings:       total time =  2891.13 ms
+```
+
+## Parallel generation
+
+```bash
+./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
+
+...
+
+main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
+
+ Hello my name is
+
+main: generating 4 sequences ...
+
+main: stream 0 finished
+main: stream 1 finished
+main: stream 2 finished
+main: stream 3 finished
+
+sequence 0:
+
+Hello my name is Shirley. I am a 25-year-old female who has been working for over 5 years as a b
+
+sequence 1:
+
+Hello my name is Renee and I'm a 32 year old female from the United States. I'm looking for a man between
+
+sequence 2:
+
+Hello my name is Diana. I am looking for a housekeeping job. I have experience with children and have my own transportation. I am
+
+sequence 3:
+
+Hello my name is Cody. I am a 3 year old neutered male. I am a very friendly cat. I am very playful and
+
+main: decoded 108 tokens in 3.57 s, speed: 30.26 t/s
+
+llama_print_timings:        load time =   587.00 ms
+llama_print_timings:      sample time =     2.56 ms /   112 runs   (    0.02 ms per token, 43664.72 tokens per second)
+llama_print_timings: prompt eval time =  4089.11 ms /   118 tokens (   34.65 ms per token,    28.86 tokens per second)
+llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
+llama_print_timings:       total time =  4156.04 ms
+```
diff --git a/llama.h b/llama.h
index 590af79bb0061..e7b0805239856 100644
--- a/llama.h
+++ b/llama.h
@@ -90,24 +90,24 @@ extern "C" {
     // model file types
     enum llama_ftype {
         LLAMA_FTYPE_ALL_F32              = 0,
-        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
-        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
+        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0          = 8,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1          = 9,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };

From c1596f633fa141f4cde8a92bb8895fd10dc91869 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 27 Sep 2023 18:12:43 +0300
Subject: [PATCH 47/55] llama : fix kv cache heuristic when context is less
 than 32

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 73a636cea59a4..b409b0d12344a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4117,7 +4117,7 @@ static int llama_decode_internal(
     // after enough generations, the benefit from this heuristic disappears
     // if we start defragmenting the cache, the benefit from this will be more important
     //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
-    kv_self.n = std::max(32, llama_kv_cache_cell_max(kv_self));
+    kv_self.n = std::min((int32_t) hparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
 
     //printf("kv_self.n = %d\n", kv_self.n);
 

From 4ad0676927330ccc84c66b8ab7c27ddf18aea43d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Sep 2023 15:48:38 +0300
Subject: [PATCH 48/55] parallel : fix crash when `-n -1`

---
 examples/parallel/parallel.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index c7fb6d81a3561..790189af98876 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.tokens_prev.resize(params.n_predict);
+        client.tokens_prev.resize(std::max(256, params.n_predict));
         std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
     }
 
@@ -321,7 +321,8 @@ int main(int argc, char ** argv) {
                 //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
 
                 if (client.n_decoded > 2 &&
-                        (id == llama_token_eos(ctx) || client.n_decoded + client.n_prompt >= params.n_predict ||
+                        (id == llama_token_eos(ctx) ||
+                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                          client.response.find("User:") != std::string::npos ||
                          client.response.find('\n') != std::string::npos)) {
                     // basic reverse prompt

From e9463792d35d57e79fcb36904a6744c8a9010376 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Sep 2023 16:01:49 +0300
Subject: [PATCH 49/55] llama : simplify returns if/else branches

---
 llama.cpp | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 86379f88a9c6d..140533553c93e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7380,12 +7380,8 @@ int llama_eval(
     llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
 
     const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0), n_threads);
-    if (ret != 0) {
-        if (ret < 0) {
-            LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
-        }
-
-        return ret;
+    if (ret < 0) {
+        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
 
     return ret;
@@ -7402,12 +7398,8 @@ int llama_eval_embd(
     llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
 
     const int ret = llama_decode_internal(*ctx, batch, n_threads);
-    if (ret != 0) {
-        if (ret < 0) {
-            LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
-        }
-
-        return ret;
+    if (ret < 0) {
+        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
 
     return ret;
@@ -7460,12 +7452,8 @@ int llama_decode(
           struct llama_batch   batch,
                          int   n_threads) {
     const int ret = llama_decode_internal(*ctx, batch, n_threads);
-    if (ret != 0) {
-        if (ret < 0) {
-            LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
-        }
-
-        return ret;
+    if (ret < 0) {
+        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
     }
 
     return ret;

From 4c72ab13b20c1589c3af7fc18d14299c983ad556 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Sep 2023 16:02:20 +0300
Subject: [PATCH 50/55] metal : use mm kernels for batch size > 2

---
 ggml-metal.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 71d69d38d9bb3..b3c463f03ad3d 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -958,7 +958,7 @@ void ggml_metal_graph_compute(
                                 src1t == GGML_TYPE_F32 &&
                                 [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
                                 ne00%32 == 0 &&
-                                ne11 > 1) {
+                                ne11 > 2) {
                                 switch (src0->type) {
                                     case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32];  break;
                                     case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;

From d008733e6b292bf110e738dfef0b5440baede986 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Sep 2023 16:05:37 +0300
Subject: [PATCH 51/55] examples : utilize new llama_get_logits_ith()

---
 common/common.cpp          | 2 +-
 common/common.h            | 2 +-
 examples/simple/simple.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 059d7a76aa5e7..7c3e11875cb0b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -905,7 +905,7 @@ llama_token llama_sample_token(
 
     llama_token id = 0;
 
-    float * logits = llama_get_logits(ctx) + idx * n_vocab;
+    float * logits = llama_get_logits_ith(ctx, idx);
 
     // Apply params.logit_bias map
     for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
diff --git a/common/common.h b/common/common.h
index ed86aa95b000a..16e30b2f5ccbf 100644
--- a/common/common.h
+++ b/common/common.h
@@ -183,7 +183,7 @@ std::string llama_detokenize_bpe(
 //  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
 //  - grammar:       grammar to use for sampling, ignore if NULL
 //  - last_tokens:   needed for repetition penalty, ignore if empty
-//  - idx:           sample from llama_get_logits(ctx) + idx * n_vocab
+//  - idx:           sample from llama_get_logits_ith(ctx, idx)
 //
 // returns:
 //  - token:      sampled token
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index cf48ce0c01bc3..08b082b336b9b 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -150,7 +150,7 @@ int main(int argc, char ** argv) {
             }
 
             auto n_vocab = llama_n_vocab(ctx);
-            auto logits  = llama_get_logits(ctx) + i_batch[i] * n_vocab;
+            auto logits  = llama_get_logits_ith(ctx, i_batch[i]);
 
             std::vector<llama_token_data> candidates;
             candidates.reserve(n_vocab);

From a2075615034f71d4c80d4e1cce277b55a5e1d63b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Sep 2023 17:32:04 +0300
Subject: [PATCH 52/55] examples : add example for batched decoding

---
 .gitignore                      |   1 +
 Makefile                        |   5 +-
 examples/CMakeLists.txt         |   1 +
 examples/batched/CMakeLists.txt |   5 +
 examples/batched/README.md      |  44 ++++++
 examples/batched/batched.cpp    | 243 ++++++++++++++++++++++++++++++++
 examples/simple/README.md       |  48 +------
 examples/simple/simple.cpp      |  93 +++---------
 8 files changed, 315 insertions(+), 125 deletions(-)
 create mode 100644 examples/batched/CMakeLists.txt
 create mode 100644 examples/batched/README.md
 create mode 100644 examples/batched/batched.cpp

diff --git a/.gitignore b/.gitignore
index 1f841c8308a41..b54723a15052d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,6 +51,7 @@ models-mnt
 /save-load-state
 /server
 /simple
+/batched
 /speculative
 /parallel
 /train-text-from-scratch
diff --git a/Makefile b/Makefile
index 207559d8c30e2..c7f6a808ed379 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
@@ -519,6 +519,9 @@ main: examples/main/main.cpp                                  build-info.h ggml.
 simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index df7307072c1b6..129cc01163957 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -23,6 +23,7 @@ else()
     add_subdirectory(train-text-from-scratch)
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(simple)
+    add_subdirectory(batched)
     add_subdirectory(speculative)
     add_subdirectory(parallel)
     add_subdirectory(embd-input)
diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt
new file mode 100644
index 0000000000000..6aa178d4d5911
--- /dev/null
+++ b/examples/batched/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET batched)
+add_executable(${TARGET} batched.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/batched/README.md b/examples/batched/README.md
new file mode 100644
index 0000000000000..5d730331769fb
--- /dev/null
+++ b/examples/batched/README.md
@@ -0,0 +1,44 @@
+# llama.cpp/example/batched
+
+The example demonstrates batched generation from a given prompt
+
+```bash
+./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
+
+...
+
+main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
+
+ Hello my name is
+
+main: generating 4 sequences ...
+
+main: stream 0 finished
+main: stream 1 finished
+main: stream 2 finished
+main: stream 3 finished
+
+sequence 0:
+
+Hello my name is Shirley. I am a 25-year-old female who has been working for over 5 years as a b
+
+sequence 1:
+
+Hello my name is Renee and I'm a 32 year old female from the United States. I'm looking for a man between
+
+sequence 2:
+
+Hello my name is Diana. I am looking for a housekeeping job. I have experience with children and have my own transportation. I am
+
+sequence 3:
+
+Hello my name is Cody. I am a 3 year old neutered male. I am a very friendly cat. I am very playful and
+
+main: decoded 108 tokens in 3.57 s, speed: 30.26 t/s
+
+llama_print_timings:        load time =   587.00 ms
+llama_print_timings:      sample time =     2.56 ms /   112 runs   (    0.02 ms per token, 43664.72 tokens per second)
+llama_print_timings: prompt eval time =  4089.11 ms /   118 tokens (   34.65 ms per token,    28.86 tokens per second)
+llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
+llama_print_timings:       total time =  4156.04 ms
+```
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
new file mode 100644
index 0000000000000..08b082b336b9b
--- /dev/null
+++ b/examples/batched/batched.cpp
@@ -0,0 +1,243 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
+        return 1 ;
+    }
+
+    int n_parallel = 1;
+
+    if (argc >= 2) {
+        params.model = argv[1];
+    }
+
+    if (argc >= 3) {
+        params.prompt = argv[2];
+    }
+
+    if (argc >= 4) {
+        n_parallel = std::atoi(argv[3]);
+    }
+
+    if (params.prompt.empty()) {
+        params.prompt = "Hello my name is";
+    }
+
+    // total length of the sequences including the prompt
+    const int n_len = 32;
+
+    // init LLM
+
+    llama_backend_init(params.numa);
+
+    llama_context_params ctx_params = llama_context_default_params();
+
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx = 2048;
+
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
+
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    // tokenize the prompt
+
+    std::vector<llama_token> tokens_list;
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
+
+    const int n_ctx    = llama_n_ctx(ctx);
+    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
+
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_parallel, n_kv_req);
+
+    // make sure the KV cache is big enough to hold all the prompt and generated tokens
+    if (n_kv_req > n_ctx) {
+        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
+        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        return 1;
+    }
+
+    // print the prompt token-by-token
+
+    fprintf(stderr, "\n");
+
+    for (auto id : tokens_list) {
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+    }
+
+    fflush(stderr);
+
+    // create a llama_batch with size 512
+    // we use this object to submit token data for decoding
+
+    llama_batch batch = llama_batch_init(512, 0);
+
+    // evaluate the initial prompt
+    batch.n_tokens = tokens_list.size();
+
+    for (int32_t i = 0; i < batch.n_tokens; i++) {
+        batch.token[i]  = tokens_list[i];
+        batch.pos[i]    = i;
+        batch.seq_id[i] = 0;
+        batch.logits[i] = false;
+    }
+
+    // llama_decode will output logits only for the last token of the prompt
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (llama_decode(ctx, batch, params.n_threads) != 0) {
+        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        return 1;
+    }
+
+    // assign the system KV cache to all parallel sequences
+    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
+    for (int32_t i = 1; i < n_parallel; ++i) {
+        llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
+    }
+
+    if (n_parallel > 1) {
+        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+    }
+
+    // main loop
+
+    // we will store the parallel decoded sequences in this vector
+    std::vector<std::string> streams(n_parallel);
+
+    // remember the batch index of the last token for each parallel sequence
+    // we need this to determine which logits to sample from
+    std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
+
+    int n_cur    = batch.n_tokens;
+    int n_decode = 0;
+
+    const auto t_main_start = ggml_time_us();
+
+    while (n_cur <= n_len) {
+        // evaluate the current batch with the transformer model
+        if (llama_decode(ctx, batch, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            return 1;
+        }
+
+        // prepare the next batch
+        batch.n_tokens = 0;
+
+        // sample the next token for each parallel sequence / stream
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            if (i_batch[i] < 0) {
+                // the stream has already finished
+                continue;
+            }
+
+            auto n_vocab = llama_n_vocab(ctx);
+            auto logits  = llama_get_logits_ith(ctx, i_batch[i]);
+
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(n_vocab);
+
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+            }
+
+            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+            const int   top_k = 40;
+            const float top_p = 0.9f;
+            const float temp  = 0.4f;
+
+            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+            llama_sample_temp (ctx, &candidates_p, temp);
+
+            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
+
+            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+
+            // is it an end of stream? -> mark the stream as finished
+            if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
+                i_batch[i] = -1;
+                LOG_TEE("\n");
+                if (n_parallel > 1) {
+                    LOG_TEE("%s: stream %d finished", __func__, i);
+                }
+
+                continue;
+            }
+
+            // if there is only one stream, we print immediately to stdout
+            if (n_parallel == 1) {
+                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+                fflush(stdout);
+            }
+
+            streams[i] += llama_token_to_piece(ctx, new_token_id);
+
+            // push this new token for next evaluation
+            batch.token [batch.n_tokens] = new_token_id;
+            batch.pos   [batch.n_tokens] = n_cur;
+            batch.seq_id[batch.n_tokens] = i;
+            batch.logits[batch.n_tokens] = true;
+
+            i_batch[i] = batch.n_tokens;
+
+            batch.n_tokens += 1;
+
+            n_decode += 1;
+        }
+
+        // all streams are finished
+        if (batch.n_tokens == 0) {
+            break;
+        }
+
+        n_cur += 1;
+    }
+
+    LOG_TEE("\n");
+
+    if (n_parallel > 1) {
+        LOG_TEE("\n");
+
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+        }
+    }
+
+    const auto t_main_end = ggml_time_us();
+
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
+
+    llama_print_timings(ctx);
+
+    fprintf(stderr, "\n");
+
+    llama_batch_free(batch);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/examples/simple/README.md b/examples/simple/README.md
index 765d32210db2e..5d24b1046935c 100644
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@@ -1,12 +1,9 @@
 # llama.cpp/example/simple
 
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
-The example demonstrates single-batch as well as parallel generation.
-
-## Single-batch generation
 
 ```bash
-./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 1
+./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
 
 ...
 
@@ -22,46 +19,3 @@ llama_print_timings: prompt eval time =   655.63 ms /    10 tokens (   65.56 ms
 llama_print_timings:        eval time =  2180.97 ms /    27 runs   (   80.78 ms per token,    12.38 tokens per second)
 llama_print_timings:       total time =  2891.13 ms
 ```
-
-## Parallel generation
-
-```bash
-./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
-
-...
-
-main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
-
- Hello my name is
-
-main: generating 4 sequences ...
-
-main: stream 0 finished
-main: stream 1 finished
-main: stream 2 finished
-main: stream 3 finished
-
-sequence 0:
-
-Hello my name is Shirley. I am a 25-year-old female who has been working for over 5 years as a b
-
-sequence 1:
-
-Hello my name is Renee and I'm a 32 year old female from the United States. I'm looking for a man between
-
-sequence 2:
-
-Hello my name is Diana. I am looking for a housekeeping job. I have experience with children and have my own transportation. I am
-
-sequence 3:
-
-Hello my name is Cody. I am a 3 year old neutered male. I am a very friendly cat. I am very playful and
-
-main: decoded 108 tokens in 3.57 s, speed: 30.26 t/s
-
-llama_print_timings:        load time =   587.00 ms
-llama_print_timings:      sample time =     2.56 ms /   112 runs   (    0.02 ms per token, 43664.72 tokens per second)
-llama_print_timings: prompt eval time =  4089.11 ms /   118 tokens (   34.65 ms per token,    28.86 tokens per second)
-llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
-llama_print_timings:       total time =  4156.04 ms
-```
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 08b082b336b9b..2acdc72739ba1 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -10,12 +10,10 @@ int main(int argc, char ** argv) {
     gpt_params params;
 
     if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
         return 1 ;
     }
 
-    int n_parallel = 1;
-
     if (argc >= 2) {
         params.model = argv[1];
     }
@@ -24,15 +22,11 @@ int main(int argc, char ** argv) {
         params.prompt = argv[2];
     }
 
-    if (argc >= 4) {
-        n_parallel = std::atoi(argv[3]);
-    }
-
     if (params.prompt.empty()) {
         params.prompt = "Hello my name is";
     }
 
-    // total length of the sequences including the prompt
+    // total length of the sequence including the prompt
     const int n_len = 32;
 
     // init LLM
@@ -64,9 +58,9 @@ int main(int argc, char ** argv) {
     tokens_list = ::llama_tokenize(ctx, params.prompt, true);
 
     const int n_ctx    = llama_n_ctx(ctx);
-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
+    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
 
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
 
     // make sure the KV cache is big enough to hold all the prompt and generated tokens
     if (n_kv_req > n_ctx) {
@@ -108,25 +102,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    // assign the system KV cache to all parallel sequences
-    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
-    for (int32_t i = 1; i < n_parallel; ++i) {
-        llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
-    }
-
-    if (n_parallel > 1) {
-        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
-    }
-
     // main loop
 
-    // we will store the parallel decoded sequences in this vector
-    std::vector<std::string> streams(n_parallel);
-
-    // remember the batch index of the last token for each parallel sequence
-    // we need this to determine which logits to sample from
-    std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
-
     int n_cur    = batch.n_tokens;
     int n_decode = 0;
 
@@ -139,18 +116,10 @@ int main(int argc, char ** argv) {
             return 1;
         }
 
-        // prepare the next batch
-        batch.n_tokens = 0;
-
-        // sample the next token for each parallel sequence / stream
-        for (int32_t i = 0; i < n_parallel; ++i) {
-            if (i_batch[i] < 0) {
-                // the stream has already finished
-                continue;
-            }
-
+        // sample the next token
+        {
             auto n_vocab = llama_n_vocab(ctx);
-            auto logits  = llama_get_logits_ith(ctx, i_batch[i]);
+            auto logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
 
             std::vector<llama_token_data> candidates;
             candidates.reserve(n_vocab);
@@ -161,68 +130,38 @@ int main(int argc, char ** argv) {
 
             llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 
-            const int   top_k = 40;
-            const float top_p = 0.9f;
-            const float temp  = 0.4f;
+            // sample the most likely token
+            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
 
-            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-            llama_sample_temp (ctx, &candidates_p, temp);
-
-            const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
-
-            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
-
-            // is it an end of stream? -> mark the stream as finished
+            // is it an end of stream?
             if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
-                i_batch[i] = -1;
                 LOG_TEE("\n");
-                if (n_parallel > 1) {
-                    LOG_TEE("%s: stream %d finished", __func__, i);
-                }
 
-                continue;
+                break;
             }
 
-            // if there is only one stream, we print immediately to stdout
-            if (n_parallel == 1) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-                fflush(stdout);
-            }
+            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            fflush(stdout);
 
-            streams[i] += llama_token_to_piece(ctx, new_token_id);
+            // prepare the next batch
+            batch.n_tokens = 0;
 
             // push this new token for next evaluation
             batch.token [batch.n_tokens] = new_token_id;
             batch.pos   [batch.n_tokens] = n_cur;
-            batch.seq_id[batch.n_tokens] = i;
+            batch.seq_id[batch.n_tokens] = 0;
             batch.logits[batch.n_tokens] = true;
 
-            i_batch[i] = batch.n_tokens;
-
             batch.n_tokens += 1;
 
             n_decode += 1;
         }
 
-        // all streams are finished
-        if (batch.n_tokens == 0) {
-            break;
-        }
-
         n_cur += 1;
     }
 
     LOG_TEE("\n");
 
-    if (n_parallel > 1) {
-        LOG_TEE("\n");
-
-        for (int32_t i = 0; i < n_parallel; ++i) {
-            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
-        }
-    }
-
     const auto t_main_end = ggml_time_us();
 
     LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",

From 2b8830af7153a75fba5a899a8b331389025d0d03 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Sep 2023 17:48:25 +0300
Subject: [PATCH 53/55] examples : do not eval prompt 2 times (close #3348)

---
 examples/batched/batched.cpp | 29 ++++++++++++++++-------------
 examples/simple/simple.cpp   | 16 ++++++++--------
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 08b082b336b9b..4dd1d553d1c18 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,6 +1,7 @@
 #include "common.h"
 #include "llama.h"
 
+#include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <string>
@@ -42,7 +43,9 @@ int main(int argc, char ** argv) {
     llama_context_params ctx_params = llama_context_default_params();
 
     ctx_params.seed  = 1234;
-    ctx_params.n_ctx = 2048;
+    ctx_params.n_ctx = n_len*n_parallel; // FIXME: use n_kv_req instead (tokenize with model after #3301)
+    ctx_params.n_batch = std::max(n_len, n_parallel);
+    // ctx_params.n_gpu_layers = 99; // offload all layers to the GPU
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
 
@@ -66,11 +69,11 @@ int main(int argc, char ** argv) {
     const int n_ctx    = llama_n_ctx(ctx);
     const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
 
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
 
     // make sure the KV cache is big enough to hold all the prompt and generated tokens
     if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
+        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
         LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
         return 1;
     }
@@ -88,7 +91,7 @@ int main(int argc, char ** argv) {
     // create a llama_batch with size 512
     // we use this object to submit token data for decoding
 
-    llama_batch batch = llama_batch_init(512, 0);
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);
 
     // evaluate the initial prompt
     batch.n_tokens = tokens_list.size();
@@ -133,12 +136,6 @@ int main(int argc, char ** argv) {
     const auto t_main_start = ggml_time_us();
 
     while (n_cur <= n_len) {
-        // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch, params.n_threads)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
-            return 1;
-        }
-
         // prepare the next batch
         batch.n_tokens = 0;
 
@@ -149,8 +146,8 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
-            auto n_vocab = llama_n_vocab(ctx);
-            auto logits  = llama_get_logits_ith(ctx, i_batch[i]);
+            auto   n_vocab = llama_n_vocab(ctx);
+            auto * logits  = llama_get_logits_ith(ctx, i_batch[i]);
 
             std::vector<llama_token_data> candidates;
             candidates.reserve(n_vocab);
@@ -178,7 +175,7 @@ int main(int argc, char ** argv) {
                 i_batch[i] = -1;
                 LOG_TEE("\n");
                 if (n_parallel > 1) {
-                    LOG_TEE("%s: stream %d finished", __func__, i);
+                    LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
                 }
 
                 continue;
@@ -211,6 +208,12 @@ int main(int argc, char ** argv) {
         }
 
         n_cur += 1;
+
+        // evaluate the current batch with the transformer model
+        if (llama_decode(ctx, batch, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            return 1;
+        }
     }
 
     LOG_TEE("\n");
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 2acdc72739ba1..1616a4a7581a3 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -110,16 +110,10 @@ int main(int argc, char ** argv) {
     const auto t_main_start = ggml_time_us();
 
     while (n_cur <= n_len) {
-        // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch, params.n_threads)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
-            return 1;
-        }
-
         // sample the next token
         {
-            auto n_vocab = llama_n_vocab(ctx);
-            auto logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+            auto   n_vocab = llama_n_vocab(ctx);
+            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);
 
             std::vector<llama_token_data> candidates;
             candidates.reserve(n_vocab);
@@ -158,6 +152,12 @@ int main(int argc, char ** argv) {
         }
 
         n_cur += 1;
+
+        // evaluate the current batch with the transformer model
+        if (llama_decode(ctx, batch, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            return 1;
+        }
     }
 
     LOG_TEE("\n");

From ce2d995af2d2d1163f384737cfdd534cb7025dab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Sep 2023 18:12:39 +0300
Subject: [PATCH 54/55] server : clear the KV cache beyond n_past before
 llama_decode

---
 examples/server/server.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3587d901447c6..c5b1328d9e2fa 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -434,6 +434,10 @@ struct llama_server_context
             {
                 n_eval = params.n_batch;
             }
+
+            // since #3228 we now have to manually manage the KV cache
+            llama_kv_cache_tokens_rm(ctx, n_past, -1);
+
             if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads))
             {
                 LOG_ERROR("failed to eval", {

From c5650ed470f6264046431c8b5e43cbfe3e680d17 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 28 Sep 2023 19:03:36 +0300
Subject: [PATCH 55/55] server : avoid context swaps by shifting the KV cache

---
 examples/server/server.cpp | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c5b1328d9e2fa..273eb36f4284d 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -381,6 +381,10 @@ struct llama_server_context
 
         // compare the evaluated prompt with the new prompt
         n_past = common_part(embd, prompt_tokens);
+
+        // since #3228 we now have to manually manage the KV cache
+        llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx);
+
         embd = prompt_tokens;
         if (n_past == num_prompt_tokens)
         {
@@ -411,19 +415,27 @@ struct llama_server_context
 
         if (embd.size() >= (size_t)params.n_ctx)
         {
-            // Reset context
-            const int n_left = (params.n_ctx - params.n_keep) / 2;
+            // Shift context
+
+            const int n_left    = n_past - params.n_keep - 1;
+            const int n_discard = n_left/2;
+
+            llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+            llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+
+            for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
+            {
+                embd[i - n_discard] = embd[i];
+            }
+            embd.resize(embd.size() - n_discard);
+
+            n_past -= n_discard;
 
-            std::vector<llama_token> new_tokens(embd.begin(), embd.begin() + params.n_keep);
-            new_tokens.insert(new_tokens.end(), embd.end() - n_left, embd.end());
-            embd = new_tokens;
-            n_past = params.n_keep;
             truncated = true;
             LOG_VERBOSE("input truncated", {
                                                {"n_ctx", params.n_ctx},
                                                {"n_keep", params.n_keep},
                                                {"n_left", n_left},
-                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
                                            });
         }
 
@@ -435,9 +447,6 @@ struct llama_server_context
                 n_eval = params.n_batch;
             }
 
-            // since #3228 we now have to manually manage the KV cache
-            llama_kv_cache_tokens_rm(ctx, n_past, -1);
-
             if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads))
             {
                 LOG_ERROR("failed to eval", {