From 8dec38c35ca6682bcbdde1b55438b1527020a389 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Mon, 17 Jul 2023 20:07:15 -0400 Subject: [PATCH 01/28] llama: implement NTK-By-Parts (NTKv2) RoPE scaling --- examples/common.cpp | 16 ++++ examples/common.h | 2 + examples/server/server.cpp | 18 +++++ ggml.c | 155 +++++++++++++++++++++++++++---------- ggml.h | 8 +- llama.cpp | 38 ++++++--- llama.h | 2 + 7 files changed, 189 insertions(+), 50 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 21f4a0357d422..957022d0cb360 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -194,6 +194,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--rope-ntk-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_ntk_factor = std::stof(argv[i]); + } else if (arg == "--rope-ext-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_ext_factor = std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; } else if (arg == "--top-p") { @@ -566,6 +578,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); + fprintf(stdout, " --rope-ntk-factor N RoPE NTK mix factor (default: %.1f)\n", params.rope_ntk_factor); + fprintf(stdout, " --rope-ext-factor N RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor); fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); @@ -657,6 +671,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param lparams.embedding = params.embedding; lparams.rope_freq_base = params.rope_freq_base; lparams.rope_freq_scale = params.rope_freq_scale; + lparams.rope_ntk_factor = params.rope_ntk_factor; + lparams.rope_ext_factor = params.rope_ext_factor; return lparams; } diff --git a/examples/common.h b/examples/common.h index 375bc0a3db416..677676ad131b5 100644 --- a/examples/common.h +++ b/examples/common.h @@ -32,6 +32,8 @@ struct gpt_params { float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon float rope_freq_base = 10000.0f; // RoPE base frequency float rope_freq_scale = 1.0f; // RoPE frequency scaling factor + float rope_ntk_factor = 0.0f; // RoPE NTK mix factor + float rope_ext_factor = 0.0f; // RoPE extrapolation mix factor // sampling parameters std::unordered_map logit_bias; // logit bias for specific tokens diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6f7a66da108c8..49d2dd0508e0f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -612,6 +612,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps); fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); + fprintf(stdout, " --rope-ntk-factor N RoPE NTK mix factor (default: %.1f)\n", params.rope_ntk_factor); + fprintf(stdout, " --rope-ext-factor N RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor); fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); @@ -764,6 +766,22 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.rope_freq_scale = std::stof(argv[i]); } + else if (arg == "--rope-ntk-factor") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_ntk_factor = std::stof(argv[i]); + } + else if (arg == "--rope-ext-factor") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_ext_factor = std::stof(argv[i]); + } else if (arg == "--memory-f32" || arg == "--memory_f32") { params.memory_f16 = false; diff --git a/ggml.c b/ggml.c index beb7f464167d5..8c5f7ac2641ef 100644 --- a/ggml.c +++ b/ggml.c @@ -1,5 +1,6 @@ #define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows +#define _USE_MATH_DEFINES // For M_PI on MSVC #include "ggml.h" @@ -6711,6 +6712,8 @@ static struct ggml_tensor * ggml_rope_impl( int n_ctx, float freq_base, float freq_scale, + float ntk_factor, + float ext_factor, bool inplace) { GGML_ASSERT(n_past >= 0); bool is_node = false; @@ -6721,9 +6724,11 @@ static struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[6] = { n_past, n_dims, mode, n_ctx }; + int32_t params[8] = { n_past, n_dims, mode, n_ctx }; memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_scale, sizeof(float)); + memcpy(params + 6, &ntk_factor, sizeof(float)); + memcpy(params + 7, &ext_factor, sizeof(float)); ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE; @@ -6740,7 +6745,7 @@ struct ggml_tensor * ggml_rope( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, false); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 0.0f, false); } struct ggml_tensor * ggml_rope_inplace( @@ -6750,7 +6755,7 @@ struct ggml_tensor * ggml_rope_inplace( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 0.0f, true); } struct ggml_tensor * ggml_rope_custom( @@ -6761,8 +6766,10 @@ struct ggml_tensor * ggml_rope_custom( int mode, int n_ctx, float freq_base, - float freq_scale) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false); + float freq_scale, + float ntk_factor, + float ext_factor) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ntk_factor, ext_factor, false); } struct ggml_tensor * ggml_rope_custom_inplace( @@ -6773,8 +6780,10 @@ struct ggml_tensor * ggml_rope_custom_inplace( int mode, int n_ctx, float freq_base, - float freq_scale) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, true); + float freq_scale, + float ntk_factor, + float ext_factor) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ntk_factor, ext_factor, true); } // ggml_rope_back @@ -12003,6 +12012,52 @@ static void ggml_compute_forward_clamp( // ggml_compute_forward_rope +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +#define NTKV2_MAX_POS_EMB 2048 +#define NTKV2_CORRECTION_FACTOR(n_rot) (__builtin_logf(NTKV2_MAX_POS_EMB / ((n_rot) * 2 * (float)M_PI)) / 2) + +static inline float rope_ntkv2_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / MIN(0.001f, high - low); + return 1 - MIN(1, MAX(0, y)); +} + +// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static float rope_ntkv2( + const float theta_base, + const float theta_ntk, + const float dims_over_base, + const float freq_scale, + const int64_t i0, + const float ntk_factor, + const float ext_factor, + const int n_dims) { + // Interpolation constants found experimentally for LLaMA (might not be totally optimal though) + // Do not change unless there is a good reason for doing so! + static const float BETA_0 = 1.75f; + static const float BETA_1 = 1.25f; + static const float GAMMA_0 = 16.0f; + static const float GAMMA_1 = 2.0f; + + static const float low_1p = NTKV2_CORRECTION_FACTOR(BETA_0); + static const float high_1p = NTKV2_CORRECTION_FACTOR(BETA_1); + static const float low_2p = NTKV2_CORRECTION_FACTOR(GAMMA_0); + static const float high_2p = NTKV2_CORRECTION_FACTOR(GAMMA_1); + + // start and end correction factors + const float low_1 = MAX(0, floorf(low_1p * dims_over_base)); + const float high_1 = MIN(n_dims - 1, ceilf(high_1p * dims_over_base)); + const float low_2 = MAX(0, floorf(low_2p * dims_over_base)); + const float high_2 = MIN(n_dims - 1, ceilf(high_2p * dims_over_base)); + + const float theta_linear = freq_scale * theta_base; + const float ramp_mix = rope_ntkv2_ramp(low_1, high_1, i0) * ntk_factor; + const float theta_mix = theta_linear * (1 - ramp_mix) + theta_ntk * ramp_mix; + const float ramp_final = rope_ntkv2_ramp(low_2, high_2, i0) * ext_factor; + return theta_mix * (1 - ramp_final) + theta_base * ramp_final; +} + static void ggml_compute_forward_rope_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -12014,6 +12069,8 @@ static void ggml_compute_forward_rope_f32( float freq_base; float freq_scale; + float ntk_factor; + float ext_factor; const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; @@ -12021,6 +12078,8 @@ static void ggml_compute_forward_rope_f32( const int n_ctx = ((int32_t *) dst->op_params)[3]; memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); assert(n_past >= 0); @@ -12050,6 +12109,8 @@ static void ggml_compute_forward_rope_f32( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims); + const float dims_over_base = n_dims / logf(freq_base); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12061,18 +12122,19 @@ static void ggml_compute_forward_rope_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = freq_scale * (float)p; + float theta_base = (float)p; + float theta_ntk = theta_base; if (is_glm) { - theta = MIN(p, n_ctx - 2); + theta_base = MIN(p, n_ctx - 2); float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); const float cos_block_theta = cosf(block_theta); const float sin_block_theta = sinf(block_theta); - theta *= theta_scale; + theta_base *= theta_scale; block_theta *= theta_scale; const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); @@ -12090,10 +12152,13 @@ static void ggml_compute_forward_rope_f32( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + const float theta = rope_ntkv2(theta_base, theta_ntk, dims_over_base, + freq_scale, i0, ntk_factor, ext_factor, n_dims); const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); - theta *= theta_scale; + theta_base *= theta_scale; + theta_ntk *= theta_ntk_scale; const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -12107,12 +12172,13 @@ static void ggml_compute_forward_rope_f32( } else { // TODO: this is probably wrong, but I can't figure it out .. // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + theta_base *= freq_scale; for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -12143,6 +12209,8 @@ static void ggml_compute_forward_rope_f16( float freq_base; float freq_scale; + float ntk_factor; + float ext_factor; const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; @@ -12150,6 +12218,8 @@ static void ggml_compute_forward_rope_f16( const int n_ctx = ((int32_t *) dst->op_params)[3]; memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); assert(n_past >= 0); @@ -12179,6 +12249,8 @@ static void ggml_compute_forward_rope_f16( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims); + const float dims_over_base = n_dims / logf(freq_base); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12190,18 +12262,19 @@ static void ggml_compute_forward_rope_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = freq_scale * (float)p; + float theta_base = (float)p; + float theta_ntk = theta_base; if (is_glm) { - theta = MIN(p, n_ctx - 2); + theta_base = MIN(p, n_ctx - 2); float block_theta = MAX(p - (n_ctx - 2), 0); for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); const float cos_block_theta = cosf(block_theta); const float sin_block_theta = sinf(block_theta); - theta *= theta_scale; + theta_base *= theta_scale; block_theta *= theta_scale; const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); @@ -12219,10 +12292,13 @@ static void ggml_compute_forward_rope_f16( } } if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + const float theta = rope_ntkv2(theta_base, theta_ntk, dims_over_base, + freq_scale, i0, ntk_factor, ext_factor, n_dims); const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); - theta *= theta_scale; + theta_base *= theta_scale; + theta_ntk *= theta_ntk_scale; const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -12236,12 +12312,13 @@ static void ggml_compute_forward_rope_f16( } else { // TODO: this is probably wrong, but I can't figure it out .. // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 + theta_base *= freq_scale; for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -12335,14 +12412,14 @@ static void ggml_compute_forward_rope_back_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta_base = (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -12356,10 +12433,10 @@ static void ggml_compute_forward_rope_back_f32( } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; @@ -12431,14 +12508,14 @@ static void ggml_compute_forward_rope_back_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta_base = (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -12452,10 +12529,10 @@ static void ggml_compute_forward_rope_back_f16( } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta_base); + const float sin_theta = sinf(theta_base); - theta *= theta_scale; + theta_base *= theta_scale; const int64_t i0 = ib*n_dims + ic/2; diff --git a/ggml.h b/ggml.h index bdbd128004332..459d217df8068 100644 --- a/ggml.h +++ b/ggml.h @@ -1194,7 +1194,9 @@ extern "C" { int mode, int n_ctx, float freq_base, - float freq_scale); + float freq_scale, + float ntk_factor, + float ext_factor); // in-place, returns view(a) GGML_API struct ggml_tensor * ggml_rope_custom_inplace( @@ -1205,7 +1207,9 @@ extern "C" { int mode, int n_ctx, float freq_base, - float freq_scale); + float freq_scale, + float ntk_factor, + float ext_factor); // rotary position embedding backward, i.e compute dx from dy // a - dy diff --git a/llama.cpp b/llama.cpp index 39aefd499dd0c..945215de8a31e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -196,6 +196,8 @@ struct llama_hparams { float rope_freq_base = 10000.0f; float rope_freq_scale = 1.0f; + float rope_ntk_factor = 0.0f; + float rope_ext_factor = 0.0f; enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; @@ -898,6 +900,8 @@ struct llama_context_params llama_context_default_params() { /*.tensor_split =*/ nullptr, /*.rope_freq_base =*/ 10000.0f, /*.rope_freq_scale =*/ 1.0f, + /*.rope_ntk_factor =*/ 0.0f, + /*.rope_ext_factor =*/ 0.0f, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, @@ -1032,6 +1036,8 @@ static void llama_model_load_internal( const bool mul_mat_q, float rope_freq_base, float rope_freq_scale, + float rope_ntk_factor, + float rope_ext_factor, bool low_vram, ggml_type memory_type, bool use_mmap, @@ -1083,6 +1089,8 @@ static void llama_model_load_internal( hparams.rope_freq_base = rope_freq_base; hparams.rope_freq_scale = rope_freq_scale; + hparams.rope_ntk_factor = rope_ntk_factor; + hparams.rope_ext_factor = rope_ext_factor; } // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199 @@ -1106,6 +1114,8 @@ static void llama_model_load_internal( fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); + fprintf(stderr, "%s: ntk_factor = %g\n", __func__, hparams.rope_ntk_factor); + fprintf(stderr, "%s: ext_factor = %g\n", __func__, hparams.rope_ext_factor); fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } @@ -1374,6 +1384,8 @@ static bool llama_model_load( const bool mul_mat_q, float rope_freq_base, float rope_freq_scale, + float rope_ntk_factor, + float rope_ext_factor, bool low_vram, ggml_type memory_type, bool use_mmap, @@ -1382,9 +1394,10 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, - main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type, - use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, + tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, rope_ntk_factor, + rope_ext_factor, low_vram, memory_type, use_mmap, use_mlock, vocab_only, + progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { fprintf(stderr, "error loading model: %s\n", err.what()); @@ -1422,6 +1435,8 @@ static struct ggml_cgraph * llama_build_graph( const float freq_base = hparams.rope_freq_base; const float freq_scale = hparams.rope_freq_scale; + const float ntk_factor = hparams.rope_ntk_factor; + const float ext_factor = hparams.rope_ext_factor; const float rms_norm_eps = hparams.f_rms_norm_eps; const int n_gpu_layers = model.n_gpu_layers; @@ -1551,11 +1566,15 @@ static struct ggml_cgraph * llama_build_graph( offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + struct ggml_tensor * Kcur = ggml_rope_custom_inplace( + ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, + freq_scale, ntk_factor, ext_factor); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + struct ggml_tensor * Qcur = ggml_rope_custom_inplace( + ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, + freq_scale, ntk_factor, ext_factor); offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); @@ -3197,10 +3216,11 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers, - params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram, - memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, - params.progress_callback_user_data)) { + if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, + params.rms_norm_eps, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.mul_mat_q, + params.rope_freq_base, params.rope_freq_scale, params.rope_ntk_factor, params.rope_ext_factor, + params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, + params.progress_callback, params.progress_callback_user_data)) { delete model; fprintf(stderr, "%s: failed to load model\n", __func__); return nullptr; diff --git a/llama.h b/llama.h index fa1977f2d9492..25bb3952a73f8 100644 --- a/llama.h +++ b/llama.h @@ -100,6 +100,8 @@ extern "C" { // ref: https://github.com/ggerganov/llama.cpp/pull/2054 float rope_freq_base; // RoPE base frequency float rope_freq_scale; // RoPE frequency scaling factor + float rope_ntk_factor; // RoPE NTK mix factor + float rope_ext_factor; // RoPE extrapolation mix factor // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; From 6aeb46b343ee15f28583e381fcf5b5f687fb09aa Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Tue, 18 Jul 2023 22:28:27 -0400 Subject: [PATCH 02/28] CUDA implementation --- ggml-cuda.cu | 84 +++++++++++++++++++++++++++++++++++++++++++++------- ggml.c | 70 +++++++++++++++++++++++-------------------- ggml.h | 3 ++ 3 files changed, 115 insertions(+), 42 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 9d42efb0d0b03..91a6edca60011 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3558,9 +3558,49 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, cpy_1(cx + x_offset, cdst + dst_offset); } +static __device__ float rope_ntkv2_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / min(0.001f, high - low); + return 1.0f - min(1.0f, max(0.0f, y)); +} + +struct rope_corr_factors { + float v[4]; +}; + +// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static __device__ float rope_ntkv2( + const float theta_base, + const float theta_linear, + const float theta_ntk, + const rope_corr_factors corr_factors, + const int64_t i0, + const float ntk_factor, + const float ext_factor) { + float ramp_mix; + float theta; + + ramp_mix = rope_ntkv2_ramp(corr_factors.v[0], corr_factors.v[1], i0) * ntk_factor; + theta = theta_linear * (1 - ramp_mix) + theta_ntk * ramp_mix; + + ramp_mix = rope_ntkv2_ramp(corr_factors.v[2], corr_factors.v[3], i0) * ext_factor; + theta = theta * (1 - ramp_mix) + theta_base * ramp_mix; + return theta; +} + // rope == RoPE == rotary positional embedding -static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0, - const float p_delta, const int p_delta_rows, const float theta_scale) { +static __global__ void rope_f32( + const float * x, + float * dst, + const int ncols, + const float freq_scale, + const float ntk_factor, + const float ext_factor, + const float theta_scale, + const float theta_ntk_scale, + const float p0, + const int p_delta_rows, + const rope_corr_factors corr_factors) { const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x); if (col >= ncols) { @@ -3570,7 +3610,11 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c const int row = blockDim.y*blockIdx.y + threadIdx.y; const int i = row*ncols + col; - const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); + const float p = p0 + row / p_delta_rows; + const float theta_base = p*powf(theta_scale, col/2); + const float theta_linear = freq_scale * theta_base; + const float theta_ntk = p*powf(theta_ntk_scale, col/2); + const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors, col, ntk_factor, ext_factor); const float sin_theta = sinf(theta); const float cos_theta = cosf(theta); @@ -4234,13 +4278,26 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons scale_f32<<>>(x, dst, scale, k); } -static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, - const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { +static void rope_f32_cuda( + const float * x, + float * dst, + const int ncols, + const int nrows, + const float freq_scale, + const float ntk_factor, + const float ext_factor, + const float theta_scale, + const float theta_ntk_scale, + const float p0, + const int p_delta_rows, + const rope_corr_factors corr_factors, + cudaStream_t stream) { GGML_ASSERT(nrows % 2 == 0); const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(num_blocks_x, nrows, 1); - rope_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); + rope_f32<<>>(x, dst, ncols, freq_scale, ntk_factor, ext_factor, theta_scale, + theta_ntk_scale, p0, p_delta_rows, corr_factors); } static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) { @@ -4941,11 +4998,13 @@ inline void ggml_cuda_op_rope( const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; const int n_ctx = ((int32_t *) dst->op_params)[3]; - // RoPE alteration for extended context - float freq_base, freq_scale; + // RoPE alteration for extended context + float freq_base, freq_scale, ntk_factor, ext_factor; memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); const float theta_scale = powf(freq_base, -2.0f/n_dims); @@ -4958,8 +5017,13 @@ inline void ggml_cuda_op_rope( const float block_p = max(p - (n_ctx - 2.f), 0.f); rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); } else { - const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; - rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); + const float p0 = (mode & 1) == 0 ? n_past : 0; + const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims); + rope_corr_factors corr_factors; + ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors.v); + + rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, freq_scale, ntk_factor, ext_factor, theta_scale, + theta_ntk_scale, p0, ne01, corr_factors, cudaStream_main); } (void) src1; diff --git a/ggml.c b/ggml.c index 8c5f7ac2641ef..8a57391811dcc 100644 --- a/ggml.c +++ b/ggml.c @@ -12012,11 +12012,6 @@ static void ggml_compute_forward_clamp( // ggml_compute_forward_rope -// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get -// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` -#define NTKV2_MAX_POS_EMB 2048 -#define NTKV2_CORRECTION_FACTOR(n_rot) (__builtin_logf(NTKV2_MAX_POS_EMB / ((n_rot) * 2 * (float)M_PI)) / 2) - static inline float rope_ntkv2_ramp(const float low, const float high, const int i0) { const float y = (i0 / 2 - low) / MIN(0.001f, high - low); return 1 - MIN(1, MAX(0, y)); @@ -12026,36 +12021,43 @@ static inline float rope_ntkv2_ramp(const float low, const float high, const int // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. static float rope_ntkv2( const float theta_base, + const float theta_linear, const float theta_ntk, - const float dims_over_base, - const float freq_scale, + const float corr_factors[4], const int64_t i0, const float ntk_factor, - const float ext_factor, - const int n_dims) { + const float ext_factor) { + float ramp_mix; + float theta; + + ramp_mix = rope_ntkv2_ramp(corr_factors[0], corr_factors[1], i0) * ntk_factor; + theta = theta_linear * (1 - ramp_mix) + theta_ntk * ramp_mix; + + ramp_mix = rope_ntkv2_ramp(corr_factors[2], corr_factors[3], i0) * ext_factor; + theta = theta * (1 - ramp_mix) + theta_base * ramp_mix; + return theta; +} + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float ggml_rope_ntkv2_corr_factor(const int n_dims, const float n_rot, const float base) { + static const float max_pos_emb = 2048; + return n_dims * logf(max_pos_emb / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +} + +void ggml_rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]) { // Interpolation constants found experimentally for LLaMA (might not be totally optimal though) // Do not change unless there is a good reason for doing so! - static const float BETA_0 = 1.75f; - static const float BETA_1 = 1.25f; + static const float BETA_0 = 1.75f; + static const float BETA_1 = 1.25f; static const float GAMMA_0 = 16.0f; static const float GAMMA_1 = 2.0f; - static const float low_1p = NTKV2_CORRECTION_FACTOR(BETA_0); - static const float high_1p = NTKV2_CORRECTION_FACTOR(BETA_1); - static const float low_2p = NTKV2_CORRECTION_FACTOR(GAMMA_0); - static const float high_2p = NTKV2_CORRECTION_FACTOR(GAMMA_1); - // start and end correction factors - const float low_1 = MAX(0, floorf(low_1p * dims_over_base)); - const float high_1 = MIN(n_dims - 1, ceilf(high_1p * dims_over_base)); - const float low_2 = MAX(0, floorf(low_2p * dims_over_base)); - const float high_2 = MIN(n_dims - 1, ceilf(high_2p * dims_over_base)); - - const float theta_linear = freq_scale * theta_base; - const float ramp_mix = rope_ntkv2_ramp(low_1, high_1, i0) * ntk_factor; - const float theta_mix = theta_linear * (1 - ramp_mix) + theta_ntk * ramp_mix; - const float ramp_final = rope_ntkv2_ramp(low_2, high_2, i0) * ext_factor; - return theta_mix * (1 - ramp_final) + theta_base * ramp_final; + factors[0] = MAX(0, floorf(ggml_rope_ntkv2_corr_factor(n_dims, BETA_0, freq_base))); + factors[1] = MIN(n_dims - 1, ceilf(ggml_rope_ntkv2_corr_factor(n_dims, BETA_1, freq_base))); + factors[2] = MAX(0, floorf(ggml_rope_ntkv2_corr_factor(n_dims, GAMMA_0, freq_base))); + factors[3] = MIN(n_dims - 1, ceilf(ggml_rope_ntkv2_corr_factor(n_dims, GAMMA_1, freq_base))); } static void ggml_compute_forward_rope_f32( @@ -12110,7 +12112,8 @@ static void ggml_compute_forward_rope_f32( const float theta_scale = powf(freq_base, -2.0f/n_dims); const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims); - const float dims_over_base = n_dims / logf(freq_base); + float corr_factors[4]; + ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12152,8 +12155,9 @@ static void ggml_compute_forward_rope_f32( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float theta = rope_ntkv2(theta_base, theta_ntk, dims_over_base, - freq_scale, i0, ntk_factor, ext_factor, n_dims); + const float theta_linear = freq_scale * theta_base; + const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors, + i0, ntk_factor, ext_factor); const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); @@ -12250,7 +12254,8 @@ static void ggml_compute_forward_rope_f16( const float theta_scale = powf(freq_base, -2.0f/n_dims); const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims); - const float dims_over_base = n_dims / logf(freq_base); + float corr_factors[4]; + ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12292,8 +12297,9 @@ static void ggml_compute_forward_rope_f16( } } if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float theta = rope_ntkv2(theta_base, theta_ntk, dims_over_base, - freq_scale, i0, ntk_factor, ext_factor, n_dims); + const float theta_linear = freq_scale * theta_base; + const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors, + i0, ntk_factor, ext_factor); const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); diff --git a/ggml.h b/ggml.h index 459d217df8068..c2c6b7b1d376c 100644 --- a/ggml.h +++ b/ggml.h @@ -1211,6 +1211,9 @@ extern "C" { float ntk_factor, float ext_factor); + // compute correction factors for NTKv2 RoPE scaling + void ggml_rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]); + // rotary position embedding backward, i.e compute dx from dy // a - dy GGML_API struct ggml_tensor * ggml_rope_back( From 9348aa4df9889707f8dd4189e544cf4b816ff798 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 21 Jul 2023 17:10:57 -0400 Subject: [PATCH 03/28] Metal implementation --- ggml-metal.m | 7 +++-- ggml-metal.metal | 67 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index b47a98e214b61..372d3e696e9c3 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1035,10 +1035,11 @@ void ggml_metal_graph_compute( const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; - float freq_base; - float freq_scale; + float freq_base, freq_scale, ntk_factor, ext_factor; memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1064,6 +1065,8 @@ void ggml_metal_graph_compute( [encoder setBytes:&mode length:sizeof( int) atIndex:20]; [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; + [encoder setBytes:&ntk_factor length:sizeof(float) atIndex:23]; + [encoder setBytes:&ext_factor length:sizeof(float) atIndex:24]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 8d26b5ec2dfa4..347fd17ac093e 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -597,6 +597,55 @@ kernel void kernel_alibi_f32( } } +static float rope_ntkv2_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / min(0.001f, high - low); + return 1.0f - min(1.0f, max(0.0f, y)); +} + +// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static float rope_ntkv2( + const float theta_base, + const float theta_linear, + const float theta_ntk, + const float corr_factors[4], + const int64_t i0, + const float ntk_factor, + const float ext_factor) { + float ramp_mix; + float theta; + + ramp_mix = rope_ntkv2_ramp(corr_factors[0], corr_factors[1], i0) * ntk_factor; + theta = theta_linear * (1 - ramp_mix) + theta_ntk * ramp_mix; + + ramp_mix = rope_ntkv2_ramp(corr_factors[2], corr_factors[3], i0) * ext_factor; + theta = theta * (1 - ramp_mix) + theta_base * ramp_mix; + return theta; +} + +// Interpolation constants found experimentally for LLaMA (might not be totally optimal though) +// Do not change unless there is a good reason for doing so! +constant float BETA_0 = 1.75f; +constant float BETA_1 = 1.25f; +constant float GAMMA_0 = 16.0f; +constant float GAMMA_1 = 2.0f; + +constant float max_pos_emb = 2048; + +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float rope_ntkv2_corr_factor(const int n_dims, const float n_rot, const float base) { + return n_dims * log(max_pos_emb / (n_rot * 2 * M_PI_F)) / (2 * log(base)); +} + +static void rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]) { + // start and end correction factors + factors[0] = max(0.0f, floor(rope_ntkv2_corr_factor(n_dims, BETA_0, freq_base))); + factors[1] = min(n_dims - 1.0f, ceil(rope_ntkv2_corr_factor(n_dims, BETA_1, freq_base))); + factors[2] = max(0.0f, floor(rope_ntkv2_corr_factor(n_dims, GAMMA_0, freq_base))); + factors[3] = min(n_dims - 1.0f, ceil(rope_ntkv2_corr_factor(n_dims, GAMMA_1, freq_base))); +} + kernel void kernel_rope( device const void * src0, device float * dst, @@ -621,24 +670,33 @@ kernel void kernel_rope( constant int & mode, constant float & freq_base, constant float & freq_scale, + constant float & ntk_factor, + constant float & ext_factor, uint3 tpig[[thread_position_in_grid]]) { const int64_t i3 = tpig[2]; const int64_t i2 = tpig[1]; const int64_t i1 = tpig[0]; - const bool is_neox = mode & 2; const float theta_scale = pow(freq_base, -2.0f/n_dims); + const float theta_ntk_scale = pow(freq_base * pow(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims); + float corr_factors[4]; + rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors); - const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2); + float theta_base = (mode & 1) == 0 ? n_past + i2 : i2; + float theta_ntk = theta_base; - float theta = freq_scale * (float)p; + const bool is_neox = mode & 2; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { + const float theta_linear = freq_scale * theta_base; + const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors, + i0, ntk_factor, ext_factor); const float cos_theta = cos(theta); const float sin_theta = sin(theta); - theta *= theta_scale; + theta_base *= theta_scale; + theta_ntk *= theta_ntk_scale; device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -650,6 +708,7 @@ kernel void kernel_rope( dst_data[1] = x0*sin_theta + x1*cos_theta; } } else { + theta_base *= freq_scale; // TODO: implement } } From a30ae2095c91adbb9e3d626c1bae234fcb43e669 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Mon, 4 Sep 2023 20:08:17 -0400 Subject: [PATCH 04/28] implement new YaRN algorithm --- examples/common.cpp | 64 +++++++++------ examples/common.h | 4 +- examples/server/server.cpp | 28 +++++-- ggml-cuda.cu | 81 +++++++++---------- ggml-metal.m | 22 +++--- ggml-metal.metal | 72 +++++++---------- ggml.c | 158 +++++++++++++++++-------------------- ggml.h | 16 ++-- llama.cpp | 111 +++++++++++++++----------- llama.h | 10 ++- 10 files changed, 300 insertions(+), 266 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 957022d0cb360..ef08c403c2eb8 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -194,18 +194,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.rope_freq_scale = std::stof(argv[i]); - } else if (arg == "--rope-ntk-factor") { + } else if (arg == "--rope-ext-factor") { if (++i >= argc) { invalid_param = true; break; } - params.rope_ntk_factor = std::stof(argv[i]); - } else if (arg == "--rope-ext-factor") { + params.rope_ext_factor = std::stof(argv[i]); + } else if (arg == "--rope-attn-factor") { if (++i >= argc) { invalid_param = true; break; } - params.rope_ext_factor = std::stof(argv[i]); + params.rope_attn_factor = std::stof(argv[i]); + } else if (arg == "--rope-beta-fast") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_beta_fast = std::stof(argv[i]); + } else if (arg == "--rope-beta-slow") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_beta_slow = std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; } else if (arg == "--top-p") { @@ -578,8 +590,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); - fprintf(stdout, " --rope-ntk-factor N RoPE NTK mix factor (default: %.1f)\n", params.rope_ntk_factor); fprintf(stdout, " --rope-ext-factor N RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor); + fprintf(stdout, " --rope-attn-factor N RoPE magnitude scaling factor (default: %.1f)\n", params.rope_attn_factor); + fprintf(stdout, " --rope-beta-fast N RoPE low correction dim (default: %.1f)\n", params.rope_beta_fast); + fprintf(stdout, " --rope-beta-slow N RoPE high correction dim (default: %.1f)\n", params.rope_beta_slow); fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); @@ -654,25 +668,27 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto lparams = llama_context_default_params(); - lparams.n_ctx = params.n_ctx; - lparams.n_batch = params.n_batch; - lparams.n_gqa = params.n_gqa; - lparams.rms_norm_eps = params.rms_norm_eps; - lparams.n_gpu_layers = params.n_gpu_layers; - lparams.main_gpu = params.main_gpu; - lparams.tensor_split = params.tensor_split; - lparams.low_vram = params.low_vram; - lparams.mul_mat_q = params.mul_mat_q; - lparams.seed = params.seed; - lparams.f16_kv = params.memory_f16; - lparams.use_mmap = params.use_mmap; - lparams.use_mlock = params.use_mlock; - lparams.logits_all = params.perplexity; - lparams.embedding = params.embedding; - lparams.rope_freq_base = params.rope_freq_base; - lparams.rope_freq_scale = params.rope_freq_scale; - lparams.rope_ntk_factor = params.rope_ntk_factor; - lparams.rope_ext_factor = params.rope_ext_factor; + lparams.n_ctx = params.n_ctx; + lparams.n_batch = params.n_batch; + lparams.n_gqa = params.n_gqa; + lparams.rms_norm_eps = params.rms_norm_eps; + lparams.n_gpu_layers = params.n_gpu_layers; + lparams.main_gpu = params.main_gpu; + lparams.tensor_split = params.tensor_split; + lparams.low_vram = params.low_vram; + lparams.mul_mat_q = params.mul_mat_q; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; + lparams.use_mlock = params.use_mlock; + lparams.logits_all = params.perplexity; + lparams.embedding = params.embedding; + lparams.rope_freq_base = params.rope_freq_base; + lparams.rope_freq_scale = params.rope_freq_scale; + lparams.rope_ext_factor = params.rope_ext_factor; + lparams.rope_attn_factor = params.rope_attn_factor; + lparams.rope_beta_fast = params.rope_beta_fast; + lparams.rope_beta_slow = params.rope_beta_slow; return lparams; } diff --git a/examples/common.h b/examples/common.h index 677676ad131b5..8410b38a5f48b 100644 --- a/examples/common.h +++ b/examples/common.h @@ -32,8 +32,10 @@ struct gpt_params { float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon float rope_freq_base = 10000.0f; // RoPE base frequency float rope_freq_scale = 1.0f; // RoPE frequency scaling factor - float rope_ntk_factor = 0.0f; // RoPE NTK mix factor float rope_ext_factor = 0.0f; // RoPE extrapolation mix factor + float rope_attn_factor = 1.0f; // RoPE magnitude scaling factor + float rope_beta_fast = 32.0f; // RoPE low correction dim + float rope_beta_slow = 1.0f; // RoPE high correction dim // sampling parameters std::unordered_map logit_bias; // logit bias for specific tokens diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 49d2dd0508e0f..9721f269233be 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -612,8 +612,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps); fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); - fprintf(stdout, " --rope-ntk-factor N RoPE NTK mix factor (default: %.1f)\n", params.rope_ntk_factor); fprintf(stdout, " --rope-ext-factor N RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor); + fprintf(stdout, " --rope-attn-factor N RoPE magnitude scaling factor (default: %.1f)\n", params.rope_attn_factor); + fprintf(stdout, " --rope-beta-fast N RoPE low correction dim (default: %.1f)\n", params.rope_beta_fast); + fprintf(stdout, " --rope-beta-slow N RoPE high correction dim (default: %.1f)\n", params.rope_beta_slow); fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); @@ -766,21 +768,37 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.rope_freq_scale = std::stof(argv[i]); } - else if (arg == "--rope-ntk-factor") + else if (arg == "--rope-ext-factor") { if (++i >= argc) { invalid_param = true; break; } - params.rope_ntk_factor = std::stof(argv[i]); + params.rope_ext_factor = std::stof(argv[i]); } - else if (arg == "--rope-ext-factor") + else if (arg == "--rope-attn-factor") { if (++i >= argc) { invalid_param = true; break; } - params.rope_ext_factor = std::stof(argv[i]); + params.rope_attn_factor = std::stof(argv[i]); + } + else if (arg == "--rope-beta-fast") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_beta_fast = std::stof(argv[i]); + } + else if (arg == "--rope-beta-slow") + { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_beta_slow = std::stof(argv[i]); } else if (arg == "--memory-f32" || arg == "--memory_f32") { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 91a6edca60011..dedb87efd2ffa 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3558,34 +3558,31 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, cpy_1(cx + x_offset, cdst + dst_offset); } -static __device__ float rope_ntkv2_ramp(const float low, const float high, const int i0) { +static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) { const float y = (i0 / 2 - low) / min(0.001f, high - low); return 1.0f - min(1.0f, max(0.0f, y)); } -struct rope_corr_factors { +struct rope_corr_dims { float v[4]; }; -// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. -static __device__ float rope_ntkv2( - const float theta_base, - const float theta_linear, - const float theta_ntk, - const rope_corr_factors corr_factors, - const int64_t i0, - const float ntk_factor, - const float ext_factor) { - float ramp_mix; - float theta; - - ramp_mix = rope_ntkv2_ramp(corr_factors.v[0], corr_factors.v[1], i0) * ntk_factor; - theta = theta_linear * (1 - ramp_mix) + theta_ntk * ramp_mix; - - ramp_mix = rope_ntkv2_ramp(corr_factors.v[2], corr_factors.v[3], i0) * ext_factor; - theta = theta * (1 - ramp_mix) + theta_base * ramp_mix; - return theta; +static __device__ void rope_yarn( + float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; + float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + if (freq_scale > 1.0f) + mscale *= 1.0f + 0.1f * logf(freq_scale); + *cos_theta = cosf(theta) * mscale; + *sin_theta = sinf(theta) * mscale; } // rope == RoPE == rotary positional embedding @@ -3594,13 +3591,11 @@ static __global__ void rope_f32( float * dst, const int ncols, const float freq_scale, - const float ntk_factor, const float ext_factor, const float theta_scale, - const float theta_ntk_scale, const float p0, const int p_delta_rows, - const rope_corr_factors corr_factors) { + const rope_corr_dims corr_dims) { const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x); if (col >= ncols) { @@ -3612,11 +3607,9 @@ static __global__ void rope_f32( const float p = p0 + row / p_delta_rows; const float theta_base = p*powf(theta_scale, col/2); - const float theta_linear = freq_scale * theta_base; - const float theta_ntk = p*powf(theta_ntk_scale, col/2); - const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors, col, ntk_factor, ext_factor); - const float sin_theta = sinf(theta); - const float cos_theta = cosf(theta); + + float cos_theta, sin_theta; + rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta); const float x0 = x[i + 0]; const float x1 = x[i + 1]; @@ -4284,20 +4277,19 @@ static void rope_f32_cuda( const int ncols, const int nrows, const float freq_scale, - const float ntk_factor, const float ext_factor, const float theta_scale, - const float theta_ntk_scale, const float p0, const int p_delta_rows, - const rope_corr_factors corr_factors, + const rope_corr_dims corr_dims, cudaStream_t stream) { GGML_ASSERT(nrows % 2 == 0); const dim3 block_dims(2*CUDA_ROPE_BLOCK_SIZE, 1, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(num_blocks_x, nrows, 1); - rope_f32<<>>(x, dst, ncols, freq_scale, ntk_factor, ext_factor, theta_scale, - theta_ntk_scale, p0, p_delta_rows, corr_factors); + rope_f32<<>>( + x, dst, ncols, freq_scale, ext_factor, theta_scale, p0, p_delta_rows, corr_dims + ); } static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) { @@ -5000,11 +4992,13 @@ inline void ggml_cuda_op_rope( const int n_ctx = ((int32_t *) dst->op_params)[3]; // RoPE alteration for extended context - float freq_base, freq_scale, ntk_factor, ext_factor; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 9, sizeof(float)); const float theta_scale = powf(freq_base, -2.0f/n_dims); @@ -5018,12 +5012,13 @@ inline void ggml_cuda_op_rope( rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); } else { const float p0 = (mode & 1) == 0 ? n_past : 0; - const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims); - rope_corr_factors corr_factors; - ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors.v); + rope_corr_dims corr_dims; + ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims.v); - rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, freq_scale, ntk_factor, ext_factor, theta_scale, - theta_ntk_scale, p0, ne01, corr_factors, cudaStream_main); + rope_f32_cuda( + src0_ddf_i, dst_ddf_i, ne00, i01_diff, freq_scale, ext_factor, theta_scale, p0, ne01, corr_dims, + cudaStream_main + ); } (void) src1; diff --git a/ggml-metal.m b/ggml-metal.m index 372d3e696e9c3..0e8b0f9a958b3 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1035,11 +1035,13 @@ void ggml_metal_graph_compute( const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; - float freq_base, freq_scale, ntk_factor, ext_factor; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 9, sizeof(float)); [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1063,10 +1065,12 @@ void ggml_metal_graph_compute( [encoder setBytes:&n_past length:sizeof( int) atIndex:18]; [encoder setBytes:&n_dims length:sizeof( int) atIndex:19]; [encoder setBytes:&mode length:sizeof( int) atIndex:20]; - [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; - [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; - [encoder setBytes:&ntk_factor length:sizeof(float) atIndex:23]; - [encoder setBytes:&ext_factor length:sizeof(float) atIndex:24]; + [encoder setBytes:&freq_base length:sizeof(float) atIndex:21]; + [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22]; + [encoder setBytes:&ext_factor length:sizeof(float) atIndex:23]; + [encoder setBytes:&attn_factor length:sizeof(float) atIndex:24]; + [encoder setBytes:&beta_fast length:sizeof(float) atIndex:25]; + [encoder setBytes:&beta_slow length:sizeof(float) atIndex:26]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 347fd17ac093e..f5a98d09278e3 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -597,53 +597,41 @@ kernel void kernel_alibi_f32( } } -static float rope_ntkv2_ramp(const float low, const float high, const int i0) { +static float rope_yarn_ramp(const float low, const float high, const int i0) { const float y = (i0 / 2 - low) / min(0.001f, high - low); return 1.0f - min(1.0f, max(0.0f, y)); } -// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. -static float rope_ntkv2( - const float theta_base, - const float theta_linear, - const float theta_ntk, - const float corr_factors[4], - const int64_t i0, - const float ntk_factor, - const float ext_factor) { - float ramp_mix; - float theta; - - ramp_mix = rope_ntkv2_ramp(corr_factors[0], corr_factors[1], i0) * ntk_factor; - theta = theta_linear * (1 - ramp_mix) + theta_ntk * ramp_mix; - - ramp_mix = rope_ntkv2_ramp(corr_factors[2], corr_factors[3], i0) * ext_factor; - theta = theta * (1 - ramp_mix) + theta_base * ramp_mix; - return theta; +static void rope_yarn( + float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + if (freq_scale > 1.0f) + mscale *= 1.0f + 0.1f * logf(freq_scale); + *cos_theta = cosf(theta) * mscale; + *sin_theta = sinf(theta) * mscale; } -// Interpolation constants found experimentally for LLaMA (might not be totally optimal though) -// Do not change unless there is a good reason for doing so! -constant float BETA_0 = 1.75f; -constant float BETA_1 = 1.25f; -constant float GAMMA_0 = 16.0f; -constant float GAMMA_1 = 2.0f; - constant float max_pos_emb = 2048; // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` -static float rope_ntkv2_corr_factor(const int n_dims, const float n_rot, const float base) { +static float rope_yarn_corr_factor(const int n_dims, const float n_rot, const float base) { return n_dims * log(max_pos_emb / (n_rot * 2 * M_PI_F)) / (2 * log(base)); } -static void rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]) { - // start and end correction factors - factors[0] = max(0.0f, floor(rope_ntkv2_corr_factor(n_dims, BETA_0, freq_base))); - factors[1] = min(n_dims - 1.0f, ceil(rope_ntkv2_corr_factor(n_dims, BETA_1, freq_base))); - factors[2] = max(0.0f, floor(rope_ntkv2_corr_factor(n_dims, GAMMA_0, freq_base))); - factors[3] = min(n_dims - 1.0f, ceil(rope_ntkv2_corr_factor(n_dims, GAMMA_1, freq_base))); +static void rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]) { + // start and end correction dims + dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, beta_fast, freq_base))); + dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, beta_slow, freq_base))); } kernel void kernel_rope( @@ -670,33 +658,29 @@ kernel void kernel_rope( constant int & mode, constant float & freq_base, constant float & freq_scale, - constant float & ntk_factor, constant float & ext_factor, + constant float & attn_factor, + constant float & beta_fast, + constant float & beta_slow, uint3 tpig[[thread_position_in_grid]]) { const int64_t i3 = tpig[2]; const int64_t i2 = tpig[1]; const int64_t i1 = tpig[0]; const float theta_scale = pow(freq_base, -2.0f/n_dims); - const float theta_ntk_scale = pow(freq_base * pow(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims); - float corr_factors[4]; - rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors); + float corr_dims[2]; + rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims); float theta_base = (mode & 1) == 0 ? n_past + i2 : i2; - float theta_ntk = theta_base; const bool is_neox = mode & 2; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float theta_linear = freq_scale * theta_base; - const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors, - i0, ntk_factor, ext_factor); - const float cos_theta = cos(theta); - const float sin_theta = sin(theta); + float cos_theta, sin_theta; + rope_yarn(theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); theta_base *= theta_scale; - theta_ntk *= theta_ntk_scale; device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); diff --git a/ggml.c b/ggml.c index 8a57391811dcc..8bf7f52e223c5 100644 --- a/ggml.c +++ b/ggml.c @@ -6712,8 +6712,10 @@ static struct ggml_tensor * ggml_rope_impl( int n_ctx, float freq_base, float freq_scale, - float ntk_factor, float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow, bool inplace) { GGML_ASSERT(n_past >= 0); bool is_node = false; @@ -6724,11 +6726,13 @@ static struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[8] = { n_past, n_dims, mode, n_ctx }; - memcpy(params + 4, &freq_base, sizeof(float)); - memcpy(params + 5, &freq_scale, sizeof(float)); - memcpy(params + 6, &ntk_factor, sizeof(float)); - memcpy(params + 7, &ext_factor, sizeof(float)); + int32_t params[10] = { n_past, n_dims, mode, n_ctx }; + memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 5, &freq_scale, sizeof(float)); + memcpy(params + 6, &ext_factor, sizeof(float)); + memcpy(params + 7, &attn_factor, sizeof(float)); + memcpy(params + 8, &beta_fast, sizeof(float)); + memcpy(params + 9, &beta_slow, sizeof(float)); ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE; @@ -6745,7 +6749,7 @@ struct ggml_tensor * ggml_rope( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 0.0f, false); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false); } struct ggml_tensor * ggml_rope_inplace( @@ -6755,7 +6759,7 @@ struct ggml_tensor * ggml_rope_inplace( int n_dims, int mode, int n_ctx) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 0.0f, true); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true); } struct ggml_tensor * ggml_rope_custom( @@ -6767,9 +6771,13 @@ struct ggml_tensor * ggml_rope_custom( int n_ctx, float freq_base, float freq_scale, - float ntk_factor, - float ext_factor) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ntk_factor, ext_factor, false); + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return ggml_rope_impl( + ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, false + ); } struct ggml_tensor * ggml_rope_custom_inplace( @@ -6781,9 +6789,13 @@ struct ggml_tensor * ggml_rope_custom_inplace( int n_ctx, float freq_base, float freq_scale, - float ntk_factor, - float ext_factor) { - return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ntk_factor, ext_factor, true); + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow) { + return ggml_rope_impl( + ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, true + ); } // ggml_rope_back @@ -12012,52 +12024,40 @@ static void ggml_compute_forward_clamp( // ggml_compute_forward_rope -static inline float rope_ntkv2_ramp(const float low, const float high, const int i0) { +static inline float rope_yarn_ramp(const float low, const float high, const int i0) { const float y = (i0 / 2 - low) / MIN(0.001f, high - low); return 1 - MIN(1, MAX(0, y)); } -// NTKv2 algorithm based on LlamaPartNTKScaledRotaryEmbedding.py from https://github.com/jquesnelle/scaled-rope +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. -static float rope_ntkv2( - const float theta_base, - const float theta_linear, - const float theta_ntk, - const float corr_factors[4], - const int64_t i0, - const float ntk_factor, - const float ext_factor) { - float ramp_mix; - float theta; - - ramp_mix = rope_ntkv2_ramp(corr_factors[0], corr_factors[1], i0) * ntk_factor; - theta = theta_linear * (1 - ramp_mix) + theta_ntk * ramp_mix; - - ramp_mix = rope_ntkv2_ramp(corr_factors[2], corr_factors[3], i0) * ext_factor; - theta = theta * (1 - ramp_mix) + theta_base * ramp_mix; - return theta; +static void rope_yarn( + float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + if (freq_scale > 1.0f) + mscale *= 1.0f + 0.1f * logf(freq_scale); + *cos_theta = cosf(theta) * mscale; + *sin_theta = sinf(theta) * mscale; } // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get -// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` -static float ggml_rope_ntkv2_corr_factor(const int n_dims, const float n_rot, const float base) { +// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float ggml_rope_yarn_corr_dim(const int n_dims, const float n_rot, const float base) { static const float max_pos_emb = 2048; return n_dims * logf(max_pos_emb / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); } -void ggml_rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]) { - // Interpolation constants found experimentally for LLaMA (might not be totally optimal though) - // Do not change unless there is a good reason for doing so! - static const float BETA_0 = 1.75f; - static const float BETA_1 = 1.25f; - static const float GAMMA_0 = 16.0f; - static const float GAMMA_1 = 2.0f; - - // start and end correction factors - factors[0] = MAX(0, floorf(ggml_rope_ntkv2_corr_factor(n_dims, BETA_0, freq_base))); - factors[1] = MIN(n_dims - 1, ceilf(ggml_rope_ntkv2_corr_factor(n_dims, BETA_1, freq_base))); - factors[2] = MAX(0, floorf(ggml_rope_ntkv2_corr_factor(n_dims, GAMMA_0, freq_base))); - factors[3] = MIN(n_dims - 1, ceilf(ggml_rope_ntkv2_corr_factor(n_dims, GAMMA_1, freq_base))); +void ggml_rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]) { + // start and end correction dims + dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, beta_fast, freq_base))); + dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, beta_slow, freq_base))); } static void ggml_compute_forward_rope_f32( @@ -12069,19 +12069,18 @@ static void ggml_compute_forward_rope_f32( return; } - float freq_base; - float freq_scale; - float ntk_factor; - float ext_factor; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; const int n_ctx = ((int32_t *) dst->op_params)[3]; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 9, sizeof(float)); assert(n_past >= 0); @@ -12111,9 +12110,8 @@ static void ggml_compute_forward_rope_f32( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); - const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims); - float corr_factors[4]; - ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors); + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12126,7 +12124,6 @@ static void ggml_compute_forward_rope_f32( if (ir > ir1) break; float theta_base = (float)p; - float theta_ntk = theta_base; if (is_glm) { theta_base = MIN(p, n_ctx - 2); @@ -12155,14 +12152,12 @@ static void ggml_compute_forward_rope_f32( } } else if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float theta_linear = freq_scale * theta_base; - const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors, - i0, ntk_factor, ext_factor); - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta + ); theta_base *= theta_scale; - theta_ntk *= theta_ntk_scale; const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); @@ -12211,19 +12206,18 @@ static void ggml_compute_forward_rope_f16( return; } - float freq_base; - float freq_scale; - float ntk_factor; - float ext_factor; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; const int n_ctx = ((int32_t *) dst->op_params)[3]; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&ntk_factor, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 9, sizeof(float)); assert(n_past >= 0); @@ -12253,9 +12247,8 @@ static void ggml_compute_forward_rope_f16( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); - const float theta_ntk_scale = powf(freq_base * powf(freq_scale, (n_dims / (n_dims - 2.0f))), -2.0f/n_dims); - float corr_factors[4]; - ggml_rope_ntkv2_corr_factors(n_dims, freq_base, corr_factors); + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12268,7 +12261,6 @@ static void ggml_compute_forward_rope_f16( if (ir > ir1) break; float theta_base = (float)p; - float theta_ntk = theta_base; if (is_glm) { theta_base = MIN(p, n_ctx - 2); @@ -12297,14 +12289,12 @@ static void ggml_compute_forward_rope_f16( } } if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { - const float theta_linear = freq_scale * theta_base; - const float theta = rope_ntkv2(theta_base, theta_linear, theta_ntk, corr_factors, - i0, ntk_factor, ext_factor); - const float cos_theta = cosf(theta); - const float sin_theta = sinf(theta); + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta + ); theta_base *= theta_scale; - theta_ntk *= theta_ntk_scale; const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); diff --git a/ggml.h b/ggml.h index c2c6b7b1d376c..06e1cbf7b7312 100644 --- a/ggml.h +++ b/ggml.h @@ -1195,8 +1195,10 @@ extern "C" { int n_ctx, float freq_base, float freq_scale, - float ntk_factor, - float ext_factor); + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); // in-place, returns view(a) GGML_API struct ggml_tensor * ggml_rope_custom_inplace( @@ -1208,11 +1210,13 @@ extern "C" { int n_ctx, float freq_base, float freq_scale, - float ntk_factor, - float ext_factor); + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); - // compute correction factors for NTKv2 RoPE scaling - void ggml_rope_ntkv2_corr_factors(int n_dims, const float freq_base, float factors[4]); + // compute correction dims for YaRN RoPE scaling + void ggml_rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]); // rotary position embedding backward, i.e compute dx from dy // a - dy diff --git a/llama.cpp b/llama.cpp index 945215de8a31e..3ea0650827357 100644 --- a/llama.cpp +++ b/llama.cpp @@ -194,10 +194,12 @@ struct llama_hparams { float f_ffn_mult = 1.0f; float f_rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; - float rope_freq_base = 10000.0f; - float rope_freq_scale = 1.0f; - float rope_ntk_factor = 0.0f; - float rope_ext_factor = 0.0f; + float rope_freq_base = 10000.0f; + float rope_freq_scale = 1.0f; + float rope_ext_factor = 0.0f; + float rope_attn_factor = 1.0f; + float rope_beta_fast = 0.0f; + float rope_beta_slow = 0.0f; enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; @@ -900,8 +902,10 @@ struct llama_context_params llama_context_default_params() { /*.tensor_split =*/ nullptr, /*.rope_freq_base =*/ 10000.0f, /*.rope_freq_scale =*/ 1.0f, - /*.rope_ntk_factor =*/ 0.0f, /*.rope_ext_factor =*/ 0.0f, + /*.rope_attn_factor =*/ 1.0f, + /*.rope_beta_fast =*/ 32.0f, + /*.rope_beta_slow =*/ 1.0f, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, @@ -1036,8 +1040,10 @@ static void llama_model_load_internal( const bool mul_mat_q, float rope_freq_base, float rope_freq_scale, - float rope_ntk_factor, float rope_ext_factor, + float rope_attn_factor, + float rope_beta_fast, + float rope_beta_slow, bool low_vram, ggml_type memory_type, bool use_mmap, @@ -1087,10 +1093,12 @@ static void llama_model_load_internal( hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model } - hparams.rope_freq_base = rope_freq_base; - hparams.rope_freq_scale = rope_freq_scale; - hparams.rope_ntk_factor = rope_ntk_factor; - hparams.rope_ext_factor = rope_ext_factor; + hparams.rope_freq_base = rope_freq_base; + hparams.rope_freq_scale = rope_freq_scale; + hparams.rope_ext_factor = rope_ext_factor; + hparams.rope_attn_factor = rope_attn_factor; + hparams.rope_beta_fast = rope_beta_fast; + hparams.rope_beta_slow = rope_beta_slow; } // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199 @@ -1100,24 +1108,26 @@ static void llama_model_load_internal( //const uint32_t n_ff = 28672; { - fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); - fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); - fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); - fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); - fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); - fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); - fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); - fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim - fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa()); - fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps); - fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); - fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); - fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); - fprintf(stderr, "%s: ntk_factor = %g\n", __func__, hparams.rope_ntk_factor); - fprintf(stderr, "%s: ext_factor = %g\n", __func__, hparams.rope_ext_factor); - fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); + fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); + fprintf(stderr, "%s: n_vocab = %u\n", __func__, hparams.n_vocab); + fprintf(stderr, "%s: n_ctx = %u\n", __func__, hparams.n_ctx); + fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); + fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); + fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); + fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); + fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa()); + fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps); + fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); + fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); + fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); + fprintf(stderr, "%s: ext_factor = %g\n", __func__, hparams.rope_ext_factor); + fprintf(stderr, "%s: attn_factor = %g\n", __func__, hparams.rope_attn_factor); + fprintf(stderr, "%s: beta_fast = %g\n", __func__, hparams.rope_beta_fast); + fprintf(stderr, "%s: beta_slow = %g\n", __func__, hparams.rope_beta_slow); + fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); + fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { @@ -1384,8 +1394,10 @@ static bool llama_model_load( const bool mul_mat_q, float rope_freq_base, float rope_freq_scale, - float rope_ntk_factor, float rope_ext_factor, + float rope_attn_factor, + float rope_beta_fast, + float rope_beta_slow, bool low_vram, ggml_type memory_type, bool use_mmap, @@ -1394,10 +1406,11 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, - tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, rope_ntk_factor, - rope_ext_factor, low_vram, memory_type, use_mmap, use_mlock, vocab_only, - progress_callback, progress_callback_user_data); + llama_model_load_internal( + fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, mul_mat_q, + rope_freq_base, rope_freq_scale, rope_ext_factor, rope_attn_factor, rope_beta_fast, rope_beta_slow, + low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data + ); return true; } catch (const std::exception & err) { fprintf(stderr, "error loading model: %s\n", err.what()); @@ -1433,10 +1446,12 @@ static struct ggml_cgraph * llama_build_graph( LLAMA_ASSERT(n_embd_head == hparams.n_rot); - const float freq_base = hparams.rope_freq_base; - const float freq_scale = hparams.rope_freq_scale; - const float ntk_factor = hparams.rope_ntk_factor; - const float ext_factor = hparams.rope_ext_factor; + const float freq_base = hparams.rope_freq_base; + const float freq_scale = hparams.rope_freq_scale; + const float ext_factor = hparams.rope_ext_factor; + const float attn_factor = hparams.rope_attn_factor; + const float beta_fast = hparams.rope_beta_fast; + const float beta_slow = hparams.rope_beta_slow; const float rms_norm_eps = hparams.f_rms_norm_eps; const int n_gpu_layers = model.n_gpu_layers; @@ -1567,14 +1582,16 @@ static struct ggml_cgraph * llama_build_graph( ggml_set_name(tmpq, "tmpq"); struct ggml_tensor * Kcur = ggml_rope_custom_inplace( - ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, - freq_scale, ntk_factor, ext_factor); + ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, + freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); struct ggml_tensor * Qcur = ggml_rope_custom_inplace( - ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, - freq_scale, ntk_factor, ext_factor); + ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, + freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); @@ -3216,11 +3233,13 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, - params.rms_norm_eps, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.mul_mat_q, - params.rope_freq_base, params.rope_freq_scale, params.rope_ntk_factor, params.rope_ext_factor, - params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, - params.progress_callback, params.progress_callback_user_data)) { + if (!llama_model_load( + path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, + params.n_gpu_layers, params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, + params.rope_freq_scale, params.rope_ext_factor, params.rope_attn_factor, params.rope_beta_fast, + params.rope_beta_slow, params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, + params.progress_callback, params.progress_callback_user_data + )) { delete model; fprintf(stderr, "%s: failed to load model\n", __func__); return nullptr; diff --git a/llama.h b/llama.h index 25bb3952a73f8..66c78c7619dd5 100644 --- a/llama.h +++ b/llama.h @@ -98,10 +98,12 @@ extern "C" { const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency - float rope_freq_scale; // RoPE frequency scaling factor - float rope_ntk_factor; // RoPE NTK mix factor - float rope_ext_factor; // RoPE extrapolation mix factor + float rope_freq_base; // RoPE base frequency + float rope_freq_scale; // RoPE frequency scaling factor + float rope_ext_factor; // RoPE extrapolation mix factor + float rope_attn_factor; // RoPE magnitude scaling factor + float rope_beta_fast; // RoPE low correction dim + float rope_beta_slow; // RoPE high correction dim // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; From 826269adc52894d4c44165cf1437fd8d1585c653 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Mon, 4 Sep 2023 22:15:59 -0400 Subject: [PATCH 05/28] ggml : increase GGML_MAX_OP_PARAMS --- ggml.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.h b/ggml.h index 1d52f8f048a3d..7084687cf8caa 100644 --- a/ggml.h +++ b/ggml.h @@ -211,7 +211,7 @@ #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 6 #define GGML_MAX_NAME 64 -#define GGML_MAX_OP_PARAMS 32 +#define GGML_MAX_OP_PARAMS 48 #define GGML_DEFAULT_N_THREADS 4 #if UINTPTR_MAX == 0xFFFFFFFF From cf731d56480b8f155cc163d9bd45b681c80fba47 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Tue, 5 Sep 2023 14:14:05 -0400 Subject: [PATCH 06/28] YaRN : avoid NaN if unused betas are zero --- ggml-cuda.cu | 7 +++++-- ggml-metal.metal | 7 +++++-- ggml.c | 7 +++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 73a3399c539aa..c649e90a13060 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4058,8 +4058,11 @@ static __device__ void rope_yarn( ) { // Get n-d rotational scaling corrected for extrapolation float theta_interp = freq_scale * theta_extrap; - float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; - float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + } // Get n-d magnitude scaling corrected for interpolation if (freq_scale > 1.0f) diff --git a/ggml-metal.metal b/ggml-metal.metal index 6b0194d51e4c2..a1eb2d0d8c677 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -688,8 +688,11 @@ static void rope_yarn( ) { // Get n-d rotational scaling corrected for extrapolation float theta_interp = freq_scale * theta_extrap; - float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + float theta = theta_interp; + if (ext_factor != 0.0f) { + ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + } // Get n-d magnitude scaling corrected for interpolation if (freq_scale > 1.0f) diff --git a/ggml.c b/ggml.c index 94a47faa166eb..85316a3a16812 100644 --- a/ggml.c +++ b/ggml.c @@ -12626,8 +12626,11 @@ static void rope_yarn( ) { // Get n-d rotational scaling corrected for extrapolation float theta_interp = freq_scale * theta_extrap; - float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - float theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + } // Get n-d magnitude scaling corrected for interpolation if (freq_scale > 1.0f) From dcb058ce5dbed14cce4a4e3ee628a5df7802d71a Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Tue, 5 Sep 2023 14:17:50 -0400 Subject: [PATCH 07/28] YaRN : fix missing parameter in CUDA impl --- ggml-cuda.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c649e90a13060..87ad07b0bc874 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4073,8 +4073,8 @@ static __device__ void rope_yarn( // rope == RoPE == rotary positional embedding static __global__ void rope_f32( - const float * x, float * dst, const int ncols, const float freq_scale, const float ext_factor, - const float theta_scale, const float p0, const int p_delta_rows, const rope_corr_dims corr_dims + float * x, float * dst, int ncols, float freq_scale, float ext_factor, float attn_factor, float theta_scale, + float p0, int p_delta_rows, rope_corr_dims corr_dims ) { const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); @@ -4086,7 +4086,7 @@ static __global__ void rope_f32( const int i = row*ncols + col; const float p = p0 + row / p_delta_rows; - const float theta_base = p*powf(theta_scale, col/2); + const float theta_base = p*powf(theta_scale, col/2); float cos_theta, sin_theta; rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta); @@ -5001,15 +5001,15 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons } static void rope_f32_cuda( - const float * x, float * dst, const int ncols, const int nrows, const float freq_scale, const float ext_factor, - const float theta_scale, const float p0, const int p_delta_rows, const rope_corr_dims corr_dims, cudaStream_t stream + float * x, float * dst, int ncols, int nrows, float freq_scale, float ext_factor, float attn_factor, + float theta_scale, float p0, int p_delta_rows, rope_corr_dims corr_dims, cudaStream_t stream ) { GGML_ASSERT(ncols % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nrows, num_blocks_x, 1); rope_f32<<>>( - x, dst, ncols, freq_scale, ext_factor, theta_scale, p0, p_delta_rows, corr_dims + x, dst, ncols, freq_scale, ext_factor, attn_factor, theta_scale, p0, p_delta_rows, corr_dims ); } @@ -5785,8 +5785,8 @@ inline void ggml_cuda_op_rope( ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims.v); rope_f32_cuda( - src0_ddf_i, dst_ddf_i, ne00, i01_diff, freq_scale, ext_factor, theta_scale, p0, ne01, corr_dims, - cudaStream_main + src0_ddf_i, dst_ddf_i, ne00, i01_diff, freq_scale, ext_factor, attn_factor, theta_scale, p0, ne01, + corr_dims, cudaStream_main ); } From 281b26e647cc4f6a8f53fed504a5a490c79cd0cd Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Wed, 6 Sep 2023 13:00:04 -0400 Subject: [PATCH 08/28] convert : reduce unnecessary variables in Params --- convert.py | 63 ++++++++++++++++-------------------------------------- 1 file changed, 18 insertions(+), 45 deletions(-) diff --git a/convert.py b/convert.py index 5a7483b43d563..5a05a90de3ec4 100755 --- a/convert.py +++ b/convert.py @@ -210,23 +210,12 @@ def guessed(model: LazyModel) -> Params: def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_vocab = config["vocab_size"] - n_embd = config["hidden_size"] - n_layer = config["num_hidden_layers"] - n_ff = config["intermediate_size"] - n_head = config["num_attention_heads"] - n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head - f_norm_eps = config["rms_norm_eps"] - f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None - rope_scaling = config.get("rope_scaling") if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear": f_rope_scale = config["rope_scaling"].get("factor") else: f_rope_scale = None - n_mult = Params.find_n_mult(n_ff, n_embd) - if "max_sequence_length" in config: n_ctx = config["max_sequence_length"] elif "max_position_embeddings" in config: @@ -236,16 +225,16 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: "Suggestion: provide 'config.json' of the model in the same directory containing model files.") return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_mult = n_mult, - n_layer = n_layer, + n_vocab = config["vocab_size"], + n_embd = config["hidden_size"], + n_mult = Params.find_n_mult(n_ff, n_embd), + n_layer = config["num_hidden_layers"], n_ctx = n_ctx, - n_ff = n_ff, - n_head = n_head, - n_head_kv = n_head_kv, - f_norm_eps = f_norm_eps, - f_rope_freq_base = f_rope_freq_base, + n_ff = config["intermediate_size"], + n_head = config["num_attention_heads"], + n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head, + f_norm_eps = config["rms_norm_eps"], + f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None, f_rope_scale = f_rope_scale, ) @@ -255,16 +244,6 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_vocab = config["vocab_size"] if "vocab_size" in config else -1 - n_embd = config["dim"] - n_layer = config["n_layers"] - n_mult = config["multiple_of"] - n_ff = -1 - n_head = config["n_heads"] - n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head - f_norm_eps = config["norm_eps"] - f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None - # hack to determine LLaMA v1 vs v2 vs CodeLlama if f_rope_freq_base and f_rope_freq_base == 1000000: # CodeLlama @@ -276,23 +255,17 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: # LLaMA v1 n_ctx = 2048 - if n_vocab == -1: - n_vocab = model["tok_embeddings.weight"].shape[0] - - if n_ff == -1: - n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] - return Params( - n_vocab = n_vocab, - n_embd = n_embd, - n_mult = n_mult, - n_layer = n_layer, + n_vocab = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]), + n_embd = config["dim"], + n_mult = config["multiple_of"], + n_layer = config["n_layers"], n_ctx = n_ctx, - n_ff = n_ff, - n_head = n_head, - n_head_kv = n_head_kv, - f_norm_eps = f_norm_eps, - f_rope_freq_base = f_rope_freq_base, + n_ff = model["layers.0.feed_forward.w1.weight"].shape[0], + n_head = config["n_heads"], + n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head, + f_norm_eps = config["norm_eps"], + f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None, ) @staticmethod From dc26a0dd326cbdcdfd8b2e66536d06fe60189eeb Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Wed, 20 Sep 2023 21:33:33 -0400 Subject: [PATCH 09/28] llama : simplify use of context params --- llama.cpp | 95 +++++++++++++++++-------------------------------------- 1 file changed, 29 insertions(+), 66 deletions(-) diff --git a/llama.cpp b/llama.cpp index 3c4d1ef5e5df5..87aea24682087 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1650,22 +1650,21 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) { } } -static void llm_load_hparams( - llama_model_loader & ml, - llama_model & model, - int n_ctx, - float rope_freq_base, - float rope_freq_scale, - float rope_ext_factor, - float rope_attn_factor, - float rope_beta_fast, - float rope_beta_slow) { +static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const llama_context_params & params) { struct gguf_context * ctx = ml.ctx_gguf; const auto kv = LLM_KV(model.arch); auto & hparams = model.hparams; + hparams.n_ctx = params.n_ctx; + hparams.rope_freq_base = params.rope_freq_base; + hparams.rope_freq_scale = params.rope_freq_scale; + hparams.rope_ext_factor = params.rope_ext_factor; + hparams.rope_attn_factor = params.rope_attn_factor; + hparams.rope_beta_fast = params.rope_beta_fast; + hparams.rope_beta_slow = params.rope_beta_slow; + // get general kv GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME)); @@ -1682,16 +1681,17 @@ static void llm_load_hparams( GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); // rope_freq_base (optional) - if (rope_freq_base == 0.0f) { - rope_freq_base = 10000.0f; + if (hparams.rope_freq_base == 0.0f) { + float rope_freq_base = 10000.0f; GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + hparams.rope_freq_base = rope_freq_base; } // rope_freq_scale (inverse of the kv) is optional - if (rope_freq_scale == 0.0f) { + if (hparams.rope_freq_scale == 0.0f) { float ropescale = 1.0f; GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - rope_freq_scale = 1.0f/ropescale; + hparams.rope_freq_scale = 1.0f/ropescale; } // sanity check for n_rot (optional) @@ -1759,14 +1759,6 @@ static void llm_load_hparams( }; model.ftype = ml.ftype; - - hparams.n_ctx = n_ctx; - hparams.rope_freq_base = rope_freq_base; - hparams.rope_freq_scale = rope_freq_scale; - hparams.rope_ext_factor = rope_ext_factor; - hparams.rope_attn_factor = rope_attn_factor; - hparams.rope_beta_fast = rope_beta_fast; - hparams.rope_beta_slow = rope_beta_slow; } // TODO: This should probably be in llama.h @@ -2388,37 +2380,13 @@ static void llm_load_tensors( model.t_load_us = ggml_time_us() - model.t_start_us; } -static bool llama_model_load( - const std::string & fname, - llama_model & model, - int n_ctx, - int n_batch, - int n_gpu_layers, - int main_gpu, - const float * tensor_split, - const bool mul_mat_q, - float rope_freq_base, - float rope_freq_scale, - float rope_ext_factor, - float rope_attn_factor, - float rope_beta_fast, - float rope_beta_slow, - bool low_vram, - ggml_type memory_type, - bool use_mmap, - bool use_mlock, - bool vocab_only, - llama_progress_callback progress_callback, - void *progress_callback_user_data) { +static bool llama_model_load(const std::string & fname, llama_model & model, const llama_context_params & params) { try { - std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); + std::unique_ptr ml(new llama_model_loader(fname, params.use_mmap)); - llm_load_arch(*ml, model); - llm_load_hparams( - *ml, model, n_ctx, rope_freq_base, rope_freq_scale, rope_ext_factor, rope_attn_factor, rope_beta_fast, - rope_beta_slow - ); - llm_load_vocab(*ml, model); + llm_load_arch (*ml, model); + llm_load_hparams(*ml, model, params); + llm_load_vocab (*ml, model); llm_load_print_meta(*ml, model); @@ -2426,15 +2394,18 @@ static bool llama_model_load( throw std::runtime_error("vocab size mismatch"); } - if (vocab_only) { + if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); return true; } + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + llm_load_tensors( - *ml, model, n_batch, n_gpu_layers, - main_gpu, tensor_split, mul_mat_q, low_vram, memory_type, - use_mlock, progress_callback, progress_callback_user_data); + *ml, model, params.n_batch, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.mul_mat_q, + params.low_vram, memory_type, params.use_mlock, params.progress_callback, + params.progress_callback_user_data + ); } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); return false; @@ -5694,8 +5665,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::unique_ptr ml(new llama_model_loader(fname_inp, /*use_mmap*/ false)); llama_model model; - llm_load_arch(*ml, model); - llm_load_hparams(*ml, model, 0, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f); + llm_load_arch (*ml, model); + llm_load_hparams(*ml, model, llama_context_default_params()); if (params->only_copy) { ftype = model.ftype; @@ -6298,8 +6269,6 @@ struct llama_model * llama_load_model_from_file( llama_model * model = new llama_model; - ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - unsigned cur_percentage = 0; if (params.progress_callback == NULL) { params.progress_callback_user_data = &cur_percentage; @@ -6316,13 +6285,7 @@ struct llama_model * llama_load_model_from_file( }; } - if (!llama_model_load( - path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu, params.tensor_split, - params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale, params.rope_ext_factor, - params.rope_attn_factor, params.rope_beta_fast, params.rope_beta_slow, params.low_vram, memory_type, - params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, - params.progress_callback_user_data - )) { + if (!llama_model_load(path_model, *model, params)) { LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); delete model; return nullptr; From 904d4edfa1c3ad79e32e48365168bbe0e5bc36f2 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Thu, 14 Sep 2023 13:26:10 -0400 Subject: [PATCH 10/28] llama : store YaRN parameters in GGUF --- common/common.cpp | 73 +++++++++++++--------- common/common.h | 10 +-- convert.py | 53 +++++++++++----- examples/server/server.cpp | 39 ++++++++---- gguf-py/gguf/gguf.py | 27 ++++++-- llama.cpp | 124 ++++++++++++++++++++++++++----------- llama.h | 34 ++++++---- 7 files changed, 245 insertions(+), 115 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 9e4452dabd021..ca4b9c1cc8a16 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -192,36 +192,46 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--rope-scaling") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + else { invalid_param = true; break; } } else if (arg == "--rope-scale") { if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = 1.0f/std::stof(argv[i]); - } else if (arg == "--rope-ext-factor") { + } else if (arg == "--yarn-ext-factor") { if (++i >= argc) { invalid_param = true; break; } - params.rope_ext_factor = std::stof(argv[i]); - } else if (arg == "--rope-attn-factor") { + params.yarn_ext_factor = std::stof(argv[i]); + } else if (arg == "--yarn-attn-factor") { if (++i >= argc) { invalid_param = true; break; } - params.rope_attn_factor = std::stof(argv[i]); - } else if (arg == "--rope-beta-fast") { + params.yarn_attn_factor = std::stof(argv[i]); + } else if (arg == "--yarn-beta-fast") { if (++i >= argc) { invalid_param = true; break; } - params.rope_beta_fast = std::stof(argv[i]); - } else if (arg == "--rope-beta-slow") { + params.yarn_beta_fast = std::stof(argv[i]); + } else if (arg == "--yarn-beta-slow") { if (++i >= argc) { invalid_param = true; break; } - params.rope_beta_slow = std::stof(argv[i]); + params.yarn_beta_slow = std::stof(argv[i]); } else if (arg == "--memory-f32") { params.memory_f16 = false; } else if (arg == "--top-p") { @@ -671,13 +681,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --cfg-negative-prompt-file FNAME\n"); printf(" negative prompt file to use for guidance. (default: empty)\n"); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); - printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n"); + printf(" --rope-scaling {none,linear,yarn}\n"); + printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); + printf(" --rope-scale N RoPE context scaling factor, inverse of --rope-freq-scale\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); - printf(" --rope-ext-factor N RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor); - printf(" --rope-attn-factor N RoPE magnitude scaling factor (default: %.1f)\n", params.rope_attn_factor); - printf(" --rope-beta-fast N RoPE low correction dim (default: %.1f)\n", params.rope_beta_fast); - printf(" --rope-beta-slow N RoPE high correction dim (default: %.1f)\n", params.rope_beta_slow); + printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); + printf(" --yarn-ext-factor N YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor); + printf(" --yarn-attn-factor N YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor); + printf(" --yarn-beta-fast N YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast); + printf(" --yarn-beta-slow N YaRN high correction dim (default: %.1f)\n", params.yarn_beta_slow); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); @@ -758,22 +770,23 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param if (params.n_gpu_layers != -1) { lparams.n_gpu_layers = params.n_gpu_layers; } - lparams.main_gpu = params.main_gpu; - lparams.tensor_split = params.tensor_split; - lparams.low_vram = params.low_vram; - lparams.mul_mat_q = params.mul_mat_q; - lparams.seed = params.seed; - lparams.f16_kv = params.memory_f16; - lparams.use_mmap = params.use_mmap; - lparams.use_mlock = params.use_mlock; - lparams.logits_all = params.perplexity; - lparams.embedding = params.embedding; - lparams.rope_freq_base = params.rope_freq_base; - lparams.rope_freq_scale = params.rope_freq_scale; - lparams.rope_ext_factor = params.rope_ext_factor; - lparams.rope_attn_factor = params.rope_attn_factor; - lparams.rope_beta_fast = params.rope_beta_fast; - lparams.rope_beta_slow = params.rope_beta_slow; + lparams.main_gpu = params.main_gpu; + lparams.tensor_split = params.tensor_split; + lparams.low_vram = params.low_vram; + lparams.mul_mat_q = params.mul_mat_q; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; + lparams.use_mlock = params.use_mlock; + lparams.logits_all = params.perplexity; + lparams.embedding = params.embedding; + lparams.rope_scaling_type = params.rope_scaling_type; + lparams.rope_freq_base = params.rope_freq_base; + lparams.rope_freq_scale = params.rope_freq_scale; + lparams.yarn_ext_factor = params.yarn_ext_factor; + lparams.yarn_attn_factor = params.yarn_attn_factor; + lparams.yarn_beta_fast = params.yarn_beta_fast; + lparams.yarn_beta_slow = params.yarn_beta_slow; return lparams; } diff --git a/common/common.h b/common/common.h index 0b45b4278d73c..a1e7da128c113 100644 --- a/common/common.h +++ b/common/common.h @@ -50,10 +50,12 @@ struct gpt_params { int32_t n_beams = 0; // if non-zero then use beam search of given width. float rope_freq_base = 10000.0f; // RoPE base frequency float rope_freq_scale = 1.0f; // RoPE frequency scaling factor - float rope_ext_factor = 0.0f; // RoPE extrapolation mix factor - float rope_attn_factor = 1.0f; // RoPE magnitude scaling factor - float rope_beta_fast = 32.0f; // RoPE low correction dim - float rope_beta_slow = 1.0f; // RoPE high correction dim + float yarn_ext_factor = 0.0f; // YaRN extrapolation mix factor + float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor + float yarn_beta_fast = 32.0f; // YaRN low correction dim + float yarn_beta_slow = 1.0f; // YaRN high correction dim + + llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // sampling parameters int32_t top_k = 40; // <= 0 to use vocab size diff --git a/convert.py b/convert.py index 649624cff7e6d..f08cf01c5786e 100755 --- a/convert.py +++ b/convert.py @@ -152,8 +152,11 @@ class Params: n_head_kv: int f_norm_eps: float + rope_scaling_type: gguf.RopeScalingType | None = None f_rope_freq_base: float | None = None f_rope_scale: float | None = None + n_orig_ctx: int | None = None + rope_finetuned: bool | None = None ftype: GGMLFileType | None = None @@ -199,11 +202,20 @@ def guessed(model: LazyModel) -> Params: def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) + rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None rope_scaling = config.get("rope_scaling") - if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear": - f_rope_scale = config["rope_scaling"].get("factor") - else: - f_rope_scale = None + + if rope_scaling is not None and typ := rope_scaling.get("type"): + rope_factor = rope_scaling.get("factor") + f_rope_scale = rope_factor + if typ == "linear": + rope_scaling_type = RopeScalingType.LINEAR + elif typ == "yarn": + rope_scaling_type = RopeScalingType.YARN + n_orig_ctx = rope_scaling['original_max_position_embeddings'] + rope_finetuned = rope_scaling['finetuned'] + else: + raise NotImplementedError(f'Unknown rope scaling type: {typ}') if "max_sequence_length" in config: n_ctx = config["max_sequence_length"] @@ -214,16 +226,18 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: "Suggestion: provide 'config.json' of the model in the same directory containing model files.") return Params( - n_vocab = config["vocab_size"], - n_embd = config["hidden_size"], - n_layer = config["num_hidden_layers"], - n_ctx = n_ctx, - n_ff = config["intermediate_size"], - n_head = config["num_attention_heads"], - n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head, - f_norm_eps = config["rms_norm_eps"], - f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None, - f_rope_scale = f_rope_scale, + n_vocab = config["vocab_size"], + n_embd = config["hidden_size"], + n_layer = config["num_hidden_layers"], + n_ctx = n_ctx, + n_ff = config["intermediate_size"], + n_head = config["num_attention_heads"], + n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head, + f_norm_eps = config["rms_norm_eps"], + f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None, + f_rope_scale = f_rope_scale, + n_orig_ctx = n_orig_ctx, + rope_finetuned = rope_finetuned, ) # LLaMA v2 70B params.json @@ -819,8 +833,15 @@ def add_meta_arch(self, params: Params) -> None: if params.f_rope_freq_base is not None: self.gguf.add_rope_freq_base(params.f_rope_freq_base) - if params.f_rope_scale is not None: - self.gguf.add_rope_scale_linear(params.f_rope_scale) + if params.rope_scaling_type: + self.gguf.add_rope_scaling_type(params.rope_scaling_type) + self.gguf.add_rope_scaling_factor(params.f_rope_scale) + + if params.n_orig_ctx is not None: + self.gguf.add_rope_original_context_length(params.n_orig_ctx) + + if params.rope_finetuned is not None: + self.gguf.add_rope_finetuned(params.rope_finetuned) if params.ftype is not None: self.gguf.add_file_type(params.ftype) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0fb4e2c324b55..3a1c55b1c939c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -701,12 +701,14 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" --rope-scaling {none,linear,yarn}\n"); + printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); - printf(" --rope-ext-factor N RoPE extrapolation mix factor (default: %.1f)\n", params.rope_ext_factor); - printf(" --rope-attn-factor N RoPE magnitude scaling factor (default: %.1f)\n", params.rope_attn_factor); - printf(" --rope-beta-fast N RoPE low correction dim (default: %.1f)\n", params.rope_beta_fast); - printf(" --rope-beta-slow N RoPE high correction dim (default: %.1f)\n", params.rope_beta_slow); + printf(" --yarn-ext-factor N YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor); + printf(" --yarn-attn-factor N YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor); + printf(" --yarn-beta-fast N YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast); + printf(" --yarn-beta-slow N YaRN high correction dim (default: %.1f)\n", params.yarn_beta_slow); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); @@ -824,6 +826,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_ctx = std::stoi(argv[i]); } + else if (arg == "--rope-scaling") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + else { invalid_param = true; break; } + } else if (arg == "--rope-freq-base") { if (++i >= argc) @@ -842,37 +857,37 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.rope_freq_scale = std::stof(argv[i]); } - else if (arg == "--rope-ext-factor") + else if (arg == "--yarn-ext-factor") { if (++i >= argc) { invalid_param = true; break; } - params.rope_ext_factor = std::stof(argv[i]); + params.yarn_ext_factor = std::stof(argv[i]); } - else if (arg == "--rope-attn-factor") + else if (arg == "--yarn-attn-factor") { if (++i >= argc) { invalid_param = true; break; } - params.rope_attn_factor = std::stof(argv[i]); + params.yarn_attn_factor = std::stof(argv[i]); } - else if (arg == "--rope-beta-fast") + else if (arg == "--yarn-beta-fast") { if (++i >= argc) { invalid_param = true; break; } - params.rope_beta_fast = std::stof(argv[i]); + params.yarn_beta_fast = std::stof(argv[i]); } - else if (arg == "--rope-beta-slow") + else if (arg == "--yarn-beta-slow") { if (++i >= argc) { invalid_param = true; break; } - params.rope_beta_slow = std::stof(argv[i]); + params.yarn_beta_slow = std::stof(argv[i]); } else if (arg == "--memory-f32" || arg == "--memory_f32") { diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index e0e0dbcbbe840..742ce214356e2 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -52,9 +52,12 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" # RoPE -KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count" -KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base" -KEY_ROPE_SCALE_LINEAR = "{arch}.rope.scale_linear" +KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count" +KEY_ROPE_FREQ_BASE = "{arch}.rope.freq_base" +KEY_ROPE_SCALING_TYPE = "{arch}.rope.scaling.type" +KEY_ROPE_SCALING_FACTOR = "{arch}.rope.scaling.factor" +KEY_ROPE_SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" +KEY_ROPE_SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" # tokenization KEY_TOKENIZER_MODEL = "tokenizer.ggml.model" @@ -407,6 +410,11 @@ class TokenType(IntEnum): UNUSED = 5 BYTE = 6 +class RopeScalingType(IntEnum): + NONE = 0 + LINEAR = 1 + YARN = 2 + # # implementation # @@ -760,8 +768,17 @@ def add_rope_dimension_count(self, count: int): def add_rope_freq_base(self, value: float): self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value) - def add_rope_scale_linear(self, value: float): - self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value) + def add_rope_scaling_type(self, value: RopeScalingType): + self.add_uint8(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), int(value)) + + def add_rope_scaling_factor(self, value: float): + self.add_float32(KEY_ROPE_SCALING_FACTOR.format(arch=self.arch), value) + + def add_rope_scaling_orig_ctx_len(self, value: int): + self.add_uint32(KEY_ROPE_SCALING_ORIG_CTX_LEN.format(arch=self.arch), value) + + def add_rope_scaling_finetuned(self, value: bool): + self.add_bool(KEY_ROPE_SCALING_FINETUNED.format(arch=self.arch), value) def add_tokenizer_model(self, model: str): self.add_string(KEY_TOKENIZER_MODEL, model) diff --git a/llama.cpp b/llama.cpp index 87aea24682087..cd545b254d1eb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -204,7 +204,10 @@ enum llm_kv { LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, - LLM_KV_ROPE_SCALE_LINEAR, + LLM_KV_ROPE_SCALING_TYPE, + LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, + LLM_KV_ROPE_SCALING_FINETUNED, LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_LIST, @@ -246,9 +249,12 @@ static std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, - { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, + { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, + { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -943,12 +949,17 @@ struct llama_hparams { float f_norm_eps; float f_norm_rms_eps; - float rope_freq_base; - float rope_freq_scale; - float rope_ext_factor; - float rope_attn_factor; - float rope_beta_fast; - float rope_beta_slow; + float rope_freq_base; + float rope_freq_scale; + bool rope_finetuned; + uint32_t n_yarn_orig_ctx; + + // These hyperparameters are not exposed in GGUF, because all + // existing YaRN models use the same values for them. + float yarn_ext_factor; + float yarn_attn_factor; + float yarn_beta_fast; + float yarn_beta_slow; bool operator!=(const llama_hparams & other) const { return static_cast(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT @@ -1660,10 +1671,10 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const hparams.n_ctx = params.n_ctx; hparams.rope_freq_base = params.rope_freq_base; hparams.rope_freq_scale = params.rope_freq_scale; - hparams.rope_ext_factor = params.rope_ext_factor; - hparams.rope_attn_factor = params.rope_attn_factor; - hparams.rope_beta_fast = params.rope_beta_fast; - hparams.rope_beta_slow = params.rope_beta_slow; + hparams.yarn_ext_factor = params.yarn_ext_factor; + hparams.yarn_attn_factor = params.yarn_attn_factor; + hparams.yarn_beta_fast = params.yarn_beta_fast; + hparams.yarn_beta_slow = params.yarn_beta_slow; // get general kv GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME)); @@ -1680,6 +1691,14 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const hparams.n_head_kv = hparams.n_head; GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); + hparams.rope_finetuned = false; + GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false, + kv(LLM_KV_ROPE_SCALING_FINETUNED)); + + hparams.n_yarn_orig_ctx = 0; + GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, + kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN)); + // rope_freq_base (optional) if (hparams.rope_freq_base == 0.0f) { float rope_freq_base = 10000.0f; @@ -1687,13 +1706,28 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const hparams.rope_freq_base = rope_freq_base; } + llama_rope_scaling_type rope_scaling_type = params.rope_scaling_type; + + if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { + uint8_t type = LLAMA_ROPE_SCALING_LINEAR; + GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE)); + rope_scaling_type = llama_rope_scaling_type(type); + } + GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE); + // rope_freq_scale (inverse of the kv) is optional - if (hparams.rope_freq_scale == 0.0f) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { + hparams.rope_freq_scale = 1.0f; + } else if (hparams.rope_freq_scale == 0.0f) { float ropescale = 1.0f; - GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR)); hparams.rope_freq_scale = 1.0f/ropescale; } + if (rope_scaling_type == LLAMA_ROPE_SCALING_YARN) { + hparams.yarn_ext_factor = 1.0f; // enable YaRN + } + // sanity check for n_rot (optional) { hparams.n_rot = hparams.n_embd / hparams.n_head; @@ -1902,6 +1936,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); + LLAMA_LOG_INFO("%s: YaRN scaling = %g\n", __func__, hparams.yarn_ext_factor); + LLAMA_LOG_INFO("%s: YaRN orig ctx = %u\n", __func__, hparams.n_yarn_orig_ctx); + LLAMA_LOG_INFO("%s: YaRN beta_fast = %f\n", __func__, hparams.yarn_beta_fast); + LLAMA_LOG_INFO("%s: YaRN beta_slow = %f\n", __func__, hparams.yarn_beta_slow); + LLAMA_LOG_INFO("%s: RoPE finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "no"); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); @@ -2444,10 +2483,10 @@ static struct ggml_cgraph * llm_build_llama( const float freq_base = hparams.rope_freq_base; const float freq_scale = hparams.rope_freq_scale; - const float ext_factor = hparams.rope_ext_factor; - const float attn_factor = hparams.rope_attn_factor; - const float beta_fast = hparams.rope_beta_fast; - const float beta_slow = hparams.rope_beta_slow; + const float ext_factor = hparams.yarn_ext_factor; + const float attn_factor = hparams.yarn_attn_factor; + const float beta_fast = hparams.yarn_beta_fast; + const float beta_slow = hparams.yarn_beta_slow; const float norm_rms_eps = hparams.f_norm_eps; const int n_gpu_layers = model.n_gpu_layers; @@ -2561,15 +2600,13 @@ static struct ggml_cgraph * llm_build_llama( struct ggml_tensor * Kcur = ggml_rope_custom_inplace( ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, - freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); + freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); struct ggml_tensor * Qcur = ggml_rope_custom_inplace( ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, - freq_scale, ext_factor, attn_factor, beta_fast, beta_slow - ); + freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); @@ -2786,6 +2823,10 @@ static struct ggml_cgraph * llm_build_baichaun( const float freq_base = hparams.rope_freq_base; const float freq_scale = hparams.rope_freq_scale; + const float ext_factor = hparams.yarn_ext_factor; + const float attn_factor = hparams.yarn_attn_factor; + const float beta_fast = hparams.yarn_beta_fast; + const float beta_slow = hparams.yarn_beta_slow; const float norm_rms_eps = hparams.f_norm_rms_eps; const int n_gpu_layers = model.n_gpu_layers; @@ -2901,8 +2942,16 @@ static struct ggml_cgraph * llm_build_baichaun( struct ggml_tensor * Qcur; switch (model.type) { case MODEL_7B: - Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); - Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); + Kcur = ggml_rope_custom_inplace( + ctx0, + ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), + n_past, n_embd_head, 0, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + Qcur = ggml_rope_custom_inplace( + ctx0, + ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), + n_past, n_embd_head, 0, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); break; case MODEL_13B: Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N); @@ -3146,10 +3195,10 @@ static struct ggml_cgraph * llm_build_falcon( const float freq_base = hparams.rope_freq_base; const float freq_scale = hparams.rope_freq_scale; - const float ext_factor = hparams.rope_ext_factor; - const float attn_factor = hparams.rope_attn_factor; - const float beta_fast = hparams.rope_beta_fast; - const float beta_slow = hparams.rope_beta_slow; + const float ext_factor = hparams.yarn_ext_factor; + const float attn_factor = hparams.yarn_attn_factor; + const float beta_fast = hparams.yarn_beta_fast; + const float beta_slow = hparams.yarn_beta_slow; const float norm_eps = hparams.f_norm_eps; const int n_gpu_layers = model.n_gpu_layers; @@ -3302,11 +3351,13 @@ static struct ggml_cgraph * llm_build_falcon( // using mode = 2 for neox mode struct ggml_tensor * Qcur = ggml_rope_custom_inplace( - ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ctx0, tmpq, n_past, n_embd_head, 2, 0, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); offload_func_kq(Qcur); struct ggml_tensor * Kcur = ggml_rope_custom_inplace( - ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ctx0, tmpk, n_past, n_embd_head, 2, 0, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); offload_func_kq(Kcur); @@ -6186,10 +6237,11 @@ struct llama_context_params llama_context_default_params() { /*.tensor_split =*/ nullptr, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, - /*.rope_ext_factor =*/ 0.0f, - /*.rope_attn_factor =*/ 1.0f, - /*.rope_beta_fast =*/ 32.0f, - /*.rope_beta_slow =*/ 1.0f, + /*.yarn_ext_factor =*/ 0.0f, + /*.yarn_attn_factor =*/ 1.0f, + /*.yarn_beta_fast =*/ 32.0f, + /*.yarn_beta_slow =*/ 1.0f, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, diff --git a/llama.h b/llama.h index 762362bc6b51e..5d69997bf5a4b 100644 --- a/llama.h +++ b/llama.h @@ -108,6 +108,14 @@ extern "C" { LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; + enum llama_rope_scaling_type: int8_t { + LLAMA_ROPE_SCALING_UNSPECIFIED = -1, + LLAMA_ROPE_SCALING_NONE = 0, + LLAMA_ROPE_SCALING_LINEAR = 1, + LLAMA_ROPE_SCALING_YARN = 2, + LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, + }; + typedef struct llama_token_data { llama_token id; // token id float logit; // log-odds of the token @@ -134,10 +142,12 @@ extern "C" { // ref: https://github.com/ggerganov/llama.cpp/pull/2054 float rope_freq_base; // RoPE base frequency float rope_freq_scale; // RoPE frequency scaling factor - float rope_ext_factor; // RoPE extrapolation mix factor - float rope_attn_factor; // RoPE magnitude scaling factor - float rope_beta_fast; // RoPE low correction dim - float rope_beta_slow; // RoPE high correction dim + float yarn_ext_factor; // YaRN extrapolation mix factor + float yarn_attn_factor; // YaRN magnitude scaling factor + float yarn_beta_fast; // YaRN low correction dim + float yarn_beta_slow; // YaRN high correction dim + + llama_rope_scaling_type rope_scaling_type; // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; @@ -145,14 +155,14 @@ extern "C" { void * progress_callback_user_data; // Keep the booleans together to avoid misalignment during copy-by-value. - bool low_vram; // if true, reduce VRAM usage at the cost of performance - bool mul_mat_q; // if true, use experimental mul_mat_q kernels - bool f16_kv; // use fp16 for KV cache - bool logits_all; // the llama_eval() call computes all logits, not just the last one - bool vocab_only; // only load the vocabulary, no weights - bool use_mmap; // use mmap if possible - bool use_mlock; // force system to keep model in RAM - bool embedding; // embedding mode only + bool low_vram; // if true, reduce VRAM usage at the cost of performance + bool mul_mat_q; // if true, use experimental mul_mat_q kernels + bool f16_kv; // use fp16 for KV cache + bool logits_all; // the llama_eval() call computes all logits, not just the last one + bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible + bool use_mlock; // force system to keep model in RAM + bool embedding; // embedding mode only }; // Signature for logging events From 56abb9a406ef34a995c56be838ebe6529cd50438 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Wed, 20 Sep 2023 22:25:21 -0400 Subject: [PATCH 11/28] fix convert scripts --- convert-baichuan-hf-to-gguf.py | 3 ++- convert.py | 25 +++++++++++++------------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py index 8bd34dc440769..ef68d5819fef2 100755 --- a/convert-baichuan-hf-to-gguf.py +++ b/convert-baichuan-hf-to-gguf.py @@ -154,7 +154,8 @@ def parse_args() -> argparse.Namespace: if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: if "type" in hparams["rope_scaling"]: if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) + gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) # TOKENIZATION diff --git a/convert.py b/convert.py index f08cf01c5786e..e098e9eef38ee 100755 --- a/convert.py +++ b/convert.py @@ -205,13 +205,13 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None rope_scaling = config.get("rope_scaling") - if rope_scaling is not None and typ := rope_scaling.get("type"): + if rope_scaling is not None and (typ := rope_scaling.get("type")): rope_factor = rope_scaling.get("factor") f_rope_scale = rope_factor if typ == "linear": - rope_scaling_type = RopeScalingType.LINEAR + rope_scaling_type = gguf.RopeScalingType.LINEAR elif typ == "yarn": - rope_scaling_type = RopeScalingType.YARN + rope_scaling_type = gguf.RopeScalingType.YARN n_orig_ctx = rope_scaling['original_max_position_embeddings'] rope_finetuned = rope_scaling['finetuned'] else: @@ -231,10 +231,10 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: n_layer = config["num_hidden_layers"], n_ctx = n_ctx, n_ff = config["intermediate_size"], - n_head = config["num_attention_heads"], - n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head, + n_head = (n_head := config["num_attention_heads"]), + n_head_kv = config.get("num_key_value_heads", n_head), f_norm_eps = config["rms_norm_eps"], - f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None, + f_rope_freq_base = config.get("rope_theta"), f_rope_scale = f_rope_scale, n_orig_ctx = n_orig_ctx, rope_finetuned = rope_finetuned, @@ -247,7 +247,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) # hack to determine LLaMA v1 vs v2 vs CodeLlama - if f_rope_freq_base == 1000000: + if config.get("rope_theta") == 1000000: # CodeLlama n_ctx = 16384 elif config["norm_eps"] == 1e-05: @@ -263,10 +263,10 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: n_layer = config["n_layers"], n_ctx = n_ctx, n_ff = model["layers.0.feed_forward.w1.weight"].shape[0], - n_head = config["n_heads"], - n_head_kv = config["n_kv_heads"] if "n_kv_heads" in config else n_head, + n_head = (n_head := config["n_heads"]), + n_head_kv = config.get("n_kv_heads", n_head), f_norm_eps = config["norm_eps"], - f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None, + f_rope_freq_base = config.get("rope_theta"), ) @staticmethod @@ -834,14 +834,15 @@ def add_meta_arch(self, params: Params) -> None: self.gguf.add_rope_freq_base(params.f_rope_freq_base) if params.rope_scaling_type: + assert params.f_rope_scale is not None self.gguf.add_rope_scaling_type(params.rope_scaling_type) self.gguf.add_rope_scaling_factor(params.f_rope_scale) if params.n_orig_ctx is not None: - self.gguf.add_rope_original_context_length(params.n_orig_ctx) + self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx) if params.rope_finetuned is not None: - self.gguf.add_rope_finetuned(params.rope_finetuned) + self.gguf.add_rope_scaling_finetuned(params.rope_finetuned) if params.ftype is not None: self.gguf.add_file_type(params.ftype) From 43eaf06a2f27e9a8dc65109a03b6025441c22f9a Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Wed, 20 Sep 2023 23:29:08 -0400 Subject: [PATCH 12/28] llama : fix C compatibility --- common/common.h | 3 +-- llama.cpp | 6 +++--- llama.h | 27 +++++++++++++-------------- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/common/common.h b/common/common.h index a1e7da128c113..e49db6fedcdaf 100644 --- a/common/common.h +++ b/common/common.h @@ -54,8 +54,7 @@ struct gpt_params { float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim - - llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; + int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // sampling parameters int32_t top_k = 40; // <= 0 to use vocab size diff --git a/llama.cpp b/llama.cpp index cd545b254d1eb..56c511b594333 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1706,12 +1706,12 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const hparams.rope_freq_base = rope_freq_base; } - llama_rope_scaling_type rope_scaling_type = params.rope_scaling_type; + int8_t rope_scaling_type = params.rope_scaling_type; if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { uint8_t type = LLAMA_ROPE_SCALING_LINEAR; GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE)); - rope_scaling_type = llama_rope_scaling_type(type); + rope_scaling_type = int8_t(type); } GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE); @@ -6234,6 +6234,7 @@ struct llama_context_params llama_context_default_params() { /*.n_batch =*/ 512, /*.n_gpu_layers =*/ 0, /*.main_gpu =*/ 0, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, /*.tensor_split =*/ nullptr, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, @@ -6241,7 +6242,6 @@ struct llama_context_params llama_context_default_params() { /*.yarn_attn_factor =*/ 1.0f, /*.yarn_beta_fast =*/ 32.0f, /*.yarn_beta_slow =*/ 1.0f, - /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, diff --git a/llama.h b/llama.h index 5d69997bf5a4b..6528254cb9f5d 100644 --- a/llama.h +++ b/llama.h @@ -108,7 +108,7 @@ extern "C" { LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; - enum llama_rope_scaling_type: int8_t { + enum llama_rope_scaling_type { LLAMA_ROPE_SCALING_UNSPECIFIED = -1, LLAMA_ROPE_SCALING_NONE = 0, LLAMA_ROPE_SCALING_LINEAR = 1, @@ -131,23 +131,22 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); struct llama_context_params { - uint32_t seed; // RNG seed, -1 for random - int32_t n_ctx; // text context - int32_t n_batch; // prompt processing batch size - int32_t n_gpu_layers; // number of layers to store in VRAM - int32_t main_gpu; // the GPU that is used for scratch and small tensors + uint32_t seed; // RNG seed, -1 for random + int32_t n_ctx; // text context + int32_t n_batch; // prompt processing batch size + int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t main_gpu; // the GPU that is used for scratch and small tensors + int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency - float rope_freq_scale; // RoPE frequency scaling factor - float yarn_ext_factor; // YaRN extrapolation mix factor - float yarn_attn_factor; // YaRN magnitude scaling factor - float yarn_beta_fast; // YaRN low correction dim - float yarn_beta_slow; // YaRN high correction dim - - llama_rope_scaling_type rope_scaling_type; + float rope_freq_base; // RoPE base frequency + float rope_freq_scale; // RoPE frequency scaling factor + float yarn_ext_factor; // YaRN extrapolation mix factor + float yarn_attn_factor; // YaRN magnitude scaling factor + float yarn_beta_fast; // YaRN low correction dim + float yarn_beta_slow; // YaRN high correction dim // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; From fe788c45c8d5a2c6e6c2f0ec04978419555c65b2 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Thu, 21 Sep 2023 00:01:48 -0400 Subject: [PATCH 13/28] don't hardcode max_pos_emb --- .../train-text-from-scratch.cpp | 2 +- ggml-cuda.cu | 27 ++--- ggml-metal.m | 19 ++-- ggml-metal.metal | 17 +-- ggml.c | 100 ++++++++++-------- ggml.h | 7 +- llama.cpp | 67 +++++++----- 7 files changed, 131 insertions(+), 108 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 8a5ad82bf3441..36415398e6fee 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -687,7 +687,7 @@ struct ggml_tensor * llama_build_train_graphs( const int rope_mode = 0; return ggml_rope_custom( - ctx, t, n_past, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f + ctx, t, n_past, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f ); }; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 394fd81fa3cd3..bd788ce4da4e1 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4386,7 +4386,7 @@ static __device__ void rope_yarn( // rope == RoPE == rotary positional embedding static __global__ void rope_f32( - float * x, float * dst, int ncols, float freq_scale, float ext_factor, float attn_factor, float theta_scale, + const float * x, float * dst, int ncols, float freq_scale, float ext_factor, float attn_factor, float theta_scale, float p0, int p_delta_rows, rope_corr_dims corr_dims ) { const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); @@ -5396,7 +5396,7 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons } static void rope_f32_cuda( - float * x, float * dst, int ncols, int nrows, float freq_scale, float ext_factor, float attn_factor, + const float * x, float * dst, int ncols, int nrows, float freq_scale, float ext_factor, float attn_factor, float theta_scale, float p0, int p_delta_rows, rope_corr_dims corr_dims, cudaStream_t stream ) { GGML_ASSERT(ncols % 2 == 0); @@ -6109,19 +6109,20 @@ inline void ggml_cuda_op_rope( const int64_t ne01 = src0->ne[1]; const int64_t nrows = ggml_nrows(src0); - const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; // RoPE alteration for extended context float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); const float theta_scale = powf(freq_base, -2.0f/n_dims); const float p0 = (mode & 1) == 0 ? n_past : 0; @@ -6137,7 +6138,7 @@ inline void ggml_cuda_op_rope( rope_neox_f32_cuda(src0_dd, dst_dd, ne00, nrows, p0, freq_scale, ne01, theta_scale, main_stream); } else { rope_corr_dims corr_dims; - ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims.v); + ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v); rope_f32_cuda( src0_dd, dst_dd, ne00, nrows, freq_scale, ext_factor, attn_factor, theta_scale, p0, ne01, corr_dims, diff --git a/ggml-metal.m b/ggml-metal.m index 06d97695bc0b2..c0607e844d146 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1176,17 +1176,18 @@ void ggml_metal_graph_compute( } break; case GGML_OP_ROPE: { - const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[3]; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; diff --git a/ggml-metal.metal b/ggml-metal.metal index c5e0ee8a042f9..ddf81fb7b9e04 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -832,18 +832,18 @@ static void rope_yarn( *sin_theta = sinf(theta) * mscale; } -constant float max_pos_emb = 2048; - // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` -static float rope_yarn_corr_factor(const int n_dims, const float n_rot, const float base) { - return n_dims * log(max_pos_emb / (n_rot * 2 * M_PI_F)) / (2 * log(base)); +static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) { + return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base)); } -static void rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]) { +static void rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] +) { // start and end correction dims - dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, beta_fast, freq_base))); - dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, beta_slow, freq_base))); + dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base))); + dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base))); } kernel void kernel_rope( @@ -868,6 +868,7 @@ kernel void kernel_rope( constant int & n_past, constant int & n_dims, constant int & mode, + constant int & n_orig_ctx, constant float & freq_base, constant float & freq_scale, constant float & ext_factor, @@ -884,7 +885,7 @@ kernel void kernel_rope( const bool is_neox = mode & 2; float corr_dims[2]; - rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims); + rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); const int64_t p = (mode & 1) == 0 ? n_past + i2 : i2; diff --git a/ggml.c b/ggml.c index 53137924d0014..56b9fdd291aa0 100644 --- a/ggml.c +++ b/ggml.c @@ -6973,6 +6973,7 @@ static struct ggml_tensor * ggml_rope_impl( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, float ext_factor, @@ -6991,15 +6992,15 @@ static struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[12] = { n_past, n_dims, mode, n_ctx }; - memcpy(params + 4, &freq_base, sizeof(float)); - memcpy(params + 5, &freq_scale, sizeof(float)); - memcpy(params + 6, &ext_factor, sizeof(float)); - memcpy(params + 7, &attn_factor, sizeof(float)); - memcpy(params + 8, &beta_fast, sizeof(float)); - memcpy(params + 9, &beta_slow, sizeof(float)); - memcpy(params + 10, &xpos_base, sizeof(float)); - memcpy(params + 11, &xpos_down, sizeof(bool)); + int32_t params[13] = { n_past, n_dims, mode, n_ctx, n_orig_ctx }; + memcpy(params + 5, &freq_base, sizeof(float)); + memcpy(params + 6, &freq_scale, sizeof(float)); + memcpy(params + 7, &ext_factor, sizeof(float)); + memcpy(params + 8, &attn_factor, sizeof(float)); + memcpy(params + 9, &beta_fast, sizeof(float)); + memcpy(params + 10, &beta_slow, sizeof(float)); + memcpy(params + 11, &xpos_base, sizeof(float)); + memcpy(params + 12, &xpos_down, sizeof(bool)); ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ROPE; @@ -7017,7 +7018,7 @@ struct ggml_tensor * ggml_rope( int mode, int n_ctx) { return ggml_rope_impl( - ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false + ctx, a, n_past, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false ); } @@ -7029,7 +7030,7 @@ struct ggml_tensor * ggml_rope_inplace( int mode, int n_ctx) { return ggml_rope_impl( - ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true + ctx, a, n_past, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true ); } @@ -7040,6 +7041,7 @@ struct ggml_tensor * ggml_rope_custom( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, float ext_factor, @@ -7047,8 +7049,8 @@ struct ggml_tensor * ggml_rope_custom( float beta_fast, float beta_slow) { return ggml_rope_impl( - ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, - false, false + ctx, a, n_past, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false ); } @@ -7059,6 +7061,7 @@ struct ggml_tensor * ggml_rope_custom_inplace( int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, float ext_factor, @@ -7066,8 +7069,8 @@ struct ggml_tensor * ggml_rope_custom_inplace( float beta_fast, float beta_slow) { return ggml_rope_impl( - ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, - false, true + ctx, a, n_past, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true ); } @@ -7078,7 +7081,7 @@ struct ggml_tensor * ggml_rope_xpos_inplace( int n_dims, float base, bool down) { - return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true); + return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true); } // ggml_rope_back @@ -12675,15 +12678,16 @@ static void rope_yarn( // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get // `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` -static float ggml_rope_yarn_corr_dim(const int n_dims, const float n_rot, const float base) { - static const float max_pos_emb = 2048; - return n_dims * logf(max_pos_emb / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) { + return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); } -void ggml_rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]) { +void ggml_rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] +) { // start and end correction dims - dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, beta_fast, freq_base))); - dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, beta_slow, freq_base))); + dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base))); + dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base))); } static void ggml_compute_forward_rope_f32( @@ -12701,18 +12705,20 @@ static void ggml_compute_forward_rope_f32( float xpos_base; bool xpos_down; - const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&xpos_base, (int32_t *) dst->op_params + 10, sizeof(float)); - memcpy(&xpos_down, (int32_t *) dst->op_params + 11, sizeof(bool)); + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float)); + memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool)); assert(n_past >= 0); @@ -12743,7 +12749,7 @@ static void ggml_compute_forward_rope_f32( const float theta_scale = powf(freq_base, -2.0f/n_dims); float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims); + ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12844,16 +12850,17 @@ static void ggml_compute_forward_rope_f16( float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; - memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 9, sizeof(float)); + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); assert(n_past >= 0); @@ -12884,7 +12891,7 @@ static void ggml_compute_forward_rope_f16( const float theta_scale = powf(freq_base, -2.0f/n_dims); float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, freq_base, beta_fast, beta_slow, corr_dims); + ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -16641,6 +16648,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor n_past, n_dims, mode, + 0, n_ctx, freq_base, freq_scale, diff --git a/ggml.h b/ggml.h index 5078fb7b5c4b7..26f7cf024f8ab 100644 --- a/ggml.h +++ b/ggml.h @@ -219,7 +219,7 @@ #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 6 #define GGML_MAX_NAME 64 -#define GGML_MAX_OP_PARAMS 48 +#define GGML_MAX_OP_PARAMS 64 #define GGML_DEFAULT_N_THREADS 4 #if UINTPTR_MAX == 0xFFFFFFFF @@ -1248,6 +1248,7 @@ extern "C" { int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, float ext_factor, @@ -1263,6 +1264,7 @@ extern "C" { int n_dims, int mode, int n_ctx, + int n_orig_ctx, float freq_base, float freq_scale, float ext_factor, @@ -1271,7 +1273,8 @@ extern "C" { float beta_slow); // compute correction dims for YaRN RoPE scaling - void ggml_rope_yarn_corr_dims(int n_dims, const float freq_base, float beta_fast, float beta_slow, float dims[2]); + void ggml_rope_yarn_corr_dims( + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); // xPos RoPE, in-place, returns view(a) GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( diff --git a/llama.cpp b/llama.cpp index 56c511b594333..7184c376c8a45 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2471,13 +2471,14 @@ static struct ggml_cgraph * llm_build_llama( GGML_ASSERT(!!kv_self.ctx); - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int32_t n_embd = hparams.n_embd; + const int32_t n_layer = hparams.n_layer; + const int32_t n_ctx = hparams.n_ctx; + const int32_t n_orig_ctx = hparams.n_yarn_orig_ctx; + const int32_t n_head = hparams.n_head; + const int32_t n_head_kv = hparams.n_head_kv; + const int32_t n_embd_head = hparams.n_embd_head(); + const int32_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -2599,14 +2600,18 @@ static struct ggml_cgraph * llm_build_llama( ggml_set_name(tmpq, "tmpq"); struct ggml_tensor * Kcur = ggml_rope_custom_inplace( - ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, - freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), + n_past, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); struct ggml_tensor * Qcur = ggml_rope_custom_inplace( - ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, - freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), + n_past, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); @@ -2811,13 +2816,14 @@ static struct ggml_cgraph * llm_build_baichaun( GGML_ASSERT(!!kv_self.ctx); - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int32_t n_embd = hparams.n_embd; + const int32_t n_layer = hparams.n_layer; + const int32_t n_ctx = hparams.n_ctx; + const int32_t n_orig_ctx = hparams.n_yarn_orig_ctx; + const int32_t n_head = hparams.n_head; + const int32_t n_head_kv = hparams.n_head_kv; + const int32_t n_embd_head = hparams.n_embd_head(); + const int32_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -2945,12 +2951,14 @@ static struct ggml_cgraph * llm_build_baichaun( Kcur = ggml_rope_custom_inplace( ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), - n_past, n_embd_head, 0, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + n_past, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow ); Qcur = ggml_rope_custom_inplace( ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), - n_past, n_embd_head, 0, 0, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + n_past, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow ); break; case MODEL_13B: @@ -3183,13 +3191,14 @@ static struct ggml_cgraph * llm_build_falcon( GGML_ASSERT(!!kv_self.ctx); - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = hparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int32_t n_embd = hparams.n_embd; + const int32_t n_layer = hparams.n_layer; + const int32_t n_ctx = hparams.n_ctx; + const int32_t n_orig_ctx = hparams.n_yarn_orig_ctx; + const int32_t n_head = hparams.n_head; + const int32_t n_head_kv = hparams.n_head_kv; + const int32_t n_embd_head = hparams.n_embd_head(); + const int32_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3351,12 +3360,12 @@ static struct ggml_cgraph * llm_build_falcon( // using mode = 2 for neox mode struct ggml_tensor * Qcur = ggml_rope_custom_inplace( - ctx0, tmpq, n_past, n_embd_head, 2, 0, + ctx0, tmpq, n_past, n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); offload_func_kq(Qcur); struct ggml_tensor * Kcur = ggml_rope_custom_inplace( - ctx0, tmpk, n_past, n_embd_head, 2, 0, + ctx0, tmpk, n_past, n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); offload_func_kq(Kcur); From e0b120c3cae9e38f3a0b5eae26d43fda87c1c910 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Thu, 21 Sep 2023 15:00:08 -0400 Subject: [PATCH 14/28] address review comments --- common/common.cpp | 4 ++-- examples/server/server.cpp | 2 +- ggml.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index ca4b9c1cc8a16..3e3bdf18b348a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -683,9 +683,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); printf(" --rope-scaling {none,linear,yarn}\n"); printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); - printf(" --rope-scale N RoPE context scaling factor, inverse of --rope-freq-scale\n"); + printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); printf(" --yarn-ext-factor N YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor); printf(" --yarn-attn-factor N YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor); printf(" --yarn-beta-fast N YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3a1c55b1c939c..e87bfd508fcdc 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -704,7 +704,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --rope-scaling {none,linear,yarn}\n"); printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); printf(" --yarn-ext-factor N YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor); printf(" --yarn-attn-factor N YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor); printf(" --yarn-beta-fast N YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast); diff --git a/ggml.c b/ggml.c index 56b9fdd291aa0..398b88cbe37d2 100644 --- a/ggml.c +++ b/ggml.c @@ -12650,7 +12650,7 @@ static void ggml_compute_forward_clamp( // ggml_compute_forward_rope -static inline float rope_yarn_ramp(const float low, const float high, const int i0) { +static float rope_yarn_ramp(const float low, const float high, const int i0) { const float y = (i0 / 2 - low) / MIN(0.001f, high - low); return 1 - MIN(1, MAX(0, y)); } From 19bb74e74c036e8eddfcfe101cfd8940f42df078 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Thu, 21 Sep 2023 15:10:39 -0400 Subject: [PATCH 15/28] restore backwards compatiblity with *.rope.scale_linear --- llama.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 7184c376c8a45..d862541d1f80c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -204,6 +204,7 @@ enum llm_kv { LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_FREQ_BASE, + LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_FACTOR, LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, @@ -251,6 +252,7 @@ static std::map LLM_KV_NAMES = { { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, @@ -1719,9 +1721,12 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { hparams.rope_freq_scale = 1.0f; } else if (hparams.rope_freq_scale == 0.0f) { - float ropescale = 1.0f; + float ropescale = 0.0f; GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR)); - hparams.rope_freq_scale = 1.0f/ropescale; + if (ropescale == 0.0f) { // try the old key name + GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + } + hparams.rope_freq_scale = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; } if (rope_scaling_type == LLAMA_ROPE_SCALING_YARN) { From 4d5fe73449ae71f6163b0ec165cae5de56415efc Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Thu, 21 Sep 2023 15:41:06 -0400 Subject: [PATCH 16/28] better option descriptions in help --- common/common.cpp | 8 ++++---- examples/server/server.cpp | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 3e3bdf18b348a..6159df5654211 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -686,10 +686,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); - printf(" --yarn-ext-factor N YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor); - printf(" --yarn-attn-factor N YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor); - printf(" --yarn-beta-fast N YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast); - printf(" --yarn-beta-slow N YaRN high correction dim (default: %.1f)\n", params.yarn_beta_slow); + printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); + printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); + printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); + printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index e87bfd508fcdc..33bc8c2b8c76f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -705,10 +705,10 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); - printf(" --yarn-ext-factor N YaRN extrapolation mix factor (default: %.1f)\n", params.yarn_ext_factor); - printf(" --yarn-attn-factor N YaRN magnitude scaling factor (default: %.1f)\n", params.yarn_attn_factor); - printf(" --yarn-beta-fast N YaRN low correction dim (default: %.1f)\n", params.yarn_beta_fast); - printf(" --yarn-beta-slow N YaRN high correction dim (default: %.1f)\n", params.yarn_beta_slow); + printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); + printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); + printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); + printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); From 746641574a5597a3c0d4c11fcea05327e9c9556c Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Sat, 7 Oct 2023 12:57:55 -0400 Subject: [PATCH 17/28] gguf : store scaling type as a string instead of an int --- gguf-py/gguf/gguf.py | 10 +++++----- llama.cpp | 22 +++++++++++++++++++--- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 742ce214356e2..6bcb33701f71e 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -410,10 +410,10 @@ class TokenType(IntEnum): UNUSED = 5 BYTE = 6 -class RopeScalingType(IntEnum): - NONE = 0 - LINEAR = 1 - YARN = 2 +class RopeScalingType(Enum): + NONE = 'none' + LINEAR = 'linear' + YARN = 'yarn' # # implementation @@ -769,7 +769,7 @@ def add_rope_freq_base(self, value: float): self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value) def add_rope_scaling_type(self, value: RopeScalingType): - self.add_uint8(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), int(value)) + self.add_string(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), value.value) def add_rope_scaling_factor(self, value: float): self.add_float32(KEY_ROPE_SCALING_FACTOR.format(arch=self.arch), value) diff --git a/llama.cpp b/llama.cpp index d862541d1f80c..4f68ba69259bc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -470,6 +470,22 @@ struct LLM_TN { } \ } +static std::map LLAMA_ROPE_SCALING_TYPES = { + { LLAMA_ROPE_SCALING_NONE, "none" }, + { LLAMA_ROPE_SCALING_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_YARN, "yarn" }, +}; + +static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) { + for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) { + if (kv.second == name) { + return kv.first; + } + } + + return LLAMA_ROPE_SCALING_UNSPECIFIED; +} + // // ggml helpers // @@ -1711,9 +1727,9 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const int8_t rope_scaling_type = params.rope_scaling_type; if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { - uint8_t type = LLAMA_ROPE_SCALING_LINEAR; - GGUF_GET_KEY(ctx, type, gguf_get_val_u8, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE)); - rope_scaling_type = int8_t(type); + std::string type("linear"); + GGUF_GET_KEY(ctx, type, gguf_get_val_str, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE)); + rope_scaling_type = int8_t(llama_rope_scaling_type_from_string(type)); } GGML_ASSERT(rope_scaling_type >= 0 && rope_scaling_type <= LLAMA_ROPE_SCALING_MAX_VALUE); From 4f4e94804dd986e00c7a8978900f015c5ac08c7e Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Sat, 7 Oct 2023 12:59:27 -0400 Subject: [PATCH 18/28] improve printing of YaRN parameters --- llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 4f68ba69259bc..25f129e570205 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1957,11 +1957,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); - LLAMA_LOG_INFO("%s: YaRN scaling = %g\n", __func__, hparams.yarn_ext_factor); + LLAMA_LOG_INFO("%s: YaRN extension = %g\n", __func__, hparams.yarn_ext_factor); LLAMA_LOG_INFO("%s: YaRN orig ctx = %u\n", __func__, hparams.n_yarn_orig_ctx); - LLAMA_LOG_INFO("%s: YaRN beta_fast = %f\n", __func__, hparams.yarn_beta_fast); - LLAMA_LOG_INFO("%s: YaRN beta_slow = %f\n", __func__, hparams.yarn_beta_slow); - LLAMA_LOG_INFO("%s: RoPE finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "no"); + LLAMA_LOG_INFO("%s: YaRN beta_fast = %.1f\n", __func__, hparams.yarn_beta_fast); + LLAMA_LOG_INFO("%s: YaRN beta_slow = %.1f\n", __func__, hparams.yarn_beta_slow); + LLAMA_LOG_INFO("%s: RoPE finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); From 5d7a3a5c0dcfb4ad31a2532d3223e43dde78741b Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Sat, 7 Oct 2023 13:20:33 -0400 Subject: [PATCH 19/28] allow forcing ext_factor to zero if scaling type is YaRN --- llama.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 25f129e570205..e6ce4486b17ca 100644 --- a/llama.cpp +++ b/llama.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -1735,7 +1736,7 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const // rope_freq_scale (inverse of the kv) is optional if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { - hparams.rope_freq_scale = 1.0f; + hparams.rope_freq_scale = 1.0f; // never scale if scaling type is none } else if (hparams.rope_freq_scale == 0.0f) { float ropescale = 0.0f; GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR)); @@ -1745,8 +1746,8 @@ static void llm_load_hparams(llama_model_loader & ml, llama_model & model, const hparams.rope_freq_scale = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; } - if (rope_scaling_type == LLAMA_ROPE_SCALING_YARN) { - hparams.yarn_ext_factor = 1.0f; // enable YaRN + if (std::isnan(hparams.yarn_ext_factor)) { // NaN indicates 'not set' + hparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; } // sanity check for n_rot (optional) @@ -6268,7 +6269,7 @@ struct llama_context_params llama_context_default_params() { /*.tensor_split =*/ nullptr, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, - /*.yarn_ext_factor =*/ 0.0f, + /*.yarn_ext_factor =*/ NAN, /*.yarn_attn_factor =*/ 1.0f, /*.yarn_beta_fast =*/ 32.0f, /*.yarn_beta_slow =*/ 1.0f, From babf0e0c8fce8c2045c36e1ba26c456968967664 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Sun, 8 Oct 2023 14:31:35 -0400 Subject: [PATCH 20/28] fix rope_cuda parameter order --- ggml-cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 487ccdd18c7d4..8e1ad45620b73 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5469,8 +5469,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons template static void rope_cuda( - const T * x, T * dst, int ncols, int nrows, float freq_scale, float ext_factor, float attn_factor, - float theta_scale, const int32_t * pos, int p_delta_rows, rope_corr_dims corr_dims, cudaStream_t stream + const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float theta_scale, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream ) { GGML_ASSERT(ncols % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); From 0050e1ecc6fcb3f659ccbc964866350c35376114 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Sun, 8 Oct 2023 16:51:51 -0400 Subject: [PATCH 21/28] default n_yarn_orig_ctx to n_ctx_train --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index c5403902816eb..6b3627c0fab3d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2011,7 +2011,7 @@ static void llm_load_hparams( GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false, kv(LLM_KV_ROPE_SCALING_FINETUNED)); - hparams.n_yarn_orig_ctx = 0; + hparams.n_yarn_orig_ctx = hparams.n_ctx_train; GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN)); From 09c31027db2e620d7b97b827fe5b6e3945fd7504 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Sun, 8 Oct 2023 16:59:23 -0400 Subject: [PATCH 22/28] fix uninitialized cparams --- llama.cpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 6b3627c0fab3d..ab2342bf390b9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7888,14 +7888,18 @@ struct llama_context * llama_new_context_with_model( const auto & hparams = model->hparams; auto & cparams = ctx->cparams; - cparams.n_batch = params.n_batch; - cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; - cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; - cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; - cparams.yarn_ext_factor = params.yarn_ext_factor; - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch; - cparams.mul_mat_q = params.mul_mat_q; + cparams.n_batch = params.n_batch; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.yarn_ext_factor = params.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.mul_mat_q = params.mul_mat_q; + + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; + cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; auto rope_scaling_type = params.rope_scaling_type; if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { From 57c3442ea5e0a84339e932807671835181220e63 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Sun, 8 Oct 2023 18:10:02 -0400 Subject: [PATCH 23/28] make printed param formatting more consistent --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index ab2342bf390b9..107dca5a63a2b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2256,11 +2256,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); - LLAMA_LOG_INFO("%s: RoPE scaling = %s\n", __func__, rope_scaling_type.c_str()); + LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); - LLAMA_LOG_INFO("%s: YaRN orig ctx = %u\n", __func__, hparams.n_yarn_orig_ctx); - LLAMA_LOG_INFO("%s: RoPE finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); + LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx); + LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9); From a20b3e6cf05d7d4d9135bb418ceef4cdf1f34c65 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Wed, 11 Oct 2023 17:36:11 -0400 Subject: [PATCH 24/28] fix missing import --- gguf-py/gguf/gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 3a6cd75dd0afe..0a54bc7da3800 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -7,7 +7,7 @@ import struct import sys import tempfile -from enum import IntEnum, auto +from enum import Enum, IntEnum, auto from io import BufferedWriter from pathlib import Path from typing import IO, Any, BinaryIO, Callable, Sequence From 9ae10b3aee5e104a8f6477e9ed0f9708281e16e2 Mon Sep 17 00:00:00 2001 From: Jeffrey Quesnelle Date: Thu, 19 Oct 2023 19:36:16 -0700 Subject: [PATCH 25/28] Fix YaRN inverted scaling and add "rope.scaling.type" to GGUF (#1) --- convert.py | 1 + ggml-cuda.cu | 4 ++-- ggml-metal.metal | 4 ++-- ggml.c | 4 ++-- llama.cpp | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/convert.py b/convert.py index 6e294b503e785..175f4b14b588c 100755 --- a/convert.py +++ b/convert.py @@ -234,6 +234,7 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: n_head_kv = config.get("num_key_value_heads", n_head), f_norm_eps = config["rms_norm_eps"], f_rope_freq_base = config.get("rope_theta"), + rope_scaling_type = rope_scaling_type, f_rope_scale = f_rope_scale, n_orig_ctx = n_orig_ctx, rope_finetuned = rope_finetuned, diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 95e1ae4c6c9f2..ff7b1e90a2758 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4429,8 +4429,8 @@ static __device__ void rope_yarn( } // Get n-d magnitude scaling corrected for interpolation - if (freq_scale > 1.0f) - mscale *= 1.0f + 0.1f * logf(freq_scale); + if (freq_scale < 1.0f) + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); *cos_theta = cosf(theta) * mscale; *sin_theta = sinf(theta) * mscale; } diff --git a/ggml-metal.metal b/ggml-metal.metal index 6fd3f9aa006ff..2064884fffe41 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -899,8 +899,8 @@ static void rope_yarn( } // Get n-d magnitude scaling corrected for interpolation - if (freq_scale > 1.0f) - mscale *= 1.0f + 0.1f * logf(freq_scale); + if (freq_scale < 1.0f) + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); *cos_theta = cosf(theta) * mscale; *sin_theta = sinf(theta) * mscale; } diff --git a/ggml.c b/ggml.c index 4b40a4e71fa6b..a243418106ee3 100644 --- a/ggml.c +++ b/ggml.c @@ -13364,8 +13364,8 @@ static void rope_yarn( } // Get n-d magnitude scaling corrected for interpolation - if (freq_scale > 1.0f) - mscale *= 1.0f + 0.1f * logf(freq_scale); + if (freq_scale < 1.0f) + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); *cos_theta = cosf(theta) * mscale; *sin_theta = sinf(theta) * mscale; } diff --git a/llama.cpp b/llama.cpp index faeee0d3ac073..cbab5f580969c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2055,7 +2055,7 @@ static void llm_load_hparams( GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); std::string rope_scaling("linear"); - GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_UINT8, false, kv(LLM_KV_ROPE_SCALING_TYPE)); + GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE)); hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); From 14cf93b14c3e5160b383136f09bbb1344c1bf0ba Mon Sep 17 00:00:00 2001 From: Jeffrey Quesnelle Date: Fri, 20 Oct 2023 06:18:17 -0700 Subject: [PATCH 26/28] fix YaRN ramp, make mscale conditional, add --yarn-orig-ctx (#2) --- common/common.cpp | 8 ++++++++ common/common.h | 5 +++-- ggml-cuda.cu | 7 +++---- ggml-metal.metal | 7 +++---- ggml.c | 7 +++---- llama.cpp | 10 ++++++---- llama.h | 13 +++++++------ 7 files changed, 33 insertions(+), 24 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 3fafdfb38fef1..d0b05c1ba0a83 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -220,6 +220,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.rope_freq_scale = 1.0f/std::stof(argv[i]); + } else if (arg == "--yarn-orig-ctx") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_orig_ctx = std::stoi(argv[i]); } else if (arg == "--yarn-ext-factor") { if (++i >= argc) { invalid_param = true; @@ -737,6 +743,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); + printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n"); printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); @@ -861,6 +868,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_attn_factor = params.yarn_attn_factor; cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; + cparams.yarn_orig_ctx = params.yarn_orig_ctx; return cparams; } diff --git a/common/common.h b/common/common.h index 91993dba1850f..01c2661b03fe9 100644 --- a/common/common.h +++ b/common/common.h @@ -57,8 +57,9 @@ struct gpt_params { float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float yarn_ext_factor = NAN; // YaRN extrapolation mix factor float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor - float yarn_beta_fast = 32.0f; // YaRN low correction dim - float yarn_beta_slow = 1.0f; // YaRN high correction dim + float yarn_beta_fast = 32.0f;// YaRN low correction dim + float yarn_beta_slow = 1.0f; // YaRN high correction dim + int32_t yarn_orig_ctx = 0; // YaRN original context length int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // // sampling parameters diff --git a/ggml-cuda.cu b/ggml-cuda.cu index ff7b1e90a2758..4c6a36ca11d3a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4406,7 +4406,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, } static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) { - const float y = (i0 / 2 - low) / min(0.001f, high - low); + const float y = (i0 / 2 - low) / max(0.001f, high - low); return 1.0f - min(1.0f, max(0.0f, y)); } @@ -4426,11 +4426,10 @@ static __device__ void rope_yarn( if (ext_factor != 0.0f) { float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; - } - // Get n-d magnitude scaling corrected for interpolation - if (freq_scale < 1.0f) + // Get n-d magnitude scaling corrected for interpolation mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } *cos_theta = cosf(theta) * mscale; *sin_theta = sinf(theta) * mscale; } diff --git a/ggml-metal.metal b/ggml-metal.metal index 2064884fffe41..42729177433ba 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -880,7 +880,7 @@ kernel void kernel_alibi_f32( } static float rope_yarn_ramp(const float low, const float high, const int i0) { - const float y = (i0 / 2 - low) / min(0.001f, high - low); + const float y = (i0 / 2 - low) / max(0.001f, high - low); return 1.0f - min(1.0f, max(0.0f, y)); } @@ -896,11 +896,10 @@ static void rope_yarn( if (ext_factor != 0.0f) { ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; - } - // Get n-d magnitude scaling corrected for interpolation - if (freq_scale < 1.0f) + // Get n-d magnitude scaling corrected for interpolation mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } *cos_theta = cosf(theta) * mscale; *sin_theta = sinf(theta) * mscale; } diff --git a/ggml.c b/ggml.c index a243418106ee3..111ee3e56fe75 100644 --- a/ggml.c +++ b/ggml.c @@ -13345,7 +13345,7 @@ static void ggml_compute_forward_clamp( // ggml_compute_forward_rope static float rope_yarn_ramp(const float low, const float high, const int i0) { - const float y = (i0 / 2 - low) / MIN(0.001f, high - low); + const float y = (i0 / 2 - low) / MAX(0.001f, high - low); return 1 - MIN(1, MAX(0, y)); } @@ -13361,11 +13361,10 @@ static void rope_yarn( if (ext_factor != 0.0f) { float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; - } - // Get n-d magnitude scaling corrected for interpolation - if (freq_scale < 1.0f) + // Get n-d magnitude scaling corrected for interpolation mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } *cos_theta = cosf(theta) * mscale; *sin_theta = sinf(theta) * mscale; } diff --git a/llama.cpp b/llama.cpp index cbab5f580969c..01e219a48e523 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1113,6 +1113,7 @@ struct llama_cparams { float rope_freq_base; float rope_freq_scale; + uint32_t n_yarn_orig_ctx; // These hyperparameters are not exposed in GGUF, because all // existing YaRN models use the same values for them. float yarn_ext_factor; @@ -3028,7 +3029,7 @@ static struct ggml_cgraph * llm_build_llama( const int32_t n_embd = hparams.n_embd; const int32_t n_layer = hparams.n_layer; const int32_t n_ctx = cparams.n_ctx; - const int32_t n_orig_ctx = hparams.n_yarn_orig_ctx; + const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; const int32_t n_head = hparams.n_head; const int32_t n_head_kv = hparams.n_head_kv; const int32_t n_embd_head = hparams.n_embd_head(); @@ -3430,7 +3431,7 @@ static struct ggml_cgraph * llm_build_baichaun( const int32_t n_embd = hparams.n_embd; const int32_t n_layer = hparams.n_layer; const int32_t n_ctx = cparams.n_ctx; - const int32_t n_orig_ctx = hparams.n_yarn_orig_ctx; + const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; const int32_t n_head = hparams.n_head; const int32_t n_head_kv = hparams.n_head_kv; const int32_t n_embd_head = hparams.n_embd_head(); @@ -4194,7 +4195,7 @@ static struct ggml_cgraph * llm_build_falcon( const int32_t n_embd = hparams.n_embd; const int32_t n_layer = hparams.n_layer; const int32_t n_ctx = cparams.n_ctx; - const int32_t n_orig_ctx = hparams.n_yarn_orig_ctx; + const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; const int32_t n_head = hparams.n_head; const int32_t n_head_kv = hparams.n_head_kv; const int32_t n_embd_head = hparams.n_embd_head(); @@ -4818,7 +4819,7 @@ static struct ggml_cgraph * llm_build_persimmon( const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; - const int32_t n_orig_ctx = hparams.n_yarn_orig_ctx; + const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_head = hparams.n_head; const int64_t n_embd_head = hparams.n_embd_head(); @@ -8676,6 +8677,7 @@ struct llama_context * llama_new_context_with_model( cparams.mul_mat_q = params.mul_mat_q; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + cparams.n_yarn_orig_ctx = params.yarn_orig_ctx == 0 ? hparams.n_ctx_train : params.yarn_orig_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; diff --git a/llama.h b/llama.h index 48e12cfea79d5..5f6b14e192c7a 100644 --- a/llama.h +++ b/llama.h @@ -182,12 +182,13 @@ extern "C" { int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` // ref: https://github.com/ggerganov/llama.cpp/pull/2054 - float rope_freq_base; // RoPE base frequency, 0 = from model - float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model - float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model - float yarn_attn_factor; // YaRN magnitude scaling factor - float yarn_beta_fast; // YaRN low correction dim - float yarn_beta_slow; // YaRN high correction dim + float rope_freq_base; // RoPE base frequency, 0 = from model + float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model + float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model + float yarn_attn_factor; // YaRN magnitude scaling factor + float yarn_beta_fast; // YaRN low correction dim + float yarn_beta_slow; // YaRN high correction dim + uint32_t yarn_orig_ctx; // YaRN original context size // Keep the booleans together to avoid misalignment during copy-by-value. bool mul_mat_q; // if true, use experimental mul_mat_q kernels From 9fc823826e815f0fba3544f0818326fb0045bfb7 Mon Sep 17 00:00:00 2001 From: Jeffrey Quesnelle Date: Mon, 30 Oct 2023 08:35:51 -0700 Subject: [PATCH 27/28] fix loading rope.scaling.original_context_length from GGUF (#3) Co-authored-by: cebtenzzre --- llama.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index e0794f90cac32..a28c6f9601ae2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8968,10 +8968,13 @@ struct llama_context * llama_new_context_with_model( cparams.mul_mat_q = params.mul_mat_q; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; - cparams.n_yarn_orig_ctx = params.yarn_orig_ctx == 0 ? hparams.n_ctx_train : params.yarn_orig_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; + cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : + hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx : + hparams.n_ctx_train; + auto rope_scaling_type = params.rope_scaling_type; if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { rope_scaling_type = hparams.rope_scaling_type_train; From 15f26efdb129c8ed116c58bb276897c760a8bbe2 Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Wed, 1 Nov 2023 16:44:49 -0400 Subject: [PATCH 28/28] implement YaRN for GPT-NeoX RoPE --- ggml-cuda.cu | 81 ++++++++++++++++++++++++++++++------------------ ggml-metal.metal | 9 ++++-- ggml.c | 22 ++++++++++--- 3 files changed, 74 insertions(+), 38 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a43a5873d4a2d..4c117344bc849 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4439,7 +4439,7 @@ static __device__ void rope_yarn( // rope == RoPE == rotary positional embedding template static __global__ void rope( - const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float theta_scale, + const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims ) { const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); @@ -4453,7 +4453,7 @@ static __global__ void rope( const int i2 = row/p_delta_rows; const int p = has_pos ? pos[i2] : 0; - const float theta_base = p*powf(theta_scale, col/2); + const float theta_base = p*powf(freq_base, -col/ncols); float cos_theta, sin_theta; rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta); @@ -4466,8 +4466,10 @@ static __global__ void rope( } template -static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale, - const int p_delta_rows, const float theta_scale) { +static __global__ void rope_neox( + const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, + float ext_factor, float attn_factor, rope_corr_dims corr_dims +) { const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (col >= ncols) { @@ -4478,11 +4480,14 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in const int i = row*ncols + col/2; const int i2 = row/p_delta_rows; + // simplified from `(row * ncols + col) * (-1 / ncols)` + const float cur_rot = -col/ncols - row; + const int p = has_pos ? pos[i2] : 0; - const float p0 = p*freq_scale; - const float theta = p0*powf(theta_scale, col/2); - const float sin_theta = sinf(theta); - const float cos_theta = cosf(theta); + const float theta_base = p*powf(freq_base, cur_rot); + + float cos_theta, sin_theta; + rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); const float x0 = x[i + 0]; const float x1 = x[i + ncols/2]; @@ -4491,8 +4496,10 @@ static __global__ void rope_neox(const T * x, T * dst, const int ncols, const in dst[i + ncols/2] = x0*sin_theta + x1*cos_theta; } -static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale, - const int p_delta_rows, const float theta_scale, const int n_ctx) { +static __global__ void rope_glm_f32( + const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, + int n_ctx +) { const int col = blockDim.x*blockIdx.x + threadIdx.x; const int half_n_dims = ncols/4; @@ -4504,7 +4511,7 @@ static __global__ void rope_glm_f32(const float * x, float * dst, const int ncol const int i = row*ncols + col; const int i2 = row/p_delta_rows; - const float col_theta_scale = powf(theta_scale, col); + const float col_theta_scale = powf(freq_base, -2.0f*col/ncols); // FIXME: this is likely wrong const int p = pos != nullptr ? pos[i2] : 0; @@ -5525,7 +5532,7 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const template static void rope_cuda( const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, - float theta_scale, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream ) { GGML_ASSERT(ncols % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); @@ -5533,36 +5540,44 @@ static void rope_cuda( const dim3 block_nums(nrows, num_blocks_x, 1); if (pos == nullptr) { rope<<>>( - x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, ext_factor, attn_factor, corr_dims + x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims ); } else { rope<<>>( - x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, ext_factor, attn_factor, corr_dims + x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims ); } } template -static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale, - const int p_delta_rows, const float theta_scale, cudaStream_t stream) { +static void rope_neox_cuda( + const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream +) { GGML_ASSERT(ncols % 2 == 0); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nrows, num_blocks_x, 1); if (pos == nullptr) { - rope_neox<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale); + rope_neox<<>>( + x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims + ); } else { - rope_neox<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale); + rope_neox<<>>( + x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims + ); } } -static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale, - const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) { +static void rope_glm_f32_cuda( + const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, int n_ctx, cudaStream_t stream +) { GGML_ASSERT(ncols % 4 == 0); const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1); const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE; const dim3 block_nums(num_blocks_x, nrows, 1); - rope_glm_f32<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx); + rope_glm_f32<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx); } static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, @@ -6425,8 +6440,6 @@ inline void ggml_cuda_op_rope( memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - const float theta_scale = powf(freq_base, -2.0f/n_dims); - const int32_t * pos = nullptr; if ((mode & 1) == 0) { GGML_ASSERT(src1->type == GGML_TYPE_I32); @@ -6437,31 +6450,37 @@ inline void ggml_cuda_op_rope( const bool is_neox = mode & 2; const bool is_glm = mode & 4; + rope_corr_dims corr_dims; + ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v); + // compute if (is_glm) { GGML_ASSERT(false); - rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream); + rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream); } else if (is_neox) { GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet"); if (src0->type == GGML_TYPE_F32) { - rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream); + rope_neox_cuda( + (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream + ); } else if (src0->type == GGML_TYPE_F16) { - rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream); + rope_neox_cuda( + (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, main_stream + ); } else { GGML_ASSERT(false); } } else { - rope_corr_dims corr_dims; - ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v); - if (src0->type == GGML_TYPE_F32) { rope_cuda( - (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, ext_factor, + (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, attn_factor, corr_dims, main_stream ); } else if (src0->type == GGML_TYPE_F16) { rope_cuda( - (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, ext_factor, + (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, attn_factor, corr_dims, main_stream ); } else { diff --git a/ggml-metal.metal b/ggml-metal.metal index ddfe378131ce6..791b0c0c47de9 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1125,9 +1125,12 @@ kernel void kernel_rope( for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) { - const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib); - const float cos_theta = cos(theta); - const float sin_theta = sin(theta); + // simplified from `(ib * n_dims + ic) * inv_ndims` + const float cur_rot = inv_ndims*ic - ib; + + const float theta = theta_0 * pow(freq_base, cur_rot); + float cos_theta, sin_theta; + rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); const int64_t i0 = ib*n_dims + ic/2; diff --git a/ggml.c b/ggml.c index 0034067d55a3f..2c6555e15964c 100644 --- a/ggml.c +++ b/ggml.c @@ -13486,6 +13486,7 @@ static void ggml_compute_forward_rope_f32( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.f/n_dims; float corr_dims[2]; ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); @@ -13556,8 +13557,14 @@ static void ggml_compute_forward_rope_f32( theta_base *= freq_scale; for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); + // simplified from `(ib * n_dims + ic) * inv_ndims` + float cur_rot = inv_ndims * ic - ib; + + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + &cos_theta, &sin_theta + ); theta_base *= theta_scale; @@ -13628,6 +13635,7 @@ static void ggml_compute_forward_rope_f16( int ir = 0; const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.f/n_dims; float corr_dims[2]; ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims); @@ -13694,8 +13702,14 @@ static void ggml_compute_forward_rope_f16( theta_base *= freq_scale; for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { - const float cos_theta = cosf(theta_base); - const float sin_theta = sinf(theta_base); + // simplified from `(ib * n_dims + ic) * inv_ndims` + float cur_rot = inv_ndims * ic - ib; + + float cos_theta, sin_theta; + rope_yarn( + theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, + &cos_theta, &sin_theta + ); theta_base *= theta_scale;