From 64331c7165c68f60ad563992c21d98108034c46a Mon Sep 17 00:00:00 2001 From: zhink <771809832@qq.com> Date: Fri, 3 Jan 2025 14:18:40 +0800 Subject: [PATCH] Add environment variable description --- csrc/gpu/append_attn/append_attention_kernel.h | 14 ++++++++++++++ llm/docs/predict/best_practices.md | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/csrc/gpu/append_attn/append_attention_kernel.h b/csrc/gpu/append_attn/append_attention_kernel.h index 10932d8f411d..55220266646b 100644 --- a/csrc/gpu/append_attn/append_attention_kernel.h +++ b/csrc/gpu/append_attn/append_attention_kernel.h @@ -299,4 +299,18 @@ inline uint32_t get_max_partition_size(int bsz) { static const uint32_t max_partition_size = max_partition_size_env == nullptr ? 0 : std::stoul(std::string(max_partition_size_env)); return (max_partition_size != 0 ? max_partition_size : (bsz == 1 ? 128 : 512)); +} + +inline uint32_t get_decoder_block_shape_q() { + static const char* decoder_block_shape_q_env = std::getenv("FLAGS_dec_block_shape_q"); + static const uint32_t decoder_block_shape_q = + decoder_block_shape_q_env == nullptr ? 16 : std::stoi(std::string(decoder_block_shape_q_env)); + return decoder_block_shape_q; +} + +inline uint32_t get_encoder_block_shape_q() { + static const char* encoder_block_shape_q_env = std::getenv("FLAGS_enc_block_shape_q"); + static const uint32_t encoder_block_shape_q = + encoder_block_shape_q_env == nullptr ? 64 : std::stoi(std::string(encoder_block_shape_q_env)); + return encoder_block_shape_q; } \ No newline at end of file diff --git a/llm/docs/predict/best_practices.md b/llm/docs/predict/best_practices.md index 450a2f59fa53..496df200f3fd 100644 --- a/llm/docs/predict/best_practices.md +++ b/llm/docs/predict/best_practices.md @@ -22,3 +22,9 @@ PaddleNLP 提供了多种环境变量,用于优化推理性能和资源使用 - `FLAGS_fraction_of_gpu_memory_to_use`:GPU 显存使用率,默认值为0.9。设置为0.9即可。 - `FLAGS_gemm_use_half_precision_compute_type`:是否使用半精度浮点数计算,默认值为0。设置为0即可。 + +**Append Attention 优化** + +- `FLAGS_cascade_attention_max_partition_size`:Append Attention decoder计算时对cache_kv进行分chunk的chunk大小,batchsize为1时默认值为128,batchsize大于时512。显示设置时不区分batchsize。 +- `FLAGS_dec_block_shape_q`:Append Attention decoder计算时对q进行分块的分块大小,默认值为16。设置为16即可。 +- `FLAGS_enc_block_shape_q`:Append Attention encoder计算时对q进行分块的分块大小,默认值为64。设置为64即可。