From b6562e26d662216bc760107a24c67d5f767b64e8 Mon Sep 17 00:00:00 2001 From: zhink <771809832@qq.com> Date: Thu, 12 Dec 2024 11:13:28 +0800 Subject: [PATCH 1/4] add FLAGS instead max_partition_size --- csrc/gpu/append_attention.cu | 47 +------------------ .../append_attn/append_attention_c16_impl.cuh | 14 ++---- .../append_attn/append_attention_c4_impl.cuh | 14 ++---- .../append_attn/append_attention_c8_impl.cuh | 14 ++---- .../gpu/append_attn/append_attention_kernel.h | 28 +++++------ .../get_block_shape_and_split_kv_block.cu | 8 ++-- ..._attention_c16_bfloat16_bfloat16_kernel.cu | 2 - ...ppend_attention_c16_bfloat16_fp8_kernel.cu | 2 - ...pend_attention_c16_bfloat16_int8_kernel.cu | 2 - ...nd_attention_c16_float16_float16_kernel.cu | 2 - ...append_attention_c16_float16_fp8_kernel.cu | 2 - ...ppend_attention_c16_float16_int8_kernel.cu | 2 - ...d_attention_c4_bfloat16_bfloat16_kernel.cu | 2 - ...append_attention_c4_bfloat16_fp8_kernel.cu | 2 - ...ppend_attention_c4_bfloat16_int8_kernel.cu | 2 - ...end_attention_c4_float16_float16_kernel.cu | 2 - .../append_attention_c4_float16_fp8_kernel.cu | 2 - ...append_attention_c4_float16_int8_kernel.cu | 2 - ...d_attention_c8_bfloat16_bfloat16_kernel.cu | 2 - ...append_attention_c8_bfloat16_fp8_kernel.cu | 2 - ...ppend_attention_c8_bfloat16_int8_kernel.cu | 2 - ...end_attention_c8_float16_float16_kernel.cu | 2 - .../append_attention_c8_float16_fp8_kerne.cu | 2 - .../append_attention_c8_float16_int8_kerne.cu | 2 - csrc/gpu/helper.h | 14 ++++++ .../transformers/fused_transformer_layers.py | 18 ------- 26 files changed, 45 insertions(+), 148 deletions(-) diff --git a/csrc/gpu/append_attention.cu b/csrc/gpu/append_attention.cu index f80f8cee5d3d..d24a20e48d11 100644 --- a/csrc/gpu/append_attention.cu +++ b/csrc/gpu/append_attention.cu @@ -59,10 +59,6 @@ std::vector AppendAttentionKernel( const float quant_max_bound, const float quant_min_bound, const float out_linear_in_scale, - const int encoder_block_shape_q, - const int decoder_block_shape_q, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool speculate_decoder) { @@ -76,7 +72,8 @@ std::vector AppendAttentionKernel( int max_enc_len_this_time_data = max_enc_len_this_time.data()[0]; int max_dec_len_this_time_data = max_dec_len_this_time.data()[0]; int max_len_kv_data = max_len_kv.data()[0]; - + const int encoder_block_shape_q = get_encoder_block_shape_q(); + const int decoder_block_shape_q = get_decoder_block_shape_q(); auto main_stream = qkv.stream(); static cudaEvent_t main_event; static cudaEvent_t decoder_event; @@ -209,8 +206,6 @@ std::vector AppendAttentionKernel( quant_max_bound, quant_min_bound, out_linear_in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, false, @@ -248,8 +243,6 @@ std::vector AppendAttentionKernel( quant_max_bound, quant_min_bound, out_linear_in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, false, @@ -292,8 +285,6 @@ std::vector AppendAttentionKernel( quant_max_bound, quant_min_bound, out_linear_in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, false, @@ -440,8 +431,6 @@ std::vector AppendAttentionKernel( quant_max_bound, quant_min_bound, out_linear_in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, !speculate_decoder, @@ -479,8 +468,6 @@ std::vector AppendAttentionKernel( quant_max_bound, quant_min_bound, out_linear_in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, !speculate_decoder, @@ -524,8 +511,6 @@ std::vector AppendAttentionKernel( quant_max_bound, quant_min_bound, out_linear_in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, !speculate_decoder, @@ -583,10 +568,6 @@ std::vector AppendAttention( const float quant_max_bound, const float quant_min_bound, const float out_linear_in_scale, - const int encoder_block_shape_q, - const int decoder_block_shape_q, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool speculate_decoder) { @@ -648,10 +629,6 @@ std::vector AppendAttention( quant_max_bound, quant_min_bound, out_linear_in_scale, - encoder_block_shape_q, - decoder_block_shape_q, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, speculate_decoder); @@ -698,10 +675,6 @@ std::vector AppendAttention( quant_max_bound, quant_min_bound, out_linear_in_scale, - encoder_block_shape_q, - decoder_block_shape_q, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, speculate_decoder); @@ -749,10 +722,6 @@ std::vector AppendAttention( quant_max_bound, quant_min_bound, out_linear_in_scale, - encoder_block_shape_q, - decoder_block_shape_q, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, speculate_decoder); @@ -798,10 +767,6 @@ std::vector AppendAttention( quant_max_bound, quant_min_bound, out_linear_in_scale, - encoder_block_shape_q, - decoder_block_shape_q, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, speculate_decoder); @@ -903,10 +868,6 @@ std::vector AppendAttentionInferDtype( const float quant_max_bound, const float quant_min_bound, const float out_linear_in_scale, - const int encoder_block_shape_q, - const int decoder_block_shape_q, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool speculate_decoder) { @@ -983,10 +944,6 @@ PD_BUILD_OP(append_attention) "quant_max_bound: float", "quant_min_bound: float", "out_linear_in_scale: float", - "encoder_block_shape_q: int", - "decoder_block_shape_q: int", - "max_partition_size: int", - "encoder_max_partition_size: int", "speculate_max_draft_token_num: int", "causal: bool", "speculate_decoder: bool"}) diff --git a/csrc/gpu/append_attn/append_attention_c16_impl.cuh b/csrc/gpu/append_attn/append_attention_c16_impl.cuh index ed181836d73c..2d6d098cf7ec 100644 --- a/csrc/gpu/append_attn/append_attention_c16_impl.cuh +++ b/csrc/gpu/append_attn/append_attention_c16_impl.cuh @@ -786,8 +786,6 @@ void MultiQueryAppendAttention( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool is_decoder, cudaStream_t &stream, @@ -839,9 +837,9 @@ void MultiQueryAppendAttention( int sm_count; cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); - uint32_t chunk_size = static_cast(max_partition_size); + uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = static_cast(encoder_max_partition_size); + chunk_size = get_encoder_max_partition_size(); } const int num_chunks = div_up(max_dec_len, chunk_size); dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads); @@ -1058,9 +1056,9 @@ void MultiQueryAppendAttention( int sm_count; cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); - uint32_t chunk_size = static_cast(max_partition_size); + uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = static_cast(encoder_max_partition_size); + chunk_size = get_encoder_max_partition_size(); } const int num_chunks = div_up(max_dec_len, chunk_size); @@ -1301,8 +1299,6 @@ void CascadeAppendAttentionC16Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, @@ -1363,8 +1359,6 @@ void CascadeAppendAttentionC16Kernel( quant_max_bound, quant_min_bound, in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, is_decoder, stream, diff --git a/csrc/gpu/append_attn/append_attention_c4_impl.cuh b/csrc/gpu/append_attn/append_attention_c4_impl.cuh index 586bde4dc741..883e71e18816 100644 --- a/csrc/gpu/append_attn/append_attention_c4_impl.cuh +++ b/csrc/gpu/append_attn/append_attention_c4_impl.cuh @@ -973,8 +973,6 @@ void MultiQueryAppendC4Attention( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool is_decoder, cudaStream_t &stream, @@ -1036,9 +1034,9 @@ void MultiQueryAppendC4Attention( const float ratio = static_cast(num_blocks_need) / static_cast(num_blocks_per_wave); - uint32_t chunk_size = static_cast(max_partition_size); + uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = static_cast(encoder_max_partition_size); + chunk_size = get_encoder_max_partition_size(); } const int num_chunks = div_up(max_dec_len, chunk_size); @@ -1282,9 +1280,9 @@ void MultiQueryAppendC4Attention( static_cast(num_blocks_per_wave); - uint32_t chunk_size = static_cast(max_partition_size); + uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = static_cast(encoder_max_partition_size); + chunk_size = get_encoder_max_partition_size(); } const int num_chunks = div_up(max_dec_len, chunk_size); dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads); @@ -1538,8 +1536,6 @@ void CascadeAppendAttentionC4Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, @@ -1604,8 +1600,6 @@ void CascadeAppendAttentionC4Kernel( quant_max_bound, quant_min_bound, in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, is_decoder, stream, diff --git a/csrc/gpu/append_attn/append_attention_c8_impl.cuh b/csrc/gpu/append_attn/append_attention_c8_impl.cuh index d5d1cc38e1b4..3bff1bc15bd5 100644 --- a/csrc/gpu/append_attn/append_attention_c8_impl.cuh +++ b/csrc/gpu/append_attn/append_attention_c8_impl.cuh @@ -860,8 +860,6 @@ void MultiQueryAppendC8Attention( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool is_decoder, cudaStream_t &stream, @@ -914,9 +912,9 @@ void MultiQueryAppendC8Attention( const int dev_id = 0; int sm_count; cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); - uint32_t chunk_size = static_cast(max_partition_size); + uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = static_cast(encoder_max_partition_size); + chunk_size = get_encoder_max_partition_size(); } const int num_chunks = div_up(max_dec_len, chunk_size); dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads); @@ -1136,9 +1134,9 @@ void MultiQueryAppendC8Attention( const int dev_id = 0; int sm_count; cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); - uint32_t chunk_size = static_cast(max_partition_size); + uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = static_cast(encoder_max_partition_size); + chunk_size = get_encoder_max_partition_size(); } const int num_chunks = div_up(max_dec_len, chunk_size); @@ -1377,8 +1375,6 @@ void CascadeAppendAttentionC8Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, @@ -1441,8 +1437,6 @@ void CascadeAppendAttentionC8Kernel( quant_max_bound, quant_min_bound, in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, is_decoder, stream, diff --git a/csrc/gpu/append_attn/append_attention_kernel.h b/csrc/gpu/append_attn/append_attention_kernel.h index 59532b2400c5..a00b925b0f7e 100644 --- a/csrc/gpu/append_attn/append_attention_kernel.h +++ b/csrc/gpu/append_attn/append_attention_kernel.h @@ -52,8 +52,6 @@ void CascadeAppendAttentionC16Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, @@ -97,8 +95,6 @@ void CascadeAppendAttentionC8Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, @@ -142,8 +138,6 @@ void CascadeAppendAttentionC4Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, @@ -188,8 +182,6 @@ void CascadeAppendAttentionKernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, @@ -223,8 +215,6 @@ void CascadeAppendAttentionKernel( quant_max_bound, quant_min_bound, in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, is_decoder, @@ -258,8 +248,6 @@ void CascadeAppendAttentionKernel( quant_max_bound, quant_min_bound, in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, is_decoder, @@ -293,8 +281,6 @@ void CascadeAppendAttentionKernel( quant_max_bound, quant_min_bound, in_scale, - max_partition_size, - encoder_max_partition_size, speculate_max_draft_token_num, causal, is_decoder, @@ -307,3 +293,17 @@ void CascadeAppendAttentionKernel( "cache_int4_zp]"); } } + +inline uint32_t get_max_partition_size(int bsz) { + static const char* max_partition_size_env = std::getenv("FLAGS_cascade_attention_max_partition_size"); + static const uint32_t max_partition_size = + max_partition_size_env == nullptr ? 0 : std::stoul(std::string(max_partition_size_env)); + return (max_partition_size != 0 ? max_partition_size : (bsz == 1 ? 128 : 512)); +} + +inline uint32_t get_encoder_max_partition_size() { + static const char* encoder_max_partition_size_env = std::getenv("FLAGS_cascade_encoder_attention_max_partition_size"); + static const uint32_t encoder_max_partition_size = + encoder_max_partition_size_env == nullptr ? 32768 : std::stoul(std::string(encoder_max_partition_size_env)); + return encoder_max_partition_size; +} \ No newline at end of file diff --git a/csrc/gpu/append_attn/get_block_shape_and_split_kv_block.cu b/csrc/gpu/append_attn/get_block_shape_and_split_kv_block.cu index 26ace708c40d..7cf9ab9068eb 100644 --- a/csrc/gpu/append_attn/get_block_shape_and_split_kv_block.cu +++ b/csrc/gpu/append_attn/get_block_shape_and_split_kv_block.cu @@ -104,13 +104,13 @@ std::vector GetBlockShapeAndSplitKVBlock( const paddle::Tensor& max_enc_len_this_time, const paddle::Tensor& seq_lens_this_time, const paddle::Tensor& cum_offsets, - const int encoder_block_shape_q, - const int decoder_block_shape_q, const int group_size, const int block_size, const int decoder_step_token_num) { auto stream = seq_lens_encoder.stream(); int bsz = cum_offsets.shape()[0]; + const int encoder_block_shape_q = get_encoder_block_shape_q(); + const int decoder_block_shape_q = get_decoder_block_shape_q(); // decoder const uint32_t decoder_max_tile_size_per_bs_q = @@ -284,9 +284,7 @@ PD_BUILD_OP(get_block_shape_and_split_kv_block) "decoder_tile_ids_per_batch", "decoder_num_blocks", "max_len_kv"}) - .Attrs({"encoder_block_shape_q: int", - "decoder_block_shape_q: int", - "group_size: int", + .Attrs({"group_size: int", "block_size: int", "decoder_step_token_num: int"}) .SetKernelFn(PD_KERNEL(GetBlockShapeAndSplitKVBlock)) diff --git a/csrc/gpu/append_attn/template_instantiation/append_attention_c16_bfloat16_bfloat16_kernel.cu b/csrc/gpu/append_attn/template_instantiation/append_attention_c16_bfloat16_bfloat16_kernel.cu index 7dafef74ba88..79ba5cd7bc85 100644 --- a/csrc/gpu/append_attn/template_instantiation/append_attention_c16_bfloat16_bfloat16_kernel.cu +++ b/csrc/gpu/append_attn/template_instantiation/append_attention_c16_bfloat16_bfloat16_kernel.cu @@ -49,8 +49,6 @@ template void CascadeAppendAttentionC16Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, diff --git a/csrc/gpu/append_attn/template_instantiation/append_attention_c16_float16_float16_kernel.cu b/csrc/gpu/append_attn/template_instantiation/append_attention_c16_float16_float16_kernel.cu index 806eecbb529d..09e149c25233 100644 --- a/csrc/gpu/append_attn/template_instantiation/append_attention_c16_float16_float16_kernel.cu +++ b/csrc/gpu/append_attn/template_instantiation/append_attention_c16_float16_float16_kernel.cu @@ -48,8 +48,6 @@ template void CascadeAppendAttentionC16Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, diff --git a/csrc/gpu/append_attn/template_instantiation/append_attention_c16_float16_fp8_kernel.cu b/csrc/gpu/append_attn/template_instantiation/append_attention_c16_float16_fp8_kernel.cu index c677686d68aa..648d301880b8 100644 --- a/csrc/gpu/append_attn/template_instantiation/append_attention_c16_float16_fp8_kernel.cu +++ b/csrc/gpu/append_attn/template_instantiation/append_attention_c16_float16_fp8_kernel.cu @@ -48,8 +48,6 @@ template void CascadeAppendAttentionC16Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, diff --git a/csrc/gpu/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu b/csrc/gpu/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu index 75c6e80c3056..a3f0c95f02e2 100644 --- a/csrc/gpu/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu +++ b/csrc/gpu/append_attn/template_instantiation/append_attention_c4_bfloat16_bfloat16_kernel.cu @@ -48,8 +48,6 @@ template void CascadeAppendAttentionC4Kernel const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, diff --git a/csrc/gpu/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu b/csrc/gpu/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu index 065834d6d0d8..63b03741b0e7 100644 --- a/csrc/gpu/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu +++ b/csrc/gpu/append_attn/template_instantiation/append_attention_c4_bfloat16_fp8_kernel.cu @@ -48,8 +48,6 @@ template void CascadeAppendAttentionC4Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, diff --git a/csrc/gpu/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu b/csrc/gpu/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu index 3a2b13a89045..aae73a837de4 100644 --- a/csrc/gpu/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu +++ b/csrc/gpu/append_attn/template_instantiation/append_attention_c4_float16_float16_kernel.cu @@ -49,8 +49,6 @@ template void CascadeAppendAttentionC4Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, diff --git a/csrc/gpu/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu b/csrc/gpu/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu index 4f5dedb15dc5..57c5e36fca93 100644 --- a/csrc/gpu/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu +++ b/csrc/gpu/append_attn/template_instantiation/append_attention_c4_float16_fp8_kernel.cu @@ -48,8 +48,6 @@ template void CascadeAppendAttentionC4Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, diff --git a/csrc/gpu/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu b/csrc/gpu/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu index 606c9128a973..e5d85cad2b5e 100644 --- a/csrc/gpu/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu +++ b/csrc/gpu/append_attn/template_instantiation/append_attention_c8_bfloat16_bfloat16_kernel.cu @@ -50,8 +50,6 @@ CascadeAppendAttentionC8Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, diff --git a/csrc/gpu/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu b/csrc/gpu/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu index efc54738fafc..e115efacf907 100644 --- a/csrc/gpu/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu +++ b/csrc/gpu/append_attn/template_instantiation/append_attention_c8_bfloat16_fp8_kernel.cu @@ -48,8 +48,6 @@ template void CascadeAppendAttentionC8Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, diff --git a/csrc/gpu/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu b/csrc/gpu/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu index 83728df8d409..cfa10da809da 100644 --- a/csrc/gpu/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu +++ b/csrc/gpu/append_attn/template_instantiation/append_attention_c8_float16_float16_kernel.cu @@ -48,8 +48,6 @@ template void CascadeAppendAttentionC8Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, diff --git a/csrc/gpu/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu b/csrc/gpu/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu index 35267a59f55b..842fb6415fca 100644 --- a/csrc/gpu/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu +++ b/csrc/gpu/append_attn/template_instantiation/append_attention_c8_float16_fp8_kerne.cu @@ -48,8 +48,6 @@ template void CascadeAppendAttentionC8Kernel( const float quant_max_bound, const float quant_min_bound, const float in_scale, - const int max_partition_size, - const int encoder_max_partition_size, const int speculate_max_draft_token_num, const bool causal, const bool is_decoder, diff --git a/csrc/gpu/helper.h b/csrc/gpu/helper.h index 4e8aa488141a..93ced1b7193c 100644 --- a/csrc/gpu/helper.h +++ b/csrc/gpu/helper.h @@ -221,3 +221,17 @@ __device__ inline bool is_in_end(const int64_t id, const int64_t *end_ids, int l } return flag; } + +inline uint32_t get_decoder_block_shape_q() { + static const char* decoder_block_shape_q_env = std::getenv("FLAGS_flag_dec_block_shape_q"); + static const uint32_t decoder_block_shape_q = + decoder_block_shape_q_env == nullptr ? 16 : std::stoi(std::string(decoder_block_shape_q_env)); + return decoder_block_shape_q; +} + +inline uint32_t get_encoder_block_shape_q() { + static const char* encoder_block_shape_q_env = std::getenv("FLAGS_flag_block_shape_q"); + static const uint32_t encoder_block_shape_q = + encoder_block_shape_q_env == nullptr ? 64 : std::stoi(std::string(encoder_block_shape_q_env)); + return encoder_block_shape_q; +} diff --git a/paddlenlp/experimental/transformers/fused_transformer_layers.py b/paddlenlp/experimental/transformers/fused_transformer_layers.py index cdf5730c7a86..8810c3fe19e4 100644 --- a/paddlenlp/experimental/transformers/fused_transformer_layers.py +++ b/paddlenlp/experimental/transformers/fused_transformer_layers.py @@ -1033,10 +1033,6 @@ def forward( kwargs["max_dec_len_this_time"] = max_dec_len_this_time if self.config.append_attn: - kwargs["encoder_block_shape_q"] = 64 - kwargs["decoder_block_shape_q"] = 16 - kwargs["max_partition_size"] = 32768 - kwargs["encoder_max_partition_size"] = 32768 from paddlenlp_ops import get_block_shape_and_split_kv_block @@ -1057,8 +1053,6 @@ def forward( max_enc_len_this_time, kwargs.get("seq_lens_this_time", None), kwargs.get("cum_offsets", None), - kwargs.get("encoder_block_shape_q", 64), - kwargs.get("decoder_block_shape_q", 16), self.num_heads // self.kv_num_heads, kwargs.get("block_size", 64), self.config.speculate_config.speculate_max_draft_token_num, @@ -2197,10 +2191,6 @@ def compute_attn( 0.0, 0.0, 0.0, # out_linear_in_scale - kwargs.get("encoder_block_shape_q", 64), - kwargs.get("decoder_block_shape_q", 16), - kwargs.get("max_partition_size", 32768), - kwargs.get("encoder_max_partition_size", 32768), self.config.speculate_config.speculate_max_draft_token_num, True, # causal self.config.speculate_config.speculate_method is not None, # speculate_decoder @@ -2395,10 +2385,6 @@ def compute_attn( self.quant_max_bound, self.quant_min_bound, self.act_scales["out_linear_in_scale"][i], - kwargs.get("encoder_block_shape_q", 64), - kwargs.get("decoder_block_shape_q", 16), - kwargs.get("max_partition_size", 32768), - kwargs.get("encoder_max_partition_size", 32768), self.config.speculate_config.speculate_max_draft_token_num, True, # causal self.config.speculate_config.speculate_method is not None, # speculate_decoder @@ -2760,10 +2746,6 @@ def compute_attn( self.quant_max_bound, self.quant_min_bound, self.act_scales["out_linear_in_scale"][i], - kwargs.get("encoder_block_shape_q", 64), - kwargs.get("decoder_block_shape_q", 16), - kwargs.get("max_partition_size", 32768), - kwargs.get("encoder_max_partition_size", 32768), self.config.speculate_config.speculate_max_draft_token_num, True, # causal False, # speculate_decoder From d402f236d87f713764deb99b6bfb6be86b27fcbc Mon Sep 17 00:00:00 2001 From: zhink <771809832@qq.com> Date: Mon, 30 Dec 2024 14:40:55 +0800 Subject: [PATCH 2/4] encoder_max_partition_size eq max_seq_len when encoder --- csrc/gpu/append_attn/append_attention_c16_impl.cuh | 4 ++-- csrc/gpu/append_attn/append_attention_c4_impl.cuh | 4 ++-- csrc/gpu/append_attn/append_attention_c8_impl.cuh | 4 ++-- csrc/gpu/append_attn/append_attention_kernel.h | 7 ------- 4 files changed, 6 insertions(+), 13 deletions(-) diff --git a/csrc/gpu/append_attn/append_attention_c16_impl.cuh b/csrc/gpu/append_attn/append_attention_c16_impl.cuh index 2d6d098cf7ec..3b08d0a85dbc 100644 --- a/csrc/gpu/append_attn/append_attention_c16_impl.cuh +++ b/csrc/gpu/append_attn/append_attention_c16_impl.cuh @@ -839,7 +839,7 @@ void MultiQueryAppendAttention( uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = get_encoder_max_partition_size(); + chunk_size = max_seq_len; } const int num_chunks = div_up(max_dec_len, chunk_size); dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads); @@ -1058,7 +1058,7 @@ void MultiQueryAppendAttention( uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = get_encoder_max_partition_size(); + chunk_size = max_seq_len; } const int num_chunks = div_up(max_dec_len, chunk_size); diff --git a/csrc/gpu/append_attn/append_attention_c4_impl.cuh b/csrc/gpu/append_attn/append_attention_c4_impl.cuh index 883e71e18816..7d49de3966e0 100644 --- a/csrc/gpu/append_attn/append_attention_c4_impl.cuh +++ b/csrc/gpu/append_attn/append_attention_c4_impl.cuh @@ -1036,7 +1036,7 @@ void MultiQueryAppendC4Attention( uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = get_encoder_max_partition_size(); + chunk_size = max_seq_len; } const int num_chunks = div_up(max_dec_len, chunk_size); @@ -1282,7 +1282,7 @@ void MultiQueryAppendC4Attention( uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = get_encoder_max_partition_size(); + chunk_size = max_seq_len; } const int num_chunks = div_up(max_dec_len, chunk_size); dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads); diff --git a/csrc/gpu/append_attn/append_attention_c8_impl.cuh b/csrc/gpu/append_attn/append_attention_c8_impl.cuh index 3bff1bc15bd5..e0ede51a9c81 100644 --- a/csrc/gpu/append_attn/append_attention_c8_impl.cuh +++ b/csrc/gpu/append_attn/append_attention_c8_impl.cuh @@ -914,7 +914,7 @@ void MultiQueryAppendC8Attention( cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = get_encoder_max_partition_size(); + chunk_size = max_seq_len; } const int num_chunks = div_up(max_dec_len, chunk_size); dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads); @@ -1136,7 +1136,7 @@ void MultiQueryAppendC8Attention( cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id); uint32_t chunk_size = get_max_partition_size(bsz); if (!is_decoder) { - chunk_size = get_encoder_max_partition_size(); + chunk_size = max_seq_len; } const int num_chunks = div_up(max_dec_len, chunk_size); diff --git a/csrc/gpu/append_attn/append_attention_kernel.h b/csrc/gpu/append_attn/append_attention_kernel.h index a00b925b0f7e..10932d8f411d 100644 --- a/csrc/gpu/append_attn/append_attention_kernel.h +++ b/csrc/gpu/append_attn/append_attention_kernel.h @@ -299,11 +299,4 @@ inline uint32_t get_max_partition_size(int bsz) { static const uint32_t max_partition_size = max_partition_size_env == nullptr ? 0 : std::stoul(std::string(max_partition_size_env)); return (max_partition_size != 0 ? max_partition_size : (bsz == 1 ? 128 : 512)); -} - -inline uint32_t get_encoder_max_partition_size() { - static const char* encoder_max_partition_size_env = std::getenv("FLAGS_cascade_encoder_attention_max_partition_size"); - static const uint32_t encoder_max_partition_size = - encoder_max_partition_size_env == nullptr ? 32768 : std::stoul(std::string(encoder_max_partition_size_env)); - return encoder_max_partition_size; } \ No newline at end of file From 7a1e00a40402d5a52492ef980526efe10a2e122e Mon Sep 17 00:00:00 2001 From: zhink <771809832@qq.com> Date: Fri, 3 Jan 2025 14:18:40 +0800 Subject: [PATCH 3/4] Add environment variable description --- csrc/gpu/append_attn/append_attention_kernel.h | 2 +- csrc/gpu/helper.h | 4 ++-- llm/docs/predict/best_practices.md | 6 ++++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/csrc/gpu/append_attn/append_attention_kernel.h b/csrc/gpu/append_attn/append_attention_kernel.h index 10932d8f411d..b0fabcf893d3 100644 --- a/csrc/gpu/append_attn/append_attention_kernel.h +++ b/csrc/gpu/append_attn/append_attention_kernel.h @@ -299,4 +299,4 @@ inline uint32_t get_max_partition_size(int bsz) { static const uint32_t max_partition_size = max_partition_size_env == nullptr ? 0 : std::stoul(std::string(max_partition_size_env)); return (max_partition_size != 0 ? max_partition_size : (bsz == 1 ? 128 : 512)); -} \ No newline at end of file +} diff --git a/csrc/gpu/helper.h b/csrc/gpu/helper.h index 93ced1b7193c..99087ad1878d 100644 --- a/csrc/gpu/helper.h +++ b/csrc/gpu/helper.h @@ -223,14 +223,14 @@ __device__ inline bool is_in_end(const int64_t id, const int64_t *end_ids, int l } inline uint32_t get_decoder_block_shape_q() { - static const char* decoder_block_shape_q_env = std::getenv("FLAGS_flag_dec_block_shape_q"); + static const char* decoder_block_shape_q_env = std::getenv("FLAGS_dec_block_shape_q"); static const uint32_t decoder_block_shape_q = decoder_block_shape_q_env == nullptr ? 16 : std::stoi(std::string(decoder_block_shape_q_env)); return decoder_block_shape_q; } inline uint32_t get_encoder_block_shape_q() { - static const char* encoder_block_shape_q_env = std::getenv("FLAGS_flag_block_shape_q"); + static const char* encoder_block_shape_q_env = std::getenv("FLAGS_enc_block_shape_q"); static const uint32_t encoder_block_shape_q = encoder_block_shape_q_env == nullptr ? 64 : std::stoi(std::string(encoder_block_shape_q_env)); return encoder_block_shape_q; diff --git a/llm/docs/predict/best_practices.md b/llm/docs/predict/best_practices.md index 450a2f59fa53..496df200f3fd 100644 --- a/llm/docs/predict/best_practices.md +++ b/llm/docs/predict/best_practices.md @@ -22,3 +22,9 @@ PaddleNLP 提供了多种环境变量,用于优化推理性能和资源使用 - `FLAGS_fraction_of_gpu_memory_to_use`:GPU 显存使用率,默认值为0.9。设置为0.9即可。 - `FLAGS_gemm_use_half_precision_compute_type`:是否使用半精度浮点数计算,默认值为0。设置为0即可。 + +**Append Attention 优化** + +- `FLAGS_cascade_attention_max_partition_size`:Append Attention decoder计算时对cache_kv进行分chunk的chunk大小,batchsize为1时默认值为128,batchsize大于时512。显示设置时不区分batchsize。 +- `FLAGS_dec_block_shape_q`:Append Attention decoder计算时对q进行分块的分块大小,默认值为16。设置为16即可。 +- `FLAGS_enc_block_shape_q`:Append Attention encoder计算时对q进行分块的分块大小,默认值为64。设置为64即可。 From 87d0b782140482c8143ff99401b5b847d5b762d8 Mon Sep 17 00:00:00 2001 From: zhink <771809832@qq.com> Date: Fri, 3 Jan 2025 18:57:21 +0800 Subject: [PATCH 4/4] style --- llm/docs/predict/best_practices.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llm/docs/predict/best_practices.md b/llm/docs/predict/best_practices.md index 6075c8c19a33..77b29fcb5ebe 100644 --- a/llm/docs/predict/best_practices.md +++ b/llm/docs/predict/best_practices.md @@ -29,6 +29,6 @@ PaddleNLP 提供了多种环境变量,用于优化推理性能和资源使用 **Append Attention 优化** -- `FLAGS_cascade_attention_max_partition_size`:Append Attention decoder计算时对cache_kv进行分chunk的chunk大小,batchsize为1时默认值为128,batchsize大于时512。显示设置时不区分batchsize。 -- `FLAGS_dec_block_shape_q`:Append Attention decoder计算时对q进行分块的分块大小,默认值为16。设置为16即可。 -- `FLAGS_enc_block_shape_q`:Append Attention encoder计算时对q进行分块的分块大小,默认值为64。设置为64即可。 +- `FLAGS_cascade_attention_max_partition_size`:Append Attention decoder计算时对cache_kv进行分chunk的chunk大小,默认值根据batchsize设置,batchsize=1时设置为128,batchsize>1时设置为512。显式设置时不再区分batchsize。 +- `FLAGS_dec_block_shape_q`:Append Attention decoder计算时对q进行分块的分块大小,默认值为16。 +- `FLAGS_enc_block_shape_q`:Append Attention encoder计算时对q进行分块的分块大小,默认值为64。