sgl-project · merrymercy · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
@@ -11,6 +11,8 @@
 limitations under the License.
 """
 
+"""Fused operators for activation layers."""
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

@@ -13,6 +13,10 @@
 limitations under the License.
 """
 
+"""
+Memory-efficient attention for decoding.
+"""
+
 # Adapted from
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_softmax_and_reducev.py
@@ -194,7 +198,7 @@ def _fwd_kernel_stage2(
     tl.store(out_ptrs, acc)
 
 
-def _token_att_m_fwd(
+def _decode_att_m_fwd(
     q,
     k_buffer,
     att_out,
@@ -254,7 +258,7 @@ def _token_att_m_fwd(
     )
 
 
-def _token_softmax_reducev_fwd(
+def _decode_softmax_reducev_fwd(
     logics,
     v_buffer,
     o,
@@ -292,7 +296,7 @@ def _token_softmax_reducev_fwd(
     )
 
 
-def token_attention_fwd(
+def decode_attention_fwd(
     q,
     k_buffer,
     v_buffer,
@@ -312,7 +316,7 @@ def token_attention_fwd(
             (q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda"
         )
 
-    _token_att_m_fwd(
+    _decode_att_m_fwd(
         q,
         k_buffer,
         att_m,
@@ -324,7 +328,7 @@ def token_attention_fwd(
         sm_scale,
         logit_cap,
     )
-    _token_softmax_reducev_fwd(
+    _decode_softmax_reducev_fwd(
         att_m,
         v_buffer,
         o,

@@ -13,11 +13,16 @@
 limitations under the License.
 """
 
+"""
+Memory-efficient attention for prefill.
+It supporst page size = 1 and prefill with KV cache (i.e. extend).
+"""
+
 import torch
 import triton
 import triton.language as tl
 
-from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
+from sglang.srt.layers.prefill_attention import context_attention_fwd
 
 CUDA_CAPABILITY = torch.cuda.get_device_capability()
 

@@ -13,6 +13,8 @@
 limitations under the License.
 """
 
+"""Fused operators for normalization layers."""
+
 from typing import Optional, Tuple, Union
 
 import torch