Skip to content

Commit

Permalink
Clean up the comments and names under python/sglang/srt/layers (#1047)
Browse files Browse the repository at this point in the history
  • Loading branch information
merrymercy authored Aug 12, 2024
1 parent fb7421d commit fb1f28c
Show file tree
Hide file tree
Showing 9 changed files with 26 additions and 1,633 deletions.
2 changes: 2 additions & 0 deletions python/sglang/srt/layers/activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
limitations under the License.
"""

"""Fused operators for activation layers."""

import torch
import torch.nn as nn
import torch.nn.functional as F
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
limitations under the License.
"""

"""
Memory-efficient attention for decoding.
"""

# Adapted from
# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_softmax_and_reducev.py
Expand Down Expand Up @@ -194,7 +198,7 @@ def _fwd_kernel_stage2(
tl.store(out_ptrs, acc)


def _token_att_m_fwd(
def _decode_att_m_fwd(
q,
k_buffer,
att_out,
Expand Down Expand Up @@ -254,7 +258,7 @@ def _token_att_m_fwd(
)


def _token_softmax_reducev_fwd(
def _decode_softmax_reducev_fwd(
logics,
v_buffer,
o,
Expand Down Expand Up @@ -292,7 +296,7 @@ def _token_softmax_reducev_fwd(
)


def token_attention_fwd(
def decode_attention_fwd(
q,
k_buffer,
v_buffer,
Expand All @@ -312,7 +316,7 @@ def token_attention_fwd(
(q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda"
)

_token_att_m_fwd(
_decode_att_m_fwd(
q,
k_buffer,
att_m,
Expand All @@ -324,7 +328,7 @@ def token_attention_fwd(
sm_scale,
logit_cap,
)
_token_softmax_reducev_fwd(
_decode_softmax_reducev_fwd(
att_m,
v_buffer,
o,
Expand Down
7 changes: 6 additions & 1 deletion python/sglang/srt/layers/extend_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,16 @@
limitations under the License.
"""

"""
Memory-efficient attention for prefill.
It supporst page size = 1 and prefill with KV cache (i.e. extend).
"""

import torch
import triton
import triton.language as tl

from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
from sglang.srt.layers.prefill_attention import context_attention_fwd

CUDA_CAPABILITY = torch.cuda.get_device_capability()

Expand Down
2 changes: 2 additions & 0 deletions python/sglang/srt/layers/layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
limitations under the License.
"""

"""Fused operators for normalization layers."""

from typing import Optional, Tuple, Union

import torch
Expand Down
Loading

0 comments on commit fb1f28c

Please sign in to comment.