Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up the comments and names under python/sglang/srt/layers #1047

Merged
merged 3 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/sglang/srt/layers/activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
limitations under the License.
"""

"""Fused operators for activation layers."""

import torch
import torch.nn as nn
import torch.nn.functional as F
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
limitations under the License.
"""

"""
Memory-efficient attention for decoding.
"""

# Adapted from
# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_softmax_and_reducev.py
Expand Down Expand Up @@ -194,7 +198,7 @@ def _fwd_kernel_stage2(
tl.store(out_ptrs, acc)


def _token_att_m_fwd(
def _decode_att_m_fwd(
q,
k_buffer,
att_out,
Expand Down Expand Up @@ -254,7 +258,7 @@ def _token_att_m_fwd(
)


def _token_softmax_reducev_fwd(
def _decode_softmax_reducev_fwd(
logics,
v_buffer,
o,
Expand Down Expand Up @@ -292,7 +296,7 @@ def _token_softmax_reducev_fwd(
)


def token_attention_fwd(
def decode_attention_fwd(
q,
k_buffer,
v_buffer,
Expand All @@ -312,7 +316,7 @@ def token_attention_fwd(
(q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda"
)

_token_att_m_fwd(
_decode_att_m_fwd(
q,
k_buffer,
att_m,
Expand All @@ -324,7 +328,7 @@ def token_attention_fwd(
sm_scale,
logit_cap,
)
_token_softmax_reducev_fwd(
_decode_softmax_reducev_fwd(
att_m,
v_buffer,
o,
Expand Down
7 changes: 6 additions & 1 deletion python/sglang/srt/layers/extend_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,16 @@
limitations under the License.
"""

"""
Memory-efficient attention for prefill.
It supporst page size = 1 and prefill with KV cache (i.e. extend).
"""

import torch
import triton
import triton.language as tl

from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
from sglang.srt.layers.prefill_attention import context_attention_fwd

CUDA_CAPABILITY = torch.cuda.get_device_capability()

Expand Down
2 changes: 2 additions & 0 deletions python/sglang/srt/layers/layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
limitations under the License.
"""

"""Fused operators for normalization layers."""

from typing import Optional, Tuple, Union

import torch
Expand Down
Loading
Loading