Question and suggestions about custom allreduce #2918

leizhao1234 · 2025-01-16T09:27:01Z

leizhao1234
Jan 16, 2025

In trtllm's twoshot allreduce, there is one multi_gpu_barrier and one block_barrier, whereas in sgLang's twoshot allreduce, there appear to be two block_barriers.
I noticed that the first block barrier does not use a memory fence, similar to vllm. However, does the first barrier need to be at the block level? I believe a multi_gpu_barrier without a memory fence would suffice.. Here is my code:
`inline device void multi_gpu_barrier(uint32_t** signals, uint32_t const flag, size_t const local_rank,
size_t const world_size, int const tidx, int const bidx)
{
// After this function, at least one block in each GPU has reached the barrier
if (tidx < world_size)
{
// we can think of signals having the shape [world_size, world_size]
// Dimension 0 is the "listening" dimension, dimension 1 is "emitting" dimension

    // Block 0 broadcasts its flag (local_rank on emitting dimension) to all receivers
    size_t offset = (flag % 2) ? world_size : 0;

    if (bidx == 0)
    {
        st_flag_volatile(flag, signals[tidx] + offset + local_rank);
        //st_flag_release(flag, signals[tidx] + offset + local_rank);
    }

    // All blocks check that corresponding block 0 on other GPUs have set the flag
    // No deadlock because block #0 is always the first block started
    uint32_t* peer_barrier_d = signals[local_rank] + offset + tidx;
    /*
    while (ld_flag_acquire(peer_barrier_d) != flag)
    {
    }
    */
    while (ld_flag_volatile(peer_barrier_d) != flag)
    {
    }
}

__syncthreads();

}

inline device void block_barrier(uint32_t** signals, uint32_t const flag, size_t const local_rank,
size_t const world_size, int const tidx, int const bidx, int const grid_size)
{
__syncthreads();
// After this function, the block of id == bidx of each GPU has reached the barrier
if (tidx < world_size)
{
// we can think of signals having the shape [world_size, 2, num_blocks, world_size]
// (+ an offset on dim 2 to account for flags used in multi_gpu_barrier)
// Dimension 0 is the "listening" dimension, dimension 3 is "emitting" dimension

    // Block broadcast its flag (local_rank on emitting dimension) to all receivers
    uint32_t flag_block_offset = bidx * world_size;

    
    st_flag_release(flag, signals[tidx] + flag_block_offset + local_rank);

    // Blocks check that corresponding blocks on other GPUs have also set the flag
    uint32_t* peer_barrier_d = signals[local_rank] + flag_block_offset + tidx;

    while (ld_flag_acquire(peer_barrier_d) != flag)
    {
    }
}

__syncthreads();

}`
I think oneshot only requires one multi_gpu_barrier, while twoshot requires both one multi_gpu_barrier and one block_barrier.

leizhao1234 · 2025-01-16T09:45:32Z

leizhao1234
Jan 16, 2025
Author

__inline__ __device__ void multi_gpu_barrier(uint32_t** signals, uint32_t const flag, size_t const local_rank,
    size_t const world_size, int const tidx, int const bidx)
{
    // After this function, at least one block in each GPU has reached the barrier
    if (tidx < world_size)
    {
        // we can think of signals having the shape [world_size, world_size]
        // Dimension 0 is the "listening" dimension, dimension 1 is "emitting" dimension

        // Block 0 broadcasts its flag (local_rank on emitting dimension) to all receivers
        size_t offset = (flag % 2) ? world_size : 0;

        if (bidx == 0)
        {
            st_flag_volatile(flag, signals[tidx] + offset + local_rank);
            //st_flag_release(flag, signals[tidx] + offset + local_rank);
        }

        // All blocks check that corresponding block 0 on other GPUs have set the flag
        // No deadlock because block #0 is always the first block started
        uint32_t* peer_barrier_d = signals[local_rank] + offset + tidx;
        /*
        while (ld_flag_acquire(peer_barrier_d) != flag)
        {
        }
        */
        while (ld_flag_volatile(peer_barrier_d) != flag)
        {
        }
    }

    __syncthreads();
}

__inline__ __device__ void block_barrier(uint32_t** signals, uint32_t const flag, size_t const local_rank,
    size_t const world_size, int const tidx, int const bidx, int const grid_size)
{
    __syncthreads();
    // After this function, the block of id == bidx of each GPU has reached the barrier
    if (tidx < world_size)
    {
        // we can think of signals having the shape [world_size, 2, num_blocks, world_size]
        // (+ an offset on dim 2 to account for flags used in multi_gpu_barrier)
        // Dimension 0 is the "listening" dimension, dimension 3 is "emitting" dimension

        // Block broadcast its flag (local_rank on emitting dimension) to all receivers
        uint32_t flag_block_offset = bidx * world_size;

        
        st_flag_release(flag, signals[tidx] + flag_block_offset + local_rank);

        // Blocks check that corresponding blocks on other GPUs have also set the flag
        uint32_t* peer_barrier_d = signals[local_rank] + flag_block_offset + tidx;

        while (ld_flag_acquire(peer_barrier_d) != flag)
        {
        }
    }

    __syncthreads();
}

The block 0 is always the first block started, and each rank's data is allready in global memory(cudamemcpy).

0 replies

yizhang2077 · 2025-01-16T10:06:40Z

yizhang2077
Jan 16, 2025
Collaborator

I think when copy_mode is true, it still need block barrier without fence but if copy_mode is false multi_gpu_barrier without a memory fence is suffice

0 replies

leizhao1234 · 2025-01-16T10:26:14Z

leizhao1234
Jan 16, 2025
Author

What is the meaning of copy_mode?

5 replies

yizhang2077 Jan 16, 2025
Collaborator

When copy_mode is true, it copies data from local input to ipc buffer in kernel. In cuda graph mode, it can capture local input and register it as ipc buffer first, so that it does not need additional copy.

leizhao1234 Jan 16, 2025
Author

However, I tested a multi_gpu_barrier without a memory fence in copy_mode for both oneshot and twoshot, and the results were correct. Could you please clarify why a block barrier is needed?

yizhang2077 Jan 16, 2025
Collaborator

sglang/sgl-kernel/src/sgl-kernel/csrc/trt_reduce_internal.cu

Lines 164 to 177 in 7596417

    
           // Suppose that two GPUs participate in the AR exchange, and we start four blocks. 
        
           // The message is partitioned into chunks as detailed below: 
        
           //               message 
        
           //       |-------------------| 
        
           // GPU 0 | B0 | B1 | B2 | B3 | 
        
           // GPU 1 | B0 | B1 | B2 | B3 | 
        
           // 
        
           // Here the step-by-step behavior of one block: 
        
           // 1. B0 copies the chunk it  is responsible for, from local_input to shareable buffer 
        
           // 2. B0 on GPU 0 and B0 on GPU 1 wait for each other (block_barrier) 
        
           // 3. B0 on GPU 0 pull and sum the chunk from GPU 1, writes the result to local_output 
        
           // 
        
           // With COPY_INPUT == false, skip step 1. and use gpu_barrier instead of block barrier during step 2. 
        
           // We only to know if the other GPU as arrived at the AR kernel, that would mean that data is ready

As comment step2 says, since each ipc buffer is zero when kernel start, each block must wait for the same block in other gpu has been copied to ipc buffer, then it can do reduce work.

leizhao1234 Jan 16, 2025
Author

https://github.com/NVIDIA/TensorRT-LLM/blob/0d0583a639cb120f09ae4af50dd0722bdd60a5df/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu#L1778-L1786
But in trtllm, oneshot use multi_gpu_barrier for copy_mode.

yizhang2077 Jan 16, 2025
Collaborator

https://github.com/NVIDIA/TensorRT-LLM/blob/0d0583a639cb120f09ae4af50dd0722bdd60a5df/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu#L1357-L1381
I think oneshot also use block_barrier for copy_mode

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Question and suggestions about custom allreduce #2918

{{title}}

Replies: 3 comments 5 replies

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Select a reply

Question and suggestions about custom allreduce #2918

leizhao1234 Jan 16, 2025

Replies: 3 comments · 5 replies

leizhao1234 Jan 16, 2025 Author

yizhang2077 Jan 16, 2025 Collaborator

leizhao1234 Jan 16, 2025 Author

yizhang2077 Jan 16, 2025 Collaborator

leizhao1234 Jan 16, 2025 Author

yizhang2077 Jan 16, 2025 Collaborator

leizhao1234 Jan 16, 2025 Author

yizhang2077 Jan 16, 2025 Collaborator

leizhao1234
Jan 16, 2025

Replies: 3 comments 5 replies

leizhao1234
Jan 16, 2025
Author

yizhang2077
Jan 16, 2025
Collaborator

leizhao1234
Jan 16, 2025
Author

yizhang2077 Jan 16, 2025
Collaborator

leizhao1234 Jan 16, 2025
Author

yizhang2077 Jan 16, 2025
Collaborator

leizhao1234 Jan 16, 2025
Author

yizhang2077 Jan 16, 2025
Collaborator