sgl-project · zhyncs · Jan 23, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -51,6 +51,7 @@ jobs:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
     runs-on: 1-gpu-runner
     strategy:
+      fail-fast: false
       matrix:
         range: [0-6, 6-15, 15-22, 22-32, 32-40, 40-100]
     steps:

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -27,7 +27,7 @@ runtime_common = [
 ]
 srt = [
     "sglang[runtime_common]", "cuda-python",
-    "sgl-kernel>=0.0.2.post14", "torch", "vllm==0.6.4.post1",
+    "sgl-kernel>=0.0.2.post16", "torch", "vllm==0.6.4.post1",
     "flashinfer==0.1.6"
 ]
 

@@ -20,10 +20,18 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import (
+    enable_use_sgl_kernel_first,
+    is_cuda_available,
+    is_flashinfer_available,
+)
 
-if is_flashinfer_available():
-    from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
+if enable_use_sgl_kernel_first:
+    if is_cuda_available():
+        from sgl_kernel import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
+else:
+    if is_flashinfer_available():
+        from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
 
 from vllm.model_executor.custom_op import CustomOp
 

@@ -19,15 +19,28 @@
 import torch
 import torch.nn as nn
 
-from sglang.srt.utils import is_flashinfer_available
-
-if is_flashinfer_available():
-    from flashinfer.norm import (
-        fused_add_rmsnorm,
-        gemma_fused_add_rmsnorm,
-        gemma_rmsnorm,
-        rmsnorm,
-    )
+from sglang.srt.utils import (
+    enable_use_sgl_kernel_first,
+    is_cuda_available,
+    is_flashinfer_available,
+)
+
+if enable_use_sgl_kernel_first:
+    if is_cuda_available():
+        from sgl_kernel import (
+            fused_add_rmsnorm,
+            gemma_fused_add_rmsnorm,
+            gemma_rmsnorm,
+            rmsnorm,
+        )
+else:
+    if is_flashinfer_available():
+        from flashinfer.norm import (
+            fused_add_rmsnorm,
+            gemma_fused_add_rmsnorm,
+            gemma_rmsnorm,
+            rmsnorm,
+        )
 
 from vllm.model_executor.custom_op import CustomOp
 

@@ -12,17 +12,28 @@
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.utils import (
     crash_on_warnings,
+    enable_use_sgl_kernel_first,
     get_bool_env_var,
+    is_cuda_available,
     is_flashinfer_available,
 )
 
-if is_flashinfer_available():
-    from flashinfer.sampling import (
-        min_p_sampling_from_probs,
-        top_k_renorm_prob,
-        top_k_top_p_sampling_from_probs,
-        top_p_renorm_prob,
-    )
+if enable_use_sgl_kernel_first:
+    if is_cuda_available():
+        from sgl_kernel import (
+            min_p_sampling_from_probs,
+            top_k_renorm_prob,
+            top_k_top_p_sampling_from_probs,
+            top_p_renorm_prob,
+        )
+else:
+    if is_flashinfer_available():
+        from flashinfer.sampling import (
+            min_p_sampling_from_probs,
+            top_k_renorm_prob,
+            top_k_top_p_sampling_from_probs,
+            top_p_renorm_prob,
+        )
 
 
 logger = logging.getLogger(__name__)

@@ -60,8 +60,18 @@
 
 is_hip_ = is_hip()
 
-if is_flashinfer_available():
-    from flashinfer import bmm_fp8
+from sglang.srt.utils import (
+    enable_use_sgl_kernel_first,
+    is_cuda_available,
+    is_flashinfer_available,
+)
+
+if enable_use_sgl_kernel_first:
+    if is_cuda_available():
+        from sgl_kernel import bmm_fp8
+else:
+    if is_flashinfer_available():
+        from flashinfer import bmm_fp8
 
 
 class DeepseekV2MLP(nn.Module):

@@ -40,10 +40,18 @@
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import (
+    enable_use_sgl_kernel_first,
+    is_cuda_available,
+    is_flashinfer_available,
+)
 
-if is_flashinfer_available():
-    from flashinfer import bmm_fp8
+if enable_use_sgl_kernel_first:
+    if is_cuda_available():
+        from sgl_kernel import bmm_fp8
+else:
+    if is_flashinfer_available():
+        from flashinfer import bmm_fp8
 
 
 class MiniCPM3MLP(nn.Module):

@@ -66,6 +66,8 @@
 show_time_cost = False
 time_infos = {}
 
+enable_use_sgl_kernel_first = bool(int(os.getenv("ENABLE_USE_SGL_KERNEL_FIRST", "1")))
+
 
 def is_hip() -> bool:
     """Return whether it is HIP on the AMD ROCm platform."""