Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug Fix] Fix the support check for FP8 CUTLASS #5352

Merged
merged 1 commit into from
Jun 8, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions vllm/model_executor/layers/quantization/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@
def cutlass_fp8_supported() -> bool:
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
version = torch.version.cuda
version = version[0] * 10 + version[1]
major, minor = torch.version.cuda.split(".")
version = int(major) * 10 + int(minor)

# CUTLASS FP8 kernels need at least
# CUDA 12.0 on SM90 systems (Hopper)
# CUDA 12.4 on SM89 systems (Lovelace)
gpu_is_supported = False
if capability >= 900:
if capability >= 90:
gpu_is_supported = version > 120
elif capability >= 890:
elif capability >= 89:
gpu_is_supported = version > 124

return gpu_is_supported
Expand Down Expand Up @@ -103,7 +103,7 @@ class Fp8LinearMethod(LinearMethodBase):
1. Only support per-tensor quantization due to torch._scaled_mm support.
2. Only support float8_e4m3fn data type due to the limitation of
torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)

Args:
quant_config: The quantization config.
"""
Expand Down Expand Up @@ -298,8 +298,8 @@ def __init__(self, quant_config: Fp8Config):
self.quant_config = quant_config

def create_weights(self, layer: torch.nn.Module):
"""Create "weight" (aka kv_scale) for an attention layer.
"""Create "weight" (aka kv_scale) for an attention layer.

Args:
layer: The layer that is using the QuantizeMethodBase factory.
"""
Expand Down
Loading