diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 1c25d1e621d43..c66fce3d4316a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -272,7 +272,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, params_dtype: torch.dtype, **extra_weight_attrs): assert params_dtype == torch.float16, ( - "float16 is required for MoE compressd models. Set dtype=torch.float16" # noqa: E501 + "float16 is required for MoE compressed models. Set dtype=torch.float16" # noqa: E501 ) # Will transpose the loaded weight along the @@ -306,7 +306,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, load_full_w2 = self.actorder and self.group_size != -1 w2_scales_size = (intermediate_full if load_full_w2 else intermediate_size) - # @eliza TODO: is this condition actually needed/is it doing anything? + self.is_k_full = (not self.actorder) or (intermediate_size == intermediate_full) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index 61d1c911cd1ad..2e1b5e3c2d3b1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -62,7 +62,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, **kwargs): assert params_dtype == torch.float16, ( - "float16 is required for marlin24 compressd models. Set dtype=torch.float16" # noqa: E501 + "float16 is required for marlin24 compressed models. Set dtype=torch.float16" # noqa: E501 ) pack_factor = 32 // self.quant_type.size_bits