Lazily import HPU-dependent components

HabanaAI · Oct 4, 2024 · e62f43d · e62f43d
1 parent 38e60f4
commit e62f43d
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 7 deletions.
diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py
@@ -6,8 +6,6 @@
 import os
 from typing import Any, Dict, List, Optional, Set, Tuple
 
-from vllm_hpu_extension.profiler import HabanaMemoryProfiler
-
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -86,7 +84,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         # remains to abstract away the device for non-GPU configurations.
         logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
                     num_cpu_blocks)
-
+        from vllm_hpu_extension.profiler import HabanaMemoryProfiler
         with HabanaMemoryProfiler() as cache_init_m:
             self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
         msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
@@ -30,10 +30,6 @@
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 
-if current_platform.is_hpu():
-    from vllm_hpu_extension.rotary_embed import (HpuLlama3RotaryEmbedding,
-                                                 HpuRotaryEmbedding)
-
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
     x1 = x[..., :x.shape[-1] // 2]
@@ -923,6 +919,7 @@ def get_rope(
 
     if rope_scaling is None:
         if current_platform.is_hpu():
+            from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding
             rotary_emb = HpuRotaryEmbedding(head_size,
                                             rotary_dim,
                                             max_position,
@@ -945,6 +942,8 @@ def get_rope(
             original_max_position = rope_scaling[
                 "original_max_position_embeddings"]
             if current_platform.is_hpu():
+                from vllm_hpu_extension.rotary_embed import (
+                    HpuLlama3RotaryEmbedding)
                 rotary_emb = HpuLlama3RotaryEmbedding(
                     head_size,
                     rotary_dim,