vllm-project · WoosukKwon · Aug 30, 2024 · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
@@ -4,4 +4,4 @@
 # Dependencies for TPU
 # Currently, the TPU backend uses a nightly version of PyTorch XLA.
 # You can install the dependencies in Dockerfile.tpu.
-ray
+ray[default,serve]
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
@@ -123,7 +123,10 @@ def __init__(
             raise NotImplementedError("TPU version must be 4 or higher.")
 
         self.megacore_mode = None
-        tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower()
+        tpu_env = torch_xla.tpu.get_tpu_env()
+        tpu_type = tpu_env.get("TYPE") or tpu_env.get("ACCELERATOR_TYPE")
+        tpu_type = tpu_type.lower()
+
         if "lite" not in tpu_type:
             if self.num_kv_heads % 2 == 0:
                 self.megacore_mode = "kv_head"

diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -1,3 +1,5 @@
+import os
+
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
@@ -8,6 +10,7 @@
     import ray
     import torch_xla.core.xla_model as xm
     import torch_xla.runtime as xr
+    from ray._private.accelerators import TPUAcceleratorManager
     from torch_xla._internal import pjrt
 
 
@@ -24,9 +27,37 @@ def __init__(self, group: ProcessGroup):
         # be simply calculated as follows.
         global_rank = dist.get_rank(group)
         global_world_size = dist.get_world_size(group)
-        num_nodes = len(ray.nodes())
+
+        # Calculate how many TPU nodes are in the current deployment. This
+        # is the Ray placement group if it is deployed with Ray. Default
+        # to the number of TPU nodes in the Ray cluster. The number of TPU
+        # nodes is computed by the total number of TPUs divided by the
+        # number of TPU accelerators per node, to account for clusters
+        # with both CPUs and TPUs.
+        cluster_resources = ray.cluster_resources()
+        total_tpus = int(cluster_resources["TPU"])
+        tpus_per_node = (
+            TPUAcceleratorManager.get_current_node_num_accelerators())
+        num_nodes = total_tpus // tpus_per_node
+
+        pg_table = ray.util.placement_group_table()
+        current_pg = ray.util.get_current_placement_group()
+
+        if current_pg:
+            nodes_in_pg = set()
+            for pg_key, pg in pg_table.items():
+                if pg_key == current_pg.id.hex():
+                    for _, node in pg["bundles_to_node_id"].items():
+                        nodes_in_pg.add(node)
+            num_nodes = len(nodes_in_pg)
+
         local_world_size = global_world_size // num_nodes
         local_rank = global_rank % local_world_size
+
+        # Ensure environment variables are set for multihost deployments.
+        os.environ["CLOUD_TPU_TASK_ID"] = str(global_rank)
+        os.environ["TPU_VISIBLE_CHIPS"] = str(local_rank)
+
         pjrt.initialize_multiprocess(local_rank, local_world_size)
         xr._init_world_size_ordinal()
 

diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py
@@ -70,6 +70,19 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             worker_module_name = "vllm.worker.tpu_worker"
             worker_class_name = "TPUWorker"
 
+            # GKE does not fetch environment information from metadata server
+            # and instead sets these from within the Ray process. Therefore we
+            # need to override the Ray environment variables manually.
+            override_env = {}
+            if "TPU_CHIPS_PER_HOST_BOUNDS" in os.environ:
+                override_env.update({
+                    "TPU_CHIPS_PER_HOST_BOUNDS":
+                    os.environ["TPU_CHIPS_PER_HOST_BOUNDS"]
+                })
+            if "TPU_HOST_BOUNDS" in os.environ:
+                override_env.update(
+                    {"TPU_HOST_BOUNDS": os.environ["TPU_HOST_BOUNDS"]})
+
             worker = ray.remote(
                 num_cpus=0,
                 resources={"TPU": 1},
@@ -80,6 +93,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                 worker_class_name=worker_class_name,
                 trust_remote_code=self.model_config.trust_remote_code,
             )
+            if override_env:
+                worker.override_env_vars.remote(override_env)
 
             worker_ip = ray.get(worker.get_node_ip.remote())
             if worker_ip == driver_ip and self.driver_dummy_worker is None:
@@ -118,8 +133,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             "VLLM_TRACE_FUNCTION":
             str(envs.VLLM_TRACE_FUNCTION),
         }, ) for _ in worker_node_and_gpu_ids]
-        self._run_workers("update_environment_variables",
-                          all_args=all_args_to_update_environment_variables)
+        self._run_workers(
+            "update_environment_variables",
+            all_args=all_args_to_update_environment_variables,
+        )
 
         if len(node_workers) == 1:
             # in single node case, we don't need to get the IP address.
@@ -145,9 +162,11 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
 
         self._run_workers("init_device")
-        self._run_workers("load_model",
-                          max_concurrent_workers=self.parallel_config.
-                          max_parallel_loading_workers)
+        self._run_workers(
+            "load_model",
+            max_concurrent_workers=self.parallel_config.
+            max_parallel_loading_workers,
+        )
 
     def _driver_execute_model(
         self,
@@ -190,10 +209,10 @@ def _run_workers(
                 "max_concurrent_workers is not supported yet.")
 
         count = len(self.workers)
-        all_worker_args = repeat(args, count) if all_args is None \
-            else islice(all_args, 1, None)
-        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
-            else islice(all_kwargs, 1, None)
+        all_worker_args = (repeat(args, count) if all_args is None else islice(
+            all_args, 1, None))
+        all_worker_kwargs = (repeat(kwargs, count) if all_kwargs is None else
+                             islice(all_kwargs, 1, None))
 
         # Start the ray workers first.
         ray_worker_outputs = [
@@ -241,9 +260,11 @@ def initialize_cache(self, num_gpu_blocks: int,
                     num_cpu_blocks)
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
-        self._run_workers("initialize_cache",
-                          num_gpu_blocks=num_gpu_blocks,
-                          num_cpu_blocks=num_cpu_blocks)
+        self._run_workers(
+            "initialize_cache",
+            num_gpu_blocks=num_gpu_blocks,
+            num_cpu_blocks=num_cpu_blocks,
+        )
 
     def execute_model(
         self,
@@ -253,7 +274,8 @@ def execute_model(
             self.parallel_worker_tasks = self._run_workers(
                 "start_worker_execution_loop",
                 async_run_remote_workers_only=True,
-                **self.extra_execute_model_run_workers_kwargs)
+                **self.extra_execute_model_run_workers_kwargs,
+            )
 
         # Only the driver worker returns the sampling results.
         return self._driver_execute_model(execute_model_req)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
@@ -1,3 +1,4 @@
+import os
 import time
 from collections import defaultdict
 from typing import Dict, List, Optional, Tuple, Union
@@ -84,6 +85,9 @@ def execute_model_spmd(
 
             return output
 
+        def override_env_vars(self, vars):
+            os.environ.update(vars)
+
     ray_import_err = None
 
 except ImportError as e: