Add TPU doc (#1113)

* init tpu doc * add tpu node mnist example * file ending * file ending * add tpunode * doc update * add tpuvm option * runtime version * update * address comments * yapf * update * fix test * add trc * comments * change default tpunode
skypilot-org · Aug 24, 2022 · 70aab9d · 70aab9d
1 parent c50a904
commit 70aab9d
Show file tree

Hide file tree

Showing 5 changed files with 261 additions and 4 deletions.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -54,6 +54,7 @@ Use the clouds **easily** and **cost effectively**, without needing cloud infra
    reference/auto-stop
    examples/spot-jobs
    reference/benchmark/index
+   reference/tpu
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/source/reference/tpu.rst b/docs/source/reference/tpu.rst
@@ -0,0 +1,193 @@
+.. _tpu:
+
+Cloud TPU
+================================
+
+SkyPilot supports running jobs on Google's `Cloud TPU <https://cloud.google.com/tpu/docs/intro-to-tpu>`_.
+Two different TPU architectures are available on GCP:
+
+- `TPU Nodes <https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu-node>`_
+- `TPU VMs <https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu-vm>`_
+
+Both are supported by SkyPilot.
+
+The two architectures differ as follows.
+For TPU Nodes, a host VM communicates with the TPU host over gRPC.
+For TPU VMs, you can SSH directly into a VM that is physically connected to the TPU device.
+For more details please refer to GCP `documentation <https://cloud.google.com/tpu/docs/system-architecture-tpu-vm#tpu-arch>`_.
+
+
+.. note::
+
+   We encourage researchers to apply for free TPU access through `TPU Research Cloud (TRC) <https://sites.research.google/trc/about/>`_ program.
+
+
+Getting TPUs in one command
+--------------------------------
+
+Like :ref:`GPUs <interactive-nodes>`, SkyPilot provides a simple command to quickly get TPUs for development:
+
+.. code-block:: bash
+
+   sky tpunode                                # By default TPU v2-8 is used
+   sky tpunode --use-spot                     # Preemptible TPUs
+   sky tpunode --tpus tpu-v3-8                # Change TPU type to tpu-v3-8
+   sky tpunode --instance-type n1-highmem-16  # Change the host VM type to n1-highmem-16
+   sky tpunode --tpu-vm                       # Use TPU VM (instead of TPU Node)
+
+After the command has finished, you will be dropped into the host VM and can start develop code right away!
+
+Below we demonstrate how to run MNIST training on both TPU Nodes and TPU VMs with SkyPilot YAML.
+
+TPU Nodes
+--------------------------------
+
+To use TPU Node, a host CPU VM needs to be created together with a TPU node and configured correctly to connect with each other.
+SkyPilot automates the above process with a simple interface:
+
+.. code-block:: yaml
+
+   resources:
+      instance_type: n1-highmem-8
+      accelerators: tpu-v2-8
+      accelerator_args:
+         runtime_version: 2.5.0 # TPU software version to be used.
+
+The above YAML considers :code:`n1-highmem-8` as the host machine and :code:`tpu-v2-8` as the TPU node resource.
+You may modify the host instance type or TPU type as you wish.
+To show more TPU accelerators, you may run the command :code:`sky show-gpus`.
+
+Now, we show a complete YAML for running `MNIST training <https://cloud.google.com/tpu/docs/tutorials/mnist-2.x>`_ on TPU node with TensorFlow.
+
+.. code-block:: yaml
+
+   # Task name (optional), used for display purposes.
+   name: mnist-tpu-node
+
+   resources:
+      accelerators: tpu-v2-8
+      accelerator_args:
+         runtime_version: 2.5.0 # TPU software version to be used.
+
+   # TPU node requires loading data from a GCS bucket.
+   file_mounts:
+      /dataset:
+         name: mnist-tpu-node
+         store: gcs
+         mode: MOUNT
+
+   # The setup command.  Will be run under the working directory.
+   setup: |
+      git clone https://github.com/tensorflow/models.git
+
+      conda activate mnist
+      if [ $? -eq 0 ]; then
+         echo 'conda env exists'
+      else
+         conda create -n mnist python=3.8 -y
+         conda activate mnist
+         pip install tensorflow==2.5.0 tensorflow-datasets tensorflow-model-optimization cloud-tpu-client
+      fi
+
+   # The command to run.  Will be run under the working directory.
+   run: |
+      conda activate mnist
+      cd models/official/legacy/image_classification/
+
+      export STORAGE_BUCKET=gs://mnist-tpu-node
+      export MODEL_DIR=${STORAGE_BUCKET}/mnist
+      export DATA_DIR=${STORAGE_BUCKET}/data
+
+      export PYTHONPATH=/home/gcpuser/sky_workdir/models
+
+      python3 mnist_main.py \
+         --tpu=${TPU_NAME} \
+         --model_dir=${MODEL_DIR} \
+         --data_dir=${DATA_DIR} \
+         --train_epochs=10 \
+         --distribution_strategy=tpu \
+         --download
+
+.. note::
+
+   TPU node requires loading data from a GCS bucket, so we add a :code:`file_mounts` to create a new bucket.
+   Check :ref:`SkyPilot Storage <sky-storage>` for more details.
+
+.. note::
+   The environment variable :code:`$TPU_NAME` is automatically set by SkyPilot for connecting TPU devices.
+
+With the above YAML, you should be able to launch the training job with :code:`sky launch`!
+
+.. code-block:: console
+
+   $ sky launch mnist-tpu-node.yaml -c mycluster
+   ...
+   (mnist-tpu-node pid=28961) Epoch 9/10
+   (mnist-tpu-node pid=28961) 58/58 [==============================] - 1s 19ms/step - loss: 0.1181 - sparse_categorical_accuracy: 0.9646 - val_loss: 0.0921 - val_sparse_categorical_accuracy: 0.9719
+   (mnist-tpu-node pid=28961) Epoch 10/10
+   (mnist-tpu-node pid=28961) 58/58 [==============================] - 1s 20ms/step - loss: 0.1139 - sparse_categorical_accuracy: 0.9655 - val_loss: 0.0831 - val_sparse_categorical_accuracy: 0.9742
+   ...
+   (mnist-tpu-node pid=28961) {'accuracy_top_1': 0.9741753339767456, 'eval_loss': 0.0831054300069809, 'loss': 0.11388632655143738, 'training_accuracy_top_1': 0.9654667377471924}
+
+
+
+TPU VMs
+--------------------------------
+
+To use TPU VMs, user only needs to add :code:`tpu_vm: True` and the desired TPU runtime version in :code:`accelerator_args` shown below:
+
+.. code-block:: yaml
+
+   resources:
+      accelerators: tpu-v2-8
+      accelerator_args:
+         runtime_version: tpu-vm-base
+         tpu_vm: True
+
+
+Note that :code:`instance_type` is no longer needed because TPU VMs is a standalone host VM that physically connects to the TPU device.
+
+Now we show an example of running `mnist training <https://cloud.google.com/tpu/docs/run-calculation-jax#running_jax_code_on_a_tpu_vm>`_ on TPU VM with JAX.
+
+.. code-block:: yaml
+
+   name: mnist-tpu-vm
+
+   resources:
+      accelerators: tpu-v2-8
+      accelerator_args:
+         runtime_version: tpu-vm-base
+         tpu_vm: True
+
+   setup: |
+      git clone https://github.com/google/flax.git
+
+      conda activate flax
+      if [ $? -eq 0 ]; then
+         echo 'conda env exists'
+      else
+         conda create -n flax python=3.8 -y
+         conda activate flax
+         # Make sure to install TPU related packages in a conda env to avoid package conflicts.
+         pip install "jax[tpu]>=0.2.16" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+         pip install --upgrade clu
+         pip install -e flax
+      fi
+
+   run: |
+      conda activate flax
+      cd flax/examples/mnist
+      python3 main.py --workdir=/tmp/mnist \
+      --config=configs/default.py \
+      --config.learning_rate=0.05 \
+      --config.num_epochs=10
+
+A GCS bucket is not required as the TPU VM is physically linked to the TPU device, which can access data directly.
+You are expected to see the below outputs when the job finishes.
+
+.. code-block:: console
+
+   $ sky launch examples/tpu/tpuvm_mnist.yaml -c mycluster
+   ...
+   (mnist-tpu-vm pid=10155) I0823 07:49:25.468526 139641357117440 train.py:146] epoch:  9, train_loss: 0.0120, train_accuracy: 99.64, test_loss: 0.0278, test_accuracy: 99.02
+   (mnist-tpu-vm pid=10155) I0823 07:49:26.966874 139641357117440 train.py:146] epoch: 10, train_loss: 0.0095, train_accuracy: 99.73, test_loss: 0.0264, test_accuracy: 99.19
diff --git a/examples/tpu/tpu_node_mnist.yml b/examples/tpu/tpu_node_mnist.yml
@@ -0,0 +1,46 @@
+name: mnist-tpu-node
+
+resources:
+  instance_type: n1-highmem-8
+  accelerators: tpu-v2-8
+  accelerator_args:
+    runtime_version: 2.5.0
+
+file_mounts:
+  /dataset:
+    name: demo-mnist-tpu
+    store: gcs
+    mode: MOUNT
+
+
+# The setup command.  Will be run under the working directory.
+setup: | 
+  git clone https://github.com/tensorflow/models.git
+
+  conda activate mnist
+  if [ $? -eq 0 ]; then
+    echo 'conda env exists'
+  else
+    conda create -n mnist python=3.8 -y
+    conda activate mnist
+    pip install tensorflow==2.5.0 tensorflow-datasets tensorflow-model-optimization cloud-tpu-client
+  fi
+
+# The command to run.  Will be run under the working directory.
+run: |
+  conda activate mnist
+  cd models/official/legacy/image_classification/
+
+  export STORAGE_BUCKET=gs://demo-mnist-tpu
+  export MODEL_DIR=${STORAGE_BUCKET}/mnist
+  export DATA_DIR=${STORAGE_BUCKET}/data
+
+  export PYTHONPATH=/home/gcpuser/sky_workdir/models
+
+  python3 mnist_main.py \
+    --tpu=${TPU_NAME} \
+    --model_dir=${MODEL_DIR} \
+    --data_dir=${DATA_DIR} \
+    --train_epochs=10 \
+    --distribution_strategy=tpu \
+    --download
diff --git a/sky/cli.py b/sky/cli.py
@@ -92,7 +92,7 @@
                              use_spot=False),
     'tpunode': sky.Resources(cloud=sky.GCP(),
                              instance_type=None,
-                             accelerators={'tpu-v3-8': 1},
+                             accelerators={'tpu-v2-8': 1},
                              accelerator_args={'runtime_version': '2.5.0'},
                              use_spot=False),
 }
@@ -176,6 +176,11 @@ def _interactive_node_cli_command(cli_func):
                                is_flag=True,
                                help='If true, use spot instances.')
 
+    tpuvm_option = click.option('--tpu-vm',
+                                default=False,
+                                is_flag=True,
+                                help='If true, use TPU VMs.')
+
     disk_size = click.option('--disk-size',
                              default=None,
                              type=int,
@@ -200,6 +205,7 @@ def _interactive_node_cli_command(cli_func):
         *([gpus] if cli_func.__name__ == 'gpunode' else []),
         *([tpus] if cli_func.__name__ == 'tpunode' else []),
         spot_option,
+        *([tpuvm_option] if cli_func.__name__ == 'tpunode' else []),
 
         # Attach options
         screen_option,
@@ -1865,8 +1871,9 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
 # pylint: disable=redefined-outer-name
 def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
             instance_type: Optional[str], tpus: Optional[str],
-            use_spot: Optional[bool], screen: Optional[bool],
-            tmux: Optional[bool], disk_size: Optional[int]):
+            use_spot: Optional[bool], tpu_vm: Optional[bool],
+            screen: Optional[bool], tmux: Optional[bool],
+            disk_size: Optional[int]):
     """Launch or attach to an interactive TPU node.
 
     Examples:
@@ -1905,6 +1912,10 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
     user_requested_resources = not (instance_type is None and tpus is None and
                                     use_spot is None)
     default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['tpunode']
+    accelerator_args = default_resources.accelerator_args
+    if tpu_vm:
+        accelerator_args['tpu_vm'] = True
+        accelerator_args['runtime_version'] = 'tpu-vm-base'
     if instance_type is None:
         instance_type = default_resources.instance_type
     if tpus is None:
@@ -1914,6 +1925,7 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
     resources = sky.Resources(cloud=sky.GCP(),
                               instance_type=instance_type,
                               accelerators=tpus,
+                              accelerator_args=accelerator_args,
                               use_spot=use_spot,
                               disk_size=disk_size)
 

diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py
@@ -197,7 +197,12 @@ def get_common_gpus() -> List[str]:
 
 def get_tpus() -> List[str]:
     """Returns a list of TPU names."""
-    return ['tpu-v2-8', 'tpu-v2-32', 'tpu-v2-128', 'tpu-v3-8']
+    # TODO(wei-lin): refactor below hard-coded list.
+    return [
+        'tpu-v2-8', 'tpu-v2-32', 'tpu-v2-128', 'tpu-v2-256', 'tpu-v2-512',
+        'tpu-v3-8', 'tpu-v3-32', 'tpu-v3-64', 'tpu-v3-128', 'tpu-v3-256',
+        'tpu-v3-512', 'tpu-v3-1024', 'tpu-v3-2048'
+    ]
 
 
 __all__ = [