Added Distributed(Tensor Parallel) Inference Recipe (#2245)

Co-authored-by: JessicaZhong <[email protected]>
pytorch · Jan 18, 2025 · 779569e · 779569e
1 parent 1036095
commit 779569e
Show file tree

Hide file tree

Showing 14 changed files with 618 additions and 13 deletions.
diff --git a/recipes/configs/generation.yaml b/recipes/configs/generation.yaml
@@ -1,4 +1,9 @@
-# Config for running the InferenceRecipe in generate.py to generate output from an LLM
+# Config for running the InferenceRecipe in generate.py to generate output
+# from Llama2 7B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Llama-2-7b-hf --output-dir /tmp/Llama-2-7b-hf --ignore-patterns "*.safetensors" --hf-token <HF_TOKEN>
 #
 # To launch, run the following command from root torchtune directory:
 #    tune run generate --config generation

diff --git a/recipes/configs/llama3/70B_generation_distributed.yaml b/recipes/configs/llama3/70B_generation_distributed.yaml
@@ -0,0 +1,50 @@
+# Config for running the InferenceRecipe in dev/generate_v2.py to generate output
+# using a Llama3 70B Instruct model
+#
+# This config assumes that you've run the following command before launching:
+#  tune download meta-llama/Meta-Llama-3-70B-Instruct --output-dir /tmp/Meta-Llama-3-70B-Instruct --ignore-patterns "original/consolidated*" --hf-token <HF_TOKEN>
+#
+# To launch, run the following command from root torchtune directory:
+#    tune run --nproc_per_node 8 dev/generate_v2_distributed --config llama3/70B_generation_distributed
+
+output_dir: ./
+
+# Model arguments
+model:
+  _component_: torchtune.models.llama3.llama3_70b
+
+parallelize_plan:
+  _component_: torchtune.models.llama3.base_llama_tp_plan
+
+# Transform arguments
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Meta-Llama-3-70B-Instruct/original/tokenizer.model
+  prompt_template: null
+  max_seq_len: 8192
+
+# Checkpointer
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Meta-Llama-3-70B-Instruct
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00030"
+  recipe_checkpoint: null
+  output_dir: ${output_dir}
+  model_type: LLAMA3
+
+# Device
+device: cuda
+dtype: bf16
+seed: 1234
+log_level: INFO
+
+# Generation arguments
+prompt:
+  system: null
+  user:
+    text: Tell a joke.
+max_new_tokens: 200
+temperature: 0.6 # 0.8 and 0.6 are popular values to try
+top_k: 300
diff --git a/recipes/configs/llama3_1/70B_generation_distributed.yaml b/recipes/configs/llama3_1/70B_generation_distributed.yaml
@@ -0,0 +1,50 @@
+# Config for running the InferenceRecipe in dev/generate_v2.py to generate output
+# using a Llama3.1 70B Instruct model
+#
+# This config assumes that you've run the following command before launching:
+#   tune download meta-llama/Meta-Llama-3.1-70B-Instruct --output-dir /tmp/Meta-Llama-3.1-70B-Instruct --ignore-patterns "original/consolidated*" --hf-token <HF_TOKEN>
+#
+# To launch, run the following command from root torchtune directory:
+#    tune run --nproc_per_node 8 dev/generate_v2_distributed --config llama3_1/70B_generation_distributed
+
+output_dir: ./
+
+# Model arguments
+model:
+  _component_: torchtune.models.llama3_1.llama3_1_70b
+
+parallelize_plan:
+  _component_: torchtune.models.llama3.base_llama_tp_plan
+
+# Transform arguments
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Meta-Llama-3.1-70B-Instruct/original/tokenizer.model
+  prompt_template: null
+  max_seq_len: 8192
+
+# Checkpointer
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Meta-Llama-3.1-70B-Instruct/
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00030"
+  recipe_checkpoint: null
+  output_dir: ${output_dir}
+  model_type: LLAMA3
+
+# Device
+device: cuda
+dtype: bf16
+seed: 1234
+log_level: INFO
+
+# Generation arguments
+prompt:
+  system: null
+  user:
+    text: Tell a joke.
+max_new_tokens: 200
+temperature: 0.6 # 0.8 and 0.6 are popular values to try
+top_k: 300
diff --git a/recipes/configs/llama3_2_vision/11B_generation_v2.yaml b/recipes/configs/llama3_2_vision/11B_generation_v2.yaml
@@ -7,7 +7,7 @@
 # To launch, run the following command from root torchtune directory:
 #    tune run dev/generate_v2 --config llama3_2_vision/generation_v2
 
-output_dir: ./ # Not needed
+output_dir: ./
 
 # Model arguments
 model:

diff --git a/recipes/configs/llama3_3/70B_generation_distributed.yaml b/recipes/configs/llama3_3/70B_generation_distributed.yaml
@@ -0,0 +1,50 @@
+# Config for running the InferenceRecipe in dev/generate_v2.py to generate output
+# using a Llama3.1 70B Instruct model
+#
+# This config assumes that you've run the following command before launching:
+#   tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "original/consolidated*" --hf-token <HF_TOKEN>
+#
+# To launch, run the following command from root torchtune directory:
+#    tune run --nproc_per_node 8 dev/generate_v2_distributed --config llama3_3/70B_generation_distributed
+
+output_dir: ./
+
+# Model arguments
+model:
+  _component_: torchtune.models.llama3_3.llama3_3_70b
+
+parallelize_plan:
+  _component_: torchtune.models.llama3.base_llama_tp_plan
+
+# Transform arguments
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Llama-3.3-70B-Instruct/original/tokenizer.model
+  prompt_template: null
+  max_seq_len: 8192
+
+# Checkpointer
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-3.3-70B-Instruct/
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00030"
+  recipe_checkpoint: null
+  output_dir: ${output_dir}
+  model_type: LLAMA3
+
+# Device
+device: cuda
+dtype: bf16
+seed: 1234
+log_level: INFO
+
+# Generation arguments
+prompt:
+  system: null
+  user:
+    text: Tell a joke.
+max_new_tokens: 200
+temperature: 0.6 # 0.8 and 0.6 are popular values to try
+top_k: 300
diff --git a/recipes/dev/generate_v2.py b/recipes/dev/generate_v2.py
@@ -39,18 +39,22 @@ def __call__(self, prompt: Dict[str, Any]) -> List[Message]:
 
         # Iterate through roles and add content
         for role, content in prompt.items():
-            if isinstance(content, str):
+            if content is None:
+                continue
+            elif isinstance(content, str):
                 new_content = [{"type": "text", "content": content}]
-            else:
-                assert (
-                    "image" in content.keys()
-                ), "Multiple entries per role expect an image key"
+            elif "image" in content.keys():
                 image_loc = content["image"]
                 image = load_image(image_loc)
                 new_content = [
                     {"type": "image", "content": image},
                     {"type": "text", "content": content["text"]},
                 ]
+            else:
+                assert (
+                    "text" in content.keys()
+                ), "Multiple entries per role expect at least a text key"
+                new_content = [{"type": "text", "content": content["text"]}]
             messages.append(Message(role=role, content=new_content))
 
         # Finally, add an empty assistant message to kick-start generation
@@ -109,12 +113,12 @@ def log_metrics(self, total_time: int, tokens_per_second: float) -> None:
             f"Time for inference: {total_time:.02f} sec total, {tokens_per_second:.02f} tokens/sec"
         )
         self._logger.info(
-            f"Bandwidth achieved: {model_size * tokens_per_second / 1e9:.02f} GB/s"
+            f"Bandwidth achieved: {model_size * tokens_per_second / (1024**3):.02f} GiB/s"
         )
         if self._device.type != "cpu":
             torch_device = utils.get_torch_device_namespace()
             self._logger.info(
-                f"Max memory allocated: {torch_device.max_memory_allocated() / 1e9:.02f} GB"
+                f"Max memory allocated: {torch_device.max_memory_allocated() / (1024**3):.02f} GiB"
             )
 
     @torch.inference_mode()