YerevaNN · ProgerDav · May 31, 2024 · May 31, 2024 · Aug 13, 2024 · Aug 21, 2024
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -6,4 +6,5 @@ sentencepiece
 tiktoken
 blobfile
 tabulate
-transformers
+transformers
+orjson
diff --git a/run_llama_train.sh b/run_llama_train.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/bash
+#!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 
@@ -11,7 +11,7 @@ set -ex
 # e.g.
 # LOG_RANK=0,1 NGPU=4 ./run_llama_train.sh
 NGPU=${NGPU:-"2"}
-LOG_RANK=0,1
+LOG_RANK="0,1"
 CONFIG_FILE=${CONFIG_FILE:-"./train_configs/debug_model.toml"}
 MAX_RESTARTS=5
 

diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -385,6 +385,67 @@ def __init__(self):
             help="Python garbage control scheduling interval, in steps",
         )
 
+        # experimental dataloader flags
+        self.parser.add_argument(
+            "--dataset.use_experimental_dataloader",
+            action="store_true",
+            help="Whether to use the experimental dataloader instead of default HF",
+        )
+        self.parser.add_argument(
+            "--dataset.data_logical_shards",
+            type=int,
+            default=768,
+            help="Dataloader logical shards. All divisors are possible world sizes.",
+        )
+        self.parser.add_argument(
+            "--dataset.bos_token",
+            type=int,
+            default=-1,
+            help="BOS token index value. If not using, leave as -1.",
+        )
+        self.parser.add_argument(
+            "--dataset.eos_token",
+            type=int,
+            default=0,
+            help="EOS or SEP token index value.",
+        )
+        self.parser.add_argument(
+            "--dataset.drop_tokens",
+            type=str,
+            default="",
+            help="Dummy token values to drop from begin/end of sequences (comma-separated ints)",
+        )
+        self.parser.add_argument(
+            "--dataset.datasets",
+            type=str,
+            default="c4_mini",
+            help="Datasets to use for training, comma-separated",
+        )
+        self.parser.add_argument(
+            "--dataset.dataset_weights",
+            type=str,
+            default="1",
+            help="Sampling ratios for sub-datasets, comma-separated. Do not need to sum to 1.",
+        )
+        self.parser.add_argument(
+            "--dataset.num_data_workers",
+            type=int,
+            default=1,
+            help="Number of parallel dataloader processes per device.",
+        )
+        self.parser.add_argument(
+            "--dataset.file_type",
+            type=str,
+            default="arrow",
+            help="Shard file format. Current options are pretokenized 'arrow' and raw text 'hf_parquet'.",
+        )
+        self.parser.add_argument(
+            "--dataset.col_name",
+            type=str,
+            default="tokens",
+            help="Which column/field of the file shard contains the readable data.",
+        )
+
         # checkpointing configs
         self.parser.add_argument(
             "--checkpoint.enable_checkpoint",

diff --git a/torchtitan/datasets/__init__.py b/torchtitan/datasets/__init__.py
@@ -4,8 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from torchtitan.datasets.experimental_datasets import build_experimental_data_loader
 from torchtitan.datasets.hf_datasets import build_hf_data_loader
 
 __all__ = [
-    "build_hf_data_loader"
+    "build_hf_data_loader",
+    "build_experimental_data_loader",
 ]
diff --git a/torchtitan/datasets/download_tokenizer.py b/torchtitan/datasets/download_tokenizer.py
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+from requests.exceptions import HTTPError
+
+
+def hf_download(
+    repo_id: str, tokenizer_path: str, local_dir: str, hf_token: Optional[str] = None
+) -> None:
+    from huggingface_hub import hf_hub_download
+
+    tokenizer_path = (
+        f"{tokenizer_path}/tokenizer.model" if tokenizer_path else "tokenizer.model"
+    )
+
+    try:
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=tokenizer_path,
+            local_dir=local_dir,
+            local_dir_use_symlinks=False,
+            token=hf_token,
+        )
+    except HTTPError as e:
+        if e.response.status_code == 401:
+            print(
+                "You need to pass a valid `--hf_token=...` to download private checkpoints."
+            )
+        else:
+            raise e
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Download tokenizer from HuggingFace.")
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="meta-llama/Meta-Llama-3-8B",
+        help="Repository ID to download from. default to Llama-3-8B",
+    )
+    parser.add_argument(
+        "--tokenizer_path",
+        type=str,
+        default="",
+        help="the tokenizer.model path relative to repo_id",
+    )
+    parser.add_argument(
+        "--hf_token", type=str, default=None, help="HuggingFace API token"
+    )
+    parser.add_argument(
+        "--local_dir",
+        type=str,
+        default="torchtitan/datasets/tokenizer/",
+        help="local directory to save the tokenizer.model",
+    )
+
+    args = parser.parse_args()
+    hf_download(args.repo_id, args.tokenizer_path, args.local_dir, args.hf_token)
diff --git a/torchtitan/datasets/experimental/llama2/c4_mini/c4_mini.arrow b/torchtitan/datasets/experimental/llama2/c4_mini/c4_mini.arrow
diff --git a/torchtitan/datasets/experimental/llama2/meta/c4_llama2_counts.csv b/torchtitan/datasets/experimental/llama2/meta/c4_llama2_counts.csv
@@ -0,0 +1,2 @@
+dataset/filename,documents,tokens
+/c4_mini/c4_mini.arrow,45000,24174478
diff --git a/torchtitan/datasets/experimental/llama3/c4_mini/c4_mini.arrow b/torchtitan/datasets/experimental/llama3/c4_mini/c4_mini.arrow
diff --git a/torchtitan/datasets/experimental/llama3/meta/c4_llama3_counts.csv b/torchtitan/datasets/experimental/llama3/meta/c4_llama3_counts.csv
@@ -0,0 +1,2 @@
+dataset/filename,documents,tokens
+/c4_mini/c4_mini.arrow,45000,20505558
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		dataset/filename,documents,tokens
		/c4_mini/c4_mini.arrow,45000,24174478
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		dataset/filename,documents,tokens
		/c4_mini/c4_mini.arrow,45000,20505558