Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IBM Dataloader #19

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .ci/docker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ sentencepiece
tiktoken
blobfile
tabulate
transformers
transformers
orjson
4 changes: 2 additions & 2 deletions run_llama_train.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/bash
#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

Expand All @@ -11,7 +11,7 @@ set -ex
# e.g.
# LOG_RANK=0,1 NGPU=4 ./run_llama_train.sh
NGPU=${NGPU:-"2"}
LOG_RANK=0,1
LOG_RANK="0,1"
CONFIG_FILE=${CONFIG_FILE:-"./train_configs/debug_model.toml"}
MAX_RESTARTS=5

Expand Down
61 changes: 61 additions & 0 deletions torchtitan/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,67 @@ def __init__(self):
help="Python garbage control scheduling interval, in steps",
)

# experimental dataloader flags
self.parser.add_argument(
"--dataset.use_experimental_dataloader",
action="store_true",
help="Whether to use the experimental dataloader instead of default HF",
)
self.parser.add_argument(
"--dataset.data_logical_shards",
type=int,
default=768,
help="Dataloader logical shards. All divisors are possible world sizes.",
)
self.parser.add_argument(
"--dataset.bos_token",
type=int,
default=-1,
help="BOS token index value. If not using, leave as -1.",
)
self.parser.add_argument(
"--dataset.eos_token",
type=int,
default=0,
help="EOS or SEP token index value.",
)
self.parser.add_argument(
"--dataset.drop_tokens",
type=str,
default="",
help="Dummy token values to drop from begin/end of sequences (comma-separated ints)",
)
self.parser.add_argument(
"--dataset.datasets",
type=str,
default="c4_mini",
help="Datasets to use for training, comma-separated",
)
self.parser.add_argument(
"--dataset.dataset_weights",
type=str,
default="1",
help="Sampling ratios for sub-datasets, comma-separated. Do not need to sum to 1.",
)
self.parser.add_argument(
"--dataset.num_data_workers",
type=int,
default=1,
help="Number of parallel dataloader processes per device.",
)
self.parser.add_argument(
"--dataset.file_type",
type=str,
default="arrow",
help="Shard file format. Current options are pretokenized 'arrow' and raw text 'hf_parquet'.",
)
self.parser.add_argument(
"--dataset.col_name",
type=str,
default="tokens",
help="Which column/field of the file shard contains the readable data.",
)

# checkpointing configs
self.parser.add_argument(
"--checkpoint.enable_checkpoint",
Expand Down
4 changes: 3 additions & 1 deletion torchtitan/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from torchtitan.datasets.experimental_datasets import build_experimental_data_loader
from torchtitan.datasets.hf_datasets import build_hf_data_loader

__all__ = [
"build_hf_data_loader"
"build_hf_data_loader",
"build_experimental_data_loader",
]
65 changes: 65 additions & 0 deletions torchtitan/datasets/download_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

from typing import Optional

from requests.exceptions import HTTPError


def hf_download(
repo_id: str, tokenizer_path: str, local_dir: str, hf_token: Optional[str] = None
) -> None:
from huggingface_hub import hf_hub_download

tokenizer_path = (
f"{tokenizer_path}/tokenizer.model" if tokenizer_path else "tokenizer.model"
)

try:
hf_hub_download(
repo_id=repo_id,
filename=tokenizer_path,
local_dir=local_dir,
local_dir_use_symlinks=False,
token=hf_token,
)
except HTTPError as e:
if e.response.status_code == 401:
print(
"You need to pass a valid `--hf_token=...` to download private checkpoints."
)
else:
raise e


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(description="Download tokenizer from HuggingFace.")
parser.add_argument(
"--repo_id",
type=str,
default="meta-llama/Meta-Llama-3-8B",
help="Repository ID to download from. default to Llama-3-8B",
)
parser.add_argument(
"--tokenizer_path",
type=str,
default="",
help="the tokenizer.model path relative to repo_id",
)
parser.add_argument(
"--hf_token", type=str, default=None, help="HuggingFace API token"
)
parser.add_argument(
"--local_dir",
type=str,
default="torchtitan/datasets/tokenizer/",
help="local directory to save the tokenizer.model",
)

args = parser.parse_args()
hf_download(args.repo_id, args.tokenizer_path, args.local_dir, args.hf_token)
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
dataset/filename,documents,tokens
/c4_mini/c4_mini.arrow,45000,24174478
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
dataset/filename,documents,tokens
/c4_mini/c4_mini.arrow,45000,20505558
Loading