Rename to torchtitan (#221)

Trying out a full renaming pass from torchtrian -> torchtitan, including: 1. directory structure 2. all names inside the repo itself.
pytorch · Apr 11, 2024 · 2636cac · 2636cac
1 parent 8763924
commit 2636cac
Show file tree

Hide file tree

Showing 32 changed files with 76 additions and 71 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-# Contributing to torchtrain
+# Contributing to torchtitan
 We want to make contributing to this project as easy and transparent as
 possible.
 
@@ -28,5 +28,5 @@ disclosure of security bugs. In those cases, please go through the process
 outlined on that page and do not file a public issue.
 
 ## License
-By contributing to `torchtrain`, you agree that your contributions will be licensed
+By contributing to `torchtitan`, you agree that your contributions will be licensed
 under the LICENSE file in the root directory of this source tree.
diff --git a/README.md b/README.md
@@ -1,11 +1,11 @@
-# torchtrain
+# torchtitan
 
 Note: This repository is currently under heavy development.
 
-torchtrain is a native PyTorch library with PyTorch native parallelisms and various training techniques to train large models.
+torchtitan is a native PyTorch library with PyTorch native parallelisms and various training techniques to train large models.
 ## Design Principles
 
-While torchtrain utilizes the PyTorch ecosystem for things like data loading (i.e. HuggingFace datasets), the core functionality is written in PyTorch.
+While torchtitan utilizes the PyTorch ecosystem for things like data loading (i.e. HuggingFace datasets), the core functionality is written in PyTorch.
 
 * Designed to be easy to understand, use and extend for different training purposes.
 * Minimal changes to the model code, when applying 1D/2D or 3D Parallelisms.
@@ -41,7 +41,7 @@ To visualize TensorBoard metrics of models trained on a remote server via a loca
 ssh -L 6006:127.0.0.1:6006 [username]@[hostname]
 ```
 
-3. Inside the SSH tunnel that logged into the remote server, go to the torchtrain repo, and start the TensorBoard backend
+3. Inside the SSH tunnel that logged into the remote server, go to the torchtitan repo, and start the TensorBoard backend
 ```
 tensorboard --logdir=./outputs/tb
 ```

diff --git a/docs/fsdp.md b/docs/fsdp.md
@@ -11,7 +11,7 @@ Compared to FSDP1:
 
 In the future, FSDP2 will offer an extension point to customize the all-gather (e.g. for fp8 all-gather for fp8 linears) and improved `torch.compile` support.
 
-We have validated FSDP2 numerics and performance using torchtrain (e.g. see this [PR](https://github.com/pytorch/torchtrain/pull/165)). For example, on some Llama-7B runs on 8x H100s, FSDP2 achieves higher MFU with 7% lower peak memory than FSDP1, matching the same loss curve.
+We have validated FSDP2 numerics and performance using torchtitan (e.g. see this [PR](https://github.com/pytorch/torchtitan/pull/165)). For example, on some Llama-7B runs on 8x H100s, FSDP2 achieves higher MFU with 7% lower peak memory than FSDP1, matching the same loss curve.
 
 For more details on motivation, API, and system design, refer to [here](https://github.com/pytorch/pytorch/issues/114299). In this README, we try to provide more user-facing info and less system design details.
 

diff --git a/multinode_trainer.slurm b/multinode_trainer.slurm
@@ -5,7 +5,7 @@
 # --- constraint issues or to tune for improved performance.
 # ---
 
-#SBATCH --job-name=torchtrain_multi_node
+#SBATCH --job-name=torchtitan_multi_node
 
 #SBATCH --ntasks=4
 

diff --git a/run_llama_train.sh b/run_llama_train.sh
@@ -5,7 +5,7 @@ set -ex
 # libUV is a scalable backend for TCPStore which is used in processGroup
 # rendezvous. This is the recommended backend for distributed training.
 export USE_LIBUV=1
-TRAINER_DIR=${1:-/home/$USER/local/torchtrain}
+TRAINER_DIR=${1:-/home/$USER/local/torchtitan}
 
 # use envs as local overrides for convenience
 # e.g.

diff --git a/setup.py b/setup.py
@@ -1,3 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -13,7 +16,7 @@
 from setuptools import find_packages, setup
 
 setup(
-    name="torchtrain",
+    name="torchtitan",
     version="0.0.1",
     packages=find_packages(),
     install_requires=[
@@ -22,5 +25,5 @@
     description="Package for training large models using native PyTorch",
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",
-    url="https://github.com/pytorch-labs/torchtrain",
+    url="https://github.com/pytorch-labs/torchtitan",
 )
diff --git a/test/test_job_config.py b/test/test_job_config.py
@@ -4,7 +4,7 @@
 import tempfile
 
 import pytest
-from torchtrain.config_manager import JobConfig
+from torchtitan.config_manager import JobConfig
 
 
 class TestJobConfig:

diff --git a/torchtrain/checkpoint.py → torchtitan/checkpoint.py b/torchtrain/checkpoint.py → torchtitan/checkpoint.py
@@ -17,7 +17,7 @@
     set_model_state_dict,
     set_optimizer_state_dict,
 )
-from torchtrain.logging_utils import logger
+from torchtitan.logging_utils import logger
 
 
 class IntervalType(enum.Enum):

diff --git a/torchtrain/config_manager.py → torchtitan/config_manager.py b/torchtrain/config_manager.py → torchtitan/config_manager.py
@@ -13,7 +13,7 @@
 except ModuleNotFoundError:
     import tomli as tomllib
 
-from torchtrain.logging_utils import logger
+from torchtitan.logging_utils import logger
 
 
 class JobConfig:
@@ -40,7 +40,7 @@ class JobConfig:
 
     def __init__(self):
         # main parser
-        self.parser = argparse.ArgumentParser(description="TorchTrain arg parser.")
+        self.parser = argparse.ArgumentParser(description="torchtitan arg parser.")
         self.parser.add_argument(
             "--job.config_file",
             type=str,
@@ -52,7 +52,7 @@ def __init__(self):
         self.parser.add_argument(
             "--job.dump_folder",
             type=str,
-            default="./torchtrain/outputs",
+            default="./torchtitan/outputs",
             help="folder to dump job outputs",
         )
         self.parser.add_argument(
@@ -134,7 +134,7 @@ def __init__(self):
         self.parser.add_argument(
             "--model.tokenizer_path",
             type=str,
-            default="./torchtrain/datasets/tokenizer/tokenizer.model",
+            default="./torchtitan/datasets/tokenizer/tokenizer.model",
             help="tokenizer path",
         )
 

diff --git a/torchtrain/datasets/__init__.py → torchtitan/datasets/__init__.py b/torchtrain/datasets/__init__.py → torchtitan/datasets/__init__.py
@@ -1,8 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
-from torchtrain.datasets.hf_datasets import build_hf_data_loader
-from torchtrain.datasets.tokenizer import create_tokenizer
+from torchtitan.datasets.hf_datasets import build_hf_data_loader
+from torchtitan.datasets.tokenizer import create_tokenizer
 
 __all__ = [
     "build_hf_data_loader",

diff --git a/torchtrain/datasets/download_tokenizer.py → torchtitan/datasets/download_tokenizer.py b/torchtrain/datasets/download_tokenizer.py → torchtitan/datasets/download_tokenizer.py
@@ -18,7 +18,7 @@ def hf_download(repo_id: Optional[str] = None, hf_token: Optional[str] = None) -
         hf_hub_download(
             repo_id,
             "tokenizer.model",
-            local_dir="torchtrain/datasets/tokenizer/",
+            local_dir="torchtitan/datasets/tokenizer/",
             local_dir_use_symlinks=False,
             token=hf_token,
         )

diff --git a/torchtrain/datasets/hf_datasets.py → torchtitan/datasets/hf_datasets.py b/torchtrain/datasets/hf_datasets.py → torchtitan/datasets/hf_datasets.py
@@ -6,8 +6,8 @@
 import torch
 from torch.utils.data import DataLoader, IterableDataset
 
-from torchtrain.datasets.tokenizer import TokenizerIf
-from torchtrain.logging_utils import logger
+from torchtitan.datasets.tokenizer import TokenizerIf
+from torchtitan.logging_utils import logger
 
 from datasets import load_dataset, load_from_disk
 from datasets.distributed import split_dataset_by_node

diff --git a/torchtrain/datasets/tokenizer.py → torchtitan/datasets/tokenizer.py b/torchtrain/datasets/tokenizer.py → torchtitan/datasets/tokenizer.py
@@ -12,7 +12,7 @@
 
 from sentencepiece import SentencePieceProcessor
 
-from torchtrain.logging_utils import logger
+from torchtitan.logging_utils import logger
 
 
 class TokenizerIf(ABC):

diff --git a/...htrain/datasets/tokenizer/tokenizer.model → ...htitan/datasets/tokenizer/tokenizer.model b/...htrain/datasets/tokenizer/tokenizer.model → ...htitan/datasets/tokenizer/tokenizer.model
diff --git a/torchtrain/float8_linear.py → torchtitan/float8_linear.py b/torchtrain/float8_linear.py → torchtitan/float8_linear.py
@@ -4,9 +4,9 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved
 
-from torchtrain.config_manager import JobConfig
-from torchtrain.logging_utils import logger
-from torchtrain.models.llama import Transformer
+from torchtitan.config_manager import JobConfig
+from torchtitan.logging_utils import logger
+from torchtitan.models.llama import Transformer
 
 
 def build_fp8_linear(model: Transformer, job_config: JobConfig):

diff --git a/torchtrain/logging_utils.py → torchtitan/logging_utils.py b/torchtrain/logging_utils.py → torchtitan/logging_utils.py
diff --git a/torchtrain/lr_scheduling.py → torchtitan/lr_scheduling.py b/torchtrain/lr_scheduling.py → torchtitan/lr_scheduling.py
@@ -5,7 +5,7 @@
 # All rights reserved.
 
 from torch.optim.lr_scheduler import LambdaLR
-from torchtrain.config_manager import JobConfig
+from torchtitan.config_manager import JobConfig
 
 # global states for scheduling
 # these are needed as LambdaLR does not support argument passing

diff --git a/torchtrain/metrics.py → torchtitan/metrics.py b/torchtrain/metrics.py → torchtitan/metrics.py
@@ -11,8 +11,8 @@
 
 import torch
 from torch.utils.tensorboard import SummaryWriter
-from torchtrain.config_manager import JobConfig
-from torchtrain.logging_utils import logger
+from torchtitan.config_manager import JobConfig
+from torchtitan.logging_utils import logger
 
 
 # named tuple for passing GPU memory stats for logging

diff --git a/torchtitan/models/__init__.py b/torchtitan/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+from torchtitan.models.llama import llama_configs, Transformer
+
+models_config = {
+    "llama": llama_configs,
+}
+
+model_name_to_cls = {
+    "llama": Transformer,
+}
+
+model_name_to_tokenizer = {
+    "llama": "sentencepiece",
+}
diff --git a/torchtrain/models/llama/__init__.py → torchtitan/models/llama/__init__.py b/torchtrain/models/llama/__init__.py → torchtitan/models/llama/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
-from torchtrain.models.llama.model import ModelArgs, Transformer
+from torchtitan.models.llama.model import ModelArgs, Transformer
 
 __all__ = ["Transformer"]
 

diff --git a/torchtrain/models/llama/model.py → torchtitan/models/llama/model.py b/torchtrain/models/llama/model.py → torchtitan/models/llama/model.py
@@ -7,7 +7,7 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
-from torchtrain.models.norms import create_norm
+from torchtitan.models.norms import create_norm
 
 
 @dataclass

diff --git a/torchtrain/models/norms.py → torchtitan/models/norms.py b/torchtrain/models/norms.py → torchtitan/models/norms.py
@@ -126,7 +126,6 @@ def _rms_norm_fwd_kernel(
     N,  # num cols
     block_N: tl.constexpr,
 ):
-
     row = tl.program_id(0)
     cols = tl.arange(0, block_N)
 

diff --git a/torchtrain/parallelisms/__init__.py → torchtitan/parallelisms/__init__.py b/torchtrain/parallelisms/__init__.py → torchtitan/parallelisms/__init__.py
@@ -5,8 +5,8 @@
 from functools import cached_property
 
 from torch.distributed.device_mesh import init_device_mesh
-from torchtrain.logging_utils import logger
-from torchtrain.parallelisms.parallelize_llama import parallelize_llama
+from torchtitan.logging_utils import logger
+from torchtitan.parallelisms.parallelize_llama import parallelize_llama
 
 models_parallelize_fns = {
     "llama": parallelize_llama,

diff --git a/torchtrain/parallelisms/parallelize_llama.py → torchtitan/parallelisms/parallelize_llama.py b/torchtrain/parallelisms/parallelize_llama.py → torchtitan/parallelisms/parallelize_llama.py
@@ -25,8 +25,8 @@
 
 from torch.utils.checkpoint import _pt2_selective_checkpoint_context_fn_gen, checkpoint
 
-from torchtrain.config_manager import JobConfig
-from torchtrain.logging_utils import logger
+from torchtitan.config_manager import JobConfig
+from torchtitan.logging_utils import logger
 
 
 # for selective AC

diff --git a/torchtrain/profiling.py → torchtitan/profiling.py b/torchtrain/profiling.py → torchtitan/profiling.py
@@ -5,8 +5,8 @@
 import os
 
 import torch
-from torchtrain.config_manager import JobConfig
-from torchtrain.logging_utils import logger
+from torchtitan.config_manager import JobConfig
+from torchtitan.logging_utils import logger
 
 
 @contextlib.contextmanager

diff --git a/torchtrain/utils.py → torchtitan/utils.py b/torchtrain/utils.py → torchtitan/utils.py
@@ -10,7 +10,7 @@
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.distributed_c10d as c10d
 from torch.distributed.device_mesh import DeviceMesh
-from torchtrain.logging_utils import logger
+from torchtitan.logging_utils import logger
 
 
 def dist_max(x: Union[int, float], mesh: DeviceMesh) -> float:

diff --git a/torchtrain/models/__init__.py b/torchtrain/models/__init__.py
diff --git a/train.py b/train.py
@@ -18,17 +18,17 @@
 from torch.distributed.elastic.multiprocessing.errors import record
 from torch.distributed.tensor.parallel import loss_parallel
 
-from torchtrain.checkpoint import CheckpointManager, IntervalType
-from torchtrain.config_manager import JobConfig
-from torchtrain.datasets import create_tokenizer, dataloader_fn
-from torchtrain.float8_linear import build_fp8_linear
-from torchtrain.logging_utils import init_logger, logger
-from torchtrain.lr_scheduling import get_lr_scheduler
-from torchtrain.metrics import build_gpu_memory_monitor, build_metric_logger
-from torchtrain.models import model_name_to_cls, model_name_to_tokenizer, models_config
-from torchtrain.parallelisms import models_parallelize_fns, ParallelDims
-from torchtrain.profiling import maybe_run_profiler
-from torchtrain.utils import (
+from torchtitan.checkpoint import CheckpointManager, IntervalType
+from torchtitan.config_manager import JobConfig
+from torchtitan.datasets import create_tokenizer, dataloader_fn
+from torchtitan.float8_linear import build_fp8_linear
+from torchtitan.logging_utils import init_logger, logger
+from torchtitan.lr_scheduling import get_lr_scheduler
+from torchtitan.metrics import build_gpu_memory_monitor, build_metric_logger
+from torchtitan.models import model_name_to_cls, model_name_to_tokenizer, models_config
+from torchtitan.parallelisms import models_parallelize_fns, ParallelDims
+from torchtitan.profiling import maybe_run_profiler
+from torchtitan.utils import (
     Color,
     dist_max,
     dist_mean,

diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml
@@ -1,4 +1,4 @@
-# TorchTrain Config.toml
+# torchtitan Config.toml
 [job]
 dump_folder = "./outputs"
 description = "LLaMA debug training"
@@ -20,7 +20,7 @@ save_tb_folder = "tb"
 name = "llama"
 flavor = "debugmodel"
 norm_type = "fused_rmsnorm" # layernorm / np_layernorm / rmsnorm / fused_rmsnorm
-tokenizer_path = "./torchtrain/datasets/tokenizer/tokenizer.model"
+tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"
 
 [optimizer]
 name = "AdamW"

diff --git a/train_configs/llama_13b.toml b/train_configs/llama_13b.toml
@@ -1,4 +1,4 @@
-# TorchTrain Config.toml
+# torchtitan Config.toml
 [job]
 dump_folder = "./outputs"
 description = "LLaMA 13B training"
@@ -18,7 +18,7 @@ save_tb_folder = "tb"
 name = "llama"
 flavor = "13B"
 norm_type = "rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
-tokenizer_path = "./torchtrain/datasets/tokenizer/tokenizer.model"
+tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"
 
 [optimizer]
 name = "AdamW"

diff --git a/train_configs/llama_70b.toml b/train_configs/llama_70b.toml
@@ -1,4 +1,4 @@
-# TorchTrain Config.toml
+# torchtitan Config.toml
 [job]
 dump_folder = "./outputs"
 description = "LLaMA 70B training"
@@ -18,7 +18,7 @@ save_tb_folder = "tb"
 name = "llama"
 flavor = "70B"
 norm_type = "rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]
-tokenizer_path = "./torchtrain/datasets/tokenizer/tokenizer.model"
+tokenizer_path = "./torchtitan/datasets/tokenizer/tokenizer.model"
 
 [optimizer]
 name = "AdamW"