Add openwebtext dataset for larger scale training without shuffling (p…

…ytorch#130) This PR adds the openwebtext 1M dataset. This is a homogenous dataset, so we are able to train successfully while not having any shuffle in our dataset loader. 1 - adds the dateset to hf_datasets 2 - makes the default dataset for 13b and 70b as openwebtext since that is the preferred choice for larger scale training. Testing - ran 5K iters (9 nodes) to verify no spiking issues: <img width="787" alt="Screenshot 2024-03-12 at 9 50 57 AM" src="https://github.com/pytorch/torchtrain/assets/46302957/420fa1fc-50f8-47bc-9b07-02c8fa132e7c">
YerevaNN · Mar 12, 2024 · 10229d6 · 10229d6
1 parent 66c196b
commit 10229d6
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 4 deletions.
diff --git a/torchtrain/datasets/__init__.py b/torchtrain/datasets/__init__.py
@@ -13,4 +13,5 @@
     "alpaca": build_hf_data_loader,
     "minipile": build_hf_data_loader,
     "c4": build_hf_data_loader,
+    "openwebtext": build_hf_data_loader,
 }
diff --git a/torchtrain/datasets/hf_datasets.py b/torchtrain/datasets/hf_datasets.py
@@ -17,6 +17,7 @@
     "alpaca": "tatsu-lab/alpaca",
     "minipile": "JeanKaddour/minipile",
     "c4": "allenai/c4",
+    "openwebtext": "Skylion007/openwebtext",
 }
 
 
@@ -32,9 +33,10 @@ class HuggingFaceDataset(IterableDataset):
         rank (int): rank of the current data parallel process
         infinite (bool): whether to loop infinitely over the dataset
 
-    We currently support three datasets:
+    We currently support four datasets:
     alpaca (52K training entries)
-    minipile (1M training entries)
+    minipile (1M training entries, amalgamated from other datasets)
+    openwebtext (1M training entries, same type of data for entire dataset)
     c4 (177M training entries - this dataset is streamed due to the size)
 
     >> Alpaca <<:
@@ -65,6 +67,15 @@ class HuggingFaceDataset(IterableDataset):
     'timestamp': '2019-04-25T12:57:54Z'
     }
 
+    >> OpenWebText <<:
+    OpenWeb crawl, English
+    Example:
+    {
+        'text': "Amazon has launched a new cheaper version of its Echo Dot voice-controlled device today.
+    The launch comes six months after Amazon first introduced two new Echo devices —
+    one of which was the $90 Echo Dot,..."
+    }
+
     Example use (alpaca):
     >>> alpaca_ds = HuggingFaceDataset(dataset_name="alpaca", dataset_path=None, tokenizer=tokenizer)
     >>> for batch in Dataloader(alpaca_ds, batch_size=8):

diff --git a/train_configs/llama_13b.toml b/train_configs/llama_13b.toml
@@ -40,4 +40,4 @@ compile = false
 checkpoint_interval = 3600
 checkpoint_interval_type = "steps"
 checkpoint_folder = ""
-dataset = "minipile"
+dataset = "openwebtext"
diff --git a/train_configs/llama_70b.toml b/train_configs/llama_70b.toml
@@ -40,4 +40,4 @@ compile = false
 checkpoint_interval = 3600
 checkpoint_interval_type = "steps"
 checkpoint_folder = ""
-dataset = "minipile"
+dataset = "openwebtext"