FlagOpen · yuzhou03 · Oct 27, 2023 · Oct 20, 2023 · Oct 24, 2023 · Oct 25, 2023
diff --git a/training/benchmarks/llama2_7b/deepspeed-nvidia/README.md b/training/benchmarks/llama2_7b/deepspeed-nvidia/README.md
@@ -0,0 +1,27 @@
+## 模型信息
+
+Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Meta's fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Llama2 outperform open-source chat models on most benchmarks meta's researchers tested, and based on their human evaluations for helpfulness and safety, may be a suitable substitute for closedsource models. Meta provide a detailed description of their approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on their work and contribute to the responsible development of LLMs.
+
+## 模型配置及tokenizer准备
+
+本测试样例为预训练case，需要下载模型config文件，以及tokenizer。
+
+本测试样例目录下已提供处理好的llama2_7b_hf/目录，相比于huggingface原版，将llama2的max_position_embedding从2048修改为4096。原始的2048是huggingface transformer库的bug
+
+## 数据准备
+
+本测试样例数据准备共分为4个步骤
+
+1. 下载openwebtext原始压缩文件，即：
+
+   https://drive.google.com/drive/folders/1IaD_SIIB-K3Sij_-JjWoPy_UrWqQRdjx 中12GB的openwebtext.tar.xz
+
+2. 全部解压缩
+
+   解压上述12GB的文件后，会出现若干形如urlsf_subsetxxxxxx.xz的压缩文件，将所有压缩文件解压到同一个目录，最终可获得7000000余个txt文件
+
+3. 运行数据预处理文件
+
+   执行preprocess/data_process.py，配置好其中的4个命令行参数。推荐的默认token数量为100M，即1亿个token。此配置在A800 8卡上预计训练1小时
+
+4. 将outputfile（通常为openwebtext_llama2_100M.npy）放置在data_dir下
diff --git a/training/benchmarks/llama2_7b/deepspeed-nvidia/dataset/__init__.py b/training/benchmarks/llama2_7b/deepspeed-nvidia/dataset/__init__.py
@@ -0,0 +1 @@
+from .llama_dataset import get_llama_dataset
diff --git a/training/benchmarks/llama2_7b/deepspeed-nvidia/dataset/llama_dataset.py b/training/benchmarks/llama2_7b/deepspeed-nvidia/dataset/llama_dataset.py
@@ -0,0 +1,27 @@
+import numpy as np
+import torch
+import os
+from torch.utils.data import Dataset
+
+
+class Llama2PretrainDataset(Dataset):
+
+    def __init__(self, npy_file, item_length):
+        data = np.load(npy_file)
+        self.data = torch.from_numpy(data)
+        self.item_length = item_length
+        self.length = len(data) // item_length * item_length
+
+    def __getitem__(self, index):
+        start = index * self.item_length
+        end = start + self.item_length
+        return self.data[start:end]
+
+    def __len__(self):
+        return self.length // self.item_length
+
+
+def get_llama_dataset(args, seqlength, datafilename):
+    dataset = Llama2PretrainDataset(os.path.join(args.data_dir, datafilename),
+                                    seqlength)
+    return dataset
diff --git a/training/benchmarks/llama2_7b/deepspeed-nvidia/ds_config.json b/training/benchmarks/llama2_7b/deepspeed-nvidia/ds_config.json
@@ -0,0 +1,40 @@
+{
+  "gradient_accumulation_steps": 1,
+  "train_micro_batch_size_per_gpu": 1,
+  "prescale_gradients": false,
+  "zero_allow_untested_optimizer": true,
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": 1e-5,
+      "weight_decay": 0.1,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-5
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_prefetch_bucket_size": 1e7,
+    "sub_group_size": 1e9,
+    "contiguous_gradients": true,
+    "allgather_bucket_size": 1e8,
+    "reduce_bucket_size": 1e7,
+    "overlap_comm": true,
+    "reduce_scatter": true
+  },
+  "steps_per_print": 50,
+  "gradient_clipping": 1.0,
+  "wall_clock_breakdown": false,
+  "bf16": {
+    "enabled": true
+  },
+  "activation_checkpointing": {
+    "partition_activations": true,
+    "contiguous_memory_optimization": false
+  }
+}
diff --git a/training/benchmarks/llama2_7b/deepspeed-nvidia/llama2_7b_hf/config.json b/training/benchmarks/llama2_7b/deepspeed-nvidia/llama2_7b_hf/config.json
@@ -0,0 +1,25 @@
+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 4096,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.31.0",
+  "use_cache": true,
+  "vocab_size": 32000
+}
diff --git a/training/benchmarks/llama2_7b/deepspeed-nvidia/llama2_7b_hf/special_tokens_map.json b/training/benchmarks/llama2_7b/deepspeed-nvidia/llama2_7b_hf/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}