[LLM-paddle] add llama1-7b pretrain with callback (#239)

* modify gitignore * add paddle llama * add recompute and sharding for llama7b * adapte to the driver & fix start_paddle_task * fix llama1-7b fig files and trainer fix llama1-7b docker run cmd modify docker paddle version * [callback] llama1-7B pretrain * modify the llama case config name in test_conf.py fix llama run_pretraining.py fix llama1-13b config fix llama1-7b and llama1-13b readme [LLM] add llama1-13b pretrain [LLM] llama1-7b pretrain with callback * update config * update config * add metrics in README.md * update README.md * remove llama 13B files --------- Co-authored-by: DrownFish19 <[email protected]>
FlagOpen · Sep 28, 2023 · bba2b79 · bba2b79
1 parent 9491411
commit bba2b79
Show file tree

Hide file tree

Showing 26 changed files with 1,930 additions and 20 deletions.
diff --git a/training/benchmarks/driver/__init__.py b/training/benchmarks/driver/__init__.py
@@ -1,3 +1,3 @@
-from .event import Event
 from .base import Driver
+from .event import Event
 from .log_event import LogEventManager
diff --git a/training/benchmarks/driver/check.py b/training/benchmarks/driver/check.py
@@ -4,8 +4,6 @@
 
 import os
 import os.path as ospath
-from .dist_pytorch import global_batch_size
-
 
 def get_config_arg(config, name):
     if hasattr(config, name):

diff --git a/training/benchmarks/driver/dist_paddle.py b/training/benchmarks/driver/dist_paddle.py
@@ -0,0 +1,114 @@
+import os
+from contextlib import contextmanager
+
+import paddle
+import paddle.distributed as dist
+from paddlenlp.trainer import (
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+from paddlenlp.trainer.trainer_utils import IntervalStrategy
+
+from .base import Driver
+from .event import Event
+from typing import Dict
+
+def barrier():
+    if dist.is_initialized():
+        dist.barrier()
+
+def is_main_process():
+    if dist.is_initialized():
+        if "PADDLE_TRAINER_ID" in os.environ:
+            return int(os.environ["PADDLE_TRAINER_ID"]) == 0
+        else:
+            return dist.get_rank() == 0
+
+    return True
+
+class PaddleCallback(TrainerCallback):
+    def __init__(self, driver: Driver):
+        self.driver = driver
+
+    def on_init_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerState,
+        **kwargs
+    ):
+        self.driver.event(Event.INIT_END)
+
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs
+    ):
+        self.driver.event(Event.TRAIN_START)
+
+    def on_train_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs
+    ):
+        self.driver.event(Event.TRAIN_END)
+
+    def on_epoch_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs
+    ):
+        self.driver.event(Event.EPOCH_BEGIN, epoch=state.epoch)
+
+    def on_epoch_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs
+    ):
+        self.driver.event(Event.EPOCH_END, epoch=state.epoch)
+
+    def on_step_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs
+    ):
+        self.driver.event(Event.STEP_BEGIN, step=state.global_step + 1)
+
+    def on_evaluate(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs
+    ):
+        logs = kwargs["metrics"]
+        logs["global_step"] = state.global_step
+        self.driver.event(Event.EVALUATE, result=logs)
+        if kwargs["metrics"]["eval_ppl"] < self.driver.config.target_ppl:
+            control.should_training_stop = True
+
+
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs=None,
+        **kwargs
+    ):
+        _ = logs.pop("total_flos", None)
+        if state.is_local_process_zero:
+            self.driver.logger.log(Event.STEP_END, message=logs)
diff --git a/training/benchmarks/llama1_7B/README.md b/training/benchmarks/llama1_7B/README.md
@@ -0,0 +1,34 @@
+### 模型信息
+#### 模型介绍
+We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community1.
+
+Please refer to this paper for a detailed description of LLaMA1: 
+[LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) 
+
+#### 模型代码来源
+Paddle case代码来源:
+https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/llama licensed under the Apache License, Version 2.0.
+
+#### 数据集
+##### 测试数据集下载地址
+测试数据集中提供了处理好的openwebtext 100k条 doc的训练样本：
+```
+wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
+wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
+```
+
+##### 预处理
+> 无需预处理 
+
+#### 模型实现
+* 运行自动加载
+
+#### 模型checkpoint
+* 运行自动下载
+* Paddle的 LLaMA 模型的权重的使用则需要遵循[License](../../paddlenlp/transformers/llama/LICENSE)。
+
+### 框架与芯片支持情况
+|     | Pytorch  |Paddle|TensorFlow2|
+|  ----  | ----  |  ----  | ----  |
+| Nvidia GPU |N/A  |✅  |N/A|
+|    |   |    |   |
diff --git a/training/benchmarks/llama1_7B/paddle/config/__init__.py b/training/benchmarks/llama1_7B/paddle/config/__init__.py
@@ -0,0 +1,2 @@
+from ._base import *
+from .mutable_params import mutable_params
diff --git a/training/benchmarks/llama1_7B/paddle/config/_base.py b/training/benchmarks/llama1_7B/paddle/config/_base.py
@@ -0,0 +1,221 @@
+# =========================================================
+# Required parameters
+# =========================================================
+vendor: str = None
+
+device: str = "gpu"
+
+
+# =========================================================
+# data
+# =========================================================
+# The name of the dataset to use (via the datasets library).
+input_dir : str = "data" 
+
+# Train/valid/test data split.
+split: str = "949,50,1"
+
+# The maximum total input sequence length after tokenization. Sequences longer "
+# "than this will be truncated, sequences shorter will be padded.
+max_seq_length: int = 2048
+
+# Mask token prob.
+masked_lm_prob: float = 0.15
+
+# Short sequence prob.
+short_seq_prob: float = 0.
+
+# Use share folder for data dir and output dir on multi machine.
+share_folder: bool = False
+
+# Whether to favor long ngrams
+favor_longer_ngram: bool = False
+
+# Max N Grams
+max_ngrams: int = 3
+
+# mmap/lazy format converted from preprocessed data.
+data_impl: str = "mmap"
+
+# Drop the last incomplete batch if it is not divisible by the batch size.
+dataloader_drop_last: bool = False
+
+# Number of subprocesses to use for data loading. 
+# 0 means that the data will be loaded in the main process.
+dataloader_num_workers: int = 1
+
+
+# =========================================================
+# Model
+# =========================================================
+# Only support for llama pre-training for now.
+model_type: str = "llama"
+
+# Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html
+model_name_or_path: str = "facebook/llama-7b" # "facebook/llama-7b"
+
+# Pretrained tokenizer name or path if not the same as model_name
+tokenizer_name_or_path: str = "facebook/llama-7b"
+
+# Pre-training from existing paddlenlp model weights. Default Fasle and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models.
+continue_training: bool = True
+
+# use flash attention
+use_flash_attention: bool = False
+
+# use fused rms_norm
+use_fused_rms_norm: bool = False
+
+# =========================================================
+# trainer args
+# =========================================================
+# The output directory where the model predictions and checkpoints will be written.
+output_dir: str = None
+
+# Whether to run training.
+do_train: bool = True
+
+# Whether to run eval on the dev set.
+do_eval: bool = True
+
+# Batch size per GPU core/CPU for training.
+per_device_train_batch_size: int = 1
+
+# Batch size per GPU core/CPU for evaluation.
+per_device_eval_batch_size: int = 1
+
+# Number of updates steps to accumulate before performing a backward/update pass.
+gradient_accumulation_steps: int = 1
+
+# If > 0: set total number of training steps to perform. Override num_train_epochs.
+max_steps: int = -1
+
+# Log every X updates steps.
+logging_steps: int = 20
+log_freq = logging_steps
+
+# Random seed that will be set at the beginning of training.
+seed: int = 42
+
+# Whether or not to use Paddle Sharding Data Parallel training (in distributed training
+# only). The base option should be `stage1`, `stage2` or `stage3` and you can add
+# CPU-offload to `stage2` or `stage3` like this: stage2 offload` or `stage3 offload`. 
+# sharding: str = None
+
+# tensor_parallel_degree means split the transformer layer to how many parts.
+# default -1 for not use tensor parallel,  Suggest tensor_parallel_degree<=8 for better proformance.
+# Note, this need model support in source code.
+tensor_parallel_degree: int = -1
+
+# pipeline_parallel_degree means split all transformer layers to how many stages.
+# default -1 for not use pipeline parallel.
+# Note. this need model support in source code, see llama modeling_pp.py file
+pipeline_parallel_degree: int = -1
+
+# Recompute the forward pass to calculate gradients. Used for saving memory.
+recompute: bool = True
+
+# Whether or not to disable the tqdm progress bars.
+disable_tqdm : bool = True
+
+# Run an evaluation every X steps.
+eval_steps: int = 1000
+
+# Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
+save_steps: int = 5000
+
+# The steps use to control the learing rate. If the step > decay_steps, will use the min_lr.
+decay_steps: int = None
+
+# virtual_pp_degree
+virtual_pp_degree: int = 1
+
+# use sequence parallel. If mp_degree=1, sequence_parallel is forced to be False.
+sequence_parallel: bool = False
+
+# Whether to use distributed dataloader
+distributed_dataloader: bool = True
+
+# recompute训练的粒度
+# 可选 `full` `full_attn` `core_attn`
+# full即recompute全部transformer
+# full_attn表明只recompute所有self attention部分
+# core_attn表明只recompute `softmax(qkT)v` 部分
+# 注：显存占用方面，`core_attn` > `full_attn` > `full`，若所选策略产生OOM错误，可以适当更改
+recompute_granularity: int = "full"
+
+# target perplexity value
+target_ppl: float = 10.0
+
+# =========================================================
+# fp16 config args
+# =========================================================
+# Whether to use fp16 (mixed) precision instead of 32-bit
+fp16: bool = True
+
+# For fp16: AMP optimization level selected in ['O0', 'O1', and 'O2']. 
+fp16_opt_level: str = 'O0'
+
+# Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA
+# architecture or using CPU (no_cuda). This is an experimental API and it may change.
+bf16: bool = False
+
+# The value of initial scale_loss for fp16.
+scale_loss: float = 1024.0
+
+
+# =========================================================
+# dist args
+# =========================================================
+# Whether to read local rank from ENVVAR
+use_env: bool = True
+
+# Communication backend for distributed training on gpus
+dist_backend: str = "nccl"
+
+local_rank: int = -1
+
+
+# =========================================================
+# lr_scheduler args
+# =========================================================
+# initial learning rate
+learning_rate: float = 0.0001
+
+# Minimum learning rate deacyed to.
+min_learning_rate : float = 1e-05
+
+# Linear warmup over warmup_ratio fraction of total steps.
+warmup_ratio: float = 0.01
+
+# Linear warmup over warmup_steps.
+warmup_steps: int = 0
+
+# weight decay coefficient for L2 regularization
+weight_decay: float = 0.01
+
+# The scheduler type to use. suppor linear, cosine, constant, constant_with_warmup
+lr_scheduler_type: str = "linear"
+
+
+# =========================================================
+# optimizer args
+# =========================================================
+# Beta1 for AdamW optimizer
+adam_beta1: float = 0.9
+
+# Beta2 for AdamW optimizer
+adam_beta2: float = 0.999
+
+# Epsilon for AdamW optimizer.
+adam_epsilon: float = 1e-8
+
+# Max gradient norm.
+max_grad_norm: float = 1.0
+
+
+# =========================================================
+# load and save args
+# =========================================================
+# Path to a directory containing a model checkpoint.
+output_dir: str = "llama-paddle/output"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from ._base import *
		from .mutable_params import mutable_params