Skip to content

Commit

Permalink
[LLM-paddle] add llama1-7b pretrain with callback (#239)
Browse files Browse the repository at this point in the history
* modify gitignore

* add paddle llama

* add recompute and sharding for llama7b

* adapte to the driver & fix start_paddle_task

* fix llama1-7b fig files and trainer
fix llama1-7b docker run cmd
modify docker paddle version

* [callback] llama1-7B pretrain

* modify the llama case config name in test_conf.py
fix llama run_pretraining.py
fix llama1-13b config
fix llama1-7b and llama1-13b readme
[LLM] add llama1-13b pretrain
[LLM] llama1-7b pretrain with callback

* update config

* update config

* add metrics in README.md

* update README.md

* remove llama 13B files

---------

Co-authored-by: DrownFish19 <[email protected]>
  • Loading branch information
LaiXinyi823 and DrownFish19 authored Sep 28, 2023
1 parent 9491411 commit bba2b79
Show file tree
Hide file tree
Showing 26 changed files with 1,930 additions and 20 deletions.
2 changes: 1 addition & 1 deletion training/benchmarks/driver/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .event import Event
from .base import Driver
from .event import Event
from .log_event import LogEventManager
2 changes: 0 additions & 2 deletions training/benchmarks/driver/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

import os
import os.path as ospath
from .dist_pytorch import global_batch_size


def get_config_arg(config, name):
if hasattr(config, name):
Expand Down
114 changes: 114 additions & 0 deletions training/benchmarks/driver/dist_paddle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import os
from contextlib import contextmanager

import paddle
import paddle.distributed as dist
from paddlenlp.trainer import (
TrainerCallback,
TrainerControl,
TrainerState,
TrainingArguments,
)
from paddlenlp.trainer.trainer_utils import IntervalStrategy

from .base import Driver
from .event import Event
from typing import Dict

def barrier():
if dist.is_initialized():
dist.barrier()

def is_main_process():
if dist.is_initialized():
if "PADDLE_TRAINER_ID" in os.environ:
return int(os.environ["PADDLE_TRAINER_ID"]) == 0
else:
return dist.get_rank() == 0

return True

class PaddleCallback(TrainerCallback):
def __init__(self, driver: Driver):
self.driver = driver

def on_init_end(
self,
args: TrainingArguments,
state: TrainerState,
control: TrainerState,
**kwargs
):
self.driver.event(Event.INIT_END)

def on_train_begin(
self,
args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
**kwargs
):
self.driver.event(Event.TRAIN_START)

def on_train_end(
self,
args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
**kwargs
):
self.driver.event(Event.TRAIN_END)

def on_epoch_begin(
self,
args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
**kwargs
):
self.driver.event(Event.EPOCH_BEGIN, epoch=state.epoch)

def on_epoch_end(
self,
args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
**kwargs
):
self.driver.event(Event.EPOCH_END, epoch=state.epoch)

def on_step_begin(
self,
args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
**kwargs
):
self.driver.event(Event.STEP_BEGIN, step=state.global_step + 1)

def on_evaluate(
self,
args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
**kwargs
):
logs = kwargs["metrics"]
logs["global_step"] = state.global_step
self.driver.event(Event.EVALUATE, result=logs)
if kwargs["metrics"]["eval_ppl"] < self.driver.config.target_ppl:
control.should_training_stop = True



def on_log(
self,
args: TrainingArguments,
state: TrainerState,
control: TrainerControl,
logs=None,
**kwargs
):
_ = logs.pop("total_flos", None)
if state.is_local_process_zero:
self.driver.logger.log(Event.STEP_END, message=logs)
34 changes: 34 additions & 0 deletions training/benchmarks/llama1_7B/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
### 模型信息
#### 模型介绍
We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly available datasets exclusively, without resorting to proprietary and inaccessible datasets. In particular, LLaMA-13B outperforms GPT-3 (175B) on most benchmarks, and LLaMA65B is competitive with the best models, Chinchilla-70B and PaLM-540B. We release all our models to the research community1.

Please refer to this paper for a detailed description of LLaMA1:
[LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)

#### 模型代码来源
Paddle case代码来源:
https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/llama licensed under the Apache License, Version 2.0.

#### 数据集
##### 测试数据集下载地址
测试数据集中提供了处理好的openwebtext 100k条 doc的训练样本:
```
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz
```

##### 预处理
> 无需预处理
#### 模型实现
* 运行自动加载

#### 模型checkpoint
* 运行自动下载
* Paddle的 LLaMA 模型的权重的使用则需要遵循[License](../../paddlenlp/transformers/llama/LICENSE)

### 框架与芯片支持情况
| | Pytorch |Paddle|TensorFlow2|
| ---- | ---- | ---- | ---- |
| Nvidia GPU |N/A ||N/A|
| | | | |
2 changes: 2 additions & 0 deletions training/benchmarks/llama1_7B/paddle/config/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from ._base import *
from .mutable_params import mutable_params
221 changes: 221 additions & 0 deletions training/benchmarks/llama1_7B/paddle/config/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
# =========================================================
# Required parameters
# =========================================================
vendor: str = None

device: str = "gpu"


# =========================================================
# data
# =========================================================
# The name of the dataset to use (via the datasets library).
input_dir : str = "data"

# Train/valid/test data split.
split: str = "949,50,1"

# The maximum total input sequence length after tokenization. Sequences longer "
# "than this will be truncated, sequences shorter will be padded.
max_seq_length: int = 2048

# Mask token prob.
masked_lm_prob: float = 0.15

# Short sequence prob.
short_seq_prob: float = 0.

# Use share folder for data dir and output dir on multi machine.
share_folder: bool = False

# Whether to favor long ngrams
favor_longer_ngram: bool = False

# Max N Grams
max_ngrams: int = 3

# mmap/lazy format converted from preprocessed data.
data_impl: str = "mmap"

# Drop the last incomplete batch if it is not divisible by the batch size.
dataloader_drop_last: bool = False

# Number of subprocesses to use for data loading.
# 0 means that the data will be loaded in the main process.
dataloader_num_workers: int = 1


# =========================================================
# Model
# =========================================================
# Only support for llama pre-training for now.
model_type: str = "llama"

# Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html
model_name_or_path: str = "facebook/llama-7b" # "facebook/llama-7b"

# Pretrained tokenizer name or path if not the same as model_name
tokenizer_name_or_path: str = "facebook/llama-7b"

# Pre-training from existing paddlenlp model weights. Default Fasle and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models.
continue_training: bool = True

# use flash attention
use_flash_attention: bool = False

# use fused rms_norm
use_fused_rms_norm: bool = False

# =========================================================
# trainer args
# =========================================================
# The output directory where the model predictions and checkpoints will be written.
output_dir: str = None

# Whether to run training.
do_train: bool = True

# Whether to run eval on the dev set.
do_eval: bool = True

# Batch size per GPU core/CPU for training.
per_device_train_batch_size: int = 1

# Batch size per GPU core/CPU for evaluation.
per_device_eval_batch_size: int = 1

# Number of updates steps to accumulate before performing a backward/update pass.
gradient_accumulation_steps: int = 1

# If > 0: set total number of training steps to perform. Override num_train_epochs.
max_steps: int = -1

# Log every X updates steps.
logging_steps: int = 20
log_freq = logging_steps

# Random seed that will be set at the beginning of training.
seed: int = 42

# Whether or not to use Paddle Sharding Data Parallel training (in distributed training
# only). The base option should be `stage1`, `stage2` or `stage3` and you can add
# CPU-offload to `stage2` or `stage3` like this: stage2 offload` or `stage3 offload`.
# sharding: str = None

# tensor_parallel_degree means split the transformer layer to how many parts.
# default -1 for not use tensor parallel, Suggest tensor_parallel_degree<=8 for better proformance.
# Note, this need model support in source code.
tensor_parallel_degree: int = -1

# pipeline_parallel_degree means split all transformer layers to how many stages.
# default -1 for not use pipeline parallel.
# Note. this need model support in source code, see llama modeling_pp.py file
pipeline_parallel_degree: int = -1

# Recompute the forward pass to calculate gradients. Used for saving memory.
recompute: bool = True

# Whether or not to disable the tqdm progress bars.
disable_tqdm : bool = True

# Run an evaluation every X steps.
eval_steps: int = 1000

# Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
save_steps: int = 5000

# The steps use to control the learing rate. If the step > decay_steps, will use the min_lr.
decay_steps: int = None

# virtual_pp_degree
virtual_pp_degree: int = 1

# use sequence parallel. If mp_degree=1, sequence_parallel is forced to be False.
sequence_parallel: bool = False

# Whether to use distributed dataloader
distributed_dataloader: bool = True

# recompute训练的粒度
# 可选 `full` `full_attn` `core_attn`
# full即recompute全部transformer
# full_attn表明只recompute所有self attention部分
# core_attn表明只recompute `softmax(qkT)v` 部分
# 注:显存占用方面,`core_attn` > `full_attn` > `full`,若所选策略产生OOM错误,可以适当更改
recompute_granularity: int = "full"

# target perplexity value
target_ppl: float = 10.0

# =========================================================
# fp16 config args
# =========================================================
# Whether to use fp16 (mixed) precision instead of 32-bit
fp16: bool = True

# For fp16: AMP optimization level selected in ['O0', 'O1', and 'O2'].
fp16_opt_level: str = 'O0'

# Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA
# architecture or using CPU (no_cuda). This is an experimental API and it may change.
bf16: bool = False

# The value of initial scale_loss for fp16.
scale_loss: float = 1024.0


# =========================================================
# dist args
# =========================================================
# Whether to read local rank from ENVVAR
use_env: bool = True

# Communication backend for distributed training on gpus
dist_backend: str = "nccl"

local_rank: int = -1


# =========================================================
# lr_scheduler args
# =========================================================
# initial learning rate
learning_rate: float = 0.0001

# Minimum learning rate deacyed to.
min_learning_rate : float = 1e-05

# Linear warmup over warmup_ratio fraction of total steps.
warmup_ratio: float = 0.01

# Linear warmup over warmup_steps.
warmup_steps: int = 0

# weight decay coefficient for L2 regularization
weight_decay: float = 0.01

# The scheduler type to use. suppor linear, cosine, constant, constant_with_warmup
lr_scheduler_type: str = "linear"


# =========================================================
# optimizer args
# =========================================================
# Beta1 for AdamW optimizer
adam_beta1: float = 0.9

# Beta2 for AdamW optimizer
adam_beta2: float = 0.999

# Epsilon for AdamW optimizer.
adam_epsilon: float = 1e-8

# Max gradient norm.
max_grad_norm: float = 1.0


# =========================================================
# load and save args
# =========================================================
# Path to a directory containing a model checkpoint.
output_dir: str = "llama-paddle/output"
Loading

0 comments on commit bba2b79

Please sign in to comment.