diff --git a/training/benchmarks/driver/__init__.py b/training/benchmarks/driver/__init__.py index b83d7ce65..09a25f5e8 100644 --- a/training/benchmarks/driver/__init__.py +++ b/training/benchmarks/driver/__init__.py @@ -1,4 +1,3 @@ from .base import Driver -from .callback_paddle import PaddleCallback from .event import Event from .log_event import LogEventManager diff --git a/training/benchmarks/driver/callback_paddle.py b/training/benchmarks/driver/callback_paddle.py deleted file mode 100644 index 6f4eeae92..000000000 --- a/training/benchmarks/driver/callback_paddle.py +++ /dev/null @@ -1,92 +0,0 @@ -from .base import Driver -from. event import Event -from paddlenlp.trainer import ( - TrainerCallback, - TrainerControl, - TrainerState, - TrainingArguments, -) - - -class PaddleCallback(TrainerCallback): - def __init__(self, driver: Driver): - self.driver = driver - - def on_init_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerState, - **kwargs - ): - self.driver.event(Event.INIT_END) - - def on_train_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs - ): - self.driver.event(Event.TRAIN_START) - - def on_train_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs - ): - self.driver.event(Event.TRAIN_END) - - def on_epoch_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs - ): - self.driver.event(Event.EPOCH_BEGIN, epoch=state.epoch) - - def on_epoch_end( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs - ): - self.driver.event(Event.EPOCH_END, epoch=state.epoch) - - def on_step_begin( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs - ): - # print("STATE:", state) - self.driver.event(Event.STEP_BEGIN, step=state.global_step + 1) - - # def on_step_end( - # self, - # args: TrainingArguments, - # state: TrainerState, - # control: TrainerControl, - # **kwargs - # ): - # pass - # # self.driver.event(Event.STEP_END, step=state.global_step) - - def on_evaluate( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - **kwargs - ): - self.driver.event(Event.EVALUATE) - - def on_log(self, args, state, control, logs=None, **kwargs): - _ = logs.pop("total_flos", None) - if state.is_local_process_zero: - self.driver.logger.log(Event.STEP_END, message=logs) diff --git a/training/benchmarks/driver/dist_paddle.py b/training/benchmarks/driver/dist_paddle.py index dc6c19570..e6e3f4690 100644 --- a/training/benchmarks/driver/dist_paddle.py +++ b/training/benchmarks/driver/dist_paddle.py @@ -1,82 +1,24 @@ import os from contextlib import contextmanager -import random -import numpy as np + import paddle import paddle.distributed as dist - +from paddlenlp.trainer import ( + TrainerCallback, + TrainerControl, + TrainerState, + TrainingArguments, +) +from paddlenlp.trainer.trainer_utils import IntervalStrategy + +from .base import Driver +from .event import Event +from typing import Dict def barrier(): if dist.is_initialized(): dist.barrier() -def set_seed(args): - if args.device == "cpu": - idx = 0 - else: - idx = paddle.distributed.get_rank() - random.seed(args.seed + idx) - np.random.seed(args.seed + idx) - paddle.seed(args.seed + idx) - - -def get_rank(default=0): - """ - Gets distributed rank or returns zero if distributed is not initialized. - """ - if dist.is_initialized(): - rank = dist.get_rank() - else: - rank = default - return rank - - -def get_world_size(): - """ - Gets total number of distributed workers or returns one if distributed is - not initialized. - """ - if dist.is_initialized(): - world_size = dist.get_world_size() - else: - world_size = 1 - return world_size - - -def main_proc_print(*args, **kwargs): - if is_main_process(): - print(*args, **kwargs) - - -def init_dist_training_env(config): - if dist.get_world_size() <= 1: - config.device = paddle.device.get_device() - config.world_size = get_world_size() - else: - dist.init_parallel_env() - config.device = paddle.device.get_device() - config.world_size = get_world_size() - print("------------------------") - print("device numbers:", config.world_size) - print("the processing uses", config.device) - return - - -def global_batch_size(config): - - return config.per_device_train_batch_size * config.world_size - - -@contextmanager -def sync_workers(): - """ - Yields distributed rank and synchronizes all workers on exit. - """ - rank = get_rank() - yield rank - barrier() - - def is_main_process(): if dist.is_initialized(): if "PADDLE_TRAINER_ID" in os.environ: @@ -86,15 +28,83 @@ def is_main_process(): return True - -def format_step(step): - if isinstance(step, str): - return step - s = "" - if len(step) > 0: - s += "Training Epoch: {} ".format(step[0]) - if len(step) > 1: - s += "Training Iteration: {} ".format(step[1]) - if len(step) > 2: - s += "Validation Iteration: {} ".format(step[2]) - return s +class PaddleCallback(TrainerCallback): + def __init__(self, driver: Driver): + self.driver = driver + + def on_init_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerState, + **kwargs + ): + self.driver.event(Event.INIT_END) + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs + ): + self.driver.event(Event.TRAIN_START) + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs + ): + self.driver.event(Event.TRAIN_END) + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs + ): + self.driver.event(Event.EPOCH_BEGIN, epoch=state.epoch) + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs + ): + self.driver.event(Event.EPOCH_END, epoch=state.epoch) + + def on_step_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs + ): + self.driver.event(Event.STEP_BEGIN, step=state.global_step + 1) + + def on_evaluate( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs + ): + logs = kwargs["metrics"] + logs["global_step"] = state.global_step + self.driver.event(Event.EVALUATE, result=logs) + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs=None, + **kwargs + ): + _ = logs.pop("total_flos", None) + if state.is_local_process_zero: + self.driver.logger.log(Event.STEP_END, message=logs) \ No newline at end of file diff --git a/training/benchmarks/llama1_13B/README.md b/training/benchmarks/llama1_13B/README.md new file mode 100644 index 000000000..242c88313 --- /dev/null +++ b/training/benchmarks/llama1_13B/README.md @@ -0,0 +1,43 @@ +### 模型信息 +#### 模型介绍 +We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions +of tokens, and show that it is possible to train +state-of-the-art models using publicly available datasets exclusively, without resorting +to proprietary and inaccessible datasets. In +particular, LLaMA-13B outperforms GPT-3 +(175B) on most benchmarks, and LLaMA65B is competitive with the best models, +Chinchilla-70B and PaLM-540B. We release +all our models to the research community1 +. + +Please refer to this paper for a detailed description of LLaMA1: +[LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) + +#### 模型代码来源 +Paddle case代码来源: +https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/llama licensed under the Apache License, Version 2.0. + + +#### 数据集 +##### 测试数据集下载地址 +测试数据集中提供了处理好的100k条doc的训练样本: +``` +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz +``` + +##### 预处理 +> 无需预处理 + +#### 模型实现 +* 运行自动加载 + +#### 模型checkpoint +* 运行自动下载,参数量:13B +* Paddle的 LLaMA 模型的权重的使用则需要遵循[License](../../paddlenlp/transformers/llama/LICENSE)。 + +### 框架与芯片支持情况 +| | Pytorch |Paddle|TensorFlow2| +| ---- | ---- | ---- | ---- | +| Nvidia GPU |N/A |✅ |N/A| +| | | | | diff --git a/training/benchmarks/llama1_13B/paddle b/training/benchmarks/llama1_13B/paddle new file mode 120000 index 000000000..8b077270f --- /dev/null +++ b/training/benchmarks/llama1_13B/paddle @@ -0,0 +1 @@ +/ssd2/laixinyi/projects/FlagPerf/training/benchmarks/llama1_7B/paddle \ No newline at end of file diff --git a/training/benchmarks/llama1_7B/README.md b/training/benchmarks/llama1_7B/README.md index e69de29bb..a007dcf8c 100644 --- a/training/benchmarks/llama1_7B/README.md +++ b/training/benchmarks/llama1_7B/README.md @@ -0,0 +1,43 @@ +### 模型信息 +#### 模型介绍 +We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters. We train our models on trillions +of tokens, and show that it is possible to train +state-of-the-art models using publicly available datasets exclusively, without resorting +to proprietary and inaccessible datasets. In +particular, LLaMA-13B outperforms GPT-3 +(175B) on most benchmarks, and LLaMA65B is competitive with the best models, +Chinchilla-70B and PaLM-540B. We release +all our models to the research community1 +. + +Please refer to this paper for a detailed description of LLaMA1: +[LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) + +#### 模型代码来源 +Paddle case代码来源: +https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm/llama licensed under the Apache License, Version 2.0. + + +#### 数据集 +##### 测试数据集下载地址 +测试数据集中提供了处理好的100k条doc的训练样本: +``` +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz +``` + +##### 预处理 +> 无需预处理 + +#### 模型实现 +* 运行自动加载 + +#### 模型checkpoint +* 运行自动下载,参数量:7B +* Paddle的 LLaMA 模型的权重的使用则需要遵循[License](../../paddlenlp/transformers/llama/LICENSE)。 + +### 框架与芯片支持情况 +| | Pytorch |Paddle|TensorFlow2| +| ---- | ---- | ---- | ---- | +| Nvidia GPU |N/A |✅ |N/A| +| | | | | \ No newline at end of file diff --git a/training/benchmarks/llama1_7B/paddle/config/_base.py b/training/benchmarks/llama1_7B/paddle/config/_base.py index c7884e5fc..b84c49907 100644 --- a/training/benchmarks/llama1_7B/paddle/config/_base.py +++ b/training/benchmarks/llama1_7B/paddle/config/_base.py @@ -9,23 +9,39 @@ # ========================================================= # data # ========================================================= -# vocab file path -tokenizer_vocab_file : str = 'sentencepiece.bpe.model' - # The name of the dataset to use (via the datasets library). input_dir : str = "data" # Train/valid/test data split. split: str = "949,50,1" -# The maximum total input sequence length after tokenization. Sequences longer +# The maximum total input sequence length after tokenization. Sequences longer " +# "than this will be truncated, sequences shorter will be padded. max_seq_length: int = 2048 +# Mask token prob. +masked_lm_prob: float = 0.15 + +# Short sequence prob. +short_seq_prob: float = 0. + # Use share folder for data dir and output dir on multi machine. share_folder: bool = False -dataset_rank: int = 0 +# Whether to favor long ngrams +favor_longer_ngram: bool = False + +# Max N Grams +max_ngrams: int = 3 +# mmap/lazy format converted from preprocessed data. +data_impl: str = "mmap" + +# Drop the last incomplete batch if it is not divisible by the batch size. +dataloader_drop_last: bool = False + +# Number of subprocesses to use for data loading. +# 0 means that the data will be loaded in the main process. dataloader_num_workers: int = 1 @@ -35,139 +51,106 @@ # Only support for llama pre-training for now. model_type: str = "llama" -model_name_or_path: str = "facebook/llama-7b" - -hidden_size: int = 4096 # 4096, 768 - -initializer_range: float = 0.02 - -intermediate_size: int = 11008 - -lm_shift_labels: bool = False - -max_position_embeddings: int = 2048 - -num_attention_heads: int = 32 # 32, 8 - -num_hidden_layers: int = 32 # 32, 2 - -rms_norm_eps: float = 1e-06 - -vocab_size: int = 32000 - -bos_token_id: int = 1 - -eos_token_id: int = 2 - -pad_token_id: int = 0 - -use_cache: bool = False - -recompute: bool = True - -tensor_parallel_output: bool = True - -tie_word_embeddings: bool = False - -use_flash_attention: bool = False +# Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html +model_name_or_path: str = "facebook/llama-7b" # "facebook/llama-7b" # Pretrained tokenizer name or path if not the same as model_name tokenizer_name_or_path: str = "facebook/llama-7b" -# llama, use_fused_rms_norm -use_fused_rms_norm: bool = False - -# gpt, fuse_attention_qkv -fuse_attention_qkv: bool = True - -fuse_attention_ffn: bool = False - -# full core_attn -recompute_granularity: str = "full" - # Pre-training from existing paddlenlp model weights. Default Fasle and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models. continue_training: bool = True -num_workers: int = 1 - -dataloader_drop_last: bool = False - -dataset_world_size: int = 1 +# use flash attention +use_flash_attention: bool = False +# use fused rms_norm +use_fused_rms_norm: bool = False # ========================================================= # trainer args # ========================================================= -# Do trainingFalse +# The output directory where the model predictions and checkpoints will be written. +output_dir: str = None + +# Whether to run training. do_train: bool = True +# Whether to run eval on the dev set. do_eval: bool = True -# Total number of training steps to perform. -max_steps: int = 10000 - +# Batch size per GPU core/CPU for training. per_device_train_batch_size: int = 1 +# Batch size per GPU core/CPU for evaluation. per_device_eval_batch_size: int = 1 -# Total number of training samples to run. -max_samples_termination: float = 120000 +# Number of updates steps to accumulate before performing a backward/update pass. +gradient_accumulation_steps: int = 1 -# frequency of logging loss. If not positive, no logging is provided for training loss -logging_steps: int = 20 +# If > 0: set total number of training steps to perform. Override num_train_epochs. +max_steps: int = -1 +# Log every X updates steps. +logging_steps: int = 20 log_freq = logging_steps -logging_dir: str = None - -eval_steps: int = 1000 - -# Sample to begin performing eval. -eval_iter_start_samples: int = 1 +# Random seed that will be set at the beginning of training. +seed: int = 42 -eval_iters: int = 10 +# Whether or not to use Paddle Sharding Data Parallel training (in distributed training +# only). The base option should be `stage1`, `stage2` or `stage3` and you can add +# CPU-offload to `stage2` or `stage3` like this: stage2 offload` or `stage3 offload`. +# sharding: str = None -test_iters = eval_iters * 10 +# tensor_parallel_degree means split the transformer layer to how many parts. +# default -1 for not use tensor parallel, Suggest tensor_parallel_degree<=8 for better proformance. +# Note, this need model support in source code. +tensor_parallel_degree: int = -1 -# The steps use to control the learing rate. If the step > decay_steps, will use the min_learning_rate. -decay_steps: float = None +# pipeline_parallel_degree means split all transformer layers to how many stages. +# default -1 for not use pipeline parallel. +# Note. this need model support in source code, see llama modeling_pp.py file +pipeline_parallel_degree: int = -1 -# Number of updates steps to accumulate before performing a backward/update pass. -gradient_accumulation_steps : int = 1 +# Recompute the forward pass to calculate gradients. Used for saving memory. +recompute: bool = True -local_rank : int = -1 +# Whether or not to disable the tqdm progress bars. +disable_tqdm : bool = True -local_process_index : int = 0 +# Run an evaluation every X steps. +eval_steps: int = 1000 -# random seed -seed: int = 42 +# Number of updates steps before two checkpoint saves if `save_strategy="steps"`. +save_steps: int = 5000 -world_size : int = 1 +# The steps use to control the learing rate. If the step > decay_steps, will use the min_lr. +decay_steps: int = None -max_grad_norm: float = 1.0 +virtual_pp_degree: int = 1 -use_hybrid_parallel: bool = True +sequence_parallel: bool = False -sharding: str = "stage2" +distributed_dataloader: bool = True -disable_tqdm : bool = True +recompute_granularity: int = "full" # ========================================================= # fp16 config args # ========================================================= -# Run model in fp16 mode +# Whether to use fp16 (mixed) precision instead of 32-bit fp16: bool = True -fp16_opt_level: str = 'O2' +# For fp16: AMP optimization level selected in ['O0', 'O1', and 'O2']. +fp16_opt_level: str = 'O0' +# Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA +# architecture or using CPU (no_cuda). This is an experimental API and it may change. bf16: bool = False +# The value of initial scale_loss for fp16. scale_loss: float = 1024.0 -amp_custom_white_list = None - -amp_custom_black_list = None - # ========================================================= # dist args @@ -178,6 +161,8 @@ # Communication backend for distributed training on gpus dist_backend: str = "nccl" +local_rank: int = -1 + # ========================================================= # lr_scheduler args @@ -188,32 +173,37 @@ # Minimum learning rate deacyed to. min_learning_rate : float = 1e-05 -# number of iterations to decay LR over, If None defaults to `--train-iters`*`--epochs` -lr_decay_steps: int = 10 - -# learning rate decay function -lr_scheduler_type: str = "linear" - -# percentage of data to warmup on (.01 = 1% of all training iters). Default 0.01 +# Linear warmup over warmup_ratio fraction of total steps. warmup_ratio: float = 0.01 +# Linear warmup over warmup_steps. warmup_steps: int = 0 # weight decay coefficient for L2 regularization weight_decay: float = 0.01 +# The scheduler type to use. suppor linear, cosine, constant, constant_with_warmup +lr_scheduler_type: str = "linear" + # ========================================================= # optimizer args # ========================================================= +# Beta1 for AdamW optimizer adam_beta1: float = 0.9 + +# Beta2 for AdamW optimizer adam_beta2: float = 0.999 -adam_epsilon: float = 1e-08 + +# Epsilon for AdamW optimizer. +adam_epsilon: float = 1e-8 + +# Max gradient norm. +max_grad_norm: float = 1.0 # ========================================================= # load and save args # ========================================================= # Path to a directory containing a model checkpoint. -init_checkpoint: str = "model_state.pdparams" output_dir: str = "llama-paddle/output" \ No newline at end of file diff --git a/training/benchmarks/llama1_7B/paddle/config/mutable_params.py b/training/benchmarks/llama1_7B/paddle/config/mutable_params.py index 1ed4b077f..aac1597ea 100644 --- a/training/benchmarks/llama1_7B/paddle/config/mutable_params.py +++ b/training/benchmarks/llama1_7B/paddle/config/mutable_params.py @@ -1,23 +1,40 @@ mutable_params = [ - "split", - "max_seq_length", - "per_device_train_batch_size", + "model_name_or_path", + "tokenizer_name_or_path", + "input_dir", + "output_dir", + "split", + "max_seq_length", + "per_device_train_batch_size", "per_device_eval_batch_size", - "use_flash_attention", - "use_fused_rms_norm", - "fp16", - "fp16_opt_level", - "gradient_accumulation_steps", - "max_steps", - "eval_steps", - "learning_rate", - "min_learning_rate", - "weight_decay", + "use_flash_attention", + "use_fused_rms_norm", + "fp16", + "fp16_opt_level", + "scale_loss", + "learning_rate", + "min_learning_rate", + "max_steps", + "save_steps", + "weight_decay", "warmup_ratio", - "seed", - "sharding", - "recompute" + "max_grad_norm", + "logging_steps", + "dataloader_num_workers", + "eval_steps", + "disable_tqdm", + "continue_training", + "recompute", + "do_train", + "do_eval", + "data_impl", + "gradient_accumulation_steps", + "tensor_parallel_degree", + "pipeline_parallel_degree", + "virtual_pp_degree", + "sequence_parallel", + "distributed_dataloader", ] -mutable_params += ["local_rank", "do_train", "input_dir", "logging_steps"] \ No newline at end of file +mutable_params += ["local_rank", "dist_backend"] \ No newline at end of file diff --git a/training/benchmarks/llama1_7B/paddle/dataloaders/dataloader.py b/training/benchmarks/llama1_7B/paddle/dataloaders/dataloader.py index 5d3f63c0b..8aac7886e 100644 --- a/training/benchmarks/llama1_7B/paddle/dataloaders/dataloader.py +++ b/training/benchmarks/llama1_7B/paddle/dataloaders/dataloader.py @@ -4,38 +4,17 @@ import paddle from paddlenlp.utils.log import logger -from .dataset import GPTDataset, get_train_valid_test_split_ -def get_train_data_file(args): - if len(args.input_dir.split()) > 1: - # weight-1 data-prefix-1 weight-2 data-prefix-2 ... - return args.input_dir.split() - else: - files = [ - os.path.join(args.input_dir, f) - for f in os.listdir(args.input_dir) - if (os.path.isfile(os.path.join(args.input_dir, f)) and "_idx.npz" in str(f)) - ] - files = [x.replace("_idx.npz", "") for x in files] - - if len(files) > 1: - ret = [] - logger.info("You are using multi-dataset:") - for x in files: - ret.append(1.0) - ret.append(x) - logger.info(" > set weight of %s dataset to 1.0" % x) - return ret +from paddlenlp.data.causal_dataset import build_train_valid_test_datasets, print_rank_0 - return files def create_pretrained_dataset( data_args, training_args, data_file, tokenizer, + need_data=True, ): - - train_valid_test_num_samples = [ + train_val_test_num_samples = [ training_args.per_device_train_batch_size * training_args.dataset_world_size * training_args.max_steps @@ -47,73 +26,70 @@ def create_pretrained_dataset( training_args.per_device_eval_batch_size * training_args.dataset_world_size * training_args.test_iters, ] - input_prefix = data_file[0] - - for suffix in ["_ids.npy", "_idx.npz"]: - if not os.path.isfile(input_prefix + suffix): - raise ValueError("File Not found, %s" % (input_prefix + suffix)) - - sample_ids = np.load(input_prefix + "_ids.npy", mmap_mode="r", allow_pickle=True) - # All documment ids, extend as 1-D array. - - process_data = np.load(input_prefix + "_idx.npz") - # The len(sample_lens) num of docs - # The sum(sample_lens) should equal len(sample_ids) - sample_lens = process_data["lens"] - - splits = get_train_valid_test_split_(data_args.split, len(sample_lens)) - assert len(sample_lens) >= splits[-1], "The document nums should larger than max of splits, but %s < %s" % ( - len(sample_lens), - splits[-1], + print_rank_0(" > datasets target sizes (minimum size):") + print_rank_0(" train: {}".format(train_val_test_num_samples[0])) + print_rank_0(" validation: {}".format(train_val_test_num_samples[1])) + print_rank_0(" test: {}".format(train_val_test_num_samples[2])) + + # Build the datasets. + train_dataset, valid_dataset, test_dataset = build_train_valid_test_datasets( + data_prefix=data_file, + data_impl=data_args.data_impl, + splits_string=data_args.split, + train_val_test_num_samples=train_val_test_num_samples, + seq_length=data_args.max_seq_length, + seed=training_args.seed, + skip_warmup=data_args.skip_warmup, + data_cache_path=data_args.data_cache, + need_data=need_data, ) def print_dataset(data, mode="train"): - # logger.info(f"Sample data for {mode} mode") - input_ids, loss_mask, attention_mask, position_ids, labels = data - # logger.info(tokenizer._decode(input_ids)) - # logger.info(tokenizer._decode(labels)) - # logger.info(tokenizer.convert_ids_to_tokens(input_ids)) - - def build_dataset(index, name): - dataset = GPTDataset( - file_prefix=input_prefix, - build_data_file=training_args.local_process_index == 0, - micro_batch_size=training_args.per_device_train_batch_size - if name == "train" - else training_args.per_device_eval_batch_size, - name="gpt_" + name, - max_seq_len=data_args.max_seq_length, - num_samples=train_valid_test_num_samples[index], - documents=np.arange(splits[index], splits[index + 1]), - sample_ids=sample_ids, - sample_lens=sample_lens, - eos_id=tokenizer.eos_token_id, - seed=training_args.seed, - ) - print_dataset(dataset[0], name) - return dataset + logger.info(f"Sample data for {mode} mode.") + # input_ids, loss_mask, attention_mask, position_ids, labels = data + input_ids = data["text"] + + logger.info(tokenizer._decode(input_ids)) from paddlenlp.data import Stack def _collate_data(data, stack_fn=Stack()): - num_fields = len(data[0]) - out = [None] * num_fields - # 0:input_ids, 1:loss_mask, 2:attention_mask, 3:position_ids, 4:labels - for i in (0, 1, 2, 3, 4): - out[i] = stack_fn([x[i] for x in data]) + tokens_ = stack_fn([x["text"] for x in data]) + + labels = tokens_[:, 1:] + tokens = tokens_[:, :-1] return { - "input_ids": out[0], - # "token_type_ids": out[1], - # "attention_mask": out[2], - # "loss_mask": out[3], - "labels": out[4], + "input_ids": tokens, + "labels": labels, } - # Note, data should be broardcast to all devices. - # for train, valid, test, the distinct data num is data_world_size - train_dataset = build_dataset(0, "train") - valid_dataset = build_dataset(1, "valid") - test_dataset = build_dataset(2, "test") - + # if need_data: + # print_dataset(train_dataset[0], "train") + # print_dataset(valid_dataset[0], "valid") + # print_dataset(test_dataset[0], "test") return train_dataset, valid_dataset, test_dataset, _collate_data + +def get_train_data_file(args): + if len(args.input_dir.split()) > 1: + # weight-1 data-prefix-1 weight-2 data-prefix-2 ... + return args.input_dir.split() + else: + files = [ + os.path.join(args.input_dir, f) + for f in os.listdir(args.input_dir) + if (os.path.isfile(os.path.join(args.input_dir, f)) and ("_idx.npz" in str(f) or ".idx" in str(f))) + ] + files = [x.replace("_idx.npz", "") for x in files] + files = [x.replace(".idx", "") for x in files] # add + + if len(files) > 1: + ret = [] + logger.info("You are using multi-dataset:") + for x in files: + ret.append(1.0) + ret.append(x) + logger.info(" > set weight of %s dataset to 1.0" % x) + return ret + + return files \ No newline at end of file diff --git a/training/benchmarks/llama1_7B/paddle/dataloaders/sentencepiece.bpe.model b/training/benchmarks/llama1_7B/paddle/dataloaders/sentencepiece.bpe.model deleted file mode 100755 index 22bccbcb4..000000000 Binary files a/training/benchmarks/llama1_7B/paddle/dataloaders/sentencepiece.bpe.model and /dev/null differ diff --git a/training/benchmarks/llama1_7B/paddle/model/models/modeling_pp.py b/training/benchmarks/llama1_7B/paddle/model/models/modeling_pp.py index 8411bd96e..9a1969903 100644 --- a/training/benchmarks/llama1_7B/paddle/model/models/modeling_pp.py +++ b/training/benchmarks/llama1_7B/paddle/model/models/modeling_pp.py @@ -1,25 +1,10 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# pass import paddle import paddle.distributed.fleet as fleet import paddle.nn as nn from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer from paddlenlp.transformers import PretrainedModel -from .modeling import ( +from paddlenlp.transformers.llama.modeling import ( LlamaConfig, LlamaDecoderLayer, LlamaLMHead, @@ -72,6 +57,8 @@ class LlamaEmbeddingPipe(nn.Layer): def __init__(self, config): super(LlamaEmbeddingPipe, self).__init__() + self.sequence_parallel = config.sequence_parallel + self.hidden_size = config.hidden_size if config.tensor_parallel_degree > 1: self.embed_tokens = fleet.meta_parallel.VocabParallelEmbedding( config.vocab_size, @@ -91,8 +78,16 @@ def forward(self, args): _type_: _description_ """ input_ids, attention_mask, position_ids = parse_args(args) - input_embeds = self.embed_tokens(input_ids) + if self.sequence_parallel: + from paddlenlp.transformers import ScatterOp + + # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim] + bs, seq_len, hidden_size = input_embeds.shape + input_embeds = paddle.reshape_(input_embeds, [bs * seq_len, hidden_size]) + # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism) + input_embeds = ScatterOp.apply(input_embeds) + batch_size, seq_length = input_ids.shape if attention_mask is not None: attention_mask = LlamaModel._prepare_decoder_attention_mask( @@ -206,18 +201,18 @@ class LlamaForCausalLMPipe(PipelinePretrainedModel, PipelineLayer): def __init__( self, config, - # use_recompute=None, # scale_qk_by_layer_num=True, - # recompute_granularity="full", # virtual_pp_degree=4, - # sequence_parallel=False, - # no_recompute_layers=None, - pp_recompute_interval=1, ): self.config = config - use_recompute = self.config.use_recompute - recompute_granularity = self.config.recompute_granularity + self.use_recompute = self.config.use_recompute + self.recompute_granularity = self.config.recompute_granularity + self.pp_recompute_interval = self.config.pp_recompute_interval + self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else [] + if self.recompute_granularity == "full": + assert len(self.no_recompute_layers) == 0, "for pp with full recompute, no_recompute_layers is not support" + # virtual_pp_degree = self.config.virtual_pp_degree virtual_pp_degree = getattr(self.config, "virtual_pp_degree", 1) @@ -230,17 +225,20 @@ def __init__( self.add_sequential_layer(LayerDesc(LlamaEmbeddingPipe, config=config), "llama") for i in range(config.num_hidden_layers): - self.add_sequential_layer(LayerDesc(LlamaDecoderLayerPipe, config=config), f"llama.layers.{i}") + self.add_sequential_layer( + LayerDesc(LlamaDecoderLayerPipe, config=config, layerwise_recompute=i not in self.no_recompute_layers), + f"llama.layers.{i}", + ) self.add_sequential_layer(LayerDesc(LlamaRMSNormPipe, config=config), "llama.norm") self.add_sequential_layer(LayerDesc(LlamaLMHead, config=config), "lm_head") recompute_interval = 0 - if use_recompute and recompute_granularity == "full": - assert pp_recompute_interval <= config.num_hidden_layers // ( + if self.use_recompute and self.recompute_granularity == "full": + assert self.config.pp_recompute_interval <= config.num_hidden_layers // ( virtual_pp_degree * get_hcg().topology().get_dim_size("pipe") ), "pp recompute interval should smaller than num layers of each pp chunk" - recompute_interval = pp_recompute_interval + recompute_interval = self.config.pp_recompute_interval seg_method = "layer:LlamaDecoderLayer" if config.num_hidden_layers % get_hcg().topology().get_dim_size("pipe") != 0: diff --git a/training/benchmarks/llama1_7B/paddle/run_pretraining.py b/training/benchmarks/llama1_7B/paddle/run_pretraining.py index e1e235e59..dd8923045 100644 --- a/training/benchmarks/llama1_7B/paddle/run_pretraining.py +++ b/training/benchmarks/llama1_7B/paddle/run_pretraining.py @@ -1,18 +1,17 @@ """LLaMA Pretraining""" -from __future__ import absolute_import, division, print_function - +import math import argparse import os import random import sys import time - from dataclasses import dataclass, field -from typing import Optional +from typing import List, Optional import numpy as np import paddle + from paddlenlp.trainer import ( PdArgumentParser, Trainer, @@ -27,16 +26,18 @@ LinearAnnealingWithWarmupDecay, LlamaConfig, LlamaForCausalLM, + register_sequence_parallel_allreduce_hooks, ) + from paddlenlp.utils.batch_sampler import DistributedBatchSampler from paddlenlp.utils.log import logger CURR_PATH = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../"))) -from driver import Driver, Event, PaddleCallback, dist_paddle +from driver import Driver, Event, dist_paddle from driver.config_manager import get_properties_from_config -from dataloaders.dataloader import get_train_data_file -from dataloaders.dataloader import create_pretrained_dataset +from dataloaders.dataloader import create_pretrained_dataset, get_train_data_file +from model.models.modeling_pp import LlamaForCausalLMPipe from train.trainer import PretrainingTrainer from train.training_state import TrainingState @@ -67,6 +68,13 @@ class PreTrainingArguments(TrainingArguments): "help": "The steps use to control the learing rate. If the step > decay_steps, will use the min_learning_rate." }, ) + enable_linear_fused_grad_add: bool = field( + default=False, + metadata={ + "help": "Enable fused linear grad add strategy, which will reduce elementwise add for grad accumulation in the backward of nn.Linear ." + }, + ) + @dataclass class DataArguments: @@ -93,6 +101,14 @@ class DataArguments: metadata={"help": "Use share folder for data dir and output dir on multi machine."}, ) + data_impl: str = field(default="mmap", metadata={"help": "The format of the preprocessed data."}) + skip_warmup: bool = field( + default=True, + metadata={"help": "Whether to skip the warmup process of mmap files."}, + ) + data_cache: str = field(default=None, metadata={"help": "The path of the cached dataset."}) + + @dataclass class ModelArguments: """ @@ -103,7 +119,7 @@ class ModelArguments: default="llama", metadata={"help": "Only support for llama pre-training for now."} ) model_name_or_path: str = field( - default="facebook/tiny-random-llama", + default="__internal_testing__/tiny-random-llama", metadata={ "help": "Path to pretrained model or model identifier from https://paddlenlp.readthedocs.io/zh/latest/model_zoo/transformers.html" }, @@ -124,24 +140,67 @@ class ModelArguments: metadata={"help": "llama, use_fused_rms_norm"}, ) fuse_attention_qkv: bool = field( - default=True, - metadata={"help": "gpt, fuse_attention_qkv"}, + default=False, + metadata={"help": "whether to fuse attention qkv"}, + ) + fuse_attention_ffn: bool = field( + default=False, + metadata={"help": "whether to fuse first up and gate proj in mlp block"}, ) recompute_granularity: str = field( default="full", - metadata={"help": "full core_attn"}, + metadata={"help": "Choose among ['full', 'core_attn', 'full_attn']"}, ) virtual_pp_degree: int = field( default=1, metadata={"help": "virtual_pp_degree"}, ) - continue_training: bool = field( default=False, metadata={ - "help": "Pre-training from existing paddlenlp model weights. Default Fasle and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models." + "help": "Pre-training from existing paddlenlp model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models." }, ) + sequence_parallel: bool = field( + default=False, + metadata={"help": "whether to use sequence parallel"}, + ) + fuse_sequence_parallel_allreduce: bool = field( + default=False, + metadata={"help": "whether to use fuse sequence parallel allreduce"}, + ) + rope_fusion_level: Optional[str] = field( + default=None, + metadata={ + "help": "The level of fusion of rope embedding. Can be chosen from:\n" + "(1) 'full': fuse sin cos compute and rope embedding\n" + "(2) 'core': only fuse rope embedding, will compute the sin and cos\n" + "(3) None: don't fuse any part of the rope embedding" + }, + ) + no_recompute_layers: Optional[List[int]] = field( + default=None, + metadata={"help": "Specify the full transformer layers that should not be recomputed."}, + ) + pp_recompute_interval: int = field( + default=1, + metadata={ + "help": "The interval for the number of layers at which recomputation occurs. A value of 0 indicates no recomputation. Default is 0." + }, + ) + recompute_use_reentrant: bool = field( + default=False, + metadata={"help": "recompute_use_reentrant"}, + ) + +def set_seed(args): + if args.device == "cpu": + idx = 0 + else: + idx = paddle.distributed.get_rank() + random.seed(args.seed + idx) + np.random.seed(args.seed + idx) + paddle.seed(args.seed + idx) def main(): import config @@ -154,15 +213,17 @@ def main(): dist_paddle.barrier() llama_driver.event(Event.INIT_START) init_start_time = llama_driver.logger.previous_log_time - parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments)) + model_args, data_args, training_args = parser.parse_dict( get_properties_from_config(config) ) + if model_args.tokenizer_name_or_path is None: model_args.tokenizer_name_or_path = model_args.model_name_or_path - - dist_paddle.set_seed(training_args) + + + set_seed(training_args) paddle.set_device(training_args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() @@ -183,11 +244,6 @@ def main(): last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) - # if last_checkpoint is None and len( - # os.listdir(training_args.output_dir)) > 1: - # raise ValueError( - # f"Output directory ({training_args.output_dir}) already exists and is not empty. " - # "Use --overwrite_output_dir to overcome.") if last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " @@ -199,19 +255,33 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path) llama_config = config_class.from_pretrained(model_args.model_name_or_path) - llama_config.max_position_embeddings = max(llama_config.max_position_embeddings, data_args.max_seq_length) + + llama_config.seq_length = data_args.max_seq_length + + if not model_args.continue_training: + llama_config.max_position_embeddings = max(llama_config.max_position_embeddings, data_args.max_seq_length) + if not model_args.continue_training: llama_config.vocab_size = max(llama_config.vocab_size, ((tokenizer.vocab_size - 1) // 128 + 1) * 128) logger.info(f"Reset vocab size to {llama_config.vocab_size} for batter amp peformance.") - llama_config.lm_shift_labels = False + if model_args.no_recompute_layers is not None: + model_args.no_recompute_layers.sort() + llama_config.use_flash_attention = model_args.use_flash_attention llama_config.use_fused_rms_norm = model_args.use_fused_rms_norm - llama_config.fuse_attention_qkv = False + llama_config.fuse_attention_qkv = model_args.fuse_attention_qkv + llama_config.fuse_attention_ffn = model_args.fuse_attention_ffn llama_config.recompute_granularity = model_args.recompute_granularity llama_config.virtual_pp_degree = model_args.virtual_pp_degree - llama_config.use_recompute = training_args.recompute + llama_config.sequence_parallel = model_args.sequence_parallel + llama_config.fuse_sequence_parallel_allreduce = model_args.fuse_sequence_parallel_allreduce + llama_config.rope_fusion_level = model_args.rope_fusion_level + llama_config.no_recompute_layers = model_args.no_recompute_layers + llama_config.pp_recompute_interval = model_args.pp_recompute_interval + llama_config.recompute_use_reentrant = model_args.recompute_use_reentrant + llama_config.use_recompute = training_args.recompute llama_config.tensor_parallel_degree = training_args.tensor_parallel_degree llama_config.tensor_parallel_rank = training_args.tensor_parallel_rank @@ -237,6 +307,15 @@ def main(): else: model = model_class._from_config(llama_config, dtype=dtype) + if model_args.sequence_parallel: + register_sequence_parallel_allreduce_hooks( + model, training_args.gradient_accumulation_steps, model_args.fuse_sequence_parallel_allreduce + ) + + if training_args.recompute: + model.recompute_enable() + + # Create the learning_rate sheduler and optimizer if training_args.decay_steps is None: training_args.decay_steps = training_args.max_steps warmup_steps = training_args.warmup_ratio * training_args.max_steps @@ -261,11 +340,20 @@ def main(): data_file = get_train_data_file(data_args) train_dataset, eval_dataset, test_dataset, data_collator = create_pretrained_dataset( - data_args, training_args, data_file, tokenizer + data_args, + training_args, + data_file, + tokenizer, + need_data=training_args.should_load_dataset, ) - print(f"train_dataset length:{len(train_dataset)}") - print(f"eval_dataset length:{len(eval_dataset)}") + total_effective_tokens = ( + training_args.per_device_train_batch_size + * training_args.dataset_world_size + * training_args.max_steps + * training_args.gradient_accumulation_steps + * data_args.max_seq_length + ) trainer = PretrainingTrainer( model=model, @@ -275,7 +363,7 @@ def main(): eval_dataset=eval_dataset if training_args.do_eval else None, optimizers=(None, lr_scheduler), tokenizer=tokenizer, - callbacks=[PaddleCallback(driver=llama_driver)], + callbacks=[dist_paddle.PaddleCallback(driver=llama_driver)], ) checkpoint = None @@ -284,46 +372,72 @@ def main(): elif last_checkpoint is not None: checkpoint = last_checkpoint - if not config.do_train: - return config, training_state - + dist_paddle.barrier() llama_driver.event(Event.INIT_END) init_end_time = llama_driver.logger.previous_log_time training_state.init_time = (init_end_time - init_start_time) / 1e+3 + # Init Evaluation + # dist_paddle.barrier() + # eval_metrics = trainer.evaluate() + + # Training + # dist_paddle.barrier() + # try: + # if training_args.do_train: + # train_result = trainer.train(resume_from_checkpoint=checkpoint) + # metrics = train_result.metrics + # trainer.save_model() + # trainer.log_metrics("train", metrics) + # trainer.save_metrics("train", metrics) + # trainer.save_state() + # training_state.raw_train_time = train_metrics["train_runtime"] + # training_state.training_sequences_per_second = train_metrics[ + # "train_samples_per_second" + # ] + # training_state.loss = train_metrics["train_loss"] + # except: + # training_state.end_training = False + dist_paddle.barrier() + if training_args.do_train: + train_result = trainer.train(resume_from_checkpoint=checkpoint) + train_metrics = train_result.metrics + trainer.log_metrics("train", train_metrics) + trainer.save_metrics("train", train_metrics) + trainer.save_state() + training_state.raw_train_time = train_metrics["train_runtime"] + training_state.training_sequences_per_second = train_metrics[ + "train_samples_per_second" + ] + training_state.loss = train_metrics["train_loss"] + + # End Evaluation + dist_paddle.barrier() + eval_metrics = trainer.evaluate() + training_state.eval_loss = eval_metrics["eval_loss"] - train_start_time = time.time() - train_result = trainer.train(resume_from_checkpoint=checkpoint) - metrics = train_result.metrics - trainer.save_model() - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) - trainer.save_state() - training_state.raw_train_time = time.time() - train_start_time - - return config, training_state + return training_args, training_state, llama_driver if __name__ == "__main__": now = time.time() - config, state = main() + training_args, state, llama_driver = main() if not dist_paddle.is_main_process(): exit() e2e_time = time.time() - now - training_perf = (dist_paddle.global_batch_size(config) * - state.global_steps) / state.raw_train_time - if config.do_train: + + if training_args.do_train: finished_info = { "e2e_time": e2e_time, - "training_sequences_per_second": training_perf, + "training_sequences_per_second": state.training_sequences_per_second, "converged": state.converged, - "final_loss": state.eval_avg_loss, + "final_loss": state.eval_loss, "raw_train_time": state.raw_train_time, "init_time": state.init_time, } else: finished_info = {"e2e_time": e2e_time} - logger.log(Event.FINISHED, message=finished_info, stacklevel=0) + llama_driver.logger.log(Event.FINISHED, message=finished_info, stacklevel=0) \ No newline at end of file diff --git a/training/benchmarks/llama1_7B/paddle/schedulers/__init__.py b/training/benchmarks/llama1_7B/paddle/schedulers/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/training/benchmarks/llama1_7B/paddle/train/training_state.py b/training/benchmarks/llama1_7B/paddle/train/training_state.py index c97cc9006..9fb4d54cf 100644 --- a/training/benchmarks/llama1_7B/paddle/train/training_state.py +++ b/training/benchmarks/llama1_7B/paddle/train/training_state.py @@ -24,6 +24,8 @@ class TrainingState: init_time = 0 raw_train_time = 0 + training_sequences_per_second = 0 + def status(self): if self.converged: self._status = "success" diff --git a/training/nvidia/docker_image/paddle_2.5.1/Dockerfile b/training/nvidia/docker_image/paddle_2.5.1/Dockerfile index f975bca00..11ceea601 100644 --- a/training/nvidia/docker_image/paddle_2.5.1/Dockerfile +++ b/training/nvidia/docker_image/paddle_2.5.1/Dockerfile @@ -1,5 +1,6 @@ -# FROM paddlepaddle/paddle:2.5.0rc1-gpu-cuda11.2-cudnn8.2-trt8.0 -# FROM paddlepaddle/paddle:2.5.1-gpu-cuda11.2-cudnn8.2-trt8.0 FROM registry.baidubce.com/paddlepaddle/paddle:2.5.1-gpu-cuda11.2-cudnn8.2-trt8.0 RUN /bin/bash -c "uname -a" -RUN pip3 uninstall -y pylint \ No newline at end of file +RUN pip3 uninstall -y pylint +RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +RUN pip3 install --upgrade typing-extensions +RUN pip3 install --pre --upgrade paddlenlp -f https://www.paddlepaddle.org.cn/whl/paddlenlp.html \ No newline at end of file diff --git a/training/nvidia/docker_image/paddle_2.5.1/paddle_2.5.1_install.sh b/training/nvidia/docker_image/paddle_2.5.1/paddle_2.5.1_install.sh deleted file mode 100644 index e75068895..000000000 --- a/training/nvidia/docker_image/paddle_2.5.1/paddle_2.5.1_install.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -# pip install https://paddlenlp.bj.bcebos.com/wheels/paddlenlp-2.5.2.post0-py3-none-any.whl \ No newline at end of file diff --git a/training/benchmarks/llama1_7B/paddle/optimizers/__init__.py b/training/nvidia/llama1_13B-paddle/config/config_A100x1x8.py similarity index 100% rename from training/benchmarks/llama1_7B/paddle/optimizers/__init__.py rename to training/nvidia/llama1_13B-paddle/config/config_A100x1x8.py diff --git a/training/nvidia/llama1_13B-paddle/config/environment_variables.sh b/training/nvidia/llama1_13B-paddle/config/environment_variables.sh new file mode 120000 index 000000000..601eb1ce8 --- /dev/null +++ b/training/nvidia/llama1_13B-paddle/config/environment_variables.sh @@ -0,0 +1 @@ +/ssd2/laixinyi/projects/FlagPerf/training/nvidia/llama1_7B-paddle/config/environment_variables.sh \ No newline at end of file diff --git a/training/nvidia/llama1_13B-paddle/config/requirements.txt b/training/nvidia/llama1_13B-paddle/config/requirements.txt new file mode 120000 index 000000000..8fd0d0025 --- /dev/null +++ b/training/nvidia/llama1_13B-paddle/config/requirements.txt @@ -0,0 +1 @@ +/ssd2/laixinyi/projects/FlagPerf/training/nvidia/llama1_7B-paddle/config/requirements.txt \ No newline at end of file diff --git a/training/nvidia/llama1_7B-paddle/README.md b/training/nvidia/llama1_7B-paddle/README.md deleted file mode 100644 index 4838a1279..000000000 --- a/training/nvidia/llama1_7B-paddle/README.md +++ /dev/null @@ -1,82 +0,0 @@ - -### 模型Checkpoint下载 -[模型Checkpoint下载](../../benchmarks/bert/README.md#模型checkpoint下载) - - -### 测试数据集下载 -[测试数据集下载](../../benchmarks/bert/README.md#测试数据集下载) - - -### Paddle版本运行指南 - -单卡运行命令: -● 依赖包,paddlepaddle-gpu - -''' -python -m pip install paddlepaddle-gpu==2.4.0rc0 -i https://pypi.tuna.tsinghua.edu.cn/simple -''' - -● bash环境变量: -``` -export MASTER_ADDR=user_ip -export MASTER_PORT=user_port -export WORLD_SIZE=1 -export NODE_RANK=0 -export CUDA_VISIBLE_DEVICES=0,1#可用的GPU索引 -export RANK=0 -export LOCAL_RANK=0 -``` -example: -``` -export MASTER_ADDR=10.21.226.184 -export MASTER_PORT=29501 -export WORLD_SIZE=1 -export NODE_RANK=0 -export CUDA_VISIBLE_DEVICES=0,1#可用的GPU索引 -export RANK=0 -export LOCAL_RANK=0 -``` - -● 运行脚本: - -在该路径目录下 - -``` -python run_pretraining.py ---data_dir data_path ---extern_config_dir config_path ---extern_config_file config_file.py -``` - -example: -``` -python run_pretraining.py ---data_dir /ssd2/yangjie40/data_config ---extern_config_dir /ssd2/yangjie40/flagperf/training/nvidia/bert-pytorch/config ---extern_config_file config_A100x1x2.py -``` - - -### Nvidia GPU配置与运行信息参考 -#### 环境配置 -- ##### 硬件环境 - - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB - - 多机网络类型、带宽: InfiniBand,200Gb/s -- ##### 软件环境 - - OS版本:Ubuntu 20.04 - - OS kernel版本: 5.4.0-113-generic - - 加速卡驱动版本:470.129.06 - - Docker 版本:20.10.16 - - 训练框架版本: paddle-2.4.0-rc - - 依赖软件版本: - - cuda: cuda_11.2.r11.2 - - -### 运行情况 -| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s)| -| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- | -| 单机1卡 | config_A100x1x1 | N/A | 0.67 | N/A | N/A | N/A | -| 单机2卡 | config_A100x1x2 | N/A | 0.67 | N/A | N/A | N/A | -| 单机4卡 | config_A100x1x4 | 1715.28 | 0.67 | 0.6809 | 6250 | 180.07 | -| 单机8卡 | config_A100x1x8 | 1315.42 | 0.67 | 0.6818 | 4689 | 355.63 | - diff --git a/training/nvidia/llama1_7B-paddle/config/config_V100x1x8.py b/training/nvidia/llama1_7B-paddle/config/config_V100x1x8.py index a7dfac802..fbb25e188 100644 --- a/training/nvidia/llama1_7B-paddle/config/config_V100x1x8.py +++ b/training/nvidia/llama1_7B-paddle/config/config_V100x1x8.py @@ -1,20 +1,26 @@ +model_name_or_path: str = "facebook/llama-7b" # "facebook/tiny-random-llama" # +tokenizer_name_or_path: str = "facebook/llama-7b" # "facebook/tiny-random-llama" # split = "949,50,1" max_seq_length = 2048 per_device_train_batch_size = 1 per_device_eval_batch_size = 1 -use_flash_attention = 0 -use_fused_rms_norm = 0 +use_flash_attention = False +use_fused_rms_norm = False fp16 = True fp16_opt_level = "O2" -gradient_accumulation_steps = 1 -max_steps = 1000 -eval_steps = 1000 +scale_loss = 1024 learning_rate = 0.0001 min_learning_rate = 0.00001 +max_steps = 1000 +save_steps = 5000 weight_decay = 0.01 warmup_ratio = 0.01 -logging_steps = 20 +max_grad_norm = 1.0 +logging_steps = 1 log_freq = logging_steps -seed = 42 +dataloader_num_workers = 1 sharding = "stage3" -use_recompute = True \ No newline at end of file +eval_steps = 1000 +disable_tqdm = True +continue_training = True +recompute = True \ No newline at end of file diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index c9c559085..505724af9 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -74,5 +74,5 @@ # "transformer:pytorch_1.13:A100:1:8:1": "/home/datasets_ckpt/transformer/train/", # "swin_transformer:pytorch_1.8:A100:1:8:1": "/raid/dataset/ImageNet_1k_2012/", - "llama:paddle_2.5.1:V100:1:8:1": "/ssd2/laixinyi/projects/FlagPerf/training/benchmarks/llama/paddle/data" + # "llama1_7B:paddle_2.5.1:V100:1:8:1": "/home/dataset/llama1_7B/" } \ No newline at end of file diff --git a/training/run_benchmarks/dev.py b/training/run_benchmarks/dev.py index 180421842..3097edab5 100644 --- a/training/run_benchmarks/dev.py +++ b/training/run_benchmarks/dev.py @@ -167,7 +167,7 @@ def prepare_running_env(dp_path, container_name, case_config, stdout, nullout): + "/run_benchmarks/prepare_in_container.py --framework " \ + framework + " --model " + model + " --vendor " \ + tc.VENDOR + " --pipsource " + tc.PIP_SOURCE + "\"" - pre_env_cmd = "docker exec -i " + container_name + " bash -c \"" + "python3 " \ + pre_env_cmd = "sudo docker exec -i " + container_name + " bash -c \"" + "python3 " \ + tc.FLAGPERF_PATH + "/" \ + "/run_benchmarks/prepare_in_container.py --framework " \ + framework + " --model " + model + " --vendor " \ @@ -200,7 +200,7 @@ def start_container_in_cluster(dp_path, run_args, container_name, image_name, + " utils/container_manager.py -o runnew " \ + " -c " + container_name + " -i " + image_name + " -a \"" \ + run_args + "\"" - start_container_cmd = "nvidia-docker run " + run_args + \ + start_container_cmd = "sudo docker run " + run_args + \ " --name=" + container_name + " \"" + image_name + "\" " + \ "sleep infinity" sys.stdout = stdout @@ -241,7 +241,7 @@ def clear_caches_cluster(clear, nnodes): RUN_LOGGER.info("Caches clear config is NOT set.") return - clear_cmd = "sync /sbin/sysctl vm.drop_caches=3" + clear_cmd = "sync && sudo /sbin/sysctl vm.drop_caches=3" timeout = 30 RUN_LOGGER.debug("Run cmd in the cluster to clear the system cache: " + clear_cmd + " timeout=" + str(timeout)) diff --git a/training/run_benchmarks/paddle/start_paddle_task.py b/training/run_benchmarks/paddle/start_paddle_task.py index 23d6bddc2..8f5a540cc 100644 --- a/training/run_benchmarks/paddle/start_paddle_task.py +++ b/training/run_benchmarks/paddle/start_paddle_task.py @@ -133,28 +133,6 @@ def _set_common_ddp_envs(task_args): task_args.visible_dev_env] = current_env['PADDLE_WORLD_DEVICE_IDS'] return current_env -def _set_paddle_container_envs(task_args): - '''Set and return env items for paddle. - ''' - - # set Paddle distributed related environmental variables - current_env = os.environ.copy() - current_env['PADDLE_WORLD_DEVICE_IDS'] = ','.join( - [str(i) for i in range(task_args.nproc)]) - current_env["PADDLE_TRAINER_ENDPOINTS"] = str(task_args.master_addr) \ - + ':' + str(task_args.master_port) - current_env["FLAGS_embedding_deterministic"] = "1" - current_env["FLAGS_cudnn_deterministic"] = "1" - current_env["NVIDIA_TF32_OVERRIDE"] = "0" - current_env["NCCL_ALGO"] = "Tree" - - # set GPU/MLU device env, TODO other vendor's device - if task_args.visible_dev_env is not None: - current_env[ - task_args.visible_dev_env] = current_env['PADDLE_WORLD_DEVICE_IDS'] - return current_env - - def _get_basic_train_script_args(task_args): '''Generate basic train script args according to the script options.''' config_dir, config_file = helper.get_config_dir_file(task_args) @@ -177,46 +155,6 @@ def _get_basic_train_script_args(task_args): + "--extern_module_dir " + extern_module_dir return basic_train_script_args - -def test_main(): - '''Parse args and start the training task. Support DDP. - ''' - task_args = parse_args() - task_args.framework = "paddle" - - task_log_dir = helper.init_flagperf_logger(START_LOGGER, task_args) - helper.write_pid_file(task_args.log_dir, "start_paddle_task.pid") - - # Check and get train script & its basic args. - basic_train_script_args = _get_basic_train_script_args(task_args) - if basic_train_script_args is None: - START_LOGGER.error("Can't get args of train script.") - sys.exit(3) - - train_script_path = helper.get_train_script_path(task_args) - if train_script_path is None: - START_LOGGER.error("Can't find path of train script.") - sys.exit(4) - - current_env = _set_paddle_container_envs(task_args) - - start_cmd = sys.executable + " -u -m paddle.distributed.launch " + " --log_dir " + task_log_dir \ - + " " + train_script_path + " " + basic_train_script_args - # + " 2>&1 | tee " \ - # + task_log_dir + "/rank" + str(task_args.node_rank) \ - # + ".out.log" - - START_LOGGER.info("Start task with command: " + start_cmd) - START_LOGGER.debug("----------- Process envs -----------") - for environ in current_env.keys(): - START_LOGGER.debug(environ + ":" + current_env[environ]) - START_LOGGER.debug("start command: " + start_cmd) - process = subprocess.Popen(start_cmd, shell=True, env=current_env) - process.wait() - - START_LOGGER.stop() - - def main(): '''Parse args and start the training task. Support DDP. ''' @@ -260,10 +198,10 @@ def main(): # + ':' + str(task_args.master_port1) \ # + ',' + str(task_args.master_addr) \ # + ':' + str(task_args.master_port2) - current_env["FLAGS_embedding_deterministic"] = "1" - current_env["FLAGS_cudnn_deterministic"] = "1" - current_env["NVIDIA_TF32_OVERRIDE"] = "0" - current_env["NCCL_ALGO"] = "Tree" + # current_env["FLAGS_embedding_deterministic"] = "1" + # current_env["FLAGS_cudnn_deterministic"] = "1" + # current_env["NVIDIA_TF32_OVERRIDE"] = "0" + # current_env["NCCL_ALGO"] = "Tree" start_cmd = sys.executable + " -u " + train_script_path + " " \ + basic_train_script_args + " 2>&1 | tee " \