Skip to content

Commit

Permalink
Add training scripts (#21)
Browse files Browse the repository at this point in the history
* add training scripts

* fix codacy issues

* update training scripts

* fix wrong link

* Update run_sft.sh
  • Loading branch information
iMountTai authored May 6, 2024
1 parent 8c8a80f commit 7ddc9a6
Show file tree
Hide file tree
Showing 5 changed files with 1,279 additions and 0 deletions.
82 changes: 82 additions & 0 deletions scripts/training/build_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import logging
import os
from typing import Union, List
import datasets
import torch
from datasets import load_dataset, concatenate_datasets
import transformers


IGNORE_INDEX = -100

logger = logging.getLogger('__name__')

DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant. 你是一个乐于助人的助手。"""
system_format='<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>'
user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
assistant_format='{content}<|eot_id|>'

def build_instruction_dataset(data_path: Union[List[str],str],
tokenizer: transformers.PreTrainedTokenizer,
max_seq_length: int, data_cache_dir = None,
preprocessing_num_workers = None,
):

def tokenization(examples):
sources = []
targets = []
for instruction, input_text, output in zip(examples['instruction'],examples['input'],examples['output']):
if input_text is not None and input_text !="":
instruction = instruction+'\n'+input_text
source = system_format.format(content=DEFAULT_SYSTEM_PROMPT) + user_format.format(content=instruction)
target = output

sources.append(source)
targets.append(target)

tokenized_sources = tokenizer(sources, return_attention_mask=False, add_special_tokens=False)
tokenized_targets = tokenizer(targets, return_attention_mask=False, add_special_tokens=False)

all_input_ids = []
all_labels = []
for s,t in zip(tokenized_sources['input_ids'],tokenized_targets['input_ids']):
input_ids = torch.LongTensor(s + t)[:max_seq_length]
labels = torch.LongTensor([IGNORE_INDEX] * len(s) + t)[:max_seq_length]
all_input_ids.append(input_ids)
all_labels.append(labels)

results = {'input_ids':all_input_ids, 'labels': all_labels}
return results


logging.warning("building dataset...")
all_datasets = []

if not isinstance(data_path,(list,tuple)):
data_path = [data_path]
for file in data_path:

if data_cache_dir is None:
data_cache_dir = str(os.path.dirname(file))
cache_path = os.path.join(data_cache_dir,os.path.basename(file).split('.')[0]+f"_{max_seq_length}")
os.makedirs(cache_path, exist_ok=True)
try:
processed_dataset = datasets.load_from_disk(cache_path)
logger.info(f'training datasets-{file} has been loaded from disk')
except Exception:
raw_dataset = load_dataset("json", data_files=file, cache_dir=cache_path)
tokenization_func = tokenization
tokenized_dataset = raw_dataset.map(
tokenization_func,
batched=True,
num_proc=preprocessing_num_workers,
remove_columns=["instruction","input","output"],
keep_in_memory=False,
desc="preprocessing on dataset",
)
processed_dataset = tokenized_dataset
processed_dataset.save_to_disk(cache_path)
processed_dataset.set_format('torch')
all_datasets.append(processed_dataset['train'])
all_datasets = concatenate_datasets(all_datasets)
return all_datasets
Loading

0 comments on commit 7ddc9a6

Please sign in to comment.