-
Notifications
You must be signed in to change notification settings - Fork 155
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add training scripts * fix codacy issues * update training scripts * fix wrong link * Update run_sft.sh
- Loading branch information
Showing
5 changed files
with
1,279 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import logging | ||
import os | ||
from typing import Union, List | ||
import datasets | ||
import torch | ||
from datasets import load_dataset, concatenate_datasets | ||
import transformers | ||
|
||
|
||
IGNORE_INDEX = -100 | ||
|
||
logger = logging.getLogger('__name__') | ||
|
||
DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant. 你是一个乐于助人的助手。""" | ||
system_format='<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>' | ||
user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' | ||
assistant_format='{content}<|eot_id|>' | ||
|
||
def build_instruction_dataset(data_path: Union[List[str],str], | ||
tokenizer: transformers.PreTrainedTokenizer, | ||
max_seq_length: int, data_cache_dir = None, | ||
preprocessing_num_workers = None, | ||
): | ||
|
||
def tokenization(examples): | ||
sources = [] | ||
targets = [] | ||
for instruction, input_text, output in zip(examples['instruction'],examples['input'],examples['output']): | ||
if input_text is not None and input_text !="": | ||
instruction = instruction+'\n'+input_text | ||
source = system_format.format(content=DEFAULT_SYSTEM_PROMPT) + user_format.format(content=instruction) | ||
target = output | ||
|
||
sources.append(source) | ||
targets.append(target) | ||
|
||
tokenized_sources = tokenizer(sources, return_attention_mask=False, add_special_tokens=False) | ||
tokenized_targets = tokenizer(targets, return_attention_mask=False, add_special_tokens=False) | ||
|
||
all_input_ids = [] | ||
all_labels = [] | ||
for s,t in zip(tokenized_sources['input_ids'],tokenized_targets['input_ids']): | ||
input_ids = torch.LongTensor(s + t)[:max_seq_length] | ||
labels = torch.LongTensor([IGNORE_INDEX] * len(s) + t)[:max_seq_length] | ||
all_input_ids.append(input_ids) | ||
all_labels.append(labels) | ||
|
||
results = {'input_ids':all_input_ids, 'labels': all_labels} | ||
return results | ||
|
||
|
||
logging.warning("building dataset...") | ||
all_datasets = [] | ||
|
||
if not isinstance(data_path,(list,tuple)): | ||
data_path = [data_path] | ||
for file in data_path: | ||
|
||
if data_cache_dir is None: | ||
data_cache_dir = str(os.path.dirname(file)) | ||
cache_path = os.path.join(data_cache_dir,os.path.basename(file).split('.')[0]+f"_{max_seq_length}") | ||
os.makedirs(cache_path, exist_ok=True) | ||
try: | ||
processed_dataset = datasets.load_from_disk(cache_path) | ||
logger.info(f'training datasets-{file} has been loaded from disk') | ||
except Exception: | ||
raw_dataset = load_dataset("json", data_files=file, cache_dir=cache_path) | ||
tokenization_func = tokenization | ||
tokenized_dataset = raw_dataset.map( | ||
tokenization_func, | ||
batched=True, | ||
num_proc=preprocessing_num_workers, | ||
remove_columns=["instruction","input","output"], | ||
keep_in_memory=False, | ||
desc="preprocessing on dataset", | ||
) | ||
processed_dataset = tokenized_dataset | ||
processed_dataset.save_to_disk(cache_path) | ||
processed_dataset.set_format('torch') | ||
all_datasets.append(processed_dataset['train']) | ||
all_datasets = concatenate_datasets(all_datasets) | ||
return all_datasets |
Oops, something went wrong.