Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add training scripts #21

Merged
merged 5 commits into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions scripts/training/build_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import logging
import os
from typing import Union, List
import datasets
import torch
from datasets import load_dataset, concatenate_datasets
import transformers


IGNORE_INDEX = -100

logger = logging.getLogger('__name__')

DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant. 你是一个乐于助人的助手。"""
system_format='<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>'
user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
assistant_format='{content}<|eot_id|>'

def build_instruction_dataset(data_path: Union[List[str],str],
tokenizer: transformers.PreTrainedTokenizer,
max_seq_length: int, data_cache_dir = None,
preprocessing_num_workers = None,
):

def tokenization(examples):
sources = []
targets = []
for instruction, input_text, output in zip(examples['instruction'],examples['input'],examples['output']):
if input_text is not None and input_text !="":
instruction = instruction+'\n'+input_text
source = system_format.format(content=DEFAULT_SYSTEM_PROMPT) + user_format.format(content=instruction)
target = output

sources.append(source)
targets.append(target)

tokenized_sources = tokenizer(sources, return_attention_mask=False, add_special_tokens=False)
tokenized_targets = tokenizer(targets, return_attention_mask=False, add_special_tokens=False)

all_input_ids = []
all_labels = []
for s,t in zip(tokenized_sources['input_ids'],tokenized_targets['input_ids']):
input_ids = torch.LongTensor(s + t)[:max_seq_length]
labels = torch.LongTensor([IGNORE_INDEX] * len(s) + t)[:max_seq_length]
all_input_ids.append(input_ids)
all_labels.append(labels)

results = {'input_ids':all_input_ids, 'labels': all_labels}
return results


logging.warning("building dataset...")
all_datasets = []

if not isinstance(data_path,(list,tuple)):
data_path = [data_path]
for file in data_path:

if data_cache_dir is None:
data_cache_dir = str(os.path.dirname(file))
cache_path = os.path.join(data_cache_dir,os.path.basename(file).split('.')[0]+f"_{max_seq_length}")
os.makedirs(cache_path, exist_ok=True)
try:
processed_dataset = datasets.load_from_disk(cache_path)
logger.info(f'training datasets-{file} has been loaded from disk')
except Exception:
raw_dataset = load_dataset("json", data_files=file, cache_dir=cache_path)
tokenization_func = tokenization
tokenized_dataset = raw_dataset.map(
tokenization_func,
batched=True,
num_proc=preprocessing_num_workers,
remove_columns=["instruction","input","output"],
keep_in_memory=False,
desc="preprocessing on dataset",
)
processed_dataset = tokenized_dataset
processed_dataset.save_to_disk(cache_path)
processed_dataset.set_format('torch')
all_datasets.append(processed_dataset['train'])
all_datasets = concatenate_datasets(all_datasets)
return all_datasets
Loading