Add training scripts (#21)

* add training scripts * fix codacy issues * update training scripts * fix wrong link * Update run_sft.sh
ymcui · May 6, 2024 · 7ddc9a6 · 7ddc9a6
1 parent 8c8a80f
commit 7ddc9a6
Show file tree

Hide file tree

Showing 5 changed files with 1,279 additions and 0 deletions.
diff --git a/scripts/training/build_dataset.py b/scripts/training/build_dataset.py
@@ -0,0 +1,82 @@
+import logging
+import os
+from typing import Union, List
+import datasets
+import torch
+from datasets import load_dataset, concatenate_datasets
+import transformers
+
+
+IGNORE_INDEX = -100
+
+logger = logging.getLogger('__name__')
+
+DEFAULT_SYSTEM_PROMPT = """You are a helpful assistant. 你是一个乐于助人的助手。"""
+system_format='<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>'
+user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
+assistant_format='{content}<|eot_id|>'
+
+def build_instruction_dataset(data_path: Union[List[str],str],
+                tokenizer: transformers.PreTrainedTokenizer,
+                max_seq_length: int, data_cache_dir = None,
+                preprocessing_num_workers = None,
+                ):
+
+    def tokenization(examples):
+        sources = []
+        targets = []
+        for instruction, input_text, output in zip(examples['instruction'],examples['input'],examples['output']):
+            if input_text is not None and input_text !="":
+                instruction = instruction+'\n'+input_text
+            source = system_format.format(content=DEFAULT_SYSTEM_PROMPT) + user_format.format(content=instruction)
+            target = output
+
+            sources.append(source)
+            targets.append(target)
+
+        tokenized_sources = tokenizer(sources, return_attention_mask=False, add_special_tokens=False)
+        tokenized_targets = tokenizer(targets, return_attention_mask=False, add_special_tokens=False)
+
+        all_input_ids = []
+        all_labels = []
+        for s,t in zip(tokenized_sources['input_ids'],tokenized_targets['input_ids']):
+            input_ids = torch.LongTensor(s + t)[:max_seq_length]
+            labels = torch.LongTensor([IGNORE_INDEX] * len(s) + t)[:max_seq_length]
+            all_input_ids.append(input_ids)
+            all_labels.append(labels)
+
+        results = {'input_ids':all_input_ids, 'labels': all_labels}
+        return results
+
+
+    logging.warning("building dataset...")
+    all_datasets = []
+
+    if not isinstance(data_path,(list,tuple)):
+        data_path = [data_path]
+    for file in data_path:
+
+        if data_cache_dir is None:
+            data_cache_dir = str(os.path.dirname(file))
+        cache_path = os.path.join(data_cache_dir,os.path.basename(file).split('.')[0]+f"_{max_seq_length}")
+        os.makedirs(cache_path, exist_ok=True)
+        try:
+            processed_dataset = datasets.load_from_disk(cache_path)
+            logger.info(f'training datasets-{file} has been loaded from disk')
+        except Exception:
+            raw_dataset = load_dataset("json", data_files=file, cache_dir=cache_path)
+            tokenization_func = tokenization
+            tokenized_dataset = raw_dataset.map(
+                tokenization_func,
+                batched=True,
+                num_proc=preprocessing_num_workers,
+                remove_columns=["instruction","input","output"],
+                keep_in_memory=False,
+                desc="preprocessing on dataset",
+            )
+            processed_dataset = tokenized_dataset
+            processed_dataset.save_to_disk(cache_path)
+        processed_dataset.set_format('torch')
+        all_datasets.append(processed_dataset['train'])
+    all_datasets = concatenate_datasets(all_datasets)
+    return all_datasets