-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcraft_train_data.py
52 lines (42 loc) · 1.66 KB
/
craft_train_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import json
import os
saved_name = "codes_train_data_v11_24.03.08.json"
max_lens_for_repo = 50
code_data_size = 2000
nl_data_size = 2000
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM
# tokenizer = AutoTokenizer.from_pretrained("/Users/zandaoguang/Desktop/CodeLlama-7b-Instruct")
# def cal_tokens(text):
# inputs = tokenizer(text)
# input_ids = inputs["input_ids"]
# return len(input_ids)
path = "/Users/zandaoguang/Desktop/Intern/huawei/codes/outputs"
new_data = []
over_num = 0
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".json"):
file_path = os.path.join(root, file)
with open(file_path, "r") as file:
data = json.load(file)
filted_data = []
for item in data:
# this_tokens = cal_tokens(item["instruction"]+item["output"])
# print(this_tokens)
words_len = len((item["instruction"]+item["output"]).split(" "))
if words_len > 10000:
over_num += 1
continue
# if this_tokens > max_sizes:
# max_sizes = this_tokens
filted_data.append(item)
import random
random.shuffle(filted_data)
new_data.extend(filted_data[:max_lens_for_repo])
print(f"over: {over_num}")
print(f"saved: {len(new_data)}")
saved_path = f"/Users/zandaoguang/Desktop/Intern/huawei/codes/training_data/{saved_name}"
with open(saved_path, "w+") as file:
json.dump(new_data, file, indent=4)
print("done!")