From ce7a4ae416bc1bf406f14aa9774752ffbf885eb4 Mon Sep 17 00:00:00 2001 From: zealot52099 Date: Mon, 18 Mar 2024 10:31:34 +0800 Subject: [PATCH 01/13] add more directories --- rag/src/config/config.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/rag/src/config/config.py b/rag/src/config/config.py index b84327f..d803d64 100644 --- a/rag/src/config/config.py +++ b/rag/src/config/config.py @@ -13,11 +13,16 @@ # data data_dir = os.path.join(base_dir, 'data') # data knowledge_json_path = os.path.join(data_dir, 'knowledge.json') # json -knowledge_pkl_path = os.path.join(data_dir, 'knowledge.pkl') # pickle +knowledge_pkl_path = os.path.join(data_dir, 'knowledge.pkl') # pkl +doc_dir = os.path.join(data_dir, 'txt') +qa_dir = os.path.join(data_dir, 'json') # log log_dir = os.path.join(base_dir, 'log') # log log_path = os.path.join(log_dir, 'log.log') # file +# vector DB +vector_db_dir = os.path.join(data_dir, 'vector_db.pkl') + select_num = 3 retrieval_num = 10 \ No newline at end of file From 5879afffe6fdd0eef12f9fadaebfe54a803d1a71 Mon Sep 17 00:00:00 2001 From: zealot52099 Date: Mon, 18 Mar 2024 10:32:27 +0800 Subject: [PATCH 02/13] add data_processing.py --- rag/src/data_processing.py | 262 +++++++++++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 rag/src/data_processing.py diff --git a/rag/src/data_processing.py b/rag/src/data_processing.py new file mode 100644 index 0000000..e3215fd --- /dev/null +++ b/rag/src/data_processing.py @@ -0,0 +1,262 @@ +import json +import pickle +from loguru import logger +from sentence_transformers import SentenceTransformer + +from config.config import embedding_path, doc_dir, qa_dir, knowledge_pkl_path, data_dir, base_dir, vector_db_dir +import os +import faiss +import platform +from langchain_community.document_loaders import DirectoryLoader, TextLoader, JSONLoader +from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter +from BCEmbedding import EmbeddingModel, RerankerModel +from util.pipeline import EmoLLMRAG +import pickle +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch +import streamlit as st +from openxlab.model import download + + +''' +1)根据QA对/TXT 文本生成 embedding +2)调用 langchain FAISS 接口构建 vector DB +3)存储到 openxlab.dataset 中,方便后续调用 +4)提供 embedding 的接口函数,方便后续调用 +5)提供 rerank 的接口函数,方便后续调用 +''' + +""" +加载向量模型 +""" +def load_embedding_model(): + logger.info('Loading embedding model...') + # model = EmbeddingModel(model_name_or_path="huggingface/bce-embedding-base_v1") + model = EmbeddingModel(model_name_or_path="maidalun1020/bce-embedding-base_v1") + logger.info('Embedding model loaded.') + return model + +def load_rerank_model(): + logger.info('Loading rerank_model...') + model = RerankerModel(model_name_or_path="maidalun1020/bce-reranker-base_v1") + # model = RerankerModel(model_name_or_path="huggingface/bce-reranker-base_v1") + logger.info('Rerank model loaded.') + return model + + +def split_document(data_path, chunk_size=1000, chunk_overlap=100): + # text_spliter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) + text_spliter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) + split_docs = [] + logger.info(f'Loading txt files from {data_path}') + if os.path.isdir(data_path): + # 如果是文件夹,则遍历读取 + for root, dirs, files in os.walk(data_path): + for file in files: + if file.endswith('.txt'): + file_path = os.path.join(root, file) + # logger.info(f'splitting file {file_path}') + text_loader = TextLoader(file_path, encoding='utf-8') + text = text_loader.load() + + splits = text_spliter.split_documents(text) + # logger.info(f"splits type {type(splits[0])}") + # logger.info(f'splits size {len(splits)}') + split_docs += splits + elif file.endswith('.txt'): + file_path = os.path.join(root, file) + # logger.info(f'splitting file {file_path}') + text_loader = TextLoader(file_path, encoding='utf-8') + text = text_loader.load() + splits = text_spliter.split_documents(text) + # logger.info(f"splits type {type(splits[0])}") + # logger.info(f'splits size {len(splits)}') + split_docs = splits + logger.info(f'split_docs size {len(split_docs)}') + return split_docs + + +##TODO 1、读取system prompt 2、限制序列长度 +def split_conversation(path): + ''' + data format: + [ + { + "conversation": [ + { + "input": Q1 + "output": A1 + }, + { + "input": Q2 + "output": A2 + }, + ] + }, + ] + ''' + qa_pairs = [] + logger.info(f'Loading json files from {path}') + if os.path.isfile(path): + with open(path, 'r', encoding='utf-8') as file: + data = json.load(file) + for conversation in data: + for dialog in conversation['conversation']: + # input_text = dialog['input'] + # output_text = dialog['output'] + # if len(input_text) > max_length or len(output_text) > max_length: + # continue + qa_pairs.append(dialog) + elif os.path.isdir(path): + # 如果是文件夹,则遍历读取 + for root, dirs, files in os.walk(path): + for file in files: + if file.endswith('.json'): + file_path = os.path.join(root, file) + logger.info(f'splitting file {file_path}') + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for conversation in data: + for dialog in conversation['conversation']: + qa_pairs.append(dialog) + return qa_pairs + + + +# 加载本地索引 +def load_index_and_knowledge(): + current_os = platform.system() + split_doc = [] + split_qa = [] + #读取知识库 + if not os.path.exists(knowledge_pkl_path): + split_doc = split_document(doc_dir) + split_qa = split_conversation(qa_dir) + # logger.info(f'split_qa size:{len(split_qa)}') + # logger.info(f'type of split_qa:{type(split_qa[0])}') + # logger.info(f'split_doc size:{len(split_doc)}') + # logger.info(f'type of doc:{type(split_doc[0])}') + knowledge_chunks = split_doc + split_qa + with open(knowledge_pkl_path, 'wb') as file: + pickle.dump(knowledge_chunks, file) + else: + with open(knowledge_pkl_path , 'rb') as f: + knowledge_chunks = pickle.load(f) + + #读取vector DB + if not os.path.exists(vector_db_dir): + logger.info(f'Creating index...') + emb_model = load_embedding_model() + if not split_doc: + split_doc = split_document(doc_dir) + if not split_qa: + split_qa = split_conversation(qa_dir) + # 创建索引,windows不支持faiss-gpu + if current_os == 'Linux': + index = create_index_gpu(split_doc, split_qa, emb_model, vector_db_dir) + else: + index = create_index_cpu(split_doc, split_qa, emb_model, vector_db_dir) + else: + if current_os == 'Linux': + res = faiss.StandardGpuResources() + index = faiss.index_cpu_to_gpu(res, 0, index, vector_db_dir) + else: + index = faiss.read_index(vector_db_dir) + + return index, knowledge_chunks + + +def create_index_cpu(split_doc, split_qa, emb_model, knowledge_pkl_path, dimension = 768, question_only=False): + # 假设BCE嵌入的维度是768,根据你选择的模型可能不同 + faiss_index_cpu = faiss.IndexFlatIP(dimension) # 创建一个使用内积的FAISS索引 + # 将问答对转换为向量并添加到FAISS索引中 + for doc in split_doc: + # type_of_docs = type(split_doc) + text = f"{doc.page_content}" + vector = emb_model.encode([text]) + faiss_index_cpu.add(vector) + for qa in split_qa: + #仅对Q对进行编码 + text = f"{qa['input']}" + vector = emb_model.encode([text]) + faiss_index_cpu.add(vector) + faiss.write_index(faiss_index_cpu, knowledge_pkl_path) + return faiss_index_cpu + +def create_index_gpu(split_doc, split_qa, emb_model, knowledge_pkl_path, dimension = 768, question_only=False): + res = faiss.StandardGpuResources() + index = faiss.IndexFlatIP(dimension) + faiss_index_gpu = faiss.index_cpu_to_gpu(res, 0, index) + for doc in split_doc: + # type_of_docs = type(split_doc) + text = f"{doc.page_content}" + vector = emb_model.encode([text]) + faiss_index_gpu.add(vector) + for qa in split_qa: + #仅对Q对进行编码 + text = f"{qa['input']}" + vector = emb_model.encode([text]) + faiss_index_gpu.add(vector) + faiss.write_index(faiss_index_gpu, knowledge_pkl_path) + return faiss_index_gpu + + + +# 根据query搜索相似文本 +def find_top_k(query, faiss_index, k=5): + emb_model = load_embedding_model() + emb_query = emb_model.encode([query]) + distances, indices = faiss_index.search(emb_query, k) + return distances, indices + +def rerank(query, indices, knowledge_chunks): + passages = [] + for index in indices[0]: + content = knowledge_chunks[index] + ''' + txt: 'langchain_core.documents.base.Document' + json: dict + ''' + # logger.info(f'retrieved content:{content}') + # logger.info(f'type of content:{type(content)}') + if type(content) == dict: + content = content["input"] + '\n' + content["output"] + else: + content = content.page_content + passages.append(content) + + model = load_rerank_model() + rerank_results = model.rerank(query, passages) + return rerank_results + +@st.cache_resource +def load_model(): + model = ( + AutoModelForCausalLM.from_pretrained("model", trust_remote_code=True) + .to(torch.bfloat16) + .cuda() + ) + tokenizer = AutoTokenizer.from_pretrained("model", trust_remote_code=True) + return model, tokenizer + +if __name__ == "__main__": + logger.info(data_dir) + if not os.path.exists(data_dir): + os.mkdir(data_dir) + faiss_index, knowledge_chunks = load_index_and_knowledge() + # 按照query进行查询 + # query = "她要阻挠姐姐的婚姻,即使她自己的尸体在房门跟前" + # query = "肯定的。我最近睡眠很差,总是做噩梦。而且我吃得也不好,体重一直在下降" + # query = "序言 (一) 变态心理学是心理学本科生的必修课程之一,教材更新的问题一直在困扰着我们。" + query = "心理咨询师,我觉得我的胸闷症状越来越严重了,这让我很害怕" + distances, indices = find_top_k(query, faiss_index, 5) + logger.info(f'distances==={distances}') + logger.info(f'indices==={indices}') + + + # rerank无法返回id,先实现按整个问答对排序 + rerank_results = rerank(query, indices, knowledge_chunks) + for passage, score in zip(rerank_results['rerank_passages'], rerank_results['rerank_scores']): + print(str(score)+'\n') + print(passage+'\n') + \ No newline at end of file From 74db6d98932ee9abe5d248359e96ebf34ad3842f Mon Sep 17 00:00:00 2001 From: zealot52099 Date: Mon, 18 Mar 2024 10:33:01 +0800 Subject: [PATCH 03/13] update main.py --- rag/src/main.py | 112 ++++++++++++++++++++---------------------------- 1 file changed, 46 insertions(+), 66 deletions(-) diff --git a/rag/src/main.py b/rag/src/main.py index 97f60a0..7dd7639 100644 --- a/rag/src/main.py +++ b/rag/src/main.py @@ -5,87 +5,67 @@ from typing import Tuple from sentence_transformers import SentenceTransformer -from config.config import knowledge_json_path, knowledge_pkl_path, model_repo +from config.config import knowledge_json_path, knowledge_pkl_path, model_repo, model_dir, base_dir from util.encode import load_embedding, encode_qa from util.pipeline import EmoLLMRAG - +from loguru import logger from transformers import AutoTokenizer, AutoModelForCausalLM import torch import streamlit as st from openxlab.model import download +from data_processing import load_index_and_knowledge, create_index_cpu, create_index_gpu, find_top_k, rerank +from config.config import embedding_path, doc_dir, qa_dir, knowledge_pkl_path, data_dir -download( - model_repo=model_repo, - output='model' -) - - -""" -读取知识库 -""" -def load_knowledge() -> Tuple[list, list]: - # 如果 pkl 不存在,则先编码存储 - if not os.path.exists(knowledge_pkl_path): - encode_qa(knowledge_json_path, knowledge_pkl_path) - - # 加载 json 和 pkl - with open(knowledge_json_path, 'r', encoding='utf-8') as f1, open(knowledge_pkl_path, 'rb') as f2: - knowledge = json.load(f1) - encoded_knowledge = pickle.load(f2) - return knowledge, encoded_knowledge - - -""" -召回 top_k 个相关的文本段 -""" -def find_top_k( - emb: SentenceTransformer, - query: str, - knowledge: list, - encoded_knowledge: list, - k=3 -) -> list[str]: - # 编码 query - query_embedding = emb.encode(query) - - # 查找 top_k - scores = query_embedding @ encoded_knowledge.T - # 使用 argpartition 找出每行第 k 个大的值的索引,第 k 个位置左侧都是比它大的值,右侧都是比它小的值 - top_k_indices = np.argpartition(scores, -k)[-k:] - # 由于 argpartition 不保证顺序,我们需要对提取出的 k 个索引进行排序 - top_k_values_sorted_indices = np.argsort(scores[top_k_indices])[::-1] - top_k_indices = top_k_indices[top_k_values_sorted_indices] +''' + 1)构建完整的 RAG pipeline。输入为用户 query,输出为 answer + 2)调用 embedding 提供的接口对 query 向量化 + 3)下载基于 FAISS 预构建的 vector DB ,并检索对应信息 + 4)调用 rerank 接口重排序检索内容 + 5)调用 prompt 接口获取 system prompt 和 prompt template + 6)拼接 prompt 并调用模型返回结果 - # 返回 - contents = [knowledge[index] for index in top_k_indices] - return contents - - -def main(): - emb = load_embedding() - knowledge, encoded_knowledge = load_knowledge() - query = "认知心理学研究哪些心理活动?" - contents = find_top_k(emb, query, knowledge, encoded_knowledge, 2) - print('召回的 top-k 条相关内容如下:') - print(json.dumps(contents, ensure_ascii=False, indent=2)) - # 这里我没实现 LLM 部分,如果有 LLM - ## 1. 读取 LLM - ## 2. 将 contents 拼接为 prompt,传给 LLM,作为 {已知内容} - ## 3. 要求 LLM 根据已知内容回复 +''' +# download( +# model_repo=model_repo, +# output='model' +# ) @st.cache_resource def load_model(): + model_dir = os.path.join(base_dir,'../model') + logger.info(f'Loading model from {model_dir}') model = ( - AutoModelForCausalLM.from_pretrained("model", trust_remote_code=True) + AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True) .to(torch.bfloat16) .cuda() ) - tokenizer = AutoTokenizer.from_pretrained("model", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) return model, tokenizer -if __name__ == '__main__': - #main() - query = '' +def get_prompt(): + pass + +def get_prompt_template(): + pass + +def main(query, system_prompt): model, tokenizer = load_model() - rag_obj = EmoLLMRAG(model) - response = rag_obj.main(query) \ No newline at end of file + model = model.eval() + if not os.path.exists(data_dir): + os.mkdir(data_dir) + # 下载基于 FAISS 预构建的 vector DB 以及原始知识库 + faiss_index, knowledge_chunks = load_index_and_knowledge() + distances, indices = find_top_k(query, faiss_index, 5) + rerank_results = rerank(query, indices, knowledge_chunks) + messages = [(system_prompt, rerank_results['rerank_passages'][0])] + logger.info(f'messages:{messages}') + response, history = model.chat(tokenizer, query, history=messages) + messages.append((query, response)) + print(f"robot >>> {response}") + +if __name__ == '__main__': + # query = '你好' + query = "心理咨询师,我觉得我的胸闷症状越来越严重了,这让我很害怕" + #TODO system_prompt = get_prompt() + system_prompt = "你是一个由aJupyter、Farewell、jujimeizuo、Smiling&Weeping研发(排名按字母顺序排序,不分先后)、散步提供技术支持、上海人工智能实验室提供支持开发的心理健康大模型。现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。" + main(query, system_prompt) \ No newline at end of file From 98ecdda78d9c7e8536eceef3b66db0d52f22fcc4 Mon Sep 17 00:00:00 2001 From: zealot52099 Date: Mon, 18 Mar 2024 10:46:09 +0800 Subject: [PATCH 04/13] fix bug --- rag/src/data_processing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rag/src/data_processing.py b/rag/src/data_processing.py index e3215fd..45ff3f0 100644 --- a/rag/src/data_processing.py +++ b/rag/src/data_processing.py @@ -63,8 +63,8 @@ def split_document(data_path, chunk_size=1000, chunk_overlap=100): # logger.info(f"splits type {type(splits[0])}") # logger.info(f'splits size {len(splits)}') split_docs += splits - elif file.endswith('.txt'): - file_path = os.path.join(root, file) + elif data_path.endswith('.txt'): + file_path = os.path.join(root, data_path) # logger.info(f'splitting file {file_path}') text_loader = TextLoader(file_path, encoding='utf-8') text = text_loader.load() From 042146af56b830bad8860323dc79524c6a3dd079 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Mon, 18 Mar 2024 22:13:35 +0900 Subject: [PATCH 05/13] Revert "modified merge_jsonl and merge_jsonl_r" This reverts commit a38ef600587df3f036c4ad9572998d6e4882fed2. --- .gitignore | 1 - generate_data/final_data/merge_jsonl.py | 60 ------------------ generate_data/final_data/merge_jsonl_r.py | 75 ----------------------- 3 files changed, 136 deletions(-) delete mode 100644 generate_data/final_data/merge_jsonl.py delete mode 100644 generate_data/final_data/merge_jsonl_r.py diff --git a/.gitignore b/.gitignore index 7467647..b2c615a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,6 @@ zhipuai/ data/ *.jsonl -*.json # ./generate_data/*.josnl # ./generate_data/*/*/*.josnl diff --git a/generate_data/final_data/merge_jsonl.py b/generate_data/final_data/merge_jsonl.py deleted file mode 100644 index b8edd10..0000000 --- a/generate_data/final_data/merge_jsonl.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -import os - - -def save_merge_json(data_lis, file_path): - with open(file_path, 'wt', encoding='utf-8') as file: - json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':')) - - -def get_all_file_paths(folder_path, file_type='.jsonl'): - # 确保传入的是一个目录 - if not os.path.isdir(folder_path): - raise ValueError(f"{folder_path} is not a valid directory") - - # 获取文件夹下所有文件的路径 - file_paths = [os.path.join(folder_path, file) for file in os.listdir( - folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)] - return file_paths - - -if __name__ == '__main__': - conversion_lis = [] - - folder_path = r'./' - - merge_path = folder_path.split('/')[-1] - try: - merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else '' - except: - merge_last_path = '' - print(f'merge_path={merge_path},merge_last_path={merge_last_path}') - - - for path in get_all_file_paths(folder_path): - print(path) - - with open(path, 'rt', encoding='utf-8') as file: - for line in file: - # # 移除行尾的换行符 - # if line == '\n': - # line = line.rstrip('\n') - line = line.rstrip('\n') - # 解析JSON - try: - data = json.loads(line) - conversion_lis.append(data) - # conversion_lis.append('\n') - except json.JSONDecodeError as e: - print(f"Error decoding JSON: {e}") - - if merge_last_path!='': - save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json' - elif merge_path!='': - save_merge_json_path = rf'./{merge_path}_merge.json' - else: - save_merge_json_path = rf'./curr_merge.json' - - save_merge_json(data_lis=conversion_lis, - file_path=save_merge_json_path) - print(len(conversion_lis),save_merge_json_path) diff --git a/generate_data/final_data/merge_jsonl_r.py b/generate_data/final_data/merge_jsonl_r.py deleted file mode 100644 index a29c951..0000000 --- a/generate_data/final_data/merge_jsonl_r.py +++ /dev/null @@ -1,75 +0,0 @@ -import json -import os - - -def save_merge_json(data_lis, file_path): - with open(file_path, 'wt', encoding='utf-8') as file: - json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':')) - - -def get_all_file_paths(folder_path, file_type='.jsonl'): - # 确保传入的是一个目录 - if not os.path.isdir(folder_path): - raise ValueError(f"{folder_path} is not a valid directory") - - # 获取文件夹下所有文件的路径 - file_paths = [os.path.join(folder_path, file) for file in os.listdir( - folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)] - return file_paths - - -if __name__ == '__main__': - - data_ai = 'qwen' # python merge_jsonl_r.py > qwen.txt - # data_ai = 'zhipuai' # python merge_jsonl_r.py > zhipuai.txt - root_dir = rf'./{data_ai}/' - - save_final_merge_json_path = f'{data_ai}_final_merge.json' - - subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))] - - final_list = [] - for folder_path in subfolders: - conversion_lis = [] - merge_path = folder_path.split('/')[-1] - try: - merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else '' - except: - merge_last_path = '' - print(f'merge_path={merge_path},merge_last_path={merge_last_path}') - - - for path in get_all_file_paths(folder_path): - print(path) - - with open(path, 'rt', encoding='utf-8') as file: - for line in file: - # # 移除行尾的换行符 - # if line == '\n': - # line = line.rstrip('\n') - line = line.rstrip('\n') - # 解析JSON - try: - data = json.loads(line) - conversion_lis.append(data) - # conversion_lis.append('\n') - except json.JSONDecodeError as e: - print(f"Error decoding JSON: {e}") - - if merge_last_path!='': - save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json' - elif merge_path!='': - save_merge_json_path = rf'./{merge_path}_merge.json' - else: - save_merge_json_path = rf'./curr_merge.json' - - save_merge_json(data_lis=conversion_lis, - file_path=save_merge_json_path) - - final_list = final_list+conversion_lis - print(len(conversion_lis),len(final_list),save_merge_json_path) - - save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path) - print(save_final_merge_json_path) - - From c16761e289825c631d7c54d8ba3baaaf188f2a58 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Mon, 18 Mar 2024 23:35:21 +0900 Subject: [PATCH 06/13] update three merge_json*.py files and corresponding tutorial in CN and EN update three merge_json*.py files and corresponding tutorial in CN and EN --- .gitignore | 2 + generate_data/merge_json.py | 40 ++++++++++++++++++ generate_data/merge_jsonl.py | 62 +++++++++++++++++++++++++++ generate_data/merge_jsonl_r.py | 77 ++++++++++++++++++++++++++++++++++ generate_data/tutorial.md | 71 ++++++++++++++++++++----------- generate_data/tutorial_EN.md | 75 +++++++++++++++++++++------------ 6 files changed, 276 insertions(+), 51 deletions(-) create mode 100644 generate_data/merge_json.py create mode 100644 generate_data/merge_jsonl.py create mode 100644 generate_data/merge_jsonl_r.py diff --git a/.gitignore b/.gitignore index 2d26489..d6ca709 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ pdf/ .idea/ *.jsonl +*.json +*.txt # ./generate_data/*.josnl # ./generate_data/*/*/*.josnl diff --git a/generate_data/merge_json.py b/generate_data/merge_json.py new file mode 100644 index 0000000..714befb --- /dev/null +++ b/generate_data/merge_json.py @@ -0,0 +1,40 @@ +import json +import os + + +def save_merge_json(data_lis, file_path): + import json + + with open(file_path, 'wt', encoding='utf-8') as file: + json.dump(data_lis, file, ensure_ascii=False) + + +def get_all_file_paths(folder_path): + # 确保传入的是一个目录 + if not os.path.isdir(folder_path): + raise ValueError(f"{folder_path} is not a valid directory") + + # 获取文件夹下所有文件的路径 + file_paths = [os.path.join(folder_path, file) for file in os.listdir( + folder_path) if os.path.isfile(os.path.join(folder_path, file))] + return file_paths + + +if __name__ == '__main__': + conversion_lis = [] + + for path in get_all_file_paths(r'data\res-aiwei'): + print(path) + + with open(path, 'rt', encoding='utf-8') as file: + for line in file: + # 移除行尾的换行符 + line = line.rstrip('\n') + # 解析JSON + try: + data = json.loads(line) + conversion_lis.append(data) + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") + save_merge_json(data_lis=conversion_lis, + file_path=r'.\merge.json') diff --git a/generate_data/merge_jsonl.py b/generate_data/merge_jsonl.py new file mode 100644 index 0000000..7887ab0 --- /dev/null +++ b/generate_data/merge_jsonl.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +import json +import os + + +def save_merge_json(data_lis, file_path): + with open(file_path, 'wt', encoding='utf-8') as file: + json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':')) + + +def get_all_file_paths(folder_path, file_type='.jsonl'): + # 确保传入的是一个目录 + if not os.path.isdir(folder_path): + raise ValueError(f"{folder_path} is not a valid directory") + + # 获取文件夹下所有文件的路径 + file_paths = [os.path.join(folder_path, file) for file in os.listdir( + folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)] + return file_paths + + +if __name__ == '__main__': + conversion_lis = [] + + folder_path = r'./' # python merge_jsonl.py > curr.txt + + merge_path = folder_path.split('/')[-1] + try: + merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else '' + except: + merge_last_path = '' + print(f'merge_path={merge_path},merge_last_path={merge_last_path}') + + + for path in get_all_file_paths(folder_path): + print(path.encode("utf-8")) + + with open(path, 'rt', encoding='utf-8') as file: + for line in file: + # # 移除行尾的换行符 + # if line == '\n': + # line = line.rstrip('\n') + line = line.rstrip('\n') + # 解析JSON + try: + data = json.loads(line) + conversion_lis.append(data) + # conversion_lis.append('\n') + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") + + if merge_last_path!='': + save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json' + elif merge_path!='': + save_merge_json_path = rf'./{merge_path}_merge.json' + else: + save_merge_json_path = rf'./curr_merge.json' + + save_merge_json(data_lis=conversion_lis, + file_path=save_merge_json_path) + print(len(conversion_lis),save_merge_json_path) diff --git a/generate_data/merge_jsonl_r.py b/generate_data/merge_jsonl_r.py new file mode 100644 index 0000000..cf4998a --- /dev/null +++ b/generate_data/merge_jsonl_r.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +import json +import os + + +def save_merge_json(data_lis, file_path): + with open(file_path, 'wt', encoding='utf-8') as file: + json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':')) + + +def get_all_file_paths(folder_path, file_type='.jsonl'): + # 确保传入的是一个目录 + if not os.path.isdir(folder_path): + raise ValueError(f"{folder_path} is not a valid directory") + + # 获取文件夹下所有文件的路径 + file_paths = [os.path.join(folder_path, file) for file in os.listdir( + folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)] + return file_paths + + +if __name__ == '__main__': + + data_ai = 'qwen' # python merge_jsonl_r.py > qwen.txt + # data_ai = 'zhipuai' # python merge_jsonl_r.py > zhipuai.txt + root_dir = rf'./{data_ai}/' + + save_final_merge_json_path = f'{data_ai}_final_merge.json' + + subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))] + + final_list = [] + for folder_path in subfolders: + conversion_lis = [] + merge_path = folder_path.split('/')[-1] + try: + merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else '' + except: + merge_last_path = '' + print(f'merge_path={merge_path},merge_last_path={merge_last_path}'.encode("utf-8")) + + + for path in get_all_file_paths(folder_path): + print(path.encode("utf-8")) + + with open(path, 'rt', encoding='utf-8') as file: + for line in file: + # # 移除行尾的换行符 + # if line == '\n': + # line = line.rstrip('\n') + line = line.rstrip('\n') + # 解析JSON + try: + data = json.loads(line) + conversion_lis.append(data) + # conversion_lis.append('\n') + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") + + if merge_last_path!='': + save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json' + elif merge_path!='': + save_merge_json_path = rf'./{merge_path}_merge.json' + else: + save_merge_json_path = rf'./curr_merge.json' + + save_merge_json(data_lis=conversion_lis, + file_path=save_merge_json_path) + + final_list = final_list+conversion_lis + print(f'{len(conversion_lis)},{len(final_list)},{save_merge_json_path}'.encode("utf-8")) + + save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path) + print(len(conversion_lis),save_final_merge_json_path.encode("utf-8")) + + diff --git a/generate_data/tutorial.md b/generate_data/tutorial.md index 80426b4..f7af989 100644 --- a/generate_data/tutorial.md +++ b/generate_data/tutorial.md @@ -22,7 +22,7 @@ ## **三、实践步骤** -1. **初始化** +### 1. **初始化** * 安装所需的软件和库 @@ -34,49 +34,62 @@ 可参见 `config.yml`均有注释 -2. **模型选择与配置** +### 2. **模型选择与配置** * 根据需求选择适合的模型 为了使大家都能够玩上大模型,我们选用InterLLM2-7B作为我们的基线模型(消费级显卡也可部署微调的哦) * 对模型进行必要的配置和调整 根据我们的数据集以及配置策略,使用XTuner进行微调 -3. **数据生成** +### 3. **数据生成** + +#### **三种改进前的数据生成方法** * 使用通义千问大模型进行数据生成 - ```bash +```bash # 终端运行 bash run_qwen.bash - - # 或者不使用终端运行 - python qwen_gen_data_NoBash.py - ``` +``` * 使用百度文心大模型进行数据生成 - ```bash +```bash # 终端运行 python ernie_gen_data.py - ``` +``` -* 使用智谱GLM大模型进行数据生成 +* 使用讯飞星火大模型进行数据生成 - ```bash +```bash # 终端运行 - python zhipuai_gen_data.py - ``` + python ./xinghuo/gen_data.py +``` -* 使用讯飞星火大模型进行数据生成 +#### **改进的两种数据生成方法** + +采用改进的数据生成方法生成多轮对话时,首先需要定义`ai_tool`变量,该变量表示LLM模型的名称(`qwen`或`zhipuai`)。根据`ai_tool`变量的值,创建一个`{ai_tool}`文件夹。 + +然后,遍历所有的`area`值,接着根据不同的`emotion`值生成多轮对话。生成的对话会每隔`save_interval`次迭代写入到`./{ai_tool}/{area}/{emotion}.jsonl`文件中。这个过程会重复执行`total_num_each_emo_area`次。 + +* 使用**改进的**通义千问大模型数据生成方法 - ```bash +```bash + # 或者不使用bash,直接运行 + python qwen_gen_data_NoBash.py +``` + +* 使用**改进的**智谱GLM大模型数据生成方法 + +```bash # 终端运行 - python ./xinghuo/gen_data.py - ``` + python zhipuai_gen_data.py +``` -1. **自我认知数据集的整合** +### 4. **自我认知数据集的整合** * 自我认知数据集需要按照格式手动生成,如下格式即可 + ```json [ { @@ -98,19 +111,27 @@ ] ``` -5. **数据集整合** +### 5. **数据集整合** + +#### Case 1: 使用`python ernie_gen_data.py`、`bash run_qwen.bash`或者`python ./xinghuo/gen_data.py` + +* 首先使用`check.py`进行数据检查。在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。 +* 然后使用`merge_json.py`将所有的json(或者使用`merge_jsonl.py`将所有的jsonl)文件整合为一个总的json文件。 + +#### Case 2: 使用`python qwen_gen_data_NoBash.py`或者`python zhipuai_gen_data.py` - 在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。 +在这种情况下,我们需要在使用两种改进的生成方法生成多轮对话后,将`{data_ai}`文件夹下所有`{area}`子文件夹中的所有`{emotion}.jsonl`文件合并为`{data_ai}_final_merge.json`文件。 -* 首先使用`check.py`进行数据检查。 -* 然后使用`merge_json.py`将所有的json整合为一个总的json文件。 +* 由于采用了改进的数据生成方法和不同的存储生成对话结构,因此我们可以免除对数据集的检查。 +* 然后使用`merge_jsonl_r.py`将`qwen`或者`zhipuai`定义为`data_ai`变量,并将其文件夹下所有领域(`area`)下所有的jsonl文件整合为一个总的json文件并取名为`{area}_merge.json`,最终在`{data_ai}`文件夹下生成`{data_ai}_final_merge.json`。 +* 然后我们可以手动合成`qwen_final_merge.json`和`zhipuai_final_merge.json`为`qwen_zhipuai_final_merge.json`文件了, 注意合并后的json文件夹中,最外面只有一对`[]`,中间是`{}`包裹的多轮对话。 -6. **评估与优化** +### 6. **评估与优化** * 使用适当的评估指标对生成的数据集进行评估 * 根据评估结果进行必要的优化和调整 -7. **测试与部署** +### 7. **测试与部署** * 使用独立测试集对训练好的模型进行评估 * 根据测试结果进行必要的调整和优化 diff --git a/generate_data/tutorial_EN.md b/generate_data/tutorial_EN.md index 25e10e2..85acf33 100644 --- a/generate_data/tutorial_EN.md +++ b/generate_data/tutorial_EN.md @@ -22,7 +22,7 @@ In order to have a better representation of our large mental models, we must hav ## **III. Practical steps** -1. **Initialize** +### 1. **Initialize** * Install the required software and libraries @@ -34,7 +34,7 @@ In order to have a better representation of our large mental models, we must hav See `config.yml` for annotations -2. **Model selection and configuration** +### 2. **Model selection and configuration** * Select the right model for your needs In order to enable everyone to play with the large model, we chose the InterLLM2-7B as our baseline model (consumer graphics cards can also be deployed fine-tuned oh). @@ -42,40 +42,52 @@ In order to have a better representation of our large mental models, we must hav * Make necessary configurations and adjustments to the model Use XTuner for fine-tuning based on our dataset and configuration strategy. -3. **Data generation** +### 3. **Data generation** -* Data generation using Tongyi Qianwen +#### **Three original methods for data generation** + +* 1.Data generation using Tongyi Qianwen - ```bash +```bash # Terminal operation bash run_qwen.bash +``` - # Or just use python without bash - python qwen_gen_data_NoBash.py - ``` - -* Data generation using Wenxin Yiyan +* 2.Data generation using Wenxin Yiyan - ```bash +```bash # Terminal operation python ernie_gen_data.py - ``` +``` -* Data generation using Zhipu GLM +* 3.Data generation using IFlystar Fire - ```bash +```bash # Terminal operation - python zhipuai_gen_data.py - ``` + python ./xinghuo/gen_data.py +``` + +#### **Two improved methods for data generation** + +When generating multi-turn dialogues with these two improved methods, the first step is to define the value of the `ai_tool` variable, which represents the LLM model name (`qwen` or `zhipuai`). Based on the value of this `ai_tool` variable, a `{ai_tool}` folder is created. -* Data generation using IFlystar Fire +Then, all `area` values are traversed, followed by different `emotion` values for generating multi-turn dialogues. The generated dialogues are written to the `./{ai_tool}/{area}/{emotion}.jsonl` file every `save_interval` iterations. This process is repeated `total_num_each_emo_area` times. + +* 1.Using the **improved** method for generating data with the Qwen model: - ```bash - # Terminal operation - python ./xinghuo/gen_data.py - ``` +```bash + # Alternatively, you can run it directly without using bash + python qwen_gen_data_NoBash.py +``` -4. **Integration of self-cognition datasets** +* 2.Using the **improved** method for generating data with the Zhipuai GLM-4 model: + +```bash + # Alternatively, you can run it directly without using bash + python zhipuai_gen_data.py +``` + +### 4. **Integration of self-cognition datasets** * Self-cognition dataset this needs to be manually generated in accordance with the format, the following format can be @@ -100,16 +112,27 @@ In order to have a better representation of our large mental models, we must hav ] ``` -5. **dataset integration** +### 5. **Dataset Integration** + +#### **Case 1**: Using `python ernie_gen_data.py`, `bash run_qwen.bash`, or `python ./xinghuo/gen_data.py` + +* First, use `check.py` to check the data. Before integrating the dataset, we need to check whether the generated data has format errors or type mismatches. +* Then, use `merge_json.py` to consolidate all json files (or use `merge_jsonl.py` to consolidate all jsonl files) into one overall json file. + +#### **Case 2**: Using `python qwen_gen_data_NoBash.py` or `python zhipuai_gen_data.py` + +In this case, we need to merge all `{emotion}.jsonl` files in all `{area}` subfolders under the `{data_ai}` folder into `{data_ai}_final_merge.json` after we use two improved generation methods to generate multi-round conversations. -Before dataset integration, we need to check whether the generated data has formatting errors, type mismatches, etc. We need check.py to check the data. Finally, merge_json.py is used to combine all the json into one overall json file. +* As we have adopted improved data generation methods and different storage generation dialog structures, we can avoid checking the dataset. +* Then, use `merge_jsonl_r.py` to define `qwen` or `zhipuai` as the `data_ai` variable, and consolidate all jsonl files in all areas (`area`) into one overall json file named `{area}_merge.json`. Finally, generate `{data_ai}_final_merge.json` in the `{data_ai}` folder. +* We can then manually merge `qwen_final_merge.json` and `zhipuai_final_merge.json` into `qwen_zhipuai_final_merge.json`. Note that in the merged json file, there is only one pair of `[]` on the outside, and the multi-round dialogues are wrapped in `{}`. -6. **Evaluation and optimization** +### 6. **Evaluation and optimization** * Evaluate the generated dataset using appropriate evaluation metrics * Make necessary optimizations and adjustments based on the evaluation results -7. **Testing and deployment** +### 7. **Testing and deployment** * Evaluate the trained model using an independent test set * Make necessary adjustments and optimizations based on test results From 275f2497099122dc293f3131367de4d5e91bb925 Mon Sep 17 00:00:00 2001 From: HongCheng Date: Mon, 18 Mar 2024 23:39:49 +0900 Subject: [PATCH 07/13] small update --- generate_data/tutorial.md | 2 +- generate_data/tutorial_EN.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/generate_data/tutorial.md b/generate_data/tutorial.md index f7af989..454895a 100644 --- a/generate_data/tutorial.md +++ b/generate_data/tutorial.md @@ -118,7 +118,7 @@ * 首先使用`check.py`进行数据检查。在进行数据集整合之前,我们要检查生成的数据是否存在格式错误,类型不符合等情况。 * 然后使用`merge_json.py`将所有的json(或者使用`merge_jsonl.py`将所有的jsonl)文件整合为一个总的json文件。 -#### Case 2: 使用`python qwen_gen_data_NoBash.py`或者`python zhipuai_gen_data.py` +#### Case 2: 使用改进的生成保存方法:`python qwen_gen_data_NoBash.py`或者`python zhipuai_gen_data.py` 在这种情况下,我们需要在使用两种改进的生成方法生成多轮对话后,将`{data_ai}`文件夹下所有`{area}`子文件夹中的所有`{emotion}.jsonl`文件合并为`{data_ai}_final_merge.json`文件。 diff --git a/generate_data/tutorial_EN.md b/generate_data/tutorial_EN.md index 85acf33..fdd5d69 100644 --- a/generate_data/tutorial_EN.md +++ b/generate_data/tutorial_EN.md @@ -119,7 +119,7 @@ Then, all `area` values are traversed, followed by different `emotion` values fo * First, use `check.py` to check the data. Before integrating the dataset, we need to check whether the generated data has format errors or type mismatches. * Then, use `merge_json.py` to consolidate all json files (or use `merge_jsonl.py` to consolidate all jsonl files) into one overall json file. -#### **Case 2**: Using `python qwen_gen_data_NoBash.py` or `python zhipuai_gen_data.py` +#### **Case 2**: Using improved generation method: `python qwen_gen_data_NoBash.py` or `python zhipuai_gen_data.py` In this case, we need to merge all `{emotion}.jsonl` files in all `{area}` subfolders under the `{data_ai}` folder into `{data_ai}_final_merge.json` after we use two improved generation methods to generate multi-round conversations. From 1de2cf5a86009a9e797fba9f6b233acb9b14d73e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=8F=8B=E6=98=89?= Date: Mon, 18 Mar 2024 23:13:00 +0800 Subject: [PATCH 08/13] update README --- README.md | 39 ++++++++++++++++++++------------------- README_EN.md | 39 ++++++++++++++++++++------------------- 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index b2abf79..2dfb036 100644 --- a/README.md +++ b/README.md @@ -210,25 +210,26 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git ### 作者(排名不分先后) -| 用户名 | 学校/组织 | 备注 | 贡献 | -| :----------: | :--------------------: | :-------------------: | :----------: | -| [aJupyter](https://github.com/aJupyter) | 南开大学在读硕士 | DataWhale成员 | 项目发起人 | -| [jujimeizuo](https://github.com/jujimeizuo) | 江南大学在读硕士 | | | -| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | 哈尔滨工业大学(威海)在读本科生 | | | -| [8baby8](https://github.com/8baby8) | 飞桨领航团区域主管 | 文心大模型核心开发者 | | -| [zxazys](https://github.com/zxazys) | 南开大学在读硕士 | | | -| [MING-ZCH](https://github.com/MING-ZCH) | 华中科技大学在读本科生 | | | -| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | swufe | | | -| [MrCatAI](https://github.com/MrCatAI) | AI搬用工 | | | -| [ZeyuBa](https://github.com/ZeyuBa) | 自动化所在读硕士 | | | -| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | 宾夕法尼亚大学在读硕士 | | | -| [Nobody-ML](https://github.com/Nobody-ML) | 中国石油大学(华东)在读本科生 | | | -| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora/) |MiniSora主要维护|数据清洗、文档翻译| -| [Mxoder](https://github.com/Mxoder) | 北京航空航天大学在读本科生 | | | -| [Anooyman](https://github.com/Anooyman) | 南京理工大学硕士 | | | -| [Vicky-3021](https://github.com/Vicky-3021) | 西安电子科技大学硕士(研0) | | | -| [SantiagoTOP](https://github.com/santiagoTOP) | 太原理工大学在读硕士 | | | -| [zealot52099](https://github.com/zealot52099) | AI搬用工 | |清洗数据、RAG| +| 用户名 | 学校/组织 | 备注 | 贡献 | +|:-------------------------------------------------------------:|:--------------------------------------------------:| :-------------------: | :----------: | +| [aJupyter](https://github.com/aJupyter) | 南开大学在读硕士 | DataWhale成员 | 项目发起人 | +| [jujimeizuo](https://github.com/jujimeizuo) | 江南大学在读硕士 | | | +| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | 哈尔滨工业大学(威海)在读本科生 | | | +| [8baby8](https://github.com/8baby8) | 飞桨领航团区域主管 | 文心大模型核心开发者 | | +| [zxazys](https://github.com/zxazys) | 南开大学在读硕士 | | | +| [MING-ZCH](https://github.com/MING-ZCH) | 华中科技大学在读本科生 | | | +| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | swufe | | | +| [MrCatAI](https://github.com/MrCatAI) | AI搬用工 | | | +| [ZeyuBa](https://github.com/ZeyuBa) | 自动化所在读硕士 | | | +| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | 宾夕法尼亚大学在读硕士 | | | +| [Nobody-ML](https://github.com/Nobody-ML) | 中国石油大学(华东)在读本科生 | | | +| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora/) |MiniSora主要维护|数据清洗、文档翻译| +| [Mxoder](https://github.com/Mxoder) | 北京航空航天大学在读本科生 | | | +| [Anooyman](https://github.com/Anooyman) | 南京理工大学硕士 | | | +| [Vicky-3021](https://github.com/Vicky-3021) | 西安电子科技大学硕士(研0) | | | +| [SantiagoTOP](https://github.com/santiagoTOP) | 太原理工大学在读硕士 | | | +| [zealot52099](https://github.com/zealot52099) | AI搬用工 | |清洗数据、RAG| +| [wwwyfff](https://github.com/wwwyfff) | 复旦大学在读硕士 | || ### 版权说明 diff --git a/README_EN.md b/README_EN.md index a8a5a3e..0d7bc5b 100644 --- a/README_EN.md +++ b/README_EN.md @@ -226,25 +226,26 @@ This project uses Git for version control. You can see the currently available v ### Authors (in no particular order) -| Username | School/Organization | Remarks | Contributions | -| :-------: | :-------------------: | :------------------: | :--------: | -| [aJupyter](https://github.com/aJupyter) | Nankai University, Master's student | DataWhale member | Project initiator | -| [jujimeizuo](https://github.com/jujimeizuo) | Jiangnan University, Master's student | | | -| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | Harbin Institute of Technology (Weihai), Undergraduate student | | | -| [8baby8](https://github.com/8baby8) | PaddlePaddle Pilot Team Regional Director | Wenxin Large Model core developer | | -| [zxazys](https://github.com/zxazys) | Nankai University, Master's student | | | -| [MING-ZCH](https://github.com/MING-ZCH) | Huazhong University of Science and Technology, Undergraduate student | | | -| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | SWUFE (Southwestern University of Finance and Economics) | | | -| [MrCatAI](https://github.com/MrCatAI) | AI Mover | | | -| [ZeyuBa](https://github.com/ZeyuBa) | Institute of Automation, Master's student | | | -| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | University of Pennsylvania, Master's student | | | -| [Nobody-ML](https://github.com/Nobody-ML) | China University of Petroleum (East China), Undergraduate student | | | -| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora) |Maintainer and Admin|Data Cleaning and Docs Translation| -| [Mxoder](https://github.com/Mxoder) | Beihang University, Undergraduate student | | | -| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | | -| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | | -| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | | -| [zealot52099](https://github.com/zealot52099) | AI Mover | |Data Processing and RAG| +| Username | School/Organization | Remarks | Contributions | +|:-------------------------------------------------------------:|:--------------------------------------------------------------------:| :------------------: | :--------: | +| [aJupyter](https://github.com/aJupyter) | Nankai University, Master's student | DataWhale member | Project initiator | +| [jujimeizuo](https://github.com/jujimeizuo) | Jiangnan University, Master's student | | | +| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | Harbin Institute of Technology (Weihai), Undergraduate student | | | +| [8baby8](https://github.com/8baby8) | PaddlePaddle Pilot Team Regional Director | Wenxin Large Model core developer | | +| [zxazys](https://github.com/zxazys) | Nankai University, Master's student | | | +| [MING-ZCH](https://github.com/MING-ZCH) | Huazhong University of Science and Technology, Undergraduate student | | | +| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | SWUFE (Southwestern University of Finance and Economics) | | | +| [MrCatAI](https://github.com/MrCatAI) | AI Mover | | | +| [ZeyuBa](https://github.com/ZeyuBa) | Institute of Automation, Master's student | | | +| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | University of Pennsylvania, Master's student | | | +| [Nobody-ML](https://github.com/Nobody-ML) | China University of Petroleum (East China), Undergraduate student | | | +| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora) |Maintainer and Admin|Data Cleaning and Docs Translation| +| [Mxoder](https://github.com/Mxoder) | Beihang University, Undergraduate student | | | +| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | | +| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | | +| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | | +| [zealot52099](https://github.com/zealot52099) | AI Mover | |Data Processing and RAG| +| [wwwyfff](https://github.com/wwwyfff) | FuDan University, Master's student | || ### Copyright Notice From 7bbe3842dcb88e30ee1aa2ad29b6b891f113f15a Mon Sep 17 00:00:00 2001 From: jkhumor <3323637090@qq.com> Date: Tue, 19 Mar 2024 12:34:51 +0800 Subject: [PATCH 09/13] modify readme --- README.md | 3 ++- README_EN.md | 41 +++++++++++++++++++++-------------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 2dfb036..48f254f 100644 --- a/README.md +++ b/README.md @@ -229,7 +229,8 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git | [Vicky-3021](https://github.com/Vicky-3021) | 西安电子科技大学硕士(研0) | | | | [SantiagoTOP](https://github.com/santiagoTOP) | 太原理工大学在读硕士 | | | | [zealot52099](https://github.com/zealot52099) | AI搬用工 | |清洗数据、RAG| -| [wwwyfff](https://github.com/wwwyfff) | 复旦大学在读硕士 | || +| [wwwyfff](https://github.com/wwwyfff) | 复旦大学在读硕士 | || +| [jkhumor](https://github.com/jkhumor) | 南开大学在读硕士 | |RAG| ### 版权说明 diff --git a/README_EN.md b/README_EN.md index 0d7bc5b..564ae67 100644 --- a/README_EN.md +++ b/README_EN.md @@ -226,26 +226,27 @@ This project uses Git for version control. You can see the currently available v ### Authors (in no particular order) -| Username | School/Organization | Remarks | Contributions | -|:-------------------------------------------------------------:|:--------------------------------------------------------------------:| :------------------: | :--------: | -| [aJupyter](https://github.com/aJupyter) | Nankai University, Master's student | DataWhale member | Project initiator | -| [jujimeizuo](https://github.com/jujimeizuo) | Jiangnan University, Master's student | | | -| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | Harbin Institute of Technology (Weihai), Undergraduate student | | | -| [8baby8](https://github.com/8baby8) | PaddlePaddle Pilot Team Regional Director | Wenxin Large Model core developer | | -| [zxazys](https://github.com/zxazys) | Nankai University, Master's student | | | -| [MING-ZCH](https://github.com/MING-ZCH) | Huazhong University of Science and Technology, Undergraduate student | | | -| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | SWUFE (Southwestern University of Finance and Economics) | | | -| [MrCatAI](https://github.com/MrCatAI) | AI Mover | | | -| [ZeyuBa](https://github.com/ZeyuBa) | Institute of Automation, Master's student | | | -| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | University of Pennsylvania, Master's student | | | -| [Nobody-ML](https://github.com/Nobody-ML) | China University of Petroleum (East China), Undergraduate student | | | -| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora) |Maintainer and Admin|Data Cleaning and Docs Translation| -| [Mxoder](https://github.com/Mxoder) | Beihang University, Undergraduate student | | | -| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | | -| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | | -| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | | -| [zealot52099](https://github.com/zealot52099) | AI Mover | |Data Processing and RAG| -| [wwwyfff](https://github.com/wwwyfff) | FuDan University, Master's student | || +| Username | School/Organization | Remarks | Contributions | +|:-------------------------------------------------------------:|:------------------------------------------------------------------------------:| :------------------: | :--------: | +| [aJupyter](https://github.com/aJupyter) | Nankai University, Master's student | DataWhale member | Project initiator | +| [jujimeizuo](https://github.com/jujimeizuo) | Jiangnan University, Master's student | | | +| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | Harbin Institute of Technology (Weihai), Undergraduate student | | | +| [8baby8](https://github.com/8baby8) | PaddlePaddle Pilot Team Regional Director | Wenxin Large Model core developer | | +| [zxazys](https://github.com/zxazys) | Nankai University, Master's student | | | +| [MING-ZCH](https://github.com/MING-ZCH) | Huazhong University of Science and Technology, Undergraduate student | | | +| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | SWUFE (Southwestern University of Finance and Economics) | | | +| [MrCatAI](https://github.com/MrCatAI) | AI Mover | | | +| [ZeyuBa](https://github.com/ZeyuBa) | Institute of Automation, Master's student | | | +| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | University of Pennsylvania, Master's student | | | +| [Nobody-ML](https://github.com/Nobody-ML) | China University of Petroleum (East China), Undergraduate student | | | +| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora) |Maintainer and Admin|Data Cleaning and Docs Translation| +| [Mxoder](https://github.com/Mxoder) | Beihang University, Undergraduate student | | | +| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | | +| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | | +| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | | +| [zealot52099](https://github.com/zealot52099) | AI Mover | |Data Processing and RAG| +| [wwwyfff](https://github.com/wwwyfff) | FuDan University, Master's student | || +| [jkhumor](https://github.com/jkhumor) | Nankai University, Master's student | |RAG| ### Copyright Notice From 1ee3a481b858de5ace0867bc0ac4130771e63609 Mon Sep 17 00:00:00 2001 From: jkhumor <3323637090@qq.com> Date: Tue, 19 Mar 2024 13:11:39 +0800 Subject: [PATCH 10/13] update readme --- README.md | 2 +- README_EN.md | 42 +++++++++++++++++++++--------------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 48f254f..4a8ba6a 100644 --- a/README.md +++ b/README.md @@ -230,7 +230,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git | [SantiagoTOP](https://github.com/santiagoTOP) | 太原理工大学在读硕士 | | | | [zealot52099](https://github.com/zealot52099) | AI搬用工 | |清洗数据、RAG| | [wwwyfff](https://github.com/wwwyfff) | 复旦大学在读硕士 | || -| [jkhumor](https://github.com/jkhumor) | 南开大学在读硕士 | |RAG| +| [jkhumor](https://github.com/jkhumor) | 南开大学在读硕士 | |RAG| ### 版权说明 diff --git a/README_EN.md b/README_EN.md index 564ae67..cdf933e 100644 --- a/README_EN.md +++ b/README_EN.md @@ -226,27 +226,27 @@ This project uses Git for version control. You can see the currently available v ### Authors (in no particular order) -| Username | School/Organization | Remarks | Contributions | -|:-------------------------------------------------------------:|:------------------------------------------------------------------------------:| :------------------: | :--------: | -| [aJupyter](https://github.com/aJupyter) | Nankai University, Master's student | DataWhale member | Project initiator | -| [jujimeizuo](https://github.com/jujimeizuo) | Jiangnan University, Master's student | | | -| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | Harbin Institute of Technology (Weihai), Undergraduate student | | | -| [8baby8](https://github.com/8baby8) | PaddlePaddle Pilot Team Regional Director | Wenxin Large Model core developer | | -| [zxazys](https://github.com/zxazys) | Nankai University, Master's student | | | -| [MING-ZCH](https://github.com/MING-ZCH) | Huazhong University of Science and Technology, Undergraduate student | | | -| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | SWUFE (Southwestern University of Finance and Economics) | | | -| [MrCatAI](https://github.com/MrCatAI) | AI Mover | | | -| [ZeyuBa](https://github.com/ZeyuBa) | Institute of Automation, Master's student | | | -| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | University of Pennsylvania, Master's student | | | -| [Nobody-ML](https://github.com/Nobody-ML) | China University of Petroleum (East China), Undergraduate student | | | -| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora) |Maintainer and Admin|Data Cleaning and Docs Translation| -| [Mxoder](https://github.com/Mxoder) | Beihang University, Undergraduate student | | | -| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | | -| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | | -| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | | -| [zealot52099](https://github.com/zealot52099) | AI Mover | |Data Processing and RAG| -| [wwwyfff](https://github.com/wwwyfff) | FuDan University, Master's student | || -| [jkhumor](https://github.com/jkhumor) | Nankai University, Master's student | |RAG| +| Username | School/Organization | Remarks | Contributions | +|:-------------------------------------------------------------:|:--------------------------------------------------------------------:| :------------------: | :--------: | +| [aJupyter](https://github.com/aJupyter) | Nankai University, Master's student | DataWhale member | Project initiator | +| [jujimeizuo](https://github.com/jujimeizuo) | Jiangnan University, Master's student | | | +| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | Harbin Institute of Technology (Weihai), Undergraduate student | | | +| [8baby8](https://github.com/8baby8) | PaddlePaddle Pilot Team Regional Director | Wenxin Large Model core developer | | +| [zxazys](https://github.com/zxazys) | Nankai University, Master's student | | | +| [MING-ZCH](https://github.com/MING-ZCH) | Huazhong University of Science and Technology, Undergraduate student | | | +| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | SWUFE (Southwestern University of Finance and Economics) | | | +| [MrCatAI](https://github.com/MrCatAI) | AI Mover | | | +| [ZeyuBa](https://github.com/ZeyuBa) | Institute of Automation, Master's student | | | +| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | University of Pennsylvania, Master's student | | | +| [Nobody-ML](https://github.com/Nobody-ML) | China University of Petroleum (East China), Undergraduate student | | | +| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora) |Maintainer and Admin|Data Cleaning and Docs Translation| +| [Mxoder](https://github.com/Mxoder) | Beihang University, Undergraduate student | | | +| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student | | | +| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) | | | +| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | | +| [zealot52099](https://github.com/zealot52099) | AI Mover | |Data Processing and RAG| +| [wwwyfff](https://github.com/wwwyfff) | FuDan University, Master's student | || +| [jkhumor](https://github.com/jkhumor) | Nankai University, Master's student | |RAG| ### Copyright Notice From 861f12d47a6595549f14d219dab16eb36889d276 Mon Sep 17 00:00:00 2001 From: zealot52099 Date: Tue, 19 Mar 2024 16:41:09 +0800 Subject: [PATCH 11/13] add deduplicate.py --- datasets/deduplicate.py | 68 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 datasets/deduplicate.py diff --git a/datasets/deduplicate.py b/datasets/deduplicate.py new file mode 100644 index 0000000..776396e --- /dev/null +++ b/datasets/deduplicate.py @@ -0,0 +1,68 @@ +import json +from loguru import logger +import os +from datasketch import MinHash +from hashlib import md5 + +def is_json_file(filename): + return filename.endswith('.json') + +# 绝对匹配 +def is_duplicate_absolutely(d1, d2): + return md5(d1.encode('utf-8')).hexdigest() == md5(d2.encode('utf-8')).hexdigest() + +# 使用MinHash生成器计算dict的签名 +def hash_dict(dict_obj): + m = MinHash() + for key, value in sorted(dict_obj.items()): + # 对于非str类型值需要先转为str + m.update(str(value).encode('utf8')) + return m + +# 使用绝对匹配和MinHash对dict列表去重 +def deduplicate_json(data_list, threshold=0.8): + seen_hashes = [] + duplicates_removed = [] + + for item in data_list: + # print(item) + # print('###########') + min_hash = hash_dict(item) + # print(f'min_hash: {min_hash}') + + # 绝对匹配去重 + if not any(is_duplicate_absolutely(str(item), str(existing)) for existing in duplicates_removed): + # MinHash相似性去重 + has_similar = False + for stored_min_hash, stored_text in seen_hashes: + if stored_min_hash.jaccard(min_hash) > threshold: + has_similar = True + break + if not has_similar: + seen_hashes.append((min_hash,item)) + duplicates_removed.append(item) + + + return duplicates_removed + +if __name__ == '__main__': + data_ai = 'qwen' + root_dir = rf'./{data_ai}/' + dedup_output_dir = os.path.join(root_dir,'dedup') + if not os.path.exists(dedup_output_dir): + os.mkdir(dedup_output_dir) + if not os.path.exists(root_dir): + logger.error(f"folder {root_dir} not exist" ) + + else: + for file in os.listdir(root_dir): + file_path = os.path.join(root_dir, file) + if os.path.isfile(file_path): + print(f'file name: {file_path}') + if is_json_file(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + dedup_data = deduplicate_json(data) + with open(os.path.join(root_dir, 'dedup','dedup_' + file), 'w', encoding='utf-8') as output_file: + json.dump(dedup_data, output_file, ensure_ascii=False, indent=4) + \ No newline at end of file From 6e7bd5e5d4954cf11cb7659dafdab9ef209df877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E5=8F=8B=E6=98=89?= Date: Tue, 19 Mar 2024 18:03:26 +0800 Subject: [PATCH 12/13] GLM-6B ft --- xtuner_config/ChatGLM3-6b-ft.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xtuner_config/ChatGLM3-6b-ft.md b/xtuner_config/ChatGLM3-6b-ft.md index 37015c8..a1867be 100644 --- a/xtuner_config/ChatGLM3-6b-ft.md +++ b/xtuner_config/ChatGLM3-6b-ft.md @@ -65,8 +65,7 @@ LLM 的微调一般指指令微调过程。所谓指令微调,是说我们使 def process_func(example): MAX_LENGTH = 512 input_ids, labels = [], [] - instruction = tokenizer.encode(text="\n".join(["<|system|>", "现在你是一个心理专家,我有一些心理问题,请你用专业的知识帮我解决。", "<|user|>", - example["system"] + example["input"] + "<|assistant|>"]).strip() + "\n", + instruction = tokenizer.encode(text="\n".join(["<|system|>", example["system"], "<|user|>", example["input"] + "<|assistant|>"]).strip() + "\n", add_special_tokens=True, truncation=True, max_length=MAX_LENGTH) response = tokenizer.encode(text=example["output"], add_special_tokens=False, truncation=True, From 96f6ce307e6049aa35f3165f438db9c74e274458 Mon Sep 17 00:00:00 2001 From: Anooyman <875734078@qq.com> Date: Tue, 19 Mar 2024 21:11:10 +0800 Subject: [PATCH 13/13] Update --- rag/src/config/config.py | 11 +++++++++- rag/src/{util => }/pipeline.py | 40 +++++++++++++++------------------- 2 files changed, 27 insertions(+), 24 deletions(-) rename rag/src/{util => }/pipeline.py (76%) diff --git a/rag/src/config/config.py b/rag/src/config/config.py index d803d64..d4dcfe3 100644 --- a/rag/src/config/config.py +++ b/rag/src/config/config.py @@ -25,4 +25,13 @@ vector_db_dir = os.path.join(data_dir, 'vector_db.pkl') select_num = 3 -retrieval_num = 10 \ No newline at end of file +retrieval_num = 10 +system_prompt = """ + 你是一个拥有丰富心理学知识的温柔邻家温柔大姐姐艾薇,我有一些心理问题,请你用专业的知识和温柔、可爱、俏皮、的口吻帮我解决,回复中可以穿插一些可爱的Emoji表情符号或者文本符号。\n +""" +prompt_template = """ + {system_prompt} + 根据下面检索回来的信息,回答问题。 + {content} + 问题:{question} +""" \ No newline at end of file diff --git a/rag/src/util/pipeline.py b/rag/src/pipeline.py similarity index 76% rename from rag/src/util/pipeline.py rename to rag/src/pipeline.py index a6f2cdf..214eef3 100644 --- a/rag/src/util/pipeline.py +++ b/rag/src/pipeline.py @@ -2,7 +2,8 @@ from langchain_core.prompts import PromptTemplate from transformers.utils import logging -from config.config import retrieval_num, select_num +from data_processing import DataProcessing +from config.config import retrieval_num, select_num, system_prompt, prompt_template logger = logging.get_logger(__name__) @@ -16,7 +17,7 @@ class EmoLLMRAG(object): 4. 将 query 和检索回来的 content 传入 LLM 中 """ - def __init__(self, model) -> None: + def __init__(self, model, retrieval_num, rerank_flag=False, select_num=3) -> None: """ 输入 Model 进行初始化 @@ -30,42 +31,35 @@ def __init__(self, model) -> None: self.vectorstores = self._load_vector_db() self.system_prompt = self._get_system_prompt() self.prompt_template = self._get_prompt_template() - - # 等待 embedding team 封装对应接口 - #self.data_process_obj = DataProcessing() + self.data_processing_obj = DataProcessing() + self.system_prompt = system_prompt + self.prompt_template = prompt_template + self.retrieval_num = retrieval_num + self.rerank_flag = rerank_flag + self.select_num = select_num def _load_vector_db(self): """ 调用 embedding 模块给出接口 load vector DB """ - return - - def _get_system_prompt(self) -> str: - """ - 加载 system prompt - """ - return '' + vectorstores = self.data_processing_obj.load_vector_db() + if not vectorstores: + vectorstores = self.data_processing_obj.load_index_and_knowledge() - def _get_prompt_template(self) -> str: - """ - 加载 prompt template - """ - return '' + return vectorstores - def get_retrieval_content(self, query, rerank_flag=False) -> str: + def get_retrieval_content(self, query) -> str: """ Input: 用户提问, 是否需要rerank ouput: 检索后并且 rerank 的内容 """ content = '' - documents = self.vectorstores.similarity_search(query, k=retrieval_num) + documents = self.vectorstores.similarity_search(query, k=self.retrieval_num) # 如果需要rerank,调用接口对 documents 进行 rerank - if rerank_flag: - pass - # 等后续调用接口 - #documents = self.data_process_obj.rerank_documents(documents, select_num) + if self.rerank_flag: + documents = self.data_processing_obj.rerank(documents, self.select_num) for doc in documents: content += doc.page_content