From ce7a4ae416bc1bf406f14aa9774752ffbf885eb4 Mon Sep 17 00:00:00 2001
From: zealot52099 <songyan5209@163.com>
Date: Mon, 18 Mar 2024 10:31:34 +0800
Subject: [PATCH 01/13] add more directories

---
 rag/src/config/config.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/rag/src/config/config.py b/rag/src/config/config.py
index b84327f..d803d64 100644
--- a/rag/src/config/config.py
+++ b/rag/src/config/config.py
@@ -13,11 +13,16 @@
 # data
 data_dir = os.path.join(base_dir, 'data')                           # data
 knowledge_json_path = os.path.join(data_dir, 'knowledge.json')      # json
-knowledge_pkl_path = os.path.join(data_dir, 'knowledge.pkl')        # pickle
+knowledge_pkl_path = os.path.join(data_dir, 'knowledge.pkl')        # pkl
+doc_dir = os.path.join(data_dir, 'txt')   
+qa_dir = os.path.join(data_dir, 'json')   
 
 # log
 log_dir = os.path.join(base_dir, 'log')                             # log
 log_path = os.path.join(log_dir, 'log.log')                         # file
 
+# vector DB
+vector_db_dir = os.path.join(data_dir, 'vector_db.pkl')
+
 select_num = 3
 retrieval_num = 10
\ No newline at end of file

From 5879afffe6fdd0eef12f9fadaebfe54a803d1a71 Mon Sep 17 00:00:00 2001
From: zealot52099 <songyan5209@163.com>
Date: Mon, 18 Mar 2024 10:32:27 +0800
Subject: [PATCH 02/13] add data_processing.py

---
 rag/src/data_processing.py | 262 +++++++++++++++++++++++++++++++++++++
 1 file changed, 262 insertions(+)
 create mode 100644 rag/src/data_processing.py

diff --git a/rag/src/data_processing.py b/rag/src/data_processing.py
new file mode 100644
index 0000000..e3215fd
--- /dev/null
+++ b/rag/src/data_processing.py
@@ -0,0 +1,262 @@
+import json
+import pickle
+from loguru import logger
+from sentence_transformers import SentenceTransformer
+
+from config.config import embedding_path, doc_dir, qa_dir, knowledge_pkl_path, data_dir, base_dir, vector_db_dir
+import os
+import faiss
+import platform
+from langchain_community.document_loaders import DirectoryLoader, TextLoader, JSONLoader
+from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from BCEmbedding import EmbeddingModel, RerankerModel
+from util.pipeline import EmoLLMRAG
+import pickle
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import streamlit as st
+from openxlab.model import download
+
+
+'''
+1）根据QA对/TXT 文本生成 embedding 
+2）调用 langchain FAISS 接口构建 vector DB 
+3）存储到 openxlab.dataset 中，方便后续调用
+4）提供 embedding 的接口函数，方便后续调用
+5）提供 rerank 的接口函数，方便后续调用
+'''
+
+"""
+加载向量模型
+"""
+def load_embedding_model():
+    logger.info('Loading embedding model...')
+    # model = EmbeddingModel(model_name_or_path="huggingface/bce-embedding-base_v1")   
+    model = EmbeddingModel(model_name_or_path="maidalun1020/bce-embedding-base_v1")
+    logger.info('Embedding model loaded.')
+    return model
+
+def load_rerank_model():
+    logger.info('Loading rerank_model...')
+    model = RerankerModel(model_name_or_path="maidalun1020/bce-reranker-base_v1")
+    # model = RerankerModel(model_name_or_path="huggingface/bce-reranker-base_v1")
+    logger.info('Rerank model loaded.')
+    return model
+
+
+def split_document(data_path, chunk_size=1000, chunk_overlap=100):
+    # text_spliter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    text_spliter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) 
+    split_docs = []
+    logger.info(f'Loading txt files from {data_path}')
+    if os.path.isdir(data_path):
+        # 如果是文件夹，则遍历读取
+        for root, dirs, files in os.walk(data_path):
+            for file in files:      
+                if file.endswith('.txt'):            
+                    file_path = os.path.join(root, file)
+                    # logger.info(f'splitting file {file_path}')
+                    text_loader = TextLoader(file_path, encoding='utf-8')        
+                    text = text_loader.load()
+                    
+                    splits = text_spliter.split_documents(text)
+                    # logger.info(f"splits type {type(splits[0])}")
+                    # logger.info(f'splits size {len(splits)}')
+                    split_docs += splits
+    elif file.endswith('.txt'): 
+        file_path = os.path.join(root, file)
+        # logger.info(f'splitting file {file_path}')
+        text_loader = TextLoader(file_path, encoding='utf-8')        
+        text = text_loader.load()
+        splits = text_spliter.split_documents(text)
+        # logger.info(f"splits type {type(splits[0])}")
+        # logger.info(f'splits size {len(splits)}')
+        split_docs = splits
+    logger.info(f'split_docs size {len(split_docs)}')
+    return split_docs
+
+
+##TODO 1、读取system prompt 2、限制序列长度
+def split_conversation(path):
+    '''
+    data format:
+    [
+        {
+            "conversation": [
+                {
+                    "input":  Q1
+                    "output": A1
+                },
+                {
+                    "input":  Q2
+                    "output": A2
+                },
+            ]
+        },
+    ]
+    '''
+    qa_pairs = []
+    logger.info(f'Loading json files from {path}')
+    if os.path.isfile(path):
+        with open(path, 'r', encoding='utf-8') as file:
+            data = json.load(file)
+        for conversation in data:
+            for dialog in conversation['conversation']:
+                # input_text = dialog['input']
+                # output_text = dialog['output']      
+                # if len(input_text) > max_length or len(output_text) > max_length:
+                #     continue
+                qa_pairs.append(dialog)        
+    elif os.path.isdir(path):
+        # 如果是文件夹，则遍历读取
+        for root, dirs, files in os.walk(path):
+            for file in files:
+                if file.endswith('.json'): 
+                    file_path = os.path.join(root, file)
+                    logger.info(f'splitting file {file_path}')
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        data = json.load(f)
+                        for conversation in data:
+                            for dialog in conversation['conversation']:
+                                qa_pairs.append(dialog)
+    return qa_pairs
+
+
+        
+# 加载本地索引
+def load_index_and_knowledge():
+    current_os = platform.system()
+    split_doc = []
+    split_qa = []
+    #读取知识库
+    if not os.path.exists(knowledge_pkl_path):
+        split_doc = split_document(doc_dir)
+        split_qa = split_conversation(qa_dir)
+    # logger.info(f'split_qa size:{len(split_qa)}')
+    # logger.info(f'type of split_qa:{type(split_qa[0])}')
+    # logger.info(f'split_doc size:{len(split_doc)}')
+    # logger.info(f'type of doc:{type(split_doc[0])}')
+        knowledge_chunks = split_doc + split_qa
+        with open(knowledge_pkl_path, 'wb') as file:
+            pickle.dump(knowledge_chunks, file)
+    else:
+        with open(knowledge_pkl_path , 'rb') as f:
+            knowledge_chunks = pickle.load(f)
+        
+    #读取vector DB
+    if not os.path.exists(vector_db_dir):
+        logger.info(f'Creating index...')
+        emb_model = load_embedding_model()
+        if not split_doc:
+            split_doc = split_document(doc_dir)
+        if not split_qa:
+            split_qa = split_conversation(qa_dir)
+        # 创建索引,windows不支持faiss-gpu
+        if current_os == 'Linux':
+            index = create_index_gpu(split_doc, split_qa, emb_model, vector_db_dir)
+        else:
+            index = create_index_cpu(split_doc, split_qa, emb_model, vector_db_dir)
+    else:
+        if current_os == 'Linux':
+            res = faiss.StandardGpuResources()
+            index = faiss.index_cpu_to_gpu(res, 0, index, vector_db_dir)
+        else:
+            index = faiss.read_index(vector_db_dir)
+    
+    return index, knowledge_chunks
+
+
+def create_index_cpu(split_doc, split_qa, emb_model, knowledge_pkl_path, dimension = 768, question_only=False):
+    # 假设BCE嵌入的维度是768，根据你选择的模型可能不同
+    faiss_index_cpu = faiss.IndexFlatIP(dimension)  # 创建一个使用内积的FAISS索引
+    # 将问答对转换为向量并添加到FAISS索引中
+    for doc in split_doc:
+        # type_of_docs = type(split_doc) 
+        text = f"{doc.page_content}"
+        vector = emb_model.encode([text])
+        faiss_index_cpu.add(vector)
+    for qa in split_qa:
+        #仅对Q对进行编码
+        text = f"{qa['input']}"
+        vector = emb_model.encode([text])
+        faiss_index_cpu.add(vector)
+    faiss.write_index(faiss_index_cpu, knowledge_pkl_path)
+    return faiss_index_cpu
+
+def create_index_gpu(split_doc, split_qa, emb_model, knowledge_pkl_path, dimension = 768, question_only=False):
+    res = faiss.StandardGpuResources()
+    index = faiss.IndexFlatIP(dimension)
+    faiss_index_gpu = faiss.index_cpu_to_gpu(res, 0, index)
+    for doc in split_doc:
+        # type_of_docs = type(split_doc)
+        text = f"{doc.page_content}"
+        vector = emb_model.encode([text])
+        faiss_index_gpu.add(vector)
+    for qa in split_qa:
+        #仅对Q对进行编码
+        text = f"{qa['input']}"
+        vector = emb_model.encode([text])
+        faiss_index_gpu.add(vector)
+    faiss.write_index(faiss_index_gpu, knowledge_pkl_path)
+    return faiss_index_gpu
+
+   
+
+# 根据query搜索相似文本
+def find_top_k(query, faiss_index, k=5):
+    emb_model = load_embedding_model()
+    emb_query = emb_model.encode([query])
+    distances, indices = faiss_index.search(emb_query, k)
+    return distances, indices
+
+def rerank(query, indices, knowledge_chunks):
+    passages = []
+    for index in indices[0]:
+        content = knowledge_chunks[index]
+        '''
+        txt: 'langchain_core.documents.base.Document'
+        json: dict
+        '''
+        # logger.info(f'retrieved content:{content}')
+        # logger.info(f'type of content:{type(content)}')
+        if type(content) == dict:
+            content = content["input"] + '\n' + content["output"]
+        else:
+            content = content.page_content
+        passages.append(content)
+    
+    model = load_rerank_model()
+    rerank_results = model.rerank(query, passages)
+    return rerank_results
+
+@st.cache_resource
+def load_model():
+    model = (
+        AutoModelForCausalLM.from_pretrained("model", trust_remote_code=True)
+        .to(torch.bfloat16)
+        .cuda()
+    )
+    tokenizer = AutoTokenizer.from_pretrained("model", trust_remote_code=True)
+    return model, tokenizer
+
+if __name__ == "__main__":
+    logger.info(data_dir)
+    if not os.path.exists(data_dir):
+         os.mkdir(data_dir)
+    faiss_index, knowledge_chunks = load_index_and_knowledge()
+    # 按照query进行查询
+    # query = "她要阻挠姐姐的婚姻，即使她自己的尸体在房门跟前"
+    # query = "肯定的。我最近睡眠很差，总是做噩梦。而且我吃得也不好，体重一直在下降"
+    # query = "序言 （一） 变态心理学是心理学本科生的必修课程之一，教材更新的问题一直在困扰着我们。"
+    query = "心理咨询师，我觉得我的胸闷症状越来越严重了，这让我很害怕"
+    distances, indices = find_top_k(query, faiss_index, 5)
+    logger.info(f'distances==={distances}')
+    logger.info(f'indices==={indices}')
+   
+
+    # rerank无法返回id，先实现按整个问答对排序
+    rerank_results = rerank(query, indices, knowledge_chunks)
+    for passage, score in zip(rerank_results['rerank_passages'], rerank_results['rerank_scores']):
+        print(str(score)+'\n')
+        print(passage+'\n')
+  
\ No newline at end of file

From 74db6d98932ee9abe5d248359e96ebf34ad3842f Mon Sep 17 00:00:00 2001
From: zealot52099 <songyan5209@163.com>
Date: Mon, 18 Mar 2024 10:33:01 +0800
Subject: [PATCH 03/13] update main.py

---
 rag/src/main.py | 112 ++++++++++++++++++++----------------------------
 1 file changed, 46 insertions(+), 66 deletions(-)

diff --git a/rag/src/main.py b/rag/src/main.py
index 97f60a0..7dd7639 100644
--- a/rag/src/main.py
+++ b/rag/src/main.py
@@ -5,87 +5,67 @@
 from typing import Tuple
 from sentence_transformers import SentenceTransformer
 
-from config.config import knowledge_json_path, knowledge_pkl_path, model_repo
+from config.config import knowledge_json_path, knowledge_pkl_path, model_repo, model_dir, base_dir
 from util.encode import load_embedding, encode_qa
 from util.pipeline import EmoLLMRAG
-
+from loguru import logger
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import streamlit as st
 from openxlab.model import download
+from data_processing import load_index_and_knowledge, create_index_cpu, create_index_gpu, find_top_k, rerank
+from config.config import embedding_path, doc_dir, qa_dir, knowledge_pkl_path, data_dir
 
-download(
-    model_repo=model_repo, 
-    output='model'
-)
-
-
-"""
-读取知识库
-"""
-def load_knowledge() -> Tuple[list, list]:
-    # 如果 pkl 不存在，则先编码存储
-    if not os.path.exists(knowledge_pkl_path):
-        encode_qa(knowledge_json_path, knowledge_pkl_path)
-
-    # 加载 json 和 pkl
-    with open(knowledge_json_path, 'r', encoding='utf-8') as f1, open(knowledge_pkl_path, 'rb') as f2:
-        knowledge = json.load(f1)
-        encoded_knowledge = pickle.load(f2)
-    return knowledge, encoded_knowledge
-
-
-"""
-召回 top_k 个相关的文本段
-"""
-def find_top_k(
-    emb: SentenceTransformer,
-    query: str,
-    knowledge: list,
-    encoded_knowledge: list,
-    k=3
-) -> list[str]:
-    # 编码 query
-    query_embedding = emb.encode(query)
-
-    # 查找 top_k
-    scores = query_embedding @ encoded_knowledge.T
-    # 使用 argpartition 找出每行第 k 个大的值的索引，第 k 个位置左侧都是比它大的值，右侧都是比它小的值
-    top_k_indices = np.argpartition(scores, -k)[-k:]
-    # 由于 argpartition 不保证顺序，我们需要对提取出的 k 个索引进行排序
-    top_k_values_sorted_indices = np.argsort(scores[top_k_indices])[::-1]
-    top_k_indices = top_k_indices[top_k_values_sorted_indices]
+'''
+	1）构建完整的 RAG pipeline。输入为用户 query，输出为 answer
+	2）调用 embedding 提供的接口对 query 向量化
+	3）下载基于 FAISS 预构建的 vector DB ，并检索对应信息
+	4）调用 rerank 接口重排序检索内容
+	5）调用 prompt 接口获取 system prompt 和 prompt template
+	6）拼接 prompt 并调用模型返回结果
 
-    # 返回
-    contents = [knowledge[index] for index in top_k_indices]
-    return contents
-    
-
-def main():
-    emb = load_embedding()
-    knowledge, encoded_knowledge = load_knowledge()
-    query = "认知心理学研究哪些心理活动？"
-    contents = find_top_k(emb, query, knowledge, encoded_knowledge, 2)
-    print('召回的 top-k 条相关内容如下：')
-    print(json.dumps(contents, ensure_ascii=False, indent=2))
-    # 这里我没实现 LLM 部分，如果有 LLM
-    ## 1. 读取 LLM
-    ## 2. 将 contents 拼接为 prompt，传给 LLM，作为 {已知内容}
-    ## 3. 要求 LLM 根据已知内容回复
+'''
+# download(
+#     model_repo=model_repo, 
+#     output='model'
+# )
 
 @st.cache_resource
 def load_model():
+    model_dir = os.path.join(base_dir,'../model') 
+    logger.info(f'Loading model from {model_dir}')
     model = (
-        AutoModelForCausalLM.from_pretrained("model", trust_remote_code=True)
+        AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True)
         .to(torch.bfloat16)
         .cuda()
     )
-    tokenizer = AutoTokenizer.from_pretrained("model", trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
     return model, tokenizer
 
-if __name__ == '__main__':
-    #main()
-    query = ''
+def get_prompt():
+    pass
+
+def get_prompt_template():
+    pass
+
+def main(query, system_prompt):
     model, tokenizer = load_model()
-    rag_obj = EmoLLMRAG(model)
-    response = rag_obj.main(query)
\ No newline at end of file
+    model = model.eval()
+    if not os.path.exists(data_dir):
+         os.mkdir(data_dir)
+    # 下载基于 FAISS 预构建的 vector DB 以及原始知识库
+    faiss_index, knowledge_chunks = load_index_and_knowledge()
+    distances, indices = find_top_k(query, faiss_index, 5)
+    rerank_results = rerank(query, indices, knowledge_chunks)
+    messages = [(system_prompt, rerank_results['rerank_passages'][0])]
+    logger.info(f'messages:{messages}')
+    response, history = model.chat(tokenizer, query, history=messages)
+    messages.append((query, response))
+    print(f"robot >>> {response}")  
+    
+if __name__ == '__main__':
+    # query = '你好' 
+    query = "心理咨询师，我觉得我的胸闷症状越来越严重了，这让我很害怕"
+    #TODO system_prompt = get_prompt()
+    system_prompt = "你是一个由aJupyter、Farewell、jujimeizuo、Smiling&Weeping研发（排名按字母顺序排序，不分先后）、散步提供技术支持、上海人工智能实验室提供支持开发的心理健康大模型。现在你是一个心理专家，我有一些心理问题，请你用专业的知识帮我解决。"
+    main(query, system_prompt)
\ No newline at end of file

From 98ecdda78d9c7e8536eceef3b66db0d52f22fcc4 Mon Sep 17 00:00:00 2001
From: zealot52099 <songyan5209@163.com>
Date: Mon, 18 Mar 2024 10:46:09 +0800
Subject: [PATCH 04/13] fix bug

---
 rag/src/data_processing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rag/src/data_processing.py b/rag/src/data_processing.py
index e3215fd..45ff3f0 100644
--- a/rag/src/data_processing.py
+++ b/rag/src/data_processing.py
@@ -63,8 +63,8 @@ def split_document(data_path, chunk_size=1000, chunk_overlap=100):
                     # logger.info(f"splits type {type(splits[0])}")
                     # logger.info(f'splits size {len(splits)}')
                     split_docs += splits
-    elif file.endswith('.txt'): 
-        file_path = os.path.join(root, file)
+    elif data_path.endswith('.txt'): 
+        file_path = os.path.join(root, data_path)
         # logger.info(f'splitting file {file_path}')
         text_loader = TextLoader(file_path, encoding='utf-8')        
         text = text_loader.load()

From 042146af56b830bad8860323dc79524c6a3dd079 Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Mon, 18 Mar 2024 22:13:35 +0900
Subject: [PATCH 05/13] Revert "modified merge_jsonl and merge_jsonl_r"

This reverts commit a38ef600587df3f036c4ad9572998d6e4882fed2.
---
 .gitignore                                |  1 -
 generate_data/final_data/merge_jsonl.py   | 60 ------------------
 generate_data/final_data/merge_jsonl_r.py | 75 -----------------------
 3 files changed, 136 deletions(-)
 delete mode 100644 generate_data/final_data/merge_jsonl.py
 delete mode 100644 generate_data/final_data/merge_jsonl_r.py

diff --git a/.gitignore b/.gitignore
index 7467647..b2c615a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,6 @@ zhipuai/
 data/
 
 *.jsonl
-*.json
 # ./generate_data/*.josnl
 # ./generate_data/*/*/*.josnl
 
diff --git a/generate_data/final_data/merge_jsonl.py b/generate_data/final_data/merge_jsonl.py
deleted file mode 100644
index b8edd10..0000000
--- a/generate_data/final_data/merge_jsonl.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import json
-import os
-
-
-def save_merge_json(data_lis, file_path):
-    with open(file_path, 'wt', encoding='utf-8') as file:
-        json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
-
-
-def get_all_file_paths(folder_path, file_type='.jsonl'):
-    # 确保传入的是一个目录
-    if not os.path.isdir(folder_path):
-        raise ValueError(f"{folder_path} is not a valid directory")
-
-    # 获取文件夹下所有文件的路径
-    file_paths = [os.path.join(folder_path, file) for file in os.listdir(
-        folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
-    return file_paths
-
-
-if __name__ == '__main__':
-    conversion_lis = []
-    
-    folder_path = r'./'
-    
-    merge_path = folder_path.split('/')[-1]
-    try:
-        merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
-    except:
-        merge_last_path = '' 
-    print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
-        
-
-    for path in get_all_file_paths(folder_path):
-        print(path)
-
-        with open(path, 'rt', encoding='utf-8') as file:
-            for line in file:
-                # # 移除行尾的换行符
-                # if line == '\n':
-                #     line = line.rstrip('\n')
-                line = line.rstrip('\n')
-                # 解析JSON
-                try:
-                    data = json.loads(line)
-                    conversion_lis.append(data)
-                    # conversion_lis.append('\n')
-                except json.JSONDecodeError as e:
-                    print(f"Error decoding JSON: {e}")
-                    
-    if merge_last_path!='':
-        save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'  
-    elif merge_path!='':
-        save_merge_json_path = rf'./{merge_path}_merge.json'
-    else:
-        save_merge_json_path = rf'./curr_merge.json'
-                    
-    save_merge_json(data_lis=conversion_lis,
-                    file_path=save_merge_json_path)
-    print(len(conversion_lis),save_merge_json_path)
diff --git a/generate_data/final_data/merge_jsonl_r.py b/generate_data/final_data/merge_jsonl_r.py
deleted file mode 100644
index a29c951..0000000
--- a/generate_data/final_data/merge_jsonl_r.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import json
-import os
-
-
-def save_merge_json(data_lis, file_path):
-    with open(file_path, 'wt', encoding='utf-8') as file:
-        json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
-
-
-def get_all_file_paths(folder_path, file_type='.jsonl'):
-    # 确保传入的是一个目录
-    if not os.path.isdir(folder_path):
-        raise ValueError(f"{folder_path} is not a valid directory")
-
-    # 获取文件夹下所有文件的路径
-    file_paths = [os.path.join(folder_path, file) for file in os.listdir(
-        folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
-    return file_paths
-
-
-if __name__ == '__main__':
-    
-    data_ai = 'qwen'  # python merge_jsonl_r.py > qwen.txt
-    # data_ai = 'zhipuai'  # python merge_jsonl_r.py > zhipuai.txt
-    root_dir  = rf'./{data_ai}/'
-    
-    save_final_merge_json_path = f'{data_ai}_final_merge.json'
-
-    subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
-
-    final_list = []
-    for folder_path in subfolders:
-        conversion_lis = []
-        merge_path = folder_path.split('/')[-1]
-        try:
-            merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
-        except:
-            merge_last_path = '' 
-        print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
-            
-
-        for path in get_all_file_paths(folder_path):
-            print(path)
-
-            with open(path, 'rt', encoding='utf-8') as file:
-                for line in file:
-                    # # 移除行尾的换行符
-                    # if line == '\n':
-                    #     line = line.rstrip('\n')
-                    line = line.rstrip('\n')
-                    # 解析JSON
-                    try:
-                        data = json.loads(line)
-                        conversion_lis.append(data)
-                        # conversion_lis.append('\n')
-                    except json.JSONDecodeError as e:
-                        print(f"Error decoding JSON: {e}")
-                        
-            if merge_last_path!='':
-                save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'  
-            elif merge_path!='':
-                save_merge_json_path = rf'./{merge_path}_merge.json'
-            else:
-                save_merge_json_path = rf'./curr_merge.json'
-                            
-            save_merge_json(data_lis=conversion_lis,
-                            file_path=save_merge_json_path)
-        
-        final_list = final_list+conversion_lis
-        print(len(conversion_lis),len(final_list),save_merge_json_path)
-        
-    save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path)
-    print(save_final_merge_json_path)
-        
-        

From c16761e289825c631d7c54d8ba3baaaf188f2a58 Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Mon, 18 Mar 2024 23:35:21 +0900
Subject: [PATCH 06/13] update three merge_json*.py files and corresponding
 tutorial in CN and EN

update three merge_json*.py files and corresponding tutorial in CN and EN
---
 .gitignore                     |  2 +
 generate_data/merge_json.py    | 40 ++++++++++++++++++
 generate_data/merge_jsonl.py   | 62 +++++++++++++++++++++++++++
 generate_data/merge_jsonl_r.py | 77 ++++++++++++++++++++++++++++++++++
 generate_data/tutorial.md      | 71 ++++++++++++++++++++-----------
 generate_data/tutorial_EN.md   | 75 +++++++++++++++++++++------------
 6 files changed, 276 insertions(+), 51 deletions(-)
 create mode 100644 generate_data/merge_json.py
 create mode 100644 generate_data/merge_jsonl.py
 create mode 100644 generate_data/merge_jsonl_r.py

diff --git a/.gitignore b/.gitignore
index 2d26489..d6ca709 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,8 @@ pdf/
 .idea/
 
 *.jsonl
+*.json
+*.txt
 # ./generate_data/*.josnl
 # ./generate_data/*/*/*.josnl
 
diff --git a/generate_data/merge_json.py b/generate_data/merge_json.py
new file mode 100644
index 0000000..714befb
--- /dev/null
+++ b/generate_data/merge_json.py
@@ -0,0 +1,40 @@
+import json
+import os
+
+
+def save_merge_json(data_lis, file_path):
+    import json
+
+    with open(file_path, 'wt', encoding='utf-8') as file:
+        json.dump(data_lis, file, ensure_ascii=False)
+
+
+def get_all_file_paths(folder_path):
+    # 确保传入的是一个目录
+    if not os.path.isdir(folder_path):
+        raise ValueError(f"{folder_path} is not a valid directory")
+
+    # 获取文件夹下所有文件的路径
+    file_paths = [os.path.join(folder_path, file) for file in os.listdir(
+        folder_path) if os.path.isfile(os.path.join(folder_path, file))]
+    return file_paths
+
+
+if __name__ == '__main__':
+    conversion_lis = []
+
+    for path in get_all_file_paths(r'data\res-aiwei'):
+        print(path)
+
+        with open(path, 'rt', encoding='utf-8') as file:
+            for line in file:
+                # 移除行尾的换行符
+                line = line.rstrip('\n')
+                # 解析JSON
+                try:
+                    data = json.loads(line)
+                    conversion_lis.append(data)
+                except json.JSONDecodeError as e:
+                    print(f"Error decoding JSON: {e}")
+        save_merge_json(data_lis=conversion_lis,
+                        file_path=r'.\merge.json')
diff --git a/generate_data/merge_jsonl.py b/generate_data/merge_jsonl.py
new file mode 100644
index 0000000..7887ab0
--- /dev/null
+++ b/generate_data/merge_jsonl.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-  
+
+import json
+import os
+
+
+def save_merge_json(data_lis, file_path):
+    with open(file_path, 'wt', encoding='utf-8') as file:
+        json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
+
+
+def get_all_file_paths(folder_path, file_type='.jsonl'):
+    # 确保传入的是一个目录
+    if not os.path.isdir(folder_path):
+        raise ValueError(f"{folder_path} is not a valid directory")
+
+    # 获取文件夹下所有文件的路径
+    file_paths = [os.path.join(folder_path, file) for file in os.listdir(
+        folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
+    return file_paths
+
+
+if __name__ == '__main__':
+    conversion_lis = []
+    
+    folder_path = r'./'  # python merge_jsonl.py > curr.txt
+    
+    merge_path = folder_path.split('/')[-1]
+    try:
+        merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
+    except:
+        merge_last_path = '' 
+    print(f'merge_path={merge_path},merge_last_path={merge_last_path}')
+        
+
+    for path in get_all_file_paths(folder_path):
+        print(path.encode("utf-8"))
+
+        with open(path, 'rt', encoding='utf-8') as file:
+            for line in file:
+                # # 移除行尾的换行符
+                # if line == '\n':
+                #     line = line.rstrip('\n')
+                line = line.rstrip('\n')
+                # 解析JSON
+                try:
+                    data = json.loads(line)
+                    conversion_lis.append(data)
+                    # conversion_lis.append('\n')
+                except json.JSONDecodeError as e:
+                    print(f"Error decoding JSON: {e}")
+                    
+    if merge_last_path!='':
+        save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'  
+    elif merge_path!='':
+        save_merge_json_path = rf'./{merge_path}_merge.json'
+    else:
+        save_merge_json_path = rf'./curr_merge.json'
+                    
+    save_merge_json(data_lis=conversion_lis,
+                    file_path=save_merge_json_path)
+    print(len(conversion_lis),save_merge_json_path)
diff --git a/generate_data/merge_jsonl_r.py b/generate_data/merge_jsonl_r.py
new file mode 100644
index 0000000..cf4998a
--- /dev/null
+++ b/generate_data/merge_jsonl_r.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-  
+
+import json
+import os
+
+
+def save_merge_json(data_lis, file_path):
+    with open(file_path, 'wt', encoding='utf-8') as file:
+        json.dump(data_lis, file, ensure_ascii=False, separators=(',\n',':'))
+
+
+def get_all_file_paths(folder_path, file_type='.jsonl'):
+    # 确保传入的是一个目录
+    if not os.path.isdir(folder_path):
+        raise ValueError(f"{folder_path} is not a valid directory")
+
+    # 获取文件夹下所有文件的路径
+    file_paths = [os.path.join(folder_path, file) for file in os.listdir(
+        folder_path) if os.path.isfile(os.path.join(folder_path, file)) and (file_type in file)]
+    return file_paths
+
+
+if __name__ == '__main__':
+    
+    data_ai = 'qwen'  # python merge_jsonl_r.py > qwen.txt
+    # data_ai = 'zhipuai'  # python merge_jsonl_r.py > zhipuai.txt
+    root_dir  = rf'./{data_ai}/'
+    
+    save_final_merge_json_path = f'{data_ai}_final_merge.json'
+
+    subfolders = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
+
+    final_list = []
+    for folder_path in subfolders:
+        conversion_lis = []
+        merge_path = folder_path.split('/')[-1]
+        try:
+            merge_last_path = folder_path.split('/')[-2] if folder_path.split('/')[-2]!='.' else ''
+        except:
+            merge_last_path = '' 
+        print(f'merge_path={merge_path},merge_last_path={merge_last_path}'.encode("utf-8"))
+            
+
+        for path in get_all_file_paths(folder_path):
+            print(path.encode("utf-8"))
+
+            with open(path, 'rt', encoding='utf-8') as file:
+                for line in file:
+                    # # 移除行尾的换行符
+                    # if line == '\n':
+                    #     line = line.rstrip('\n')
+                    line = line.rstrip('\n')
+                    # 解析JSON
+                    try:
+                        data = json.loads(line)
+                        conversion_lis.append(data)
+                        # conversion_lis.append('\n')
+                    except json.JSONDecodeError as e:
+                        print(f"Error decoding JSON: {e}")
+                        
+            if merge_last_path!='':
+                save_merge_json_path = rf'./{merge_last_path}/{merge_path}_merge.json'  
+            elif merge_path!='':
+                save_merge_json_path = rf'./{merge_path}_merge.json'
+            else:
+                save_merge_json_path = rf'./curr_merge.json'
+                            
+            save_merge_json(data_lis=conversion_lis,
+                            file_path=save_merge_json_path)
+        
+        final_list = final_list+conversion_lis
+        print(f'{len(conversion_lis)},{len(final_list)},{save_merge_json_path}'.encode("utf-8"))
+        
+    save_merge_json(data_lis=final_list,file_path=save_final_merge_json_path)
+    print(len(conversion_lis),save_final_merge_json_path.encode("utf-8"))
+        
+        
diff --git a/generate_data/tutorial.md b/generate_data/tutorial.md
index 80426b4..f7af989 100644
--- a/generate_data/tutorial.md
+++ b/generate_data/tutorial.md
@@ -22,7 +22,7 @@
 
 ## **三、实践步骤**
 
-1. **初始化**
+### 1. **初始化**
 
 * 安装所需的软件和库
 
@@ -34,49 +34,62 @@
 
   可参见 `config.yml`均有注释
 
-2. **模型选择与配置**
+### 2. **模型选择与配置**
 
 * 根据需求选择适合的模型
   为了使大家都能够玩上大模型，我们选用InterLLM2-7B作为我们的基线模型（消费级显卡也可部署微调的哦）
 * 对模型进行必要的配置和调整
   根据我们的数据集以及配置策略，使用XTuner进行微调
 
-3. **数据生成**
+### 3. **数据生成**
+
+#### **三种改进前的数据生成方法**
 
 * 使用通义千问大模型进行数据生成
   
-  ```bash
+```bash
   # 终端运行
   bash run_qwen.bash
-
-  # 或者不使用终端运行
-  python qwen_gen_data_NoBash.py
-  ```
+```
 
 * 使用百度文心大模型进行数据生成
 
-  ```bash
+```bash
   # 终端运行
   python ernie_gen_data.py
-  ```
+```
 
-* 使用智谱GLM大模型进行数据生成
+* 使用讯飞星火大模型进行数据生成
   
-  ```bash
+```bash
   # 终端运行
-  python zhipuai_gen_data.py
-  ```
+  python ./xinghuo/gen_data.py
+```
 
-* 使用讯飞星火大模型进行数据生成
+#### **改进的两种数据生成方法**
+
+采用改进的数据生成方法生成多轮对话时，首先需要定义`ai_tool`变量，该变量表示LLM模型的名称（`qwen`或`zhipuai`）。根据`ai_tool`变量的值，创建一个`{ai_tool}`文件夹。
+
+然后，遍历所有的`area`值，接着根据不同的`emotion`值生成多轮对话。生成的对话会每隔`save_interval`次迭代写入到`./{ai_tool}/{area}/{emotion}.jsonl`文件中。这个过程会重复执行`total_num_each_emo_area`次。
+
+* 使用**改进的**通义千问大模型数据生成方法
   
-  ```bash
+```bash
+   # 或者不使用bash，直接运行
+  python qwen_gen_data_NoBash.py
+```
+
+* 使用**改进的**智谱GLM大模型数据生成方法
+  
+```bash
   # 终端运行
-  python ./xinghuo/gen_data.py
-  ```
+  python zhipuai_gen_data.py
+```
 
-1. **自我认知数据集的整合**
+### 4. **自我认知数据集的整合**
 
 * 自我认知数据集需要按照格式手动生成，如下格式即可
+
   ```json
   [
       {
@@ -98,19 +111,27 @@
   ]
   ```
 
-5. **数据集整合**
+### 5. **数据集整合**
+
+#### Case 1: 使用`python ernie_gen_data.py`、`bash run_qwen.bash`或者`python ./xinghuo/gen_data.py`
+
+* 首先使用`check.py`进行数据检查。在进行数据集整合之前，我们要检查生成的数据是否存在格式错误，类型不符合等情况。
+* 然后使用`merge_json.py`将所有的json（或者使用`merge_jsonl.py`将所有的jsonl）文件整合为一个总的json文件。
+
+#### Case 2: 使用`python qwen_gen_data_NoBash.py`或者`python zhipuai_gen_data.py`
 
-   在进行数据集整合之前，我们要检查生成的数据是否存在格式错误，类型不符合等情况。
+在这种情况下，我们需要在使用两种改进的生成方法生成多轮对话后，将`{data_ai}`文件夹下所有`{area}`子文件夹中的所有`{emotion}.jsonl`文件合并为`{data_ai}_final_merge.json`文件。
 
-* 首先使用`check.py`进行数据检查。
-* 然后使用`merge_json.py`将所有的json整合为一个总的json文件。
+* 由于采用了改进的数据生成方法和不同的存储生成对话结构，因此我们可以免除对数据集的检查。
+* 然后使用`merge_jsonl_r.py`将`qwen`或者`zhipuai`定义为`data_ai`变量，并将其文件夹下所有领域（`area`）下所有的jsonl文件整合为一个总的json文件并取名为`{area}_merge.json`,最终在`{data_ai}`文件夹下生成`{data_ai}_final_merge.json`。
+* 然后我们可以手动合成`qwen_final_merge.json`和`zhipuai_final_merge.json`为`qwen_zhipuai_final_merge.json`文件了， 注意合并后的json文件夹中，最外面只有一对`[]`，中间是`{}`包裹的多轮对话。
 
-6. **评估与优化**
+### 6. **评估与优化**
 
 * 使用适当的评估指标对生成的数据集进行评估
 * 根据评估结果进行必要的优化和调整
 
-7. **测试与部署**
+### 7. **测试与部署**
 
 * 使用独立测试集对训练好的模型进行评估
 * 根据测试结果进行必要的调整和优化
diff --git a/generate_data/tutorial_EN.md b/generate_data/tutorial_EN.md
index 25e10e2..85acf33 100644
--- a/generate_data/tutorial_EN.md
+++ b/generate_data/tutorial_EN.md
@@ -22,7 +22,7 @@ In order to have a better representation of our large mental models, we must hav
 
 ## **III. Practical steps**
 
-1. **Initialize**
+### 1. **Initialize**
 
 * Install the required software and libraries
 
@@ -34,7 +34,7 @@ In order to have a better representation of our large mental models, we must hav
 
   See `config.yml` for annotations
 
-2. **Model selection and configuration**
+### 2. **Model selection and configuration**
 
 * Select the right model for your needs
   In order to enable everyone to play with the large model, we chose the InterLLM2-7B as our baseline model (consumer graphics cards can also be deployed fine-tuned oh).
@@ -42,40 +42,52 @@ In order to have a better representation of our large mental models, we must hav
 * Make necessary configurations and adjustments to the model
   Use XTuner for fine-tuning based on our dataset and configuration strategy.
 
-3. **Data generation**
+### 3. **Data generation**
 
-* Data generation using Tongyi Qianwen
+#### **Three original methods for data generation**
+
+* 1.Data generation using Tongyi Qianwen 
   
-  ```bash
+```bash
   # Terminal operation
   bash run_qwen.bash
+```
 
-  # Or just use python without bash
-  python qwen_gen_data_NoBash.py
-  ```
-
-* Data generation using Wenxin Yiyan
+* 2.Data generation using Wenxin Yiyan
   
-  ```bash
+```bash
   # Terminal operation
   python ernie_gen_data.py
-  ```
+```
 
-* Data generation using Zhipu GLM
+* 3.Data generation using IFlystar Fire
   
-  ```bash
+```bash
   # Terminal operation
-  python zhipuai_gen_data.py
-  ```
+  python ./xinghuo/gen_data.py
+```
+
+#### **Two improved methods for data generation**
+
+When generating multi-turn dialogues with these two improved methods, the first step is to define the value of the `ai_tool` variable, which represents the LLM model name (`qwen` or `zhipuai`). Based on the value of this `ai_tool` variable, a `{ai_tool}` folder is created. 
 
-* Data generation using IFlystar Fire
+Then, all `area` values are traversed, followed by different `emotion` values for generating multi-turn dialogues. The generated dialogues are written to the `./{ai_tool}/{area}/{emotion}.jsonl` file every `save_interval` iterations. This process is repeated `total_num_each_emo_area` times.
+
+* 1.Using the **improved** method for generating data with the Qwen model:
   
-  ```bash
-  # Terminal operation
-  python ./xinghuo/gen_data.py
-  ```
+```bash
+  # Alternatively, you can run it directly without using bash
+  python qwen_gen_data_NoBash.py
+```
 
-4. **Integration of self-cognition datasets**
+* 2.Using the **improved** method for generating data with the Zhipuai GLM-4 model:
+
+```bash
+  # Alternatively, you can run it directly without using bash
+  python zhipuai_gen_data.py
+```
+
+### 4. **Integration of self-cognition datasets**
 
 * Self-cognition dataset this needs to be manually generated in accordance with the format, the following format can be
   
@@ -100,16 +112,27 @@ In order to have a better representation of our large mental models, we must hav
   ]
   ```
 
-5. **dataset integration**
+### 5. **Dataset Integration**
+
+#### **Case 1**: Using `python ernie_gen_data.py`, `bash run_qwen.bash`, or `python ./xinghuo/gen_data.py`
+
+* First, use `check.py` to check the data. Before integrating the dataset, we need to check whether the generated data has format errors or type mismatches.
+* Then, use `merge_json.py` to consolidate all json files (or use `merge_jsonl.py` to consolidate all jsonl files) into one overall json file.
+
+#### **Case 2**: Using `python qwen_gen_data_NoBash.py` or `python zhipuai_gen_data.py`
+
+In this case, we need to merge all `{emotion}.jsonl` files in all `{area}` subfolders under the `{data_ai}` folder into `{data_ai}_final_merge.json` after we use two improved generation methods to generate multi-round conversations.
 
-Before dataset integration, we need to check whether the generated data has formatting errors, type mismatches, etc. We need check.py to check the data. Finally, merge_json.py is used to combine all the json into one overall json file.
+* As we have adopted improved data generation methods and different storage generation dialog structures, we can avoid checking the dataset.
+* Then, use `merge_jsonl_r.py` to define `qwen` or `zhipuai` as the `data_ai` variable, and consolidate all jsonl files in all areas (`area`) into one overall json file named `{area}_merge.json`. Finally, generate `{data_ai}_final_merge.json` in the `{data_ai}` folder.
+* We can then manually merge `qwen_final_merge.json` and `zhipuai_final_merge.json` into `qwen_zhipuai_final_merge.json`. Note that in the merged json file, there is only one pair of `[]` on the outside, and the multi-round dialogues are wrapped in `{}`.
 
-6. **Evaluation and optimization**
+### 6. **Evaluation and optimization**
 
 * Evaluate the generated dataset using appropriate evaluation metrics
 * Make necessary optimizations and adjustments based on the evaluation results
 
-7. **Testing and deployment**
+### 7. **Testing and deployment**
 
 * Evaluate the trained model using an independent test set
 * Make necessary adjustments and optimizations based on test results

From 275f2497099122dc293f3131367de4d5e91bb925 Mon Sep 17 00:00:00 2001
From: HongCheng <kwchenghong@gmail.com>
Date: Mon, 18 Mar 2024 23:39:49 +0900
Subject: [PATCH 07/13] small update

---
 generate_data/tutorial.md    | 2 +-
 generate_data/tutorial_EN.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/generate_data/tutorial.md b/generate_data/tutorial.md
index f7af989..454895a 100644
--- a/generate_data/tutorial.md
+++ b/generate_data/tutorial.md
@@ -118,7 +118,7 @@
 * 首先使用`check.py`进行数据检查。在进行数据集整合之前，我们要检查生成的数据是否存在格式错误，类型不符合等情况。
 * 然后使用`merge_json.py`将所有的json（或者使用`merge_jsonl.py`将所有的jsonl）文件整合为一个总的json文件。
 
-#### Case 2: 使用`python qwen_gen_data_NoBash.py`或者`python zhipuai_gen_data.py`
+#### Case 2: 使用改进的生成保存方法：`python qwen_gen_data_NoBash.py`或者`python zhipuai_gen_data.py`
 
 在这种情况下，我们需要在使用两种改进的生成方法生成多轮对话后，将`{data_ai}`文件夹下所有`{area}`子文件夹中的所有`{emotion}.jsonl`文件合并为`{data_ai}_final_merge.json`文件。
 
diff --git a/generate_data/tutorial_EN.md b/generate_data/tutorial_EN.md
index 85acf33..fdd5d69 100644
--- a/generate_data/tutorial_EN.md
+++ b/generate_data/tutorial_EN.md
@@ -119,7 +119,7 @@ Then, all `area` values are traversed, followed by different `emotion` values fo
 * First, use `check.py` to check the data. Before integrating the dataset, we need to check whether the generated data has format errors or type mismatches.
 * Then, use `merge_json.py` to consolidate all json files (or use `merge_jsonl.py` to consolidate all jsonl files) into one overall json file.
 
-#### **Case 2**: Using `python qwen_gen_data_NoBash.py` or `python zhipuai_gen_data.py`
+#### **Case 2**: Using improved generation method: `python qwen_gen_data_NoBash.py` or `python zhipuai_gen_data.py` 
 
 In this case, we need to merge all `{emotion}.jsonl` files in all `{area}` subfolders under the `{data_ai}` folder into `{data_ai}_final_merge.json` after we use two improved generation methods to generate multi-round conversations.
 

From 1de2cf5a86009a9e797fba9f6b233acb9b14d73e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=8F=8B=E6=98=89?=
 <youfang.wang@txtechnology.com.cn>
Date: Mon, 18 Mar 2024 23:13:00 +0800
Subject: [PATCH 08/13] update README

---
 README.md    | 39 ++++++++++++++++++++-------------------
 README_EN.md | 39 ++++++++++++++++++++-------------------
 2 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index b2abf79..2dfb036 100644
--- a/README.md
+++ b/README.md
@@ -210,25 +210,26 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
 
 ### 作者（排名不分先后）
 
-|   用户名   |       学校/组织        |       备注       |     贡献     |
-| :----------: | :--------------------: | :-------------------: | :----------: |
-| [aJupyter](https://github.com/aJupyter) | 南开大学在读硕士 | DataWhale成员 | 项目发起人 |
-| [jujimeizuo](https://github.com/jujimeizuo) | 江南大学在读硕士 |  |  |
-| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | 哈尔滨工业大学（威海）在读本科生 |  |  |
-| [8baby8](https://github.com/8baby8) | 飞桨领航团区域主管 | 文心大模型核心开发者 |  |
-| [zxazys](https://github.com/zxazys) | 南开大学在读硕士 |  |  |
-| [MING-ZCH](https://github.com/MING-ZCH) | 华中科技大学在读本科生 |  |  |
-| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | swufe |  |  |
-| [MrCatAI](https://github.com/MrCatAI) | AI搬用工 |  |  |
-| [ZeyuBa](https://github.com/ZeyuBa) | 自动化所在读硕士 |  |  |
-| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | 宾夕法尼亚大学在读硕士 |  |  |
-| [Nobody-ML](https://github.com/Nobody-ML) | 中国石油大学（华东）在读本科生 |  |  |
-| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora/) |MiniSora主要维护|数据清洗、文档翻译|
-| [Mxoder](https://github.com/Mxoder) | 北京航空航天大学在读本科生 |  |  |
-| [Anooyman](https://github.com/Anooyman) | 南京理工大学硕士 |  |  |
-| [Vicky-3021](https://github.com/Vicky-3021) | 西安电子科技大学硕士（研0） |  |  |
-| [SantiagoTOP](https://github.com/santiagoTOP) | 太原理工大学在读硕士 | | |
-| [zealot52099](https://github.com/zealot52099) | AI搬用工 | |清洗数据、RAG|
+|                              用户名                              |                       学校/组织                        |       备注       |     贡献     |
+|:-------------------------------------------------------------:|:--------------------------------------------------:| :-------------------: | :----------: |
+|            [aJupyter](https://github.com/aJupyter)            |                      南开大学在读硕士                      | DataWhale成员 | 项目发起人 |
+|          [jujimeizuo](https://github.com/jujimeizuo)          |                      江南大学在读硕士                      |  |  |
+| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) |                  哈尔滨工业大学（威海）在读本科生                  |  |  |
+|              [8baby8](https://github.com/8baby8)              |                     飞桨领航团区域主管                      | 文心大模型核心开发者 |  |
+|              [zxazys](https://github.com/zxazys)              |                      南开大学在读硕士                      |  |  |
+|            [MING-ZCH](https://github.com/MING-ZCH)            |                    华中科技大学在读本科生                     |  |  |
+|    [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL)    |                       swufe                        |  |  |
+|             [MrCatAI](https://github.com/MrCatAI)             |                       AI搬用工                        |  |  |
+|              [ZeyuBa](https://github.com/ZeyuBa)              |                      自动化所在读硕士                      |  |  |
+|    [aiyinyuedejustin](https://github.com/aiyinyuedejustin)    |                    宾夕法尼亚大学在读硕士                     |  |  |
+|           [Nobody-ML](https://github.com/Nobody-ML)           |                  中国石油大学（华东）在读本科生                   |  |  |
+|             [chg0901](https://github.com/chg0901)             | [MiniSora](https://github.com/mini-sora/minisora/) |MiniSora主要维护|数据清洗、文档翻译|
+|              [Mxoder](https://github.com/Mxoder)              |                   北京航空航天大学在读本科生                    |  |  |
+|            [Anooyman](https://github.com/Anooyman)            |                      南京理工大学硕士                      |  |  |
+|          [Vicky-3021](https://github.com/Vicky-3021)          |                   西安电子科技大学硕士（研0）                   |  |  |
+|         [SantiagoTOP](https://github.com/santiagoTOP)         |                     太原理工大学在读硕士                     | | |
+|         [zealot52099](https://github.com/zealot52099)         |                       AI搬用工                        | |清洗数据、RAG|
+|         [wwwyfff](https://github.com/wwwyfff)                 |                      复旦大学在读硕士                      | ||
 
 ### 版权说明
 
diff --git a/README_EN.md b/README_EN.md
index a8a5a3e..0d7bc5b 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -226,25 +226,26 @@ This project uses Git for version control. You can see the currently available v
 
 ### Authors (in no particular order)
 
-| Username | School/Organization | Remarks | Contributions |
-| :-------: | :-------------------: | :------------------: | :--------: |
-| [aJupyter](https://github.com/aJupyter) | Nankai University, Master's student | DataWhale member | Project initiator |
-| [jujimeizuo](https://github.com/jujimeizuo) | Jiangnan University, Master's student |  |  |
-| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) | Harbin Institute of Technology (Weihai), Undergraduate student |  |  |
-| [8baby8](https://github.com/8baby8) | PaddlePaddle Pilot Team Regional Director | Wenxin Large Model core developer |  |
-| [zxazys](https://github.com/zxazys) | Nankai University, Master's student |  |  |
-| [MING-ZCH](https://github.com/MING-ZCH) | Huazhong University of Science and Technology, Undergraduate student |  |  |
-| [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL) | SWUFE (Southwestern University of Finance and Economics) |  |  |
-| [MrCatAI](https://github.com/MrCatAI) | AI Mover |  |  |
-| [ZeyuBa](https://github.com/ZeyuBa) | Institute of Automation, Master's student |  |  |
-| [aiyinyuedejustin](https://github.com/aiyinyuedejustin) | University of Pennsylvania, Master's student |  |  |
-| [Nobody-ML](https://github.com/Nobody-ML) | China University of Petroleum (East China), Undergraduate student |  |  |
-| [chg0901](https://github.com/chg0901) | [MiniSora](https://github.com/mini-sora/minisora) |Maintainer and Admin|Data Cleaning and Docs Translation|
-| [Mxoder](https://github.com/Mxoder) | Beihang University, Undergraduate student |  |  |
-| [Anooyman](https://github.com/Anooyman) | Nanjing University of Science and Technology, Master's student |  |  |
-| [Vicky-3021](https://github.com/Vicky-3021) | Xidian University, Master's student (Research Year 0) |  |  |
-| [SantiagoTOP](https://github.com/santiagoTOP) | Taiyuan University of Technology, Master's student | | |
-| [zealot52099](https://github.com/zealot52099) | AI Mover | |Data Processing and RAG|
+|                           Username                            |                         School/Organization                          | Remarks | Contributions |
+|:-------------------------------------------------------------:|:--------------------------------------------------------------------:| :------------------: | :--------: |
+|            [aJupyter](https://github.com/aJupyter)            |                 Nankai University, Master's student                  | DataWhale member | Project initiator |
+|          [jujimeizuo](https://github.com/jujimeizuo)          |                Jiangnan University, Master's student                 |  |  |
+| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) |    Harbin Institute of Technology (Weihai), Undergraduate student    |  |  |
+|              [8baby8](https://github.com/8baby8)              |              PaddlePaddle Pilot Team Regional Director               | Wenxin Large Model core developer |  |
+|              [zxazys](https://github.com/zxazys)              |                 Nankai University, Master's student                  |  |  |
+|            [MING-ZCH](https://github.com/MING-ZCH)            | Huazhong University of Science and Technology, Undergraduate student |  |  |
+|    [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL)    |       SWUFE (Southwestern University of Finance and Economics)       |  |  |
+|             [MrCatAI](https://github.com/MrCatAI)             |                               AI Mover                               |  |  |
+|              [ZeyuBa](https://github.com/ZeyuBa)              |              Institute of Automation, Master's student               |  |  |
+|    [aiyinyuedejustin](https://github.com/aiyinyuedejustin)    |             University of Pennsylvania, Master's student             |  |  |
+|           [Nobody-ML](https://github.com/Nobody-ML)           |  China University of Petroleum (East China), Undergraduate student   |  |  |
+|             [chg0901](https://github.com/chg0901)             |          [MiniSora](https://github.com/mini-sora/minisora)           |Maintainer and Admin|Data Cleaning and Docs Translation|
+|              [Mxoder](https://github.com/Mxoder)              |              Beihang University, Undergraduate student               |  |  |
+|            [Anooyman](https://github.com/Anooyman)            |    Nanjing University of Science and Technology, Master's student    |  |  |
+|          [Vicky-3021](https://github.com/Vicky-3021)          |        Xidian University, Master's student (Research Year 0)         |  |  |
+|         [SantiagoTOP](https://github.com/santiagoTOP)         |          Taiyuan University of Technology, Master's student          | | |
+|         [zealot52099](https://github.com/zealot52099)         |                               AI Mover                               | |Data Processing and RAG|
+|             [wwwyfff](https://github.com/wwwyfff)             |                  FuDan University, Master's student                  | ||
 
 ### Copyright Notice
 

From 7bbe3842dcb88e30ee1aa2ad29b6b891f113f15a Mon Sep 17 00:00:00 2001
From: jkhumor <3323637090@qq.com>
Date: Tue, 19 Mar 2024 12:34:51 +0800
Subject: [PATCH 09/13] modify readme

---
 README.md    |  3 ++-
 README_EN.md | 41 +++++++++++++++++++++--------------------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 2dfb036..48f254f 100644
--- a/README.md
+++ b/README.md
@@ -229,7 +229,8 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
 |          [Vicky-3021](https://github.com/Vicky-3021)          |                   西安电子科技大学硕士（研0）                   |  |  |
 |         [SantiagoTOP](https://github.com/santiagoTOP)         |                     太原理工大学在读硕士                     | | |
 |         [zealot52099](https://github.com/zealot52099)         |                       AI搬用工                        | |清洗数据、RAG|
-|         [wwwyfff](https://github.com/wwwyfff)                 |                      复旦大学在读硕士                      | ||
+|             [wwwyfff](https://github.com/wwwyfff)             |                      复旦大学在读硕士                      | ||
+|        [jkhumor](https://github.com/jkhumor)                  |                       南开大学在读硕士                        | |RAG|
 
 ### 版权说明
 
diff --git a/README_EN.md b/README_EN.md
index 0d7bc5b..564ae67 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -226,26 +226,27 @@ This project uses Git for version control. You can see the currently available v
 
 ### Authors (in no particular order)
 
-|                           Username                            |                         School/Organization                          | Remarks | Contributions |
-|:-------------------------------------------------------------:|:--------------------------------------------------------------------:| :------------------: | :--------: |
-|            [aJupyter](https://github.com/aJupyter)            |                 Nankai University, Master's student                  | DataWhale member | Project initiator |
-|          [jujimeizuo](https://github.com/jujimeizuo)          |                Jiangnan University, Master's student                 |  |  |
-| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) |    Harbin Institute of Technology (Weihai), Undergraduate student    |  |  |
-|              [8baby8](https://github.com/8baby8)              |              PaddlePaddle Pilot Team Regional Director               | Wenxin Large Model core developer |  |
-|              [zxazys](https://github.com/zxazys)              |                 Nankai University, Master's student                  |  |  |
-|            [MING-ZCH](https://github.com/MING-ZCH)            | Huazhong University of Science and Technology, Undergraduate student |  |  |
-|    [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL)    |       SWUFE (Southwestern University of Finance and Economics)       |  |  |
-|             [MrCatAI](https://github.com/MrCatAI)             |                               AI Mover                               |  |  |
-|              [ZeyuBa](https://github.com/ZeyuBa)              |              Institute of Automation, Master's student               |  |  |
-|    [aiyinyuedejustin](https://github.com/aiyinyuedejustin)    |             University of Pennsylvania, Master's student             |  |  |
-|           [Nobody-ML](https://github.com/Nobody-ML)           |  China University of Petroleum (East China), Undergraduate student   |  |  |
-|             [chg0901](https://github.com/chg0901)             |          [MiniSora](https://github.com/mini-sora/minisora)           |Maintainer and Admin|Data Cleaning and Docs Translation|
-|              [Mxoder](https://github.com/Mxoder)              |              Beihang University, Undergraduate student               |  |  |
-|            [Anooyman](https://github.com/Anooyman)            |    Nanjing University of Science and Technology, Master's student    |  |  |
-|          [Vicky-3021](https://github.com/Vicky-3021)          |        Xidian University, Master's student (Research Year 0)         |  |  |
-|         [SantiagoTOP](https://github.com/santiagoTOP)         |          Taiyuan University of Technology, Master's student          | | |
-|         [zealot52099](https://github.com/zealot52099)         |                               AI Mover                               | |Data Processing and RAG|
-|             [wwwyfff](https://github.com/wwwyfff)             |                  FuDan University, Master's student                  | ||
+|                           Username                            |                              School/Organization                               | Remarks | Contributions |
+|:-------------------------------------------------------------:|:------------------------------------------------------------------------------:| :------------------: | :--------: |
+|            [aJupyter](https://github.com/aJupyter)            |                      Nankai University, Master's student                       | DataWhale member | Project initiator |
+|          [jujimeizuo](https://github.com/jujimeizuo)          |                     Jiangnan University, Master's student                      |  |  |
+| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) |         Harbin Institute of Technology (Weihai), Undergraduate student         |  |  |
+|              [8baby8](https://github.com/8baby8)              |                   PaddlePaddle Pilot Team Regional Director                    | Wenxin Large Model core developer |  |
+|              [zxazys](https://github.com/zxazys)              |                      Nankai University, Master's student                       |  |  |
+|            [MING-ZCH](https://github.com/MING-ZCH)            |      Huazhong University of Science and Technology, Undergraduate student      |  |  |
+|    [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL)    |            SWUFE (Southwestern University of Finance and Economics)            |  |  |
+|             [MrCatAI](https://github.com/MrCatAI)             |                                    AI Mover                                    |  |  |
+|              [ZeyuBa](https://github.com/ZeyuBa)              |                   Institute of Automation, Master's student                    |  |  |
+|    [aiyinyuedejustin](https://github.com/aiyinyuedejustin)    |                  University of Pennsylvania, Master's student                  |  |  |
+|           [Nobody-ML](https://github.com/Nobody-ML)           |       China University of Petroleum (East China), Undergraduate student        |  |  |
+|             [chg0901](https://github.com/chg0901)             |               [MiniSora](https://github.com/mini-sora/minisora)                |Maintainer and Admin|Data Cleaning and Docs Translation|
+|              [Mxoder](https://github.com/Mxoder)              |                   Beihang University, Undergraduate student                    |  |  |
+|            [Anooyman](https://github.com/Anooyman)            |         Nanjing University of Science and Technology, Master's student         |  |  |
+|          [Vicky-3021](https://github.com/Vicky-3021)          |             Xidian University, Master's student (Research Year 0)              |  |  |
+|         [SantiagoTOP](https://github.com/santiagoTOP)         |               Taiyuan University of Technology, Master's student               | | |
+|         [zealot52099](https://github.com/zealot52099)         |                                    AI Mover                                    | |Data Processing and RAG|
+|             [wwwyfff](https://github.com/wwwyfff)             |                       FuDan University, Master's student                       | ||
+|        [jkhumor](https://github.com/jkhumor)                  |                      Nankai University, Master's student                       | |RAG|
 
 ### Copyright Notice
 

From 1ee3a481b858de5ace0867bc0ac4130771e63609 Mon Sep 17 00:00:00 2001
From: jkhumor <3323637090@qq.com>
Date: Tue, 19 Mar 2024 13:11:39 +0800
Subject: [PATCH 10/13] update readme

---
 README.md    |  2 +-
 README_EN.md | 42 +++++++++++++++++++++---------------------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 48f254f..4a8ba6a 100644
--- a/README.md
+++ b/README.md
@@ -230,7 +230,7 @@ git clone https://github.com/SmartFlowAI/EmoLLM.git
 |         [SantiagoTOP](https://github.com/santiagoTOP)         |                     太原理工大学在读硕士                     | | |
 |         [zealot52099](https://github.com/zealot52099)         |                       AI搬用工                        | |清洗数据、RAG|
 |             [wwwyfff](https://github.com/wwwyfff)             |                      复旦大学在读硕士                      | ||
-|        [jkhumor](https://github.com/jkhumor)                  |                       南开大学在读硕士                        | |RAG|
+|             [jkhumor](https://github.com/jkhumor)             |                      南开大学在读硕士                      | |RAG|
 
 ### 版权说明
 
diff --git a/README_EN.md b/README_EN.md
index 564ae67..cdf933e 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -226,27 +226,27 @@ This project uses Git for version control. You can see the currently available v
 
 ### Authors (in no particular order)
 
-|                           Username                            |                              School/Organization                               | Remarks | Contributions |
-|:-------------------------------------------------------------:|:------------------------------------------------------------------------------:| :------------------: | :--------: |
-|            [aJupyter](https://github.com/aJupyter)            |                      Nankai University, Master's student                       | DataWhale member | Project initiator |
-|          [jujimeizuo](https://github.com/jujimeizuo)          |                     Jiangnan University, Master's student                      |  |  |
-| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) |         Harbin Institute of Technology (Weihai), Undergraduate student         |  |  |
-|              [8baby8](https://github.com/8baby8)              |                   PaddlePaddle Pilot Team Regional Director                    | Wenxin Large Model core developer |  |
-|              [zxazys](https://github.com/zxazys)              |                      Nankai University, Master's student                       |  |  |
-|            [MING-ZCH](https://github.com/MING-ZCH)            |      Huazhong University of Science and Technology, Undergraduate student      |  |  |
-|    [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL)    |            SWUFE (Southwestern University of Finance and Economics)            |  |  |
-|             [MrCatAI](https://github.com/MrCatAI)             |                                    AI Mover                                    |  |  |
-|              [ZeyuBa](https://github.com/ZeyuBa)              |                   Institute of Automation, Master's student                    |  |  |
-|    [aiyinyuedejustin](https://github.com/aiyinyuedejustin)    |                  University of Pennsylvania, Master's student                  |  |  |
-|           [Nobody-ML](https://github.com/Nobody-ML)           |       China University of Petroleum (East China), Undergraduate student        |  |  |
-|             [chg0901](https://github.com/chg0901)             |               [MiniSora](https://github.com/mini-sora/minisora)                |Maintainer and Admin|Data Cleaning and Docs Translation|
-|              [Mxoder](https://github.com/Mxoder)              |                   Beihang University, Undergraduate student                    |  |  |
-|            [Anooyman](https://github.com/Anooyman)            |         Nanjing University of Science and Technology, Master's student         |  |  |
-|          [Vicky-3021](https://github.com/Vicky-3021)          |             Xidian University, Master's student (Research Year 0)              |  |  |
-|         [SantiagoTOP](https://github.com/santiagoTOP)         |               Taiyuan University of Technology, Master's student               | | |
-|         [zealot52099](https://github.com/zealot52099)         |                                    AI Mover                                    | |Data Processing and RAG|
-|             [wwwyfff](https://github.com/wwwyfff)             |                       FuDan University, Master's student                       | ||
-|        [jkhumor](https://github.com/jkhumor)                  |                      Nankai University, Master's student                       | |RAG|
+|                           Username                            |                         School/Organization                          | Remarks | Contributions |
+|:-------------------------------------------------------------:|:--------------------------------------------------------------------:| :------------------: | :--------: |
+|            [aJupyter](https://github.com/aJupyter)            |                 Nankai University, Master's student                  | DataWhale member | Project initiator |
+|          [jujimeizuo](https://github.com/jujimeizuo)          |                Jiangnan University, Master's student                 |  |  |
+| [Smiling-Weeping-zhr](https://github.com/Smiling-Weeping-zhr) |    Harbin Institute of Technology (Weihai), Undergraduate student    |  |  |
+|              [8baby8](https://github.com/8baby8)              |              PaddlePaddle Pilot Team Regional Director               | Wenxin Large Model core developer |  |
+|              [zxazys](https://github.com/zxazys)              |                 Nankai University, Master's student                  |  |  |
+|            [MING-ZCH](https://github.com/MING-ZCH)            | Huazhong University of Science and Technology, Undergraduate student |  |  |
+|    [JasonLLLLLLLLLLL](https://github.com/JasonLLLLLLLLLLL)    |       SWUFE (Southwestern University of Finance and Economics)       |  |  |
+|             [MrCatAI](https://github.com/MrCatAI)             |                               AI Mover                               |  |  |
+|              [ZeyuBa](https://github.com/ZeyuBa)              |              Institute of Automation, Master's student               |  |  |
+|    [aiyinyuedejustin](https://github.com/aiyinyuedejustin)    |             University of Pennsylvania, Master's student             |  |  |
+|           [Nobody-ML](https://github.com/Nobody-ML)           |  China University of Petroleum (East China), Undergraduate student   |  |  |
+|             [chg0901](https://github.com/chg0901)             |          [MiniSora](https://github.com/mini-sora/minisora)           |Maintainer and Admin|Data Cleaning and Docs Translation|
+|              [Mxoder](https://github.com/Mxoder)              |              Beihang University, Undergraduate student               |  |  |
+|            [Anooyman](https://github.com/Anooyman)            |    Nanjing University of Science and Technology, Master's student    |  |  |
+|          [Vicky-3021](https://github.com/Vicky-3021)          |        Xidian University, Master's student (Research Year 0)         |  |  |
+|         [SantiagoTOP](https://github.com/santiagoTOP)         |          Taiyuan University of Technology, Master's student          | | |
+|         [zealot52099](https://github.com/zealot52099)         |                               AI Mover                               | |Data Processing and RAG|
+|             [wwwyfff](https://github.com/wwwyfff)             |                  FuDan University, Master's student                  | ||
+|             [jkhumor](https://github.com/jkhumor)             |                 Nankai University, Master's student                  | |RAG|
 
 ### Copyright Notice
 

From 861f12d47a6595549f14d219dab16eb36889d276 Mon Sep 17 00:00:00 2001
From: zealot52099 <songyan5209@163.com>
Date: Tue, 19 Mar 2024 16:41:09 +0800
Subject: [PATCH 11/13] add deduplicate.py

---
 datasets/deduplicate.py | 68 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 datasets/deduplicate.py

diff --git a/datasets/deduplicate.py b/datasets/deduplicate.py
new file mode 100644
index 0000000..776396e
--- /dev/null
+++ b/datasets/deduplicate.py
@@ -0,0 +1,68 @@
+import json
+from loguru import logger
+import os
+from datasketch import MinHash
+from hashlib import md5
+
+def is_json_file(filename):
+    return filename.endswith('.json')
+
+# 绝对匹配
+def is_duplicate_absolutely(d1, d2):
+    return md5(d1.encode('utf-8')).hexdigest() == md5(d2.encode('utf-8')).hexdigest()
+
+# 使用MinHash生成器计算dict的签名
+def hash_dict(dict_obj):
+    m = MinHash()
+    for key, value in sorted(dict_obj.items()):
+        # 对于非str类型值需要先转为str
+        m.update(str(value).encode('utf8'))
+    return m
+
+# 使用绝对匹配和MinHash对dict列表去重
+def deduplicate_json(data_list, threshold=0.8):
+    seen_hashes = []
+    duplicates_removed = []
+
+    for item in data_list:
+        # print(item)
+        # print('###########')
+        min_hash = hash_dict(item)
+        # print(f'min_hash: {min_hash}')
+
+        # 绝对匹配去重
+        if not any(is_duplicate_absolutely(str(item), str(existing)) for existing in duplicates_removed):
+            # MinHash相似性去重 
+            has_similar = False
+            for stored_min_hash, stored_text in seen_hashes:
+                if stored_min_hash.jaccard(min_hash) > threshold:
+                    has_similar = True
+                    break
+            if not has_similar:
+                seen_hashes.append((min_hash,item))
+                duplicates_removed.append(item)
+           
+
+    return duplicates_removed
+
+if __name__ == '__main__':    
+    data_ai = 'qwen'  
+    root_dir  = rf'./{data_ai}/'
+    dedup_output_dir = os.path.join(root_dir,'dedup')
+    if not os.path.exists(dedup_output_dir):
+        os.mkdir(dedup_output_dir)
+    if not os.path.exists(root_dir):
+        logger.error(f"folder {root_dir} not exist" )
+        
+    else:    
+        for file in os.listdir(root_dir):
+            file_path = os.path.join(root_dir, file)
+            if os.path.isfile(file_path):
+                print(f'file name: {file_path}')
+                if is_json_file(file_path):
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        data = json.load(f)
+                        dedup_data = deduplicate_json(data)                   
+                    with open(os.path.join(root_dir, 'dedup','dedup_' + file), 'w', encoding='utf-8') as output_file:
+                        json.dump(dedup_data, output_file, ensure_ascii=False, indent=4)
+                
\ No newline at end of file

From 6e7bd5e5d4954cf11cb7659dafdab9ef209df877 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=8F=8B=E6=98=89?=
 <youfang.wang@txtechnology.com.cn>
Date: Tue, 19 Mar 2024 18:03:26 +0800
Subject: [PATCH 12/13] GLM-6B ft

---
 xtuner_config/ChatGLM3-6b-ft.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/xtuner_config/ChatGLM3-6b-ft.md b/xtuner_config/ChatGLM3-6b-ft.md
index 37015c8..a1867be 100644
--- a/xtuner_config/ChatGLM3-6b-ft.md
+++ b/xtuner_config/ChatGLM3-6b-ft.md
@@ -65,8 +65,7 @@ LLM 的微调一般指指令微调过程。所谓指令微调，是说我们使
 def process_func(example):
     MAX_LENGTH = 512
     input_ids, labels = [], []
-    instruction = tokenizer.encode(text="\n".join(["<|system|>", "现在你是一个心理专家，我有一些心理问题，请你用专业的知识帮我解决。", "<|user|>", 
-                                    example["system"] + example["input"] + "<|assistant|>"]).strip() + "\n",
+    instruction = tokenizer.encode(text="\n".join(["<|system|>", example["system"], "<|user|>", example["input"] + "<|assistant|>"]).strip() + "\n",
                                     add_special_tokens=True, truncation=True, max_length=MAX_LENGTH)
 
     response = tokenizer.encode(text=example["output"], add_special_tokens=False, truncation=True,

From 96f6ce307e6049aa35f3165f438db9c74e274458 Mon Sep 17 00:00:00 2001
From: Anooyman <875734078@qq.com>
Date: Tue, 19 Mar 2024 21:11:10 +0800
Subject: [PATCH 13/13] Update

---
 rag/src/config/config.py       | 11 +++++++++-
 rag/src/{util => }/pipeline.py | 40 +++++++++++++++-------------------
 2 files changed, 27 insertions(+), 24 deletions(-)
 rename rag/src/{util => }/pipeline.py (76%)

diff --git a/rag/src/config/config.py b/rag/src/config/config.py
index d803d64..d4dcfe3 100644
--- a/rag/src/config/config.py
+++ b/rag/src/config/config.py
@@ -25,4 +25,13 @@
 vector_db_dir = os.path.join(data_dir, 'vector_db.pkl')
 
 select_num = 3
-retrieval_num = 10
\ No newline at end of file
+retrieval_num = 10
+system_prompt = """
+	你是一个拥有丰富心理学知识的温柔邻家温柔大姐姐艾薇，我有一些心理问题，请你用专业的知识和温柔、可爱、俏皮、的口吻帮我解决，回复中可以穿插一些可爱的Emoji表情符号或者文本符号。\n
+"""
+prompt_template = """
+	{system_prompt}
+	根据下面检索回来的信息，回答问题。
+	{content}
+	问题：{question}
+"""
\ No newline at end of file
diff --git a/rag/src/util/pipeline.py b/rag/src/pipeline.py
similarity index 76%
rename from rag/src/util/pipeline.py
rename to rag/src/pipeline.py
index a6f2cdf..214eef3 100644
--- a/rag/src/util/pipeline.py
+++ b/rag/src/pipeline.py
@@ -2,7 +2,8 @@
 from langchain_core.prompts import PromptTemplate
 from transformers.utils import logging
 
-from config.config import retrieval_num, select_num
+from data_processing import DataProcessing
+from config.config import retrieval_num, select_num, system_prompt, prompt_template
 
 logger = logging.get_logger(__name__)
 
@@ -16,7 +17,7 @@ class EmoLLMRAG(object):
             4. 将 query 和检索回来的 content 传入 LLM 中
     """
 
-    def __init__(self, model) -> None:
+    def __init__(self, model, retrieval_num, rerank_flag=False, select_num=3) -> None:
         """
             输入 Model 进行初始化 
 
@@ -30,42 +31,35 @@ def __init__(self, model) -> None:
         self.vectorstores = self._load_vector_db()
         self.system_prompt = self._get_system_prompt()
         self.prompt_template = self._get_prompt_template()
-
-        # 等待 embedding team 封装对应接口
-        #self.data_process_obj = DataProcessing()
+        self.data_processing_obj = DataProcessing()
+        self.system_prompt = system_prompt
+        self.prompt_template = prompt_template
+        self.retrieval_num = retrieval_num
+        self.rerank_flag = rerank_flag
+        self.select_num = select_num
 
     def _load_vector_db(self):
         """
             调用 embedding 模块给出接口 load vector DB
         """
-        return 
-    
-    def _get_system_prompt(self) -> str:
-        """
-            加载 system prompt
-        """
-        return ''
+        vectorstores = self.data_processing_obj.load_vector_db()
+        if not vectorstores:
+            vectorstores = self.data_processing_obj.load_index_and_knowledge()
 
-    def _get_prompt_template(self) -> str:
-        """
-            加载 prompt template
-        """
-        return ''
+        return vectorstores 
 
-    def get_retrieval_content(self, query, rerank_flag=False) -> str:
+    def get_retrieval_content(self, query) -> str:
         """
             Input: 用户提问, 是否需要rerank
             ouput: 检索后并且 rerank 的内容        
         """
     
         content = ''
-        documents = self.vectorstores.similarity_search(query, k=retrieval_num)
+        documents = self.vectorstores.similarity_search(query, k=self.retrieval_num)
 
         # 如果需要rerank，调用接口对 documents 进行 rerank
-        if rerank_flag:
-            pass
-            # 等后续调用接口
-            #documents = self.data_process_obj.rerank_documents(documents, select_num)
+        if self.rerank_flag:
+            documents = self.data_processing_obj.rerank(documents, self.select_num)
 
         for doc in documents:
             content += doc.page_content