Skip to content

Commit

Permalink
Don't use parallel when data size is big (#108)
Browse files Browse the repository at this point in the history
  • Loading branch information
moria97 authored Jul 26, 2024
1 parent 8f192cd commit da268b8
Showing 1 changed file with 2 additions and 29 deletions.
31 changes: 2 additions & 29 deletions src/pai_rag/modules/index/pai_bm25_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from typing import Callable, List, cast, Dict
from llama_index.core.schema import BaseNode, TextNode
from pai_rag.utils.tokenizer import jieba_tokenizer
import concurrent.futures
from scipy.sparse import csr_matrix
from pai_rag.integrations.retrievers.fusion_retriever import MyNodeWithScore

Expand Down Expand Up @@ -168,34 +167,8 @@ def add_docs(self, nodes: List[BaseNode]):
self.index.doc_lens, (0, pad_size), "constant", constant_values=(0)
)

chunk_size = 1000000000
start_pos = 0
if len(text_list) < 2 * chunk_size:
tokens_list = self.split_doc(text_list, self.tokenizer)
self.process_token_list(tokens_list, id_list)
else:
with concurrent.futures.ProcessPoolExecutor(
max_workers=self.workers
) as executor:
futures = []
future2startpos = {}
while start_pos < len(text_list):
fut = executor.submit(
self.split_doc,
text_list[start_pos : start_pos + chunk_size],
self.tokenizer,
)
futures.append(fut)
future2startpos[fut] = start_pos
start_pos += chunk_size

i = 0
for fut in concurrent.futures.as_completed(futures):
start_pos = future2startpos[fut]
i += 1
tokens_list = fut.result()
batch_id_list = id_list[start_pos : start_pos + chunk_size]
self.process_token_list(tokens_list, batch_id_list)
tokens_list = self.split_doc(text_list, self.tokenizer)
self.process_token_list(tokens_list, id_list)

self.construct_index_matrix()

Expand Down

0 comments on commit da268b8

Please sign in to comment.