Skip to content

Commit

Permalink
Fix node hash for special file types (#54)
Browse files Browse the repository at this point in the history
* Fix node hash for special file types

* Remove print
  • Loading branch information
moria97 authored Jun 6, 2024
1 parent 4bb4ce8 commit 8b72d8f
Showing 1 changed file with 18 additions and 2 deletions.
20 changes: 18 additions & 2 deletions src/pai_rag/data/rag_dataloader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os
from typing import Any, Dict
from llama_index.core import Settings
from llama_index.core.schema import TextNode
from llama_index.llms.huggingface import HuggingFaceLLM
Expand All @@ -14,6 +16,8 @@

DEFAULT_LOCAL_QA_MODEL_PATH = "/huggingface/transformers/qwen_1.8b"

DOC_TYPES_DO_NOT_NEED_CHUNKING = set([".csv", ".xlsx", ".md", ".xls", ".htm", ".html"])


class RagDataLoader:
"""
Expand Down Expand Up @@ -49,13 +53,25 @@ def __init__(

logger.info("RagDataLoader initialized.")

def _extract_file_type(self, metadata: Dict[str, Any]):
file_name = metadata.get("file_name", "dummy.txt")
return os.path.splitext(file_name)[1]

async def load(self, file_directory: str, enable_qa_extraction: bool):
data_reader = self.datareader_factory.get_reader(file_directory)
docs = data_reader.load_data()
nodes = []

doc_cnt_map = {}
for doc in docs:
if doc.metadata.get("file_type", "Unknown") == "HTML":
node_id = node_id_hash(0, doc)
doc_type = self._extract_file_type(doc.metadata)

if doc_type in DOC_TYPES_DO_NOT_NEED_CHUNKING:
doc_key = f"""{doc.metadata.get("file_path", "dummy")}"""
if doc_key not in doc_cnt_map:
doc_cnt_map[doc_key] = 0
doc_cnt_map[doc_key] += 1
node_id = node_id_hash(doc_cnt_map[doc_key], doc)
nodes.append(
TextNode(id_=node_id, text=doc.text, metadata=doc.metadata)
)
Expand Down

0 comments on commit 8b72d8f

Please sign in to comment.