Skip to content

Commit

Permalink
remove table_summary and raptor (#339)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ceceliachenen authored Jan 16, 2025
1 parent cefc173 commit 8696675
Show file tree
Hide file tree
Showing 11 changed files with 9 additions and 87 deletions.
31 changes: 0 additions & 31 deletions src/pai_rag/app/web/tabs/upload_tab.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,8 @@ def upload_oss_knowledge(
oss_path,
chunk_size,
chunk_overlap,
enable_raptor,
enable_multimodal,
enable_mandatory_ocr,
enable_table_summary,
upload_index,
):
if not oss_path:
Expand All @@ -34,10 +32,8 @@ def upload_oss_knowledge(
oss_path=oss_path,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
enable_raptor=enable_raptor,
enable_multimodal=enable_multimodal,
enable_mandatory_ocr=enable_mandatory_ocr,
enable_table_summary=enable_table_summary,
index_name=upload_index,
from_oss=True,
):
Expand All @@ -48,10 +44,8 @@ def upload_files(
upload_files,
chunk_size,
chunk_overlap,
enable_raptor,
enable_multimodal,
enable_mandatory_ocr,
enable_table_summary,
upload_index,
):
if not upload_files:
Expand All @@ -68,10 +62,8 @@ def upload_files(
oss_path=None,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
enable_raptor=enable_raptor,
enable_multimodal=enable_multimodal,
enable_mandatory_ocr=enable_mandatory_ocr,
enable_table_summary=enable_table_summary,
index_name=upload_index,
):
yield state_info
Expand All @@ -82,10 +74,8 @@ def upload_knowledge(
oss_path,
chunk_size,
chunk_overlap,
enable_raptor,
enable_multimodal,
enable_mandatory_ocr,
enable_table_summary,
index_name,
from_oss: bool = False,
):
Expand All @@ -95,7 +85,6 @@ def upload_knowledge(
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap,
"enable_mandatory_ocr": enable_mandatory_ocr,
"enable_table_summary": enable_table_summary,
}
)
except RagApiError as api_error:
Expand All @@ -105,15 +94,13 @@ def upload_knowledge(
if from_oss:
response = rag_client.add_knowledge(
oss_path=oss_path,
enable_raptor=enable_raptor,
index_name=index_name,
enable_multimodal=enable_multimodal,
)
my_upload_files.append(MyUploadFile(oss_path, response["task_id"]))
else:
response = rag_client.add_knowledge(
input_files=[file.name for file in upload_files],
enable_raptor=enable_raptor,
index_name=index_name,
enable_multimodal=enable_multimodal,
)
Expand Down Expand Up @@ -187,11 +174,6 @@ def create_upload_tab() -> Dict[str, Any]:
label="\N{fire} Chunk Overlap (The portion of adjacent document chunks that overlap with each other)",
elem_id="chunk_overlap",
)
enable_raptor = gr.Checkbox(
label="Yes",
info="Process with Raptor Node Enhancement",
elem_id="enable_raptor",
)
enable_multimodal = gr.Checkbox(
label="Yes",
info="Process with MultiModal",
Expand All @@ -204,11 +186,6 @@ def create_upload_tab() -> Dict[str, Any]:
elem_id="enable_mandatory_ocr",
visible=True,
)
enable_table_summary = gr.Checkbox(
label="Yes",
info="Process with Table Summary ",
elem_id="enable_table_summary",
)
with gr.Column(scale=8):
with gr.Tab("Files"):
upload_file = gr.File(
Expand Down Expand Up @@ -246,10 +223,8 @@ def create_upload_tab() -> Dict[str, Any]:
oss_path,
chunk_size,
chunk_overlap,
enable_raptor,
enable_multimodal,
enable_mandatory_ocr,
enable_table_summary,
upload_index,
],
outputs=[upload_oss_state_df, upload_oss_state],
Expand All @@ -262,10 +237,8 @@ def create_upload_tab() -> Dict[str, Any]:
upload_file,
chunk_size,
chunk_overlap,
enable_raptor,
enable_multimodal,
enable_mandatory_ocr,
enable_table_summary,
upload_index,
],
outputs=[upload_file_state_df, upload_file_state],
Expand All @@ -285,10 +258,8 @@ def create_upload_tab() -> Dict[str, Any]:
dummy_component,
chunk_size,
chunk_overlap,
enable_raptor,
enable_multimodal,
enable_mandatory_ocr,
enable_table_summary,
upload_index,
],
outputs=[upload_dir_state_df, upload_dir_state],
Expand All @@ -304,8 +275,6 @@ def create_upload_tab() -> Dict[str, Any]:
upload_index.elem_id: upload_index,
chunk_size.elem_id: chunk_size,
chunk_overlap.elem_id: chunk_overlap,
enable_raptor.elem_id: enable_raptor,
enable_multimodal.elem_id: enable_multimodal,
enable_mandatory_ocr.elem_id: enable_mandatory_ocr,
enable_table_summary.elem_id: enable_table_summary,
}
6 changes: 0 additions & 6 deletions src/pai_rag/app/web/view_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,7 @@ class ViewModel(BaseModel):

# reader
reader_type: str = "SimpleDirectoryReader"
enable_raptor: bool = False
enable_mandatory_ocr: bool = False
enable_table_summary: bool = False

config_file: str = None

Expand Down Expand Up @@ -185,7 +183,6 @@ def from_app_config(config: RagConfig):
view_model.chunk_size = config.node_parser.chunk_size

view_model.enable_mandatory_ocr = config.data_reader.enable_mandatory_ocr
view_model.enable_table_summary = config.data_reader.enable_table_summary

view_model.similarity_top_k = config.retriever.similarity_top_k
view_model.image_similarity_top_k = config.retriever.image_similarity_top_k
Expand Down Expand Up @@ -309,7 +306,6 @@ def to_app_config(self):
config["node_parser"]["chunk_overlap"] = int(self.chunk_overlap)

config["data_reader"]["enable_mandatory_ocr"] = self.enable_mandatory_ocr
config["data_reader"]["enable_table_summary"] = self.enable_table_summary

config["retriever"]["similarity_top_k"] = self.similarity_top_k
config["retriever"]["image_similarity_top_k"] = self.image_similarity_top_k
Expand Down Expand Up @@ -547,10 +543,8 @@ def to_component_settings(self) -> Dict[str, Dict[str, Any]]:

settings["chunk_size"] = {"value": self.chunk_size}
settings["chunk_overlap"] = {"value": self.chunk_overlap}
settings["enable_raptor"] = {"value": self.enable_raptor}
settings["enable_multimodal"] = {"value": self.enable_multimodal}
settings["enable_mandatory_ocr"] = {"value": self.enable_mandatory_ocr}
settings["enable_table_summary"] = {"value": self.enable_table_summary}

# retrieval and rerank
settings["retrieval_mode"] = {"value": self.retrieval_mode}
Expand Down
8 changes: 5 additions & 3 deletions src/pai_rag/integrations/nodes/raptor_nodes_enhance.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(
embed_model=embed_model,
)

def __call__(self, nodes: List[BaseNode]) -> List[BaseNode]:
def __call__(self, nodes: List[BaseNode], **kwargs: Any) -> List[BaseNode]:
"""Given a set of nodes, this function inserts higher level of abstractions within the index.
For later retrieval
Expand Down Expand Up @@ -65,7 +65,8 @@ def __call__(self, nodes: List[BaseNode]) -> List[BaseNode]:
logger.info(f"Generating embeddings for level {level}.")

embeddings = embed_model.get_text_embedding_batch(
[node.get_content(metadata_mode="embed") for node in cur_nodes]
[node.get_content(metadata_mode="embed") for node in cur_nodes],
**kwargs,
)
assert len(embeddings) == len(cur_nodes)
id_to_embedding = {
Expand Down Expand Up @@ -122,7 +123,8 @@ def __call__(self, nodes: List[BaseNode]) -> List[BaseNode]:

if level == self.tree_depth - 1:
embeddings = embed_model.get_text_embedding_batch(
[node.get_content(metadata_mode="embed") for node in cur_nodes]
[node.get_content(metadata_mode="embed") for node in cur_nodes],
**kwargs,
)
assert len(embeddings) == len(cur_nodes)
id_to_embedding = {
Expand Down
7 changes: 0 additions & 7 deletions src/pai_rag/integrations/readers/pai/pai_data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
class BaseDataReaderConfig(BaseModel):
concat_csv_rows: bool = False
enable_mandatory_ocr: bool = False
enable_table_summary: bool = False
format_sheet_data_to_json: bool = False
sheet_column_filters: List[str] | None = None

Expand All @@ -36,28 +35,22 @@ def get_file_readers(reader_config: BaseDataReaderConfig = None, oss_store: Any

file_readers = {
".html": PaiHtmlReader(
enable_table_summary=reader_config.enable_table_summary,
oss_cache=oss_store, # Storing html images
),
".htm": PaiHtmlReader(
enable_table_summary=reader_config.enable_table_summary,
oss_cache=oss_store, # Storing html images
),
".docx": PaiDocxReader(
enable_table_summary=reader_config.enable_table_summary,
oss_cache=oss_store, # Storing docx images
),
".pdf": PaiPDFReader(
enable_mandatory_ocr=reader_config.enable_mandatory_ocr,
enable_table_summary=reader_config.enable_table_summary,
oss_cache=oss_store, # Storing pdf images
),
".pptx": PaiPptxReader(
enable_table_summary=reader_config.enable_table_summary,
oss_cache=oss_store, # Storing pptx images
),
".md": PaiMarkdownReader(
enable_table_summary=reader_config.enable_table_summary,
oss_cache=oss_store, # Storing markdown images
),
".csv": PaiPandasCSVReader(
Expand Down
7 changes: 1 addition & 6 deletions src/pai_rag/integrations/readers/pai_docx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,14 @@ class PaiDocxReader(BaseReader):
"""Read docx files including texts, tables, images.
Args:
enable_table_summary (bool): whether to use table_summary to process tables
oss_cache : oss_cache
"""

def __init__(
self,
enable_table_summary: bool = False,
oss_cache: Any = None,
) -> None:
self.enable_table_summary = enable_table_summary
self._oss_cache = oss_cache
logger.info(
f"PaiDocxReader created with enable_table_summary : {self.enable_table_summary}"
)

def _transform_local_to_oss(
self, image_blob: bytes, image_filename: str, doc_name: str
Expand Down
7 changes: 1 addition & 6 deletions src/pai_rag/integrations/readers/pai_html_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,14 @@ class PaiHtmlReader(BaseReader):
"""Read html files including texts, tables, images.
Args:
enable_table_summary (bool): whether to use table_summary to process tables
oss_cache : oss_cache
"""

def __init__(
self,
enable_table_summary: bool = False,
oss_cache: Any = None,
) -> None:
self.enable_table_summary = enable_table_summary
self._oss_cache = oss_cache
logger.info(
f"PaiHtmlReader created with enable_table_summary : {self.enable_table_summary}"
)

def _extract_tables(self, html):
soup = BeautifulSoup(html, "html.parser")
Expand Down
5 changes: 0 additions & 5 deletions src/pai_rag/integrations/readers/pai_markdown_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,9 @@
class PaiMarkdownReader(BaseReader):
def __init__(
self,
enable_table_summary: bool = False,
oss_cache: Any = None,
) -> None:
self.enable_table_summary = enable_table_summary
self._oss_cache = oss_cache
logger.info(
f"PaiMarkdownReader created with enable_table_summary : {self.enable_table_summary}"
)

def replace_image_paths(self, markdown_name: str, content: str):
markdown_image_matches = MARKDOWN_IMAGE_PATTERN.finditer(content)
Expand Down
8 changes: 2 additions & 6 deletions src/pai_rag/integrations/readers/pai_pdf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,17 @@ class PaiPDFReader(BaseReader):
"""Read PDF files including texts, tables, images.
Args:
enable_table_summary (bool): whether to use table_summary to process tables
enable_mandatory_ocr (bool): whether to use ocr to files
oss_cache: oss_cache
"""

def __init__(
self,
enable_mandatory_ocr: bool = False,
enable_table_summary: bool = False,
oss_cache: Any = None,
) -> None:
self.enable_table_summary = enable_table_summary
self.enable_mandatory_ocr = enable_mandatory_ocr
self._oss_cache = oss_cache
logger.info(
f"PaiPdfReader created with enable_table_summary : {self.enable_table_summary}"
)
logger.info(
f"PaiPdfReader created with enable_mandatory_ocr : {self.enable_mandatory_ocr}"
)
Expand Down
5 changes: 0 additions & 5 deletions src/pai_rag/integrations/readers/pai_pptx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,9 @@
class PaiPptxReader(BaseReader):
def __init__(
self,
enable_table_summary: bool = False,
oss_cache: Any = None,
) -> None:
self.enable_table_summary = enable_table_summary
self._oss_cache = oss_cache
logger.info(
f"PaiPptxReader created with enable_table_summary : {self.enable_table_summary}"
)

def _extract_shape(self, slide_number, shape):
image_flag = False
Expand Down
3 changes: 0 additions & 3 deletions src/pai_rag/tools/data_process/ops/parser_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def __init__(
self,
concat_csv_rows: bool = False,
enable_mandatory_ocr: bool = False,
enable_table_summary: bool = False,
format_sheet_data_to_json: bool = False,
sheet_column_filters: List[str] = None,
oss_bucket: str = None,
Expand All @@ -40,7 +39,6 @@ def __init__(
self.data_reader_config = BaseDataReaderConfig(
concat_csv_rows=concat_csv_rows,
enable_mandatory_ocr=enable_mandatory_ocr,
enable_table_summary=enable_table_summary,
format_sheet_data_to_json=format_sheet_data_to_json,
sheet_column_filters=sheet_column_filters,
)
Expand All @@ -61,7 +59,6 @@ def __init__(
f"""ParserActor [PaiDataReader] init finished with following parameters:
concat_csv_rows: {concat_csv_rows}
enable_mandatory_ocr: {enable_mandatory_ocr}
enable_table_summary: {enable_table_summary}
format_sheet_data_to_json: {format_sheet_data_to_json}
sheet_column_filters: {sheet_column_filters}
oss_bucket: {oss_bucket}
Expand Down
9 changes: 0 additions & 9 deletions src/pai_rag/tools/data_process/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def process_parser(args):
"accelerator",
"enable_mandatory_ocr",
"concat_csv_rows",
"enable_table_summary",
"format_sheet_data_to_json",
"sheet_column_filters",
"oss_bucket",
Expand Down Expand Up @@ -204,14 +203,6 @@ def init_configs():
default=False,
help="Whether to concat csv rows for rag_parser operator.",
)
parser.add_argument(
"--enable_table_summary",
type=str2bool,
nargs="?",
const=True,
default=False,
help="Whether to enable table summary for rag_parser operator.",
)
parser.add_argument(
"--format_sheet_data_to_json",
type=str2bool,
Expand Down

0 comments on commit 8696675

Please sign in to comment.