Skip to content

Commit

Permalink
Refactor gmft parser
Browse files Browse the repository at this point in the history
  • Loading branch information
DL committed Aug 3, 2024
1 parent 377f827 commit b655b04
Show file tree
Hide file tree
Showing 3 changed files with 230 additions and 182 deletions.
251 changes: 118 additions & 133 deletions dev/gmft.ipynb

Large diffs are not rendered by default.

47 changes: 47 additions & 0 deletions src/llmsearch/parsers/tables/generic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pandas as pd

from abc import ABC, abstractmethod


class GenericParsedTable(ABC):
def __init__(self, page_number: int):
self.page_num = page_number # Common field

@property
@abstractmethod
def df(self) -> pd.DataFrame:
"""Returns Pandas DF corresponding to a table"""
pass

@property
@abstractmethod
def caption(self) -> str:
"""Returns caption of the table"""
pass

@property
@abstractmethod
def xml(self) -> str:
"""Returns xml representation of the table"""
pass


def pandas_df_to_xml(df: pd.DataFrame) -> str:
"""Converts Pandas df to a simplified xml representation digestible by LLMs
Args:
df (pd.DataFrame): Pandas df
Returns:
str: xml string
"""

def func(row):
xml = ["<row>"]
for field in row.index:
xml.append(' <col name="{0}">{1}</col>'.format(field, row[field]))
xml.append("</row>")
return "\n".join(xml)

items = df.apply(func, axis=1)
return "\n".join(items)
114 changes: 65 additions & 49 deletions src/llmsearch/parsers/tables/gmft_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,62 +2,65 @@
import pandas as pd
from typing import Any, List, Optional, Tuple
from gmft.pdf_bindings import PyPDFium2Document
from gmft import CroppedTable, TableDetector, AutoFormatConfig, AutoTableFormatter, TATRTableFormatter
from gmft import (
CroppedTable,
TableDetector,
AutoFormatConfig,
AutoTableFormatter,
)
from pathlib import Path
from loguru import logger
from dataclasses import dataclass
# from pydantic import BaseModel, Field

from llmsearch.parsers.tables.generic import pandas_df_to_xml, GenericParsedTable

class ParsedTable:

class GMFTParsedTable(GenericParsedTable):
def __init__(self, table: CroppedTable, page_num: int) -> None:
super().__init__(
page_number=page_num
) # Initialize the field from the abstract class
self._table = table
self.page_num = page_num
self.config = AutoFormatConfig()
self.formatter = AutoTableFormatter()
self.failed = False

@cached_property
def captions(self) -> List[str]:
try:
return self._table.captions()
except Exception as ex:
logger.error(f"Couldn't parse captions: {str(ex)}")
return []
def _captions(self) -> List[str]:
return [c for c in self._table.captions() if c.strip()]

@cached_property
def caption(self) -> str:
return "\n".join(set(self._captions))

@property
def df(self) -> Optional[pd.DataFrame]:
ft = self.formatter.extract(self._table)
ft = self.formatter.extract(self._table)
try:
df = ft.df()
except ValueError as ex:
logger.error(f"Couldn't extract df on page {self.page_num}: {str(ex)}")
self.failed = True
return None

config = AutoFormatConfig()
config.total_overlap_reject_threshold = 0.8
config.large_table_threshold = 0
# config = AutoFormatConfig()
# config.total_overlap_reject_threshold = 0.8
# config.large_table_threshold = 0

try:
logger.info("\tTrying to reover")
df = ft.df(config_overrides = config)
except ValueError:
logger.error(f"\tCouldn't recover, page {self.page_num}: {str(ex)}")
return None
# try:
# logger.info("\tTrying to reover")
# df = ft.df(config_overrides = config)
# except ValueError:
# logger.error(f"\tCouldn't recover, page {self.page_num}: {str(ex)}")
# return None

return df

@property
def xml(self) -> str:
def func(row):
xml = ['<row>']
for field in row.index:
xml.append(' <col name="{0}">{1}</col>'.format(field, row[field]))
xml.append('</row>')
return '\n'.join(xml)
df = self.df
if df is None:
if self.df is None:
return ""
items = df.apply(func, axis=1)
return '\n'.join(items)
return pandas_df_to_xml(self.df)


@dataclass
Expand All @@ -70,12 +73,14 @@ def n_tables(self):
return len(self.cropped_tables)


class GMFTTableParser:
class GMFTParser:
def __init__(self, fn: Path) -> None:
self.fn = fn
logger.info("Loading document.")
self._doc = None
self._parsed_tables = None

def detect_tables(self) -> Tuple[List[PageTables], Any]:
def detect_page_tables(self) -> Tuple[List[PageTables], Any]:
"""Detects tables in a document and returns list of page tables"""

logger.info("Detecting tables...")
Expand All @@ -85,28 +90,39 @@ def detect_tables(self) -> Tuple[List[PageTables], Any]:

for page in doc:
pt.append(
PageTables(page_num=page.page_number, cropped_tables=detector.extract(page))
PageTables(
page_num=page.page_number, cropped_tables=detector.extract(page)
)
)

# doc.close()
return pt, doc

@property
def parsed_tables(self) -> List[GenericParsedTable]:
if self._parsed_tables is None:
page_tables, self._doc = self.detect_page_tables()
logger.info("Parsing tables ...")

out_tables = []

for page_table in page_tables:
for cropped_table in page_table.cropped_tables:
out_tables.append(
GMFTParsedTable(cropped_table, page_table.page_num)
)
self._parsed_tables = out_tables
return self._parsed_tables


if __name__ == "__main__":
# fn = Path("/home/snexus/Downloads/ws90.pdf")
fn = Path("/home/snexus/Downloads/SSRN-id2741701.pdf")

parser = GMFTTableParser(fn=fn)
page_tables, doc = parser.detect_tables()

for t in page_tables:
print(f" ========= PAGE {t.page_num} ============")
print(f"Number detected tables: {t.n_tables}")
for table in t.cropped_tables:
p = ParsedTable(table, t.page_num)
print(p.captions)
print(p.xml)
print('-------------')

doc.close()
parser = GMFTParser(fn=fn)
for p in parser.parsed_tables:
print("-------------")
print(p.page_num)
print(p.caption)
print(p.xml)
del parser

0 comments on commit b655b04

Please sign in to comment.