Refactor gmft parser

snexus · Aug 3, 2024 · b655b04 · b655b04
1 parent 377f827
commit b655b04
Show file tree

Hide file tree

Showing 3 changed files with 230 additions and 182 deletions.
diff --git a/dev/gmft.ipynb b/dev/gmft.ipynb
diff --git a/src/llmsearch/parsers/tables/generic.py b/src/llmsearch/parsers/tables/generic.py
@@ -0,0 +1,47 @@
+import pandas as pd
+
+from abc import ABC, abstractmethod
+
+
+class GenericParsedTable(ABC):
+    def __init__(self, page_number: int):
+        self.page_num = page_number  # Common field
+
+    @property
+    @abstractmethod
+    def df(self) -> pd.DataFrame:
+        """Returns Pandas DF corresponding to a table"""
+        pass
+
+    @property
+    @abstractmethod
+    def caption(self) -> str:
+        """Returns caption of the table"""
+        pass
+
+    @property
+    @abstractmethod
+    def xml(self) -> str:
+        """Returns xml representation of the table"""
+        pass
+
+
+def pandas_df_to_xml(df: pd.DataFrame) -> str:
+    """Converts Pandas df to a simplified xml representation digestible by LLMs
+
+    Args:
+        df (pd.DataFrame): Pandas df
+
+    Returns:
+        str: xml string
+    """
+
+    def func(row):
+        xml = ["<row>"]
+        for field in row.index:
+            xml.append('  <col name="{0}">{1}</col>'.format(field, row[field]))
+        xml.append("</row>")
+        return "\n".join(xml)
+
+    items = df.apply(func, axis=1)
+    return "\n".join(items)
diff --git a/src/llmsearch/parsers/tables/gmft_parser.py b/src/llmsearch/parsers/tables/gmft_parser.py
@@ -2,62 +2,65 @@
 import pandas as pd
 from typing import Any, List, Optional, Tuple
 from gmft.pdf_bindings import PyPDFium2Document
-from gmft import CroppedTable, TableDetector, AutoFormatConfig, AutoTableFormatter, TATRTableFormatter
+from gmft import (
+    CroppedTable,
+    TableDetector,
+    AutoFormatConfig,
+    AutoTableFormatter,
+)
 from pathlib import Path
 from loguru import logger
 from dataclasses import dataclass
-# from pydantic import BaseModel, Field
 
+from llmsearch.parsers.tables.generic import pandas_df_to_xml, GenericParsedTable
 
-class ParsedTable:
+
+class GMFTParsedTable(GenericParsedTable):
     def __init__(self, table: CroppedTable, page_num: int) -> None:
+        super().__init__(
+            page_number=page_num
+        )  # Initialize the field from the abstract class
         self._table = table
         self.page_num = page_num
-        self.config = AutoFormatConfig()
         self.formatter = AutoTableFormatter()
+        self.failed = False
 
     @cached_property
-    def captions(self) -> List[str]:
-        try:
-            return self._table.captions()
-        except Exception as ex:
-            logger.error(f"Couldn't parse captions: {str(ex)}")
-            return []
-    
+    def _captions(self) -> List[str]:
+        return [c for c in self._table.captions() if c.strip()]
+
+    @cached_property
+    def caption(self) -> str:
+        return "\n".join(set(self._captions))
+
     @property
     def df(self) -> Optional[pd.DataFrame]:
-        ft =  self.formatter.extract(self._table)
+        ft = self.formatter.extract(self._table)
         try:
             df = ft.df()
         except ValueError as ex:
             logger.error(f"Couldn't extract df on page {self.page_num}: {str(ex)}")
+            self.failed = True
+            return None
 
-            config = AutoFormatConfig()
-            config.total_overlap_reject_threshold = 0.8
-            config.large_table_threshold = 0
+            # config = AutoFormatConfig()
+            # config.total_overlap_reject_threshold = 0.8
+            # config.large_table_threshold = 0
 
-            try:
-                logger.info("\tTrying to reover")
-                df = ft.df(config_overrides = config)
-            except ValueError:
-                logger.error(f"\tCouldn't recover, page {self.page_num}: {str(ex)}")
-                return None
+            # try:
+            # logger.info("\tTrying to reover")
+            # df = ft.df(config_overrides = config)
+            # except ValueError:
+            # logger.error(f"\tCouldn't recover, page {self.page_num}: {str(ex)}")
+            # return None
 
         return df
-    
+
     @property
     def xml(self) -> str:
-        def func(row):
-            xml = ['<row>']
-            for field in row.index:
-                xml.append('  <col name="{0}">{1}</col>'.format(field, row[field]))
-            xml.append('</row>')
-            return '\n'.join(xml)
-        df = self.df
-        if df is None:
+        if self.df is None:
             return ""
-        items = df.apply(func, axis=1)
-        return '\n'.join(items)
+        return pandas_df_to_xml(self.df)
 
 
 @dataclass
@@ -70,12 +73,14 @@ def n_tables(self):
         return len(self.cropped_tables)
 
 
-class GMFTTableParser:
+class GMFTParser:
     def __init__(self, fn: Path) -> None:
         self.fn = fn
         logger.info("Loading document.")
+        self._doc = None
+        self._parsed_tables = None
 
-    def detect_tables(self) -> Tuple[List[PageTables], Any]:
+    def detect_page_tables(self) -> Tuple[List[PageTables], Any]:
         """Detects tables in a document and returns list of page tables"""
 
         logger.info("Detecting tables...")
@@ -85,28 +90,39 @@ def detect_tables(self) -> Tuple[List[PageTables], Any]:
 
         for page in doc:
             pt.append(
-                PageTables(page_num=page.page_number, cropped_tables=detector.extract(page))
+                PageTables(
+                    page_num=page.page_number, cropped_tables=detector.extract(page)
+                )
             )
 
-        # doc.close()
         return pt, doc
 
+    @property
+    def parsed_tables(self) -> List[GenericParsedTable]:
+        if self._parsed_tables is None:
+            page_tables, self._doc = self.detect_page_tables()
+            logger.info("Parsing tables ...")
+
+            out_tables = []
+
+            for page_table in page_tables:
+                for cropped_table in page_table.cropped_tables:
+                    out_tables.append(
+                        GMFTParsedTable(cropped_table, page_table.page_num)
+                    )
+            self._parsed_tables = out_tables
+        return self._parsed_tables
+
 
 if __name__ == "__main__":
     # fn = Path("/home/snexus/Downloads/ws90.pdf")
     fn = Path("/home/snexus/Downloads/SSRN-id2741701.pdf")
 
-    parser = GMFTTableParser(fn=fn)
-    page_tables, doc = parser.detect_tables()
-
-    for t in page_tables:
-        print(f" ========= PAGE {t.page_num} ============")
-        print(f"Number detected tables: {t.n_tables}")
-        for table in t.cropped_tables:
-            p = ParsedTable(table, t.page_num)
-            print(p.captions)
-            print(p.xml)
-            print('-------------')
-
-    doc.close()
+    parser = GMFTParser(fn=fn)
+    for p in parser.parsed_tables:
+        print("-------------")
+        print(p.page_num)
+        print(p.caption)
+        print(p.xml)
+    del parser