HazyResearch · senwu · Oct 3, 2020 · Sep 25, 2020 · Sep 25, 2020 · Sep 25, 2020
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,16 @@ Added
 * `@HiromuHota`_: Support spaCy v2.3.
   (`#506 <https://github.com/HazyResearch/fonduer/pull/506>`_)
 
+Changed
+^^^^^^^
+* `@HiromuHota`_: Renamed :class:`VisualLinker` to :class:`PdfVisualParser`,
+  which takes ``pdf_path``, and changed :class:`Parser`'s signature as follows:
+  (`#518 <https://github.com/HazyResearch/fonduer/pull/518>`_)
+
+    * Renamed ``vizlink`` to ``visual_parser``.
+    * Removed ``pdf_path``. Now this is required only by :class:`PdfVisualLinker`.
+    * Removed ``visual``. Provide ``visual_parser`` if visual information is to be parsed.
+
 0.8.3_ - 2020-09-11
 -------------------
 

diff --git a/docs/user/parser.rst b/docs/user/parser.rst
@@ -24,11 +24,6 @@ This is Fonduer_'s core Parser object.
     :inherited-members:
     :show-inheritance:
 
-.. automodule:: fonduer.parser.visual_linker
-    :members:
-    :inherited-members:
-    :show-inheritance:
-
 Lingual Parsers
 ---------------
 
@@ -40,6 +35,17 @@ and enrich them with NLP.
     :inherited-members:
     :show-inheritance:
 
+Visual Parsers
+--------------
+
+The following docs describe various visual parsers. They parse visual information,
+e.g., bounding boxes of each word.
+
+.. automodule:: fonduer.parser.visual_parser
+    :members:
+    :inherited-members:
+    :show-inheritance:
+
 Preprocessors
 -------------
 

diff --git a/src/fonduer/parser/parser.py b/src/fonduer/parser/parser.py
@@ -35,7 +35,7 @@
     Table,
 )
 from fonduer.parser.models.utils import construct_stable_id
-from fonduer.parser.visual_linker import VisualLinker
+from fonduer.parser.visual_parser import VisualParser
 from fonduer.utils.udf import UDF, UDFRunner
 
 logger = logging.getLogger(__name__)
@@ -62,12 +62,8 @@ class Parser(UDFRunner):
         replaces various unicode variants of a hyphen (e.g. emdash, endash,
         minus, etc.) with a standard ASCII hyphen.
     :param tabular: Whether to include tabular information in the parse.
-    :param visual: Whether to include visual information in the parse.
-        Requires PDFs for each input document.
-    :param vizlink: A custom visual linker that inherits
-        :class:`VisualLinker <fonduer.parser.visual_linker.VisualLinker>`.
-        Unless otherwise specified, :class:`VisualLinker` will be used.
-    :param pdf_path: The path to the corresponding PDFs use for visual info.
+    :param visual_parser: A visual parser that parses visual information.
+        Defaults to None (visual information is not parsed).
     """
 
     def __init__(
@@ -88,9 +84,7 @@ def __init__(
             ("[\u2010\u2011\u2012\u2013\u2014\u2212]", "-")
         ],
         tabular: bool = True,  # tabular information
-        visual: bool = False,  # visual information
-        vizlink: Optional[VisualLinker] = None,  # visual linker
-        pdf_path: Optional[str] = None,
+        visual_parser: Optional[VisualParser] = None,  # visual parser
     ) -> None:
         """Initialize Parser."""
         super().__init__(
@@ -105,9 +99,7 @@ def __init__(
             strip=strip,
             replacements=replacements,
             tabular=tabular,
-            visual=visual,
-            vizlink=vizlink,
-            pdf_path=pdf_path,
+            visual_parser=visual_parser,
             language=language,
         )
 
@@ -117,14 +109,11 @@ def apply(  # type: ignore
         clear: bool = True,
         parallelism: Optional[int] = None,
         progress_bar: bool = True,
-        pdf_path: Optional[str] = None,
     ) -> None:
         """Run the Parser.
 
         :param doc_loader: An iteratable of ``Documents`` to parse. Typically,
             one of Fonduer's document preprocessors.
-        :param pdf_path: The path to the PDF documents, if any. This path will
-            override the one used in initialization, if provided.
         :param clear: Whether or not to clear the labels table before applying
             these LFs.
         :param parallelism: How many threads to use for extraction. This will
@@ -135,7 +124,6 @@ def apply(  # type: ignore
         """
         super().apply(
             doc_loader,
-            pdf_path=pdf_path,
             clear=clear,
             parallelism=parallelism,
             progress_bar=progress_bar,
@@ -146,11 +134,8 @@ def _add(self, doc: Union[Document, None]) -> None:
         if doc:
             self.session.add(doc)
 
-    def clear(self, pdf_path: Optional[str] = None) -> None:  # type: ignore
-        """Clear all of the ``Context`` objects in the database.
-
-        :param pdf_path: This parameter is ignored.
-        """
+    def clear(self) -> None:  # type: ignore
+        """Clear all of the ``Context`` objects in the database."""
         self.session.query(Context).delete(synchronize_session="fetch")
 
     def get_last_documents(self) -> List[Document]:
@@ -186,18 +171,12 @@ def __init__(
         strip: bool,
         replacements: List[Tuple[str, str]],
         tabular: bool,
-        visual: bool,
-        vizlink: Optional[VisualLinker],
-        pdf_path: Optional[str],
+        visual_parser: Optional[VisualParser],
         language: Optional[str],
         **kwargs: Any,
     ) -> None:
         """Initialize Parser UDF.
 
-        :param visual: boolean, if True visual features are used in the model
-        :param pdf_path: directory where pdf are saved, if a pdf file is not
-            found, it will be created from the html document and saved in that
-            directory
         :param replacements: a list of (_pattern_, _replace_) tuples where
             _pattern_ isinstance a regex and _replace_ is a character string.
             All occurents of _pattern_ in the text will be replaced by
@@ -238,40 +217,23 @@ def __init__(
         self.tabular = tabular
 
         # visual setup
-        self.visual = visual
-        self.vizlink = vizlink
-        if self.visual:
-            self.pdf_path = pdf_path
-            if not self.vizlink:
-                # Use the provided pdf_path if present
-                if not self.pdf_path:
-                    warnings.warn(
-                        "Visual parsing failed: pdf_path is required. "
-                        + "Proceeding without visual parsing.",
-                        RuntimeWarning,
-                    )
-                    self.visual = False
-                else:
-                    self.vizlink = VisualLinker(pdf_path)
+        self.visual_parser = visual_parser
 
     def apply(  # type: ignore
-        self, document: Document, pdf_path: Optional[str] = None, **kwargs: Any
+        self, document: Document, **kwargs: Any
     ) -> Optional[Document]:
         """Parse a text in an instance of Document.
 
         :param document: document to parse.
-        :param pdf_path: path of a pdf file that the document is visually linked with.
         """
         try:
             [y for y in self.parse(document, document.text)]
-            if self.visual:
-                # Use the provided pdf_path if present
-                self.pdf_path = pdf_path if pdf_path else self.pdf_path
-                if not self.vizlink.is_linkable(document.name, self.pdf_path):
+            if self.visual_parser:
+                if not self.visual_parser.is_parsable(document.name):
                     warnings.warn(
                         (
                             f"Visual parse failed. "
-                            f"{self.pdf_path + document.name} not a PDF. "
+                            f"{document.name} not a PDF. "
                             f"Proceeding without visual parsing."
                         ),
                         RuntimeWarning,
@@ -280,8 +242,8 @@ def apply(  # type: ignore
                     # Add visual attributes
                     [
                         y
-                        for y in self.vizlink.link(
-                            document.name, document.sentences, self.pdf_path
+                        for y in self.visual_parser.parse(
+                            document.name, document.sentences
                         )
                     ]
             return document

diff --git a/src/fonduer/parser/visual_parser/__init__.py b/src/fonduer/parser/visual_parser/__init__.py
@@ -0,0 +1,5 @@
+"""Fonduer's visual parser module."""
+from fonduer.parser.visual_parser.pdf_visual_parser import PdfVisualParser
+from fonduer.parser.visual_parser.visual_parser import VisualParser
+
+__all__ = ["VisualParser", "PdfVisualParser"]
diff --git a/src/fonduer/parser/visual_linker.py → ...parser/visual_parser/pdf_visual_parser.py b/src/fonduer/parser/visual_linker.py → ...parser/visual_parser/pdf_visual_parser.py
@@ -1,20 +1,21 @@
-"""Fonduer visual linker."""
+"""Fonduer visual parser."""
 import logging
 import os
 import re
 import shutil
 import subprocess
-from builtins import object, range, zip
+from builtins import range, zip
 from collections import OrderedDict, defaultdict
 from operator import attrgetter
-from typing import DefaultDict, Dict, Iterator, List, Optional, Tuple
+from typing import DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple
 
 import numpy as np
 from bs4 import BeautifulSoup
 from bs4.element import Tag
 from editdistance import eval as editdist  # Alternative library: python-levenshtein
 
 from fonduer.parser.models import Sentence
+from fonduer.parser.visual_parser.visual_parser import VisualParser
 from fonduer.utils.utils_visual import Bbox
 
 logger = logging.getLogger(__name__)
@@ -36,17 +37,28 @@
 HtmlWord = Tuple[HtmlWordId, str]
 
 
-class VisualLinker(object):
-    """Link visual information with sentences."""
+class PdfVisualParser(VisualParser):
+    """Link visual information, extracted from PDF, with parsed sentences.
 
-    def __init__(
-        self, pdf_path: str, time: bool = False, verbose: bool = False
-    ) -> None:
-        """Initialize VisualLinker."""
+    This linker assumes the following conditions for expected results:
+
+    - The PDF file exists in a directory specified by `pdf_path`.
+    - The basename of the PDF file is same as the *document name*
+      and its extension is either ".pdf" or ".PDF".
+    - A PDF has a text layer.
+    """
+
+    def __init__(self, pdf_path: str, verbose: bool = False) -> None:
+        """Initialize VisualParser.
+
+        :param pdf_path: a path to directory that contains PDF files.
+        :param verbose: whether to turn on verbose logging.
+        """
+        if not os.path.isdir(pdf_path):
+            raise ValueError(f"No directory exists at {pdf_path}!")
         self.pdf_path = pdf_path
         self.pdf_file: Optional[str] = None
         self.verbose = verbose
-        self.time = time
         self.coordinate_map: Optional[Dict[PdfWordId, Bbox]] = None
         self.pdf_word_list: Optional[List[PdfWord]] = None
         self.html_word_list: Optional[List[HtmlWord]] = None
@@ -70,21 +82,18 @@ def __init__(
                 f"but should be 0.36.0 or above"
             )
 
-    def link(
-        self, document_name: str, sentences: List[Sentence], pdf_path: str = None
+    def parse(
+        self, document_name: str, sentences: Iterable[Sentence]
     ) -> Iterator[Sentence]:
         """Link visual information with sentences.
 
         :param document_name: the document name.
         :param sentences: sentences to be linked with visual information.
-        :param pdf_path: The path to the PDF documents, if any, defaults to None.
-            This path will override the one used in initialization, if provided.
         :return: A generator of ``Sentence``.
         """
         # sentences should be sorted as their order is not deterministic.
         self.sentences = sorted(sentences, key=attrgetter("position"))
-        self.pdf_path = pdf_path if pdf_path is not None else self.pdf_path
-        self.pdf_file = self._get_linked_pdf_path(document_name, self.pdf_path)
+        self.pdf_file = self._get_linked_pdf_path(document_name)
         try:
             self._extract_pdf_words()
         except RuntimeError as e:
@@ -129,36 +138,26 @@ def _extract_pdf_words(self) -> None:
         if self.verbose:
             logger.info(f"Extracted {len(self.pdf_word_list)} pdf words")
 
-    def _get_linked_pdf_path(self, filename: str, pdf_path: str = None) -> str:
-        """Get the linked pdf file path, return None if it doesn't exist.
+    def _get_linked_pdf_path(self, document_name: str) -> str:
+        """Get the pdf file path, return None if it doesn't exist.
 
-        :param filename: The name to the PDF document.
-        :param pdf_path: The path to the PDF documents, if any, defaults to None.
+        :param document_name: a document name.
         """
-        path = pdf_path if pdf_path is not None else self.pdf_path
-        # If path is file, but not PDF.
-        if os.path.isfile(path) and path.lower().endswith(".pdf"):
-            return path
-        else:
-            full_path = os.path.join(path, filename)
-            if os.path.isfile(full_path) and full_path.lower().endswith(".pdf"):
-                return full_path
-            full_path = os.path.join(path, filename + ".pdf")
-            if os.path.isfile(full_path):
-                return full_path
-            full_path = os.path.join(path, filename + ".PDF")
-            if os.path.isfile(full_path):
-                return full_path
+        full_path = os.path.join(self.pdf_path, document_name + ".pdf")
+        if os.path.isfile(full_path):
+            return full_path
+        full_path = os.path.join(self.pdf_path, document_name + ".PDF")
+        if os.path.isfile(full_path):
+            return full_path
 
         return None
 
-    def is_linkable(self, filename: str, pdf_path: str = None) -> bool:
+    def is_parsable(self, document_name: str) -> bool:
         """Verify that the file exists and has a PDF extension.
 
-        :param filename: The path to the PDF document.
-        :param pdf_path: The path to the PDF documents, if any, defaults to None.
+        :param document_name: The path to the PDF document.
         """
-        return False if self._get_linked_pdf_path(filename, pdf_path) is None else True
+        return False if self._get_linked_pdf_path(document_name) is None else True
 
     def _coordinates_from_HTML(
         self, page: Tag, page_num: int

diff --git a/src/fonduer/parser/visual_parser/visual_parser.py b/src/fonduer/parser/visual_parser/visual_parser.py
@@ -0,0 +1,32 @@
+"""Abstract visual parser."""
+from abc import ABC, abstractmethod
+from typing import Iterable, Iterator
+
+from fonduer.parser.models import Sentence
+
+
+class VisualParser(ABC):
+    """Abstract visual parer."""
+
+    @abstractmethod
+    def parse(
+        self,
+        document_name: str,
+        sentences: Iterable[Sentence],
+    ) -> Iterator[Sentence]:
+        """Parse visual information and link them with given sentences.
+
+        :param document_name: the document name.
+        :param sentences: sentences to be linked with visual information.
+        :yield: sentences with visual information.
+        """
+        pass
+
+    @abstractmethod
+    def is_parsable(self, document_name: str) -> bool:
+        """Check if visual information can be parsed.
+
+        :param document_name: the document name.
+        :return: Whether visual information is parsable.
+        """
+        pass