Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename "VisualLinker" to "PdfVisualParser" to welcome "HocrVisualParser" #518

Merged
merged 7 commits into from
Oct 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,23 @@ Added
* `@HiromuHota`_: Support spaCy v2.3.
(`#506 <https://github.com/HazyResearch/fonduer/pull/506>`_)

Changed
^^^^^^^
* `@HiromuHota`_: Renamed :class:`VisualLinker` to :class:`PdfVisualParser`,
which assumes the followings:
(`#518 <https://github.com/HazyResearch/fonduer/pull/518>`_)

* ``pdf_path`` should be a directory path, where PDF files exist, and cannot be a file path.
* The PDF file should have the same basename (:class:`os.path.basename`) as the document.
E.g., the PDF file should be either "123.pdf" or "123.PDF" for "123.html".

* `@HiromuHota`_: Changed :class:`Parser`'s signature as follows:
(`#518 <https://github.com/HazyResearch/fonduer/pull/518>`_)

* Renamed ``vizlink`` to ``visual_parser``.
* Removed ``pdf_path``. Now this is required only by :class:`PdfVisualLinker`.
* Removed ``visual``. Provide ``visual_parser`` if visual information is to be parsed.

0.8.3_ - 2020-09-11
-------------------

Expand Down
16 changes: 11 additions & 5 deletions docs/user/parser.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,6 @@ This is Fonduer_'s core Parser object.
:inherited-members:
:show-inheritance:

.. automodule:: fonduer.parser.visual_linker
:members:
:inherited-members:
:show-inheritance:

Lingual Parsers
---------------

Expand All @@ -40,6 +35,17 @@ and enrich them with NLP.
:inherited-members:
:show-inheritance:

Visual Parsers
--------------

The following docs describe various visual parsers. They parse visual information,
e.g., bounding boxes of each word.

.. automodule:: fonduer.parser.visual_parser
:members:
:inherited-members:
:show-inheritance:

Preprocessors
-------------

Expand Down
68 changes: 15 additions & 53 deletions src/fonduer/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
Table,
)
from fonduer.parser.models.utils import construct_stable_id
from fonduer.parser.visual_linker import VisualLinker
from fonduer.parser.visual_parser import VisualParser
from fonduer.utils.udf import UDF, UDFRunner

logger = logging.getLogger(__name__)
Expand All @@ -62,12 +62,8 @@ class Parser(UDFRunner):
replaces various unicode variants of a hyphen (e.g. emdash, endash,
minus, etc.) with a standard ASCII hyphen.
:param tabular: Whether to include tabular information in the parse.
:param visual: Whether to include visual information in the parse.
Requires PDFs for each input document.
:param vizlink: A custom visual linker that inherits
:class:`VisualLinker <fonduer.parser.visual_linker.VisualLinker>`.
Unless otherwise specified, :class:`VisualLinker` will be used.
:param pdf_path: The path to the corresponding PDFs use for visual info.
:param visual_parser: A visual parser that parses visual information.
Defaults to None (visual information is not parsed).
"""

def __init__(
Expand All @@ -88,9 +84,7 @@ def __init__(
("[\u2010\u2011\u2012\u2013\u2014\u2212]", "-")
],
tabular: bool = True, # tabular information
visual: bool = False, # visual information
vizlink: Optional[VisualLinker] = None, # visual linker
pdf_path: Optional[str] = None,
visual_parser: Optional[VisualParser] = None, # visual parser
) -> None:
"""Initialize Parser."""
super().__init__(
Expand All @@ -105,9 +99,7 @@ def __init__(
strip=strip,
replacements=replacements,
tabular=tabular,
visual=visual,
vizlink=vizlink,
pdf_path=pdf_path,
visual_parser=visual_parser,
language=language,
)

Expand All @@ -117,14 +109,11 @@ def apply( # type: ignore
clear: bool = True,
parallelism: Optional[int] = None,
progress_bar: bool = True,
pdf_path: Optional[str] = None,
) -> None:
"""Run the Parser.

:param doc_loader: An iteratable of ``Documents`` to parse. Typically,
one of Fonduer's document preprocessors.
:param pdf_path: The path to the PDF documents, if any. This path will
override the one used in initialization, if provided.
:param clear: Whether or not to clear the labels table before applying
these LFs.
:param parallelism: How many threads to use for extraction. This will
Expand All @@ -135,7 +124,6 @@ def apply( # type: ignore
"""
super().apply(
doc_loader,
pdf_path=pdf_path,
clear=clear,
parallelism=parallelism,
progress_bar=progress_bar,
Expand All @@ -146,11 +134,8 @@ def _add(self, doc: Union[Document, None]) -> None:
if doc:
self.session.add(doc)

def clear(self, pdf_path: Optional[str] = None) -> None: # type: ignore
"""Clear all of the ``Context`` objects in the database.

:param pdf_path: This parameter is ignored.
"""
def clear(self) -> None: # type: ignore
"""Clear all of the ``Context`` objects in the database."""
self.session.query(Context).delete(synchronize_session="fetch")

def get_last_documents(self) -> List[Document]:
Expand Down Expand Up @@ -186,18 +171,12 @@ def __init__(
strip: bool,
replacements: List[Tuple[str, str]],
tabular: bool,
visual: bool,
vizlink: Optional[VisualLinker],
pdf_path: Optional[str],
visual_parser: Optional[VisualParser],
language: Optional[str],
**kwargs: Any,
) -> None:
"""Initialize Parser UDF.

:param visual: boolean, if True visual features are used in the model
:param pdf_path: directory where pdf are saved, if a pdf file is not
found, it will be created from the html document and saved in that
directory
:param replacements: a list of (_pattern_, _replace_) tuples where
_pattern_ isinstance a regex and _replace_ is a character string.
All occurents of _pattern_ in the text will be replaced by
Expand Down Expand Up @@ -238,40 +217,23 @@ def __init__(
self.tabular = tabular

# visual setup
self.visual = visual
self.vizlink = vizlink
if self.visual:
self.pdf_path = pdf_path
if not self.vizlink:
# Use the provided pdf_path if present
if not self.pdf_path:
warnings.warn(
"Visual parsing failed: pdf_path is required. "
+ "Proceeding without visual parsing.",
RuntimeWarning,
)
self.visual = False
else:
self.vizlink = VisualLinker(pdf_path)
self.visual_parser = visual_parser

def apply( # type: ignore
self, document: Document, pdf_path: Optional[str] = None, **kwargs: Any
self, document: Document, **kwargs: Any
) -> Optional[Document]:
"""Parse a text in an instance of Document.

:param document: document to parse.
:param pdf_path: path of a pdf file that the document is visually linked with.
"""
try:
[y for y in self.parse(document, document.text)]
if self.visual:
# Use the provided pdf_path if present
self.pdf_path = pdf_path if pdf_path else self.pdf_path
if not self.vizlink.is_linkable(document.name, self.pdf_path):
if self.visual_parser:
if not self.visual_parser.is_parsable(document.name):
warnings.warn(
(
f"Visual parse failed. "
f"{self.pdf_path + document.name} not a PDF. "
f"{document.name} not a PDF. "
f"Proceeding without visual parsing."
),
RuntimeWarning,
Expand All @@ -280,8 +242,8 @@ def apply( # type: ignore
# Add visual attributes
[
y
for y in self.vizlink.link(
document.name, document.sentences, self.pdf_path
for y in self.visual_parser.parse(
document.name, document.sentences
)
]
return document
Expand Down
5 changes: 5 additions & 0 deletions src/fonduer/parser/visual_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Fonduer's visual parser module."""
from fonduer.parser.visual_parser.pdf_visual_parser import PdfVisualParser
from fonduer.parser.visual_parser.visual_parser import VisualParser

__all__ = ["VisualParser", "PdfVisualParser"]
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
"""Fonduer visual linker."""
"""Fonduer visual parser."""
import logging
import os
import re
import shutil
import subprocess
from builtins import object, range, zip
from builtins import range, zip
from collections import OrderedDict, defaultdict
from operator import attrgetter
from typing import DefaultDict, Dict, Iterator, List, Optional, Tuple
from typing import DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple

import numpy as np
from bs4 import BeautifulSoup
from bs4.element import Tag
from editdistance import eval as editdist # Alternative library: python-levenshtein

from fonduer.parser.models import Sentence
from fonduer.parser.visual_parser.visual_parser import VisualParser
from fonduer.utils.utils_visual import Bbox

logger = logging.getLogger(__name__)
Expand All @@ -36,17 +37,28 @@
HtmlWord = Tuple[HtmlWordId, str]


class VisualLinker(object):
"""Link visual information with sentences."""
class PdfVisualParser(VisualParser):
"""Link visual information, extracted from PDF, with parsed sentences.

def __init__(
self, pdf_path: str, time: bool = False, verbose: bool = False
) -> None:
"""Initialize VisualLinker."""
This linker assumes the following conditions for expected results:

- The PDF file exists in a directory specified by `pdf_path`.
- The basename of the PDF file is same as the *document name*
and its extension is either ".pdf" or ".PDF".
- A PDF has a text layer.
"""

def __init__(self, pdf_path: str, verbose: bool = False) -> None:
"""Initialize VisualParser.

:param pdf_path: a path to directory that contains PDF files.
:param verbose: whether to turn on verbose logging.
"""
if not os.path.isdir(pdf_path):
raise ValueError(f"No directory exists at {pdf_path}!")
self.pdf_path = pdf_path
self.pdf_file: Optional[str] = None
self.verbose = verbose
self.time = time
self.coordinate_map: Optional[Dict[PdfWordId, Bbox]] = None
self.pdf_word_list: Optional[List[PdfWord]] = None
self.html_word_list: Optional[List[HtmlWord]] = None
Expand All @@ -70,21 +82,18 @@ def __init__(
f"but should be 0.36.0 or above"
)

def link(
self, document_name: str, sentences: List[Sentence], pdf_path: str = None
def parse(
self, document_name: str, sentences: Iterable[Sentence]
) -> Iterator[Sentence]:
"""Link visual information with sentences.

:param document_name: the document name.
:param sentences: sentences to be linked with visual information.
:param pdf_path: The path to the PDF documents, if any, defaults to None.
This path will override the one used in initialization, if provided.
:return: A generator of ``Sentence``.
"""
# sentences should be sorted as their order is not deterministic.
self.sentences = sorted(sentences, key=attrgetter("position"))
self.pdf_path = pdf_path if pdf_path is not None else self.pdf_path
self.pdf_file = self._get_linked_pdf_path(document_name, self.pdf_path)
self.pdf_file = self._get_linked_pdf_path(document_name)
try:
self._extract_pdf_words()
except RuntimeError as e:
Expand Down Expand Up @@ -129,36 +138,26 @@ def _extract_pdf_words(self) -> None:
if self.verbose:
logger.info(f"Extracted {len(self.pdf_word_list)} pdf words")

def _get_linked_pdf_path(self, filename: str, pdf_path: str = None) -> str:
"""Get the linked pdf file path, return None if it doesn't exist.
def _get_linked_pdf_path(self, document_name: str) -> str:
"""Get the pdf file path, return None if it doesn't exist.

:param filename: The name to the PDF document.
:param pdf_path: The path to the PDF documents, if any, defaults to None.
:param document_name: a document name.
"""
path = pdf_path if pdf_path is not None else self.pdf_path
# If path is file, but not PDF.
if os.path.isfile(path) and path.lower().endswith(".pdf"):
return path
else:
full_path = os.path.join(path, filename)
if os.path.isfile(full_path) and full_path.lower().endswith(".pdf"):
return full_path
full_path = os.path.join(path, filename + ".pdf")
if os.path.isfile(full_path):
return full_path
full_path = os.path.join(path, filename + ".PDF")
if os.path.isfile(full_path):
return full_path
full_path = os.path.join(self.pdf_path, document_name + ".pdf")
if os.path.isfile(full_path):
return full_path
full_path = os.path.join(self.pdf_path, document_name + ".PDF")
if os.path.isfile(full_path):
return full_path

return None

def is_linkable(self, filename: str, pdf_path: str = None) -> bool:
def is_parsable(self, document_name: str) -> bool:
"""Verify that the file exists and has a PDF extension.

:param filename: The path to the PDF document.
:param pdf_path: The path to the PDF documents, if any, defaults to None.
:param document_name: The path to the PDF document.
"""
return False if self._get_linked_pdf_path(filename, pdf_path) is None else True
return False if self._get_linked_pdf_path(document_name) is None else True

def _coordinates_from_HTML(
self, page: Tag, page_num: int
Expand Down
32 changes: 32 additions & 0 deletions src/fonduer/parser/visual_parser/visual_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Abstract visual parser."""
from abc import ABC, abstractmethod
from typing import Iterable, Iterator

from fonduer.parser.models import Sentence


class VisualParser(ABC):
"""Abstract visual parer."""

@abstractmethod
def parse(
self,
document_name: str,
sentences: Iterable[Sentence],
) -> Iterator[Sentence]:
"""Parse visual information and link them with given sentences.

:param document_name: the document name.
:param sentences: sentences to be linked with visual information.
:yield: sentences with visual information.
"""
pass

@abstractmethod
def is_parsable(self, document_name: str) -> bool:
"""Check if visual information can be parsed.

:param document_name: the document name.
:return: Whether visual information is parsable.
"""
pass
Loading