Skip to content

Commit

Permalink
fix test
Browse files Browse the repository at this point in the history
  • Loading branch information
senwu committed Jun 18, 2020
1 parent b6322f3 commit e641137
Showing 1 changed file with 14 additions and 9 deletions.
23 changes: 14 additions & 9 deletions src/fonduer/parser/visual_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,21 @@ def __init__(
f"but should be 0.36.0 or above"
)

def link(self, document_name: str, sentences: List[Sentence]) -> Iterator[Sentence]:
def link(
self, document_name: str, sentences: List[Sentence], pdf_path: str = None
) -> Iterator[Sentence]:
"""Link visual information with sentences.
:param document_name: the document name.
:param sentences: sentences to be linked with visual information.
:param pdf_path: The path to the PDF documents, if any. This path will
override the one used in initialization, if provided.
:param pdf_path: The path to the PDF documents, if any, defaults to None.
This path will override the one used in initialization, if provided.
:return: A generator of ``Sentence``.
"""
# sentences should be sorted as their order is not deterministic.
self.sentences = sorted(sentences, key=attrgetter("position"))
self.pdf_file = self._get_linked_pdf_path(document_name)
self.pdf_path = pdf_path if pdf_path is not None else self.pdf_path
self.pdf_file = self._get_linked_pdf_path(document_name, self.pdf_path)
try:
self._extract_pdf_words()
except RuntimeError as e:
Expand Down Expand Up @@ -129,12 +132,13 @@ def _extract_pdf_words(self) -> None:
if self.verbose:
self.logger.info(f"Extracted {len(self.pdf_word_list)} pdf words")

def _get_linked_pdf_path(self, filename: str) -> str:
def _get_linked_pdf_path(self, filename: str, pdf_path: str = None) -> str:
"""Get the linked pdf file path, return None if it doesn't exist.
:param filename: The path to the PDF document.
:param filename: The name to the PDF document.
:param pdf_path: The path to the PDF documents, if any, defaults to None.
"""
path = self.pdf_path
path = pdf_path if pdf_path is not None else self.pdf_path
# If path is file, but not PDF.
if os.path.isfile(path) and path.lower().endswith(".pdf"):
return path
Expand All @@ -151,12 +155,13 @@ def _get_linked_pdf_path(self, filename: str) -> str:

return None

def is_linkable(self, filename: str) -> bool:
def is_linkable(self, filename: str, pdf_path: str = None) -> bool:
"""Verify that the file exists and has a PDF extension.
:param filename: The path to the PDF document.
:param pdf_path: The path to the PDF documents, if any, defaults to None.
"""
return False if self._get_linked_pdf_path(filename) is None else True
return False if self._get_linked_pdf_path(filename, pdf_path) is None else True

def _coordinates_from_HTML(
self, page: Tag, page_num: int
Expand Down

0 comments on commit e641137

Please sign in to comment.