From e641137cacfdf58ae9be17f4cb12d0acf5ef200c Mon Sep 17 00:00:00 2001 From: Sen Wu Date: Wed, 17 Jun 2020 14:09:30 -0700 Subject: [PATCH] fix test --- src/fonduer/parser/visual_linker.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/fonduer/parser/visual_linker.py b/src/fonduer/parser/visual_linker.py index 7ffbaed3f..02bb75197 100644 --- a/src/fonduer/parser/visual_linker.py +++ b/src/fonduer/parser/visual_linker.py @@ -69,18 +69,21 @@ def __init__( f"but should be 0.36.0 or above" ) - def link(self, document_name: str, sentences: List[Sentence]) -> Iterator[Sentence]: + def link( + self, document_name: str, sentences: List[Sentence], pdf_path: str = None + ) -> Iterator[Sentence]: """Link visual information with sentences. :param document_name: the document name. :param sentences: sentences to be linked with visual information. - :param pdf_path: The path to the PDF documents, if any. This path will - override the one used in initialization, if provided. + :param pdf_path: The path to the PDF documents, if any, defaults to None. + This path will override the one used in initialization, if provided. :return: A generator of ``Sentence``. """ # sentences should be sorted as their order is not deterministic. self.sentences = sorted(sentences, key=attrgetter("position")) - self.pdf_file = self._get_linked_pdf_path(document_name) + self.pdf_path = pdf_path if pdf_path is not None else self.pdf_path + self.pdf_file = self._get_linked_pdf_path(document_name, self.pdf_path) try: self._extract_pdf_words() except RuntimeError as e: @@ -129,12 +132,13 @@ def _extract_pdf_words(self) -> None: if self.verbose: self.logger.info(f"Extracted {len(self.pdf_word_list)} pdf words") - def _get_linked_pdf_path(self, filename: str) -> str: + def _get_linked_pdf_path(self, filename: str, pdf_path: str = None) -> str: """Get the linked pdf file path, return None if it doesn't exist. - :param filename: The path to the PDF document. + :param filename: The name to the PDF document. + :param pdf_path: The path to the PDF documents, if any, defaults to None. """ - path = self.pdf_path + path = pdf_path if pdf_path is not None else self.pdf_path # If path is file, but not PDF. if os.path.isfile(path) and path.lower().endswith(".pdf"): return path @@ -151,12 +155,13 @@ def _get_linked_pdf_path(self, filename: str) -> str: return None - def is_linkable(self, filename: str) -> bool: + def is_linkable(self, filename: str, pdf_path: str = None) -> bool: """Verify that the file exists and has a PDF extension. :param filename: The path to the PDF document. + :param pdf_path: The path to the PDF documents, if any, defaults to None. """ - return False if self._get_linked_pdf_path(filename) is None else True + return False if self._get_linked_pdf_path(filename, pdf_path) is None else True def _coordinates_from_HTML( self, page: Tag, page_num: int