HazyResearch · senwu · Jun 18, 2020 · Jun 17, 2020 · Jun 17, 2020 · Jun 17, 2020
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -56,6 +56,7 @@ jobs:
         createdb e2e_test -p 5432 -h 127.0.0.1 -w -U postgres
         createdb inc_test -p 5432 -h 127.0.0.1 -w -U postgres
         createdb meta_test -p 5432 -h 127.0.0.1 -w -U postgres
+        createdb parser_test -p 5432 -h 127.0.0.1 -w -U postgres
         createdb pg_test -p 5432 -h 127.0.0.1 -w -U postgres
         cd tests && ./download_data.sh && cd ..
         python -m spacy download en
@@ -122,6 +123,7 @@ jobs:
         createdb e2e_test -p 5432 -h 127.0.0.1 -w -U postgres
         createdb inc_test -p 5432 -h 127.0.0.1 -w -U postgres
         createdb meta_test -p 5432 -h 127.0.0.1 -w -U postgres
+        createdb parser_test -p 5432 -h 127.0.0.1 -w -U postgres
         createdb pg_test -p 5432 -h 127.0.0.1 -w -U postgres
         cd tests && ./download_data.sh && cd ..
     - name: Print Version Info

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -32,6 +32,9 @@ Deprecated
 
 Fixed
 ^^^^^
+* `@senwu`_: Fix pdf_path cannot be without a trailing slash.
+  (`#442 <https://github.com/HazyResearch/fonduer/issues/442>`_)
+  (`#457 <https://github.com/HazyResearch/fonduer/pull/457>`_)
 * `@kaikun213`_: Fix bug in table range difference calculations.
   (`#420 <https://github.com/HazyResearch/fonduer/pull/420>`_)
 * `@HiromuHota`_: mention_extractor.apply with clear=True now works even if it's not the first run.

diff --git a/docs/dev/tests.rst b/docs/dev/tests.rst
@@ -9,6 +9,7 @@ In order to run the tests, you will need to create the local databases used
 by the tests::
 
     $ createdb meta_test
+    $ createdb parser_test
     $ createdb e2e_test
     $ createdb inc_test
     $ createdb pg_test

diff --git a/src/fonduer/parser/parser.py b/src/fonduer/parser/parser.py
@@ -262,7 +262,7 @@ def apply(  # type: ignore
             if self.visual:
                 # Use the provided pdf_path if present
                 self.pdf_path = pdf_path if pdf_path else self.pdf_path
-                if not self.vizlink.is_linkable(document.name):
+                if not self.vizlink.is_linkable(document.name, self.pdf_path):
                     warnings.warn(
                         (
                             f"Visual parse failed. "

diff --git a/src/fonduer/parser/visual_linker.py b/src/fonduer/parser/visual_linker.py
@@ -70,23 +70,20 @@ def __init__(
             )
 
     def link(
-        self, document_name: str, sentences: List[Sentence], pdf_path: str
+        self, document_name: str, sentences: List[Sentence], pdf_path: str = None
     ) -> Iterator[Sentence]:
         """Link visual information with sentences.
 
         :param document_name: the document name.
         :param sentences: sentences to be linked with visual information.
-        :param pdf_path: The path to the PDF documents, if any. This path will
-            override the one used in initialization, if provided.
+        :param pdf_path: The path to the PDF documents, if any, defaults to None.
+            This path will override the one used in initialization, if provided.
         :return: A generator of ``Sentence``.
         """
         # sentences should be sorted as their order is not deterministic.
         self.sentences = sorted(sentences, key=attrgetter("position"))
-        self.pdf_file = (
-            pdf_path if os.path.isfile(pdf_path) else pdf_path + document_name + ".pdf"
-        )
-        if not os.path.isfile(self.pdf_file):
-            self.pdf_file = self.pdf_file[:-3] + "PDF"
+        self.pdf_path = pdf_path if pdf_path is not None else self.pdf_path
+        self.pdf_file = self._get_linked_pdf_path(document_name, self.pdf_path)
         try:
             self._extract_pdf_words()
         except RuntimeError as e:
@@ -135,25 +132,36 @@ def _extract_pdf_words(self) -> None:
         if self.verbose:
             self.logger.info(f"Extracted {len(self.pdf_word_list)} pdf words")
 
-    def is_linkable(self, filename: str) -> bool:
-        """Verify that the file exists and has a PDF extension.
+    def _get_linked_pdf_path(self, filename: str, pdf_path: str = None) -> str:
+        """Get the linked pdf file path, return None if it doesn't exist.
 
-        :param filename: The path to the PDF document.
+        :param filename: The name to the PDF document.
+        :param pdf_path: The path to the PDF documents, if any, defaults to None.
         """
-        path = self.pdf_path
+        path = pdf_path if pdf_path is not None else self.pdf_path
         # If path is file, but not PDF.
         if os.path.isfile(path) and path.lower().endswith(".pdf"):
-            return True
+            return path
         else:
             full_path = os.path.join(path, filename)
             if os.path.isfile(full_path) and full_path.lower().endswith(".pdf"):
-                return True
-            elif os.path.isfile(os.path.join(path, filename + ".pdf")):
-                return True
-            elif os.path.isfile(os.path.join(path, filename + ".PDF")):
-                return True
+                return full_path
+            full_path = os.path.join(path, filename + ".pdf")
+            if os.path.isfile(full_path):
+                return full_path
+            full_path = os.path.join(path, filename + ".PDF")
+            if os.path.isfile(full_path):
+                return full_path
+
+        return None
 
-        return False
+    def is_linkable(self, filename: str, pdf_path: str = None) -> bool:
+        """Verify that the file exists and has a PDF extension.
+
+        :param filename: The path to the PDF document.
+        :param pdf_path: The path to the PDF documents, if any, defaults to None.
+        """
+        return False if self._get_linked_pdf_path(filename, pdf_path) is None else True
 
     def _coordinates_from_HTML(
         self, page: Tag, page_num: int

diff --git a/tests/parser/test_parser.py b/tests/parser/test_parser.py
@@ -4,6 +4,8 @@
 
 import pytest
 
+import fonduer
+from fonduer.parser import Parser
 from fonduer.parser.lingual_parser import SpacyParser
 from fonduer.parser.models import Document
 from fonduer.parser.parser import ParserUDF, SimpleParser
@@ -14,6 +16,15 @@
     TSVDocPreprocessor,
 )
 
+DB = "parser_test"
+if "CI" in os.environ:
+    CONN_STRING = (
+        f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}"
+        + f"@{os.environ['POSTGRES_HOST']}:{os.environ['POSTGRES_PORT']}/{DB}"
+    )
+else:
+    CONN_STRING = f"postgresql://127.0.0.1:5432/{DB}"
+
 
 def get_parser_udf(
     structural=True,  # structural information
@@ -812,3 +823,117 @@ def test_parser_no_image():
 
     # Check that doc has no figures
     assert len(doc.figures) == 0
+
+
+def test_various_file_path_formats():
+    """Test the parser with various file path formats."""
+    session = fonduer.Meta.init(CONN_STRING).Session()
+
+    def parse(docs_path, pdf_path):
+        # Preprocessor for the Docs
+        doc_preprocessor = HTMLDocPreprocessor(docs_path)
+
+        # Create an Parser and parse the documents
+        corpus_parser = Parser(
+            session,
+            parallelism=1,
+            structural=True,
+            lingual=True,
+            visual=True,
+            pdf_path=pdf_path,
+        )
+
+        corpus_parser.clear()
+        corpus_parser.apply(doc_preprocessor)
+        return corpus_parser
+
+    # Test that doc_path is a file path, and pdf_path is a file path
+    docs_path = "tests/data/html_simple/md.html"
+    pdf_path = "tests/data/pdf_simple/md.pdf"
+
+    corpus_parser = parse(docs_path, pdf_path)
+    docs = corpus_parser.get_documents()
+    assert len(docs) == 1
+    assert docs[0].sentences[0].top is not None
+
+    # Test that doc_path is a file path, and pdf_path is a file path
+    docs_path = "tests/data/html_simple/"
+    pdf_path = "tests/data/pdf_simple/"
+
+    corpus_parser = parse(docs_path, pdf_path)
+    docs = corpus_parser.get_documents()
+    assert len(docs) == 6
+    for doc in docs:
+        # table_span.pdf and no_image_unsorted.pdf are not exist, so no
+        # coordinate info available
+        if doc.name in ["table_span", "no_image_unsorted"]:
+            assert doc.sentences[0].top is None
+        else:
+            assert doc.sentences[0].top is not None
+
+    # Test that doc_path is a directory (no trailing slash), and pdf_path is a
+    # directory
+    docs_path = "tests/data/html_simple"
+    pdf_path = "tests/data/pdf_simple/"
+
+    corpus_parser = parse(docs_path, pdf_path)
+    docs = corpus_parser.get_documents()
+    assert len(docs) == 6
+    for doc in docs:
+        # table_span.pdf and no_image_unsorted.pdf are not exist, so no
+        # coordinate info available
+        if doc.name in ["table_span", "no_image_unsorted"]:
+            assert doc.sentences[0].top is None
+        else:
+            assert doc.sentences[0].top is not None
+
+    # Test that doc_path is a directory, and pdf_path is a directory (no trailing
+    # slash)
+    docs_path = "tests/data/html_simple/"
+    pdf_path = "tests/data/pdf_simple"
+
+    corpus_parser = parse(docs_path, pdf_path)
+    docs = corpus_parser.get_documents()
+    assert len(docs) == 6
+    for doc in docs:
+        # table_span.pdf and no_image_unsorted.pdf are not exist, so no
+        # coordinate info available
+        if doc.name in ["table_span", "no_image_unsorted"]:
+            assert doc.sentences[0].top is None
+        else:
+            assert doc.sentences[0].top is not None
+
+    # Test that doc_path is a directory (no trailing slash), and pdf_path is a
+    # directory (no trailing slash)
+    docs_path = "tests/data/html_simple"
+    pdf_path = "tests/data/pdf_simple"
+
+    corpus_parser = parse(docs_path, pdf_path)
+    docs = corpus_parser.get_documents()
+    assert len(docs) == 6
+    for doc in docs:
+        # table_span.pdf and no_image_unsorted.pdf are not exist, so no
+        # coordinate info available
+        if doc.name in ["table_span", "no_image_unsorted"]:
+            assert doc.sentences[0].top is None
+        else:
+            assert doc.sentences[0].top is not None
+
+    # Test that doc_path is a file path, and pdf_path is a directory
+    docs_path = "tests/data/html_extended/ext_diseases.html"
+    pdf_path = "tests/data/pdf_extended/"
+
+    corpus_parser = parse(docs_path, pdf_path)
+    docs = corpus_parser.get_documents()
+    assert len(docs) == 1
+    assert docs[0].sentences[0].top is not None
+
+    # Test that doc_path is a file path, and pdf_path is a directory (no trailing
+    # slash)
+    docs_path = "tests/data/html_extended/ext_diseases.html"
+    pdf_path = "tests/data/pdf_extended"
+
+    corpus_parser = parse(docs_path, pdf_path)
+    docs = corpus_parser.get_documents()
+    assert len(docs) == 1
+    assert docs[0].sentences[0].top is not None