Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix pdf_path cannot be without a trailing slash #459

Merged
merged 6 commits into from
Jun 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ jobs:
createdb e2e_test -p 5432 -h 127.0.0.1 -w -U postgres
createdb inc_test -p 5432 -h 127.0.0.1 -w -U postgres
createdb meta_test -p 5432 -h 127.0.0.1 -w -U postgres
createdb parser_test -p 5432 -h 127.0.0.1 -w -U postgres
createdb pg_test -p 5432 -h 127.0.0.1 -w -U postgres
cd tests && ./download_data.sh && cd ..
python -m spacy download en
Expand Down Expand Up @@ -122,6 +123,7 @@ jobs:
createdb e2e_test -p 5432 -h 127.0.0.1 -w -U postgres
createdb inc_test -p 5432 -h 127.0.0.1 -w -U postgres
createdb meta_test -p 5432 -h 127.0.0.1 -w -U postgres
createdb parser_test -p 5432 -h 127.0.0.1 -w -U postgres
createdb pg_test -p 5432 -h 127.0.0.1 -w -U postgres
cd tests && ./download_data.sh && cd ..
- name: Print Version Info
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ Deprecated

Fixed
^^^^^
* `@senwu`_: Fix pdf_path cannot be without a trailing slash.
(`#442 <https://github.com/HazyResearch/fonduer/issues/442>`_)
(`#457 <https://github.com/HazyResearch/fonduer/pull/457>`_)
senwu marked this conversation as resolved.
Show resolved Hide resolved
* `@kaikun213`_: Fix bug in table range difference calculations.
(`#420 <https://github.com/HazyResearch/fonduer/pull/420>`_)
* `@HiromuHota`_: mention_extractor.apply with clear=True now works even if it's not the first run.
Expand Down
1 change: 1 addition & 0 deletions docs/dev/tests.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ In order to run the tests, you will need to create the local databases used
by the tests::

$ createdb meta_test
$ createdb parser_test
$ createdb e2e_test
$ createdb inc_test
$ createdb pg_test
Expand Down
2 changes: 1 addition & 1 deletion src/fonduer/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def apply( # type: ignore
if self.visual:
# Use the provided pdf_path if present
self.pdf_path = pdf_path if pdf_path else self.pdf_path
if not self.vizlink.is_linkable(document.name):
if not self.vizlink.is_linkable(document.name, self.pdf_path):
warnings.warn(
(
f"Visual parse failed. "
Expand Down
46 changes: 27 additions & 19 deletions src/fonduer/parser/visual_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,23 +70,20 @@ def __init__(
)

def link(
self, document_name: str, sentences: List[Sentence], pdf_path: str
self, document_name: str, sentences: List[Sentence], pdf_path: str = None
) -> Iterator[Sentence]:
"""Link visual information with sentences.

:param document_name: the document name.
:param sentences: sentences to be linked with visual information.
:param pdf_path: The path to the PDF documents, if any. This path will
override the one used in initialization, if provided.
:param pdf_path: The path to the PDF documents, if any, defaults to None.
This path will override the one used in initialization, if provided.
:return: A generator of ``Sentence``.
"""
# sentences should be sorted as their order is not deterministic.
self.sentences = sorted(sentences, key=attrgetter("position"))
self.pdf_file = (
pdf_path if os.path.isfile(pdf_path) else pdf_path + document_name + ".pdf"
)
if not os.path.isfile(self.pdf_file):
self.pdf_file = self.pdf_file[:-3] + "PDF"
self.pdf_path = pdf_path if pdf_path is not None else self.pdf_path
self.pdf_file = self._get_linked_pdf_path(document_name, self.pdf_path)
try:
self._extract_pdf_words()
except RuntimeError as e:
Expand Down Expand Up @@ -135,25 +132,36 @@ def _extract_pdf_words(self) -> None:
if self.verbose:
self.logger.info(f"Extracted {len(self.pdf_word_list)} pdf words")

def is_linkable(self, filename: str) -> bool:
"""Verify that the file exists and has a PDF extension.
def _get_linked_pdf_path(self, filename: str, pdf_path: str = None) -> str:
"""Get the linked pdf file path, return None if it doesn't exist.

:param filename: The path to the PDF document.
:param filename: The name to the PDF document.
:param pdf_path: The path to the PDF documents, if any, defaults to None.
"""
path = self.pdf_path
path = pdf_path if pdf_path is not None else self.pdf_path
# If path is file, but not PDF.
if os.path.isfile(path) and path.lower().endswith(".pdf"):
return True
return path
else:
full_path = os.path.join(path, filename)
if os.path.isfile(full_path) and full_path.lower().endswith(".pdf"):
return True
elif os.path.isfile(os.path.join(path, filename + ".pdf")):
return True
elif os.path.isfile(os.path.join(path, filename + ".PDF")):
return True
return full_path
full_path = os.path.join(path, filename + ".pdf")
if os.path.isfile(full_path):
return full_path
full_path = os.path.join(path, filename + ".PDF")
if os.path.isfile(full_path):
return full_path

return None

return False
def is_linkable(self, filename: str, pdf_path: str = None) -> bool:
"""Verify that the file exists and has a PDF extension.

:param filename: The path to the PDF document.
:param pdf_path: The path to the PDF documents, if any, defaults to None.
"""
return False if self._get_linked_pdf_path(filename, pdf_path) is None else True
senwu marked this conversation as resolved.
Show resolved Hide resolved

def _coordinates_from_HTML(
self, page: Tag, page_num: int
Expand Down
125 changes: 125 additions & 0 deletions tests/parser/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import pytest

import fonduer
from fonduer.parser import Parser
from fonduer.parser.lingual_parser import SpacyParser
from fonduer.parser.models import Document
from fonduer.parser.parser import ParserUDF, SimpleParser
Expand All @@ -14,6 +16,15 @@
TSVDocPreprocessor,
)

DB = "parser_test"
if "CI" in os.environ:
CONN_STRING = (
f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}"
+ f"@{os.environ['POSTGRES_HOST']}:{os.environ['POSTGRES_PORT']}/{DB}"
)
else:
CONN_STRING = f"postgresql://127.0.0.1:5432/{DB}"


def get_parser_udf(
structural=True, # structural information
Expand Down Expand Up @@ -812,3 +823,117 @@ def test_parser_no_image():

# Check that doc has no figures
assert len(doc.figures) == 0


def test_various_file_path_formats():
"""Test the parser with various file path formats."""
session = fonduer.Meta.init(CONN_STRING).Session()

def parse(docs_path, pdf_path):
# Preprocessor for the Docs
doc_preprocessor = HTMLDocPreprocessor(docs_path)

# Create an Parser and parse the documents
corpus_parser = Parser(
session,
parallelism=1,
structural=True,
lingual=True,
visual=True,
pdf_path=pdf_path,
)

corpus_parser.clear()
corpus_parser.apply(doc_preprocessor)
return corpus_parser

# Test that doc_path is a file path, and pdf_path is a file path
docs_path = "tests/data/html_simple/md.html"
pdf_path = "tests/data/pdf_simple/md.pdf"

corpus_parser = parse(docs_path, pdf_path)
docs = corpus_parser.get_documents()
assert len(docs) == 1
assert docs[0].sentences[0].top is not None

# Test that doc_path is a file path, and pdf_path is a file path
docs_path = "tests/data/html_simple/"
pdf_path = "tests/data/pdf_simple/"

corpus_parser = parse(docs_path, pdf_path)
docs = corpus_parser.get_documents()
assert len(docs) == 6
for doc in docs:
# table_span.pdf and no_image_unsorted.pdf are not exist, so no
# coordinate info available
if doc.name in ["table_span", "no_image_unsorted"]:
assert doc.sentences[0].top is None
else:
assert doc.sentences[0].top is not None

# Test that doc_path is a directory (no trailing slash), and pdf_path is a
# directory
docs_path = "tests/data/html_simple"
pdf_path = "tests/data/pdf_simple/"

corpus_parser = parse(docs_path, pdf_path)
docs = corpus_parser.get_documents()
assert len(docs) == 6
for doc in docs:
# table_span.pdf and no_image_unsorted.pdf are not exist, so no
# coordinate info available
if doc.name in ["table_span", "no_image_unsorted"]:
assert doc.sentences[0].top is None
else:
assert doc.sentences[0].top is not None

# Test that doc_path is a directory, and pdf_path is a directory (no trailing
# slash)
docs_path = "tests/data/html_simple/"
pdf_path = "tests/data/pdf_simple"

corpus_parser = parse(docs_path, pdf_path)
docs = corpus_parser.get_documents()
assert len(docs) == 6
for doc in docs:
# table_span.pdf and no_image_unsorted.pdf are not exist, so no
# coordinate info available
if doc.name in ["table_span", "no_image_unsorted"]:
assert doc.sentences[0].top is None
else:
assert doc.sentences[0].top is not None

# Test that doc_path is a directory (no trailing slash), and pdf_path is a
# directory (no trailing slash)
docs_path = "tests/data/html_simple"
pdf_path = "tests/data/pdf_simple"

corpus_parser = parse(docs_path, pdf_path)
docs = corpus_parser.get_documents()
assert len(docs) == 6
for doc in docs:
# table_span.pdf and no_image_unsorted.pdf are not exist, so no
# coordinate info available
if doc.name in ["table_span", "no_image_unsorted"]:
assert doc.sentences[0].top is None
else:
assert doc.sentences[0].top is not None

# Test that doc_path is a file path, and pdf_path is a directory
docs_path = "tests/data/html_extended/ext_diseases.html"
pdf_path = "tests/data/pdf_extended/"

corpus_parser = parse(docs_path, pdf_path)
docs = corpus_parser.get_documents()
assert len(docs) == 1
assert docs[0].sentences[0].top is not None

# Test that doc_path is a file path, and pdf_path is a directory (no trailing
# slash)
docs_path = "tests/data/html_extended/ext_diseases.html"
pdf_path = "tests/data/pdf_extended"

corpus_parser = parse(docs_path, pdf_path)
docs = corpus_parser.get_documents()
assert len(docs) == 1
assert docs[0].sentences[0].top is not None