Skip to content

Commit

Permalink
save_pdf can now also save paper metadata in json format (#57)
Browse files Browse the repository at this point in the history
* chore: version and deps bump

* feat: Save json with paper metadata

* test: remove mock of PDF
  • Loading branch information
jannisborn authored Nov 13, 2024
1 parent 3f4f8e2 commit 0afbada
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 45 deletions.
2 changes: 1 addition & 1 deletion paperscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Initialize the module."""

__name__ = "paperscraper"
__version__ = "0.2.14"
__version__ = "0.2.15"

import logging
import os
Expand Down
113 changes: 88 additions & 25 deletions paperscraper/pdf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""Functionalities to scrape PDF files of publications."""

import json
import logging
import os
import sys
from pathlib import Path
from typing import Any, Dict

import requests
import tldextract
from bs4 import BeautifulSoup
from tqdm import tqdm

Expand All @@ -15,55 +17,111 @@
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging.getLogger(__name__)

ABSTRACT_ATTRIBUTE = {
"biorxiv": ["DC.Description"],
"arxiv": ["citation_abstract"],
"chemrxiv": ["citation_abstract"],
}
DEFAULT_ATTRIBUTES = ["citation_abstract", "description"]

def save_pdf(paper_metadata: Dict[str, Any], filepath: str) -> None:

def save_pdf(
paper_metadata: Dict[str, Any], filepath: str, save_metadata: bool = False
) -> None:
"""
Save a PDF file of a paper.
Args:
paper_metadata (Dict[str, Any]): A dictionary with the paper metadata. Must
paper_metadata: A dictionary with the paper metadata. Must
contain the `doi` key.
filepath (str): Path to the file to be saved.
filepath: Path to the PDF file to be saved (with or without suffix).
save_metadata: A boolean indicating whether to save paper metadata as a separate json.
"""
if not isinstance(paper_metadata, Dict):
raise TypeError(f"paper_metadata must be a dict, not {type(paper_metadata)}.")
if "doi" not in paper_metadata.keys():
raise KeyError("paper_metadata must contain the key 'doi'.")
if not isinstance(filepath, str):
raise TypeError(f"filepath must be a string, not {type(filepath)}.")
if not filepath.endswith(".pdf"):
raise ValueError("Please provide a filepath with .pdf extension.")
if not Path(filepath).parent.exists():
raise ValueError(f"The folder: {Path(filepath).parent} seems to not exist.")

output_path = Path(filepath)

if not Path(output_path).parent.exists():
raise ValueError(f"The folder: {output_path.parent} seems to not exist.")

url = f"https://doi.org/{paper_metadata['doi']}"
try:
response = requests.get(url, timeout=60)
except Exception:
logger.warning(f"Could not download {url}.")
response.raise_for_status()
except Exception as e:
logger.error(f"Could neither download paper nor metadata from {url}: {e}")
return

soup = BeautifulSoup(response.text, features="lxml")

metas = soup.find("meta", {"name": "citation_pdf_url"})
if metas is None:
logger.warning(
f"Could not find PDF for: {url} (either there's a paywall or the host "
"blocks PDF scraping)."
)
meta_pdf = soup.find("meta", {"name": "citation_pdf_url"})
if meta_pdf and meta_pdf.get("content"):
pdf_url = meta_pdf.get("content")
try:
response = requests.get(pdf_url, timeout=60)
response.raise_for_status()

if response.content[:4] != b"%PDF":
logger.warning(
f"The file from {url} does not appear to be a valid PDF."
)
else:
with open(output_path.with_suffix(".pdf"), "wb+") as f:
f.write(response.content)
except Exception as e:
logger.warning(f"Could not download {pdf_url}: {e}")

if not save_metadata:
return
pdf_url = metas.attrs.get("content")

metadata = {}
# Extract title
title_tag = soup.find("meta", {"name": "citation_title"})
metadata["title"] = title_tag["content"] if title_tag else "Title not found"

# Extract authors
authors = []
for author_tag in soup.find_all("meta", {"name": "citation_author"}):
if author_tag.get("content"):
authors.append(author_tag["content"])
metadata["authors"] = authors if authors else ["Author information not found"]

# Extract abstract
domain = tldextract.extract(url).domain
abstract_keys = ABSTRACT_ATTRIBUTE.get(domain, DEFAULT_ATTRIBUTES)

for key in abstract_keys:
abstract_tag = soup.find("meta", {"name": key})
if abstract_tag:
raw_abstract = BeautifulSoup(
abstract_tag["content"], "html.parser"
).get_text(separator="\n")
if raw_abstract.strip().startswith("Abstract"):
raw_abstract = raw_abstract.strip()[8:]
metadata["abstract"] = raw_abstract.strip()
break

if "abstract" not in metadata.keys():
metadata["abstract"] = "Abstract not found"
logger.warning(f"Could not find abstract for {url}")
elif metadata["abstract"].endswith("..."):
logger.warning(f"Abstract truncated from {url}")

# Save metadata to JSON
try:
response = requests.get(pdf_url, timeout=60)
except Exception:
logger.warning(f"Could not download {pdf_url}.")
return
with open(filepath, "wb+") as f:
f.write(response.content)
with open(output_path.with_suffix(".json"), "w", encoding="utf-8") as f:
json.dump(metadata, f, ensure_ascii=False, indent=4)
except Exception as e:
logger.error(f"Failed to save metadata to {str(output_path)}: {e}")


def save_pdf_from_dump(dump_path: str, pdf_path: str, key_to_save: str = "doi") -> None:
def save_pdf_from_dump(
dump_path: str, pdf_path: str, key_to_save: str = "doi", save_metadata: bool = False
) -> None:
"""
Receives a path to a `.jsonl` dump with paper metadata and saves the PDF files of
each paper.
Expand All @@ -73,6 +131,7 @@ def save_pdf_from_dump(dump_path: str, pdf_path: str, key_to_save: str = "doi")
pdf_path: Path to a folder where the files will be stored.
key_to_save: Key in the paper metadata to use as filename.
Has to be `doi` or `title`. Defaults to `doi`.
save_metadata: A boolean indicating whether to save paper metadata as a separate json.
"""

if not isinstance(dump_path, str):
Expand All @@ -98,4 +157,8 @@ def save_pdf_from_dump(dump_path: str, pdf_path: str, key_to_save: str = "doi")
logger.warning(f"Skipping {paper['title']} since no DOI available.")
continue
filename = paper[key_to_save].replace("/", "_")
save_pdf(paper, os.path.join(pdf_path, f"{filename}.pdf"))
save_pdf(
paper,
os.path.join(pdf_path, f"{filename}.pdf"),
save_metadata=save_metadata,
)
68 changes: 50 additions & 18 deletions paperscraper/tests/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from unittest.mock import MagicMock, mock_open, patch

import pytest

from paperscraper.load_dumps import QUERY_FN_DICT
from paperscraper.pdf import save_pdf, save_pdf_from_dump

Expand All @@ -24,9 +23,56 @@ def paper_data(self):

def test_basic_search(self):
paper_data = {"doi": "10.48550/arXiv.2207.03928"}
save_pdf(paper_data, filepath="gt4sd_paper.pdf")
assert os.path.exists("gt4sd_paper.pdf")
os.remove("gt4sd_paper.pdf")
save_pdf(paper_data, filepath="gt4sd.pdf", save_metadata=True)
assert os.path.exists("gt4sd.pdf")
assert os.path.exists("gt4sd.json")
os.remove("gt4sd.pdf")
os.remove("gt4sd.json")

# # chemrxiv
paper_data = {"doi": "10.26434/chemrxiv-2021-np7xj-v4"}
save_pdf(paper_data, filepath="kinases.pdf", save_metadata=True)
assert os.path.exists("kinases.pdf")
assert os.path.exists("kinases.json")
os.remove("kinases.pdf")
os.remove("kinases.json")

# biorxiv
paper_data = {"doi": "10.1101/798496"}
save_pdf(paper_data, filepath="taskload.pdf", save_metadata=True)
assert os.path.exists("taskload.pdf")
assert os.path.exists("taskload.json")
os.remove("taskload.pdf")
os.remove("taskload.json")

# # medrxiv
paper_data = {"doi": "10.1101/2020.09.02.20187096"}
save_pdf(paper_data, filepath="covid_review.pdf", save_metadata=True)
assert os.path.exists("covid_review.pdf")
assert os.path.exists("covid_review.json")
os.remove("covid_review.pdf")
os.remove("covid_review.json")

# # journal with OA paper
paper_data = {"doi": "10.1038/s42256-023-00639-z"}
save_pdf(paper_data, filepath="regression_transformer", save_metadata=True)
assert os.path.exists("regression_transformer.pdf")
assert os.path.exists("regression_transformer.json")
os.remove("regression_transformer.pdf")
os.remove("regression_transformer.json")

# book chapter with paywall
paper_data = {"doi": "10.1007/978-981-97-4828-0_7"}
save_pdf(paper_data, filepath="clm_chapter", save_metadata=True)
assert not os.path.exists("clm_chapter.pdf")
assert os.path.exists("clm_chapter.json")
os.remove("clm_chapter.json")

# journal without OA paper
paper_data = {"doi": "10.1126/science.adk9587"}
save_pdf(paper_data, filepath="color", save_metadata=True)
assert not os.path.exists("color.pdf")
assert not os.path.exists("color.json")

def test_missing_doi(self):
with pytest.raises(KeyError):
Expand Down Expand Up @@ -81,20 +127,6 @@ def test_network_issues_on_pdf_url_request(self, mock_get, paper_data):
save_pdf(paper_metadata=paper_data, filepath="output.pdf")
assert not os.path.exists("output.pdf")

@patch("requests.get")
@patch("builtins.open", new_callable=mock_open)
def test_successful_pdf_download_and_save(self, mock_file, mock_get, paper_data):
response_doi = MagicMock()
response_doi.text = (
'<meta name="citation_pdf_url" content="http://valid.url/document.pdf">'
)
response_pdf = MagicMock()
response_pdf.content = b"PDF content"
mock_get.side_effect = [response_doi, response_pdf]
save_pdf(paper_metadata=paper_data, filepath="output.pdf")
mock_file.assert_called_once_with("output.pdf", "wb+")
mock_file().write.assert_called_once_with(b"PDF content")

def test_save_pdf_from_dump_without_path(self):
with pytest.raises(ValueError):
save_pdf_from_dump(TEST_FILE_PATH, pdf_path=SAVE_PATH, key_to_save="doi")
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ matplotlib-venn>=0.11.5
bs4>=0.0.1
impact-factor>=1.1.1
thefuzz>=0.20.0
pytest
pytest
tldextract
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"impact-factor>=1.1.1",
"thefuzz",
"pytest",
"tldextract",
],
keywords=[
"Academics",
Expand Down

0 comments on commit 0afbada

Please sign in to comment.