Adding support for self-referencing (#59) (#60)

* refactor: change default logging level * feat: tested version of self_references * chore * feat: Support and test asynchronous calls * chore
jannisborn · Nov 26, 2024 · c546e80 · c546e80
1 parent 0afbada
commit c546e80
Show file tree

Hide file tree

Showing 15 changed files with 255 additions and 14 deletions.
diff --git a/paperscraper/__init__.py b/paperscraper/__init__.py
@@ -11,7 +11,7 @@
 from .load_dumps import QUERY_FN_DICT
 from .utils import get_filename_from_query
 
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
 logger = logging.getLogger(__name__)
 
 # Set urllib logging depth

diff --git a/paperscraper/citations/__init__.py b/paperscraper/citations/__init__.py
@@ -0,0 +1,2 @@
+from ..scholar import get_citations_from_title
+from .core import self_references, self_references_paper
diff --git a/paperscraper/citations/core.py b/paperscraper/citations/core.py
@@ -0,0 +1,119 @@
+import asyncio
+import logging
+import re
+import sys
+from typing import Dict, Iterable, Union
+
+import httpx
+
+from ..utils import optional_async
+from .utils import check_overlap, doi_pattern
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger = logging.getLogger(__name__)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+
+
+@optional_async
+async def self_references(
+    inputs: Union[str, Iterable[str]],
+    relative: bool = False,
+    verbose: bool = False,
+) -> Dict[str, Dict[str, Union[float, int]]]:
+    """
+    Analyze self-references for a DOI or a list of DOIs.
+
+    Args:
+        inputs: A single DOI or an iterable of DOIs.
+        relative: If True, returns self-citations as percentages; otherwise, as raw counts.
+                  Defaults to False.
+        verbose: Whether to log detailed information. Defaults to False.
+
+    Returns:
+        A dictionary where the keys are DOIs and the values are dictionaries mapping
+        authors to their self-citations.
+
+    Raises:
+        NotImplementedError: If the input does not match a DOI format.
+    """
+    if isinstance(inputs, str):
+        inputs = [inputs]
+
+    results: Dict[str, Dict[str, Union[float, int]]] = {}
+
+    tasks = []
+
+    for sample in inputs:
+        dois = re.findall(doi_pattern, sample, re.IGNORECASE)
+        if len(dois) == 1:
+            # This is a DOI
+            tasks.append(
+                (
+                    sample,
+                    self_references_paper(dois[0], verbose=verbose, relative=relative),
+                )
+            )
+        elif len(dois) == 0:
+            # TODO: Check that it is a proper name or an ORCID ID
+            raise NotImplementedError(
+                "Analyzing self-references of whole authors is not yet implemented."
+            )
+    completed_tasks = await asyncio.gather(*[task[1] for task in tasks])
+    for sample, task_result in zip(tasks, completed_tasks):
+        results[sample[0]] = task_result
+
+    return results
+
+
+@optional_async
+async def self_references_paper(
+    doi: str,
+    relative: bool = False,
+    verbose: bool = False,
+) -> Dict[str, Union[float, int]]:
+    """
+    Analyze self-references for a single DOI.
+
+    Args:
+        doi: The DOI to analyze.
+        relative: If True, returns self-citations as percentages; otherwise, as raw counts.
+                  Defaults to False.
+        verbose: Whether to log detailed information. Defaults to False.
+
+    Returns:
+        A dictionary mapping authors to their self-citations.
+
+    Raises:
+        ValueError: If no references are found for the given DOI.
+    """
+    async with httpx.AsyncClient() as client:
+        response = await client.get(
+            f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}",
+            params={"fields": "title,authors,references.authors"},
+        )
+        response.raise_for_status()
+        paper = response.json()
+
+    if not paper["references"]:
+        raise ValueError("Could not find citations from Semantic Scholar")
+
+    authors: Dict[str, int] = {a["name"]: 0 for a in paper["authors"]}
+
+    for ref in paper["references"]:
+        ref_authors = {a["name"] for a in ref["authors"]}
+        for author in authors:
+            if any(check_overlap(author, ra) for ra in ref_authors):
+                authors[author] += 1
+    total = len(paper["references"])
+
+    if verbose:
+        logger.info(f"Self references in \"{paper['title']}\"")
+        logger.info(f" N = {len(paper['references'])}")
+        for author, self_cites in authors.items():
+            logger.info(f" {author}: {100*(self_cites/total):.2f}% self-references")
+
+    if relative:
+        for author, self_cites in authors.items():
+            authors[author] = round(100 * self_cites / total, 2)
+
+    return authors
diff --git a/paperscraper/citations/tests/__init__.py b/paperscraper/citations/tests/__init__.py
diff --git a/paperscraper/citations/tests/test_self_references.py b/paperscraper/citations/tests/test_self_references.py
@@ -0,0 +1,84 @@
+import asyncio
+import logging
+import time
+
+import pytest
+
+from paperscraper.citations import self_references
+
+logging.disable(logging.INFO)
+
+
+class TestSelfReferences:
+    @pytest.fixture
+    def dois(self):
+        return [
+            "10.1038/s43586-024-00334-2",
+            "10.1038/s41586-023-06600-9",
+            "10.1016/j.neunet.2014.09.003",
+        ]
+
+    def test_single_doi(self, dois):
+        for relative in [True, False]:
+            result = self_references(dois[0], relative=relative)
+            assert isinstance(result, dict)
+            assert len(result) > 0
+            for doi, self_cite_dict in result.items():
+                assert isinstance(doi, str)
+                assert isinstance(self_cite_dict, dict)
+                for author, self_cites in self_cite_dict.items():
+                    assert isinstance(author, str)
+                    if relative:
+                        assert isinstance(self_cites, float)
+                        assert self_cites >= 0 and self_cites <= 100
+                    else:
+                        assert isinstance(self_cites, int)
+                        assert self_cites >= 0
+
+    def test_multiple_dois(self, dois):
+        for relative in [True, False]:
+            result = self_references(dois[1:], relative=relative)
+            assert isinstance(result, dict)
+            assert len(result) == len(dois[1:])
+            for doi, self_cite_dict in result.items():
+                assert isinstance(doi, str)
+                assert isinstance(self_cite_dict, dict)
+                for author, self_cites in self_cite_dict.items():
+                    assert isinstance(author, str)
+                    if relative:
+                        assert isinstance(self_cites, float)
+                        assert self_cites >= 0 and self_cites <= 100
+                    else:
+                        assert isinstance(self_cites, int)
+                        assert self_cites >= 0
+
+    def test_not_implemented_error(self):
+        with pytest.raises(NotImplementedError):
+            self_references("John Jumper")
+
+    def test_compare_async_and_sync_performance(self, dois):
+        """
+        Compares the execution time of asynchronous and synchronous `self_references`
+        for a list of DOIs.
+        """
+
+        start_time = time.perf_counter()
+        self_references(dois)
+        async_duration = time.perf_counter() - start_time
+
+        # Measure synchronous execution time (three independent calls)
+        start_time = time.perf_counter()
+        for doi in dois:
+            self_references(doi)
+        sync_duration = time.perf_counter() - start_time
+
+        print(f"Asynchronous execution time (batch): {async_duration:.2f} seconds")
+        print(
+            f"Synchronous execution time (independent calls): {sync_duration:.2f} seconds"
+        )
+
+        # Assert that async execution (batch) is faster or at least not slower
+        assert async_duration <= sync_duration, (
+            f"Async execution ({async_duration:.2f}s) is slower than sync execution "
+            f"({sync_duration:.2f}s)"
+        )
diff --git a/paperscraper/citations/utils.py b/paperscraper/citations/utils.py
@@ -0,0 +1,23 @@
+from typing import List
+
+import httpx
+
+doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
+
+
+def check_overlap(n1: str, n2: str) -> bool:
+    """
+    Check whether two author names are identical.
+    TODO: This can be made more robust
+
+    Args:
+        n1: first name
+        n2: second name
+
+    Returns:
+        bool: Whether names are identical.
+    """
+    # remove initials and check for name intersection
+    s1 = {w for w in n1.lower().replace(".", "").split() if len(w) > 1}
+    s2 = {w for w in n2.lower().replace(".", "").split() if len(w) > 1}
+    return len(s1 | s2) == len(s1)
diff --git a/paperscraper/get_dumps/chemrxiv.py b/paperscraper/get_dumps/chemrxiv.py
@@ -10,7 +10,7 @@
 
 from .utils.chemrxiv import ChemrxivAPI, download_full, parse_dump
 
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 today = datetime.today().strftime("%Y-%m-%d")

diff --git a/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py b/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
@@ -7,7 +7,7 @@
 
 import requests
 
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 now_datetime = datetime.now()

diff --git a/paperscraper/get_dumps/utils/chemrxiv/utils.py b/paperscraper/get_dumps/utils/chemrxiv/utils.py
@@ -13,7 +13,7 @@
 
 from .chemrxiv_api import ChemrxivAPI
 
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 today = datetime.today().strftime("%Y-%m-%d")
@@ -90,7 +90,6 @@ def parse_dump(source_path: str, target_path: str) -> None:
     dump = []
     # Read source dump
     for file_name in tqdm(os.listdir(source_path)):
-
         if not file_name.endswith(".json"):
             continue
         filepath = os.path.join(source_path, file_name)
@@ -131,7 +130,6 @@ def download_full(save_dir: str, api: Optional[ChemrxivAPI] = None) -> None:
 
     os.makedirs(save_dir, exist_ok=True)
     for preprint in tqdm(api.all_preprints()):
-
         path = os.path.join(save_dir, f"{preprint['item']['id']}.json")
         if os.path.exists(path):
             continue

diff --git a/paperscraper/load_dumps.py b/paperscraper/load_dumps.py
@@ -9,7 +9,7 @@
 from .pubmed import get_and_dump_pubmed_papers
 from .xrxiv.xrxiv_query import XRXivQuery
 
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 # Set up the query dictionary

diff --git a/paperscraper/pdf.py b/paperscraper/pdf.py
@@ -14,7 +14,7 @@
 
 from .utils import load_jsonl
 
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 ABSTRACT_ATTRIBUTE = {

diff --git a/paperscraper/postprocessing.py b/paperscraper/postprocessing.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pandas as pd
 
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -75,7 +75,6 @@ def aggregate_paper(
         # At least one synonym per keyword needs to be in either title or
         # abstract.
         if filtering and filter_keys != list():
-
             # Filter out papers which undesired terms
             unwanted = False
             for unwanted_key in unwanted_keys:

diff --git a/paperscraper/scholar/scholar.py b/paperscraper/scholar/scholar.py
@@ -7,7 +7,7 @@
 
 from ..utils import dump_papers
 
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -49,7 +49,6 @@ def get_scholar_papers(
 
     processed = []
     for paper in matches:
-
         # Extracts title, author, year, journal, abstract
         entry = {
             scholar_field_mapper.get(key, key): process_fields.get(

diff --git a/paperscraper/utils.py b/paperscraper/utils.py
@@ -1,11 +1,13 @@
+import asyncio
 import json
 import logging
 import sys
+from functools import wraps
 from typing import Dict, List
 
 import pandas as pd
 
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -68,3 +70,18 @@ def load_jsonl(filepath: str) -> List[Dict[str, str]]:
     with open(filepath, "r") as f:
         data = [json.loads(line) for line in f.readlines()]
     return data
+
+
+def optional_async(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # Check if there's an active event loop
+        try:
+            loop = asyncio.get_running_loop()
+            # If we're in an async context, await the function
+            return func(*args, **kwargs)
+        except RuntimeError:
+            # Otherwise, run it synchronously using asyncio.run
+            return asyncio.run(func(*args, **kwargs))
+
+    return wrapper
diff --git a/paperscraper/xrxiv/xrxiv_query.py b/paperscraper/xrxiv/xrxiv_query.py
@@ -6,7 +6,7 @@
 
 import pandas as pd
 
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from ..scholar import get_citations_from_title
		from .core import self_references, self_references_paper