Feature/mx 1604 wikidata search endpoint (#91)

# Added - `/wikidata` endpoint to fetch all matching organizations from wikidata
robert-koch-institut · Jul 26, 2024 · b5a2111 · b5a2111
1 parent fccd5ca
commit b5a2111
Show file tree

Hide file tree

Showing 9 changed files with 458 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- `/wikidata` endpoint to fetch all matching organizations from wikidata
 - add support for computed fields in graph queries
 
 ### Changes

diff --git a/mex/backend/auxiliary/__init__.py b/mex/backend/auxiliary/__init__.py
diff --git a/mex/backend/auxiliary/models.py b/mex/backend/auxiliary/models.py
@@ -0,0 +1,14 @@
+from typing import Generic, TypeVar
+
+from pydantic import BaseModel
+
+T = TypeVar("T")
+
+
+class PagedResponseSchema(BaseModel, Generic[T]):
+    """Response schema for any paged API."""
+
+    total: int
+    offset: int
+    limit: int
+    results: list[T]
diff --git a/mex/backend/auxiliary/wikidata.py b/mex/backend/auxiliary/wikidata.py
@@ -0,0 +1,74 @@
+from functools import cache
+from typing import Annotated
+
+from fastapi import APIRouter, Query
+
+from mex.backend.auxiliary.models import PagedResponseSchema
+from mex.common.models import ExtractedOrganization, ExtractedPrimarySource
+from mex.common.primary_source.extract import extract_seed_primary_sources
+from mex.common.primary_source.transform import (
+    get_primary_sources_by_name,
+    transform_seed_primary_sources_to_extracted_primary_sources,
+)
+from mex.common.types import TextLanguage
+from mex.common.wikidata.extract import (
+    get_count_of_found_organizations_by_label,
+    search_organizations_by_label,
+)
+from mex.common.wikidata.transform import (
+    transform_wikidata_organizations_to_extracted_organizations,
+)
+
+router = APIRouter()
+
+
+@router.get("/wikidata", status_code=200, tags=["wikidata"])
+def search_organization_in_wikidata(
+    q: Annotated[str, Query(min_length=1, max_length=1000)],
+    offset: Annotated[int, Query(ge=0, le=10e10)] = 0,
+    limit: Annotated[int, Query(ge=1, le=100)] = 10,
+    lang: TextLanguage = TextLanguage.EN,
+) -> PagedResponseSchema[ExtractedOrganization]:
+    """Search an organization in wikidata.
+
+    Args:
+        q: label of the organization to be searched
+        offset: start page number
+        limit: end page number
+        lang: language of the label. Example: en, de
+
+    Returns:
+        Paginated list of ExtractedOrganization
+    """
+    total_orgs = get_count_of_found_organizations_by_label(q, lang)
+    organizations = search_organizations_by_label(q, offset, limit, lang)
+
+    extracted_organizations = list(
+        transform_wikidata_organizations_to_extracted_organizations(
+            organizations, extracted_primary_source_wikidata()
+        )
+    )
+
+    return PagedResponseSchema(
+        total=total_orgs,
+        offset=offset,
+        limit=limit,
+        results=[organization for organization in extracted_organizations],
+    )
+
+
+@cache
+def extracted_primary_source_wikidata() -> ExtractedPrimarySource:
+    """Load and return wikidata primary source."""
+    seed_primary_sources = extract_seed_primary_sources()
+    extracted_primary_sources = list(
+        transform_seed_primary_sources_to_extracted_primary_sources(
+            seed_primary_sources
+        )
+    )
+    (extracted_primary_source_wikidata,) = get_primary_sources_by_name(
+        extracted_primary_sources,
+        "wikidata",
+    )
+
+    return extracted_primary_source_wikidata
diff --git a/mex/backend/main.py b/mex/backend/main.py
@@ -9,6 +9,7 @@
 from fastapi.openapi.utils import get_openapi
 from pydantic import BaseModel
 
+from mex.backend.auxiliary.wikidata import router as wikidata_router
 from mex.backend.exceptions import handle_uncaught_exception
 from mex.backend.extracted.main import router as extracted_router
 from mex.backend.identity.main import router as identity_router
@@ -88,6 +89,7 @@ async def lifespan(_: FastAPI) -> AsyncIterator[None]:
 router.include_router(ingest_router, dependencies=[Depends(has_write_access)])
 router.include_router(merged_router, dependencies=[Depends(has_read_access)])
 router.include_router(rules_router, dependencies=[Depends(has_write_access)])
+router.include_router(wikidata_router, dependencies=[Depends(has_read_access)])
 
 
 class SystemStatus(BaseModel):

diff --git a/tests/auxiliary/__init__.py b/tests/auxiliary/__init__.py
diff --git a/tests/auxiliary/conftest.py b/tests/auxiliary/conftest.py
@@ -0,0 +1,91 @@
+import json
+from pathlib import Path
+from typing import Any
+from unittest.mock import MagicMock, Mock
+
+import pytest
+import requests
+from pytest import MonkeyPatch
+from requests import Response
+
+from mex.common.wikidata.connector import (
+    WikidataAPIConnector,
+    WikidataQueryServiceConnector,
+)
+from mex.common.wikidata.models.organization import WikidataOrganization
+
+TEST_DATA_DIR = Path(__file__).parent / "test_data"
+
+
+@pytest.fixture
+def wikidata_organization_raw() -> dict[str, Any]:
+    """Return a raw wikidata organization."""
+    with open(TEST_DATA_DIR / "wikidata_organization_raw.json") as fh:
+        return json.load(fh)
+
+
+@pytest.fixture
+def wikidata_organization(
+    wikidata_organization_raw: dict[str, Any],
+) -> WikidataOrganization:
+    """Return a wikidata organization instance."""
+    return WikidataOrganization.model_validate(wikidata_organization_raw)
+
+
+@pytest.fixture
+def mocked_wikidata(
+    monkeypatch: MonkeyPatch, wikidata_organization_raw: dict[str, Any]
+) -> None:
+    """Mock wikidata connector."""
+    response_query = Mock(spec=Response, status_code=200)
+
+    session = MagicMock(spec=requests.Session)
+    session.get = MagicMock(side_effect=[response_query])
+
+    def mocked_init(self: WikidataQueryServiceConnector) -> None:
+        self.session = session
+
+    monkeypatch.setattr(WikidataQueryServiceConnector, "__init__", mocked_init)
+    monkeypatch.setattr(WikidataAPIConnector, "__init__", mocked_init)
+
+    # mock search_wikidata_with_query
+
+    def get_data_by_query(
+        self: WikidataQueryServiceConnector, query: str
+    ) -> list[dict[str, dict[str, str]]]:
+        return [
+            {
+                "item": {
+                    "type": "uri",
+                    "value": "http://www.wikidata.org/entity/Q26678",
+                },
+                "itemLabel": {"xml:lang": "en", "type": "literal", "value": "BMW"},
+                "itemDescription": {
+                    "xml:lang": "en",
+                    "type": "literal",
+                    "value": "German automotive manufacturer, and conglomerate",
+                },
+                "count": {
+                    "datatype": "http://www.w3.org/2001/XMLSchema#integer",
+                    "type": "literal",
+                    "value": "3",
+                },
+            },
+        ]
+
+    monkeypatch.setattr(
+        WikidataQueryServiceConnector, "get_data_by_query", get_data_by_query
+    )
+
+    # mock get_wikidata_org_with_org_id
+
+    def get_wikidata_item_details_by_id(
+        self: WikidataQueryServiceConnector, item_id: str
+    ) -> dict[str, str]:
+        return wikidata_organization_raw
+
+    monkeypatch.setattr(
+        WikidataAPIConnector,
+        "get_wikidata_item_details_by_id",
+        get_wikidata_item_details_by_id,
+    )