From b5a2111d16a4d216a8bb2c2107645530f224b3e1 Mon Sep 17 00:00:00 2001 From: Kamran Ali <33874616+mr-kamran-ali@users.noreply.github.com> Date: Fri, 26 Jul 2024 16:13:32 +0200 Subject: [PATCH] Feature/mx 1604 wikidata search endpoint (#91) # Added - `/wikidata` endpoint to fetch all matching organizations from wikidata --- CHANGELOG.md | 1 + mex/backend/auxiliary/__init__.py | 0 mex/backend/auxiliary/models.py | 14 ++ mex/backend/auxiliary/wikidata.py | 74 ++++++ mex/backend/main.py | 2 + tests/auxiliary/__init__.py | 0 tests/auxiliary/conftest.py | 91 +++++++ .../test_data/wikidata_organization_raw.json | 228 ++++++++++++++++++ tests/auxiliary/test_wikidata.py | 48 ++++ 9 files changed, 458 insertions(+) create mode 100644 mex/backend/auxiliary/__init__.py create mode 100644 mex/backend/auxiliary/models.py create mode 100644 mex/backend/auxiliary/wikidata.py create mode 100644 tests/auxiliary/__init__.py create mode 100644 tests/auxiliary/conftest.py create mode 100644 tests/auxiliary/test_data/wikidata_organization_raw.json create mode 100644 tests/auxiliary/test_wikidata.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d4b4578..acfae9d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- `/wikidata` endpoint to fetch all matching organizations from wikidata - add support for computed fields in graph queries ### Changes diff --git a/mex/backend/auxiliary/__init__.py b/mex/backend/auxiliary/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/mex/backend/auxiliary/models.py b/mex/backend/auxiliary/models.py new file mode 100644 index 00000000..0327e1f1 --- /dev/null +++ b/mex/backend/auxiliary/models.py @@ -0,0 +1,14 @@ +from typing import Generic, TypeVar + +from pydantic import BaseModel + +T = TypeVar("T") + + +class PagedResponseSchema(BaseModel, Generic[T]): + """Response schema for any paged API.""" + + total: int + offset: int + limit: int + results: list[T] diff --git a/mex/backend/auxiliary/wikidata.py b/mex/backend/auxiliary/wikidata.py new file mode 100644 index 00000000..04857f60 --- /dev/null +++ b/mex/backend/auxiliary/wikidata.py @@ -0,0 +1,74 @@ +from functools import cache +from typing import Annotated + +from fastapi import APIRouter, Query + +from mex.backend.auxiliary.models import PagedResponseSchema +from mex.common.models import ExtractedOrganization, ExtractedPrimarySource +from mex.common.primary_source.extract import extract_seed_primary_sources +from mex.common.primary_source.transform import ( + get_primary_sources_by_name, + transform_seed_primary_sources_to_extracted_primary_sources, +) +from mex.common.types import TextLanguage +from mex.common.wikidata.extract import ( + get_count_of_found_organizations_by_label, + search_organizations_by_label, +) +from mex.common.wikidata.transform import ( + transform_wikidata_organizations_to_extracted_organizations, +) + +router = APIRouter() + + +@router.get("/wikidata", status_code=200, tags=["wikidata"]) +def search_organization_in_wikidata( + q: Annotated[str, Query(min_length=1, max_length=1000)], + offset: Annotated[int, Query(ge=0, le=10e10)] = 0, + limit: Annotated[int, Query(ge=1, le=100)] = 10, + lang: TextLanguage = TextLanguage.EN, +) -> PagedResponseSchema[ExtractedOrganization]: + """Search an organization in wikidata. + + Args: + q: label of the organization to be searched + offset: start page number + limit: end page number + lang: language of the label. Example: en, de + + Returns: + Paginated list of ExtractedOrganization + """ + total_orgs = get_count_of_found_organizations_by_label(q, lang) + organizations = search_organizations_by_label(q, offset, limit, lang) + + extracted_organizations = list( + transform_wikidata_organizations_to_extracted_organizations( + organizations, extracted_primary_source_wikidata() + ) + ) + + return PagedResponseSchema( + total=total_orgs, + offset=offset, + limit=limit, + results=[organization for organization in extracted_organizations], + ) + + +@cache +def extracted_primary_source_wikidata() -> ExtractedPrimarySource: + """Load and return wikidata primary source.""" + seed_primary_sources = extract_seed_primary_sources() + extracted_primary_sources = list( + transform_seed_primary_sources_to_extracted_primary_sources( + seed_primary_sources + ) + ) + (extracted_primary_source_wikidata,) = get_primary_sources_by_name( + extracted_primary_sources, + "wikidata", + ) + + return extracted_primary_source_wikidata diff --git a/mex/backend/main.py b/mex/backend/main.py index c0038b89..ee1df61e 100644 --- a/mex/backend/main.py +++ b/mex/backend/main.py @@ -9,6 +9,7 @@ from fastapi.openapi.utils import get_openapi from pydantic import BaseModel +from mex.backend.auxiliary.wikidata import router as wikidata_router from mex.backend.exceptions import handle_uncaught_exception from mex.backend.extracted.main import router as extracted_router from mex.backend.identity.main import router as identity_router @@ -88,6 +89,7 @@ async def lifespan(_: FastAPI) -> AsyncIterator[None]: router.include_router(ingest_router, dependencies=[Depends(has_write_access)]) router.include_router(merged_router, dependencies=[Depends(has_read_access)]) router.include_router(rules_router, dependencies=[Depends(has_write_access)]) +router.include_router(wikidata_router, dependencies=[Depends(has_read_access)]) class SystemStatus(BaseModel): diff --git a/tests/auxiliary/__init__.py b/tests/auxiliary/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/auxiliary/conftest.py b/tests/auxiliary/conftest.py new file mode 100644 index 00000000..043f3139 --- /dev/null +++ b/tests/auxiliary/conftest.py @@ -0,0 +1,91 @@ +import json +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock, Mock + +import pytest +import requests +from pytest import MonkeyPatch +from requests import Response + +from mex.common.wikidata.connector import ( + WikidataAPIConnector, + WikidataQueryServiceConnector, +) +from mex.common.wikidata.models.organization import WikidataOrganization + +TEST_DATA_DIR = Path(__file__).parent / "test_data" + + +@pytest.fixture +def wikidata_organization_raw() -> dict[str, Any]: + """Return a raw wikidata organization.""" + with open(TEST_DATA_DIR / "wikidata_organization_raw.json") as fh: + return json.load(fh) + + +@pytest.fixture +def wikidata_organization( + wikidata_organization_raw: dict[str, Any], +) -> WikidataOrganization: + """Return a wikidata organization instance.""" + return WikidataOrganization.model_validate(wikidata_organization_raw) + + +@pytest.fixture +def mocked_wikidata( + monkeypatch: MonkeyPatch, wikidata_organization_raw: dict[str, Any] +) -> None: + """Mock wikidata connector.""" + response_query = Mock(spec=Response, status_code=200) + + session = MagicMock(spec=requests.Session) + session.get = MagicMock(side_effect=[response_query]) + + def mocked_init(self: WikidataQueryServiceConnector) -> None: + self.session = session + + monkeypatch.setattr(WikidataQueryServiceConnector, "__init__", mocked_init) + monkeypatch.setattr(WikidataAPIConnector, "__init__", mocked_init) + + # mock search_wikidata_with_query + + def get_data_by_query( + self: WikidataQueryServiceConnector, query: str + ) -> list[dict[str, dict[str, str]]]: + return [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q26678", + }, + "itemLabel": {"xml:lang": "en", "type": "literal", "value": "BMW"}, + "itemDescription": { + "xml:lang": "en", + "type": "literal", + "value": "German automotive manufacturer, and conglomerate", + }, + "count": { + "datatype": "http://www.w3.org/2001/XMLSchema#integer", + "type": "literal", + "value": "3", + }, + }, + ] + + monkeypatch.setattr( + WikidataQueryServiceConnector, "get_data_by_query", get_data_by_query + ) + + # mock get_wikidata_org_with_org_id + + def get_wikidata_item_details_by_id( + self: WikidataQueryServiceConnector, item_id: str + ) -> dict[str, str]: + return wikidata_organization_raw + + monkeypatch.setattr( + WikidataAPIConnector, + "get_wikidata_item_details_by_id", + get_wikidata_item_details_by_id, + ) diff --git a/tests/auxiliary/test_data/wikidata_organization_raw.json b/tests/auxiliary/test_data/wikidata_organization_raw.json new file mode 100644 index 00000000..d8afc9ca --- /dev/null +++ b/tests/auxiliary/test_data/wikidata_organization_raw.json @@ -0,0 +1,228 @@ +{ + "aliases": { + "de": [ + { + "language": "de", + "value": "alias_de_1" + }, + { + "language": "de", + "value": "alias_de_2" + }, + { + "language": "de", + "value": "alias_de_3" + } + ], + "en": [ + { + "language": "en", + "value": "alias_en_1" + }, + { + "language": "en", + "value": "alias_en_2" + }, + { + "language": "en", + "value": "alias_en_3" + }, + { + "language": "en", + "value": "alias_en_4" + } + ] + }, + "claims": { + "P1813": [ + { + "id": "Q679041$AAE01E9A-03EA-424E-A51A-222A4858C4DD", + "mainsnak": { + "datatype": "monolingualtext", + "datavalue": { + "type": "monolingualtext", + "value": { + "language": "en", + "text": "RKI" + } + }, + "hash": "6cd9c230521797cef15c529e5bb006a0c51e801e", + "property": "P1813", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + }, + { + "id": "Q679041$20A515C6-206D-4001-A408-4DA10F41533A", + "mainsnak": { + "datatype": "monolingualtext", + "datavalue": { + "type": "monolingualtext", + "value": { + "language": "de", + "text": "RKI" + } + }, + "hash": "03dcb3e47ca24e8ab90a1b11eb7602ceca2d07ad", + "property": "P1813", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + }, + { + "id": "Q679041$ac3e29c1-4ace-df94-91f7-d74b410c3582", + "mainsnak": { + "datatype": "monolingualtext", + "datavalue": { + "type": "monolingualtext", + "value": { + "language": "fr", + "text": "IRK" + } + }, + "hash": "966f7d0aee390d96edaafd00d04a07ec88844a1e", + "property": "P1813", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + } + ], + "P213": [ + { + "id": "Q679041$0ABA944D-81E3-4ED0-A792-52EC80175170", + "mainsnak": { + "datatype": "external-id", + "datavalue": { + "type": "string", + "value": "0000 0001 0940 3744" + }, + "hash": "17d825de2b5559de23b14b54519731a55a733ba4", + "property": "P213", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + } + ], + "P6782": [ + { + "id": "Q679041$42EED77F-B584-48C1-B1D7-DD1C27815BA6", + "mainsnak": { + "datatype": "external-id", + "datavalue": { + "type": "string", + "value": "01k5qnb77" + }, + "hash": "dd1172552e08b0ce0ac4f5af1c3b086fe95f4bdb", + "property": "P6782", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + } + ], + "P856": [ + { + "id": "Q679041$ccd210f4-4f33-9140-5060-a83edd44a7f2", + "mainsnak": { + "datatype": "url", + "datavalue": { + "type": "string", + "value": "https://www.rki.de/" + }, + "hash": "d07d9f8d73b9fa174b86cbbc7c5d3154f84e7a29", + "property": "P856", + "snaktype": "value" + }, + "rank": "normal", + "type": "statement" + }, + { + "id": "Q679041$3FE8023E-41AE-4DB3-B0B7-51419DA6CAE7", + "mainsnak": { + "datatype": "url", + "datavalue": { + "type": "string", + "value": "https://www.rki.de/DE/Home/homepage_node.html" + }, + "hash": "4da1521afde56c04ad95ba5d0b5977dc4cda248f", + "property": "P856", + "snaktype": "value" + }, + "qualifiers": { + "P407": [ + { + "datatype": "wikibase-item", + "datavalue": { + "type": "wikibase-entityid", + "value": { + "entity-type": "item", + "id": "Q188", + "numeric-id": 188 + } + }, + "hash": "46bfd327b830f66f7061ea92d1be430c135fa91f", + "property": "P407", + "snaktype": "value" + } + ] + }, + "qualifiers-order": [ + "P407" + ], + "rank": "normal", + "type": "statement" + }, + { + "id": "Q679041$324BC651-7212-4CE7-89A1-9E9135AAAA09", + "mainsnak": { + "datatype": "url", + "datavalue": { + "type": "string", + "value": "https://www.rki.de/EN/Home/homepage_node.html" + }, + "hash": "9e7237708fdfec88603db5ead3645b9d5d825808", + "property": "P856", + "snaktype": "value" + }, + "qualifiers": { + "P407": [ + { + "datatype": "wikibase-item", + "datavalue": { + "type": "wikibase-entityid", + "value": { + "entity-type": "item", + "id": "Q1860", + "numeric-id": 1860 + } + }, + "hash": "daf1c4fcb58181b02dff9cc89deb084004ddae4b", + "property": "P407", + "snaktype": "value" + } + ] + }, + "qualifiers-order": [ + "P407" + ], + "rank": "normal", + "type": "statement" + } + ] + }, + "id": "Q679041", + "labels": { + "de": { + "language": "de", + "value": "Robert Koch-Institut" + }, + "en": { + "language": "en", + "value": "Robert Koch Institute" + } + } +} diff --git a/tests/auxiliary/test_wikidata.py b/tests/auxiliary/test_wikidata.py new file mode 100644 index 00000000..5b9492d8 --- /dev/null +++ b/tests/auxiliary/test_wikidata.py @@ -0,0 +1,48 @@ +import pytest +from fastapi.testclient import TestClient +from pytest import MonkeyPatch + +from mex.backend.auxiliary import wikidata +from mex.common.models import ( + ExtractedPrimarySource, +) +from mex.common.types import Text + + +@pytest.mark.usefixtures( + "mocked_wikidata", +) +def test_search_organization_in_wikidata_mocked( + client_with_api_key_read_permission: TestClient, monkeypatch: MonkeyPatch +) -> None: + def extracted_primary_source_wikidata() -> ExtractedPrimarySource: + return ExtractedPrimarySource( + hadPrimarySource="00000000000000", + identifierInPrimarySource="wikidata", + title=[Text(value="Wikidata", language=None)], + entityType="ExtractedPrimarySource", + ) + + monkeypatch.setattr( + wikidata, "extracted_primary_source_wikidata", extracted_primary_source_wikidata + ) + + expected_total = 3 + expected_organization_identifier = "Q679041" + expected_organization_official_name = [ + {"value": "Robert Koch Institute", "language": "en"}, + {"value": "Robert Koch-Institut", "language": "de"}, + ] + organizations = client_with_api_key_read_permission.get( + "/v0/wikidata", params={"q": "rki"} + ).json() + + assert organizations["total"] == expected_total + assert ( + organizations["results"][0]["identifierInPrimarySource"] + == expected_organization_identifier + ) + assert ( + organizations["results"][0]["officialName"] + == expected_organization_official_name + )