-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/mx 1604 wikidata search endpoint (#91)
# Added - `/wikidata` endpoint to fetch all matching organizations from wikidata
- Loading branch information
1 parent
fccd5ca
commit b5a2111
Showing
9 changed files
with
458 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from typing import Generic, TypeVar | ||
|
||
from pydantic import BaseModel | ||
|
||
T = TypeVar("T") | ||
|
||
|
||
class PagedResponseSchema(BaseModel, Generic[T]): | ||
"""Response schema for any paged API.""" | ||
|
||
total: int | ||
offset: int | ||
limit: int | ||
results: list[T] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
from functools import cache | ||
from typing import Annotated | ||
|
||
from fastapi import APIRouter, Query | ||
|
||
from mex.backend.auxiliary.models import PagedResponseSchema | ||
from mex.common.models import ExtractedOrganization, ExtractedPrimarySource | ||
from mex.common.primary_source.extract import extract_seed_primary_sources | ||
from mex.common.primary_source.transform import ( | ||
get_primary_sources_by_name, | ||
transform_seed_primary_sources_to_extracted_primary_sources, | ||
) | ||
from mex.common.types import TextLanguage | ||
from mex.common.wikidata.extract import ( | ||
get_count_of_found_organizations_by_label, | ||
search_organizations_by_label, | ||
) | ||
from mex.common.wikidata.transform import ( | ||
transform_wikidata_organizations_to_extracted_organizations, | ||
) | ||
|
||
router = APIRouter() | ||
|
||
|
||
@router.get("/wikidata", status_code=200, tags=["wikidata"]) | ||
def search_organization_in_wikidata( | ||
q: Annotated[str, Query(min_length=1, max_length=1000)], | ||
offset: Annotated[int, Query(ge=0, le=10e10)] = 0, | ||
limit: Annotated[int, Query(ge=1, le=100)] = 10, | ||
lang: TextLanguage = TextLanguage.EN, | ||
) -> PagedResponseSchema[ExtractedOrganization]: | ||
"""Search an organization in wikidata. | ||
Args: | ||
q: label of the organization to be searched | ||
offset: start page number | ||
limit: end page number | ||
lang: language of the label. Example: en, de | ||
Returns: | ||
Paginated list of ExtractedOrganization | ||
""" | ||
total_orgs = get_count_of_found_organizations_by_label(q, lang) | ||
organizations = search_organizations_by_label(q, offset, limit, lang) | ||
|
||
extracted_organizations = list( | ||
transform_wikidata_organizations_to_extracted_organizations( | ||
organizations, extracted_primary_source_wikidata() | ||
) | ||
) | ||
|
||
return PagedResponseSchema( | ||
total=total_orgs, | ||
offset=offset, | ||
limit=limit, | ||
results=[organization for organization in extracted_organizations], | ||
) | ||
|
||
|
||
@cache | ||
def extracted_primary_source_wikidata() -> ExtractedPrimarySource: | ||
"""Load and return wikidata primary source.""" | ||
seed_primary_sources = extract_seed_primary_sources() | ||
extracted_primary_sources = list( | ||
transform_seed_primary_sources_to_extracted_primary_sources( | ||
seed_primary_sources | ||
) | ||
) | ||
(extracted_primary_source_wikidata,) = get_primary_sources_by_name( | ||
extracted_primary_sources, | ||
"wikidata", | ||
) | ||
|
||
return extracted_primary_source_wikidata |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import json | ||
from pathlib import Path | ||
from typing import Any | ||
from unittest.mock import MagicMock, Mock | ||
|
||
import pytest | ||
import requests | ||
from pytest import MonkeyPatch | ||
from requests import Response | ||
|
||
from mex.common.wikidata.connector import ( | ||
WikidataAPIConnector, | ||
WikidataQueryServiceConnector, | ||
) | ||
from mex.common.wikidata.models.organization import WikidataOrganization | ||
|
||
TEST_DATA_DIR = Path(__file__).parent / "test_data" | ||
|
||
|
||
@pytest.fixture | ||
def wikidata_organization_raw() -> dict[str, Any]: | ||
"""Return a raw wikidata organization.""" | ||
with open(TEST_DATA_DIR / "wikidata_organization_raw.json") as fh: | ||
return json.load(fh) | ||
|
||
|
||
@pytest.fixture | ||
def wikidata_organization( | ||
wikidata_organization_raw: dict[str, Any], | ||
) -> WikidataOrganization: | ||
"""Return a wikidata organization instance.""" | ||
return WikidataOrganization.model_validate(wikidata_organization_raw) | ||
|
||
|
||
@pytest.fixture | ||
def mocked_wikidata( | ||
monkeypatch: MonkeyPatch, wikidata_organization_raw: dict[str, Any] | ||
) -> None: | ||
"""Mock wikidata connector.""" | ||
response_query = Mock(spec=Response, status_code=200) | ||
|
||
session = MagicMock(spec=requests.Session) | ||
session.get = MagicMock(side_effect=[response_query]) | ||
|
||
def mocked_init(self: WikidataQueryServiceConnector) -> None: | ||
self.session = session | ||
|
||
monkeypatch.setattr(WikidataQueryServiceConnector, "__init__", mocked_init) | ||
monkeypatch.setattr(WikidataAPIConnector, "__init__", mocked_init) | ||
|
||
# mock search_wikidata_with_query | ||
|
||
def get_data_by_query( | ||
self: WikidataQueryServiceConnector, query: str | ||
) -> list[dict[str, dict[str, str]]]: | ||
return [ | ||
{ | ||
"item": { | ||
"type": "uri", | ||
"value": "http://www.wikidata.org/entity/Q26678", | ||
}, | ||
"itemLabel": {"xml:lang": "en", "type": "literal", "value": "BMW"}, | ||
"itemDescription": { | ||
"xml:lang": "en", | ||
"type": "literal", | ||
"value": "German automotive manufacturer, and conglomerate", | ||
}, | ||
"count": { | ||
"datatype": "http://www.w3.org/2001/XMLSchema#integer", | ||
"type": "literal", | ||
"value": "3", | ||
}, | ||
}, | ||
] | ||
|
||
monkeypatch.setattr( | ||
WikidataQueryServiceConnector, "get_data_by_query", get_data_by_query | ||
) | ||
|
||
# mock get_wikidata_org_with_org_id | ||
|
||
def get_wikidata_item_details_by_id( | ||
self: WikidataQueryServiceConnector, item_id: str | ||
) -> dict[str, str]: | ||
return wikidata_organization_raw | ||
|
||
monkeypatch.setattr( | ||
WikidataAPIConnector, | ||
"get_wikidata_item_details_by_id", | ||
get_wikidata_item_details_by_id, | ||
) |
Oops, something went wrong.