Skip to content

Commit

Permalink
Feature/mx 1604 wikidata search endpoint (#91)
Browse files Browse the repository at this point in the history
# Added
- `/wikidata` endpoint to fetch all matching organizations from wikidata
  • Loading branch information
mr-kamran-ali authored Jul 26, 2024
1 parent fccd5ca commit b5a2111
Show file tree
Hide file tree
Showing 9 changed files with 458 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- `/wikidata` endpoint to fetch all matching organizations from wikidata
- add support for computed fields in graph queries

### Changes
Expand Down
Empty file.
14 changes: 14 additions & 0 deletions mex/backend/auxiliary/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from typing import Generic, TypeVar

from pydantic import BaseModel

T = TypeVar("T")


class PagedResponseSchema(BaseModel, Generic[T]):
"""Response schema for any paged API."""

total: int
offset: int
limit: int
results: list[T]
74 changes: 74 additions & 0 deletions mex/backend/auxiliary/wikidata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from functools import cache
from typing import Annotated

from fastapi import APIRouter, Query

from mex.backend.auxiliary.models import PagedResponseSchema
from mex.common.models import ExtractedOrganization, ExtractedPrimarySource
from mex.common.primary_source.extract import extract_seed_primary_sources
from mex.common.primary_source.transform import (
get_primary_sources_by_name,
transform_seed_primary_sources_to_extracted_primary_sources,
)
from mex.common.types import TextLanguage
from mex.common.wikidata.extract import (
get_count_of_found_organizations_by_label,
search_organizations_by_label,
)
from mex.common.wikidata.transform import (
transform_wikidata_organizations_to_extracted_organizations,
)

router = APIRouter()


@router.get("/wikidata", status_code=200, tags=["wikidata"])
def search_organization_in_wikidata(
q: Annotated[str, Query(min_length=1, max_length=1000)],
offset: Annotated[int, Query(ge=0, le=10e10)] = 0,
limit: Annotated[int, Query(ge=1, le=100)] = 10,
lang: TextLanguage = TextLanguage.EN,
) -> PagedResponseSchema[ExtractedOrganization]:
"""Search an organization in wikidata.
Args:
q: label of the organization to be searched
offset: start page number
limit: end page number
lang: language of the label. Example: en, de
Returns:
Paginated list of ExtractedOrganization
"""
total_orgs = get_count_of_found_organizations_by_label(q, lang)
organizations = search_organizations_by_label(q, offset, limit, lang)

extracted_organizations = list(
transform_wikidata_organizations_to_extracted_organizations(
organizations, extracted_primary_source_wikidata()
)
)

return PagedResponseSchema(
total=total_orgs,
offset=offset,
limit=limit,
results=[organization for organization in extracted_organizations],
)


@cache
def extracted_primary_source_wikidata() -> ExtractedPrimarySource:
"""Load and return wikidata primary source."""
seed_primary_sources = extract_seed_primary_sources()
extracted_primary_sources = list(
transform_seed_primary_sources_to_extracted_primary_sources(
seed_primary_sources
)
)
(extracted_primary_source_wikidata,) = get_primary_sources_by_name(
extracted_primary_sources,
"wikidata",
)

return extracted_primary_source_wikidata
2 changes: 2 additions & 0 deletions mex/backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from fastapi.openapi.utils import get_openapi
from pydantic import BaseModel

from mex.backend.auxiliary.wikidata import router as wikidata_router
from mex.backend.exceptions import handle_uncaught_exception
from mex.backend.extracted.main import router as extracted_router
from mex.backend.identity.main import router as identity_router
Expand Down Expand Up @@ -88,6 +89,7 @@ async def lifespan(_: FastAPI) -> AsyncIterator[None]:
router.include_router(ingest_router, dependencies=[Depends(has_write_access)])
router.include_router(merged_router, dependencies=[Depends(has_read_access)])
router.include_router(rules_router, dependencies=[Depends(has_write_access)])
router.include_router(wikidata_router, dependencies=[Depends(has_read_access)])


class SystemStatus(BaseModel):
Expand Down
Empty file added tests/auxiliary/__init__.py
Empty file.
91 changes: 91 additions & 0 deletions tests/auxiliary/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import json
from pathlib import Path
from typing import Any
from unittest.mock import MagicMock, Mock

import pytest
import requests
from pytest import MonkeyPatch
from requests import Response

from mex.common.wikidata.connector import (
WikidataAPIConnector,
WikidataQueryServiceConnector,
)
from mex.common.wikidata.models.organization import WikidataOrganization

TEST_DATA_DIR = Path(__file__).parent / "test_data"


@pytest.fixture
def wikidata_organization_raw() -> dict[str, Any]:
"""Return a raw wikidata organization."""
with open(TEST_DATA_DIR / "wikidata_organization_raw.json") as fh:
return json.load(fh)


@pytest.fixture
def wikidata_organization(
wikidata_organization_raw: dict[str, Any],
) -> WikidataOrganization:
"""Return a wikidata organization instance."""
return WikidataOrganization.model_validate(wikidata_organization_raw)


@pytest.fixture
def mocked_wikidata(
monkeypatch: MonkeyPatch, wikidata_organization_raw: dict[str, Any]
) -> None:
"""Mock wikidata connector."""
response_query = Mock(spec=Response, status_code=200)

session = MagicMock(spec=requests.Session)
session.get = MagicMock(side_effect=[response_query])

def mocked_init(self: WikidataQueryServiceConnector) -> None:
self.session = session

monkeypatch.setattr(WikidataQueryServiceConnector, "__init__", mocked_init)
monkeypatch.setattr(WikidataAPIConnector, "__init__", mocked_init)

# mock search_wikidata_with_query

def get_data_by_query(
self: WikidataQueryServiceConnector, query: str
) -> list[dict[str, dict[str, str]]]:
return [
{
"item": {
"type": "uri",
"value": "http://www.wikidata.org/entity/Q26678",
},
"itemLabel": {"xml:lang": "en", "type": "literal", "value": "BMW"},
"itemDescription": {
"xml:lang": "en",
"type": "literal",
"value": "German automotive manufacturer, and conglomerate",
},
"count": {
"datatype": "http://www.w3.org/2001/XMLSchema#integer",
"type": "literal",
"value": "3",
},
},
]

monkeypatch.setattr(
WikidataQueryServiceConnector, "get_data_by_query", get_data_by_query
)

# mock get_wikidata_org_with_org_id

def get_wikidata_item_details_by_id(
self: WikidataQueryServiceConnector, item_id: str
) -> dict[str, str]:
return wikidata_organization_raw

monkeypatch.setattr(
WikidataAPIConnector,
"get_wikidata_item_details_by_id",
get_wikidata_item_details_by_id,
)
Loading

0 comments on commit b5a2111

Please sign in to comment.