Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New HealthDCAT-AP profile #326

Merged
merged 22 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
dfbf750
Update tests for changes in date parsing in rdflib
amercader Nov 21, 2024
1e7314c
Fix agent mbox value to be also without mailto
Markus92 Nov 26, 2024
7ba4777
Initial HealthDCAT-AP profile
Markus92 Nov 25, 2024
86b85d2
Initial passing unit tests for example dataset
Markus92 Nov 26, 2024
fb6ecc5
More fields and more tests
Markus92 Nov 26, 2024
899ac2c
Additional HealthDCAT-AP fields
Markus92 Nov 27, 2024
221c002
Fix Wikidata URIs in example so they actually resolve
Markus92 Dec 2, 2024
f5b7216
Add coding system attribute
Markus92 Dec 2, 2024
73e4c88
Create initial CKAN JSON data implementing HealthDCAT scheme
Markus92 Dec 2, 2024
4e78c47
Add a whole bunch of test cases
Markus92 Dec 4, 2024
e4bcca7
Implemented code values, qualified relations and analytics
Markus92 Dec 4, 2024
d90a4c8
Add URL property to contactPoint (VCARD.hasURL)
Markus92 Dec 4, 2024
4458afe
Wrote some documentation regarding the extension
Markus92 Dec 4, 2024
d9818fa
Merge branch 'master' into healthdcat_ap
Markus92 Dec 4, 2024
3f8cd85
dpv:hasPersonalData and some cleanup
Markus92 Dec 6, 2024
4422323
Small documentation update
Markus92 Dec 6, 2024
5a6d402
Merge remote-tracking branch 'upstream/master' into healthdcat_ap
Markus92 Dec 18, 2024
655370b
Fix cardinality of qualified_relation
Markus92 Dec 18, 2024
1140071
Fix test case for spatial_coverage in HealthDCAT-AP profile
Markus92 Dec 18, 2024
fa18da6
Merge remote-tracking branch 'origin/healthdcat_ap' into healthdcat_ap
Markus92 Dec 18, 2024
9966dd9
Small cleanup
Markus92 Dec 18, 2024
30f3bac
Move qualified relations to generic CKAN DCAT scheming class
Markus92 Dec 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ckanext/dcat/profiles/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@
from .euro_dcat_ap_3 import EuropeanDCATAP3Profile
from .dcat_us_3 import DCATUS3Profile
from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile
from .euro_health_dcat_ap import EuropeanHealthDCATAPProfile
from .schemaorg import SchemaOrgProfile
25 changes: 14 additions & 11 deletions ckanext/dcat/profiles/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
import json
from urllib.parse import quote

from ckan.lib.helpers import resource_formats
from ckan.model.license import LicenseRegister
from ckantoolkit import ObjectNotFound, asbool, aslist, config, get_action, url_for
from dateutil.parser import parse as parse_date
from rdflib import term, URIRef, BNode, Literal
from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS, ORG
from geomet import wkt, InvalidGeoJSONException
from geomet import InvalidGeoJSONException, wkt
from rdflib import BNode, Literal, URIRef, term
from rdflib.namespace import ORG, RDF, RDFS, SKOS, XSD, Namespace

from ckantoolkit import config, url_for, asbool, aslist, get_action, ObjectNotFound
from ckan.model.license import LicenseRegister
from ckan.lib.helpers import resource_formats
from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS
from ckanext.dcat.validators import is_year, is_year_month, is_date
from ckanext.dcat.validators import is_date, is_year, is_year_month

CNT = Namespace("http://www.w3.org/2011/content#")
DCT = Namespace("http://purl.org/dc/terms/")
Expand Down Expand Up @@ -536,7 +536,9 @@ def _agents_details(self, subject, predicate):
agent_details = {}
agent_details["uri"] = str(agent) if isinstance(agent, term.URIRef) else ""
agent_details["name"] = self._object_value(agent, FOAF.name)
agent_details["email"] = self._object_value(agent, FOAF.mbox)
agent_details["email"] = self._without_mailto(
self._object_value(agent, FOAF.mbox)
)
if not agent_details["email"]:
agent_details["email"] = self._without_mailto(
self._object_value(agent, VCARD.hasEmail)
Expand Down Expand Up @@ -573,6 +575,9 @@ def _contact_details(self, subject, predicate):
)

contact["identifier"] = self._get_vcard_property_value(agent, VCARD.hasUID)

contact["url"] = self._get_vcard_property_value(agent, VCARD.hasURL)

contacts.append(contact)

return contacts
Expand Down Expand Up @@ -818,9 +823,7 @@ def _add_spatial_value_to_graph(self, spatial_ref, predicate, value):
or object.
"""
spatial_formats = aslist(
config.get(
"ckanext.dcat.output_spatial_format", DEFAULT_SPATIAL_FORMATS
)
config.get("ckanext.dcat.output_spatial_format", DEFAULT_SPATIAL_FORMATS)
)

if isinstance(value, str):
Expand Down
4 changes: 2 additions & 2 deletions ckanext/dcat/profiles/euro_dcat_ap_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,12 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref):
contact = self._contact_details(dataset_ref, ADMS.contactPoint)
if contact:
contact = contact[0]
for key in ("uri", "name", "email", "identifier"):
for key in ("uri", "name", "email", "identifier", "url"):
if contact.get(key):
dataset_dict["extras"].append(
{
"key": "contact_{0}".format(key),
"value": contact.get(key)
"value": contact.get(key),
}
)

Expand Down
7 changes: 7 additions & 0 deletions ckanext/dcat/profiles/euro_dcat_ap_scheming.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,13 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref):
"identifier",
_type=URIRefOrLiteral,
)
self._add_triple_from_dict(
item,
contact_details,
VCARD.hasURL,
"url",
_type=URIRef,
)

self._add_agents(dataset_ref, dataset_dict, "publisher", DCT.publisher)
self._add_agents(dataset_ref, dataset_dict, "creator", DCT.creator)
Expand Down
254 changes: 254 additions & 0 deletions ckanext/dcat/profiles/euro_health_dcat_ap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
"""Test document"""
amercader marked this conversation as resolved.
Show resolved Hide resolved

import json

from rdflib import RDF, SKOS, XSD, BNode, Literal, URIRef, term
from rdflib.namespace import Namespace

from ckanext.dcat.profiles.base import DCAT, DCT, CleanedURIRef, URIRefOrLiteral
from ckanext.dcat.profiles.euro_dcat_ap_3 import EuropeanDCATAP3Profile

# HealthDCAT-AP namespace. Note: not finalized yet
HEALTHDCATAP = Namespace("http://healthdataportal.eu/ns/health#")

# Data Privacy Vocabulary namespace
DPV = Namespace("https://w3id.org/dpv#")

namespaces = {
"healthdcatap": HEALTHDCATAP,
"dpv": DPV,
}


class EuropeanHealthDCATAPProfile(EuropeanDCATAP3Profile):
"""
A profile implementing HealthDCAT-AP, a health-related extension of the DCAT application profile
for sharing information about Catalogues containing Datasets and Data Services descriptions in Europe.
"""

def parse_dataset(self, dataset_dict, dataset_ref):
# Call super method for DCAT-AP 3 properties
dataset_dict = super(EuropeanHealthDCATAPProfile, self).parse_dataset(
dataset_dict, dataset_ref
)

dataset_dict = self._parse_health_fields(dataset_dict, dataset_ref)

return dataset_dict

def _parse_health_fields(self, dataset_dict, dataset_ref):
self.__parse_healthdcat_stringvalues(dataset_dict, dataset_ref)

self.__parse_healthdcat_intvalues(dataset_dict, dataset_ref)

# Add the HDAB. There should only ever be one but you never know
agents = self._agents_details(dataset_ref, HEALTHDCATAP.hdab)
if agents:
dataset_dict["hdab"] = agents

# Add any qualifiedRelations
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really like the qualified relationships handling (parsing and serializing), and AFAICT there is nothing health specific to it and could be moved to one of the base DCAT AP profiles right? (euro_dcat_ap_scheming.py would be the right place I think as this was added in DCAT 2)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing health specific indeed, it's DCAT 2+. I will move it to the generic scheming.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved it to the generic class, but did not have time anymore to add it to the test cases, sorry! It's still covered by HealthDCAT-AP test cases.

qual_relations = self._relationship_details(dataset_ref, DCAT.qualifiedRelation)
if qual_relations:
dataset_dict["qualified_relation"] = qual_relations

# Retention period
retention_start, retention_end = self._time_interval(
dataset_ref, HEALTHDCATAP.retentionPeriod, dcat_ap_version=2
)
retention_dict = {}
if retention_start is not None:
retention_dict["start"] = retention_start
if retention_end is not None:
retention_dict["end"] = retention_end
if retention_dict:
dataset_dict["retention_period"] = [retention_dict]

return dataset_dict

def __parse_healthdcat_intvalues(self, dataset_dict, dataset_ref):
for key, predicate in (
("min_typical_age", HEALTHDCATAP.minTypicalAge),
("max_typical_age", HEALTHDCATAP.maxTypicalAge),
("number_of_records", HEALTHDCATAP.numberOfRecords),
("number_of_unique_individuals", HEALTHDCATAP.numberOfUniqueIndividuals),
):
value = self._object_value_int(dataset_ref, predicate)
# A zero value evaluates as False but is definitely not a None
if value is not None:
dataset_dict[key] = value

def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref):
for (
key,
predicate,
) in (
("analytics", HEALTHDCATAP.analytics),
("code_values", HEALTHDCATAP.hasCodeValues),
("coding_system", HEALTHDCATAP.hasCodingSystem),
("health_category", HEALTHDCATAP.healthCategory),
("health_theme", HEALTHDCATAP.healthTheme),
("legal_basis", DPV.hasLegalBasis),
("personal_data", DPV.hasPersonalData),
("population_coverage", HEALTHDCATAP.populationCoverage),
("publisher_note", HEALTHDCATAP.publisherNote),
("publisher_type", HEALTHDCATAP.publisherType),
("purpose", DPV.hasPurpose),
):
values = self._object_value_list(dataset_ref, predicate)
if values:
dataset_dict[key] = values

def graph_from_dataset(self, dataset_dict, dataset_ref):
super().graph_from_dataset(dataset_dict, dataset_ref)
for prefix, namespace in namespaces.items():
self.g.bind(prefix, namespace)

## key, predicate, fallbacks, _type, _class
items = [
("analytics", HEALTHDCATAP.analytics, None, URIRefOrLiteral),
("code_values", HEALTHDCATAP.hasCodeValues, None, URIRefOrLiteral),
("coding_system", HEALTHDCATAP.hasCodingSystem, None, URIRefOrLiteral),
("health_category", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral),
("health_theme", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral),
("legal_basis", DPV.hasLegalBasis, None, URIRefOrLiteral),
(
"population_coverage",
HEALTHDCATAP.populationCoverage,
None,
URIRefOrLiteral,
),
("personal_data", DPV.hasPersonalData, None, URIRef),
("publisher_note", HEALTHDCATAP.publisherNote, None, URIRefOrLiteral),
("publisher_type", HEALTHDCATAP.publisherType, None, URIRefOrLiteral),
("purpose", DPV.hasPurpose, None, URIRefOrLiteral),
]
self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

items = [
("min_typical_age", HEALTHDCATAP.minTypicalAge),
("max_typical_age", HEALTHDCATAP.maxTypicalAge),
("number_of_records", HEALTHDCATAP.numberOfRecords),
("number_of_unique_individuals", HEALTHDCATAP.numberOfUniqueIndividuals),
]
for key, predicate in items:
self._add_nonneg_integer_triple(dataset_dict, dataset_ref, key, predicate)

self._add_agents(dataset_ref, dataset_dict, "hdab", HEALTHDCATAP.hdab)
self._add_relationship(
dataset_ref, dataset_dict, "qualified_relation", DCAT.qualifiedRelation
)

def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate):
"""
Adds non-negative integers to the Dataset graph (xsd:nonNegativeInteger)

dataset_ref: subject of Graph
key: scheming key in CKAN
predicate: predicate to use
"""
value = self._get_dict_value(dataset_dict, key)

if value:
try:
if int(value) < 0:
raise ValueError("Not a non-negative integer")
self.g.add(
(
dataset_ref,
predicate,
Literal(int(value), datatype=XSD.nonNegativeInteger),
)
)
except (ValueError, TypeError):
self.g.add((dataset_ref, predicate, Literal(value)))

def _add_timeframe_triple(self, dataset_dict, dataset_ref):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like the same logic as the standard temporal_coverage handling. If there's no change is best to delete the method to avoid duplication.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's the same logic (copy-pasted the lines you're mentioning), moved to a separate function instead of inline. I could also split it off to a separate function in that file to avoid duplication?

temporal = dataset_dict.get("temporal_coverage")
if (
isinstance(temporal, list)
and len(temporal)
and self._not_empty_dict(temporal[0])
):
for item in temporal:
temporal_ref = BNode()
self.g.add((temporal_ref, RDF.type, DCT.PeriodOfTime))
if item.get("start"):
self._add_date_triple(temporal_ref, DCAT.startDate, item["start"])
if item.get("end"):
self._add_date_triple(temporal_ref, DCAT.endDate, item["end"])
self.g.add((dataset_ref, DCT.temporal, temporal_ref))

def _relationship_details(self, subject, predicate):
"""
Returns a list of dicts with details about a dcat:Relationship property, e.g.
dcat:qualifiedRelation

Both subject and predicate must be rdflib URIRef or BNode objects

Returns keys for uri, role, and relation with the values set to
an empty string if they could not be found.
"""

relations = []
for relation in self.g.objects(subject, predicate):
relation_details = {}
relation_details["uri"] = (
str(relation) if isinstance(relation, term.URIRef) else ""
)
relation_details["role"] = self._object_value(relation, DCAT.hadRole)
relation_details["relation"] = self._object_value(relation, DCT.relation)
relations.append(relation_details)

return relations

def _add_relationship(
self,
dataset_ref,
dataset_dict,
relation_key,
rdf_predicate,
):
"""
Adds one or more Relationships to the RDF graph.

:param dataset_ref: The RDF reference of the dataset
:param dataset_dict: The dataset dictionary containing agent information
:param relation_key: field name in the CKAN dict (.e.g. "qualifiedRelation")
:param rdf_predicate: The RDF predicate (DCAT.qualifiedRelation)
"""
relation = dataset_dict.get(relation_key)
if (
isinstance(relation, list)
and len(relation)
and self._not_empty_dict(relation[0])
):
relations = relation

for relation in relations:

agent_uri = relation.get("uri")
if agent_uri:
agent_ref = CleanedURIRef(agent_uri)
else:
agent_ref = BNode()

self.g.add((agent_ref, DCT.type, DCAT.Relationship))
self.g.add((dataset_ref, rdf_predicate, agent_ref))

self._add_triple_from_dict(
relation,
agent_ref,
DCT.relation,
"relation",
_type=URIRefOrLiteral,
)
self._add_triple_from_dict(
relation,
agent_ref,
DCAT.hadRole,
"role",
_type=URIRefOrLiteral,
)

def graph_from_catalog(self, catalog_dict, catalog_ref):
super().graph_from_catalog(catalog_dict, catalog_ref)
5 changes: 4 additions & 1 deletion ckanext/dcat/schemas/dcat_ap_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@ dataset_fields:
- field_name: identifier
label: Identifier
help_text: Unique identifier for the contact point. Such as a ROR ID.


- field_name: url
label: URL
help_text: A URL associated with the contact
help_text: Contact information for enquiries about the dataset.

- field_name: publisher
Expand Down
5 changes: 4 additions & 1 deletion ckanext/dcat/schemas/dcat_ap_multilingual.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,10 @@ dataset_fields:
- field_name: identifier
label: Identifier
help_text: Unique identifier for the contact point. Such as a ROR ID.


- field_name: url
label: URL
help_text: A URL associated with the contact
help_text: Contact information for enquiries about the dataset.

- field_name: license_id
Expand Down
4 changes: 4 additions & 0 deletions ckanext/dcat/schemas/dcat_ap_recommended.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ dataset_fields:
- field_name: email
label: Email
display_snippet: email.html

- field_name: url
label: URL
help_text: A URL associated with the contact
help_text: Contact information for enquiries about the dataset.

- field_name: publisher
Expand Down
4 changes: 4 additions & 0 deletions ckanext/dcat/schemas/dcat_us_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ dataset_fields:
- field_name: email
label: Email
display_snippet: email.html

- field_name: url
label: URL
help_text: A URL associated with the contact
help_text: Contact information for enquiries about the dataset.

- field_name: publisher
Expand Down
Loading
Loading