From dfbf7501517a116278e4402349b5e2dd5fd1dec1 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 21 Nov 2024 13:00:43 +0100 Subject: [PATCH 01/19] Update tests for changes in date parsing in rdflib Rdflib 7.1 introduced changes in [date parsing][1] that made some base profile tests fail. Basically the previous rdflib versions incomplete dates like 1904 were expanded to `1904-01-01`. Of course this is not a valid date and should be expressed using `gYear`: 1904 and we should be expecting `1904`. This should play nice with the time properties we are generating in CKAN as they already handle automatically `gYear`, `gYearMonth`, `date` and `dateTime`. Sites importing external DCAT representations that use the wrong encoding might need to check their parsers. [1] https://github.com/RDFLib/rdflib/pull/2929 --- ckanext/dcat/profiles/base.py | 3 -- .../tests/profiles/base/test_base_profile.py | 28 +++++++++---------- requirements.txt | 2 +- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 5cf15c55..efb30d29 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -433,9 +433,6 @@ def _time_interval(self, subject, predicate, dcat_ap_version=1): It checks for time intervals defined with DCAT, W3C Time hasBeginning & hasEnd and schema.org startDate & endDate. - Note that partial dates will be expanded to the first month / day - value, eg '1904' -> '1904-01-01'. - Returns a tuple with the start and end date values, both of which can be None if not found """ diff --git a/ckanext/dcat/tests/profiles/base/test_base_profile.py b/ckanext/dcat/tests/profiles/base/test_base_profile.py index aa8daf1e..f7b6da9f 100644 --- a/ckanext/dcat/tests/profiles/base/test_base_profile.py +++ b/ckanext/dcat/tests/profiles/base/test_base_profile.py @@ -510,7 +510,7 @@ def test_time_interval_w3c_time_inXSDDateTime(self): - 1904 + 1904 @@ -532,7 +532,7 @@ def test_time_interval_w3c_time_inXSDDateTime(self): start, end = p._time_interval(URIRef('http://example.org'), DCT.temporal) - assert start == '1904-01-01' + assert start == '1904' assert end == '2014-03-22' def test_time_interval_w3c_time_inXSDDateTimeStamp(self): @@ -548,7 +548,7 @@ def test_time_interval_w3c_time_inXSDDateTimeStamp(self): - 1904 + 1904 @@ -570,7 +570,7 @@ def test_time_interval_w3c_time_inXSDDateTimeStamp(self): start, end = p._time_interval(URIRef('http://example.org'), DCT.temporal) - assert start == '1904-01-01' + assert start == '1904' assert end == '2014-03-22' def test_time_interval_w3c_time_inXSDDate(self): @@ -586,7 +586,7 @@ def test_time_interval_w3c_time_inXSDDate(self): - 1904 + 1904 @@ -608,7 +608,7 @@ def test_time_interval_w3c_time_inXSDDate(self): start, end = p._time_interval(URIRef('http://example.org'), DCT.temporal) - assert start == '1904-01-01' + assert start == '1904' assert end == '2014-03-22' def test_time_interval_multiple_w3c_time(self): @@ -627,9 +627,9 @@ def test_time_interval_multiple_w3c_time(self): - 2005 - 1904 - 1974 + 2005 + 1904 + 1974 @@ -653,7 +653,7 @@ def test_time_interval_multiple_w3c_time(self): start, end = p._time_interval(URIRef('http://example.org'), DCT.temporal) - assert start == '1904-01-01' + assert start == '1904' assert end == '2014-03-22' def test_time_interval_dcat(self): @@ -706,7 +706,7 @@ def test_time_interval_all_dcat_ap_2_dcat_found(self): - 1904 + 1904 @@ -763,7 +763,7 @@ def test_time_interval_all_dcat_ap_1_schema_org_found(self): - 1904 + 1904 @@ -827,7 +827,7 @@ def test_time_interval_all_dcat_ap_2_w3c_time_found(self): - 1904 + 1904 @@ -849,7 +849,7 @@ def test_time_interval_all_dcat_ap_2_w3c_time_found(self): start, end = p._time_interval(URIRef('http://example.org'), DCT.temporal, dcat_ap_version=2) - assert start == '1904-01-01' + assert start == '1904' assert end == '2014-03-22' def test_publisher_foaf(self): diff --git a/requirements.txt b/requirements.txt index 28bad239..0eb6dabe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -rdflib>=6.1.1,<7.1.0 +rdflib>=6.1.1,<7.2.0 geomet>=0.2.0 ckantoolkit>=0.0.7 future>=0.18.2 From 1e7314c7ef2d7d47d04b66f9fc61c8d41445b1d2 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Tue, 26 Nov 2024 03:32:43 +0100 Subject: [PATCH 02/19] Fix agent mbox value to be also without mailto --- ckanext/dcat/profiles/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index efb30d29..9edb3db6 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -536,7 +536,9 @@ def _agents_details(self, subject, predicate): agent_details = {} agent_details["uri"] = str(agent) if isinstance(agent, term.URIRef) else "" agent_details["name"] = self._object_value(agent, FOAF.name) - agent_details["email"] = self._object_value(agent, FOAF.mbox) + agent_details["email"] = self._without_mailto( + self._object_value(agent, FOAF.mbox) + ) if not agent_details["email"]: agent_details["email"] = self._without_mailto( self._object_value(agent, VCARD.hasEmail) From 7ba477793b89613b5f68d486bd2e542a425e5f05 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Mon, 25 Nov 2024 12:50:11 +0100 Subject: [PATCH 03/19] Initial HealthDCAT-AP profile --- ckanext/dcat/profiles/euro_health_dcat_ap.py | 98 ++++ ckanext/dcat/schemas/healthdcat_ap.yaml | 461 ++++++++++++++++++ .../tests/profiles/health_dcat_ap/__init__.py | 0 .../test_euro_health_dcat_ap_profile_parse.py | 156 ++++++ examples/dcat/dataset_health.ttl | 327 +++++++++++++ 5 files changed, 1042 insertions(+) create mode 100644 ckanext/dcat/profiles/euro_health_dcat_ap.py create mode 100644 ckanext/dcat/schemas/healthdcat_ap.yaml create mode 100644 ckanext/dcat/tests/profiles/health_dcat_ap/__init__.py create mode 100644 ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py create mode 100644 examples/dcat/dataset_health.ttl diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py new file mode 100644 index 00000000..b428528b --- /dev/null +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -0,0 +1,98 @@ +"""Test document""" + +import json + +from rdflib import SKOS, XSD, Literal +from rdflib.namespace import Namespace + +from ckanext.dcat.profiles.base import URIRefOrLiteral +from ckanext.dcat.profiles.euro_dcat_ap_3 import EuropeanDCATAP3Profile + +HEALTHDCATAP = Namespace("http://healthdataportal.eu/ns/health#") + + +class EuropeanHealthDCATAPProfile(EuropeanDCATAP3Profile): + """ + A profile implementing HealthDCAT-AP, a health-related extension of the DCAT application profile + for sharing information about Catalogues containing Datasets and Data Services descriptions in Europe. + """ + + def parse_dataset(self, dataset_dict, dataset_ref): + # Call super method for DCAT-AP 3 properties + dataset_dict = super(EuropeanHealthDCATAPProfile, self).parse_dataset( + dataset_dict, dataset_ref + ) + + dataset_dict = self._parse_mandatory_fields(dataset_dict, dataset_ref) + + return dataset_dict + + def _parse_mandatory_fields(self, dataset_dict, dataset_ref): + + # Lists for "purpose" and "health theme" + for ( + key, + predicate, + ) in ( + ("purpose", HEALTHDCATAP.purpose), + ("health_theme", HEALTHDCATAP.healthTheme), + ): + values = self._object_value_list(dataset_ref, predicate) + if values: + dataset_dict[key].append(json.dumps(values)) + + # Find number of records + number_of_records = self._object_value_int( + dataset_ref, HEALTHDCATAP.numberOfRecords + ) + if number_of_records is not None: + dataset_dict["number_of_records"] = number_of_records + + return dataset_dict + + def graph_from_dataset(self, dataset_dict, dataset_ref): + super().graph_from_dataset(dataset_dict, dataset_ref) + + g = self.g + # List items: + # - Purpose + # - Health theme + items = [ + ("purpose", HEALTHDCATAP.purpose, None, URIRefOrLiteral), + ( + "health_theme", + HEALTHDCATAP.healthTheme, + None, + URIRefOrLiteral, + SKOS.concept, + ), + ] + self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) + + # Number of records + if dataset_dict.get("number_of_records"): + try: + g.add( + ( + dataset_ref, + HEALTHDCATAP.numberOfRecords, + Literal( + dataset_dict["number_of_records"], + datatype=XSD.nonNegativeInteger, + ), + ) + ) + except (ValueError, TypeError): + g.add( + ( + dataset_ref, + HEALTHDCATAP.numberOfRecords, + Literal(dataset_dict["number_of_records"]), + ) + ) + + def graph_from_catalog(self, catalog_dict, catalog_ref): + super().graph_from_catalog(catalog_dict, catalog_ref) + + def __init__(self): + return None diff --git a/ckanext/dcat/schemas/healthdcat_ap.yaml b/ckanext/dcat/schemas/healthdcat_ap.yaml new file mode 100644 index 00000000..58f361eb --- /dev/null +++ b/ckanext/dcat/schemas/healthdcat_ap.yaml @@ -0,0 +1,461 @@ +scheming_version: 2 +dataset_type: dataset +about: Schema for HealthDCAT-AP +about_url: http://github.com/ckan/ckanext-dcat + +dataset_fields: + +- field_name: title + label: Title + preset: title + required: true + help_text: A descriptive title for the dataset. + +- field_name: name + label: URL + preset: dataset_slug + form_placeholder: eg. my-dataset + +- field_name: notes + label: Description + required: true + form_snippet: markdown.html + help_text: A free-text account of the dataset. + +- field_name: tag_string + label: Keywords + preset: tag_string_autocomplete + form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataset. Use commas to separate multiple values. + +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the contact point. Such as a ROR ID. + + help_text: Contact information for enquiries about the dataset. + +- field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the publisher, such as a ROR ID. + help_text: Entity responsible for making the dataset available. + +- field_name: creator + label: Creator + repeating_label: Creator + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + help_text: URI of the creator, if available. + + - field_name: name + label: Name + help_text: Name of the entity or person who created the dataset. + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact email of the creator. + + - field_name: url + label: URL + display_snippet: link.html + help_text: URL for more information about the creator. + + - field_name: type + label: Type + help_text: Type of creator (e.g., Organization, Person). + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + +- field_name: license_id + label: License + form_snippet: license.html + help_text: License definitions and additional information can be found at http://opendefinition.org/. + +- field_name: owner_org + label: Organization + preset: dataset_organization + help_text: The CKAN organization the dataset belongs to. + +- field_name: url + label: Landing page + form_placeholder: http://example.com/dataset.json + display_snippet: link.html + help_text: Web page that can be navigated to gain access to the dataset, its distributions and/or additional information. + + # Note: this will fall back to metadata_created if not present +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the dataset. + + # Note: this will fall back to metadata_modified if not present +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the dataset was changed, updated or modified. + +- field_name: version + label: Version + validators: ignore_missing unicode_safe package_version_validator + help_text: Version number or other version designation of the dataset. + +- field_name: version_notes + label: Version notes + validators: ignore_missing unicode_safe + form_snippet: markdown.html + display_snippet: markdown.html + help_text: A description of the differences between this version and a previous version of the dataset. + + # Note: CKAN will generate a unique identifier for each dataset +- field_name: identifier + label: Identifier + help_text: A unique identifier of the dataset. + +- field_name: frequency + label: Frequency + help_text: The frequency at which dataset is published. + +- field_name: provenance + label: Provenance + form_snippet: markdown.html + display_snippet: markdown.html + help_text: A statement about the lineage of the dataset. + +- field_name: dcat_type + label: Type + help_text: The type of the dataset. + # TODO: controlled vocabulary? + +- field_name: temporal_coverage + label: Temporal coverage + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + help_text: The temporal period or periods the dataset covers. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the dataset. + +- field_name: spatial_coverage + label: Spatial coverage + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: text + label: Label + + - field_name: geom + label: Geometry + + - field_name: bbox + label: Bounding Box + + - field_name: centroid + label: Centroid + help_text: A geographic region that is covered by the dataset. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in a dataset, measured in meters. + +- field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + help_text: Information that indicates whether the dataset is Open Data, has access restrictions or is not public. + +- field_name: alternate_identifier + label: Other identifier + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: This property refers to a secondary identifier of the dataset, such as MAST/ADS, DataCite, DOI, etc. + +- field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A category of the dataset. A Dataset may be associated with multiple themes. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the dataset. + # TODO: language form snippet / validator / graph + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this dataset. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An implementing rule or other specification that the dataset follows. + +- field_name: is_referenced_by + label: Is referenced by + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A related resource, such as a publication, that references, cites, or otherwise points to the dataset. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the dataset. + + +- field_name: purpose + label: Purpose + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A free text statement of the purpose of the processing of data or personal data. + +- field_name: health_theme + label: Health theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + The health category to which this dataset belongs as described in the Commission Regulation + on the European Health Data Space laying down a list of categories of electronic data for secondary use, Art.33. + +- field_name: number_of_records + label: Number of records + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Size of the dataset in terms of the number of records. + + +#- field_name: hvd_category +# label: HVD Category +# preset: multiple_text +# validators: ignore_missing scheming_multiple_text +# TODO: implement separately as part of wider HVD support + +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this dataset (if not provided it will be autogenerated). + +# TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) +# +resource_fields: + +- field_name: url + label: URL + preset: resource_url_upload + +- field_name: name + label: Name + form_placeholder: + help_text: A descriptive title for the resource. + +- field_name: description + label: Description + form_snippet: markdown.html + help_text: A free-text account of the resource. + +- field_name: format + label: Format + preset: resource_format_autocomplete + help_text: File format. If not provided it will be guessed. + +- field_name: mimetype + label: Media type + validators: if_empty_guess_format ignore_missing unicode_safe + help_text: Media type for this format. If not provided it will be guessed. + +- field_name: compress_format + label: Compress format + help_text: The format of the file in which the data is contained in a compressed form. + +- field_name: package_format + label: Package format + help_text: The format of the file in which one or more data files are grouped together. + +- field_name: size + label: Size + validators: ignore_missing int_validator + form_snippet: number.html + display_snippet: file_size.html + help_text: File size in bytes + +- field_name: hash + label: Hash + help_text: Checksum of the downloaded file. + +- field_name: hash_algorithm + label: Hash Algorithm + help_text: Algorithm used to calculate to checksum. + +- field_name: rights + label: Rights + form_snippet: markdown.html + display_snippet: markdown.html + help_text: Some statement about the rights associated with the resource. + +- field_name: availability + label: Availability + help_text: Indicates how long it is planned to keep the resource available. + +- field_name: status + label: Status + preset: select + choices: + - value: http://purl.org/adms/status/Completed + label: Completed + - value: http://purl.org/adms/status/UnderDevelopment + label: Under Development + - value: http://purl.org/adms/status/Deprecated + label: Deprecated + - value: http://purl.org/adms/status/Withdrawn + label: Withdrawn + help_text: The status of the resource in the context of maturity lifecycle. + +- field_name: license + label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. + +- field_name: has_version + label: Has version + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_inline: true + help_text: This property refers to a related Dataset that is a version, edition, or adaptation of the described Dataset. + + # Note: this falls back to the standard resource url field +- field_name: access_url + label: Access URL + help_text: URL that gives access to the dataset (defaults to the standard resource URL). + + # Note: this falls back to the standard resource url field +- field_name: download_url + label: Download URL + display_snippet: link.html + help_text: URL that provides a direct link to a downloadable file (defaults to the standard resource URL). + +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the resource. + +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the resource was changed, updated or modified. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the distribution. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in the distribution, measured in meters. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the resource. + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this resource. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An established schema to which the described resource conforms. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the resource. + +- field_name: access_services + label: Access services + repeating_label: Access service + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: title + label: Title + + - field_name: endpoint_description + label: Endpoint description + + - field_name: endpoint_url + label: Endpoint URL + preset: multiple_text + + - field_name: serves_dataset + label: Serves dataset + preset: multiple_text + validators: ignore_missing scheming_multiple_text + + help_text: A data service that gives access to the resource. + + # Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this resource (if not provided it will be autogenerated). diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/__init__.py b/ckanext/dcat/tests/profiles/health_dcat_ap/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py new file mode 100644 index 00000000..dcb83a07 --- /dev/null +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -0,0 +1,156 @@ +import pytest + +from ckan.tests.helpers import call_action +from ckanext.dcat.processors import RDFParser +from ckanext.dcat.tests.utils import BaseParseTest + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_dcat_ap_3") +class TestSchemingParseSupport(BaseParseTest): + def test_e2e_dcat_to_ckan(self): + """ + Parse a DCAT RDF graph into a CKAN dataset dict, create a dataset with + package_create and check that all expected fields are there + """ + contents = self._get_file_contents("dataset_health.ttl") + + p = RDFParser() + + p.parse(contents, _format="turtle") + + datasets = [d for d in p.datasets()] + + assert len(datasets) == 1 + + dataset_dict = datasets[0] + + dataset_dict["name"] = "test-dcat-1" + dataset = call_action("package_create", **dataset_dict) + + # Core fields + + assert dataset["title"] == "Zimbabwe Regional Geochemical Survey." + assert ( + dataset["notes"] + == "During the period 1982-86 a team of geologists from the British Geological Survey ..." + ) + assert dataset["url"] == "http://dataset.info.org" + assert dataset["version"] == "2.3" + assert dataset["license_id"] == "cc-nc" + assert sorted([t["name"] for t in dataset["tags"]]) == [ + "exploration", + "geochemistry", + "geology", + ] + + # Standard fields + assert dataset["version_notes"] == "New schema added" + assert dataset["identifier"] == "9df8df51-63db-37a8-e044-0003ba9b0d98" + assert dataset["frequency"] == "http://purl.org/cld/freq/daily" + assert dataset["access_rights"] == "public" + assert dataset["provenance"] == "Some statement about provenance" + assert dataset["dcat_type"] == "test-type" + + assert dataset["issued"] == "2012-05-10" + assert dataset["modified"] == "2012-05-10T21:04:00" + assert dataset["temporal_resolution"] == "PT15M" + assert dataset["spatial_resolution_in_meters"] == "1.5" + + # List fields + assert sorted(dataset["conforms_to"]) == ["Standard 1", "Standard 2"] + assert sorted(dataset["language"]) == ["ca", "en", "es"] + assert sorted(dataset["theme"]) == [ + "Earth Sciences", + "http://eurovoc.europa.eu/100142", + "http://eurovoc.europa.eu/209065", + ] + assert sorted(dataset["alternate_identifier"]) == [ + "alternate-identifier-1", + "alternate-identifier-2", + ] + assert sorted(dataset["documentation"]) == [ + "http://dataset.info.org/doc1", + "http://dataset.info.org/doc2", + ] + + assert sorted(dataset["is_referenced_by"]) == [ + "https://doi.org/10.1038/sdata.2018.22", + "test_isreferencedby", + ] + assert sorted(dataset["applicable_legislation"]) == [ + "http://data.europa.eu/eli/reg_impl/2023/138/oj", + "http://data.europa.eu/eli/reg_impl/2023/138/oj_alt", + ] + # Repeating subfields + + assert dataset["contact"][0]["name"] == "Point of Contact" + assert dataset["contact"][0]["email"] == "contact@some.org" + + assert ( + dataset["publisher"][0]["name"] == "Publishing Organization for dataset 1" + ) + assert dataset["publisher"][0]["email"] == "contact@some.org" + assert dataset["publisher"][0]["url"] == "http://some.org" + assert ( + dataset["publisher"][0]["type"] + == "http://purl.org/adms/publishertype/NonProfitOrganisation" + ) + assert dataset["temporal_coverage"][0]["start"] == "1905-03-01" + assert dataset["temporal_coverage"][0]["end"] == "2013-01-05" + + resource = dataset["resources"][0] + + # Resources: core fields + assert resource["url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" + + # Resources: standard fields + assert resource["license"] == "http://creativecommons.org/licenses/by-nc/2.0/" + assert resource["rights"] == "Some statement about rights" + assert resource["issued"] == "2012-05-11" + assert resource["modified"] == "2012-05-01T00:04:06" + assert resource["temporal_resolution"] == "PT15M" + assert resource["spatial_resolution_in_meters"] == 1.5 + assert resource["status"] == "http://purl.org/adms/status/Completed" + assert resource["size"] == 12323 + assert ( + resource["availability"] + == "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL" + ) + assert ( + resource["compress_format"] + == "http://www.iana.org/assignments/media-types/application/gzip" + ) + assert ( + resource["package_format"] + == "http://publications.europa.eu/resource/authority/file-type/TAR" + ) + + assert resource["hash"] == "4304cf2e751e6053c90b1804c89c0ebb758f395a" + assert ( + resource["hash_algorithm"] + == "http://spdx.org/rdf/terms#checksumAlgorithm_sha1" + ) + + assert ( + resource["access_url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" + ) + assert "download_url" not in resource + + # Resources: list fields + assert sorted(resource["language"]) == ["ca", "en", "es"] + assert sorted(resource["documentation"]) == [ + "http://dataset.info.org/distribution1/doc1", + "http://dataset.info.org/distribution1/doc2", + ] + assert sorted(resource["conforms_to"]) == ["Standard 1", "Standard 2"] + + # Resources: repeating subfields + assert resource["access_services"][0]["title"] == "Sparql-end Point" + assert resource["access_services"][0]["endpoint_url"] == [ + "http://publications.europa.eu/webapi/rdf/sparql" + ] diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl new file mode 100644 index 00000000..fc7d8906 --- /dev/null +++ b/examples/dcat/dataset_health.ttl @@ -0,0 +1,327 @@ +@prefix adms: . +@prefix dcat: . +@prefix dcatap: . +@prefix dct: . +@prefix dqv: . +@prefix foaf: . +@prefix locn: . +@prefix oa: . +@prefix prov: . +@prefix rdfs: . +@prefix skos: . +@prefix spdx: . +@prefix vcard: . + + + a dcat:Resource , dcat:Dataset; + dcatap:applicableLegislation ; + + ; + + , ; + + , ; + + [ a foaf:Organization; + locn:address [ a locn:Address; + locn:adminUnitL1 "http://publications.europa.eu/resource/authority/country/BEL"; + locn:fullAddress "Galileelaan 5, Bus 2"; + locn:postCode "1210"; + locn:postName "Saint-Josse-ten-Noode" + ]; + foaf:homepage ; + foaf:mbox ; + foaf:name "Belgian Health Data Agency" + ]; + + , , , ; + + , ; + + "110"^^; + + "0"^^; + + "124866488"^^; + + "8914722"^^; + + "The population targeted by the LINK-VACC project comprises all individuals in Belgium who have received a COVID-19 vaccine, undergone testing for COVID-19, or have been hospitalized with a confirmed diagnosis of COVID-19. The project also considers healthcare professionals and the general Belgian population for understanding vaccination coverage and effectiveness, especially among those with comorbidities and varying socio-economic backgrounds."@en; + + "Sciensano is a research institute and the national public health institute of Belgium. It is a so-called federal scientific institution that operates under the authority of the federal minister of Public Health and the federal minister of Agriculture of Belgium"@en; + + ; + + [ a dct:PeriodOfTime; + rdfs:comment "As stated in the CSI deliberation"@en; + dcat:endDate "2034-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:accessRights ; + dct:accrualPeriodicity ; + dct:alternative "LINK-VACC"@en; + dct:conformsTo ; + dct:creator ; + dct:description "The LINK-VACC project links selected variables from existing registries for COVID-19 vaccine surveillance, in order to ensure the monitoring of COVID- 19 vaccines in the phase following their marketing authorization (post-authorization surveillance). This includes the measurement of uptake and coverage of the vaccination, the estimation of vaccine effectiveness, and continuous monitoring of the vaccine’s safety. For these purposes, existing pseudonymized data on COVID-19 laboratory test results, hospitalized COVID-19 patients, COVID-19 vaccinations, underlying health problems, socio-demographic and -economic factors, and healthcare worker status are linked. | Funding: The project is funded by the Belgian Federal Authorities | Geo coverage: Nuts 3 | Target population: General population | Personal identifier: National identifier | Level of aggregation: Individual | Linkage possible: Yes | Regulations for data sharing: External investigators with a request for selected data should fill in the data request form (https://epistat.wiv-isp.be/datarequest). Depending on the type of desired data (anonymous or pseudonymized), the provision of data will have to be assessed by the Belgian Information Security Committee Social Security & Health based on legal and ethical regulations, and is outlined in a data transfer agreement with the data owner (Sciensano)."@en; + dct:identifier "http://fdp2.healthdataportal.eu/dataset/8bb235c9-7bcd-4290-a188-49fe33c2170c"^^; + dct:isPartOf ; + dct:isReferencedBy , ; + dct:issued "2023-01-20T08:51:00Z"^^; + dct:language , , ; + dct:modified "2024-10-09T00:00:00Z"^^; + dct:provenance [ a dct:ProvenanceStatement; + rdfs:label "The data for the LINK-VACC project is sourced from several existing databases, including Vaccinnet+, HealthData COVID-19 database (Contact tracing and Clinic database), CoBRHA, STATBEL, and the AIM database. These databases collectively provide comprehensive demographic, clinical, and socio-economic data relevant to the project's objectives"@en + ]; + dct:publisher [ a foaf:Organization , foaf:Agent; + foaf:homepage ; + foaf:mbox ; + foaf:name "Sciensano" + ]; + dct:relation ; + dct:spatial ; + dct:temporal [ a dct:PeriodOfTime; + dcat:endDate "2024-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:title "[Adapted] Linking of registers for COVID-19 vaccine surveillance"@en; + dct:type [ a skos:Concept; + skos:inScheme ; + skos:prefLabel "Personal Data"@en + ]; + adms:identifier ; + adms:sample ; + adms:versionNotes "Dataset continuously updated"@en; + dcat:contactPoint ; + dcat:distribution ; + dcat:hasVersion ; + dcat:keyword "COVID-19"@en , "Vaccination"@en , "Vaccine effectiveness"@en , "Vaccine"@en , "Surveillance"@en , "Post-market authorization"@en , "corona virus"@en , "SARS-CoV-2"@en , "Data linkage"@en , "Public health surveillance"@en; + dcat:landingPage [ a foaf:Document; + rdfs:label "Landing Page for Sciensano"@en; + foaf:homepage + ]; + dcat:spatialResolutionInMeters "10"^^; + dcat:temporalResolution "P1D"^^; + dcat:theme ; + dcat:version "Project HDBP0250"; + dqv:hasQualityAnnotation [ a dqv:QualityCertificate; + oa:hasBody ; + oa:hasTarget ; + oa:motivatedBy dqv:qualityAssessment + ]; + prov:qualifiedAttribution ; + prov:wasGeneratedBy ; + foaf:page [ a foaf:Document; + rdfs:label "Landing Page for Sciensano"@en; + foaf:homepage + ]; + + ; + + , , , , ; + + [ a ; + dct:description "The primary objective of Sciensano's LINK-VACC project is to monitor COVID-19 vaccines post-authorization and evaluate the public health value of prioritizing vaccination for people with comorbidities. This involves assessing the vaccines' effectiveness and safety in the broader population context, beyond the limited scope of clinical trials, and determining future vaccination policies in public health emergencies such as epidemics or pandemics"@en + ] . + + + a dcat:Distribution; + dcatap:applicableLegislation ; + dct:format ; + dct:identifier "http://ehelse.healthdataportal.eu/analytics/47f55653-a151-48c1-8d90-940561da6e57"; + dct:isPartOf ; + dct:issued "2024-06-03T08:51:00Z"^^; + dct:license ; + dct:modified "2024-06-04T18:00:00Z"^^; + dct:rights [ a dct:RightsStatement; + rdfs:label "_g_L202C11377"@en , "internalURI:wasGeneratedBy0"@en , "_g_L123C7733"@en + ]; + dct:title "Technical report number of unique study subjects available by environment for project HDBP0250"@en; + dcat:accessURL ; + dcat:downloadURL ; + dcat:mediaType . + + + a dct:MediaType . + + + a foaf:Agent; + foaf:homepage ; + foaf:mbox ; + foaf:name "Sciensano" . + + + a adms:Identifier; + skos:notation "https://www.healthinformationportal.eu/health-information-sources/linking-registers-covid-19-vaccine-surveillance"^^; + adms:schemaAgency "Health Information Portal" . + + + a vcard:Organization , vcard:Kind; + vcard:fn "Sciensano"; + vcard:hasEmail ; + vcard:hasURL ; + vcard:organisationName "Sciensano"; + vcard:organisationUnit "Health Information" . + + + a dcat:CatalogRecord; + dct:creator ; + dct:identifier "16e16149-bf41-42f6-8741-225e8c97a35e"; + dct:issued "2024-10-04T14:28:36Z"^^; + dct:modified "2024-10-09T17:34:28Z"^^; + spdx:checksum [ a spdx:Checksum; + spdx:algorithm spdx:checksumAlgorithm_md5; + spdx:checksumValue "ea77c251b6945e450ae4d66c581495d4" + ]; + foaf:primaryTopic . + + + a ; + dct:description "The protocol of the LINK-VACC project was approved by the medical ethics committee University Hospital Brussels – Vrije Universiteit Brussel (VUB) on 03/02/2021 (reference number 2020/523) and obtained authorization from the Information Security Committee (ISC) Social Security and Health (reference number IVC/KSZG/21/034)."@en; + dct:source , . + + + a dct:LinguisticSystem . + + + a ; + dct:title "ID_TU_STATBEL_POP"@en; + + ; + dcat:keyword "LINK-VACC"@en . + + + a dcat:Distribution; + dcatap:applicableLegislation ; + dct:format ; + dct:identifier "http://ehelse.healthdataportal.eu/sample/fe921169-4619-4386-8bfe-60ea131dbe96"; + dct:isPartOf ; + dct:issued "2024-06-03T08:51:00Z"^^; + dct:language ; + dct:license ; + dct:modified "2024-06-04T18:00:00Z"^^; + dct:rights [ a dct:RightsStatement; + rdfs:label "Free access."@en + ]; + dct:title "Proxy data generating for the EHDS2 Pilot project Sciensano Use Case"@en; + dcat:accessURL ; + dcat:downloadURL ; + dcat:mediaType . + + + a dct:Standard; + dct:alternative "ICD-10-PCS"@en; + dct:identifier "https://www.wikidata.org/wiki/Property:P1690"^^; + dct:title "identifier in the ICD-10-PCS (Procedure Coding System, International Classification of Diseases, 10th revision)"@en; + dcat:version "10th Revision" . + + + a dct:LinguisticSystem . + + + a dct:Standard; + rdfs:label "Fast Healthcare Interoperability Resources"@en . + + + a skos:Concept; + skos:prefLabel "vaccine efficacy"@en . + + + a dct:LinguisticSystem . + + + a skos:Concept; + skos:prefLabel "National Public Health Institute"@en . + + + a dct:Standard; + dct:alternative "ICD-10-CM"@en; + dct:identifier "https://www.wikidata.org/wiki/Property:P4229"^^; + dct:title "identifier in the ICD-10-CM (International Classification of Diseases, 10th Revision, Clinical Modification)"@en; + dcat:version "10th Revision" . + + + a dct:RightsStatement . + + + a dct:Frequency . + + + a prov:Attribution; + dcat:hadRole ; + prov:agent [ a foaf:Organization; + foaf:homepage ; + foaf:mbox ; + foaf:name "Sciensano" + ] . + + + a dct:Location . + + + a skos:Concept; + dct:identifier "https://icd.who.int/browse10/2019/en#/Y59.0"^^; + skos:definition "Viral vaccines"@en; + skos:hasTopConcept ; + skos:notation "Y59.0"; + skos:prefLabel "Viral vaccines"@en . + + + a dct:MediaTypeOrExtent . + + + a dcat:Distribution; + dcatap:applicableLegislation ; + dct:description "Belgian Health Data Agency For better Healthcare, Research & Policy Making"@en; + dct:format ; + dct:identifier "http://ehelse.healthdataportal.eu/distribution/13a3851d-6cdf-4570-a7f0-7f03015d1925"; + dct:isPartOf ; + dct:issued "2024-06-03T08:51:00Z"^^; + dct:license ; + dct:modified "2024-06-04T18:00:00Z"^^; + dct:rights [ a dct:RightsStatement; + rdfs:label "Access to data is conditional on the issuance of a permit by the HDAB after submission of a data request application (English)"@en + ]; + dct:title "Belgian Health Data Agency"@en; + dcat:accessURL ; + dcat:byteSize "80000"^^ . + + + a prov:Activity; + rdfs:label "http://dbpedia.org/resource/Record_linkage"@en; + rdfs:seeAlso ; + dct:type ; + prov:startedAtTime "2021-01-01T00:00:00Z"^^; + prov:wasAssociatedWith [ a prov:Agent; + prov:actedOnBehalfOf [ a prov:Organization , prov:Agent; + foaf:name "Sciensano" + ]; + foaf:homepage ; + foaf:mbox ; + foaf:name "Dr. Joris van Loenhout" + ]; + foaf:page . + + + a ; + + ; + + "Patient death reason\tInformation on wheter the cause of death was COVID-19."@en; + + "CD_COD_COVID" . + + + a skos:Concept; + dct:identifier "https://icd.who.int/browse10/2019/en#/U07.1"^^; + skos:definition "COVID-19, virus identified"@en; + skos:hasTopConcept ; + skos:notation "U07.1"; + skos:prefLabel "COVID-19"@en . + + + a dct:LicenseDocument; + rdfs:label "Creative Commons Attribution–NonCommercial–NoDerivs 3.0 Unported"@en . + + + a skos:Concept; + skos:prefLabel "viral vaccines"@en . From 86b85d2a7163a38b2b88abffda200f8d39456c52 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Tue, 26 Nov 2024 03:32:15 +0100 Subject: [PATCH 04/19] Initial passing unit tests for example dataset --- .../test_euro_health_dcat_ap_profile_parse.py | 225 ++++++++++-------- examples/dcat/dataset_health.ttl | 47 ++-- 2 files changed, 145 insertions(+), 127 deletions(-) diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index dcb83a07..6289eecc 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -1,6 +1,7 @@ +# test import pytest - from ckan.tests.helpers import call_action + from ckanext.dcat.processors import RDFParser from ckanext.dcat.tests.utils import BaseParseTest @@ -17,7 +18,8 @@ def test_e2e_dcat_to_ckan(self): Parse a DCAT RDF graph into a CKAN dataset dict, create a dataset with package_create and check that all expected fields are there """ - contents = self._get_file_contents("dataset_health.ttl") + + contents = self._get_file_contents("dcat/dataset_health.ttl") p = RDFParser() @@ -34,123 +36,142 @@ def test_e2e_dcat_to_ckan(self): # Core fields - assert dataset["title"] == "Zimbabwe Regional Geochemical Survey." + assert ( + dataset["title"] + == "[Adapted] Linking of registers for COVID-19 vaccine surveillance" + ) assert ( dataset["notes"] - == "During the period 1982-86 a team of geologists from the British Geological Survey ..." + == "The LINK-VACC project links selected variables from existing registries..." ) - assert dataset["url"] == "http://dataset.info.org" - assert dataset["version"] == "2.3" - assert dataset["license_id"] == "cc-nc" + # assert dataset["url"] == "http://dataset.info.org" + # assert dataset["version"] == "Project HDBP0250" + # assert dataset["license_id"] == "cc-nc" assert sorted([t["name"] for t in dataset["tags"]]) == [ - "exploration", - "geochemistry", - "geology", + "COVID-19", + "Vaccination", + "Vaccine effectiveness", ] # Standard fields - assert dataset["version_notes"] == "New schema added" - assert dataset["identifier"] == "9df8df51-63db-37a8-e044-0003ba9b0d98" - assert dataset["frequency"] == "http://purl.org/cld/freq/daily" - assert dataset["access_rights"] == "public" - assert dataset["provenance"] == "Some statement about provenance" - assert dataset["dcat_type"] == "test-type" - - assert dataset["issued"] == "2012-05-10" - assert dataset["modified"] == "2012-05-10T21:04:00" - assert dataset["temporal_resolution"] == "PT15M" - assert dataset["spatial_resolution_in_meters"] == "1.5" - - # List fields - assert sorted(dataset["conforms_to"]) == ["Standard 1", "Standard 2"] - assert sorted(dataset["language"]) == ["ca", "en", "es"] - assert sorted(dataset["theme"]) == [ - "Earth Sciences", - "http://eurovoc.europa.eu/100142", - "http://eurovoc.europa.eu/209065", - ] - assert sorted(dataset["alternate_identifier"]) == [ - "alternate-identifier-1", - "alternate-identifier-2", - ] - assert sorted(dataset["documentation"]) == [ - "http://dataset.info.org/doc1", - "http://dataset.info.org/doc2", - ] - - assert sorted(dataset["is_referenced_by"]) == [ - "https://doi.org/10.1038/sdata.2018.22", - "test_isreferencedby", - ] - assert sorted(dataset["applicable_legislation"]) == [ - "http://data.europa.eu/eli/reg_impl/2023/138/oj", - "http://data.europa.eu/eli/reg_impl/2023/138/oj_alt", - ] - # Repeating subfields - - assert dataset["contact"][0]["name"] == "Point of Contact" - assert dataset["contact"][0]["email"] == "contact@some.org" - - assert ( - dataset["publisher"][0]["name"] == "Publishing Organization for dataset 1" - ) - assert dataset["publisher"][0]["email"] == "contact@some.org" - assert dataset["publisher"][0]["url"] == "http://some.org" + assert dataset["version_notes"] == "Dataset continuously updated" assert ( - dataset["publisher"][0]["type"] - == "http://purl.org/adms/publishertype/NonProfitOrganisation" + dataset["identifier"] + == "http://fdp2.healthdataportal.eu/dataset/8bb235c9-7bcd-4290-a188-49fe33c2170c" ) - assert dataset["temporal_coverage"][0]["start"] == "1905-03-01" - assert dataset["temporal_coverage"][0]["end"] == "2013-01-05" - - resource = dataset["resources"][0] - - # Resources: core fields - assert resource["url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" - - # Resources: standard fields - assert resource["license"] == "http://creativecommons.org/licenses/by-nc/2.0/" - assert resource["rights"] == "Some statement about rights" - assert resource["issued"] == "2012-05-11" - assert resource["modified"] == "2012-05-01T00:04:06" - assert resource["temporal_resolution"] == "PT15M" - assert resource["spatial_resolution_in_meters"] == 1.5 - assert resource["status"] == "http://purl.org/adms/status/Completed" - assert resource["size"] == 12323 assert ( - resource["availability"] - == "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL" + dataset["frequency"] + == "http://publications.europa.eu/resource/authority/frequency/DAILY" ) assert ( - resource["compress_format"] - == "http://www.iana.org/assignments/media-types/application/gzip" + dataset["access_rights"] + == "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC" ) assert ( - resource["package_format"] - == "http://publications.europa.eu/resource/authority/file-type/TAR" + dataset["provenance"] + == "The data for the LINK-VACC project is sourced from..." ) - assert resource["hash"] == "4304cf2e751e6053c90b1804c89c0ebb758f395a" - assert ( - resource["hash_algorithm"] - == "http://spdx.org/rdf/terms#checksumAlgorithm_sha1" - ) + # Hard to map (example uses a blind node which doesn't work well in CKAN) + # assert dataset["dcat_type"] == "test-type" - assert ( - resource["access_url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" - ) - assert "download_url" not in resource + assert dataset["issued"] == "2023-01-20T08:51:00+00:00" + assert dataset["modified"] == "2024-10-09T00:00:00+00:00" + assert dataset["temporal_resolution"] == "P1D" + assert dataset["spatial_resolution_in_meters"] == "10.0" - # Resources: list fields - assert sorted(resource["language"]) == ["ca", "en", "es"] - assert sorted(resource["documentation"]) == [ - "http://dataset.info.org/distribution1/doc1", - "http://dataset.info.org/distribution1/doc2", + # List fields + assert sorted(dataset["conforms_to"]) == [ + "https://www.wikidata.org/wiki/Q19597236" ] - assert sorted(resource["conforms_to"]) == ["Standard 1", "Standard 2"] - - # Resources: repeating subfields - assert resource["access_services"][0]["title"] == "Sparql-end Point" - assert resource["access_services"][0]["endpoint_url"] == [ - "http://publications.europa.eu/webapi/rdf/sparql" + assert sorted(dataset["language"]) == [ + "http://publications.europa.eu/resource/authority/language/ENG", + "http://publications.europa.eu/resource/authority/language/FRA", + "http://publications.europa.eu/resource/authority/language/NLD", ] + assert sorted(dataset["theme"]) == [ + "http://publications.europa.eu/resource/authority/data-theme/HEAL" + ] + # assert sorted(dataset["alternate_identifier"]) == [ + # "alternate-identifier-1", + # "alternate-identifier-2", + # ] + # assert sorted(dataset["documentation"]) == [ + # "http://dataset.info.org/doc1", + # "http://dataset.info.org/doc2", + # ] + + # <> , ; + assert sorted(dataset["is_referenced_by"]) == [ + "https://doi.org/10.1136/jech-2023-220751", + "https://doi.org/10.1186/s13690-021-00709-x", + ] + assert sorted(dataset["applicable_legislation"]) == [ + "http://data.europa.eu/eli/reg/2022/868/oj", + ] + # Repeating subfields + + assert dataset["contact"][0]["name"] == "Sciensano" + assert dataset["contact"][0]["email"] == "covacsurv@sciensano.be" + + assert dataset["publisher"][0]["name"] == "Sciensano" + assert dataset["publisher"][0]["email"] == "info@sciensano.be" + assert dataset["publisher"][0]["url"] == "https://sciensano.be" + # assert ( + # dataset["publisher"][0]["type"] + # == "http://purl.org/adms/publishertype/NonProfitOrganisation" + # ) + assert dataset["temporal_coverage"][0]["start"] == "2020-03-01" + assert dataset["temporal_coverage"][0]["end"] == "2024-12-31" + + # resource = dataset["resources"][0] + + # # Resources: core fields + # assert resource["url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" + + # # Resources: standard fields + # assert resource["license"] == "http://creativecommons.org/licenses/by-nc/2.0/" + # assert resource["rights"] == "Some statement about rights" + # assert resource["issued"] == "2012-05-11" + # assert resource["modified"] == "2012-05-01T00:04:06" + # assert resource["temporal_resolution"] == "PT15M" + # assert resource["spatial_resolution_in_meters"] == 1.5 + # assert resource["status"] == "http://purl.org/adms/status/Completed" + # assert resource["size"] == 12323 + # assert ( + # resource["availability"] + # == "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL" + # ) + # assert ( + # resource["compress_format"] + # == "http://www.iana.org/assignments/media-types/application/gzip" + # ) + # assert ( + # resource["package_format"] + # == "http://publications.europa.eu/resource/authority/file-type/TAR" + # ) + + # assert resource["hash"] == "4304cf2e751e6053c90b1804c89c0ebb758f395a" + # assert ( + # resource["hash_algorithm"] + # == "http://spdx.org/rdf/terms#checksumAlgorithm_sha1" + # ) + + # assert ( + # resource["access_url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" + # ) + # assert "download_url" not in resource + + # # Resources: list fields + # assert sorted(resource["language"]) == ["ca", "en", "es"] + # assert sorted(resource["documentation"]) == [ + # "http://dataset.info.org/distribution1/doc1", + # "http://dataset.info.org/distribution1/doc2", + # ] + # assert sorted(resource["conforms_to"]) == ["Standard 1", "Standard 2"] + + # # Resources: repeating subfields + # assert resource["access_services"][0]["title"] == "Sparql-end Point" + # assert resource["access_services"][0]["endpoint_url"] == [ + # "http://publications.europa.eu/webapi/rdf/sparql" + # ] diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index fc7d8906..f81a541d 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -62,7 +62,7 @@ dct:alternative "LINK-VACC"@en; dct:conformsTo ; dct:creator ; - dct:description "The LINK-VACC project links selected variables from existing registries for COVID-19 vaccine surveillance, in order to ensure the monitoring of COVID- 19 vaccines in the phase following their marketing authorization (post-authorization surveillance). This includes the measurement of uptake and coverage of the vaccination, the estimation of vaccine effectiveness, and continuous monitoring of the vaccine’s safety. For these purposes, existing pseudonymized data on COVID-19 laboratory test results, hospitalized COVID-19 patients, COVID-19 vaccinations, underlying health problems, socio-demographic and -economic factors, and healthcare worker status are linked. | Funding: The project is funded by the Belgian Federal Authorities | Geo coverage: Nuts 3 | Target population: General population | Personal identifier: National identifier | Level of aggregation: Individual | Linkage possible: Yes | Regulations for data sharing: External investigators with a request for selected data should fill in the data request form (https://epistat.wiv-isp.be/datarequest). Depending on the type of desired data (anonymous or pseudonymized), the provision of data will have to be assessed by the Belgian Information Security Committee Social Security & Health based on legal and ethical regulations, and is outlined in a data transfer agreement with the data owner (Sciensano)."@en; + dct:description "The LINK-VACC project links selected variables from existing registries..."@en; dct:identifier "http://fdp2.healthdataportal.eu/dataset/8bb235c9-7bcd-4290-a188-49fe33c2170c"^^; dct:isPartOf ; dct:isReferencedBy , ; @@ -70,7 +70,7 @@ dct:language , , ; dct:modified "2024-10-09T00:00:00Z"^^; dct:provenance [ a dct:ProvenanceStatement; - rdfs:label "The data for the LINK-VACC project is sourced from several existing databases, including Vaccinnet+, HealthData COVID-19 database (Contact tracing and Clinic database), CoBRHA, STATBEL, and the AIM database. These databases collectively provide comprehensive demographic, clinical, and socio-economic data relevant to the project's objectives"@en + rdfs:label "The data for the LINK-VACC project is sourced from..."@en ]; dct:publisher [ a foaf:Organization , foaf:Agent; foaf:homepage ; @@ -92,17 +92,14 @@ adms:sample ; adms:versionNotes "Dataset continuously updated"@en; dcat:contactPoint ; - dcat:distribution ; + # dcat:distribution ; dcat:hasVersion ; - dcat:keyword "COVID-19"@en , "Vaccination"@en , "Vaccine effectiveness"@en , "Vaccine"@en , "Surveillance"@en , "Post-market authorization"@en , "corona virus"@en , "SARS-CoV-2"@en , "Data linkage"@en , "Public health surveillance"@en; - dcat:landingPage [ a foaf:Document; - rdfs:label "Landing Page for Sciensano"@en; - foaf:homepage - ]; + dcat:keyword "COVID-19"@en , "Vaccination"@en , "Vaccine effectiveness"@en; dcat:spatialResolutionInMeters "10"^^; dcat:temporalResolution "P1D"^^; dcat:theme ; - dcat:version "Project HDBP0250"; + # dcat:version is not mapped in ckan + # dcat:version "Project HDBP0250"; dqv:hasQualityAnnotation [ a dqv:QualityCertificate; oa:hasBody ; oa:hasTarget ; @@ -268,22 +265,22 @@ a dct:MediaTypeOrExtent . - - a dcat:Distribution; - dcatap:applicableLegislation ; - dct:description "Belgian Health Data Agency For better Healthcare, Research & Policy Making"@en; - dct:format ; - dct:identifier "http://ehelse.healthdataportal.eu/distribution/13a3851d-6cdf-4570-a7f0-7f03015d1925"; - dct:isPartOf ; - dct:issued "2024-06-03T08:51:00Z"^^; - dct:license ; - dct:modified "2024-06-04T18:00:00Z"^^; - dct:rights [ a dct:RightsStatement; - rdfs:label "Access to data is conditional on the issuance of a permit by the HDAB after submission of a data request application (English)"@en - ]; - dct:title "Belgian Health Data Agency"@en; - dcat:accessURL ; - dcat:byteSize "80000"^^ . +# +# a dcat:Distribution; +# dcatap:applicableLegislation ; +# dct:description "Belgian Health Data Agency For better Healthcare, Research & Policy Making"@en; +# dct:format ; +# dct:identifier "http://ehelse.healthdataportal.eu/distribution/13a3851d-6cdf-4570-a7f0-7f03015d1925"; +# dct:isPartOf ; +# dct:issued "2024-06-03T08:51:00Z"^^; +# dct:license ; +# dct:modified "2024-06-04T18:00:00Z"^^; +# dct:rights [ a dct:RightsStatement; +# rdfs:label "Access to data is conditional on the issuance of a permit by the HDAB after submission of a data request application (English)"@en +# ]; +# dct:title "Belgian Health Data Agency"@en; +# dcat:accessURL ; +# dcat:byteSize "80000"^^ . a prov:Activity; From fb6ecc516646f70df817d99851c26be6e6860763 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Tue, 26 Nov 2024 17:39:12 +0100 Subject: [PATCH 05/19] More fields and more tests --- ckanext/dcat/profiles/__init__.py | 1 + ckanext/dcat/profiles/euro_health_dcat_ap.py | 36 +++++++------ ckanext/dcat/schemas/healthdcat_ap.yaml | 54 ++++++++++++++++++- .../test_euro_health_dcat_ap_profile_parse.py | 18 ++++++- examples/dcat/dataset_health.ttl | 2 +- 5 files changed, 91 insertions(+), 20 deletions(-) diff --git a/ckanext/dcat/profiles/__init__.py b/ckanext/dcat/profiles/__init__.py index 6d30a244..668de499 100644 --- a/ckanext/dcat/profiles/__init__.py +++ b/ckanext/dcat/profiles/__init__.py @@ -25,4 +25,5 @@ from .euro_dcat_ap_3 import EuropeanDCATAP3Profile from .dcat_us_3 import DCATUS3Profile from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile +from .euro_health_dcat_ap import EuropeanHealthDCATAPProfile from .schemaorg import SchemaOrgProfile diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index b428528b..65495c3b 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -23,30 +23,39 @@ def parse_dataset(self, dataset_dict, dataset_ref): dataset_dict, dataset_ref ) - dataset_dict = self._parse_mandatory_fields(dataset_dict, dataset_ref) + dataset_dict = self._parse_health_fields(dataset_dict, dataset_ref) return dataset_dict - def _parse_mandatory_fields(self, dataset_dict, dataset_ref): - - # Lists for "purpose" and "health theme" + def _parse_health_fields(self, dataset_dict, dataset_ref): for ( key, predicate, ) in ( - ("purpose", HEALTHDCATAP.purpose), + # ("purpose", HEALTHDCATAP.purpose), + ("health_category", HEALTHDCATAP.healthCategory), ("health_theme", HEALTHDCATAP.healthTheme), ): values = self._object_value_list(dataset_ref, predicate) if values: - dataset_dict[key].append(json.dumps(values)) + dataset_dict[key] = values - # Find number of records - number_of_records = self._object_value_int( - dataset_ref, HEALTHDCATAP.numberOfRecords - ) - if number_of_records is not None: - dataset_dict["number_of_records"] = number_of_records + for key, predicate in ( + ("min_typical_age", HEALTHDCATAP.minTypicalAge), + ("max_typical_age", HEALTHDCATAP.maxTypicalAge), + ("number_of_records", HEALTHDCATAP.numberOfRecords), + ): + value = self._object_value_int(dataset_ref, predicate) + # a zero value evaluates as False but is definitely not a None + if value is not None: + dataset_dict[key] = value + + # Purpose is a dpv:Purpose, inside is a dct:Description + + # Add the HDAB. There should only ever be one but you never know + agents = self._agents_details(dataset_ref, HEALTHDCATAP.hdab) + if agents: + dataset_dict["hdab"] = agents return dataset_dict @@ -93,6 +102,3 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): def graph_from_catalog(self, catalog_dict, catalog_ref): super().graph_from_catalog(catalog_dict, catalog_ref) - - def __init__(self): - return None diff --git a/ckanext/dcat/schemas/healthdcat_ap.yaml b/ckanext/dcat/schemas/healthdcat_ap.yaml index 58f361eb..92e45d78 100644 --- a/ckanext/dcat/schemas/healthdcat_ap.yaml +++ b/ckanext/dcat/schemas/healthdcat_ap.yaml @@ -265,13 +265,33 @@ dataset_fields: validators: ignore_missing scheming_multiple_text help_text: A free text statement of the purpose of the processing of data or personal data. +- field_name: health_category + label: Health category + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + The health category to which this dataset belongs as described in the Commission Regulation on + the European Health Data Space laying down a list of categories of electronic data for + secondary use, Art.33. + - field_name: health_theme label: Health theme preset: multiple_text validators: ignore_missing scheming_multiple_text help_text: > - The health category to which this dataset belongs as described in the Commission Regulation - on the European Health Data Space laying down a list of categories of electronic data for secondary use, Art.33. + A category of the Dataset or tag describing the Dataset. + +- field_name: min_typical_age + label: Minimum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Minimum typical age of the population within the dataset. + +- field_name: max_typical_age + label: Maximum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Maximum typical age of the population within the dataset. - field_name: number_of_records label: Number of records @@ -280,6 +300,36 @@ dataset_fields: help_text: Size of the dataset in terms of the number of records. +# Officially there can only be one HDAB for now, but keep it repeating subfield just in case +- field_name: hdab + label: Health data access body + repeating_label: Health data access body + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the HDAB, such as a ROR ID. + help_text: Health Data Access Body supporting access to data in the Member State. + + #- field_name: hvd_category # label: HVD Category # preset: multiple_text diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 6289eecc..7b0c6ef1 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -9,9 +9,9 @@ @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( - "scheming.dataset_schemas", "ckanext.dcat.schemas:dcat_ap_full.yaml" + "scheming.dataset_schemas", "ckanext.dcat.schemas:healthdcat_ap.yaml" ) -@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_dcat_ap_3") +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") class TestSchemingParseSupport(BaseParseTest): def test_e2e_dcat_to_ckan(self): """ @@ -124,6 +124,20 @@ def test_e2e_dcat_to_ckan(self): assert dataset["temporal_coverage"][0]["start"] == "2020-03-01" assert dataset["temporal_coverage"][0]["end"] == "2024-12-31" + ## HealthDCAT specific + assert sorted(dataset["health_theme"]) == [ + "https://www.wikidata.org/wiki/Q58624061", + "https://www.wikidata.org/wiki/Q7907952", + ] + + assert dataset["hdab"][0]["name"] == "Belgian Health Data Agency" + assert dataset["hdab"][0]["email"] == "info@hda.fgov.be" + assert dataset["hdab"][0]["url"] == "https://www.hda.belgium.be" + + assert dataset["number_of_records"] == "124866488" + assert dataset["min_typical_age"] == "0" + assert dataset["max_typical_age"] == "110" + # resource = dataset["resources"][0] # # Resources: core fields diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index f81a541d..513d8f57 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -117,7 +117,7 @@ , , , , ; [ a ; - dct:description "The primary objective of Sciensano's LINK-VACC project is to monitor COVID-19 vaccines post-authorization and evaluate the public health value of prioritizing vaccination for people with comorbidities. This involves assessing the vaccines' effectiveness and safety in the broader population context, beyond the limited scope of clinical trials, and determining future vaccination policies in public health emergencies such as epidemics or pandemics"@en + dct:description "The primary objective of Sciensano's LINK-VACC project is to ..."@en ] . From 899ac2ce90b813c2979770e98f21522bad5e2cd4 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Wed, 27 Nov 2024 21:04:28 +0100 Subject: [PATCH 06/19] Additional HealthDCAT-AP fields --- ckanext/dcat/profiles/euro_health_dcat_ap.py | 60 +++++++++++++------ ckanext/dcat/schemas/healthdcat_ap.yaml | 36 ++++++++++- .../test_euro_health_dcat_ap_profile_parse.py | 17 +++++- examples/dcat/dataset_health.ttl | 4 +- 4 files changed, 94 insertions(+), 23 deletions(-) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 65495c3b..7e469390 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -28,6 +28,45 @@ def parse_dataset(self, dataset_dict, dataset_ref): return dataset_dict def _parse_health_fields(self, dataset_dict, dataset_ref): + self.__parse_healthdcat_stringvalues(dataset_dict, dataset_ref) + + self.__parse_healthdcat_intvalues(dataset_dict, dataset_ref) + + # Purpose is a dpv:Purpose, inside is a dct:Description + pass + + # Add the HDAB. There should only ever be one but you never know + agents = self._agents_details(dataset_ref, HEALTHDCATAP.hdab) + if agents: + dataset_dict["hdab"] = agents + + # Retention period + retention_start, retention_end = self._time_interval( + dataset_ref, HEALTHDCATAP.retentionPeriod, dcat_ap_version=2 + ) + retention_dict = {} + if retention_start is not None: + retention_dict["start"] = retention_start + if retention_end is not None: + retention_dict["end"] = retention_end + if retention_dict: + dataset_dict["retention_period"] = [retention_dict] + + return dataset_dict + + def __parse_healthdcat_intvalues(self, dataset_dict, dataset_ref): + for key, predicate in ( + ("min_typical_age", HEALTHDCATAP.minTypicalAge), + ("max_typical_age", HEALTHDCATAP.maxTypicalAge), + ("number_of_records", HEALTHDCATAP.numberOfRecords), + ("number_of_unique_individuals", HEALTHDCATAP.numberOfUniqueIndividuals), + ): + value = self._object_value_int(dataset_ref, predicate) + # A zero value evaluates as False but is definitely not a None + if value is not None: + dataset_dict[key] = value + + def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): for ( key, predicate, @@ -35,30 +74,13 @@ def _parse_health_fields(self, dataset_dict, dataset_ref): # ("purpose", HEALTHDCATAP.purpose), ("health_category", HEALTHDCATAP.healthCategory), ("health_theme", HEALTHDCATAP.healthTheme), + ("population_coverage", HEALTHDCATAP.populationCoverage), + ("publisher_note", HEALTHDCATAP.publisherNote), ): values = self._object_value_list(dataset_ref, predicate) if values: dataset_dict[key] = values - for key, predicate in ( - ("min_typical_age", HEALTHDCATAP.minTypicalAge), - ("max_typical_age", HEALTHDCATAP.maxTypicalAge), - ("number_of_records", HEALTHDCATAP.numberOfRecords), - ): - value = self._object_value_int(dataset_ref, predicate) - # a zero value evaluates as False but is definitely not a None - if value is not None: - dataset_dict[key] = value - - # Purpose is a dpv:Purpose, inside is a dct:Description - - # Add the HDAB. There should only ever be one but you never know - agents = self._agents_details(dataset_ref, HEALTHDCATAP.hdab) - if agents: - dataset_dict["hdab"] = agents - - return dataset_dict - def graph_from_dataset(self, dataset_dict, dataset_ref): super().graph_from_dataset(dataset_dict, dataset_ref) diff --git a/ckanext/dcat/schemas/healthdcat_ap.yaml b/ckanext/dcat/schemas/healthdcat_ap.yaml index 92e45d78..42d7957e 100644 --- a/ckanext/dcat/schemas/healthdcat_ap.yaml +++ b/ckanext/dcat/schemas/healthdcat_ap.yaml @@ -297,7 +297,41 @@ dataset_fields: label: Number of records validators: ignore_missing int_validator form_snippet: number.html - help_text: Size of the dataset in terms of the number of records. + help_text: Size of the dataset in terms of the number of records + +- field_name: number_of_unique_individuals + label: Number of records for unique individuals. + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Number of records for unique individuals. + +- field_name: publisher_note + label: Publisher note + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A description of the publisher activities. + +- field_name: population_coverage + label: Population coverage + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A definition of the population within the dataset. + +- field_name: retention_period + label: Retention period + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + + help_text: A temporal period which the dataset is available for secondary use. # Officially there can only be one HDAB for now, but keep it repeating subfield just in case diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 7b0c6ef1..46cf3e48 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -134,9 +134,24 @@ def test_e2e_dcat_to_ckan(self): assert dataset["hdab"][0]["email"] == "info@hda.fgov.be" assert dataset["hdab"][0]["url"] == "https://www.hda.belgium.be" - assert dataset["number_of_records"] == "124866488" assert dataset["min_typical_age"] == "0" assert dataset["max_typical_age"] == "110" + assert dataset["number_of_records"] == "124866488" + assert dataset["number_of_unique_individuals"] == "8914722" + + assert dataset["population_coverage"] == [ + "The population targeted by the LINK-VACC project comprises all individuals ..." + ] + assert dataset["publisher_note"] == [ + "Sciensano is a research institute and the national public health institute ..." + ] + + assert dataset["retention_period"] == [ + { + "start": "2020-03-01", + "end": "2034-12-31", + } + ] # resource = dataset["resources"][0] diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index 513d8f57..b9b580ac 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -46,9 +46,9 @@ "8914722"^^; - "The population targeted by the LINK-VACC project comprises all individuals in Belgium who have received a COVID-19 vaccine, undergone testing for COVID-19, or have been hospitalized with a confirmed diagnosis of COVID-19. The project also considers healthcare professionals and the general Belgian population for understanding vaccination coverage and effectiveness, especially among those with comorbidities and varying socio-economic backgrounds."@en; + "The population targeted by the LINK-VACC project comprises all individuals ..."@en; - "Sciensano is a research institute and the national public health institute of Belgium. It is a so-called federal scientific institution that operates under the authority of the federal minister of Public Health and the federal minister of Agriculture of Belgium"@en; + "Sciensano is a research institute and the national public health institute ..."@en; ; From 221c00243045f2dc9b0f9d8b7d3445a1b0f31118 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Mon, 2 Dec 2024 19:26:03 +0100 Subject: [PATCH 07/19] Fix Wikidata URIs in example so they actually resolve --- .../test_euro_health_dcat_ap_profile_parse.py | 4 +- examples/dcat/dataset_health.ttl | 37 +++---------------- 2 files changed, 8 insertions(+), 33 deletions(-) diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 46cf3e48..19fbd74d 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -126,8 +126,8 @@ def test_e2e_dcat_to_ckan(self): ## HealthDCAT specific assert sorted(dataset["health_theme"]) == [ - "https://www.wikidata.org/wiki/Q58624061", - "https://www.wikidata.org/wiki/Q7907952", + "http://www.wikidata.org/entity/Q58624061", + "http://www.wikidata.org/entity/Q7907952", ] assert dataset["hdab"][0]["name"] == "Belgian Health Data Agency" diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index b9b580ac..4c16cc60 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -19,8 +19,8 @@ ; , ; - - , ; + , + ; [ a foaf:Organization; locn:address [ a locn:Address; @@ -36,7 +36,7 @@ , , , ; - , ; + , ; "110"^^; @@ -60,7 +60,7 @@ dct:accessRights ; dct:accrualPeriodicity ; dct:alternative "LINK-VACC"@en; - dct:conformsTo ; + dct:conformsTo ; dct:creator ; dct:description "The LINK-VACC project links selected variables from existing registries..."@en; dct:identifier "http://fdp2.healthdataportal.eu/dataset/8bb235c9-7bcd-4290-a188-49fe33c2170c"^^; @@ -204,24 +204,10 @@ dcat:downloadURL ; dcat:mediaType . - - a dct:Standard; - dct:alternative "ICD-10-PCS"@en; - dct:identifier "https://www.wikidata.org/wiki/Property:P1690"^^; - dct:title "identifier in the ICD-10-PCS (Procedure Coding System, International Classification of Diseases, 10th revision)"@en; - dcat:version "10th Revision" . a dct:LinguisticSystem . - - a dct:Standard; - rdfs:label "Fast Healthcare Interoperability Resources"@en . - - - a skos:Concept; - skos:prefLabel "vaccine efficacy"@en . - a dct:LinguisticSystem . @@ -229,13 +215,6 @@ a skos:Concept; skos:prefLabel "National Public Health Institute"@en . - - a dct:Standard; - dct:alternative "ICD-10-CM"@en; - dct:identifier "https://www.wikidata.org/wiki/Property:P4229"^^; - dct:title "identifier in the ICD-10-CM (International Classification of Diseases, 10th Revision, Clinical Modification)"@en; - dcat:version "10th Revision" . - a dct:RightsStatement . @@ -258,7 +237,7 @@ a skos:Concept; dct:identifier "https://icd.who.int/browse10/2019/en#/Y59.0"^^; skos:definition "Viral vaccines"@en; - skos:hasTopConcept ; + skos:hasTopConcept ; skos:notation "Y59.0"; skos:prefLabel "Viral vaccines"@en . @@ -311,14 +290,10 @@ a skos:Concept; dct:identifier "https://icd.who.int/browse10/2019/en#/U07.1"^^; skos:definition "COVID-19, virus identified"@en; - skos:hasTopConcept ; + skos:hasTopConcept ; skos:notation "U07.1"; skos:prefLabel "COVID-19"@en . a dct:LicenseDocument; rdfs:label "Creative Commons Attribution–NonCommercial–NoDerivs 3.0 Unported"@en . - - - a skos:Concept; - skos:prefLabel "viral vaccines"@en . From f5b7216dfc156e29fe3017b930927bb72a2f5947 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Mon, 2 Dec 2024 19:25:45 +0100 Subject: [PATCH 08/19] Add coding system attribute --- ckanext/dcat/profiles/euro_health_dcat_ap.py | 1 + ckanext/dcat/schemas/healthdcat_ap.yaml | 7 +++++++ .../test_euro_health_dcat_ap_profile_parse.py | 18 +++++++++++++++++- pyproject.toml | 3 ++- 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 7e469390..ab2fe6fd 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -72,6 +72,7 @@ def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): predicate, ) in ( # ("purpose", HEALTHDCATAP.purpose), + ("coding_system", HEALTHDCATAP.hasCodingSystem), ("health_category", HEALTHDCATAP.healthCategory), ("health_theme", HEALTHDCATAP.healthTheme), ("population_coverage", HEALTHDCATAP.populationCoverage), diff --git a/ckanext/dcat/schemas/healthdcat_ap.yaml b/ckanext/dcat/schemas/healthdcat_ap.yaml index 42d7957e..7688afc5 100644 --- a/ckanext/dcat/schemas/healthdcat_ap.yaml +++ b/ckanext/dcat/schemas/healthdcat_ap.yaml @@ -258,6 +258,13 @@ dataset_fields: validators: ignore_missing scheming_multiple_text help_text: The legislation that mandates the creation or management of the dataset. +- field_name: coding_system + label: Coding system + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + Coding systems in use (e.g. ICD-10-CM, DGRs, SNOMED CT, ...). + To comply with HealthDCAT-AP, Wikidata URIs MUST be used. - field_name: purpose label: Purpose diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 19fbd74d..fe89b156 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -80,9 +80,25 @@ def test_e2e_dcat_to_ckan(self): assert dataset["temporal_resolution"] == "P1D" assert dataset["spatial_resolution_in_meters"] == "10.0" + assert sorted(dataset["coding_system"]) == [ + "http://www.wikidata.org/entity/P1690", + "http://www.wikidata.org/entity/P4229", + ] + + # Doesn't really get mapped for some reason + # assert dataset["spatial_coverage"] == [ + # { + # "uri": "http://publications.europa.eu/resource/authority/country/BEL", + # "text": None, + # "geom": None, + # "bbox": None, + # "cent": None, + # } + # ] + # List fields assert sorted(dataset["conforms_to"]) == [ - "https://www.wikidata.org/wiki/Q19597236" + "http://www.wikidata.org/entity/Q19597236" ] assert sorted(dataset["language"]) == [ "http://publications.europa.eu/resource/authority/language/ENG", diff --git a/pyproject.toml b/pyproject.toml index b7634286..80033250 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,5 +61,6 @@ euro_dcat_ap = "ckanext.dcat.profiles:EuropeanDCATAPProfile" euro_dcat_ap_2 = "ckanext.dcat.profiles:EuropeanDCATAP2Profile" euro_dcat_ap_3 = "ckanext.dcat.profiles:EuropeanDCATAP3Profile" euro_dcat_ap_scheming = "ckanext.dcat.profiles:EuropeanDCATAPSchemingProfile" -dcat_us_3="ckanext.dcat.profiles:DCATUS3Profile" +euro_health_dcat_ap = "ckanext.dcat.profiles:EuropeanHealthDCATAPProfile" +dcat_us_3 = "ckanext.dcat.profiles:DCATUS3Profile" schemaorg = "ckanext.dcat.profiles:SchemaOrgProfile" From 73e4c88c5fa742957c3a6228d98ca48f929b364f Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Mon, 2 Dec 2024 22:05:59 +0100 Subject: [PATCH 09/19] Create initial CKAN JSON data implementing HealthDCAT scheme --- examples/ckan/healthdcat_ap.json | 175 +++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 examples/ckan/healthdcat_ap.json diff --git a/examples/ckan/healthdcat_ap.json b/examples/ckan/healthdcat_ap.json new file mode 100644 index 00000000..46ec76ef --- /dev/null +++ b/examples/ckan/healthdcat_ap.json @@ -0,0 +1,175 @@ +{ + "access_rights": "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC", + "alternate_identifier": [ + "internalURI:admsIdentifier0" + ], + "applicable_legislation": [ + "http://data.europa.eu/eli/reg/2022/868/oj" + ], + "author": null, + "author_email": null, + "coding_system": [ + "http://www.wikidata.org/entity/P1690", + "http://www.wikidata.org/entity/P4229" + ], + "conforms_to": [ + "http://www.wikidata.org/entity/Q19597236" + ], + "creator_user_id": null, + "dcat_type": "n1049372e768c4429a6b2200c22f5f1a4b7", + "documentation": [ + "n1049372e768c4429a6b2200c22f5f1a4b9" + ], + "frequency": "http://publications.europa.eu/resource/authority/frequency/DAILY", + "health_category": [ + "http://13.81.34.152:1101/resource/authority/healthcategories/PHDR", + "http://13.81.34.152:1101/resource/authority/healthcategories/IDHP", + "http://13.81.34.152:1101/resource/authority/healthcategories/DIOH", + "http://13.81.34.152:1101/resource/authority/healthcategories/EHRS" + ], + "health_theme": [ + "http://www.wikidata.org/entity/Q7907952", + "http://www.wikidata.org/entity/Q58624061" + ], + "id": "e7ccf79d-705c-427f-8e96-f87bcd6e5318", + "identifier": "http://fdp2.healthdataportal.eu/dataset/8bb235c9-7bcd-4290-a188-49fe33c2170c", + "is_referenced_by": [ + "https://doi.org/10.1136/jech-2023-220751", + "https://doi.org/10.1186/s13690-021-00709-x" + ], + "isopen": false, + "issued": "2023-01-20T08:51:00+00:00", + "language": [ + "http://publications.europa.eu/resource/authority/language/ENG", + "http://publications.europa.eu/resource/authority/language/NLD", + "http://publications.europa.eu/resource/authority/language/FRA" + ], + "license_id": "", + "license_title": "", + "maintainer": null, + "maintainer_email": null, + "max_typical_age": "110", + "metadata_created": "2024-12-02T19:00:30.897399", + "metadata_modified": "2024-12-02T19:00:30.897406", + "min_typical_age": "0", + "modified": "2024-10-09T00:00:00+00:00", + "name": "test-dcat-1", + "notes": "The LINK-VACC project links selected variables from existing registries...", + "num_resources": 0, + "num_tags": 3, + "number_of_records": "124866488", + "number_of_unique_individuals": "8914722", + "organization": null, + "population_coverage": [ + "The population targeted by the LINK-VACC project comprises all individuals ..." + ], + "private": false, + "provenance": "The data for the LINK-VACC project is sourced from...", + "publisher_note": [ + "Sciensano is a research institute and the national public health institute ..." + ], + "publisher_type": [ + "http: //healthdataportal.eu/resource/authority/publisher-type/nationalPublicHealthInstitute" + ], + "spatial_resolution_in_meters": "10.0", + "state": "active", + "temporal_resolution": "P1D", + "theme": [ + "http://publications.europa.eu/resource/authority/data-theme/HEAL" + ], + "title": "[Adapted] Linking of registers for COVID-19 vaccine surveillance", + "type": "dataset", + "uri": "http://piveau.io/set/data/16e16149-bf41-42f6-8741-225e8c97a35e", + "version_notes": "Dataset continuously updated", + "contact": [ + { + "email": "covacsurv@sciensano.be", + "identifier": "", + "name": "Sciensano", + "uri": "internalURI:contactPoint0" + } + ], + "creator": [ + { + "email": "info@sciensano.be", + "identifier": "", + "name": "Sciensano", + "type": "", + "uri": "internalURI:creator0", + "url": "https://org.belgif.be/id/CbeRegisteredEntity/0693876830" + } + ], + "extras": [ + { + "key": "related_resource", + "value": "[\"http://ehelse.healthdataportal.eu/dataset/32987e34-3f23-4e14-b7a3-8a884eb79e51\"]" + }, + { + "key": "sample", + "value": "[\"http: //piveau.io/set/distribution/12f5eb54-a2f7-4549-b4ac-1e1d10003cef\"]" + }, + { + "key": "spatial_uri", + "value": "http://publications.europa.eu/resource/authority/country/BEL" + } + ], + "hdab": [ + { + "email": "info@hda.fgov.be", + "identifier": "", + "name": "Belgian Health Data Agency", + "type": "", + "uri": "", + "url": "https: //www.hda.belgium.be" + } + ], + "publisher": [ + { + "email": "info@sciensano.be", + "identifier": "", + "name": "Sciensano", + "type": "", + "uri": "", + "url": "https://sciensano.be" + } + ], + "retention_period": [ + { + "end": "2034-12-31", + "start": "2020-03-01" + } + ], + "tags": [ + { + "display_name": "COVID-19", + "id": "5c418ec2-cb41-4c42-9b9c-f5d1e3a831e5", + "name": "COVID-19", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "Vaccination", + "id": "c4117ace-2114-470d-b6e9-0df7580a12d8", + "name": "Vaccination", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "Vaccine effectiveness", + "id": "d5a5288d-3bff-431e-be94-12c71d25d75b", + "name": "Vaccine effectiveness", + "state": "active", + "vocabulary_id": null + } + ], + "temporal_coverage": [ + { + "end": "2024-12-31", + "start": "2020-03-01" + } + ], + "resources": [], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] +} \ No newline at end of file From 4e78c478d86c021e42ecd6c1592c62537ae2eca0 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Wed, 4 Dec 2024 17:29:17 +0100 Subject: [PATCH 10/19] Add a whole bunch of test cases --- ckanext/dcat/profiles/base.py | 14 +- ckanext/dcat/profiles/euro_health_dcat_ap.py | 101 +++-- ckanext/dcat/schemas/healthdcat_ap.yaml | 13 + .../test_euro_health_dcat_ap_profile_parse.py | 65 ++-- ...t_euro_health_dcat_ap_profile_serialize.py | 65 ++++ examples/ckan/healthdcat_ap.json | 356 +++++++++--------- examples/dcat/dataset_health.ttl | 130 +++---- examples/dcat/dataset_health_no_blank.ttl | 76 ++++ 8 files changed, 516 insertions(+), 304 deletions(-) create mode 100644 ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py create mode 100644 examples/dcat/dataset_health_no_blank.ttl diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 9edb3db6..9beee13c 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -2,16 +2,16 @@ import json from urllib.parse import quote +from ckan.lib.helpers import resource_formats +from ckan.model.license import LicenseRegister +from ckantoolkit import ObjectNotFound, asbool, aslist, config, get_action, url_for from dateutil.parser import parse as parse_date -from rdflib import term, URIRef, BNode, Literal -from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS, ORG -from geomet import wkt, InvalidGeoJSONException +from geomet import InvalidGeoJSONException, wkt +from rdflib import BNode, Literal, URIRef, term +from rdflib.namespace import ORG, RDF, RDFS, SKOS, XSD, Namespace -from ckantoolkit import config, url_for, asbool, aslist, get_action, ObjectNotFound -from ckan.model.license import LicenseRegister -from ckan.lib.helpers import resource_formats from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS -from ckanext.dcat.validators import is_year, is_year_month, is_date +from ckanext.dcat.validators import is_date, is_year, is_year_month CNT = Namespace("http://www.w3.org/2011/content#") DCT = Namespace("http://purl.org/dc/terms/") diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index ab2fe6fd..517b1b3b 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -2,14 +2,23 @@ import json -from rdflib import SKOS, XSD, Literal +from rdflib import RDF, SKOS, XSD, BNode, Literal from rdflib.namespace import Namespace -from ckanext.dcat.profiles.base import URIRefOrLiteral +from ckanext.dcat.profiles.base import DCAT, DCT, URIRefOrLiteral from ckanext.dcat.profiles.euro_dcat_ap_3 import EuropeanDCATAP3Profile +# HealthDCAT-AP namespace. Note: not finalized yet HEALTHDCATAP = Namespace("http://healthdataportal.eu/ns/health#") +# Data Privacy Vocabulary namespace +DPV = Namespace("https://w3id.org/dpv#") + +namespaces = { + "healthdcatap": HEALTHDCATAP, + "dpv": DPV, +} + class EuropeanHealthDCATAPProfile(EuropeanDCATAP3Profile): """ @@ -75,8 +84,11 @@ def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): ("coding_system", HEALTHDCATAP.hasCodingSystem), ("health_category", HEALTHDCATAP.healthCategory), ("health_theme", HEALTHDCATAP.healthTheme), + ("legal_basis", DPV.hasLegalBasis), ("population_coverage", HEALTHDCATAP.populationCoverage), ("publisher_note", HEALTHDCATAP.publisherNote), + ("publisher_type", HEALTHDCATAP.publisherType), + ("purpose", DPV.hasPurpose), ): values = self._object_value_list(dataset_ref, predicate) if values: @@ -84,44 +96,89 @@ def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): def graph_from_dataset(self, dataset_dict, dataset_ref): super().graph_from_dataset(dataset_dict, dataset_ref) + for prefix, namespace in namespaces.items(): + self.g.bind(prefix, namespace) + + # g = self.g - g = self.g + # ("coding_system", HEALTHDCATAP.hasCodingSystem), + # ("health_category", HEALTHDCATAP.healthCategory), + # ("health_theme", HEALTHDCATAP.healthTheme), + # ("population_coverage", HEALTHDCATAP.populationCoverage), + # ("publisher_note", HEALTHDCATAP.publisherNote), + # ("publisher_type", HEALTHDCATAP.publisherType), # List items: # - Purpose # - Health theme + + ## key, predicate, fallbacks, _type, _class items = [ - ("purpose", HEALTHDCATAP.purpose, None, URIRefOrLiteral), + ("coding_system", HEALTHDCATAP.hasCodingSystem, None, URIRefOrLiteral), + ("health_category", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), + ("health_theme", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), + ("legal_basis", DPV.hasLegalBasis, None, URIRefOrLiteral), ( - "health_theme", - HEALTHDCATAP.healthTheme, + "population_coverage", + HEALTHDCATAP.populationCoverage, None, URIRefOrLiteral, - SKOS.concept, ), + ("publisher_note", HEALTHDCATAP.publisherNote, None, URIRefOrLiteral), + ("publisher_type", HEALTHDCATAP.publisherType, None, URIRefOrLiteral), + ("purpose", DPV.hasPurpose, None, URIRefOrLiteral), ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) - # Number of records - if dataset_dict.get("number_of_records"): + items = [ + ("min_typical_age", HEALTHDCATAP.minTypicalAge), + ("max_typical_age", HEALTHDCATAP.maxTypicalAge), + ("number_of_records", HEALTHDCATAP.numberOfRecords), + ("number_of_unique_individuals", HEALTHDCATAP.numberOfUniqueIndividuals), + ] + for key, predicate in items: + self._add_nonneg_integer_triple(dataset_dict, dataset_ref, key, predicate) + + self._add_agents(dataset_ref, dataset_dict, "hdab", HEALTHDCATAP.hdab) + + def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): + """ + Adds non-negative integers to the Dataset graph (xsd:nonNegativeInteger) + + dataset_ref: subject of Graph + key: scheming key in CKAN + predicate: predicate to use + """ + value = self._get_dict_value(dataset_dict, key) + + if value: try: - g.add( + if int(value) < 0: + raise ValueError("Not a non-negative integer") + self.g.add( ( dataset_ref, - HEALTHDCATAP.numberOfRecords, - Literal( - dataset_dict["number_of_records"], - datatype=XSD.nonNegativeInteger, - ), + predicate, + Literal(int(value), datatype=XSD.nonNegativeInteger), ) ) except (ValueError, TypeError): - g.add( - ( - dataset_ref, - HEALTHDCATAP.numberOfRecords, - Literal(dataset_dict["number_of_records"]), - ) - ) + self.g.add((dataset_ref, predicate, Literal(value))) + + def _add_timeframe_triple(self, dataset_dict, dataset_ref): + temporal = dataset_dict.get("temporal_coverage") + if ( + isinstance(temporal, list) + and len(temporal) + and self._not_empty_dict(temporal[0]) + ): + for item in temporal: + temporal_ref = BNode() + self.g.add((temporal_ref, RDF.type, DCT.PeriodOfTime)) + if item.get("start"): + self._add_date_triple(temporal_ref, DCAT.startDate, item["start"]) + if item.get("end"): + self._add_date_triple(temporal_ref, DCAT.endDate, item["end"]) + self.g.add((dataset_ref, DCT.temporal, temporal_ref)) def graph_from_catalog(self, catalog_dict, catalog_ref): super().graph_from_catalog(catalog_dict, catalog_ref) diff --git a/ckanext/dcat/schemas/healthdcat_ap.yaml b/ckanext/dcat/schemas/healthdcat_ap.yaml index 7688afc5..9fbc7ff2 100644 --- a/ckanext/dcat/schemas/healthdcat_ap.yaml +++ b/ckanext/dcat/schemas/healthdcat_ap.yaml @@ -288,6 +288,12 @@ dataset_fields: help_text: > A category of the Dataset or tag describing the Dataset. +- field_name: legal_basis + label: Legal basis + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legal basis used to justify processing of personal data. + - field_name: min_typical_age label: Minimum typical age validators: ignore_missing int_validator @@ -319,6 +325,13 @@ dataset_fields: help_text: > A description of the publisher activities. +- field_name: publisher_type + label: Publisher type + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A type of organisation that makes the Dataset available. + - field_name: population_coverage label: Population coverage preset: multiple_text diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index fe89b156..60aaf169 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -1,10 +1,16 @@ # test +import json +import logging +from pprint import pprint + import pytest from ckan.tests.helpers import call_action from ckanext.dcat.processors import RDFParser from ckanext.dcat.tests.utils import BaseParseTest +log = logging.getLogger(__name__) + @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @@ -36,29 +42,23 @@ def test_e2e_dcat_to_ckan(self): # Core fields - assert ( - dataset["title"] - == "[Adapted] Linking of registers for COVID-19 vaccine surveillance" - ) + assert dataset["title"] == "HealthDCAT-AP test dataset" assert ( dataset["notes"] - == "The LINK-VACC project links selected variables from existing registries..." + == "This dataset is an example of using HealthDCAT-AP in CKAN" ) # assert dataset["url"] == "http://dataset.info.org" # assert dataset["version"] == "Project HDBP0250" # assert dataset["license_id"] == "cc-nc" assert sorted([t["name"] for t in dataset["tags"]]) == [ - "COVID-19", - "Vaccination", - "Vaccine effectiveness", + "Test 1", + "Test 2", + "Test 3", ] # Standard fields assert dataset["version_notes"] == "Dataset continuously updated" - assert ( - dataset["identifier"] - == "http://fdp2.healthdataportal.eu/dataset/8bb235c9-7bcd-4290-a188-49fe33c2170c" - ) + assert dataset["identifier"] == "http://example.com/dataset/1234567890" assert ( dataset["frequency"] == "http://publications.europa.eu/resource/authority/frequency/DAILY" @@ -69,14 +69,14 @@ def test_e2e_dcat_to_ckan(self): ) assert ( dataset["provenance"] - == "The data for the LINK-VACC project is sourced from..." + == "This example dataset is partly sourced from TEHDAS2" ) # Hard to map (example uses a blind node which doesn't work well in CKAN) # assert dataset["dcat_type"] == "test-type" - assert dataset["issued"] == "2023-01-20T08:51:00+00:00" - assert dataset["modified"] == "2024-10-09T00:00:00+00:00" + assert dataset["issued"] == "2024-01-01T00:00:00+00:00" + assert dataset["modified"] == "2024-12-31T23:59:59+00:00" assert dataset["temporal_resolution"] == "P1D" assert dataset["spatial_resolution_in_meters"] == "10.0" @@ -117,22 +117,22 @@ def test_e2e_dcat_to_ckan(self): # "http://dataset.info.org/doc2", # ] - # <> , ; + # <> , ; assert sorted(dataset["is_referenced_by"]) == [ - "https://doi.org/10.1136/jech-2023-220751", - "https://doi.org/10.1186/s13690-021-00709-x", + "https://doi.org/10.1038/sdata.2016.18", + "https://dx.doi.org/10.1002/jmri.28679", ] assert sorted(dataset["applicable_legislation"]) == [ "http://data.europa.eu/eli/reg/2022/868/oj", ] # Repeating subfields - assert dataset["contact"][0]["name"] == "Sciensano" + assert dataset["contact"][0]["name"] == "Contact Point" assert dataset["contact"][0]["email"] == "covacsurv@sciensano.be" - assert dataset["publisher"][0]["name"] == "Sciensano" - assert dataset["publisher"][0]["email"] == "info@sciensano.be" - assert dataset["publisher"][0]["url"] == "https://sciensano.be" + assert dataset["publisher"][0]["name"] == "Contact Point" + assert dataset["publisher"][0]["email"] == "info@example.com" + assert dataset["publisher"][0]["url"] == "https://healthdata.nl" # assert ( # dataset["publisher"][0]["type"] # == "http://purl.org/adms/publishertype/NonProfitOrganisation" @@ -146,21 +146,28 @@ def test_e2e_dcat_to_ckan(self): "http://www.wikidata.org/entity/Q7907952", ] - assert dataset["hdab"][0]["name"] == "Belgian Health Data Agency" - assert dataset["hdab"][0]["email"] == "info@hda.fgov.be" - assert dataset["hdab"][0]["url"] == "https://www.hda.belgium.be" + assert dataset["legal_basis"] == ["https://w3id.org/dpv#Consent"] + + assert dataset["hdab"][0]["name"] == "EU Health Data Access Body" + assert dataset["hdab"][0]["email"] == "hdab@example.com" + assert dataset["hdab"][0]["url"] == "https://www.example.com/hdab" assert dataset["min_typical_age"] == "0" assert dataset["max_typical_age"] == "110" - assert dataset["number_of_records"] == "124866488" - assert dataset["number_of_unique_individuals"] == "8914722" + assert dataset["number_of_records"] == "123456789" + assert dataset["number_of_unique_individuals"] == "7654321" assert dataset["population_coverage"] == [ - "The population targeted by the LINK-VACC project comprises all individuals ..." + "This example includes a very non-descript population" ] assert dataset["publisher_note"] == [ - "Sciensano is a research institute and the national public health institute ..." + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." ] + assert dataset["publisher_type"] == [ + "http://example.com/publisherType/undefined" + ] + + assert dataset["purpose"] == ["https://w3id.org/dpv#AcademicResearch"] assert dataset["retention_period"] == [ { diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py new file mode 100644 index 00000000..64ccd189 --- /dev/null +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -0,0 +1,65 @@ +import json + +import pytest +from ckan.tests.helpers import call_action +from geomet import wkt +from rdflib import Graph +from rdflib.namespace import RDF +from rdflib.term import URIRef + +from ckanext.dcat import utils +from ckanext.dcat.processors import RDFSerializer +from ckanext.dcat.profiles import ( + ADMS, + DCAT, + DCATAP, + DCT, + FOAF, + GSP, + LOCN, + OWL, + RDF, + RDFS, + SKOS, + SPDX, + VCARD, + XSD, +) +from ckanext.dcat.tests.utils import BaseSerializeTest + +DCAT_AP_PROFILES = ["euro_dcat_ap_3"] + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:healthdcat_ap.yaml" +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") +class TestEuroDCATAP3ProfileSerializeDataset(BaseSerializeTest): + def test_e2e_ckan_to_dcat(self): + dataset_dict = json.loads(self._get_file_contents("ckan/healthdcat_ap.json"))[0] + + dataset = call_action("package_create", **dataset_dict) + + # Make sure schema was used + assert dataset["hdab"][0]["name"] == "EU Health Data Access Body" + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + # Test dataset URI + assert str(dataset_ref) == utils.dataset_uri(dataset) + + # Load Reference graph that only containes + contents = self._get_file_contents("dcat/dataset_health_no_blank.ttl") + reference = Graph() + reference.parse(data=contents, format="turtle") + + # First check that all non-blind nodes from the reference are present in the output + assert all(triple in g for triple in reference) + + print(s.g.serialize(format="turtle")) + assert False diff --git a/examples/ckan/healthdcat_ap.json b/examples/ckan/healthdcat_ap.json index 46ec76ef..54779ef1 100644 --- a/examples/ckan/healthdcat_ap.json +++ b/examples/ckan/healthdcat_ap.json @@ -1,175 +1,181 @@ -{ - "access_rights": "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC", - "alternate_identifier": [ - "internalURI:admsIdentifier0" - ], - "applicable_legislation": [ - "http://data.europa.eu/eli/reg/2022/868/oj" - ], - "author": null, - "author_email": null, - "coding_system": [ - "http://www.wikidata.org/entity/P1690", - "http://www.wikidata.org/entity/P4229" - ], - "conforms_to": [ - "http://www.wikidata.org/entity/Q19597236" - ], - "creator_user_id": null, - "dcat_type": "n1049372e768c4429a6b2200c22f5f1a4b7", - "documentation": [ - "n1049372e768c4429a6b2200c22f5f1a4b9" - ], - "frequency": "http://publications.europa.eu/resource/authority/frequency/DAILY", - "health_category": [ - "http://13.81.34.152:1101/resource/authority/healthcategories/PHDR", - "http://13.81.34.152:1101/resource/authority/healthcategories/IDHP", - "http://13.81.34.152:1101/resource/authority/healthcategories/DIOH", - "http://13.81.34.152:1101/resource/authority/healthcategories/EHRS" - ], - "health_theme": [ - "http://www.wikidata.org/entity/Q7907952", - "http://www.wikidata.org/entity/Q58624061" - ], - "id": "e7ccf79d-705c-427f-8e96-f87bcd6e5318", - "identifier": "http://fdp2.healthdataportal.eu/dataset/8bb235c9-7bcd-4290-a188-49fe33c2170c", - "is_referenced_by": [ - "https://doi.org/10.1136/jech-2023-220751", - "https://doi.org/10.1186/s13690-021-00709-x" - ], - "isopen": false, - "issued": "2023-01-20T08:51:00+00:00", - "language": [ - "http://publications.europa.eu/resource/authority/language/ENG", - "http://publications.europa.eu/resource/authority/language/NLD", - "http://publications.europa.eu/resource/authority/language/FRA" - ], - "license_id": "", - "license_title": "", - "maintainer": null, - "maintainer_email": null, - "max_typical_age": "110", - "metadata_created": "2024-12-02T19:00:30.897399", - "metadata_modified": "2024-12-02T19:00:30.897406", - "min_typical_age": "0", - "modified": "2024-10-09T00:00:00+00:00", - "name": "test-dcat-1", - "notes": "The LINK-VACC project links selected variables from existing registries...", - "num_resources": 0, - "num_tags": 3, - "number_of_records": "124866488", - "number_of_unique_individuals": "8914722", - "organization": null, - "population_coverage": [ - "The population targeted by the LINK-VACC project comprises all individuals ..." - ], - "private": false, - "provenance": "The data for the LINK-VACC project is sourced from...", - "publisher_note": [ - "Sciensano is a research institute and the national public health institute ..." - ], - "publisher_type": [ - "http: //healthdataportal.eu/resource/authority/publisher-type/nationalPublicHealthInstitute" - ], - "spatial_resolution_in_meters": "10.0", - "state": "active", - "temporal_resolution": "P1D", - "theme": [ - "http://publications.europa.eu/resource/authority/data-theme/HEAL" - ], - "title": "[Adapted] Linking of registers for COVID-19 vaccine surveillance", - "type": "dataset", - "uri": "http://piveau.io/set/data/16e16149-bf41-42f6-8741-225e8c97a35e", - "version_notes": "Dataset continuously updated", - "contact": [ - { - "email": "covacsurv@sciensano.be", - "identifier": "", - "name": "Sciensano", - "uri": "internalURI:contactPoint0" - } - ], - "creator": [ - { - "email": "info@sciensano.be", - "identifier": "", - "name": "Sciensano", - "type": "", - "uri": "internalURI:creator0", - "url": "https://org.belgif.be/id/CbeRegisteredEntity/0693876830" - } - ], - "extras": [ - { - "key": "related_resource", - "value": "[\"http://ehelse.healthdataportal.eu/dataset/32987e34-3f23-4e14-b7a3-8a884eb79e51\"]" - }, - { - "key": "sample", - "value": "[\"http: //piveau.io/set/distribution/12f5eb54-a2f7-4549-b4ac-1e1d10003cef\"]" - }, - { - "key": "spatial_uri", - "value": "http://publications.europa.eu/resource/authority/country/BEL" - } - ], - "hdab": [ - { - "email": "info@hda.fgov.be", - "identifier": "", - "name": "Belgian Health Data Agency", - "type": "", - "uri": "", - "url": "https: //www.hda.belgium.be" - } - ], - "publisher": [ - { - "email": "info@sciensano.be", - "identifier": "", - "name": "Sciensano", - "type": "", - "uri": "", - "url": "https://sciensano.be" - } - ], - "retention_period": [ - { - "end": "2034-12-31", - "start": "2020-03-01" - } - ], - "tags": [ - { - "display_name": "COVID-19", - "id": "5c418ec2-cb41-4c42-9b9c-f5d1e3a831e5", - "name": "COVID-19", - "state": "active", - "vocabulary_id": null - }, - { - "display_name": "Vaccination", - "id": "c4117ace-2114-470d-b6e9-0df7580a12d8", - "name": "Vaccination", - "state": "active", - "vocabulary_id": null - }, - { - "display_name": "Vaccine effectiveness", - "id": "d5a5288d-3bff-431e-be94-12c71d25d75b", - "name": "Vaccine effectiveness", - "state": "active", - "vocabulary_id": null - } - ], - "temporal_coverage": [ - { - "end": "2024-12-31", - "start": "2020-03-01" - } - ], - "resources": [], - "groups": [], - "relationships_as_subject": [], - "relationships_as_object": [] -} \ No newline at end of file +[ + { + "access_rights": "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC", + "alternate_identifier": [ + "internalURI:admsIdentifier0" + ], + "applicable_legislation": [ + "http://data.europa.eu/eli/reg/2022/868/oj" + ], + "author": null, + "author_email": null, + "coding_system": [ + "http://www.wikidata.org/entity/P1690", + "http://www.wikidata.org/entity/P4229" + ], + "conforms_to": [ + "http://www.wikidata.org/entity/Q19597236" + ], + "creator_user_id": null, + "dcat_type": "n1049372e768c4429a6b2200c22f5f1a4b7", + "documentation": [ + "n1049372e768c4429a6b2200c22f5f1a4b9" + ], + "frequency": "http://publications.europa.eu/resource/authority/frequency/DAILY", + "health_category": [ + "http://example.com/ontology/resource/authority/healthcategories/PHDR", + "http://example.com/ontology/resource/authority/healthcategories/IDHP", + "http://example.com/ontology/resource/authority/healthcategories/DIOH", + "http://example.com/ontology/resource/authority/healthcategories/EHRS" + ], + "health_theme": [ + "http://www.wikidata.org/entity/Q7907952", + "http://www.wikidata.org/entity/Q58624061" + ], + "id": "e7ccf79d-705c-427f-8e96-f87bcd6e5318", + "identifier": "http://example.com/dataset/1234567890", + "is_referenced_by": [ + "https://doi.org/10.1038/sdata.2016.18", + "https://dx.doi.org/10.1002/jmri.28679" + ], + "isopen": false, + "issued": "2024-01-01T00:00:00+00:00", + "language": [ + "http://publications.europa.eu/resource/authority/language/ENG", + "http://publications.europa.eu/resource/authority/language/NLD", + "http://publications.europa.eu/resource/authority/language/FRA" + ], + "legal_basis": [ + "https://w3id.org/dpv#Consent" + ], + "license_id": "", + "license_title": "", + "maintainer": null, + "maintainer_email": null, + "max_typical_age": "110", + "metadata_created": "2024-12-02T19:00:30.897399", + "metadata_modified": "2024-12-02T19:00:30.897406", + "min_typical_age": "0", + "modified": "2024-12-31T23:59:59+00:00", + "name": "test-dcat-1", + "notes": "This dataset is an example of using HealthDCAT-AP in CKAN", + "num_resources": 0, + "num_tags": 3, + "number_of_records": "123456789", + "number_of_unique_individuals": "7654321", + "organization": null, + "population_coverage": [ + "This example includes a very non-descript population" + ], + "private": false, + "provenance": "This example dataset is partly sourced from TEHDAS2", + "publisher_note": [ + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." + ], + "publisher_type": [ + "http://example.com/publisherType/undefined" + ], + "purpose": [ + "https://w3id.org/dpv#AcademicResearch" + ], + "spatial_resolution_in_meters": "10.0", + "state": "active", + "temporal_resolution": "P1D", + "theme": [ + "http://publications.europa.eu/resource/authority/data-theme/HEAL" + ], + "title": "HealthDCAT-AP test dataset", + "type": "dataset", + "uri": "http://example.healthdata.nl/set/dataset", + "version_notes": "Dataset continuously updated", + "contact": [ + { + "email": "covacsurv@sciensano.be", + "identifier": "", + "name": "Contact Point" + } + ], + "creator": [ + { + "email": "info@example.com", + "identifier": "", + "name": "Contact Point", + "type": "", + "url": "https:/example.com/homepage" + } + ], + "extras": [ + { + "key": "related_resource", + "value": "[\"http://example.com/dataset/9876543210\"]" + }, + { + "key": "sample", + "value": "[\"http://piveau.io/set/distribution/12f5eb54-a2f7-4549-b4ac-1e1d10003cef\"]" + }, + { + "key": "spatial_uri", + "value": "http://publications.europa.eu/resource/authority/country/BEL" + } + ], + "hdab": [ + { + "email": "hdab@example.com", + "identifier": "", + "name": "EU Health Data Access Body", + "type": "", + "uri": "", + "url": "https: //www.hda.belgium.be" + } + ], + "publisher": [ + { + "email": "info@example.com", + "identifier": "", + "name": "Contact Point", + "type": "", + "uri": "", + "url": "https://healthdata.nl" + } + ], + "retention_period": [ + { + "end": "2034-12-31", + "start": "2020-03-01" + } + ], + "tags": [ + { + "display_name": "Test 1", + "id": "5c418ec2-cb41-4c42-9b9c-f5d1e3a831e5", + "name": "Test 1", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "Test 2", + "id": "c4117ace-2114-470d-b6e9-0df7580a12d8", + "name": "Test 2", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "Test 3", + "id": "d5a5288d-3bff-431e-be94-12c71d25d75b", + "name": "Test 3", + "state": "active", + "vocabulary_id": null + } + ], + "temporal_coverage": [ + { + "end": "2024-12-31", + "start": "2020-03-01" + } + ], + "resources": [], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + } +] \ No newline at end of file diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index 4c16cc60..ca92043a 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -12,7 +12,7 @@ @prefix spdx: . @prefix vcard: . - + a dcat:Resource , dcat:Dataset; dcatap:applicableLegislation ; @@ -23,18 +23,12 @@ ; [ a foaf:Organization; - locn:address [ a locn:Address; - locn:adminUnitL1 "http://publications.europa.eu/resource/authority/country/BEL"; - locn:fullAddress "Galileelaan 5, Bus 2"; - locn:postCode "1210"; - locn:postName "Saint-Josse-ten-Noode" - ]; - foaf:homepage ; - foaf:mbox ; - foaf:name "Belgian Health Data Agency" + foaf:homepage ; + foaf:mbox ; + foaf:name "EU Health Data Access Body" ]; - , , , ; + , , , ; , ; @@ -42,59 +36,59 @@ "0"^^; - "124866488"^^; + "123456789"^^; - "8914722"^^; + "7654321"^^; - "The population targeted by the LINK-VACC project comprises all individuals ..."@en; + "This example includes a very non-descript population"; - "Sciensano is a research institute and the national public health institute ..."@en; + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation."; - ; + ; [ a dct:PeriodOfTime; - rdfs:comment "As stated in the CSI deliberation"@en; + rdfs:comment "As stated in the CSI deliberation"; dcat:endDate "2034-12-31"^^; dcat:startDate "2020-03-01"^^ ]; dct:accessRights ; dct:accrualPeriodicity ; - dct:alternative "LINK-VACC"@en; + dct:alternative "TEST-DATASET"; dct:conformsTo ; dct:creator ; - dct:description "The LINK-VACC project links selected variables from existing registries..."@en; - dct:identifier "http://fdp2.healthdataportal.eu/dataset/8bb235c9-7bcd-4290-a188-49fe33c2170c"^^; - dct:isPartOf ; - dct:isReferencedBy , ; - dct:issued "2023-01-20T08:51:00Z"^^; + dct:description "This dataset is an example of using HealthDCAT-AP in CKAN"; + dct:identifier "http://example.com/dataset/1234567890"^^; + dct:isPartOf ; + dct:isReferencedBy , ; + dct:issued "2024-01-01T00:00:00Z"^^; dct:language , , ; - dct:modified "2024-10-09T00:00:00Z"^^; + dct:modified "2024-12-31T23:59:59Z"^^; dct:provenance [ a dct:ProvenanceStatement; - rdfs:label "The data for the LINK-VACC project is sourced from..."@en + rdfs:label "This example dataset is partly sourced from TEHDAS2" ]; dct:publisher [ a foaf:Organization , foaf:Agent; - foaf:homepage ; - foaf:mbox ; - foaf:name "Sciensano" + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point" ]; - dct:relation ; + dct:relation ; dct:spatial ; dct:temporal [ a dct:PeriodOfTime; dcat:endDate "2024-12-31"^^; dcat:startDate "2020-03-01"^^ ]; - dct:title "[Adapted] Linking of registers for COVID-19 vaccine surveillance"@en; + dct:title "HealthDCAT-AP test dataset"; dct:type [ a skos:Concept; skos:inScheme ; - skos:prefLabel "Personal Data"@en + skos:prefLabel "Personal Data" ]; adms:identifier ; adms:sample ; - adms:versionNotes "Dataset continuously updated"@en; + adms:versionNotes "Dataset continuously updated"; dcat:contactPoint ; # dcat:distribution ; - dcat:hasVersion ; - dcat:keyword "COVID-19"@en , "Vaccination"@en , "Vaccine effectiveness"@en; + dcat:hasVersion ; + dcat:keyword "Test 1" , "Test 2" , "Test 3"; dcat:spatialResolutionInMeters "10"^^; dcat:temporalResolution "P1D"^^; dcat:theme ; @@ -108,17 +102,15 @@ prov:qualifiedAttribution ; prov:wasGeneratedBy ; foaf:page [ a foaf:Document; - rdfs:label "Landing Page for Sciensano"@en; + rdfs:label "Landing Page for Sciensano"; foaf:homepage ]; - ; + ; , , , , ; - [ a ; - dct:description "The primary objective of Sciensano's LINK-VACC project is to ..."@en - ] . + . a dcat:Distribution; @@ -130,9 +122,9 @@ dct:license ; dct:modified "2024-06-04T18:00:00Z"^^; dct:rights [ a dct:RightsStatement; - rdfs:label "_g_L202C11377"@en , "internalURI:wasGeneratedBy0"@en , "_g_L123C7733"@en + rdfs:label "_g_L202C11377" , "internalURI:wasGeneratedBy0" , "_g_L123C7733" ]; - dct:title "Technical report number of unique study subjects available by environment for project HDBP0250"@en; + dct:title "Technical report number of unique study subjects available by environment for project HDBP0250"; dcat:accessURL ; dcat:downloadURL ; dcat:mediaType . @@ -142,9 +134,9 @@ a foaf:Agent; - foaf:homepage ; - foaf:mbox ; - foaf:name "Sciensano" . + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point" . a adms:Identifier; @@ -153,10 +145,10 @@ a vcard:Organization , vcard:Kind; - vcard:fn "Sciensano"; - vcard:hasEmail ; - vcard:hasURL ; - vcard:organisationName "Sciensano"; + vcard:fn "Contact Point"; + vcard:hasEmail ; + vcard:hasURL ; + vcard:organisationName "Contact Point"; vcard:organisationUnit "Health Information" . @@ -171,20 +163,16 @@ ]; foaf:primaryTopic . - - a ; - dct:description "The protocol of the LINK-VACC project was approved by the medical ethics committee University Hospital Brussels – Vrije Universiteit Brussel (VUB) on 03/02/2021 (reference number 2020/523) and obtained authorization from the Information Security Committee (ISC) Social Security and Health (reference number IVC/KSZG/21/034)."@en; - dct:source , . a dct:LinguisticSystem . a ; - dct:title "ID_TU_STATBEL_POP"@en; + dct:title "ID_TU_STATBEL_POP"; ; - dcat:keyword "LINK-VACC"@en . + dcat:keyword "TEST-DATASET" . a dcat:Distribution; @@ -197,9 +185,9 @@ dct:license ; dct:modified "2024-06-04T18:00:00Z"^^; dct:rights [ a dct:RightsStatement; - rdfs:label "Free access."@en + rdfs:label "Free access." ]; - dct:title "Proxy data generating for the EHDS2 Pilot project Sciensano Use Case"@en; + dct:title "Proxy data generating for the EHDS2 Pilot project Sciensano Use Case"; dcat:accessURL ; dcat:downloadURL ; dcat:mediaType . @@ -211,9 +199,9 @@ a dct:LinguisticSystem . - + a skos:Concept; - skos:prefLabel "National Public Health Institute"@en . + skos:prefLabel "National Public Health Institute" . a dct:RightsStatement . @@ -227,7 +215,7 @@ prov:agent [ a foaf:Organization; foaf:homepage ; foaf:mbox ; - foaf:name "Sciensano" + foaf:name "Contact Point" ] . @@ -236,10 +224,10 @@ a skos:Concept; dct:identifier "https://icd.who.int/browse10/2019/en#/Y59.0"^^; - skos:definition "Viral vaccines"@en; + skos:definition "Viral vaccines"; skos:hasTopConcept ; skos:notation "Y59.0"; - skos:prefLabel "Viral vaccines"@en . + skos:prefLabel "Viral vaccines" . a dct:MediaTypeOrExtent . @@ -247,7 +235,7 @@ # # a dcat:Distribution; # dcatap:applicableLegislation ; -# dct:description "Belgian Health Data Agency For better Healthcare, Research & Policy Making"@en; +# dct:description "EU Health Data Access Body For better Healthcare, Research & Policy Making"; # dct:format ; # dct:identifier "http://ehelse.healthdataportal.eu/distribution/13a3851d-6cdf-4570-a7f0-7f03015d1925"; # dct:isPartOf ; @@ -255,21 +243,21 @@ # dct:license ; # dct:modified "2024-06-04T18:00:00Z"^^; # dct:rights [ a dct:RightsStatement; -# rdfs:label "Access to data is conditional on the issuance of a permit by the HDAB after submission of a data request application (English)"@en +# rdfs:label "Access to data is conditional on the issuance of a permit by the HDAB after submission of a data request application (English)" # ]; -# dct:title "Belgian Health Data Agency"@en; -# dcat:accessURL ; +# dct:title "EU Health Data Access Body"; +# dcat:accessURL ; # dcat:byteSize "80000"^^ . a prov:Activity; - rdfs:label "http://dbpedia.org/resource/Record_linkage"@en; + rdfs:label "http://dbpedia.org/resource/Record_linkage"; rdfs:seeAlso ; dct:type ; prov:startedAtTime "2021-01-01T00:00:00Z"^^; prov:wasAssociatedWith [ a prov:Agent; prov:actedOnBehalfOf [ a prov:Organization , prov:Agent; - foaf:name "Sciensano" + foaf:name "Contact Point" ]; foaf:homepage ; foaf:mbox ; @@ -282,18 +270,18 @@ ; - "Patient death reason\tInformation on wheter the cause of death was COVID-19."@en; + "Patient death reason\tInformation on wheter the cause of death was COVID-19."; "CD_COD_COVID" . a skos:Concept; dct:identifier "https://icd.who.int/browse10/2019/en#/U07.1"^^; - skos:definition "COVID-19, virus identified"@en; + skos:definition "COVID-19, virus identified"; skos:hasTopConcept ; skos:notation "U07.1"; - skos:prefLabel "COVID-19"@en . + skos:prefLabel "Test 1" . a dct:LicenseDocument; - rdfs:label "Creative Commons Attribution–NonCommercial–NoDerivs 3.0 Unported"@en . + rdfs:label "Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported" . diff --git a/examples/dcat/dataset_health_no_blank.ttl b/examples/dcat/dataset_health_no_blank.ttl new file mode 100644 index 00000000..ee627867 --- /dev/null +++ b/examples/dcat/dataset_health_no_blank.ttl @@ -0,0 +1,76 @@ +# This Graph contains no blind nodes, to allow for easy comparison between a generated graph +# The blind nodes can be compared manually + +@prefix adms: . +@prefix dcat: . +@prefix dcatap: . +@prefix dct: . +@prefix dpv: . +@prefix foaf: . +@prefix healthdcatap: . +@prefix rdfs: . +@prefix skos: . +@prefix vcard: . +@prefix xsd: . + + a dcat:Dataset ; +# healthdcatap:hdab [ a foaf:Agent ; +# vcard:hasEmail ; +# foaf:homepage ; +# foaf:name "EU Health Data Access Body" ] ; +# dct:provenance [ a dct:ProvenanceStatement ; +# rdfs:label "This example dataset is partly sourced from TEHDAS2" ] ; +# dct:publisher [ a foaf:Agent ; +# vcard:hasEmail ; +# foaf:homepage ; +# foaf:name "Contact Point" ] ; +# dct:temporal [ a dct:PeriodOfTime ; +# dcat:endDate "2024-12-31"^^xsd:date ; +# dcat:startDate "2020-03-01"^^xsd:date ] ; +# adms:identifier [ a adms:Identifier ; +# skos:notation "internalURI:admsIdentifier0" ] ; +# dcat:contactPoint [ a vcard:Kind ; +# vcard:fn "Contact Point" ; +# vcard:hasEmail ] ; + dcatap:applicableLegislation ; + healthdcatap:hasCodingSystem , + ; + healthdcatap:healthCategory , + , + , + , + , + ; + healthdcatap:maxTypicalAge "110"^^xsd:nonNegativeInteger ; + healthdcatap:minTypicalAge "0"^^xsd:nonNegativeInteger ; + healthdcatap:numberOfRecords "123456789"^^xsd:nonNegativeInteger ; + healthdcatap:numberOfUniqueIndividuals "7654321"^^xsd:nonNegativeInteger ; + healthdcatap:populationCoverage "This example includes a very non-descript population" ; + healthdcatap:publisherNote "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." ; + healthdcatap:publisherType ; + dct:accessRights ; + dct:accrualPeriodicity ; + dct:conformsTo ; + dct:description "This dataset is an example of using HealthDCAT-AP in CKAN" ; + dct:identifier ; + dct:isReferencedBy , + ; + dct:issued "2024-01-01T00:00:00+00:00"^^xsd:dateTime ; + dct:language , + , + ; + dct:modified "2024-12-31T23:59:59+00:00"^^xsd:dateTime ; + dct:relation ; + dct:title "HealthDCAT-AP test dataset" ; + dct:type "n1049372e768c4429a6b2200c22f5f1a4b7" ; + adms:sample "http://piveau.io/set/distribution/12f5eb54-a2f7-4549-b4ac-1e1d10003cef" ; + adms:versionNotes "Dataset continuously updated" ; + dcat:keyword "Test 1", + "Test 2", + "Test 3" ; + dcat:spatialResolutionInMeters 10.0 ; + dcat:temporalResolution "P1D"^^xsd:duration ; + dcat:theme ; + foaf:page "n1049372e768c4429a6b2200c22f5f1a4b9" ; + dpv:hasLegalBasis dpv:Consent ; + dpv:hasPurpose dpv:AcademicResearch . From e4bcca7705caec49863775f4bdf9ba1bccda3a10 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Wed, 4 Dec 2024 20:35:14 +0100 Subject: [PATCH 11/19] Implemented code values, qualified relations and analytics --- ckanext/dcat/profiles/euro_health_dcat_ap.py | 89 ++++++++++++++++++- ckanext/dcat/schemas/healthdcat_ap.yaml | 38 ++++++-- .../test_euro_health_dcat_ap_profile_parse.py | 25 ++++-- ...t_euro_health_dcat_ap_profile_serialize.py | 41 ++++++++- examples/ckan/healthdcat_ap.json | 19 +++- examples/dcat/dataset_health.ttl | 18 ++-- examples/dcat/dataset_health_no_blank.ttl | 8 +- 7 files changed, 208 insertions(+), 30 deletions(-) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 517b1b3b..5896a71d 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -2,10 +2,10 @@ import json -from rdflib import RDF, SKOS, XSD, BNode, Literal +from rdflib import RDF, SKOS, XSD, BNode, Literal, term from rdflib.namespace import Namespace -from ckanext.dcat.profiles.base import DCAT, DCT, URIRefOrLiteral +from ckanext.dcat.profiles.base import DCAT, DCT, CleanedURIRef, URIRefOrLiteral from ckanext.dcat.profiles.euro_dcat_ap_3 import EuropeanDCATAP3Profile # HealthDCAT-AP namespace. Note: not finalized yet @@ -49,6 +49,11 @@ def _parse_health_fields(self, dataset_dict, dataset_ref): if agents: dataset_dict["hdab"] = agents + # Add any qualifiedRelations + qual_relations = self._relationship_details(dataset_ref, DCAT.qualifiedRelation) + if qual_relations: + dataset_dict["qualified_relation"] = qual_relations + # Retention period retention_start, retention_end = self._time_interval( dataset_ref, HEALTHDCATAP.retentionPeriod, dcat_ap_version=2 @@ -80,7 +85,8 @@ def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): key, predicate, ) in ( - # ("purpose", HEALTHDCATAP.purpose), + ("analytics", HEALTHDCATAP.analytics), + ("code_values", HEALTHDCATAP.hasCodeValues), ("coding_system", HEALTHDCATAP.hasCodingSystem), ("health_category", HEALTHDCATAP.healthCategory), ("health_theme", HEALTHDCATAP.healthTheme), @@ -113,6 +119,8 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): ## key, predicate, fallbacks, _type, _class items = [ + ("analytics", HEALTHDCATAP.analytics, None, URIRefOrLiteral), + ("code_values", HEALTHDCATAP.hasCodeValues, None, URIRefOrLiteral), ("coding_system", HEALTHDCATAP.hasCodingSystem, None, URIRefOrLiteral), ("health_category", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), ("health_theme", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), @@ -139,6 +147,9 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self._add_nonneg_integer_triple(dataset_dict, dataset_ref, key, predicate) self._add_agents(dataset_ref, dataset_dict, "hdab", HEALTHDCATAP.hdab) + self._add_relationship( + dataset_ref, dataset_dict, "qualified_relation", DCAT.qualifiedRelation + ) def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): """ @@ -180,5 +191,77 @@ def _add_timeframe_triple(self, dataset_dict, dataset_ref): self._add_date_triple(temporal_ref, DCAT.endDate, item["end"]) self.g.add((dataset_ref, DCT.temporal, temporal_ref)) + def _relationship_details(self, subject, predicate): + """ + Returns a list of dicts with details about a dcat:Relationship property, e.g. + dcat:qualifiedRelation + + Both subject and predicate must be rdflib URIRef or BNode objects + + Returns keys for uri, role, and relation with the values set to + an empty string if they could not be found. + """ + + relations = [] + for relation in self.g.objects(subject, predicate): + relation_details = {} + relation_details["uri"] = ( + str(relation) if isinstance(relation, term.URIRef) else "" + ) + relation_details["role"] = self._object_value(relation, DCAT.hadRole) + relation_details["relation"] = self._object_value(relation, DCT.relation) + relations.append(relation_details) + + return relations + + def _add_relationship( + self, + dataset_ref, + dataset_dict, + relation_key, + rdf_predicate, + ): + """ + Adds one or more Relationships to the RDF graph. + + :param dataset_ref: The RDF reference of the dataset + :param dataset_dict: The dataset dictionary containing agent information + :param relation_key: field name in the CKAN dict (.e.g. "qualifiedRelation") + :param rdf_predicate: The RDF predicate (DCAT.qualifiedRelation) + """ + relation = dataset_dict.get(relation_key) + if ( + isinstance(relation, list) + and len(relation) + and self._not_empty_dict(relation[0]) + ): + relations = relation + + for relation in relations: + + agent_uri = relation.get("uri") + if agent_uri: + agent_ref = CleanedURIRef(agent_uri) + else: + agent_ref = BNode() + + self.g.add((agent_ref, DCT.type, DCAT.Relationship)) + self.g.add((dataset_ref, rdf_predicate, agent_ref)) + + self._add_triple_from_dict( + relation, + agent_ref, + DCT.relation, + "relation", + _type=URIRefOrLiteral, + ) + self._add_triple_from_dict( + relation, + agent_ref, + DCAT.hadRole, + "role", + _type=URIRefOrLiteral, + ) + def graph_from_catalog(self, catalog_dict, catalog_ref): super().graph_from_catalog(catalog_dict, catalog_ref) diff --git a/ckanext/dcat/schemas/healthdcat_ap.yaml b/ckanext/dcat/schemas/healthdcat_ap.yaml index 9fbc7ff2..20320ecf 100644 --- a/ckanext/dcat/schemas/healthdcat_ap.yaml +++ b/ckanext/dcat/schemas/healthdcat_ap.yaml @@ -252,12 +252,29 @@ dataset_fields: validators: ignore_missing scheming_multiple_text help_text: A related resource, such as a publication, that references, cites, or otherwise points to the dataset. +- field_name: analytics + label: Analytics + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + An analytics distribution of the dataset. + Publishers are encouraged to provide URLs pointing to API endpoints or document + repositories where users can access or request associated resources such as + technical reports of the dataset, quality measurements, usability indicators,... + or analytics services. + - field_name: applicable_legislation label: Applicable legislation preset: multiple_text validators: ignore_missing scheming_multiple_text help_text: The legislation that mandates the creation or management of the dataset. +- field_name: code_values + label: Code values + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Health classifications and their codes associated with the dataset. + - field_name: coding_system label: Coding system preset: multiple_text @@ -383,12 +400,23 @@ dataset_fields: help_text: Unique identifier for the HDAB, such as a ROR ID. help_text: Health Data Access Body supporting access to data in the Member State. +- field_name: qualified_relation + label: Qualified relation + repeating_label: Relationship + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: relation + label: Relation + help_text: The resource related to the source resource. -#- field_name: hvd_category -# label: HVD Category -# preset: multiple_text -# validators: ignore_missing scheming_multiple_text -# TODO: implement separately as part of wider HVD support + - field_name: role + label: Role + help_text: The function of an entity or agent with respect to another entity or resource. + help_text: A description of a relationship with another resource. # Note: if not provided, this will be autogenerated - field_name: uri diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 60aaf169..d7f7f98c 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -72,14 +72,18 @@ def test_e2e_dcat_to_ckan(self): == "This example dataset is partly sourced from TEHDAS2" ) - # Hard to map (example uses a blind node which doesn't work well in CKAN) + # Hard to map (example uses a blank node which doesn't work well in CKAN) # assert dataset["dcat_type"] == "test-type" assert dataset["issued"] == "2024-01-01T00:00:00+00:00" assert dataset["modified"] == "2024-12-31T23:59:59+00:00" assert dataset["temporal_resolution"] == "P1D" - assert dataset["spatial_resolution_in_meters"] == "10.0" + assert dataset["analytics"] == ["http://example.com/analytics"] + assert sorted(dataset["code_values"]) == [ + "http://example.com/code1", + "http://example.com/code2", + ] assert sorted(dataset["coding_system"]) == [ "http://www.wikidata.org/entity/P1690", "http://www.wikidata.org/entity/P4229", @@ -128,15 +132,22 @@ def test_e2e_dcat_to_ckan(self): # Repeating subfields assert dataset["contact"][0]["name"] == "Contact Point" - assert dataset["contact"][0]["email"] == "covacsurv@sciensano.be" + assert dataset["contact"][0]["email"] == "contact@example.com" assert dataset["publisher"][0]["name"] == "Contact Point" assert dataset["publisher"][0]["email"] == "info@example.com" assert dataset["publisher"][0]["url"] == "https://healthdata.nl" - # assert ( - # dataset["publisher"][0]["type"] - # == "http://purl.org/adms/publishertype/NonProfitOrganisation" - # ) + + assert len(dataset["qualified_relation"]) == 1 + assert ( + dataset["qualified_relation"][0]["relation"] + == "http://example.com/dataset/3.141592" + ) + assert ( + dataset["qualified_relation"][0]["role"] + == "http://www.iana.org/assignments/relation/related" + ) + assert dataset["temporal_coverage"][0]["start"] == "2020-03-01" assert dataset["temporal_coverage"][0]["end"] == "2024-12-31" diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py index 64ccd189..26a6f783 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -25,6 +25,7 @@ VCARD, XSD, ) +from ckanext.dcat.profiles.euro_health_dcat_ap import HEALTHDCATAP from ckanext.dcat.tests.utils import BaseSerializeTest DCAT_AP_PROFILES = ["euro_dcat_ap_3"] @@ -38,6 +39,12 @@ @pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") class TestEuroDCATAP3ProfileSerializeDataset(BaseSerializeTest): def test_e2e_ckan_to_dcat(self): + """ + End to end testing of CKAN dataset to RDF triples. + + Note: in this HealthDCAT-AP profile, only the HealthDCAT-AP specific triples are tested for. + Triples in other profiles could be tested, but should mainly be tested by their respective + profiles.""" dataset_dict = json.loads(self._get_file_contents("ckan/healthdcat_ap.json"))[0] dataset = call_action("package_create", **dataset_dict) @@ -58,8 +65,36 @@ def test_e2e_ckan_to_dcat(self): reference = Graph() reference.parse(data=contents, format="turtle") + print(s.g.serialize(format="turtle")) # First check that all non-blind nodes from the reference are present in the output - assert all(triple in g for triple in reference) + # Any other nodes added by other profiles (e.g. DCAT-AP 3) we do not have an opinion about + for triple in reference: + assert triple in g, f"Triple {triple} not in output graph" + # assert all(triple in g for triple in reference) - print(s.g.serialize(format="turtle")) - assert False + # Test HealthDCAT-AP specific HDAB triples + # We can assume other blank nodes (e.g. contact point, publisher, temporal) are taken care + # of by the base profile. + hdab = [t for t in g.triples((dataset_ref, HEALTHDCATAP.hdab, None))] + assert len(hdab) == 1 + hdab_items = [ + (FOAF.name, dataset_dict["hdab"][0]["name"]), + (VCARD.hasEmail, URIRef("mailto:" + dataset_dict["hdab"][0]["email"])), + (FOAF.homepage, URIRef(dataset_dict["hdab"][0]["url"])), + ] + for predicate, value in hdab_items: + assert self._triple( + g, hdab[0][2], predicate, value + ), f"HDAB Predicate {predicate} does not have value {value}" + + # Test qualified relation + relation = [t for t in g.triples((dataset_ref, DCAT.qualifiedRelation, None))] + assert len(relation) == 1 + relation_items = [ + (DCT.relation, URIRef(dataset_dict["qualified_relation"][0]["relation"])), + (DCAT.hadRole, URIRef(dataset_dict["qualified_relation"][0]["role"])), + ] + for predicate, value in relation_items: + assert self._triple( + g, relation[0][2], predicate, value + ), f"relation Predicate {predicate} does not have value {value}" diff --git a/examples/ckan/healthdcat_ap.json b/examples/ckan/healthdcat_ap.json index 54779ef1..21e502d6 100644 --- a/examples/ckan/healthdcat_ap.json +++ b/examples/ckan/healthdcat_ap.json @@ -1,6 +1,9 @@ [ { "access_rights": "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC", + "analytics": [ + "http://example.com/analytics" + ], "alternate_identifier": [ "internalURI:admsIdentifier0" ], @@ -9,6 +12,10 @@ ], "author": null, "author_email": null, + "code_values": [ + "http://example.com/code1", + "http://example.com/code2" + ], "coding_system": [ "http://www.wikidata.org/entity/P1690", "http://www.wikidata.org/entity/P4229" @@ -78,7 +85,13 @@ "purpose": [ "https://w3id.org/dpv#AcademicResearch" ], - "spatial_resolution_in_meters": "10.0", + "qualified_relation": [ + { + "uri": "", + "relation": "http://example.com/dataset/3.141592", + "role": "http://www.iana.org/assignments/relation/related" + } + ], "state": "active", "temporal_resolution": "P1D", "theme": [ @@ -111,7 +124,7 @@ }, { "key": "sample", - "value": "[\"http://piveau.io/set/distribution/12f5eb54-a2f7-4549-b4ac-1e1d10003cef\"]" + "value": "[\"http://example.com/sample\"]" }, { "key": "spatial_uri", @@ -125,7 +138,7 @@ "name": "EU Health Data Access Body", "type": "", "uri": "", - "url": "https: //www.hda.belgium.be" + "url": "https://www.example.com/hdab" } ], "publisher": [ diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index ca92043a..8b47f7e3 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -16,9 +16,10 @@ a dcat:Resource , dcat:Dataset; dcatap:applicableLegislation ; - ; + ; - , ; + , + ; , ; @@ -72,6 +73,11 @@ foaf:name "Contact Point" ]; dct:relation ; + dcat:qualifiedRelation [ + a dcat:Relationship ; + dct:relation ; + dcat:hadRole + ]; dct:spatial ; dct:temporal [ a dct:PeriodOfTime; dcat:endDate "2024-12-31"^^; @@ -83,7 +89,7 @@ skos:prefLabel "Personal Data" ]; adms:identifier ; - adms:sample ; + adms:sample ; adms:versionNotes "Dataset continuously updated"; dcat:contactPoint ; # dcat:distribution ; @@ -92,7 +98,7 @@ dcat:spatialResolutionInMeters "10"^^; dcat:temporalResolution "P1D"^^; dcat:theme ; - # dcat:version is not mapped in ckan + # dcat:version is not mapped in ckan and should be hasVersion # dcat:version "Project HDBP0250"; dqv:hasQualityAnnotation [ a dqv:QualityCertificate; oa:hasBody ; @@ -112,7 +118,7 @@ . - + a dcat:Distribution; dcatap:applicableLegislation ; dct:format ; @@ -174,7 +180,7 @@ ; dcat:keyword "TEST-DATASET" . - + a dcat:Distribution; dcatap:applicableLegislation ; dct:format ; diff --git a/examples/dcat/dataset_health_no_blank.ttl b/examples/dcat/dataset_health_no_blank.ttl index ee627867..e8021267 100644 --- a/examples/dcat/dataset_health_no_blank.ttl +++ b/examples/dcat/dataset_health_no_blank.ttl @@ -1,4 +1,4 @@ -# This Graph contains no blind nodes, to allow for easy comparison between a generated graph +# This Graph contains no blank nodes, to allow for easy comparison between a generated graph # The blind nodes can be compared manually @prefix adms: . @@ -33,6 +33,9 @@ # vcard:fn "Contact Point" ; # vcard:hasEmail ] ; dcatap:applicableLegislation ; + healthdcatap:analytics ; + healthdcatap:hasCodeValues , + ; healthdcatap:hasCodingSystem , ; healthdcatap:healthCategory , @@ -63,12 +66,11 @@ dct:relation ; dct:title "HealthDCAT-AP test dataset" ; dct:type "n1049372e768c4429a6b2200c22f5f1a4b7" ; - adms:sample "http://piveau.io/set/distribution/12f5eb54-a2f7-4549-b4ac-1e1d10003cef" ; + adms:sample ; adms:versionNotes "Dataset continuously updated" ; dcat:keyword "Test 1", "Test 2", "Test 3" ; - dcat:spatialResolutionInMeters 10.0 ; dcat:temporalResolution "P1D"^^xsd:duration ; dcat:theme ; foaf:page "n1049372e768c4429a6b2200c22f5f1a4b9" ; From d90a4c8509c593edc16e8e9ad3ca97d387fdc29d Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Wed, 4 Dec 2024 15:03:48 +0100 Subject: [PATCH 12/19] Add URL property to contactPoint (VCARD.hasURL) --- ckanext/dcat/profiles/base.py | 7 ++++--- ckanext/dcat/profiles/euro_dcat_ap_base.py | 4 ++-- ckanext/dcat/profiles/euro_dcat_ap_scheming.py | 7 +++++++ ckanext/dcat/schemas/dcat_ap_full.yaml | 5 ++++- ckanext/dcat/schemas/dcat_ap_multilingual.yaml | 5 ++++- ckanext/dcat/schemas/dcat_ap_recommended.yaml | 4 ++++ ckanext/dcat/schemas/dcat_us_full.yaml | 4 ++++ ckanext/dcat/schemas/dcat_us_recommended.yaml | 4 ++++ .../profiles/dcat_ap_3/test_euro_dcatap_3_profile_parse.py | 1 + .../dcat_ap_3/test_euro_dcatap_3_profile_serialize.py | 3 +++ examples/ckan/ckan_full_dataset_dcat_ap.json | 3 ++- examples/dcat/dataset.rdf | 1 + 12 files changed, 40 insertions(+), 8 deletions(-) diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 9beee13c..a93eeb5c 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -575,6 +575,9 @@ def _contact_details(self, subject, predicate): ) contact["identifier"] = self._get_vcard_property_value(agent, VCARD.hasUID) + + contact["url"] = self._get_vcard_property_value(agent, VCARD.hasURL) + contacts.append(contact) return contacts @@ -820,9 +823,7 @@ def _add_spatial_value_to_graph(self, spatial_ref, predicate, value): or object. """ spatial_formats = aslist( - config.get( - "ckanext.dcat.output_spatial_format", DEFAULT_SPATIAL_FORMATS - ) + config.get("ckanext.dcat.output_spatial_format", DEFAULT_SPATIAL_FORMATS) ) if isinstance(value, str): diff --git a/ckanext/dcat/profiles/euro_dcat_ap_base.py b/ckanext/dcat/profiles/euro_dcat_ap_base.py index 2356a2d4..e42c6afb 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_base.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_base.py @@ -144,12 +144,12 @@ def _parse_dataset_base(self, dataset_dict, dataset_ref): contact = self._contact_details(dataset_ref, ADMS.contactPoint) if contact: contact = contact[0] - for key in ("uri", "name", "email", "identifier"): + for key in ("uri", "name", "email", "identifier", "url"): if contact.get(key): dataset_dict["extras"].append( { "key": "contact_{0}".format(key), - "value": contact.get(key) + "value": contact.get(key), } ) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index ca72cd21..b8402a65 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -168,6 +168,13 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): "identifier", _type=URIRefOrLiteral, ) + self._add_triple_from_dict( + item, + contact_details, + VCARD.hasURL, + "url", + _type=URIRef, + ) self._add_agents(dataset_ref, dataset_dict, "publisher", DCT.publisher) self._add_agents(dataset_ref, dataset_dict, "creator", DCT.creator) diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index 82ee2dfd..0126dce4 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -46,7 +46,10 @@ dataset_fields: - field_name: identifier label: Identifier help_text: Unique identifier for the contact point. Such as a ROR ID. - + + - field_name: url + label: URL + help_text: A URL associated with the contact help_text: Contact information for enquiries about the dataset. - field_name: publisher diff --git a/ckanext/dcat/schemas/dcat_ap_multilingual.yaml b/ckanext/dcat/schemas/dcat_ap_multilingual.yaml index 63c07c7a..98e9ffb4 100644 --- a/ckanext/dcat/schemas/dcat_ap_multilingual.yaml +++ b/ckanext/dcat/schemas/dcat_ap_multilingual.yaml @@ -104,7 +104,10 @@ dataset_fields: - field_name: identifier label: Identifier help_text: Unique identifier for the contact point. Such as a ROR ID. - + + - field_name: url + label: URL + help_text: A URL associated with the contact help_text: Contact information for enquiries about the dataset. - field_name: license_id diff --git a/ckanext/dcat/schemas/dcat_ap_recommended.yaml b/ckanext/dcat/schemas/dcat_ap_recommended.yaml index ca38ab16..3ab995dd 100644 --- a/ckanext/dcat/schemas/dcat_ap_recommended.yaml +++ b/ckanext/dcat/schemas/dcat_ap_recommended.yaml @@ -42,6 +42,10 @@ dataset_fields: - field_name: email label: Email display_snippet: email.html + + - field_name: url + label: URL + help_text: A URL associated with the contact help_text: Contact information for enquiries about the dataset. - field_name: publisher diff --git a/ckanext/dcat/schemas/dcat_us_full.yaml b/ckanext/dcat/schemas/dcat_us_full.yaml index 24e8dedd..6f55903f 100644 --- a/ckanext/dcat/schemas/dcat_us_full.yaml +++ b/ckanext/dcat/schemas/dcat_us_full.yaml @@ -42,6 +42,10 @@ dataset_fields: - field_name: email label: Email display_snippet: email.html + + - field_name: url + label: URL + help_text: A URL associated with the contact help_text: Contact information for enquiries about the dataset. - field_name: publisher diff --git a/ckanext/dcat/schemas/dcat_us_recommended.yaml b/ckanext/dcat/schemas/dcat_us_recommended.yaml index f5aea9b3..0cacdb53 100644 --- a/ckanext/dcat/schemas/dcat_us_recommended.yaml +++ b/ckanext/dcat/schemas/dcat_us_recommended.yaml @@ -42,6 +42,10 @@ dataset_fields: - field_name: email label: Email display_snippet: email.html + + - field_name: url + label: URL + help_text: A URL associated with the contact help_text: Contact information for enquiries about the dataset. - field_name: publisher diff --git a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_parse.py index b54774a2..25cd2c64 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_parse.py @@ -91,6 +91,7 @@ def test_e2e_dcat_to_ckan(self): assert dataset["contact"][0]["name"] == "Point of Contact" assert dataset["contact"][0]["email"] == "contact@some.org" + assert dataset["contact"][0]["url"] == "https://example.org" assert ( dataset["publisher"][0]["name"] == "Publishing Organization for dataset 1" diff --git a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py index 75d359a3..b509e37d 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_3/test_euro_dcatap_3_profile_serialize.py @@ -148,6 +148,9 @@ def test_e2e_ckan_to_dcat(self): VCARD.hasUID, dataset_dict["contact"][0]["identifier"], ) + assert self._triple( + g, contact_details[0][2], VCARD.hasURL, URIRef(dataset_dict["contact"][0]["url"]) + ) assert self._triple( g, contact_details[1][2], VCARD.fn, dataset_dict["contact"][1]["name"] ) diff --git a/examples/ckan/ckan_full_dataset_dcat_ap.json b/examples/ckan/ckan_full_dataset_dcat_ap.json index 922e947d..6adb770d 100644 --- a/examples/ckan/ckan_full_dataset_dcat_ap.json +++ b/examples/ckan/ckan_full_dataset_dcat_ap.json @@ -53,7 +53,8 @@ { "name": "Contact 1", "email": "contact1@example.org", - "identifier": "123" + "identifier": "123", + "url": "https://example.org" }, { "name": "Contact 2", diff --git a/examples/dcat/dataset.rdf b/examples/dcat/dataset.rdf index 7ab556cb..ff280d9a 100644 --- a/examples/dcat/dataset.rdf +++ b/examples/dcat/dataset.rdf @@ -68,6 +68,7 @@ Point of Contact + From 4458afe4864d435937f13b03c3fd9162396da79d Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Wed, 4 Dec 2024 21:03:16 +0100 Subject: [PATCH 13/19] Wrote some documentation regarding the extension --- docs/healthdcat.md | 59 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 docs/healthdcat.md diff --git a/docs/healthdcat.md b/docs/healthdcat.md new file mode 100644 index 00000000..fea7f760 --- /dev/null +++ b/docs/healthdcat.md @@ -0,0 +1,59 @@ +# HealthDCAT-AP + +## Introduction + +This extension contains a profile for the proposed [HealthDCAT-AP](https://healthdcat-ap.github.io/) +extension, a health-related extension of the DCAT application profile for sharing information about +Catalogues containing Datasets and Data Services descriptions in Europe (DCAT-AP). The CKAN +HealthDCAT-AP profile was developed to implement this. + +The development of a healthDCAT application profile, as an extension of the DCAT application +profile, aims to standardize health metadata within the scope of EHDS, fostering greater +interoperability, findability and accessibility of electronic health data across the EU. + +**Note:** HealthDCAT-AP is still under active development and not finalized yet. Cardinalities, +certain vocabularies and the namespace have not officially been ratified yet. + +The goal of this profile is to provide the wider FAIR community and other EU portals with a starting +point for implementing HealthDCAT-AP within their own data catalogs. + +## Implementation details + +The HealthDCAT-AP profile is an extension of the DCAT-AP v3 profile. Just like that profile, +this profile requires *ckanext-scheming*. + +## Usage and settings + +This profile has currently no additional settings. To select the profile, make sure +`scheming.dataset_schemas` includes `ckanext.dcat.schemas:healthdcat_ap.yaml`, and +`ckanext.dcat.rdf.profiles` includes `euro_health_dcat_ap`. + +## Limitations and deviations + +As HealthDCAT-AP is still a draft, it is bound for change. There are currently still some +inconsistencies in the standard and unclarities regarding certain properties. Below a short summary: + +1. Cardinalities have not yet been finalized for HealthDCAT-AP. This CKAN scheme has taken a very + liberal approach and takes all values as strictly optional (no failed validation for missing + fields). Note that some mandatory fields are currently impossible to fill with real data e.g. the + Health Data Access Body (HDAB) field: the EHDS legislation has not been implemented yet and no HDABs + have been formally appointed. +2. The HealthDCAT-AP namespace is not formally defined yet. For now, + `http://healthdataportal.eu/ns/health#` is used. This will be updated once the final namespace is + standardized. +3. The official examples of the standard uses the `dct:description` property to encode the data + purpose. This does not seem to be according to the Data Privacy Vocabulary specification, which + proposes a controlled vocabulary. See + () for the German perspective on + this. +4. The distributions proposed by HealthDCAT-AP, *analytics* and *sample*, are not specifically + implemented. URIs are linked, the resources themselves are not loaded. For *sample*, as this is + an upstream DCAT-AP property, this can be included once picked up there. +5. Documentation (*foaf:page*) is implemented as an URI. There is some HealthDCAT-AP example data + out in the wild that uses a blank node for this and adds several properties, however this is + inconsistent with other DCAT implementations. +6. DatasetSeries are not supported yet by CKAN, and also not by this profile. +7. The *quality annotation* property has not been implemented due to it being very vaguely specified + for now. +8. There is no multilingual support yet. +9. For other properties, any limitations from the DCAT-AP profiles still apply. From 3f8cd85ed0b1dfa09c39f4d8a0879e3b63f85d61 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Fri, 6 Dec 2024 02:05:24 +0100 Subject: [PATCH 14/19] dpv:hasPersonalData and some cleanup --- ckanext/dcat/profiles/euro_health_dcat_ap.py | 19 +---- ...healthdcat_ap.yaml => health_dcat_ap.yaml} | 6 ++ .../test_euro_health_dcat_ap_profile_parse.py | 77 +++---------------- ...t_euro_health_dcat_ap_profile_serialize.py | 6 +- docs/healthdcat.md | 2 +- ...healthdcat_ap.json => health_dcat_ap.json} | 5 ++ examples/dcat/dataset_health.ttl | 4 +- examples/dcat/dataset_health_no_blank.ttl | 5 +- 8 files changed, 37 insertions(+), 87 deletions(-) rename ckanext/dcat/schemas/{healthdcat_ap.yaml => health_dcat_ap.yaml} (98%) rename examples/ckan/{healthdcat_ap.json => health_dcat_ap.json} (97%) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 5896a71d..93f0e004 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -2,7 +2,7 @@ import json -from rdflib import RDF, SKOS, XSD, BNode, Literal, term +from rdflib import RDF, SKOS, XSD, BNode, Literal, URIRef, term from rdflib.namespace import Namespace from ckanext.dcat.profiles.base import DCAT, DCT, CleanedURIRef, URIRefOrLiteral @@ -41,9 +41,6 @@ def _parse_health_fields(self, dataset_dict, dataset_ref): self.__parse_healthdcat_intvalues(dataset_dict, dataset_ref) - # Purpose is a dpv:Purpose, inside is a dct:Description - pass - # Add the HDAB. There should only ever be one but you never know agents = self._agents_details(dataset_ref, HEALTHDCATAP.hdab) if agents: @@ -91,6 +88,7 @@ def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): ("health_category", HEALTHDCATAP.healthCategory), ("health_theme", HEALTHDCATAP.healthTheme), ("legal_basis", DPV.hasLegalBasis), + ("personal_data", DPV.hasPersonalData), ("population_coverage", HEALTHDCATAP.populationCoverage), ("publisher_note", HEALTHDCATAP.publisherNote), ("publisher_type", HEALTHDCATAP.publisherType), @@ -105,18 +103,6 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): for prefix, namespace in namespaces.items(): self.g.bind(prefix, namespace) - # g = self.g - - # ("coding_system", HEALTHDCATAP.hasCodingSystem), - # ("health_category", HEALTHDCATAP.healthCategory), - # ("health_theme", HEALTHDCATAP.healthTheme), - # ("population_coverage", HEALTHDCATAP.populationCoverage), - # ("publisher_note", HEALTHDCATAP.publisherNote), - # ("publisher_type", HEALTHDCATAP.publisherType), - # List items: - # - Purpose - # - Health theme - ## key, predicate, fallbacks, _type, _class items = [ ("analytics", HEALTHDCATAP.analytics, None, URIRefOrLiteral), @@ -131,6 +117,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): None, URIRefOrLiteral, ), + ("personal_data", DPV.hasPersonalData, None, URIRef), ("publisher_note", HEALTHDCATAP.publisherNote, None, URIRefOrLiteral), ("publisher_type", HEALTHDCATAP.publisherType, None, URIRefOrLiteral), ("purpose", DPV.hasPurpose, None, URIRefOrLiteral), diff --git a/ckanext/dcat/schemas/healthdcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml similarity index 98% rename from ckanext/dcat/schemas/healthdcat_ap.yaml rename to ckanext/dcat/schemas/health_dcat_ap.yaml index 20320ecf..a06bd6ad 100644 --- a/ckanext/dcat/schemas/healthdcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -335,6 +335,12 @@ dataset_fields: form_snippet: number.html help_text: Number of records for unique individuals. +- field_name: personal_data + label: Personal data + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Key elements that represent an individual in the dataset. + - field_name: publisher_note label: Publisher note preset: multiple_text diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index d7f7f98c..465b69f8 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -15,7 +15,7 @@ @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( - "scheming.dataset_schemas", "ckanext.dcat.schemas:healthdcat_ap.yaml" + "scheming.dataset_schemas", "ckanext.dcat.schemas:health_dcat_ap.yaml" ) @pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") class TestSchemingParseSupport(BaseParseTest): @@ -47,9 +47,7 @@ def test_e2e_dcat_to_ckan(self): dataset["notes"] == "This dataset is an example of using HealthDCAT-AP in CKAN" ) - # assert dataset["url"] == "http://dataset.info.org" - # assert dataset["version"] == "Project HDBP0250" - # assert dataset["license_id"] == "cc-nc" + assert sorted([t["name"] for t in dataset["tags"]]) == [ "Test 1", "Test 2", @@ -90,6 +88,7 @@ def test_e2e_dcat_to_ckan(self): ] # Doesn't really get mapped for some reason + # Should be covered in DCAT-AP base profile # assert dataset["spatial_coverage"] == [ # { # "uri": "http://publications.europa.eu/resource/authority/country/BEL", @@ -112,16 +111,7 @@ def test_e2e_dcat_to_ckan(self): assert sorted(dataset["theme"]) == [ "http://publications.europa.eu/resource/authority/data-theme/HEAL" ] - # assert sorted(dataset["alternate_identifier"]) == [ - # "alternate-identifier-1", - # "alternate-identifier-2", - # ] - # assert sorted(dataset["documentation"]) == [ - # "http://dataset.info.org/doc1", - # "http://dataset.info.org/doc2", - # ] - # <> , ; assert sorted(dataset["is_referenced_by"]) == [ "https://doi.org/10.1038/sdata.2016.18", "https://dx.doi.org/10.1002/jmri.28679", @@ -129,8 +119,8 @@ def test_e2e_dcat_to_ckan(self): assert sorted(dataset["applicable_legislation"]) == [ "http://data.europa.eu/eli/reg/2022/868/oj", ] - # Repeating subfields + # Repeating subfields assert dataset["contact"][0]["name"] == "Contact Point" assert dataset["contact"][0]["email"] == "contact@example.com" @@ -163,11 +153,18 @@ def test_e2e_dcat_to_ckan(self): assert dataset["hdab"][0]["email"] == "hdab@example.com" assert dataset["hdab"][0]["url"] == "https://www.example.com/hdab" + # CKAN converts these to strings, but also converts back to decimal/nonneg int assert dataset["min_typical_age"] == "0" assert dataset["max_typical_age"] == "110" assert dataset["number_of_records"] == "123456789" assert dataset["number_of_unique_individuals"] == "7654321" + assert sorted(dataset["personal_data"]) == [ + "https://w3id.org/dpv/dpv-pd#Age", + "https://w3id.org/dpv/dpv-pd#Gender", + "https://w3id.org/dpv/dpv-pd#HealthRecord", + ] + assert dataset["population_coverage"] == [ "This example includes a very non-descript population" ] @@ -186,55 +183,3 @@ def test_e2e_dcat_to_ckan(self): "end": "2034-12-31", } ] - - # resource = dataset["resources"][0] - - # # Resources: core fields - # assert resource["url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" - - # # Resources: standard fields - # assert resource["license"] == "http://creativecommons.org/licenses/by-nc/2.0/" - # assert resource["rights"] == "Some statement about rights" - # assert resource["issued"] == "2012-05-11" - # assert resource["modified"] == "2012-05-01T00:04:06" - # assert resource["temporal_resolution"] == "PT15M" - # assert resource["spatial_resolution_in_meters"] == 1.5 - # assert resource["status"] == "http://purl.org/adms/status/Completed" - # assert resource["size"] == 12323 - # assert ( - # resource["availability"] - # == "http://publications.europa.eu/resource/authority/planned-availability/EXPERIMENTAL" - # ) - # assert ( - # resource["compress_format"] - # == "http://www.iana.org/assignments/media-types/application/gzip" - # ) - # assert ( - # resource["package_format"] - # == "http://publications.europa.eu/resource/authority/file-type/TAR" - # ) - - # assert resource["hash"] == "4304cf2e751e6053c90b1804c89c0ebb758f395a" - # assert ( - # resource["hash_algorithm"] - # == "http://spdx.org/rdf/terms#checksumAlgorithm_sha1" - # ) - - # assert ( - # resource["access_url"] == "http://www.bgs.ac.uk/gbase/geochemcd/home.html" - # ) - # assert "download_url" not in resource - - # # Resources: list fields - # assert sorted(resource["language"]) == ["ca", "en", "es"] - # assert sorted(resource["documentation"]) == [ - # "http://dataset.info.org/distribution1/doc1", - # "http://dataset.info.org/distribution1/doc2", - # ] - # assert sorted(resource["conforms_to"]) == ["Standard 1", "Standard 2"] - - # # Resources: repeating subfields - # assert resource["access_services"][0]["title"] == "Sparql-end Point" - # assert resource["access_services"][0]["endpoint_url"] == [ - # "http://publications.europa.eu/webapi/rdf/sparql" - # ] diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py index 26a6f783..1ea287d4 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -34,7 +34,7 @@ @pytest.mark.usefixtures("with_plugins", "clean_db") @pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") @pytest.mark.ckan_config( - "scheming.dataset_schemas", "ckanext.dcat.schemas:healthdcat_ap.yaml" + "scheming.dataset_schemas", "ckanext.dcat.schemas:health_dcat_ap.yaml" ) @pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") class TestEuroDCATAP3ProfileSerializeDataset(BaseSerializeTest): @@ -45,7 +45,9 @@ def test_e2e_ckan_to_dcat(self): Note: in this HealthDCAT-AP profile, only the HealthDCAT-AP specific triples are tested for. Triples in other profiles could be tested, but should mainly be tested by their respective profiles.""" - dataset_dict = json.loads(self._get_file_contents("ckan/healthdcat_ap.json"))[0] + dataset_dict = json.loads(self._get_file_contents("ckan/health_dcat_ap.json"))[ + 0 + ] dataset = call_action("package_create", **dataset_dict) diff --git a/docs/healthdcat.md b/docs/healthdcat.md index fea7f760..4060adef 100644 --- a/docs/healthdcat.md +++ b/docs/healthdcat.md @@ -25,7 +25,7 @@ this profile requires *ckanext-scheming*. ## Usage and settings This profile has currently no additional settings. To select the profile, make sure -`scheming.dataset_schemas` includes `ckanext.dcat.schemas:healthdcat_ap.yaml`, and +`scheming.dataset_schemas` includes `ckanext.dcat.schemas:health_dcat_ap.yaml`, and `ckanext.dcat.rdf.profiles` includes `euro_health_dcat_ap`. ## Limitations and deviations diff --git a/examples/ckan/healthdcat_ap.json b/examples/ckan/health_dcat_ap.json similarity index 97% rename from examples/ckan/healthdcat_ap.json rename to examples/ckan/health_dcat_ap.json index 21e502d6..2670c77b 100644 --- a/examples/ckan/healthdcat_ap.json +++ b/examples/ckan/health_dcat_ap.json @@ -71,6 +71,11 @@ "number_of_records": "123456789", "number_of_unique_individuals": "7654321", "organization": null, + "personal_data": [ + "https://w3id.org/dpv/dpv-pd#Age", + "https://w3id.org/dpv/dpv-pd#Gender", + "https://w3id.org/dpv/dpv-pd#HealthRecord" + ], "population_coverage": [ "This example includes a very non-descript population" ], diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl index 8b47f7e3..a665c1ee 100644 --- a/examples/dcat/dataset_health.ttl +++ b/examples/dcat/dataset_health.ttl @@ -114,7 +114,9 @@ ; - , , , , ; + , + , + ; . diff --git a/examples/dcat/dataset_health_no_blank.ttl b/examples/dcat/dataset_health_no_blank.ttl index e8021267..ba854b5f 100644 --- a/examples/dcat/dataset_health_no_blank.ttl +++ b/examples/dcat/dataset_health_no_blank.ttl @@ -75,4 +75,7 @@ dcat:theme ; foaf:page "n1049372e768c4429a6b2200c22f5f1a4b9" ; dpv:hasLegalBasis dpv:Consent ; - dpv:hasPurpose dpv:AcademicResearch . + dpv:hasPurpose dpv:AcademicResearch ; + dpv:hasPersonalData , + , + . From 442232396795a29178df8c070d3fd3a9c3b7d142 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Fri, 6 Dec 2024 05:27:07 +0100 Subject: [PATCH 15/19] Small documentation update --- docs/healthdcat.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/healthdcat.md b/docs/healthdcat.md index 4060adef..ac5f1a4a 100644 --- a/docs/healthdcat.md +++ b/docs/healthdcat.md @@ -12,7 +12,8 @@ profile, aims to standardize health metadata within the scope of EHDS, fostering interoperability, findability and accessibility of electronic health data across the EU. **Note:** HealthDCAT-AP is still under active development and not finalized yet. Cardinalities, -certain vocabularies and the namespace have not officially been ratified yet. +certain vocabularies and the namespace have not officially been ratified yet. These are expected to +be finalized after the public consultation in Q1 2025. The goal of this profile is to provide the wider FAIR community and other EU portals with a starting point for implementing HealthDCAT-AP within their own data catalogs. @@ -31,7 +32,8 @@ This profile has currently no additional settings. To select the profile, make s ## Limitations and deviations As HealthDCAT-AP is still a draft, it is bound for change. There are currently still some -inconsistencies in the standard and unclarities regarding certain properties. Below a short summary: +inconsistencies in the standard and unclarities regarding certain properties. Below a short summary +of limitations and implementaiton decisions made during development of this profile. 1. Cardinalities have not yet been finalized for HealthDCAT-AP. This CKAN scheme has taken a very liberal approach and takes all values as strictly optional (no failed validation for missing @@ -53,7 +55,7 @@ inconsistencies in the standard and unclarities regarding certain properties. Be out in the wild that uses a blank node for this and adds several properties, however this is inconsistent with other DCAT implementations. 6. DatasetSeries are not supported yet by CKAN, and also not by this profile. -7. The *quality annotation* property has not been implemented due to it being very vaguely specified - for now. +7. The *quality annotation* property has not been implemented due to usage not being completely +defined yet. 8. There is no multilingual support yet. 9. For other properties, any limitations from the DCAT-AP profiles still apply. From 655370b4d2e3c82c1ce70a848b051a73cd8f5631 Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 18 Dec 2024 13:17:47 +0100 Subject: [PATCH 16/19] Fix cardinality of qualified_relation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrià Mercader --- ckanext/dcat/schemas/health_dcat_ap.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml index a06bd6ad..6245756d 100644 --- a/ckanext/dcat/schemas/health_dcat_ap.yaml +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -409,7 +409,6 @@ dataset_fields: - field_name: qualified_relation label: Qualified relation repeating_label: Relationship - repeating_once: true repeating_subfields: - field_name: uri From 114007118a261363dd65ecbad9fe9df4057f8afc Mon Sep 17 00:00:00 2001 From: Mark Date: Wed, 18 Dec 2024 13:18:10 +0100 Subject: [PATCH 17/19] Fix test case for spatial_coverage in HealthDCAT-AP profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrià Mercader --- .../test_euro_health_dcat_ap_profile_parse.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py index 465b69f8..7abcacb4 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -87,17 +87,11 @@ def test_e2e_dcat_to_ckan(self): "http://www.wikidata.org/entity/P4229", ] - # Doesn't really get mapped for some reason - # Should be covered in DCAT-AP base profile - # assert dataset["spatial_coverage"] == [ - # { - # "uri": "http://publications.europa.eu/resource/authority/country/BEL", - # "text": None, - # "geom": None, - # "bbox": None, - # "cent": None, - # } - # ] + assert dataset["spatial_coverage"] == [ + { + "uri": "http://publications.europa.eu/resource/authority/country/BEL", + } + ] # List fields assert sorted(dataset["conforms_to"]) == [ From 9966dd932ca3deeb863bb101682833f1b0f55b32 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Wed, 18 Dec 2024 20:57:31 +0100 Subject: [PATCH 18/19] Small cleanup --- ckanext/dcat/profiles/euro_health_dcat_ap.py | 4 ---- .../profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py | 1 - .../test_euro_health_dcat_ap_profile_serialize.py | 1 - 3 files changed, 6 deletions(-) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 93f0e004..2144f097 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -1,7 +1,3 @@ -"""Test document""" - -import json - from rdflib import RDF, SKOS, XSD, BNode, Literal, URIRef, term from rdflib.namespace import Namespace diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index fdda473f..1bce901c 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -555,7 +555,6 @@ def test_dataset_distribution_access_service_list_values_only(self): # List endpoint_url_list = access_service.get('endpoint_url') - print(access_service) assert len(endpoint_url_list) == 1 assert 'http://publications.europa.eu/webapi/rdf/sparql' in endpoint_url_list diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py index 1ea287d4..0bfade6e 100644 --- a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -67,7 +67,6 @@ def test_e2e_ckan_to_dcat(self): reference = Graph() reference.parse(data=contents, format="turtle") - print(s.g.serialize(format="turtle")) # First check that all non-blind nodes from the reference are present in the output # Any other nodes added by other profiles (e.g. DCAT-AP 3) we do not have an opinion about for triple in reference: From 30f3bac18a78d0c921a182d0329b3ff7e56881d1 Mon Sep 17 00:00:00 2001 From: Mark Janse Date: Wed, 18 Dec 2024 21:35:55 +0100 Subject: [PATCH 19/19] Move qualified relations to generic CKAN DCAT scheming class --- .../dcat/profiles/euro_dcat_ap_scheming.py | 83 ++++++++++++++++++- ckanext/dcat/profiles/euro_health_dcat_ap.py | 31 ------- 2 files changed, 82 insertions(+), 32 deletions(-) diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 3a2742a1..7a70bd77 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -1,6 +1,6 @@ import json -from rdflib import URIRef, BNode, Literal +from rdflib import URIRef, BNode, Literal, term from .base import RDFProfile, CleanedURIRef, URIRefOrLiteral from .base import ( RDF, @@ -118,6 +118,11 @@ def _parse_list_value(data_dict, field_name): if agents: dataset_dict[key] = agents + # Add any qualifiedRelations + qual_relations = self._relationship_details(dataset_ref, DCAT.qualifiedRelation) + if qual_relations: + dataset_dict["qualified_relation"] = qual_relations + # Repeating subfields: resources for schema_field in self._dataset_schema["resource_fields"]: if "repeating_subfields" in schema_field: @@ -227,6 +232,10 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): spatial_ref, field[1], item[field[0]] ) + self._add_relationship( + dataset_ref, dataset_dict, "qualified_relation", DCAT.qualifiedRelation + ) + resources = dataset_dict.get("resources", []) for resource in resources: if resource.get("access_services"): @@ -292,6 +301,78 @@ def _add_agents( _type=URIRefOrLiteral, ) + def _relationship_details(self, subject, predicate): + """ + Returns a list of dicts with details about a dcat:Relationship property, e.g. + dcat:qualifiedRelation + + Both subject and predicate must be rdflib URIRef or BNode objects + + Returns keys for uri, role, and relation with the values set to + an empty string if they could not be found. + """ + + relations = [] + for relation in self.g.objects(subject, predicate): + relation_details = {} + relation_details["uri"] = ( + str(relation) if isinstance(relation, term.URIRef) else "" + ) + relation_details["role"] = self._object_value(relation, DCAT.hadRole) + relation_details["relation"] = self._object_value(relation, DCT.relation) + relations.append(relation_details) + + return relations + + def _add_relationship( + self, + dataset_ref, + dataset_dict, + relation_key, + rdf_predicate, + ): + """ + Adds one or more Relationships to the RDF graph. + + :param dataset_ref: The RDF reference of the dataset + :param dataset_dict: The dataset dictionary containing agent information + :param relation_key: field name in the CKAN dict (.e.g. "qualifiedRelation") + :param rdf_predicate: The RDF predicate (DCAT.qualifiedRelation) + """ + relation = dataset_dict.get(relation_key) + if ( + isinstance(relation, list) + and len(relation) + and self._not_empty_dict(relation[0]) + ): + relations = relation + + for relation in relations: + + agent_uri = relation.get("uri") + if agent_uri: + agent_ref = CleanedURIRef(agent_uri) + else: + agent_ref = BNode() + + self.g.add((agent_ref, DCT.type, DCAT.Relationship)) + self.g.add((dataset_ref, rdf_predicate, agent_ref)) + + self._add_triple_from_dict( + relation, + agent_ref, + DCT.relation, + "relation", + _type=URIRefOrLiteral, + ) + self._add_triple_from_dict( + relation, + agent_ref, + DCAT.hadRole, + "role", + _type=URIRefOrLiteral, + ) + @staticmethod def _not_empty_dict(data_dict): return any(data_dict.values()) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py index 2144f097..6e560b8c 100644 --- a/ckanext/dcat/profiles/euro_health_dcat_ap.py +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -42,11 +42,6 @@ def _parse_health_fields(self, dataset_dict, dataset_ref): if agents: dataset_dict["hdab"] = agents - # Add any qualifiedRelations - qual_relations = self._relationship_details(dataset_ref, DCAT.qualifiedRelation) - if qual_relations: - dataset_dict["qualified_relation"] = qual_relations - # Retention period retention_start, retention_end = self._time_interval( dataset_ref, HEALTHDCATAP.retentionPeriod, dcat_ap_version=2 @@ -130,9 +125,6 @@ def graph_from_dataset(self, dataset_dict, dataset_ref): self._add_nonneg_integer_triple(dataset_dict, dataset_ref, key, predicate) self._add_agents(dataset_ref, dataset_dict, "hdab", HEALTHDCATAP.hdab) - self._add_relationship( - dataset_ref, dataset_dict, "qualified_relation", DCAT.qualifiedRelation - ) def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): """ @@ -174,29 +166,6 @@ def _add_timeframe_triple(self, dataset_dict, dataset_ref): self._add_date_triple(temporal_ref, DCAT.endDate, item["end"]) self.g.add((dataset_ref, DCT.temporal, temporal_ref)) - def _relationship_details(self, subject, predicate): - """ - Returns a list of dicts with details about a dcat:Relationship property, e.g. - dcat:qualifiedRelation - - Both subject and predicate must be rdflib URIRef or BNode objects - - Returns keys for uri, role, and relation with the values set to - an empty string if they could not be found. - """ - - relations = [] - for relation in self.g.objects(subject, predicate): - relation_details = {} - relation_details["uri"] = ( - str(relation) if isinstance(relation, term.URIRef) else "" - ) - relation_details["role"] = self._object_value(relation, DCAT.hadRole) - relation_details["relation"] = self._object_value(relation, DCT.relation) - relations.append(relation_details) - - return relations - def _add_relationship( self, dataset_ref,