diff --git a/ckanext/dcat/profiles/__init__.py b/ckanext/dcat/profiles/__init__.py index 6d30a244..668de499 100644 --- a/ckanext/dcat/profiles/__init__.py +++ b/ckanext/dcat/profiles/__init__.py @@ -25,4 +25,5 @@ from .euro_dcat_ap_3 import EuropeanDCATAP3Profile from .dcat_us_3 import DCATUS3Profile from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile +from .euro_health_dcat_ap import EuropeanHealthDCATAPProfile from .schemaorg import SchemaOrgProfile diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 30b989e7..a93eeb5c 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -2,16 +2,16 @@ import json from urllib.parse import quote +from ckan.lib.helpers import resource_formats +from ckan.model.license import LicenseRegister +from ckantoolkit import ObjectNotFound, asbool, aslist, config, get_action, url_for from dateutil.parser import parse as parse_date -from rdflib import term, URIRef, BNode, Literal -from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS, ORG -from geomet import wkt, InvalidGeoJSONException +from geomet import InvalidGeoJSONException, wkt +from rdflib import BNode, Literal, URIRef, term +from rdflib.namespace import ORG, RDF, RDFS, SKOS, XSD, Namespace -from ckantoolkit import config, url_for, asbool, aslist, get_action, ObjectNotFound -from ckan.model.license import LicenseRegister -from ckan.lib.helpers import resource_formats from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS -from ckanext.dcat.validators import is_year, is_year_month, is_date +from ckanext.dcat.validators import is_date, is_year, is_year_month CNT = Namespace("http://www.w3.org/2011/content#") DCT = Namespace("http://purl.org/dc/terms/") diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 3a2742a1..7a70bd77 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -1,6 +1,6 @@ import json -from rdflib import URIRef, BNode, Literal +from rdflib import URIRef, BNode, Literal, term from .base import RDFProfile, CleanedURIRef, URIRefOrLiteral from .base import ( RDF, @@ -118,6 +118,11 @@ def _parse_list_value(data_dict, field_name): if agents: dataset_dict[key] = agents + # Add any qualifiedRelations + qual_relations = self._relationship_details(dataset_ref, DCAT.qualifiedRelation) + if qual_relations: + dataset_dict["qualified_relation"] = qual_relations + # Repeating subfields: resources for schema_field in self._dataset_schema["resource_fields"]: if "repeating_subfields" in schema_field: @@ -227,6 +232,10 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): spatial_ref, field[1], item[field[0]] ) + self._add_relationship( + dataset_ref, dataset_dict, "qualified_relation", DCAT.qualifiedRelation + ) + resources = dataset_dict.get("resources", []) for resource in resources: if resource.get("access_services"): @@ -292,6 +301,78 @@ def _add_agents( _type=URIRefOrLiteral, ) + def _relationship_details(self, subject, predicate): + """ + Returns a list of dicts with details about a dcat:Relationship property, e.g. + dcat:qualifiedRelation + + Both subject and predicate must be rdflib URIRef or BNode objects + + Returns keys for uri, role, and relation with the values set to + an empty string if they could not be found. + """ + + relations = [] + for relation in self.g.objects(subject, predicate): + relation_details = {} + relation_details["uri"] = ( + str(relation) if isinstance(relation, term.URIRef) else "" + ) + relation_details["role"] = self._object_value(relation, DCAT.hadRole) + relation_details["relation"] = self._object_value(relation, DCT.relation) + relations.append(relation_details) + + return relations + + def _add_relationship( + self, + dataset_ref, + dataset_dict, + relation_key, + rdf_predicate, + ): + """ + Adds one or more Relationships to the RDF graph. + + :param dataset_ref: The RDF reference of the dataset + :param dataset_dict: The dataset dictionary containing agent information + :param relation_key: field name in the CKAN dict (.e.g. "qualifiedRelation") + :param rdf_predicate: The RDF predicate (DCAT.qualifiedRelation) + """ + relation = dataset_dict.get(relation_key) + if ( + isinstance(relation, list) + and len(relation) + and self._not_empty_dict(relation[0]) + ): + relations = relation + + for relation in relations: + + agent_uri = relation.get("uri") + if agent_uri: + agent_ref = CleanedURIRef(agent_uri) + else: + agent_ref = BNode() + + self.g.add((agent_ref, DCT.type, DCAT.Relationship)) + self.g.add((dataset_ref, rdf_predicate, agent_ref)) + + self._add_triple_from_dict( + relation, + agent_ref, + DCT.relation, + "relation", + _type=URIRefOrLiteral, + ) + self._add_triple_from_dict( + relation, + agent_ref, + DCAT.hadRole, + "role", + _type=URIRefOrLiteral, + ) + @staticmethod def _not_empty_dict(data_dict): return any(data_dict.values()) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py new file mode 100644 index 00000000..6e560b8c --- /dev/null +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -0,0 +1,219 @@ +from rdflib import RDF, SKOS, XSD, BNode, Literal, URIRef, term +from rdflib.namespace import Namespace + +from ckanext.dcat.profiles.base import DCAT, DCT, CleanedURIRef, URIRefOrLiteral +from ckanext.dcat.profiles.euro_dcat_ap_3 import EuropeanDCATAP3Profile + +# HealthDCAT-AP namespace. Note: not finalized yet +HEALTHDCATAP = Namespace("http://healthdataportal.eu/ns/health#") + +# Data Privacy Vocabulary namespace +DPV = Namespace("https://w3id.org/dpv#") + +namespaces = { + "healthdcatap": HEALTHDCATAP, + "dpv": DPV, +} + + +class EuropeanHealthDCATAPProfile(EuropeanDCATAP3Profile): + """ + A profile implementing HealthDCAT-AP, a health-related extension of the DCAT application profile + for sharing information about Catalogues containing Datasets and Data Services descriptions in Europe. + """ + + def parse_dataset(self, dataset_dict, dataset_ref): + # Call super method for DCAT-AP 3 properties + dataset_dict = super(EuropeanHealthDCATAPProfile, self).parse_dataset( + dataset_dict, dataset_ref + ) + + dataset_dict = self._parse_health_fields(dataset_dict, dataset_ref) + + return dataset_dict + + def _parse_health_fields(self, dataset_dict, dataset_ref): + self.__parse_healthdcat_stringvalues(dataset_dict, dataset_ref) + + self.__parse_healthdcat_intvalues(dataset_dict, dataset_ref) + + # Add the HDAB. There should only ever be one but you never know + agents = self._agents_details(dataset_ref, HEALTHDCATAP.hdab) + if agents: + dataset_dict["hdab"] = agents + + # Retention period + retention_start, retention_end = self._time_interval( + dataset_ref, HEALTHDCATAP.retentionPeriod, dcat_ap_version=2 + ) + retention_dict = {} + if retention_start is not None: + retention_dict["start"] = retention_start + if retention_end is not None: + retention_dict["end"] = retention_end + if retention_dict: + dataset_dict["retention_period"] = [retention_dict] + + return dataset_dict + + def __parse_healthdcat_intvalues(self, dataset_dict, dataset_ref): + for key, predicate in ( + ("min_typical_age", HEALTHDCATAP.minTypicalAge), + ("max_typical_age", HEALTHDCATAP.maxTypicalAge), + ("number_of_records", HEALTHDCATAP.numberOfRecords), + ("number_of_unique_individuals", HEALTHDCATAP.numberOfUniqueIndividuals), + ): + value = self._object_value_int(dataset_ref, predicate) + # A zero value evaluates as False but is definitely not a None + if value is not None: + dataset_dict[key] = value + + def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): + for ( + key, + predicate, + ) in ( + ("analytics", HEALTHDCATAP.analytics), + ("code_values", HEALTHDCATAP.hasCodeValues), + ("coding_system", HEALTHDCATAP.hasCodingSystem), + ("health_category", HEALTHDCATAP.healthCategory), + ("health_theme", HEALTHDCATAP.healthTheme), + ("legal_basis", DPV.hasLegalBasis), + ("personal_data", DPV.hasPersonalData), + ("population_coverage", HEALTHDCATAP.populationCoverage), + ("publisher_note", HEALTHDCATAP.publisherNote), + ("publisher_type", HEALTHDCATAP.publisherType), + ("purpose", DPV.hasPurpose), + ): + values = self._object_value_list(dataset_ref, predicate) + if values: + dataset_dict[key] = values + + def graph_from_dataset(self, dataset_dict, dataset_ref): + super().graph_from_dataset(dataset_dict, dataset_ref) + for prefix, namespace in namespaces.items(): + self.g.bind(prefix, namespace) + + ## key, predicate, fallbacks, _type, _class + items = [ + ("analytics", HEALTHDCATAP.analytics, None, URIRefOrLiteral), + ("code_values", HEALTHDCATAP.hasCodeValues, None, URIRefOrLiteral), + ("coding_system", HEALTHDCATAP.hasCodingSystem, None, URIRefOrLiteral), + ("health_category", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), + ("health_theme", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), + ("legal_basis", DPV.hasLegalBasis, None, URIRefOrLiteral), + ( + "population_coverage", + HEALTHDCATAP.populationCoverage, + None, + URIRefOrLiteral, + ), + ("personal_data", DPV.hasPersonalData, None, URIRef), + ("publisher_note", HEALTHDCATAP.publisherNote, None, URIRefOrLiteral), + ("publisher_type", HEALTHDCATAP.publisherType, None, URIRefOrLiteral), + ("purpose", DPV.hasPurpose, None, URIRefOrLiteral), + ] + self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) + + items = [ + ("min_typical_age", HEALTHDCATAP.minTypicalAge), + ("max_typical_age", HEALTHDCATAP.maxTypicalAge), + ("number_of_records", HEALTHDCATAP.numberOfRecords), + ("number_of_unique_individuals", HEALTHDCATAP.numberOfUniqueIndividuals), + ] + for key, predicate in items: + self._add_nonneg_integer_triple(dataset_dict, dataset_ref, key, predicate) + + self._add_agents(dataset_ref, dataset_dict, "hdab", HEALTHDCATAP.hdab) + + def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): + """ + Adds non-negative integers to the Dataset graph (xsd:nonNegativeInteger) + + dataset_ref: subject of Graph + key: scheming key in CKAN + predicate: predicate to use + """ + value = self._get_dict_value(dataset_dict, key) + + if value: + try: + if int(value) < 0: + raise ValueError("Not a non-negative integer") + self.g.add( + ( + dataset_ref, + predicate, + Literal(int(value), datatype=XSD.nonNegativeInteger), + ) + ) + except (ValueError, TypeError): + self.g.add((dataset_ref, predicate, Literal(value))) + + def _add_timeframe_triple(self, dataset_dict, dataset_ref): + temporal = dataset_dict.get("temporal_coverage") + if ( + isinstance(temporal, list) + and len(temporal) + and self._not_empty_dict(temporal[0]) + ): + for item in temporal: + temporal_ref = BNode() + self.g.add((temporal_ref, RDF.type, DCT.PeriodOfTime)) + if item.get("start"): + self._add_date_triple(temporal_ref, DCAT.startDate, item["start"]) + if item.get("end"): + self._add_date_triple(temporal_ref, DCAT.endDate, item["end"]) + self.g.add((dataset_ref, DCT.temporal, temporal_ref)) + + def _add_relationship( + self, + dataset_ref, + dataset_dict, + relation_key, + rdf_predicate, + ): + """ + Adds one or more Relationships to the RDF graph. + + :param dataset_ref: The RDF reference of the dataset + :param dataset_dict: The dataset dictionary containing agent information + :param relation_key: field name in the CKAN dict (.e.g. "qualifiedRelation") + :param rdf_predicate: The RDF predicate (DCAT.qualifiedRelation) + """ + relation = dataset_dict.get(relation_key) + if ( + isinstance(relation, list) + and len(relation) + and self._not_empty_dict(relation[0]) + ): + relations = relation + + for relation in relations: + + agent_uri = relation.get("uri") + if agent_uri: + agent_ref = CleanedURIRef(agent_uri) + else: + agent_ref = BNode() + + self.g.add((agent_ref, DCT.type, DCAT.Relationship)) + self.g.add((dataset_ref, rdf_predicate, agent_ref)) + + self._add_triple_from_dict( + relation, + agent_ref, + DCT.relation, + "relation", + _type=URIRefOrLiteral, + ) + self._add_triple_from_dict( + relation, + agent_ref, + DCAT.hadRole, + "role", + _type=URIRefOrLiteral, + ) + + def graph_from_catalog(self, catalog_dict, catalog_ref): + super().graph_from_catalog(catalog_dict, catalog_ref) diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml new file mode 100644 index 00000000..6245756d --- /dev/null +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -0,0 +1,598 @@ +scheming_version: 2 +dataset_type: dataset +about: Schema for HealthDCAT-AP +about_url: http://github.com/ckan/ckanext-dcat + +dataset_fields: + +- field_name: title + label: Title + preset: title + required: true + help_text: A descriptive title for the dataset. + +- field_name: name + label: URL + preset: dataset_slug + form_placeholder: eg. my-dataset + +- field_name: notes + label: Description + required: true + form_snippet: markdown.html + help_text: A free-text account of the dataset. + +- field_name: tag_string + label: Keywords + preset: tag_string_autocomplete + form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataset. Use commas to separate multiple values. + +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the contact point. Such as a ROR ID. + + help_text: Contact information for enquiries about the dataset. + +- field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the publisher, such as a ROR ID. + help_text: Entity responsible for making the dataset available. + +- field_name: creator + label: Creator + repeating_label: Creator + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + help_text: URI of the creator, if available. + + - field_name: name + label: Name + help_text: Name of the entity or person who created the dataset. + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact email of the creator. + + - field_name: url + label: URL + display_snippet: link.html + help_text: URL for more information about the creator. + + - field_name: type + label: Type + help_text: Type of creator (e.g., Organization, Person). + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + +- field_name: license_id + label: License + form_snippet: license.html + help_text: License definitions and additional information can be found at http://opendefinition.org/. + +- field_name: owner_org + label: Organization + preset: dataset_organization + help_text: The CKAN organization the dataset belongs to. + +- field_name: url + label: Landing page + form_placeholder: http://example.com/dataset.json + display_snippet: link.html + help_text: Web page that can be navigated to gain access to the dataset, its distributions and/or additional information. + + # Note: this will fall back to metadata_created if not present +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the dataset. + + # Note: this will fall back to metadata_modified if not present +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the dataset was changed, updated or modified. + +- field_name: version + label: Version + validators: ignore_missing unicode_safe package_version_validator + help_text: Version number or other version designation of the dataset. + +- field_name: version_notes + label: Version notes + validators: ignore_missing unicode_safe + form_snippet: markdown.html + display_snippet: markdown.html + help_text: A description of the differences between this version and a previous version of the dataset. + + # Note: CKAN will generate a unique identifier for each dataset +- field_name: identifier + label: Identifier + help_text: A unique identifier of the dataset. + +- field_name: frequency + label: Frequency + help_text: The frequency at which dataset is published. + +- field_name: provenance + label: Provenance + form_snippet: markdown.html + display_snippet: markdown.html + help_text: A statement about the lineage of the dataset. + +- field_name: dcat_type + label: Type + help_text: The type of the dataset. + # TODO: controlled vocabulary? + +- field_name: temporal_coverage + label: Temporal coverage + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + help_text: The temporal period or periods the dataset covers. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the dataset. + +- field_name: spatial_coverage + label: Spatial coverage + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: text + label: Label + + - field_name: geom + label: Geometry + + - field_name: bbox + label: Bounding Box + + - field_name: centroid + label: Centroid + help_text: A geographic region that is covered by the dataset. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in a dataset, measured in meters. + +- field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + help_text: Information that indicates whether the dataset is Open Data, has access restrictions or is not public. + +- field_name: alternate_identifier + label: Other identifier + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: This property refers to a secondary identifier of the dataset, such as MAST/ADS, DataCite, DOI, etc. + +- field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A category of the dataset. A Dataset may be associated with multiple themes. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the dataset. + # TODO: language form snippet / validator / graph + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this dataset. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An implementing rule or other specification that the dataset follows. + +- field_name: is_referenced_by + label: Is referenced by + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A related resource, such as a publication, that references, cites, or otherwise points to the dataset. + +- field_name: analytics + label: Analytics + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + An analytics distribution of the dataset. + Publishers are encouraged to provide URLs pointing to API endpoints or document + repositories where users can access or request associated resources such as + technical reports of the dataset, quality measurements, usability indicators,... + or analytics services. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the dataset. + +- field_name: code_values + label: Code values + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Health classifications and their codes associated with the dataset. + +- field_name: coding_system + label: Coding system + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + Coding systems in use (e.g. ICD-10-CM, DGRs, SNOMED CT, ...). + To comply with HealthDCAT-AP, Wikidata URIs MUST be used. + +- field_name: purpose + label: Purpose + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A free text statement of the purpose of the processing of data or personal data. + +- field_name: health_category + label: Health category + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + The health category to which this dataset belongs as described in the Commission Regulation on + the European Health Data Space laying down a list of categories of electronic data for + secondary use, Art.33. + +- field_name: health_theme + label: Health theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A category of the Dataset or tag describing the Dataset. + +- field_name: legal_basis + label: Legal basis + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legal basis used to justify processing of personal data. + +- field_name: min_typical_age + label: Minimum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Minimum typical age of the population within the dataset. + +- field_name: max_typical_age + label: Maximum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Maximum typical age of the population within the dataset. + +- field_name: number_of_records + label: Number of records + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Size of the dataset in terms of the number of records + +- field_name: number_of_unique_individuals + label: Number of records for unique individuals. + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Number of records for unique individuals. + +- field_name: personal_data + label: Personal data + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Key elements that represent an individual in the dataset. + +- field_name: publisher_note + label: Publisher note + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A description of the publisher activities. + +- field_name: publisher_type + label: Publisher type + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A type of organisation that makes the Dataset available. + +- field_name: population_coverage + label: Population coverage + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A definition of the population within the dataset. + +- field_name: retention_period + label: Retention period + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + + help_text: A temporal period which the dataset is available for secondary use. + + +# Officially there can only be one HDAB for now, but keep it repeating subfield just in case +- field_name: hdab + label: Health data access body + repeating_label: Health data access body + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the HDAB, such as a ROR ID. + help_text: Health Data Access Body supporting access to data in the Member State. + +- field_name: qualified_relation + label: Qualified relation + repeating_label: Relationship + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: relation + label: Relation + help_text: The resource related to the source resource. + + - field_name: role + label: Role + help_text: The function of an entity or agent with respect to another entity or resource. + help_text: A description of a relationship with another resource. + +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this dataset (if not provided it will be autogenerated). + +# TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) +# +resource_fields: + +- field_name: url + label: URL + preset: resource_url_upload + +- field_name: name + label: Name + form_placeholder: + help_text: A descriptive title for the resource. + +- field_name: description + label: Description + form_snippet: markdown.html + help_text: A free-text account of the resource. + +- field_name: format + label: Format + preset: resource_format_autocomplete + help_text: File format. If not provided it will be guessed. + +- field_name: mimetype + label: Media type + validators: if_empty_guess_format ignore_missing unicode_safe + help_text: Media type for this format. If not provided it will be guessed. + +- field_name: compress_format + label: Compress format + help_text: The format of the file in which the data is contained in a compressed form. + +- field_name: package_format + label: Package format + help_text: The format of the file in which one or more data files are grouped together. + +- field_name: size + label: Size + validators: ignore_missing int_validator + form_snippet: number.html + display_snippet: file_size.html + help_text: File size in bytes + +- field_name: hash + label: Hash + help_text: Checksum of the downloaded file. + +- field_name: hash_algorithm + label: Hash Algorithm + help_text: Algorithm used to calculate to checksum. + +- field_name: rights + label: Rights + form_snippet: markdown.html + display_snippet: markdown.html + help_text: Some statement about the rights associated with the resource. + +- field_name: availability + label: Availability + help_text: Indicates how long it is planned to keep the resource available. + +- field_name: status + label: Status + preset: select + choices: + - value: http://purl.org/adms/status/Completed + label: Completed + - value: http://purl.org/adms/status/UnderDevelopment + label: Under Development + - value: http://purl.org/adms/status/Deprecated + label: Deprecated + - value: http://purl.org/adms/status/Withdrawn + label: Withdrawn + help_text: The status of the resource in the context of maturity lifecycle. + +- field_name: license + label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. + +- field_name: has_version + label: Has version + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_inline: true + help_text: This property refers to a related Dataset that is a version, edition, or adaptation of the described Dataset. + + # Note: this falls back to the standard resource url field +- field_name: access_url + label: Access URL + help_text: URL that gives access to the dataset (defaults to the standard resource URL). + + # Note: this falls back to the standard resource url field +- field_name: download_url + label: Download URL + display_snippet: link.html + help_text: URL that provides a direct link to a downloadable file (defaults to the standard resource URL). + +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the resource. + +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the resource was changed, updated or modified. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the distribution. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in the distribution, measured in meters. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the resource. + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this resource. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An established schema to which the described resource conforms. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the resource. + +- field_name: access_services + label: Access services + repeating_label: Access service + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: title + label: Title + + - field_name: endpoint_description + label: Endpoint description + + - field_name: endpoint_url + label: Endpoint URL + preset: multiple_text + + - field_name: serves_dataset + label: Serves dataset + preset: multiple_text + validators: ignore_missing scheming_multiple_text + + help_text: A data service that gives access to the resource. + + # Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this resource (if not provided it will be autogenerated). diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index fdda473f..1bce901c 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -555,7 +555,6 @@ def test_dataset_distribution_access_service_list_values_only(self): # List endpoint_url_list = access_service.get('endpoint_url') - print(access_service) assert len(endpoint_url_list) == 1 assert 'http://publications.europa.eu/webapi/rdf/sparql' in endpoint_url_list diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/__init__.py b/ckanext/dcat/tests/profiles/health_dcat_ap/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py new file mode 100644 index 00000000..7abcacb4 --- /dev/null +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -0,0 +1,179 @@ +# test +import json +import logging +from pprint import pprint + +import pytest +from ckan.tests.helpers import call_action + +from ckanext.dcat.processors import RDFParser +from ckanext.dcat.tests.utils import BaseParseTest + +log = logging.getLogger(__name__) + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:health_dcat_ap.yaml" +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") +class TestSchemingParseSupport(BaseParseTest): + def test_e2e_dcat_to_ckan(self): + """ + Parse a DCAT RDF graph into a CKAN dataset dict, create a dataset with + package_create and check that all expected fields are there + """ + + contents = self._get_file_contents("dcat/dataset_health.ttl") + + p = RDFParser() + + p.parse(contents, _format="turtle") + + datasets = [d for d in p.datasets()] + + assert len(datasets) == 1 + + dataset_dict = datasets[0] + + dataset_dict["name"] = "test-dcat-1" + dataset = call_action("package_create", **dataset_dict) + + # Core fields + + assert dataset["title"] == "HealthDCAT-AP test dataset" + assert ( + dataset["notes"] + == "This dataset is an example of using HealthDCAT-AP in CKAN" + ) + + assert sorted([t["name"] for t in dataset["tags"]]) == [ + "Test 1", + "Test 2", + "Test 3", + ] + + # Standard fields + assert dataset["version_notes"] == "Dataset continuously updated" + assert dataset["identifier"] == "http://example.com/dataset/1234567890" + assert ( + dataset["frequency"] + == "http://publications.europa.eu/resource/authority/frequency/DAILY" + ) + assert ( + dataset["access_rights"] + == "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC" + ) + assert ( + dataset["provenance"] + == "This example dataset is partly sourced from TEHDAS2" + ) + + # Hard to map (example uses a blank node which doesn't work well in CKAN) + # assert dataset["dcat_type"] == "test-type" + + assert dataset["issued"] == "2024-01-01T00:00:00+00:00" + assert dataset["modified"] == "2024-12-31T23:59:59+00:00" + assert dataset["temporal_resolution"] == "P1D" + + assert dataset["analytics"] == ["http://example.com/analytics"] + assert sorted(dataset["code_values"]) == [ + "http://example.com/code1", + "http://example.com/code2", + ] + assert sorted(dataset["coding_system"]) == [ + "http://www.wikidata.org/entity/P1690", + "http://www.wikidata.org/entity/P4229", + ] + + assert dataset["spatial_coverage"] == [ + { + "uri": "http://publications.europa.eu/resource/authority/country/BEL", + } + ] + + # List fields + assert sorted(dataset["conforms_to"]) == [ + "http://www.wikidata.org/entity/Q19597236" + ] + assert sorted(dataset["language"]) == [ + "http://publications.europa.eu/resource/authority/language/ENG", + "http://publications.europa.eu/resource/authority/language/FRA", + "http://publications.europa.eu/resource/authority/language/NLD", + ] + assert sorted(dataset["theme"]) == [ + "http://publications.europa.eu/resource/authority/data-theme/HEAL" + ] + + assert sorted(dataset["is_referenced_by"]) == [ + "https://doi.org/10.1038/sdata.2016.18", + "https://dx.doi.org/10.1002/jmri.28679", + ] + assert sorted(dataset["applicable_legislation"]) == [ + "http://data.europa.eu/eli/reg/2022/868/oj", + ] + + # Repeating subfields + assert dataset["contact"][0]["name"] == "Contact Point" + assert dataset["contact"][0]["email"] == "contact@example.com" + + assert dataset["publisher"][0]["name"] == "Contact Point" + assert dataset["publisher"][0]["email"] == "info@example.com" + assert dataset["publisher"][0]["url"] == "https://healthdata.nl" + + assert len(dataset["qualified_relation"]) == 1 + assert ( + dataset["qualified_relation"][0]["relation"] + == "http://example.com/dataset/3.141592" + ) + assert ( + dataset["qualified_relation"][0]["role"] + == "http://www.iana.org/assignments/relation/related" + ) + + assert dataset["temporal_coverage"][0]["start"] == "2020-03-01" + assert dataset["temporal_coverage"][0]["end"] == "2024-12-31" + + ## HealthDCAT specific + assert sorted(dataset["health_theme"]) == [ + "http://www.wikidata.org/entity/Q58624061", + "http://www.wikidata.org/entity/Q7907952", + ] + + assert dataset["legal_basis"] == ["https://w3id.org/dpv#Consent"] + + assert dataset["hdab"][0]["name"] == "EU Health Data Access Body" + assert dataset["hdab"][0]["email"] == "hdab@example.com" + assert dataset["hdab"][0]["url"] == "https://www.example.com/hdab" + + # CKAN converts these to strings, but also converts back to decimal/nonneg int + assert dataset["min_typical_age"] == "0" + assert dataset["max_typical_age"] == "110" + assert dataset["number_of_records"] == "123456789" + assert dataset["number_of_unique_individuals"] == "7654321" + + assert sorted(dataset["personal_data"]) == [ + "https://w3id.org/dpv/dpv-pd#Age", + "https://w3id.org/dpv/dpv-pd#Gender", + "https://w3id.org/dpv/dpv-pd#HealthRecord", + ] + + assert dataset["population_coverage"] == [ + "This example includes a very non-descript population" + ] + assert dataset["publisher_note"] == [ + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." + ] + assert dataset["publisher_type"] == [ + "http://example.com/publisherType/undefined" + ] + + assert dataset["purpose"] == ["https://w3id.org/dpv#AcademicResearch"] + + assert dataset["retention_period"] == [ + { + "start": "2020-03-01", + "end": "2034-12-31", + } + ] diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py new file mode 100644 index 00000000..0bfade6e --- /dev/null +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -0,0 +1,101 @@ +import json + +import pytest +from ckan.tests.helpers import call_action +from geomet import wkt +from rdflib import Graph +from rdflib.namespace import RDF +from rdflib.term import URIRef + +from ckanext.dcat import utils +from ckanext.dcat.processors import RDFSerializer +from ckanext.dcat.profiles import ( + ADMS, + DCAT, + DCATAP, + DCT, + FOAF, + GSP, + LOCN, + OWL, + RDF, + RDFS, + SKOS, + SPDX, + VCARD, + XSD, +) +from ckanext.dcat.profiles.euro_health_dcat_ap import HEALTHDCATAP +from ckanext.dcat.tests.utils import BaseSerializeTest + +DCAT_AP_PROFILES = ["euro_dcat_ap_3"] + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:health_dcat_ap.yaml" +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") +class TestEuroDCATAP3ProfileSerializeDataset(BaseSerializeTest): + def test_e2e_ckan_to_dcat(self): + """ + End to end testing of CKAN dataset to RDF triples. + + Note: in this HealthDCAT-AP profile, only the HealthDCAT-AP specific triples are tested for. + Triples in other profiles could be tested, but should mainly be tested by their respective + profiles.""" + dataset_dict = json.loads(self._get_file_contents("ckan/health_dcat_ap.json"))[ + 0 + ] + + dataset = call_action("package_create", **dataset_dict) + + # Make sure schema was used + assert dataset["hdab"][0]["name"] == "EU Health Data Access Body" + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + # Test dataset URI + assert str(dataset_ref) == utils.dataset_uri(dataset) + + # Load Reference graph that only containes + contents = self._get_file_contents("dcat/dataset_health_no_blank.ttl") + reference = Graph() + reference.parse(data=contents, format="turtle") + + # First check that all non-blind nodes from the reference are present in the output + # Any other nodes added by other profiles (e.g. DCAT-AP 3) we do not have an opinion about + for triple in reference: + assert triple in g, f"Triple {triple} not in output graph" + # assert all(triple in g for triple in reference) + + # Test HealthDCAT-AP specific HDAB triples + # We can assume other blank nodes (e.g. contact point, publisher, temporal) are taken care + # of by the base profile. + hdab = [t for t in g.triples((dataset_ref, HEALTHDCATAP.hdab, None))] + assert len(hdab) == 1 + hdab_items = [ + (FOAF.name, dataset_dict["hdab"][0]["name"]), + (VCARD.hasEmail, URIRef("mailto:" + dataset_dict["hdab"][0]["email"])), + (FOAF.homepage, URIRef(dataset_dict["hdab"][0]["url"])), + ] + for predicate, value in hdab_items: + assert self._triple( + g, hdab[0][2], predicate, value + ), f"HDAB Predicate {predicate} does not have value {value}" + + # Test qualified relation + relation = [t for t in g.triples((dataset_ref, DCAT.qualifiedRelation, None))] + assert len(relation) == 1 + relation_items = [ + (DCT.relation, URIRef(dataset_dict["qualified_relation"][0]["relation"])), + (DCAT.hadRole, URIRef(dataset_dict["qualified_relation"][0]["role"])), + ] + for predicate, value in relation_items: + assert self._triple( + g, relation[0][2], predicate, value + ), f"relation Predicate {predicate} does not have value {value}" diff --git a/docs/healthdcat.md b/docs/healthdcat.md new file mode 100644 index 00000000..ac5f1a4a --- /dev/null +++ b/docs/healthdcat.md @@ -0,0 +1,61 @@ +# HealthDCAT-AP + +## Introduction + +This extension contains a profile for the proposed [HealthDCAT-AP](https://healthdcat-ap.github.io/) +extension, a health-related extension of the DCAT application profile for sharing information about +Catalogues containing Datasets and Data Services descriptions in Europe (DCAT-AP). The CKAN +HealthDCAT-AP profile was developed to implement this. + +The development of a healthDCAT application profile, as an extension of the DCAT application +profile, aims to standardize health metadata within the scope of EHDS, fostering greater +interoperability, findability and accessibility of electronic health data across the EU. + +**Note:** HealthDCAT-AP is still under active development and not finalized yet. Cardinalities, +certain vocabularies and the namespace have not officially been ratified yet. These are expected to +be finalized after the public consultation in Q1 2025. + +The goal of this profile is to provide the wider FAIR community and other EU portals with a starting +point for implementing HealthDCAT-AP within their own data catalogs. + +## Implementation details + +The HealthDCAT-AP profile is an extension of the DCAT-AP v3 profile. Just like that profile, +this profile requires *ckanext-scheming*. + +## Usage and settings + +This profile has currently no additional settings. To select the profile, make sure +`scheming.dataset_schemas` includes `ckanext.dcat.schemas:health_dcat_ap.yaml`, and +`ckanext.dcat.rdf.profiles` includes `euro_health_dcat_ap`. + +## Limitations and deviations + +As HealthDCAT-AP is still a draft, it is bound for change. There are currently still some +inconsistencies in the standard and unclarities regarding certain properties. Below a short summary +of limitations and implementaiton decisions made during development of this profile. + +1. Cardinalities have not yet been finalized for HealthDCAT-AP. This CKAN scheme has taken a very + liberal approach and takes all values as strictly optional (no failed validation for missing + fields). Note that some mandatory fields are currently impossible to fill with real data e.g. the + Health Data Access Body (HDAB) field: the EHDS legislation has not been implemented yet and no HDABs + have been formally appointed. +2. The HealthDCAT-AP namespace is not formally defined yet. For now, + `http://healthdataportal.eu/ns/health#` is used. This will be updated once the final namespace is + standardized. +3. The official examples of the standard uses the `dct:description` property to encode the data + purpose. This does not seem to be according to the Data Privacy Vocabulary specification, which + proposes a controlled vocabulary. See + () for the German perspective on + this. +4. The distributions proposed by HealthDCAT-AP, *analytics* and *sample*, are not specifically + implemented. URIs are linked, the resources themselves are not loaded. For *sample*, as this is + an upstream DCAT-AP property, this can be included once picked up there. +5. Documentation (*foaf:page*) is implemented as an URI. There is some HealthDCAT-AP example data + out in the wild that uses a blank node for this and adds several properties, however this is + inconsistent with other DCAT implementations. +6. DatasetSeries are not supported yet by CKAN, and also not by this profile. +7. The *quality annotation* property has not been implemented due to usage not being completely +defined yet. +8. There is no multilingual support yet. +9. For other properties, any limitations from the DCAT-AP profiles still apply. diff --git a/examples/ckan/health_dcat_ap.json b/examples/ckan/health_dcat_ap.json new file mode 100644 index 00000000..2670c77b --- /dev/null +++ b/examples/ckan/health_dcat_ap.json @@ -0,0 +1,199 @@ +[ + { + "access_rights": "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC", + "analytics": [ + "http://example.com/analytics" + ], + "alternate_identifier": [ + "internalURI:admsIdentifier0" + ], + "applicable_legislation": [ + "http://data.europa.eu/eli/reg/2022/868/oj" + ], + "author": null, + "author_email": null, + "code_values": [ + "http://example.com/code1", + "http://example.com/code2" + ], + "coding_system": [ + "http://www.wikidata.org/entity/P1690", + "http://www.wikidata.org/entity/P4229" + ], + "conforms_to": [ + "http://www.wikidata.org/entity/Q19597236" + ], + "creator_user_id": null, + "dcat_type": "n1049372e768c4429a6b2200c22f5f1a4b7", + "documentation": [ + "n1049372e768c4429a6b2200c22f5f1a4b9" + ], + "frequency": "http://publications.europa.eu/resource/authority/frequency/DAILY", + "health_category": [ + "http://example.com/ontology/resource/authority/healthcategories/PHDR", + "http://example.com/ontology/resource/authority/healthcategories/IDHP", + "http://example.com/ontology/resource/authority/healthcategories/DIOH", + "http://example.com/ontology/resource/authority/healthcategories/EHRS" + ], + "health_theme": [ + "http://www.wikidata.org/entity/Q7907952", + "http://www.wikidata.org/entity/Q58624061" + ], + "id": "e7ccf79d-705c-427f-8e96-f87bcd6e5318", + "identifier": "http://example.com/dataset/1234567890", + "is_referenced_by": [ + "https://doi.org/10.1038/sdata.2016.18", + "https://dx.doi.org/10.1002/jmri.28679" + ], + "isopen": false, + "issued": "2024-01-01T00:00:00+00:00", + "language": [ + "http://publications.europa.eu/resource/authority/language/ENG", + "http://publications.europa.eu/resource/authority/language/NLD", + "http://publications.europa.eu/resource/authority/language/FRA" + ], + "legal_basis": [ + "https://w3id.org/dpv#Consent" + ], + "license_id": "", + "license_title": "", + "maintainer": null, + "maintainer_email": null, + "max_typical_age": "110", + "metadata_created": "2024-12-02T19:00:30.897399", + "metadata_modified": "2024-12-02T19:00:30.897406", + "min_typical_age": "0", + "modified": "2024-12-31T23:59:59+00:00", + "name": "test-dcat-1", + "notes": "This dataset is an example of using HealthDCAT-AP in CKAN", + "num_resources": 0, + "num_tags": 3, + "number_of_records": "123456789", + "number_of_unique_individuals": "7654321", + "organization": null, + "personal_data": [ + "https://w3id.org/dpv/dpv-pd#Age", + "https://w3id.org/dpv/dpv-pd#Gender", + "https://w3id.org/dpv/dpv-pd#HealthRecord" + ], + "population_coverage": [ + "This example includes a very non-descript population" + ], + "private": false, + "provenance": "This example dataset is partly sourced from TEHDAS2", + "publisher_note": [ + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." + ], + "publisher_type": [ + "http://example.com/publisherType/undefined" + ], + "purpose": [ + "https://w3id.org/dpv#AcademicResearch" + ], + "qualified_relation": [ + { + "uri": "", + "relation": "http://example.com/dataset/3.141592", + "role": "http://www.iana.org/assignments/relation/related" + } + ], + "state": "active", + "temporal_resolution": "P1D", + "theme": [ + "http://publications.europa.eu/resource/authority/data-theme/HEAL" + ], + "title": "HealthDCAT-AP test dataset", + "type": "dataset", + "uri": "http://example.healthdata.nl/set/dataset", + "version_notes": "Dataset continuously updated", + "contact": [ + { + "email": "covacsurv@sciensano.be", + "identifier": "", + "name": "Contact Point" + } + ], + "creator": [ + { + "email": "info@example.com", + "identifier": "", + "name": "Contact Point", + "type": "", + "url": "https:/example.com/homepage" + } + ], + "extras": [ + { + "key": "related_resource", + "value": "[\"http://example.com/dataset/9876543210\"]" + }, + { + "key": "sample", + "value": "[\"http://example.com/sample\"]" + }, + { + "key": "spatial_uri", + "value": "http://publications.europa.eu/resource/authority/country/BEL" + } + ], + "hdab": [ + { + "email": "hdab@example.com", + "identifier": "", + "name": "EU Health Data Access Body", + "type": "", + "uri": "", + "url": "https://www.example.com/hdab" + } + ], + "publisher": [ + { + "email": "info@example.com", + "identifier": "", + "name": "Contact Point", + "type": "", + "uri": "", + "url": "https://healthdata.nl" + } + ], + "retention_period": [ + { + "end": "2034-12-31", + "start": "2020-03-01" + } + ], + "tags": [ + { + "display_name": "Test 1", + "id": "5c418ec2-cb41-4c42-9b9c-f5d1e3a831e5", + "name": "Test 1", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "Test 2", + "id": "c4117ace-2114-470d-b6e9-0df7580a12d8", + "name": "Test 2", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "Test 3", + "id": "d5a5288d-3bff-431e-be94-12c71d25d75b", + "name": "Test 3", + "state": "active", + "vocabulary_id": null + } + ], + "temporal_coverage": [ + { + "end": "2024-12-31", + "start": "2020-03-01" + } + ], + "resources": [], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + } +] \ No newline at end of file diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl new file mode 100644 index 00000000..a665c1ee --- /dev/null +++ b/examples/dcat/dataset_health.ttl @@ -0,0 +1,295 @@ +@prefix adms: . +@prefix dcat: . +@prefix dcatap: . +@prefix dct: . +@prefix dqv: . +@prefix foaf: . +@prefix locn: . +@prefix oa: . +@prefix prov: . +@prefix rdfs: . +@prefix skos: . +@prefix spdx: . +@prefix vcard: . + + + a dcat:Resource , dcat:Dataset; + dcatap:applicableLegislation ; + + ; + + , + ; + , + ; + + [ a foaf:Organization; + foaf:homepage ; + foaf:mbox ; + foaf:name "EU Health Data Access Body" + ]; + + , , , ; + + , ; + + "110"^^; + + "0"^^; + + "123456789"^^; + + "7654321"^^; + + "This example includes a very non-descript population"; + + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation."; + + ; + + [ a dct:PeriodOfTime; + rdfs:comment "As stated in the CSI deliberation"; + dcat:endDate "2034-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:accessRights ; + dct:accrualPeriodicity ; + dct:alternative "TEST-DATASET"; + dct:conformsTo ; + dct:creator ; + dct:description "This dataset is an example of using HealthDCAT-AP in CKAN"; + dct:identifier "http://example.com/dataset/1234567890"^^; + dct:isPartOf ; + dct:isReferencedBy , ; + dct:issued "2024-01-01T00:00:00Z"^^; + dct:language , , ; + dct:modified "2024-12-31T23:59:59Z"^^; + dct:provenance [ a dct:ProvenanceStatement; + rdfs:label "This example dataset is partly sourced from TEHDAS2" + ]; + dct:publisher [ a foaf:Organization , foaf:Agent; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point" + ]; + dct:relation ; + dcat:qualifiedRelation [ + a dcat:Relationship ; + dct:relation ; + dcat:hadRole + ]; + dct:spatial ; + dct:temporal [ a dct:PeriodOfTime; + dcat:endDate "2024-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:title "HealthDCAT-AP test dataset"; + dct:type [ a skos:Concept; + skos:inScheme ; + skos:prefLabel "Personal Data" + ]; + adms:identifier ; + adms:sample ; + adms:versionNotes "Dataset continuously updated"; + dcat:contactPoint ; + # dcat:distribution ; + dcat:hasVersion ; + dcat:keyword "Test 1" , "Test 2" , "Test 3"; + dcat:spatialResolutionInMeters "10"^^; + dcat:temporalResolution "P1D"^^; + dcat:theme ; + # dcat:version is not mapped in ckan and should be hasVersion + # dcat:version "Project HDBP0250"; + dqv:hasQualityAnnotation [ a dqv:QualityCertificate; + oa:hasBody ; + oa:hasTarget ; + oa:motivatedBy dqv:qualityAssessment + ]; + prov:qualifiedAttribution ; + prov:wasGeneratedBy ; + foaf:page [ a foaf:Document; + rdfs:label "Landing Page for Sciensano"; + foaf:homepage + ]; + + ; + + , + , + ; + + . + + + a dcat:Distribution; + dcatap:applicableLegislation ; + dct:format ; + dct:identifier "http://ehelse.healthdataportal.eu/analytics/47f55653-a151-48c1-8d90-940561da6e57"; + dct:isPartOf ; + dct:issued "2024-06-03T08:51:00Z"^^; + dct:license ; + dct:modified "2024-06-04T18:00:00Z"^^; + dct:rights [ a dct:RightsStatement; + rdfs:label "_g_L202C11377" , "internalURI:wasGeneratedBy0" , "_g_L123C7733" + ]; + dct:title "Technical report number of unique study subjects available by environment for project HDBP0250"; + dcat:accessURL ; + dcat:downloadURL ; + dcat:mediaType . + + + a dct:MediaType . + + + a foaf:Agent; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point" . + + + a adms:Identifier; + skos:notation "https://www.healthinformationportal.eu/health-information-sources/linking-registers-covid-19-vaccine-surveillance"^^; + adms:schemaAgency "Health Information Portal" . + + + a vcard:Organization , vcard:Kind; + vcard:fn "Contact Point"; + vcard:hasEmail ; + vcard:hasURL ; + vcard:organisationName "Contact Point"; + vcard:organisationUnit "Health Information" . + + + a dcat:CatalogRecord; + dct:creator ; + dct:identifier "16e16149-bf41-42f6-8741-225e8c97a35e"; + dct:issued "2024-10-04T14:28:36Z"^^; + dct:modified "2024-10-09T17:34:28Z"^^; + spdx:checksum [ a spdx:Checksum; + spdx:algorithm spdx:checksumAlgorithm_md5; + spdx:checksumValue "ea77c251b6945e450ae4d66c581495d4" + ]; + foaf:primaryTopic . + + + + a dct:LinguisticSystem . + + + a ; + dct:title "ID_TU_STATBEL_POP"; + + ; + dcat:keyword "TEST-DATASET" . + + + a dcat:Distribution; + dcatap:applicableLegislation ; + dct:format ; + dct:identifier "http://ehelse.healthdataportal.eu/sample/fe921169-4619-4386-8bfe-60ea131dbe96"; + dct:isPartOf ; + dct:issued "2024-06-03T08:51:00Z"^^; + dct:language ; + dct:license ; + dct:modified "2024-06-04T18:00:00Z"^^; + dct:rights [ a dct:RightsStatement; + rdfs:label "Free access." + ]; + dct:title "Proxy data generating for the EHDS2 Pilot project Sciensano Use Case"; + dcat:accessURL ; + dcat:downloadURL ; + dcat:mediaType . + + + + a dct:LinguisticSystem . + + + a dct:LinguisticSystem . + + + a skos:Concept; + skos:prefLabel "National Public Health Institute" . + + + a dct:RightsStatement . + + + a dct:Frequency . + + + a prov:Attribution; + dcat:hadRole ; + prov:agent [ a foaf:Organization; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point" + ] . + + + a dct:Location . + + + a skos:Concept; + dct:identifier "https://icd.who.int/browse10/2019/en#/Y59.0"^^; + skos:definition "Viral vaccines"; + skos:hasTopConcept ; + skos:notation "Y59.0"; + skos:prefLabel "Viral vaccines" . + + + a dct:MediaTypeOrExtent . + +# +# a dcat:Distribution; +# dcatap:applicableLegislation ; +# dct:description "EU Health Data Access Body For better Healthcare, Research & Policy Making"; +# dct:format ; +# dct:identifier "http://ehelse.healthdataportal.eu/distribution/13a3851d-6cdf-4570-a7f0-7f03015d1925"; +# dct:isPartOf ; +# dct:issued "2024-06-03T08:51:00Z"^^; +# dct:license ; +# dct:modified "2024-06-04T18:00:00Z"^^; +# dct:rights [ a dct:RightsStatement; +# rdfs:label "Access to data is conditional on the issuance of a permit by the HDAB after submission of a data request application (English)" +# ]; +# dct:title "EU Health Data Access Body"; +# dcat:accessURL ; +# dcat:byteSize "80000"^^ . + + + a prov:Activity; + rdfs:label "http://dbpedia.org/resource/Record_linkage"; + rdfs:seeAlso ; + dct:type ; + prov:startedAtTime "2021-01-01T00:00:00Z"^^; + prov:wasAssociatedWith [ a prov:Agent; + prov:actedOnBehalfOf [ a prov:Organization , prov:Agent; + foaf:name "Contact Point" + ]; + foaf:homepage ; + foaf:mbox ; + foaf:name "Dr. Joris van Loenhout" + ]; + foaf:page . + + + a ; + + ; + + "Patient death reason\tInformation on wheter the cause of death was COVID-19."; + + "CD_COD_COVID" . + + + a skos:Concept; + dct:identifier "https://icd.who.int/browse10/2019/en#/U07.1"^^; + skos:definition "COVID-19, virus identified"; + skos:hasTopConcept ; + skos:notation "U07.1"; + skos:prefLabel "Test 1" . + + + a dct:LicenseDocument; + rdfs:label "Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported" . diff --git a/examples/dcat/dataset_health_no_blank.ttl b/examples/dcat/dataset_health_no_blank.ttl new file mode 100644 index 00000000..ba854b5f --- /dev/null +++ b/examples/dcat/dataset_health_no_blank.ttl @@ -0,0 +1,81 @@ +# This Graph contains no blank nodes, to allow for easy comparison between a generated graph +# The blind nodes can be compared manually + +@prefix adms: . +@prefix dcat: . +@prefix dcatap: . +@prefix dct: . +@prefix dpv: . +@prefix foaf: . +@prefix healthdcatap: . +@prefix rdfs: . +@prefix skos: . +@prefix vcard: . +@prefix xsd: . + + a dcat:Dataset ; +# healthdcatap:hdab [ a foaf:Agent ; +# vcard:hasEmail ; +# foaf:homepage ; +# foaf:name "EU Health Data Access Body" ] ; +# dct:provenance [ a dct:ProvenanceStatement ; +# rdfs:label "This example dataset is partly sourced from TEHDAS2" ] ; +# dct:publisher [ a foaf:Agent ; +# vcard:hasEmail ; +# foaf:homepage ; +# foaf:name "Contact Point" ] ; +# dct:temporal [ a dct:PeriodOfTime ; +# dcat:endDate "2024-12-31"^^xsd:date ; +# dcat:startDate "2020-03-01"^^xsd:date ] ; +# adms:identifier [ a adms:Identifier ; +# skos:notation "internalURI:admsIdentifier0" ] ; +# dcat:contactPoint [ a vcard:Kind ; +# vcard:fn "Contact Point" ; +# vcard:hasEmail ] ; + dcatap:applicableLegislation ; + healthdcatap:analytics ; + healthdcatap:hasCodeValues , + ; + healthdcatap:hasCodingSystem , + ; + healthdcatap:healthCategory , + , + , + , + , + ; + healthdcatap:maxTypicalAge "110"^^xsd:nonNegativeInteger ; + healthdcatap:minTypicalAge "0"^^xsd:nonNegativeInteger ; + healthdcatap:numberOfRecords "123456789"^^xsd:nonNegativeInteger ; + healthdcatap:numberOfUniqueIndividuals "7654321"^^xsd:nonNegativeInteger ; + healthdcatap:populationCoverage "This example includes a very non-descript population" ; + healthdcatap:publisherNote "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." ; + healthdcatap:publisherType ; + dct:accessRights ; + dct:accrualPeriodicity ; + dct:conformsTo ; + dct:description "This dataset is an example of using HealthDCAT-AP in CKAN" ; + dct:identifier ; + dct:isReferencedBy , + ; + dct:issued "2024-01-01T00:00:00+00:00"^^xsd:dateTime ; + dct:language , + , + ; + dct:modified "2024-12-31T23:59:59+00:00"^^xsd:dateTime ; + dct:relation ; + dct:title "HealthDCAT-AP test dataset" ; + dct:type "n1049372e768c4429a6b2200c22f5f1a4b7" ; + adms:sample ; + adms:versionNotes "Dataset continuously updated" ; + dcat:keyword "Test 1", + "Test 2", + "Test 3" ; + dcat:temporalResolution "P1D"^^xsd:duration ; + dcat:theme ; + foaf:page "n1049372e768c4429a6b2200c22f5f1a4b9" ; + dpv:hasLegalBasis dpv:Consent ; + dpv:hasPurpose dpv:AcademicResearch ; + dpv:hasPersonalData , + , + . diff --git a/pyproject.toml b/pyproject.toml index b7634286..80033250 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,5 +61,6 @@ euro_dcat_ap = "ckanext.dcat.profiles:EuropeanDCATAPProfile" euro_dcat_ap_2 = "ckanext.dcat.profiles:EuropeanDCATAP2Profile" euro_dcat_ap_3 = "ckanext.dcat.profiles:EuropeanDCATAP3Profile" euro_dcat_ap_scheming = "ckanext.dcat.profiles:EuropeanDCATAPSchemingProfile" -dcat_us_3="ckanext.dcat.profiles:DCATUS3Profile" +euro_health_dcat_ap = "ckanext.dcat.profiles:EuropeanHealthDCATAPProfile" +dcat_us_3 = "ckanext.dcat.profiles:DCATUS3Profile" schemaorg = "ckanext.dcat.profiles:SchemaOrgProfile"