diff --git a/mermaid/svg/flow.svg b/mermaid/svg/flow.svg index a158bd0..8f81e1d 100644 --- a/mermaid/svg/flow.svg +++ b/mermaid/svg/flow.svg @@ -1 +1 @@ -
TOP
B1
f1
i1
C
B2
f2
i2
A
B
\ No newline at end of file +
TOP
B1
f1
i1
C
B2
f2
i2
A
B
\ No newline at end of file diff --git a/mermaid/svg/publish.svg b/mermaid/svg/publish.svg index 40ec45f..688f424 100644 --- a/mermaid/svg/publish.svg +++ b/mermaid/svg/publish.svg @@ -1 +1 @@ -
prepare data
Generate CDS data structure
generate CDS data structure
Generate high level metadata
generate study_description.json
generate dataset_description.json
generate README.md
generate CHANGELOG.md
generate LICENSE
generate datasheet.md
generate citation.md
generate datatype_dictionary.json
generate participants.tsv
generate participants.json
generate CDS data structure
generate derivatives
User clicks 'publish' in UI
\ No newline at end of file +
prepare data
Generate CDS data structure
generate CDS data structure
Generate high level metadata
generate study_description.json
generate dataset_description.json
generate README.md
generate CHANGELOG.md
generate LICENSE
generate datasheet.md
generate citation.md
generate datatype_dictionary.json
generate participants.tsv
generate participants.json
generate CDS data structure
generate derivatives
User clicks 'publish' in UI
\ No newline at end of file diff --git a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py index 699d25e..522d7b0 100644 --- a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py +++ b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py @@ -5,6 +5,7 @@ import pathlib import tempfile import uuid +import json import azure.storage.blob as azureblob import psycopg2 @@ -28,8 +29,8 @@ def pipeline(): cur = conn.cursor() - study_id = "c588f59c-cacb-4e52-99dd-95b37dcbfd5c" - dataset_id = "af4be921-e507-41a9-9328-4cbb4b7dca1c" + study_id = "e631d9c1-a74a-413f-a5ce-64535a7302b0" + dataset_id = "f636e555-4c2d-4c89-a79b-a0a63bc29664" cur.execute( "SELECT * FROM dataset WHERE id = %s AND study_id = %s", @@ -53,7 +54,7 @@ def pipeline(): identifier["identifierValue"] = doi[0] identifier["identifierType"] = "DOI" - dataset_metadata["Identifier"] = identifier + dataset_metadata["identifier"] = identifier titles = [] @@ -76,13 +77,13 @@ def pipeline(): titles.append(item) - dataset_metadata["Title"] = titles + dataset_metadata["title"] = titles # todo: generating a random uuid for now # Get the dataset version version = str(uuid.uuid4()) - dataset_metadata["Version"] = version + dataset_metadata["version"] = version alternate_identifiers = [] @@ -103,13 +104,13 @@ def pipeline(): alternate_identifiers.append(item) - dataset_metadata["AlternateIdentifier"] = alternate_identifiers + dataset_metadata["alternateIdentifier"] = alternate_identifiers creators = [] # Get the dataset creators cur.execute( - "SELECT name, name_type, name_identifier, name_identifier_scheme, name_identifier_scheme_uri, affiliations FROM dataset_contributor WHERE dataset_id = %s AND creator = true", + "SELECT family_name, given_name, name_type, name_identifier, name_identifier_scheme, name_identifier_scheme_uri, affiliations FROM dataset_contributor WHERE dataset_id = %s AND creator = true", (dataset_id,), ) @@ -119,54 +120,74 @@ def pipeline(): for creator in dataset_creators: item = {} - item["creatorName"] = creator[0] - item["nameType"] = creator[1] + creator_name = "" + + if (creator[0] is not None and creator[0] != "") and ( + creator[1] is not None and creator[1] != "" + ): + creator_name = f"{creator[0]}, {creator[1]}" + elif creator[0] is not None and creator[0] != "": + creator_name = creator[0] + elif creator[1] is not None and creator[1] != "": + creator_name = creator[1] + + item["creatorName"] = creator_name + item["nameType"] = creator[2] name_identifier = {} - name_identifier["nameIdentifierValue"] = creator[2] - name_identifier["nameIdentifierScheme"] = creator[3] - if creator[4] is not None and creator[4] != "": - name_identifier["schemeURI"] = creator[4] + name_identifier["nameIdentifierValue"] = creator[3] + name_identifier["nameIdentifierScheme"] = creator[4] + if creator[5] is not None and creator[5] != "": + name_identifier["schemeURI"] = creator[5] item["nameIdentifier"] = [name_identifier] - affiliations = creator[5] + affiliations = creator[6] item["affiliation"] = [] for affiliation in affiliations: affiliation_item = {} - affiliation_item["affiliationValue"] = affiliation["name"] + affiliation_item["affiliationName"] = affiliation["name"] + + affiliation_item["affiliationIdentifier"] = {} + if ( affiliation["identifier"] is not None and affiliation["identifier"] != "" ): - affiliation_item["affiliationIdentifier"] = affiliation[ - "identifier" - ] + affiliation_item["affiliationIdentifier"][ + "affiliationIdentifierValue" + ] = affiliation["identifier"] + if affiliation["scheme"] is not None and affiliation["scheme"] != "": - affiliation_item["affiliationIdentifierScheme"] = affiliation[ - "scheme" - ] + affiliation_item["affiliationIdentifier"][ + "affiliationIdentifierScheme" + ] = affiliation["scheme"] if ( affiliation["scheme_uri"] is not None and affiliation["scheme_uri"] != "" ): - affiliation_item["schemeURI"] = affiliation["scheme_uri"] + affiliation_item["affiliationIdentifier"]["schemeURI"] = ( + affiliation["scheme_uri"] + ) item["affiliation"].append(affiliation_item) + if item["affiliation"] == []: + del item["affiliation"] + creators.append(item) - dataset_metadata["Creator"] = creators + dataset_metadata["creator"] = creators contributors = [] # Get the dataset contributors cur.execute( - "SELECT name, name_type, name_identifier, name_identifier_scheme, name_identifier_scheme_uri, contributor_type, affiliations FROM dataset_contributor WHERE dataset_id = %s AND creator = false", + "SELECT family_name, given_name, name_type, name_identifier, name_identifier_scheme, name_identifier_scheme_uri, contributor_type, affiliations FROM dataset_contributor WHERE dataset_id = %s AND creator = false", (dataset_id,), ) @@ -176,7 +197,19 @@ def pipeline(): for contributor in dataset_contributors: item = {} - item["contributorName"] = contributor[0] + contributor_name = "" + + if (contributor[0] is not None and contributor[0] != "") and ( + contributor[1] is not None and contributor[1] != "" + ): + contributor_name = f"{contributor[0]}, {contributor[1]}" + elif contributor[0] is not None and contributor[0] != "": + contributor_name = contributor[0] + elif contributor[1] is not None and contributor[1] != "": + contributor_name = contributor[1] + + item["contributorName"] = contributor_name + item["nameType"] = contributor[1] name_identifier = {} @@ -197,34 +230,45 @@ def pipeline(): for affiliation in affiliations: affiliation_item = {} - affiliation_item["affiliationValue"] = affiliation["name"] + affiliation_item["affiliationName"] = affiliation["name"] + + affiliation_item["affiliationIdentifier"] = {} + if ( affiliation["identifier"] is not None and affiliation["identifier"] != "" ): - affiliation_item["affiliationIdentifier"] = affiliation[ - "identifier" - ] + affiliation_item["affiliationIdentifier"][ + "affiliationIdentifierValue" + ] = affiliation["identifier"] + if affiliation["scheme"] is not None and affiliation["scheme"] != "": - affiliation_item["affiliationIdentifierScheme"] = affiliation[ - "scheme" - ] + affiliation_item["affiliationIdentifier"][ + "affiliationIdentifierScheme" + ] = affiliation["scheme"] + if ( affiliation["scheme_uri"] is not None and affiliation["scheme_uri"] != "" ): - affiliation_item["schemeURI"] = affiliation["scheme_uri"] + affiliation_item["affiliationIdentifier"]["schemeURI"] = ( + affiliation["scheme_uri"] + ) item["affiliation"].append(affiliation_item) + if item["affiliation"] == []: + del item["affiliation"] + contributors.append(item) - dataset_metadata["Contributor"] = contributors + if len(contributors) > 0: + dataset_metadata["contributor"] = contributors # Get the publication year publication_year = str(datetime.datetime.now().year) - dataset_metadata["PublicationYear"] = publication_year + dataset_metadata["publicationYear"] = publication_year dates = [] @@ -249,7 +293,7 @@ def pipeline(): dates.append(item) - dataset_metadata["Date"] = dates + dataset_metadata["date"] = dates resource_type = {} @@ -264,23 +308,7 @@ def pipeline(): resource_type["resourceTypeValue"] = dataset_resource_type[0] resource_type["resourceTypeGeneral"] = "Dataset" - dataset_metadata["ResourceType"] = resource_type - - dataset_record_keys = {} - - # Get the dataset record keys - cur.execute( - "SELECT key_type, key_details FROM dataset_record_keys WHERE dataset_id = %s", - (dataset_id,), - ) - - record_keys = cur.fetchone() - - dataset_record_keys["keysType"] = record_keys[0] - if record_keys[1] is not None and record_keys[1] != "": - dataset_record_keys["keysDetails"] = record_keys[1] - - dataset_metadata["DatasetRecordKeys"] = dataset_record_keys + dataset_metadata["resourceType"] = resource_type dataset_de_ident_level = {} @@ -301,7 +329,7 @@ def pipeline(): if de_ident_level[6] is not None and de_ident_level[6] != "": dataset_de_ident_level["deIdentDetails"] = de_ident_level[6] - dataset_metadata["DatasetDeIdentLevel"] = dataset_de_ident_level + dataset_metadata["datasetDeIdentLevel"] = dataset_de_ident_level dataset_consent = {} @@ -322,7 +350,7 @@ def pipeline(): if consent[6] is not None and consent[6] != "": dataset_consent["consentsDetails"] = consent[6] - dataset_metadata["DatasetConsent"] = dataset_consent + dataset_metadata["datasetConsent"] = dataset_consent descriptions = [] @@ -343,7 +371,7 @@ def pipeline(): descriptions.append(item) - dataset_metadata["Description"] = descriptions + dataset_metadata["description"] = descriptions cur.execute( "SELECT language FROM dataset_other WHERE dataset_id = %s", @@ -353,7 +381,7 @@ def pipeline(): dataset_language = cur.fetchone() if dataset_language[0] is not None and dataset_language[0] != "": - dataset_metadata["Language"] = dataset_language[0] + dataset_metadata["language"] = dataset_language[0] subjects = [] @@ -370,37 +398,58 @@ def pipeline(): item = {} item["subjectValue"] = subject[0] + + item["subjectIdentifier"] = {} + if subject[1] is not None and subject[1] != "": - item["subjectScheme"] = subject[1] + item["subjectIdentifier"]["subjectScheme"] = subject[1] + if subject[2] is not None and subject[2] != "": - item["schemeURI"] = subject[2] + item["subjectIdentifier"]["schemeURI"] = subject[2] + if subject[3] is not None and subject[3] != "": - item["valueURI"] = subject[3] + item["subjectIdentifier"]["valueURI"] = subject[3] + if subject[4] is not None and subject[4] != "": - item["classificationCode"] = subject[4] + item["subjectIdentifier"]["classificationCode"] = subject[4] subjects.append(item) - dataset_metadata["Subject"] = subjects + dataset_metadata["subject"] = subjects - managing_organisation = {} + managing_organization = {} # Get the dataset managing organization cur.execute( - "SELECT managing_organization_name, managing_organization_ror_id FROM dataset_other WHERE dataset_id = %s", + "SELECT name, identifier, identifier_scheme, identifier_scheme_uri FROM dataset_managing_organization WHERE dataset_id = %s", (dataset_id,), ) - dataset_managing_organisation = cur.fetchone() + dataset_managing_organization = cur.fetchone() + print(dataset_managing_organization) - managing_organisation["name"] = dataset_managing_organisation[0] + managing_organization["name"] = dataset_managing_organization[0] if ( - dataset_managing_organisation[1] is not None - and dataset_managing_organisation[1] != "" + dataset_managing_organization[1] is not None + and dataset_managing_organization[1] != "" ): - managing_organisation["rorId"] = dataset_managing_organisation[1] + managing_organization["managingOrganizationIdentifier"] = {} + managing_organization["managingOrganizationIdentifier"]["managingOrganizationIdentifierValue"] = dataset_managing_organization[1] - dataset_metadata["ManagingOrganisation"] = managing_organisation + if ( + dataset_managing_organization[2] is not None + and dataset_managing_organization[2] != "" + ): + managing_organization["managingOrganizationIdentifier"]["managingOrganizationScheme"] = dataset_managing_organization[2] + + if ( + dataset_managing_organization[3] is not None + and dataset_managing_organization[3] != "" + ): + managing_organization["managingOrganizationIdentifier"]["schemeURI"] = dataset_managing_organization[3] + + print(managing_organization) + dataset_metadata["managingOrganization"] = managing_organization access_details = {} @@ -421,14 +470,14 @@ def pipeline(): access_details["urlLastChecked"] = timestamp.strftime("%Y-%m-%d") # Get the dataset Access Type - dataset_metadata["AccessType"] = dataset_access[0] - dataset_metadata["AccessDetails"] = access_details + dataset_metadata["accessType"] = dataset_access[0] + dataset_metadata["accessDetails"] = access_details rights = [] # Get the dataset rights cur.execute( - "SELECT rights, uri, identifier, identifier_scheme FROM dataset_rights WHERE dataset_id = %s", + "SELECT rights, uri, identifier, identifier_scheme, identifier_scheme_uri FROM dataset_rights WHERE dataset_id = %s", (dataset_id,), ) @@ -438,27 +487,28 @@ def pipeline(): for right in dataset_rights: item = {} - item["rightsValue"] = right[0] + item["rightsName"] = right[0] + if right[1] is not None and right[1] != "": item["rightsURI"] = right[1] + + item["rightsIdentifier"] = {} + if right[2] is not None and right[2] != "": - item["rightsIdentifier"] = right[2] + item["rightsIdentifier"]["rightsIdentifierValue"] = right[2] if right[3] is not None and right[3] != "": - item["rightsIdentifierScheme"] = right[3] + item["rightsIdentifier"]["rightsIdentifierScheme"] = right[3] + if right[4] is not None and right[4] != "": + item["rightsIdentifier"]["schemeURI"] = right[4] rights.append(item) - dataset_metadata["Rights"] = rights - - # Get the dataset publisher information - cur.execute( - "SELECT publisher FROM dataset_other WHERE dataset_id = %s", - (dataset_id,), - ) - - dataset_publisher = cur.fetchone() + dataset_metadata["rights"] = rights - dataset_metadata["Publisher"] = dataset_publisher[0] + # Create the publisher object + dataset_metadata["publisher"] = { + "publisherName": "FAIRhub", + } sizes = [] @@ -474,7 +524,23 @@ def pipeline(): for size in dataset_sizes[0]: sizes.append(size) - dataset_metadata["Size"] = sizes + dataset_metadata["size"] = sizes + + formats = [] + + # Get the dataset formats + cur.execute( + "SELECT format FROM dataset_other WHERE dataset_id = %s", + (dataset_id,), + ) + + dataset_formats = cur.fetchone() + + if len(dataset_formats[0]) > 0: + for dataset_format in dataset_formats[0]: + formats.append(dataset_format) + + dataset_metadata["format"] = formats funding_references = [] @@ -493,9 +559,8 @@ def pipeline(): item["funderName"] = funding_reference[0] item["funderIdentifier"] = {} item["funderIdentifier"]["funderIdentifierValue"] = funding_reference[1] + item["funderIdentifier"]["funderIdentifierType"] = funding_reference[2] - if funding_reference[2] is not None and funding_reference[2] != "": - item["funderIdentifier"]["funderIdentifierType"] = funding_reference[2] if funding_reference[3] is not None and funding_reference[3] != "": item["funderIdentifier"]["schemeURI"] = funding_reference[3] @@ -509,179 +574,38 @@ def pipeline(): funding_references.append(item) - dataset_metadata["FundingReference"] = funding_references + dataset_metadata["fundingReference"] = funding_references - related_items = [] + related_identifiers = [] - # Get the dataset related items + # Get the dataset related identifiers cur.execute( - "SELECT id, type, relation_type FROM dataset_related_item WHERE dataset_id = %s", + "SELECT identifier, identifier_type, relation_type, related_metadata_scheme, scheme_uri, scheme_type, resource_type FROM dataset_related_identifier WHERE dataset_id = %s", (dataset_id,), ) - dataset_related_items = cur.fetchall() - - if dataset_related_items is not None: - for related_item in dataset_related_items: - related_item_id = related_item[0] + dataset_related_identifiers = cur.fetchall() + if dataset_related_identifiers is not None: + for related_identifier in dataset_related_identifiers: item = {} - item["relatedItemType"] = related_item[1] - item["relationType"] = related_item[2] - - item_identifiers = [] - - # Get the related item's identifiers - cur.execute( - "SELECT identifier, type, metadata_scheme, scheme_uri, scheme_type FROM dataset_related_item_identifier WHERE dataset_related_item_id = %s", - (related_item_id,), - ) - - related_item_identifiers = cur.fetchall() - - if related_item_identifiers is not None: - for related_item_identifier in related_item_identifiers: - item_identifier = {} - - item_identifier["relatedItemIdentifierValue"] = ( - related_item_identifier[0] - ) - item_identifier["relatedItemIdentifierType"] = ( - related_item_identifier[1] - ) - if ( - related_item_identifier[2] is not None - and related_item_identifier[2] != "" - ): - item_identifier["relatedMetadataScheme"] = ( - related_item_identifier[2] - ) - if ( - related_item_identifier[3] is not None - and related_item_identifier[3] != "" - ): - item_identifier["schemeURI"] = related_item_identifier[3] - if ( - related_item_identifier[4] is not None - and related_item_identifier[4] != "" - ): - item_identifier["schemeType"] = related_item_identifier[4] - - item_identifiers.append(item_identifier) - - item["relatedItemIdentifier"] = item_identifiers - - related_items.append(item) - - item_creators = [] - - # Get the related item's creators - cur.execute( - "SELECT name, name_type FROM dataset_related_item_contributor WHERE dataset_related_item_id = %s AND creator = true", - (related_item_id,), - ) - - related_item_creators = cur.fetchall() - - if related_item_creators is not None: - for related_item_creator in related_item_creators: - item_creator = {} - - item_creator["creatorName"] = related_item_creator[0] - item_creator["nameType"] = related_item_creator[1] - - item_creators.append(item_creator) - - item["creator"] = item_creators - - item_contributors = [] - - # Get the related item's contributors - cur.execute( - "SELECT name, name_type, contributor_type FROM dataset_related_item_contributor WHERE dataset_related_item_id = %s AND creator = false", - (related_item_id,), - ) - - related_item_contributors = cur.fetchall() - - if related_item_contributors is not None: - for related_item_contributor in related_item_contributors: - item_contributor = {} - - item_contributor["contributorName"] = related_item_contributor[0] - if ( - related_item_contributor[1] is not None - and related_item_contributor[1] != "" - ): - item_contributor["nameType"] = related_item_contributor[1] - item_contributor["contributorType"] = related_item_contributor[2] - - item_contributors.append(item_contributor) - - item["contributor"] = item_contributors - - item_titles = [] - - # Get the related item's titles - cur.execute( - "SELECT title, type FROM dataset_related_item_title WHERE dataset_related_item_id = %s", - (related_item_id,), - ) - - related_item_titles = cur.fetchall() - - if related_item_titles is not None: - for related_item_title in related_item_titles: - item_title = {} - - item_title["titleValue"] = related_item_title[0] - - if not related_item_title[1] == "MainTitle": - item_title["titleType"] = related_item_title[1] - - item_titles.append(item_title) - - item["title"] = item_titles - - # Get the related item's dates - cur.execute( - "SELECT publication_year, volume, issue, number_value, number_type, first_page, last_page, publisher, edition FROM dataset_related_item_other WHERE dataset_related_item_id = %s", - (related_item_id,), - ) - - related_item_other = cur.fetchone() - - if related_item_other[0] is not None and related_item_other[0] != "": - timestamp = datetime.datetime.fromtimestamp( - related_item_other[0] / 1000 - ) - item["publicationYear"] = str(timestamp.year) - if related_item_other[1] is not None and related_item_other[1] != "": - item["volume"] = related_item_other[1] - if related_item_other[2] is not None and related_item_other[2] != "": - item["issue"] = related_item_other[2] - - item["number"] = {} - if related_item_other[3] is not None and related_item_other[3] != "": - item["number"]["numberValue"] = related_item_other[3] - if related_item_other[4] is not None and related_item_other[4] != "": - item["number"]["numberType"] = related_item_other[4] - if item["number"] == {}: - del item["number"] + item["relatedIdentifierValue"] = related_identifier[0] + item["relatedIdentifierType"] = related_identifier[1] + item["relationType"] = related_identifier[2] - if related_item_other[5] is not None and related_item_other[5] != "": - item["firstPage"] = related_item_other[5] - if related_item_other[6] is not None and related_item_other[6] != "": - item["lastPage"] = related_item_other[6] - if related_item_other[7] is not None and related_item_other[7] != "": - item["publisher"] = related_item_other[7] - if related_item_other[8] is not None and related_item_other[8] != "": - item["edition"] = related_item_other[8] + if related_identifier[3] is not None and related_identifier[3] != "": + item["relatedMetadataScheme"] = related_identifier[3] + if related_identifier[4] is not None and related_identifier[4] != "": + item["schemeURI"] = related_identifier[4] + if related_identifier[5] is not None and related_identifier[5] != "": + item["schemeType"] = related_identifier[5] + if related_identifier[6] is not None and related_identifier[6] != "": + item["resourceTypeGeneral"] = related_identifier[6] - dataset_metadata["RelatedItem"] = related_items + related_identifiers.append(item) - dataset_metadata["RelatedIdentifier"] = [] + dataset_metadata["relatedIdentifier"] = related_identifiers conn.commit() conn.close() @@ -690,11 +614,14 @@ def pipeline(): temp_folder_path = tempfile.mkdtemp() temp_file_path = pathlib.Path(temp_folder_path, "dataset_description.json") + + print(json.dumps(dataset_metadata)) data_is_valid = pyfairdatatools.validate.validate_dataset_description( data=dataset_metadata ) + # sourcery skip: raise-specific-error if not data_is_valid: raise Exception("Dataset description is not valid") diff --git a/publish_pipeline/generate_high_level_metadata/generate_discovery_metadata.py b/publish_pipeline/generate_high_level_metadata/generate_discovery_metadata.py index 88a67fe..17fa1d2 100644 --- a/publish_pipeline/generate_high_level_metadata/generate_discovery_metadata.py +++ b/publish_pipeline/generate_high_level_metadata/generate_discovery_metadata.py @@ -48,19 +48,19 @@ def pipeline(): title = dataset_title[0] - discovery_metadata["Title"] = title + discovery_metadata["title"] = title identifier = "10.5281/zenodo.7641684" - discovery_metadata["Identifier"] = identifier + discovery_metadata["identifier"] = identifier version = str(uuid.uuid4()) - discovery_metadata["Version"] = version + discovery_metadata["version"] = version publication_date = datetime.datetime.now().strftime("%Y-%m-%d") - discovery_metadata["PublicationDate"] = publication_date + discovery_metadata["publicationDate"] = publication_date detailed_description = "" @@ -70,7 +70,7 @@ def pipeline(): ) study_description = cur.fetchone() detailed_description = study_description[0] - discovery_metadata["About"] = detailed_description + discovery_metadata["about"] = detailed_description # license @@ -84,7 +84,7 @@ def pipeline(): dataset_rights = cur.fetchone() # license_text = dataset_other.join(",") license_text = dataset_rights[0] - discovery_metadata["License"] = license_text + discovery_metadata["license"] = license_text acknowledgement = "" @@ -96,7 +96,7 @@ def pipeline(): dataset_acknowledgement = cur.fetchone() acknowledgement = dataset_acknowledgement[0] if acknowledgement: - discovery_metadata["Acknowledgement"] = acknowledgement + discovery_metadata["acknowledgement"] = acknowledgement # conn.commit() conn.close() diff --git a/publish_pipeline/generate_high_level_metadata/generate_study_description.py b/publish_pipeline/generate_high_level_metadata/generate_study_description.py index 6cd37ae..3f005de 100644 --- a/publish_pipeline/generate_high_level_metadata/generate_study_description.py +++ b/publish_pipeline/generate_high_level_metadata/generate_study_description.py @@ -29,11 +29,11 @@ def pipeline(): study_id = "c588f59c-cacb-4e52-99dd-95b37dcbfd5c" - cur.execute("SELECT * FROM study WHERE id = %s", (study_id,)) + cur.execute("SELECT title, acronym FROM study WHERE id = %s", (study_id,)) study = cur.fetchone() - if study is None: + if study[0] is None: return "Study not found" identification_module = {} @@ -46,26 +46,31 @@ def pipeline(): primary_study_identification = cur.fetchone() - identification_module["OrgStudyIdInfo"] = {} + identification_module["officialTitle"] = study[0] + + if study[1] is not None and study[1] != "": + identification_module["acronym"] = {} + + identification_module["orgStudyIdInfo"] = {} # Study Identifier - identification_module["OrgStudyIdInfo"]["OrgStudyId"] = ( + identification_module["orgStudyIdInfo"]["orgStudyId"] = ( primary_study_identification[0] ) # Study Identifier Type - identification_module["OrgStudyIdInfo"]["OrgStudyIdType"] = ( + identification_module["orgStudyIdInfo"]["orgStudyIdType"] = ( primary_study_identification[1] ) if primary_study_identification[2] and primary_study_identification[2] != "": # Study Identifier Domain - identification_module["OrgStudyIdInfo"]["OrgStudyIdDomain"] = ( + identification_module["orgStudyIdInfo"]["orgStudyIdDomain"] = ( primary_study_identification[2] ) if primary_study_identification[3] and primary_study_identification[3] != "": # Study Identifier Link - identification_module["OrgStudyIdInfo"]["OrgStudyIdLink"] = ( + identification_module["orgStudyIdInfo"]["orgStudyIdLink"] = ( primary_study_identification[3] ) @@ -77,25 +82,25 @@ def pipeline(): secondary_study_identification = cur.fetchall() - identification_module["SecondaryIdInfoList"] = [] + identification_module["secondaryIdInfoList"] = [] for row in secondary_study_identification: item = {} # Study Identifier and Study Identifier Type - item["SecondaryId"] = row[0] - item["SecondaryIdType"] = row[1] + item["secondaryId"] = row[0] + item["secondaryIdType"] = row[1] if row[2]: # Study Identifer Domain - item["SecondaryIdDomain"] = row[2] + item["secondaryIdDomain"] = row[2] if row[3]: # Study Identifier Link - item["SecondaryIdLink"] = row[3] + item["secondaryIdLink"] = row[3] - identification_module["SecondaryIdInfoList"].append(item) + identification_module["secondaryIdInfoList"].append(item) - study_metadata["IdentificationModule"] = identification_module + study_metadata["identificationModule"] = identification_module status_module = {} @@ -107,75 +112,166 @@ def pipeline(): study_status = cur.fetchone() - status_module["OverallStatus"] = study_status[0] - status_module["WhyStopped"] = study_status[1] + status_module["overallStatus"] = study_status[0] + status_module["whyStopped"] = study_status[1] start_date = datetime.datetime.strptime(study_status[2], "%Y-%m-%d %H:%M:%S") - status_module["StartDateStruct"] = { + status_module["startDateStruct"] = { # date format: Month DD, YYYY - "StartDate": start_date.strftime("%B %d, %Y"), - "StartDateType": study_status[3], + "startDate": start_date.strftime("%B %d, %Y"), + "startDateType": study_status[3], } completion_date = datetime.datetime.strptime(study_status[4], "%Y-%m-%d %H:%M:%S") - status_module["CompletionDateStruct"] = { - "CompletionDate": completion_date.strftime("%B %d, %Y"), - "CompletionDateType": study_status[5], + status_module["completionDateStruct"] = { + "completionDate": completion_date.strftime("%B %d, %Y"), + "completionDateType": study_status[5], } - study_metadata["StatusModule"] = status_module + study_metadata["statusModule"] = status_module sponsor_collaborators_module = {} - # Get the study sponsor and collaborators metadata + # Get the study sponsor metadata cur.execute( - "SELECT responsible_party_type, responsible_party_investigator_name, responsible_party_investigator_title, responsible_party_investigator_affiliation, lead_sponsor_name, collaborator_name FROM study_sponsors_collaborators WHERE study_id = %s", + "SELECT responsible_party_type, responsible_party_investigator_first_name, responsible_party_investigator_last_name, responsible_party_investigator_title, responsible_party_investigator_identifier_value, responsible_party_investigator_identifier_scheme, responsible_party_investigator_identifier_scheme_uri, responsible_party_investigator_affiliation_name, responsible_party_investigator_affiliation_identifier_value, responsible_party_investigator_affiliation_identifier_scheme, responsible_party_investigator_affiliation_identifier_scheme_uri, lead_sponsor_name, lead_sponsor_identifier, lead_sponsor_scheme, lead_sponsor_scheme_uri FROM study_sponsors WHERE study_id = %s", (study_id,), ) - sponsor_collaborators = cur.fetchone() + study_sponsors = cur.fetchone() - sponsor_collaborators_module["ResponsibleParty"] = { - "ResponsiblePartyType": sponsor_collaborators[0], - "ResponsiblePartyInvestigatorFullName": sponsor_collaborators[1], - "ResponsiblePartyInvestigatorTitle": sponsor_collaborators[2], - "ResponsiblePartyInvestigatorAffiliation": sponsor_collaborators[3], - } + responsible_party = {} - sponsor_collaborators_module["LeadSponsor"] = { - "LeadSponsorName": sponsor_collaborators[4] - } + responsible_party["responsiblePartyType"] = study_sponsors[0] + + if study_sponsors[1] is not None and study_sponsors[1] != "": + responsible_party["responsiblePartyInvestigatorFirstName"] = study_sponsors[1] + if study_sponsors[2] is not None and study_sponsors[2] != "": + responsible_party["responsiblePartyInvestigatorLastName"] = study_sponsors[2] + if study_sponsors[3] is not None and study_sponsors[3] != "": + responsible_party["responsiblePartyInvestigatorTitle"] = study_sponsors[3] + if study_sponsors[4] is not None and study_sponsors[4] != "": + responsible_party["responsiblePartyInvestigatorIdentifier"] = {} + + responsible_party["responsiblePartyInvestigatorIdentifier"][ + "responsiblePartyInvestigatorIdentifierValue" + ] = study_sponsors[4] + + if study_sponsors[5] is not None and study_sponsors[5] != "": + responsible_party["responsiblePartyInvestigatorIdentifier"][ + "responsiblePartyInvestigatorIdentifierScheme" + ] = study_sponsors[5] - sponsor_collaborators_module["CollaboratorList"] = [] + if study_sponsors[6] is not None and study_sponsors[6] != "": + responsible_party["responsiblePartyInvestigatorIdentifier"]["schemeURI"] = ( + study_sponsors[6] + ) - sponsor_collaborators = sponsor_collaborators[5] + if study_sponsors[7] is not None and study_sponsors[7] != "": + responsible_party["responsiblePartyInvestigatorAffiliation"] = {} + + responsible_party["responsiblePartyInvestigatorAffiliation"][ + "responsiblePartyInvestigatorAffiliationName" + ] = study_sponsors[7] + + if study_sponsors[8] is not None and study_sponsors[8] != "": + responsible_party["responsiblePartyInvestigatorAffiliation"][ + "responsiblePartyInvestigatorAffiliationIdentifier" + ] = {} + + responsible_party["responsiblePartyInvestigatorAffiliation"][ + "responsiblePartyInvestigatorAffiliationIdentifier" + ][ + "responsiblePartyInvestigatorAffiliationIdentifierValue" + ] = study_sponsors[ + 8 + ] + + if study_sponsors[9] is not None and study_sponsors[9] != "": + responsible_party["responsiblePartyInvestigatorAffiliation"][ + "responsiblePartyInvestigatorAffiliationIdentifier" + ][ + "responsiblePartyInvestigatorAffiliationIdentifierScheme" + ] = study_sponsors[ + 9 + ] + + if study_sponsors[10] is not None and study_sponsors[10] != "": + responsible_party["responsiblePartyInvestigatorAffiliation"][ + "responsiblePartyInvestigatorAffiliationIdentifier" + ]["schemeURI"] = study_sponsors[10] + + sponsor_collaborators_module["responsibleParty"] = responsible_party + + lead_sponsor = {"leadSponsorName": study_sponsors[11]} + + if study_sponsors[12] is not None and study_sponsors[12] != "": + lead_sponsor["leadSponsor"]["leadSponsorIdentifier"] = { + "leadSponsorIdentifierValue": study_sponsors[12] + } + if study_sponsors[13] is not None and study_sponsors[13] != "": + lead_sponsor["leadSponsor"]["leadSponsorIdentifier"][ + "leadSponsorIdentifierScheme" + ] = study_sponsors[13] + + sponsor_collaborators_module["leadSponsor"] = lead_sponsor + + # Get the study collaborators metadata + cur.execute( + "SELECT name, identifier, scheme, scheme_uri FROM study_collaborators WHERE study_id = %s", + (study_id,), + ) - for row in sponsor_collaborators: - # Add the collabarator(s) to the list - item = {"CollaboratorName": row} + study_collaborators = cur.fetchall() + + collaborators = [] + + for row in study_collaborators: + item = {} - sponsor_collaborators_module["CollaboratorList"].append(item) + item["collaboratorName"] = row[0] - study_metadata["SponsorCollaboratorsModule"] = sponsor_collaborators_module + if row[1] is not None and row[1] != "": + item["collaboratorNameIdentifier"] = { + "collaboratorNameIdentifierValue": row[1] + } + + if row[2] is not None and row[2] != "": + item["collaboratorNameIdentifier"][ + "collaboratorNameIdentifierScheme" + ] = row[2] + if row[3] is not None and row[3] != "": + item["collaboratorNameIdentifier"]["schemeURI"] = row[3] + + collaborators.append(item) + + sponsor_collaborators_module["collaboratorList"] = collaborators + + study_metadata["sponsorCollaboratorsModule"] = sponsor_collaborators_module oversight_module = {} # Get the study oversight metadata cur.execute( - "SELECT oversight_has_dmc FROM study_other WHERE study_id = %s", + "SELECT fda_regulated_drug, fda_regulated_device, human_subject_review_status, has_dmc FROM study_oversight WHERE study_id = %s", (study_id,), ) study_oversight = cur.fetchone() - if study_oversight[0]: - oversight_module["OversightHasDMC"] = "Yes" - else: - oversight_module["OversightHasDMC"] = "No" + if study_oversight[0] is not None and study_oversight[0] != "": + oversight_module["isFDARegulatedDrug"] = study_oversight[0] + if study_oversight[1] is not None and study_oversight[1] != "": + oversight_module["isFDARegulatedDevice"] = study_oversight[1] + + oversight_module["humanSubjectReviewStatus"] = study_oversight[2] - study_metadata["OversightModule"] = oversight_module + if study_oversight[3] is not None and study_oversight[3] != "": + oversight_module["oversightHasDMC"] = study_oversight[3] + + study_metadata["oversightModule"] = oversight_module description_module = {} @@ -187,110 +283,157 @@ def pipeline(): study_description = cur.fetchone() - description_module["BriefSummary"] = study_description[0] + description_module["briefSummary"] = study_description[0] if study_description[1] and study_description[1] != "": - description_module["DetailedDescription"] = study_description[1] + description_module["detailedDescription"] = study_description[1] - study_metadata["DescriptionModule"] = description_module + study_metadata["descriptionModule"] = description_module conditions_module = {} # Get the study conditions metadata cur.execute( - "SELECT conditions, keywords FROM study_other WHERE study_id = %s", + "SELECT name, classification_code, scheme, scheme_uri, condition_uri FROM study_conditions WHERE study_id = %s", + (study_id,), + ) + + study_conditions = cur.fetchall() + + conditions_list = [] + + for row in study_conditions: + item = {} + + item["conditionName"] = row[0] + + if row[1] is not None and row[1] != "": + item["conditionIdentifier"] = {"conditionClassificationCode": row[1]} + + if row[2] is not None and row[2] != "": + item["conditionIdentifier"]["conditionScheme"] = row[2] + + if row[3] is not None and row[3] != "": + item["conditionIdentifier"]["schemeURI"] = row[3] + + if row[4] is not None and row[4] != "": + item["conditionIdentifier"]["conditionURI"] = row[4] + + conditions_list.append(item) + + conditions_module["conditionList"] = conditions_list + + # Get the study keywords metadata + cur.execute( + "SELECT name, classification_code, scheme, scheme_uri, keyword_uri FROM study_keywords WHERE study_id = %s", (study_id,), ) - study_conditions = cur.fetchone() + study_keywords = cur.fetchall() - conditions_module["ConditionList"] = [] - conditions = study_conditions[0] + keywords_list = [] - for row in conditions: - conditions_module["ConditionList"].append(row) + for row in study_keywords: + item = {} - # todo: add keywords from the UI and API - conditions_module["KeywordList"] = ["Dataset"] - keywords = study_conditions[1] - for row in keywords: - conditions_module["KeywordList"].append(row) + item["keywordName"] = row[0] - study_metadata["ConditionsModule"] = conditions_module + if row[1] is not None and row[1] != "": + item["keywordIdentifier"] = {"keywordClassificationCode": row[1]} + + if row[2] is not None and row[2] != "": + item["keywordIdentifier"]["keywordScheme"] = row[2] + + if row[3] is not None and row[3] != "": + item["keywordIdentifier"]["schemeURI"] = row[3] + + if row[4] is not None and row[4] != "": + item["keywordIdentifier"]["keywordURI"] = row[4] + + keywords_list.append(item) + + conditions_module["keywordList"] = keywords_list + + study_metadata["conditionsModule"] = conditions_module design_module = {} # Get the study design metadata cur.execute( - "SELECT study_type, design_allocation, design_intervention_model, design_intervention_model_description, design_primary_purpose, design_masking, design_masking_description, design_who_masked_list, phase_list, enrollment_count, enrollment_type, number_arms,design_observational_model_list, design_time_perspective_list, bio_spec_retention, bio_spec_description, target_duration, number_groups_cohorts FROM study_design WHERE study_id = %s", + "SELECT study_type, design_allocation, design_intervention_model, design_intervention_model_description, design_primary_purpose, design_masking, design_masking_description, design_who_masked_list, phase_list, enrollment_count, enrollment_type, number_arms,design_observational_model_list, design_time_perspective_list, bio_spec_retention, bio_spec_description, target_duration, number_groups_cohorts, isPatientRegistry FROM study_design WHERE study_id = %s", (study_id,), ) study_design = cur.fetchone() study_type = study_design[0] - design_module["StudyType"] = study_type + design_module["studyType"] = study_type if study_type == "Interventional": - design_module["DesignInfo"] = {} - design_module["DesignInfo"]["DesignAllocation"] = study_design[1] - design_module["DesignInfo"]["DesignInterventionModel"] = study_design[2] + design_module["designInfo"] = {} + design_module["designInfo"]["designAllocation"] = study_design[1] + design_module["designInfo"]["designInterventionModel"] = study_design[2] if study_design[3] and study_design[3] != "": - design_module["DesignInfo"]["DesignInterventionModelDescription"] = ( + design_module["designInfo"]["designInterventionModelDescription"] = ( study_design[3] ) - design_module["DesignInfo"]["DesignPrimaryPurpose"] = study_design[4] + design_module["designInfo"]["designPrimaryPurpose"] = study_design[4] - design_module["DesignInfo"]["DesignMaskingInfo"] = {} - design_module["DesignInfo"]["DesignMaskingInfo"]["DesignMasking"] = ( + design_module["designInfo"]["designMaskingInfo"] = {} + design_module["designInfo"]["designMaskingInfo"]["designMasking"] = ( study_design[5] ) - design_module["DesignInfo"]["DesignMaskingInfo"]["DesignMaskingDescription"] = ( + design_module["designInfo"]["designMaskingInfo"]["designMaskingDescription"] = ( study_design[6] ) - design_module["DesignInfo"]["DesignMaskingInfo"]["DesignWhoMaskedList"] = [] + design_module["designInfo"]["designMaskingInfo"]["designWhoMaskedList"] = [] if study_design[7] is not None: for row in study_design[7]: - design_module["DesignInfo"]["DesignMaskingInfo"][ - "DesignWhoMaskedList" + design_module["designInfo"]["designMaskingInfo"][ + "designWhoMaskedList" ].append(row) - design_module["PhaseList"] = [] + design_module["phaseList"] = [] if study_design[8] is not None: for row in study_design[8]: - design_module["PhaseList"].append(row) + design_module["phaseList"].append(row) - design_module["EnrollmentInfo"] = {} - design_module["EnrollmentInfo"]["EnrollmentCount"] = str(study_design[9]) - design_module["EnrollmentInfo"]["EnrollmentType"] = study_design[10] + design_module["enrollmentInfo"] = {} + design_module["enrollmentInfo"]["enrollmentCount"] = str(study_design[9]) + design_module["enrollmentInfo"]["enrollmentType"] = study_design[10] - if study_type == "Interventional": - design_module["NumberArms"] = str(study_design[11]) + if study_type == "interventional": + design_module["numberArms"] = str(study_design[11]) + + if study_type == "observational": + design_module["designInfo"] = {} + design_module["designInfo"]["designObservationalModelList"] = [] - if study_type == "Observational": - design_module["DesignInfo"] = {} - design_module["DesignInfo"]["DesignObservationalModelList"] = [] if study_design[12] is not None: for row in study_design[12]: - design_module["DesignInfo"]["DesignObservationalModelList"].append(row) + design_module["designInfo"]["designObservationalModelList"].append(row) - design_module["DesignInfo"]["DesignTimePerspectiveList"] = [] + design_module["designInfo"]["designTimePerspectiveList"] = [] if study_design[13] is not None: for row in study_design[13]: - design_module["DesignInfo"]["DesignTimePerspectiveList"].append(row) + design_module["designInfo"]["designTimePerspectiveList"].append(row) + + design_module["bioSpec"] = {} + design_module["bioSpec"]["bioSpecRetention"] = study_design[14] - design_module["BioSpec"] = {} - design_module["BioSpec"]["BioSpecRetention"] = study_design[14] if study_design[15] is not None and study_design[15] != "": - design_module["BioSpec"]["BioSpecDescription"] = study_design[15] + design_module["bioSpec"]["bioSpecDescription"] = study_design[15] - design_module["TargetDuration"] = study_design[16] - design_module["NumberGroupsCohorts"] = str(study_design[17]) + design_module["targetDuration"] = study_design[16] + design_module["numberGroupsCohorts"] = str(study_design[17]) - study_metadata["DesignModule"] = design_module + if study_design[18] is not None and study_design[18] != "": + design_module["isPatientRegistry"] = study_design[18] + + study_metadata["designModule"] = design_module arms_interventions_module = {} @@ -302,285 +445,251 @@ def pipeline(): study_arms = cur.fetchall() - arms_interventions_module["ArmGroupList"] = [] + arms_interventions_module["armGroupList"] = [] for row in study_arms: item = {} - item["ArmGroupLabel"] = row[0] + item["armGroupLabel"] = row[0] if study_type == "Interventional": - item["ArmGroupType"] = row[1] + item["armGroupType"] = row[1] - if row[2] is not None and row[2] != "": - item["ArmGroupDescription"] = row[2] + item["armGroupDescription"] = row[2] if study_type == "Interventional" and row[3] is not None and len(row[3]) > 0: - item["ArmGroupInterventionList"] = [] + item["armGroupInterventionList"] = [] for intervention in row[3]: - item["ArmGroupInterventionList"].append(intervention) + item["armGroupInterventionList"].append(intervention) - arms_interventions_module["ArmGroupList"].append(item) + arms_interventions_module["armGroupList"].append(item) # Get the study interventions metadata cur.execute( - "SELECT type, name, description, arm_group_label_list, other_name_list FROM study_intervention WHERE study_id = %s", + "SELECT type, name, description, other_name_list FROM study_intervention WHERE study_id = %s", (study_id,), ) study_interventions = cur.fetchall() - arms_interventions_module["InterventionList"] = [] + arms_interventions_module["interventionList"] = [] for row in study_interventions: item = {} - item["InterventionType"] = row[0] - item["InterventionName"] = row[1] - if row[2] is not None and row[2] != "": - item["InterventionDescription"] = row[2] - - item["InterventionArmGroupLabelList"] = [] + item["interventionType"] = row[0] + item["interventionName"] = row[1] + item["interventionDescription"] = row[2] - if row[3] is not None: - for arm_group_label in row[3]: - item["InterventionArmGroupLabelList"].append(arm_group_label) + if row[3] is not None and len(row[3]) > 0: + item["interventionOtherNameList"] = [] - item["InterventionOtherNameList"] = [] + for other_name in row[3]: + item["interventionOtherNameList"].append(other_name) - if row[4] is not None: - for other_name in row[4]: - item["InterventionOtherNameList"].append(other_name) + arms_interventions_module["interventionList"].append(item) - arms_interventions_module["InterventionList"].append(item) - - study_metadata["ArmsInterventionsModule"] = arms_interventions_module + study_metadata["armsInterventionsModule"] = arms_interventions_module eligibility_module = {} # Get the study eligibility metadata cur.execute( - "SELECT gender, gender_based, gender_description, minimum_age_value, minimum_age_unit, maximum_age_value, maximum_age_unit, healthy_volunteers, inclusion_criteria, exclusion_criteria, study_population, sampling_method FROM study_eligibility WHERE study_id = %s", + "SELECT sex, gender_based, gender_description, minimum_age_value, minimum_age_unit, maximum_age_value, maximum_age_unit, healthy_volunteers, inclusion_criteria, exclusion_criteria, study_population, sampling_method FROM study_eligibility WHERE study_id = %s", (study_id,), ) study_eligibility = cur.fetchone() - eligibility_module["Gender"] = study_eligibility[0] - eligibility_module["GenderBased"] = study_eligibility[1] - eligibility_module["GenderDescription"] = study_eligibility[2] - eligibility_module["MinimumAge"] = f"{study_eligibility[3]} {study_eligibility[4]}" - eligibility_module["MaximumAge"] = f"{study_eligibility[5]} {study_eligibility[6]}" - if study_eligibility[7] is not None and study_eligibility[7] != "": - eligibility_module["HealthyVolunteers"] = study_eligibility[7] - if study_type == "Observational": - eligibility_module["StudyPopulation"] = study_eligibility[10] - eligibility_module["SamplingMethod"] = study_eligibility[11] + eligibility_module["sex"] = study_eligibility[0] + eligibility_module["genderBased"] = study_eligibility[1] + eligibility_module["genderDescription"] = study_eligibility[2] + eligibility_module["minimumAge"] = f"{study_eligibility[3]} {study_eligibility[4]}" + eligibility_module["maximumAge"] = f"{study_eligibility[5]} {study_eligibility[6]}" + eligibility_module["healthyVolunteers"] = study_eligibility[7] - eligibility_criteria = "" - - if study_eligibility[8] is not None: - eligibility_criteria = "Inclusion Criteria\n" + if study_type == "Observational": + eligibility_module["studyPopulation"] = study_eligibility[10] + eligibility_module["samplingMethod"] = study_eligibility[11] - for criteria in study_eligibility[8]: - eligibility_criteria += f"* {criteria}\n" + eligibility_criteria = { + "eligibilityCriteriaInclusion": [], + "eligibilityCriteriaExclusion": [], + } - if study_eligibility[9] is not None: - eligibility_criteria += "\nExclusion Criteria\n" + if study_eligibility[8] is not None and len(study_eligibility[8]) > 0: + eligibility_criteria["eligibilityCriteriaInclusion"] = study_eligibility[8] - for criteria in study_eligibility[9]: - eligibility_criteria += f"* {criteria}\n" + if study_eligibility[9] is not None and len(study_eligibility[9]) > 0: + eligibility_criteria["eligibilityCriteriaExclusion"] = study_eligibility[9] - eligibility_module["EligibilityCriteria"] = eligibility_criteria + eligibility_module["eligibilityCriteria"] = eligibility_criteria - study_metadata["EligibilityModule"] = eligibility_module + study_metadata["eligibilityModule"] = eligibility_module contacts_locations_module = {} # Get the study contacts and locations metadata cur.execute( - "SELECT name, affiliation, phone, phone_ext, email_address FROM study_contact WHERE study_id = %s AND central_contact = true", + "SELECT first_name, last_name, degree, identifier, identifier_scheme, identifier_scheme_uri, affiliation, affiliation_identifier, affiliation_identifier_scheme, affiliation_identifier_scheme_uri, phone, phone_ext, email_address FROM study_central_contact WHERE study_id = %s", (study_id,), ) study_central_contacts = cur.fetchall() - contacts_locations_module["CentralContactList"] = [] + central_contacts = [] if study_central_contacts is not None: for row in study_central_contacts: item = {} - item["CentralContactName"] = row[0] - item["CentralContactAffiliation"] = row[1] - item["CentralContactPhone"] = row[2] - if row[3] is not None and row[3] != "": - item["CentralContactPhoneExt"] = row[3] - item["CentralContactEMail"] = row[4] + item["centralContactFirstName"] = row[0] + item["centralContactLastName"] = row[1] - contacts_locations_module["CentralContactList"].append(item) + if row[2] is not None and row[2] != "": + item["centralContactDegree"] = row[2] - # Get the study contacts metadata - cur.execute( - "SELECT name, affiliation, role FROM study_overall_official WHERE study_id = %s", - (study_id,), - ) + if row[3] is not None and row[3] != "": + item["centralContactIdentifier"] = {} - contacts_locations_module["OverallOfficialList"] = [] + item["centralContactIdentifier"]["centralContactIdentifierValue"] = row[ + 3 + ] + item["centralContactIdentifierScheme"] = row[4] - study_overall_officials = cur.fetchall() + if row[5] is not None and row[5] != "": + item["schemeURI"] = row[5] - if study_overall_officials is not None: - for row in study_overall_officials: - item = {} + item["centralContactAffiliation"] = { + "centralContactAffiliationName": row[6] + } - item["OverallOfficialName"] = row[0] - item["OverallOfficialAffiliation"] = row[1] - item["OverallOfficialRole"] = row[2] + if row[7] is not None and row[7] != "": + item["centralContactAffiliation"][ + "centralContactAffiliationIdentifier" + ] = {} - contacts_locations_module["OverallOfficialList"].append(item) + item["centralContactAffiliation"][ + "centralContactAffiliationIdentifier" + ]["centralContactAffiliationIdentifierValue"] = row[7] + item["centralContactAffiliation"][ + "centralContactAffiliationIdentifier" + ]["centralContactAffiliationIdentifierScheme"] = row[8] - # Get the study locations metadata - cur.execute( - "SELECT facility, status, city, state, zip, country FROM study_location WHERE study_id = %s", - (study_id,), - ) + if row[9] is not None and row[9] != "": + item["centralContactAffiliation"][ + "centralContactAffiliationIdentifier" + ]["schemeURI"] = row[9] - study_locations = cur.fetchall() + if row[10] is not None and row[10] != "": + item["centralContactPhone"] = row[10] - contacts_locations_module["LocationList"] = [] + if row[11] is not None and row[11] != "": + item["centralContactPhoneExt"] = row[11] - if study_locations is not None: - for row in study_locations: - item = {} + item["centralContactEMail"] = row[12] - item["LocationFacility"] = row[0] - item["LocationStatus"] = row[1] - item["LocationCity"] = row[2] - if row[3] is not None and row[3] != "": - item["LocationState"] = row[3] - if row[4] is not None and row[4] != "": - item["LocationZip"] = row[4] - item["LocationCountry"] = row[5] - - contacts_locations_module["LocationList"].append(item) + central_contacts.append(item) - study_metadata["ContactsLocationsModule"] = contacts_locations_module + contacts_locations_module["centralContactList"] = central_contacts - ipd_sharing_statement_module = {} - - # Get the study IPD sharing metadata + # Get the study contacts metadata cur.execute( - "SELECT ipd_sharing, ipd_sharing_description, ipd_sharing_info_type_list, ipd_sharing_time_frame, ipd_sharing_access_criteria, ipd_sharing_url FROM study_ipdsharing WHERE study_id = %s", + "SELECT first_name, last_name, degree, identifier, identifier_scheme, identifier_scheme_uri, affiliation, affiliation_identifier, affiliation_identifier_scheme, affiliation_identifier_scheme_uri, role FROM study_overall_official WHERE study_id = %s", (study_id,), ) - ipd_sharing = cur.fetchone() - - bool_ipd_share = ipd_sharing[0] - ipd_sharing_statement_module["IPDSharing"] = ipd_sharing[0] - if bool_ipd_share == "No" and ipd_sharing[1] is not None and ipd_sharing[1] != "": - ipd_sharing_statement_module["IPDSharingDescription"] = ipd_sharing[1] - if bool_ipd_share == "Yes": - ipd_sharing_statement_module["IPDSharingDescription"] = ipd_sharing[1] + study_overall_officials = cur.fetchall() - ipd_sharing_statement_module["IPDSharingInfoTypeList"] = [] - if ipd_sharing[2] is not None: - for row in ipd_sharing[2]: - ipd_sharing_statement_module["IPDSharingInfoTypeList"].append(row) + overall_officals = [] - if ( - bool_ipd_share == "No" - and ipd_sharing_statement_module["IPDSharingInfoTypeList"] == [] - ): - # Delete key if empty - del ipd_sharing_statement_module["IPDSharingInfoTypeList"] + if study_overall_officials is not None: + for row in study_overall_officials: + item = {} - if bool_ipd_share == "No" and ipd_sharing[3] is not None and ipd_sharing[3] != "": - ipd_sharing_statement_module["IPDSharingTimeFrame"] = ipd_sharing[3] - if bool_ipd_share == "Yes": - ipd_sharing_statement_module["IPDSharingTimeFrame"] = ipd_sharing[3] + item["overallOfficialFirstName"] = row[0] + item["overallOfficialLastName"] = row[1] + item["overallOfficialDegree"] = row[2] - if bool_ipd_share == "No" and ipd_sharing[4] is not None and ipd_sharing[4] != "": - ipd_sharing_statement_module["IPDSharingAccessCriteria"] = ipd_sharing[4] - if bool_ipd_share == "Yes": - ipd_sharing_statement_module["IPDSharingAccessCriteria"] = ipd_sharing[4] + if row[3] is not None and row[3] != "": + item["overallOfficialIdentifier"] = {} - if bool_ipd_share == "No" and ipd_sharing[5] is not None and ipd_sharing[5] != "": - ipd_sharing_statement_module["IPDSharingURL"] = ipd_sharing[5] - if bool_ipd_share == "Yes": - ipd_sharing_statement_module["IPDSharingURL"] = ipd_sharing[5] + item["overallOfficialIdentifier"]["overallOfficialIdentifierValue"] = ( + row[3] + ) + item["overallOfficialIdentifierScheme"] = row[4] - study_metadata["IPDSharingStatementModule"] = ipd_sharing_statement_module + if row[5] is not None and row[5] != "": + item["overallOfficialIdentifier"]["schemeURI"] = row[5] - references_module = {} + item["overallOfficialAffiliation"] = { + "overallOfficialAffiliationName": row[6] + } - # Get the study references metadata (publications) - cur.execute( - "SELECT identifier, type, citation FROM study_reference WHERE study_id = %s", - (study_id,), - ) + if row[7] is not None and row[7] != "": + item["overallOfficialAffiliation"][ + "overallOfficialAffiliationIdentifier" + ] = {} - study_references = cur.fetchall() + item["overallOfficialAffiliation"][ + "overallOfficialAffiliationIdentifier" + ]["overallOfficialAffiliationIdentifierValue"] = row[7] + item["overallOfficialAffiliation"][ + "overallOfficialAffiliationIdentifier" + ]["overallOfficialAffiliationIdentifierScheme"] = row[8] - references_module["ReferenceList"] = [] + if row[9] is not None and row[9] != "": + item["overallOfficialAffiliation"][ + "overallOfficialAffiliationIdentifier" + ]["schemeURI"] = row[9] - if study_references is not None: - for row in study_references: - item = {} + if row[10] is not None and row[10] != "": + item["overallOfficialRole"] = row[10] - if row[0] is not None and row[0] != "": - item["ReferenceID"] = row[0] - if row[1] is not None and row[1] != "": - item["ReferenceType"] = row[1] - if row[2] is not None and row[2] != "": - item["ReferenceCitation"] = row[2] + overall_officals.append(item) - references_module["ReferenceList"].append(item) + contacts_locations_module["overallOfficialList"] = overall_officals - # Get the study links metadata + # Get the study locations metadata cur.execute( - "SELECT url, title FROM study_link WHERE study_id = %s", + "SELECT facility, status, city, state, zip, country, identifier, identifier_scheme, identifier_scheme_uri FROM study_location WHERE study_id = %s", (study_id,), ) - study_links = cur.fetchall() + study_locations = cur.fetchall() - references_module["SeeAlsoLinkList"] = [] + location_list = [] - if study_links is not None: - for row in study_links: + if study_locations is not None: + for row in study_locations: item = {} - item["SeeAlsoLinkURL"] = row[0] - if row[1] is not None and row[1] != "": - item["SeeAlsoLinkLabel"] = row[1] + item["locationFacility"] = row[0] + item["locationStatus"] = row[1] + item["locationCity"] = row[2] - references_module["SeeAlsoLinkList"].append(item) + if row[3] is not None and row[3] != "": + item["locationState"] = row[3] - # Get the study available IPD - cur.execute( - "SELECT identifier, type, url, comment FROM study_available_ipd WHERE study_id = %s", - (study_id,), - ) + if row[4] is not None and row[4] != "": + item["locationZip"] = row[4] - study_available_ipd = cur.fetchall() + item["locationCountry"] = row[5] - references_module["AvailIPDList"] = [] + if row[6] is not None and row[6] != "": + item["locationIdentifier"] = {} - if study_available_ipd is not None: - for row in study_available_ipd: - item = {} + item["locationIdentifier"]["locationIdentifierValue"] = row[6] + item["locationIdentifierScheme"] = row[7] + + if row[8] is not None and row[8] != "": + item["locationIdentifier"]["schemeURI"] = row[8] - item["AvailIPDId"] = row[0] - item["AvailIPDType"] = row[1] - item["AvailIPDURL"] = row[2] - if row[3]: - item["AvailIPDComment"] = row[3] + location_list.append(item) - references_module["AvailIPDList"].append(item) + contacts_locations_module["locationList"] = location_list - study_metadata["ReferencesModule"] = references_module + study_metadata["contactsLocationsModule"] = contacts_locations_module conn.commit() conn.close() diff --git a/publish_pipeline/register_doi/register_doi.py b/publish_pipeline/register_doi/register_doi.py index 620f8b2..6a9d8a6 100644 --- a/publish_pipeline/register_doi/register_doi.py +++ b/publish_pipeline/register_doi/register_doi.py @@ -3,333 +3,14 @@ import base64 import datetime import json -import random -import string import azure.storage.blob as azureblob import requests +import pyfairdatatools import config -def generate_random_identifier(k): - """Generate a random identifier""" - return "".join(random.choices(string.ascii_lowercase + string.digits, k=k)) - - -def create_payload(dataset_description): - """Generate payload for DOI registration""" - # doi = dataset_description["Identifier"]["identifierValue"] - doi = f"10.82914/fairhub.{generate_random_identifier(6)}" - creators = [] - titles = [] - subjects = [] - contributors = [] - dates = [] - alternate_identifiers = [] - related_items = [] - funding_references = [] - rights_list = [] - descriptions = [] - - for description in dataset_description["Description"]: - description_obj = { - "description": description["descriptionValue"], - "descriptionType": description["descriptionType"], - } - descriptions.append(description_obj) - - for rights in dataset_description["Rights"]: - rights_obj = {"rights": rights["rightsValue"]} - if "rightsURI" in rights: - rights_obj["rightsUri"] = rights["rightsURI"] - if "rightsIdentifier" in rights: - rights_obj["rightsIdentifier"] = rights["rightsIdentifier"] - if "rightsIdentifierScheme" in rights: - rights_obj["rightsIdentifierScheme"] = rights["rightsIdentifierScheme"] - rights_list.append(rights_obj) - - for funder in dataset_description["FundingReference"]: - funder_obj = { - "funderName": funder["funderName"], - "funderIdentifier": funder["funderIdentifier"]["funderIdentifierValue"], - "awardNumber": funder["awardNumber"]["awardNumberValue"], - } - if "awardURI" in funder["awardNumber"]: - funder_obj["awardUri"] = funder["awardNumber"]["awardURI"] - if "awardTitle" in funder["awardNumber"]: - funder_obj["awardTitle"] = funder["awardNumber"]["awardTitle"] - if "funderIentifierType" in funder["funderIdentifier"]: - funder_obj["funderIdentifierType"] = funder["funderIdentifier"][ - "funderIdentifierType" - ] - else: - funder_obj["funderIdentifierType"] = "Other" - funding_references.append(funder_obj) - - for related_item in dataset_description["RelatedItem"]: - if "relatedItemIdentifier" in related_item: - related_item_identifiers = [] - for identifier in related_item["relatedItemIdentifier"]: - identifier_obj = { - "relatedItemIdentifier": identifier["relatedItemIdentifierValue"], - "relatedItemIdentifierType": identifier[ - "relatedItemIdentifierType" - ], - } - if "relatedMetadataScheme" in identifier: - identifier_obj["relatedMetadataScheme"] = identifier[ - "relatedMetadataScheme" - ] - if "schemeURI" in identifier: - identifier_obj["schemeUri"] = identifier["schemeURI"] - if "schemeType" in identifier: - identifier_obj["schemeType"] = identifier["schemeType"] - - related_item_identifiers.append(identifier_obj) - if "title" in related_item: - related_item_titles = [] - for title in related_item["title"]: - title_obj = {"title": title["titleValue"]} - if "titleType" in title: - title_obj["titleType"] = title["titleType"] - related_item_titles.append(title_obj) - if "creator" in related_item: - related_item_creators = [] - for creator in related_item["creator"]: - creator_obj = { - "name": creator["creatorName"], - "nameType": creator["nameType"], - } - related_item_creators.append(creator_obj) - if "contributor" in related_item: - related_item_contributors = [] - for contributor in related_item["contributor"]: - contributor_obj = { - "name": contributor["contributorName"], - "contributorType": contributor["contributorType"], - } - if "nameType" in contributor: - contributor_obj["nameType"] = contributor["nameType"] - related_item_contributors.append(contributor_obj) - - related_item_obj = { - "relationType": related_item["relationType"], - "relatedItemType": related_item["relatedItemType"], - } - if related_item_creators: - related_item_obj["creators"] = related_item_creators - if related_item_contributors: - related_item_obj["contributors"] = related_item_contributors - if related_item_titles: - related_item_obj["titles"] = related_item_titles - if related_item_identifiers: - related_item_obj["relatedItemIdentifier"] = related_item_identifiers - if "publicationYear" in related_item: - related_item_obj["publicationYear"] = related_item["publicationYear"] - if "volume" in related_item: - related_item_obj["volume"] = related_item["volume"] - if "issue" in related_item: - related_item_obj["issue"] = related_item["issue"] - if "number" in related_item and "numberValue" in related_item["number"]: - related_item_obj["number"] = related_item["number"]["numberValue"] - if "number" in related_item and "numberType" in related_item["number"]: - related_item_obj["numberType"] = related_item["number"]["numberType"] - if "firstPage" in related_item: - related_item_obj["firstPage"] = related_item["firstPage"] - if "lastPage" in related_item: - related_item_obj["last_page"] = related_item["lastPage"] - if "publisher" in related_item: - related_item_obj["publisher"] = related_item["publisher"] - if "edition" in related_item: - related_item_obj["edition"] = related_item["edition"] - - related_items.append(related_item_obj) - - for alternate_identifier in dataset_description["AlternateIdentifier"]: - alternate_identifiers.append( - { - "alternateIdentifier": alternate_identifier["alternateIdentifierValue"], - "alternateIdentifierType": alternate_identifier[ - "alternateIdentifierType" - ], - } - ) - - for date in dataset_description["Date"]: - date_obj = { - "date": date["dateValue"], - "dateType": date["dateType"], - } - if "dateInformation" in date: - date_obj["dateInformation"] = date["dateInformation"] - dates.append(date_obj) - - for contributor in dataset_description["Contributor"]: - if "affiliation" in contributor: - contributor_affiliations = [] - for affiliation in contributor["affiliation"]: - # TODO: VERIFY BY KEY IS AFFILIATIONVALUE AND NOT NAME - affiliate = { - "name": affiliation["affiliationValue"], - } - if "schemeURI" in affiliation: - affiliate["schemeUri"] = affiliation["schemeURI"] - if "affiliationIdentifierScheme" in affiliation: - affiliate["affiliationIdentifierScheme"] = affiliation[ - "affiliationIdentifierScheme" - ] - if "affiliationIdentifier" in affiliation: - affiliate["affiliationIdentifier"] = affiliation[ - "affiliationIdentifier" - ] - - print(affiliate) - contributor_affiliations.append(affiliate) - if "nameIdentifier" in contributor: - name_identifiers = [] - for name_identifier in contributor["nameIdentifier"]: - name_identifier = { - "nameIdentifier": name_identifier["nameIdentifierValue"], - "nameIdentifierScheme": name_identifier["nameIdentifierScheme"], - } - if "schemeURI" in name_identifier: - name_identifier["schemeURI"] = name_identifier["schemeURI"] - name_identifiers.append(name_identifier) - - contributor_obj = { - "name": contributor["contributorName"], - "nameType": contributor["nameType"], - "contributorType": contributor["contributorType"], - } - if contributor_affiliations: - print(contributor_affiliations) - contributor_obj["affiliation"] = contributor_affiliations - if name_identifiers: - contributor_obj["nameIdentifiers"] = name_identifiers - - contributors.append(contributor_obj) - - for subject in dataset_description["Subject"]: - subject_obj = {} - if "classificationCode" in subject: - subject_obj["classificationCode"] = subject["classificationCode"] - if "subjectScheme" in subject: - subject_obj["subjectScheme"] = subject["subjectScheme"] - if "schemeURI" in subject: - subject_obj["schemeUri"] = subject["schemeURI"] - subject_obj["subject"] = subject["subjectValue"] - subjects.append(subject_obj) - - for title in dataset_description["Title"]: - title_obj = {"title": title["titleValue"]} - if "titleType" in title: - title_obj["titleType"] = title["titleType"] - titles.append(title_obj) - - for creator in dataset_description["Creator"]: - if "affiliation" in creator: - creator_affiliations = [] - for affiliation in creator["affiliation"]: - affiliate = { - "name": affiliation["affiliationValue"], - } - if "schemeURI" in affiliation: - affiliate["schemeUri"] = affiliation["schemeURI"] - if "affiliationIdentifierScheme" in affiliation: - affiliate["affiliationIdentifierScheme"] = affiliation[ - "affiliationIdentifierScheme" - ] - if "affiliationIdentifier" in affiliation: - affiliate["affiliationIdentifier"] = affiliation[ - "affiliationIdentifier" - ] - - creator_affiliations.append(affiliate) - if "nameIdentifier" in creator: - name_identifiers = [] - for name_identifier in creator["nameIdentifier"]: - name_identifier = { - "nameIdentifier": name_identifier["nameIdentifierValue"], - "nameIdentifierScheme": name_identifier["nameIdentifierScheme"], - } - if "schemeURI" in name_identifier: - name_identifier["schemeURI"] = name_identifier["schemeURI"] - name_identifiers.append(name_identifier) - - creator_obj = { - "name": creator["creatorName"], - "nameType": creator["nameType"], - } - if creator_affiliations: - creator_obj["affiliation"] = creator_affiliations - if name_identifiers: - creator_obj["nameIdentifiers"] = name_identifiers - - creators.append(creator_obj) - - for funding_reference in dataset_description["FundingReference"]: - funder_obj = {"funderName": funding_reference["funderName"]} - if ( - "funderIdentifier" in funding_reference - and "funderIdentifierValue" in funding_reference["funderIdentifier"] - ): - funder_obj["funderIdentifer"] = funding_reference["funderIdentifier"][ - "funderIdentifierValue" - ] - if ( - "funderIdentifier" in funding_reference - and "funderIdentifierType" in funding_reference["funderIdentifier"] - ): - funder_obj["funderIdentifierType"] = funding_reference["funderIdentifier"][ - "funderIdentifierType" - ] - - payload = { - "data": { - "type": "dois", - "attributes": { - "event": "publish", - "doi": doi, - "creators": creators, - "titles": titles, - "publisher": {"name": dataset_description["Publisher"]}, - "publicationYear": dataset_description["PublicationYear"], - "subjects": subjects, - "contributors": contributors, - "dates": dates, - "alternateIdentifiers": alternate_identifiers, - "types": { - "resourceTypeGeneral": dataset_description["ResourceType"][ - "resourceTypeGeneral" - ], - "resourceType": dataset_description["ResourceType"][ - "resourceTypeValue" - ], - }, - # "relatedItems": related_items, - "rightsList": rights_list, - "description": descriptions, - "version": dataset_description["Version"], - "fundingReferences": funding_references, - "url": "https://staging.fairhub.io/datasets/2", - }, - } - } - - print(dataset_description["Version"]) - if len(dataset_description["RelatedIdentifier"]) > 0: - payload["data"]["attributes"]["relatedIdentifiers"] = dataset_description[ - "RelatedIdentifier" - ] - if len(dataset_description["Size"]) > 0: - payload["data"]["attributes"]["sizes"] = dataset_description["Size"] - if dataset_description["Language"]: - payload["data"]["attributes"]["language"] = dataset_description["Language"] - - return payload - - def pipeline(): """Register a DOI for the dataset""" @@ -368,11 +49,8 @@ def pipeline(): # Load the dataset_description.json file dataset_description = json.loads(stream) - # print(json.dumps(dataset_description)) # Create payload for doi registration - print("UHHHH") - payload = create_payload(dataset_description) - print(json.dumps(payload)) + payload = pyfairdatatools.utils.convert_for_datacite(dataset_description) url = f"{config.DATACITE_API_URL}/dois" headers = {