From 3fc2aa440ea64c314c32ccb24c5609888a4e6ad0 Mon Sep 17 00:00:00 2001 From: Sanjay Soundarajan Date: Tue, 27 Feb 2024 12:30:14 -0800 Subject: [PATCH 1/5] =?UTF-8?q?=E2=9C=A8=20feat:=20add=20support=20for=20d?= =?UTF-8?q?atacite=20v4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../generate_dataset_description.py | 317 ++++++------------ .../generate_discovery_metadata.py | 14 +- publish_pipeline/register_doi/register_doi.py | 166 ++++----- 3 files changed, 173 insertions(+), 324 deletions(-) diff --git a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py index 699d25e..4580c29 100644 --- a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py +++ b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py @@ -53,7 +53,7 @@ def pipeline(): identifier["identifierValue"] = doi[0] identifier["identifierType"] = "DOI" - dataset_metadata["Identifier"] = identifier + dataset_metadata["identifier"] = identifier titles = [] @@ -76,13 +76,13 @@ def pipeline(): titles.append(item) - dataset_metadata["Title"] = titles + dataset_metadata["title"] = titles # todo: generating a random uuid for now # Get the dataset version version = str(uuid.uuid4()) - dataset_metadata["Version"] = version + dataset_metadata["version"] = version alternate_identifiers = [] @@ -103,13 +103,13 @@ def pipeline(): alternate_identifiers.append(item) - dataset_metadata["AlternateIdentifier"] = alternate_identifiers + dataset_metadata["alternateIdentifier"] = alternate_identifiers creators = [] # Get the dataset creators cur.execute( - "SELECT name, name_type, name_identifier, name_identifier_scheme, name_identifier_scheme_uri, affiliations FROM dataset_contributor WHERE dataset_id = %s AND creator = true", + "SELECT family_name, given_name, name_type, name_identifier, name_identifier_scheme, name_identifier_scheme_uri, affiliations FROM dataset_contributor WHERE dataset_id = %s AND creator = true", (dataset_id,), ) @@ -119,25 +119,36 @@ def pipeline(): for creator in dataset_creators: item = {} - item["creatorName"] = creator[0] - item["nameType"] = creator[1] + creator_name = "" + + if (creator[0] is not None and creator[0] != "") and ( + creator[1] is not None and creator[1] != "" + ): + creator_name = f"{creator[0]}, {creator[1]}" + elif creator[0] is not None and creator[0] != "": + creator_name = creator[0] + elif creator[1] is not None and creator[1] != "": + creator_name = creator[1] + + item["creatorName"] = creator_name + item["nameType"] = creator[2] name_identifier = {} - name_identifier["nameIdentifierValue"] = creator[2] - name_identifier["nameIdentifierScheme"] = creator[3] - if creator[4] is not None and creator[4] != "": - name_identifier["schemeURI"] = creator[4] + name_identifier["nameIdentifierValue"] = creator[3] + name_identifier["nameIdentifierScheme"] = creator[4] + if creator[5] is not None and creator[5] != "": + name_identifier["schemeURI"] = creator[5] item["nameIdentifier"] = [name_identifier] - affiliations = creator[5] + affiliations = creator[6] item["affiliation"] = [] for affiliation in affiliations: affiliation_item = {} - affiliation_item["affiliationValue"] = affiliation["name"] + affiliation_item["affiliationName"] = affiliation["name"] if ( affiliation["identifier"] is not None and affiliation["identifier"] != "" @@ -160,13 +171,13 @@ def pipeline(): creators.append(item) - dataset_metadata["Creator"] = creators + dataset_metadata["creator"] = creators contributors = [] # Get the dataset contributors cur.execute( - "SELECT name, name_type, name_identifier, name_identifier_scheme, name_identifier_scheme_uri, contributor_type, affiliations FROM dataset_contributor WHERE dataset_id = %s AND creator = false", + "SELECT family_name, given_name, name_type, name_identifier, name_identifier_scheme, name_identifier_scheme_uri, contributor_type, affiliations FROM dataset_contributor WHERE dataset_id = %s AND creator = false", (dataset_id,), ) @@ -176,7 +187,19 @@ def pipeline(): for contributor in dataset_contributors: item = {} - item["contributorName"] = contributor[0] + contributor_name = "" + + if (contributor[0] is not None and contributor[0] != "") and ( + contributor[1] is not None and contributor[1] != "" + ): + contributor_name = f"{contributor[0]}, {contributor[1]}" + elif contributor[0] is not None and contributor[0] != "": + contributor_name = contributor[0] + elif contributor[1] is not None and contributor[1] != "": + contributor_name = contributor[1] + + item["contributorName"] = contributor_name + item["nameType"] = contributor[1] name_identifier = {} @@ -197,7 +220,7 @@ def pipeline(): for affiliation in affiliations: affiliation_item = {} - affiliation_item["affiliationValue"] = affiliation["name"] + affiliation_item["affiliationName"] = affiliation["name"] if ( affiliation["identifier"] is not None and affiliation["identifier"] != "" @@ -219,12 +242,12 @@ def pipeline(): contributors.append(item) - dataset_metadata["Contributor"] = contributors + dataset_metadata["contributor"] = contributors # Get the publication year publication_year = str(datetime.datetime.now().year) - dataset_metadata["PublicationYear"] = publication_year + dataset_metadata["publicationYear"] = publication_year dates = [] @@ -249,7 +272,7 @@ def pipeline(): dates.append(item) - dataset_metadata["Date"] = dates + dataset_metadata["date"] = dates resource_type = {} @@ -264,23 +287,7 @@ def pipeline(): resource_type["resourceTypeValue"] = dataset_resource_type[0] resource_type["resourceTypeGeneral"] = "Dataset" - dataset_metadata["ResourceType"] = resource_type - - dataset_record_keys = {} - - # Get the dataset record keys - cur.execute( - "SELECT key_type, key_details FROM dataset_record_keys WHERE dataset_id = %s", - (dataset_id,), - ) - - record_keys = cur.fetchone() - - dataset_record_keys["keysType"] = record_keys[0] - if record_keys[1] is not None and record_keys[1] != "": - dataset_record_keys["keysDetails"] = record_keys[1] - - dataset_metadata["DatasetRecordKeys"] = dataset_record_keys + dataset_metadata["resourceType"] = resource_type dataset_de_ident_level = {} @@ -301,7 +308,7 @@ def pipeline(): if de_ident_level[6] is not None and de_ident_level[6] != "": dataset_de_ident_level["deIdentDetails"] = de_ident_level[6] - dataset_metadata["DatasetDeIdentLevel"] = dataset_de_ident_level + dataset_metadata["datasetDeIdentLevel"] = dataset_de_ident_level dataset_consent = {} @@ -322,7 +329,7 @@ def pipeline(): if consent[6] is not None and consent[6] != "": dataset_consent["consentsDetails"] = consent[6] - dataset_metadata["DatasetConsent"] = dataset_consent + dataset_metadata["datasetConsent"] = dataset_consent descriptions = [] @@ -343,7 +350,7 @@ def pipeline(): descriptions.append(item) - dataset_metadata["Description"] = descriptions + dataset_metadata["description"] = descriptions cur.execute( "SELECT language FROM dataset_other WHERE dataset_id = %s", @@ -353,7 +360,7 @@ def pipeline(): dataset_language = cur.fetchone() if dataset_language[0] is not None and dataset_language[0] != "": - dataset_metadata["Language"] = dataset_language[0] + dataset_metadata["language"] = dataset_language[0] subjects = [] @@ -381,7 +388,7 @@ def pipeline(): subjects.append(item) - dataset_metadata["Subject"] = subjects + dataset_metadata["subject"] = subjects managing_organisation = {} @@ -400,7 +407,7 @@ def pipeline(): ): managing_organisation["rorId"] = dataset_managing_organisation[1] - dataset_metadata["ManagingOrganisation"] = managing_organisation + dataset_metadata["managingOrganisation"] = managing_organisation access_details = {} @@ -421,14 +428,14 @@ def pipeline(): access_details["urlLastChecked"] = timestamp.strftime("%Y-%m-%d") # Get the dataset Access Type - dataset_metadata["AccessType"] = dataset_access[0] - dataset_metadata["AccessDetails"] = access_details + dataset_metadata["accessType"] = dataset_access[0] + dataset_metadata["accessDetails"] = access_details rights = [] # Get the dataset rights cur.execute( - "SELECT rights, uri, identifier, identifier_scheme FROM dataset_rights WHERE dataset_id = %s", + "SELECT rightsName, uri, identifier, identifier_scheme FROM dataset_rights WHERE dataset_id = %s", (dataset_id,), ) @@ -439,6 +446,7 @@ def pipeline(): item = {} item["rightsValue"] = right[0] + if right[1] is not None and right[1] != "": item["rightsURI"] = right[1] if right[2] is not None and right[2] != "": @@ -448,17 +456,28 @@ def pipeline(): rights.append(item) - dataset_metadata["Rights"] = rights + dataset_metadata["rights"] = rights # Get the dataset publisher information cur.execute( - "SELECT publisher FROM dataset_other WHERE dataset_id = %s", + "SELECT publisher, identifier, identifier_scheme, scheme_uri FROM dataset_publisher WHERE dataset_id = %s", (dataset_id,), ) dataset_publisher = cur.fetchone() - dataset_metadata["Publisher"] = dataset_publisher[0] + publisher = {} + + publisher["publisherName"] = dataset_publisher[0] + + if dataset_publisher[1] is not None and dataset_publisher[1] != "": + publisher["publisherIdentifier"] = dataset_publisher[1] + if dataset_publisher[2] is not None and dataset_publisher[2] != "": + publisher["publisherIdentifierScheme"] = dataset_publisher[2] + if dataset_publisher[3] is not None and dataset_publisher[3] != "": + publisher["schemeURI"] = dataset_publisher[3] + + dataset_metadata["publisher"] = publisher sizes = [] @@ -474,7 +493,23 @@ def pipeline(): for size in dataset_sizes[0]: sizes.append(size) - dataset_metadata["Size"] = sizes + dataset_metadata["size"] = sizes + + formats = [] + + # Get the dataset formats + cur.execute( + "SELECT format FROM dataset_other WHERE dataset_id = %s", + (dataset_id,), + ) + + dataset_formats = cur.fetchone() + + if len(dataset_formats[0]) > 0: + for dataset_format in dataset_formats[0]: + formats.append(dataset_format) + + dataset_metadata["format"] = formats funding_references = [] @@ -509,179 +544,38 @@ def pipeline(): funding_references.append(item) - dataset_metadata["FundingReference"] = funding_references + dataset_metadata["fundingReference"] = funding_references - related_items = [] + related_identifiers = [] - # Get the dataset related items + # Get the dataset related identifiers cur.execute( - "SELECT id, type, relation_type FROM dataset_related_item WHERE dataset_id = %s", + "SELECT identifier, identifier_type, relation_type, related_metadata_scheme, scheme_uri, scheme_type, resource_type FROM dataset_related_identifier WHERE dataset_id = %s", (dataset_id,), ) - dataset_related_items = cur.fetchall() - - if dataset_related_items is not None: - for related_item in dataset_related_items: - related_item_id = related_item[0] + dataset_related_identifiers = cur.fetchall() + if dataset_related_identifiers is not None: + for related_identifier in dataset_related_identifiers: item = {} - item["relatedItemType"] = related_item[1] - item["relationType"] = related_item[2] - - item_identifiers = [] - - # Get the related item's identifiers - cur.execute( - "SELECT identifier, type, metadata_scheme, scheme_uri, scheme_type FROM dataset_related_item_identifier WHERE dataset_related_item_id = %s", - (related_item_id,), - ) - - related_item_identifiers = cur.fetchall() - - if related_item_identifiers is not None: - for related_item_identifier in related_item_identifiers: - item_identifier = {} - - item_identifier["relatedItemIdentifierValue"] = ( - related_item_identifier[0] - ) - item_identifier["relatedItemIdentifierType"] = ( - related_item_identifier[1] - ) - if ( - related_item_identifier[2] is not None - and related_item_identifier[2] != "" - ): - item_identifier["relatedMetadataScheme"] = ( - related_item_identifier[2] - ) - if ( - related_item_identifier[3] is not None - and related_item_identifier[3] != "" - ): - item_identifier["schemeURI"] = related_item_identifier[3] - if ( - related_item_identifier[4] is not None - and related_item_identifier[4] != "" - ): - item_identifier["schemeType"] = related_item_identifier[4] - - item_identifiers.append(item_identifier) - - item["relatedItemIdentifier"] = item_identifiers - - related_items.append(item) - - item_creators = [] - - # Get the related item's creators - cur.execute( - "SELECT name, name_type FROM dataset_related_item_contributor WHERE dataset_related_item_id = %s AND creator = true", - (related_item_id,), - ) - - related_item_creators = cur.fetchall() - - if related_item_creators is not None: - for related_item_creator in related_item_creators: - item_creator = {} - - item_creator["creatorName"] = related_item_creator[0] - item_creator["nameType"] = related_item_creator[1] - - item_creators.append(item_creator) - - item["creator"] = item_creators - - item_contributors = [] - - # Get the related item's contributors - cur.execute( - "SELECT name, name_type, contributor_type FROM dataset_related_item_contributor WHERE dataset_related_item_id = %s AND creator = false", - (related_item_id,), - ) - - related_item_contributors = cur.fetchall() - - if related_item_contributors is not None: - for related_item_contributor in related_item_contributors: - item_contributor = {} - - item_contributor["contributorName"] = related_item_contributor[0] - if ( - related_item_contributor[1] is not None - and related_item_contributor[1] != "" - ): - item_contributor["nameType"] = related_item_contributor[1] - item_contributor["contributorType"] = related_item_contributor[2] - - item_contributors.append(item_contributor) - - item["contributor"] = item_contributors - - item_titles = [] - - # Get the related item's titles - cur.execute( - "SELECT title, type FROM dataset_related_item_title WHERE dataset_related_item_id = %s", - (related_item_id,), - ) - - related_item_titles = cur.fetchall() - - if related_item_titles is not None: - for related_item_title in related_item_titles: - item_title = {} - - item_title["titleValue"] = related_item_title[0] - - if not related_item_title[1] == "MainTitle": - item_title["titleType"] = related_item_title[1] - - item_titles.append(item_title) - - item["title"] = item_titles - - # Get the related item's dates - cur.execute( - "SELECT publication_year, volume, issue, number_value, number_type, first_page, last_page, publisher, edition FROM dataset_related_item_other WHERE dataset_related_item_id = %s", - (related_item_id,), - ) - - related_item_other = cur.fetchone() - - if related_item_other[0] is not None and related_item_other[0] != "": - timestamp = datetime.datetime.fromtimestamp( - related_item_other[0] / 1000 - ) - item["publicationYear"] = str(timestamp.year) - if related_item_other[1] is not None and related_item_other[1] != "": - item["volume"] = related_item_other[1] - if related_item_other[2] is not None and related_item_other[2] != "": - item["issue"] = related_item_other[2] - - item["number"] = {} - if related_item_other[3] is not None and related_item_other[3] != "": - item["number"]["numberValue"] = related_item_other[3] - if related_item_other[4] is not None and related_item_other[4] != "": - item["number"]["numberType"] = related_item_other[4] - if item["number"] == {}: - del item["number"] + item["relatedIdentifierValue"] = related_identifier[0] + item["relatedIdentifierType"] = related_identifier[1] + item["relationType"] = related_identifier[2] - if related_item_other[5] is not None and related_item_other[5] != "": - item["firstPage"] = related_item_other[5] - if related_item_other[6] is not None and related_item_other[6] != "": - item["lastPage"] = related_item_other[6] - if related_item_other[7] is not None and related_item_other[7] != "": - item["publisher"] = related_item_other[7] - if related_item_other[8] is not None and related_item_other[8] != "": - item["edition"] = related_item_other[8] + if related_identifier[3] is not None and related_identifier[3] != "": + item["relatedMetadataScheme"] = related_identifier[3] + if related_identifier[4] is not None and related_identifier[4] != "": + item["schemeURI"] = related_identifier[4] + if related_identifier[5] is not None and related_identifier[5] != "": + item["schemeType"] = related_identifier[5] + if related_identifier[6] is not None and related_identifier[6] != "": + item["resourceTypeGeneral"] = related_identifier[6] - dataset_metadata["RelatedItem"] = related_items + related_identifiers.append(item) - dataset_metadata["RelatedIdentifier"] = [] + dataset_metadata["relatedIdentifier"] = related_identifiers conn.commit() conn.close() @@ -695,6 +589,7 @@ def pipeline(): data=dataset_metadata ) + # sourcery skip: raise-specific-error if not data_is_valid: raise Exception("Dataset description is not valid") diff --git a/publish_pipeline/generate_high_level_metadata/generate_discovery_metadata.py b/publish_pipeline/generate_high_level_metadata/generate_discovery_metadata.py index 88a67fe..17fa1d2 100644 --- a/publish_pipeline/generate_high_level_metadata/generate_discovery_metadata.py +++ b/publish_pipeline/generate_high_level_metadata/generate_discovery_metadata.py @@ -48,19 +48,19 @@ def pipeline(): title = dataset_title[0] - discovery_metadata["Title"] = title + discovery_metadata["title"] = title identifier = "10.5281/zenodo.7641684" - discovery_metadata["Identifier"] = identifier + discovery_metadata["identifier"] = identifier version = str(uuid.uuid4()) - discovery_metadata["Version"] = version + discovery_metadata["version"] = version publication_date = datetime.datetime.now().strftime("%Y-%m-%d") - discovery_metadata["PublicationDate"] = publication_date + discovery_metadata["publicationDate"] = publication_date detailed_description = "" @@ -70,7 +70,7 @@ def pipeline(): ) study_description = cur.fetchone() detailed_description = study_description[0] - discovery_metadata["About"] = detailed_description + discovery_metadata["about"] = detailed_description # license @@ -84,7 +84,7 @@ def pipeline(): dataset_rights = cur.fetchone() # license_text = dataset_other.join(",") license_text = dataset_rights[0] - discovery_metadata["License"] = license_text + discovery_metadata["license"] = license_text acknowledgement = "" @@ -96,7 +96,7 @@ def pipeline(): dataset_acknowledgement = cur.fetchone() acknowledgement = dataset_acknowledgement[0] if acknowledgement: - discovery_metadata["Acknowledgement"] = acknowledgement + discovery_metadata["acknowledgement"] = acknowledgement # conn.commit() conn.close() diff --git a/publish_pipeline/register_doi/register_doi.py b/publish_pipeline/register_doi/register_doi.py index 620f8b2..49d1544 100644 --- a/publish_pipeline/register_doi/register_doi.py +++ b/publish_pipeline/register_doi/register_doi.py @@ -27,20 +27,20 @@ def create_payload(dataset_description): contributors = [] dates = [] alternate_identifiers = [] - related_items = [] + related_identifiers = [] funding_references = [] rights_list = [] descriptions = [] - for description in dataset_description["Description"]: + for description in dataset_description["description"]: description_obj = { "description": description["descriptionValue"], "descriptionType": description["descriptionType"], } descriptions.append(description_obj) - for rights in dataset_description["Rights"]: - rights_obj = {"rights": rights["rightsValue"]} + for rights in dataset_description["rights"]: + rights_obj = {"rights": rights["rightsName"]} if "rightsURI" in rights: rights_obj["rightsUri"] = rights["rightsURI"] if "rightsIdentifier" in rights: @@ -49,7 +49,7 @@ def create_payload(dataset_description): rights_obj["rightsIdentifierScheme"] = rights["rightsIdentifierScheme"] rights_list.append(rights_obj) - for funder in dataset_description["FundingReference"]: + for funder in dataset_description["fundingReference"]: funder_obj = { "funderName": funder["funderName"], "funderIdentifier": funder["funderIdentifier"]["funderIdentifierValue"], @@ -67,86 +67,7 @@ def create_payload(dataset_description): funder_obj["funderIdentifierType"] = "Other" funding_references.append(funder_obj) - for related_item in dataset_description["RelatedItem"]: - if "relatedItemIdentifier" in related_item: - related_item_identifiers = [] - for identifier in related_item["relatedItemIdentifier"]: - identifier_obj = { - "relatedItemIdentifier": identifier["relatedItemIdentifierValue"], - "relatedItemIdentifierType": identifier[ - "relatedItemIdentifierType" - ], - } - if "relatedMetadataScheme" in identifier: - identifier_obj["relatedMetadataScheme"] = identifier[ - "relatedMetadataScheme" - ] - if "schemeURI" in identifier: - identifier_obj["schemeUri"] = identifier["schemeURI"] - if "schemeType" in identifier: - identifier_obj["schemeType"] = identifier["schemeType"] - - related_item_identifiers.append(identifier_obj) - if "title" in related_item: - related_item_titles = [] - for title in related_item["title"]: - title_obj = {"title": title["titleValue"]} - if "titleType" in title: - title_obj["titleType"] = title["titleType"] - related_item_titles.append(title_obj) - if "creator" in related_item: - related_item_creators = [] - for creator in related_item["creator"]: - creator_obj = { - "name": creator["creatorName"], - "nameType": creator["nameType"], - } - related_item_creators.append(creator_obj) - if "contributor" in related_item: - related_item_contributors = [] - for contributor in related_item["contributor"]: - contributor_obj = { - "name": contributor["contributorName"], - "contributorType": contributor["contributorType"], - } - if "nameType" in contributor: - contributor_obj["nameType"] = contributor["nameType"] - related_item_contributors.append(contributor_obj) - - related_item_obj = { - "relationType": related_item["relationType"], - "relatedItemType": related_item["relatedItemType"], - } - if related_item_creators: - related_item_obj["creators"] = related_item_creators - if related_item_contributors: - related_item_obj["contributors"] = related_item_contributors - if related_item_titles: - related_item_obj["titles"] = related_item_titles - if related_item_identifiers: - related_item_obj["relatedItemIdentifier"] = related_item_identifiers - if "publicationYear" in related_item: - related_item_obj["publicationYear"] = related_item["publicationYear"] - if "volume" in related_item: - related_item_obj["volume"] = related_item["volume"] - if "issue" in related_item: - related_item_obj["issue"] = related_item["issue"] - if "number" in related_item and "numberValue" in related_item["number"]: - related_item_obj["number"] = related_item["number"]["numberValue"] - if "number" in related_item and "numberType" in related_item["number"]: - related_item_obj["numberType"] = related_item["number"]["numberType"] - if "firstPage" in related_item: - related_item_obj["firstPage"] = related_item["firstPage"] - if "lastPage" in related_item: - related_item_obj["last_page"] = related_item["lastPage"] - if "publisher" in related_item: - related_item_obj["publisher"] = related_item["publisher"] - if "edition" in related_item: - related_item_obj["edition"] = related_item["edition"] - - related_items.append(related_item_obj) - - for alternate_identifier in dataset_description["AlternateIdentifier"]: + for alternate_identifier in dataset_description["alternateIdentifier"]: alternate_identifiers.append( { "alternateIdentifier": alternate_identifier["alternateIdentifierValue"], @@ -156,7 +77,7 @@ def create_payload(dataset_description): } ) - for date in dataset_description["Date"]: + for date in dataset_description["date"]: date_obj = { "date": date["dateValue"], "dateType": date["dateType"], @@ -165,13 +86,13 @@ def create_payload(dataset_description): date_obj["dateInformation"] = date["dateInformation"] dates.append(date_obj) - for contributor in dataset_description["Contributor"]: + for contributor in dataset_description["contributor"]: if "affiliation" in contributor: contributor_affiliations = [] for affiliation in contributor["affiliation"]: # TODO: VERIFY BY KEY IS AFFILIATIONVALUE AND NOT NAME affiliate = { - "name": affiliation["affiliationValue"], + "name": affiliation["affiliationName"], } if "schemeURI" in affiliation: affiliate["schemeUri"] = affiliation["schemeURI"] @@ -202,6 +123,14 @@ def create_payload(dataset_description): "nameType": contributor["nameType"], "contributorType": contributor["contributorType"], } + + contributor_name = contributor["contributorName"] + split_name = contributor_name.split(",") + + if len(split_name) > 1: + contributor_obj["familyName"] = split_name[0] + contributor_obj["givenName"] = split_name[1] + if contributor_affiliations: print(contributor_affiliations) contributor_obj["affiliation"] = contributor_affiliations @@ -210,7 +139,7 @@ def create_payload(dataset_description): contributors.append(contributor_obj) - for subject in dataset_description["Subject"]: + for subject in dataset_description["subject"]: subject_obj = {} if "classificationCode" in subject: subject_obj["classificationCode"] = subject["classificationCode"] @@ -221,18 +150,18 @@ def create_payload(dataset_description): subject_obj["subject"] = subject["subjectValue"] subjects.append(subject_obj) - for title in dataset_description["Title"]: + for title in dataset_description["title"]: title_obj = {"title": title["titleValue"]} if "titleType" in title: title_obj["titleType"] = title["titleType"] titles.append(title_obj) - for creator in dataset_description["Creator"]: + for creator in dataset_description["creator"]: if "affiliation" in creator: creator_affiliations = [] for affiliation in creator["affiliation"]: affiliate = { - "name": affiliation["affiliationValue"], + "name": affiliation["affiliationName"], } if "schemeURI" in affiliation: affiliate["schemeUri"] = affiliation["schemeURI"] @@ -261,6 +190,14 @@ def create_payload(dataset_description): "name": creator["creatorName"], "nameType": creator["nameType"], } + + creator_name = creator["creatorName"] + split_name = creator_name.split(",") + + if len(split_name) > 1: + creator_obj["familyName"] = split_name[0] + creator_obj["givenName"] = split_name[1] + if creator_affiliations: creator_obj["affiliation"] = creator_affiliations if name_identifiers: @@ -268,7 +205,7 @@ def create_payload(dataset_description): creators.append(creator_obj) - for funding_reference in dataset_description["FundingReference"]: + for funding_reference in dataset_description["fundingReference"]: funder_obj = {"funderName": funding_reference["funderName"]} if ( "funderIdentifier" in funding_reference @@ -285,6 +222,27 @@ def create_payload(dataset_description): "funderIdentifierType" ] + for related_identifier in dataset_description["relatedIdentifier"]: + related_identifier_obj = { + "relatedIdentifier": related_identifier["relatedIdentifierValue"], + "relatedIdentifierType": related_identifier["relatedIdentifierType"], + "relationType": related_identifier["relationType"], + } + + if "relatedMetadataScheme" in related_identifier: + related_identifier_obj["relatedMetadataScheme"] = related_identifier[ + "relatedMetadataScheme" + ] + if "schemeURI" in related_identifier: + related_identifier_obj["schemeURI"] = related_identifier["schemeURI"] + if "schemeType" in related_identifier: + related_identifier_obj["schemeType"] = related_identifier["schemeType"] + if "resourceTypeGeneral" in related_identifier: + related_identifier_obj["resourceTypeGeneral"] = related_identifier[ + "resource_type" + ] + related_identifiers.append(related_identifier_obj) + payload = { "data": { "type": "dois", @@ -293,38 +251,37 @@ def create_payload(dataset_description): "doi": doi, "creators": creators, "titles": titles, - "publisher": {"name": dataset_description["Publisher"]}, - "publicationYear": dataset_description["PublicationYear"], + "publisher": {"name": dataset_description["publisher"]}, + "publicationYear": dataset_description["publicationYear"], "subjects": subjects, "contributors": contributors, "dates": dates, "alternateIdentifiers": alternate_identifiers, "types": { - "resourceTypeGeneral": dataset_description["ResourceType"][ + "resourceTypeGeneral": dataset_description["resourceType"][ "resourceTypeGeneral" ], - "resourceType": dataset_description["ResourceType"][ + "resourceType": dataset_description["resourceType"][ "resourceTypeValue" ], }, - # "relatedItems": related_items, "rightsList": rights_list, "description": descriptions, - "version": dataset_description["Version"], + "version": dataset_description["version"], "fundingReferences": funding_references, "url": "https://staging.fairhub.io/datasets/2", }, } } - print(dataset_description["Version"]) - if len(dataset_description["RelatedIdentifier"]) > 0: + print(dataset_description["version"]) + if len(dataset_description["relatedIdentifier"]) > 0: payload["data"]["attributes"]["relatedIdentifiers"] = dataset_description[ "RelatedIdentifier" ] - if len(dataset_description["Size"]) > 0: + if len(dataset_description["size"]) > 0: payload["data"]["attributes"]["sizes"] = dataset_description["Size"] - if dataset_description["Language"]: + if dataset_description["language"]: payload["data"]["attributes"]["language"] = dataset_description["Language"] return payload @@ -368,11 +325,8 @@ def pipeline(): # Load the dataset_description.json file dataset_description = json.loads(stream) - # print(json.dumps(dataset_description)) # Create payload for doi registration - print("UHHHH") payload = create_payload(dataset_description) - print(json.dumps(payload)) url = f"{config.DATACITE_API_URL}/dois" headers = { From 36c9d0ee87e2519ce4d6bf4eb188de34888d5f8a Mon Sep 17 00:00:00 2001 From: Sanjay Soundarajan Date: Wed, 28 Feb 2024 18:51:26 -0800 Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=90=9B=20fix:=20update=20dataset=20pu?= =?UTF-8?q?blisher?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../generate_dataset_description.py | 25 ++++--------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py index 4580c29..9574d24 100644 --- a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py +++ b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py @@ -458,26 +458,11 @@ def pipeline(): dataset_metadata["rights"] = rights - # Get the dataset publisher information - cur.execute( - "SELECT publisher, identifier, identifier_scheme, scheme_uri FROM dataset_publisher WHERE dataset_id = %s", - (dataset_id,), - ) - - dataset_publisher = cur.fetchone() - - publisher = {} - - publisher["publisherName"] = dataset_publisher[0] - - if dataset_publisher[1] is not None and dataset_publisher[1] != "": - publisher["publisherIdentifier"] = dataset_publisher[1] - if dataset_publisher[2] is not None and dataset_publisher[2] != "": - publisher["publisherIdentifierScheme"] = dataset_publisher[2] - if dataset_publisher[3] is not None and dataset_publisher[3] != "": - publisher["schemeURI"] = dataset_publisher[3] - - dataset_metadata["publisher"] = publisher + # Create the publisher object + dataset_metadata["publisher"] = { + "publisherName": "FAIRhub", + "publisherIdentifier": "https://fairhub.io", + } sizes = [] From 5b4cc54f49348180090585596be68c525e5bbb73 Mon Sep 17 00:00:00 2001 From: Sanjay Soundarajan Date: Thu, 29 Feb 2024 17:38:53 -0800 Subject: [PATCH 3/5] =?UTF-8?q?=F0=9F=90=9B=20fix:=20managing=20organizati?= =?UTF-8?q?on=20key=20in=20dataset=20metadata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../generate_dataset_description.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py index 9574d24..3850e1e 100644 --- a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py +++ b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py @@ -390,7 +390,7 @@ def pipeline(): dataset_metadata["subject"] = subjects - managing_organisation = {} + managing_organization = {} # Get the dataset managing organization cur.execute( @@ -398,16 +398,16 @@ def pipeline(): (dataset_id,), ) - dataset_managing_organisation = cur.fetchone() + dataset_managing_organization = cur.fetchone() - managing_organisation["name"] = dataset_managing_organisation[0] + managing_organization["name"] = dataset_managing_organization[0] if ( - dataset_managing_organisation[1] is not None - and dataset_managing_organisation[1] != "" + dataset_managing_organization[1] is not None + and dataset_managing_organization[1] != "" ): - managing_organisation["rorId"] = dataset_managing_organisation[1] + managing_organization["rorId"] = dataset_managing_organization[1] - dataset_metadata["managingOrganisation"] = managing_organisation + dataset_metadata["managingOrganization"] = managing_organization access_details = {} @@ -435,7 +435,7 @@ def pipeline(): # Get the dataset rights cur.execute( - "SELECT rightsName, uri, identifier, identifier_scheme FROM dataset_rights WHERE dataset_id = %s", + "SELECT rights, uri, identifier, identifier_scheme, identifier_scheme_uri FROM dataset_rights WHERE dataset_id = %s", (dataset_id,), ) @@ -445,7 +445,7 @@ def pipeline(): for right in dataset_rights: item = {} - item["rightsValue"] = right[0] + item["rightsName"] = right[0] if right[1] is not None and right[1] != "": item["rightsURI"] = right[1] @@ -453,6 +453,8 @@ def pipeline(): item["rightsIdentifier"] = right[2] if right[3] is not None and right[3] != "": item["rightsIdentifierScheme"] = right[3] + if right[4] is not None and right[4] != "": + item["schemeURI"] = right[4] rights.append(item) @@ -461,7 +463,6 @@ def pipeline(): # Create the publisher object dataset_metadata["publisher"] = { "publisherName": "FAIRhub", - "publisherIdentifier": "https://fairhub.io", } sizes = [] From 4a474fe0bcf041c5c4d21f6029d4df6e7006fc98 Mon Sep 17 00:00:00 2001 From: Sanjay Soundarajan Date: Thu, 29 Feb 2024 17:40:33 -0800 Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=90=9B=20fix:=20add=20funderIdentifie?= =?UTF-8?q?rType=20to=20funderIdentifier?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../generate_dataset_description.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py index 3850e1e..ac2b5e0 100644 --- a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py +++ b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py @@ -514,9 +514,8 @@ def pipeline(): item["funderName"] = funding_reference[0] item["funderIdentifier"] = {} item["funderIdentifier"]["funderIdentifierValue"] = funding_reference[1] + item["funderIdentifier"]["funderIdentifierType"] = funding_reference[2] - if funding_reference[2] is not None and funding_reference[2] != "": - item["funderIdentifier"]["funderIdentifierType"] = funding_reference[2] if funding_reference[3] is not None and funding_reference[3] != "": item["funderIdentifier"]["schemeURI"] = funding_reference[3] From aa91b8440cfc02428ed00eb9571da4678597898f Mon Sep 17 00:00:00 2001 From: Sanjay Soundarajan Date: Thu, 30 May 2024 09:57:49 -0700 Subject: [PATCH 5/5] =?UTF-8?q?=E2=9C=A8=20feat:=20update=20study=5Fdescri?= =?UTF-8?q?ption=20(#12)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🐛 fix: add more datacite changes * ✨ feat: update study_description * refactor: ♻️ conversion function transfer (#14) * :sparkles: feat: convert function transferred to pyfairdatatools * :recycle: refactor: call updated columns on managing org table * :recycle: refactor: call new table for managing org * :hammer: chore: remove unused modules * :sparkles: working doi registration with conversion fn transfer * chore: generate mermaid diagrams Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --------- Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Dorian Portillo Co-authored-by: megasanjay --- mermaid/svg/flow.svg | 2 +- mermaid/svg/publish.svg | 2 +- .../generate_dataset_description.py | 99 ++- .../generate_study_description.py | 645 ++++++++++-------- publish_pipeline/register_doi/register_doi.py | 280 +------- 5 files changed, 454 insertions(+), 574 deletions(-) diff --git a/mermaid/svg/flow.svg b/mermaid/svg/flow.svg index a158bd0..8f81e1d 100644 --- a/mermaid/svg/flow.svg +++ b/mermaid/svg/flow.svg @@ -1 +1 @@ -
TOP
B1
f1
i1
C
B2
f2
i2
A
B
\ No newline at end of file +
TOP
B1
f1
i1
C
B2
f2
i2
A
B
\ No newline at end of file diff --git a/mermaid/svg/publish.svg b/mermaid/svg/publish.svg index 40ec45f..688f424 100644 --- a/mermaid/svg/publish.svg +++ b/mermaid/svg/publish.svg @@ -1 +1 @@ -
prepare data
Generate CDS data structure
generate CDS data structure
Generate high level metadata
generate study_description.json
generate dataset_description.json
generate README.md
generate CHANGELOG.md
generate LICENSE
generate datasheet.md
generate citation.md
generate datatype_dictionary.json
generate participants.tsv
generate participants.json
generate CDS data structure
generate derivatives
User clicks 'publish' in UI
\ No newline at end of file +
prepare data
Generate CDS data structure
generate CDS data structure
Generate high level metadata
generate study_description.json
generate dataset_description.json
generate README.md
generate CHANGELOG.md
generate LICENSE
generate datasheet.md
generate citation.md
generate datatype_dictionary.json
generate participants.tsv
generate participants.json
generate CDS data structure
generate derivatives
User clicks 'publish' in UI
\ No newline at end of file diff --git a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py index ac2b5e0..522d7b0 100644 --- a/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py +++ b/publish_pipeline/generate_high_level_metadata/generate_dataset_description.py @@ -5,6 +5,7 @@ import pathlib import tempfile import uuid +import json import azure.storage.blob as azureblob import psycopg2 @@ -28,8 +29,8 @@ def pipeline(): cur = conn.cursor() - study_id = "c588f59c-cacb-4e52-99dd-95b37dcbfd5c" - dataset_id = "af4be921-e507-41a9-9328-4cbb4b7dca1c" + study_id = "e631d9c1-a74a-413f-a5ce-64535a7302b0" + dataset_id = "f636e555-4c2d-4c89-a79b-a0a63bc29664" cur.execute( "SELECT * FROM dataset WHERE id = %s AND study_id = %s", @@ -149,26 +150,35 @@ def pipeline(): affiliation_item = {} affiliation_item["affiliationName"] = affiliation["name"] + + affiliation_item["affiliationIdentifier"] = {} + if ( affiliation["identifier"] is not None and affiliation["identifier"] != "" ): - affiliation_item["affiliationIdentifier"] = affiliation[ - "identifier" - ] + affiliation_item["affiliationIdentifier"][ + "affiliationIdentifierValue" + ] = affiliation["identifier"] + if affiliation["scheme"] is not None and affiliation["scheme"] != "": - affiliation_item["affiliationIdentifierScheme"] = affiliation[ - "scheme" - ] + affiliation_item["affiliationIdentifier"][ + "affiliationIdentifierScheme" + ] = affiliation["scheme"] if ( affiliation["scheme_uri"] is not None and affiliation["scheme_uri"] != "" ): - affiliation_item["schemeURI"] = affiliation["scheme_uri"] + affiliation_item["affiliationIdentifier"]["schemeURI"] = ( + affiliation["scheme_uri"] + ) item["affiliation"].append(affiliation_item) + if item["affiliation"] == []: + del item["affiliation"] + creators.append(item) dataset_metadata["creator"] = creators @@ -221,28 +231,39 @@ def pipeline(): affiliation_item = {} affiliation_item["affiliationName"] = affiliation["name"] + + affiliation_item["affiliationIdentifier"] = {} + if ( affiliation["identifier"] is not None and affiliation["identifier"] != "" ): - affiliation_item["affiliationIdentifier"] = affiliation[ - "identifier" - ] + affiliation_item["affiliationIdentifier"][ + "affiliationIdentifierValue" + ] = affiliation["identifier"] + if affiliation["scheme"] is not None and affiliation["scheme"] != "": - affiliation_item["affiliationIdentifierScheme"] = affiliation[ - "scheme" - ] + affiliation_item["affiliationIdentifier"][ + "affiliationIdentifierScheme" + ] = affiliation["scheme"] + if ( affiliation["scheme_uri"] is not None and affiliation["scheme_uri"] != "" ): - affiliation_item["schemeURI"] = affiliation["scheme_uri"] + affiliation_item["affiliationIdentifier"]["schemeURI"] = ( + affiliation["scheme_uri"] + ) item["affiliation"].append(affiliation_item) + if item["affiliation"] == []: + del item["affiliation"] + contributors.append(item) - dataset_metadata["contributor"] = contributors + if len(contributors) > 0: + dataset_metadata["contributor"] = contributors # Get the publication year publication_year = str(datetime.datetime.now().year) @@ -377,14 +398,20 @@ def pipeline(): item = {} item["subjectValue"] = subject[0] + + item["subjectIdentifier"] = {} + if subject[1] is not None and subject[1] != "": - item["subjectScheme"] = subject[1] + item["subjectIdentifier"]["subjectScheme"] = subject[1] + if subject[2] is not None and subject[2] != "": - item["schemeURI"] = subject[2] + item["subjectIdentifier"]["schemeURI"] = subject[2] + if subject[3] is not None and subject[3] != "": - item["valueURI"] = subject[3] + item["subjectIdentifier"]["valueURI"] = subject[3] + if subject[4] is not None and subject[4] != "": - item["classificationCode"] = subject[4] + item["subjectIdentifier"]["classificationCode"] = subject[4] subjects.append(item) @@ -394,19 +421,34 @@ def pipeline(): # Get the dataset managing organization cur.execute( - "SELECT managing_organization_name, managing_organization_ror_id FROM dataset_other WHERE dataset_id = %s", + "SELECT name, identifier, identifier_scheme, identifier_scheme_uri FROM dataset_managing_organization WHERE dataset_id = %s", (dataset_id,), ) dataset_managing_organization = cur.fetchone() + print(dataset_managing_organization) managing_organization["name"] = dataset_managing_organization[0] if ( dataset_managing_organization[1] is not None and dataset_managing_organization[1] != "" ): - managing_organization["rorId"] = dataset_managing_organization[1] + managing_organization["managingOrganizationIdentifier"] = {} + managing_organization["managingOrganizationIdentifier"]["managingOrganizationIdentifierValue"] = dataset_managing_organization[1] + + if ( + dataset_managing_organization[2] is not None + and dataset_managing_organization[2] != "" + ): + managing_organization["managingOrganizationIdentifier"]["managingOrganizationScheme"] = dataset_managing_organization[2] + if ( + dataset_managing_organization[3] is not None + and dataset_managing_organization[3] != "" + ): + managing_organization["managingOrganizationIdentifier"]["schemeURI"] = dataset_managing_organization[3] + + print(managing_organization) dataset_metadata["managingOrganization"] = managing_organization access_details = {} @@ -449,12 +491,15 @@ def pipeline(): if right[1] is not None and right[1] != "": item["rightsURI"] = right[1] + + item["rightsIdentifier"] = {} + if right[2] is not None and right[2] != "": - item["rightsIdentifier"] = right[2] + item["rightsIdentifier"]["rightsIdentifierValue"] = right[2] if right[3] is not None and right[3] != "": - item["rightsIdentifierScheme"] = right[3] + item["rightsIdentifier"]["rightsIdentifierScheme"] = right[3] if right[4] is not None and right[4] != "": - item["schemeURI"] = right[4] + item["rightsIdentifier"]["schemeURI"] = right[4] rights.append(item) @@ -569,6 +614,8 @@ def pipeline(): temp_folder_path = tempfile.mkdtemp() temp_file_path = pathlib.Path(temp_folder_path, "dataset_description.json") + + print(json.dumps(dataset_metadata)) data_is_valid = pyfairdatatools.validate.validate_dataset_description( data=dataset_metadata diff --git a/publish_pipeline/generate_high_level_metadata/generate_study_description.py b/publish_pipeline/generate_high_level_metadata/generate_study_description.py index 6cd37ae..3f005de 100644 --- a/publish_pipeline/generate_high_level_metadata/generate_study_description.py +++ b/publish_pipeline/generate_high_level_metadata/generate_study_description.py @@ -29,11 +29,11 @@ def pipeline(): study_id = "c588f59c-cacb-4e52-99dd-95b37dcbfd5c" - cur.execute("SELECT * FROM study WHERE id = %s", (study_id,)) + cur.execute("SELECT title, acronym FROM study WHERE id = %s", (study_id,)) study = cur.fetchone() - if study is None: + if study[0] is None: return "Study not found" identification_module = {} @@ -46,26 +46,31 @@ def pipeline(): primary_study_identification = cur.fetchone() - identification_module["OrgStudyIdInfo"] = {} + identification_module["officialTitle"] = study[0] + + if study[1] is not None and study[1] != "": + identification_module["acronym"] = {} + + identification_module["orgStudyIdInfo"] = {} # Study Identifier - identification_module["OrgStudyIdInfo"]["OrgStudyId"] = ( + identification_module["orgStudyIdInfo"]["orgStudyId"] = ( primary_study_identification[0] ) # Study Identifier Type - identification_module["OrgStudyIdInfo"]["OrgStudyIdType"] = ( + identification_module["orgStudyIdInfo"]["orgStudyIdType"] = ( primary_study_identification[1] ) if primary_study_identification[2] and primary_study_identification[2] != "": # Study Identifier Domain - identification_module["OrgStudyIdInfo"]["OrgStudyIdDomain"] = ( + identification_module["orgStudyIdInfo"]["orgStudyIdDomain"] = ( primary_study_identification[2] ) if primary_study_identification[3] and primary_study_identification[3] != "": # Study Identifier Link - identification_module["OrgStudyIdInfo"]["OrgStudyIdLink"] = ( + identification_module["orgStudyIdInfo"]["orgStudyIdLink"] = ( primary_study_identification[3] ) @@ -77,25 +82,25 @@ def pipeline(): secondary_study_identification = cur.fetchall() - identification_module["SecondaryIdInfoList"] = [] + identification_module["secondaryIdInfoList"] = [] for row in secondary_study_identification: item = {} # Study Identifier and Study Identifier Type - item["SecondaryId"] = row[0] - item["SecondaryIdType"] = row[1] + item["secondaryId"] = row[0] + item["secondaryIdType"] = row[1] if row[2]: # Study Identifer Domain - item["SecondaryIdDomain"] = row[2] + item["secondaryIdDomain"] = row[2] if row[3]: # Study Identifier Link - item["SecondaryIdLink"] = row[3] + item["secondaryIdLink"] = row[3] - identification_module["SecondaryIdInfoList"].append(item) + identification_module["secondaryIdInfoList"].append(item) - study_metadata["IdentificationModule"] = identification_module + study_metadata["identificationModule"] = identification_module status_module = {} @@ -107,75 +112,166 @@ def pipeline(): study_status = cur.fetchone() - status_module["OverallStatus"] = study_status[0] - status_module["WhyStopped"] = study_status[1] + status_module["overallStatus"] = study_status[0] + status_module["whyStopped"] = study_status[1] start_date = datetime.datetime.strptime(study_status[2], "%Y-%m-%d %H:%M:%S") - status_module["StartDateStruct"] = { + status_module["startDateStruct"] = { # date format: Month DD, YYYY - "StartDate": start_date.strftime("%B %d, %Y"), - "StartDateType": study_status[3], + "startDate": start_date.strftime("%B %d, %Y"), + "startDateType": study_status[3], } completion_date = datetime.datetime.strptime(study_status[4], "%Y-%m-%d %H:%M:%S") - status_module["CompletionDateStruct"] = { - "CompletionDate": completion_date.strftime("%B %d, %Y"), - "CompletionDateType": study_status[5], + status_module["completionDateStruct"] = { + "completionDate": completion_date.strftime("%B %d, %Y"), + "completionDateType": study_status[5], } - study_metadata["StatusModule"] = status_module + study_metadata["statusModule"] = status_module sponsor_collaborators_module = {} - # Get the study sponsor and collaborators metadata + # Get the study sponsor metadata cur.execute( - "SELECT responsible_party_type, responsible_party_investigator_name, responsible_party_investigator_title, responsible_party_investigator_affiliation, lead_sponsor_name, collaborator_name FROM study_sponsors_collaborators WHERE study_id = %s", + "SELECT responsible_party_type, responsible_party_investigator_first_name, responsible_party_investigator_last_name, responsible_party_investigator_title, responsible_party_investigator_identifier_value, responsible_party_investigator_identifier_scheme, responsible_party_investigator_identifier_scheme_uri, responsible_party_investigator_affiliation_name, responsible_party_investigator_affiliation_identifier_value, responsible_party_investigator_affiliation_identifier_scheme, responsible_party_investigator_affiliation_identifier_scheme_uri, lead_sponsor_name, lead_sponsor_identifier, lead_sponsor_scheme, lead_sponsor_scheme_uri FROM study_sponsors WHERE study_id = %s", (study_id,), ) - sponsor_collaborators = cur.fetchone() + study_sponsors = cur.fetchone() - sponsor_collaborators_module["ResponsibleParty"] = { - "ResponsiblePartyType": sponsor_collaborators[0], - "ResponsiblePartyInvestigatorFullName": sponsor_collaborators[1], - "ResponsiblePartyInvestigatorTitle": sponsor_collaborators[2], - "ResponsiblePartyInvestigatorAffiliation": sponsor_collaborators[3], - } + responsible_party = {} - sponsor_collaborators_module["LeadSponsor"] = { - "LeadSponsorName": sponsor_collaborators[4] - } + responsible_party["responsiblePartyType"] = study_sponsors[0] + + if study_sponsors[1] is not None and study_sponsors[1] != "": + responsible_party["responsiblePartyInvestigatorFirstName"] = study_sponsors[1] + if study_sponsors[2] is not None and study_sponsors[2] != "": + responsible_party["responsiblePartyInvestigatorLastName"] = study_sponsors[2] + if study_sponsors[3] is not None and study_sponsors[3] != "": + responsible_party["responsiblePartyInvestigatorTitle"] = study_sponsors[3] + if study_sponsors[4] is not None and study_sponsors[4] != "": + responsible_party["responsiblePartyInvestigatorIdentifier"] = {} + + responsible_party["responsiblePartyInvestigatorIdentifier"][ + "responsiblePartyInvestigatorIdentifierValue" + ] = study_sponsors[4] + + if study_sponsors[5] is not None and study_sponsors[5] != "": + responsible_party["responsiblePartyInvestigatorIdentifier"][ + "responsiblePartyInvestigatorIdentifierScheme" + ] = study_sponsors[5] - sponsor_collaborators_module["CollaboratorList"] = [] + if study_sponsors[6] is not None and study_sponsors[6] != "": + responsible_party["responsiblePartyInvestigatorIdentifier"]["schemeURI"] = ( + study_sponsors[6] + ) - sponsor_collaborators = sponsor_collaborators[5] + if study_sponsors[7] is not None and study_sponsors[7] != "": + responsible_party["responsiblePartyInvestigatorAffiliation"] = {} + + responsible_party["responsiblePartyInvestigatorAffiliation"][ + "responsiblePartyInvestigatorAffiliationName" + ] = study_sponsors[7] + + if study_sponsors[8] is not None and study_sponsors[8] != "": + responsible_party["responsiblePartyInvestigatorAffiliation"][ + "responsiblePartyInvestigatorAffiliationIdentifier" + ] = {} + + responsible_party["responsiblePartyInvestigatorAffiliation"][ + "responsiblePartyInvestigatorAffiliationIdentifier" + ][ + "responsiblePartyInvestigatorAffiliationIdentifierValue" + ] = study_sponsors[ + 8 + ] + + if study_sponsors[9] is not None and study_sponsors[9] != "": + responsible_party["responsiblePartyInvestigatorAffiliation"][ + "responsiblePartyInvestigatorAffiliationIdentifier" + ][ + "responsiblePartyInvestigatorAffiliationIdentifierScheme" + ] = study_sponsors[ + 9 + ] + + if study_sponsors[10] is not None and study_sponsors[10] != "": + responsible_party["responsiblePartyInvestigatorAffiliation"][ + "responsiblePartyInvestigatorAffiliationIdentifier" + ]["schemeURI"] = study_sponsors[10] + + sponsor_collaborators_module["responsibleParty"] = responsible_party + + lead_sponsor = {"leadSponsorName": study_sponsors[11]} + + if study_sponsors[12] is not None and study_sponsors[12] != "": + lead_sponsor["leadSponsor"]["leadSponsorIdentifier"] = { + "leadSponsorIdentifierValue": study_sponsors[12] + } + if study_sponsors[13] is not None and study_sponsors[13] != "": + lead_sponsor["leadSponsor"]["leadSponsorIdentifier"][ + "leadSponsorIdentifierScheme" + ] = study_sponsors[13] + + sponsor_collaborators_module["leadSponsor"] = lead_sponsor + + # Get the study collaborators metadata + cur.execute( + "SELECT name, identifier, scheme, scheme_uri FROM study_collaborators WHERE study_id = %s", + (study_id,), + ) - for row in sponsor_collaborators: - # Add the collabarator(s) to the list - item = {"CollaboratorName": row} + study_collaborators = cur.fetchall() + + collaborators = [] + + for row in study_collaborators: + item = {} - sponsor_collaborators_module["CollaboratorList"].append(item) + item["collaboratorName"] = row[0] - study_metadata["SponsorCollaboratorsModule"] = sponsor_collaborators_module + if row[1] is not None and row[1] != "": + item["collaboratorNameIdentifier"] = { + "collaboratorNameIdentifierValue": row[1] + } + + if row[2] is not None and row[2] != "": + item["collaboratorNameIdentifier"][ + "collaboratorNameIdentifierScheme" + ] = row[2] + if row[3] is not None and row[3] != "": + item["collaboratorNameIdentifier"]["schemeURI"] = row[3] + + collaborators.append(item) + + sponsor_collaborators_module["collaboratorList"] = collaborators + + study_metadata["sponsorCollaboratorsModule"] = sponsor_collaborators_module oversight_module = {} # Get the study oversight metadata cur.execute( - "SELECT oversight_has_dmc FROM study_other WHERE study_id = %s", + "SELECT fda_regulated_drug, fda_regulated_device, human_subject_review_status, has_dmc FROM study_oversight WHERE study_id = %s", (study_id,), ) study_oversight = cur.fetchone() - if study_oversight[0]: - oversight_module["OversightHasDMC"] = "Yes" - else: - oversight_module["OversightHasDMC"] = "No" + if study_oversight[0] is not None and study_oversight[0] != "": + oversight_module["isFDARegulatedDrug"] = study_oversight[0] + if study_oversight[1] is not None and study_oversight[1] != "": + oversight_module["isFDARegulatedDevice"] = study_oversight[1] + + oversight_module["humanSubjectReviewStatus"] = study_oversight[2] - study_metadata["OversightModule"] = oversight_module + if study_oversight[3] is not None and study_oversight[3] != "": + oversight_module["oversightHasDMC"] = study_oversight[3] + + study_metadata["oversightModule"] = oversight_module description_module = {} @@ -187,110 +283,157 @@ def pipeline(): study_description = cur.fetchone() - description_module["BriefSummary"] = study_description[0] + description_module["briefSummary"] = study_description[0] if study_description[1] and study_description[1] != "": - description_module["DetailedDescription"] = study_description[1] + description_module["detailedDescription"] = study_description[1] - study_metadata["DescriptionModule"] = description_module + study_metadata["descriptionModule"] = description_module conditions_module = {} # Get the study conditions metadata cur.execute( - "SELECT conditions, keywords FROM study_other WHERE study_id = %s", + "SELECT name, classification_code, scheme, scheme_uri, condition_uri FROM study_conditions WHERE study_id = %s", + (study_id,), + ) + + study_conditions = cur.fetchall() + + conditions_list = [] + + for row in study_conditions: + item = {} + + item["conditionName"] = row[0] + + if row[1] is not None and row[1] != "": + item["conditionIdentifier"] = {"conditionClassificationCode": row[1]} + + if row[2] is not None and row[2] != "": + item["conditionIdentifier"]["conditionScheme"] = row[2] + + if row[3] is not None and row[3] != "": + item["conditionIdentifier"]["schemeURI"] = row[3] + + if row[4] is not None and row[4] != "": + item["conditionIdentifier"]["conditionURI"] = row[4] + + conditions_list.append(item) + + conditions_module["conditionList"] = conditions_list + + # Get the study keywords metadata + cur.execute( + "SELECT name, classification_code, scheme, scheme_uri, keyword_uri FROM study_keywords WHERE study_id = %s", (study_id,), ) - study_conditions = cur.fetchone() + study_keywords = cur.fetchall() - conditions_module["ConditionList"] = [] - conditions = study_conditions[0] + keywords_list = [] - for row in conditions: - conditions_module["ConditionList"].append(row) + for row in study_keywords: + item = {} - # todo: add keywords from the UI and API - conditions_module["KeywordList"] = ["Dataset"] - keywords = study_conditions[1] - for row in keywords: - conditions_module["KeywordList"].append(row) + item["keywordName"] = row[0] - study_metadata["ConditionsModule"] = conditions_module + if row[1] is not None and row[1] != "": + item["keywordIdentifier"] = {"keywordClassificationCode": row[1]} + + if row[2] is not None and row[2] != "": + item["keywordIdentifier"]["keywordScheme"] = row[2] + + if row[3] is not None and row[3] != "": + item["keywordIdentifier"]["schemeURI"] = row[3] + + if row[4] is not None and row[4] != "": + item["keywordIdentifier"]["keywordURI"] = row[4] + + keywords_list.append(item) + + conditions_module["keywordList"] = keywords_list + + study_metadata["conditionsModule"] = conditions_module design_module = {} # Get the study design metadata cur.execute( - "SELECT study_type, design_allocation, design_intervention_model, design_intervention_model_description, design_primary_purpose, design_masking, design_masking_description, design_who_masked_list, phase_list, enrollment_count, enrollment_type, number_arms,design_observational_model_list, design_time_perspective_list, bio_spec_retention, bio_spec_description, target_duration, number_groups_cohorts FROM study_design WHERE study_id = %s", + "SELECT study_type, design_allocation, design_intervention_model, design_intervention_model_description, design_primary_purpose, design_masking, design_masking_description, design_who_masked_list, phase_list, enrollment_count, enrollment_type, number_arms,design_observational_model_list, design_time_perspective_list, bio_spec_retention, bio_spec_description, target_duration, number_groups_cohorts, isPatientRegistry FROM study_design WHERE study_id = %s", (study_id,), ) study_design = cur.fetchone() study_type = study_design[0] - design_module["StudyType"] = study_type + design_module["studyType"] = study_type if study_type == "Interventional": - design_module["DesignInfo"] = {} - design_module["DesignInfo"]["DesignAllocation"] = study_design[1] - design_module["DesignInfo"]["DesignInterventionModel"] = study_design[2] + design_module["designInfo"] = {} + design_module["designInfo"]["designAllocation"] = study_design[1] + design_module["designInfo"]["designInterventionModel"] = study_design[2] if study_design[3] and study_design[3] != "": - design_module["DesignInfo"]["DesignInterventionModelDescription"] = ( + design_module["designInfo"]["designInterventionModelDescription"] = ( study_design[3] ) - design_module["DesignInfo"]["DesignPrimaryPurpose"] = study_design[4] + design_module["designInfo"]["designPrimaryPurpose"] = study_design[4] - design_module["DesignInfo"]["DesignMaskingInfo"] = {} - design_module["DesignInfo"]["DesignMaskingInfo"]["DesignMasking"] = ( + design_module["designInfo"]["designMaskingInfo"] = {} + design_module["designInfo"]["designMaskingInfo"]["designMasking"] = ( study_design[5] ) - design_module["DesignInfo"]["DesignMaskingInfo"]["DesignMaskingDescription"] = ( + design_module["designInfo"]["designMaskingInfo"]["designMaskingDescription"] = ( study_design[6] ) - design_module["DesignInfo"]["DesignMaskingInfo"]["DesignWhoMaskedList"] = [] + design_module["designInfo"]["designMaskingInfo"]["designWhoMaskedList"] = [] if study_design[7] is not None: for row in study_design[7]: - design_module["DesignInfo"]["DesignMaskingInfo"][ - "DesignWhoMaskedList" + design_module["designInfo"]["designMaskingInfo"][ + "designWhoMaskedList" ].append(row) - design_module["PhaseList"] = [] + design_module["phaseList"] = [] if study_design[8] is not None: for row in study_design[8]: - design_module["PhaseList"].append(row) + design_module["phaseList"].append(row) - design_module["EnrollmentInfo"] = {} - design_module["EnrollmentInfo"]["EnrollmentCount"] = str(study_design[9]) - design_module["EnrollmentInfo"]["EnrollmentType"] = study_design[10] + design_module["enrollmentInfo"] = {} + design_module["enrollmentInfo"]["enrollmentCount"] = str(study_design[9]) + design_module["enrollmentInfo"]["enrollmentType"] = study_design[10] - if study_type == "Interventional": - design_module["NumberArms"] = str(study_design[11]) + if study_type == "interventional": + design_module["numberArms"] = str(study_design[11]) + + if study_type == "observational": + design_module["designInfo"] = {} + design_module["designInfo"]["designObservationalModelList"] = [] - if study_type == "Observational": - design_module["DesignInfo"] = {} - design_module["DesignInfo"]["DesignObservationalModelList"] = [] if study_design[12] is not None: for row in study_design[12]: - design_module["DesignInfo"]["DesignObservationalModelList"].append(row) + design_module["designInfo"]["designObservationalModelList"].append(row) - design_module["DesignInfo"]["DesignTimePerspectiveList"] = [] + design_module["designInfo"]["designTimePerspectiveList"] = [] if study_design[13] is not None: for row in study_design[13]: - design_module["DesignInfo"]["DesignTimePerspectiveList"].append(row) + design_module["designInfo"]["designTimePerspectiveList"].append(row) + + design_module["bioSpec"] = {} + design_module["bioSpec"]["bioSpecRetention"] = study_design[14] - design_module["BioSpec"] = {} - design_module["BioSpec"]["BioSpecRetention"] = study_design[14] if study_design[15] is not None and study_design[15] != "": - design_module["BioSpec"]["BioSpecDescription"] = study_design[15] + design_module["bioSpec"]["bioSpecDescription"] = study_design[15] - design_module["TargetDuration"] = study_design[16] - design_module["NumberGroupsCohorts"] = str(study_design[17]) + design_module["targetDuration"] = study_design[16] + design_module["numberGroupsCohorts"] = str(study_design[17]) - study_metadata["DesignModule"] = design_module + if study_design[18] is not None and study_design[18] != "": + design_module["isPatientRegistry"] = study_design[18] + + study_metadata["designModule"] = design_module arms_interventions_module = {} @@ -302,285 +445,251 @@ def pipeline(): study_arms = cur.fetchall() - arms_interventions_module["ArmGroupList"] = [] + arms_interventions_module["armGroupList"] = [] for row in study_arms: item = {} - item["ArmGroupLabel"] = row[0] + item["armGroupLabel"] = row[0] if study_type == "Interventional": - item["ArmGroupType"] = row[1] + item["armGroupType"] = row[1] - if row[2] is not None and row[2] != "": - item["ArmGroupDescription"] = row[2] + item["armGroupDescription"] = row[2] if study_type == "Interventional" and row[3] is not None and len(row[3]) > 0: - item["ArmGroupInterventionList"] = [] + item["armGroupInterventionList"] = [] for intervention in row[3]: - item["ArmGroupInterventionList"].append(intervention) + item["armGroupInterventionList"].append(intervention) - arms_interventions_module["ArmGroupList"].append(item) + arms_interventions_module["armGroupList"].append(item) # Get the study interventions metadata cur.execute( - "SELECT type, name, description, arm_group_label_list, other_name_list FROM study_intervention WHERE study_id = %s", + "SELECT type, name, description, other_name_list FROM study_intervention WHERE study_id = %s", (study_id,), ) study_interventions = cur.fetchall() - arms_interventions_module["InterventionList"] = [] + arms_interventions_module["interventionList"] = [] for row in study_interventions: item = {} - item["InterventionType"] = row[0] - item["InterventionName"] = row[1] - if row[2] is not None and row[2] != "": - item["InterventionDescription"] = row[2] - - item["InterventionArmGroupLabelList"] = [] + item["interventionType"] = row[0] + item["interventionName"] = row[1] + item["interventionDescription"] = row[2] - if row[3] is not None: - for arm_group_label in row[3]: - item["InterventionArmGroupLabelList"].append(arm_group_label) + if row[3] is not None and len(row[3]) > 0: + item["interventionOtherNameList"] = [] - item["InterventionOtherNameList"] = [] + for other_name in row[3]: + item["interventionOtherNameList"].append(other_name) - if row[4] is not None: - for other_name in row[4]: - item["InterventionOtherNameList"].append(other_name) + arms_interventions_module["interventionList"].append(item) - arms_interventions_module["InterventionList"].append(item) - - study_metadata["ArmsInterventionsModule"] = arms_interventions_module + study_metadata["armsInterventionsModule"] = arms_interventions_module eligibility_module = {} # Get the study eligibility metadata cur.execute( - "SELECT gender, gender_based, gender_description, minimum_age_value, minimum_age_unit, maximum_age_value, maximum_age_unit, healthy_volunteers, inclusion_criteria, exclusion_criteria, study_population, sampling_method FROM study_eligibility WHERE study_id = %s", + "SELECT sex, gender_based, gender_description, minimum_age_value, minimum_age_unit, maximum_age_value, maximum_age_unit, healthy_volunteers, inclusion_criteria, exclusion_criteria, study_population, sampling_method FROM study_eligibility WHERE study_id = %s", (study_id,), ) study_eligibility = cur.fetchone() - eligibility_module["Gender"] = study_eligibility[0] - eligibility_module["GenderBased"] = study_eligibility[1] - eligibility_module["GenderDescription"] = study_eligibility[2] - eligibility_module["MinimumAge"] = f"{study_eligibility[3]} {study_eligibility[4]}" - eligibility_module["MaximumAge"] = f"{study_eligibility[5]} {study_eligibility[6]}" - if study_eligibility[7] is not None and study_eligibility[7] != "": - eligibility_module["HealthyVolunteers"] = study_eligibility[7] - if study_type == "Observational": - eligibility_module["StudyPopulation"] = study_eligibility[10] - eligibility_module["SamplingMethod"] = study_eligibility[11] + eligibility_module["sex"] = study_eligibility[0] + eligibility_module["genderBased"] = study_eligibility[1] + eligibility_module["genderDescription"] = study_eligibility[2] + eligibility_module["minimumAge"] = f"{study_eligibility[3]} {study_eligibility[4]}" + eligibility_module["maximumAge"] = f"{study_eligibility[5]} {study_eligibility[6]}" + eligibility_module["healthyVolunteers"] = study_eligibility[7] - eligibility_criteria = "" - - if study_eligibility[8] is not None: - eligibility_criteria = "Inclusion Criteria\n" + if study_type == "Observational": + eligibility_module["studyPopulation"] = study_eligibility[10] + eligibility_module["samplingMethod"] = study_eligibility[11] - for criteria in study_eligibility[8]: - eligibility_criteria += f"* {criteria}\n" + eligibility_criteria = { + "eligibilityCriteriaInclusion": [], + "eligibilityCriteriaExclusion": [], + } - if study_eligibility[9] is not None: - eligibility_criteria += "\nExclusion Criteria\n" + if study_eligibility[8] is not None and len(study_eligibility[8]) > 0: + eligibility_criteria["eligibilityCriteriaInclusion"] = study_eligibility[8] - for criteria in study_eligibility[9]: - eligibility_criteria += f"* {criteria}\n" + if study_eligibility[9] is not None and len(study_eligibility[9]) > 0: + eligibility_criteria["eligibilityCriteriaExclusion"] = study_eligibility[9] - eligibility_module["EligibilityCriteria"] = eligibility_criteria + eligibility_module["eligibilityCriteria"] = eligibility_criteria - study_metadata["EligibilityModule"] = eligibility_module + study_metadata["eligibilityModule"] = eligibility_module contacts_locations_module = {} # Get the study contacts and locations metadata cur.execute( - "SELECT name, affiliation, phone, phone_ext, email_address FROM study_contact WHERE study_id = %s AND central_contact = true", + "SELECT first_name, last_name, degree, identifier, identifier_scheme, identifier_scheme_uri, affiliation, affiliation_identifier, affiliation_identifier_scheme, affiliation_identifier_scheme_uri, phone, phone_ext, email_address FROM study_central_contact WHERE study_id = %s", (study_id,), ) study_central_contacts = cur.fetchall() - contacts_locations_module["CentralContactList"] = [] + central_contacts = [] if study_central_contacts is not None: for row in study_central_contacts: item = {} - item["CentralContactName"] = row[0] - item["CentralContactAffiliation"] = row[1] - item["CentralContactPhone"] = row[2] - if row[3] is not None and row[3] != "": - item["CentralContactPhoneExt"] = row[3] - item["CentralContactEMail"] = row[4] + item["centralContactFirstName"] = row[0] + item["centralContactLastName"] = row[1] - contacts_locations_module["CentralContactList"].append(item) + if row[2] is not None and row[2] != "": + item["centralContactDegree"] = row[2] - # Get the study contacts metadata - cur.execute( - "SELECT name, affiliation, role FROM study_overall_official WHERE study_id = %s", - (study_id,), - ) + if row[3] is not None and row[3] != "": + item["centralContactIdentifier"] = {} - contacts_locations_module["OverallOfficialList"] = [] + item["centralContactIdentifier"]["centralContactIdentifierValue"] = row[ + 3 + ] + item["centralContactIdentifierScheme"] = row[4] - study_overall_officials = cur.fetchall() + if row[5] is not None and row[5] != "": + item["schemeURI"] = row[5] - if study_overall_officials is not None: - for row in study_overall_officials: - item = {} + item["centralContactAffiliation"] = { + "centralContactAffiliationName": row[6] + } - item["OverallOfficialName"] = row[0] - item["OverallOfficialAffiliation"] = row[1] - item["OverallOfficialRole"] = row[2] + if row[7] is not None and row[7] != "": + item["centralContactAffiliation"][ + "centralContactAffiliationIdentifier" + ] = {} - contacts_locations_module["OverallOfficialList"].append(item) + item["centralContactAffiliation"][ + "centralContactAffiliationIdentifier" + ]["centralContactAffiliationIdentifierValue"] = row[7] + item["centralContactAffiliation"][ + "centralContactAffiliationIdentifier" + ]["centralContactAffiliationIdentifierScheme"] = row[8] - # Get the study locations metadata - cur.execute( - "SELECT facility, status, city, state, zip, country FROM study_location WHERE study_id = %s", - (study_id,), - ) + if row[9] is not None and row[9] != "": + item["centralContactAffiliation"][ + "centralContactAffiliationIdentifier" + ]["schemeURI"] = row[9] - study_locations = cur.fetchall() + if row[10] is not None and row[10] != "": + item["centralContactPhone"] = row[10] - contacts_locations_module["LocationList"] = [] + if row[11] is not None and row[11] != "": + item["centralContactPhoneExt"] = row[11] - if study_locations is not None: - for row in study_locations: - item = {} + item["centralContactEMail"] = row[12] - item["LocationFacility"] = row[0] - item["LocationStatus"] = row[1] - item["LocationCity"] = row[2] - if row[3] is not None and row[3] != "": - item["LocationState"] = row[3] - if row[4] is not None and row[4] != "": - item["LocationZip"] = row[4] - item["LocationCountry"] = row[5] - - contacts_locations_module["LocationList"].append(item) + central_contacts.append(item) - study_metadata["ContactsLocationsModule"] = contacts_locations_module + contacts_locations_module["centralContactList"] = central_contacts - ipd_sharing_statement_module = {} - - # Get the study IPD sharing metadata + # Get the study contacts metadata cur.execute( - "SELECT ipd_sharing, ipd_sharing_description, ipd_sharing_info_type_list, ipd_sharing_time_frame, ipd_sharing_access_criteria, ipd_sharing_url FROM study_ipdsharing WHERE study_id = %s", + "SELECT first_name, last_name, degree, identifier, identifier_scheme, identifier_scheme_uri, affiliation, affiliation_identifier, affiliation_identifier_scheme, affiliation_identifier_scheme_uri, role FROM study_overall_official WHERE study_id = %s", (study_id,), ) - ipd_sharing = cur.fetchone() - - bool_ipd_share = ipd_sharing[0] - ipd_sharing_statement_module["IPDSharing"] = ipd_sharing[0] - if bool_ipd_share == "No" and ipd_sharing[1] is not None and ipd_sharing[1] != "": - ipd_sharing_statement_module["IPDSharingDescription"] = ipd_sharing[1] - if bool_ipd_share == "Yes": - ipd_sharing_statement_module["IPDSharingDescription"] = ipd_sharing[1] + study_overall_officials = cur.fetchall() - ipd_sharing_statement_module["IPDSharingInfoTypeList"] = [] - if ipd_sharing[2] is not None: - for row in ipd_sharing[2]: - ipd_sharing_statement_module["IPDSharingInfoTypeList"].append(row) + overall_officals = [] - if ( - bool_ipd_share == "No" - and ipd_sharing_statement_module["IPDSharingInfoTypeList"] == [] - ): - # Delete key if empty - del ipd_sharing_statement_module["IPDSharingInfoTypeList"] + if study_overall_officials is not None: + for row in study_overall_officials: + item = {} - if bool_ipd_share == "No" and ipd_sharing[3] is not None and ipd_sharing[3] != "": - ipd_sharing_statement_module["IPDSharingTimeFrame"] = ipd_sharing[3] - if bool_ipd_share == "Yes": - ipd_sharing_statement_module["IPDSharingTimeFrame"] = ipd_sharing[3] + item["overallOfficialFirstName"] = row[0] + item["overallOfficialLastName"] = row[1] + item["overallOfficialDegree"] = row[2] - if bool_ipd_share == "No" and ipd_sharing[4] is not None and ipd_sharing[4] != "": - ipd_sharing_statement_module["IPDSharingAccessCriteria"] = ipd_sharing[4] - if bool_ipd_share == "Yes": - ipd_sharing_statement_module["IPDSharingAccessCriteria"] = ipd_sharing[4] + if row[3] is not None and row[3] != "": + item["overallOfficialIdentifier"] = {} - if bool_ipd_share == "No" and ipd_sharing[5] is not None and ipd_sharing[5] != "": - ipd_sharing_statement_module["IPDSharingURL"] = ipd_sharing[5] - if bool_ipd_share == "Yes": - ipd_sharing_statement_module["IPDSharingURL"] = ipd_sharing[5] + item["overallOfficialIdentifier"]["overallOfficialIdentifierValue"] = ( + row[3] + ) + item["overallOfficialIdentifierScheme"] = row[4] - study_metadata["IPDSharingStatementModule"] = ipd_sharing_statement_module + if row[5] is not None and row[5] != "": + item["overallOfficialIdentifier"]["schemeURI"] = row[5] - references_module = {} + item["overallOfficialAffiliation"] = { + "overallOfficialAffiliationName": row[6] + } - # Get the study references metadata (publications) - cur.execute( - "SELECT identifier, type, citation FROM study_reference WHERE study_id = %s", - (study_id,), - ) + if row[7] is not None and row[7] != "": + item["overallOfficialAffiliation"][ + "overallOfficialAffiliationIdentifier" + ] = {} - study_references = cur.fetchall() + item["overallOfficialAffiliation"][ + "overallOfficialAffiliationIdentifier" + ]["overallOfficialAffiliationIdentifierValue"] = row[7] + item["overallOfficialAffiliation"][ + "overallOfficialAffiliationIdentifier" + ]["overallOfficialAffiliationIdentifierScheme"] = row[8] - references_module["ReferenceList"] = [] + if row[9] is not None and row[9] != "": + item["overallOfficialAffiliation"][ + "overallOfficialAffiliationIdentifier" + ]["schemeURI"] = row[9] - if study_references is not None: - for row in study_references: - item = {} + if row[10] is not None and row[10] != "": + item["overallOfficialRole"] = row[10] - if row[0] is not None and row[0] != "": - item["ReferenceID"] = row[0] - if row[1] is not None and row[1] != "": - item["ReferenceType"] = row[1] - if row[2] is not None and row[2] != "": - item["ReferenceCitation"] = row[2] + overall_officals.append(item) - references_module["ReferenceList"].append(item) + contacts_locations_module["overallOfficialList"] = overall_officals - # Get the study links metadata + # Get the study locations metadata cur.execute( - "SELECT url, title FROM study_link WHERE study_id = %s", + "SELECT facility, status, city, state, zip, country, identifier, identifier_scheme, identifier_scheme_uri FROM study_location WHERE study_id = %s", (study_id,), ) - study_links = cur.fetchall() + study_locations = cur.fetchall() - references_module["SeeAlsoLinkList"] = [] + location_list = [] - if study_links is not None: - for row in study_links: + if study_locations is not None: + for row in study_locations: item = {} - item["SeeAlsoLinkURL"] = row[0] - if row[1] is not None and row[1] != "": - item["SeeAlsoLinkLabel"] = row[1] + item["locationFacility"] = row[0] + item["locationStatus"] = row[1] + item["locationCity"] = row[2] - references_module["SeeAlsoLinkList"].append(item) + if row[3] is not None and row[3] != "": + item["locationState"] = row[3] - # Get the study available IPD - cur.execute( - "SELECT identifier, type, url, comment FROM study_available_ipd WHERE study_id = %s", - (study_id,), - ) + if row[4] is not None and row[4] != "": + item["locationZip"] = row[4] - study_available_ipd = cur.fetchall() + item["locationCountry"] = row[5] - references_module["AvailIPDList"] = [] + if row[6] is not None and row[6] != "": + item["locationIdentifier"] = {} - if study_available_ipd is not None: - for row in study_available_ipd: - item = {} + item["locationIdentifier"]["locationIdentifierValue"] = row[6] + item["locationIdentifierScheme"] = row[7] + + if row[8] is not None and row[8] != "": + item["locationIdentifier"]["schemeURI"] = row[8] - item["AvailIPDId"] = row[0] - item["AvailIPDType"] = row[1] - item["AvailIPDURL"] = row[2] - if row[3]: - item["AvailIPDComment"] = row[3] + location_list.append(item) - references_module["AvailIPDList"].append(item) + contacts_locations_module["locationList"] = location_list - study_metadata["ReferencesModule"] = references_module + study_metadata["contactsLocationsModule"] = contacts_locations_module conn.commit() conn.close() diff --git a/publish_pipeline/register_doi/register_doi.py b/publish_pipeline/register_doi/register_doi.py index 49d1544..6a9d8a6 100644 --- a/publish_pipeline/register_doi/register_doi.py +++ b/publish_pipeline/register_doi/register_doi.py @@ -3,290 +3,14 @@ import base64 import datetime import json -import random -import string import azure.storage.blob as azureblob import requests +import pyfairdatatools import config -def generate_random_identifier(k): - """Generate a random identifier""" - return "".join(random.choices(string.ascii_lowercase + string.digits, k=k)) - - -def create_payload(dataset_description): - """Generate payload for DOI registration""" - # doi = dataset_description["Identifier"]["identifierValue"] - doi = f"10.82914/fairhub.{generate_random_identifier(6)}" - creators = [] - titles = [] - subjects = [] - contributors = [] - dates = [] - alternate_identifiers = [] - related_identifiers = [] - funding_references = [] - rights_list = [] - descriptions = [] - - for description in dataset_description["description"]: - description_obj = { - "description": description["descriptionValue"], - "descriptionType": description["descriptionType"], - } - descriptions.append(description_obj) - - for rights in dataset_description["rights"]: - rights_obj = {"rights": rights["rightsName"]} - if "rightsURI" in rights: - rights_obj["rightsUri"] = rights["rightsURI"] - if "rightsIdentifier" in rights: - rights_obj["rightsIdentifier"] = rights["rightsIdentifier"] - if "rightsIdentifierScheme" in rights: - rights_obj["rightsIdentifierScheme"] = rights["rightsIdentifierScheme"] - rights_list.append(rights_obj) - - for funder in dataset_description["fundingReference"]: - funder_obj = { - "funderName": funder["funderName"], - "funderIdentifier": funder["funderIdentifier"]["funderIdentifierValue"], - "awardNumber": funder["awardNumber"]["awardNumberValue"], - } - if "awardURI" in funder["awardNumber"]: - funder_obj["awardUri"] = funder["awardNumber"]["awardURI"] - if "awardTitle" in funder["awardNumber"]: - funder_obj["awardTitle"] = funder["awardNumber"]["awardTitle"] - if "funderIentifierType" in funder["funderIdentifier"]: - funder_obj["funderIdentifierType"] = funder["funderIdentifier"][ - "funderIdentifierType" - ] - else: - funder_obj["funderIdentifierType"] = "Other" - funding_references.append(funder_obj) - - for alternate_identifier in dataset_description["alternateIdentifier"]: - alternate_identifiers.append( - { - "alternateIdentifier": alternate_identifier["alternateIdentifierValue"], - "alternateIdentifierType": alternate_identifier[ - "alternateIdentifierType" - ], - } - ) - - for date in dataset_description["date"]: - date_obj = { - "date": date["dateValue"], - "dateType": date["dateType"], - } - if "dateInformation" in date: - date_obj["dateInformation"] = date["dateInformation"] - dates.append(date_obj) - - for contributor in dataset_description["contributor"]: - if "affiliation" in contributor: - contributor_affiliations = [] - for affiliation in contributor["affiliation"]: - # TODO: VERIFY BY KEY IS AFFILIATIONVALUE AND NOT NAME - affiliate = { - "name": affiliation["affiliationName"], - } - if "schemeURI" in affiliation: - affiliate["schemeUri"] = affiliation["schemeURI"] - if "affiliationIdentifierScheme" in affiliation: - affiliate["affiliationIdentifierScheme"] = affiliation[ - "affiliationIdentifierScheme" - ] - if "affiliationIdentifier" in affiliation: - affiliate["affiliationIdentifier"] = affiliation[ - "affiliationIdentifier" - ] - - print(affiliate) - contributor_affiliations.append(affiliate) - if "nameIdentifier" in contributor: - name_identifiers = [] - for name_identifier in contributor["nameIdentifier"]: - name_identifier = { - "nameIdentifier": name_identifier["nameIdentifierValue"], - "nameIdentifierScheme": name_identifier["nameIdentifierScheme"], - } - if "schemeURI" in name_identifier: - name_identifier["schemeURI"] = name_identifier["schemeURI"] - name_identifiers.append(name_identifier) - - contributor_obj = { - "name": contributor["contributorName"], - "nameType": contributor["nameType"], - "contributorType": contributor["contributorType"], - } - - contributor_name = contributor["contributorName"] - split_name = contributor_name.split(",") - - if len(split_name) > 1: - contributor_obj["familyName"] = split_name[0] - contributor_obj["givenName"] = split_name[1] - - if contributor_affiliations: - print(contributor_affiliations) - contributor_obj["affiliation"] = contributor_affiliations - if name_identifiers: - contributor_obj["nameIdentifiers"] = name_identifiers - - contributors.append(contributor_obj) - - for subject in dataset_description["subject"]: - subject_obj = {} - if "classificationCode" in subject: - subject_obj["classificationCode"] = subject["classificationCode"] - if "subjectScheme" in subject: - subject_obj["subjectScheme"] = subject["subjectScheme"] - if "schemeURI" in subject: - subject_obj["schemeUri"] = subject["schemeURI"] - subject_obj["subject"] = subject["subjectValue"] - subjects.append(subject_obj) - - for title in dataset_description["title"]: - title_obj = {"title": title["titleValue"]} - if "titleType" in title: - title_obj["titleType"] = title["titleType"] - titles.append(title_obj) - - for creator in dataset_description["creator"]: - if "affiliation" in creator: - creator_affiliations = [] - for affiliation in creator["affiliation"]: - affiliate = { - "name": affiliation["affiliationName"], - } - if "schemeURI" in affiliation: - affiliate["schemeUri"] = affiliation["schemeURI"] - if "affiliationIdentifierScheme" in affiliation: - affiliate["affiliationIdentifierScheme"] = affiliation[ - "affiliationIdentifierScheme" - ] - if "affiliationIdentifier" in affiliation: - affiliate["affiliationIdentifier"] = affiliation[ - "affiliationIdentifier" - ] - - creator_affiliations.append(affiliate) - if "nameIdentifier" in creator: - name_identifiers = [] - for name_identifier in creator["nameIdentifier"]: - name_identifier = { - "nameIdentifier": name_identifier["nameIdentifierValue"], - "nameIdentifierScheme": name_identifier["nameIdentifierScheme"], - } - if "schemeURI" in name_identifier: - name_identifier["schemeURI"] = name_identifier["schemeURI"] - name_identifiers.append(name_identifier) - - creator_obj = { - "name": creator["creatorName"], - "nameType": creator["nameType"], - } - - creator_name = creator["creatorName"] - split_name = creator_name.split(",") - - if len(split_name) > 1: - creator_obj["familyName"] = split_name[0] - creator_obj["givenName"] = split_name[1] - - if creator_affiliations: - creator_obj["affiliation"] = creator_affiliations - if name_identifiers: - creator_obj["nameIdentifiers"] = name_identifiers - - creators.append(creator_obj) - - for funding_reference in dataset_description["fundingReference"]: - funder_obj = {"funderName": funding_reference["funderName"]} - if ( - "funderIdentifier" in funding_reference - and "funderIdentifierValue" in funding_reference["funderIdentifier"] - ): - funder_obj["funderIdentifer"] = funding_reference["funderIdentifier"][ - "funderIdentifierValue" - ] - if ( - "funderIdentifier" in funding_reference - and "funderIdentifierType" in funding_reference["funderIdentifier"] - ): - funder_obj["funderIdentifierType"] = funding_reference["funderIdentifier"][ - "funderIdentifierType" - ] - - for related_identifier in dataset_description["relatedIdentifier"]: - related_identifier_obj = { - "relatedIdentifier": related_identifier["relatedIdentifierValue"], - "relatedIdentifierType": related_identifier["relatedIdentifierType"], - "relationType": related_identifier["relationType"], - } - - if "relatedMetadataScheme" in related_identifier: - related_identifier_obj["relatedMetadataScheme"] = related_identifier[ - "relatedMetadataScheme" - ] - if "schemeURI" in related_identifier: - related_identifier_obj["schemeURI"] = related_identifier["schemeURI"] - if "schemeType" in related_identifier: - related_identifier_obj["schemeType"] = related_identifier["schemeType"] - if "resourceTypeGeneral" in related_identifier: - related_identifier_obj["resourceTypeGeneral"] = related_identifier[ - "resource_type" - ] - related_identifiers.append(related_identifier_obj) - - payload = { - "data": { - "type": "dois", - "attributes": { - "event": "publish", - "doi": doi, - "creators": creators, - "titles": titles, - "publisher": {"name": dataset_description["publisher"]}, - "publicationYear": dataset_description["publicationYear"], - "subjects": subjects, - "contributors": contributors, - "dates": dates, - "alternateIdentifiers": alternate_identifiers, - "types": { - "resourceTypeGeneral": dataset_description["resourceType"][ - "resourceTypeGeneral" - ], - "resourceType": dataset_description["resourceType"][ - "resourceTypeValue" - ], - }, - "rightsList": rights_list, - "description": descriptions, - "version": dataset_description["version"], - "fundingReferences": funding_references, - "url": "https://staging.fairhub.io/datasets/2", - }, - } - } - - print(dataset_description["version"]) - if len(dataset_description["relatedIdentifier"]) > 0: - payload["data"]["attributes"]["relatedIdentifiers"] = dataset_description[ - "RelatedIdentifier" - ] - if len(dataset_description["size"]) > 0: - payload["data"]["attributes"]["sizes"] = dataset_description["Size"] - if dataset_description["language"]: - payload["data"]["attributes"]["language"] = dataset_description["Language"] - - return payload - - def pipeline(): """Register a DOI for the dataset""" @@ -326,7 +50,7 @@ def pipeline(): dataset_description = json.loads(stream) # Create payload for doi registration - payload = create_payload(dataset_description) + payload = pyfairdatatools.utils.convert_for_datacite(dataset_description) url = f"{config.DATACITE_API_URL}/dois" headers = {