From e646d0e9492d413b2e1d0afef5d8890c0afeafa1 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 24 Aug 2023 13:16:40 -0400 Subject: [PATCH 1/8] add study name and description for platform files --- bin/get_heal_platform_mds_data_dicts.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bin/get_heal_platform_mds_data_dicts.py b/bin/get_heal_platform_mds_data_dicts.py index e16b89ab..a48e77ba 100644 --- a/bin/get_heal_platform_mds_data_dicts.py +++ b/bin/get_heal_platform_mds_data_dicts.py @@ -225,9 +225,12 @@ def generate_dbgap_files(dbgap_dir, studies_with_data_dicts_dir): data_table.set('id', study['gen3_discovery']['@id']) else: logging.warning(f"No identifier found in data dictionary file {file_path}") - - if 'label' in study['gen3_discovery']: - data_table.set('label', study['gen3_discovery']['label']) + study_name = study.get('gen3_discovery', {}).get('label') or study.get('gen3_discovery', {}).get('study_metadata',{}).get('minimal_info',{}).get('study_name') + if study_name: + data_table.set('study_name', study_name) + study_description = study.get('gen3_discovery', {}).get('study_metadata',{}).get('minimal_info',{}).get('study_description') + if study_description: + data_table.set('study_description', study_description) # Determine the data_table study_id from the internal HEAL Data Platform (HDP) identifier. if '_hdp_uid' in study['gen3_discovery']: @@ -401,4 +404,5 @@ def get_heal_platform_mds_data_dicts(output, mds_metadata_endpoint, limit): # Run get_heal_platform_mds_data_dicts() if not used as a library. if __name__ == "__main__": - get_heal_platform_mds_data_dicts() + # get_heal_platform_mds_data_dicts() + generate_dbgap_files('mds_data/dbGaPs', 'mds_data/studies_with_data_dicts') From 020c7ebb3a4197ddbff8dffa016dd06352bc8c6b Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 24 Aug 2023 13:18:04 -0400 Subject: [PATCH 2/8] add heal parser --- src/dug/core/crawler.py | 2 +- src/dug/core/parsers/dbgap_parser.py | 2 +- src/dug/core/parsers/heal_dp_parser.py | 57 ++++++++++++++++++++++++++ src/dug/core/parsers/nida_parser.py | 2 +- src/dug/utils.py | 5 +++ 5 files changed, 65 insertions(+), 3 deletions(-) create mode 100644 src/dug/core/parsers/heal_dp_parser.py diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 3ae70574..61d6b05c 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -152,7 +152,7 @@ def annotate_element(self, element): concept = DugConcept(concept_id=identifier.id, name=identifier.label, desc=identifier.description, - concept_type=identifier.type) + concept_type=identifier.types) # Add to list of concepts self.concepts[identifier.id] = concept diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index 2926b1f1..a362d028 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -39,7 +39,7 @@ def parse_study_name_from_gap_exchange_file(filepath: Path) -> str: def _get_element_type(self): - return "DbGaP" + return "dbGaP" def __call__(self, input_file: InputFile) -> List[Indexable]: logger.debug(input_file) diff --git a/src/dug/core/parsers/heal_dp_parser.py b/src/dug/core/parsers/heal_dp_parser.py new file mode 100644 index 00000000..2f3dddd2 --- /dev/null +++ b/src/dug/core/parsers/heal_dp_parser.py @@ -0,0 +1,57 @@ +import logging +import os +from typing import List +from xml.etree import ElementTree as ET + +from dug import utils as utils +from ._base import DugElement, FileParser, Indexable, InputFile + +logger = logging.getLogger('dug') + + +class HEALDPParser(FileParser): + # Class for parsers Heal data platform converted Data dictionary into a set of Dug Elements + + def __init__(self, study_type="HEAL Studies"): + super() + self.study_type = study_type + + + def get_study_type(self): + return self.study_type + + def set_study_type(self, study_type): + self.study_type = study_type + + def __call__(self, input_file: InputFile) -> List[Indexable]: + logger.debug(input_file) + tree = ET.parse(input_file) + root = tree.getroot() + study_id = root.attrib['study_id'] + participant_set = root.get('participant_set','0') + + # Parse study name from file handle + study_name = root.get('study_name') + + if study_name is None: + err_msg = f"Unable to parse study name from data dictionary: {input_file}!" + logger.error(err_msg) + raise IOError(err_msg) + + elements = [] + for variable in root.iter('variable'): + elem = DugElement(elem_id=f"{variable.attrib['id']}.p{participant_set}", + name=variable.find('name').text, + desc=variable.find('description').text.lower(), + elem_type=self.get_study_type(), + collection_id=f"{study_id}.p{participant_set}", + collection_name=study_name) + + # Create NIDA links as study/variable actions + elem.collection_action = utils.get_heal_platform_link(study_id=study_id) + # Add to set of variables + logger.debug(elem) + elements.append(elem) + + # You don't actually create any concepts + return elements diff --git a/src/dug/core/parsers/nida_parser.py b/src/dug/core/parsers/nida_parser.py index d7a7b47d..64174ead 100644 --- a/src/dug/core/parsers/nida_parser.py +++ b/src/dug/core/parsers/nida_parser.py @@ -43,7 +43,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: elem = DugElement(elem_id=f"{variable.attrib['id']}.p{participant_set}", name=variable.find('name').text, desc=variable.find('description').text.lower(), - elem_type="DbGaP", + elem_type="NIDA", collection_id=f"{study_id}.p{participant_set}", collection_name=study_name) diff --git a/src/dug/utils.py b/src/dug/utils.py index 9b224387..97ae558a 100644 --- a/src/dug/utils.py +++ b/src/dug/utils.py @@ -37,6 +37,11 @@ def get_nida_study_link(study_id): base_url = "https://datashare.nida.nih.gov/study" return f'{base_url}/{study_id}' +def get_heal_platform_link(study_id): + base_url = "https://healdata.org/portal/discovery" + accession = study_id.split(':')[1] + return f'{base_url}/{accession}' + def biolink_snake_case(arg): """Convert such SnakeCase to snake_case. From d30c80fa660360c92071377516c6851c3860b1ca Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 24 Aug 2023 14:09:03 -0400 Subject: [PATCH 3/8] add parsers --- src/dug/core/parsers/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py index 8143d508..8e158340 100644 --- a/src/dug/core/parsers/__init__.py +++ b/src/dug/core/parsers/__init__.py @@ -11,6 +11,7 @@ from .topmed_csv_parser import TOPMedCSVParser from .sprint_parser import SPRINTParser from .bacpac_parser import BACPACParser +from .heal_dp_parser import HEALDPParser logger = logging.getLogger('dug') @@ -30,6 +31,10 @@ def define_parsers(parser_dict: Dict[str, Parser]): parser_dict["kfdrc"] = KFDRCDbGaPParser() parser_dict["sprint"] = SPRINTParser() parser_dict["bacpac"] = BACPACParser() + parser_dict["heal-studies"] = HEALDPParser(study_type="HEAL Studies") + parser_dict["heal-reasearch"] = HEALDPParser(study_type="HEAL Research Programs") + + class ParserNotFoundException(Exception): From 5f1f4629c9f10187cfb1b9bf1d3b14bfb6dc81de Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 24 Aug 2023 17:37:10 -0400 Subject: [PATCH 4/8] fix typo on research --- src/dug/core/parsers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py index 8e158340..0d0ebabe 100644 --- a/src/dug/core/parsers/__init__.py +++ b/src/dug/core/parsers/__init__.py @@ -32,7 +32,7 @@ def define_parsers(parser_dict: Dict[str, Parser]): parser_dict["sprint"] = SPRINTParser() parser_dict["bacpac"] = BACPACParser() parser_dict["heal-studies"] = HEALDPParser(study_type="HEAL Studies") - parser_dict["heal-reasearch"] = HEALDPParser(study_type="HEAL Research Programs") + parser_dict["heal-research"] = HEALDPParser(study_type="HEAL Research Programs") From e8b4b5001a663be01ee32ef0b536cc81b9cc8d29 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 30 Aug 2023 12:35:34 -0400 Subject: [PATCH 5/8] fix cde crawl --- src/dug/core/crawler.py | 4 ++-- src/dug/utils.py | 14 +++++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 61d6b05c..a0714980 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -5,7 +5,7 @@ from dug.core.parsers import Parser, DugElement, DugConcept import dug.core.tranql as tql -from dug.utils import biolink_snake_case +from dug.utils import biolink_snake_case, get_formatted_biolink_name logger = logging.getLogger('dug') @@ -218,7 +218,7 @@ def expand_to_dug_element(self, # convert the first type to snake case to be used in tranql query. # first type is the leaf type, this is coming from Node normalization. - node_type = biolink_snake_case(identifier.types[0].replace("biolink:", "")) + node_type = biolink_snake_case(get_formatted_biolink_name(identifier.types).replace("biolink:", "")) try: # Tranql query factory currently supports select node types as valid query # Types missing from QueryFactory.data_types will be skipped with this try catch diff --git a/src/dug/utils.py b/src/dug/utils.py index 97ae558a..957f80f0 100644 --- a/src/dug/utils.py +++ b/src/dug/utils.py @@ -1,4 +1,7 @@ import re +import bmt + +bmt_tk = bmt.Toolkit() class ObjectFactory: def __init__(self): @@ -62,4 +65,13 @@ def biolink_snake_case(arg): lambda c: c.group(0).lower(), tmp ) - return tmp \ No newline at end of file + return tmp + +def get_formatted_biolink_name(bl_type): + category = bl_type + if isinstance(bl_type, str): + bl_element = bmt_tk.get_element(bl_type) + category = bl_element.class_uri or bl_element.slot_uri + if isinstance(bl_type, list): + return get_formatted_biolink_name(bl_type[0]) + return category \ No newline at end of file From cd11e817683451e2a97cd954627ebec35209e50e Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 31 Aug 2023 08:43:51 -0400 Subject: [PATCH 6/8] remove testing code from platform fetch --- bin/get_heal_platform_mds_data_dicts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bin/get_heal_platform_mds_data_dicts.py b/bin/get_heal_platform_mds_data_dicts.py index a48e77ba..e6bdbc43 100644 --- a/bin/get_heal_platform_mds_data_dicts.py +++ b/bin/get_heal_platform_mds_data_dicts.py @@ -404,5 +404,4 @@ def get_heal_platform_mds_data_dicts(output, mds_metadata_endpoint, limit): # Run get_heal_platform_mds_data_dicts() if not used as a library. if __name__ == "__main__": - # get_heal_platform_mds_data_dicts() - generate_dbgap_files('mds_data/dbGaPs', 'mds_data/studies_with_data_dicts') + get_heal_platform_mds_data_dicts() From 88172d1bd950186f0114739054f96561c7cc0bf8 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 31 Aug 2023 13:49:43 -0400 Subject: [PATCH 7/8] DbGaP to dbGaP --- src/dug/core/parsers/topmed_csv_parser.py | 2 +- src/dug/core/parsers/topmed_tag_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/topmed_csv_parser.py b/src/dug/core/parsers/topmed_csv_parser.py index 329c0a05..710bcb63 100644 --- a/src/dug/core/parsers/topmed_csv_parser.py +++ b/src/dug/core/parsers/topmed_csv_parser.py @@ -34,7 +34,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: elem = DugElement(elem_id=row['variable_full_accession'], name=row['variable_name'], desc=row['variable_desc'], - elem_type="DbGaP", + elem_type="dbGaP", collection_id=row['study_full_accession'], collection_name=row['study_name']) diff --git a/src/dug/core/parsers/topmed_tag_parser.py b/src/dug/core/parsers/topmed_tag_parser.py index 64b4064b..f10ed43d 100644 --- a/src/dug/core/parsers/topmed_tag_parser.py +++ b/src/dug/core/parsers/topmed_tag_parser.py @@ -59,7 +59,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: elem_id=row['variable_full_accession'], name=row['variable_name'] if 'variable_name' in row else row['variable_full_accession'], desc=row['variable_description'] if 'variable_description' in row else row['variable_full_accession'], - elem_type="DbGaP", + elem_type="dbGaP", collection_id=row['study_full_accession'], collection_name=row['study_name'] ) From 7f51c93b8c40be40f36ab446d7d2f1c737efae52 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Fri, 1 Sep 2023 16:36:34 -0400 Subject: [PATCH 8/8] Update heal_dp_parser.py remove participant id --- src/dug/core/parsers/heal_dp_parser.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/dug/core/parsers/heal_dp_parser.py b/src/dug/core/parsers/heal_dp_parser.py index 2f3dddd2..e13c8790 100644 --- a/src/dug/core/parsers/heal_dp_parser.py +++ b/src/dug/core/parsers/heal_dp_parser.py @@ -28,7 +28,6 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: tree = ET.parse(input_file) root = tree.getroot() study_id = root.attrib['study_id'] - participant_set = root.get('participant_set','0') # Parse study name from file handle study_name = root.get('study_name') @@ -40,11 +39,11 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: elements = [] for variable in root.iter('variable'): - elem = DugElement(elem_id=f"{variable.attrib['id']}.p{participant_set}", + elem = DugElement(elem_id=f"{variable.attrib['id']}", name=variable.find('name').text, desc=variable.find('description').text.lower(), elem_type=self.get_study_type(), - collection_id=f"{study_id}.p{participant_set}", + collection_id=f"{study_id}", collection_name=study_name) # Create NIDA links as study/variable actions