helxplatform · YaphetKG · Sep 6, 2023 · Aug 24, 2023 · Aug 24, 2023 · Aug 24, 2023
diff --git a/bin/get_heal_platform_mds_data_dicts.py b/bin/get_heal_platform_mds_data_dicts.py
@@ -225,9 +225,12 @@ def generate_dbgap_files(dbgap_dir, studies_with_data_dicts_dir):
                     data_table.set('id', study['gen3_discovery']['@id'])
                 else:
                     logging.warning(f"No identifier found in data dictionary file {file_path}")
-
-                if 'label' in study['gen3_discovery']:
-                    data_table.set('label', study['gen3_discovery']['label'])
+                study_name =  study.get('gen3_discovery', {}).get('label') or study.get('gen3_discovery', {}).get('study_metadata',{}).get('minimal_info',{}).get('study_name')
+                if study_name:
+                    data_table.set('study_name', study_name)                
+                study_description = study.get('gen3_discovery', {}).get('study_metadata',{}).get('minimal_info',{}).get('study_description')
+                if study_description:
+                    data_table.set('study_description', study_description)
 
                 # Determine the data_table study_id from the internal HEAL Data Platform (HDP) identifier.
                 if '_hdp_uid' in study['gen3_discovery']:

diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py
@@ -5,7 +5,7 @@
 
 from dug.core.parsers import Parser, DugElement, DugConcept
 import dug.core.tranql as tql
-from dug.utils import biolink_snake_case
+from dug.utils import biolink_snake_case, get_formatted_biolink_name
 
 logger = logging.getLogger('dug')
 
@@ -152,7 +152,7 @@ def annotate_element(self, element):
                 concept = DugConcept(concept_id=identifier.id,
                                                        name=identifier.label,
                                                        desc=identifier.description,
-                                                       concept_type=identifier.type)
+                                                       concept_type=identifier.types)
                 # Add to list of concepts
                 self.concepts[identifier.id] = concept
 
@@ -218,7 +218,7 @@ def expand_to_dug_element(self,
 
             # convert the first type to snake case to be used in tranql query.
             # first type is the leaf type, this is coming from Node normalization.
-            node_type = biolink_snake_case(identifier.types[0].replace("biolink:", ""))
+            node_type = biolink_snake_case(get_formatted_biolink_name(identifier.types).replace("biolink:", ""))
             try:
                 # Tranql query factory currently supports select node types as valid query
                 # Types missing from QueryFactory.data_types will be skipped with this try catch

diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py
@@ -11,6 +11,7 @@
 from .topmed_csv_parser import TOPMedCSVParser
 from .sprint_parser import SPRINTParser
 from .bacpac_parser import BACPACParser
+from .heal_dp_parser import HEALDPParser
 
 
 logger = logging.getLogger('dug')
@@ -30,6 +31,10 @@ def define_parsers(parser_dict: Dict[str, Parser]):
     parser_dict["kfdrc"] = KFDRCDbGaPParser()
     parser_dict["sprint"] = SPRINTParser()
     parser_dict["bacpac"] = BACPACParser()
+    parser_dict["heal-studies"] = HEALDPParser(study_type="HEAL Studies")
+    parser_dict["heal-research"] = HEALDPParser(study_type="HEAL Research Programs")
+
+
 
 
 class ParserNotFoundException(Exception):

diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py
@@ -39,7 +39,7 @@ def parse_study_name_from_gap_exchange_file(filepath: Path) -> str:
 
 
     def _get_element_type(self):
-        return "DbGaP"
+        return "dbGaP"
 
     def __call__(self, input_file: InputFile) -> List[Indexable]:
         logger.debug(input_file)

diff --git a/src/dug/core/parsers/heal_dp_parser.py b/src/dug/core/parsers/heal_dp_parser.py
@@ -0,0 +1,56 @@
+import logging
+import os
+from typing import List
+from xml.etree import ElementTree as ET
+
+from dug import utils as utils
+from ._base import DugElement, FileParser, Indexable, InputFile
+
+logger = logging.getLogger('dug')
+
+
+class HEALDPParser(FileParser):
+    # Class for parsers Heal data platform converted Data dictionary into a set of Dug Elements
+
+    def __init__(self, study_type="HEAL Studies"):
+        super()
+        self.study_type = study_type
+
+
+    def get_study_type(self):
+        return self.study_type
+
+    def set_study_type(self, study_type):
+        self.study_type = study_type
+
+    def __call__(self, input_file: InputFile) -> List[Indexable]:
+        logger.debug(input_file)
+        tree = ET.parse(input_file)
+        root = tree.getroot()
+        study_id = root.attrib['study_id']
+
+        # Parse study name from file handle
+        study_name = root.get('study_name')
+
+        if study_name is None:
+            err_msg = f"Unable to parse study name from data dictionary: {input_file}!"
+            logger.error(err_msg)
+            raise IOError(err_msg)
+
+        elements = []
+        for variable in root.iter('variable'):
+            elem = DugElement(elem_id=f"{variable.attrib['id']}",
+                              name=variable.find('name').text,
+                              desc=variable.find('description').text.lower(),
+                              elem_type=self.get_study_type(),
+                              collection_id=f"{study_id}",
+                              collection_name=study_name)
+
+            # Create NIDA links as study/variable actions
+            elem.collection_action = utils.get_heal_platform_link(study_id=study_id)
+            # Add to set of variables
+            logger.debug(elem)
+            elements.append(elem)
+
+        # You don't actually create any concepts
+        return elements
diff --git a/src/dug/core/parsers/nida_parser.py b/src/dug/core/parsers/nida_parser.py
@@ -43,7 +43,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
             elem = DugElement(elem_id=f"{variable.attrib['id']}.p{participant_set}",
                               name=variable.find('name').text,
                               desc=variable.find('description').text.lower(),
-                              elem_type="DbGaP",
+                              elem_type="NIDA",
                               collection_id=f"{study_id}.p{participant_set}",
                               collection_name=study_name)
 

diff --git a/src/dug/core/parsers/topmed_csv_parser.py b/src/dug/core/parsers/topmed_csv_parser.py
@@ -34,7 +34,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
                 elem = DugElement(elem_id=row['variable_full_accession'],
                                   name=row['variable_name'],
                                   desc=row['variable_desc'],
-                                  elem_type="DbGaP",
+                                  elem_type="dbGaP",
                                   collection_id=row['study_full_accession'],
                                   collection_name=row['study_name'])
 

diff --git a/src/dug/core/parsers/topmed_tag_parser.py b/src/dug/core/parsers/topmed_tag_parser.py
@@ -59,7 +59,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
                     elem_id=row['variable_full_accession'],
                     name=row['variable_name'] if 'variable_name' in row else row['variable_full_accession'],
                     desc=row['variable_description'] if 'variable_description' in row else row['variable_full_accession'],
-                    elem_type="DbGaP",
+                    elem_type="dbGaP",
                     collection_id=row['study_full_accession'],
                     collection_name=row['study_name']
                 )

diff --git a/src/dug/utils.py b/src/dug/utils.py
@@ -1,4 +1,7 @@
 import re
+import bmt
+
+bmt_tk = bmt.Toolkit()
 
 class ObjectFactory:
     def __init__(self):
@@ -37,6 +40,11 @@ def get_nida_study_link(study_id):
     base_url = "https://datashare.nida.nih.gov/study"
     return f'{base_url}/{study_id}'
 
+def get_heal_platform_link(study_id):
+    base_url = "https://healdata.org/portal/discovery"
+    accession = study_id.split(':')[1]
+    return f'{base_url}/{accession}'
+
 
 def biolink_snake_case(arg):
     """Convert such SnakeCase to snake_case.
@@ -57,4 +65,13 @@ def biolink_snake_case(arg):
         lambda c: c.group(0).lower(),
         tmp
     )
-    return tmp
+    return tmp
+
+def get_formatted_biolink_name(bl_type):
+    category = bl_type
+    if isinstance(bl_type, str):
+        bl_element = bmt_tk.get_element(bl_type)
+        category = bl_element.class_uri or bl_element.slot_uri
+    if isinstance(bl_type, list):
+        return get_formatted_biolink_name(bl_type[0])
+    return category