From e646d0e9492d413b2e1d0afef5d8890c0afeafa1 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Thu, 24 Aug 2023 13:16:40 -0400
Subject: [PATCH 1/8] add study name and description for platform files

---
 bin/get_heal_platform_mds_data_dicts.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/bin/get_heal_platform_mds_data_dicts.py b/bin/get_heal_platform_mds_data_dicts.py
index e16b89ab..a48e77ba 100644
--- a/bin/get_heal_platform_mds_data_dicts.py
+++ b/bin/get_heal_platform_mds_data_dicts.py
@@ -225,9 +225,12 @@ def generate_dbgap_files(dbgap_dir, studies_with_data_dicts_dir):
                     data_table.set('id', study['gen3_discovery']['@id'])
                 else:
                     logging.warning(f"No identifier found in data dictionary file {file_path}")
-
-                if 'label' in study['gen3_discovery']:
-                    data_table.set('label', study['gen3_discovery']['label'])
+                study_name =  study.get('gen3_discovery', {}).get('label') or study.get('gen3_discovery', {}).get('study_metadata',{}).get('minimal_info',{}).get('study_name')
+                if study_name:
+                    data_table.set('study_name', study_name)                
+                study_description = study.get('gen3_discovery', {}).get('study_metadata',{}).get('minimal_info',{}).get('study_description')
+                if study_description:
+                    data_table.set('study_description', study_description)
 
                 # Determine the data_table study_id from the internal HEAL Data Platform (HDP) identifier.
                 if '_hdp_uid' in study['gen3_discovery']:
@@ -401,4 +404,5 @@ def get_heal_platform_mds_data_dicts(output, mds_metadata_endpoint, limit):
 
 # Run get_heal_platform_mds_data_dicts() if not used as a library.
 if __name__ == "__main__":
-    get_heal_platform_mds_data_dicts()
+    # get_heal_platform_mds_data_dicts()
+    generate_dbgap_files('mds_data/dbGaPs', 'mds_data/studies_with_data_dicts')

From 020c7ebb3a4197ddbff8dffa016dd06352bc8c6b Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Thu, 24 Aug 2023 13:18:04 -0400
Subject: [PATCH 2/8] add heal parser

---
 src/dug/core/crawler.py                |  2 +-
 src/dug/core/parsers/dbgap_parser.py   |  2 +-
 src/dug/core/parsers/heal_dp_parser.py | 57 ++++++++++++++++++++++++++
 src/dug/core/parsers/nida_parser.py    |  2 +-
 src/dug/utils.py                       |  5 +++
 5 files changed, 65 insertions(+), 3 deletions(-)
 create mode 100644 src/dug/core/parsers/heal_dp_parser.py

diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py
index 3ae70574..61d6b05c 100644
--- a/src/dug/core/crawler.py
+++ b/src/dug/core/crawler.py
@@ -152,7 +152,7 @@ def annotate_element(self, element):
                 concept = DugConcept(concept_id=identifier.id,
                                                        name=identifier.label,
                                                        desc=identifier.description,
-                                                       concept_type=identifier.type)
+                                                       concept_type=identifier.types)
                 # Add to list of concepts
                 self.concepts[identifier.id] = concept
 
diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py
index 2926b1f1..a362d028 100644
--- a/src/dug/core/parsers/dbgap_parser.py
+++ b/src/dug/core/parsers/dbgap_parser.py
@@ -39,7 +39,7 @@ def parse_study_name_from_gap_exchange_file(filepath: Path) -> str:
 
 
     def _get_element_type(self):
-        return "DbGaP"
+        return "dbGaP"
 
     def __call__(self, input_file: InputFile) -> List[Indexable]:
         logger.debug(input_file)
diff --git a/src/dug/core/parsers/heal_dp_parser.py b/src/dug/core/parsers/heal_dp_parser.py
new file mode 100644
index 00000000..2f3dddd2
--- /dev/null
+++ b/src/dug/core/parsers/heal_dp_parser.py
@@ -0,0 +1,57 @@
+import logging
+import os
+from typing import List
+from xml.etree import ElementTree as ET
+
+from dug import utils as utils
+from ._base import DugElement, FileParser, Indexable, InputFile
+
+logger = logging.getLogger('dug')
+
+
+class HEALDPParser(FileParser):
+    # Class for parsers Heal data platform converted Data dictionary into a set of Dug Elements
+
+    def __init__(self, study_type="HEAL Studies"):
+        super()
+        self.study_type = study_type
+
+
+    def get_study_type(self):
+        return self.study_type
+    
+    def set_study_type(self, study_type):
+        self.study_type = study_type
+
+    def __call__(self, input_file: InputFile) -> List[Indexable]:
+        logger.debug(input_file)
+        tree = ET.parse(input_file)
+        root = tree.getroot()
+        study_id = root.attrib['study_id']
+        participant_set = root.get('participant_set','0')
+
+        # Parse study name from file handle
+        study_name = root.get('study_name')
+
+        if study_name is None:
+            err_msg = f"Unable to parse study name from data dictionary: {input_file}!"
+            logger.error(err_msg)
+            raise IOError(err_msg)
+
+        elements = []
+        for variable in root.iter('variable'):
+            elem = DugElement(elem_id=f"{variable.attrib['id']}.p{participant_set}",
+                              name=variable.find('name').text,
+                              desc=variable.find('description').text.lower(),
+                              elem_type=self.get_study_type(),
+                              collection_id=f"{study_id}.p{participant_set}",
+                              collection_name=study_name)
+
+            # Create NIDA links as study/variable actions
+            elem.collection_action = utils.get_heal_platform_link(study_id=study_id)
+            # Add to set of variables
+            logger.debug(elem)
+            elements.append(elem)
+
+        # You don't actually create any concepts
+        return elements
diff --git a/src/dug/core/parsers/nida_parser.py b/src/dug/core/parsers/nida_parser.py
index d7a7b47d..64174ead 100644
--- a/src/dug/core/parsers/nida_parser.py
+++ b/src/dug/core/parsers/nida_parser.py
@@ -43,7 +43,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
             elem = DugElement(elem_id=f"{variable.attrib['id']}.p{participant_set}",
                               name=variable.find('name').text,
                               desc=variable.find('description').text.lower(),
-                              elem_type="DbGaP",
+                              elem_type="NIDA",
                               collection_id=f"{study_id}.p{participant_set}",
                               collection_name=study_name)
 
diff --git a/src/dug/utils.py b/src/dug/utils.py
index 9b224387..97ae558a 100644
--- a/src/dug/utils.py
+++ b/src/dug/utils.py
@@ -37,6 +37,11 @@ def get_nida_study_link(study_id):
     base_url = "https://datashare.nida.nih.gov/study"
     return f'{base_url}/{study_id}'
 
+def get_heal_platform_link(study_id):
+    base_url = "https://healdata.org/portal/discovery"
+    accession = study_id.split(':')[1]
+    return f'{base_url}/{accession}'
+
 
 def biolink_snake_case(arg):
     """Convert such SnakeCase to snake_case.

From d30c80fa660360c92071377516c6851c3860b1ca Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Thu, 24 Aug 2023 14:09:03 -0400
Subject: [PATCH 3/8] add parsers

---
 src/dug/core/parsers/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py
index 8143d508..8e158340 100644
--- a/src/dug/core/parsers/__init__.py
+++ b/src/dug/core/parsers/__init__.py
@@ -11,6 +11,7 @@
 from .topmed_csv_parser import TOPMedCSVParser
 from .sprint_parser import SPRINTParser
 from .bacpac_parser import BACPACParser
+from .heal_dp_parser import HEALDPParser
 
 
 logger = logging.getLogger('dug')
@@ -30,6 +31,10 @@ def define_parsers(parser_dict: Dict[str, Parser]):
     parser_dict["kfdrc"] = KFDRCDbGaPParser()
     parser_dict["sprint"] = SPRINTParser()
     parser_dict["bacpac"] = BACPACParser()
+    parser_dict["heal-studies"] = HEALDPParser(study_type="HEAL Studies")
+    parser_dict["heal-reasearch"] = HEALDPParser(study_type="HEAL Research Programs")
+    
+
 
 
 class ParserNotFoundException(Exception):

From 5f1f4629c9f10187cfb1b9bf1d3b14bfb6dc81de Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Thu, 24 Aug 2023 17:37:10 -0400
Subject: [PATCH 4/8] fix typo on research

---
 src/dug/core/parsers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dug/core/parsers/__init__.py b/src/dug/core/parsers/__init__.py
index 8e158340..0d0ebabe 100644
--- a/src/dug/core/parsers/__init__.py
+++ b/src/dug/core/parsers/__init__.py
@@ -32,7 +32,7 @@ def define_parsers(parser_dict: Dict[str, Parser]):
     parser_dict["sprint"] = SPRINTParser()
     parser_dict["bacpac"] = BACPACParser()
     parser_dict["heal-studies"] = HEALDPParser(study_type="HEAL Studies")
-    parser_dict["heal-reasearch"] = HEALDPParser(study_type="HEAL Research Programs")
+    parser_dict["heal-research"] = HEALDPParser(study_type="HEAL Research Programs")
     
 
 

From e8b4b5001a663be01ee32ef0b536cc81b9cc8d29 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Wed, 30 Aug 2023 12:35:34 -0400
Subject: [PATCH 5/8] fix cde crawl

---
 src/dug/core/crawler.py |  4 ++--
 src/dug/utils.py        | 14 +++++++++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py
index 61d6b05c..a0714980 100644
--- a/src/dug/core/crawler.py
+++ b/src/dug/core/crawler.py
@@ -5,7 +5,7 @@
 
 from dug.core.parsers import Parser, DugElement, DugConcept
 import dug.core.tranql as tql
-from dug.utils import biolink_snake_case
+from dug.utils import biolink_snake_case, get_formatted_biolink_name
 
 logger = logging.getLogger('dug')
 
@@ -218,7 +218,7 @@ def expand_to_dug_element(self,
 
             # convert the first type to snake case to be used in tranql query.
             # first type is the leaf type, this is coming from Node normalization.
-            node_type = biolink_snake_case(identifier.types[0].replace("biolink:", ""))
+            node_type = biolink_snake_case(get_formatted_biolink_name(identifier.types).replace("biolink:", ""))
             try:
                 # Tranql query factory currently supports select node types as valid query
                 # Types missing from QueryFactory.data_types will be skipped with this try catch
diff --git a/src/dug/utils.py b/src/dug/utils.py
index 97ae558a..957f80f0 100644
--- a/src/dug/utils.py
+++ b/src/dug/utils.py
@@ -1,4 +1,7 @@
 import re
+import bmt
+
+bmt_tk = bmt.Toolkit()
 
 class ObjectFactory:
     def __init__(self):
@@ -62,4 +65,13 @@ def biolink_snake_case(arg):
         lambda c: c.group(0).lower(),
         tmp
     )
-    return tmp
\ No newline at end of file
+    return tmp
+
+def get_formatted_biolink_name(bl_type):
+    category = bl_type
+    if isinstance(bl_type, str):
+        bl_element = bmt_tk.get_element(bl_type)
+        category = bl_element.class_uri or bl_element.slot_uri
+    if isinstance(bl_type, list):
+        return get_formatted_biolink_name(bl_type[0])
+    return category
\ No newline at end of file

From cd11e817683451e2a97cd954627ebec35209e50e Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Thu, 31 Aug 2023 08:43:51 -0400
Subject: [PATCH 6/8] remove testing code from platform fetch

---
 bin/get_heal_platform_mds_data_dicts.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bin/get_heal_platform_mds_data_dicts.py b/bin/get_heal_platform_mds_data_dicts.py
index a48e77ba..e6bdbc43 100644
--- a/bin/get_heal_platform_mds_data_dicts.py
+++ b/bin/get_heal_platform_mds_data_dicts.py
@@ -404,5 +404,4 @@ def get_heal_platform_mds_data_dicts(output, mds_metadata_endpoint, limit):
 
 # Run get_heal_platform_mds_data_dicts() if not used as a library.
 if __name__ == "__main__":
-    # get_heal_platform_mds_data_dicts()
-    generate_dbgap_files('mds_data/dbGaPs', 'mds_data/studies_with_data_dicts')
+    get_heal_platform_mds_data_dicts()

From 88172d1bd950186f0114739054f96561c7cc0bf8 Mon Sep 17 00:00:00 2001
From: YaphetKG <kebedey@renci.org>
Date: Thu, 31 Aug 2023 13:49:43 -0400
Subject: [PATCH 7/8] DbGaP to dbGaP

---
 src/dug/core/parsers/topmed_csv_parser.py | 2 +-
 src/dug/core/parsers/topmed_tag_parser.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dug/core/parsers/topmed_csv_parser.py b/src/dug/core/parsers/topmed_csv_parser.py
index 329c0a05..710bcb63 100644
--- a/src/dug/core/parsers/topmed_csv_parser.py
+++ b/src/dug/core/parsers/topmed_csv_parser.py
@@ -34,7 +34,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
                 elem = DugElement(elem_id=row['variable_full_accession'],
                                   name=row['variable_name'],
                                   desc=row['variable_desc'],
-                                  elem_type="DbGaP",
+                                  elem_type="dbGaP",
                                   collection_id=row['study_full_accession'],
                                   collection_name=row['study_name'])
 
diff --git a/src/dug/core/parsers/topmed_tag_parser.py b/src/dug/core/parsers/topmed_tag_parser.py
index 64b4064b..f10ed43d 100644
--- a/src/dug/core/parsers/topmed_tag_parser.py
+++ b/src/dug/core/parsers/topmed_tag_parser.py
@@ -59,7 +59,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
                     elem_id=row['variable_full_accession'],
                     name=row['variable_name'] if 'variable_name' in row else row['variable_full_accession'],
                     desc=row['variable_description'] if 'variable_description' in row else row['variable_full_accession'],
-                    elem_type="DbGaP",
+                    elem_type="dbGaP",
                     collection_id=row['study_full_accession'],
                     collection_name=row['study_name']
                 )

From 7f51c93b8c40be40f36ab446d7d2f1c737efae52 Mon Sep 17 00:00:00 2001
From: YaphetKG <45075777+YaphetKG@users.noreply.github.com>
Date: Fri, 1 Sep 2023 16:36:34 -0400
Subject: [PATCH 8/8] Update heal_dp_parser.py

remove participant id
---
 src/dug/core/parsers/heal_dp_parser.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/dug/core/parsers/heal_dp_parser.py b/src/dug/core/parsers/heal_dp_parser.py
index 2f3dddd2..e13c8790 100644
--- a/src/dug/core/parsers/heal_dp_parser.py
+++ b/src/dug/core/parsers/heal_dp_parser.py
@@ -28,7 +28,6 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
         tree = ET.parse(input_file)
         root = tree.getroot()
         study_id = root.attrib['study_id']
-        participant_set = root.get('participant_set','0')
 
         # Parse study name from file handle
         study_name = root.get('study_name')
@@ -40,11 +39,11 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
 
         elements = []
         for variable in root.iter('variable'):
-            elem = DugElement(elem_id=f"{variable.attrib['id']}.p{participant_set}",
+            elem = DugElement(elem_id=f"{variable.attrib['id']}",
                               name=variable.find('name').text,
                               desc=variable.find('description').text.lower(),
                               elem_type=self.get_study_type(),
-                              collection_id=f"{study_id}.p{participant_set}",
+                              collection_id=f"{study_id}",
                               collection_name=study_name)
 
             # Create NIDA links as study/variable actions