Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mds import to dev #316

Merged
merged 8 commits into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions bin/get_heal_platform_mds_data_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,12 @@ def generate_dbgap_files(dbgap_dir, studies_with_data_dicts_dir):
data_table.set('id', study['gen3_discovery']['@id'])
else:
logging.warning(f"No identifier found in data dictionary file {file_path}")

if 'label' in study['gen3_discovery']:
data_table.set('label', study['gen3_discovery']['label'])
study_name = study.get('gen3_discovery', {}).get('label') or study.get('gen3_discovery', {}).get('study_metadata',{}).get('minimal_info',{}).get('study_name')
if study_name:
data_table.set('study_name', study_name)
study_description = study.get('gen3_discovery', {}).get('study_metadata',{}).get('minimal_info',{}).get('study_description')
if study_description:
data_table.set('study_description', study_description)

# Determine the data_table study_id from the internal HEAL Data Platform (HDP) identifier.
if '_hdp_uid' in study['gen3_discovery']:
Expand Down
6 changes: 3 additions & 3 deletions src/dug/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from dug.core.parsers import Parser, DugElement, DugConcept
import dug.core.tranql as tql
from dug.utils import biolink_snake_case
from dug.utils import biolink_snake_case, get_formatted_biolink_name

logger = logging.getLogger('dug')

Expand Down Expand Up @@ -152,7 +152,7 @@ def annotate_element(self, element):
concept = DugConcept(concept_id=identifier.id,
name=identifier.label,
desc=identifier.description,
concept_type=identifier.type)
concept_type=identifier.types)
# Add to list of concepts
self.concepts[identifier.id] = concept

Expand Down Expand Up @@ -218,7 +218,7 @@ def expand_to_dug_element(self,

# convert the first type to snake case to be used in tranql query.
# first type is the leaf type, this is coming from Node normalization.
node_type = biolink_snake_case(identifier.types[0].replace("biolink:", ""))
node_type = biolink_snake_case(get_formatted_biolink_name(identifier.types).replace("biolink:", ""))
try:
# Tranql query factory currently supports select node types as valid query
# Types missing from QueryFactory.data_types will be skipped with this try catch
Expand Down
5 changes: 5 additions & 0 deletions src/dug/core/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .topmed_csv_parser import TOPMedCSVParser
from .sprint_parser import SPRINTParser
from .bacpac_parser import BACPACParser
from .heal_dp_parser import HEALDPParser


logger = logging.getLogger('dug')
Expand All @@ -30,6 +31,10 @@ def define_parsers(parser_dict: Dict[str, Parser]):
parser_dict["kfdrc"] = KFDRCDbGaPParser()
parser_dict["sprint"] = SPRINTParser()
parser_dict["bacpac"] = BACPACParser()
parser_dict["heal-studies"] = HEALDPParser(study_type="HEAL Studies")
parser_dict["heal-research"] = HEALDPParser(study_type="HEAL Research Programs")




class ParserNotFoundException(Exception):
Expand Down
2 changes: 1 addition & 1 deletion src/dug/core/parsers/dbgap_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def parse_study_name_from_gap_exchange_file(filepath: Path) -> str:


def _get_element_type(self):
return "DbGaP"
return "dbGaP"

def __call__(self, input_file: InputFile) -> List[Indexable]:
logger.debug(input_file)
Expand Down
56 changes: 56 additions & 0 deletions src/dug/core/parsers/heal_dp_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import logging
import os
from typing import List
from xml.etree import ElementTree as ET

from dug import utils as utils
from ._base import DugElement, FileParser, Indexable, InputFile

logger = logging.getLogger('dug')


class HEALDPParser(FileParser):
# Class for parsers Heal data platform converted Data dictionary into a set of Dug Elements

def __init__(self, study_type="HEAL Studies"):
super()
self.study_type = study_type


def get_study_type(self):
return self.study_type

def set_study_type(self, study_type):
self.study_type = study_type

def __call__(self, input_file: InputFile) -> List[Indexable]:
logger.debug(input_file)
tree = ET.parse(input_file)
root = tree.getroot()
study_id = root.attrib['study_id']

# Parse study name from file handle
study_name = root.get('study_name')

if study_name is None:
err_msg = f"Unable to parse study name from data dictionary: {input_file}!"
logger.error(err_msg)
raise IOError(err_msg)

elements = []
for variable in root.iter('variable'):
elem = DugElement(elem_id=f"{variable.attrib['id']}",
name=variable.find('name').text,
desc=variable.find('description').text.lower(),
elem_type=self.get_study_type(),
collection_id=f"{study_id}",
collection_name=study_name)

# Create NIDA links as study/variable actions
elem.collection_action = utils.get_heal_platform_link(study_id=study_id)
# Add to set of variables
logger.debug(elem)
elements.append(elem)

# You don't actually create any concepts
return elements
2 changes: 1 addition & 1 deletion src/dug/core/parsers/nida_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
elem = DugElement(elem_id=f"{variable.attrib['id']}.p{participant_set}",
name=variable.find('name').text,
desc=variable.find('description').text.lower(),
elem_type="DbGaP",
elem_type="NIDA",
collection_id=f"{study_id}.p{participant_set}",
collection_name=study_name)

Expand Down
2 changes: 1 addition & 1 deletion src/dug/core/parsers/topmed_csv_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
elem = DugElement(elem_id=row['variable_full_accession'],
name=row['variable_name'],
desc=row['variable_desc'],
elem_type="DbGaP",
elem_type="dbGaP",
collection_id=row['study_full_accession'],
collection_name=row['study_name'])

Expand Down
2 changes: 1 addition & 1 deletion src/dug/core/parsers/topmed_tag_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:
elem_id=row['variable_full_accession'],
name=row['variable_name'] if 'variable_name' in row else row['variable_full_accession'],
desc=row['variable_description'] if 'variable_description' in row else row['variable_full_accession'],
elem_type="DbGaP",
elem_type="dbGaP",
collection_id=row['study_full_accession'],
collection_name=row['study_name']
)
Expand Down
19 changes: 18 additions & 1 deletion src/dug/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import re
import bmt

bmt_tk = bmt.Toolkit()

class ObjectFactory:
def __init__(self):
Expand Down Expand Up @@ -37,6 +40,11 @@ def get_nida_study_link(study_id):
base_url = "https://datashare.nida.nih.gov/study"
return f'{base_url}/{study_id}'

def get_heal_platform_link(study_id):
base_url = "https://healdata.org/portal/discovery"
accession = study_id.split(':')[1]
return f'{base_url}/{accession}'


def biolink_snake_case(arg):
"""Convert such SnakeCase to snake_case.
Expand All @@ -57,4 +65,13 @@ def biolink_snake_case(arg):
lambda c: c.group(0).lower(),
tmp
)
return tmp
return tmp

def get_formatted_biolink_name(bl_type):
category = bl_type
if isinstance(bl_type, str):
bl_element = bmt_tk.get_element(bl_type)
category = bl_element.class_uri or bl_element.slot_uri
if isinstance(bl_type, list):
return get_formatted_biolink_name(bl_type[0])
return category