From f30d95d23fe21f16ee4665138a247066d9afc5ec Mon Sep 17 00:00:00 2001 From: Hoid Date: Thu, 3 Aug 2023 11:11:40 -0400 Subject: [PATCH 1/6] feat: Parse study name from GapExchange file if present --- src/dug/core/parsers/dbgap_parser.py | 31 ++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index b01432c4..bb6161c5 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -1,9 +1,10 @@ import logging -import re +import re, os from typing import List from xml.etree import ElementTree as ET from dug import utils as utils +from pathlib import Path from ._base import DugElement, FileParser, Indexable, InputFile logger = logging.getLogger('dug') @@ -13,13 +14,29 @@ class DbGaPParser(FileParser): # Class for parsers DBGaP Data dictionary into a set of Dug Elements @staticmethod - def parse_study_name_from_filename(filename: str): + def parse_study_name_from_filename(filename: str) -> str: # Parse the study name from the xml filename if it exists. Return None if filename isn't right format to get id from dbgap_file_pattern = re.compile(r'.*/*phs[0-9]+\.v[0-9]+\.pht[0-9]+\.v[0-9]+\.(.+)\.data_dict.*') match = re.match(dbgap_file_pattern, filename) if match is not None: return match.group(1) return None + + @staticmethod + def parse_study_name_from_gap_exchange_file(filepath: Path) -> str: + # Parse the study name from the GapExchange file adjacent to the file passed in + parent_dir = filepath.parent.absolute() + gap_exchange_filename_str = "GapExchange_" + parent_dir.name + gap_exchange_filepath = None + for item in os.scandir(parent_dir): + if item.is_file and gap_exchange_filename_str in item.name: + gap_exchange_filepath = item.path + if gap_exchange_filepath is None: + return None + tree = ET.parse(gap_exchange_filepath, ET.XMLParser(encoding='iso-8859-5')) + tree_root = tree.getroot() + return tree_root.attrib['Configuration']['StudyNameEntrez'].text + def _get_element_type(self): return "DbGaP" @@ -28,12 +45,14 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: logger.debug(input_file) tree = ET.parse(input_file, ET.XMLParser(encoding='iso-8859-5')) root = tree.getroot() - study_id = root.attrib['study_id'] + study_id = root.attrib['study_id'].text participant_set = root.get('participant_set','0') - # Parse study name from file handle - study_name = self.parse_study_name_from_filename(str(input_file)) - + # Parse study name from GapExchange file, and if that fails try from file handle + # If still None, raise an error message + study_name = self.parse_study_name_from_gap_exchange_file(Path(input_file)) + if study_name is None: + study_name = self.parse_study_name_from_filename(str(input_file)) if study_name is None: err_msg = f"Unable to parse DbGaP study name from data dictionary: {input_file}!" logger.error(err_msg) From 61904163a159b5b92b03a5327fae032c0f720d28 Mon Sep 17 00:00:00 2001 From: Hoid Date: Fri, 4 Aug 2023 09:40:36 -0400 Subject: [PATCH 2/6] fix: Fix small bug with .text --- src/dug/core/parsers/dbgap_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index bb6161c5..7de924e5 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -35,7 +35,7 @@ def parse_study_name_from_gap_exchange_file(filepath: Path) -> str: return None tree = ET.parse(gap_exchange_filepath, ET.XMLParser(encoding='iso-8859-5')) tree_root = tree.getroot() - return tree_root.attrib['Configuration']['StudyNameEntrez'].text + return tree_root.attrib['Configuration']['StudyNameEntrez'] def _get_element_type(self): @@ -45,7 +45,7 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: logger.debug(input_file) tree = ET.parse(input_file, ET.XMLParser(encoding='iso-8859-5')) root = tree.getroot() - study_id = root.attrib['study_id'].text + study_id = root.attrib['study_id'] participant_set = root.get('participant_set','0') # Parse study name from GapExchange file, and if that fails try from file handle From 4257f9c1183e97a112907a82e7c1dfd731f061fc Mon Sep 17 00:00:00 2001 From: Hoid Date: Fri, 4 Aug 2023 09:49:29 -0400 Subject: [PATCH 3/6] fix: Fix test_loaders.py to have the right number of files in test dir --- tests/integration/test_loaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_loaders.py b/tests/integration/test_loaders.py index 9287da44..8e67609b 100644 --- a/tests/integration/test_loaders.py +++ b/tests/integration/test_loaders.py @@ -18,7 +18,7 @@ def test_filesystem_loader(): filepath=TEST_DATA_DIR, ) files = list(targets) - assert len(files) == 10 + assert len(files) == 12 with pytest.raises(ValueError): targets = load_from_filesystem( From 9c009398e4dfc68d93577c3dbc4735922074b121 Mon Sep 17 00:00:00 2001 From: Hoid Date: Fri, 4 Aug 2023 11:40:47 -0400 Subject: [PATCH 4/6] fix: Fix study name from gap exchange function --- src/dug/core/parsers/dbgap_parser.py | 2 +- .../GapExchange_phs001252.v1.p1.xml | 390 ++++++++++++++++++ ...pht006366.v1.ECLIPSE_Subject.data_dict.xml | 2 + tests/integration/test_loaders.py | 2 +- tests/integration/test_parsers.py | 7 + 5 files changed, 401 insertions(+), 2 deletions(-) create mode 100644 tests/integration/data/phs001252.v1.p1/GapExchange_phs001252.v1.p1.xml create mode 100644 tests/integration/data/phs001252.v1.p1/phs001252.v1.pht006366.v1.ECLIPSE_Subject.data_dict.xml diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index 7de924e5..baa37c45 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -35,7 +35,7 @@ def parse_study_name_from_gap_exchange_file(filepath: Path) -> str: return None tree = ET.parse(gap_exchange_filepath, ET.XMLParser(encoding='iso-8859-5')) tree_root = tree.getroot() - return tree_root.attrib['Configuration']['StudyNameEntrez'] + return tree_root.find("./Studies/Study/Configuration/StudyNameEntrez").text def _get_element_type(self): diff --git a/tests/integration/data/phs001252.v1.p1/GapExchange_phs001252.v1.p1.xml b/tests/integration/data/phs001252.v1.p1/GapExchange_phs001252.v1.p1.xml new file mode 100644 index 00000000..33f615cb --- /dev/null +++ b/tests/integration/data/phs001252.v1.p1/GapExchange_phs001252.v1.p1.xml @@ -0,0 +1,390 @@ + + + + + + + + + + + + + + + + + + + + TitleNameInstitute + Principal InvestigatorJorgen Vestbo, Professorthe University of Manchester, Manchester, UK + Principal InvestigatorEdwin K. Silverman, MD, PhDBrigham and Women 's Hospital, Boston, MA + ECLIPSE InvestigatorsY. IvanovPleven, Bulgaria + ECLIPSE InvestigatorsK. KostovSofia, Bulgaria + ECLIPSE InvestigatorsJ. BourbeauMontreal, Canada + + ]]> + Evaluation of COPD Longitudinally to Identify Predictive Surrogate Endpoints (ECLIPSE) + Evaluation of COPD Longitudinally to Identify Predictive Surrogate Endpoints (ECLIPSE) + + Case-Control + Longitudinal Cohort + Cohort + + ECLIPSE was a longitudinal observational study of 2164 COPD subjects and a smaller number of smoking controls (337) and nonsmoking controls (245) followed regularly for three years, with three chest CT scans (at baseline, one year, and three years) (Vestbo, European Respiratory Journal 2008; 31: 869). Inclusion criteria included age 40-75, at least 10 pack-years of smoking, and spirometry in GOLD grades 2-4 (COPD cases) or normal spirometry with post-bronchodilator FEV1 >85% predicted and FEV1/FVC>0.7 (controls). Study visits were performed at enrollment, three months, and every six months thereafter with spirometry, questionnaires, and other clinical evaluations. The ECLIPSE CT scans have been analyzed with the VIDA software for emphysema and airway phenotypes. ECLIPSE has provided key insights into the clinical epidemiology of COPD, including COPD exacerbations (Hurst, NEJM 2010; 363: 1128) and lung function decline in COPD (Vestbo, NEJM 2011; 365: 1184). ECLIPSE has been used in a number of genetic studies of COPD susceptibility and protein biomarkers(Faner, Thorax 2014; 69: 666). Genome-wide gene expression microarray data are available in 147 induced sputum samples from COPD subjects and 248 peripheral blood samples from COPD and control subjects.

]]> +
+ Inclusion criteria included age 40-75, at least 10 pack-years of smoking, and spirometry in GOLD grades 2-4 (COPD cases) or normal spirometry with post-bronchodilator FEV1 >85% predicted and FEV1/FVC >0.7 (controls). Exclusion criteria included respiratory disorders other than COPD, known severe α1-antitrypsin deficiency, history of significant inflammatory disease other than COPD, a COPD exacerbation or blood transfusions within 4 weeks of enrollment, prior lung surgery, recent diagnosis of cancer, inability to walk, and therapy with oral corticosteroids at inclusion.

]]> +
+ + + + + + + + + + + + + + + + + + + + +
+ Jorgen Vestbo, Professor + the University of Manchester, Manchester, UK +
+
+ Edwin K. Silverman, MD, PhD + Brigham and Women 's Hospital, Boston, MA +
+
+ Y. Ivanov + Pleven, Bulgaria +
+
+ K. Kostov + Sofia, Bulgaria +
+
+ J. Bourbeau + Montreal, Canada +
+
+ M. Fitzgerald + Vancouver, BC, Canada +
+
+ P. Hernandez + Halifax, NS, Canada +
+
+ K. Killian + Hamilton, ON, Canada +
+
+ R. Levy + Vancouver, BC, Canada +
+
+ F. Maltais + Montreal, Canada +
+
+ D. O'Donnell + Kingston, ON, Canada +
+
+ J. Krepelka + Prague, Czech Republic +
+
+ J. Vestbo + Hvidovre, Denmark +
+
+ E. Wouters + Horn-Maastricht, The Netherlands +
+
+ D. Quinn + Wellington, New Zealand +
+
+ P. Bakke + Bergen, Norway +
+
+ M. Kosnik + Golnik, Slovenia +
+
+ A. Agusti + Spain +
+
+ J. Sauleda + Spain +
+
+ P. de Mallorca + Spain +
+
+ Y. Feschenko + Kiev, Ukraine +
+
+ V. Gavrisyuk + Kiev, Ukraine +
+
+ L. Yashina + Kiev, Ukraine +
+
+ N. Monogarova + Donetsk, Ukraine +
+
+ P. Calverley + Liverpool, United Kingdom +
+
+ D. Lomas + Cambridge, United Kingdom +
+
+ W. MacNee + Edinburgh, United Kingdom +
+
+ D. Singh + Manchester, United Kingdom +
+
+ J. Wedzicha + London, United Kingdom +
+
+ A. Anzueto + San Antonio, TX, USA +
+
+ S. Braman + Providence, RI, USA +
+
+ R. Casaburi + Torrance, CA, USA +
+
+ B. Celli + Boston, MA, USA +
+
+ G. Giessel + Richmond, VA, USA +
+
+ M. Gotfried + Phoenix, AZ, USA +
+
+ G. Greenwald + Rancho Mirage, CA, USA +
+
+ N. Hanania + Houston, TX, USA +
+
+ D. Mahler + Lebanon, NH, USA +
+
+ B. Make + Denver, CO, USA +
+
+ S. Rennard + Omaha, NE, USA +
+
+ C. Rochester + New Haven, CT, USA +
+
+ P. Scanlon + Rochester, MN, USA +
+
+ D. Schuller + Omaha, NE, USA +
+
+ F. Sciurba + Pittsburgh, PA, USA +
+
+ A. Sharafkhaneh + Houston, TX, USA +
+
+ T. Siler + St. Charles, MO, USA +
+
+ E. Silverman + Boston, MA, USA +
+
+ A. Wanner + Miami, FL, USA +
+
+ R. Wise + Baltimore, MD, USA +
+
+ R. ZuWallack + Hartford, CT, USA +
+
+ H. Coxson + Canada +
+
+ C. Crim + GSK, USA +
+
+ L. Edwards + GSK, USA +
+
+ D. Lomas + UK +
+
+ W. MacNee + UK +
+
+ E. Silverman + USA +
+
+ R. Tal-Singer + Co-chair, GSK, USA +
+
+ J. Vestbo + Co-chair, Denmark +
+
+ J. Yates + GSK, USA +
+
+ A. Agusti + Spain +
+
+ P. Calverley + UK +
+
+ B. Celli + USA +
+
+ C. Crim + GSK, USA +
+
+ B. Miller + GSK, USA +
+
+ W. MacNee + Chair, UK +
+
+ S. Rennard + USA +
+
+ R. Tal-Singer + GSK, USA +
+
+ E. Wouters + The Netherlands +
+
+ J. Yates + GSK, USA +
+
+ yes + + + + The ECLIPSE study protocol was finalized in 2005, and subject recruitment began in 2006.

]]> +
+ + + +
+ + + + NHLBI + National Heart, Lung, and Blood Institute DAC + 0 + + + + + yes + yes + 0 + 1 + 8 + no + + + http://dbgap.ncbi.nlm.nih.gov/aa/wga.cgi?page=DUC&view_pdf&stacc=phs001252.v1.p1 + + + + + + + + + Disease-Specific (Chronic Obstructive Pulmonary Disease, RD) + DS-COPD-RD + Use of the data must be related to Chronic Obstructive Pulmonary Disease and related disorders. + No + + + + + +
+ +
+ +
diff --git a/tests/integration/data/phs001252.v1.p1/phs001252.v1.pht006366.v1.ECLIPSE_Subject.data_dict.xml b/tests/integration/data/phs001252.v1.p1/phs001252.v1.pht006366.v1.ECLIPSE_Subject.data_dict.xml new file mode 100644 index 00000000..c9c083e9 --- /dev/null +++ b/tests/integration/data/phs001252.v1.p1/phs001252.v1.pht006366.v1.ECLIPSE_Subject.data_dict.xml @@ -0,0 +1,2 @@ + +SUBJECT_IDSubject IDstringCONSENTConsent group as determined by DACencoded valueDisease-Specific (Chronic Obstructive Pulmonary Disease, RD) (DS-COPD-RD) diff --git a/tests/integration/test_loaders.py b/tests/integration/test_loaders.py index 8e67609b..799fd9dc 100644 --- a/tests/integration/test_loaders.py +++ b/tests/integration/test_loaders.py @@ -18,7 +18,7 @@ def test_filesystem_loader(): filepath=TEST_DATA_DIR, ) files = list(targets) - assert len(files) == 12 + assert len(files) == 15 with pytest.raises(ValueError): targets = load_from_filesystem( diff --git a/tests/integration/test_parsers.py b/tests/integration/test_parsers.py index a22d00c7..fa5ea2de 100644 --- a/tests/integration/test_parsers.py +++ b/tests/integration/test_parsers.py @@ -1,6 +1,7 @@ from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser, AnvilDbGaPParser,\ CRDCDbGaPParser, KFDRCDbGaPParser, SPRINTParser, BACPACParser from tests.integration.conftest import TEST_DATA_DIR +from pathlib import Path def test_dbgap_parse_study_name_from_filename(): parser = DbGaPParser() @@ -21,6 +22,12 @@ def test_nida_parse_study_name_from_filename(): studyname = parser.parse_study_name_from_filename(filename) assert studyname == "NIDA-CSP1019" +def test_dbgap_parse_study_name_from_gap_exchange_file(): + parser = DbGaPParser() + parse_filepath = Path(TEST_DATA_DIR / "phs001252.v1.p1" / "phs001252.v1.pht006366.v1.ECLIPSE_Subject.data_dict.xml") + studyname = parser.parse_study_name_from_gap_exchange_file(parse_filepath) + assert studyname == "Evaluation of COPD Longitudinally to Identify Predictive Surrogate Endpoints (ECLIPSE)" + def test_dbgap_parser(): parser = DbGaPParser() parse_file = str(TEST_DATA_DIR / "phs000166.v2.pht000700.v1.CAMP_CData.data_dict_2009_09_03.xml") From 217d266726277fd7738c68dda745e7f217722ebc Mon Sep 17 00:00:00 2001 From: Hoid Date: Mon, 7 Aug 2023 12:00:38 -0400 Subject: [PATCH 5/6] fix: Skip GapExchange files in __call__() --- src/dug/core/parsers/dbgap_parser.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index baa37c45..46254d77 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -43,8 +43,13 @@ def _get_element_type(self): def __call__(self, input_file: InputFile) -> List[Indexable]: logger.debug(input_file) + if "GapExchange" in str(input_file).split("/")[-1]: + msg = f"Skipping parsing for GapExchange file: {input_file}!" + logger.info(msg) + return [] tree = ET.parse(input_file, ET.XMLParser(encoding='iso-8859-5')) root = tree.getroot() + print(root.attrib) study_id = root.attrib['study_id'] participant_set = root.get('participant_set','0') From 58548cc2d82654b5c42a2dac2d8f6c69f40ce287 Mon Sep 17 00:00:00 2001 From: Hoid Date: Mon, 7 Aug 2023 12:02:34 -0400 Subject: [PATCH 6/6] fix: Remove print statement --- src/dug/core/parsers/dbgap_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index 46254d77..2926b1f1 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -49,7 +49,6 @@ def __call__(self, input_file: InputFile) -> List[Indexable]: return [] tree = ET.parse(input_file, ET.XMLParser(encoding='iso-8859-5')) root = tree.getroot() - print(root.attrib) study_id = root.attrib['study_id'] participant_set = root.get('participant_set','0')