Skip to content

Commit

Permalink
More parsers (#248)
Browse files Browse the repository at this point in the history
* Release/2.8.0 (#198)

* Bumping version

* support for extracting dug elements from graph (#197)

* support for extracting dug elements from graph

* adding flag for enabling dug element extraction from graph

* adding new config for node_to dug element parsing

* adding more parameters to crawler to able configuration to element extraction logic

* add tests

* add tests for crawler

Co-authored-by: Yaphetkg <[email protected]>

* Update _version.py

* Update _version.py

updating version for final push to master

* Update factory.py

Adding more comments

Co-authored-by: Carl Schreep <[email protected]>
Co-authored-by: Yaphetkg <[email protected]>

* Release/v2.9.0 (#201)

* Bumping version

* support for extracting dug elements from graph (#197)

* support for extracting dug elements from graph

* adding flag for enabling dug element extraction from graph

* adding new config for node_to dug element parsing

* adding more parameters to crawler to able configuration to element extraction logic

* add tests

* add tests for crawler

Co-authored-by: Yaphetkg <[email protected]>

* Display es scores (#199)

* Include ES scores in variable results

* Round ES score to 6

* Update _version.py (#200)

* Update _version.py

Co-authored-by: Carl Schreep <[email protected]>
Co-authored-by: Yaphetkg <[email protected]>
Co-authored-by: Ginnie Hench <[email protected]>

* consolidate dbgap format parser in single file , adds crdc and kfdrc parsers

* adding tests

* bump version

* parser when versions of studies are > 9

* test for version

* fix long text issues, and encoding errors

* nltk initialization

* change nltk approach for sliding window

Co-authored-by: Carl Schreep <[email protected]>
Co-authored-by: Yaphetkg <[email protected]>
Co-authored-by: Ginnie Hench <[email protected]>
  • Loading branch information
4 people authored Aug 17, 2022
1 parent 87b374d commit b6f4730
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 15 deletions.
2 changes: 1 addition & 1 deletion src/dug/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.9.3dev"
__version__ = "2.9.4dev"
35 changes: 33 additions & 2 deletions src/dug/core/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
import os
import urllib.parse
from typing import TypeVar, Generic, Union, List, Tuple, Optional

import requests
from requests import Session

import dug.core.tranql as tql


logger = logging.getLogger('dug')

logging.getLogger("requests").setLevel(logging.WARNING)
Expand Down Expand Up @@ -271,9 +271,40 @@ class Annotator(ApiClient[str, List[Identifier]]):
def __init__(self, url: str):
self.url = url

def sliding_window(self, text, max_characters=2000, padding_words=5):
"""
For long texts sliding window works as the following
"aaaa bbb ccc ddd eeee"
with a sliding max chars 8 and padding 1
first yeild would be "aaaa bbb"
next subsequent yeilds "bbb ccc", "ccc ddd" , "ddd eeee"
allowing context to be preserved with the scope of padding
For a text of length 7653 , with max_characters 2000 and padding 5 , 4 chunks are yielded.
"""
words = text.split(' ')
total_words = len(words)
window_end = False
current_index = 0
while not window_end:
current_string = ""
for index, word in enumerate(words[current_index: ]):
if len(current_string) + len(word) + 1 >= max_characters:
yield current_string + " "
current_index += index - padding_words
break
appendee = word if index == 0 else " " + word
current_string += appendee

if current_index + index == len(words) - 1:
window_end = True
yield current_string

def annotate(self, text, http_session):
logger.debug(f"Annotating: {text}")
return self(text, http_session)
identifiers = []
for chunk_text in self.sliding_window(text):
identifiers += self(chunk_text, http_session)
return identifiers

def make_request(self, value: Input, http_session: Session):
value = urllib.parse.quote(value)
Expand Down
8 changes: 5 additions & 3 deletions src/dug/core/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import pluggy

from ._base import DugElement, DugConcept, Indexable, Parser, FileParser
from .dbgap_parser import DbGaPParser
from .dbgap_parser import DbGaPParser, AnvilDbGaPParser, KFDRCDbGaPParser, CRDCDbGaPParser
from .nida_parser import NIDAParser
from .scicrunch_parser import SciCrunchParser
from .topmed_tag_parser import TOPMedTagParser
from .topmed_csv_parser import TOPMedCSVParser
from .anvil_dbgap_parser import AnvilDbGaPParser


logger = logging.getLogger('dug')

Expand All @@ -23,7 +23,9 @@ def define_parsers(parser_dict: Dict[str, Parser]):
parser_dict["topmedtag"] = TOPMedTagParser()
parser_dict["topmedcsv"] = TOPMedCSVParser()
parser_dict["scicrunch"] = SciCrunchParser()
parser_dict["anvil"] = AnvilDbGaPParser()
parser_dict["anvil"] = AnvilDbGaPParser()
parser_dict["crdc"] = CRDCDbGaPParser()
parser_dict["kfdrc"] = KFDRCDbGaPParser()


class ParserNotFoundException(Exception):
Expand Down
6 changes: 0 additions & 6 deletions src/dug/core/parsers/anvil_dbgap_parser.py

This file was deleted.

18 changes: 16 additions & 2 deletions src/dug/core/parsers/dbgap_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class DbGaPParser(FileParser):
@staticmethod
def parse_study_name_from_filename(filename: str):
# Parse the study name from the xml filename if it exists. Return None if filename isn't right format to get id from
dbgap_file_pattern = re.compile(r'.*/*phs[0-9]+\.v[0-9]\.pht[0-9]+\.v[0-9]\.(.+)\.data_dict.*')
dbgap_file_pattern = re.compile(r'.*/*phs[0-9]+\.v[0-9]+\.pht[0-9]+\.v[0-9]+\.(.+)\.data_dict.*')
match = re.match(dbgap_file_pattern, filename)
if match is not None:
return match.group(1)
Expand All @@ -26,7 +26,7 @@ def _get_element_type(self):

def __call__(self, input_file: InputFile) -> List[Indexable]:
logger.debug(input_file)
tree = ET.parse(input_file)
tree = ET.parse(input_file, ET.XMLParser(encoding='iso-8859-5'))
root = tree.getroot()
study_id = root.attrib['study_id']
participant_set = root.get('participant_set','0')
Expand Down Expand Up @@ -58,3 +58,17 @@ def __call__(self, input_file: InputFile) -> List[Indexable]:

# You don't actually create any concepts
return elements


class AnvilDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "AnVIL"


class CRDCDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "Cancer Data Commons"

class KFDRCDbGaPParser(DbGaPParser):
def _get_element_type(self):
return "Kids First"
25 changes: 24 additions & 1 deletion tests/integration/test_parsers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser, AnvilDbGaPParser
from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser, AnvilDbGaPParser,\
CRDCDbGaPParser, KFDRCDbGaPParser
from tests.integration.conftest import TEST_DATA_DIR

def test_dbgap_parse_study_name_from_filename():
parser = DbGaPParser()
filename = "whatever/phs000166.v2.pht000700.v1.CAMP_CData.data_dict_2009_09_03.xml"
studyname = parser.parse_study_name_from_filename(filename)
assert studyname == "CAMP_CData"
# test if version numbers are > 9
filename = "whatever/phs000166.v23.pht000700.v13.CAMP_CData.data_dict_2009_09_03.xml"
studyname = parser.parse_study_name_from_filename(filename)
assert studyname == "CAMP_CData"

def test_nida_parse_study_name_from_filename():
parser = NIDAParser()
Expand Down Expand Up @@ -70,3 +75,21 @@ def test_anvil_parser():
assert len(elements) == 3
for element in elements:
assert element.type == "AnVIL"


def test_crdc_parser():
parser = CRDCDbGaPParser()
parse_file = str(TEST_DATA_DIR / "phs001547.v1.pht009987.v1.TOPMed_CCDG_GENAF_Subject.data_dict.xml")
elements = parser(parse_file)
assert len(elements) == 3
for element in elements:
assert element.type == "Cancer Data Commons"


def test_kfdrc_parser():
parser = KFDRCDbGaPParser()
parse_file = str(TEST_DATA_DIR / "phs001547.v1.pht009987.v1.TOPMed_CCDG_GENAF_Subject.data_dict.xml")
elements = parser(parse_file)
assert len(elements) == 3
for element in elements:
assert element.type == "Kids First"
26 changes: 26 additions & 0 deletions tests/unit/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,29 @@ def test_ontology_helper(ontology_api):
assert name == 'primary circulatory organ'
assert description == 'A hollow, muscular organ, which, by contracting rhythmically, keeps up the circulation of the blood or analogs[GO,modified].'
assert ontology_type == 'anatomical entity'


def test_yield_partial_text():
annotator = Annotator('foo')
# text contains 800 characters + 9 new lines
text = """COG Protocol number on which the patient was enrolled [901=Trial of mouse monoclonal Anti-GD-2 antibody 14.G2A plus IL-2 with or without GM-CSF in children with refractory NBL or melanoma; 911=I-131-MIBG for therapy of advanced neuroblastoma; 914=A dose escalation study of cisplatin, doxorubicin, VP-16, and ifosfamide followed by GM-CSF in advanced NBL and peripheral neuroepithelioma; 925=Study of topotecan; 935=Study of ch14.18 with GM-CSF in children with NBL and other GD2 positive malignancies immediately post ABMT or PBSC; 937=Phase I trial of ZD1694, an inhibitor of thymidylate synthase, in pediatric patients with advanced neoplastic disease; 9709=A phase I study of fenretinide in children with high risk solid tumors; 321P2=New intensive chemotherapy for CCG stage II (with N-myc amplification), stage III and stage IV neuroblastoma; 321P3=Treatment of poor prognosis neuroblastoma before disease progression with intensive multimodal therapy and BMT; 323P=Cyclic combination chemotherapy for newly diagnosed stage III neuroblastoma age 2 and older and stage IV Nneuroblastoma all ages; 3881=Biology and therapy of good, intermediate, and selected poor prognosis neuroblastoma; 3891=Conventional dose chemoradiotherapy vs ablative chemoradiotherapy with autologous BMT for high-risk neuroblastoma; 3951=Phase I pilot study of multiple cycles of high dose chemotherapy with peripheral blood stem cell infusions in advanced stage neuroblastoma.; 4941=National Wilms tumor study V - therapeutic trial & biology study; 8605=Study of the combination of ifosfamide, mesna, and VP-16 in children and young adults with recurrent sarcomas, PNET and other tumors; 8742=Phase III portion of 8741 for neuroblastoma; 9047=Neuroblastoma biology protocol; 9082=Protocol for the development of intervention strategies to reduce the time between symptom onset and diagnosis of childhood cancer -a pediatric oncology group cancer control study; 9140=Therapy for patients with recurrent or refractory neuroblastoma - a phase II study; 9262=A Phase II study of taxol in children with recurrent/refractory soft-tissue sarcoma, rhabdomyosarcoma, osteosarcoma, Ewing's sarcoma, neuroblastoma, germ cell tumors, Wilms' tumor, hepatoblastoma, and hepatocellular carcinoma, a POG study; 9280=Neuroblastoma epidemiology protocol - A Non-Therapeutic Study - A Joint Project of: The University of North Carolina, The Pediatric Oncology Group and The Children's Cancer Study Group; 9340=Treatment of patients >365 days at diagnosis with stage IV NBL: Upfront Phase II Window - A Phase II Study; 9341=Treatment of patients >365 days at diagnosis with stage IV and stage IIB/III (N-myc) NBL - a phase III study; 9342=Neuroblastoma #5, bone marrow transplant - a phase III study; 9343=Interleukin-6 in children receiving autologous bone marrow transplantation for advanced neuroblastoma - a pediatric oncology group phase I trial; 9361=Topotecan in pediatric patients with recurrent or progressive solid tumors - a pediatric oncology group phase II study; 9375=Topotecan plus cyclophosphamide in children with solid tumors - a pediatric oncology group phase I trial; 9464=Cyclophosphamide plus topotecan in children with recurrent or refractory solid tumors - a pediatric oncology group phase II study; 9640=Treatment of patients with high risk neuroblastoma (a feasibility pilot) using two cycles of marrow ablative chemotherapy followed by rescue With peripheral blood stem cells (PBSC), radiation therapy; A3973=A randomized study of purged vs. unpurged PBSC transplant following dose intensive induction therapy for high risk NBL; AADM01P1=Protocol for registration and consent to the childhood cancer research network: a limited institution pilot; AAML00P2=A dose finding study of the safety of gemtuzumab ozogamicin combined with conventional chemotherapy for patients with relapsed or refractory acute myeloid leukemia; ACCL0331=A Randomized double blind placebo controlled clinical trial to assess the efficacy of traumeel® S (IND # 66649) for the prevention and treatment of mucositis in children undergoing hematopoietic stem cell transplantation; ACCRN07=Protocol for the enrollment on the official COG registry, The Childhood Cancer Research Network (CCRN); ADVL0018=Phase I study of hu14.18-IL2 fusion protein in patients with refractory neuroblastoma and other refractory GD2 expressing tumors; ADVL0212=A Phase I study of depsipeptide (NSC#630176, IND# 51810) in pediatric patients with refractory solid tumors and leukemias; ADVL0214=A phase I study of single agent OSI-774 (Tarceva) (NSC # 718781, IND #63383) followed by OSI-774 with temozolomide for patients with selected recurrent/refractory solid tumors, including brain tumors; ADVL0215=A phase I study of decitabine in combination with doxorubicin and cyclophosphamide in the treatment of relapsed or refractory solid tumors; ADVL0421=A phase II study of oxaliplatin in children with recurrent solid tumors; ADVL0524=Phase II trial of ixabepilone (BMS-247550), an epothilone B analog, in children and young adults with refractory solid tumors; ADVL0525=A phase II study of pemetrexed in children with recurrent malignancies; ADVL06B1=A pharmacokinetic-pharmacodynamic-pharmacogenetic study of actinomycin-D and vincristine in children with cancer; ADVL0714=A phase I study of VEGF trap (NSC# 724770, IND# 100137) in children with refractory solid tumors; ALTE03N1=Key adverse events after childhood cancer; ALTE05N1=Umbrella long-term follow-up protocol; ANBL0032=Phase III randomized study of chimeric antibody 14.18 (Ch14.18) in high risk neuroblastoma following myeloablative therapy and autologous stem cell rescue; ANBL00B1=Neuroblastoma biology studies; ANBL00P1=A pilot study of tandem high dose chemotherapy with stem cell rescue following induction therapy in children with high risk neuroblastoma; ANBL02P1=A pilot induction regimen incorporating dose-intensive topotecan and cyclophosphamide for treatment of newly diagnosed high risk neuroblastoma; ANBL0321=Phase II study of fenretinide in pediatric patients with resistant or recurrent neuroblastoma; ANBL0322=A phase II study of hu14.18-IL2 (BB-IND-9728) in children with recurrent or refractory neuroblastoma; ANBL0532=Phase III randomized trial of single vs. tandem myeloablative as consolidation therapy for high-risk neuroblastoma; ANBL0621=A phase II study of ABT-751, an orally bioavailable tubulin binding agent, in children with relapsed or refractory neuroblastoma; B003=Diagnostic & prognostic studies in NBL; B903=Childhood cancer genetics; B947=Protocol for collection of biology specimens for research studies; B954=Opsoclonus-myoclonus-ataxia syndrome, neuroblastoma and the presence of anti-neuronal antibodies; B973=Laboratory-clinical studies of neuroblastoma; E04=Self-administered epidemiology questionnaire; E18=A case-control study of risk factors for neuroblastoma; I03=Neuroblastoma, diagnostic/prognostic; N891=Parents' perceptions of randomization; P9462=Randomized treatment of recurrent neuroblastoma with topotecan regimens following desferrioxamine (POG only) in an investigational window; P9641=Primary surgical therapy for biologically defined low-risk neuroblastoma; P9761=A phase II trial of irinotecan in children with refractory solid tumors; P9963=A phase II trial of rebeccamycin analogue (NSC #655649) in children with solid tumors; R9702=Prognostic implications of MIBG uptake in patients with neuroblastoma previously treated on CCG-3891; S31=Right atrial catheter study; S921=Comparison of urokinase vs heparin in preventing Infection in central venous devices in children with malignancies]"""
chunks = ""
is_the_beginning = True
max_chars = 2000
padding_words = 3
counter = 0
print(len(text))
# divvy up into chunks, sum of each chunk should equal the original text.
for chunk in annotator.sliding_window(text=text, max_characters=max_chars, padding_words= padding_words):
assert len(chunk) <= max_chars
counter += 1
if is_the_beginning:
chunks += chunk
else:
# remove redundand padded words from final result
chunks += " ".join(chunk.split(" ")[padding_words:])
is_the_beginning = False

print(counter)
# since spaces are trimmed by tokenizer , we can execuled all spaces and do char
assert chunks == text

0 comments on commit b6f4730

Please sign in to comment.