Skip to content

Commit

Permalink
Release/2.8.0 (#198)
Browse files Browse the repository at this point in the history
* Bumping version

* support for extracting dug elements from graph (#197)

* support for extracting dug elements from graph

* adding flag for enabling dug element extraction from graph

* adding new config for node_to dug element parsing

* adding more parameters to crawler to able configuration to element extraction logic

* add tests

* add tests for crawler

Co-authored-by: Yaphetkg <[email protected]>

* Update _version.py

* Update _version.py

updating version for final push to master

* Update factory.py

Adding more comments

Co-authored-by: Carl Schreep <[email protected]>
Co-authored-by: Yaphetkg <[email protected]>
  • Loading branch information
3 people authored Feb 8, 2022
1 parent 1b5c10a commit 60b473a
Show file tree
Hide file tree
Showing 13 changed files with 491 additions and 29 deletions.
2 changes: 1 addition & 1 deletion src/dug/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.7.0"
__version__ = "2.8.0"
11 changes: 11 additions & 0 deletions src/dug/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ def get_argparser():
default=None
)

crawl_parser.add_argument(
"-x", "--extract-from-graph",
help="[Optional] Extract dug elements for tranql using concepts from annotation",
dest="extract_dug_elements",
default=False,
action="store_true"
)

# Search subcommand
search_parser = subparsers.add_parser('search', help='Apply semantic search')
search_parser.set_defaults(func=search)
Expand Down Expand Up @@ -95,6 +103,9 @@ def get_argparser():

def crawl(args):
config = Config.from_env()
if not args.extract_dug_elements:
# disable extraction
config.node_to_element_queries = {}
factory = DugFactory(config)
dug = Dug(factory)
dug.crawl(args.target, args.parser_type, args.element_type)
Expand Down
8 changes: 8 additions & 0 deletions src/dug/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ class Config:
"phen_to_anat": ["phenotypic_feature", "anatomical_entity"],
})

node_to_element_queries: dict = field(default_factory=lambda: {
# Dug element type to cast the query kg nodes to
"cde": {
# Parse nodes matching criteria in kg
"node_type": "biolink:Publication"
}
})

concept_expander: dict = field(default_factory=lambda: {
"url": "https://tranql-dev.renci.org/tranql/query?dynamic_id_resolution=true&asynchronous=false",
"min_tranql_score": 0.0
Expand Down
8 changes: 5 additions & 3 deletions src/dug/core/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def __init__(self, url, min_tranql_score=0.2):
def is_acceptable_answer(self, answer):
return True

def expand_identifier(self, identifier, query_factory, kg_filename):
def expand_identifier(self, identifier, query_factory, kg_filename, include_all_attributes=False):

answer_kgs = []

Expand Down Expand Up @@ -182,9 +182,11 @@ def expand_identifier(self, identifier, query_factory, kg_filename):
# Temporarily surround in try/except because sometimes the answer graphs
# contain invalid references to edges/nodes
# This will be fixed in Robokop but for now just silently warn if answer is invalid
node_attributes_filter = None if include_all_attributes else self.include_node_keys
edge_attributes_filter = None if include_all_attributes else self.include_edge_keys
answer_kg = kg.get_answer_subgraph(answer,
include_node_keys=self.include_node_keys,
include_edge_keys=self.include_edge_keys)
include_node_keys=node_attributes_filter,
include_edge_keys=edge_attributes_filter)

# Add subgraph to list of acceptable answers to query
answer_kgs.append(answer_kg)
Expand Down
89 changes: 86 additions & 3 deletions src/dug/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@
import traceback

from dug.core.parsers import Parser, DugElement, DugConcept
import dug.core.tranql as tql
from dug.utils import biolink_snake_case

logger = logging.getLogger('dug')


class Crawler:
def __init__(self, crawl_file: str, parser: Parser, annotator,
tranqlizer, tranql_queries,
http_session, exclude_identifiers=None, element_type=None):
http_session, exclude_identifiers=None, element_type=None,
element_extraction=None):

if exclude_identifiers is None:
exclude_identifiers = []
Expand All @@ -24,6 +27,7 @@ def __init__(self, crawl_file: str, parser: Parser, annotator,
self.tranql_queries = tranql_queries
self.http_session = http_session
self.exclude_identifiers = exclude_identifiers
self.element_extraction = element_extraction
self.elements = []
self.concepts = {}
self.crawlspace = "crawl"
Expand Down Expand Up @@ -52,7 +56,10 @@ def crawl(self):
# Annotate elements
self.annotate_elements()

# Expand concepts
# if elements are extracted from the graph this array will contain the new dug elements
dug_elements_from_graph = []

# Expand concepts to other concepts
concept_file = open(f"{self.crawlspace}/concept_file.json", "w")
for concept_id, concept in self.concepts.items():
# Use TranQL queries to fetch knowledge graphs containing related but not synonymous biological terms
Expand All @@ -70,6 +77,21 @@ def crawl(self):
# Write concept out to a file
concept_file.write(f"{json.dumps(concept.get_searchable_dict(), indent=2)}")

if self.element_extraction:
for element_extraction_config in self.element_extraction:
casting_config = element_extraction_config['casting_config']
tranql_source = element_extraction_config['tranql_source']
dug_element_type = element_extraction_config['output_dug_type']
dug_elements_from_graph += self.expand_to_dug_element(
concept=concept,
casting_config=casting_config,
dug_element_type=dug_element_type,
tranql_source=tranql_source
)

# add new elements to parsed elements
self.elements += dug_elements_from_graph

# Set element optional terms now that concepts have been expanded
# Open variable file for writing
variable_file = open(f"{self.crawlspace}/element_file.json", "w")
Expand Down Expand Up @@ -117,7 +139,6 @@ def annotate_elements(self):
for concept_to_add in concepts_to_add:
element.add_concept(concept_to_add)


def annotate_element(self, element):

# Annotate with a set of normalized ontology identifiers
Expand Down Expand Up @@ -172,3 +193,65 @@ def expand_concept(self, concept):
# Add any answer knowledge graphs to
for answer in answers:
concept.add_kg_answer(answer, query_name=query_name)

def expand_to_dug_element(self,
concept,
casting_config,
dug_element_type,
tranql_source):
"""
Given a concept look up the knowledge graph to construct dug elements out of kg results
does concept -> target_node_type crawls and converts target_node_type to dug element of type `dug_element_type`
"""
elements = []
# using node_type as the primary criteria for matching nodes to element type.
target_node_type = casting_config["node_type"]
target_node_type_snake_case = biolink_snake_case(target_node_type.replace("biolink:", ""))
for ident_id, identifier in concept.identifiers.items():

# Check to see if the concept identifier has types defined, this is used to create
# tranql queries below.
if not identifier.types:
continue

# convert the first type to snake case to be used in tranql query.
# first type is the leaf type, this is coming from Node normalization.
node_type = biolink_snake_case(identifier.types[0].replace("biolink:", ""))
try:
# Tranql query factory currently supports select node types as valid query
# Types missing from QueryFactory.data_types will be skipped with this try catch
query = tql.QueryFactory([node_type, target_node_type_snake_case], tranql_source)
except tql.InvalidQueryError as exception:
logger.debug(f"Skipping {ident_id}, {exception}")
continue

# check if tranql query object can use the curie.
if query.is_valid_curie(ident_id):
logger.info(f"Expanding {ident_id} to other dug elements")
# Fetch kg and answer
# Fetch kg and answer
# replace ":" with "~" to avoid windows os errors
kg_outfile = f"{self.crawlspace}/" + f"{ident_id}_{target_node_type}.json".replace(":", "~")

# query tranql, answers will include all node and edge attributes
answers = self.tranqlizer.expand_identifier(ident_id, query,
kg_filename=kg_outfile,
include_all_attributes=True)

# for each answer construct a dug element
for answer in answers:
# here we will inspect the answers create new dug elements based on target node type
# and return the variables.
for node_id, node in answer.nodes.items():
if target_node_type in node["category"]:
# @TODO make element creation more generic
# @TODO need to encode more data into the graph nodes, to parse them properly
element = DugElement(
elem_id=node_id,
name=node.get('name', ""),
desc=node.get('summary', ""),
elem_type=dug_element_type
)
element.add_concept(concept)
elements.append(element)
return elements
28 changes: 27 additions & 1 deletion src/dug/core/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def build_crawler(self, target, parser: Parser, element_type: str, tranql_source
tranql_queries=self.build_tranql_queries(tranql_source),
http_session=self.build_http_session(),
exclude_identifiers=self.config.tranql_exclude_identifiers,
element_type=element_type
element_type=element_type,
element_extraction=self.build_element_extraction_parameters(),
)

return crawler
Expand Down Expand Up @@ -78,3 +79,28 @@ def build_tranql_queries(self, source=None) -> Dict[str, tql.QueryFactory]:

def build_search_obj(self, indices) -> Search:
return Search(self.config, indices=indices)

def build_element_extraction_parameters(self, source=None):
# Method reformats the node_to_element_queries object
# Uses tranql source use for concept crawling
if source is None:
source = TRANQL_SOURCE
queries = self.config.node_to_element_queries
# reformat config as array , in the crawler this is looped over
# to make calls to the expansion logic.
# casting config will be a set of conditions to perform casting on.
# Currently we are casting based on node type returned from the tranql query
# we might want to filter those based on curie type or other conditions , if
# node type is too broad.
return [
{
"output_dug_type": dug_type,
"casting_config": {
"node_type": queries[dug_type]['node_type']
# CDE's are only ones
# but if we had two biolink:Publication nodes we want to conditionally
# cast to other output_dug_type, we could extend this config
},
"tranql_source": source
} for dug_type in queries
]
25 changes: 4 additions & 21 deletions src/dug/core/tranql.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json, re
import json
from dug.utils import biolink_snake_case


class MissingNodeReferenceError(BaseException):
Expand Down Expand Up @@ -179,25 +180,7 @@ def get_kg(self):
return old_kg_model

def _snake_case(self, arg: str):
"""Convert string to snake_case.
Non-alphanumeric characters are replaced with _.
CamelCase is replaced with snake_case.
"""
# replace non-alphanumeric characters with _
tmp = re.sub(r'\W', '_', arg)
# replace X with _x
tmp = re.sub(
r'(?<=[a-z])[A-Z](?=[a-z])',
lambda c: '_' + c.group(0).lower(),
tmp
)
# lower-case first character
tmp = re.sub(
r'^[A-Z](?=[a-z])',
lambda c: c.group(0).lower(),
tmp
)
return tmp
return biolink_snake_case(arg)


class InvalidQueryError(BaseException):
Expand All @@ -207,7 +190,7 @@ class InvalidQueryError(BaseException):
class QueryFactory:

# Class member list of valid data types that can be included in query
data_types = ["phenotypic_feature", "gene", "disease", "chemical_substance",
data_types = ["publication", "phenotypic_feature", "gene", "disease", "chemical_substance",
"drug_exposure", "biological_process", "anatomical_entity", "small_molecule",
"chemical_mixture", "chemical_entity"]

Expand Down
24 changes: 24 additions & 0 deletions src/dug/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

class ObjectFactory:
def __init__(self):
self._builders = {}
Expand Down Expand Up @@ -34,3 +36,25 @@ def get_dbgap_study_link(study_id):
def get_nida_study_link(study_id):
base_url = "https://datashare.nida.nih.gov/study"
return f'{base_url}/{study_id}'


def biolink_snake_case(arg):
"""Convert such SnakeCase to snake_case.
Non-alphanumeric characters are replaced with _.
CamelCase is replaced with snake_case.
"""
# replace non-alphanumeric characters with _
tmp = re.sub(r'\W', '_', arg)
# replace X with _x
tmp = re.sub(
r'(?<=[a-z])[A-Z](?=[a-z])',
lambda c: '_' + c.group(0).lower(),
tmp
)
# lower-case first character
tmp = re.sub(
r'^[A-Z](?=[a-z])',
lambda c: c.group(0).lower(),
tmp
)
return tmp
63 changes: 63 additions & 0 deletions tests/unit/mocks/MockCrawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from unittest.mock import MagicMock, Mock

import pytest
import os
import json


from dug.core.annotate import Identifier
from dug.core.tranql import QueryFactory, QueryKG

# Makes some simple mokes
ParserMock = MagicMock()
HTTPSessionMock = MagicMock()

# mocking tranql queries
TranqlQueriesMock = {}
for key, query in {
"disease": ["disease", "phenotypic_feature"],
"pheno": ["phenotypic_feature", "disease"]
}.items():
TranqlQueriesMock[key] = QueryFactory(query, source="test")


# for testing no id exclusion
ExcludedIDs = []

ANNOTATED_IDS = [
Identifier("MONDO:0", "0", ["disease"]),
Identifier("PUBCHEM.COMPOUND:1", "1", ["chemical"])
]
for ids in ANNOTATED_IDS:
ids.type = ids.types[0]
# annotator with annotate method returning mocked concepts
AnnotatorMock = MagicMock()
AnnotatorMock.annotate = Mock(return_value=ANNOTATED_IDS)

# tranqlizer returning mock kg when expanding concepts
TranqlizerMock = MagicMock()

# Get example tranql answer
with open(os.path.join(os.path.dirname(__file__), "data", "tranql_response.json")) as stream:
tranql_json = json.load(stream)
kg_answer = QueryKG(kg_json=tranql_json)
TRANQL_ANSWERS = []
for answer in kg_answer.answers:
TRANQL_ANSWERS.append(kg_answer.get_answer_subgraph(answer))

TranqlizerMock.expand_identifier = Mock(return_value=TRANQL_ANSWERS)

#mock a crawler with mock dependencies
@pytest.fixture
def crawler_init_args_no_graph_extraction():
return {
"crawl_file": "test",
"parser": ParserMock,
"annotator": AnnotatorMock,
"tranqlizer": TranqlizerMock,
"tranql_queries": TranqlQueriesMock,
"http_session": HTTPSessionMock,
"exclude_identifiers": ExcludedIDs,
"element_type": "TestElement",
"element_extraction": None
}
Empty file added tests/unit/mocks/__init__.py
Empty file.
Loading

0 comments on commit 60b473a

Please sign in to comment.