Release/2.8.0 (#198)

* Bumping version * support for extracting dug elements from graph (#197) * support for extracting dug elements from graph * adding flag for enabling dug element extraction from graph * adding new config for node_to dug element parsing * adding more parameters to crawler to able configuration to element extraction logic * add tests * add tests for crawler Co-authored-by: Yaphetkg <[email protected]> * Update _version.py * Update _version.py updating version for final push to master * Update factory.py Adding more comments Co-authored-by: Carl Schreep <[email protected]> Co-authored-by: Yaphetkg <[email protected]>
helxplatform · Feb 8, 2022 · 60b473a · 60b473a
1 parent 1b5c10a
commit 60b473a
Show file tree

Hide file tree

Showing 13 changed files with 491 additions and 29 deletions.
diff --git a/src/dug/_version.py b/src/dug/_version.py
@@ -1 +1 @@
-__version__ = "2.7.0"
+__version__ = "2.8.0"
diff --git a/src/dug/cli.py b/src/dug/cli.py
@@ -59,6 +59,14 @@ def get_argparser():
         default=None
     )
 
+    crawl_parser.add_argument(
+        "-x", "--extract-from-graph",
+        help="[Optional] Extract dug elements for tranql using concepts from annotation",
+        dest="extract_dug_elements",
+        default=False,
+        action="store_true"
+    )
+
     # Search subcommand
     search_parser = subparsers.add_parser('search', help='Apply semantic search')
     search_parser.set_defaults(func=search)
@@ -95,6 +103,9 @@ def get_argparser():
 
 def crawl(args):
     config = Config.from_env()
+    if not args.extract_dug_elements:
+        # disable extraction
+        config.node_to_element_queries = {}
     factory = DugFactory(config)
     dug = Dug(factory)
     dug.crawl(args.target, args.parser_type, args.element_type)

diff --git a/src/dug/config.py b/src/dug/config.py
@@ -63,6 +63,14 @@ class Config:
         "phen_to_anat": ["phenotypic_feature", "anatomical_entity"],
     })
 
+    node_to_element_queries: dict = field(default_factory=lambda: {
+        # Dug element type to cast the query kg nodes to
+        "cde": {
+            # Parse nodes matching criteria in kg
+            "node_type": "biolink:Publication"
+        }
+    })
+
     concept_expander: dict = field(default_factory=lambda: {
         "url": "https://tranql-dev.renci.org/tranql/query?dynamic_id_resolution=true&asynchronous=false",
         "min_tranql_score": 0.0

diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py
@@ -130,7 +130,7 @@ def __init__(self, url, min_tranql_score=0.2):
     def is_acceptable_answer(self, answer):
         return True
 
-    def expand_identifier(self, identifier, query_factory, kg_filename):
+    def expand_identifier(self, identifier, query_factory, kg_filename, include_all_attributes=False):
 
         answer_kgs = []
 
@@ -182,9 +182,11 @@ def expand_identifier(self, identifier, query_factory, kg_filename):
                 # Temporarily surround in try/except because sometimes the answer graphs
                 # contain invalid references to edges/nodes
                 # This will be fixed in Robokop but for now just silently warn if answer is invalid
+                node_attributes_filter = None if include_all_attributes else self.include_node_keys
+                edge_attributes_filter = None if include_all_attributes else self.include_edge_keys
                 answer_kg = kg.get_answer_subgraph(answer,
-                                                   include_node_keys=self.include_node_keys,
-                                                   include_edge_keys=self.include_edge_keys)
+                                                   include_node_keys=node_attributes_filter,
+                                                   include_edge_keys=edge_attributes_filter)
 
                 # Add subgraph to list of acceptable answers to query
                 answer_kgs.append(answer_kg)

diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py
@@ -4,14 +4,17 @@
 import traceback
 
 from dug.core.parsers import Parser, DugElement, DugConcept
+import dug.core.tranql as tql
+from dug.utils import biolink_snake_case
 
 logger = logging.getLogger('dug')
 
 
 class Crawler:
     def __init__(self, crawl_file: str, parser: Parser, annotator,
                  tranqlizer, tranql_queries,
-                 http_session, exclude_identifiers=None, element_type=None):
+                 http_session, exclude_identifiers=None, element_type=None,
+                 element_extraction=None):
 
         if exclude_identifiers is None:
             exclude_identifiers = []
@@ -24,6 +27,7 @@ def __init__(self, crawl_file: str, parser: Parser, annotator,
         self.tranql_queries = tranql_queries
         self.http_session = http_session
         self.exclude_identifiers = exclude_identifiers
+        self.element_extraction = element_extraction
         self.elements = []
         self.concepts = {}
         self.crawlspace = "crawl"
@@ -52,7 +56,10 @@ def crawl(self):
         # Annotate elements
         self.annotate_elements()
 
-        # Expand concepts
+        # if elements are extracted from the graph this array will contain the new dug elements
+        dug_elements_from_graph = []
+
+        # Expand concepts to other concepts
         concept_file = open(f"{self.crawlspace}/concept_file.json", "w")
         for concept_id, concept in self.concepts.items():
             # Use TranQL queries to fetch knowledge graphs containing related but not synonymous biological terms
@@ -70,6 +77,21 @@ def crawl(self):
             # Write concept out to a file
             concept_file.write(f"{json.dumps(concept.get_searchable_dict(), indent=2)}")
 
+            if self.element_extraction:
+                for element_extraction_config in self.element_extraction:
+                    casting_config = element_extraction_config['casting_config']
+                    tranql_source = element_extraction_config['tranql_source']
+                    dug_element_type = element_extraction_config['output_dug_type']
+                    dug_elements_from_graph += self.expand_to_dug_element(
+                        concept=concept,
+                        casting_config=casting_config,
+                        dug_element_type=dug_element_type,
+                        tranql_source=tranql_source
+                    )
+
+        # add new elements to parsed elements
+        self.elements += dug_elements_from_graph
+
         # Set element optional terms now that concepts have been expanded
         # Open variable file for writing
         variable_file = open(f"{self.crawlspace}/element_file.json", "w")
@@ -117,7 +139,6 @@ def annotate_elements(self):
             for concept_to_add in concepts_to_add:
                 element.add_concept(concept_to_add)
 
-
     def annotate_element(self, element):
 
         # Annotate with a set of normalized ontology identifiers
@@ -172,3 +193,65 @@ def expand_concept(self, concept):
                 # Add any answer knowledge graphs to
                 for answer in answers:
                     concept.add_kg_answer(answer, query_name=query_name)
+
+    def expand_to_dug_element(self,
+                              concept,
+                              casting_config,
+                              dug_element_type,
+                              tranql_source):
+        """
+        Given a concept look up the knowledge graph to construct dug elements out of kg results
+        does concept -> target_node_type crawls and converts target_node_type to dug element of type `dug_element_type`
+        """
+        elements = []
+        # using node_type as the primary criteria for matching nodes to element type.
+        target_node_type = casting_config["node_type"]
+        target_node_type_snake_case = biolink_snake_case(target_node_type.replace("biolink:", ""))
+        for ident_id, identifier in concept.identifiers.items():
+
+            # Check to see if the concept identifier has types defined, this is used to create
+            # tranql queries below.
+            if not identifier.types:
+                continue
+
+            # convert the first type to snake case to be used in tranql query.
+            # first type is the leaf type, this is coming from Node normalization.
+            node_type = biolink_snake_case(identifier.types[0].replace("biolink:", ""))
+            try:
+                # Tranql query factory currently supports select node types as valid query
+                # Types missing from QueryFactory.data_types will be skipped with this try catch
+                query = tql.QueryFactory([node_type, target_node_type_snake_case], tranql_source)
+            except tql.InvalidQueryError as exception:
+                logger.debug(f"Skipping  {ident_id}, {exception}")
+                continue
+
+            # check if tranql query object can use the curie.
+            if query.is_valid_curie(ident_id):
+                logger.info(f"Expanding {ident_id} to other dug elements")
+                # Fetch kg and answer
+                # Fetch kg and answer
+                # replace ":" with "~" to avoid windows os errors
+                kg_outfile = f"{self.crawlspace}/" + f"{ident_id}_{target_node_type}.json".replace(":", "~")
+
+                # query tranql, answers will include all node and edge attributes
+                answers = self.tranqlizer.expand_identifier(ident_id, query,
+                                                            kg_filename=kg_outfile,
+                                                            include_all_attributes=True)
+
+                # for each answer construct a dug element
+                for answer in answers:
+                    # here we will inspect the answers create new dug elements based on target node type
+                    # and return the variables.
+                    for node_id, node in answer.nodes.items():
+                        if target_node_type in node["category"]:
+                            # @TODO make element creation more generic
+                            # @TODO need to encode more data into the graph nodes, to parse them properly
+                            element = DugElement(
+                                elem_id=node_id,
+                                name=node.get('name', ""),
+                                desc=node.get('summary', ""),
+                                elem_type=dug_element_type
+                            )
+                            element.add_concept(concept)
+                            elements.append(element)
+        return elements
diff --git a/src/dug/core/factory.py b/src/dug/core/factory.py
@@ -40,7 +40,8 @@ def build_crawler(self, target, parser: Parser, element_type: str, tranql_source
             tranql_queries=self.build_tranql_queries(tranql_source),
             http_session=self.build_http_session(),
             exclude_identifiers=self.config.tranql_exclude_identifiers,
-            element_type=element_type
+            element_type=element_type,
+            element_extraction=self.build_element_extraction_parameters(),
         )
 
         return crawler
@@ -78,3 +79,28 @@ def build_tranql_queries(self, source=None) -> Dict[str, tql.QueryFactory]:
 
     def build_search_obj(self, indices) -> Search:
         return Search(self.config, indices=indices)
+
+    def build_element_extraction_parameters(self, source=None):
+        # Method reformats the node_to_element_queries object 
+        # Uses tranql source use for concept crawling
+        if source is None:
+            source = TRANQL_SOURCE
+        queries = self.config.node_to_element_queries
+        # reformat config as array , in the crawler this is looped over 
+        # to make calls to the expansion logic.
+        # casting config will be a set of conditions to perform casting on. 
+        # Currently we are casting based on node type returned from the tranql query
+        # we might want to filter those based on curie type or other conditions , if 
+        # node type is too broad.
+        return [
+            {
+                "output_dug_type": dug_type,
+                "casting_config": {
+                    "node_type": queries[dug_type]['node_type']
+                    # CDE's are only ones
+                    # but if we had two biolink:Publication nodes we want to conditionally
+                    # cast to other output_dug_type, we could extend this config
+                },
+                "tranql_source": source
+             } for dug_type in queries
+        ]
diff --git a/src/dug/core/tranql.py b/src/dug/core/tranql.py
@@ -1,4 +1,5 @@
-import json, re
+import json
+from dug.utils import biolink_snake_case
 
 
 class MissingNodeReferenceError(BaseException):
@@ -179,25 +180,7 @@ def get_kg(self):
         return old_kg_model
 
     def _snake_case(self, arg: str):
-        """Convert string to snake_case.
-        Non-alphanumeric characters are replaced with _.
-        CamelCase is replaced with snake_case.
-        """
-        # replace non-alphanumeric characters with _
-        tmp = re.sub(r'\W', '_', arg)
-        # replace X with _x
-        tmp = re.sub(
-            r'(?<=[a-z])[A-Z](?=[a-z])',
-            lambda c: '_' + c.group(0).lower(),
-            tmp
-        )
-        # lower-case first character
-        tmp = re.sub(
-            r'^[A-Z](?=[a-z])',
-            lambda c: c.group(0).lower(),
-            tmp
-        )
-        return tmp
+        return biolink_snake_case(arg)
 
 
 class InvalidQueryError(BaseException):
@@ -207,7 +190,7 @@ class InvalidQueryError(BaseException):
 class QueryFactory:
 
     # Class member list of valid data types that can be included in query
-    data_types = ["phenotypic_feature", "gene", "disease", "chemical_substance",
+    data_types = ["publication", "phenotypic_feature", "gene", "disease", "chemical_substance",
                   "drug_exposure", "biological_process", "anatomical_entity", "small_molecule",
                   "chemical_mixture", "chemical_entity"]
 

diff --git a/src/dug/utils.py b/src/dug/utils.py
@@ -1,3 +1,5 @@
+import re
+
 class ObjectFactory:
     def __init__(self):
         self._builders = {}
@@ -34,3 +36,25 @@ def get_dbgap_study_link(study_id):
 def get_nida_study_link(study_id):
     base_url = "https://datashare.nida.nih.gov/study"
     return f'{base_url}/{study_id}'
+
+
+def biolink_snake_case(arg):
+    """Convert such SnakeCase to snake_case.
+       Non-alphanumeric characters are replaced with _.
+       CamelCase is replaced with snake_case.
+    """
+    # replace non-alphanumeric characters with _
+    tmp = re.sub(r'\W', '_', arg)
+    # replace X with _x
+    tmp = re.sub(
+        r'(?<=[a-z])[A-Z](?=[a-z])',
+        lambda c: '_' + c.group(0).lower(),
+        tmp
+    )
+    # lower-case first character
+    tmp = re.sub(
+        r'^[A-Z](?=[a-z])',
+        lambda c: c.group(0).lower(),
+        tmp
+    )
+    return tmp
diff --git a/tests/unit/mocks/MockCrawler.py b/tests/unit/mocks/MockCrawler.py
@@ -0,0 +1,63 @@
+from unittest.mock import MagicMock, Mock
+
+import pytest
+import os
+import json
+
+
+from dug.core.annotate import Identifier
+from dug.core.tranql import QueryFactory, QueryKG
+
+# Makes some simple mokes
+ParserMock = MagicMock()
+HTTPSessionMock = MagicMock()
+
+# mocking tranql queries
+TranqlQueriesMock = {}
+for key, query in {
+    "disease": ["disease", "phenotypic_feature"],
+    "pheno": ["phenotypic_feature", "disease"]
+}.items():
+    TranqlQueriesMock[key] = QueryFactory(query, source="test")
+
+
+# for testing no id exclusion
+ExcludedIDs = []
+
+ANNOTATED_IDS = [
+    Identifier("MONDO:0", "0", ["disease"]),
+    Identifier("PUBCHEM.COMPOUND:1", "1", ["chemical"])
+    ]
+for ids in ANNOTATED_IDS:
+    ids.type = ids.types[0]
+# annotator with annotate method returning mocked concepts
+AnnotatorMock = MagicMock()
+AnnotatorMock.annotate = Mock(return_value=ANNOTATED_IDS)
+
+# tranqlizer returning mock kg when expanding concepts
+TranqlizerMock = MagicMock()
+
+# Get example tranql answer
+with open(os.path.join(os.path.dirname(__file__), "data", "tranql_response.json")) as stream:
+    tranql_json = json.load(stream)
+    kg_answer = QueryKG(kg_json=tranql_json)
+    TRANQL_ANSWERS = []
+    for answer in kg_answer.answers:
+        TRANQL_ANSWERS.append(kg_answer.get_answer_subgraph(answer))
+
+TranqlizerMock.expand_identifier = Mock(return_value=TRANQL_ANSWERS)
+
+#mock a crawler with mock dependencies
+@pytest.fixture
+def crawler_init_args_no_graph_extraction():
+    return {
+        "crawl_file": "test",
+        "parser": ParserMock,
+        "annotator": AnnotatorMock,
+        "tranqlizer": TranqlizerMock,
+        "tranql_queries": TranqlQueriesMock,
+        "http_session": HTTPSessionMock,
+        "exclude_identifiers": ExcludedIDs,
+        "element_type": "TestElement",
+        "element_extraction": None
+    }
diff --git a/tests/unit/mocks/__init__.py b/tests/unit/mocks/__init__.py